Full streaming support

2026-04-19 23:08:08 +02:00 · 2026-02-16 17:40:11 -07:00
parent e09e9a0b7a
commit 858367c98a
6 changed files with 591 additions and 301 deletions
--- a/frigate/api/chat.py
+++ b/frigate/api/chat.py
@@ -25,6 +25,7 @@ from frigate.api.defs.response.chat_response import (
 )
 from frigate.api.defs.tags import Tags
 from frigate.api.event import events
+from frigate.genai.utils import build_assistant_message_for_conversation

 logger = logging.getLogger(__name__)

@@ -403,6 +404,78 @@ async def _execute_tool_internal(
        return {"error": f"Unknown tool: {tool_name}"}


+async def _execute_pending_tools(
+    pending_tool_calls: List[Dict[str, Any]],
+    request: Request,
+    allowed_cameras: List[str],
+) -> tuple[List[ToolCall], List[Dict[str, Any]]]:
+    """
+    Execute a list of tool calls; return (ToolCall list for API response, tool result dicts for conversation).
+    """
+    tool_calls_out: List[ToolCall] = []
+    tool_results: List[Dict[str, Any]] = []
+    for tool_call in pending_tool_calls:
+        tool_name = tool_call["name"]
+        tool_args = tool_call.get("arguments") or {}
+        tool_call_id = tool_call["id"]
+        logger.debug(
+            f"Executing tool: {tool_name} (id: {tool_call_id}) with arguments: {json.dumps(tool_args, indent=2)}"
+        )
+        try:
+            tool_result = await _execute_tool_internal(
+                tool_name, tool_args, request, allowed_cameras
+            )
+            if tool_name == "search_objects" and isinstance(tool_result, list):
+                tool_result = _format_events_with_local_time(tool_result)
+                _keys = {
+                    "id",
+                    "camera",
+                    "label",
+                    "zones",
+                    "start_time_local",
+                    "end_time_local",
+                    "sub_label",
+                    "event_count",
+                }
+                tool_result = [
+                    {k: evt[k] for k in _keys if k in evt}
+                    for evt in tool_result
+                    if isinstance(evt, dict)
+                ]
+            result_content = (
+                json.dumps(tool_result)
+                if isinstance(tool_result, (dict, list))
+                else (tool_result if isinstance(tool_result, str) else str(tool_result))
+            )
+            tool_calls_out.append(
+                ToolCall(name=tool_name, arguments=tool_args, response=result_content)
+            )
+            tool_results.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tool_call_id,
+                    "content": result_content,
+                }
+            )
+        except Exception as e:
+            logger.error(
+                f"Error executing tool {tool_name} (id: {tool_call_id}): {e}",
+                exc_info=True,
+            )
+            error_content = json.dumps({"error": f"Tool execution failed: {str(e)}"})
+            tool_calls_out.append(
+                ToolCall(name=tool_name, arguments=tool_args, response=error_content)
+            )
+            tool_results.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tool_call_id,
+                    "content": error_content,
+                }
+            )
+    return (tool_calls_out, tool_results)
+
+
@router.post(
    "/chat/completion",
    dependencies=[Depends(allow_any_authenticated())],
@@ -527,6 +600,81 @@ Always be accurate with time calculations based on the current date provided.{ca
        f"{len(tools)} tool(s) available, max_iterations={max_iterations}"
    )

+    # True LLM streaming when client supports it and stream requested
+    if body.stream and hasattr(genai_client, "chat_with_tools_stream"):
+        stream_tool_calls: List[ToolCall] = []
+        stream_iterations = 0
+
+        async def stream_body_llm():
+            nonlocal conversation, stream_tool_calls, stream_iterations
+            while stream_iterations < max_iterations:
+                logger.debug(
+                    f"Streaming LLM (iteration {stream_iterations + 1}/{max_iterations}) "
+                    f"with {len(conversation)} message(s)"
+                )
+                async for event in genai_client.chat_with_tools_stream(
+                    messages=conversation,
+                    tools=tools if tools else None,
+                    tool_choice="auto",
+                ):
+                    kind, value = event
+                    if kind == "content_delta":
+                        yield (
+                            json.dumps({"type": "content", "delta": value}).encode(
+                                "utf-8"
+                            )
+                            + b"\n"
+                        )
+                    elif kind == "message":
+                        msg = value
+                        if msg.get("finish_reason") == "error":
+                            yield (
+                                json.dumps(
+                                    {
+                                        "type": "error",
+                                        "error": "An error occurred while processing your request.",
+                                    }
+                                ).encode("utf-8")
+                                + b"\n"
+                            )
+                            return
+                        pending = msg.get("tool_calls")
+                        if pending:
+                            stream_iterations += 1
+                            conversation.append(
+                                build_assistant_message_for_conversation(
+                                    msg.get("content"), pending
+                                )
+                            )
+                            executed_calls, tool_results = await _execute_pending_tools(
+                                pending, request, allowed_cameras
+                            )
+                            stream_tool_calls.extend(executed_calls)
+                            conversation.extend(tool_results)
+                            yield (
+                                json.dumps(
+                                    {
+                                        "type": "tool_calls",
+                                        "tool_calls": [
+                                            tc.model_dump() for tc in stream_tool_calls
+                                        ],
+                                    }
+                                ).encode("utf-8")
+                                + b"\n"
+                            )
+                            break
+                        else:
+                            yield (json.dumps({"type": "done"}).encode("utf-8") + b"\n")
+                            return
+            else:
+                yield json.dumps({"type": "done"}).encode("utf-8") + b"\n"
+
+        return StreamingResponse(
+            stream_body_llm(),
+            media_type="application/x-ndjson",
+            headers={"X-Accel-Buffering": "no"},
+        )
+
    try:
        while tool_iterations < max_iterations:
            logger.debug(
@@ -548,23 +696,11 @@ Always be accurate with time calculations based on the current date provided.{ca
                    status_code=500,
                )

-            assistant_message = {
-                "role": "assistant",
-                "content": response.get("content"),
-            }
-            if response.get("tool_calls"):
-                assistant_message["tool_calls"] = [
-                    {
-                        "id": tc["id"],
-                        "type": "function",
-                        "function": {
-                            "name": tc["name"],
-                            "arguments": json.dumps(tc["arguments"]),
-                        },
-                    }
-                    for tc in response["tool_calls"]
-                ]
-            conversation.append(assistant_message)
+            conversation.append(
+                build_assistant_message_for_conversation(
+                    response.get("content"), response.get("tool_calls")
+                )
+            )

            pending_tool_calls = response.get("tool_calls")
            if not pending_tool_calls:
@@ -574,6 +710,7 @@ Always be accurate with time calculations based on the current date provided.{ca
                final_content = response.get("content") or ""

                if body.stream:
+
                    async def stream_body() -> Any:
                        if tool_calls:
                            yield (
@@ -590,8 +727,9 @@ Always be accurate with time calculations based on the current date provided.{ca
                        # Stream content in word-sized chunks for smooth UX
                        for part in _chunk_content(final_content):
                            yield (
-                                json.dumps({"type": "content", "delta": part})
-                                .encode("utf-8")
+                                json.dumps({"type": "content", "delta": part}).encode(
+                                    "utf-8"
+                                )
                                + b"\n"
                            )
                        yield json.dumps({"type": "done"}).encode("utf-8") + b"\n"
@@ -614,123 +752,15 @@ Always be accurate with time calculations based on the current date provided.{ca
                    ).model_dump(),
                )

-            # Execute tools
            tool_iterations += 1
            logger.debug(
                f"Tool calls detected (iteration {tool_iterations}/{max_iterations}): "
                f"{len(pending_tool_calls)} tool(s) to execute"
            )
-            tool_results = []
-
-            for tool_call in pending_tool_calls:
-                tool_name = tool_call["name"]
-                tool_args = tool_call["arguments"]
-                tool_call_id = tool_call["id"]
-
-                logger.debug(
-                    f"Executing tool: {tool_name} (id: {tool_call_id}) with arguments: {json.dumps(tool_args, indent=2)}"
-                )
-
-                try:
-                    tool_result = await _execute_tool_internal(
-                        tool_name, tool_args, request, allowed_cameras
-                    )
-
-                    # Add local time fields to search_objects results so the LLM doesn't hallucinate timestamps
-                    if tool_name == "search_objects" and isinstance(tool_result, list):
-                        tool_result = _format_events_with_local_time(tool_result)
-                        _keys = {
-                            "id",
-                            "camera",
-                            "label",
-                            "zones",
-                            "start_time_local",
-                            "end_time_local",
-                            "sub_label",
-                            "event_count",
-                        }
-                        tool_result = [
-                            {k: evt[k] for k in _keys if k in evt}
-                            for evt in tool_result
-                            if isinstance(evt, dict)
-                        ]
-
-                    if isinstance(tool_result, dict):
-                        result_content = json.dumps(tool_result)
-                        result_summary = tool_result
-                        if isinstance(tool_result, dict) and isinstance(
-                            tool_result.get("content"), list
-                        ):
-                            result_count = len(tool_result.get("content", []))
-                            result_summary = {
-                                "count": result_count,
-                                "sample": tool_result.get("content", [])[:2]
-                                if result_count > 0
-                                else [],
-                            }
-                        logger.debug(
-                            f"Tool {tool_name} (id: {tool_call_id}) completed successfully. "
-                            f"Result: {json.dumps(result_summary, indent=2)}"
-                        )
-                    elif isinstance(tool_result, list):
-                        result_content = json.dumps(tool_result)
-                        logger.debug(
-                            f"Tool {tool_name} (id: {tool_call_id}) completed successfully. "
-                            f"Result: {len(tool_result)} item(s)"
-                        )
-                    elif isinstance(tool_result, str):
-                        result_content = tool_result
-                        logger.debug(
-                            f"Tool {tool_name} (id: {tool_call_id}) completed successfully. "
-                            f"Result length: {len(result_content)} characters"
-                        )
-                    else:
-                        result_content = str(tool_result)
-                        logger.debug(
-                            f"Tool {tool_name} (id: {tool_call_id}) completed successfully. "
-                            f"Result type: {type(tool_result).__name__}"
-                        )
-
-                    tool_calls.append(
-                        ToolCall(
-                            name=tool_name,
-                            arguments=tool_args or {},
-                            response=result_content,
-                        )
-                    )
-                    tool_results.append(
-                        {
-                            "role": "tool",
-                            "tool_call_id": tool_call_id,
-                            "content": result_content,
-                        }
-                    )
-                except Exception as e:
-                    logger.error(
-                        f"Error executing tool {tool_name} (id: {tool_call_id}): {e}",
-                        exc_info=True,
-                    )
-                    error_content = json.dumps(
-                        {"error": f"Tool execution failed: {str(e)}"}
-                    )
-                    tool_calls.append(
-                        ToolCall(
-                            name=tool_name,
-                            arguments=tool_args or {},
-                            response=error_content,
-                        )
-                    )
-                    tool_results.append(
-                        {
-                            "role": "tool",
-                            "tool_call_id": tool_call_id,
-                            "content": error_content,
-                        }
-                    )
-                    logger.debug(
-                        f"Tool {tool_name} (id: {tool_call_id}) failed. Error result added to conversation."
-                    )
-
+            executed_calls, tool_results = await _execute_pending_tools(
+                pending_tool_calls, request, allowed_cameras
+            )
+            tool_calls.extend(executed_calls)
            conversation.extend(tool_results)
            logger.debug(
                f"Added {len(tool_results)} tool result(s) to conversation. "
--- a/frigate/genai/llama_cpp.py
+++ b/frigate/genai/llama_cpp.py
@@ -5,10 +5,12 @@ import json
 import logging
 from typing import Any, Optional

+import httpx
 import requests

 from frigate.config import GenAIProviderEnum
 from frigate.genai import GenAIClient, register_genai_provider
+from frigate.genai.utils import parse_tool_calls_from_message

 logger = logging.getLogger(__name__)

@@ -99,7 +101,76 @@ class LlamaCppClient(GenAIClient):

    def get_context_size(self) -> int:
        """Get the context window size for llama.cpp."""
-        return self.genai_config.provider_options.get("context_size", 4096)
+        return self.provider_options.get("context_size", 4096)
+
+    def _build_payload(
+        self,
+        messages: list[dict[str, Any]],
+        tools: Optional[list[dict[str, Any]]],
+        tool_choice: Optional[str],
+        stream: bool = False,
+    ) -> dict[str, Any]:
+        """Build request payload for chat completions (sync or stream)."""
+        openai_tool_choice = None
+        if tool_choice:
+            if tool_choice == "none":
+                openai_tool_choice = "none"
+            elif tool_choice == "auto":
+                openai_tool_choice = "auto"
+            elif tool_choice == "required":
+                openai_tool_choice = "required"
+
+        payload: dict[str, Any] = {"messages": messages}
+        if stream:
+            payload["stream"] = True
+        if tools:
+            payload["tools"] = tools
+            if openai_tool_choice is not None:
+                payload["tool_choice"] = openai_tool_choice
+        provider_opts = {
+            k: v for k, v in self.provider_options.items() if k != "context_size"
+        }
+        payload.update(provider_opts)
+        return payload
+
+    def _message_from_choice(self, choice: dict[str, Any]) -> dict[str, Any]:
+        """Parse OpenAI-style choice into {content, tool_calls, finish_reason}."""
+        message = choice.get("message", {})
+        content = message.get("content")
+        content = content.strip() if content else None
+        tool_calls = parse_tool_calls_from_message(message)
+        finish_reason = choice.get("finish_reason") or (
+            "tool_calls" if tool_calls else "stop" if content else "error"
+        )
+        return {
+            "content": content,
+            "tool_calls": tool_calls,
+            "finish_reason": finish_reason,
+        }
+
+    @staticmethod
+    def _streamed_tool_calls_to_list(
+        tool_calls_by_index: dict[int, dict[str, Any]],
+    ) -> Optional[list[dict[str, Any]]]:
+        """Convert streamed tool_calls index map to list of {id, name, arguments}."""
+        if not tool_calls_by_index:
+            return None
+        result = []
+        for idx in sorted(tool_calls_by_index.keys()):
+            t = tool_calls_by_index[idx]
+            args_str = t.get("arguments") or "{}"
+            try:
+                arguments = json.loads(args_str)
+            except json.JSONDecodeError:
+                arguments = {}
+            result.append(
+                {
+                    "id": t.get("id", ""),
+                    "name": t.get("name", ""),
+                    "arguments": arguments,
+                }
+            )
+        return result if result else None

    def chat_with_tools(
        self,
@@ -122,31 +193,8 @@ class LlamaCppClient(GenAIClient):
                "tool_calls": None,
                "finish_reason": "error",
            }
-
        try:
-            openai_tool_choice = None
-            if tool_choice:
-                if tool_choice == "none":
-                    openai_tool_choice = "none"
-                elif tool_choice == "auto":
-                    openai_tool_choice = "auto"
-                elif tool_choice == "required":
-                    openai_tool_choice = "required"
-
-            payload = {
-                "messages": messages,
-            }
-
-            if tools:
-                payload["tools"] = tools
-                if openai_tool_choice is not None:
-                    payload["tool_choice"] = openai_tool_choice
-
-            provider_opts = {
-                k: v for k, v in self.provider_options.items() if k != "context_size"
-            }
-            payload.update(provider_opts)
-
+            payload = self._build_payload(messages, tools, tool_choice, stream=False)
            response = requests.post(
                f"{self.provider}/v1/chat/completions",
                json=payload,
@@ -154,60 +202,13 @@ class LlamaCppClient(GenAIClient):
            )
            response.raise_for_status()
            result = response.json()
-
            if result is None or "choices" not in result or len(result["choices"]) == 0:
                return {
                    "content": None,
                    "tool_calls": None,
                    "finish_reason": "error",
                }
-
-            choice = result["choices"][0]
-            message = choice.get("message", {})
-
-            content = message.get("content")
-            if content:
-                content = content.strip()
-            else:
-                content = None
-
-            tool_calls = None
-            if "tool_calls" in message and message["tool_calls"]:
-                tool_calls = []
-                for tool_call in message["tool_calls"]:
-                    try:
-                        function_data = tool_call.get("function", {})
-                        arguments_str = function_data.get("arguments", "{}")
-                        arguments = json.loads(arguments_str)
-                    except (json.JSONDecodeError, KeyError, TypeError) as e:
-                        logger.warning(
-                            f"Failed to parse tool call arguments: {e}, "
-                            f"tool: {function_data.get('name', 'unknown')}"
-                        )
-                        arguments = {}
-
-                    tool_calls.append(
-                        {
-                            "id": tool_call.get("id", ""),
-                            "name": function_data.get("name", ""),
-                            "arguments": arguments,
-                        }
-                    )
-
-            finish_reason = "error"
-            if "finish_reason" in choice and choice["finish_reason"]:
-                finish_reason = choice["finish_reason"]
-            elif tool_calls:
-                finish_reason = "tool_calls"
-            elif content:
-                finish_reason = "stop"
-
-            return {
-                "content": content,
-                "tool_calls": tool_calls,
-                "finish_reason": finish_reason,
-            }
-
+            return self._message_from_choice(result["choices"][0])
        except requests.exceptions.Timeout as e:
            logger.warning("llama.cpp request timed out: %s", str(e))
            return {
@@ -219,8 +220,7 @@ class LlamaCppClient(GenAIClient):
            error_detail = str(e)
            if hasattr(e, "response") and e.response is not None:
                try:
-                    error_body = e.response.text
-                    error_detail = f"{str(e)} - Response: {error_body[:500]}"
+                    error_detail = f"{str(e)} - Response: {e.response.text[:500]}"
                except Exception:
                    pass
            logger.warning("llama.cpp returned an error: %s", error_detail)
@@ -236,3 +236,106 @@ class LlamaCppClient(GenAIClient):
                "tool_calls": None,
                "finish_reason": "error",
            }
+
+    async def chat_with_tools_stream(
+        self,
+        messages: list[dict[str, Any]],
+        tools: Optional[list[dict[str, Any]]] = None,
+        tool_choice: Optional[str] = "auto",
+    ):
+        """Stream chat with tools via OpenAI-compatible streaming API."""
+        if self.provider is None:
+            logger.warning(
+                "llama.cpp provider has not been initialized. Check your llama.cpp configuration."
+            )
+            yield (
+                "message",
+                {
+                    "content": None,
+                    "tool_calls": None,
+                    "finish_reason": "error",
+                },
+            )
+            return
+        try:
+            payload = self._build_payload(messages, tools, tool_choice, stream=True)
+            content_parts: list[str] = []
+            tool_calls_by_index: dict[int, dict[str, Any]] = {}
+            finish_reason = "stop"
+
+            async with httpx.AsyncClient(timeout=float(self.timeout)) as client:
+                async with client.stream(
+                    "POST",
+                    f"{self.provider}/v1/chat/completions",
+                    json=payload,
+                ) as response:
+                    response.raise_for_status()
+                    async for line in response.aiter_lines():
+                        if not line.startswith("data: "):
+                            continue
+                        data_str = line[6:].strip()
+                        if data_str == "[DONE]":
+                            break
+                        try:
+                            data = json.loads(data_str)
+                        except json.JSONDecodeError:
+                            continue
+                        choices = data.get("choices") or []
+                        if not choices:
+                            continue
+                        delta = choices[0].get("delta", {})
+                        if choices[0].get("finish_reason"):
+                            finish_reason = choices[0]["finish_reason"]
+                        if delta.get("content"):
+                            content_parts.append(delta["content"])
+                            yield ("content_delta", delta["content"])
+                        for tc in delta.get("tool_calls") or []:
+                            idx = tc.get("index", 0)
+                            if idx not in tool_calls_by_index:
+                                tool_calls_by_index[idx] = {
+                                    "id": tc.get("id", ""),
+                                    "name": tc.get("name", ""),
+                                    "arguments": "",
+                                }
+                            t = tool_calls_by_index[idx]
+                            if tc.get("id"):
+                                t["id"] = tc["id"]
+                            if tc.get("name"):
+                                t["name"] = tc["name"]
+                            if tc.get("arguments"):
+                                t["arguments"] += tc["arguments"]
+
+            full_content = "".join(content_parts).strip() or None
+            tool_calls_list = self._streamed_tool_calls_to_list(tool_calls_by_index)
+            if tool_calls_list:
+                finish_reason = "tool_calls"
+            yield (
+                "message",
+                {
+                    "content": full_content,
+                    "tool_calls": tool_calls_list,
+                    "finish_reason": finish_reason,
+                },
+            )
+        except httpx.HTTPStatusError as e:
+            logger.warning("llama.cpp streaming HTTP error: %s", e)
+            yield (
+                "message",
+                {
+                    "content": None,
+                    "tool_calls": None,
+                    "finish_reason": "error",
+                },
+            )
+        except Exception as e:
+            logger.warning(
+                "Unexpected error in llama.cpp chat_with_tools_stream: %s", str(e)
+            )
+            yield (
+                "message",
+                {
+                    "content": None,
+                    "tool_calls": None,
+                    "finish_reason": "error",
+                },
+            )
--- a/frigate/genai/ollama.py
+++ b/frigate/genai/ollama.py
@@ -1,15 +1,16 @@
 """Ollama Provider for Frigate AI."""

-import json
 import logging
 from typing import Any, Optional

 from httpx import RemoteProtocolError, TimeoutException
+from ollama import AsyncClient as OllamaAsyncClient
 from ollama import Client as ApiClient
 from ollama import ResponseError

 from frigate.config import GenAIProviderEnum
 from frigate.genai import GenAIClient, register_genai_provider
+from frigate.genai.utils import parse_tool_calls_from_message

 logger = logging.getLogger(__name__)

@@ -88,6 +89,73 @@ class OllamaClient(GenAIClient):
            "num_ctx", 4096
        )

+    def _build_request_params(
+        self,
+        messages: list[dict[str, Any]],
+        tools: Optional[list[dict[str, Any]]],
+        tool_choice: Optional[str],
+        stream: bool = False,
+    ) -> dict[str, Any]:
+        """Build request_messages and params for chat (sync or stream)."""
+        request_messages = []
+        for msg in messages:
+            msg_dict = {
+                "role": msg.get("role"),
+                "content": msg.get("content", ""),
+            }
+            if msg.get("tool_call_id"):
+                msg_dict["tool_call_id"] = msg["tool_call_id"]
+            if msg.get("name"):
+                msg_dict["name"] = msg["name"]
+            if msg.get("tool_calls"):
+                msg_dict["tool_calls"] = msg["tool_calls"]
+            request_messages.append(msg_dict)
+
+        request_params: dict[str, Any] = {
+            "model": self.genai_config.model,
+            "messages": request_messages,
+            **self.provider_options,
+        }
+        if stream:
+            request_params["stream"] = True
+        if tools:
+            request_params["tools"] = tools
+            if tool_choice:
+                request_params["tool_choice"] = (
+                    "none"
+                    if tool_choice == "none"
+                    else "required"
+                    if tool_choice == "required"
+                    else "auto"
+                )
+        return request_params
+
+    def _message_from_response(self, response: dict[str, Any]) -> dict[str, Any]:
+        """Parse Ollama chat response into {content, tool_calls, finish_reason}."""
+        if not response or "message" not in response:
+            return {
+                "content": None,
+                "tool_calls": None,
+                "finish_reason": "error",
+            }
+        message = response["message"]
+        content = message.get("content", "").strip() if message.get("content") else None
+        tool_calls = parse_tool_calls_from_message(message)
+        finish_reason = "error"
+        if response.get("done"):
+            finish_reason = (
+                "tool_calls" if tool_calls else "stop" if content else "error"
+            )
+        elif tool_calls:
+            finish_reason = "tool_calls"
+        elif content:
+            finish_reason = "stop"
+        return {
+            "content": content,
+            "tool_calls": tool_calls,
+            "finish_reason": finish_reason,
+        }
+
    def chat_with_tools(
        self,
        messages: list[dict[str, Any]],
@@ -103,93 +171,12 @@ class OllamaClient(GenAIClient):
                "tool_calls": None,
                "finish_reason": "error",
            }
-
        try:
-            request_messages = []
-            for msg in messages:
-                msg_dict = {
-                    "role": msg.get("role"),
-                    "content": msg.get("content", ""),
-                }
-                if msg.get("tool_call_id"):
-                    msg_dict["tool_call_id"] = msg["tool_call_id"]
-                if msg.get("name"):
-                    msg_dict["name"] = msg["name"]
-                if msg.get("tool_calls"):
-                    msg_dict["tool_calls"] = msg["tool_calls"]
-                request_messages.append(msg_dict)
-
-            request_params = {
-                "model": self.genai_config.model,
-                "messages": request_messages,
-            }
-
-            if tools:
-                request_params["tools"] = tools
-                if tool_choice:
-                    if tool_choice == "none":
-                        request_params["tool_choice"] = "none"
-                    elif tool_choice == "required":
-                        request_params["tool_choice"] = "required"
-                    elif tool_choice == "auto":
-                        request_params["tool_choice"] = "auto"
-
-            request_params.update(self.provider_options)
-
-            response = self.provider.chat(**request_params)
-
-            if not response or "message" not in response:
-                return {
-                    "content": None,
-                    "tool_calls": None,
-                    "finish_reason": "error",
-                }
-
-            message = response["message"]
-            content = (
-                message.get("content", "").strip() if message.get("content") else None
+            request_params = self._build_request_params(
+                messages, tools, tool_choice, stream=False
            )
-
-            tool_calls = None
-            if "tool_calls" in message and message["tool_calls"]:
-                tool_calls = []
-                for tool_call in message["tool_calls"]:
-                    try:
-                        function_data = tool_call.get("function", {})
-                        arguments_str = function_data.get("arguments", "{}")
-                        arguments = json.loads(arguments_str)
-                    except (json.JSONDecodeError, KeyError, TypeError) as e:
-                        logger.warning(
-                            f"Failed to parse tool call arguments: {e}, "
-                            f"tool: {function_data.get('name', 'unknown')}"
-                        )
-                        arguments = {}
-
-                    tool_calls.append(
-                        {
-                            "id": tool_call.get("id", ""),
-                            "name": function_data.get("name", ""),
-                            "arguments": arguments,
-                        }
-                    )
-
-            finish_reason = "error"
-            if "done" in response and response["done"]:
-                if tool_calls:
-                    finish_reason = "tool_calls"
-                elif content:
-                    finish_reason = "stop"
-            elif tool_calls:
-                finish_reason = "tool_calls"
-            elif content:
-                finish_reason = "stop"
-
-            return {
-                "content": content,
-                "tool_calls": tool_calls,
-                "finish_reason": finish_reason,
-            }
-
+            response = self.provider.chat(**request_params)
+            return self._message_from_response(response)
        except (TimeoutException, ResponseError, ConnectionError) as e:
            logger.warning("Ollama returned an error: %s", str(e))
            return {
@@ -204,3 +191,89 @@ class OllamaClient(GenAIClient):
                "tool_calls": None,
                "finish_reason": "error",
            }
+
+    async def chat_with_tools_stream(
+        self,
+        messages: list[dict[str, Any]],
+        tools: Optional[list[dict[str, Any]]] = None,
+        tool_choice: Optional[str] = "auto",
+    ):
+        """Stream chat with tools; yields content deltas then final message."""
+        if self.provider is None:
+            logger.warning(
+                "Ollama provider has not been initialized. Check your Ollama configuration."
+            )
+            yield (
+                "message",
+                {
+                    "content": None,
+                    "tool_calls": None,
+                    "finish_reason": "error",
+                },
+            )
+            return
+        try:
+            request_params = self._build_request_params(
+                messages, tools, tool_choice, stream=True
+            )
+            async_client = OllamaAsyncClient(
+                host=self.genai_config.base_url,
+                timeout=self.timeout,
+            )
+            content_parts: list[str] = []
+            final_message: dict[str, Any] | None = None
+            try:
+                stream = await async_client.chat(**request_params)
+                async for chunk in stream:
+                    if not chunk or "message" not in chunk:
+                        continue
+                    msg = chunk.get("message", {})
+                    delta = msg.get("content") or ""
+                    if delta:
+                        content_parts.append(delta)
+                        yield ("content_delta", delta)
+                    if chunk.get("done"):
+                        full_content = "".join(content_parts).strip() or None
+                        tool_calls = parse_tool_calls_from_message(msg)
+                        final_message = {
+                            "content": full_content,
+                            "tool_calls": tool_calls,
+                            "finish_reason": "tool_calls" if tool_calls else "stop",
+                        }
+                        break
+            finally:
+                await async_client.close()
+
+            if final_message is not None:
+                yield ("message", final_message)
+            else:
+                yield (
+                    "message",
+                    {
+                        "content": "".join(content_parts).strip() or None,
+                        "tool_calls": None,
+                        "finish_reason": "stop",
+                    },
+                )
+        except (TimeoutException, ResponseError, ConnectionError) as e:
+            logger.warning("Ollama streaming error: %s", str(e))
+            yield (
+                "message",
+                {
+                    "content": None,
+                    "tool_calls": None,
+                    "finish_reason": "error",
+                },
+            )
+        except Exception as e:
+            logger.warning(
+                "Unexpected error in Ollama chat_with_tools_stream: %s", str(e)
+            )
+            yield (
+                "message",
+                {
+                    "content": None,
+                    "tool_calls": None,
+                    "finish_reason": "error",
+                },
+            )
--- a/frigate/genai/utils.py
+++ b/frigate/genai/utils.py
@@ -0,0 +1,70 @@
+"""Shared helpers for GenAI providers and chat (OpenAI-style messages, tool call parsing)."""
+
+import json
+import logging
+from typing import Any, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+def parse_tool_calls_from_message(
+    message: dict[str, Any],
+) -> Optional[list[dict[str, Any]]]:
+    """
+    Parse tool_calls from an OpenAI-style message dict.
+
+    Message may have "tool_calls" as a list of:
+      {"id": str, "function": {"name": str, "arguments": str}, ...}
+
+    Returns a list of {"id", "name", "arguments"} with arguments parsed as dict,
+    or None if no tool_calls. Used by Ollama and LlamaCpp (non-stream) responses.
+    """
+    raw = message.get("tool_calls")
+    if not raw or not isinstance(raw, list):
+        return None
+    result = []
+    for tool_call in raw:
+        function_data = tool_call.get("function") or {}
+        try:
+            arguments_str = function_data.get("arguments") or "{}"
+            arguments = json.loads(arguments_str)
+        except (json.JSONDecodeError, KeyError, TypeError) as e:
+            logger.warning(
+                "Failed to parse tool call arguments: %s, tool: %s",
+                e,
+                function_data.get("name", "unknown"),
+            )
+            arguments = {}
+        result.append(
+            {
+                "id": tool_call.get("id", ""),
+                "name": function_data.get("name", ""),
+                "arguments": arguments,
+            }
+        )
+    return result if result else None
+
+
+def build_assistant_message_for_conversation(
+    content: Any,
+    tool_calls_raw: Optional[List[dict[str, Any]]],
+) -> dict[str, Any]:
+    """
+    Build the assistant message dict in OpenAI format for appending to a conversation.
+
+    tool_calls_raw: list of {"id", "name", "arguments"} (arguments as dict), or None.
+    """
+    msg: dict[str, Any] = {"role": "assistant", "content": content}
+    if tool_calls_raw:
+        msg["tool_calls"] = [
+            {
+                "id": tc["id"],
+                "type": "function",
+                "function": {
+                    "name": tc["name"],
+                    "arguments": json.dumps(tc.get("arguments") or {}),
+                },
+            }
+            for tc in tool_calls_raw
+        ]
+    return msg
--- a/web/src/pages/Chat.tsx
+++ b/web/src/pages/Chat.tsx
@@ -62,6 +62,7 @@ export default function ChatPage() {
      setMessages((prev) => [...prev, assistantMessage]);

      let buffer = "";
+      let hadStreamError = false;
      for (;;) {
        const { done, value } = await reader.read();
        if (done) break;
@@ -81,6 +82,14 @@ export default function ChatPage() {
          } catch {
            continue;
          }
+          if (data.type === "error" && "error" in data) {
+            setError((data as { error?: string }).error ?? t("error"));
+            setMessages((prev) =>
+              prev.filter((m) => !(m.role === "assistant" && m.content === "")),
+            );
+            hadStreamError = true;
+            break;
+          }
          if (data.type === "tool_calls" && data.tool_calls?.length) {
            setMessages((prev) => {
              const next = [...prev];
@@ -105,8 +114,11 @@ export default function ChatPage() {
            });
          }
        }
+        if (hadStreamError) break;
      }
-      if (buffer.trim()) {
+      if (hadStreamError) {
+        // already set error and cleaned up
+      } else if (buffer.trim()) {
        try {
          const data = JSON.parse(buffer.trim()) as {
            type: string;
@@ -130,13 +142,15 @@ export default function ChatPage() {
        }
      }

-      setMessages((prev) => {
-        const next = [...prev];
-        const last = next[next.length - 1];
-        if (last?.role === "assistant" && last.content === "")
-          next[next.length - 1] = { ...last, content: " " };
-        return next;
-      });
+      if (!hadStreamError) {
+        setMessages((prev) => {
+          const next = [...prev];
+          const last = next[next.length - 1];
+          if (last?.role === "assistant" && last.content === "")
+            next[next.length - 1] = { ...last, content: " " };
+          return next;
+        });
+      }
    } catch {
      setError(t("error"));
      setMessages((prev) =>
--- a/web/vite.config.ts
+++ b/web/vite.config.ts
@@ -4,7 +4,7 @@ import { defineConfig } from "vite";
 import react from "@vitejs/plugin-react-swc";
 import monacoEditorPlugin from "vite-plugin-monaco-editor";

-const proxyHost = process.env.PROXY_HOST || "1ocalhost:5000";
+const proxyHost = process.env.PROXY_HOST || "192.168.50.106:5002";

 // https://vitejs.dev/config/
 export default defineConfig({