From 8364e686671493b913894d17064b1826dbe26712 Mon Sep 17 00:00:00 2001
From: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>
Date: Mon, 21 Oct 2024 10:00:45 -0500
Subject: [PATCH] Model and genai fixes (#14481)

* disable mem arena in options for cpu only

* add try/except around ollama initialization

* update docs
---
 docs/docs/configuration/genai.md |  8 ++++++--
 frigate/genai/ollama.py          | 18 +++++++++++++-----
 frigate/util/model.py            |  8 ++------
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/docs/docs/configuration/genai.md b/docs/docs/configuration/genai.md
index aace224f3..1a9a125c6 100644
--- a/docs/docs/configuration/genai.md
+++ b/docs/docs/configuration/genai.md
@@ -29,11 +29,15 @@ cameras:
 
 ## Ollama
 
-[Ollama](https://ollama.com/) allows you to self-host large language models and keep everything running locally. It provides a nice API over [llama.cpp](https://github.com/ggerganov/llama.cpp). It is highly recommended to host this server on a machine with an Nvidia graphics card, or on a Apple silicon Mac for best performance. Most of the 7b parameter 4-bit vision models will fit inside 8GB of VRAM. There is also a [docker container](https://hub.docker.com/r/ollama/ollama) available.
+[Ollama](https://ollama.com/) allows you to self-host large language models and keep everything running locally. It provides a nice API over [llama.cpp](https://github.com/ggerganov/llama.cpp). It is highly recommended to host this server on a machine with an Nvidia graphics card, or on a Apple silicon Mac for best performance. CPU inference is not recommended.
+
+Most of the 7b parameter 4-bit vision models will fit inside 8GB of VRAM. There is also a [docker container](https://hub.docker.com/r/ollama/ollama) available.
+
+Parallel requests also come with some caveats. See the [Ollama documentation](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-does-ollama-handle-concurrent-requests).
 
 ### Supported Models
 
-You must use a vision capable model with Frigate. Current model variants can be found [in their model library](https://ollama.com/library). At the time of writing, this includes `llava`, `llava-llama3`, `llava-phi3`, and `moondream`.
+You must use a vision capable model with Frigate. Current model variants can be found [in their model library](https://ollama.com/library). At the time of writing, this includes `llava`, `llava-llama3`, `llava-phi3`, and `moondream`. Note that Frigate will not automatically download the model you specify in your config, you must download the model to your local instance of Ollama first.
 
 :::note
 
diff --git a/frigate/genai/ollama.py b/frigate/genai/ollama.py
index ae62208cb..e61441eba 100644
--- a/frigate/genai/ollama.py
+++ b/frigate/genai/ollama.py
@@ -21,12 +21,20 @@ class OllamaClient(GenAIClient):
 
     def _init_provider(self):
         """Initialize the client."""
-        client = ApiClient(host=self.genai_config.base_url, timeout=self.timeout)
-        response = client.pull(self.genai_config.model)
-        if response["status"] != "success":
-            logger.error("Failed to pull %s model from Ollama", self.genai_config.model)
+        try:
+            client = ApiClient(host=self.genai_config.base_url, timeout=self.timeout)
+            # ensure the model is available locally
+            response = client.show(self.genai_config.model)
+            if response.get("error"):
+                logger.error(
+                    "Ollama error: %s",
+                    response["error"],
+                )
+                return None
+            return client
+        except Exception as e:
+            logger.warning("Error initializing Ollama: %s", str(e))
             return None
-        return client
 
     def _send(self, prompt: str, images: list[bytes]) -> Optional[str]:
         """Submit a request to Ollama"""
diff --git a/frigate/util/model.py b/frigate/util/model.py
index b1c85b100..7aefe8b42 100644
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@@ -20,7 +20,7 @@ def get_ort_providers(
             ["CPUExecutionProvider"],
             [
                 {
-                    "arena_extend_strategy": "kSameAsRequested",
+                    "enable_cpu_mem_arena": False,
                 }
             ],
         )
@@ -53,7 +53,7 @@ def get_ort_providers(
             providers.append(provider)
             options.append(
                 {
-                    "arena_extend_strategy": "kSameAsRequested",
+                    "enable_cpu_mem_arena": False,
                 }
             )
         else:
@@ -85,12 +85,8 @@ class ONNXModelRunner:
         else:
             # Use ONNXRuntime
             self.type = "ort"
-            options = ort.SessionOptions()
-            if device == "CPU":
-                options.enable_cpu_mem_arena = False
             self.ort = ort.InferenceSession(
                 model_path,
-                sess_options=options,
                 providers=providers,
                 provider_options=options,
             )