From 8364e686671493b913894d17064b1826dbe26712 Mon Sep 17 00:00:00 2001 From: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> Date: Mon, 21 Oct 2024 10:00:45 -0500 Subject: [PATCH] Model and genai fixes (#14481) * disable mem arena in options for cpu only * add try/except around ollama initialization * update docs --- docs/docs/configuration/genai.md | 8 ++++++-- frigate/genai/ollama.py | 18 +++++++++++++----- frigate/util/model.py | 8 ++------ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/docs/docs/configuration/genai.md b/docs/docs/configuration/genai.md index aace224f3..1a9a125c6 100644 --- a/docs/docs/configuration/genai.md +++ b/docs/docs/configuration/genai.md @@ -29,11 +29,15 @@ cameras: ## Ollama -[Ollama](https://ollama.com/) allows you to self-host large language models and keep everything running locally. It provides a nice API over [llama.cpp](https://github.com/ggerganov/llama.cpp). It is highly recommended to host this server on a machine with an Nvidia graphics card, or on a Apple silicon Mac for best performance. Most of the 7b parameter 4-bit vision models will fit inside 8GB of VRAM. There is also a [docker container](https://hub.docker.com/r/ollama/ollama) available. +[Ollama](https://ollama.com/) allows you to self-host large language models and keep everything running locally. It provides a nice API over [llama.cpp](https://github.com/ggerganov/llama.cpp). It is highly recommended to host this server on a machine with an Nvidia graphics card, or on a Apple silicon Mac for best performance. CPU inference is not recommended. + +Most of the 7b parameter 4-bit vision models will fit inside 8GB of VRAM. There is also a [docker container](https://hub.docker.com/r/ollama/ollama) available. + +Parallel requests also come with some caveats. See the [Ollama documentation](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-does-ollama-handle-concurrent-requests). ### Supported Models -You must use a vision capable model with Frigate. Current model variants can be found [in their model library](https://ollama.com/library). At the time of writing, this includes `llava`, `llava-llama3`, `llava-phi3`, and `moondream`. +You must use a vision capable model with Frigate. Current model variants can be found [in their model library](https://ollama.com/library). At the time of writing, this includes `llava`, `llava-llama3`, `llava-phi3`, and `moondream`. Note that Frigate will not automatically download the model you specify in your config, you must download the model to your local instance of Ollama first. :::note diff --git a/frigate/genai/ollama.py b/frigate/genai/ollama.py index ae62208cb..e61441eba 100644 --- a/frigate/genai/ollama.py +++ b/frigate/genai/ollama.py @@ -21,12 +21,20 @@ class OllamaClient(GenAIClient): def _init_provider(self): """Initialize the client.""" - client = ApiClient(host=self.genai_config.base_url, timeout=self.timeout) - response = client.pull(self.genai_config.model) - if response["status"] != "success": - logger.error("Failed to pull %s model from Ollama", self.genai_config.model) + try: + client = ApiClient(host=self.genai_config.base_url, timeout=self.timeout) + # ensure the model is available locally + response = client.show(self.genai_config.model) + if response.get("error"): + logger.error( + "Ollama error: %s", + response["error"], + ) + return None + return client + except Exception as e: + logger.warning("Error initializing Ollama: %s", str(e)) return None - return client def _send(self, prompt: str, images: list[bytes]) -> Optional[str]: """Submit a request to Ollama""" diff --git a/frigate/util/model.py b/frigate/util/model.py index b1c85b100..7aefe8b42 100644 --- a/frigate/util/model.py +++ b/frigate/util/model.py @@ -20,7 +20,7 @@ def get_ort_providers( ["CPUExecutionProvider"], [ { - "arena_extend_strategy": "kSameAsRequested", + "enable_cpu_mem_arena": False, } ], ) @@ -53,7 +53,7 @@ def get_ort_providers( providers.append(provider) options.append( { - "arena_extend_strategy": "kSameAsRequested", + "enable_cpu_mem_arena": False, } ) else: @@ -85,12 +85,8 @@ class ONNXModelRunner: else: # Use ONNXRuntime self.type = "ort" - options = ort.SessionOptions() - if device == "CPU": - options.enable_cpu_mem_arena = False self.ort = ort.InferenceSession( model_path, - sess_options=options, providers=providers, provider_options=options, )