From fd96cd5daef58bb5cca9a0a7faf33848a22c44e0 Mon Sep 17 00:00:00 2001 From: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> Date: Thu, 26 Jun 2025 08:22:31 -0500 Subject: [PATCH] Verify images before adding to batch for embedding (#18885) --- frigate/embeddings/embeddings.py | 33 +++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/frigate/embeddings/embeddings.py b/frigate/embeddings/embeddings.py index 096077916..30af15d8a 100644 --- a/frigate/embeddings/embeddings.py +++ b/frigate/embeddings/embeddings.py @@ -1,12 +1,14 @@ """SQLite-vec embeddings database.""" import datetime +import io import logging import os import threading import time from numpy import ndarray +from PIL import Image from playhouse.shortcuts import model_to_dict from frigate.comms.inter_process import InterProcessRequestor @@ -199,14 +201,31 @@ class Embeddings: @param: upsert If embedding should be upserted into vec DB """ start = datetime.datetime.now().timestamp() - ids = list(event_thumbs.keys()) - embeddings = self.vision_embedding(list(event_thumbs.values())) + valid_ids = [] + valid_thumbs = [] + for eid, thumb in event_thumbs.items(): + try: + img = Image.open(io.BytesIO(thumb)) + img.verify() # Will raise if corrupt + valid_ids.append(eid) + valid_thumbs.append(thumb) + except Exception as e: + logger.warning( + f"Embeddings reindexing: Skipping corrupt thumbnail for event {eid}: {e}" + ) + + if not valid_thumbs: + logger.warning( + "Embeddings reindexing: No valid thumbnails to embed in this batch." + ) + return [] + + embeddings = self.vision_embedding(valid_thumbs) if upsert: items = [] - - for i in range(len(ids)): - items.append(ids[i]) + for i in range(len(valid_ids)): + items.append(valid_ids[i]) items.append(serialize(embeddings[i])) self.image_eps.update() @@ -214,12 +233,12 @@ class Embeddings: """ INSERT OR REPLACE INTO vec_thumbnails(id, thumbnail_embedding) VALUES {} - """.format(", ".join(["(?, ?)"] * len(ids))), + """.format(", ".join(["(?, ?)"] * len(valid_ids))), items, ) duration = datetime.datetime.now().timestamp() - start - self.text_inference_speed.update(duration / len(ids)) + self.text_inference_speed.update(duration / len(valid_ids)) return embeddings