Verify images before adding to batch for embedding (#18885)

This commit is contained in:
Josh Hawkins 2025-06-26 08:22:31 -05:00 committed by GitHub
parent e0c1fea2ac
commit fd96cd5dae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,12 +1,14 @@
"""SQLite-vec embeddings database.""" """SQLite-vec embeddings database."""
import datetime import datetime
import io
import logging import logging
import os import os
import threading import threading
import time import time
from numpy import ndarray from numpy import ndarray
from PIL import Image
from playhouse.shortcuts import model_to_dict from playhouse.shortcuts import model_to_dict
from frigate.comms.inter_process import InterProcessRequestor from frigate.comms.inter_process import InterProcessRequestor
@ -199,14 +201,31 @@ class Embeddings:
@param: upsert If embedding should be upserted into vec DB @param: upsert If embedding should be upserted into vec DB
""" """
start = datetime.datetime.now().timestamp() start = datetime.datetime.now().timestamp()
ids = list(event_thumbs.keys()) valid_ids = []
embeddings = self.vision_embedding(list(event_thumbs.values())) valid_thumbs = []
for eid, thumb in event_thumbs.items():
try:
img = Image.open(io.BytesIO(thumb))
img.verify() # Will raise if corrupt
valid_ids.append(eid)
valid_thumbs.append(thumb)
except Exception as e:
logger.warning(
f"Embeddings reindexing: Skipping corrupt thumbnail for event {eid}: {e}"
)
if not valid_thumbs:
logger.warning(
"Embeddings reindexing: No valid thumbnails to embed in this batch."
)
return []
embeddings = self.vision_embedding(valid_thumbs)
if upsert: if upsert:
items = [] items = []
for i in range(len(valid_ids)):
for i in range(len(ids)): items.append(valid_ids[i])
items.append(ids[i])
items.append(serialize(embeddings[i])) items.append(serialize(embeddings[i]))
self.image_eps.update() self.image_eps.update()
@ -214,12 +233,12 @@ class Embeddings:
""" """
INSERT OR REPLACE INTO vec_thumbnails(id, thumbnail_embedding) INSERT OR REPLACE INTO vec_thumbnails(id, thumbnail_embedding)
VALUES {} VALUES {}
""".format(", ".join(["(?, ?)"] * len(ids))), """.format(", ".join(["(?, ?)"] * len(valid_ids))),
items, items,
) )
duration = datetime.datetime.now().timestamp() - start duration = datetime.datetime.now().timestamp() - start
self.text_inference_speed.update(duration / len(ids)) self.text_inference_speed.update(duration / len(valid_ids))
return embeddings return embeddings