2024-10-07 22:30:45 +02:00
|
|
|
"""SQLite-vec embeddings database."""
|
2024-06-21 23:30:19 +02:00
|
|
|
|
2024-06-23 15:13:02 +02:00
|
|
|
import json
|
2024-06-21 23:30:19 +02:00
|
|
|
import logging
|
|
|
|
import multiprocessing as mp
|
2024-10-07 22:30:45 +02:00
|
|
|
import os
|
2024-06-21 23:30:19 +02:00
|
|
|
import signal
|
|
|
|
import threading
|
|
|
|
from types import FrameType
|
2024-10-10 17:42:24 +02:00
|
|
|
from typing import Optional, Union
|
2024-06-21 23:30:19 +02:00
|
|
|
|
|
|
|
from setproctitle import setproctitle
|
|
|
|
|
2024-10-10 17:42:24 +02:00
|
|
|
from frigate.comms.embeddings_updater import EmbeddingsRequestEnum, EmbeddingsRequestor
|
2024-06-21 23:30:19 +02:00
|
|
|
from frigate.config import FrigateConfig
|
2024-06-23 15:13:02 +02:00
|
|
|
from frigate.const import CONFIG_DIR
|
2024-10-07 22:30:45 +02:00
|
|
|
from frigate.db.sqlitevecq import SqliteVecQueueDatabase
|
2024-06-21 23:30:19 +02:00
|
|
|
from frigate.models import Event
|
2024-10-10 17:42:24 +02:00
|
|
|
from frigate.util.builtin import serialize
|
2024-06-21 23:30:19 +02:00
|
|
|
from frigate.util.services import listen
|
|
|
|
|
2024-06-23 15:13:02 +02:00
|
|
|
from .maintainer import EmbeddingMaintainer
|
|
|
|
from .util import ZScoreNormalization
|
|
|
|
|
2024-06-21 23:30:19 +02:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
def manage_embeddings(config: FrigateConfig) -> None:
|
|
|
|
# Only initialize embeddings if semantic search is enabled
|
|
|
|
if not config.semantic_search.enabled:
|
|
|
|
return
|
|
|
|
|
|
|
|
stop_event = mp.Event()
|
|
|
|
|
|
|
|
def receiveSignal(signalNumber: int, frame: Optional[FrameType]) -> None:
|
|
|
|
stop_event.set()
|
|
|
|
|
|
|
|
signal.signal(signal.SIGTERM, receiveSignal)
|
|
|
|
signal.signal(signal.SIGINT, receiveSignal)
|
|
|
|
|
|
|
|
threading.current_thread().name = "process:embeddings_manager"
|
|
|
|
setproctitle("frigate.embeddings_manager")
|
|
|
|
listen()
|
|
|
|
|
|
|
|
# Configure Frigate DB
|
2024-10-07 22:30:45 +02:00
|
|
|
db = SqliteVecQueueDatabase(
|
2024-06-21 23:30:19 +02:00
|
|
|
config.database.path,
|
|
|
|
pragmas={
|
|
|
|
"auto_vacuum": "FULL", # Does not defragment database
|
|
|
|
"cache_size": -512 * 1000, # 512MB of cache
|
|
|
|
"synchronous": "NORMAL", # Safe when using WAL https://www.sqlite.org/pragma.html#pragma_synchronous
|
|
|
|
},
|
|
|
|
timeout=max(60, 10 * len([c for c in config.cameras.values() if c.enabled])),
|
2024-10-07 22:30:45 +02:00
|
|
|
load_vec_extension=True,
|
2024-06-21 23:30:19 +02:00
|
|
|
)
|
|
|
|
models = [Event]
|
|
|
|
db.bind(models)
|
|
|
|
|
|
|
|
maintainer = EmbeddingMaintainer(
|
2024-10-07 22:30:45 +02:00
|
|
|
db,
|
2024-06-21 23:30:19 +02:00
|
|
|
config,
|
|
|
|
stop_event,
|
|
|
|
)
|
|
|
|
maintainer.start()
|
2024-06-23 15:13:02 +02:00
|
|
|
|
|
|
|
|
|
|
|
class EmbeddingsContext:
|
2024-10-10 17:42:24 +02:00
|
|
|
def __init__(self, db: SqliteVecQueueDatabase):
|
|
|
|
self.db = db
|
2024-06-23 15:13:02 +02:00
|
|
|
self.thumb_stats = ZScoreNormalization()
|
2024-10-09 23:31:54 +02:00
|
|
|
self.desc_stats = ZScoreNormalization()
|
2024-10-10 17:42:24 +02:00
|
|
|
self.requestor = EmbeddingsRequestor()
|
2024-06-23 15:13:02 +02:00
|
|
|
|
|
|
|
# load stats from disk
|
|
|
|
try:
|
2024-10-07 22:30:45 +02:00
|
|
|
with open(os.path.join(CONFIG_DIR, ".search_stats.json"), "r") as f:
|
2024-06-23 15:13:02 +02:00
|
|
|
data = json.loads(f.read())
|
|
|
|
self.thumb_stats.from_dict(data["thumb_stats"])
|
|
|
|
self.desc_stats.from_dict(data["desc_stats"])
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
|
|
|
|
2024-10-10 17:42:24 +02:00
|
|
|
def stop(self):
|
2024-06-23 15:13:02 +02:00
|
|
|
"""Write the stats to disk as JSON on exit."""
|
|
|
|
contents = {
|
|
|
|
"thumb_stats": self.thumb_stats.to_dict(),
|
|
|
|
"desc_stats": self.desc_stats.to_dict(),
|
|
|
|
}
|
2024-10-07 22:30:45 +02:00
|
|
|
with open(os.path.join(CONFIG_DIR, ".search_stats.json"), "w") as f:
|
|
|
|
json.dump(contents, f)
|
2024-10-10 17:42:24 +02:00
|
|
|
self.requestor.stop()
|
|
|
|
|
|
|
|
def search_thumbnail(
|
|
|
|
self, query: Union[Event, str], event_ids: list[str] = None
|
|
|
|
) -> list[tuple[str, float]]:
|
|
|
|
if query.__class__ == Event:
|
|
|
|
cursor = self.db.execute_sql(
|
|
|
|
"""
|
|
|
|
SELECT thumbnail_embedding FROM vec_thumbnails WHERE id = ?
|
|
|
|
""",
|
|
|
|
[query.id],
|
|
|
|
)
|
|
|
|
|
|
|
|
row = cursor.fetchone() if cursor else None
|
|
|
|
|
|
|
|
if row:
|
|
|
|
query_embedding = row[0]
|
|
|
|
else:
|
|
|
|
# If no embedding found, generate it and return it
|
2024-10-10 23:37:43 +02:00
|
|
|
data = self.requestor.send_data(
|
|
|
|
EmbeddingsRequestEnum.embed_thumbnail.value,
|
|
|
|
{"id": str(query.id), "thumbnail": str(query.thumbnail)},
|
2024-10-10 17:42:24 +02:00
|
|
|
)
|
2024-10-10 23:37:43 +02:00
|
|
|
|
|
|
|
if not data:
|
|
|
|
return []
|
|
|
|
|
|
|
|
query_embedding = serialize(data)
|
2024-10-10 17:42:24 +02:00
|
|
|
else:
|
2024-10-10 23:37:43 +02:00
|
|
|
data = self.requestor.send_data(
|
|
|
|
EmbeddingsRequestEnum.generate_search.value, query
|
2024-10-10 17:42:24 +02:00
|
|
|
)
|
|
|
|
|
2024-10-10 23:37:43 +02:00
|
|
|
if not data:
|
|
|
|
return []
|
|
|
|
|
|
|
|
query_embedding = serialize(data)
|
|
|
|
|
2024-10-10 17:42:24 +02:00
|
|
|
sql_query = """
|
|
|
|
SELECT
|
|
|
|
id,
|
|
|
|
distance
|
|
|
|
FROM vec_thumbnails
|
|
|
|
WHERE thumbnail_embedding MATCH ?
|
|
|
|
AND k = 100
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Add the IN clause if event_ids is provided and not empty
|
|
|
|
# this is the only filter supported by sqlite-vec as of 0.1.3
|
|
|
|
# but it seems to be broken in this version
|
|
|
|
if event_ids:
|
|
|
|
sql_query += " AND id IN ({})".format(",".join("?" * len(event_ids)))
|
|
|
|
|
|
|
|
# order by distance DESC is not implemented in this version of sqlite-vec
|
|
|
|
# when it's implemented, we can use cosine similarity
|
|
|
|
sql_query += " ORDER BY distance"
|
|
|
|
|
|
|
|
parameters = [query_embedding] + event_ids if event_ids else [query_embedding]
|
|
|
|
|
|
|
|
results = self.db.execute_sql(sql_query, parameters).fetchall()
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
def search_description(
|
|
|
|
self, query_text: str, event_ids: list[str] = None
|
|
|
|
) -> list[tuple[str, float]]:
|
2024-10-10 23:37:43 +02:00
|
|
|
data = self.requestor.send_data(
|
|
|
|
EmbeddingsRequestEnum.generate_search.value, query_text
|
2024-10-10 17:42:24 +02:00
|
|
|
)
|
|
|
|
|
2024-10-10 23:37:43 +02:00
|
|
|
if not data:
|
|
|
|
return []
|
|
|
|
|
|
|
|
query_embedding = serialize(data)
|
|
|
|
|
2024-10-10 17:42:24 +02:00
|
|
|
# Prepare the base SQL query
|
|
|
|
sql_query = """
|
|
|
|
SELECT
|
|
|
|
id,
|
|
|
|
distance
|
|
|
|
FROM vec_descriptions
|
|
|
|
WHERE description_embedding MATCH ?
|
|
|
|
AND k = 100
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Add the IN clause if event_ids is provided and not empty
|
|
|
|
# this is the only filter supported by sqlite-vec as of 0.1.3
|
|
|
|
# but it seems to be broken in this version
|
|
|
|
if event_ids:
|
|
|
|
sql_query += " AND id IN ({})".format(",".join("?" * len(event_ids)))
|
|
|
|
|
|
|
|
# order by distance DESC is not implemented in this version of sqlite-vec
|
|
|
|
# when it's implemented, we can use cosine similarity
|
|
|
|
sql_query += " ORDER BY distance"
|
|
|
|
|
|
|
|
parameters = [query_embedding] + event_ids if event_ids else [query_embedding]
|
|
|
|
|
|
|
|
results = self.db.execute_sql(sql_query, parameters).fetchall()
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
def update_description(self, event_id: str, description: str) -> None:
|
|
|
|
self.requestor.send_data(
|
|
|
|
EmbeddingsRequestEnum.embed_description.value,
|
|
|
|
{"id": event_id, "description": description},
|
|
|
|
)
|