mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-02-20 13:54:36 +01:00
Add ability to use Jina CLIP V2 for semantic search (#16826)
* add wheels * move extra index url to bottom * config model option * add postprocess * fix config * jina v2 embedding class * use jina v2 in embeddings * fix ov inference * frontend * update reference config * revert device * fix truncation * return np tensors * use correct embeddings from inference * manual preprocess * clean up * docs * lower batch size for v2 only * docs clarity * wording
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
from enum import Enum
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from pydantic import Field
|
||||
@@ -11,6 +12,11 @@ __all__ = [
|
||||
]
|
||||
|
||||
|
||||
class SemanticSearchModelEnum(str, Enum):
|
||||
jinav1 = "jinav1"
|
||||
jinav2 = "jinav2"
|
||||
|
||||
|
||||
class BirdClassificationConfig(FrigateBaseModel):
|
||||
enabled: bool = Field(default=False, title="Enable bird classification.")
|
||||
threshold: float = Field(
|
||||
@@ -30,7 +36,11 @@ class ClassificationConfig(FrigateBaseModel):
|
||||
class SemanticSearchConfig(FrigateBaseModel):
|
||||
enabled: bool = Field(default=False, title="Enable semantic search.")
|
||||
reindex: Optional[bool] = Field(
|
||||
default=False, title="Reindex all detections on startup."
|
||||
default=False, title="Reindex all tracked objects on startup."
|
||||
)
|
||||
model: Optional[SemanticSearchModelEnum] = Field(
|
||||
default=SemanticSearchModelEnum.jinav1,
|
||||
title="The CLIP model to use for semantic search.",
|
||||
)
|
||||
model_size: str = Field(
|
||||
default="small", title="The size of the embeddings model used."
|
||||
|
||||
@@ -10,6 +10,7 @@ from playhouse.shortcuts import model_to_dict
|
||||
|
||||
from frigate.comms.inter_process import InterProcessRequestor
|
||||
from frigate.config import FrigateConfig
|
||||
from frigate.config.classification import SemanticSearchModelEnum
|
||||
from frigate.const import (
|
||||
CONFIG_DIR,
|
||||
UPDATE_EMBEDDINGS_REINDEX_PROGRESS,
|
||||
@@ -23,6 +24,7 @@ from frigate.util.builtin import serialize
|
||||
from frigate.util.path import get_event_thumbnail_bytes
|
||||
|
||||
from .onnx.jina_v1_embedding import JinaV1ImageEmbedding, JinaV1TextEmbedding
|
||||
from .onnx.jina_v2_embedding import JinaV2Embedding
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -75,18 +77,7 @@ class Embeddings:
|
||||
# Create tables if they don't exist
|
||||
self.db.create_embeddings_tables()
|
||||
|
||||
models = [
|
||||
"jinaai/jina-clip-v1-text_model_fp16.onnx",
|
||||
"jinaai/jina-clip-v1-tokenizer",
|
||||
"jinaai/jina-clip-v1-vision_model_fp16.onnx"
|
||||
if config.semantic_search.model_size == "large"
|
||||
else "jinaai/jina-clip-v1-vision_model_quantized.onnx",
|
||||
"jinaai/jina-clip-v1-preprocessor_config.json",
|
||||
"facenet-facenet.onnx",
|
||||
"paddleocr-onnx-detection.onnx",
|
||||
"paddleocr-onnx-classification.onnx",
|
||||
"paddleocr-onnx-recognition.onnx",
|
||||
]
|
||||
models = self.get_model_definitions()
|
||||
|
||||
for model in models:
|
||||
self.requestor.send_data(
|
||||
@@ -97,17 +88,64 @@ class Embeddings:
|
||||
},
|
||||
)
|
||||
|
||||
self.text_embedding = JinaV1TextEmbedding(
|
||||
model_size=config.semantic_search.model_size,
|
||||
requestor=self.requestor,
|
||||
device="CPU",
|
||||
if self.config.semantic_search.model == SemanticSearchModelEnum.jinav2:
|
||||
# Single JinaV2Embedding instance for both text and vision
|
||||
self.embedding = JinaV2Embedding(
|
||||
model_size=self.config.semantic_search.model_size,
|
||||
requestor=self.requestor,
|
||||
device="GPU"
|
||||
if self.config.semantic_search.model_size == "large"
|
||||
else "CPU",
|
||||
)
|
||||
self.text_embedding = lambda input_data: self.embedding(
|
||||
input_data, embedding_type="text"
|
||||
)
|
||||
self.vision_embedding = lambda input_data: self.embedding(
|
||||
input_data, embedding_type="vision"
|
||||
)
|
||||
else: # Default to jinav1
|
||||
self.text_embedding = JinaV1TextEmbedding(
|
||||
model_size=config.semantic_search.model_size,
|
||||
requestor=self.requestor,
|
||||
device="CPU",
|
||||
)
|
||||
self.vision_embedding = JinaV1ImageEmbedding(
|
||||
model_size=config.semantic_search.model_size,
|
||||
requestor=self.requestor,
|
||||
device="GPU" if config.semantic_search.model_size == "large" else "CPU",
|
||||
)
|
||||
|
||||
def get_model_definitions(self):
|
||||
# Version-specific models
|
||||
if self.config.semantic_search.model == SemanticSearchModelEnum.jinav2:
|
||||
models = [
|
||||
"jinaai/jina-clip-v2-tokenizer",
|
||||
"jinaai/jina-clip-v2-model_fp16.onnx"
|
||||
if self.config.semantic_search.model_size == "large"
|
||||
else "jinaai/jina-clip-v2-model_quantized.onnx",
|
||||
"jinaai/jina-clip-v2-preprocessor_config.json",
|
||||
]
|
||||
else: # Default to jinav1
|
||||
models = [
|
||||
"jinaai/jina-clip-v1-text_model_fp16.onnx",
|
||||
"jinaai/jina-clip-v1-tokenizer",
|
||||
"jinaai/jina-clip-v1-vision_model_fp16.onnx"
|
||||
if self.config.semantic_search.model_size == "large"
|
||||
else "jinaai/jina-clip-v1-vision_model_quantized.onnx",
|
||||
"jinaai/jina-clip-v1-preprocessor_config.json",
|
||||
]
|
||||
|
||||
# Add common models
|
||||
models.extend(
|
||||
[
|
||||
"facenet-facenet.onnx",
|
||||
"paddleocr-onnx-detection.onnx",
|
||||
"paddleocr-onnx-classification.onnx",
|
||||
"paddleocr-onnx-recognition.onnx",
|
||||
]
|
||||
)
|
||||
|
||||
self.vision_embedding = JinaV1ImageEmbedding(
|
||||
model_size=config.semantic_search.model_size,
|
||||
requestor=self.requestor,
|
||||
device="GPU" if config.semantic_search.model_size == "large" else "CPU",
|
||||
)
|
||||
return models
|
||||
|
||||
def embed_thumbnail(
|
||||
self, event_id: str, thumbnail: bytes, upsert: bool = True
|
||||
@@ -244,7 +282,11 @@ class Embeddings:
|
||||
# Get total count of events to process
|
||||
total_events = Event.select().count()
|
||||
|
||||
batch_size = 32
|
||||
batch_size = (
|
||||
4
|
||||
if self.config.semantic_search.model == SemanticSearchModelEnum.jinav2
|
||||
else 32
|
||||
)
|
||||
current_page = 1
|
||||
|
||||
totals = {
|
||||
|
||||
@@ -72,6 +72,9 @@ class BaseEmbedding(ABC):
|
||||
|
||||
return image
|
||||
|
||||
def _postprocess_outputs(self, outputs: any) -> any:
|
||||
return outputs
|
||||
|
||||
def __call__(
|
||||
self, inputs: list[str] | list[Image.Image] | list[str]
|
||||
) -> list[np.ndarray]:
|
||||
@@ -91,5 +94,7 @@ class BaseEmbedding(ABC):
|
||||
else:
|
||||
logger.warning(f"Expected input '{key}' not found in onnx_inputs")
|
||||
|
||||
embeddings = self.runner.run(onnx_inputs)[0]
|
||||
outputs = self.runner.run(onnx_inputs)[0]
|
||||
embeddings = self._postprocess_outputs(outputs)
|
||||
|
||||
return [embedding for embedding in embeddings]
|
||||
|
||||
231
frigate/embeddings/onnx/jina_v2_embedding.py
Normal file
231
frigate/embeddings/onnx/jina_v2_embedding.py
Normal file
@@ -0,0 +1,231 @@
|
||||
"""JinaV2 Embeddings."""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.utils.logging import disable_progress_bar, set_verbosity_error
|
||||
|
||||
from frigate.comms.inter_process import InterProcessRequestor
|
||||
from frigate.const import MODEL_CACHE_DIR, UPDATE_MODEL_STATE
|
||||
from frigate.types import ModelStatusTypesEnum
|
||||
from frigate.util.downloader import ModelDownloader
|
||||
|
||||
from .base_embedding import BaseEmbedding
|
||||
from .runner import ONNXModelRunner
|
||||
|
||||
# disables the progress bar and download logging for downloading tokenizers and image processors
|
||||
disable_progress_bar()
|
||||
set_verbosity_error()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JinaV2Embedding(BaseEmbedding):
|
||||
def __init__(
|
||||
self,
|
||||
model_size: str,
|
||||
requestor: InterProcessRequestor,
|
||||
device: str = "AUTO",
|
||||
embedding_type: str = None,
|
||||
):
|
||||
model_file = (
|
||||
"model_fp16.onnx" if model_size == "large" else "model_quantized.onnx"
|
||||
)
|
||||
super().__init__(
|
||||
model_name="jinaai/jina-clip-v2",
|
||||
model_file=model_file,
|
||||
download_urls={
|
||||
model_file: f"https://huggingface.co/jinaai/jina-clip-v2/resolve/main/onnx/{model_file}",
|
||||
"preprocessor_config.json": "https://huggingface.co/jinaai/jina-clip-v2/resolve/main/preprocessor_config.json",
|
||||
},
|
||||
)
|
||||
self.tokenizer_file = "tokenizer"
|
||||
self.embedding_type = embedding_type
|
||||
self.requestor = requestor
|
||||
self.model_size = model_size
|
||||
self.device = device
|
||||
self.download_path = os.path.join(MODEL_CACHE_DIR, self.model_name)
|
||||
self.tokenizer = None
|
||||
self.image_processor = None
|
||||
self.runner = None
|
||||
files_names = list(self.download_urls.keys()) + [self.tokenizer_file]
|
||||
if not all(
|
||||
os.path.exists(os.path.join(self.download_path, n)) for n in files_names
|
||||
):
|
||||
logger.debug(f"starting model download for {self.model_name}")
|
||||
self.downloader = ModelDownloader(
|
||||
model_name=self.model_name,
|
||||
download_path=self.download_path,
|
||||
file_names=files_names,
|
||||
download_func=self._download_model,
|
||||
)
|
||||
self.downloader.ensure_model_files()
|
||||
else:
|
||||
self.downloader = None
|
||||
ModelDownloader.mark_files_state(
|
||||
self.requestor,
|
||||
self.model_name,
|
||||
files_names,
|
||||
ModelStatusTypesEnum.downloaded,
|
||||
)
|
||||
self._load_model_and_utils()
|
||||
logger.debug(f"models are already downloaded for {self.model_name}")
|
||||
|
||||
def _download_model(self, path: str):
|
||||
try:
|
||||
file_name = os.path.basename(path)
|
||||
|
||||
if file_name in self.download_urls:
|
||||
ModelDownloader.download_from_url(self.download_urls[file_name], path)
|
||||
elif file_name == self.tokenizer_file:
|
||||
if not os.path.exists(os.path.join(path, self.model_name)):
|
||||
logger.info(f"Downloading {self.model_name} tokenizer")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
self.model_name,
|
||||
trust_remote_code=True,
|
||||
cache_dir=os.path.join(
|
||||
MODEL_CACHE_DIR, self.model_name, "tokenizer"
|
||||
),
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
tokenizer.save_pretrained(path)
|
||||
self.requestor.send_data(
|
||||
UPDATE_MODEL_STATE,
|
||||
{
|
||||
"model": f"{self.model_name}-{file_name}",
|
||||
"state": ModelStatusTypesEnum.downloaded,
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
self.requestor.send_data(
|
||||
UPDATE_MODEL_STATE,
|
||||
{
|
||||
"model": f"{self.model_name}-{file_name}",
|
||||
"state": ModelStatusTypesEnum.error,
|
||||
},
|
||||
)
|
||||
|
||||
def _load_model_and_utils(self):
|
||||
if self.runner is None:
|
||||
if self.downloader:
|
||||
self.downloader.wait_for_download()
|
||||
|
||||
tokenizer_path = os.path.join(
|
||||
f"{MODEL_CACHE_DIR}/{self.model_name}/tokenizer"
|
||||
)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
self.model_name,
|
||||
cache_dir=tokenizer_path,
|
||||
trust_remote_code=True,
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
|
||||
self.runner = ONNXModelRunner(
|
||||
os.path.join(self.download_path, self.model_file),
|
||||
self.device,
|
||||
self.model_size,
|
||||
)
|
||||
|
||||
def _preprocess_image(self, image_data: bytes | Image.Image) -> np.ndarray:
|
||||
"""
|
||||
Manually preprocess a single image from bytes or PIL.Image to (3, 512, 512).
|
||||
"""
|
||||
if isinstance(image_data, bytes):
|
||||
image = Image.open(io.BytesIO(image_data))
|
||||
else:
|
||||
image = image_data
|
||||
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
image = image.resize((512, 512), Image.Resampling.LANCZOS)
|
||||
|
||||
# Convert to numpy array, normalize to [0, 1], and transpose to (channels, height, width)
|
||||
image_array = np.array(image, dtype=np.float32) / 255.0
|
||||
image_array = np.transpose(image_array, (2, 0, 1)) # (H, W, C) -> (C, H, W)
|
||||
|
||||
return image_array
|
||||
|
||||
def _preprocess_inputs(self, raw_inputs):
|
||||
"""
|
||||
Preprocess inputs into a list of real input tensors (no dummies).
|
||||
- For text: Returns list of input_ids.
|
||||
- For vision: Returns list of pixel_values.
|
||||
"""
|
||||
if not isinstance(raw_inputs, list):
|
||||
raw_inputs = [raw_inputs]
|
||||
|
||||
processed = []
|
||||
if self.embedding_type == "text":
|
||||
for text in raw_inputs:
|
||||
input_ids = self.tokenizer([text], return_tensors="np")["input_ids"]
|
||||
processed.append(input_ids)
|
||||
elif self.embedding_type == "vision":
|
||||
for img in raw_inputs:
|
||||
pixel_values = self._preprocess_image(img)
|
||||
processed.append(
|
||||
pixel_values[np.newaxis, ...]
|
||||
) # Add batch dim: (1, 3, 512, 512)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid embedding_type: {self.embedding_type}. Must be 'text' or 'vision'."
|
||||
)
|
||||
return processed
|
||||
|
||||
def _postprocess_outputs(self, outputs):
|
||||
"""
|
||||
Process ONNX model outputs, truncating each embedding in the array to truncate_dim.
|
||||
- outputs: NumPy array of embeddings.
|
||||
- Returns: List of truncated embeddings.
|
||||
"""
|
||||
# size of vector in database
|
||||
truncate_dim = 768
|
||||
|
||||
# jina v2 defaults to 1024 and uses Matryoshka representation, so
|
||||
# truncating only causes an extremely minor decrease in retrieval accuracy
|
||||
if outputs.shape[-1] > truncate_dim:
|
||||
outputs = outputs[..., :truncate_dim]
|
||||
|
||||
return outputs
|
||||
|
||||
def __call__(
|
||||
self, inputs: list[str] | list[Image.Image] | list[str], embedding_type=None
|
||||
) -> list[np.ndarray]:
|
||||
self.embedding_type = embedding_type
|
||||
if not self.embedding_type:
|
||||
raise ValueError(
|
||||
"embedding_type must be specified either in __init__ or __call__"
|
||||
)
|
||||
|
||||
self._load_model_and_utils()
|
||||
processed = self._preprocess_inputs(inputs)
|
||||
batch_size = len(processed)
|
||||
|
||||
# Prepare ONNX inputs with matching batch sizes
|
||||
onnx_inputs = {}
|
||||
if self.embedding_type == "text":
|
||||
onnx_inputs["input_ids"] = np.stack([x[0] for x in processed])
|
||||
onnx_inputs["pixel_values"] = np.zeros(
|
||||
(batch_size, 3, 512, 512), dtype=np.float32
|
||||
)
|
||||
elif self.embedding_type == "vision":
|
||||
onnx_inputs["input_ids"] = np.zeros((batch_size, 16), dtype=np.int64)
|
||||
onnx_inputs["pixel_values"] = np.stack([x[0] for x in processed])
|
||||
else:
|
||||
raise ValueError("Invalid embedding type")
|
||||
|
||||
# Run inference
|
||||
outputs = self.runner.run(onnx_inputs)
|
||||
if self.embedding_type == "text":
|
||||
embeddings = outputs[2] # text embeddings
|
||||
elif self.embedding_type == "vision":
|
||||
embeddings = outputs[3] # image embeddings
|
||||
else:
|
||||
raise ValueError("Invalid embedding type")
|
||||
|
||||
embeddings = self._postprocess_outputs(embeddings)
|
||||
return [embedding for embedding in embeddings]
|
||||
@@ -66,14 +66,9 @@ class ONNXModelRunner:
|
||||
def run(self, input: dict[str, Any]) -> Any:
|
||||
if self.type == "ov":
|
||||
infer_request = self.interpreter.create_infer_request()
|
||||
input_tensor = list(input.values())
|
||||
|
||||
if len(input_tensor) == 1:
|
||||
input_tensor = ov.Tensor(array=input_tensor[0])
|
||||
else:
|
||||
input_tensor = ov.Tensor(array=input_tensor)
|
||||
outputs = infer_request.infer(input)
|
||||
|
||||
infer_request.infer(input_tensor)
|
||||
return [infer_request.get_output_tensor().data]
|
||||
return outputs
|
||||
elif self.type == "ort":
|
||||
return self.ort.run(None, input)
|
||||
|
||||
Reference in New Issue
Block a user