Cleanup detection (#17785)

* Fix yolov9 NMS * Improve batched yolo NMS * Consolidate grids and strides calculation * Use existing variable * Remove * Ensure init is called
2025-09-28 17:53:51 +02:00 · 2025-04-18 10:26:34 -06:00 · 2025-04-18 10:26:34 -06:00 · 68382d89b4
commit 68382d89b4
parent 14a32a6472
5 changed files with 116 additions and 121 deletions
--- a/frigate/detectors/detection_api.py
+++ b/frigate/detectors/detection_api.py
@ -16,7 +16,7 @@ class DetectionApi(ABC):
    @abstractmethod
    def __init__(self, detector_config: BaseDetectorConfig):
        self.detector_config = detector_config
-        self.thresh = 0.5
+        self.thresh = 0.4
        self.height = detector_config.model.height
        self.width = detector_config.model.width
@ -24,58 +24,21 @@ class DetectionApi(ABC):
    def detect_raw(self, tensor_input):
        pass
-    def post_process_yolonas(self, output):
+    def calculate_grids_strides(self) -> None:
-        """
+        grids = []
-        @param output: output of inference
+        expanded_strides = []
        expected shape: [np.array(1, N, 4), np.array(1, N, 80)]
        where N depends on the input size e.g. N=2100 for 320x320 images
-        @return: best results: np.array(20, 6) where each row is
+        # decode and orient predictions
-        in this order (class_id, score, y1/height, x1/width, y2/height, x2/width)
+        strides = [8, 16, 32]
-        """
+        hsizes = [self.height // stride for stride in strides]
        wsizes = [self.width // stride for stride in strides]
-        N = output[0].shape[1]
+        for hsize, wsize, stride in zip(hsizes, wsizes, strides):
            xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
            grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
            grids.append(grid)
            shape = grid.shape[:2]
            expanded_strides.append(np.full((*shape, 1), stride))
-        boxes = output[0].reshape(N, 4)
+        self.grids = np.concatenate(grids, 1)
-        scores = output[1].reshape(N, 80)
+        self.expanded_strides = np.concatenate(expanded_strides, 1)
        class_ids = np.argmax(scores, axis=1)
        scores = scores[np.arange(N), class_ids]
        args_best = np.argwhere(scores > self.thresh)[:, 0]
        num_matches = len(args_best)
        if num_matches == 0:
            return np.zeros((20, 6), np.float32)
        elif num_matches > 20:
            args_best20 = np.argpartition(scores[args_best], -20)[-20:]
            args_best = args_best[args_best20]
        boxes = boxes[args_best]
        class_ids = class_ids[args_best]
        scores = scores[args_best]
        boxes = np.transpose(
            np.vstack(
                (
                    boxes[:, 1] / self.height,
                    boxes[:, 0] / self.width,
                    boxes[:, 3] / self.height,
                    boxes[:, 2] / self.width,
                )
            )
        )
        results = np.hstack(
            (class_ids[..., np.newaxis], scores[..., np.newaxis], boxes)
        )
        return np.resize(results, (20, 6))
    def post_process(self, output):
        if self.detector_config.model.model_type == ModelTypeEnum.yolonas:
            return self.post_process_yolonas(output)
        else:
            raise ValueError(
                f'Model type "{self.detector_config.model.model_type}" is currently not supported.'
            )
--- a/frigate/detectors/plugins/onnx.py
+++ b/frigate/detectors/plugins/onnx.py
@ -31,6 +31,8 @@ class ONNXDetector(DetectionApi):
    type_key = DETECTOR_KEY
    def __init__(self, detector_config: ONNXDetectorConfig):
        super().__init__(detector_config)
        try:
            import onnxruntime as ort
@ -52,31 +54,13 @@ class ONNXDetector(DetectionApi):
            path, providers=providers, provider_options=options
        )
        self.h = detector_config.model.height
        self.w = detector_config.model.width
        self.onnx_model_type = detector_config.model.model_type
        self.onnx_model_px = detector_config.model.input_pixel_format
        self.onnx_model_shape = detector_config.model.input_tensor
        path = detector_config.model.path
        if self.onnx_model_type == ModelTypeEnum.yolox:
-            grids = []
+            self.calculate_grids_strides()
            expanded_strides = []
            # decode and orient predictions
            strides = [8, 16, 32]
            hsizes = [self.h // stride for stride in strides]
            wsizes = [self.w // stride for stride in strides]
            for hsize, wsize, stride in zip(hsizes, wsizes, strides):
                xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
                grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
                grids.append(grid)
                shape = grid.shape[:2]
                expanded_strides.append(np.full((*shape, 1), stride))
            self.grids = np.concatenate(grids, 1)
            self.expanded_strides = np.concatenate(expanded_strides, 1)
        logger.info(f"ONNX: {path} loaded")
@ -86,10 +70,12 @@ class ONNXDetector(DetectionApi):
                None,
                {
                    "images": tensor_input,
-                    "orig_target_sizes": np.array([[self.h, self.w]], dtype=np.int64),
+                    "orig_target_sizes": np.array(
                        [[self.height, self.width]], dtype=np.int64
                    ),
                },
            )
-            return post_process_dfine(tensor_output, self.w, self.h)
+            return post_process_dfine(tensor_output, self.width, self.height)
        model_input_name = self.model.get_inputs()[0].name
        tensor_output = self.model.run(None, {model_input_name: tensor_input})
@ -111,17 +97,21 @@ class ONNXDetector(DetectionApi):
                detections[i] = [
                    class_id,
                    confidence,
-                    y_min / self.h,
+                    y_min / self.height,
-                    x_min / self.w,
+                    x_min / self.width,
-                    y_max / self.h,
+                    y_max / self.height,
-                    x_max / self.w,
+                    x_max / self.width,
                ]
            return detections
        elif self.onnx_model_type == ModelTypeEnum.yologeneric:
-            return post_process_yolo(tensor_output, self.w, self.h)
+            return post_process_yolo(tensor_output, self.width, self.height)
        elif self.onnx_model_type == ModelTypeEnum.yolox:
            return post_process_yolox(
-                tensor_output[0], self.w, self.h, self.grids, self.expanded_strides
+                tensor_output[0],
                self.width,
                self.height,
                self.grids,
                self.expanded_strides,
            )
        else:
            raise Exception(
--- a/frigate/detectors/plugins/openvino.py
+++ b/frigate/detectors/plugins/openvino.py
@ -38,6 +38,7 @@ class OvDetector(DetectionApi):
    ]
    def __init__(self, detector_config: OvDetectorConfig):
        super().__init__(detector_config)
        self.ov_core = ov.Core()
        self.ov_model_type = detector_config.model.model_type
@ -133,25 +134,7 @@ class OvDetector(DetectionApi):
                    break
            self.num_classes = tensor_shape[2] - 5
            logger.info(f"YOLOX model has {self.num_classes} classes")
-            self.set_strides_grids()
+            self.calculate_grids_strides()
    def set_strides_grids(self):
        grids = []
        expanded_strides = []
        strides = [8, 16, 32]
        hsize_list = [self.h // stride for stride in strides]
        wsize_list = [self.w // stride for stride in strides]
        for hsize, wsize, stride in zip(hsize_list, wsize_list, strides):
            xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
            grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
            grids.append(grid)
            shape = grid.shape[:2]
            expanded_strides.append(np.full((*shape, 1), stride))
        self.grids = np.concatenate(grids, 1)
        self.expanded_strides = np.concatenate(expanded_strides, 1)
    ## Takes in class ID, confidence score, and array of [x, y, w, h] that describes detection position,
    ## returns an array that's easily passable back to Frigate.
--- a/frigate/detectors/plugins/rknn.py
+++ b/frigate/detectors/plugins/rknn.py
@ -4,6 +4,7 @@ import re
 import urllib.request
 from typing import Literal
 import numpy as np
 from pydantic import Field
 from frigate.const import MODEL_CACHE_DIR
@ -150,6 +151,62 @@ class Rknn(DetectionApi):
                'Make sure to set the model input_tensor to "nhwc" in your config.'
            )
    def post_process_yolonas(self, output: list[np.ndarray]):
        """
        @param output: output of inference
        expected shape: [np.array(1, N, 4), np.array(1, N, 80)]
        where N depends on the input size e.g. N=2100 for 320x320 images
        @return: best results: np.array(20, 6) where each row is
        in this order (class_id, score, y1/height, x1/width, y2/height, x2/width)
        """
        N = output[0].shape[1]
        boxes = output[0].reshape(N, 4)
        scores = output[1].reshape(N, 80)
        class_ids = np.argmax(scores, axis=1)
        scores = scores[np.arange(N), class_ids]
        args_best = np.argwhere(scores > self.thresh)[:, 0]
        num_matches = len(args_best)
        if num_matches == 0:
            return np.zeros((20, 6), np.float32)
        elif num_matches > 20:
            args_best20 = np.argpartition(scores[args_best], -20)[-20:]
            args_best = args_best[args_best20]
        boxes = boxes[args_best]
        class_ids = class_ids[args_best]
        scores = scores[args_best]
        boxes = np.transpose(
            np.vstack(
                (
                    boxes[:, 1] / self.height,
                    boxes[:, 0] / self.width,
                    boxes[:, 3] / self.height,
                    boxes[:, 2] / self.width,
                )
            )
        )
        results = np.hstack(
            (class_ids[..., np.newaxis], scores[..., np.newaxis], boxes)
        )
        return np.resize(results, (20, 6))
    def post_process(self, output):
        if self.detector_config.model.model_type == ModelTypeEnum.yolonas:
            return self.post_process_yolonas(output)
        else:
            raise ValueError(
                f'Model type "{self.detector_config.model.model_type}" is currently not supported.'
            )
    def detect_raw(self, tensor_input):
        output = self.rknn.inference(
            [
--- a/frigate/util/model.py
+++ b/frigate/util/model.py
@ -148,27 +148,17 @@ def __post_process_multipart_yolo(
                    bw = ((dw * 2.0) ** 2) * anchor_w
                    bh = ((dh * 2.0) ** 2) * anchor_h
-                    x1 = max(0, bx - bw / 2) / width
+                    x1 = max(0, bx - bw / 2)
-                    y1 = max(0, by - bh / 2) / height
+                    y1 = max(0, by - bh / 2)
-                    x2 = min(width, bx + bw / 2) / width
+                    x2 = min(width, bx + bw / 2)
-                    y2 = min(height, by + bh / 2) / height
+                    y2 = min(height, by + bh / 2)
                    all_boxes.append([x1, y1, x2, y2])
                    all_scores.append(conf)
                    all_class_ids.append(class_id)
    formatted_boxes = [
        [
            int(x1 * width),
            int(y1 * height),
            int((x2 - x1) * width),
            int((y2 - y1) * height),
        ]
        for x1, y1, x2, y2 in all_boxes
    ]
    indices = cv2.dnn.NMSBoxes(
-        bboxes=formatted_boxes,
+        bboxes=all_boxes,
        scores=all_scores,
        score_threshold=0.4,
        nms_threshold=0.4,
@ -181,7 +171,14 @@ def __post_process_multipart_yolo(
            class_id = all_class_ids[idx]
            conf = all_scores[idx]
            x1, y1, x2, y2 = all_boxes[idx]
-            results[i] = [class_id, conf, y1, x1, y2, x2]
+            results[i] = [
                class_id,
                conf,
                y1 / height,
                x1 / width,
                y2 / height,
                x2 / width,
            ]
    return np.array(results, dtype=np.float32)
@ -200,9 +197,14 @@ def __post_process_nms_yolo(predictions: np.ndarray, width, height) -> np.ndarra
    # Rescale box
    boxes = predictions[:, :4]
    boxes_xyxy = np.ones_like(boxes)
    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2
    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2
    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2
    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2
    boxes = boxes_xyxy
-    input_shape = np.array([width, height, width, height])
+    # run NMS
    boxes = np.divide(boxes, input_shape, dtype=np.float32)
    indices = cv2.dnn.NMSBoxes(boxes, scores, score_threshold=0.4, nms_threshold=0.4)
    detections = np.zeros((20, 6), np.float32)
    for i, (bbox, confidence, class_id) in enumerate(
@ -214,10 +216,10 @@ def __post_process_nms_yolo(predictions: np.ndarray, width, height) -> np.ndarra
        detections[i] = [
            class_id,
            confidence,
-            bbox[1] - bbox[3] / 2,
+            bbox[1] / height,
-            bbox[0] - bbox[2] / 2,
+            bbox[0] / width,
-            bbox[1] + bbox[3] / 2,
+            bbox[3] / height,
-            bbox[0] + bbox[2] / 2,
+            bbox[2] / width,
        ]
    return detections