blakeblackshear.frigate/frigate/detectors/plugins/tensorrt.py

import ctypes
import logging

import numpy as np

try:
    import tensorrt as trt
    from cuda import cuda

    TRT_SUPPORT = True
except ModuleNotFoundError:
    TRT_SUPPORT = False

from pydantic import Field
from typing_extensions import Literal

from frigate.detectors.detection_api import DetectionApi
from frigate.detectors.detector_config import BaseDetectorConfig

logger = logging.getLogger(__name__)

DETECTOR_KEY = "tensorrt"

if TRT_SUPPORT:

    class TrtLogger(trt.ILogger):
        def __init__(self):
            trt.ILogger.__init__(self)

        def log(self, severity, msg):
            logger.log(self.getSeverity(severity), msg)

        def getSeverity(self, sev: trt.ILogger.Severity) -> int:
            if sev == trt.ILogger.VERBOSE:
                return logging.DEBUG
            elif sev == trt.ILogger.INFO:
                return logging.INFO
            elif sev == trt.ILogger.WARNING:
                return logging.WARNING
            elif sev == trt.ILogger.ERROR:
                return logging.ERROR
            elif sev == trt.ILogger.INTERNAL_ERROR:
                return logging.CRITICAL
            else:
                return logging.DEBUG


class TensorRTDetectorConfig(BaseDetectorConfig):
    type: Literal[DETECTOR_KEY]
    device: int = Field(default=0, title="GPU Device Index")


class HostDeviceMem(object):
    """Simple helper data class that's a little nicer to use than a 2-tuple."""

    def __init__(self, host_mem, device_mem, nbytes, size):
        self.host = host_mem
        err, self.host_dev = cuda.cuMemHostGetDevicePointer(self.host, 0)
        self.device = device_mem
        self.nbytes = nbytes
        self.size = size

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

    def __del__(self):
        cuda.cuMemFreeHost(self.host)
        cuda.cuMemFree(self.device)


class TensorRtDetector(DetectionApi):
    type_key = DETECTOR_KEY

    def _load_engine(self, model_path):
        try:
            trt.init_libnvinfer_plugins(self.trt_logger, "")

            ctypes.cdll.LoadLibrary("/usr/local/lib/libyolo_layer.so")
        except OSError as e:
            logger.error(
                "ERROR: failed to load libraries. %s",
                e,
            )

        with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def _get_input_shape(self):
        """Get input shape of the TensorRT YOLO engine."""
        binding = self.engine[0]
        assert self.engine.binding_is_input(binding)
        binding_dims = self.engine.get_binding_shape(binding)
        if len(binding_dims) == 4:
            return (
                tuple(binding_dims[2:]),
                trt.nptype(self.engine.get_binding_dtype(binding)),
            )
        elif len(binding_dims) == 3:
            return (
                tuple(binding_dims[1:]),
                trt.nptype(self.engine.get_binding_dtype(binding)),
            )
        else:
            raise ValueError(
                "bad dims of binding %s: %s" % (binding, str(binding_dims))
            )

    def _allocate_buffers(self):
        """Allocates all host/device in/out buffers required for an engine."""
        inputs = []
        outputs = []
        bindings = []
        output_idx = 0
        for binding in self.engine:
            binding_dims = self.engine.get_binding_shape(binding)
            if len(binding_dims) == 4:
                # explicit batch case (TensorRT 7+)
                size = trt.volume(binding_dims)
            elif len(binding_dims) == 3:
                # implicit batch case (TensorRT 6 or older)
                size = trt.volume(binding_dims) * self.engine.max_batch_size
            else:
                raise ValueError(
                    "bad dims of binding %s: %s" % (binding, str(binding_dims))
                )
            nbytes = size * self.engine.get_binding_dtype(binding).itemsize
            # Allocate host and device buffers
            err, host_mem = cuda.cuMemHostAlloc(
                nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP
            )
            assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"
            logger.debug(
                f"Allocated Tensor Binding {binding} Memory {nbytes} Bytes ({size} * {self.engine.get_binding_dtype(binding)})"
            )
            err, device_mem = cuda.cuMemAlloc(nbytes)
            assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # Append to the appropriate list.
            if self.engine.binding_is_input(binding):
                logger.debug(f"Input has Shape {binding_dims}")
                inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
            else:
                # each grid has 3 anchors, each anchor generates a detection
                # output of 7 float32 values
                assert size % 7 == 0, f"output size was {size}"
                logger.debug(f"Output has Shape {binding_dims}")
                outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))
                output_idx += 1
        assert len(inputs) == 1, f"inputs len was {len(inputs)}"
        assert len(outputs) == 1, f"output len was {len(outputs)}"
        return inputs, outputs, bindings

    def _do_inference(self):
        """do_inference (for TensorRT 7.0+)
        This function is generalized for multiple inputs/outputs for full
        dimension networks.
        Inputs and outputs are expected to be lists of HostDeviceMem objects.
        """
        # Push CUDA Context
        cuda.cuCtxPushCurrent(self.cu_ctx)

        # Transfer input data to the GPU.
        [
            cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)
            for inp in self.inputs
        ]

        # Run inference.
        if not self.context.execute_async_v2(
            bindings=self.bindings, stream_handle=self.stream
        ):
            logger.warn("Execute returned false")

        # Transfer predictions back from the GPU.
        [
            cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)
            for out in self.outputs
        ]

        # Synchronize the stream
        cuda.cuStreamSynchronize(self.stream)

        # Pop CUDA Context
        cuda.cuCtxPopCurrent()

        # Return only the host outputs.
        return [
            np.array(
                (ctypes.c_float * out.size).from_address(out.host), dtype=np.float32
            )
            for out in self.outputs
        ]

    def __init__(self, detector_config: TensorRTDetectorConfig):
        assert (
            TRT_SUPPORT
        ), f"TensorRT libraries not found, {DETECTOR_KEY} detector not present"

        (cuda_err,) = cuda.cuInit(0)
        assert (
            cuda_err == cuda.CUresult.CUDA_SUCCESS
        ), f"Failed to initialize cuda {cuda_err}"
        err, dev_count = cuda.cuDeviceGetCount()
        logger.debug(f"Num Available Devices: {dev_count}")
        assert (
            detector_config.device < dev_count
        ), f"Invalid TensorRT Device Config. Device {detector_config.device} Invalid."
        err, self.cu_ctx = cuda.cuCtxCreate(
            cuda.CUctx_flags.CU_CTX_MAP_HOST, detector_config.device
        )

        self.conf_th = 0.4  ##TODO: model config parameter
        self.nms_threshold = 0.4
        err, self.stream = cuda.cuStreamCreate(0)
        self.trt_logger = TrtLogger()
        self.engine = self._load_engine(detector_config.model.path)
        self.input_shape = self._get_input_shape()

        try:
            self.context = self.engine.create_execution_context()
            (
                self.inputs,
                self.outputs,
                self.bindings,
            ) = self._allocate_buffers()
        except Exception as e:
            logger.error(e)
            raise RuntimeError("fail to allocate CUDA resources") from e

        logger.debug("TensorRT loaded. Input shape is %s", self.input_shape)
        logger.debug("TensorRT version is %s", trt.__version__[0])

    def __del__(self):
        """Free CUDA memories."""
        if self.outputs is not None:
            del self.outputs
        if self.inputs is not None:
            del self.inputs
        if self.stream is not None:
            cuda.cuStreamDestroy(self.stream)
            del self.stream
        del self.engine
        del self.context
        del self.trt_logger
        cuda.cuCtxDestroy(self.cu_ctx)

    def _postprocess_yolo(self, trt_outputs, conf_th):
        """Postprocess TensorRT outputs.
        # Args
            trt_outputs: a list of 2 or 3 tensors, where each tensor
                        contains a multiple of 7 float32 numbers in
                        the order of [x, y, w, h, box_confidence, class_id, class_prob]
            conf_th: confidence threshold
        # Returns
            boxes, scores, classes
        """
        # filter low-conf detections and concatenate results of all yolo layers
        detections = []
        for o in trt_outputs:
            dets = o.reshape((-1, 7))
            dets = dets[dets[:, 4] * dets[:, 6] >= conf_th]
            detections.append(dets)
        detections = np.concatenate(detections, axis=0)

        return detections

    def detect_raw(self, tensor_input):
        # Input tensor has the shape of the [height, width, 3]
        # Output tensor of float32 of shape [20, 6] where:
        # O - class id
        # 1 - score
        # 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right]

        # normalize
        if self.input_shape[-1] != trt.int8:
            tensor_input = tensor_input.astype(self.input_shape[-1])
            tensor_input /= 255.0

        self.inputs[0].host = np.ascontiguousarray(
            tensor_input.astype(self.input_shape[-1])
        )
        trt_outputs = self._do_inference()

        raw_detections = self._postprocess_yolo(trt_outputs, self.conf_th)

        if len(raw_detections) == 0:
            return np.zeros((20, 6), np.float32)

        # raw_detections: Nx7 numpy arrays of
        #             [[x, y, w, h, box_confidence, class_id, class_prob],

        # Calculate score as box_confidence x class_prob
        raw_detections[:, 4] = raw_detections[:, 4] * raw_detections[:, 6]
        # Reorder elements by the score, best on top, remove class_prob
        ordered = raw_detections[raw_detections[:, 4].argsort()[::-1]][:, 0:6]
        # transform width to right with clamp to 0..1
        ordered[:, 2] = np.clip(ordered[:, 2] + ordered[:, 0], 0, 1)
        # transform height to bottom with clamp to 0..1
        ordered[:, 3] = np.clip(ordered[:, 3] + ordered[:, 1], 0, 1)
        # put result into the correct order and limit to top 20
        detections = ordered[:, [5, 4, 1, 0, 3, 2]][:20]

        # pad to 20x6 shape
        append_cnt = 20 - len(detections)
        if append_cnt > 0:
            detections = np.append(
                detections, np.zeros((append_cnt, 6), np.float32), axis=0
            )

        return detections
Add isort and ruff linter (#6575) * Add isort and ruff linter Both linters are pretty common among modern python code bases. The isort tool provides stable sorting and grouping, as well as pruning of unused imports. Ruff is a modern linter, that is very fast due to being written in rust. It can detect many common issues in a python codebase. Removes the pylint dev requirement, since ruff replaces it. * treewide: fix issues detected by ruff * treewide: fix bare except clauses * .devcontainer: Set up isort * treewide: optimize imports * treewide: apply black * treewide: make regex patterns raw strings This is necessary for escape sequences to be properly recognized. 2023-05-29 12:31:17 +02:00			`import ctypes`
Nvidia TensorRT detector (#4718) * Initial WIP dockerfile and scripts to add tensorrt support * Add tensorRT detector * WIP attempt to install TensorRT 8.5 * Updates to detector for cuda python library * TensorRT Cuda library rework WIP Does not run * Fixes from rebase to detector factory * Fix parsing output memory pointer * Handle TensorRT logs with the python logger * Use non-async interface and convert input data to float32. Detection runs without error. * Make TensorRT a separate build from the base Frigate image. * Add script and documentation for generating TRT Models * Add support for TensorRT devcontainer * Add labelmap to trt model script and docs. Cleanup of old scripts. * Update detect to normalize input tensor using model input type * Add config for selecting GPU. Fix Async inference. Update documentation. * Update some CUDA libraries to clean up version warning * Add CI stage to build TensorRT tag * Add note in docs for image tag and model support 2022-12-30 17:53:17 +01:00			`import logging`

			`import numpy as np`

			`try:`
			`import tensorrt as trt`
			`from cuda import cuda`

			`TRT_SUPPORT = True`
Add isort and ruff linter (#6575) * Add isort and ruff linter Both linters are pretty common among modern python code bases. The isort tool provides stable sorting and grouping, as well as pruning of unused imports. Ruff is a modern linter, that is very fast due to being written in rust. It can detect many common issues in a python codebase. Removes the pylint dev requirement, since ruff replaces it. * treewide: fix issues detected by ruff * treewide: fix bare except clauses * .devcontainer: Set up isort * treewide: optimize imports * treewide: apply black * treewide: make regex patterns raw strings This is necessary for escape sequences to be properly recognized. 2023-05-29 12:31:17 +02:00			`except ModuleNotFoundError:`
Nvidia TensorRT detector (#4718) * Initial WIP dockerfile and scripts to add tensorrt support * Add tensorRT detector * WIP attempt to install TensorRT 8.5 * Updates to detector for cuda python library * TensorRT Cuda library rework WIP Does not run * Fixes from rebase to detector factory * Fix parsing output memory pointer * Handle TensorRT logs with the python logger * Use non-async interface and convert input data to float32. Detection runs without error. * Make TensorRT a separate build from the base Frigate image. * Add script and documentation for generating TRT Models * Add support for TensorRT devcontainer * Add labelmap to trt model script and docs. Cleanup of old scripts. * Update detect to normalize input tensor using model input type * Add config for selecting GPU. Fix Async inference. Update documentation. * Update some CUDA libraries to clean up version warning * Add CI stage to build TensorRT tag * Add note in docs for image tag and model support 2022-12-30 17:53:17 +01:00			`TRT_SUPPORT = False`

Add isort and ruff linter (#6575) * Add isort and ruff linter Both linters are pretty common among modern python code bases. The isort tool provides stable sorting and grouping, as well as pruning of unused imports. Ruff is a modern linter, that is very fast due to being written in rust. It can detect many common issues in a python codebase. Removes the pylint dev requirement, since ruff replaces it. * treewide: fix issues detected by ruff * treewide: fix bare except clauses * .devcontainer: Set up isort * treewide: optimize imports * treewide: apply black * treewide: make regex patterns raw strings This is necessary for escape sequences to be properly recognized. 2023-05-29 12:31:17 +02:00			`from pydantic import Field`
			`from typing_extensions import Literal`

Nvidia TensorRT detector (#4718) * Initial WIP dockerfile and scripts to add tensorrt support * Add tensorRT detector * WIP attempt to install TensorRT 8.5 * Updates to detector for cuda python library * TensorRT Cuda library rework WIP Does not run * Fixes from rebase to detector factory * Fix parsing output memory pointer * Handle TensorRT logs with the python logger * Use non-async interface and convert input data to float32. Detection runs without error. * Make TensorRT a separate build from the base Frigate image. * Add script and documentation for generating TRT Models * Add support for TensorRT devcontainer * Add labelmap to trt model script and docs. Cleanup of old scripts. * Update detect to normalize input tensor using model input type * Add config for selecting GPU. Fix Async inference. Update documentation. * Update some CUDA libraries to clean up version warning * Add CI stage to build TensorRT tag * Add note in docs for image tag and model support 2022-12-30 17:53:17 +01:00			`from frigate.detectors.detection_api import DetectionApi`
			`from frigate.detectors.detector_config import BaseDetectorConfig`

			`logger = logging.getLogger(__name__)`

			`DETECTOR_KEY = "tensorrt"`

			`if TRT_SUPPORT:`

			`class TrtLogger(trt.ILogger):`
			`def __init__(self):`
			`trt.ILogger.__init__(self)`

			`def log(self, severity, msg):`
			`logger.log(self.getSeverity(severity), msg)`

			`def getSeverity(self, sev: trt.ILogger.Severity) -> int:`
			`if sev == trt.ILogger.VERBOSE:`
			`return logging.DEBUG`
			`elif sev == trt.ILogger.INFO:`
			`return logging.INFO`
			`elif sev == trt.ILogger.WARNING:`
			`return logging.WARNING`
			`elif sev == trt.ILogger.ERROR:`
			`return logging.ERROR`
			`elif sev == trt.ILogger.INTERNAL_ERROR:`
			`return logging.CRITICAL`
			`else:`
			`return logging.DEBUG`


			`class TensorRTDetectorConfig(BaseDetectorConfig):`
			`type: Literal[DETECTOR_KEY]`
			`device: int = Field(default=0, title="GPU Device Index")`


			`class HostDeviceMem(object):`
			`"""Simple helper data class that's a little nicer to use than a 2-tuple."""`

			`def __init__(self, host_mem, device_mem, nbytes, size):`
			`self.host = host_mem`
			`err, self.host_dev = cuda.cuMemHostGetDevicePointer(self.host, 0)`
			`self.device = device_mem`
			`self.nbytes = nbytes`
			`self.size = size`

			`def __str__(self):`
			`return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)`

			`def __repr__(self):`
			`return self.__str__()`

			`def __del__(self):`
			`cuda.cuMemFreeHost(self.host)`
			`cuda.cuMemFree(self.device)`


			`class TensorRtDetector(DetectionApi):`
			`type_key = DETECTOR_KEY`

			`def _load_engine(self, model_path):`
			`try:`
			`trt.init_libnvinfer_plugins(self.trt_logger, "")`

Upgrade TensorRT to 8.5.3 (#7006) * Update to latest tensorrt (8.6.1) release * Build trt libyolo_layer.so in container * Update tensorrt_models script to convert models from the frigate container * Fix typo in model script * Fix paths to yolo lib and models folder * Add S6 scripts to test and convert specified TensortRT models at startup. Rearrange tensorrt files into a docker support folder. * Update TensorRT documentation to reflect the new model conversion process and minimum HW support. * Fix model_cache path to live in config directory * Move tensorrt s6 files to the correct directory * Fix issues in model generation script * Disable global timeout for s6 services * Add version folder to tensorrt model_cache path * Include TensorRT version 8.5.3 * Add numpy requirement prior to removal of np.bool * This TRT version uses a mixture of cuda dependencies * Redirect stdout from noisy model conversion 2023-07-06 21:20:33 +02:00			`ctypes.cdll.LoadLibrary("/usr/local/lib/libyolo_layer.so")`
Nvidia TensorRT detector (#4718) * Initial WIP dockerfile and scripts to add tensorrt support * Add tensorRT detector * WIP attempt to install TensorRT 8.5 * Updates to detector for cuda python library * TensorRT Cuda library rework WIP Does not run * Fixes from rebase to detector factory * Fix parsing output memory pointer * Handle TensorRT logs with the python logger * Use non-async interface and convert input data to float32. Detection runs without error. * Make TensorRT a separate build from the base Frigate image. * Add script and documentation for generating TRT Models * Add support for TensorRT devcontainer * Add labelmap to trt model script and docs. Cleanup of old scripts. * Update detect to normalize input tensor using model input type * Add config for selecting GPU. Fix Async inference. Update documentation. * Update some CUDA libraries to clean up version warning * Add CI stage to build TensorRT tag * Add note in docs for image tag and model support 2022-12-30 17:53:17 +01:00			`except OSError as e:`
			`logger.error(`
			`"ERROR: failed to load libraries. %s",`
			`e,`
			`)`

			`with open(model_path, "rb") as f, trt.Runtime(self.trt_logger) as runtime:`
			`return runtime.deserialize_cuda_engine(f.read())`

			`def _get_input_shape(self):`
			`"""Get input shape of the TensorRT YOLO engine."""`
			`binding = self.engine[0]`
			`assert self.engine.binding_is_input(binding)`
			`binding_dims = self.engine.get_binding_shape(binding)`
			`if len(binding_dims) == 4:`
			`return (`
			`tuple(binding_dims[2:]),`
			`trt.nptype(self.engine.get_binding_dtype(binding)),`
			`)`
			`elif len(binding_dims) == 3:`
			`return (`
			`tuple(binding_dims[1:]),`
			`trt.nptype(self.engine.get_binding_dtype(binding)),`
			`)`
			`else:`
			`raise ValueError(`
			`"bad dims of binding %s: %s" % (binding, str(binding_dims))`
			`)`

			`def _allocate_buffers(self):`
			`"""Allocates all host/device in/out buffers required for an engine."""`
			`inputs = []`
			`outputs = []`
			`bindings = []`
			`output_idx = 0`
			`for binding in self.engine:`
			`binding_dims = self.engine.get_binding_shape(binding)`
			`if len(binding_dims) == 4:`
			`# explicit batch case (TensorRT 7+)`
			`size = trt.volume(binding_dims)`
			`elif len(binding_dims) == 3:`
			`# implicit batch case (TensorRT 6 or older)`
			`size = trt.volume(binding_dims) * self.engine.max_batch_size`
			`else:`
			`raise ValueError(`
			`"bad dims of binding %s: %s" % (binding, str(binding_dims))`
			`)`
			`nbytes = size * self.engine.get_binding_dtype(binding).itemsize`
			`# Allocate host and device buffers`
			`err, host_mem = cuda.cuMemHostAlloc(`
			`nbytes, Flags=cuda.CU_MEMHOSTALLOC_DEVICEMAP`
			`)`
			`assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAllocHost returned {err}"`
			`logger.debug(`
			`f"Allocated Tensor Binding {binding} Memory {nbytes} Bytes ({size} * {self.engine.get_binding_dtype(binding)})"`
			`)`
			`err, device_mem = cuda.cuMemAlloc(nbytes)`
			`assert err is cuda.CUresult.CUDA_SUCCESS, f"cuMemAlloc returned {err}"`
			`# Append the device buffer to device bindings.`
			`bindings.append(int(device_mem))`
			`# Append to the appropriate list.`
			`if self.engine.binding_is_input(binding):`
			`logger.debug(f"Input has Shape {binding_dims}")`
			`inputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))`
			`else:`
			`# each grid has 3 anchors, each anchor generates a detection`
			`# output of 7 float32 values`
			`assert size % 7 == 0, f"output size was {size}"`
			`logger.debug(f"Output has Shape {binding_dims}")`
			`outputs.append(HostDeviceMem(host_mem, device_mem, nbytes, size))`
			`output_idx += 1`
			`assert len(inputs) == 1, f"inputs len was {len(inputs)}"`
			`assert len(outputs) == 1, f"output len was {len(outputs)}"`
			`return inputs, outputs, bindings`

			`def _do_inference(self):`
			`"""do_inference (for TensorRT 7.0+)`
			`This function is generalized for multiple inputs/outputs for full`
			`dimension networks.`
			`Inputs and outputs are expected to be lists of HostDeviceMem objects.`
			`"""`
			`# Push CUDA Context`
			`cuda.cuCtxPushCurrent(self.cu_ctx)`

			`# Transfer input data to the GPU.`
			`[`
			`cuda.cuMemcpyHtoDAsync(inp.device, inp.host, inp.nbytes, self.stream)`
			`for inp in self.inputs`
			`]`

			`# Run inference.`
			`if not self.context.execute_async_v2(`
			`bindings=self.bindings, stream_handle=self.stream`
			`):`
Add isort and ruff linter (#6575) * Add isort and ruff linter Both linters are pretty common among modern python code bases. The isort tool provides stable sorting and grouping, as well as pruning of unused imports. Ruff is a modern linter, that is very fast due to being written in rust. It can detect many common issues in a python codebase. Removes the pylint dev requirement, since ruff replaces it. * treewide: fix issues detected by ruff * treewide: fix bare except clauses * .devcontainer: Set up isort * treewide: optimize imports * treewide: apply black * treewide: make regex patterns raw strings This is necessary for escape sequences to be properly recognized. 2023-05-29 12:31:17 +02:00			`logger.warn("Execute returned false")`
Nvidia TensorRT detector (#4718) * Initial WIP dockerfile and scripts to add tensorrt support * Add tensorRT detector * WIP attempt to install TensorRT 8.5 * Updates to detector for cuda python library * TensorRT Cuda library rework WIP Does not run * Fixes from rebase to detector factory * Fix parsing output memory pointer * Handle TensorRT logs with the python logger * Use non-async interface and convert input data to float32. Detection runs without error. * Make TensorRT a separate build from the base Frigate image. * Add script and documentation for generating TRT Models * Add support for TensorRT devcontainer * Add labelmap to trt model script and docs. Cleanup of old scripts. * Update detect to normalize input tensor using model input type * Add config for selecting GPU. Fix Async inference. Update documentation. * Update some CUDA libraries to clean up version warning * Add CI stage to build TensorRT tag * Add note in docs for image tag and model support 2022-12-30 17:53:17 +01:00
			`# Transfer predictions back from the GPU.`
			`[`
			`cuda.cuMemcpyDtoHAsync(out.host, out.device, out.nbytes, self.stream)`
			`for out in self.outputs`
			`]`

			`# Synchronize the stream`
			`cuda.cuStreamSynchronize(self.stream)`

			`# Pop CUDA Context`
			`cuda.cuCtxPopCurrent()`

			`# Return only the host outputs.`
			`return [`
			`np.array(`
			`(ctypes.c_float * out.size).from_address(out.host), dtype=np.float32`
			`)`
			`for out in self.outputs`
			`]`

			`def __init__(self, detector_config: TensorRTDetectorConfig):`
			`assert (`
			`TRT_SUPPORT`
			`), f"TensorRT libraries not found, {DETECTOR_KEY} detector not present"`

			`(cuda_err,) = cuda.cuInit(0)`
			`assert (`
			`cuda_err == cuda.CUresult.CUDA_SUCCESS`
			`), f"Failed to initialize cuda {cuda_err}"`
			`err, dev_count = cuda.cuDeviceGetCount()`
			`logger.debug(f"Num Available Devices: {dev_count}")`
			`assert (`
			`detector_config.device < dev_count`
			`), f"Invalid TensorRT Device Config. Device {detector_config.device} Invalid."`
			`err, self.cu_ctx = cuda.cuCtxCreate(`
			`cuda.CUctx_flags.CU_CTX_MAP_HOST, detector_config.device`
			`)`

			`self.conf_th = 0.4 ##TODO: model config parameter`
			`self.nms_threshold = 0.4`
			`err, self.stream = cuda.cuStreamCreate(0)`
			`self.trt_logger = TrtLogger()`
			`self.engine = self._load_engine(detector_config.model.path)`
			`self.input_shape = self._get_input_shape()`

			`try:`
			`self.context = self.engine.create_execution_context()`
			`(`
			`self.inputs,`
			`self.outputs,`
			`self.bindings,`
			`) = self._allocate_buffers()`
			`except Exception as e:`
			`logger.error(e)`
			`raise RuntimeError("fail to allocate CUDA resources") from e`

			`logger.debug("TensorRT loaded. Input shape is %s", self.input_shape)`
			`logger.debug("TensorRT version is %s", trt.__version__[0])`

			`def __del__(self):`
			`"""Free CUDA memories."""`
			`if self.outputs is not None:`
			`del self.outputs`
			`if self.inputs is not None:`
			`del self.inputs`
			`if self.stream is not None:`
			`cuda.cuStreamDestroy(self.stream)`
			`del self.stream`
			`del self.engine`
			`del self.context`
			`del self.trt_logger`
			`cuda.cuCtxDestroy(self.cu_ctx)`

			`def _postprocess_yolo(self, trt_outputs, conf_th):`
			`"""Postprocess TensorRT outputs.`
			`# Args`
			`trt_outputs: a list of 2 or 3 tensors, where each tensor`
			`contains a multiple of 7 float32 numbers in`
			`the order of [x, y, w, h, box_confidence, class_id, class_prob]`
			`conf_th: confidence threshold`
			`# Returns`
			`boxes, scores, classes`
			`"""`
			`# filter low-conf detections and concatenate results of all yolo layers`
			`detections = []`
			`for o in trt_outputs:`
			`dets = o.reshape((-1, 7))`
			`dets = dets[dets[:, 4] * dets[:, 6] >= conf_th]`
			`detections.append(dets)`
			`detections = np.concatenate(detections, axis=0)`

			`return detections`

			`def detect_raw(self, tensor_input):`
			`# Input tensor has the shape of the [height, width, 3]`
			`# Output tensor of float32 of shape [20, 6] where:`
			`# O - class id`
			`# 1 - score`
			`# 2..5 - a value between 0 and 1 of the box: [top, left, bottom, right]`

			`# normalize`
			`if self.input_shape[-1] != trt.int8:`
			`tensor_input = tensor_input.astype(self.input_shape[-1])`
			`tensor_input /= 255.0`

			`self.inputs[0].host = np.ascontiguousarray(`
			`tensor_input.astype(self.input_shape[-1])`
			`)`
			`trt_outputs = self._do_inference()`

			`raw_detections = self._postprocess_yolo(trt_outputs, self.conf_th)`

			`if len(raw_detections) == 0:`
			`return np.zeros((20, 6), np.float32)`

			`# raw_detections: Nx7 numpy arrays of`
			`# [[x, y, w, h, box_confidence, class_id, class_prob],`

			`# Calculate score as box_confidence x class_prob`
			`raw_detections[:, 4] = raw_detections[:, 4] * raw_detections[:, 6]`
			`# Reorder elements by the score, best on top, remove class_prob`
			`ordered = raw_detections[raw_detections[:, 4].argsort()[::-1]][:, 0:6]`
			`# transform width to right with clamp to 0..1`
			`ordered[:, 2] = np.clip(ordered[:, 2] + ordered[:, 0], 0, 1)`
			`# transform height to bottom with clamp to 0..1`
			`ordered[:, 3] = np.clip(ordered[:, 3] + ordered[:, 1], 0, 1)`
			`# put result into the correct order and limit to top 20`
			`detections = ordered[:, [5, 4, 1, 0, 3, 2]][:20]`
Don't fail on invalid class IDs for TensorRT detector (#8438) * Don't fail on invalid class IDs * Fix whitespace * Make log warning 2023-11-04 03:19:58 +01:00
Nvidia TensorRT detector (#4718) * Initial WIP dockerfile and scripts to add tensorrt support * Add tensorRT detector * WIP attempt to install TensorRT 8.5 * Updates to detector for cuda python library * TensorRT Cuda library rework WIP Does not run * Fixes from rebase to detector factory * Fix parsing output memory pointer * Handle TensorRT logs with the python logger * Use non-async interface and convert input data to float32. Detection runs without error. * Make TensorRT a separate build from the base Frigate image. * Add script and documentation for generating TRT Models * Add support for TensorRT devcontainer * Add labelmap to trt model script and docs. Cleanup of old scripts. * Update detect to normalize input tensor using model input type * Add config for selecting GPU. Fix Async inference. Update documentation. * Update some CUDA libraries to clean up version warning * Add CI stage to build TensorRT tag * Add note in docs for image tag and model support 2022-12-30 17:53:17 +01:00			`# pad to 20x6 shape`
			`append_cnt = 20 - len(detections)`
			`if append_cnt > 0:`
			`detections = np.append(`
			`detections, np.zeros((append_cnt, 6), np.float32), axis=0`
			`)`

			`return detections`