features added:

- GPU via TensorRT
 - CPU: TensorFlow Lite -> Tensorflow
This commit is contained in:
Alexander Smirnov 2020-04-08 20:53:58 +01:00
parent 2bc57d271c
commit 2298ca740c
16 changed files with 914 additions and 55 deletions

View File

@ -2,5 +2,9 @@ README.md
diagram.png
.gitignore
debug
build
venv*
.idea
config/
docker-compose.*
*.pyc

6
.gitignore vendored
View File

@ -1,4 +1,8 @@
*.pyc
debug
build
venv*
.vscode
config/config.yml
.idea
config/config.yml
docker-compose.*

32
Dockerfile Executable file → Normal file
View File

@ -9,22 +9,20 @@ RUN apt -qq update && apt -qq install --no-install-recommends -y \
build-essential \
gnupg wget unzip \
# libcap-dev \
&& add-apt-repository ppa:deadsnakes/ppa -y \
&& apt -qq install --no-install-recommends -y \
python3.7 \
python3.7-dev \
python3-dev \
python3-pip \
ffmpeg \
# VAAPI drivers for Intel hardware accel
libva-drm2 libva2 i965-va-driver vainfo \
&& python3.7 -m pip install -U wheel setuptools \
&& python3.7 -m pip install -U \
&& python3 -m pip install -U wheel pip setuptools \
&& python3 -m pip install -U \
opencv-python-headless \
# python-prctl \
numpy \
imutils \
scipy \
&& python3.7 -m pip install -U \
&& python3 -m pip install -U \
Flask \
paho-mqtt \
PyYAML \
@ -37,23 +35,27 @@ RUN apt -qq update && apt -qq install --no-install-recommends -y \
&& apt -qq install --no-install-recommends -y \
libedgetpu1-max \
## Tensorflow lite (python 3.7 only)
&& wget -q https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_x86_64.whl \
&& python3.7 -m pip install tflite_runtime-2.1.0.post1-cp37-cp37m-linux_x86_64.whl \
&& rm tflite_runtime-2.1.0.post1-cp37-cp37m-linux_x86_64.whl \
&& wget -q https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp36-cp36m-linux_x86_64.whl \
&& python3 -m pip install tflite_runtime-2.1.0.post1-cp36-cp36m-linux_x86_64.whl \
&& rm tflite_runtime-2.1.0.post1-cp36-cp36m-linux_x86_64.whl \
&& rm -rf /var/lib/apt/lists/* \
&& (apt-get autoremove -y; apt-get autoclean -y)
&& (apt-get autoremove -y; apt-get autoclean -y) \
## Tensorflow
&& python3 -m pip install tensorflow==1.15.2
# get model and labels
RUN wget -q https://github.com/google-coral/edgetpu/raw/master/test_data/mobilenet_ssd_v2_coco_quant_postprocess_edgetpu.tflite -O /edgetpu_model.tflite --trust-server-names
RUN wget -q https://dl.google.com/coral/canned_models/coco_labels.txt -O /labelmap.txt --trust-server-names
RUN wget -q https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip -O /cpu_model.zip && \
unzip /cpu_model.zip detect.tflite -d / && \
mv /detect.tflite /cpu_model.tflite && \
rm /cpu_model.zip
RUN wget -q http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tar.gz -O /cpu_model.tar.gz && \
tar -xf /cpu_model.tar.gz -C / ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb --strip-components 1 && \
mv /frozen_inference_graph.pb /cpu_model.pb && \
rm /cpu_model.tar.gz
WORKDIR /opt/frigate/
ADD frigate frigate/
COPY detect_objects.py .
COPY benchmark.py .
CMD ["python3.7", "-u", "detect_objects.py"]
ENV TF_CPP_MIN_LOG_LEVEL 2
CMD ["python3", "-u", "detect_objects.py"]

183
Dockerfile.gpu Normal file
View File

@ -0,0 +1,183 @@
FROM frigate AS base
#
# CUDA 10.2 base
#
# https://gitlab.com/nvidia/container-images/cuda/blob/master/dist/ubuntu18.04/10.2/base/Dockerfile
#
RUN apt-get update && apt-get install -y --no-install-recommends \
gnupg2 curl ca-certificates && \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
apt-get purge --autoremove -y curl && \
rm -rf /var/lib/apt/lists/*
ENV CUDA_VERSION 10.2.89
LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
ENV CUDA_PKG_VERSION 10-2=$CUDA_VERSION-1
# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
RUN apt-get update && apt-get install -y --no-install-recommends \
cuda-cudart-$CUDA_PKG_VERSION \
cuda-compat-10-2 && \
ln -s cuda-10.2 /usr/local/cuda && \
rm -rf /var/lib/apt/lists/*
# Required for nvidia-docker v1
RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
# nvidia-container-runtime
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411 brand=tesla,driver>=418,driver<419"
#
# CUDA 10.2 runtime
#
# https://gitlab.com/nvidia/container-images/cuda/blob/master/dist/ubuntu18.04/10.2/runtime/Dockerfile
#
ENV NCCL_VERSION 2.5.6
RUN apt-get update && apt-get install -y --no-install-recommends \
cuda-libraries-$CUDA_PKG_VERSION \
cuda-nvtx-$CUDA_PKG_VERSION \
libcublas10=10.2.2.89-1 \
libnccl2=$NCCL_VERSION-1+cuda10.2 && \
apt-mark hold libnccl2 && \
rm -rf /var/lib/apt/lists/*
#
# cuDNN 7.6.5.32 runtime
#
# https://gitlab.com/nvidia/container-images/cuda/blob/master/dist/ubuntu18.04/10.2/runtime/cudnn7/Dockerfile
#
ENV CUDNN_VERSION 7.6.5.32
LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
RUN apt-get update && apt-get install -y --no-install-recommends \
libcudnn7=$CUDNN_VERSION-1+cuda10.2 \
&& \
apt-mark hold libcudnn7 && \
rm -rf /var/lib/apt/lists/*
#
# TensorRT 6.0.1.8
#
# https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/tensorrt-601/tensorrt-install-guide/index.html#maclearn-net-repo-install
#
ENV TENSORRT_VERSION 6.0.1
LABEL com.nvidia.tensorrt.version="${TENSORRT_VERSION}"
RUN version=$TENSORRT_VERSION-1+cuda10.2 && \
apt-get update && apt-get install -y --no-install-recommends \
libnvinfer6=${version} \
libnvonnxparsers6=${version} libnvparsers6=${version} \
libnvinfer-plugin6=${version} \
python3-libnvinfer=${version} \
&& \
apt-mark hold \
libnvinfer6 \
libnvonnxparsers6 libnvparsers6 \
libnvinfer-plugin6 \
python3-libnvinfer \
&& \
rm -rf /var/lib/apt/lists/*
#
# Use a previous stage as a new temporary stage for building libraries
#
FROM base AS builder
#
# CUDA 10.2 devel
#
# https://gitlab.com/nvidia/container-images/cuda/blob/master/dist/ubuntu18.04/10.2/devel/Dockerfile
#
RUN apt-get update && apt-get install -y --no-install-recommends \
cuda-nvml-dev-$CUDA_PKG_VERSION \
cuda-command-line-tools-$CUDA_PKG_VERSION \
cuda-libraries-dev-$CUDA_PKG_VERSION \
cuda-minimal-build-$CUDA_PKG_VERSION \
libnccl-dev=$NCCL_VERSION-1+cuda10.2 \
libcublas-dev=10.2.2.89-1 \
&& \
rm -rf /var/lib/apt/lists/*
ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
#
# cuDNN 7.6.5.32 devel
#
# https://gitlab.com/nvidia/container-images/cuda/blob/master/dist/ubuntu18.04/10.2/devel/cudnn7/Dockerfile
#
RUN apt-get update && apt-get install -y --no-install-recommends \
libcudnn7-dev=$CUDNN_VERSION-1+cuda10.2 \
&& \
rm -rf /var/lib/apt/lists/*
#
# TensorRT 6.0.1.8 devel
#
# https://docs.nvidia.com/deeplearning/sdk/tensorrt-archived/tensorrt-601/tensorrt-install-guide/index.html#maclearn-net-repo-install
#
RUN version=$TENSORRT_VERSION-1+cuda10.2 && \
apt-get update && apt-get install -y --no-install-recommends \
libnvinfer-dev=${version} \
libnvonnxparsers-dev=${version} libnvparsers-dev=${version} \
libnvinfer-plugin-dev=${version} \
&& \
apt-mark hold \
libnvinfer-dev \
libnvonnxparsers-dev libnvparsers-dev \
libnvinfer-plugin-dev \
&& \
rm -rf /var/lib/apt/lists/*
# Install PyCUDA
RUN python3 -m pip install pycuda \
&& python3 -m pip wheel --wheel-dir install pycuda
# Install Cmake
ENV CMAKE_VERSION 3.14.4
RUN cd /tmp && \
wget https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-Linux-x86_64.sh && \
chmod +x cmake-$CMAKE_VERSION-Linux-x86_64.sh && \
./cmake-$CMAKE_VERSION-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
rm ./cmake-$CMAKE_VERSION-Linux-x86_64.sh
# Build plugin
ADD plugin plugin/
RUN mkdir -p build \
&& cd build \
&& cmake ../plugin \
&& make \
&& cd ..
#
# Copy libraries to the final image
#
FROM base AS result
COPY --from=builder /opt/frigate/install install
COPY --from=builder /opt/frigate/build/libflattenconcat.so /usr/lib
RUN python3 -m pip install install/* \
&& rm -r install
# Get UFF model
RUN wget -q https://github.com/dusty-nv/jetson-inference/releases/download/model-mirror-190618/SSD-Mobilenet-v2.tar.gz -O /gpu_model.tar.gz && \
tar -xf /gpu_model.tar.gz -C / SSD-Mobilenet-v2/ssd_mobilenet_v2_coco.uff --strip-components 1 && \
mv /ssd_mobilenet_v2_coco.uff /gpu_model.uff && \
rm /gpu_model.tar.gz
COPY engine.py .
COPY detect_objects_gpu.py .
CMD ["python3", "-u", "detect_objects_gpu.py"]

View File

@ -1,7 +1,7 @@
# Frigate - Realtime Object Detection for IP Cameras
Uses OpenCV and Tensorflow to perform realtime object detection locally for IP cameras. Designed for integration with HomeAssistant or others via MQTT.
Uses OpenCV, Tensorflow/TensorRT to perform realtime object detection locally for IP cameras. Designed for integration with HomeAssistant or others via MQTT.
Use of a [Google Coral USB Accelerator](https://coral.withgoogle.com/products/accelerator/) is optional, but highly recommended. On my Intel i7 processor, I can process 2-3 FPS with the CPU. The Coral can process 100+ FPS with very low CPU load.
Use of a [Google Coral USB Accelerator](https://coral.withgoogle.com/products/accelerator/) or [Nvidia CUDA GPUs](https://developer.nvidia.com/cuda-gpus) is optional, but highly recommended. On my Intel i7 processor, I can process 24 FPS with the CPU. Budget entry-level GPU processes 64 FPS and powerful GPU or the Coral can process 100+ FPS with very low CPU load.
- Leverages multiprocessing heavily with an emphasis on realtime over processing every frame
- Uses a very low overhead motion detection to determine where to run object detection
@ -29,6 +29,10 @@ docker run --rm \
blakeblackshear/frigate:stable
```
To run GPU accelerated `frigate-gpu` Docker image use the [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-(Native-GPU-Support)).
If your GPU supports Half precision (also known as FP16), you can boost performance by enabling this mode as follows:
`docker run --gpus all --env TRT_FLOAT_PRECISION=16 ...`
Example docker-compose:
```yaml
frigate:
@ -47,6 +51,8 @@ Example docker-compose:
FRIGATE_RTSP_PASSWORD: "password"
```
Please note that native GPU support has not landed in docker-compose [yet](https://github.com/docker/compose/issues/6691).
A `config.yml` file must exist in the `config` directory. See example [here](config/config.example.yml) and device specific info can be found [here](docs/DEVICES.md).
Access the mjpeg stream at `http://localhost:5000/<camera_name>` and the best snapshot for any object type with at `http://localhost:5000/<camera_name>/<object_name>/best.jpg`
@ -118,10 +124,12 @@ sensor:
unit_of_measurement: 'ms'
```
## Using a custom model
Models for both CPU and EdgeTPU (Coral) are bundled in the image. You can use your own models with volume mounts:
- CPU Model: `/cpu_model.tflite`
Models for CPU/GPU and EdgeTPU (Coral) are bundled in the images. You can use your own models with volume mounts:
- CPU Model: `/cpu_model.pb`
- GPU Model: `/gpu_model.uff`
- EdgeTPU Model: `/edgetpu_model.tflite`
- Labels: `/labelmap.txt`
## Tips
- Lower the framerate of the video feed on the camera to reduce the CPU usage for capturing the feed
- Choose smaller camera resolution as the images are resized to the shape of the model 300x300 anyway

0
benchmark.py Executable file → Normal file
View File

14
detect_objects_gpu.py Normal file
View File

@ -0,0 +1,14 @@
import os
import subprocess
if __name__ == '__main__':
if not os.path.isfile('/gpu_model.buf'):
engine = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'engine.py')
subprocess.run(['python3', '-u', engine,
'-i', '/gpu_model.uff',
'-o', '/gpu_model.buf',
'-p', os.getenv('TRT_FLOAT_PRECISION', '32')
], check=True)
from detect_objects import main as detect_objects_main
detect_objects_main()

87
engine.py Normal file
View File

@ -0,0 +1,87 @@
import ctypes
import argparse
import sys
import os
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def model_input_shape():
return 3, 300, 300
def build_engine(uff_model_path, trt_engine_datatype=trt.DataType.FLOAT, batch_size=1):
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network() as network, \
trt.UffParser() as parser:
builder.max_workspace_size = 1 << 30
builder.max_batch_size = batch_size
if trt_engine_datatype == trt.DataType.HALF:
builder.fp16_mode = True
parser.register_input("Input", model_input_shape())
parser.register_output("MarkOutput_0")
parser.parse(uff_model_path, network)
return builder.build_cuda_engine(network)
def save_engine(engine, engine_dest_path):
os.makedirs(os.path.dirname(engine_dest_path), exist_ok=True)
buf = engine.serialize()
with open(engine_dest_path, 'wb') as f:
f.write(buf)
def load_engine(trt_runtime, engine_path):
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def load_plugins():
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
try:
ctypes.CDLL('libflattenconcat.so')
except Exception as e:
print("Error: {}\n{}".format(e, "Make sure FlattenConcat custom plugin layer is provided"))
sys.exit(1)
TRT_PRECISION_TO_DATATYPE = {
16: trt.DataType.HALF,
32: trt.DataType.FLOAT
}
if __name__ == '__main__':
# Define script command line arguments
parser = argparse.ArgumentParser(description='Utility to build TensorRT engine prior to inference.')
parser.add_argument('-i', "--input",
dest='uff_model_path', metavar='UFF_MODEL_PATH', required=True,
help='preprocessed TensorFlow model in UFF format')
parser.add_argument('-p', '--precision', type=int, choices=[32, 16], default=32,
help='desired TensorRT float precision to build an engine with')
parser.add_argument('-b', '--batch_size', type=int, default=1,
help='max TensorRT engine batch size')
parser.add_argument("-o", "--output", dest='trt_engine_path',
help="path of the output file",
default=os.path.join(os.path.dirname(os.path.abspath(__file__)), "engine.buf"))
# Parse arguments passed
args = parser.parse_args()
load_plugins()
# Using supplied .uff file alongside with UffParser build TensorRT engine
print("Building TensorRT engine. This may take few minutes.")
trt_engine = build_engine(
uff_model_path=args.uff_model_path,
trt_engine_datatype=TRT_PRECISION_TO_DATATYPE[args.precision],
batch_size=args.batch_size)
# Save the engine to file
save_engine(trt_engine, args.trt_engine_path)
print("TensorRT engine saved to {}".format(args.trt_engine_path))

View File

@ -7,6 +7,12 @@ import pyarrow.plasma as plasma
import tflite_runtime.interpreter as tflite
from tflite_runtime.interpreter import load_delegate
from frigate.util import EventsPerSecond, listen
from frigate.tensorflowcpu import ObjectDetector as CPUObjectDetector
try:
import pycuda.driver as cuda
from frigate.tensorrtgpu import ObjectDetector as GPUObjectDetector
except ImportError:
pass
def load_labels(path, encoding='utf-8'):
"""Loads labels from file (with or without index numbers).
@ -28,26 +34,22 @@ def load_labels(path, encoding='utf-8'):
return {index: line.strip() for index, line in enumerate(lines)}
class ObjectDetector():
def __init__(self):
edge_tpu_delegate = None
try:
edge_tpu_delegate = load_delegate('libedgetpu.so.1.0')
except ValueError:
print("No EdgeTPU detected. Falling back to CPU.")
if edge_tpu_delegate is None:
self.interpreter = tflite.Interpreter(
model_path='/cpu_model.tflite')
else:
self.interpreter = tflite.Interpreter(
model_path='/edgetpu_model.tflite',
experimental_delegates=[edge_tpu_delegate])
def __init__(self, edge_tpu_delegate):
self.interpreter = tflite.Interpreter(
model_path='/edgetpu_model.tflite',
experimental_delegates=[edge_tpu_delegate])
self.interpreter.allocate_tensors()
self.tensor_input_details = self.interpreter.get_input_details()
self.tensor_output_details = self.interpreter.get_output_details()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
pass
def detect_raw(self, tensor_input):
self.interpreter.set_tensor(self.tensor_input_details[0]['index'], tensor_input)
self.interpreter.invoke()
@ -57,34 +59,62 @@ class ObjectDetector():
detections = np.zeros((20,6), np.float32)
for i, score in enumerate(scores):
if i == detections.shape[0]:
break
detections[i] = [label_codes[i], score, boxes[i][0], boxes[i][1], boxes[i][2], boxes[i][3]]
return detections
def create_object_detector():
edge_tpu_delegate = None
try:
edge_tpu_delegate = load_delegate('libedgetpu.so.1.0')
except ValueError:
pass
if edge_tpu_delegate is not None:
return ObjectDetector(edge_tpu_delegate)
gpu_device_count = 0
try:
cuda.init()
gpu_device_count = cuda.Device.count()
except (RuntimeError, TypeError, NameError):
pass
except cuda.RuntimeError:
pass
if gpu_device_count > 0:
print("No EdgeTPU detected. Falling back to GPU.")
return GPUObjectDetector()
print("No EdgeTPU or GPU detected. Falling back to CPU.")
return CPUObjectDetector()
def run_detector(detection_queue, avg_speed, start):
print(f"Starting detection process: {os.getpid()}")
listen()
plasma_client = plasma.connect("/tmp/plasma")
object_detector = ObjectDetector()
while True:
object_id_str = detection_queue.get()
object_id_hash = hashlib.sha1(str.encode(object_id_str))
object_id = plasma.ObjectID(object_id_hash.digest())
object_id_out = plasma.ObjectID(hashlib.sha1(str.encode(f"out-{object_id_str}")).digest())
input_frame = plasma_client.get(object_id, timeout_ms=0)
with create_object_detector() as object_detector:
while True:
object_id_str = detection_queue.get()
object_id_hash = hashlib.sha1(str.encode(object_id_str))
object_id = plasma.ObjectID(object_id_hash.digest())
object_id_out = plasma.ObjectID(hashlib.sha1(str.encode(f"out-{object_id_str}")).digest())
input_frame = plasma_client.get(object_id, timeout_ms=0)
if input_frame is plasma.ObjectNotAvailable:
continue
if input_frame is plasma.ObjectNotAvailable:
continue
# detect and put the output in the plasma store
start.value = datetime.datetime.now().timestamp()
plasma_client.put(object_detector.detect_raw(input_frame), object_id_out)
duration = datetime.datetime.now().timestamp()-start.value
start.value = 0.0
# detect and put the output in the plasma store
start.value = datetime.datetime.now().timestamp()
plasma_client.put(object_detector.detect_raw(input_frame), object_id_out)
duration = datetime.datetime.now().timestamp()-start.value
start.value = 0.0
avg_speed.value = (avg_speed.value*9 + duration)/10
avg_speed.value = (avg_speed.value*9 + duration)/10
class EdgeTPUProcess():
def __init__(self):
self.detection_queue = mp.SimpleQueue()
@ -114,7 +144,7 @@ class RemoteObjectDetector():
self.fps = EventsPerSecond()
self.plasma_client = plasma.connect("/tmp/plasma")
self.detection_queue = detection_queue
def detect(self, tensor_input, threshold=.4):
detections = []
@ -139,4 +169,4 @@ class RemoteObjectDetector():
))
self.plasma_client.delete([object_id_frame, object_id_detections])
self.fps.update()
return detections
return detections

51
frigate/tensorflowcpu.py Normal file
View File

@ -0,0 +1,51 @@
import numpy as np
import tensorflow as tf
class ObjectDetector():
def __init__(self):
self.detection_graph = tf.Graph()
with self.detection_graph.as_default():
od_graph_def = tf.compat.v1.GraphDef()
with tf.io.gfile.GFile('/cpu_model.pb', 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
config = tf.compat.v1.ConfigProto(
device_count={'GPU': 0}
)
self.sess = tf.compat.v1.Session(
graph=self.detection_graph,
config=config)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
pass
def detect_raw(self, tensor_input):
ops = self.detection_graph.get_operations()
all_tensor_names = {output.name for op in ops for output in op.outputs}
tensor_dict = {}
for key in ['detection_boxes', 'detection_scores', 'detection_classes']:
tensor_name = key + ':0'
if tensor_name in all_tensor_names:
tensor_dict[key] = self.detection_graph.get_tensor_by_name(tensor_name)
image_tensor = self.detection_graph.get_tensor_by_name('image_tensor:0')
output_dict = self.sess.run(tensor_dict,
feed_dict={image_tensor: tensor_input})
boxes = output_dict['detection_boxes'][0]
label_codes = output_dict['detection_classes'][0] - 1
scores = output_dict['detection_scores'][0]
detections = np.zeros((20, 6), np.float32)
for i, score in enumerate(scores):
if i == detections.shape[0]:
break
detections[i] = [label_codes[i], score, boxes[i][0], boxes[i][1], boxes[i][2], boxes[i][3]]
return detections

96
frigate/tensorrtgpu.py Normal file
View File

@ -0,0 +1,96 @@
import numpy as np
import pycuda.driver as cuda
import tensorrt as trt
import engine
from collections import namedtuple
from pycuda.tools import make_default_context
from pycuda.tools import clear_context_caches
HostDeviceMem = namedtuple('HostDeviceMem', 'host device')
class ObjectDetector():
def __init__(self):
self.context = make_default_context()
self.device = self.context.get_device()
engine.load_plugins()
self.trt_runtime = trt.Runtime(engine.TRT_LOGGER)
self.trt_engine = engine.load_engine(self.trt_runtime, '/gpu_model.buf')
self._allocate_buffers()
self.execution_context = self.trt_engine.create_execution_context()
input_volume = trt.volume(engine.model_input_shape())
self.numpy_array = np.zeros((self.trt_engine.max_batch_size, input_volume))
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.context.pop()
self.context = None
clear_context_caches()
def detect_raw(self, tensor_input):
# HWC -> CHW
img_np = tensor_input.transpose((0, 3, 1, 2))
# Normalize to [-1.0, 1.0] interval (expected by model)
img_np = (2.0 / 255.0) * img_np - 1.0
img_np = img_np.ravel()
np.copyto(self.inputs[0].host, img_np)
detection_out, keep_count_out = self._do_inference()
detections = np.zeros((20, 6), np.float32)
for i in range(int(keep_count_out[0])):
if i == detections.shape[0]:
break
pred_start_idx = i * 7
label = detection_out[pred_start_idx + 1] - 1
score = detection_out[pred_start_idx + 2]
xmin = detection_out[pred_start_idx + 3]
ymin = detection_out[pred_start_idx + 4]
xmax = detection_out[pred_start_idx + 5]
ymax = detection_out[pred_start_idx + 6]
detections[i] = [label, score, ymin, xmin, ymax, xmax]
return detections
def _do_inference(self):
[cuda.memcpy_htod_async(inp.device, inp.host, self.stream) for inp in self.inputs]
self.execution_context.execute_async(batch_size=self.trt_engine.max_batch_size,
bindings=self.bindings,
stream_handle=self.stream.handle)
[cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs]
self.stream.synchronize()
return [out.host for out in self.outputs]
def _allocate_buffers(self):
self.inputs = []
self.outputs = []
self.bindings = []
self.stream = cuda.Stream()
# NMS implementation in TRT 6 only supports DataType.FLOAT
binding_to_type = {"Input": np.float32,
"NMS": np.float32,
"NMS_1": np.int32}
for binding in self.trt_engine:
size = trt.volume(self.trt_engine.get_binding_shape(binding)) * self.trt_engine.max_batch_size
dtype = binding_to_type[str(binding)]
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
self.bindings.append(int(device_mem))
# Append to the appropriate list.
if self.trt_engine.binding_is_input(binding):
self.inputs.append(HostDeviceMem(host_mem, device_mem))
else:
self.outputs.append(HostDeviceMem(host_mem, device_mem))

0
frigate/util.py Executable file → Normal file
View File

0
frigate/video.py Executable file → Normal file
View File

47
plugin/CMakeLists.txt Normal file
View File

@ -0,0 +1,47 @@
cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
project(FlattenConcat LANGUAGES CXX)
# Enable all compile warnings
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-long-long -pedantic -Wno-deprecated-declarations")
# Use C++11
set (CMAKE_CXX_STANDARD 11)
# Sets variable to a value if variable is unset.
macro(set_ifndef var val)
if (NOT ${var})
set(${var} ${val})
endif()
message(STATUS "Configurable variable ${var} set to ${${var}}")
endmacro()
# -------- CONFIGURATION --------
find_package(CUDA REQUIRED)
set_ifndef(TRT_LIB /usr/lib/x86_64-linux-gnu)
set_ifndef(TRT_INCLUDE /usr/include/x86_64-linux-gnu)
set_ifndef(CUDA_ROOT /usr/local/cuda)
# Find dependencies:
message("\nThe following variables are derived from the values of the previous variables unless provided explicitly:\n")
# TensorRT's nvinfer lib
find_library(_NVINFER_LIB nvinfer HINTS ${TRT_LIB} PATH_SUFFIXES lib lib64)
set_ifndef(NVINFER_LIB ${_NVINFER_LIB})
# cuBLAS
find_library(_CUBLAS_LIB cublas HINTS ${CUDA_ROOT} PATH_SUFFIXES lib lib64)
set_ifndef(CUBLAS_LIB ${_CUBLAS_LIB})
# CUDA include dir
find_path(_CUDA_INC_DIR cuda_runtime_api.h HINTS ${CUDA_ROOT} PATH_SUFFIXES include)
set_ifndef(CUDA_INC_DIR ${_CUDA_INC_DIR})
# -------- BUILDING --------
include_directories(${TRT_INCLUDE} ${CUDA_INC_DIR})
add_library(flattenconcat MODULE
${CMAKE_SOURCE_DIR}/FlattenConcat.cpp
)
# Link TensorRT's nvinfer lib
target_link_libraries(flattenconcat PRIVATE ${NVINFER_LIB} ${CUBLAS_LIB})

320
plugin/FlattenConcat.cpp Normal file
View File

@ -0,0 +1,320 @@
/*
* The TensorFlow SSD graph has some operations that are currently not supported in TensorRT.
* Using a preprocessor on the graph, multiple operations in the graph are combined into a
* single custom operation which is implemented as a plugin layer in TensorRT. The preprocessor
* stitches all nodes within a namespace into one custom node.
*
* The plugin called `FlattenConcat` is used to flatten each input and then concatenate the
* results. This is applied to the location and confidence data before it is fed to the post
* processor step.
*
* Loading FlattenConcat plugin library using CDLL has a side effect of loading FlattenConcat
* plugin into internal TensorRT plugin registry: the latter has FlattenConcat shipped with
* TensorRT, while we load own version. There are subtle differences between built-in
* FlattenConcat and this one.
*
* The pre-trained TensorFlow model has been converted to UFF format using this FlattenConcat
* plugin and we have to stick to it when building a TensorRT inference engine. To avoid collision
* with built-in plugin of the same name of version "1" we set version "B" and load it the last.
*/
#include <algorithm>
#include <cassert>
#include <iostream>
#include <numeric>
#include <vector>
#include <cublas_v2.h>
#include "NvInferPlugin.h"
// Macro for calling GPU functions
#define CHECK(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
std::cout << "Cuda failure: " << ret; \
abort(); \
} \
} while (0)
using namespace nvinfer1;
namespace
{
const char* FLATTENCONCAT_PLUGIN_VERSION{"B"};
const char* FLATTENCONCAT_PLUGIN_NAME{"FlattenConcat_TRT"};
}
// Flattens all input tensors and concats their flattened version together
// along the major non-batch dimension, i.e axis = 1
class FlattenConcat : public IPluginV2
{
public:
// Ordinary ctor, plugin not yet configured for particular inputs/output
FlattenConcat() {}
// Ctor for clone()
FlattenConcat(const int* flattenedInputSize, int numInputs, int flattenedOutputSize)
: mFlattenedOutputSize(flattenedOutputSize)
{
for (int i = 0; i < numInputs; ++i)
mFlattenedInputSize.push_back(flattenedInputSize[i]);
}
// Ctor for loading from serialized byte array
FlattenConcat(const void* data, size_t length)
{
const char* d = reinterpret_cast<const char*>(data);
const char* a = d;
size_t numInputs = read<size_t>(d);
for (size_t i = 0; i < numInputs; ++i)
{
mFlattenedInputSize.push_back(read<int>(d));
}
mFlattenedOutputSize = read<int>(d);
assert(d == a + length);
}
int getNbOutputs() const override
{
// We always return one output
return 1;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
// At least one input
assert(nbInputDims >= 1);
// We only have one output, so it doesn't
// make sense to check index != 0
assert(index == 0);
size_t flattenedOutputSize = 0;
int inputVolume = 0;
for (int i = 0; i < nbInputDims; ++i)
{
// We only support NCHW. And inputs Dims are without batch num.
assert(inputs[i].nbDims == 3);
inputVolume = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2];
flattenedOutputSize += inputVolume;
}
return DimsCHW(flattenedOutputSize, 1, 1);
}
int initialize() override
{
// Called on engine initialization, we initialize cuBLAS library here,
// since we'll be using it for inference
CHECK(cublasCreate(&mCublas));
return 0;
}
void terminate() override
{
// Called on engine destruction, we destroy cuBLAS data structures,
// which were created in initialize()
CHECK(cublasDestroy(mCublas));
}
size_t getWorkspaceSize(int maxBatchSize) const override
{
// The operation is done in place, it doesn't use GPU memory
return 0;
}
int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
{
// Does the actual concat of inputs, which is just
// copying all inputs bytes to output byte array
size_t inputOffset = 0;
float* output = reinterpret_cast<float*>(outputs[0]);
cublasSetStream(mCublas, stream);
for (size_t i = 0; i < mFlattenedInputSize.size(); ++i)
{
const float* input = reinterpret_cast<const float*>(inputs[i]);
for (int batchIdx = 0; batchIdx < batchSize; ++batchIdx)
{
CHECK(cublasScopy(mCublas, mFlattenedInputSize[i],
input + batchIdx * mFlattenedInputSize[i], 1,
output + (batchIdx * mFlattenedOutputSize + inputOffset), 1));
}
inputOffset += mFlattenedInputSize[i];
}
return 0;
}
size_t getSerializationSize() const override
{
// Returns FlattenConcat plugin serialization size
size_t size = sizeof(mFlattenedInputSize[0]) * mFlattenedInputSize.size()
+ sizeof(mFlattenedOutputSize)
+ sizeof(size_t); // For serializing mFlattenedInputSize vector size
return size;
}
void serialize(void* buffer) const override
{
// Serializes FlattenConcat plugin into byte array
// Cast buffer to char* and save its beginning to a,
// (since value of d will be changed during write)
char* d = reinterpret_cast<char*>(buffer);
char* a = d;
size_t numInputs = mFlattenedInputSize.size();
// Write FlattenConcat fields into buffer
write(d, numInputs);
for (size_t i = 0; i < numInputs; ++i)
{
write(d, mFlattenedInputSize[i]);
}
write(d, mFlattenedOutputSize);
// Sanity check - checks if d is offset
// from a by exactly the size of serialized plugin
assert(d == a + getSerializationSize());
}
void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
{
// We only support one output
assert(nbOutputs == 1);
// Reset plugin private data structures
mFlattenedInputSize.clear();
mFlattenedOutputSize = 0;
// For each input we save its size, we also validate it
for (int i = 0; i < nbInputs; ++i)
{
int inputVolume = 0;
// We only support NCHW. And inputs Dims are without batch num.
assert(inputs[i].nbDims == 3);
// All inputs dimensions along non concat axis should be same
for (size_t dim = 1; dim < 3; dim++)
{
assert(inputs[i].d[dim] == inputs[0].d[dim]);
}
// Size of flattened input
inputVolume = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2];
mFlattenedInputSize.push_back(inputVolume);
mFlattenedOutputSize += mFlattenedInputSize[i];
}
}
bool supportsFormat(DataType type, PluginFormat format) const override
{
return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
}
const char* getPluginType() const override { return FLATTENCONCAT_PLUGIN_NAME; }
const char* getPluginVersion() const override { return FLATTENCONCAT_PLUGIN_VERSION; }
void destroy() override {}
IPluginV2* clone() const override
{
return new FlattenConcat(mFlattenedInputSize.data(), mFlattenedInputSize.size(), mFlattenedOutputSize);
}
void setPluginNamespace(const char* pluginNamespace) override
{
mPluginNamespace = pluginNamespace;
}
const char* getPluginNamespace() const override
{
return mPluginNamespace.c_str();
}
private:
template <typename T>
void write(char*& buffer, const T& val) const
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
T read(const char*& buffer)
{
T val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
return val;
}
// Number of elements in each plugin input, flattened
std::vector<int> mFlattenedInputSize;
// Number of elements in output, flattened
int mFlattenedOutputSize{0};
// cuBLAS library handle
cublasHandle_t mCublas;
// We're not using TensorRT namespaces in
// this sample, so it's just an empty string
std::string mPluginNamespace = "";
};
// PluginCreator boilerplate code for FlattenConcat plugin
class FlattenConcatPluginCreator : public IPluginCreator
{
public:
FlattenConcatPluginCreator()
{
mFC.nbFields = 0;
mFC.fields = 0;
}
~FlattenConcatPluginCreator() {}
const char* getPluginName() const override { return FLATTENCONCAT_PLUGIN_NAME; }
const char* getPluginVersion() const override { return FLATTENCONCAT_PLUGIN_VERSION; }
const PluginFieldCollection* getFieldNames() override { return &mFC; }
IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
{
return new FlattenConcat();
}
IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
{
return new FlattenConcat(serialData, serialLength);
}
void setPluginNamespace(const char* pluginNamespace) override
{
mPluginNamespace = pluginNamespace;
}
const char* getPluginNamespace() const override
{
return mPluginNamespace.c_str();
}
private:
static PluginFieldCollection mFC;
static std::vector<PluginField> mPluginAttributes;
std::string mPluginNamespace = "";
};
PluginFieldCollection FlattenConcatPluginCreator::mFC{};
std::vector<PluginField> FlattenConcatPluginCreator::mPluginAttributes;
REGISTER_TENSORRT_PLUGIN(FlattenConcatPluginCreator);

13
requirements.txt Normal file
View File

@ -0,0 +1,13 @@
numpy
imutils
scipy
Flask
paho-mqtt
PyYAML
matplotlib
pyarrow
pycuda
tensorrt
opencv-python
tensorflow
tflite_runtime