handle various scenarios with external process failures

This commit is contained in:
Blake Blackshear 2020-03-09 21:12:19 -05:00
parent a60b9211d2
commit 3a9781c4f8
6 changed files with 218 additions and 133 deletions

View File

@ -52,6 +52,12 @@ RUN wget -q https://storage.googleapis.com/download.tensorflow.org/models/tflite
mv /detect.tflite /cpu_model.tflite && \ mv /detect.tflite /cpu_model.tflite && \
rm /cpu_model.zip rm /cpu_model.zip
RUN apt -qq update && apt -qq install --no-install-recommends -y \
gdb \
python3.7-dbg \
&& rm -rf /var/lib/apt/lists/* \
&& (apt-get autoremove -y; apt-get autoclean -y)
WORKDIR /opt/frigate/ WORKDIR /opt/frigate/
ADD frigate frigate/ ADD frigate frigate/
COPY detect_objects.py . COPY detect_objects.py .

View File

@ -1,4 +1,7 @@
import os import os
import sys
import traceback
import signal
import cv2 import cv2
import time import time
import datetime import datetime
@ -58,14 +61,24 @@ GLOBAL_OBJECT_CONFIG = CONFIG.get('objects', {})
WEB_PORT = CONFIG.get('web_port', 5000) WEB_PORT = CONFIG.get('web_port', 5000)
DEBUG = (CONFIG.get('debug', '0') == '1') DEBUG = (CONFIG.get('debug', '0') == '1')
def start_plasma_store():
plasma_cmd = ['plasma_store', '-m', '400000000', '-s', '/tmp/plasma']
plasma_process = sp.Popen(plasma_cmd, stdout=sp.DEVNULL)
time.sleep(1)
rc = plasma_process.poll()
if rc is not None:
return None
return plasma_process
class CameraWatchdog(threading.Thread): class CameraWatchdog(threading.Thread):
def __init__(self, camera_processes, config, tflite_process, tracked_objects_queue, object_processor): def __init__(self, camera_processes, config, tflite_process, tracked_objects_queue, object_processor, plasma_process):
threading.Thread.__init__(self) threading.Thread.__init__(self)
self.camera_processes = camera_processes self.camera_processes = camera_processes
self.config = config self.config = config
self.tflite_process = tflite_process self.tflite_process = tflite_process
self.tracked_objects_queue = tracked_objects_queue self.tracked_objects_queue = tracked_objects_queue
self.object_processor = object_processor self.object_processor = object_processor
self.plasma_process = plasma_process
def run(self): def run(self):
time.sleep(10) time.sleep(10)
@ -73,12 +86,25 @@ class CameraWatchdog(threading.Thread):
# wait a bit before checking # wait a bit before checking
time.sleep(30) time.sleep(30)
# check the plasma process
rc = self.plasma_process.poll()
if rc != None:
print(f"plasma_process exited unexpectedly with {rc}")
self.plasma_process = start_plasma_store()
time.sleep(10)
# check the detection process
if (self.tflite_process.detection_start.value > 0.0 and if (self.tflite_process.detection_start.value > 0.0 and
datetime.datetime.now().timestamp() - self.tflite_process.detection_start.value > 10): datetime.datetime.now().timestamp() - self.tflite_process.detection_start.value > 10):
print("Detection appears to be stuck. Restarting detection process") print("Detection appears to be stuck. Restarting detection process")
self.tflite_process.start_or_restart() self.tflite_process.start_or_restart()
time.sleep(30) time.sleep(30)
elif not self.tflite_process.detect_process.is_alive():
print("Detection appears to have stopped. Restarting detection process")
self.tflite_process.start_or_restart()
time.sleep(30)
# check the camera processes
for name, camera_process in self.camera_processes.items(): for name, camera_process in self.camera_processes.items():
process = camera_process['process'] process = camera_process['process']
if not process.is_alive(): if not process.is_alive():
@ -86,14 +112,33 @@ class CameraWatchdog(threading.Thread):
camera_process['fps'].value = float(self.config[name]['fps']) camera_process['fps'].value = float(self.config[name]['fps'])
camera_process['skipped_fps'].value = 0.0 camera_process['skipped_fps'].value = 0.0
camera_process['detection_fps'].value = 0.0 camera_process['detection_fps'].value = 0.0
camera_process['read_start'].value = 0.0
camera_process['ffmpeg_pid'].value = 0
process = mp.Process(target=track_camera, args=(name, self.config[name], FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG, process = mp.Process(target=track_camera, args=(name, self.config[name], FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG,
self.tflite_process.detection_queue, self.tracked_objects_queue, self.tflite_process.detection_queue, self.tracked_objects_queue,
camera_process['fps'], camera_process['skipped_fps'], camera_process['detection_fps'])) camera_process['fps'], camera_process['skipped_fps'], camera_process['detection_fps'],
camera_process['read_start'], camera_process['ffmpeg_pid']))
process.daemon = True process.daemon = True
camera_process['process'] = process camera_process['process'] = process
process.start() process.start()
print(f"Camera_process started for {name}: {process.pid}") print(f"Camera_process started for {name}: {process.pid}")
if (camera_process['read_start'].value > 0.0 and
datetime.datetime.now().timestamp() - camera_process['read_start'].value > 10):
print(f"Process for {name} has been reading from ffmpeg for over 10 seconds long. Killing ffmpeg...")
ffmpeg_pid = camera_process['ffmpeg_pid'].value
if ffmpeg_pid != 0:
try:
os.kill(ffmpeg_pid, signal.SIGTERM)
except OSError:
print(f"Unable to terminate ffmpeg with pid {ffmpeg_pid}")
time.sleep(10)
try:
os.kill(ffmpeg_pid, signal.SIGKILL)
print(f"Unable to kill ffmpeg with pid {ffmpeg_pid}")
except OSError:
pass
def main(): def main():
# connect to mqtt and setup last will # connect to mqtt and setup last will
def on_connect(client, userdata, flags, rc): def on_connect(client, userdata, flags, rc):
@ -117,14 +162,7 @@ def main():
client.connect(MQTT_HOST, MQTT_PORT, 60) client.connect(MQTT_HOST, MQTT_PORT, 60)
client.loop_start() client.loop_start()
# start plasma store plasma_process = start_plasma_store()
plasma_cmd = ['plasma_store', '-m', '400000000', '-s', '/tmp/plasma']
plasma_process = sp.Popen(plasma_cmd, stdout=sp.DEVNULL)
time.sleep(1)
rc = plasma_process.poll()
if rc is not None:
raise RuntimeError("plasma_store exited unexpectedly with "
"code %d" % (rc,))
## ##
# Setup config defaults for cameras # Setup config defaults for cameras
@ -135,7 +173,7 @@ def main():
} }
# Queue for cameras to push tracked objects to # Queue for cameras to push tracked objects to
tracked_objects_queue = mp.Queue() tracked_objects_queue = mp.SimpleQueue()
# Start the shared tflite process # Start the shared tflite process
tflite_process = EdgeTPUProcess() tflite_process = EdgeTPUProcess()
@ -146,11 +184,14 @@ def main():
camera_processes[name] = { camera_processes[name] = {
'fps': mp.Value('d', float(config['fps'])), 'fps': mp.Value('d', float(config['fps'])),
'skipped_fps': mp.Value('d', 0.0), 'skipped_fps': mp.Value('d', 0.0),
'detection_fps': mp.Value('d', 0.0) 'detection_fps': mp.Value('d', 0.0),
'read_start': mp.Value('d', 0.0),
'ffmpeg_pid': mp.Value('i', 0)
} }
camera_process = mp.Process(target=track_camera, args=(name, config, FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG, camera_process = mp.Process(target=track_camera, args=(name, config, FFMPEG_DEFAULT_CONFIG, GLOBAL_OBJECT_CONFIG,
tflite_process.detection_queue, tracked_objects_queue, tflite_process.detection_queue, tracked_objects_queue, camera_processes[name]['fps'],
camera_processes[name]['fps'], camera_processes[name]['skipped_fps'], camera_processes[name]['detection_fps'])) camera_processes[name]['skipped_fps'], camera_processes[name]['detection_fps'],
camera_processes[name]['read_start'], camera_processes[name]['ffmpeg_pid']))
camera_process.daemon = True camera_process.daemon = True
camera_processes[name]['process'] = camera_process camera_processes[name]['process'] = camera_process
@ -161,7 +202,7 @@ def main():
object_processor = TrackedObjectProcessor(CONFIG['cameras'], client, MQTT_TOPIC_PREFIX, tracked_objects_queue) object_processor = TrackedObjectProcessor(CONFIG['cameras'], client, MQTT_TOPIC_PREFIX, tracked_objects_queue)
object_processor.start() object_processor.start()
camera_watchdog = CameraWatchdog(camera_processes, CONFIG['cameras'], tflite_process, tracked_objects_queue, object_processor) camera_watchdog = CameraWatchdog(camera_processes, CONFIG['cameras'], tflite_process, tracked_objects_queue, object_processor, plasma_process)
camera_watchdog.start() camera_watchdog.start()
# create a flask app that encodes frames a mjpeg on demand # create a flask app that encodes frames a mjpeg on demand
@ -174,6 +215,23 @@ def main():
# return a healh # return a healh
return "Frigate is running. Alive and healthy!" return "Frigate is running. Alive and healthy!"
@app.route('/debug/stack')
def processor_stack():
frame = sys._current_frames().get(object_processor.ident, None)
if frame:
return "<br>".join(traceback.format_stack(frame)), 200
else:
return "no frame found", 200
@app.route('/debug/print_stack')
def print_stack():
pid = int(request.args.get('pid', 0))
if pid == 0:
return "missing pid", 200
else:
os.kill(pid, signal.SIGUSR1)
return "check logs", 200
@app.route('/debug/stats') @app.route('/debug/stats')
def stats(): def stats():
stats = {} stats = {}
@ -185,21 +243,22 @@ def main():
stats[name] = { stats[name] = {
'fps': round(camera_stats['fps'].value, 2), 'fps': round(camera_stats['fps'].value, 2),
'skipped_fps': round(camera_stats['skipped_fps'].value, 2), 'skipped_fps': round(camera_stats['skipped_fps'].value, 2),
'detection_fps': round(camera_stats['detection_fps'].value, 2) 'detection_fps': round(camera_stats['detection_fps'].value, 2),
'read_start': camera_stats['read_start'].value,
'pid': camera_stats['process'].pid,
'ffmpeg_pid': camera_stats['ffmpeg_pid'].value
} }
stats['coral'] = { stats['coral'] = {
'fps': round(total_detection_fps, 2), 'fps': round(total_detection_fps, 2),
'inference_speed': round(tflite_process.avg_inference_speed.value*1000, 2), 'inference_speed': round(tflite_process.avg_inference_speed.value*1000, 2),
'detection_queue': tflite_process.detection_queue.qsize(), 'detection_start': tflite_process.detection_start.value,
'detection_start': tflite_process.detection_start.value 'pid': tflite_process.detect_process.pid
} }
rc = plasma_process.poll() rc = camera_watchdog.plasma_process.poll()
stats['plasma_store_rc'] = rc stats['plasma_store_rc'] = rc
stats['tracked_objects_queue'] = tracked_objects_queue.qsize()
return jsonify(stats) return jsonify(stats)
@app.route('/<camera_name>/<label>/best.jpg') @app.route('/<camera_name>/<label>/best.jpg')

View File

@ -7,7 +7,7 @@ import SharedArray as sa
import pyarrow.plasma as plasma import pyarrow.plasma as plasma
import tflite_runtime.interpreter as tflite import tflite_runtime.interpreter as tflite
from tflite_runtime.interpreter import load_delegate from tflite_runtime.interpreter import load_delegate
from frigate.util import EventsPerSecond from frigate.util import EventsPerSecond, listen
def load_labels(path, encoding='utf-8'): def load_labels(path, encoding='utf-8'):
"""Loads labels from file (with or without index numbers). """Loads labels from file (with or without index numbers).
@ -64,6 +64,7 @@ class ObjectDetector():
def run_detector(detection_queue, avg_speed, start): def run_detector(detection_queue, avg_speed, start):
print(f"Starting detection process: {os.getpid()}") print(f"Starting detection process: {os.getpid()}")
listen()
plasma_client = plasma.connect("/tmp/plasma") plasma_client = plasma.connect("/tmp/plasma")
object_detector = ObjectDetector() object_detector = ObjectDetector()
@ -87,7 +88,7 @@ def run_detector(detection_queue, avg_speed, start):
class EdgeTPUProcess(): class EdgeTPUProcess():
def __init__(self): def __init__(self):
self.detection_queue = mp.Queue() self.detection_queue = mp.SimpleQueue()
self.avg_inference_speed = mp.Value('d', 0.01) self.avg_inference_speed = mp.Value('d', 0.01)
self.detection_start = mp.Value('d', 0.0) self.detection_start = mp.Value('d', 0.0)
self.detect_process = None self.detect_process = None

View File

@ -29,7 +29,6 @@ class TrackedObjectProcessor(threading.Thread):
self.client = client self.client = client
self.topic_prefix = topic_prefix self.topic_prefix = topic_prefix
self.tracked_objects_queue = tracked_objects_queue self.tracked_objects_queue = tracked_objects_queue
self.plasma_client = plasma.connect("/tmp/plasma")
self.camera_data = defaultdict(lambda: { self.camera_data = defaultdict(lambda: {
'best_objects': {}, 'best_objects': {},
'object_status': defaultdict(lambda: defaultdict(lambda: 'OFF')), 'object_status': defaultdict(lambda: defaultdict(lambda: 'OFF')),
@ -48,6 +47,9 @@ class TrackedObjectProcessor(threading.Thread):
return self.camera_data[camera]['current_frame'] return self.camera_data[camera]['current_frame']
def run(self): def run(self):
while True:
try:
self.plasma_client = plasma.connect("/tmp/plasma")
while True: while True:
camera, frame_time, tracked_objects = self.tracked_objects_queue.get() camera, frame_time, tracked_objects = self.tracked_objects_queue.get()
@ -147,3 +149,5 @@ class TrackedObjectProcessor(threading.Thread):
if ret: if ret:
jpg_bytes = jpg.tobytes() jpg_bytes = jpg.tobytes()
self.client.publish(f"{self.topic_prefix}/{camera}/{obj_name}/snapshot", jpg_bytes, retain=True) self.client.publish(f"{self.topic_prefix}/{camera}/{obj_name}/snapshot", jpg_bytes, retain=True)
except:
pass

View File

@ -1,4 +1,6 @@
import datetime import datetime
import signal
import traceback
import collections import collections
import numpy as np import numpy as np
import cv2 import cv2
@ -127,3 +129,9 @@ class EventsPerSecond:
now = datetime.datetime.now().timestamp() now = datetime.datetime.now().timestamp()
seconds = min(now-self._start, last_n_seconds) seconds = min(now-self._start, last_n_seconds)
return len([t for t in self._timestamps if t > (now-last_n_seconds)]) / seconds return len([t for t in self._timestamps if t > (now-last_n_seconds)]) / seconds
def print_stack(sig, frame):
traceback.print_stack(frame)
def listen():
signal.signal(signal.SIGUSR1, print_stack)

View File

@ -15,7 +15,7 @@ import copy
import itertools import itertools
import json import json
from collections import defaultdict from collections import defaultdict
from frigate.util import draw_box_with_label, area, calculate_region, clipped, intersection_over_union, intersection, EventsPerSecond from frigate.util import draw_box_with_label, area, calculate_region, clipped, intersection_over_union, intersection, EventsPerSecond, listen
from frigate.objects import ObjectTracker from frigate.objects import ObjectTracker
from frigate.edgetpu import RemoteObjectDetector from frigate.edgetpu import RemoteObjectDetector
from frigate.motion import MotionDetector from frigate.motion import MotionDetector
@ -98,28 +98,32 @@ def create_tensor_input(frame, region):
# Expand dimensions since the model expects images to have shape: [1, 300, 300, 3] # Expand dimensions since the model expects images to have shape: [1, 300, 300, 3]
return np.expand_dims(cropped_frame, axis=0) return np.expand_dims(cropped_frame, axis=0)
def start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_process=None): def start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, pid, ffmpeg_process=None):
if not ffmpeg_process is None: if not ffmpeg_process is None:
print("Terminating the existing ffmpeg process...") print("Terminating the existing ffmpeg process...")
ffmpeg_process.terminate() ffmpeg_process.terminate()
try: try:
print("Waiting for ffmpeg to exit gracefully...") print("Waiting for ffmpeg to exit gracefully...")
ffmpeg_process.wait(timeout=30) ffmpeg_process.communicate(timeout=30)
except sp.TimeoutExpired: except sp.TimeoutExpired:
print("FFmpeg didnt exit. Force killing...") print("FFmpeg didnt exit. Force killing...")
ffmpeg_process.kill() ffmpeg_process.kill()
ffmpeg_process.wait() ffmpeg_process.communicate()
print("Creating ffmpeg process...") print("Creating ffmpeg process...")
print(" ".join(ffmpeg_cmd)) print(" ".join(ffmpeg_cmd))
return sp.Popen(ffmpeg_cmd, stdout = sp.PIPE, bufsize=frame_size*10) process = sp.Popen(ffmpeg_cmd, stdout = sp.PIPE, bufsize=frame_size*10)
pid.value = process.pid
return process
def track_camera(name, config, ffmpeg_global_config, global_objects_config, detection_queue, detected_objects_queue, fps, skipped_fps, detection_fps): def track_camera(name, config, ffmpeg_global_config, global_objects_config, detection_queue, detected_objects_queue, fps, skipped_fps, detection_fps, read_start, ffmpeg_pid):
print(f"Starting process for {name}: {os.getpid()}") print(f"Starting process for {name}: {os.getpid()}")
listen()
# Merge the ffmpeg config with the global config # Merge the ffmpeg config with the global config
ffmpeg = config.get('ffmpeg', {}) ffmpeg = config.get('ffmpeg', {})
ffmpeg_input = get_ffmpeg_input(ffmpeg['input']) ffmpeg_input = get_ffmpeg_input(ffmpeg['input'])
ffmpeg_restart_delay = ffmpeg.get('restart_delay', 0)
ffmpeg_global_args = ffmpeg.get('global_args', ffmpeg_global_config['global_args']) ffmpeg_global_args = ffmpeg.get('global_args', ffmpeg_global_config['global_args'])
ffmpeg_hwaccel_args = ffmpeg.get('hwaccel_args', ffmpeg_global_config['hwaccel_args']) ffmpeg_hwaccel_args = ffmpeg.get('hwaccel_args', ffmpeg_global_config['hwaccel_args'])
ffmpeg_input_args = ffmpeg.get('input_args', ffmpeg_global_config['input_args']) ffmpeg_input_args = ffmpeg.get('input_args', ffmpeg_global_config['input_args'])
@ -176,7 +180,7 @@ def track_camera(name, config, ffmpeg_global_config, global_objects_config, dete
object_tracker = ObjectTracker(10) object_tracker = ObjectTracker(10)
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size) ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_pid)
plasma_client = plasma.connect("/tmp/plasma") plasma_client = plasma.connect("/tmp/plasma")
frame_num = 0 frame_num = 0
@ -187,19 +191,22 @@ def track_camera(name, config, ffmpeg_global_config, global_objects_config, dete
skipped_fps_tracker.start() skipped_fps_tracker.start()
object_detector.fps.start() object_detector.fps.start()
while True: while True:
start = datetime.datetime.now().timestamp() rc = ffmpeg_process.poll()
if rc != None:
print(f"{name}: ffmpeg_process exited unexpectedly with {rc}")
print(f"Letting {name} rest for {ffmpeg_restart_delay} seconds before restarting...")
time.sleep(ffmpeg_restart_delay)
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_pid, ffmpeg_process)
time.sleep(10)
read_start.value = datetime.datetime.now().timestamp()
frame_bytes = ffmpeg_process.stdout.read(frame_size) frame_bytes = ffmpeg_process.stdout.read(frame_size)
duration = datetime.datetime.now().timestamp()-start duration = datetime.datetime.now().timestamp()-read_start.value
read_start.value = 0.0
avg_wait = (avg_wait*99+duration)/100 avg_wait = (avg_wait*99+duration)/100
if not frame_bytes: if len(frame_bytes) == 0:
rc = ffmpeg_process.poll() print(f"{name}: ffmpeg_process didnt return any bytes")
if rc is not None:
print(f"{name}: ffmpeg_process exited unexpectedly with {rc}")
ffmpeg_process = start_or_restart_ffmpeg(ffmpeg_cmd, frame_size, ffmpeg_process)
time.sleep(10)
else:
print(f"{name}: ffmpeg_process is still running but didnt return any bytes")
continue continue
# limit frame rate # limit frame rate