Merge pull request #1 from blakeblackshear/regions

Regions
This commit is contained in:
Blake Blackshear 2019-02-09 07:24:58 -06:00 committed by GitHub
commit 6e8409d203
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 176 additions and 62 deletions

View File

@ -61,17 +61,17 @@ RUN cd /usr/local/src/ \
RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension
# Download & build OpenCV # Download & build OpenCV
RUN wget -q -P /usr/local/src/ --no-check-certificate https://github.com/opencv/opencv/archive/3.4.1.zip RUN wget -q -P /usr/local/src/ --no-check-certificate https://github.com/opencv/opencv/archive/4.0.1.zip
RUN cd /usr/local/src/ \ RUN cd /usr/local/src/ \
&& unzip 3.4.1.zip \ && unzip 4.0.1.zip \
&& rm 3.4.1.zip \ && rm 4.0.1.zip \
&& cd /usr/local/src/opencv-3.4.1/ \ && cd /usr/local/src/opencv-4.0.1/ \
&& mkdir build \ && mkdir build \
&& cd /usr/local/src/opencv-3.4.1/build \ && cd /usr/local/src/opencv-4.0.1/build \
&& cmake -D CMAKE_INSTALL_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local/ .. \ && cmake -D CMAKE_INSTALL_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local/ .. \
&& make -j4 \ && make -j4 \
&& make install \ && make install \
&& rm -rf /usr/local/src/opencv-3.4.1 && rm -rf /usr/local/src/opencv-4.0.1
# Minimize image size # Minimize image size
RUN (apt-get autoremove -y; \ RUN (apt-get autoremove -y; \

View File

@ -1,10 +1,12 @@
# Realtime Object Detection for RTSP Cameras # Realtime Object Detection for RTSP Cameras
This results in a MJPEG stream with objects identified that has a lower latency than directly viewing the RTSP feed with VLC.
- Prioritizes realtime processing over frames per second. Dropping frames is fine. - Prioritizes realtime processing over frames per second. Dropping frames is fine.
- OpenCV runs in a separate process so it can grab frames as quickly as possible to ensure there aren't old frames in the buffer - OpenCV runs in a separate process so it can grab frames as quickly as possible to ensure there aren't old frames in the buffer
- Object detection with Tensorflow runs in a separate process and ignores frames that are more than 0.5 seconds old - Object detection with Tensorflow runs in a separate process and ignores frames that are more than 0.5 seconds old
- Uses shared memory arrays for handing frames between processes - Uses shared memory arrays for handing frames between processes
- Provides a url for viewing the video feed at a hard coded ~5FPS as an mjpeg stream - Provides a url for viewing the video feed at a hard coded ~5FPS as an mjpeg stream
- Frames are only encoded into mjpeg stream when it is being viewed - Frames are only encoded into mjpeg stream when it is being viewed
- A process is created per detection region
## Getting Started ## Getting Started
Build the container with Build the container with
@ -23,13 +25,46 @@ docker run -it --rm \
-v <path_to_labelmap.pbtext>:/label_map.pbtext:ro \ -v <path_to_labelmap.pbtext>:/label_map.pbtext:ro \
-p 5000:5000 \ -p 5000:5000 \
-e RTSP_URL='<rtsp_url>' \ -e RTSP_URL='<rtsp_url>' \
-e REGIONS='<box_size_1>,<x_offset_1>,<y_offset_1>:<box_size_2>,<x_offset_2>,<y_offset_2>' \
realtime-od:latest realtime-od:latest
``` ```
Access the mjpeg stream at http://localhost:5000 Access the mjpeg stream at http://localhost:5000
## Tips
- Lower the framerate of the RTSP feed on the camera to what you want to reduce the CPU usage for capturing the feed
- Use SSDLite models
## Future improvements ## Future improvements
- MQTT messages when detected objects change - [ ] Look for a subset of object types
- Dynamic changes to processing speed, ie. only process 1FPS unless motion detected - [ ] Try and simplify the tensorflow model to just look for the objects we care about
- Break incoming frame into multiple smaller images and run detection in parallel for lower latency (rather than input a lower resolution) - [ ] MQTT messages when detected objects change
- Parallel processing to increase FPS - [ ] Implement basic motion detection with opencv and only look for objects in the regions with detected motion
- [ ] Dynamic changes to processing speed, ie. only process 1FPS unless motion detected
- [x] Parallel processing to increase FPS
- [ ] Look into GPU accelerated decoding of RTSP stream
- [ ] Send video over a socket and use JSMPEG
## Building Tensorflow from source for CPU optimizations
https://www.tensorflow.org/install/source#docker_linux_builds
used `tensorflow/tensorflow:1.12.0-devel-py3`
## Optimizing the graph (cant say I saw much difference in CPU usage)
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/README.md#optimizing-for-deployment
```
docker run -it -v ${PWD}:/lab -v ${PWD}/../back_camera_model/models/ssd_mobilenet_v2_coco_2018_03_29/frozen_inference_graph.pb:/frozen_inference_graph.pb:ro tensorflow/tensorflow:1.12.0-devel-py3 bash
bazel build tensorflow/tools/graph_transforms:transform_graph
bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
--in_graph=/frozen_inference_graph.pb \
--out_graph=/lab/optimized_inception_graph.pb \
--inputs='image_tensor' \
--outputs='num_detections,detection_scores,detection_boxes,detection_classes' \
--transforms='
strip_unused_nodes(type=float, shape="1,300,300,3")
remove_nodes(op=Identity, op=CheckNumerics)
fold_constants(ignore_errors=true)
fold_batch_norms
fold_old_batch_norms'
```

View File

@ -5,6 +5,7 @@ import datetime
import ctypes import ctypes
import logging import logging
import multiprocessing as mp import multiprocessing as mp
import threading
from contextlib import closing from contextlib import closing
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
@ -23,15 +24,20 @@ PATH_TO_LABELS = '/label_map.pbtext'
# TODO: make dynamic? # TODO: make dynamic?
NUM_CLASSES = 90 NUM_CLASSES = 90
#REGIONS = "600,0,380:600,600,380:600,1200,380"
REGIONS = os.getenv('REGIONS')
DETECTED_OBJECTS = []
# Loading label map # Loading label map
label_map = label_map_util.load_labelmap(PATH_TO_LABELS) label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES,
use_display_name=True) use_display_name=True)
category_index = label_map_util.create_category_index(categories) category_index = label_map_util.create_category_index(categories)
def detect_objects(image_np, sess, detection_graph): def detect_objects(cropped_frame, sess, detection_graph, region_size, region_x_offset, region_y_offset):
# Expand dimensions since the model expects images to have shape: [1, None, None, 3] # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0) image_np_expanded = np.expand_dims(cropped_frame, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0') image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected. # Each box represents a part of the image where a particular object was detected.
@ -51,25 +57,55 @@ def detect_objects(image_np, sess, detection_graph):
# build an array of detected objects # build an array of detected objects
objects = [] objects = []
for index, value in enumerate(classes[0]): for index, value in enumerate(classes[0]):
object_dict = {} score = scores[0, index]
if scores[0, index] > 0.5: if score > 0.1:
object_dict[(category_index.get(value)).get('name').encode('utf8')] = \ box = boxes[0, index].tolist()
scores[0, index] box[0] = (box[0] * region_size) + region_y_offset
objects.append(object_dict) box[1] = (box[1] * region_size) + region_x_offset
box[2] = (box[2] * region_size) + region_y_offset
box[3] = (box[3] * region_size) + region_x_offset
objects += [value, scores[0, index]] + box
# only get the first 10 objects
if len(objects) == 60:
break
# draw boxes for detected objects on image return objects
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=4)
return objects, image_np class ObjectParser(threading.Thread):
def __init__(self, object_arrays):
threading.Thread.__init__(self)
self._object_arrays = object_arrays
def run(self):
global DETECTED_OBJECTS
while True:
detected_objects = []
for object_array in self._object_arrays:
object_index = 0
while(object_index < 60 and object_array[object_index] > 0):
object_class = object_array[object_index]
detected_objects.append({
'name': str(category_index.get(object_class).get('name')),
'score': object_array[object_index+1],
'ymin': int(object_array[object_index+2]),
'xmin': int(object_array[object_index+3]),
'ymax': int(object_array[object_index+4]),
'xmax': int(object_array[object_index+5])
})
object_index += 6
DETECTED_OBJECTS = detected_objects
time.sleep(0.01)
def main(): def main():
# Parse selected regions
regions = []
for region_string in REGIONS.split(':'):
region_parts = region_string.split(',')
regions.append({
'size': int(region_parts[0]),
'x_offset': int(region_parts[1]),
'y_offset': int(region_parts[2])
})
# capture a single frame and check the frame shape so the correct array # capture a single frame and check the frame shape so the correct array
# size can be allocated in memory # size can be allocated in memory
video = cv2.VideoCapture(RTSP_URL) video = cv2.VideoCapture(RTSP_URL)
@ -81,29 +117,43 @@ def main():
exit(1) exit(1)
video.release() video.release()
shared_memory_objects = []
for region in regions:
shared_memory_objects.append({
# create shared value for storing the time the frame was captured # create shared value for storing the time the frame was captured
# note: this must be a double even though the value you are storing # note: this must be a double even though the value you are storing
# is a float. otherwise it stops updating the value in shared # is a float. otherwise it stops updating the value in shared
# memory. probably something to do with the size of the memory block # memory. probably something to do with the size of the memory block
shared_frame_time = mp.Value('d', 0.0) 'frame_time': mp.Value('d', 0.0),
# create shared array for storing 10 detected objects
'output_array': mp.Array(ctypes.c_double, 6*10)
})
# compute the flattened array length from the array shape # compute the flattened array length from the array shape
flat_array_length = frame_shape[0] * frame_shape[1] * frame_shape[2] flat_array_length = frame_shape[0] * frame_shape[1] * frame_shape[2]
# create shared array for passing the image data from capture to detect_objects # create shared array for storing the full frame image data
shared_arr = mp.Array(ctypes.c_uint16, flat_array_length) shared_arr = mp.Array(ctypes.c_uint16, flat_array_length)
# create shared array for passing the image data from detect_objects to flask # shape current frame so it can be treated as an image
shared_output_arr = mp.Array(ctypes.c_uint16, flat_array_length) frame_arr = tonumpyarray(shared_arr).reshape(frame_shape)
# create a numpy array with the image shape from the shared memory array
# this is used by flask to output an mjpeg stream
frame_output_arr = tonumpyarray(shared_output_arr).reshape(frame_shape)
capture_process = mp.Process(target=fetch_frames, args=(shared_arr, shared_frame_time, frame_shape)) capture_process = mp.Process(target=fetch_frames, args=(shared_arr, [obj['frame_time'] for obj in shared_memory_objects], frame_shape))
capture_process.daemon = True capture_process.daemon = True
detection_process = mp.Process(target=process_frames, args=(shared_arr, shared_output_arr, shared_frame_time, frame_shape)) detection_processes = []
for index, region in enumerate(regions):
detection_process = mp.Process(target=process_frames, args=(shared_arr,
shared_memory_objects[index]['output_array'],
shared_memory_objects[index]['frame_time'], frame_shape,
region['size'], region['x_offset'], region['y_offset']))
detection_process.daemon = True detection_process.daemon = True
detection_processes.append(detection_process)
object_parser = ObjectParser([obj['output_array'] for obj in shared_memory_objects])
object_parser.start()
capture_process.start() capture_process.start()
print("capture_process pid ", capture_process.pid) print("capture_process pid ", capture_process.pid)
for detection_process in detection_processes:
detection_process.start() detection_process.start()
print("detection_process pid ", detection_process.pid) print("detection_process pid ", detection_process.pid)
@ -115,20 +165,45 @@ def main():
return Response(imagestream(), return Response(imagestream(),
mimetype='multipart/x-mixed-replace; boundary=frame') mimetype='multipart/x-mixed-replace; boundary=frame')
def imagestream(): def imagestream():
global DETECTED_OBJECTS
while True: while True:
# max out at 5 FPS # max out at 5 FPS
time.sleep(0.2) time.sleep(0.2)
# make a copy of the current detected objects
detected_objects = DETECTED_OBJECTS.copy()
# make a copy of the current frame
frame = frame_arr.copy()
# convert to RGB for drawing
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# draw the bounding boxes on the screen
for obj in DETECTED_OBJECTS:
vis_util.draw_bounding_box_on_image_array(frame,
obj['ymin'],
obj['xmin'],
obj['ymax'],
obj['xmax'],
color='red',
thickness=2,
display_str_list=["{}: {}%".format(obj['name'],int(obj['score']*100))],
use_normalized_coordinates=False)
for region in regions:
cv2.rectangle(frame, (region['x_offset'], region['y_offset']),
(region['x_offset']+region['size'], region['y_offset']+region['size']),
(255,255,255), 2)
# convert back to BGR # convert back to BGR
frame_bgr = cv2.cvtColor(frame_output_arr, cv2.COLOR_RGB2BGR) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
# encode the image into a jpg # encode the image into a jpg
ret, jpg = cv2.imencode('.jpg', frame_bgr) ret, jpg = cv2.imencode('.jpg', frame)
yield (b'--frame\r\n' yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + jpg.tobytes() + b'\r\n\r\n') b'Content-Type: image/jpeg\r\n\r\n' + jpg.tobytes() + b'\r\n\r\n')
app.run(host='0.0.0.0', debug=False) app.run(host='0.0.0.0', debug=False)
capture_process.join() capture_process.join()
for detection_process in detection_processes:
detection_process.join() detection_process.join()
object_parser.join()
# convert shared memory array into numpy array # convert shared memory array into numpy array
def tonumpyarray(mp_arr): def tonumpyarray(mp_arr):
@ -136,7 +211,7 @@ def tonumpyarray(mp_arr):
# fetch the frames as fast a possible, only decoding the frames when the # fetch the frames as fast a possible, only decoding the frames when the
# detection_process has consumed the current frame # detection_process has consumed the current frame
def fetch_frames(shared_arr, shared_frame_time, frame_shape): def fetch_frames(shared_arr, shared_frame_times, frame_shape):
# convert shared memory array into numpy and shape into image array # convert shared memory array into numpy and shape into image array
arr = tonumpyarray(shared_arr).reshape(frame_shape) arr = tonumpyarray(shared_arr).reshape(frame_shape)
@ -153,23 +228,24 @@ def fetch_frames(shared_arr, shared_frame_time, frame_shape):
if ret: if ret:
# if the detection_process is ready for the next frame decode it # if the detection_process is ready for the next frame decode it
# otherwise skip this frame and move onto the next one # otherwise skip this frame and move onto the next one
if shared_frame_time.value == 0.0: if all(shared_frame_time.value == 0.0 for shared_frame_time in shared_frame_times):
# go ahead and decode the current frame # go ahead and decode the current frame
ret, frame = video.retrieve() ret, frame = video.retrieve()
if ret: if ret:
# copy the frame into the numpy array
arr[:] = frame arr[:] = frame
# signal to the detection_process by setting the shared_frame_time # signal to the detection_processes by setting the shared_frame_time
for shared_frame_time in shared_frame_times:
shared_frame_time.value = frame_time.timestamp() shared_frame_time.value = frame_time.timestamp()
else:
# sleep a little to reduce CPU usage
time.sleep(0.01)
video.release() video.release()
# do the actual object detection # do the actual object detection
def process_frames(shared_arr, shared_output_arr, shared_frame_time, frame_shape): def process_frames(shared_arr, shared_output_arr, shared_frame_time, frame_shape, region_size, region_x_offset, region_y_offset):
# shape shared input array into frame for processing # shape shared input array into frame for processing
arr = tonumpyarray(shared_arr).reshape(frame_shape) arr = tonumpyarray(shared_arr).reshape(frame_shape)
# shape shared output array into frame so it can be copied into
output_arr = tonumpyarray(shared_output_arr).reshape(frame_shape)
# Load a (frozen) Tensorflow model into memory before the processing loop # Load a (frozen) Tensorflow model into memory before the processing loop
detection_graph = tf.Graph() detection_graph = tf.Graph()
@ -193,6 +269,9 @@ def process_frames(shared_arr, shared_output_arr, shared_frame_time, frame_shape
if no_frames_available > 0 and (datetime.datetime.now().timestamp() - no_frames_available) > 30: if no_frames_available > 0 and (datetime.datetime.now().timestamp() - no_frames_available) > 30:
time.sleep(1) time.sleep(1)
print("sleeping because no frames have been available in a while") print("sleeping because no frames have been available in a while")
else:
# rest a little bit to avoid maxing out the CPU
time.sleep(0.01)
continue continue
# we got a valid frame, so reset the timer # we got a valid frame, so reset the timer
@ -202,22 +281,22 @@ def process_frames(shared_arr, shared_output_arr, shared_frame_time, frame_shape
if (datetime.datetime.now().timestamp() - shared_frame_time.value) > 0.5: if (datetime.datetime.now().timestamp() - shared_frame_time.value) > 0.5:
# signal that we need a new frame # signal that we need a new frame
shared_frame_time.value = 0.0 shared_frame_time.value = 0.0
# rest a little bit to avoid maxing out the CPU
time.sleep(0.01)
continue continue
# make a copy of the frame # make a copy of the cropped frame
frame = arr.copy() cropped_frame = arr[region_y_offset:region_y_offset+region_size, region_x_offset:region_x_offset+region_size].copy()
frame_time = shared_frame_time.value frame_time = shared_frame_time.value
# signal that the frame has been used so a new one will be ready # signal that the frame has been used so a new one will be ready
shared_frame_time.value = 0.0 shared_frame_time.value = 0.0
# convert to RGB # convert to RGB
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) cropped_frame_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)
# do the object detection # do the object detection
objects, frame_overlay = detect_objects(frame_rgb, sess, detection_graph) objects = detect_objects(cropped_frame_rgb, sess, detection_graph, region_size, region_x_offset, region_y_offset)
# copy the output frame with the bounding boxes to the output array # copy the detected objects to the output array, filling the array when needed
output_arr[:] = frame_overlay shared_output_arr[:] = objects + [0.0] * (60-len(objects))
if(len(objects) > 0):
print(objects)
if __name__ == '__main__': if __name__ == '__main__':
mp.freeze_support() mp.freeze_support()