diff --git a/Dockerfile b/Dockerfile index c027df353..5fe568cc3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -61,17 +61,17 @@ RUN cd /usr/local/src/ \ RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension # Download & build OpenCV -RUN wget -q -P /usr/local/src/ --no-check-certificate https://github.com/opencv/opencv/archive/3.4.1.zip +RUN wget -q -P /usr/local/src/ --no-check-certificate https://github.com/opencv/opencv/archive/4.0.1.zip RUN cd /usr/local/src/ \ - && unzip 3.4.1.zip \ - && rm 3.4.1.zip \ - && cd /usr/local/src/opencv-3.4.1/ \ + && unzip 4.0.1.zip \ + && rm 4.0.1.zip \ + && cd /usr/local/src/opencv-4.0.1/ \ && mkdir build \ - && cd /usr/local/src/opencv-3.4.1/build \ + && cd /usr/local/src/opencv-4.0.1/build \ && cmake -D CMAKE_INSTALL_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local/ .. \ && make -j4 \ && make install \ - && rm -rf /usr/local/src/opencv-3.4.1 + && rm -rf /usr/local/src/opencv-4.0.1 # Minimize image size RUN (apt-get autoremove -y; \ diff --git a/README.md b/README.md index 771fa775f..4f0003c50 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ # Realtime Object Detection for RTSP Cameras +This results in a MJPEG stream with objects identified that has a lower latency than directly viewing the RTSP feed with VLC. - Prioritizes realtime processing over frames per second. Dropping frames is fine. - OpenCV runs in a separate process so it can grab frames as quickly as possible to ensure there aren't old frames in the buffer - Object detection with Tensorflow runs in a separate process and ignores frames that are more than 0.5 seconds old - Uses shared memory arrays for handing frames between processes - Provides a url for viewing the video feed at a hard coded ~5FPS as an mjpeg stream - Frames are only encoded into mjpeg stream when it is being viewed +- A process is created per detection region ## Getting Started Build the container with @@ -23,13 +25,46 @@ docker run -it --rm \ -v <path_to_labelmap.pbtext>:/label_map.pbtext:ro \ -p 5000:5000 \ -e RTSP_URL='<rtsp_url>' \ +-e REGIONS='<box_size_1>,<x_offset_1>,<y_offset_1>:<box_size_2>,<x_offset_2>,<y_offset_2>' \ realtime-od:latest ``` Access the mjpeg stream at http://localhost:5000 +## Tips +- Lower the framerate of the RTSP feed on the camera to what you want to reduce the CPU usage for capturing the feed +- Use SSDLite models + ## Future improvements -- MQTT messages when detected objects change -- Dynamic changes to processing speed, ie. only process 1FPS unless motion detected -- Break incoming frame into multiple smaller images and run detection in parallel for lower latency (rather than input a lower resolution) -- Parallel processing to increase FPS \ No newline at end of file +- [ ] Look for a subset of object types +- [ ] Try and simplify the tensorflow model to just look for the objects we care about +- [ ] MQTT messages when detected objects change +- [ ] Implement basic motion detection with opencv and only look for objects in the regions with detected motion +- [ ] Dynamic changes to processing speed, ie. only process 1FPS unless motion detected +- [x] Parallel processing to increase FPS +- [ ] Look into GPU accelerated decoding of RTSP stream +- [ ] Send video over a socket and use JSMPEG + +## Building Tensorflow from source for CPU optimizations +https://www.tensorflow.org/install/source#docker_linux_builds +used `tensorflow/tensorflow:1.12.0-devel-py3` + +## Optimizing the graph (cant say I saw much difference in CPU usage) +https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/README.md#optimizing-for-deployment +``` +docker run -it -v ${PWD}:/lab -v ${PWD}/../back_camera_model/models/ssd_mobilenet_v2_coco_2018_03_29/frozen_inference_graph.pb:/frozen_inference_graph.pb:ro tensorflow/tensorflow:1.12.0-devel-py3 bash + +bazel build tensorflow/tools/graph_transforms:transform_graph + +bazel-bin/tensorflow/tools/graph_transforms/transform_graph \ +--in_graph=/frozen_inference_graph.pb \ +--out_graph=/lab/optimized_inception_graph.pb \ +--inputs='image_tensor' \ +--outputs='num_detections,detection_scores,detection_boxes,detection_classes' \ +--transforms=' + strip_unused_nodes(type=float, shape="1,300,300,3") + remove_nodes(op=Identity, op=CheckNumerics) + fold_constants(ignore_errors=true) + fold_batch_norms + fold_old_batch_norms' +``` \ No newline at end of file diff --git a/detect_objects.py b/detect_objects.py index 237ef0d88..d5625418a 100644 --- a/detect_objects.py +++ b/detect_objects.py @@ -5,6 +5,7 @@ import datetime import ctypes import logging import multiprocessing as mp +import threading from contextlib import closing import numpy as np import tensorflow as tf @@ -23,15 +24,20 @@ PATH_TO_LABELS = '/label_map.pbtext' # TODO: make dynamic? NUM_CLASSES = 90 +#REGIONS = "600,0,380:600,600,380:600,1200,380" +REGIONS = os.getenv('REGIONS') + +DETECTED_OBJECTS = [] + # Loading label map label_map = label_map_util.load_labelmap(PATH_TO_LABELS) categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True) category_index = label_map_util.create_category_index(categories) -def detect_objects(image_np, sess, detection_graph): +def detect_objects(cropped_frame, sess, detection_graph, region_size, region_x_offset, region_y_offset): # Expand dimensions since the model expects images to have shape: [1, None, None, 3] - image_np_expanded = np.expand_dims(image_np, axis=0) + image_np_expanded = np.expand_dims(cropped_frame, axis=0) image_tensor = detection_graph.get_tensor_by_name('image_tensor:0') # Each box represents a part of the image where a particular object was detected. @@ -51,25 +57,55 @@ def detect_objects(image_np, sess, detection_graph): # build an array of detected objects objects = [] for index, value in enumerate(classes[0]): - object_dict = {} - if scores[0, index] > 0.5: - object_dict[(category_index.get(value)).get('name').encode('utf8')] = \ - scores[0, index] - objects.append(object_dict) + score = scores[0, index] + if score > 0.1: + box = boxes[0, index].tolist() + box[0] = (box[0] * region_size) + region_y_offset + box[1] = (box[1] * region_size) + region_x_offset + box[2] = (box[2] * region_size) + region_y_offset + box[3] = (box[3] * region_size) + region_x_offset + objects += [value, scores[0, index]] + box + # only get the first 10 objects + if len(objects) == 60: + break - # draw boxes for detected objects on image - vis_util.visualize_boxes_and_labels_on_image_array( - image_np, - np.squeeze(boxes), - np.squeeze(classes).astype(np.int32), - np.squeeze(scores), - category_index, - use_normalized_coordinates=True, - line_thickness=4) + return objects - return objects, image_np +class ObjectParser(threading.Thread): + def __init__(self, object_arrays): + threading.Thread.__init__(self) + self._object_arrays = object_arrays + + def run(self): + global DETECTED_OBJECTS + while True: + detected_objects = [] + for object_array in self._object_arrays: + object_index = 0 + while(object_index < 60 and object_array[object_index] > 0): + object_class = object_array[object_index] + detected_objects.append({ + 'name': str(category_index.get(object_class).get('name')), + 'score': object_array[object_index+1], + 'ymin': int(object_array[object_index+2]), + 'xmin': int(object_array[object_index+3]), + 'ymax': int(object_array[object_index+4]), + 'xmax': int(object_array[object_index+5]) + }) + object_index += 6 + DETECTED_OBJECTS = detected_objects + time.sleep(0.01) def main(): + # Parse selected regions + regions = [] + for region_string in REGIONS.split(':'): + region_parts = region_string.split(',') + regions.append({ + 'size': int(region_parts[0]), + 'x_offset': int(region_parts[1]), + 'y_offset': int(region_parts[2]) + }) # capture a single frame and check the frame shape so the correct array # size can be allocated in memory video = cv2.VideoCapture(RTSP_URL) @@ -81,31 +117,45 @@ def main(): exit(1) video.release() - # create shared value for storing the time the frame was captured - # note: this must be a double even though the value you are storing - # is a float. otherwise it stops updating the value in shared - # memory. probably something to do with the size of the memory block - shared_frame_time = mp.Value('d', 0.0) + shared_memory_objects = [] + for region in regions: + shared_memory_objects.append({ + # create shared value for storing the time the frame was captured + # note: this must be a double even though the value you are storing + # is a float. otherwise it stops updating the value in shared + # memory. probably something to do with the size of the memory block + 'frame_time': mp.Value('d', 0.0), + # create shared array for storing 10 detected objects + 'output_array': mp.Array(ctypes.c_double, 6*10) + }) + # compute the flattened array length from the array shape flat_array_length = frame_shape[0] * frame_shape[1] * frame_shape[2] - # create shared array for passing the image data from capture to detect_objects + # create shared array for storing the full frame image data shared_arr = mp.Array(ctypes.c_uint16, flat_array_length) - # create shared array for passing the image data from detect_objects to flask - shared_output_arr = mp.Array(ctypes.c_uint16, flat_array_length) - # create a numpy array with the image shape from the shared memory array - # this is used by flask to output an mjpeg stream - frame_output_arr = tonumpyarray(shared_output_arr).reshape(frame_shape) + # shape current frame so it can be treated as an image + frame_arr = tonumpyarray(shared_arr).reshape(frame_shape) - capture_process = mp.Process(target=fetch_frames, args=(shared_arr, shared_frame_time, frame_shape)) + capture_process = mp.Process(target=fetch_frames, args=(shared_arr, [obj['frame_time'] for obj in shared_memory_objects], frame_shape)) capture_process.daemon = True - detection_process = mp.Process(target=process_frames, args=(shared_arr, shared_output_arr, shared_frame_time, frame_shape)) - detection_process.daemon = True + detection_processes = [] + for index, region in enumerate(regions): + detection_process = mp.Process(target=process_frames, args=(shared_arr, + shared_memory_objects[index]['output_array'], + shared_memory_objects[index]['frame_time'], frame_shape, + region['size'], region['x_offset'], region['y_offset'])) + detection_process.daemon = True + detection_processes.append(detection_process) + + object_parser = ObjectParser([obj['output_array'] for obj in shared_memory_objects]) + object_parser.start() capture_process.start() print("capture_process pid ", capture_process.pid) - detection_process.start() - print("detection_process pid ", detection_process.pid) + for detection_process in detection_processes: + detection_process.start() + print("detection_process pid ", detection_process.pid) app = Flask(__name__) @@ -115,20 +165,45 @@ def main(): return Response(imagestream(), mimetype='multipart/x-mixed-replace; boundary=frame') def imagestream(): + global DETECTED_OBJECTS while True: # max out at 5 FPS time.sleep(0.2) + # make a copy of the current detected objects + detected_objects = DETECTED_OBJECTS.copy() + # make a copy of the current frame + frame = frame_arr.copy() + # convert to RGB for drawing + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + # draw the bounding boxes on the screen + for obj in DETECTED_OBJECTS: + vis_util.draw_bounding_box_on_image_array(frame, + obj['ymin'], + obj['xmin'], + obj['ymax'], + obj['xmax'], + color='red', + thickness=2, + display_str_list=["{}: {}%".format(obj['name'],int(obj['score']*100))], + use_normalized_coordinates=False) + + for region in regions: + cv2.rectangle(frame, (region['x_offset'], region['y_offset']), + (region['x_offset']+region['size'], region['y_offset']+region['size']), + (255,255,255), 2) # convert back to BGR - frame_bgr = cv2.cvtColor(frame_output_arr, cv2.COLOR_RGB2BGR) + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # encode the image into a jpg - ret, jpg = cv2.imencode('.jpg', frame_bgr) + ret, jpg = cv2.imencode('.jpg', frame) yield (b'--frame\r\n' b'Content-Type: image/jpeg\r\n\r\n' + jpg.tobytes() + b'\r\n\r\n') app.run(host='0.0.0.0', debug=False) capture_process.join() - detection_process.join() + for detection_process in detection_processes: + detection_process.join() + object_parser.join() # convert shared memory array into numpy array def tonumpyarray(mp_arr): @@ -136,7 +211,7 @@ def tonumpyarray(mp_arr): # fetch the frames as fast a possible, only decoding the frames when the # detection_process has consumed the current frame -def fetch_frames(shared_arr, shared_frame_time, frame_shape): +def fetch_frames(shared_arr, shared_frame_times, frame_shape): # convert shared memory array into numpy and shape into image array arr = tonumpyarray(shared_arr).reshape(frame_shape) @@ -153,23 +228,24 @@ def fetch_frames(shared_arr, shared_frame_time, frame_shape): if ret: # if the detection_process is ready for the next frame decode it # otherwise skip this frame and move onto the next one - if shared_frame_time.value == 0.0: + if all(shared_frame_time.value == 0.0 for shared_frame_time in shared_frame_times): # go ahead and decode the current frame ret, frame = video.retrieve() if ret: - # copy the frame into the numpy array arr[:] = frame - # signal to the detection_process by setting the shared_frame_time - shared_frame_time.value = frame_time.timestamp() + # signal to the detection_processes by setting the shared_frame_time + for shared_frame_time in shared_frame_times: + shared_frame_time.value = frame_time.timestamp() + else: + # sleep a little to reduce CPU usage + time.sleep(0.01) video.release() # do the actual object detection -def process_frames(shared_arr, shared_output_arr, shared_frame_time, frame_shape): +def process_frames(shared_arr, shared_output_arr, shared_frame_time, frame_shape, region_size, region_x_offset, region_y_offset): # shape shared input array into frame for processing arr = tonumpyarray(shared_arr).reshape(frame_shape) - # shape shared output array into frame so it can be copied into - output_arr = tonumpyarray(shared_output_arr).reshape(frame_shape) # Load a (frozen) Tensorflow model into memory before the processing loop detection_graph = tf.Graph() @@ -193,6 +269,9 @@ def process_frames(shared_arr, shared_output_arr, shared_frame_time, frame_shape if no_frames_available > 0 and (datetime.datetime.now().timestamp() - no_frames_available) > 30: time.sleep(1) print("sleeping because no frames have been available in a while") + else: + # rest a little bit to avoid maxing out the CPU + time.sleep(0.01) continue # we got a valid frame, so reset the timer @@ -202,22 +281,22 @@ def process_frames(shared_arr, shared_output_arr, shared_frame_time, frame_shape if (datetime.datetime.now().timestamp() - shared_frame_time.value) > 0.5: # signal that we need a new frame shared_frame_time.value = 0.0 + # rest a little bit to avoid maxing out the CPU + time.sleep(0.01) continue - # make a copy of the frame - frame = arr.copy() + # make a copy of the cropped frame + cropped_frame = arr[region_y_offset:region_y_offset+region_size, region_x_offset:region_x_offset+region_size].copy() frame_time = shared_frame_time.value # signal that the frame has been used so a new one will be ready shared_frame_time.value = 0.0 # convert to RGB - frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + cropped_frame_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB) # do the object detection - objects, frame_overlay = detect_objects(frame_rgb, sess, detection_graph) - # copy the output frame with the bounding boxes to the output array - output_arr[:] = frame_overlay - if(len(objects) > 0): - print(objects) + objects = detect_objects(cropped_frame_rgb, sess, detection_graph, region_size, region_x_offset, region_y_offset) + # copy the detected objects to the output array, filling the array when needed + shared_output_arr[:] = objects + [0.0] * (60-len(objects)) if __name__ == '__main__': mp.freeze_support()