import os import cv2 import imutils import time import datetime import ctypes import logging import multiprocessing as mp import threading import json from contextlib import closing import numpy as np import tensorflow as tf from object_detection.utils import label_map_util from object_detection.utils import visualization_utils as vis_util from flask import Flask, Response, make_response import paho.mqtt.client as mqtt RTSP_URL = os.getenv('RTSP_URL') # Path to frozen detection graph. This is the actual model that is used for the object detection. PATH_TO_CKPT = '/frozen_inference_graph.pb' # List of the strings that is used to add correct label for each box. PATH_TO_LABELS = '/label_map.pbtext' MQTT_HOST = os.getenv('MQTT_HOST') MQTT_TOPIC_PREFIX = os.getenv('MQTT_TOPIC_PREFIX') MQTT_OBJECT_CLASSES = os.getenv('MQTT_OBJECT_CLASSES') # TODO: make dynamic? NUM_CLASSES = 90 # REGIONS = "350,0,300,50:400,350,250,50:400,750,250,50" # REGIONS = "400,350,250,50" REGIONS = os.getenv('REGIONS') DETECTED_OBJECTS = [] # Loading label map label_map = label_map_util.load_labelmap(PATH_TO_LABELS) categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True) category_index = label_map_util.create_category_index(categories) def detect_objects(cropped_frame, sess, detection_graph, region_size, region_x_offset, region_y_offset): # Expand dimensions since the model expects images to have shape: [1, None, None, 3] image_np_expanded = np.expand_dims(cropped_frame, axis=0) image_tensor = detection_graph.get_tensor_by_name('image_tensor:0') # Each box represents a part of the image where a particular object was detected. boxes = detection_graph.get_tensor_by_name('detection_boxes:0') # Each score represent how level of confidence for each of the objects. # Score is shown on the result image, together with the class label. scores = detection_graph.get_tensor_by_name('detection_scores:0') classes = detection_graph.get_tensor_by_name('detection_classes:0') num_detections = detection_graph.get_tensor_by_name('num_detections:0') # Actual detection. (boxes, scores, classes, num_detections) = sess.run( [boxes, scores, classes, num_detections], feed_dict={image_tensor: image_np_expanded}) # build an array of detected objects objects = [] for index, value in enumerate(classes[0]): score = scores[0, index] if score > 0.1: box = boxes[0, index].tolist() box[0] = (box[0] * region_size) + region_y_offset box[1] = (box[1] * region_size) + region_x_offset box[2] = (box[2] * region_size) + region_y_offset box[3] = (box[3] * region_size) + region_x_offset objects += [value, scores[0, index]] + box # only get the first 10 objects if len(objects) == 60: break return objects class ObjectParser(threading.Thread): def __init__(self, object_arrays): threading.Thread.__init__(self) self._object_arrays = object_arrays def run(self): global DETECTED_OBJECTS while True: detected_objects = [] for object_array in self._object_arrays: object_index = 0 while(object_index < 60 and object_array[object_index] > 0): object_class = object_array[object_index] detected_objects.append({ 'name': str(category_index.get(object_class).get('name')), 'score': object_array[object_index+1], 'ymin': int(object_array[object_index+2]), 'xmin': int(object_array[object_index+3]), 'ymax': int(object_array[object_index+4]), 'xmax': int(object_array[object_index+5]) }) object_index += 6 DETECTED_OBJECTS = detected_objects time.sleep(0.1) class MqttPublisher(threading.Thread): def __init__(self, host, topic_prefix, object_classes, motion_flags): threading.Thread.__init__(self) self.client = mqtt.Client() self.client.will_set(topic_prefix+'/available', payload='offline', qos=1, retain=True) self.client.connect(host, 1883, 60) self.client.loop_start() self.client.publish(topic_prefix+'/available', 'online', retain=True) self.topic_prefix = topic_prefix self.object_classes = object_classes self.motion_flags = motion_flags def run(self): global DETECTED_OBJECTS last_sent_payload = "" last_motion = "" while True: # initialize the payload payload = {} for obj in self.object_classes: payload[obj] = [] # loop over detected objects and populate # the payload detected_objects = DETECTED_OBJECTS.copy() for obj in detected_objects: if obj['name'] in self.object_classes: payload[obj['name']].append(obj) new_payload = json.dumps(payload, sort_keys=True) if new_payload != last_sent_payload: last_sent_payload = new_payload self.client.publish(self.topic_prefix+'/objects', new_payload, retain=False) motion_status = 'OFF' if any(obj.value == 1 for obj in self.motion_flags): motion_status = 'ON' if motion_status != last_motion: last_motion = motion_status self.client.publish(self.topic_prefix+'/motion', motion_status, retain=False) time.sleep(0.1) def main(): # Parse selected regions regions = [] for region_string in REGIONS.split(':'): region_parts = region_string.split(',') regions.append({ 'size': int(region_parts[0]), 'x_offset': int(region_parts[1]), 'y_offset': int(region_parts[2]), 'min_object_size': int(region_parts[3]), # shared value for signaling to the capture process that we are ready for the next frame # (1 for ready 0 for not ready) 'ready_for_frame': mp.Value('i', 1), # shared value for motion detection signal (1 for motion 0 for no motion) 'motion_detected': mp.Value('i', 0), # create shared array for storing 10 detected objects # note: this must be a double even though the value you are storing # is a float. otherwise it stops updating the value in shared # memory. probably something to do with the size of the memory block 'output_array': mp.Array(ctypes.c_double, 6*10) }) # capture a single frame and check the frame shape so the correct array # size can be allocated in memory video = cv2.VideoCapture(RTSP_URL) ret, frame = video.read() if ret: frame_shape = frame.shape else: print("Unable to capture video stream") exit(1) video.release() # compute the flattened array length from the array shape flat_array_length = frame_shape[0] * frame_shape[1] * frame_shape[2] # create shared array for storing the full frame image data shared_arr = mp.Array(ctypes.c_uint16, flat_array_length) # create shared value for storing the frame_time shared_frame_time = mp.Value('d', 0.0) # shape current frame so it can be treated as an image frame_arr = tonumpyarray(shared_arr).reshape(frame_shape) capture_process = mp.Process(target=fetch_frames, args=(shared_arr, shared_frame_time, [region['ready_for_frame'] for region in regions], frame_shape)) capture_process.daemon = True detection_processes = [] for index, region in enumerate(regions): detection_process = mp.Process(target=process_frames, args=(shared_arr, region['output_array'], shared_frame_time, region['motion_detected'], frame_shape, region['size'], region['x_offset'], region['y_offset'])) detection_process.daemon = True detection_processes.append(detection_process) motion_processes = [] for index, region in enumerate(regions): motion_process = mp.Process(target=detect_motion, args=(shared_arr, shared_frame_time, region['ready_for_frame'], region['motion_detected'], frame_shape, region['size'], region['x_offset'], region['y_offset'], region['min_object_size'])) motion_process.daemon = True motion_processes.append(motion_process) object_parser = ObjectParser([region['output_array'] for region in regions]) object_parser.start() mqtt_publisher = MqttPublisher(MQTT_HOST, MQTT_TOPIC_PREFIX, MQTT_OBJECT_CLASSES.split(','), [region['motion_detected'] for region in regions]) mqtt_publisher.start() capture_process.start() print("capture_process pid ", capture_process.pid) for detection_process in detection_processes: detection_process.start() print("detection_process pid ", detection_process.pid) for motion_process in motion_processes: motion_process.start() print("motion_process pid ", motion_process.pid) app = Flask(__name__) @app.route('/') def index(): # return a multipart response return Response(imagestream(), mimetype='multipart/x-mixed-replace; boundary=frame') def imagestream(): global DETECTED_OBJECTS while True: # max out at 5 FPS time.sleep(0.2) # make a copy of the current detected objects detected_objects = DETECTED_OBJECTS.copy() # make a copy of the current frame frame = frame_arr.copy() # convert to RGB for drawing frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # draw the bounding boxes on the screen for obj in detected_objects: vis_util.draw_bounding_box_on_image_array(frame, obj['ymin'], obj['xmin'], obj['ymax'], obj['xmax'], color='red', thickness=2, display_str_list=["{}: {}%".format(obj['name'],int(obj['score']*100))], use_normalized_coordinates=False) for region in regions: color = (255,255,255) if region['motion_detected'].value == 1: color = (0,255,0) cv2.rectangle(frame, (region['x_offset'], region['y_offset']), (region['x_offset']+region['size'], region['y_offset']+region['size']), color, 2) cv2.putText(frame, datetime.datetime.now().strftime("%H:%M:%S"), (1125, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2) # convert back to BGR frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # encode the image into a jpg ret, jpg = cv2.imencode('.jpg', frame) yield (b'--frame\r\n' b'Content-Type: image/jpeg\r\n\r\n' + jpg.tobytes() + b'\r\n\r\n') app.run(host='0.0.0.0', debug=False) capture_process.join() for detection_process in detection_processes: detection_process.join() for motion_process in motion_processes: motion_process.join() object_parser.join() mqtt_publisher.join() # convert shared memory array into numpy array def tonumpyarray(mp_arr): return np.frombuffer(mp_arr.get_obj(), dtype=np.uint16) # fetch the frames as fast a possible, only decoding the frames when the # detection_process has consumed the current frame def fetch_frames(shared_arr, shared_frame_time, ready_for_frame_flags, frame_shape): # convert shared memory array into numpy and shape into image array arr = tonumpyarray(shared_arr).reshape(frame_shape) # start the video capture video = cv2.VideoCapture(RTSP_URL) # keep the buffer small so we minimize old data video.set(cv2.CAP_PROP_BUFFERSIZE,1) while True: # grab the frame, but dont decode it yet ret = video.grab() # snapshot the time the frame was grabbed frame_time = datetime.datetime.now() if ret: # if the anyone is ready for the next frame decode it # otherwise skip this frame and move onto the next one if any(flag.value == 1 for flag in ready_for_frame_flags): # go ahead and decode the current frame ret, frame = video.retrieve() if ret: arr[:] = frame shared_frame_time.value = frame_time.timestamp() # signal to the detection_processes by setting the shared_frame_time for flag in ready_for_frame_flags: flag.value = 0 else: # sleep a little to reduce CPU usage time.sleep(0.1) video.release() # do the actual object detection def process_frames(shared_arr, shared_output_arr, shared_frame_time, shared_motion, frame_shape, region_size, region_x_offset, region_y_offset): # shape shared input array into frame for processing arr = tonumpyarray(shared_arr).reshape(frame_shape) # Load a (frozen) Tensorflow model into memory before the processing loop detection_graph = tf.Graph() with detection_graph.as_default(): od_graph_def = tf.GraphDef() with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid: serialized_graph = fid.read() od_graph_def.ParseFromString(serialized_graph) tf.import_graph_def(od_graph_def, name='') sess = tf.Session(graph=detection_graph) no_frames_available = -1 frame_time = 0.0 while True: now = datetime.datetime.now().timestamp() # if there is no motion detected if shared_motion.value == 0: time.sleep(0.1) continue # if there isnt a new frame ready for processing if shared_frame_time.value == frame_time: # save the first time there were no frames available if no_frames_available == -1: no_frames_available = now # if there havent been any frames available in 30 seconds, # sleep to avoid using so much cpu if the camera feed is down if no_frames_available > 0 and (now - no_frames_available) > 30: time.sleep(1) print("sleeping because no frames have been available in a while") else: # rest a little bit to avoid maxing out the CPU time.sleep(0.1) continue # we got a valid frame, so reset the timer no_frames_available = -1 # if the frame is more than 0.5 second old, ignore it if (now - shared_frame_time.value) > 0.5: # rest a little bit to avoid maxing out the CPU time.sleep(0.1) continue # make a copy of the cropped frame cropped_frame = arr[region_y_offset:region_y_offset+region_size, region_x_offset:region_x_offset+region_size].copy() frame_time = shared_frame_time.value # convert to RGB cropped_frame_rgb = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB) # do the object detection objects = detect_objects(cropped_frame_rgb, sess, detection_graph, region_size, region_x_offset, region_y_offset) # copy the detected objects to the output array, filling the array when needed shared_output_arr[:] = objects + [0.0] * (60-len(objects)) # do the actual object detection def detect_motion(shared_arr, shared_frame_time, ready_for_frame, shared_motion, frame_shape, region_size, region_x_offset, region_y_offset, min_motion_area): # shape shared input array into frame for processing arr = tonumpyarray(shared_arr).reshape(frame_shape) no_frames_available = -1 avg_frame = None last_motion = -1 frame_time = 0.0 while True: now = datetime.datetime.now().timestamp() # if it has been 30 seconds since the last motion, clear the flag if last_motion > 0 and (now - last_motion) > 30: last_motion = -1 shared_motion.value = 0 # if there isnt a frame ready for processing if shared_frame_time.value == frame_time: # save the first time there were no frames available if no_frames_available == -1: no_frames_available = now # if there havent been any frames available in 30 seconds, # sleep to avoid using so much cpu if the camera feed is down if no_frames_available > 0 and (now - no_frames_available) > 30: time.sleep(1) print("sleeping because no frames have been available in a while") else: # rest a little bit to avoid maxing out the CPU time.sleep(0.1) if ready_for_frame.value == 0: ready_for_frame.value = 1 continue # we got a valid frame, so reset the timer no_frames_available = -1 # if the frame is more than 0.5 second old, discard it if (now - shared_frame_time.value) > 0.5: # signal that we need a new frame ready_for_frame.value = 1 # rest a little bit to avoid maxing out the CPU time.sleep(0.1) continue # make a copy of the cropped frame cropped_frame = arr[region_y_offset:region_y_offset+region_size, region_x_offset:region_x_offset+region_size].copy().astype('uint8') frame_time = shared_frame_time.value # signal that the frame has been used so a new one will be ready ready_for_frame.value = 1 # convert to grayscale gray = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY) # apply gaussian blur gray = cv2.GaussianBlur(gray, (21, 21), 0) if avg_frame is None: avg_frame = gray.copy().astype("float") continue # look at the delta from the avg_frame cv2.accumulateWeighted(gray, avg_frame, 0.5) frameDelta = cv2.absdiff(gray, cv2.convertScaleAbs(avg_frame)) thresh = cv2.threshold(frameDelta, 25, 255, cv2.THRESH_BINARY)[1] # dilate the thresholded image to fill in holes, then find contours # on thresholded image thresh = cv2.dilate(thresh, None, iterations=2) cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) cnts = imutils.grab_contours(cnts) # loop over the contours for c in cnts: # if the contour is big enough report motion if cv2.contourArea(c) > min_motion_area: last_motion = now shared_motion.value = 1 break if __name__ == '__main__': mp.freeze_support() main()