Repository: ndrplz/surround_vehicles_awareness Branch: master Commit: b1d3283ef92f Files: 9 Total size: 17.9 KB Directory structure: gitextract_vw4fuvte/ ├── .gitignore ├── LICENSE ├── README.md ├── data/ │ └── sample_data.txt ├── img/ │ └── helloworld.psd ├── load_data.py ├── main.py ├── model.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc .idea/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 Andrea Palazzi Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Learning to Map Vehicles into Bird's Eye View

This code accompanies the paper *["Learning to map surrounding vehicles into bird's eye view using synthetic data"](https://arxiv.org/pdf/1706.08442.pdf)*. It contains the code for loading data and pre-trained SDPN model proposed in the paper. ## How-to-run Script entry-point is in **[main.py](main.py)**. When **[main.py](main.py)** is run, *pretrained weights* are automatically downloaded and injected in the **[model](model.py)**. Model is then used to perform and inference on a sample data, mapping a car from the dashboard camera view to the bird's eye view of the scene. If everything works correctly, the output should look like this.

#### Dependencies The code was developed with the following configuration: * python 2.7.11 * numpy 1.11.2 * opencv 3.1.0 * Theano 0.9.0.dev3 * Keras 1.1.2 Other configuration will reasonably work, but have never been explicitly tested. ## Dataset In this repository only one example is provided, to the end of verifying that the model is working correctly. The **whole dataset**, which comprises more than **1M** couples of bounding boxes, can be found here. To get an idea of how the data look like you can check [this video](https://www.youtube.com/watch?v=t2mXv9j6LNw). ================================================ FILE: data/sample_data.txt ================================================ 004190.jpg, 004190_b.jpg, 000694048, 004067225593, 20.367, 229.760, 0.535, 0.503, 0.618, 0.544, 0.549, 0.021, 0.679, 0.190 ================================================ FILE: load_data.py ================================================ import numpy as np import cv2 import os.path as path from utils import imagenet_mean_bgr def convert_from_relative_to_absolute(h, w, x_min, y_min, x_max, y_max): """ Convert from relative coordinates (range 0, 1) to absolute coordinates given a frame (range h, w) Parameters ---------- h : int Image height w : int Image width x_min : float X coordinate of top-left corner (in range 0, 1) y_min : float Y coordinate of top-left corner (in range 0, 1) x_max : float X coordinate of bottom-right corner (in range 0, 1) y_max : float Y coordinate of bottom-right corner (in range 0, 1) Returns ------- coords : list Input coordinates casted to image size -> range (0, h) and (0, w) """ x_min = x_min * w y_min = y_min * h x_max = x_max * w y_max = y_max * h return map(np.int32, [x_min, y_min, x_max, y_max]) def extract_crop(frame, x_min, y_min, x_max, y_max): """ Extract vehicle crop from the image. Crop is resized to 224x224 which is ResNet input size. Parameters ---------- frame : ndarray Image to process x_min : float X coordinate of top-left corner (in range 0, 1) y_min : float Y coordinate of top-left corner (in range 0, 1) x_max : float X coordinate of bottom-right corner (in range 0, 1) y_max : float Y coordinate of bottom-right corner (in range 0, 1) Returns ------- crop : ndarray Crop containing vehicle, resized to 224x224 pixel """ h, w = frame.shape[:2] x_min, y_min, x_max, y_max = convert_from_relative_to_absolute(h, w, x_min, y_min, x_max, y_max) # extract crop from frame crop = frame[y_min:y_max, x_min:x_max, :].copy() crop = cv2.resize(crop, (224, 224)) return crop def get_sample_batch(data_dir): """ Load sample data useful for model "hello world". """ X_coords, X_images, X_crops, X_images_original = [], [], [], [] Y_coords, Y_images, Y_crops, Y_dist, Y_yaw = [], [], [], [], [] with open(path.join(data_dir,'sample_data.txt'), 'rb') as f: logs = f.readlines() for log in logs: # retrieve line values log = log.strip().split(',') # parse a log line frame_f, frame_b = log[:2] bbox_id, bbox_model = log[2:4] bbox_dist, bbox_yaw = map(np.float32, log[4:6]) coords_frontal = map(np.float32, log[6:10]) coords_birdeye = map(np.float32, log[10:]) # load images frame_frontal_path = path.join(data_dir, frame_f.strip()) frame_birdeye_path = path.join(data_dir, frame_b.strip()) if not path.exists(frame_frontal_path) or not path.exists(frame_birdeye_path): continue frame_frontal = cv2.imread(frame_frontal_path, cv2.IMREAD_COLOR) frame_birdeye = cv2.imread(frame_birdeye_path, cv2.IMREAD_COLOR) # extract crops from whole frames crop_frontal = extract_crop(frame_frontal, *coords_frontal) crop_birdeye = extract_crop(frame_birdeye, *coords_birdeye) if crop_frontal is not None and crop_birdeye is not None: # convert from (0, 1) to tanh range (-1, 1) coords_birdeye = [2 * (c - 0.5) for c in coords_birdeye] # append all needed stuff to output structures X_coords.append(coords_frontal) # append frontal coords X_crops.append(crop_frontal) # append frontal crops X_images.append(frame_frontal) # append frontal frames X_images_original.append(frame_frontal) # append frontal frames Y_coords.append(coords_birdeye) # append birdeye coords Y_crops.append(crop_birdeye) # append birdeye crops Y_images.append(frame_birdeye) # append birdeye frames Y_dist.append(bbox_dist) # append bbox distance Y_yaw.append(bbox_yaw) # append bbox yaw # preprocess X crops by subtracting mean and put channels first for b in range(0, len(X_coords)): X_crops[b] = imagenet_mean_bgr(frame_bgr=X_crops[b], op='subtract').transpose(2, 0, 1) # convert all stuff to ndarray X_coords, Y_coords = np.array(X_coords), np.array(Y_coords) X_crops, Y_crops = np.array(X_crops), np.array(Y_crops) X_images, Y_images = np.array(X_images), np.array(Y_images) Y_dist, Y_yaw = np.array(Y_dist), np.array(Y_yaw) X_images_original = np.array(X_images_original) return X_coords, X_crops, X_images, X_images_original, Y_coords, Y_crops, Y_images, Y_dist, Y_yaw ================================================ FILE: main.py ================================================ from model import SDPN from keras.optimizers import Adam from load_data import get_sample_batch from utils import show_prediction from keras.utils.data_utils import get_file TH_WEIGHTS_PATH = 'http://imagelab.ing.unimore.it/files/pretrained_models/keras/SPDN_w.hdf5' if __name__ == '__main__': # Get model model = SDPN(summary=True) # Download pre-trained weights pretrained_weights_path = get_file('SPDN_w.h5', TH_WEIGHTS_PATH, cache_subdir='models') # Load pre-trained weights model.load_weights(pretrained_weights_path) model.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999), loss='mse') # Load sample batch X_coords, X_crops, X_images, X_images_original, Y_coords, Y_crops, Y_images, Y_dist, Y_yaw = get_sample_batch('data') # Perform prediction given (vehicle_coords, vehicle_crop) in dashboard camera view Y_pred = model.predict([X_coords, X_crops]) # Display sample prediction for b in range(len(X_coords)): show_prediction(X_images_original[b], Y_images[b], X_coords[b], Y_coords[b], Y_pred[b]) ================================================ FILE: model.py ================================================ from keras.models import Model from keras.layers import Dense, Input, Dropout, Reshape, merge from keras.applications import ResNet50 def SDPN(summary=True): """ Create and return Semantic-aware Dense Prediction Network. Parameters ---------- summary : bool If True, network summary is printed to stout. Returns ------- model : keras Model Model of SDPN """ input_coords = Input(shape=(4,)) input_crop = Input(shape=(3, 224, 224)) # extract feature from image crop resnet = ResNet50(include_top=False, weights='imagenet') for layer in resnet.layers: # set resnet as non-trainable layer.trainable = False crop_encoded = resnet(input_crop) # shape of `crop_encoded` is 2018x1x1 crop_encoded = Reshape(target_shape=(2048,))(crop_encoded) # encode input coordinates h = Dense(256, activation='relu')(input_coords) h = Dropout(p=0.25)(h) h = Dense(256, activation='relu')(h) h = Dropout(p=0.25)(h) h = Dense(256, activation='relu')(h) # merge feature vectors from crop and coords merged = merge([crop_encoded, h], mode='concat') # decoding into output coordinates h = Dense(1024, activation='relu')(merged) h = Dropout(p=0.25)(h) h = Dense(1024, activation='relu')(h) h = Dropout(p=0.25)(h) h = Dense(512, activation='relu')(h) h = Dropout(p=0.25)(h) h = Dense(256, activation='relu')(h) h = Dropout(p=0.25)(h) h = Dense(128, activation='relu')(h) h = Dropout(p=0.25)(h) output_coords = Dense(4, activation='tanh')(h) model = Model(input=[input_coords, input_crop], output=output_coords) if summary: model.summary() return model ================================================ FILE: utils.py ================================================ import numpy as np import cv2 class RelativeRectangle: """ 2D Rectangle defined by top-left and bottom-right corners. NOTICE: COORDS ARE EXPRESSED IN TERMS OF PERCENTAGE OF SCREEN W AND H Parameters ---------- x_min : float x coordinate of top-left corner. y_min : float y coordinate of top-left corner. x_max : float x coordinate of bottom-right corner. y_min : float y coordinate of bottom-right corner. """ def __init__(self, x_min, y_min, x_max, y_max): self.x_min = x_min self.y_min = y_min self.x_max = x_max self.y_max = y_max self.x_side = self.x_max - self.x_min self.y_side = self.y_max - self.y_min def draw(self, frame, color=255, thickness=1): """ Draw Rectangle on a given frame. Notice: while this function does not return anything, original image `frame` is modified. Parameters ---------- frame : 2D / 3D np.array The image on which the rectangle is drawn. color : tuple, optional Color used to draw the rectangle (default = 255) thickness : int, optional Line thickness used t draw the rectangle (default = 1) Returns ------- None """ h, w = frame.shape[:2] # convert back from relative coordinates to frame coordinates x_min = int(self.x_min * w) y_min = int(self.y_min * h) x_max = int(self.x_max * w) y_max = int(self.y_max * h) cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, thickness) def get_mask(self, mask_shape): """ Return the foreground mask of the rectangle. Parameters ---------- mask_shape : tuple Tuple (height, width) that defines the shape of the fg_mask Returns ------- fg_mask : ndarray Foreground mask of this RelativeRectangle RelativeRectangle has relative coordinates, so the fg_mask shape must be passed as parameter. """ h, w = mask_shape fg_mask = np.zeros(shape=(h, w), dtype=np.float32) # convert back from relative coordinates to frame coordinates x_min = int(self.x_min * w) y_min = int(self.y_min * h) x_max = int(self.x_max * w) y_max = int(self.y_max * h) fg_mask[y_min:y_max, x_min:x_max] = 1. return fg_mask @property def tl_corner(self): """ Coordinates of the top-left corner of rectangle (as float32). Returns ------- tl_corner : float32 tuple """ return tuple(map(np.float32, (self.x_min, self.y_min))) @property def br_corner(self): """ Coordinates of the bottom-right corner of rectangle. Returns ------- br_corner : float32 tuple """ return tuple(map(np.float32, (self.x_max, self.y_max))) @property def coords(self): """ Coordinates (x_min, y_min, x_max, y_max) which define the Rectangle. Returns ------- coordinates : float32 tuple """ return tuple(map(np.float32, (self.x_min, self.y_min, self.x_max, self.y_max))) @property def area(self): """ Get the area of Rectangle Returns ------- area : float32 """ return np.float32(self.x_side * self.y_side) def imagenet_mean_bgr(frame_bgr, op='subtract'): """ Add or subtract ImageNet mean pixel value from a given BGR frame. """ imagenet_mean_BGR = np.array([123.68, 116.779, 103.939]) frame_bgr = np.float32(frame_bgr) for c in range(0, 3): if op == 'subtract': frame_bgr[:, :, c] -= imagenet_mean_BGR[c] elif op == 'add': frame_bgr[:, :, c] += imagenet_mean_BGR[c] return frame_bgr def stitch_together(input_images, layout, resize_dim=None, off_x=None, off_y=None, bg_color=(0, 0, 0)): """ Stitch together N input images into a bigger frame, using a grid layout. Input images can be either color or grayscale, but must all have the same size. :param input_images: list of input images :param layout: grid layout expressed (rows, cols) of the stitch :param resize_dim: if not None, stitch is resized to this size :param off_x: offset between stitched images along x axis :param off_y: offset between stitched images along y axis :param bg_color: color used for background :return: stitch of input images """ if len(set([img.shape for img in input_images])) > 1: raise ValueError('All images must have the same shape') # determine if input images are color (3 channels) or grayscale (single channel) if len(input_images[0].shape) == 2: mode = 'grayscale' img_h, img_w = input_images[0].shape elif len(input_images[0].shape) == 3: mode = 'color' img_h, img_w, img_c = input_images[0].shape else: raise ValueError('Unknown shape for input images') # if no offset is provided, set to 10% of image size if off_x is None: off_x = img_w // 10 if off_y is None: off_y = img_h // 10 # create stitch mask rows, cols = layout stitch_h = rows * img_h + (rows + 1) * off_y stitch_w = cols * img_w + (cols + 1) * off_x if mode == 'color': bg_color = np.array(bg_color)[None, None, :] # cast to ndarray add singleton dimensions stitch = np.uint8(np.repeat(np.repeat(bg_color, stitch_h, axis=0), stitch_w, axis=1)) elif mode == 'grayscale': stitch = np.zeros(shape=(stitch_h, stitch_w), dtype=np.uint8) for r in range(0, rows): for c in range(0, cols): list_idx = r * cols + c if list_idx < len(input_images): if mode == 'color': stitch[ r * (off_y + img_h) + off_y: r*(off_y+img_h) + off_y + img_h, c * (off_x + img_w) + off_x: c * (off_x + img_w) + off_x + img_w, :] = input_images[list_idx] elif mode == 'grayscale': stitch[ r * (off_y + img_h) + off_y: r*(off_y+img_h) + off_y + img_h, c * (off_x + img_w) + off_x: c * (off_x + img_w) + off_x + img_w]\ = input_images[list_idx] if resize_dim: stitch = cv2.resize(stitch, dsize=(resize_dim[::-1])) return stitch def show_prediction(frontal_image, birdeye_image, x_coords, y_coords, y_pred_coords): """ Display network prediction. Parameters ---------- frontal_image : ndarray Frame taken from dashboard camera view birdeye_image : ndarray Frame taken from bird's eye view x_coords : list Coords of vehicle in the frontal view y_coords : list Coords of vehicle in the bird's eye view (GT) y_pred_coords : list Coords of vehicle in the bird's eye view (pred) Returns ------- None """ birdeye_image_pred = birdeye_image.copy() birdeye_image_true = birdeye_image.copy() bbox_frontal = RelativeRectangle(*[x_coords[j] for j in range(0, 4)]) # cast back from tanh range (-1, 1) to (0, 1) bbox_pred = RelativeRectangle(*[((y_pred_coords[j] * 0.5) + 0.5) for j in range(0, 4)]) bbox_true = RelativeRectangle(*[((y_coords[j] * 0.5) + 0.5) for j in range(0, 4)]) # draw bounding boxes bbox_frontal.draw(frontal_image, color=(0, 0, 255), thickness=6) bbox_pred.draw(birdeye_image_pred, color=(0, 0, 255), thickness=6) bbox_true.draw(birdeye_image_true, color=(0, 0, 255), thickness=6) # stitch frames for showing stitch = stitch_together([frontal_image, birdeye_image_pred, birdeye_image_true], layout=(1, 3), resize_dim=(300, 1800)) cv2.imshow('Dashboard view | Birdeye Prediction | Birdeye GT', stitch) cv2.waitKey(0)