[
  {
    "path": ".gitignore",
    "content": "*.pyc\n.idea/"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2017 Andrea Palazzi\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Learning to Map Vehicles into Bird's Eye View\n\n<p align=\"center\">\n  <img src=\"img/task_overview.png\" height=\"180\">\n</p>\n\nThis code accompanies the paper *[\"Learning to map surrounding vehicles into bird's eye view using synthetic data\"](https://arxiv.org/pdf/1706.08442.pdf)*.\n\nIt contains the code for loading data and pre-trained SDPN model proposed in the paper.\n\n## How-to-run\n\nScript entry-point is in **[main.py](main.py)**. \n\nWhen **[main.py](main.py)** is run, *pretrained weights* are automatically downloaded and injected in the **[model](model.py)**.\n\nModel is then used to perform and inference on a sample data, mapping a car from the dashboard camera view to the bird's eye view of the scene. If everything works correctly, the output should look like this.\n\n<p align=\"center\">\n  <img src=\"img/helloworld.PNG\">\n</p>\n\n#### Dependencies\nThe code was developed with the following configuration:\n* python 2.7.11\n* numpy 1.11.2\n* opencv 3.1.0\n* Theano 0.9.0.dev3\n* Keras 1.1.2\n\nOther configuration will reasonably work, but have never been explicitly tested.\n\n## Dataset \nIn this repository only one example is provided, to the end of verifying that the model is working correctly.\n\nThe **whole dataset**, which comprises more than **1M** couples of bounding boxes, can be found <a href=\"http://imagelab.ing.unimore.it/imagelab/page.asp?IdPage=19\" target=\"_blank\"><b>here</b></a>.\n\nTo get an idea of how the data look like you can check [this video](https://www.youtube.com/watch?v=t2mXv9j6LNw).\n"
  },
  {
    "path": "data/sample_data.txt",
    "content": "004190.jpg, 004190_b.jpg, 000694048, 004067225593, 20.367, 229.760, 0.535, 0.503, 0.618, 0.544, 0.549, 0.021, 0.679, 0.190\n"
  },
  {
    "path": "load_data.py",
    "content": "import numpy as np\nimport cv2\nimport os.path as path\nfrom utils import imagenet_mean_bgr\n\n\ndef convert_from_relative_to_absolute(h, w, x_min, y_min, x_max, y_max):\n    \"\"\"\n    Convert from relative coordinates (range 0, 1) to absolute coordinates given a frame (range h, w)\n\n    Parameters\n    ----------\n    h : int\n        Image height\n    w : int\n        Image width\n    x_min : float\n        X coordinate of top-left corner (in range 0, 1)\n    y_min : float\n        Y coordinate of top-left corner (in range 0, 1)\n    x_max : float\n        X coordinate of bottom-right corner (in range 0, 1)\n    y_max : float\n        Y coordinate of bottom-right corner (in range 0, 1)\n\n    Returns\n    -------\n    coords : list\n        Input coordinates casted to image size -> range (0, h) and (0, w)\n    \"\"\"\n    x_min = x_min * w\n    y_min = y_min * h\n    x_max = x_max * w\n    y_max = y_max * h\n    return map(np.int32, [x_min, y_min, x_max, y_max])\n\n\ndef extract_crop(frame, x_min, y_min, x_max, y_max):\n    \"\"\"\n    Extract vehicle crop from the image.\n    Crop is resized to 224x224 which is ResNet input size.\n\n    Parameters\n    ----------\n    frame : ndarray\n        Image to process\n    x_min : float\n        X coordinate of top-left corner (in range 0, 1)\n    y_min : float\n        Y coordinate of top-left corner (in range 0, 1)\n    x_max : float\n        X coordinate of bottom-right corner (in range 0, 1)\n    y_max : float\n        Y coordinate of bottom-right corner (in range 0, 1)\n\n    Returns\n    -------\n    crop : ndarray\n        Crop containing vehicle, resized to 224x224 pixel\n    \"\"\"\n    h, w = frame.shape[:2]\n\n    x_min, y_min, x_max, y_max = convert_from_relative_to_absolute(h, w, x_min, y_min, x_max, y_max)\n\n    # extract crop from frame\n    crop = frame[y_min:y_max, x_min:x_max, :].copy()\n\n    crop = cv2.resize(crop, (224, 224))\n\n    return crop\n\n\ndef get_sample_batch(data_dir):\n    \"\"\"\n    Load sample data useful for model \"hello world\".\n    \"\"\"\n    X_coords, X_images, X_crops, X_images_original = [], [], [], []\n    Y_coords, Y_images, Y_crops, Y_dist, Y_yaw = [], [], [], [], []\n\n    with open(path.join(data_dir,'sample_data.txt'), 'rb') as f:\n        logs = f.readlines()\n\n        for log in logs:\n\n            # retrieve line values\n            log = log.strip().split(',')\n\n            # parse a log line\n            frame_f, frame_b = log[:2]\n            bbox_id, bbox_model = log[2:4]\n            bbox_dist, bbox_yaw = map(np.float32, log[4:6])\n            coords_frontal = map(np.float32, log[6:10])\n            coords_birdeye = map(np.float32, log[10:])\n\n            # load images\n            frame_frontal_path = path.join(data_dir, frame_f.strip())\n            frame_birdeye_path = path.join(data_dir, frame_b.strip())\n            if not path.exists(frame_frontal_path) or not path.exists(frame_birdeye_path): continue\n            frame_frontal = cv2.imread(frame_frontal_path, cv2.IMREAD_COLOR)\n            frame_birdeye = cv2.imread(frame_birdeye_path, cv2.IMREAD_COLOR)\n\n            # extract crops from whole frames\n            crop_frontal = extract_crop(frame_frontal, *coords_frontal)\n            crop_birdeye = extract_crop(frame_birdeye, *coords_birdeye)\n\n            if crop_frontal is not None and crop_birdeye is not None:\n\n                # convert from (0, 1) to tanh range (-1, 1)\n                coords_birdeye = [2 * (c - 0.5) for c in coords_birdeye]\n\n                # append all needed stuff to output structures\n                X_coords.append(coords_frontal)  # append frontal coords\n                X_crops.append(crop_frontal)  # append frontal crops\n                X_images.append(frame_frontal)  # append frontal frames\n                X_images_original.append(frame_frontal)  # append frontal frames\n                Y_coords.append(coords_birdeye)  # append birdeye coords\n                Y_crops.append(crop_birdeye)  # append birdeye crops\n                Y_images.append(frame_birdeye)  # append birdeye frames\n                Y_dist.append(bbox_dist)  # append bbox distance\n                Y_yaw.append(bbox_yaw)  # append bbox yaw\n\n    # preprocess X crops by subtracting mean and put channels first\n    for b in range(0, len(X_coords)):\n        X_crops[b] = imagenet_mean_bgr(frame_bgr=X_crops[b], op='subtract').transpose(2, 0, 1)\n\n    # convert all stuff to ndarray\n    X_coords, Y_coords = np.array(X_coords), np.array(Y_coords)\n    X_crops, Y_crops = np.array(X_crops), np.array(Y_crops)\n    X_images, Y_images = np.array(X_images), np.array(Y_images)\n    Y_dist, Y_yaw = np.array(Y_dist), np.array(Y_yaw)\n    X_images_original = np.array(X_images_original)\n\n    return X_coords, X_crops, X_images, X_images_original, Y_coords, Y_crops, Y_images, Y_dist, Y_yaw\n\n"
  },
  {
    "path": "main.py",
    "content": "from model import SDPN\nfrom keras.optimizers import Adam\nfrom load_data import get_sample_batch\nfrom utils import show_prediction\nfrom keras.utils.data_utils import get_file\n\n\nTH_WEIGHTS_PATH = 'http://imagelab.ing.unimore.it/files/pretrained_models/keras/SPDN_w.hdf5'\n\n\nif __name__ == '__main__':\n\n    # Get model\n    model = SDPN(summary=True)\n\n    # Download pre-trained weights\n    pretrained_weights_path = get_file('SPDN_w.h5', TH_WEIGHTS_PATH, cache_subdir='models')\n\n    # Load pre-trained weights\n    model.load_weights(pretrained_weights_path)\n    model.compile(optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999), loss='mse')\n\n    # Load sample batch\n    X_coords, X_crops, X_images, X_images_original, Y_coords, Y_crops, Y_images, Y_dist, Y_yaw = get_sample_batch('data')\n\n    # Perform prediction given (vehicle_coords, vehicle_crop) in dashboard camera view\n    Y_pred = model.predict([X_coords, X_crops])\n\n    # Display sample prediction\n    for b in range(len(X_coords)):\n        show_prediction(X_images_original[b], Y_images[b], X_coords[b], Y_coords[b], Y_pred[b])\n"
  },
  {
    "path": "model.py",
    "content": "from keras.models import Model\nfrom keras.layers import Dense, Input, Dropout, Reshape, merge\nfrom keras.applications import ResNet50\n\n\ndef SDPN(summary=True):\n    \"\"\"\n    Create and return Semantic-aware Dense Prediction Network.\n\n    Parameters\n    ----------\n    summary : bool\n        If True, network summary is printed to stout.\n\n    Returns\n    -------\n    model : keras Model\n        Model of SDPN\n\n    \"\"\"\n    input_coords = Input(shape=(4,))\n    input_crop = Input(shape=(3, 224, 224))\n\n    # extract feature from image crop\n    resnet = ResNet50(include_top=False, weights='imagenet')\n    for layer in resnet.layers:  # set resnet as non-trainable\n        layer.trainable = False\n\n    crop_encoded = resnet(input_crop)  # shape of `crop_encoded` is 2018x1x1\n    crop_encoded = Reshape(target_shape=(2048,))(crop_encoded)\n\n    # encode input coordinates\n    h = Dense(256, activation='relu')(input_coords)\n    h = Dropout(p=0.25)(h)\n    h = Dense(256, activation='relu')(h)\n    h = Dropout(p=0.25)(h)\n    h = Dense(256, activation='relu')(h)\n\n    # merge feature vectors from crop and coords\n    merged = merge([crop_encoded, h], mode='concat')\n\n    # decoding into output coordinates\n    h = Dense(1024, activation='relu')(merged)\n    h = Dropout(p=0.25)(h)\n    h = Dense(1024, activation='relu')(h)\n    h = Dropout(p=0.25)(h)\n    h = Dense(512, activation='relu')(h)\n    h = Dropout(p=0.25)(h)\n    h = Dense(256, activation='relu')(h)\n    h = Dropout(p=0.25)(h)\n    h = Dense(128, activation='relu')(h)\n    h = Dropout(p=0.25)(h)\n\n    output_coords = Dense(4, activation='tanh')(h)\n\n    model = Model(input=[input_coords, input_crop], output=output_coords)\n\n    if summary:\n        model.summary()\n\n    return model"
  },
  {
    "path": "utils.py",
    "content": "import numpy as np\nimport cv2\n\n\nclass RelativeRectangle:\n    \"\"\"\n    2D Rectangle defined by top-left and bottom-right corners.\n    NOTICE: COORDS ARE EXPRESSED IN TERMS OF PERCENTAGE OF SCREEN W AND H\n\n    Parameters\n    ----------\n    x_min : float\n        x coordinate of top-left corner.\n    y_min : float\n        y coordinate of top-left corner.\n    x_max : float\n        x coordinate of bottom-right corner.\n    y_min : float\n        y coordinate of bottom-right corner.\n    \"\"\"\n\n    def __init__(self, x_min, y_min, x_max, y_max):\n        self.x_min = x_min\n        self.y_min = y_min\n        self.x_max = x_max\n        self.y_max = y_max\n\n        self.x_side = self.x_max - self.x_min\n        self.y_side = self.y_max - self.y_min\n\n    def draw(self, frame, color=255, thickness=1):\n        \"\"\"\n        Draw Rectangle on a given frame.\n\n        Notice: while this function does not return anything, original image `frame` is modified.\n\n        Parameters\n        ----------\n        frame : 2D / 3D np.array\n            The image on which the rectangle is drawn.\n        color : tuple, optional\n            Color used to draw the rectangle (default = 255)\n        thickness : int, optional\n            Line thickness used t draw the rectangle (default = 1)\n\n        Returns\n        -------\n        None\n        \"\"\"\n\n        h, w = frame.shape[:2]\n\n        # convert back from relative coordinates to frame coordinates\n        x_min = int(self.x_min * w)\n        y_min = int(self.y_min * h)\n        x_max = int(self.x_max * w)\n        y_max = int(self.y_max * h)\n\n        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, thickness)\n\n    def get_mask(self, mask_shape):\n        \"\"\"\n        Return the foreground mask of the rectangle.\n\n        Parameters\n        ----------\n        mask_shape : tuple\n            Tuple (height, width) that defines the shape of the fg_mask\n\n        Returns\n        -------\n        fg_mask : ndarray\n            Foreground mask of this RelativeRectangle\n\n        RelativeRectangle has relative coordinates, so the fg_mask shape must be passed as parameter.\n        \"\"\"\n        h, w = mask_shape\n\n        fg_mask = np.zeros(shape=(h, w), dtype=np.float32)\n\n        # convert back from relative coordinates to frame coordinates\n        x_min = int(self.x_min * w)\n        y_min = int(self.y_min * h)\n        x_max = int(self.x_max * w)\n        y_max = int(self.y_max * h)\n\n        fg_mask[y_min:y_max, x_min:x_max] = 1.\n\n        return fg_mask\n\n    @property\n    def tl_corner(self):\n        \"\"\"\n        Coordinates of the top-left corner of rectangle (as float32).\n\n        Returns\n        -------\n        tl_corner : float32 tuple\n        \"\"\"\n        return tuple(map(np.float32, (self.x_min, self.y_min)))\n\n    @property\n    def br_corner(self):\n        \"\"\"\n        Coordinates of the bottom-right corner of rectangle.\n\n        Returns\n        -------\n        br_corner : float32 tuple\n        \"\"\"\n        return tuple(map(np.float32, (self.x_max, self.y_max)))\n\n    @property\n    def coords(self):\n        \"\"\"\n        Coordinates (x_min, y_min, x_max, y_max) which define the Rectangle.\n\n        Returns\n        -------\n        coordinates : float32 tuple\n        \"\"\"\n        return tuple(map(np.float32, (self.x_min, self.y_min, self.x_max, self.y_max)))\n\n    @property\n    def area(self):\n        \"\"\"\n        Get the area of Rectangle\n\n        Returns\n        -------\n        area : float32\n        \"\"\"\n        return np.float32(self.x_side * self.y_side)\n\n\ndef imagenet_mean_bgr(frame_bgr, op='subtract'):\n    \"\"\"\n    Add or subtract ImageNet mean pixel value from a given BGR frame.\n    \"\"\"\n    imagenet_mean_BGR = np.array([123.68, 116.779, 103.939])\n\n    frame_bgr = np.float32(frame_bgr)\n\n    for c in range(0, 3):\n        if op == 'subtract': frame_bgr[:, :, c] -= imagenet_mean_BGR[c]\n        elif op == 'add':    frame_bgr[:, :, c] += imagenet_mean_BGR[c]\n\n    return frame_bgr\n\n\ndef stitch_together(input_images, layout, resize_dim=None, off_x=None, off_y=None, bg_color=(0, 0, 0)):\n    \"\"\"\n    Stitch together N input images into a bigger frame, using a grid layout.\n    Input images can be either color or grayscale, but must all have the same size.\n    :param input_images: list of input images\n    :param layout: grid layout expressed (rows, cols) of the stitch\n    :param resize_dim: if not None, stitch is resized to this size\n    :param off_x: offset between stitched images along x axis\n    :param off_y: offset between stitched images along y axis\n    :param bg_color: color used for background\n    :return: stitch of input images\n    \"\"\"\n\n    if len(set([img.shape for img in input_images])) > 1:\n        raise ValueError('All images must have the same shape')\n\n    # determine if input images are color (3 channels) or grayscale (single channel)\n    if len(input_images[0].shape) == 2:\n        mode = 'grayscale'\n        img_h, img_w = input_images[0].shape\n    elif len(input_images[0].shape) == 3:\n        mode = 'color'\n        img_h, img_w, img_c = input_images[0].shape\n    else:\n        raise ValueError('Unknown shape for input images')\n\n    # if no offset is provided, set to 10% of image size\n    if off_x is None:\n        off_x = img_w // 10\n    if off_y is None:\n        off_y = img_h // 10\n\n    # create stitch mask\n    rows, cols = layout\n    stitch_h = rows * img_h + (rows + 1) * off_y\n    stitch_w = cols * img_w + (cols + 1) * off_x\n    if mode == 'color':\n        bg_color = np.array(bg_color)[None, None, :]  # cast to ndarray add singleton dimensions\n        stitch = np.uint8(np.repeat(np.repeat(bg_color, stitch_h, axis=0), stitch_w, axis=1))\n    elif mode == 'grayscale':\n        stitch = np.zeros(shape=(stitch_h, stitch_w), dtype=np.uint8)\n\n    for r in range(0, rows):\n        for c in range(0, cols):\n\n            list_idx =  r * cols + c\n\n            if list_idx < len(input_images):\n                if mode == 'color':\n                    stitch[ r * (off_y + img_h) + off_y: r*(off_y+img_h) + off_y + img_h,\n                            c * (off_x + img_w) + off_x: c * (off_x + img_w) + off_x + img_w,\n                            :] = input_images[list_idx]\n                elif mode == 'grayscale':\n                    stitch[ r * (off_y + img_h) + off_y: r*(off_y+img_h) + off_y + img_h,\n                            c * (off_x + img_w) + off_x: c * (off_x + img_w) + off_x + img_w]\\\n                        = input_images[list_idx]\n\n    if resize_dim:\n        stitch = cv2.resize(stitch, dsize=(resize_dim[::-1]))\n\n    return stitch\n\n\ndef show_prediction(frontal_image, birdeye_image, x_coords, y_coords, y_pred_coords):\n    \"\"\"\n    Display network prediction.\n\n    Parameters\n    ----------\n\n    frontal_image : ndarray\n        Frame taken from dashboard camera view\n    birdeye_image : ndarray\n        Frame taken from bird's eye view\n    x_coords : list\n        Coords of vehicle in the frontal view\n    y_coords : list\n        Coords of vehicle in the bird's eye view (GT)\n    y_pred_coords : list\n        Coords of vehicle in the bird's eye view (pred)\n\n    Returns\n    -------\n    None\n    \"\"\"\n    birdeye_image_pred = birdeye_image.copy()\n    birdeye_image_true = birdeye_image.copy()\n\n    bbox_frontal = RelativeRectangle(*[x_coords[j] for j in range(0, 4)])\n\n    # cast back from tanh range (-1, 1) to (0, 1)\n    bbox_pred = RelativeRectangle(*[((y_pred_coords[j] * 0.5) + 0.5) for j in range(0, 4)])\n    bbox_true = RelativeRectangle(*[((y_coords[j] * 0.5) + 0.5) for j in range(0, 4)])\n\n    # draw bounding boxes\n    bbox_frontal.draw(frontal_image, color=(0, 0, 255), thickness=6)\n    bbox_pred.draw(birdeye_image_pred, color=(0, 0, 255), thickness=6)\n    bbox_true.draw(birdeye_image_true, color=(0, 0, 255), thickness=6)\n\n    # stitch frames for showing\n    stitch = stitch_together([frontal_image, birdeye_image_pred, birdeye_image_true],\n                             layout=(1, 3), resize_dim=(300, 1800))\n    cv2.imshow('Dashboard view | Birdeye Prediction | Birdeye GT', stitch)\n    cv2.waitKey(0)"
  }
]