Repository: Tandon-A/emotic Branch: master Commit: 862a1cad4825 Files: 15 Total size: 168.3 KB Directory structure: gitextract_0dll9oq4/ ├── Colab_train_emotic.ipynb ├── LICENSE ├── README.md ├── emotic.py ├── emotic_dataset.py ├── inference.py ├── loss.py ├── main.py ├── mat2py.py ├── prepare_models.py ├── sample_inference_list.txt ├── test.py ├── train.py ├── yolo_inference.py └── yolo_utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: Colab_train_emotic.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "emotic.ipynb", "provenance": [], "collapsed_sections": [], "authorship_tag": "ABX9TyNTRS+z3BPWqTSv2PkmiNrg", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "a8cada3fef3846b2bffe52edacbc190d": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_ae34432e333e4671b3f7f934de91027b", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_674e1fd300d042cbaf9f0e53e7ac4ecd", "IPY_MODEL_88b121247db64a3490f8c1b16f68c696", "IPY_MODEL_57ffb85897da4061b318afddca2eed81" ] } }, "ae34432e333e4671b3f7f934de91027b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "674e1fd300d042cbaf9f0e53e7ac4ecd": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_9cb235dbbe234dbe805b4aa00f7d54ae", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_5e048b1fa84146c8bd2b63a19239cb9e" } }, "88b121247db64a3490f8c1b16f68c696": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_e6a413c0b59f466b9213c1904b1f57f8", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 46830571, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 46830571, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_e0d0abfa1e9441f58722b064823c8119" } }, "57ffb85897da4061b318afddca2eed81": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_a1bbd4436c154378839f58483fa8c261", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 44.7M/44.7M [00:00<00:00, 134MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_4f6e592ca3f34209af0ae78a635fc346" } }, "9cb235dbbe234dbe805b4aa00f7d54ae": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "5e048b1fa84146c8bd2b63a19239cb9e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "e6a413c0b59f466b9213c1904b1f57f8": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "e0d0abfa1e9441f58722b064823c8119": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "a1bbd4436c154378839f58483fa8c261": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "4f6e592ca3f34209af0ae78a635fc346": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "_5Xan2tnR89K" }, "source": [ "

Emotions in context (Emotic)

\n", "
Using context information to recognize emotions in images
" ] }, { "cell_type": "markdown", "metadata": { "id": "rbCWI0rkt8yp" }, "source": [ "

Project context

\n", "\n", "Humans use their facial features or expressions to convey how they feel, such as a person may smile when happy and scowl when angry. Historically, computer vision research has focussed on analyzing and learning these facial features to recognize emotions. \n", "However, these facial features are not universal and vary extensively across cultures and situations. \n", "\n", "
\n", " \n", "
Fig 1: a) (Facial feature) The person looks angry or in pain b) (Whole scene) The person looks elated.
\n", "
\n", "\n", "\n", "A scene context, as shown in the figure above, can provide additional information about the situations. This project explores the use of context in recognizing emotions in images. \n", "\n", "This project uses the EMOTIC dataset and follows the methodology as introduced in the paper 'Context based emotion recognition using EMOTIC dataset'." ] }, { "cell_type": "code", "metadata": { "id": "1YFaW8HlNWnE", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7cc564d6-4503-4b5a-bac8-a4fe0bdbcb65" }, "source": [ "# Linking Google drive to use preprocessed data \n", "from google.colab import drive\n", "\n", "# This will prompt for authorization.\n", "drive.mount('/content/drive')\n", "#/content/drive/My Drive//" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Mounted at /content/drive\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "FhzX7KUihZqu" }, "source": [ "# I. Prepare places pretrained model" ] }, { "cell_type": "code", "metadata": { "id": "uYgeeri3wdCM", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "59be98ac-4cc9-403c-e116-bac36e368e8b" }, "source": [ "# Get Resnet18 model trained on places dataset. \n", "!mkdir ./places\n", "!wget http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar -O ./places/resnet18_places365.pth.tar" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ "--2021-08-17 17:32:18-- http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar\n", "Resolving places2.csail.mit.edu (places2.csail.mit.edu)... 128.30.195.26\n", "Connecting to places2.csail.mit.edu (places2.csail.mit.edu)|128.30.195.26|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 45506139 (43M) [application/x-tar]\n", "Saving to: ‘./places/resnet18_places365.pth.tar’\n", "\n", "./places/resnet18_p 100%[===================>] 43.40M 24.3MB/s in 1.8s \n", "\n", "2021-08-17 17:32:20 (24.3 MB/s) - ‘./places/resnet18_places365.pth.tar’ saved [45506139/45506139]\n", "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "RhWL6Qi_w4qp", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "4803750e-9487-4589-ef86-d8244ed698ca" }, "source": [ "# Saving the model weights to use ahead in the notebook\n", "import torch\n", "from torch.autograd import Variable as V\n", "import torchvision.models as models\n", "from PIL import Image\n", "from torchvision import transforms as trn\n", "from torch.nn import functional as F\n", "import os\n", "\n", "# the architecture to use\n", "arch = 'resnet18'\n", "model_weight = os.path.join('./places', 'resnet18_places365.pth.tar')\n", "\n", "# create the network architecture\n", "model = models.__dict__[arch](num_classes=365)\n", "\n", "#model_weight = '%s_places365.pth.tar' % arch\n", "\n", "checkpoint = torch.load(model_weight, map_location=lambda storage, loc: storage) # model trained in GPU could be deployed in CPU machine like this!\n", "state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()} # the data parallel layer will add 'module' before each layer name\n", "model.load_state_dict(state_dict)\n", "model.eval()\n", "\n", "model.cpu()\n", "torch.save(model.state_dict(), './places/resnet18_state_dict.pth')\n", "print ('completed cell')" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "ykNjfrUuhpbq" }, "source": [ "# II. General imports" ] }, { "cell_type": "code", "metadata": { "id": "vi-O8QgwvOQY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "6f5857a3-f3af-4dbb-dd7f-8539fab5b9e7" }, "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import os\n", "from PIL import Image\n", "import scipy.io\n", "from sklearn.metrics import average_precision_score, precision_recall_curve\n", "\n", "import torch \n", "import torch.nn as nn \n", "import torch.nn.functional as F\n", "import torch.optim as optim \n", "from torch.utils.data import Dataset, DataLoader \n", "from torchsummary import summary\n", "from torchvision import transforms\n", "import torchvision.models as models\n", "from torch.optim.lr_scheduler import StepLR\n", "\n", "print ('completed cell')" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "AD0pBBBYh2vW" }, "source": [ "# III. Emotic classes" ] }, { "cell_type": "markdown", "metadata": { "id": "ZfPKerg4TWkR" }, "source": [ "## Emotic Model " ] }, { "cell_type": "code", "metadata": { "id": "ZWt88EcJVu0c", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cd2365da-0d45-4616-800c-e1fd6f565c29" }, "source": [ "class Emotic(nn.Module):\n", " ''' Emotic Model'''\n", " def __init__(self, num_context_features, num_body_features):\n", " super(Emotic,self).__init__()\n", " self.num_context_features = num_context_features\n", " self.num_body_features = num_body_features\n", " self.fc1 = nn.Linear((self.num_context_features + num_body_features), 256)\n", " self.bn1 = nn.BatchNorm1d(256)\n", " self.d1 = nn.Dropout(p=0.5)\n", " self.fc_cat = nn.Linear(256, 26)\n", " self.fc_cont = nn.Linear(256, 3)\n", " self.relu = nn.ReLU()\n", "\n", " \n", " def forward(self, x_context, x_body):\n", " context_features = x_context.view(-1, self.num_context_features)\n", " body_features = x_body.view(-1, self.num_body_features)\n", " fuse_features = torch.cat((context_features, body_features), 1)\n", " fuse_out = self.fc1(fuse_features)\n", " fuse_out = self.bn1(fuse_out)\n", " fuse_out = self.relu(fuse_out)\n", " fuse_out = self.d1(fuse_out) \n", " cat_out = self.fc_cat(fuse_out)\n", " cont_out = self.fc_cont(fuse_out)\n", " return cat_out, cont_out\n", "\n", "print ('completed cell')" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "zdzZGj6AxLaC" }, "source": [ "## Emotic Dataset" ] }, { "cell_type": "code", "metadata": { "id": "eKG5dNMXxlnm", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "890ab105-8973-4be7-be1f-670a816d6b79" }, "source": [ "class Emotic_PreDataset(Dataset):\n", " ''' Custom Emotic dataset class. Use preprocessed data stored in npy files. '''\n", " def __init__(self, x_context, x_body, y_cat, y_cont, transform, context_norm, body_norm):\n", " super(Emotic_PreDataset,self).__init__()\n", " self.x_context = x_context\n", " self.x_body = x_body\n", " self.y_cat = y_cat \n", " self.y_cont = y_cont\n", " self.transform = transform \n", " self.context_norm = transforms.Normalize(context_norm[0], context_norm[1]) # Normalizing the context image with context mean and context std\n", " self.body_norm = transforms.Normalize(body_norm[0], body_norm[1]) # Normalizing the body image with body mean and body std\n", "\n", " def __len__(self):\n", " return len(self.y_cat)\n", " \n", " def __getitem__(self, index):\n", " image_context = self.x_context[index]\n", " image_body = self.x_body[index]\n", " cat_label = self.y_cat[index]\n", " cont_label = self.y_cont[index]\n", " return self.context_norm(self.transform(image_context)), self.body_norm(self.transform(image_body)), torch.tensor(cat_label, dtype=torch.float32), torch.tensor(cont_label, dtype=torch.float32)/10.0\n", "\n", "print ('completed cell')" ], "execution_count": 6, "outputs": [ { "output_type": "stream", "text": [ "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "JFuEQruAxQrK" }, "source": [ "## Emotic Losses" ] }, { "cell_type": "code", "metadata": { "id": "ObffJVXkqsJg", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9665ef7f-44a7-4ddf-db6f-a4a0e6430061" }, "source": [ "class DiscreteLoss(nn.Module):\n", " ''' Class to measure loss between categorical emotion predictions and labels.'''\n", " def __init__(self, weight_type='mean', device=torch.device('cpu')):\n", " super(DiscreteLoss, self).__init__()\n", " self.weight_type = weight_type\n", " self.device = device\n", " if self.weight_type == 'mean':\n", " self.weights = torch.ones((1,26))/26.0\n", " self.weights = self.weights.to(self.device)\n", " elif self.weight_type == 'static':\n", " self.weights = torch.FloatTensor([0.1435, 0.1870, 0.1692, 0.1165, 0.1949, 0.1204, 0.1728, 0.1372, 0.1620,\n", " 0.1540, 0.1987, 0.1057, 0.1482, 0.1192, 0.1590, 0.1929, 0.1158, 0.1907,\n", " 0.1345, 0.1307, 0.1665, 0.1698, 0.1797, 0.1657, 0.1520, 0.1537]).unsqueeze(0)\n", " self.weights = self.weights.to(self.device)\n", " \n", " def forward(self, pred, target):\n", " if self.weight_type == 'dynamic':\n", " self.weights = self.prepare_dynamic_weights(target)\n", " self.weights = self.weights.to(self.device)\n", " loss = (((pred - target)**2) * self.weights)\n", " return loss.sum() \n", "\n", " def prepare_dynamic_weights(self, target):\n", " target_stats = torch.sum(target, dim=0).float().unsqueeze(dim=0).cpu()\n", " weights = torch.zeros((1,26))\n", " weights[target_stats != 0 ] = 1.0/torch.log(target_stats[target_stats != 0].data + 1.2)\n", " weights[target_stats == 0] = 0.0001\n", " return weights\n", "\n", "\n", "class ContinuousLoss_L2(nn.Module):\n", " ''' Class to measure loss between continuous emotion dimension predictions and labels. Using l2 loss as base. '''\n", " def __init__(self, margin=1):\n", " super(ContinuousLoss_L2, self).__init__()\n", " self.margin = margin\n", " \n", " def forward(self, pred, target):\n", " labs = torch.abs(pred - target)\n", " loss = labs ** 2 \n", " loss[ (labs < self.margin) ] = 0.0\n", " return loss.sum()\n", "\n", "\n", "class ContinuousLoss_SL1(nn.Module):\n", " ''' Class to measure loss between continuous emotion dimension predictions and labels. Using smooth l1 loss as base. '''\n", " def __init__(self, margin=1):\n", " super(ContinuousLoss_SL1, self).__init__()\n", " self.margin = margin\n", " \n", " def forward(self, pred, target):\n", " labs = torch.abs(pred - target)\n", " loss = 0.5 * (labs ** 2)\n", " loss[ (labs > self.margin) ] = labs[ (labs > self.margin) ] - 0.5\n", " return loss.sum()\n", "\n", "print ('completed cell')" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "-AMUYcy5h9cM" }, "source": [ "# IV. Load preprocessed data" ] }, { "cell_type": "code", "metadata": { "id": "VSadne_Bc5va", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cea63663-6140-4666-8a80-3e69434b92d6" }, "source": [ "# Change data_src variable as per your drive\n", "data_src = '/content/drive/My Drive/Colab/Emotic/data'\n", "\n", "\n", "# Load training preprocessed data\n", "train_context = np.load(os.path.join(data_src,'pre','train_context_arr.npy'))\n", "train_body = np.load(os.path.join(data_src,'pre','train_body_arr.npy'))\n", "train_cat = np.load(os.path.join(data_src,'pre','train_cat_arr.npy'))\n", "train_cont = np.load(os.path.join(data_src,'pre','train_cont_arr.npy'))\n", "\n", "# Load validation preprocessed data \n", "val_context = np.load(os.path.join(data_src,'pre','val_context_arr.npy'))\n", "val_body = np.load(os.path.join(data_src,'pre','val_body_arr.npy'))\n", "val_cat = np.load(os.path.join(data_src,'pre','val_cat_arr.npy'))\n", "val_cont = np.load(os.path.join(data_src,'pre','val_cont_arr.npy'))\n", "\n", "# Load testing preprocessed data\n", "test_context = np.load(os.path.join(data_src,'pre','test_context_arr.npy'))\n", "test_body = np.load(os.path.join(data_src,'pre','test_body_arr.npy'))\n", "test_cat = np.load(os.path.join(data_src,'pre','test_cat_arr.npy'))\n", "test_cont = np.load(os.path.join(data_src,'pre','test_cont_arr.npy'))\n", "\n", "# Categorical emotion classes\n", "cat = ['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion', 'Confidence', 'Disapproval', 'Disconnection',\n", " 'Disquietment', 'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem', 'Excitement', 'Fatigue', 'Fear',\n", " 'Happiness', 'Pain', 'Peace', 'Pleasure', 'Sadness', 'Sensitivity', 'Suffering', 'Surprise', 'Sympathy', 'Yearning']\n", "\n", "cat2ind = {}\n", "ind2cat = {}\n", "for idx, emotion in enumerate(cat):\n", " cat2ind[emotion] = idx\n", " ind2cat[idx] = emotion\n", "\n", "print ('train ', 'context ', train_context.shape, 'body', train_body.shape, 'cat ', train_cat.shape, 'cont', train_cont.shape)\n", "print ('val ', 'context ', val_context.shape, 'body', val_body.shape, 'cat ', val_cat.shape, 'cont', val_cont.shape)\n", "print ('test ', 'context ', test_context.shape, 'body', test_body.shape, 'cat ', test_cat.shape, 'cont', test_cont.shape)\n", "print ('completed cell')" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "train context (23266, 224, 224, 3) body (23266, 128, 128, 3) cat (23266, 26) cont (23266, 3)\n", "val context (3315, 224, 224, 3) body (3315, 128, 128, 3) cat (3315, 26) cont (3315, 3)\n", "test context (7203, 224, 224, 3) body (7203, 128, 128, 3) cat (7203, 26) cont (7203, 3)\n", "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "JySFyUFZNgPy", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "84ba41f4-7fee-466e-b1b1-ff2fce976395" }, "source": [ "batch_size = 26\n", "\n", "context_mean = [0.4690646, 0.4407227, 0.40508908]\n", "context_std = [0.2514227, 0.24312855, 0.24266963]\n", "body_mean = [0.43832874, 0.3964344, 0.3706214]\n", "body_std = [0.24784276, 0.23621225, 0.2323653]\n", "context_norm = [context_mean, context_std]\n", "body_norm = [body_mean, body_std]\n", "\n", "\n", "train_transform = transforms.Compose([transforms.ToPILImage(), \n", " transforms.RandomHorizontalFlip(), \n", " transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), \n", " transforms.ToTensor()])\n", "test_transform = transforms.Compose([transforms.ToPILImage(), \n", " transforms.ToTensor()])\n", "\n", "train_dataset = Emotic_PreDataset(train_context, train_body, train_cat, train_cont, \\\n", " train_transform, context_norm, body_norm)\n", "val_dataset = Emotic_PreDataset(val_context, val_body, val_cat, val_cont, \\\n", " test_transform, context_norm, body_norm)\n", "test_dataset = Emotic_PreDataset(test_context, test_body, test_cat, test_cont, \\\n", " test_transform, context_norm, body_norm)\n", "\n", "train_loader = DataLoader(train_dataset, batch_size, shuffle=True, drop_last=True)\n", "val_loader = DataLoader(val_dataset, batch_size, shuffle=False)\n", "test_loader = DataLoader(test_dataset, batch_size, shuffle=False) \n", "\n", "print ('train loader ', len(train_loader), 'val loader ', len(val_loader), 'test', len(test_loader))\n", "print ('completed cell')" ], "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ "train loader 894 val loader 128 test 278\n", "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "wvPoFnAliZBC" }, "source": [ "# V. Prepare emotic model" ] }, { "cell_type": "code", "metadata": { "id": "cMSaPqJyVyEW", "colab": { "base_uri": "https://localhost:8080/", "height": 83, "referenced_widgets": [ "a8cada3fef3846b2bffe52edacbc190d", "ae34432e333e4671b3f7f934de91027b", "674e1fd300d042cbaf9f0e53e7ac4ecd", "88b121247db64a3490f8c1b16f68c696", "57ffb85897da4061b318afddca2eed81", "9cb235dbbe234dbe805b4aa00f7d54ae", "5e048b1fa84146c8bd2b63a19239cb9e", "e6a413c0b59f466b9213c1904b1f57f8", "e0d0abfa1e9441f58722b064823c8119", "a1bbd4436c154378839f58483fa8c261", "4f6e592ca3f34209af0ae78a635fc346" ] }, "outputId": "b1b68154-bcfc-438a-c711-31b84177d56c" }, "source": [ "model_path_places = './places'\n", "\n", "model_context = models.__dict__[arch](num_classes=365)\n", "context_state_dict = torch.load(os.path.join(model_path_places, 'resnet18_state_dict.pth'))\n", "model_context.load_state_dict(context_state_dict)\n", "\n", "model_body = models.resnet18(pretrained=True)\n", "\n", "emotic_model = Emotic(list(model_context.children())[-1].in_features, list(model_body.children())[-1].in_features)\n", "model_context = nn.Sequential(*(list(model_context.children())[:-1]))\n", "model_body = nn.Sequential(*(list(model_body.children())[:-1]))\n", "\n", "\n", "# print (summary(model_context, (3,224,224), device=\"cpu\"))\n", "# print (summary(model_body, (3,128,128), device=\"cpu\"))\n", "\n", "print ('completed cell')" ], "execution_count": 10, "outputs": [ { "output_type": "stream", "text": [ "Downloading: \"https://download.pytorch.org/models/resnet18-f37072fd.pth\" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a8cada3fef3846b2bffe52edacbc190d", "version_minor": 0, "version_major": 2 }, "text/plain": [ " 0%| | 0.00/44.7M [00:00" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "cDa4nuQvjGSa" }, "source": [ "# VII. Test model" ] }, { "cell_type": "code", "metadata": { "id": "AFCcFv4mnmRi", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "bf220347-2681-4466-dc70-060c0291b5cc" }, "source": [ "def test_scikit_ap(cat_preds, cat_labels):\n", " ap = np.zeros(26, dtype=np.float32)\n", " for i in range(26):\n", " ap[i] = average_precision_score(cat_labels[i, :], cat_preds[i, :])\n", " print ('ap', ap, ap.shape, ap.mean())\n", " return ap.mean()\n", "\n", "\n", "def test_emotic_vad(cont_preds, cont_labels):\n", " vad = np.zeros(3, dtype=np.float32)\n", " for i in range(3):\n", " vad[i] = np.mean(np.abs(cont_preds[i, :] - cont_labels[i, :]))\n", " print ('vad', vad, vad.shape, vad.mean())\n", " return vad.mean()\n", "\n", "\n", "def get_thresholds(cat_preds, cat_labels):\n", " thresholds = np.zeros(26, dtype=np.float32)\n", " for i in range(26):\n", " p, r, t = precision_recall_curve(cat_labels[i, :], cat_preds[i, :])\n", " for k in range(len(p)):\n", " if p[k] == r[k]:\n", " thresholds[i] = t[k]\n", " break\n", " np.save('./thresholds.npy', thresholds)\n", " return thresholds\n", "\n", "print ('completed cell')" ], "execution_count": 14, "outputs": [ { "output_type": "stream", "text": [ "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "KOeZRVdbUPNx", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e20ad71b-9d42-47f5-cda8-0bd08abb27c4" }, "source": [ "def test_data(models, device, data_loader, num_images):\n", " model_context, model_body, emotic_model = models\n", " cat_preds = np.zeros((num_images, 26))\n", " cat_labels = np.zeros((num_images, 26))\n", " cont_preds = np.zeros((num_images, 3))\n", " cont_labels = np.zeros((num_images, 3))\n", "\n", " with torch.no_grad():\n", " model_context.to(device)\n", " model_body.to(device)\n", " emotic_model.to(device)\n", " model_context.eval()\n", " model_body.eval()\n", " emotic_model.eval()\n", " indx = 0\n", " print ('starting testing')\n", " for images_context, images_body, labels_cat, labels_cont in iter(data_loader):\n", " images_context = images_context.to(device)\n", " images_body = images_body.to(device)\n", "\n", " pred_context = model_context(images_context)\n", " pred_body = model_body(images_body)\n", " pred_cat, pred_cont = emotic_model(pred_context, pred_body)\n", "\n", " cat_preds[ indx : (indx + pred_cat.shape[0]), :] = pred_cat.to(\"cpu\").data.numpy()\n", " cat_labels[ indx : (indx + labels_cat.shape[0]), :] = labels_cat.to(\"cpu\").data.numpy()\n", " cont_preds[ indx : (indx + pred_cont.shape[0]), :] = pred_cont.to(\"cpu\").data.numpy() * 10\n", " cont_labels[ indx : (indx + labels_cont.shape[0]), :] = labels_cont.to(\"cpu\").data.numpy() * 10 \n", " indx = indx + pred_cat.shape[0]\n", "\n", " cat_preds = cat_preds.transpose()\n", " cat_labels = cat_labels.transpose()\n", " cont_preds = cont_preds.transpose()\n", " cont_labels = cont_labels.transpose()\n", " scipy.io.savemat('./cat_preds.mat',mdict={'cat_preds':cat_preds})\n", " scipy.io.savemat('./cat_labels.mat',mdict={'cat_labels':cat_labels})\n", " scipy.io.savemat('./cont_preds.mat',mdict={'cont_preds':cont_preds})\n", " scipy.io.savemat('./cont_labels.mat',mdict={'cont_labels':cont_labels})\n", " print ('completed testing')\n", " ap_mean = test_scikit_ap(cat_preds, cat_labels)\n", " vad_mean = test_emotic_vad(cont_preds, cont_labels)\n", " print (ap_mean, vad_mean)\n", " return ap_mean, vad_mean \n", "\n", "print ('completed cell')" ], "execution_count": 15, "outputs": [ { "output_type": "stream", "text": [ "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qIUQLrXBZ2RR", "outputId": "c958d8ba-6e32-438f-c5c5-816d9b9ed829" }, "source": [ "model_context = torch.load('./models/model_context1.pth')\n", "model_body = torch.load('./models/model_body1.pth')\n", "emotic_model = torch.load('./models/model_emotic1.pth')\n", "\n", "print ('completed cell')" ], "execution_count": 16, "outputs": [ { "output_type": "stream", "text": [ "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "oB69Xo-kLldG", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "b6be064a-25b2-43d3-e7e7-51a7fc8a9304" }, "source": [ "val_ap, val_vad = test_data([model_context, model_body, emotic_model], device, val_loader, val_dataset.__len__())\n", "test_ap, test_vad = test_data([model_context, model_body, emotic_model], device, test_loader, test_dataset.__len__())\n", "\n", "print ('validation Mean average precision=%.4f Mean VAD MAE=%.4f' %(val_ap, val_vad))\n", "print ('testing Mean average precision=%.4f Mean VAD MAE=%.4f' %(test_ap, test_vad))" ], "execution_count": 17, "outputs": [ { "output_type": "stream", "text": [ "starting testing\n", "completed testing\n", "ap [0.3983917 0.18015468 0.22337271 0.95204633 0.17163357 0.7866947\n", " 0.23361506 0.37178904 0.19096893 0.20868655 0.06009851 0.98069084\n", " 0.26645675 0.7951143 0.13405906 0.08186857 0.8081806 0.16670538\n", " 0.29040682 0.49211633 0.20419936 0.08260126 0.18704712 0.14419095\n", " 0.3501988 0.11717057] (26,) 0.34147915\n", "vad [0.70697206 0.8584789 0.86687875] (3,) 0.81077653\n", "0.34147915 0.81077653\n", "starting testing\n", "completed testing\n", "ap [0.29003292 0.08763415 0.14132965 0.56043494 0.07053518 0.75399864\n", " 0.11882206 0.2385993 0.16040386 0.173684 0.01993784 0.86009395\n", " 0.15641297 0.69662005 0.09915597 0.06025878 0.66563565 0.06506737\n", " 0.21911173 0.4214436 0.17897978 0.05904196 0.1752331 0.08228464\n", " 0.13343503 0.0820521 ] (26,) 0.2527015\n", "vad [0.8996919 1.0314642 0.943558 ] (3,) 0.95823807\n", "0.2527015 0.95823807\n", "validation Mean average precision=0.3415 Mean VAD MAE=0.8108\n", "testing Mean average precision=0.2527 Mean VAD MAE=0.9582\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "T-fc5LNp4len", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8de112fa-a4bd-43c0-ff44-895b1ae32fe1" }, "source": [ "cat_labels = scipy.io.loadmat('./cat_labels.mat')\n", "cat_preds = scipy.io.loadmat('./cat_preds.mat')\n", "cat_preds = cat_preds['cat_preds']\n", "cat_labels = cat_labels['cat_labels']\n", "print (cat_preds.shape, cat_labels.shape)\n", "\n", "#thesholds calculation for inference \n", "thresholds = get_thresholds(cat_preds, cat_labels)\n", "print (thresholds, thresholds.shape)\n", "\n", "print ('completed cell')" ], "execution_count": 18, "outputs": [ { "output_type": "stream", "text": [ "(26, 7203) (26, 7203)\n", "[0.11334415 0.32935348 0.17811956 0.1820814 0.24816841 0.13238849\n", " 0.23765785 0.10895684 0.07811652 0.07971309 0.14207679 0.47783324\n", " 0.08085962 0.14741261 0.12622227 0.12906708 0.22126663 0.2721243\n", " 0.10970519 0.10124312 0.18777776 0.14807722 0.2636854 0.09791826\n", " 0.0983988 0.0875175 ] (26,)\n", "completed cell\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "owTpkHmOjLvr" }, "source": [ "# VIII. Average Precision computation using author's script" ] }, { "cell_type": "code", "metadata": { "id": "30PEDPHxrkXA", "colab": { "base_uri": "https://localhost:8080/", "height": 101 }, "outputId": "8d2ed78c-fadb-40fc-8f11-be409beb8ea0" }, "source": [ "!apt install octave" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Reading package lists... Done\n", "Building dependency tree \n", "Reading state information... Done\n", "octave is already the newest version (4.2.2-1ubuntu1).\n", "0 upgraded, 0 newly installed, 0 to remove and 31 not upgraded.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "6fWR4CTMr7Hf", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "b7539f27-3a07-4184-f67f-d4b3d84350f7" }, "source": [ "%%writefile eval.m\n", "\n", "gt = load('./cat_labels.mat')\n", "gt = gt.cat_labels\n", "\n", "pred = load('./cat_preds.mat')\n", "pred = pred.cat_preds\n", "\n", "categories{1} = 'Affection';\n", "categories{2} = 'Anger';\n", "categories{3} = 'Annoyance';\n", "categories{4} = 'Anticipation';\n", "categories{5} = 'Aversion';\n", "categories{6} = 'Confidence';\n", "categories{7} = 'Disapproval';\n", "categories{8} = 'Disconnection';\n", "categories{9} = 'Disquietment';\n", "categories{10} = 'Doubt/Confusion';\n", "categories{11} = 'Embarrassment';\n", "categories{12} = 'Engagement';\n", "categories{13} = 'Esteem';\n", "categories{14} = 'Excitement';\n", "categories{15} = 'Fatigue';\n", "categories{16} = 'Fear';\n", "categories{17} = 'Happiness';\n", "categories{18} = 'Pain';\n", "categories{19} = 'Peace';\n", "categories{20} = 'Pleasure';\n", "categories{21} = 'Sadness';\n", "categories{22} = 'Sensitivity';\n", "categories{23} = 'Suffering';\n", "categories{24} = 'Surprise';\n", "categories{25} = 'Sympathy';\n", "categories{26} = 'Yearning';\n", "\n", "\n", "for c = 1:length(categories)\n", " confidence = pred(c,:)'; \n", " testClass = gt(c,:)';\n", " confidence = double(confidence);\n", "\n", " S = rand('state');\n", " rand('state',0);\n", " confidence = confidence + rand(size(confidence))*10^(-10);\n", " rand('state',S)\n", "\n", " [S,j] = sort(-confidence);\n", " C = testClass(j);\n", " n = length(C);\n", " \n", " REL = sum(C);\n", " if n>0\n", " RETREL = cumsum(C);\n", " RET = (1:n)';\n", " else\n", " RETREL = 0;\n", " RET = 1;\n", " end\n", "\n", " precision = 100*RETREL ./ RET;\n", " recall = 100*RETREL / REL;\n", " th = -S;\n", "\n", " % compute AP\n", " mrec=[0 ; recall ; 100];\n", " mpre=[0 ; precision ; 0];\n", " for i=numel(mpre)-1:-1:1\n", " mpre(i)=max(mpre(i),mpre(i+1));\n", " end\n", " i=find(mrec(2:end)~=mrec(1:end-1))+1;\n", " averagePrecision=sum((mrec(i)-mrec(i-1)).*mpre(i))/100;\n", " ap_list(c) = averagePrecision\n", "end\n", "\n", "display('#######################################')\n", "\n", "display('Average precision of predictions');\n", "for c = 1:length(categories)\n", " sp = '............................';\n", " cat = strcat(categories{c}, sp);\n", " cat = cat(1:18);\n", " display(cat);\n", " display(ap_list(c));\n", "end" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Overwriting eval.m\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "fA1Oc48zvI_l" }, "source": [ "!octave -W eval.m" ], "execution_count": null, "outputs": [] } ] } ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2020 Abhishek Tandon Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Emotic Humans use their facial features or expressions to convey how they feel, such as a person may smile when happy and scowl when angry. Historically, computer vision research has focussed on analyzing and learning these facial features to recognize emotions. However, these facial features are not universal and vary extensively across cultures and situations. ###### Fig 1: a) (Facial feature) The person looks angry or in pain b) (Whole scene) The person looks elated. A scene context, as shown in the figure above, can provide additional information about the situations. This project explores the use of context in recognizing emotions in images. ## Pipeline The project uses the EMOTIC dataset and follows the methodology as introduced in the paper *['Context based emotion recognition using EMOTIC dataset'](https://arxiv.org/pdf/2003.13401.pdf)*. ![Pipeline](https://raw.githubusercontent.com/Tandon-A/emotic/master/assets/pipeline%20model.jpg "Model Pipeline") ###### Fig 2: Model Pipeline ([Image source](https://arxiv.org/pdf/2003.13401.pdf)) Two feature extraction modules first extract features over an image. These features are then used by a third module to predict the continuous dimensions (valence, arousal and dominance) and the discrete emotion categories. ## Emotic Dataset The Emotic dataset can be used only for **non-commercial research and education purposes**. Please, fill out the following form to request access to the dataset and the corresponding annotations. [Access Request for EMOTIC](https://forms.gle/wvhComeDHwQPD6TE6) ## Usage Download the Emotic dataset & annotations, and prepare the directory following the below structure: ``` ├── ... │ ├── emotic │ | ├── ade20k │ | ├── emodb_small │ | ├── framesdb │ | ├── mscoco │ ├── Annotations │ | ├── Annotations.mat ``` 1. To convert annotations from mat object to csv files and preprocess the data: ``` > python mat2py.py --data_dir proj/data/emotic19 --generate_npy ``` * data_dir: Path of the directory containing the emotic and annotations folder as described in the above data directory structure. * generate_npy: Argument to specify to generate npy files (later used for training and testing) along with CSV files. If not passed only CSV files are generated. 2. To train the model: ``` > python main.py --mode train --data_path proj/data/emotic_pre --experiment_path proj/debug_exp ``` * mode: Mode to run the main file. * data_path: Path of the directory which contains the preprocessed data and CSV files generated in the first step. * experiment_path: Path of the experiment directory. The directory will save the results, models and logs. 3. To test the model: ``` > python main.py --mode test --data_path proj/data/emotic_pre --experiment_path proj/debug_exp ``` * mode: Mode to run the main file. * data_path: Path of the directory which contains the preprocessed data and CSV files generated in the first step. * experiment_path: Path of the experiment directory. Models stored in the the directory are used for testing. 4. To perform inference: ``` > python main.py --mode inference --inference_file proj/debug_exp/inference_file.txt --experiment_path proj/debug_exp ``` * mode: Mode to run the main file. * inference_file: Text file specifying images to perform inference. A row is: 'full_path_of_image x1 y1 x2 y2', where (x1,y1) and (x2,y2) specify the bounding box. Refer [sample_inference_list.txt](https://github.com/Tandon-A/emotic/blob/master/sample_inference_list.txt). * experiment_path: Path of the experiment directory. Models stored in the the directory are used for inference. You can also train and test models on Emotic dataset by using the [Colab_train_emotic notebook](https://github.com/Tandon-A/emotic/blob/master/Colab_train_emotic.ipynb). [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Tandon-A/emotic/blob/master/Colab_train_emotic.ipynb) The **trained models and thresholds** to use for inference purposes are availble [here](https://drive.google.com/drive/folders/1e-JLA7V73CQD5pjTFCSWnKCmB0gCpV1D?usp=sharing). ## Results ![Result GIF 1](https://github.com/Tandon-A/emotic/blob/master/assets/eld11_gif2.gif "Result GIF 1") ## Acknowledgements * [Places365-CNN](https://github.com/CSAILVision/places365) * [Pytorch-Yolo](https://github.com/eriklindernoren/PyTorch-YOLOv3) ### Context Based Emotion Recognition using Emotic Dataset _Ronak Kosti, Jose Alvarez, Adria Recasens, Agata Lapedriza_
[[Paper]](https://arxiv.org/pdf/2003.13401.pdf) [[Project Webpage]](http://sunai.uoc.edu/emotic/) [[Authors' Implementation]](https://github.com/rkosti/emotic) ``` @article{kosti2020context, title={Context based emotion recognition using emotic dataset}, author={Kosti, Ronak and Alvarez, Jose M and Recasens, Adria and Lapedriza, Agata}, journal={arXiv preprint arXiv:2003.13401}, year={2020} } ``` ## Author [Abhishek Tandon](https://github.com/Tandon-A) ================================================ FILE: emotic.py ================================================ import torch import torch.nn as nn class Emotic(nn.Module): ''' Emotic Model''' def __init__(self, num_context_features, num_body_features): super(Emotic,self).__init__() self.num_context_features = num_context_features self.num_body_features = num_body_features self.fc1 = nn.Linear((self.num_context_features + num_body_features), 256) self.bn1 = nn.BatchNorm1d(256) self.d1 = nn.Dropout(p=0.5) self.fc_cat = nn.Linear(256, 26) self.fc_cont = nn.Linear(256, 3) self.relu = nn.ReLU() def forward(self, x_context, x_body): context_features = x_context.view(-1, self.num_context_features) body_features = x_body.view(-1, self.num_body_features) fuse_features = torch.cat((context_features, body_features), 1) fuse_out = self.fc1(fuse_features) fuse_out = self.bn1(fuse_out) fuse_out = self.relu(fuse_out) fuse_out = self.d1(fuse_out) cat_out = self.fc_cat(fuse_out) cont_out = self.fc_cont(fuse_out) return cat_out, cont_out ================================================ FILE: emotic_dataset.py ================================================ import ast import numpy as np import os from PIL import Image import torch from torch.utils.data import Dataset from torchvision import transforms class Emotic_PreDataset(Dataset): ''' Custom Emotic dataset class. Use preprocessed data stored in npy files. ''' def __init__(self, x_context, x_body, y_cat, y_cont, transform, context_norm, body_norm): super(Emotic_PreDataset,self).__init__() self.x_context = x_context self.x_body = x_body self.y_cat = y_cat self.y_cont = y_cont self.transform = transform self.context_norm = transforms.Normalize(context_norm[0], context_norm[1]) # Normalizing the context image with context mean and context std self.body_norm = transforms.Normalize(body_norm[0], body_norm[1]) # Normalizing the body image with body mean and body std def __len__(self): return len(self.y_cat) def __getitem__(self, index): image_context = self.x_context[index] image_body = self.x_body[index] cat_label = self.y_cat[index] cont_label = self.y_cont[index] return self.context_norm(self.transform(image_context)), self.body_norm(self.transform(image_body)), torch.tensor(cat_label, dtype=torch.float32), torch.tensor(cont_label, dtype=torch.float32)/10.0 class Emotic_CSVDataset(Dataset): ''' Custom Emotic dataset class. Use csv files and generated data at runtime. ''' def __init__(self, data_df, cat2ind, transform, context_norm, body_norm, data_src = './'): super(Emotic_CSVDataset,self).__init__() self.data_df = data_df self.data_src = data_src self.transform = transform self.cat2ind = cat2ind self.context_norm = transforms.Normalize(context_norm[0], context_norm[1]) # Normalizing the context image with context mean and context std self.body_norm = transforms.Normalize(body_norm[0], body_norm[1]) # Normalizing the body image with body mean and body std def __len__(self): return len(self.data_df) def __getitem__(self, index): row = self.data_df.loc[index] image_context = Image.open(os.path.join(self.data_src, row['Folder'], row['Filename'])) bbox = ast.literal_eval(row['BBox']) image_body = image_context.crop((bbox[0], bbox[1], bbox[2], bbox[3])) image_context = image_context.resize((224, 224)) image_body = image_body.resize((128, 128)) cat_labels = ast.literal_eval(row['Categorical_Labels']) cont_labels = ast.literal_eval(row['Continuous_Labels']) one_hot_cat_labels = self.cat_to_one_hot(cat_labels) return self.context_norm(self.transform(image_context)), self.body_norm(self.transform(image_body)), torch.tensor(one_hot_cat_labels, dtype=torch.float32), torch.tensor(cont_labels, dtype=torch.float32)/10.0 def cat_to_one_hot(self, cat): one_hot_cat = np.zeros(26) for em in cat: one_hot_cat[self.cat2ind[em]] = 1 return one_hot_cat ================================================ FILE: inference.py ================================================ import cv2 import numpy as np import os import torch from torchvision import transforms from emotic import Emotic def process_images(context_norm, body_norm, image_context_path=None, image_context=None, image_body=None, bbox=None): ''' Prepare context and body image. :param context_norm: List containing mean and std values for context images. :param body_norm: List containing mean and std values for body images. :param image_context_path: Path of the context image. :param image_context: Numpy array of the context image. :param image_body: Numpy array of the body image. :param bbox: List to specify the bounding box to generate the body image. bbox = [x1, y1, x2, y2]. :return: Transformed image_context tensor and image_body tensor. ''' if image_context is None and image_context_path is None: raise ValueError('both image_context and image_context_path cannot be none. Please specify one of the two.') if image_body is None and bbox is None: raise ValueError('both body image and bounding box cannot be none. Please specify one of the two') if image_context_path is not None: image_context = cv2.cvtColor(cv2.imread(image_context_path), cv2.COLOR_BGR2RGB) if bbox is not None: image_body = image_context[bbox[1]:bbox[3],bbox[0]:bbox[2]].copy() image_context = cv2.resize(image_context, (224,224)) image_body = cv2.resize(image_body, (128,128)) test_transform = transforms.Compose([transforms.ToPILImage(),transforms.ToTensor()]) context_norm = transforms.Normalize(context_norm[0], context_norm[1]) body_norm = transforms.Normalize(body_norm[0], body_norm[1]) image_context = context_norm(test_transform(image_context)).unsqueeze(0) image_body = body_norm(test_transform(image_body)).unsqueeze(0) return image_context, image_body def infer(context_norm, body_norm, ind2cat, ind2vad, device, thresholds, models, image_context_path=None, image_context=None, image_body=None, bbox=None, to_print=True): ''' Perform inference over an image. :param context_norm: List containing mean and std values for context images. :param body_norm: List containing mean and std values for body images. :param ind2cat: Dictionary converting integer index to categorical emotion. :param ind2vad: Dictionary converting integer index to continuous emotion dimension (Valence, Arousal and Dominance). :param device: Torch device. Used to send tensors to GPU if available. :param image_context_path: Path of the context image. :param image_context: Numpy array of the context image. :param image_body: Numpy array of the body image. :param bbox: List to specify the bounding box to generate the body image. bbox = [x1, y1, x2, y2]. :param to_print: Variable to display inference results. :return: Categorical Emotions list and continuous emotion dimensions numpy array. ''' image_context, image_body = process_images(context_norm, body_norm, image_context_path=image_context_path, image_context=image_context, image_body=image_body, bbox=bbox) model_context, model_body, emotic_model = models with torch.no_grad(): image_context = image_context.to(device) image_body = image_body.to(device) pred_context = model_context(image_context) pred_body = model_body(image_body) pred_cat, pred_cont = emotic_model(pred_context, pred_body) pred_cat = pred_cat.squeeze(0) pred_cont = pred_cont.squeeze(0).to("cpu").data.numpy() bool_cat_pred = torch.gt(pred_cat, thresholds) cat_emotions = list() for i in range(len(bool_cat_pred)): if bool_cat_pred[i] == True: cat_emotions.append(ind2cat[i]) if to_print == True: print ('\n Image predictions') print ('Continuous Dimnesions Predictions') for i in range(len(pred_cont)): print ('Continuous %10s %.5f' %(ind2vad[i], 10*pred_cont[i])) print ('Categorical Emotion Predictions') for emotion in cat_emotions: print ('Categorical %16s' %(emotion)) return cat_emotions, 10*pred_cont def inference_emotic(images_list, model_path, result_path, context_norm, body_norm, ind2cat, ind2vad, args): ''' Infer on list of images defined in a text file. Save the results in inference_file.txt in the directory specified by the result_path. :param images_list: Text file specifying the images and their bounding box values to conduct inference. A row in the file is Path_of_image x1 y1 x2 y2. :param model_path: Directory path to load models and val_thresholds to perform inference. :param result_path: Directory path to save the results (text file containig categorical emotion and continuous emotion dimension prediction per image). :param context_norm: List containing mean and std values for context images. :param body_norm: List containing mean and std values for body images. :param ind2cat: Dictionary converting integer index to categorical emotion. :param ind2vad: Dictionary converting integer index to continuous emotion dimension (Valence, Arousal and Dominance). :param args: Runtime arguments. ''' with open(images_list, 'r') as f: lines = f.readlines() device = torch.device("cuda:%s" %(str(args.gpu)) if torch.cuda.is_available() else "cpu") thresholds = torch.FloatTensor(np.load(os.path.join(result_path, 'val_thresholds.npy'))).to(device) model_context = torch.load(os.path.join(model_path,'model_context1.pth')).to(device) model_body = torch.load(os.path.join(model_path,'model_body1.pth')).to(device) emotic_model = torch.load(os.path.join(model_path,'model_emotic1.pth')).to(device) model_context.eval() model_body.eval() emotic_model.eval() models = [model_context, model_body, emotic_model] result_file = os.path.join(result_path, 'inference_list.txt') with open(result_file, 'w') as f: pass for idx, line in enumerate(lines): image_context_path, x1, y1, x2, y2 = line.split('\n')[0].split(' ') bbox = [int(x1), int(y1), int(x2), int(y2)] pred_cat, pred_cont = infer(context_norm, body_norm, ind2cat, ind2vad, device, thresholds, models, image_context_path=image_context_path, bbox=bbox) write_line = list() write_line.append(image_context_path) for emotion in pred_cat: write_line.append(emotion) for continuous in pred_cont: write_line.append(str('%.4f' %(continuous))) write_line = ' '.join(write_line) with open(result_file, 'a') as f: f.writelines(write_line) f.writelines('\n') ================================================ FILE: loss.py ================================================ import torch import torch.nn as nn class DiscreteLoss(nn.Module): ''' Class to measure loss between categorical emotion predictions and labels.''' def __init__(self, weight_type='mean', device=torch.device('cpu')): super(DiscreteLoss, self).__init__() self.weight_type = weight_type self.device = device if self.weight_type == 'mean': self.weights = torch.ones((1,26))/26.0 self.weights = self.weights.to(self.device) elif self.weight_type == 'static': self.weights = torch.FloatTensor([0.1435, 0.1870, 0.1692, 0.1165, 0.1949, 0.1204, 0.1728, 0.1372, 0.1620, 0.1540, 0.1987, 0.1057, 0.1482, 0.1192, 0.1590, 0.1929, 0.1158, 0.1907, 0.1345, 0.1307, 0.1665, 0.1698, 0.1797, 0.1657, 0.1520, 0.1537]).unsqueeze(0) self.weights = self.weights.to(self.device) def forward(self, pred, target): if self.weight_type == 'dynamic': self.weights = self.prepare_dynamic_weights(target) self.weights = self.weights.to(self.device) loss = (((pred - target)**2) * self.weights) return loss.sum() def prepare_dynamic_weights(self, target): target_stats = torch.sum(target, dim=0).float().unsqueeze(dim=0).cpu() weights = torch.zeros((1,26)) weights[target_stats != 0 ] = 1.0/torch.log(target_stats[target_stats != 0].data + 1.2) weights[target_stats == 0] = 0.0001 return weights class ContinuousLoss_L2(nn.Module): ''' Class to measure loss between continuous emotion dimension predictions and labels. Using l2 loss as base. ''' def __init__(self, margin=1): super(ContinuousLoss_L2, self).__init__() self.margin = margin def forward(self, pred, target): labs = torch.abs(pred - target) loss = labs ** 2 loss[ (labs < self.margin) ] = 0.0 return loss.sum() class ContinuousLoss_SL1(nn.Module): ''' Class to measure loss between continuous emotion dimension predictions and labels. Using smooth l1 loss as base. ''' def __init__(self, margin=1): super(ContinuousLoss_SL1, self).__init__() self.margin = margin def forward(self, pred, target): labs = torch.abs(pred - target) loss = 0.5 * (labs ** 2) loss[ (labs > self.margin) ] = labs[ (labs > self.margin) ] - 0.5 return loss.sum() if __name__ == '__main__': # Discrete Loss function test target = torch.zeros((2,26)) target[0, 0:13] = 1 target[1, 13:] = 2 target[:, 13] = 0 pred = torch.ones((2,26)) * 1 target = target.cuda() pred = pred.cuda() pred.requires_grad = True target.requires_grad = False disc_loss = DiscreteLoss('dynamic', torch.device("cuda:0")) loss = disc_loss(pred, target) print ('discrete loss class', loss, loss.shape, loss.dtype, loss.requires_grad) # loss = 37.1217 #Continuous Loss function test target = torch.ones((2,3)) target[0, :] = 0.9 target[1, :] = 0.2 target = target.cuda() pred = torch.ones((2,3)) pred[0, :] = 0.7 pred[1, :] = 0.25 pred = pred.cuda() pred.requires_grad = True target.requires_grad = False cont_loss_SL1 = ContinuousLoss_SL1() loss = cont_loss_SL1(pred*10, target * 10) print ('continuous SL1 loss class', loss, loss.shape, loss.dtype, loss.requires_grad) # loss = 4.8750 cont_loss_L2 = ContinuousLoss_L2() loss = cont_loss_L2(pred*10, target * 10) print ('continuous L2 loss class', loss, loss.shape, loss.dtype, loss.requires_grad) # loss = 12.0 ================================================ FILE: main.py ================================================ import argparse import os from emotic import Emotic from train import train_emotic from test import test_emotic from inference import inference_emotic def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0, help='gpu id') parser.add_argument('--mode', type=str, default='train_test', choices=['train', 'test', 'train_test', 'inference']) parser.add_argument('--data_path', type=str, help='Path to preprocessed data npy files/ csv files') parser.add_argument('--experiment_path', type=str, required=True, help='Path to save experiment files (results, models, logs)') parser.add_argument('--model_dir_name', type=str, default='models', help='Name of the directory to save models') parser.add_argument('--result_dir_name', type=str, default='results', help='Name of the directory to save results(predictions, labels mat files)') parser.add_argument('--log_dir_name', type=str, default='logs', help='Name of the directory to save logs (train, val)') parser.add_argument('--inference_file', type=str, help='Text file containing image context paths and bounding box') parser.add_argument('--context_model', type=str, default='resnet18', choices=['resnet18', 'resnet50'], help='context model type') parser.add_argument('--body_model', type=str, default='resnet18', choices=['resnet18', 'resnet50'], help='body model type') parser.add_argument('--learning_rate', type=float, default=0.01) parser.add_argument('--weight_decay', type=float, default=5e-4) parser.add_argument('--cat_loss_weight', type=float, default=0.5, help='weight for discrete loss') parser.add_argument('--cont_loss_weight', type=float, default=0.5, help='weight fot continuous loss') parser.add_argument('--continuous_loss_type', type=str, default='Smooth L1', choices=['L2', 'Smooth L1'], help='type of continuous loss') parser.add_argument('--discrete_loss_weight_type', type=str, default='dynamic', choices=['dynamic', 'mean', 'static'], help='weight policy for discrete loss') parser.add_argument('--epochs', type=int, default=15) parser.add_argument('--batch_size', type=int, default=52) # use batch size = double(categorical emotion classes) # Generate args args = parser.parse_args() return args def check_paths(args): ''' Check (create if they don't exist) experiment directories. :param args: Runtime arguments as passed by the user. :return: List containing result_dir_path, model_dir_path, train_log_dir_path, val_log_dir_path. ''' folders= [args.result_dir_name, args.model_dir_name] paths = list() for folder in folders: folder_path = os.path.join(args.experiment_path, folder) if not os.path.exists(folder_path): os.makedirs(folder_path) paths.append(folder_path) log_folders = ['train', 'val'] for folder in log_folders: folder_path = os.path.join(args.experiment_path, args.log_dir_name, folder) if not os.path.exists(folder_path): os.makedirs(folder_path) paths.append(folder_path) return paths if __name__ == '__main__': args = parse_args() print ('mode ', args.mode) result_path, model_path, train_log_path, val_log_path = check_paths(args) cat = ['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion', 'Confidence', 'Disapproval', 'Disconnection', \ 'Disquietment', 'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem', 'Excitement', 'Fatigue', 'Fear','Happiness', \ 'Pain', 'Peace', 'Pleasure', 'Sadness', 'Sensitivity', 'Suffering', 'Surprise', 'Sympathy', 'Yearning'] cat2ind = {} ind2cat = {} for idx, emotion in enumerate(cat): cat2ind[emotion] = idx ind2cat[idx] = emotion vad = ['Valence', 'Arousal', 'Dominance'] ind2vad = {} for idx, continuous in enumerate(vad): ind2vad[idx] = continuous context_mean = [0.4690646, 0.4407227, 0.40508908] context_std = [0.2514227, 0.24312855, 0.24266963] body_mean = [0.43832874, 0.3964344, 0.3706214] body_std = [0.24784276, 0.23621225, 0.2323653] context_norm = [context_mean, context_std] body_norm = [body_mean, body_std] if args.mode == 'train': if args.data_path is None: raise ValueError('Data path not provided. Please pass a valid data path for training') with open(os.path.join(args.experiment_path, 'config.txt'), 'w') as f: print(args, file=f) train_emotic(result_path, model_path, train_log_path, val_log_path, ind2cat, ind2vad, context_norm, body_norm, args) elif args.mode == 'test': if args.data_path is None: raise ValueError('Data path not provided. Please pass a valid data path for testing') test_emotic(result_path, model_path, ind2cat, ind2vad, context_norm, body_norm, args) elif args.mode == 'train_test': if args.data_path is None: raise ValueError('Data path not provided. Please pass a valid data path for training and testing') with open(os.path.join(args.experiment_path, 'config.txt'), 'w') as f: print(args, file=f) train_emotic(result_path, model_path, train_log_path, val_log_path, ind2cat, ind2vad, context_norm, body_norm, args) test_emotic(result_path, model_path, ind2cat, ind2vad, context_norm, body_norm, args) elif args.mode == 'inference': if args.inference_file is None: raise ValueError('Inference file not provided. Please pass a valid inference file for inference') inference_emotic(args.inference_file, model_path, result_path, context_norm, body_norm, ind2cat, ind2vad, args) else: raise ValueError('Unknown mode') ================================================ FILE: mat2py.py ================================================ import argparse import csv import cv2 import numpy as np import os from scipy.io import loadmat class emotic_train: def __init__(self, filename, folder, image_size, person): self.filename = filename self.folder = folder self.im_size = [] self.bbox = [] self.cat = [] self.cont = [] self.gender = person[3][0] self.age = person[4][0] self.cat_annotators = 0 self.cont_annotators = 0 self.set_imsize(image_size) self.set_bbox(person[0]) self.set_cat(person[1]) self.set_cont(person[2]) self.check_cont() def set_imsize(self, image_size): image_size = np.array(image_size).flatten().tolist()[0] row = np.array(image_size[0]).flatten().tolist()[0] col = np.array(image_size[1]).flatten().tolist()[0] self.im_size.append(row) self.im_size.append(col) def validate_bbox(self, bbox): x1, y1, x2, y2 = bbox x1 = min(self.im_size[0], max(0, x1)) x2 = min(self.im_size[0], max(0, x2)) y1 = min(self.im_size[1], max(0, y1)) y2 = min(self.im_size[1], max(0, y2)) return [int(x1), int(y1), int(x2), int(y2)] def set_bbox(self, person_bbox): self.bbox = self.validate_bbox(np.array(person_bbox).flatten().tolist()) def set_cat(self, person_cat): cat = np.array(person_cat).flatten().tolist() cat = np.array(cat[0]).flatten().tolist() self.cat = [np.array(c).flatten().tolist()[0] for c in cat] self.cat_annotators = 1 def set_cont(self, person_cont): cont = np.array(person_cont).flatten().tolist()[0] self.cont = [np.array(c).flatten().tolist()[0] for c in cont] self.cont_annotators = 1 def check_cont(self): for c in self.cont: if np.isnan(c): self.cont_annotators = 0 break class emotic_test: def __init__(self, filename, folder, image_size, person): self.filename = filename self.folder = folder self.im_size = [] self.bbox = [] self.cat = [] self.cat_annotators = 0 self.comb_cat = [] self.cont_annotators = 0 self.cont = [] self.comb_cont = [] self.gender = person[5][0] self.age = person[6][0] self.set_imsize(image_size) self.set_bbox(person[0]) self.set_cat(person[1]) self.set_comb_cat(person[2]) self.set_cont(person[3]) self.set_comb_cont(person[4]) self.check_cont() def set_imsize(self, image_size): image_size = np.array(image_size).flatten().tolist()[0] row = np.array(image_size[0]).flatten().tolist()[0] col = np.array(image_size[1]).flatten().tolist()[0] self.im_size.append(row) self.im_size.append(col) def validate_bbox(self, bbox): x1, y1, x2, y2 = bbox x1 = min(self.im_size[0], max(0, x1)) x2 = min(self.im_size[0], max(0, x2)) y1 = min(self.im_size[1], max(0, y1)) y2 = min(self.im_size[1], max(0, y2)) return [int(x1), int(y1), int(x2), int(y2)] def set_bbox(self, person_bbox): self.bbox = self.validate_bbox(np.array(person_bbox).flatten().tolist()) def set_cat(self, person_cat): self.cat_annotators = len(person_cat[0]) for ann in range(self.cat_annotators): ann_cat = person_cat[0][ann] ann_cat = np.array(ann_cat).flatten().tolist() ann_cat = np.array(ann_cat[0]).flatten().tolist() ann_cat = [np.array(c).flatten().tolist()[0] for c in ann_cat] self.cat.append(ann_cat) def set_comb_cat(self, person_comb_cat): if self.cat_annotators != 0: self.comb_cat = [np.array(c).flatten().tolist()[0] for c in person_comb_cat[0]] else: self.comb_cat = [] def set_comb_cont(self, person_comb_cont): if self.cont_annotators != 0: comb_cont = [np.array(c).flatten().tolist()[0] for c in person_comb_cont[0]] self.comb_cont = [np.array(c).flatten().tolist()[0] for c in comb_cont[0]] else: self.comb_cont = [] def set_cont(self, person_cont): self.cont_annotators = len(person_cont[0]) for ann in range(self.cont_annotators): ann_cont = person_cont[0][ann] ann_cont = np.array(ann_cont).flatten().tolist() ann_cont = np.array(ann_cont[0]).flatten().tolist() ann_cont = [np.array(c).flatten().tolist()[0] for c in ann_cont] self.cont.append(ann_cont) def check_cont(self): for c in self.comb_cont: if np.isnan(c): self.cont_annotators = 0 break def cat_to_one_hot(y_cat): ''' One hot encode a categorical label. :param y_cat: Categorical label. :return: One hot encoded categorical label. ''' one_hot_cat = np.zeros(26) for em in y_cat: one_hot_cat[cat2ind[em]] = 1 return one_hot_cat def prepare_data(data_mat, data_path_src, save_dir, dataset_type='train', generate_npy=False, debug_mode=False): ''' Prepare csv files and save preprocessed data in npy files. :param data_mat: Mat data object for a label. :param data_path_src: Path of the parent directory containing the emotic images folders (mscoco, framesdb, emodb_small, ade20k) :param save_dir: Path of the directory to save the csv files and the npy files (if generate_npy files is True) :param dataset_type: Type of the dataset (train, val or test). Variable used in the name of csv files and npy files. :param generate_npy: If True the data is preprocessed and saved in npy files. Npy files are later used for training. ''' data_set = list() if generate_npy: context_arr = list() body_arr = list() cat_arr = list() cont_arr = list() to_break = 0 path_not_exist = 0 cat_cont_zero = 0 idx = 0 for ex_idx, ex in enumerate(data_mat[0]): nop = len(ex[4][0]) for person in range(nop): if dataset_type == 'train': et = emotic_train(ex[0][0],ex[1][0],ex[2],ex[4][0][person]) else: et = emotic_test(ex[0][0],ex[1][0],ex[2],ex[4][0][person]) try: image_path = os.path.join(data_path_src,et.folder,et.filename) if not os.path.exists(image_path): path_not_exist += 1 print ('path not existing', ex_idx, image_path) continue else: context = cv2.cvtColor(cv2.imread(image_path),cv2.COLOR_BGR2RGB) body = context[et.bbox[1]:et.bbox[3],et.bbox[0]:et.bbox[2]].copy() context_cv = cv2.resize(context, (224,224)) body_cv = cv2.resize(body, (128,128)) except Exception as e: to_break += 1 if debug_mode == True: print ('breaking at idx=%d, %d due to exception=%r' %(ex_idx, idx, e)) continue if (et.cat_annotators == 0 or et.cont_annotators == 0): cat_cont_zero += 1 continue data_set.append(et) if generate_npy == True: context_arr.append(context_cv) body_arr.append(body_cv) if dataset_type == 'train': cat_arr.append(cat_to_one_hot(et.cat)) cont_arr.append(np.array(et.cont)) else: cat_arr.append(cat_to_one_hot(et.comb_cat)) cont_arr.append(np.array(et.comb_cont)) if idx % 1000 == 0 and debug_mode==False: print (" Preprocessing data. Index = ", idx) elif idx % 20 == 0 and debug_mode==True: print (" Preprocessing data. Index = ", idx) idx = idx + 1 # for debugging purposes if debug_mode == True and idx >= 104: print (' ######## Breaking data prep step', idx, ex_idx, ' ######') print (to_break, path_not_exist, cat_cont_zero) cv2.imwrite(os.path.join(save_dir, 'context1.png'), context_arr[-1]) cv2.imwrite(os.path.join(save_dir, 'body1.png'), body_arr[-1]) break print (to_break, path_not_exist, cat_cont_zero) csv_path = os.path.join(save_dir, "%s.csv" %(dataset_type)) with open(csv_path, 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter=',', dialect='excel') row = ['Index', 'Folder', 'Filename', 'Image Size', 'BBox', 'Categorical_Labels', 'Continuous_Labels', 'Gender', 'Age'] filewriter.writerow(row) for idx, ex in enumerate(data_set): if dataset_type == 'train': row = [idx, ex.folder, ex.filename, ex.im_size, ex.bbox, ex.cat, ex.cont, ex.gender, ex.age] else: row = [idx, ex.folder, ex.filename, ex.im_size, ex.bbox, ex.comb_cat, ex.comb_cont, ex.gender, ex.age] filewriter.writerow(row) print ('wrote file ', csv_path) if generate_npy == True: context_arr = np.array(context_arr) body_arr = np.array(body_arr) cat_arr = np.array(cat_arr) cont_arr = np.array(cont_arr) print (len(data_set), context_arr.shape, body_arr.shape) np.save(os.path.join(save_dir,'%s_context_arr.npy' %(dataset_type)), context_arr) np.save(os.path.join(save_dir,'%s_body_arr.npy' %(dataset_type)), body_arr) np.save(os.path.join(save_dir,'%s_cat_arr.npy' %(dataset_type)), cat_arr) np.save(os.path.join(save_dir,'%s_cont_arr.npy' %(dataset_type)), cont_arr) print (context_arr.shape, body_arr.shape, cat_arr.shape, cont_arr.shape) print ('completed generating %s data files' %(dataset_type)) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, required=True, help='Path to Emotic data and annotations') parser.add_argument('--save_dir_name', type=str, default='emotic_pre', help='Directory name in which preprocessed data will be stored') parser.add_argument('--label', type=str, default='all', choices=['train', 'val', 'test', 'all']) parser.add_argument('--generate_npy', action='store_true', help='Generate npy files') parser.add_argument('--debug_mode', action='store_true', help='Debug mode. Will only save a small subset of the data') # Generate args args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() ann_path_src = os.path.join(args.data_dir, 'Annotations','Annotations.mat') data_path_src = os.path.join(args.data_dir, 'emotic') save_path = os.path.join(args.data_dir, args.save_dir_name) if not os.path.exists(save_path): os.makedirs(save_path) cat = ['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion', 'Confidence', 'Disapproval', 'Disconnection', 'Disquietment', 'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem', 'Excitement', 'Fatigue', 'Fear', 'Happiness', 'Pain', 'Peace', 'Pleasure', 'Sadness', 'Sensitivity', 'Suffering', 'Surprise', 'Sympathy', 'Yearning'] cat2ind = {} ind2cat = {} for idx, emotion in enumerate(cat): cat2ind[emotion] = idx ind2cat[idx] = emotion print ('loading Annotations') mat = loadmat(ann_path_src) if args.label.lower() == 'all': labels = ['train', 'val', 'test'] else: labels = [args.label.lower()] for label in labels: data_mat = mat[label] print ('starting label ', label) prepare_data(data_mat, data_path_src, save_path, dataset_type=label, generate_npy=args.generate_npy, debug_mode=args.debug_mode) ================================================ FILE: prepare_models.py ================================================ import os import torch from torch.autograd import Variable as V import torchvision.models as models from torch.nn import functional as F def prep_models(context_model='resnet18', body_model='resnet18', model_dir='./'): ''' Download imagenet pretrained models for context_model and body_model. :param context_model: Model to use for conetxt features. :param body_model: Model to use for body features. :param model_dir: Directory path where to store pretrained models. :return: Yolo model after loading model weights ''' model_name = '%s_places365.pth.tar' % context_model model_file = os.path.join(model_dir, model_name) if not os.path.exists(model_file): download_command = 'wget ' + 'http://places2.csail.mit.edu/models_places365/' + model_name +' -O ' + model_file os.system(download_command) save_file = os.path.join(model_dir,'%s_places365_py36.pth.tar' % context_model) from functools import partial import pickle pickle.load = partial(pickle.load, encoding="latin1") pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1") model = torch.load(model_file, map_location=lambda storage, loc: storage, pickle_module=pickle) torch.save(model, save_file) # create the network architecture model_context = models.__dict__[context_model](num_classes=365) checkpoint = torch.load(save_file, map_location=lambda storage, loc: storage) # model trained in GPU could be deployed in CPU machine like this! if context_model == 'densenet161': state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()} state_dict = {str.replace(k,'norm.','norm'): v for k,v in state_dict.items()} state_dict = {str.replace(k,'conv.','conv'): v for k,v in state_dict.items()} state_dict = {str.replace(k,'normweight','norm.weight'): v for k,v in state_dict.items()} state_dict = {str.replace(k,'normrunning','norm.running'): v for k,v in state_dict.items()} state_dict = {str.replace(k,'normbias','norm.bias'): v for k,v in state_dict.items()} state_dict = {str.replace(k,'convweight','conv.weight'): v for k,v in state_dict.items()} else: state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()} # the data parallel layer will add 'module' before each layer name model_context.load_state_dict(state_dict) model_context.eval() model_context.cpu() torch.save(model_context, os.path.join(model_dir, 'context_model' + '.pth')) print ('completed preparing context model') model_body = models.__dict__[body_model](pretrained=True) model_body.cpu() torch.save(model_body, os.path.join(model_dir, 'body_model' + '.pth')) print ('completed preparing body model') return model_context, model_body if __name__ == '__main__': prep_models(model_dir='proj/debug_exp/models') ================================================ FILE: sample_inference_list.txt ================================================ /data/emotic19/emotic/mscoco/images/COCO_val2014_000000562243.jpg 86 58 564 628 /data/emotic19/emotic/mscoco/images/COCO_train2014_000000288841.jpg 485 149 605 473 ================================================ FILE: test.py ================================================ import numpy as np import os import scipy.io from sklearn.metrics import average_precision_score, precision_recall_curve import torch import torch.nn as nn from torch.utils.data import DataLoader import torchvision.models as models from torchvision import transforms from emotic import Emotic from emotic_dataset import Emotic_PreDataset def test_scikit_ap(cat_preds, cat_labels, ind2cat): ''' Calculate average precision per emotion category using sklearn library. :param cat_preds: Categorical emotion predictions. :param cat_labels: Categorical emotion labels. :param ind2cat: Dictionary converting integer index to categorical emotion. :return: Numpy array containing average precision per emotion category. ''' ap = np.zeros(26, dtype=np.float32) for i in range(26): ap[i] = average_precision_score(cat_labels[i, :], cat_preds[i, :]) print ('Category %16s %.5f' %(ind2cat[i], ap[i])) print ('Mean AP %.5f' %(ap.mean())) return ap def test_vad(cont_preds, cont_labels, ind2vad): ''' Calcaulate VAD (valence, arousal, dominance) errors. :param cont_preds: Continuous emotion predictions. :param cont_labels: Continuous emotion labels. :param ind2vad: Dictionary converting integer index to continuous emotion dimension (Valence, Arousal and Dominance). :return: Numpy array containing mean absolute error per continuous emotion dimension. ''' vad = np.zeros(3, dtype=np.float32) for i in range(3): vad[i] = np.mean(np.abs(cont_preds[i, :] - cont_labels[i, :])) print ('Continuous %10s %.5f' %(ind2vad[i], vad[i])) print ('Mean VAD Error %.5f' %(vad.mean())) return vad def get_thresholds(cat_preds, cat_labels): ''' Calculate thresholds where precision is equal to recall. These thresholds are then later for inference. :param cat_preds: Categorical emotion predictions. :param cat_labels: Categorical emotion labels. :return: Numpy array containing thresholds per emotion category where precision is equal to recall. ''' thresholds = np.zeros(26, dtype=np.float32) for i in range(26): p, r, t = precision_recall_curve(cat_labels[i, :], cat_preds[i, :]) for k in range(len(p)): if p[k] == r[k]: thresholds[i] = t[k] break return thresholds def test_data(models, device, data_loader, ind2cat, ind2vad, num_images, result_dir='./', test_type='val'): ''' Test models on data :param models: List containing model_context, model_body and emotic_model (fusion model) in that order. :param device: Torch device. Used to send tensors to GPU if available. :param data_loader: Dataloader iterating over dataset. :param ind2cat: Dictionary converting integer index to categorical emotion. :param ind2vad: Dictionary converting integer index to continuous emotion dimension (Valence, Arousal and Dominance) :param num_images: Number of images in the dataset. :param result_dir: Directory path to save results (predictions mat object and thresholds npy object). :param test_type: Test type variable. Variable used in the name of thresholds and predictio files. ''' model_context, model_body, emotic_model = models cat_preds = np.zeros((num_images, 26)) cat_labels = np.zeros((num_images, 26)) cont_preds = np.zeros((num_images, 3)) cont_labels = np.zeros((num_images, 3)) with torch.no_grad(): model_context.to(device) model_body.to(device) emotic_model.to(device) model_context.eval() model_body.eval() emotic_model.eval() indx = 0 print ('starting testing') for images_context, images_body, labels_cat, labels_cont in iter(data_loader): images_context = images_context.to(device) images_body = images_body.to(device) pred_context = model_context(images_context) pred_body = model_body(images_body) pred_cat, pred_cont = emotic_model(pred_context, pred_body) cat_preds[ indx : (indx + pred_cat.shape[0]), :] = pred_cat.to("cpu").data.numpy() cat_labels[ indx : (indx + labels_cat.shape[0]), :] = labels_cat.to("cpu").data.numpy() cont_preds[ indx : (indx + pred_cont.shape[0]), :] = pred_cont.to("cpu").data.numpy() * 10 cont_labels[ indx : (indx + labels_cont.shape[0]), :] = labels_cont.to("cpu").data.numpy() * 10 indx = indx + pred_cat.shape[0] cat_preds = cat_preds.transpose() cat_labels = cat_labels.transpose() cont_preds = cont_preds.transpose() cont_labels = cont_labels.transpose() print ('completed testing') # Mat files used for emotic testing (matlab script) scipy.io.savemat(os.path.join(result_dir, '%s_cat_preds.mat' %(test_type)), mdict={'cat_preds':cat_preds}) scipy.io.savemat(os.path.join(result_dir, '%s_cat_labels.mat' %(test_type)), mdict={'cat_labels':cat_labels}) scipy.io.savemat(os.path.join(result_dir, '%s_cont_preds.mat' %(test_type)), mdict={'cont_preds':cont_preds}) scipy.io.savemat(os.path.join(result_dir, '%s_cont_labels.mat' %(test_type)), mdict={'cont_labels':cont_labels}) print ('saved mat files') test_scikit_ap(cat_preds, cat_labels, ind2cat) test_vad(cont_preds, cont_labels, ind2vad) thresholds = get_thresholds(cat_preds, cat_labels) np.save(os.path.join(result_dir, '%s_thresholds.npy' %(test_type)), thresholds) print ('saved thresholds') def test_emotic(result_path, model_path, ind2cat, ind2vad, context_norm, body_norm, args): ''' Prepare test data and test models on the same. :param result_path: Directory path to save the results (val_predidictions mat object, val_thresholds npy object). :param model_path: Directory path to load pretrained base models and save the models after training. :param ind2cat: Dictionary converting integer index to categorical emotion. :param ind2vad: Dictionary converting integer index to continuous emotion dimension (Valence, Arousal and Dominance). :param context_norm: List containing mean and std values for context images. :param body_norm: List containing mean and std values for body images. :param args: Runtime arguments. ''' # Prepare models model_context = torch.load(os.path.join(model_path,'model_context1.pth')) model_body = torch.load(os.path.join(model_path,'model_body1.pth')) emotic_model = torch.load(os.path.join(model_path,'model_emotic1.pth')) print ('Succesfully loaded models') #Load data preprocessed npy files test_context = np.load(os.path.join(args.data_path, 'test_context_arr.npy')) test_body = np.load(os.path.join(args.data_path, 'test_body_arr.npy')) test_cat = np.load(os.path.join(args.data_path, 'test_cat_arr.npy')) test_cont = np.load(os.path.join(args.data_path, 'test_cont_arr.npy')) print ('test ', 'context ', test_context.shape, 'body', test_body.shape, 'cat ', test_cat.shape, 'cont', test_cont.shape) # Initialize Dataset and DataLoader test_transform = transforms.Compose([transforms.ToPILImage(),transforms.ToTensor()]) test_dataset = Emotic_PreDataset(test_context, test_body, test_cat, test_cont, test_transform, context_norm, body_norm) test_loader = DataLoader(test_dataset, args.batch_size, shuffle=False) print ('test loader ', len(test_loader)) device = torch.device("cuda:%s" %(str(args.gpu)) if torch.cuda.is_available() else "cpu") test_data([model_context, model_body, emotic_model], device, test_loader, ind2cat, ind2vad, len(test_dataset), result_dir=result_path, test_type='test') ================================================ FILE: train.py ================================================ import numpy as np import os import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.optim.lr_scheduler import StepLR from torch.utils.data import DataLoader import torchvision.models as models from torchvision import transforms from tensorboardX import SummaryWriter from emotic import Emotic from emotic_dataset import Emotic_PreDataset from loss import DiscreteLoss, ContinuousLoss_SL1, ContinuousLoss_L2 from prepare_models import prep_models from test import test_data def train_data(opt, scheduler, models, device, train_loader, val_loader, disc_loss, cont_loss, train_writer, val_writer, model_path, args): ''' Training emotic model on train data using train loader. :param opt: Optimizer object. :param scheduler: Learning rate scheduler object. :param models: List containing model_context, model_body and emotic_model (fusion model) in that order. :param device: Torch device. Used to send tensors to GPU if available. :param train_loader: Dataloader iterating over train dataset. :param val_loader: Dataloader iterating over validation dataset. :param disc_loss: Discrete loss criterion. Loss measure between discrete emotion categories predictions and the target emotion categories. :param cont_loss: Continuous loss criterion. Loss measure between continuous VAD emotion predictions and the target VAD values. :param train_writer: SummaryWriter object to save train logs. :param val_writer: SummaryWriter object to save validation logs. :param model_path: Directory path to save the models after training. :param args: Runtime arguments. ''' model_context, model_body, emotic_model = models emotic_model.to(device) model_context.to(device) model_body.to(device) print ('starting training') for e in range(args.epochs): running_loss = 0.0 running_cat_loss = 0.0 running_cont_loss = 0.0 emotic_model.train() model_context.train() model_body.train() #train models for one epoch for images_context, images_body, labels_cat, labels_cont in iter(train_loader): images_context = images_context.to(device) images_body = images_body.to(device) labels_cat = labels_cat.to(device) labels_cont = labels_cont.to(device) opt.zero_grad() pred_context = model_context(images_context) pred_body = model_body(images_body) pred_cat, pred_cont = emotic_model(pred_context, pred_body) cat_loss_batch = disc_loss(pred_cat, labels_cat) cont_loss_batch = cont_loss(pred_cont * 10, labels_cont * 10) loss = (args.cat_loss_weight * cat_loss_batch) + (args.cont_loss_weight * cont_loss_batch) running_loss += loss.item() running_cat_loss += cat_loss_batch.item() running_cont_loss += cont_loss_batch.item() loss.backward() opt.step() if e % 1 == 0: print ('epoch = %d loss = %.4f cat loss = %.4f cont_loss = %.4f' %(e, running_loss, running_cat_loss, running_cont_loss)) train_writer.add_scalar('losses/total_loss', running_loss, e) train_writer.add_scalar('losses/categorical_loss', running_cat_loss, e) train_writer.add_scalar('losses/continuous_loss', running_cont_loss, e) running_loss = 0.0 running_cat_loss = 0.0 running_cont_loss = 0.0 emotic_model.eval() model_context.eval() model_body.eval() with torch.no_grad(): #validation for one epoch for images_context, images_body, labels_cat, labels_cont in iter(val_loader): images_context = images_context.to(device) images_body = images_body.to(device) labels_cat = labels_cat.to(device) labels_cont = labels_cont.to(device) pred_context = model_context(images_context) pred_body = model_body(images_body) pred_cat, pred_cont = emotic_model(pred_context, pred_body) cat_loss_batch = disc_loss(pred_cat, labels_cat) cont_loss_batch = cont_loss(pred_cont * 10, labels_cont * 10) loss = (args.cat_loss_weight * cat_loss_batch) + (args.cont_loss_weight * cont_loss_batch) running_loss += loss.item() running_cat_loss += cat_loss_batch.item() running_cont_loss += cont_loss_batch.item() if e % 1 == 0: print ('epoch = %d validation loss = %.4f cat loss = %.4f cont loss = %.4f ' %(e, running_loss, running_cat_loss, running_cont_loss)) val_writer.add_scalar('losses/total_loss', running_loss, e) val_writer.add_scalar('losses/categorical_loss', running_cat_loss, e) val_writer.add_scalar('losses/continuous_loss', running_cont_loss, e) scheduler.step() print ('completed training') emotic_model.to("cpu") model_context.to("cpu") model_body.to("cpu") torch.save(emotic_model, os.path.join(model_path, 'model_emotic1.pth')) torch.save(model_context, os.path.join(model_path, 'model_context1.pth')) torch.save(model_body, os.path.join(model_path, 'model_body1.pth')) print ('saved models') def train_emotic(result_path, model_path, train_log_path, val_log_path, ind2cat, ind2vad, context_norm, body_norm, args): ''' Prepare dataset, dataloders, models. :param result_path: Directory path to save the results (val_predidictions mat object, val_thresholds npy object). :param model_path: Directory path to load pretrained base models and save the models after training. :param train_log_path: Directory path to save the training logs. :param val_log_path: Directoty path to save the validation logs. :param ind2cat: Dictionary converting integer index to categorical emotion. :param ind2vad: Dictionary converting integer index to continuous emotion dimension (Valence, Arousal and Dominance). :param context_norm: List containing mean and std values for context images. :param body_norm: List containing mean and std values for body images. :param args: Runtime arguments. ''' # Load preprocessed data from npy files train_context = np.load(os.path.join(args.data_path, 'train_context_arr.npy')) train_body = np.load(os.path.join(args.data_path, 'train_body_arr.npy')) train_cat = np.load(os.path.join(args.data_path, 'train_cat_arr.npy')) train_cont = np.load(os.path.join(args.data_path, 'train_cont_arr.npy')) val_context = np.load(os.path.join(args.data_path, 'val_context_arr.npy')) val_body = np.load(os.path.join(args.data_path, 'val_body_arr.npy')) val_cat = np.load(os.path.join(args.data_path, 'val_cat_arr.npy')) val_cont = np.load(os.path.join(args.data_path, 'val_cont_arr.npy')) print ('train ', 'context ', train_context.shape, 'body', train_body.shape, 'cat ', train_cat.shape, 'cont', train_cont.shape) print ('val ', 'context ', val_context.shape, 'body', val_body.shape, 'cat ', val_cat.shape, 'cont', val_cont.shape) # Initialize Dataset and DataLoader train_transform = transforms.Compose([transforms.ToPILImage(),transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.ToTensor()]) test_transform = transforms.Compose([transforms.ToPILImage(),transforms.ToTensor()]) train_dataset = Emotic_PreDataset(train_context, train_body, train_cat, train_cont, train_transform, context_norm, body_norm) val_dataset = Emotic_PreDataset(val_context, val_body, val_cat, val_cont, test_transform, context_norm, body_norm) train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True) val_loader = DataLoader(val_dataset, args.batch_size, shuffle=False) print ('train loader ', len(train_loader), 'val loader ', len(val_loader)) # Prepare models model_context, model_body = prep_models(context_model=args.context_model, body_model=args.body_model, model_dir=model_path) emotic_model = Emotic(list(model_context.children())[-1].in_features, list(model_body.children())[-1].in_features) model_context = nn.Sequential(*(list(model_context.children())[:-1])) model_body = nn.Sequential(*(list(model_body.children())[:-1])) for param in emotic_model.parameters(): param.requires_grad = True for param in model_context.parameters(): param.requires_grad = True for param in model_body.parameters(): param.requires_grad = True device = torch.device("cuda:%s" %(str(args.gpu)) if torch.cuda.is_available() else "cpu") opt = optim.Adam((list(emotic_model.parameters()) + list(model_context.parameters()) + list(model_body.parameters())), lr=args.learning_rate, weight_decay=args.weight_decay) scheduler = StepLR(opt, step_size=7, gamma=0.1) disc_loss = DiscreteLoss(args.discrete_loss_weight_type, device) if args.continuous_loss_type == 'Smooth L1': cont_loss = ContinuousLoss_SL1() else: cont_loss = ContinuousLoss_L2() train_writer = SummaryWriter(train_log_path) val_writer = SummaryWriter(val_log_path) # training train_data(opt, scheduler, [model_context, model_body, emotic_model], device, train_loader, val_loader, disc_loss, cont_loss, train_writer, val_writer, model_path, args) # validation test_data([model_context, model_body, emotic_model], device, val_loader, ind2cat, ind2vad, len(val_dataset), result_dir=result_path, test_type='val') ================================================ FILE: yolo_inference.py ================================================ import argparse import cv2 import numpy as np import os import torch from torchvision import transforms from emotic import Emotic from inference import infer from yolo_utils import prepare_yolo, rescale_boxes, non_max_suppression def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=int, default=0, help='gpu id') parser.add_argument('--experiment_path', type=str, required=True, help='Path of experiment files (results, models, logs)') parser.add_argument('--model_dir', type=str, default='models', help='Folder to access the models') parser.add_argument('--result_dir', type=str, default='results', help='Path to save the results') parser.add_argument('--inference_file', type=str, help='Text file containing image context paths and bounding box') parser.add_argument('--video_file', type=str, help='Test video file') # Generate args args = parser.parse_args() return args def get_bbox(yolo_model, device, image_context, yolo_image_size=416, conf_thresh=0.8, nms_thresh=0.4): ''' Use yolo to obtain bounding box of every person in context image. :param yolo_model: Yolo model to obtain bounding box of every person in context image. :param device: Torch device. Used to send tensors to GPU (if available) for faster processing. :yolo_image_size: Input image size for yolo model. :conf_thresh: Confidence threshold for yolo model. Predictions with object confidence > conf_thresh are returned. :nms_thresh: Non-maximal suppression threshold for yolo model. Predictions with IoU > nms_thresh are returned. :return: Numpy array of bounding boxes. Array shape = (no_of_persons, 4). ''' test_transform = transforms.Compose([transforms.ToPILImage(),transforms.ToTensor()]) image_yolo = test_transform(cv2.resize(image_context, (416, 416))).unsqueeze(0).to(device) with torch.no_grad(): detections = yolo_model(image_yolo) nms_det = non_max_suppression(detections, conf_thresh, nms_thresh)[0] det = rescale_boxes(nms_det, yolo_image_size, (image_context.shape[:2])) bboxes = [] for x1, y1, x2, y2, _, _, cls_pred in det: if cls_pred == 0: # checking if predicted_class = persons. x1 = int(min(image_context.shape[1], max(0, x1))) x2 = int(min(image_context.shape[1], max(x1, x2))) y1 = int(min(image_context.shape[0], max(15, y1))) y2 = int(min(image_context.shape[0], max(y1, y2))) bboxes.append([x1, y1, x2, y2]) return np.array(bboxes) def yolo_infer(images_list, result_path, model_path, context_norm, body_norm, ind2cat, ind2vad, args): ''' Infer on a list of images defined in images_list text file to obtain bounding boxes of persons in the images using yolo model. :param images_list: Text file specifying the images to conduct inference. A row in the file is Path_of_image. :param result_path: Directory path to save the results (images with the predicted emotion categories and continuous emotion dimesnions). :param model_path: Directory path to load models and val_thresholds to perform inference. :param context_norm: List containing mean and std values for context images. :param body_norm: List containing mean and std values for body images. :param ind2cat: Dictionary converting integer index to categorical emotion. :param ind2vad: Dictionary converting integer index to continuous emotion dimension (Valence, Arousal and Dominance). :param args: Runtime arguments. ''' device = torch.device("cuda:%s" %(str(args.gpu)) if torch.cuda.is_available() else "cpu") yolo = prepare_yolo(model_path) yolo = yolo.to(device) yolo.eval() thresholds = torch.FloatTensor(np.load(os.path.join(result_path, 'val_thresholds.npy'))).to(device) model_context = torch.load(os.path.join(model_path,'model_context1.pth')).to(device) model_body = torch.load(os.path.join(model_path,'model_body1.pth')).to(device) emotic_model = torch.load(os.path.join(model_path,'model_emotic1.pth')).to(device) models = [model_context, model_body, emotic_model] with open(images_list, 'r') as f: lines = f.readlines() for idx, line in enumerate(lines): image_context_path = line.split('\n')[0].split(' ')[0] image_context = cv2.cvtColor(cv2.imread(image_context_path), cv2.COLOR_BGR2RGB) try: bbox_yolo = get_bbox(yolo, device, image_context) for pred_bbox in bbox_yolo: pred_cat, pred_cont = infer(context_norm, body_norm, ind2cat, ind2vad, device, thresholds, models, image_context=image_context, bbox=pred_bbox, to_print=False) write_text_vad = list() for continuous in pred_cont: write_text_vad.append(str('%.1f' %(continuous))) write_text_vad = 'vad ' + ' '.join(write_text_vad) image_context = cv2.rectangle(image_context, (pred_bbox[0], pred_bbox[1]),(pred_bbox[2] , pred_bbox[3]), (255, 0, 0), 3) cv2.putText(image_context, write_text_vad, (pred_bbox[0], pred_bbox[1] - 5), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) for i, emotion in enumerate(pred_cat): cv2.putText(image_context, emotion, (pred_bbox[0], pred_bbox[1] + (i+1)*12), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) except Exception as e: print ('Exception for image ',image_context_path) print (e) cv2.imwrite(os.path.join(result_path, 'img_%r.jpg' %(idx)), cv2.cvtColor(image_context, cv2.COLOR_RGB2BGR)) print ('completed inference for image %d' %(idx)) def yolo_video(video_file, result_path, model_path, context_norm, body_norm, ind2cat, ind2vad, args): ''' Perform inference on a video. First yolo model is used to obtain bounding boxes of persons in every frame. After that the emotic model is used to obtain categoraical and continuous emotion predictions. :param video_file: Path of video file. :param result_path: Directory path to save the results (output video). :param model_path: Directory path to load models and val_thresholds to perform inference. :param context_norm: List containing mean and std values for context images. :param body_norm: List containing mean and std values for body images. :param ind2cat: Dictionary converting integer index to categorical emotion. :param ind2vad: Dictionary converting integer index to continuous emotion dimension (Valence, Arousal and Dominance). :param args: Runtime arguments. ''' device = torch.device("cuda:%s" %(str(args.gpu)) if torch.cuda.is_available() else "cpu") yolo = prepare_yolo(model_path) yolo = yolo.to(device) yolo.eval() thresholds = torch.FloatTensor(np.load(os.path.join(result_path, 'val_thresholds.npy'))).to(device) model_context = torch.load(os.path.join(model_path,'model_context1.pth')).to(device) model_body = torch.load(os.path.join(model_path,'model_body1.pth')).to(device) emotic_model = torch.load(os.path.join(model_path,'model_emotic1.pth')).to(device) model_context.eval() model_body.eval() emotic_model.eval() models = [model_context, model_body, emotic_model] video_stream = cv2.VideoCapture(video_file) writer = None print ('Starting testing on video') while True: (grabbed, frame) = video_stream.read() if not grabbed: break image_context = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) try: bbox_yolo = get_bbox(yolo, device, image_context) for pred_idx, pred_bbox in enumerate(bbox_yolo): pred_cat, pred_cont = infer(context_norm, body_norm, ind2cat, ind2vad, device, thresholds, models, image_context=image_context, bbox=pred_bbox, to_print=False) write_text_vad = list() for continuous in pred_cont: write_text_vad.append(str('%.1f' %(continuous))) write_text_vad = 'vad ' + ' '.join(write_text_vad) image_context = cv2.rectangle(image_context, (pred_bbox[0], pred_bbox[1]),(pred_bbox[2] , pred_bbox[3]), (255, 0, 0), 3) cv2.putText(image_context, write_text_vad, (pred_bbox[0], pred_bbox[1] - 5), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 255), 2) for i, emotion in enumerate(pred_cat): cv2.putText(image_context, emotion, (pred_bbox[0], pred_bbox[1] + (i+1)*12), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 255), 2) except Exception: pass if writer is None: fourcc = cv2.VideoWriter_fourcc(*"MJPG") writer = cv2.VideoWriter(os.path.join(result_path, 'result_vid.avi'), fourcc, 30, (image_context.shape[1], image_context.shape[0]), True) writer.write(cv2.cvtColor(image_context, cv2.COLOR_RGB2BGR)) writer.release() video_stream.release() print ('Completed video') def check_paths(args): ''' Check (create if they don't exist) experiment directories. :param args: Runtime arguments as passed by the user. :return: result_dir_path, model_dir_path. ''' if args.inference_file is not None: if not os.path.exists(args.inference_file): raise ValueError('inference file does not exist. Please pass a valid inference file') if args.video_file is not None: if not os.path.exists(args.video_file): raise ValueError('video file does not exist. Please pass a valid video file') if args.inference_file is None and args.video_file is None: raise ValueError(' both inference file and video file can\'t be none. Please specify one and run again') model_path = os.path.join(args.experiment_path, args.model_dir) if not os.path.exists(model_path): raise ValueError('model path %s does not exist. Please pass a valid model_path' %(model_path)) result_path = os.path.join(args.experiment_path, args.result_dir) if not os.path.exists(result_path): os.makedirs(result_path) return result_path, model_path if __name__=='__main__': args = parse_args() result_path, model_path = check_paths(args) cat = ['Affection', 'Anger', 'Annoyance', 'Anticipation', 'Aversion', 'Confidence', 'Disapproval', 'Disconnection', \ 'Disquietment', 'Doubt/Confusion', 'Embarrassment', 'Engagement', 'Esteem', 'Excitement', 'Fatigue', 'Fear','Happiness', \ 'Pain', 'Peace', 'Pleasure', 'Sadness', 'Sensitivity', 'Suffering', 'Surprise', 'Sympathy', 'Yearning'] cat2ind = {} ind2cat = {} for idx, emotion in enumerate(cat): cat2ind[emotion] = idx ind2cat[idx] = emotion vad = ['Valence', 'Arousal', 'Dominance'] ind2vad = {} for idx, continuous in enumerate(vad): ind2vad[idx] = continuous context_mean = [0.4690646, 0.4407227, 0.40508908] context_std = [0.2514227, 0.24312855, 0.24266963] body_mean = [0.43832874, 0.3964344, 0.3706214] body_std = [0.24784276, 0.23621225, 0.2323653] context_norm = [context_mean, context_std] body_norm = [body_mean, body_std] if args.inference_file is not None: print ('inference over inference file images') yolo_infer(args.inference_file, result_path, model_path, context_norm, body_norm, ind2cat, ind2vad, args) if args.video_file is not None: print ('inference over test video') yolo_video(args.video_file, result_path, model_path, context_norm, body_norm, ind2cat, ind2vad, args) ================================================ FILE: yolo_utils.py ================================================ import cv2 import numpy as np import os import torch import torch.nn as nn import torch.nn.functional as F def to_cpu(tensor): return tensor.detach().cpu() def xywh2xyxy(x): ''' Convert bounding box from [x, y, w, h] to [x1, y1, x2, y2] :param x: bounding boxes array :return: Converted bounding box array ''' y = x.new(x.shape) y[..., 0] = x[..., 0] - x[..., 2] / 2 y[..., 1] = x[..., 1] - x[..., 3] / 2 y[..., 2] = x[..., 0] + x[..., 2] / 2 y[..., 3] = x[..., 1] + x[..., 3] / 2 return y def bbox_iou(box1, box2, x1y1x2y2=True): """ Returns the IoU of two bounding boxes """ if not x1y1x2y2: # Transform from center and width to exact coordinates b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 else: # Get the coordinates of bounding boxes b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] # get the corrdinates of the intersection rectangle inter_rect_x1 = torch.max(b1_x1, b2_x1) inter_rect_y1 = torch.max(b1_y1, b2_y1) inter_rect_x2 = torch.min(b1_x2, b2_x2) inter_rect_y2 = torch.min(b1_y2, b2_y2) # Intersection area inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp( inter_rect_y2 - inter_rect_y1 + 1, min=0 ) # Union Area b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) return iou def rescale_boxes(boxes, current_dim, original_shape): """ Rescales bounding boxes to the original shape """ orig_h, orig_w = original_shape # The amount of padding that was added pad_x = max(orig_h - orig_w, 0) * (current_dim / max(original_shape)) pad_y = max(orig_w - orig_h, 0) * (current_dim / max(original_shape)) # Image height and width after padding is removed unpad_h = current_dim - pad_y unpad_w = current_dim - pad_x # Rescale bounding boxes to dimension of original image boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h return boxes def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4): """ Removes detections with lower object confidence score than 'conf_thres' and performs Non-Maximum Suppression to further filter detections. Returns detections with shape: (x1, y1, x2, y2, object_conf, class_score, class_pred) """ # From (center x, center y, width, height) to (x1, y1, x2, y2) prediction[..., :4] = xywh2xyxy(prediction[..., :4]) output = [None for _ in range(len(prediction))] for image_i, image_pred in enumerate(prediction): # Filter out confidence scores below threshold image_pred = image_pred[image_pred[:, 4] >= conf_thres] # If none are remaining => process next image if not image_pred.size(0): continue # Object confidence times class confidence score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0] # Sort by it image_pred = image_pred[(-score).argsort()] class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True) detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1) # Perform non-maximum suppression keep_boxes = [] while detections.size(0): large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres label_match = detections[0, -1] == detections[:, -1] # Indices of boxes with lower confidence scores, large IOUs and matching labels invalid = large_overlap & label_match weights = detections[invalid, 4:5] # Merge overlapping bboxes by order of confidence detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum() keep_boxes += [detections[0]] detections = detections[~invalid] if keep_boxes: output[image_i] = torch.stack(keep_boxes) return output def parse_model_config(path): """Parses the yolo-v3 layer configuration file and returns module definitions""" file = open(path, 'r') lines = file.read().split('\n') lines = [x for x in lines if x and not x.startswith('#')] lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces module_defs = [] for line in lines: if line.startswith('['): # This marks the start of a new block module_defs.append({}) module_defs[-1]['type'] = line[1:-1].rstrip() if module_defs[-1]['type'] == 'convolutional': module_defs[-1]['batch_normalize'] = 0 else: key, value = line.split("=") value = value.strip() module_defs[-1][key.rstrip()] = value.strip() return module_defs def parse_data_config(path): """Parses the data configuration file""" options = dict() options['gpus'] = '0,1,2,3' options['num_workers'] = '10' with open(path, 'r') as fp: lines = fp.readlines() for line in lines: line = line.strip() if line == '' or line.startswith('#'): continue key, value = line.split('=') options[key.strip()] = value.strip() return options def create_modules(module_defs): """ Constructs module list of layer blocks from module configuration in module_defs """ hyperparams = module_defs.pop(0) output_filters = [int(hyperparams["channels"])] module_list = nn.ModuleList() for module_i, module_def in enumerate(module_defs): modules = nn.Sequential() if module_def["type"] == "convolutional": bn = int(module_def["batch_normalize"]) filters = int(module_def["filters"]) kernel_size = int(module_def["size"]) pad = (kernel_size - 1) // 2 modules.add_module( f"conv_{module_i}", nn.Conv2d( in_channels=output_filters[-1], out_channels=filters, kernel_size=kernel_size, stride=int(module_def["stride"]), padding=pad, bias=not bn, ), ) if bn: modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5)) if module_def["activation"] == "leaky": modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1)) elif module_def["type"] == "maxpool": kernel_size = int(module_def["size"]) stride = int(module_def["stride"]) if kernel_size == 2 and stride == 1: modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1))) maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2)) modules.add_module(f"maxpool_{module_i}", maxpool) elif module_def["type"] == "upsample": upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest") modules.add_module(f"upsample_{module_i}", upsample) elif module_def["type"] == "route": layers = [int(x) for x in module_def["layers"].split(",")] filters = sum([output_filters[1:][i] for i in layers]) modules.add_module(f"route_{module_i}", EmptyLayer()) elif module_def["type"] == "shortcut": filters = output_filters[1:][int(module_def["from"])] modules.add_module(f"shortcut_{module_i}", EmptyLayer()) elif module_def["type"] == "yolo": anchor_idxs = [int(x) for x in module_def["mask"].split(",")] # Extract anchors anchors = [int(x) for x in module_def["anchors"].split(",")] anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] anchors = [anchors[i] for i in anchor_idxs] num_classes = int(module_def["classes"]) img_size = int(hyperparams["height"]) # Define detection layer yolo_layer = YOLOLayer(anchors, num_classes, img_size) modules.add_module(f"yolo_{module_i}", yolo_layer) # Register module list and number of output filters module_list.append(modules) output_filters.append(filters) return hyperparams, module_list class Upsample(nn.Module): """ nn.Upsample is deprecated """ def __init__(self, scale_factor, mode="nearest"): super(Upsample, self).__init__() self.scale_factor = scale_factor self.mode = mode def forward(self, x): x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode) return x class EmptyLayer(nn.Module): """Placeholder for 'route' and 'shortcut' layers""" def __init__(self): super(EmptyLayer, self).__init__() class YOLOLayer(nn.Module): """Detection layer""" def __init__(self, anchors, num_classes, img_dim=416): super(YOLOLayer, self).__init__() self.anchors = anchors self.num_anchors = len(anchors) self.num_classes = num_classes self.ignore_thres = 0.5 self.mse_loss = nn.MSELoss() self.bce_loss = nn.BCELoss() self.obj_scale = 1 self.noobj_scale = 100 self.metrics = {} self.img_dim = img_dim self.grid_size = 0 # grid size def compute_grid_offsets(self, grid_size, cuda=True): self.grid_size = grid_size g = self.grid_size FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor self.stride = self.img_dim / self.grid_size # Calculate offsets for each grid self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor) self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor) self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors]) self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1)) self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1)) def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) .permute(0, 1, 3, 4, 2) .contiguous() ) # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls # Metrics cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss class Darknet(nn.Module): """YOLOv3 object detection model""" def __init__(self, config_path, img_size=416): super(Darknet, self).__init__() self.module_defs = parse_model_config(config_path) self.hyperparams, self.module_list = create_modules(self.module_defs) self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")] self.img_size = img_size self.seen = 0 self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32) def forward(self, x, targets=None): img_dim = x.shape[2] loss = 0 layer_outputs, yolo_outputs = [], [] for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): if module_def["type"] in ["convolutional", "upsample", "maxpool"]: x = module(x) elif module_def["type"] == "route": x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1) elif module_def["type"] == "shortcut": layer_i = int(module_def["from"]) x = layer_outputs[-1] + layer_outputs[layer_i] elif module_def["type"] == "yolo": x, layer_loss = module[0](x, targets, img_dim) loss += layer_loss yolo_outputs.append(x) layer_outputs.append(x) yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1)) return yolo_outputs if targets is None else (loss, yolo_outputs) def load_darknet_weights(self, weights_path): """Parses and loads the weights stored in 'weights_path'""" # Open the weights file with open(weights_path, "rb") as f: header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values self.header_info = header # Needed to write header when saving weights self.seen = header[3] # number of images seen during training weights = np.fromfile(f, dtype=np.float32) # The rest are weights # Establish cutoff for loading backbone weights cutoff = None if "darknet53.conv.74" in weights_path: cutoff = 75 ptr = 0 for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): if i == cutoff: break if module_def["type"] == "convolutional": conv_layer = module[0] if module_def["batch_normalize"]: # Load BN bias, weights, running mean and running variance bn_layer = module[1] num_b = bn_layer.bias.numel() # Number of biases # Bias bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias) bn_layer.bias.data.copy_(bn_b) ptr += num_b # Weight bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight) bn_layer.weight.data.copy_(bn_w) ptr += num_b # Running Mean bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean) bn_layer.running_mean.data.copy_(bn_rm) ptr += num_b # Running Var bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var) bn_layer.running_var.data.copy_(bn_rv) ptr += num_b else: # Load conv. bias num_b = conv_layer.bias.numel() conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias) conv_layer.bias.data.copy_(conv_b) ptr += num_b # Load conv. weights num_w = conv_layer.weight.numel() conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight) conv_layer.weight.data.copy_(conv_w) ptr += num_w def save_darknet_weights(self, path, cutoff=-1): """ @:param path - path of the new weights file @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) """ fp = open(path, "wb") self.header_info[3] = self.seen self.header_info.tofile(fp) # Iterate through layers for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): if module_def["type"] == "convolutional": conv_layer = module[0] # If batch norm, load bn first if module_def["batch_normalize"]: bn_layer = module[1] bn_layer.bias.data.cpu().numpy().tofile(fp) bn_layer.weight.data.cpu().numpy().tofile(fp) bn_layer.running_mean.data.cpu().numpy().tofile(fp) bn_layer.running_var.data.cpu().numpy().tofile(fp) # Load conv bias else: conv_layer.bias.data.cpu().numpy().tofile(fp) # Load conv weights conv_layer.weight.data.cpu().numpy().tofile(fp) fp.close() def prepare_yolo(model_dir): ''' Download yolo model files and load the model weights :param model_dir: Directory path where to store yolo model weights and yolo model configuration file. :return: Yolo model after loading model weights ''' cfg_file = os.path.join(model_dir, 'yolov3.cfg') if not os.path.exists(cfg_file): download_command = 'wget https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg -O ' + cfg_file os.system(download_command) weight_file = os.path.join(model_dir, 'yolov3.weights') if not os.path.exists(weight_file): download_command = 'wget https://pjreddie.com/media/files/yolov3.weights -O ' + weight_file os.system(download_command) yolo_model = Darknet(cfg_file, 416) yolo_model.load_darknet_weights(weight_file) print ('prepared yolo model') return yolo_model