[
  {
    "path": ".gitignore",
    "content": "**/__pycache__/\ndatasets/*/\n!datasets/.gitkeep\nassets/*\n!assets/.gitkeep\nckpts/*\n!ckpts/.gitkeep\noutputs/ckpts/*\n!outputs/ckpts/.gitkeep\noutputs/logs/*\n!outputs/logs/.gitkeep\noutputs/results/*\n!outputs/results/.gitkeep\npreds/*\n!preds/.gitkeep\ntmp"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "# Prophet\n\n[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/prompting-large-language-models-with-answer/visual-question-answering-on-a-okvqa)](https://paperswithcode.com/sota/visual-question-answering-on-a-okvqa?p=prompting-large-language-models-with-answer)\n[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/prompting-large-language-models-with-answer/visual-question-answering-on-ok-vqa)](https://paperswithcode.com/sota/visual-question-answering-on-ok-vqa?p=prompting-large-language-models-with-answer)\n\nThis repository is the official implementation of the Prophet, a two stage framework designed to prompt GPT-3 with answer heuristics for knowledge-based VQA. In stage one, we train a vanilla VQA model on a specific knowledge-based VQA dataset and extract two types of complementary answer heuristics from the model: answer candidates and answer-aware examples. In stage two, answer heuristics are used to prompt GPT-3 to generate better answers. Prophet significantly outperforms existing state-of-the-art methods  on two datasets, delivering 61.1% on OK-VQA and 55.7% on A-OKVQA. Please refer to our [paper](https://arxiv.org/pdf/2303.01903.pdf) for details.\n\n![prophet](misc/framework.png)\n\n## Updates\nApril 28, 2023\n- Add pretrained and finetuned models on A-OKVOA.\n\nMarch 10, 2023\n- Training and testing codes of the two-stages Prophet framework.\n- Pretrained and finetuned models on OK-VOA.\n\n## Table of Contents\n\n- [Prerequisites](#prerequisites)\n- [Usage](#usage)\n- [Evaluation](#evaluation)\n- [Citation](#citation)\n- [License](#license)\n<!-- - [Acknowledgement](#acknowledgement) -->\n\n## Prerequisites\n\n### Hardware and Software Requirements\n\nTo conduct the following experiments, a machine with at least 1 RTX 3090 GPU, 50GB memory, and 300GB free disk space is recommended. We strongly recommend using an SSD drive to guarantee high-speed I/O.\n\nFollowing software is needed:\n\n1. [Python](https://www.python.org/downloads/) >= 3.9\n2. [Cuda](https://developer.nvidia.com/cuda-toolkit) >= 11.3\n3. [Pytorch](https://pytorch.org/get-started/locally/) >= 12.0\n5. what you can find in [environment.yml](environment.yml)\n\nWe recommend downloading [Anaconda](https://www.anaconda.com/) first and then creating a new environment with the following command:\n\n``` shell\n$ conda env create -f environment.yml\n```\n\nThis command will create a new environment named `prophet` with all the required packages. To activate the environment, run:\n\n``` shell\n$ conda activate prophet\n```\n\n### Data Preparation\n\nBefore running the code, prepare two folders: `datasets` and `assets`. The `datasets` folder contains all the datasets and features used in this project, and the `assets` folder contains the pre-computed resources and other intermediate files (you can use them to skip some early experiment steps and save time).\n\nFirst, download the [datasets](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/Ebzd7EANzHVHnh3FvYvCJ7kBkJf56iT1Obe5L2PZAzgM2g?download=1) and [assets](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/Ec5NPIswAxlEqi74qwGjIf0BKInF0O6nwW5dtn4h3GOUsQ?download=1). Then put the `datasets` and `assets` folder in the root directory of this project. Download MSCOCO 2014 and 2017 images from [here](https://cocodataset.org/#download) (you can skip MSCOCO 2017 if you only experiments on OK-VQA) and put them in the `datasets` folder. Run the following command to extract the features of the images:\n\n``` shell\n$ bash scripts/extract_img_feats.sh\n```\n\nAfter that, the `datasets` and `assets` folder will have the following structure:\n\n<details>\n<summary>Click to expand</summary>\n\n```\ndatasets\n├── aokvqa\n│   ├── aokvqa_v1p0_test.json\n│   ├── aokvqa_v1p0_train.json\n│   └── aokvqa_v1p0_val.json\n├── coco2014\n│   ├── train2014\n│   └── val2014\n├── coco2014_feats\n│   ├── train2014\n│   └── val2014\n├── coco2017\n│   ├── test2017\n│   ├── train2017\n│   └── val2017\n├── coco2017_feats\n│   ├── test2017\n│   ├── train2017\n│   └── val2017\n├── okvqa\n│   ├── mscoco_train2014_annotations.json\n│   ├── mscoco_val2014_annotations.json\n│   ├── OpenEnded_mscoco_train2014_questions.json\n│   └── OpenEnded_mscoco_val2014_questions.json\n└── vqav2\n    ├── v2_mscoco_train2014_annotations.json\n    ├── v2_mscoco_val2014_annotations.json\n    ├── v2_OpenEnded_mscoco_train2014_questions.json\n    ├── v2_OpenEnded_mscoco_val2014_questions.json\n    ├── v2valvg_no_ok_annotations.json\n    ├── v2valvg_no_ok_questions.json\n    ├── vg_annotations.json\n    └── vg_questions.json\n```\n</details>\n\nWe've also provided a tree structure of the entire project in [misc/tree.txt](misc/tree.txt).\n\n## Usage\n\nWe provide bash scripts for each stage of the Prophet framework. You can find them in the `scripts` directory. There are two common arguments you should take care of when running each script:\n\n- `--task`: specify the task (i.e., the target dataset) you want to deal with. The available options are `ok` (training on `train` set of OK-VQA and evaluating on the `test` set of OK-VQA), `aok_val` (training on `train` set of A-OKVQA and evaluating on the `val` set of A-OKVQA) and `aok_test` (training on `train` set and `val` set of A-OKVQA and evaluating on the `test` set of A-OKVQA);\n\nNote that although Prophet uses VQA v2 datasets for pre-training, there are slight differences in how the datasets are used for different tasks (`ok`, `aok_val`, and `aok_test`), as detailed in [configs/task_to_split.py](configs/task_to_split.py). This means that different pre-training commands need to be followed for each task.\n\n- `--version`: specify the version name of this run. This name will be used to create a new folder in the `outputs` directory to store the results of this run.\n\nNotice that you can omit any arguments when invoking following scripts, it will then use the default arguments written in the script files.\n\nBefore running any script, you can also update the configuration files (`*.yml`) in the `configs` directory to change hyperparameters.\n\n### 1. OK-VQA\n\nTake OK-VQA for example, Propht consists of two phases, stage one  for training a vanilla VQA model and extracting answer heuristics, and stage two for prompting GPT-3 with answer heuristics.\n\n#### **Stage one**\n\nAt this stage, we train an improved MCAN model (check the [paper](https://arxiv.org/pdf/2303.01903.pdf) for detail description) through pretraning on VQA v2 and finetuning on target dataset. Multiple GPUs are supported by setting `--gpu 0,1,2,3` (for example). Run pretraining step with commands:\n\n```shell\n$ bash scripts/pretrain.sh \\\n    --task ok --version okvqa_pretrain_1 --gpu 0\n```\nWe've provided a pretrained model for OK-VQA [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EcdTatraOqRJnZXBDXfr7QQBPtn8QYCa2m3Pvq0LlEml9Q?download=1). Then, run finetuning step with commands:\n\n```shell\n$ bash scripts/finetune.sh \\\n    --task ok --version okvqa_finetune_1 --gpu 0 \\\n    --pretrained_model outputs/okvqa_pretrain_1/ckpts/epoch_13.pkl\n```\n\nAll epoch checkpoints are saved in `outputs/ckpts/{your_version_name}`. We've also provided a finetuned model for OK-VQA [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/ESUb093PgyZFtLnU_RIYJQsBN_PU0jJdu-eFUb1-4T4mIQ?download=1). You may pick one to generate answer heuristics by run following command:\n\n```shell\n$ bash scripts/heuristics_gen.sh \\\n    --task ok --version okvqa_heuristics_1\n    --gpu 0 --ckpt_path outputs/okvqa_finetune_1/ckpts/epoch_6.pkl\n    --candidate_num 10 --example_num 100\n```\n\nThe extracted answer heuristics will be stored as `candidates.json` and `examples.json` in `outputs/results/{your_version_name}` directory.\n\n#### **Stage two**\n\nYou may need the `candidates.json` and `examples.json` files generated in the former stage to step into this stage. **Or you can just skip stage one, and use the files of answer heuristics we provided in `assets`. Especially, the `candidates.json` and `examples.json` files for OK-VQA are `answer_aware_examples_okvqa.json` and `candidates_okvqa.json`.** To prompt GPT-3 with answer heuristics and generate better answers, run the following command:\n\n```shell\n$ bash scripts/prompt.sh \\\n    --task ok --version okvqa_prompt_1 \\\n    --examples_path outputs/results/okvqa_heuristics_1/examples.json \\ \n    --candidates_path outputs/results/okvqa_heuristics_1/candidates.json \\\n    --openai_key sk-xxxxxxxxxxxxxxxxxxxxxx\n```\nThe result file will be stored as `result.json` in `outputs/results/{your_version_name}` directory.\n\n\nWe also provide example scripts for the `aok_val` and `aok_test` modes on A-OKVQA.\n<details>\n<summary>Click to expand</summary>\n\n### 2. A-OKVQA (val)\n\n#### **Stage one**\nSimilary, for task of `aok_val`, run pretraining step with commands:\n\n```shell\n$ bash scripts/pretrain.sh \\\n    --task aok_val --version aokvqa_val_pretrain_1 --gpu 0\n```\nWe've provided a pretrained model for `aok_val` [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EYeIgGR521pNsEjxliqRkmEBGpcwS5p-qrMGTC9ro_SF6g?download=1).Then, run finetuning step with commands:\n\n```shell\n$ bash scripts/finetune.sh \\\n    --task aok_val --version aokvqa_val_finetune_1 --gpu 0 \\\n    --pretrained_model outputs/aokvqa_val_pretrain_1/ckpts/epoch_13.pkl\n```\n\nAll epoch checkpoints are saved in `outputs/ckpts/{your_version_name}`.We've also provided a finetuned model for `aok_val` [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EQXIIjAIiJJFrOpobVhyH9oBBeBAY-VttHqfS91qPOKlJw?download=1). You may pick one to generate answer heuristics by run following command:\n\n```shell\n$ bash scripts/heuristics_gen.sh \\\n    --task aok_val --version aokvqa_val_heuristics_1\n    --gpu 0 --ckpt_path outputs/aokvqa_val_finetune_1/ckpts/epoch_6.pkl\n    --candidate_num 10 --example_num 100\n```\n\nThe extracted answer heuristics will be stored as `candidates.json` and `examples.json` in `outputs/results/{your_version_name}` directory.\n\n#### **Stage two**\n\nYou may need the `candidates.json` and `examples.json` files generated in the former stage to step into this stage. **Or you can just skip stage one, and use the files of answer heuristics we provided in `assets`. Especially, the `candidates.json` and `examples.json` files for `aok_val` are `examples_aokvqa_val.json` and `candidates_aokvqa_val.json`.** To prompt GPT-3 with answer heuristics and generate better answers, run the following command:\n\n```shell\n$ bash scripts/prompt.sh \\\n    --task ok --version okvqa_val_prompt_1 \\\n    --examples_path outputs/results/aokvqa_val_heuristics_1/examples.json \\ \n    --candidates_path outputs/results/aokvqa_val_heuristics_1/candidates.json \\\n    --captions_path assets/captions_aokvqa.json \\\n    --openai_key sk-xxxxxxxxxxxxxxxxxxxxxx\n```\nThe result file will be stored as `result.json` in `outputs/results/{your_version_name}` directory.\n\n\n\n### 3. A-OKVQA (test)\n\nFor task of `aok_val`, run pretraining step with commands:\n#### **Stage one**\n```shell\n$ bash scripts/pretrain.sh \\\n    --task aok_test --version aokvqa_test_pretrain_1 --gpu 0\n```\nWe've provided a pretrained model for `aok_test` [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EWSBB1OrjIlBoPdTMso6RFABNQKYKBWo1iU4l0w2NVDvuQ?download=1). Then, run finetuning step with commands:\n\n```shell\n$ bash scripts/finetune.sh \\\n    --task aok_test --version aokvqa_test_finetune_1 --gpu 0 \\\n    --pretrained_model outputs/aokvqa_test_pretrain_1/ckpts/epoch_13.pkl\n```\n\nAll epoch checkpoints are saved in `outputs/ckptss/{your_version_name}`.We've also provided a finetuned model for `aok_test` [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EQ6gvWbv9VhHrhh0D08G79kBk6JEA_eqXEt5ULgueCf1tA?download=1). You may pick one to generate answer heuristics by run following command:\n\n```shell\n$ bash scripts/heuristics_gen.sh \\\n    --task aok_test --version aokvqa_test_heuristics_1\n    --gpu 0 --ckpt_path outputs/aokvqa_test_finetune_1/ckpts/epoch_6.pkl\n    --candidate_num 10 --example_num 100\n```\n\nThe extracted answer heuristics will be stored as `candidates.json` and `examples.json` in `outputs/results/{your_version_name}` directory.\n\n#### **Stage two**\n\nYou may need the `candidates.json` and `examples.json` files generated in the former stage to step into this stage. **Or you can just skip stage one, and use the files of answer heuristics we provided in `assets`. Especially, the `candidates.json` and `examples.json` files for `aok_test` are `examples_aokvqa_test.json` and `candidates_aokvqa_test.json`.** To prompt GPT-3 with answer heuristics and generate better answers, run the following command:\n\n```shell\n$ bash scripts/prompt.sh \\\n    --task ok --version okvqa_test_prompt_1 \\\n    --examples_path outputs/results/aokvqa_test_heuristics_1/examples.json \\ \n    --candidates_path outputs/results/aokvqa_test_heuristics_1/candidates.json \\\n    --captions_path assets/captions_aokvqa.json \\\n    --openai_key sk-xxxxxxxxxxxxxxxxxxxxxx\n```\nThe result file will be stored as `result.json` in `outputs/results/{your_version_name}` directory.\n\n</details>\n\n## Evaluation\n\nFor the task of `ok` and `aok_val` whose annotations are available, the scores are automatically computed after finetuning and prompting. You can also evaluate the result files that outputted after finetuning or prompting, by run\n\n```shell\n$ bash scripts/evaluate_file.sh \\\n    --task ok --result_path outputs/results/okvqa_prompt_1/result.json\n```\n\nUsing the corresponding result files and evaluation script above, we obtain the accuracies in the following table, respectively.\n\n\n<table border=\"2\">\n<tr><th> OK-VQA</th><th> A-OKVQA (val) </th><th> A-OKVQA (test) </th></tr>\n<tr><td>\n\n| MCAN | Prophet |\n|:--:|:--:|\n| [53.0%](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EVPAUDjTWX9Gn3GIqj7JwUoB5HMWwL3SRnNf18dSckJBOw?download=1) | [61.1%](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EUqH0N4fLVdPsLYJ48Wl_gsBneZzyGR23Tv5P9RskOBwNQ?download=1) |\n</td><td>\n\n| MCAN | Prophet |\n|:--:|:--:|\n| [52.0%](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EdBYZeS55iFEjdlOhUbyWRsBtYnQ3-zerho13mYj2YQ0Ag?download=1) |[58.2%](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EXDUxT3_LrpDugZ7xj-0BMYBynuFDJQS88M3EGeFEhU5dg?download=1) |\n</td><td>\n\n| MCAN | Prophet |\n|:--:|:--:|\n| 45.6% | 55.7% |\n</td></tr>\n</table>\n\nFor the task of `aok_test`, you need to submit the result file to the [A-OKVQA Leaderboard](https://leaderboard.allenai.org/a-okvqa/submissions/public) to evaluate the result.\n\n\n## Citation\n\nIf you use this code in your research, please cite our paper:\n\n```BibTex\n@inproceedings{shao2023prompting,\n  title={Prompting Large Language Models with Answer Heuristics for Knowledge-based Visual Question Answering},\n  author={Shao, Zhenwei and Yu, Zhou and Wang, Meng and Yu, Jun},\n  booktitle={Computer Vision and Pattern Recognition (CVPR)},\n  pages={14974--14983},\n  year={2023}\n}\n```\n\n## License\n\nThis project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.\n"
  },
  {
    "path": "assets/.gitkeep",
    "content": ""
  },
  {
    "path": "ckpts/.gitkeep",
    "content": ""
  },
  {
    "path": "configs/finetune.yml",
    "content": "# Network\nIMG_RESOLUTION: 512\nIMG_FEAT_GRID: 16\nIMG_FEAT_SIZE: 4096\nBERT_VERSION: bert-large-uncased\nMAX_TOKEN: 32\nARCH_CEIL: {\n  enc: ['SA', 'FFN'],\n  dec: ['SA_v', 'GA', 'FFN'],\n}\nLANG_FEAT_SIZE: 1024\nLAYER: 6\nHIDDEN_SIZE: 1024\nFF_SIZE: 4096\nMULTI_HEAD: 8\nDROPOUT_R: 0.1\nFLAT_MLP_SIZE: 1024\nFLAT_GLIMPSES: 1\nFLAT_OUT_SIZE: 2048\n\n# Training\nBATCH_SIZE: 64\nEVAL_BATCH_SIZE: 64\nBERT_LR_MULT: 0.01\nLR_BASE: 0.00005\nLR_DECAY_R: 0.2\nLR_DECAY_LIST: [5,]\nWARMUP_EPOCH: 0\nMAX_EPOCH: 6\nGRAD_NORM_CLIP: -1\nOPT: AdamW\nOPT_PARAMS: {betas: '(0.9, 0.98)', eps: '1e-9'}\n## optimizer for finetuning warmup (i.e., only update the new appended parameters as a warm-up)\nEPOPH_FTW: 1\nOPT_FTW: Adam\nLR_BASE_FTW: 0.001\nOPT_PARAMS_FTW: {betas: '(0.9, 0.98)', eps: '1e-9'}"
  },
  {
    "path": "configs/path_cfgs.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: set const paths and dirs\n# ------------------------------------------------------------------------------ #\n\nimport os\n\nclass PATH:\n    def __init__(self):\n\n        self.LOG_ROOT = 'outputs/logs/'\n        self.CKPT_ROOT = 'outputs/ckpts/'\n        self.RESULTS_ROOT = 'outputs/results/'\n        self.DATASET_ROOT = 'datasets/'\n        self.ASSETS_ROOT = 'assets/'\n\n\n        self.IMAGE_DIR = {\n            'train2014': self.DATASET_ROOT + 'coco2014/train2014/',\n            'val2014': self.DATASET_ROOT + 'coco2014/val2014/',\n            # 'test2015': self.DATASET_ROOT + 'coco2015/test2015/',\n            'train2017': self.DATASET_ROOT + 'coco2017/train2017/',\n            'val2017': self.DATASET_ROOT + 'coco2017/val2017/',\n            'test2017': self.DATASET_ROOT + 'coco2017/test2017/',\n        }\n\n        self.FEATS_DIR = {\n            'train2014': self.DATASET_ROOT + 'coco2014_feats/train2014/',\n            'val2014': self.DATASET_ROOT + 'coco2014_feats/val2014/',\n            'train2017': self.DATASET_ROOT + 'coco2017_feats/train2017/',\n            'val2017': self.DATASET_ROOT + 'coco2017_feats/val2017/',\n            'test2017': self.DATASET_ROOT + 'coco2017_feats/test2017/',\n        }\n\n        self.QUESTION_PATH = {\n            'v2train': self.DATASET_ROOT + 'vqav2/v2_OpenEnded_mscoco_train2014_questions.json',\n            'v2val': self.DATASET_ROOT + 'vqav2/v2_OpenEnded_mscoco_val2014_questions.json',\n            'vg': self.DATASET_ROOT + 'vqav2/vg_questions.json',\n            'v2valvg_no_ok': self.DATASET_ROOT + 'vqav2/v2valvg_no_ok_questions.json',\n            'oktrain': self.DATASET_ROOT + 'okvqa/OpenEnded_mscoco_train2014_questions.json',\n            'oktest': self.DATASET_ROOT + 'okvqa/OpenEnded_mscoco_val2014_questions.json',\n            'aoktrain': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_train.json',\n            'aokval': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_val.json',\n            'aoktest': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_test.json',\n        }\n\n        self.ANSWER_PATH = {\n            'v2train': self.DATASET_ROOT + 'vqav2/v2_mscoco_train2014_annotations.json',\n            'v2val': self.DATASET_ROOT + 'vqav2/v2_mscoco_val2014_annotations.json',\n            'vg': self.DATASET_ROOT + 'vqav2/vg_annotations.json',\n            'v2valvg_no_ok': self.DATASET_ROOT + 'vqav2/v2valvg_no_ok_annotations.json',\n            'oktrain': self.DATASET_ROOT + 'okvqa/mscoco_train2014_annotations.json',\n            'oktest': self.DATASET_ROOT + 'okvqa/mscoco_val2014_annotations.json',\n            'aoktrain': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_train.json',\n            'aokval': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_val.json',\n        }\n\n        self.ANSWER_DICT_PATH = {\n            'v2': self.ASSETS_ROOT + 'answer_dict_vqav2.json',\n            'ok': self.ASSETS_ROOT + 'answer_dict_okvqa.json',\n            'aok': self.ASSETS_ROOT + 'answer_dict_aokvqa.json',\n        }\n\n\n"
  },
  {
    "path": "configs/pretrain.yml",
    "content": "# Network\nIMG_RESOLUTION: 512\nIMG_FEAT_GRID: 16\nIMG_FEAT_SIZE: 4096\nBERT_VERSION: bert-large-uncased\nMAX_TOKEN: 32\nARCH_CEIL: {\n  enc: ['SA', 'FFN'],\n  dec: ['SA_v', 'GA', 'FFN'],\n}\nLANG_FEAT_SIZE: 1024\nLAYER: 6\nHIDDEN_SIZE: 1024\nFF_SIZE: 4096\nMULTI_HEAD: 8\nDROPOUT_R: 0.1\nFLAT_MLP_SIZE: 1024\nFLAT_GLIMPSES: 1\nFLAT_OUT_SIZE: 2048\n\n# Training\nBATCH_SIZE: 64\nEVAL_BATCH_SIZE: 64\nBERT_LR_MULT: 0.01\nLR_BASE: 0.00007\nLR_DECAY_R: 0.2\nLR_DECAY_LIST: [10, 12]\nWARMUP_EPOCH: 3\nMAX_EPOCH: 13\nGRAD_NORM_CLIP: 2.0\nOPT: Adam\nOPT_PARAMS: {betas: '(0.9, 0.98)', eps: '1e-9'}\n"
  },
  {
    "path": "configs/prompt.yml",
    "content": "MODEL: text-davinci-002\nTEMPERATURE: 0.\nMAX_TOKENS: 8\nSLEEP_PER_INFER: 10\n\nPROMPT_HEAD: \"Please answer the question according to the context and candidate answers. Each candidate answer is associated with a confidence score within a bracket. The true answer may not be included in the candidate answers.\\n\\n\"\nLINE_PREFIX: \"===\\n\"\nN_EXAMPLES: 20\nK_CANDIDATES: 10\nT_INFER: 5"
  },
  {
    "path": "configs/task_cfgs.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Object that manages the configuration of the experiments.\n# ------------------------------------------------------------------------------ #\n\nimport os\nimport random\nimport torch\nimport numpy as np\nfrom datetime import datetime\n\nfrom .path_cfgs import PATH\nfrom .task_to_split import *\n\n\nclass Cfgs(PATH):\n    \n    def __init__(self, args):\n        super(Cfgs, self).__init__()\n        self.set_silent_attr()\n\n        self.GPU = getattr(args, 'GPU', None)\n        if self.GPU is not None:\n            self.GPU_IDS = [int(i) for i in self.GPU.split(',')]\n            # print(f'Avaliable GPUs: {torch.cuda.device_count()}')\n            # print(f'Using GPU {self.GPU}')\n            self.CURRENT_GPU = self.GPU_IDS[0]\n            torch.cuda.set_device(f'cuda:{self.CURRENT_GPU}')\n            self.N_GPU = len(self.GPU_IDS)\n            self.SEED = getattr(args, 'SEED', 1111)\n            torch.manual_seed(self.SEED)\n            # torch.manual_seed_all(self.SEED)\n            if self.N_GPU < 2:\n                torch.cuda.manual_seed(self.SEED)\n            else:\n                torch.cuda.manual_seed_all(self.SEED)\n            torch.backends.cudnn.deterministic = True\n            np.random.seed(self.SEED)\n            random.seed(self.SEED)\n            torch.set_num_threads(2)\n\n        # -------------------------\n        # ---- Version Control ----\n        # -------------------------\n        self.TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')\n        self.VERSION = getattr(args, 'VERSION', self.TIMESTAMP)\n        \n        # paths and dirs\n        self.CKPTS_DIR = os.path.join(self.CKPT_ROOT, self.VERSION)\n        self.LOG_PATH = os.path.join(\n            self.LOG_ROOT, \n            self.VERSION, \n            f'log_{self.TIMESTAMP}.txt'\n        )\n        self.RESULT_DIR = os.path.join(self.RESULTS_ROOT, self.VERSION)\n        self.RESULT_PATH = os.path.join(\n            self.RESULTS_ROOT,\n            self.VERSION,\n            'result_' + self.TIMESTAMP + '.json'\n        )\n\n        # about resume\n        self.RESUME = getattr(args, 'RESUME', False)\n        if self.RESUME and self.RUN_MODE == 'pretrain':\n            self.RESUME_VERSION = getattr(args, 'RESUME_VERSION', self.VERSION)\n            self.RESUME_EPOCH = getattr(args, 'RESUME_EPOCH', None)\n            resume_path = getattr(args, 'RESUME_PATH', None)\n            self.RESUME_PATH = os.path.join(\n                self.CKPTS_DIR, \n                self.RESUME_VERSION, \n                f'epoch_{self.RESUME_EPOCH}.pkl'\n            ) if resume_path is None else resume_path\n        \n        # for testing and heuristics generation\n        self.CKPT_PATH = getattr(args, 'CKPT_PATH', None)\n\n        # ----------------------\n        # ---- Task Control ----\n        # ----------------------\n\n        self.TASK = getattr(args, 'TASK', 'ok')\n        assert self.TASK in ['ok', 'aok_val', 'aok_test']\n\n        self.RUN_MODE = getattr(args, 'RUN_MODE', 'finetune')\n        assert self.RUN_MODE in ['pretrain', 'finetune', 'finetune_test', 'heuristics', 'prompt']\n\n        if self.RUN_MODE == 'pretrain':\n            self.DATA_TAG = 'v2'  # used to config answer dict\n            self.DATA_MODE = 'pretrain'\n        else:\n            self.DATA_TAG = self.TASK.split('_')[0]  # used to config answer dict\n            self.DATA_MODE = 'finetune'\n\n        \n        # config pipeline...\n        self.EVAL_NOW = True\n        if self.RUN_MODE == 'pretrain' or self.TASK == 'aok_test':\n            self.EVAL_NOW = False\n        # print(f'Eval Now: {self.EVAL_NOW}')\n\n        # ------------------------\n        # ---- Model Training ----\n        # ------------------------\n\n        self.NUM_WORKERS = 8\n        self.PIN_MEM = True\n\n        # --------------------------------\n        # ---- Heuristics Generations ----\n        # --------------------------------\n\n        self.CANDIDATE_NUM = getattr(args, 'CANDIDATE_NUM', None)\n        if self.CANDIDATE_NUM is not None:\n            self.CANDIDATE_FILE_PATH = os.path.join(\n                self.RESULTS_ROOT,\n                self.VERSION,\n                'candidates.json'\n            )\n            self.EXAMPLE_FILE_PATH = os.path.join(\n                self.RESULTS_ROOT,\n                self.VERSION,\n                'examples.json'\n            )\n            self.ANSWER_LATENTS_DIR = os.path.join(\n                self.RESULTS_ROOT,\n                self.VERSION,\n                'answer_latents'\n            ) # where answer latents will be saved\n\n\n        # write rest arguments to self\n        for attr in args.__dict__:\n            setattr(self, attr, getattr(args, attr))\n    \n    def __repr__(self):\n        _str = ''\n        for attr in self.__dict__:\n            if attr in self.__silent or getattr(self, attr) is None:\n                continue\n            _str += '{ %-17s }-> %s\\n' % (attr, getattr(self, attr))\n        \n        return _str\n    \n    def override_from_dict(self, dict_):\n        for key, value in dict_.items():\n            setattr(self, key, value)\n    \n    def set_silent_attr(self):\n        self.__silent = []\n        for attr in self.__dict__:\n            self.__silent.append(attr)\n        \n    @property\n    def TRAIN_SPLITS(self):\n        return TASK_TO_SPLIT[self.TASK][self.DATA_MODE]['train_split']\n    \n    @property\n    def EVAL_SPLITS(self):\n        return TASK_TO_SPLIT[self.TASK][self.DATA_MODE]['eval_split']\n        \n    @property\n    def FEATURE_SPLIT(self):\n        FEATURE_SPLIT = []\n        for split in self.TRAIN_SPLITS + self.EVAL_SPLITS:\n            feat_split = SPLIT_TO_IMGS[split]\n            if feat_split not in FEATURE_SPLIT:\n                FEATURE_SPLIT.append(feat_split)\n        return FEATURE_SPLIT\n    \n    @property\n    def EVAL_QUESTION_PATH(self):\n        # if not self.EVAL_NOW:\n        #     return []\n        return self.QUESTION_PATH[self.EVAL_SPLITS[0]]\n    \n    @property\n    def EVAL_ANSWER_PATH(self):\n        if not self.EVAL_NOW:\n            return []\n        return self.ANSWER_PATH[self.EVAL_SPLITS[0]]"
  },
  {
    "path": "configs/task_to_split.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: The goal of this file is to define the mapping from task and data\n# mode to dataset splits.\n# ------------------------------------------------------------------------------ #\n\nclass DictSafe(dict):\n\n    def __init__(self, data={}):\n        dict.__init__(self, data)\n        for key, value in data.items():\n            if isinstance(value, dict):\n                self[key] = DictSafe(value)\n\n    def __getitem__(self, key):\n        return self.get(key, [])\n\n# TASK_TO_SPLIT[TASK][DATA_MODE]['train_split'] is a list of dataset split name for training\n# TASK_TO_SPLIT[TASK][DATA_MODE]['eval_split'] is a list of dataset split name for evaluation\n# 'pretrain' mode is used for pretrain, so it does not have 'eval_split'\n# 'finetune' mode is used for finetune, heuristics generation and prompting\nTASK_TO_SPLIT = {\n    'ok': {\n        'pretrain': {\n            'train_split': ['v2train', 'v2valvg_no_ok'],\n            # As the testing set of okvqa uses a subset of MSCOCO val2014 as the input images,\n            # we remove this subset from the training set of pretraining to avoid data leakage.\n        },\n        'finetune': {\n            'train_split': ['oktrain'],\n            'eval_split': ['oktest'],\n        }\n    },\n    'aok_val': {\n        'pretrain': {\n            'train_split': ['v2train'],\n        },\n        'finetune': {\n            'train_split': ['aoktrain'],\n            'eval_split': ['aokval'],\n        }\n    },\n    'aok_test': {\n        'pretrain': {\n            'train_split': ['v2train', 'v2val', 'vg'],\n        },\n        'finetune': {\n            'train_split': ['aoktrain', 'aokval'],\n            'eval_split': ['aoktest'],\n        }\n    },\n}\nTASK_TO_SPLIT = DictSafe(TASK_TO_SPLIT)\n\nSPLIT_TO_IMGS = {\n    'v2train': 'train2014',\n    'v2val': 'val2014',\n    'v2valvg_no_ok': 'val2014',\n    'vg': 'val2014',\n    'oktrain': 'train2014',\n    'oktest': 'val2014',\n    'aoktrain': 'train2017',\n    'aokval': 'val2017',\n    'aoktest': 'test2017',\n}\n\n\nif __name__ == '__main__':\n    print(TASK_TO_SPLIT['okvqa']['test']['train_split'])"
  },
  {
    "path": "datasets/.gitkeep",
    "content": ""
  },
  {
    "path": "environment.yml",
    "content": "name: prophet\nchannels:\n  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch\n  - pytorch\n  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main\n  - conda-forge\n  - defaults\ndependencies:\n  - numpy=1.21.2=py39h20f2e39_0\n  - opt_einsum=3.3.0=pyhd8ed1ab_1\n  - pip=21.2.4=py39h06a4308_0\n  - python=3.9.11=h12debd9_2\n  - pytorch=1.12.0=py3.9_cuda11.3_cudnn8.3.2_0\n  - rich=12.5.1=py39h06a4308_0\n  - torchvision=0.13.0=py39_cu113\n  - pip:\n    - pyyaml==6.0\n    - einops==0.6.0\n    - huggingface-hub==0.12.1\n    - openai==0.18.0\n    - opencv-python==4.5.5.64\n    - pillow==9.3.0\n    - pyyaml==6.0\n    - sentence-transformers==2.2.2\n    - sentencepiece==0.1.96\n    - tokenizers==0.11.6\n    - tqdm==4.63.0\n    - transformers==4.26.1\n    - git+https://github.com/openai/CLIP.git\n\n"
  },
  {
    "path": "evaluation/ans_punct.py",
    "content": "# --------------------------------------------------------\n# mcan-vqa (Deep Modular Co-Attention Networks)\n# Licensed under The MIT License [see LICENSE for details]\n# Written by Yuhao Cui https://github.com/cuiyuhao1996\n# based on VQA Evaluation Code\n# --------------------------------------------------------\n\nimport re\n\ncontractions = {\n    \"aint\": \"ain't\", \"arent\": \"aren't\", \"cant\": \"can't\", \"couldve\":\n    \"could've\", \"couldnt\": \"couldn't\", \"couldn'tve\": \"couldn't've\",\n    \"couldnt've\": \"couldn't've\", \"didnt\": \"didn't\", \"doesnt\":\n    \"doesn't\", \"dont\": \"don't\", \"hadnt\": \"hadn't\", \"hadnt've\":\n    \"hadn't've\", \"hadn'tve\": \"hadn't've\", \"hasnt\": \"hasn't\", \"havent\":\n    \"haven't\", \"hed\": \"he'd\", \"hed've\": \"he'd've\", \"he'dve\":\n    \"he'd've\", \"hes\": \"he's\", \"howd\": \"how'd\", \"howll\": \"how'll\",\n    \"hows\": \"how's\", \"Id've\": \"I'd've\", \"I'dve\": \"I'd've\", \"Im\":\n    \"I'm\", \"Ive\": \"I've\", \"isnt\": \"isn't\", \"itd\": \"it'd\", \"itd've\":\n    \"it'd've\", \"it'dve\": \"it'd've\", \"itll\": \"it'll\", \"let's\": \"let's\",\n    \"maam\": \"ma'am\", \"mightnt\": \"mightn't\", \"mightnt've\":\n    \"mightn't've\", \"mightn'tve\": \"mightn't've\", \"mightve\": \"might've\",\n    \"mustnt\": \"mustn't\", \"mustve\": \"must've\", \"neednt\": \"needn't\",\n    \"notve\": \"not've\", \"oclock\": \"o'clock\", \"oughtnt\": \"oughtn't\",\n    \"ow's'at\": \"'ow's'at\", \"'ows'at\": \"'ow's'at\", \"'ow'sat\":\n    \"'ow's'at\", \"shant\": \"shan't\", \"shed've\": \"she'd've\", \"she'dve\":\n    \"she'd've\", \"she's\": \"she's\", \"shouldve\": \"should've\", \"shouldnt\":\n    \"shouldn't\", \"shouldnt've\": \"shouldn't've\", \"shouldn'tve\":\n    \"shouldn't've\", \"somebody'd\": \"somebodyd\", \"somebodyd've\":\n    \"somebody'd've\", \"somebody'dve\": \"somebody'd've\", \"somebodyll\":\n    \"somebody'll\", \"somebodys\": \"somebody's\", \"someoned\": \"someone'd\",\n    \"someoned've\": \"someone'd've\", \"someone'dve\": \"someone'd've\",\n    \"someonell\": \"someone'll\", \"someones\": \"someone's\", \"somethingd\":\n    \"something'd\", \"somethingd've\": \"something'd've\", \"something'dve\":\n    \"something'd've\", \"somethingll\": \"something'll\", \"thats\":\n    \"that's\", \"thered\": \"there'd\", \"thered've\": \"there'd've\",\n    \"there'dve\": \"there'd've\", \"therere\": \"there're\", \"theres\":\n    \"there's\", \"theyd\": \"they'd\", \"theyd've\": \"they'd've\", \"they'dve\":\n    \"they'd've\", \"theyll\": \"they'll\", \"theyre\": \"they're\", \"theyve\":\n    \"they've\", \"twas\": \"'twas\", \"wasnt\": \"wasn't\", \"wed've\":\n    \"we'd've\", \"we'dve\": \"we'd've\", \"weve\": \"we've\", \"werent\":\n    \"weren't\", \"whatll\": \"what'll\", \"whatre\": \"what're\", \"whats\":\n    \"what's\", \"whatve\": \"what've\", \"whens\": \"when's\", \"whered\":\n    \"where'd\", \"wheres\": \"where's\", \"whereve\": \"where've\", \"whod\":\n    \"who'd\", \"whod've\": \"who'd've\", \"who'dve\": \"who'd've\", \"wholl\":\n    \"who'll\", \"whos\": \"who's\", \"whove\": \"who've\", \"whyll\": \"why'll\",\n    \"whyre\": \"why're\", \"whys\": \"why's\", \"wont\": \"won't\", \"wouldve\":\n    \"would've\", \"wouldnt\": \"wouldn't\", \"wouldnt've\": \"wouldn't've\",\n    \"wouldn'tve\": \"wouldn't've\", \"yall\": \"y'all\", \"yall'll\":\n    \"y'all'll\", \"y'allll\": \"y'all'll\", \"yall'd've\": \"y'all'd've\",\n    \"y'alld've\": \"y'all'd've\", \"y'all'dve\": \"y'all'd've\", \"youd\":\n    \"you'd\", \"youd've\": \"you'd've\", \"you'dve\": \"you'd've\", \"youll\":\n    \"you'll\", \"youre\": \"you're\", \"youve\": \"you've\"\n}\n\nmanual_map = { 'none': '0',\n              'zero': '0',\n              'one': '1',\n              'two': '2',\n              'three': '3',\n              'four': '4',\n              'five': '5',\n              'six': '6',\n              'seven': '7',\n              'eight': '8',\n               'nine': '9',\n              'ten': '10'}\narticles = ['a', 'an', 'the']\nperiod_strip = re.compile(\"(?!<=\\d)(\\.)(?!\\d)\")\ncomma_strip = re.compile(\"(\\d)(\\,)(\\d)\")\npunct = [';', r\"/\", '[', ']', '\"', '{', '}',\n                '(', ')', '=', '+', '\\\\', '_', '-',\n                '>', '<', '@', '`', ',', '?', '!']\n\ndef process_punctuation(inText):\n    outText = inText\n    for p in punct:\n        if (p + ' ' in inText or ' ' + p in inText) \\\n           or (re.search(comma_strip, inText) != None):\n            outText = outText.replace(p, '')\n        else:\n            outText = outText.replace(p, ' ')\n    outText = period_strip.sub(\"\", outText, re.UNICODE)\n    return outText\n\n\ndef process_digit_article(inText):\n    outText = []\n    tempText = inText.lower().split()\n    for word in tempText:\n        word = manual_map.setdefault(word, word)\n        if word not in articles:\n            outText.append(word)\n        else:\n            pass\n    for wordId, word in enumerate(outText):\n        if word in contractions:\n            outText[wordId] = contractions[word]\n    outText = ' '.join(outText)\n    return outText\n\n\ndef prep_ans(answer):\n    answer = process_digit_article(process_punctuation(answer))\n    answer = answer.replace(',', '')\n    return answer\n"
  },
  {
    "path": "evaluation/aok_utils/eval_predictions.py",
    "content": "import argparse\nimport pathlib\nimport json\nimport glob\n\nfrom .load_aokvqa import load_aokvqa\n\n\ndef eval_aokvqa(dataset, preds, multiple_choice=False, strict=True):\n\n    if isinstance(dataset, list):\n        dataset = { dataset[i]['question_id'] : dataset[i] for i in range(len(dataset)) }\n\n    # print(f'Loaded dataset size: {len(dataset)}')\n    if multiple_choice is False:\n        dataset = {k:v for k,v in dataset.items() if v['difficult_direct_answer'] is False}\n    # print(f'Loaded dataset size: {len(dataset)}')\n\n    if strict:\n        dataset_qids = set(dataset.keys())\n        preds_qids = set(preds.keys())\n        assert dataset_qids.issubset(preds_qids)\n\n    # dataset = q_id (str) : dataset element (dict)\n    # preds = q_id (str) : prediction (str)\n\n    acc = []\n\n    for q in dataset.keys():\n        if q not in preds.keys():\n            acc.append(0.0)\n            continue\n\n        pred = preds[q]\n        choices = dataset[q]['choices']\n        direct_answers = dataset[q]['direct_answers']\n\n        ## Multiple Choice setting\n        if multiple_choice:\n            if strict:\n                assert pred in choices, 'Prediction must be a valid choice'\n            correct_choice_idx = dataset[q]['correct_choice_idx']\n            acc.append( float(pred == choices[correct_choice_idx]) )\n        ## Direct Answer setting\n        else:\n            num_match = sum([pred == da for da in direct_answers])\n            vqa_acc = min(1.0, num_match / 3.0)\n            # with open('2.txt', 'a') as f:\n            #     f.write(q + ' ' + str(vqa_acc) + '\\n')\n            acc.append(vqa_acc)\n\n    acc = sum(acc) / len(acc) * 100\n\n    return acc\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')\n    parser.add_argument('--split', type=str, choices=['train', 'val', 'test_w_ans'], required=True)\n    parser.add_argument('--preds', type=str, required=True, dest='prediction_files')\n    args = parser.parse_args()\n\n    dataset = load_aokvqa(args.aokvqa_dir, args.split)\n\n    for prediction_file in glob.glob(args.prediction_files):\n        predictions = json.load(open(prediction_file, 'r'))\n\n        # Multiple choice\n\n        mc_predictions = {}\n\n        for q in predictions.keys():\n            if 'multiple_choice' in predictions[q].keys():\n                mc_predictions[q] = predictions[q]['multiple_choice']\n\n        if mc_predictions != {}:\n            mc_acc = eval_aokvqa(\n                dataset,\n                mc_predictions,\n                multiple_choice=True,\n                strict=False\n            )\n            print(prediction_file, 'MC', mc_acc)\n\n        # Direct Answer\n\n        da_predictions = {}\n\n        for q in predictions.keys():\n            if 'direct_answer' in predictions[q].keys():\n                da_predictions[q] = predictions[q]['direct_answer']\n\n        if da_predictions != {}:\n            da_acc = eval_aokvqa(\n                dataset,\n                da_predictions,\n                multiple_choice=False,\n                strict=False\n            )\n            print(prediction_file, 'DA', da_acc)\n"
  },
  {
    "path": "evaluation/aok_utils/load_aokvqa.py",
    "content": "import os\nimport json\n\n\ndef load_aokvqa(aokvqa_dir, split, version='v1p0'):\n    assert split in ['train', 'val', 'test', 'test_w_ans']\n    dataset = json.load(open(\n        os.path.join(aokvqa_dir, f\"aokvqa_{version}_{split}.json\")\n    ))\n    return dataset\n\ndef get_coco_path(split, image_id, coco_dir):\n    return os.path.join(coco_dir, f\"{split}2017\", f\"{image_id:012}.jpg\")\n"
  },
  {
    "path": "evaluation/aok_utils/remap_predictions.py",
    "content": "import os \nos.environ['CUDA_VISIBLE_DEVICES'] = '1'\nimport argparse\nimport pathlib\nimport json\nfrom tqdm import tqdm\n\nfrom sentence_transformers import SentenceTransformer\nfrom sentence_transformers.util import cos_sim\n\nfrom .load_aokvqa import load_aokvqa\n\n\ndef map_to_choices(dataset, predictions, device='cpu'):\n    if isinstance(dataset, list):\n        dataset = { dataset[i]['question_id'] : dataset[i] for i in range(len(dataset)) }\n\n    if all([p in dataset[q]['choices'] for q, p in predictions.items()]):\n        return predictions\n\n    model = SentenceTransformer('sentence-transformers/average_word_embeddings_glove.6B.300d')\n    model.to(device)\n    for q in tqdm(predictions.keys()):\n        choices = dataset[q]['choices']\n        if predictions[q] not in choices:\n            choice_embeddings = model.encode([predictions[q]] + choices, convert_to_tensor=True)\n            a_idx = cos_sim(choice_embeddings[0], choice_embeddings[1:]).argmax().item()\n            predictions[q] = choices[a_idx]\n\n    return predictions\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')\n    parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)\n    parser.add_argument('--pred', type=argparse.FileType('r'), required=True, dest='prediction_file')\n    parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file')\n    args = parser.parse_args()\n\n\n    dataset = load_aokvqa(args.aokvqa_dir, args.split)\n    predictions = json.load(args.prediction_file)\n    # predictions = {qid: predictions[qid]['direct_answer'] for qid in predictions }\n    # json.dump(predictions, open('cache/mcan_da.json', 'w'))\n    predictions = map_to_choices(dataset, predictions)\n\n    json.dump(predictions, args.output_file)\n"
  },
  {
    "path": "evaluation/aokvqa_evaluate.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Evaluation script for A-OKVQA\n# ------------------------------------------------------------------------------ #\n\nimport json\nfrom evaluation.aok_utils.eval_predictions import eval_aokvqa\nfrom evaluation.aok_utils.remap_predictions import map_to_choices\nfrom .ans_punct import prep_ans\nimport argparse\n\nclass AOKEvaluater:\n    def __init__(self, annotation_path: str, question_path: str):\n        self.annotation_path = annotation_path\n        self.question_path = question_path\n        self.dataset = json.load(open(question_path, 'r'))\n        self.result_file = {}\n        self.result_path = None\n        self.multiple_choice = False\n        self.map_to_mc = True\n    \n    def init(self):\n        self.result_file = []\n    \n    def set_mode(self, multiple_choice=None, map_to_mc=None):\n        if multiple_choice is not None:\n            self.multiple_choice = multiple_choice\n        if map_to_mc is not None:\n            self.map_to_mc = map_to_mc\n    \n    def prep_ans(self, answer):\n        return prep_ans(answer)\n    \n    def add(self, qid, answer):\n        if self.multiple_choice:\n            self.result_file[qid] = {\n                'multiple_choice': answer,\n            }\n        else:\n            self.result_file[qid] = {\n                'direct_answer': answer,\n            }\n    \n    def save(self, result_path: str):\n        self.result_path = result_path\n        if not self.multiple_choice and self.map_to_mc:\n            predictions = {qid: item['direct_answer'] for qid, item in self.result_file.items()}\n            predictions = map_to_choices(self.dataset, predictions, 'cuda:0')\n            for qid, answer in predictions.items():\n                self.result_file[qid]['multiple_choice'] = answer\n        json.dump(self.result_file, open(self.result_path, 'w'))\n    \n    def evaluate(self, logfile=None):\n        assert self.result_path is not None, \"Please save the result file first.\"\n\n        direct_answer = not self.multiple_choice\n        multiple_choice = self.multiple_choice or self.map_to_mc\n        eval_str = _evaluate(self.dataset, self.result_file, direct_answer=direct_answer, multiple_choice=multiple_choice)\n        print(eval_str)\n        if logfile is not None:\n            print(eval_str + '\\n', file=logfile)\n\n\ndef _evaluate(dataset, results, direct_answer=True, multiple_choice=True):\n    result_str = ''\n\n    if direct_answer:\n        # Direct Answer Evaluation\n        da_predictions = {}\n        for qid, item in results.items():\n            da_predictions[qid] = item['direct_answer']\n\n        da_acc = eval_aokvqa(\n            dataset,\n            da_predictions,\n            multiple_choice=False,\n            strict=False\n        )\n        result_str += f'DA: {da_acc: .2f}\\n'\n        \n    if multiple_choice:\n        # Multiple Choice Evaluation\n        mc_predictions = {}\n        for qid, item in results.items():\n            mc_predictions[qid] = item['multiple_choice']\n\n        mc_acc = eval_aokvqa(\n            dataset,\n            mc_predictions,\n            multiple_choice=True,\n            strict=False\n        )\n        result_str += f'MC: {mc_acc: .2f}\\n'\n    return result_str\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Evaluate A-OKVQA result file.')\n    parser.add_argument('--dataset_path', type=str, required=True)\n    parser.add_argument('--result_path', type=str, required=True)\n    parser.add_argument('--direct_answer', action='store_true')\n    parser.add_argument('--multiple_choice', action='store_true')\n    args = parser.parse_args()\n    dataset = json.load(open(args.dataset_path, 'r'))\n    result = json.load(open(args.result_path, 'r'))\n    result_str = _evaluate(dataset, result, direct_answer=args.direct_answer, multiple_choice=args.multiple_choice)\n    print(result_str)"
  },
  {
    "path": "evaluation/okvqa_evaluate.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Evaluation script for OK-VQA\n# ------------------------------------------------------------------------------ #\n\nimport json\nfrom evaluation.vqa_utils.vqa import VQA\nfrom evaluation.vqa_utils.vqaEval import VQAEval\nfrom .ans_punct import prep_ans\nimport argparse\n\nclass OKEvaluater:\n    def __init__(self, annotation_path: str, question_path: str):\n        self.annotation_path = annotation_path\n        self.question_path = question_path\n        # print(f'== Annotation file: {self.annotation_path}')\n        # print(f'== Question file: {self.question_path}')\n        self.result_file = []\n        self.result_path = None\n\n    def init(self):\n        self.result_file = []\n\n    def prep_ans(self, answer):\n        return prep_ans(answer)\n    \n    def add(self, qid, answer):\n        qid = int(qid)\n        self.result_file.append({\n            'question_id': qid,\n            'answer': answer\n        })\n    \n    def save(self, result_path: str):\n        self.result_path = result_path\n        json.dump(self.result_file, open(self.result_path, 'w'))\n    \n    def evaluate(self, logfile=None):\n        assert self.result_path is not None, \"Please save the result file first.\"\n\n        eval_str = _evaluate(self.annotation_path, self.question_path, self.result_path)\n        print()\n        print(eval_str)\n        if logfile is not None:\n            print(eval_str + '\\n', file=logfile)\n\n\ndef _evaluate(annotation_file: str, question_file: str, result_file: str):\n    # print(f'== Annotation file: {annotation_file}')\n    # print(f'== Question file: {question_file}')\n    vqa = VQA(annotation_file, question_file)\n    vqaRes_prophet = vqa.loadRes(result_file, question_file)\n    vqaEval_prophet = VQAEval(vqa, vqaRes_prophet, n=2)\n    vqaEval_prophet.evaluate()\n\n    question_types = {\n        \"eight\": \"Plants and Animals\",\n        \"nine\": \"Science and Technology\",\n        \"four\": \"Sports and Recreation\",\n        \"six\": \"Geography, History, Language and Culture\",\n        \"two\": \"Brands, Companies and Products\",\n        \"one\": \"Vehicles and Transportation\",\n        \"five\": \"Cooking and Food\",\n        \"ten\": \"Weather and Climate\",\n        \"seven\": \"People and Everyday life\",\n        \"three\": \"Objects, Material and Clothing\"\n        # \"other\": \"Other\",\n    }\n\n    result_str = ''\n    result_str += \"Overall Accuracy is: %.02f\\n\" % (vqaEval_prophet.accuracy['overall'])\n    result_str += f\"{'Question Type':40s}\\t{'Prophet'}\\n\"\n    for quesType in question_types:\n        result_str += \"%-40s\\t%.02f\\n\" % (question_types[quesType], vqaEval_prophet.accuracy['perQuestionType'][quesType])\n    # print(result_str)\n    return result_str\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Evaluate OK-VQA result file.')\n    parser.add_argument('--annotation_path', type=str, required=True)\n    parser.add_argument('--question_path', type=str, required=True)\n    parser.add_argument('--result_path', type=str, required=True)\n    args = parser.parse_args()\n    result_str = _evaluate(args.annotation_path, args.question_path, args.result_path)\n    print(result_str)"
  },
  {
    "path": "evaluation/vqa_utils/vqa.py",
    "content": "__author__ = 'aagrawal'\n__version__ = '0.9'\n\n# Interface for accessing the VQA dataset.\n\n# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link: \n# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).\n\n# The following functions are defined:\n#  VQA        - VQA class that loads VQA annotation file and prepares data structures.\n#  getQuesIds - Get question ids that satisfy given filter conditions.\n#  getImgIds  - Get image ids that satisfy given filter conditions.\n#  loadQA     - Load questions and answers with the specified question ids.\n#  showQA     - Display the specified questions and answers.\n#  loadRes    - Load result file and create result object.\n\n# Help on each function can be accessed by: \"help(COCO.function)\"\n\nimport json\nimport datetime\nimport copy\n\n\nclass VQA:\n\tdef __init__(self, annotation_file=None, question_file=None):\n\t\t\"\"\"\n       \tConstructor of VQA helper class for reading and visualizing questions and answers.\n        :param annotation_file (str): location of VQA annotation file\n        :return:\n\t\t\"\"\"\n\t\t# load dataset\n\t\tself.dataset = {}\n\t\tself.questions = {}\n\t\tself.qa = {}\n\t\tself.qqa = {}\n\t\tself.imgToQA = {}\n\t\tif not annotation_file == None and not question_file == None:\n\t\t\tprint('loading VQA annotations and questions into memory...')\n\t\t\ttime_t = datetime.datetime.utcnow()\n\t\t\tdataset = json.load(open(annotation_file, 'r'))\n\t\t\tquestions = json.load(open(question_file, 'r'))\n\t\t\tprint(datetime.datetime.utcnow() - time_t)\n\t\t\tself.dataset = dataset\n\t\t\tself.questions = questions\n\t\t\tself.createIndex()\n\n\tdef createIndex(self):\n\t\t# create index\n\t\tprint('creating index...')\n\t\timgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}\n\t\tqa = {ann['question_id']: [] for ann in self.dataset['annotations']}\n\t\tqqa = {ann['question_id']: [] for ann in self.dataset['annotations']}\n\t\tfor ann in self.dataset['annotations']:\n\t\t\timgToQA[ann['image_id']] += [ann]\n\t\t\tqa[ann['question_id']] = ann\n\t\tfor ques in self.questions['questions']:\n\t\t\tqqa[ques['question_id']] = ques\n\t\tprint('index created!')\n\n\t\t# create class members\n\t\tself.qa = qa\n\t\tself.qqa = qqa\n\t\tself.imgToQA = imgToQA\n\n\tdef info(self):\n\t\t\"\"\"\n\t\tPrint information about the VQA annotation file.\n\t\t:return:\n\t\t\"\"\"\n\t\tfor key, value in self.dataset['info'].items():\n\t\t\tprint('%s: %s' % (key, value))\n\n\tdef getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):\n\t\t\"\"\"\n\t\tGet question ids that satisfy given filter conditions. default skips that filter\n\t\t:param \timgIds    (int array)   : get question ids for given imgs\n\t\t\t\tquesTypes (str array)   : get question ids for given question types\n\t\t\t\tansTypes  (str array)   : get question ids for given answer types\n\t\t:return:    ids   (int array)   : integer array of question ids\n\t\t\"\"\"\n\t\timgIds = imgIds if type(imgIds) == list else [imgIds]\n\t\tquesTypes = quesTypes if type(quesTypes) == list else [quesTypes]\n\t\tansTypes = ansTypes if type(ansTypes) == list else [ansTypes]\n\n\t\tif len(imgIds) == len(quesTypes) == len(ansTypes) == 0:\n\t\t\tanns = self.dataset['annotations']\n\t\telse:\n\t\t\tif not len(imgIds) == 0:\n\t\t\t\tanns = sum([self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA], [])\n\t\t\telse:\n\t\t\t\tanns = self.dataset['annotations']\n\t\t\tanns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]\n\t\t\tanns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]\n\t\tids = [ann['question_id'] for ann in anns]\n\t\treturn ids\n\n\tdef getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):\n\t\t\"\"\"\n\t\tGet image ids that satisfy given filter conditions. default skips that filter\n\t\t:param quesIds   (int array)   : get image ids for given question ids\n               quesTypes (str array)   : get image ids for given question types\n               ansTypes  (str array)   : get image ids for given answer types\n\t\t:return: ids     (int array)   : integer array of image ids\n\t\t\"\"\"\n\t\tquesIds = quesIds if type(quesIds) == list else [quesIds]\n\t\tquesTypes = quesTypes if type(quesTypes) == list else [quesTypes]\n\t\tansTypes = ansTypes if type(ansTypes) == list else [ansTypes]\n\n\t\tif len(quesIds) == len(quesTypes) == len(ansTypes) == 0:\n\t\t\tanns = self.dataset['annotations']\n\t\telse:\n\t\t\tif not len(quesIds) == 0:\n\t\t\t\tanns = sum([self.qa[quesId] for quesId in quesIds if quesId in self.qa], [])\n\t\t\telse:\n\t\t\t\tanns = self.dataset['annotations']\n\t\t\tanns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]\n\t\t\tanns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]\n\t\tids = [ann['image_id'] for ann in anns]\n\t\treturn ids\n\n\tdef loadQA(self, ids=[]):\n\t\t\"\"\"\n\t\tLoad questions and answers with the specified question ids.\n\t\t:param ids (int array)       : integer ids specifying question ids\n\t\t:return: qa (object array)   : loaded qa objects\n\t\t\"\"\"\n\t\tif type(ids) == list:\n\t\t\treturn [self.qa[id] for id in ids]\n\t\telif type(ids) == int:\n\t\t\treturn [self.qa[ids]]\n\n\tdef showQA(self, anns):\n\t\t\"\"\"\n\t\tDisplay the specified annotations.\n\t\t:param anns (array of object): annotations to display\n\t\t:return: None\n\t\t\"\"\"\n\t\tif len(anns) == 0:\n\t\t\treturn 0\n\t\tfor ann in anns:\n\t\t\tquesId = ann['question_id']\n\t\t\tprint(\"Question: %s\" % (self.qqa[quesId]['question']))\n\t\t\tfor ans in ann['answers']:\n\t\t\t\tprint(\"Answer %d: %s\" % (ans['answer_id'], ans['answer']))\n\n\tdef loadRes(self, resFile, quesFile):\n\t\t\"\"\"\n\t\tLoad result file and return a result object.\n\t\t:param   resFile (str)     : file name of result file\n\t\t:return: res (obj)         : result api object\n\t\t\"\"\"\n\t\tres = VQA()\n\t\tres.questions = json.load(open(quesFile))\n\t\tres.dataset['info'] = copy.deepcopy(self.questions['info'])\n\t\tres.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])\n\t\tres.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])\n\t\tres.dataset['data_subtype'] = copy.deepcopy(self.questions['data_subtype'])\n\t\tres.dataset['license'] = copy.deepcopy(self.questions['license'])\n\n\t\tprint('Loading and preparing results...     ')\n\t\ttime_t = datetime.datetime.utcnow()\n\t\tanns = json.load(open(resFile))\n\t\tassert type(anns) == list, 'results is not an array of objects'\n\t\tannsQuesIds = [ann['question_id'] for ann in anns]\n\t\tassert set(annsQuesIds) == set(self.getQuesIds()), \\\n\t\t\t'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file.'\n\t\tfor ann in anns:\n\t\t\tquesId = ann['question_id']\n\t\t\tif res.dataset['task_type'] == 'Multiple Choice':\n\t\t\t\tassert ann['answer'] in self.qqa[quesId][\n\t\t\t\t\t'multiple_choices'], 'predicted answer is not one of the multiple choices'\n\t\t\tqaAnn = self.qa[quesId]\n\t\t\tann['image_id'] = qaAnn['image_id']\n\t\t\tann['question_type'] = qaAnn['question_type']\n\t\t\tann['answer_type'] = qaAnn['answer_type']\n\t\tprint('DONE (t=%0.2fs)' % ((datetime.datetime.utcnow() - time_t).total_seconds()))\n\n\t\tres.dataset['annotations'] = anns\n\t\tres.createIndex()\n\t\treturn res\n"
  },
  {
    "path": "evaluation/vqa_utils/vqaEval.py",
    "content": "# coding=utf-8\n\n__author__='aagrawal'\n\n# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link: \n# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).\nimport sys\nimport re\n\nclass VQAEval:\n\tdef __init__(self, vqa, vqaRes, n=2):\n\t\tself.n \t\t\t  = n\n\t\tself.accuracy     = {}\n\t\tself.evalQA       = {}\n\t\tself.evalQuesType = {}\n\t\tself.evalAnsType  = {}\n\t\tself.vqa \t\t  = vqa\n\t\tself.vqaRes       = vqaRes\n\t\tself.params\t\t  = {'question_id': vqa.getQuesIds()}\n\t\tself.contractions = {\"aint\": \"ain't\", \"arent\": \"aren't\", \"cant\": \"can't\", \"couldve\": \"could've\", \"couldnt\": \"couldn't\",\n\t\t\t\t\t\t\t \"couldn'tve\": \"couldn't've\", \"couldnt've\": \"couldn't've\", \"didnt\": \"didn't\", \"doesnt\": \"doesn't\", \"dont\": \"don't\", \"hadnt\": \"hadn't\",\n\t\t\t\t\t\t\t \"hadnt've\": \"hadn't've\", \"hadn'tve\": \"hadn't've\", \"hasnt\": \"hasn't\", \"havent\": \"haven't\", \"hed\": \"he'd\", \"hed've\": \"he'd've\",\n\t\t\t\t\t\t\t \"he'dve\": \"he'd've\", \"hes\": \"he's\", \"howd\": \"how'd\", \"howll\": \"how'll\", \"hows\": \"how's\", \"Id've\": \"I'd've\", \"I'dve\": \"I'd've\",\n\t\t\t\t\t\t\t \"Im\": \"I'm\", \"Ive\": \"I've\", \"isnt\": \"isn't\", \"itd\": \"it'd\", \"itd've\": \"it'd've\", \"it'dve\": \"it'd've\", \"itll\": \"it'll\", \"let's\": \"let's\",\n\t\t\t\t\t\t\t \"maam\": \"ma'am\", \"mightnt\": \"mightn't\", \"mightnt've\": \"mightn't've\", \"mightn'tve\": \"mightn't've\", \"mightve\": \"might've\",\n\t\t\t\t\t\t\t \"mustnt\": \"mustn't\", \"mustve\": \"must've\", \"neednt\": \"needn't\", \"notve\": \"not've\", \"oclock\": \"o'clock\", \"oughtnt\": \"oughtn't\",\n\t\t\t\t\t\t\t \"ow's'at\": \"'ow's'at\", \"'ows'at\": \"'ow's'at\", \"'ow'sat\": \"'ow's'at\", \"shant\": \"shan't\", \"shed've\": \"she'd've\", \"she'dve\": \"she'd've\",\n\t\t\t\t\t\t\t \"she's\": \"she's\", \"shouldve\": \"should've\", \"shouldnt\": \"shouldn't\", \"shouldnt've\": \"shouldn't've\", \"shouldn'tve\": \"shouldn't've\",\n\t\t\t\t\t\t\t \"somebody'd\": \"somebodyd\", \"somebodyd've\": \"somebody'd've\", \"somebody'dve\": \"somebody'd've\", \"somebodyll\": \"somebody'll\",\n\t\t\t\t\t\t\t \"somebodys\": \"somebody's\", \"someoned\": \"someone'd\", \"someoned've\": \"someone'd've\", \"someone'dve\": \"someone'd've\",\n\t\t\t\t\t\t\t \"someonell\": \"someone'll\", \"someones\": \"someone's\", \"somethingd\": \"something'd\", \"somethingd've\": \"something'd've\",\n\t\t\t\t\t\t\t \"something'dve\": \"something'd've\", \"somethingll\": \"something'll\", \"thats\": \"that's\", \"thered\": \"there'd\", \"thered've\": \"there'd've\",\n\t\t\t\t\t\t\t \"there'dve\": \"there'd've\", \"therere\": \"there're\", \"theres\": \"there's\", \"theyd\": \"they'd\", \"theyd've\": \"they'd've\",\n\t\t\t\t\t\t\t \"they'dve\": \"they'd've\", \"theyll\": \"they'll\", \"theyre\": \"they're\", \"theyve\": \"they've\", \"twas\": \"'twas\", \"wasnt\": \"wasn't\",\n\t\t\t\t\t\t\t \"wed've\": \"we'd've\", \"we'dve\": \"we'd've\", \"weve\": \"we've\", \"werent\": \"weren't\", \"whatll\": \"what'll\", \"whatre\": \"what're\",\n\t\t\t\t\t\t\t \"whats\": \"what's\", \"whatve\": \"what've\", \"whens\": \"when's\", \"whered\": \"where'd\", \"wheres\": \"where's\", \"whereve\": \"where've\",\n\t\t\t\t\t\t\t \"whod\": \"who'd\", \"whod've\": \"who'd've\", \"who'dve\": \"who'd've\", \"wholl\": \"who'll\", \"whos\": \"who's\", \"whove\": \"who've\", \"whyll\": \"why'll\",\n\t\t\t\t\t\t\t \"whyre\": \"why're\", \"whys\": \"why's\", \"wont\": \"won't\", \"wouldve\": \"would've\", \"wouldnt\": \"wouldn't\", \"wouldnt've\": \"wouldn't've\",\n\t\t\t\t\t\t\t \"wouldn'tve\": \"wouldn't've\", \"yall\": \"y'all\", \"yall'll\": \"y'all'll\", \"y'allll\": \"y'all'll\", \"yall'd've\": \"y'all'd've\",\n\t\t\t\t\t\t\t \"y'alld've\": \"y'all'd've\", \"y'all'dve\": \"y'all'd've\", \"youd\": \"you'd\", \"youd've\": \"you'd've\", \"you'dve\": \"you'd've\",\n\t\t\t\t\t\t\t \"youll\": \"you'll\", \"youre\": \"you're\", \"youve\": \"you've\"}\n\t\tself.manualMap    = { 'none': '0',\n\t\t\t\t\t\t\t  'zero': '0',\n\t\t\t\t\t\t\t  'one': '1',\n\t\t\t\t\t\t\t  'two': '2',\n\t\t\t\t\t\t\t  'three': '3',\n\t\t\t\t\t\t\t  'four': '4',\n\t\t\t\t\t\t\t  'five': '5',\n\t\t\t\t\t\t\t  'six': '6',\n\t\t\t\t\t\t\t  'seven': '7',\n\t\t\t\t\t\t\t  'eight': '8',\n\t\t\t\t\t\t\t  'nine': '9',\n\t\t\t\t\t\t\t  'ten': '10'\n\t\t\t\t\t\t\t}\n\t\tself.articles     = ['a',\n\t\t\t\t\t\t\t 'an',\n\t\t\t\t\t\t\t 'the'\n\t\t\t\t\t\t\t]\n \n\n\t\tself.periodStrip  = re.compile(\"(?!<=\\d)(\\.)(?!\\d)\")\n\t\tself.commaStrip   = re.compile(\"(\\d)(,)(\\d)\")\n\t\tself.punct        = [';', r\"/\", '[', ']', '\"', '{', '}',\n\t\t\t\t\t\t\t '(', ')', '=', '+', '\\\\', '_', '-',\n\t\t\t\t\t\t\t '>', '<', '@', '`', ',', '?', '!']\n\n\t\n\tdef evaluate(self, quesIds=None):\n\t\tif quesIds == None:\n\t\t\tquesIds = [quesId for quesId in self.params['question_id']]\n\t\tgts = {}\n\t\tres = {}\n\t\tfor quesId in quesIds:\n\t\t\tgts[quesId] = self.vqa.qa[quesId]\n\t\t\tres[quesId] = self.vqaRes.qa[quesId]\n\t\t\n\t\t# =================================================\n\t\t# Compute accuracy\n\t\t# =================================================\n\t\taccQA       = []\n\t\taccQuesType = {}\n\t\taccAnsType  = {}\n\t\tprint (\"computing accuracy\")\n\t\tstep = 0\n\t\tfor quesId in quesIds:\n\t\t\tresAns      = res[quesId]['answer']\n\t\t\tresAns      = resAns.replace('\\n', ' ')\n\t\t\tresAns      = resAns.replace('\\t', ' ')\n\t\t\tresAns      = resAns.strip()\n\t\t\tresAns      = self.processPunctuation(resAns)\n\t\t\tresAns      = self.processDigitArticle(resAns)\n\t\t\tgtAcc  = []\n\t\t\tgtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]\n\t\t\tif len(set(gtAnswers)) > 1: \n\t\t\t\tfor ansDic in gts[quesId]['answers']:\n\t\t\t\t\tansDic['answer'] = self.processPunctuation(ansDic['answer'])\n\t\t\tfor gtAnsDatum in gts[quesId]['answers']:\n\t\t\t\totherGTAns = [item for item in gts[quesId]['answers'] if item!=gtAnsDatum]\n\t\t\t\tmatchingAns = [item for item in otherGTAns if item['answer']==resAns]\n\t\t\t\tacc = min(1, float(len(matchingAns))/3)\n\t\t\t\tgtAcc.append(acc)\n\t\t\tquesType    = gts[quesId]['question_type']\n\t\t\tansType     = gts[quesId]['answer_type']\n\t\t\tavgGTAcc = float(sum(gtAcc))/len(gtAcc)\n\t\t\taccQA.append(avgGTAcc)\n\t\t\tif quesType not in accQuesType:\n\t\t\t\taccQuesType[quesType] = []\n\t\t\taccQuesType[quesType].append(avgGTAcc)\n\t\t\tif ansType not in accAnsType:\n\t\t\t\taccAnsType[ansType] = []\n\t\t\taccAnsType[ansType].append(avgGTAcc)\n\t\t\tself.setEvalQA(quesId, avgGTAcc)\n\t\t\tself.setEvalQuesType(quesId, quesType, avgGTAcc)\n\t\t\tself.setEvalAnsType(quesId, ansType, avgGTAcc)\n\t\t\tif step%100 == 0:\n\t\t\t\tself.updateProgress(step/float(len(quesIds)))\n\t\t\tstep = step + 1\n\n\t\tself.setAccuracy(accQA, accQuesType, accAnsType)\n\t\tprint (\"Done computing accuracy\")\n\t\n\tdef processPunctuation(self, inText):\n\t\toutText = inText\n\t\tfor p in self.punct:\n\t\t\tif (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None):\n\t\t\t\toutText = outText.replace(p, '')\n\t\t\telse:\n\t\t\t\toutText = outText.replace(p, ' ')\t\n\t\toutText = self.periodStrip.sub(\"\",\n\t\t\t\t\t\t\t\t\t  outText,\n\t\t\t\t\t\t\t\t\t  re.UNICODE)\n\t\treturn outText\n\t\n\tdef processDigitArticle(self, inText):\n\t\toutText = []\n\t\ttempText = inText.lower().split()\n\t\tfor word in tempText:\n\t\t\tword = self.manualMap.setdefault(word, word)\n\t\t\tif word not in self.articles:\n\t\t\t\toutText.append(word)\n\t\t\telse:\n\t\t\t\tpass\n\t\tfor wordId, word in enumerate(outText):\n\t\t\tif word in self.contractions: \n\t\t\t\toutText[wordId] = self.contractions[word]\n\t\toutText = ' '.join(outText)\n\t\treturn outText\n\n\tdef setAccuracy(self, accQA, accQuesType, accAnsType):\n\t\tself.accuracy['overall']         = round(100*float(sum(accQA))/len(accQA), self.n)\n\t\tself.accuracy['perQuestionType'] = {quesType: round(100*float(sum(accQuesType[quesType]))/len(accQuesType[quesType]), self.n) for quesType in accQuesType}\n\t\tself.accuracy['perAnswerType']   = {ansType:  round(100*float(sum(accAnsType[ansType]))/len(accAnsType[ansType]), self.n) for ansType in accAnsType}\n\t\t\t\n\tdef setEvalQA(self, quesId, acc):\n\t\tself.evalQA[quesId] = round(100*acc, self.n)\n\n\tdef setEvalQuesType(self, quesId, quesType, acc):\n\t\tif quesType not in self.evalQuesType:\n\t\t\tself.evalQuesType[quesType] = {}\n\t\tself.evalQuesType[quesType][quesId] = round(100*acc, self.n)\n\t\n\tdef setEvalAnsType(self, quesId, ansType, acc):\n\t\tif ansType not in self.evalAnsType:\n\t\t\tself.evalAnsType[ansType] = {}\n\t\tself.evalAnsType[ansType][quesId] = round(100*acc, self.n)\n\n\tdef updateProgress(self, progress):\n\t\tbarLength = 20\n\t\tstatus = \"\"\n\t\tif isinstance(progress, int):\n\t\t\tprogress = float(progress)\n\t\tif not isinstance(progress, float):\n\t\t\tprogress = 0\n\t\t\tstatus = \"error: progress var must be float\\r\\n\"\n\t\tif progress < 0:\n\t\t\tprogress = 0\n\t\t\tstatus = \"Halt...\\r\\n\"\n\t\tif progress >= 1:\n\t\t\tprogress = 1\n\t\t\tstatus = \"Done...\\r\\n\"\n\t\tblock = int(round(barLength*progress))\n\t\ttext = \"\\rFinshed Percent: [{0}] {1}% {2}\".format( \"#\"*block + \"-\"*(barLength-block), int(progress*100), status)\n\t\tsys.stdout.write(text)\n\t\tsys.stdout.flush()\n\n"
  },
  {
    "path": "main.py",
    "content": "import argparse\nimport yaml\nimport torch\n\nfrom evaluation.okvqa_evaluate import OKEvaluater\nfrom evaluation.aokvqa_evaluate import AOKEvaluater\nfrom configs.task_cfgs import Cfgs\nfrom prophet import get_args, get_runner\n\n# parse cfgs and args\nargs = get_args()\n__C = Cfgs(args)\nwith open(args.cfg_file, 'r') as f:\n    yaml_dict = yaml.load(f, Loader=yaml.FullLoader)\n__C.override_from_dict(yaml_dict)\nprint(__C)\n\n# build runner\nif __C.RUN_MODE == 'pretrain':\n    evaluater = None\nelif 'aok' in __C.TASK:\n    evaluater = AOKEvaluater(\n        __C.EVAL_ANSWER_PATH,\n        __C.EVAL_QUESTION_PATH,\n    )\nelse:\n    evaluater = OKEvaluater(\n        __C.EVAL_ANSWER_PATH,\n        __C.EVAL_QUESTION_PATH,\n    )\n\nrunner = get_runner(__C, evaluater)\n\n# run\nrunner.run()\n"
  },
  {
    "path": "misc/tree.txt",
    "content": "prophet\n├── assets\n│   ├── answer_aware_examples_okvqa.json\n│   ├── answer_dict_aokvqa.json\n│   ├── answer_dict_okvqa.json\n│   ├── answer_dict_vqav2.json\n│   ├── candidates_aokvqa_test.json\n│   ├── candidates_aokvqa_val.json\n│   ├── candidates_okvqa.json\n│   ├── captions_aokvqa.json\n│   ├── captions_okvqa.json\n│   ├── examples_aokvqa_test.json.json\n│   └── examples_aokvqa_val.json.json\n├── ckpts\n│   ├── mcan_ft_aokvqa_test.pkl\n│   ├── mcan_ft_aokvqa_val.pkl\n│   ├── mcan_ft_okvqa.pkl\n│   ├── mcan_pt_aokvqa_test.pkl\n│   └── mcan_pt_aokvqa_val.pkl\n│   ├── mcan_pt_okvqa.pkl\n├── configs\n│   ├── finetune.yml\n│   ├── path_cfgs.py\n│   ├── pretrain.yml\n│   ├── prompt.yml\n│   ├── task_cfgs.py\n│   └── task_to_split.py\n├── datasets\n│   ├── aokvqa\n│   │   ├── aokvqa_v1p0_test.json\n│   │   ├── aokvqa_v1p0_train.json\n│   │   └── aokvqa_v1p0_val.json\n│   ├── coco2014\n│   ├── coco2014_feats\n│   ├── coco2017\n│   ├── coco2017_feats\n│   ├── okvqa\n│   │   ├── mscoco_train2014_annotations.json\n│   │   ├── mscoco_val2014_annotations.json\n│   │   ├── OpenEnded_mscoco_train2014_questions.json\n│   │   └── OpenEnded_mscoco_val2014_questions.json\n│   └── vqav2\n│       ├── v2_mscoco_train2014_annotations.json\n│       ├── v2_mscoco_val2014_annotations.json\n│       ├── v2_OpenEnded_mscoco_train2014_questions.json\n│       ├── v2_OpenEnded_mscoco_val2014_questions.json\n│       ├── v2valvg_no_ok_annotations.json\n│       ├── v2valvg_no_ok_questions.json\n│       ├── vg_annotations.json\n│       └── vg_questions.json\n├── environment.yml\n├── evaluation\n│   ├── ans_punct.py\n│   ├── aok_utils\n│   │   ├── eval_predictions.py\n│   │   ├── load_aokvqa.py\n│   │   └── remap_predictions.py\n│   ├── aokvqa_evaluate.py\n│   ├── okvqa_evaluate.py\n│   └── vqa_utils\n│       ├── vqaEval.py\n│       └── vqa.py\n├── main.py\n├── misc\n│   └── framework.png\n├── outputs\n│   ├── ckpts\n│   ├── logs\n│   └── results\n├── preds\n│   ├── mcan_530_okvqa.json\n│   └── prophet_611_okvqa.json\n├── prophet\n│   ├── __init__.py\n│   ├── stage1\n│   │   ├── finetune.py\n│   │   ├── heuristics.py\n│   │   ├── model\n│   │   │   ├── layers.py\n│   │   │   ├── mcan_for_finetune.py\n│   │   │   ├── mcan.py\n│   │   │   ├── net_utils.py\n│   │   │   └── rope2d.py\n│   │   ├── pretrain.py\n│   │   └── utils\n│   │       ├── load_data.py\n│   │       ├── optim.py\n│   └── stage2\n│       ├── prompt.py\n│       └── utils\n│           ├── data_utils.py\n│           ├── fancy_pbar.py\n├── README.md\n├── scripts\n│   ├── evaluate_model.sh\n│   ├── extract_img_feats.sh\n│   ├── finetune.sh\n│   ├── heuristics_gen.sh\n│   ├── pretrain.sh\n│   └── prompt.sh\n└── tools\n    ├── extract_img_feats.py\n    └── transforms.py\n"
  },
  {
    "path": "outputs/ckpts/.gitkeep",
    "content": ""
  },
  {
    "path": "outputs/logs/.gitkeep",
    "content": ""
  },
  {
    "path": "outputs/results/.gitkeep",
    "content": ""
  },
  {
    "path": "preds/.gitkeep",
    "content": ""
  },
  {
    "path": "prophet/__init__.py",
    "content": "__author__ = 'Zhenwei Shao'\n__version__ = '1.0'\n\nimport argparse\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--task', dest='TASK', help=\"task name, one of ['ok', 'aok_val', 'aok_test']\", type=str, required=True)\n    parser.add_argument('--run_mode', dest='RUN_MODE', help=\"run mode, one of ['pretrain', 'finetune', 'finetune_test', 'heuristics', 'prompt']\", type=str, required=True)\n    parser.add_argument('--cfg', dest='cfg_file', help='config file', type=str, required=True)\n    parser.add_argument('--version', dest='VERSION', help='version name, output folder will be named as version name', type=str, required=True)\n    parser.add_argument('--ckpt_path', dest='CKPT_PATH', help='checkpoint path for test', type=str, default=None)\n    parser.add_argument('--pretrained_model', dest='PRETRAINED_MODEL_PATH', help='pretrained model path', type=str, default=None)\n    parser.add_argument('--debug', dest='DEBUG', help='debug mode', action='store_true')\n    parser.add_argument('--resume', dest='RESUME', help='resume previous run', action='store_true')\n    parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default=None)\n    parser.add_argument('--grad_accu', dest='GRAD_ACCU_STEPS', help='random seed', type=int, default=None)\n    parser.add_argument('--seed', dest='SEED', help='random seed', type=int, default=99)\n    parser.add_argument('--candidate_num', dest='CANDIDATE_NUM', help='topk candidates', type=int, default=None)\n    parser.add_argument('--example_num', dest='EXAMPLE_NUM', help='number of most similar examples to be searched, default: 200', type=int, default=None)\n    parser.add_argument('--examples_path', dest='EXAMPLES_PATH', help='answer-aware example file path, default: \"assets/answer_aware_examples_for_ok.json\"', type=str, default=None)\n    parser.add_argument('--candidates_path', dest='CANDIDATES_PATH', help='candidates file path, default: \"assets/candidates_for_ok.json\"', type=str, default=None)\n    parser.add_argument('--captions_path', dest='CAPTIONS_PATH', help='captions file path, default: \"assets/captions_for_ok.json\"', type=str, default=None)\n    parser.add_argument('--openai_key', dest='OPENAI_KEY', help='openai api key', type=str, default=None)\n    args = parser.parse_args()\n    return args\n\n\n\ndef get_runner(__C, evaluater):\n    if __C.RUN_MODE == 'pretrain':\n        from .stage1.pretrain import Runner\n    elif __C.RUN_MODE == 'finetune':\n        from .stage1.finetune import Runner\n    elif __C.RUN_MODE == 'finetune_test':\n        from .stage1.finetune import Runner\n    elif __C.RUN_MODE == 'heuristics':\n        from .stage1.heuristics import Runner\n    elif __C.RUN_MODE == 'prompt':\n        from .stage2.prompt import Runner\n    else:\n        raise NotImplementedError\n    runner = Runner(__C, evaluater)\n    return runner"
  },
  {
    "path": "prophet/stage1/finetune.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Runner that handles the finetuning and evaluation process\n# ------------------------------------------------------------------------------ #\n\nimport os, sys\n# sys.path.append(os.getcwd())\n\nfrom datetime import datetime\nimport pickle, random, math, time\nimport json\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.utils.data as Data\nimport argparse\nfrom pathlib import Path\nfrom copy import deepcopy\nimport yaml\n\nfrom configs.task_cfgs import Cfgs\nfrom .utils.load_data import CommonData, DataSet\nfrom .model.mcan_for_finetune import MCANForFinetune\nfrom .utils.optim import get_optim_for_finetune as get_optim\n\nclass Runner(object):\n    def __init__(self, __C, evaluater):\n        self.__C = __C\n        self.evaluater = evaluater\n        \n    def train(self, train_set, eval_set=None):\n        data_size = train_set.data_size\n\n        # Define the MCAN model\n        net = MCANForFinetune(self.__C, train_set.ans_size)\n\n        ## load the pretrained model\n        if self.__C.PRETRAINED_MODEL_PATH is not None:\n            print(f'Loading pretrained model from {self.__C.PRETRAINED_MODEL_PATH}')\n            ckpt = torch.load(self.__C.PRETRAINED_MODEL_PATH, map_location='cpu')\n            net.load_state_dict(ckpt['state_dict'], strict=False)\n            net.parameter_init()\n            print('Finish loading.')\n\n        # Define the optimizer\n        if self.__C.RESUME:\n            raise NotImplementedError('Resume training is not needed as the finetuning is fast')\n        else:\n            optim = get_optim(self.__C, net)\n            start_epoch = 0\n\n        # load to gpu\n        net.cuda()\n        # Define the multi-gpu training if needed\n        if self.__C.N_GPU > 1:\n            net = nn.DataParallel(net, device_ids=self.__C.GPU_IDS)\n\n        # Define the binary cross entropy loss\n        loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')\n        epoch_loss = 0\n\n        # Define multi-thread dataloader\n        dataloader = Data.DataLoader(\n            train_set,\n            batch_size=self.__C.BATCH_SIZE,\n            shuffle=True,\n            num_workers=self.__C.NUM_WORKERS,\n            pin_memory=self.__C.PIN_MEM,\n            drop_last=True\n        )\n\n        # Training script\n        for epoch in range(start_epoch, self.__C.MAX_EPOCH):\n            net.train()\n            # Save log information\n            with open(self.__C.LOG_PATH, 'a+') as logfile:\n                logfile.write(\n                    f'nowTime: {datetime.now():%Y-%m-%d %H:%M:%S}\\n'\n                )\n\n            time_start = time.time()\n\n            # Iteration\n            for step, input_tuple in enumerate(dataloader):\n                iteration_loss = 0\n                optim.zero_grad()\n                input_tuple = [x.cuda() for x in input_tuple]\n                SUB_BATCH_SIZE = self.__C.BATCH_SIZE // self.__C.GRAD_ACCU_STEPS\n                for accu_step in range(self.__C.GRAD_ACCU_STEPS):\n\n                    sub_tuple = [x[accu_step * SUB_BATCH_SIZE:\n                        (accu_step + 1) * SUB_BATCH_SIZE] for x in input_tuple]\n                    \n                    sub_ans_iter = sub_tuple[-1]\n                    pred = net(sub_tuple[:-1])\n                    loss = loss_fn(pred, sub_ans_iter)\n                    loss.backward()\n                    loss_item = loss.item()\n                    iteration_loss += loss_item\n                    epoch_loss += loss_item# * self.__C.GRAD_ACCU_STEPS\n\n                print(\"\\r[version %s][epoch %2d][step %4d/%4d][Task %s][Mode %s] loss: %.4f, lr: %.2e\" % (\n                    self.__C.VERSION,\n                    epoch + 1,\n                    step,\n                    int(data_size / self.__C.BATCH_SIZE),\n                    self.__C.TASK,\n                    self.__C.RUN_MODE,\n                    iteration_loss / self.__C.BATCH_SIZE,\n                    optim.current_lr(),\n                ), end='          ')\n\n                optim.step()\n\n            time_end = time.time()\n            print('Finished in {}s'.format(int(time_end - time_start)))\n\n            # Logging\n            with open(self.__C.LOG_PATH, 'a+') as logfile:\n                logfile.write(f'epoch = {epoch + 1}  loss = {epoch_loss / data_size}\\nlr = {optim.current_lr()}\\n\\n')\n            \n            optim.schedule_step(epoch)\n\n            # Save checkpoint\n            state = {\n                'state_dict': net.state_dict() if self.__C.N_GPU == 1 \\\n                    else net.module.state_dict(),\n                'optimizer': optim.optimizer.state_dict(),\n                'warmup_lr_scale': optim.warmup_lr_scale,\n                'decay_lr_scale': optim.decay_lr_scale,\n            }\n            torch.save(\n                state,\n                f'{self.__C.CKPTS_DIR}/epoch{epoch + 1}.pkl'\n            )\n\n\n            # Eval after every epoch\n            if eval_set is not None:\n                self.eval(\n                    eval_set,\n                    net,\n                    eval_now=True\n                )\n            \n            epoch_loss = 0\n\n    # Evaluation\n    @torch.no_grad()\n    def eval(self, dataset, net=None, eval_now=False):\n        data_size = dataset.data_size\n\n        # if eval_now and self.evaluater is None:\n        #     self.build_evaluator(dataset)\n        \n        if net is None:\n            # Load parameters\n            path = self.__C.CKPT_PATH\n\n            print('Loading ckpt {}'.format(path))\n            net = MCANForFinetune(self.__C, dataset.ans_size)\n            ckpt = torch.load(path, map_location='cpu')\n            net.load_state_dict(ckpt['state_dict'], strict=False)\n            net.cuda()\n            if self.__C.N_GPU > 1:\n                net = nn.DataParallel(net, device_ids=self.__C.GPU)\n            print('Finish!')\n\n        net.eval()\n        \n        dataloader = Data.DataLoader(\n            dataset,\n            batch_size=self.__C.EVAL_BATCH_SIZE,\n            shuffle=False,\n            num_workers=self.__C.NUM_WORKERS,\n            pin_memory=True\n        )\n\n        qid_idx = 0\n        self.evaluater.init()\n\n        for step, input_tuple in enumerate(dataloader):\n            print(\"\\rEvaluation: [step %4d/%4d]\" % (\n                step,\n                int(data_size / self.__C.EVAL_BATCH_SIZE),\n            ), end='          ')\n\n            input_tuple = [x.cuda() for x in input_tuple]\n\n\n            pred = net(input_tuple[:-1])\n            pred_np = pred.cpu().numpy()\n            pred_argmax = np.argmax(pred_np, axis=1)\n\n            # collect answers for every batch\n            for i in range(len(pred_argmax)):\n                qid = dataset.qids[qid_idx]\n                qid_idx += 1\n                ans_id = int(pred_argmax[i])\n                ans = dataset.ix_to_ans[ans_id]\n                # log result to evaluater\n                self.evaluater.add(qid, ans)\n        \n        print()\n        self.evaluater.save(self.__C.RESULT_PATH)\n        # evaluate if eval_now is True\n        if eval_now:\n            with open(self.__C.LOG_PATH, 'a+') as logfile:\n                self.evaluater.evaluate(logfile)\n\n    # def build_evaluator(self, valid_set):\n    #     if 'aok' in self.__C.TASK:\n    #         from evaluation.aokvqa_evaluate import Evaluater\n    #     elif 'ok' in self.__C.TASK:\n    #         from evaluation.okvqa_evaluate import Evaluater\n    #     else:\n    #         raise ValueError('Unknown dataset')\n    #     self.evaluater = Evaluater(\n    #         valid_set.annotation_path,\n    #         valid_set.question_path,\n    #     )\n\n    def run(self):\n        # Set ckpts and log path\n        ## where checkpoints will be saved\n        Path(self.__C.CKPTS_DIR).mkdir(parents=True, exist_ok=True)\n        ## where logs will be saved\n        Path(self.__C.LOG_PATH).parent.mkdir(parents=True, exist_ok=True)\n        ## where eval results will be saved\n        Path(self.__C.RESULT_PATH).parent.mkdir(parents=True, exist_ok=True)\n        with open(self.__C.LOG_PATH, 'w') as f:\n            f.write(str(self.__C) + '\\n')\n\n        # build dataset entities        \n        common_data = CommonData(self.__C)\n\n        if self.__C.RUN_MODE == 'finetune':\n            train_set = DataSet(\n                self.__C, \n                common_data,\n                self.__C.TRAIN_SPLITS\n            )\n            valid_set = None\n            if self.__C.EVAL_NOW:\n                valid_set = DataSet(\n                    self.__C,\n                    common_data,\n                    self.__C.EVAL_SPLITS\n                )\n            self.train(train_set, valid_set)\n        elif self.__C.RUN_MODE == 'finetune_test':\n            test_set = DataSet(\n                self.__C,\n                common_data,\n                self.__C.EVAL_SPLITS\n            )\n            self.eval(test_set, eval_now=self.__C.EVAL_NOW)\n        else:\n            raise ValueError('Invalid run mode')\n\ndef finetune_login_args(parser):\n    parser.add_argument('--task', dest='TASK', help='task name, e.g., ok, aok_val, aok_test', type=str, required=True)\n    parser.add_argument('--run_mode', dest='RUN_MODE', help='run mode', type=str, required=True)\n    parser.add_argument('--cfg', dest='cfg_file', help='optional config file', type=str, required=True)\n    parser.add_argument('--version', dest='VERSION', help='version name', type=str, required=True)\n    parser.add_argument('--resume', dest='RESUME', help='resume training', type=bool, default=False)\n    parser.add_argument('--resume_version', dest='RESUME_VERSION', help='checkpoint version name', type=str, default='')\n    parser.add_argument('--resume_epoch', dest='RESUME_EPOCH', help='checkpoint epoch', type=int, default=1)\n    parser.add_argument('--resume_path', dest='RESUME_PATH', help='checkpoint path', type=str, default='')\n    parser.add_argument('--ckpt_path', dest='CKPT_PATH', help='checkpoint path for test', type=str, default=None)\n    parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default=None)\n    parser.add_argument('--seed', dest='SEED', help='random seed', type=int, default=None)\n    parser.add_argument('--grad_accu', dest='GRAD_ACCU_STEPS', help='random seed', type=int, default=None)\n    parser.add_argument('--pretrained_model', dest='PRETRAINED_MODEL_PATH', help='pretrained model path', type=str, default=None)\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Parameters for pretraining')\n    finetune_login_args(parser)\n    args = parser.parse_args()\n    __C = Cfgs(args)\n    with open(args.cfg_file, 'r') as f:\n        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)\n    __C.override_from_dict(yaml_dict)\n    print(__C)\n    runner = Runner(__C)\n    runner.run()\n"
  },
  {
    "path": "prophet/stage1/heuristics.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Runner that handles the heuristics generations process\n# ------------------------------------------------------------------------------ #\n\nimport os, sys\n# sys.path.append(os.getcwd())\n\nfrom datetime import datetime\nimport pickle, random, math, time\nimport json\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.nn.utils import clip_grad_norm_\nimport torch.utils.data as Data\nimport argparse\nfrom pathlib import Path\nimport yaml\nfrom copy import deepcopy\nfrom tqdm import tqdm\n\nfrom configs.task_cfgs import Cfgs\nfrom .utils.load_data import CommonData, DataSet\nfrom .model.mcan_for_finetune import MCANForFinetune\nfrom .utils.optim import get_optim_for_finetune as get_optim\n\nclass Runner(object):\n    def __init__(self, __C, *args, **kwargs):\n        self.__C = __C\n        self.net = None\n\n    # heuristics generation\n    @torch.no_grad()\n    def eval(self, dataset):\n        data_size = dataset.data_size\n\n        if self.net is None:\n            # Load parameters\n            path = self.__C.CKPT_PATH\n            print('Loading ckpt {}'.format(path))\n            net = MCANForFinetune(self.__C, dataset.ans_size)\n            ckpt = torch.load(path, map_location='cpu')\n            net.load_state_dict(ckpt['state_dict'], strict=False)\n            net.cuda()\n            if self.__C.N_GPU > 1:\n                net = nn.DataParallel(net, device_ids=self.__C.GPU_IDS)\n            print('Finish!')\n            self.net = net\n        else:\n            net = self.net\n\n\n        net.eval()\n        \n        dataloader = Data.DataLoader(\n            dataset,\n            batch_size=self.__C.EVAL_BATCH_SIZE,\n            shuffle=False,\n            num_workers=self.__C.NUM_WORKERS,\n            pin_memory=True\n        )\n\n        qid_idx = 0\n        topk_results = {}\n        latent_results = []\n        k = self.__C.CANDIDATE_NUM\n\n        for step, input_tuple in enumerate(dataloader):\n            print(\"\\rEvaluation: [step %4d/%4d]\" % (\n                step,\n                int(data_size / self.__C.EVAL_BATCH_SIZE),\n            ), end='          ')\n\n            input_tuple = [x.cuda() for x in input_tuple]\n\n\n            pred, answer_latents = net(input_tuple[:-1], output_answer_latent=True)\n            pred_np = pred.sigmoid().cpu().numpy()\n            answer_latents_np = answer_latents.cpu().numpy()\n\n            # collect answers for every batch\n            for i in range(len(pred_np)):\n                qid = dataset.qids[qid_idx]\n                qid_idx += 1\n                ans_np = pred_np[i]\n                ans_idx = np.argsort(-ans_np)[:k]\n                ans_item = []\n                for idx in ans_idx:\n                    ans_item.append(\n                        {\n                            'answer': dataset.ix_to_ans[idx],\n                            'confidence': float(ans_np[idx])\n                        }\n                    )\n                topk_results[qid] = ans_item\n\n                latent_np = answer_latents_np[i]\n                latent_results.append(latent_np)\n                np.save(\n                    os.path.join(self.__C.ANSWER_LATENTS_DIR, f'{qid}.npy'),\n                    latent_np\n                )\n        print()\n        \n        return topk_results, latent_results\n\n    def run(self):\n        # Set ckpts and log path\n        ## where checkpoints will be saved\n        Path(self.__C.CKPTS_DIR).mkdir(parents=True, exist_ok=True)\n        ## where the result file of topk candidates will be saved\n        Path(self.__C.CANDIDATE_FILE_PATH).parent.mkdir(parents=True, exist_ok=True)\n        ## where answer latents will be saved\n        Path(self.__C.ANSWER_LATENTS_DIR).mkdir(parents=True, exist_ok=True)\n\n        # build dataset entities        \n        common_data = CommonData(self.__C)\n        train_set = DataSet(\n            self.__C,\n            common_data,\n            self.__C.TRAIN_SPLITS\n        )\n        test_set = DataSet(\n            self.__C,\n            common_data,\n            self.__C.EVAL_SPLITS\n        )\n\n        # forward VQA model\n        train_topk_results, train_latent_results = self.eval(train_set)\n        test_topk_results, test_latent_results = self.eval(test_set)\n\n        # save topk candidates\n        topk_results = train_topk_results | test_topk_results\n        json.dump(\n            topk_results,\n            open(self.__C.CANDIDATE_FILE_PATH, 'w'),\n            indent=4\n        )\n\n        # search similar examples\n        train_features = np.vstack(train_latent_results)\n        train_features = train_features / np.linalg.norm(train_features, axis=1, keepdims=True)\n\n        test_features = np.vstack(test_latent_results)\n        test_features = test_features / np.linalg.norm(test_features, axis=1, keepdims=True)\n\n        # compute top-E similar examples for each testing input\n        E = self.__C.EXAMPLE_NUM\n        similar_qids = {}\n        print(f'\\ncompute top-{E} similar examples for each testing input')\n        for i, test_qid in enumerate(tqdm(test_set.qids)):\n            # cosine similarity\n            dists = np.dot(test_features[i], train_features.T)\n            top_E = np.argsort(-dists)[:E]\n            similar_qids[test_qid] = [train_set.qids[j] for j in top_E]\n        \n        # save similar qids\n        with open(self.__C.EXAMPLE_FILE_PATH, 'w') as f:\n            json.dump(similar_qids, f)\n\ndef heuristics_login_args(parser):\n    parser.add_argument('--task', dest='TASK', help='task name, e.g., ok, aok_val, aok_test', type=str, required=True)\n    parser.add_argument('--cfg', dest='cfg_file', help='optional config file', type=str, required=True)\n    parser.add_argument('--version', dest='VERSION', help='version name', type=str, required=True)\n    parser.add_argument('--ckpt_path', dest='CKPT_PATH', help='checkpoint path for heuristics', type=str, default=None)\n    parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default=None)\n    parser.add_argument('--candidate_num', dest='CANDIDATE_NUM', help='topk candidates', type=int, default=None)\n    parser.add_argument('--example_num', dest='EXAMPLE_NUM', help='number of most similar examples to be searched, default: 200', type=int, default=None)\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Parameters for pretraining')\n    heuristics_login_args(parser)\n    args = parser.parse_args()\n    __C = Cfgs(args)\n    with open(args.cfg_file, 'r') as f:\n        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)\n    __C.override_from_dict(yaml_dict)\n    print(__C)\n    runner = Runner(__C)\n    runner.run()\n"
  },
  {
    "path": "prophet/stage1/model/layers.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: basic layers & blocks of MCAN\n# ------------------------------------------------------------------------------ #\n\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\nimport math\n\nfrom .net_utils import *\nfrom .rope2d import RoPE2d\n\nclass AttFlat(nn.Module):\n    def __init__(self, __C):\n        super(AttFlat, self).__init__()\n        self.__C = __C\n\n        self.mlp = MLP(\n            in_size=__C.HIDDEN_SIZE,\n            mid_size=__C.FLAT_MLP_SIZE,\n            out_size=__C.FLAT_GLIMPSES,\n            dropout_r=__C.DROPOUT_R,\n            use_relu=True\n        )\n\n        self.linear_merge = nn.Linear(\n            __C.HIDDEN_SIZE * __C.FLAT_GLIMPSES,\n            __C.FLAT_OUT_SIZE\n        )\n\n    def forward(self, x, x_mask):\n        att = self.mlp(x)\n        if x_mask is not None:\n            att = att.masked_fill(\n                x_mask.squeeze(1).squeeze(1).unsqueeze(2),\n                -1e9\n            )\n        att = F.softmax(att, dim=1)\n\n        att_list = []\n        for i in range(self.__C.FLAT_GLIMPSES):\n            att_list.append(\n                torch.sum(att[:, :, i: i + 1] * x, dim=1)\n            )\n\n        x_atted = torch.cat(att_list, dim=1)\n        x_atted = self.linear_merge(x_atted)\n\n        return x_atted\n\n\nclass MHAtt(nn.Module):\n    def __init__(self, __C):\n        super().__init__()\n        self.__C = __C\n        self.n_head = __C.MULTI_HEAD\n        self.external_dim = __C.HIDDEN_SIZE\n        self.internal_dim = __C.HIDDEN_SIZE // self.n_head\n\n        self.linear_v = nn.Linear(self.external_dim, self.external_dim, bias=False)\n        self.linear_k = nn.Linear(self.external_dim, self.external_dim)\n        self.linear_q = nn.Linear(self.external_dim, self.external_dim)\n        self.linear_merge = nn.Linear(self.external_dim, self.external_dim)\n\n        self.dropout = nn.Dropout(__C.DROPOUT_R)\n\n    def forward(self, v, k, q, mask):\n        n_batches = q.size(0)\n\n        v = self.linear_v(v).view(\n            n_batches, -1, self.n_head, self.internal_dim\n        ).transpose(1, 2)\n\n        k = self.linear_k(k).view(\n            n_batches, -1, self.n_head, self.internal_dim\n        ).transpose(1, 2)\n\n        q = self.linear_q(q).view(\n            n_batches, -1, self.n_head, self.internal_dim\n        ).transpose(1, 2)\n\n        atted = self.att(v, k, q, mask)\n        atted = atted.transpose(1, 2).contiguous().view(\n            n_batches, -1, self.external_dim\n        )\n        atted = self.linear_merge(atted)\n\n        return atted\n\n    def att(self, value, key, query, mask):\n        d_k = query.size(-1)\n\n        scores = torch.matmul(\n            query, key.transpose(-2, -1)\n        ) / math.sqrt(d_k)\n\n        if mask is not None:\n            scores = scores.masked_fill(mask, -1e9)\n\n        att_map = F.softmax(scores, dim=-1)\n        att_map = self.dropout(att_map)\n\n        return torch.matmul(att_map, value)\n\n\nclass SA_v(nn.Module):\n    def __init__(self, __C):\n        super().__init__()\n        self.__C = __C\n        self.n_head = __C.MULTI_HEAD\n        self.external_dim = __C.HIDDEN_SIZE\n        self.internal_dim = __C.HIDDEN_SIZE // self.n_head\n\n        self.linear_v = nn.Linear(self.external_dim, self.external_dim, bias=False)\n        self.linear_k = nn.Linear(self.external_dim, self.external_dim)\n        self.linear_q = nn.Linear(self.external_dim, self.external_dim)\n        self.linear_merge = nn.Linear(self.external_dim, self.external_dim)\n\n        self.dropout = nn.Dropout(__C.DROPOUT_R)\n\n\n        self.dropout1 = nn.Dropout(__C.DROPOUT_R)\n        self.norm1 = nn.LayerNorm(__C.HIDDEN_SIZE)\n        self.rope = RoPE2d(self.internal_dim, __C.IMG_FEAT_GRID)\n\n    def forward(self, *args):\n        x, *_ = args\n        n_batches = x.size(0)\n\n        v = self.linear_v(x).view(\n            n_batches, -1, self.n_head, self.internal_dim\n        ).transpose(1, 2)\n\n        k = self.linear_k(x).view(\n            n_batches, -1, self.n_head, self.internal_dim\n        ).transpose(1, 2)\n\n        q = self.linear_q(x).view(\n            n_batches, -1, self.n_head, self.internal_dim\n        ).transpose(1, 2)\n\n        q, k = self.rope(q, k)\n\n        atted = self.att(v, k, q, None)\n        atted = atted.transpose(1, 2).contiguous().view(\n            n_batches, -1, self.external_dim\n        )\n        atted = self.linear_merge(atted)\n\n        x = self.norm1(x + self.dropout1(atted))\n\n        return x\n\n    def att(self, value, key, query, mask):\n        d_k = query.size(-1)\n\n        scores = torch.matmul(\n            query, key.transpose(-2, -1)\n        ) / math.sqrt(d_k)\n\n        if mask is not None:\n            scores = scores.masked_fill(mask, -1e9)\n\n        att_map = F.softmax(scores, dim=-1)\n        att_map = self.dropout(att_map)\n\n        return torch.matmul(att_map, value)\n\n\nclass FFN(nn.Module):\n    def __init__(self, __C):\n        super(FFN, self).__init__()\n\n        self.mlp = MLP(\n            in_size=__C.HIDDEN_SIZE,\n            mid_size=__C.FF_SIZE,\n            out_size=__C.HIDDEN_SIZE,\n            dropout_r=__C.DROPOUT_R,\n            use_relu=True\n        )\n        self.dropout1 = nn.Dropout(__C.DROPOUT_R)\n        self.norm1 = nn.LayerNorm(__C.HIDDEN_SIZE)\n\n    def forward(self, x, *args):\n        x = self.norm1(x + self.dropout1(\n            self.mlp(x)\n        ))\n        return x\n\n\nclass SA(nn.Module):\n    def __init__(self, __C):\n        super(SA, self).__init__()\n\n        self.mhatt = MHAtt(__C)\n\n        self.dropout1 = nn.Dropout(__C.DROPOUT_R)\n        self.norm1 = nn.LayerNorm(__C.HIDDEN_SIZE)\n\n    def forward(self, x, x_mask, *args):\n        x = self.norm1(x + self.dropout1(\n            self.mhatt(x, x, x, x_mask)\n        ))\n\n        return x\n\n\nclass GA(nn.Module):\n    def __init__(self, __C):\n        super().__init__()\n\n        self.mhatt1 = MHAtt(__C)\n\n        self.dropout1 = nn.Dropout(__C.DROPOUT_R)\n        self.norm1 = nn.LayerNorm(__C.HIDDEN_SIZE)\n\n    def forward(self, x, y, x_mask, y_mask, *args):\n\n        x = self.norm1(x + self.dropout1(\n            self.mhatt1(y, y, x, y_mask)\n        ))\n\n        return x"
  },
  {
    "path": "prophet/stage1/model/mcan.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: the definition of the improved MCAN\n# ------------------------------------------------------------------------------ #\n\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\nimport math\nfrom transformers import AutoModel, logging\nlogging.set_verbosity_error()\n\nfrom .net_utils import *\nfrom .layers import *\n\n\nclass MCA_ED(nn.Module):\n    \"\"\"\n    The definition of the encoder-decoder backbone of MCAN.\n    \"\"\"\n    def __init__(self, __C):\n        super(MCA_ED, self).__init__()\n\n        enc = __C.ARCH_CEIL['enc'] * __C.LAYER\n        dec = __C.ARCH_CEIL['dec'] * __C.LAYER\n        self.enc_list = nn.ModuleList([eval(layer)(__C) for layer in enc])\n        self.dec_list = nn.ModuleList([eval(layer)(__C) for layer in dec])\n\n    def forward(self, x, y, x_mask, y_mask):\n        for enc in self.enc_list:\n            x = enc(x, x_mask)\n\n        for dec in self.dec_list:\n            y = dec(y, x, y_mask, x_mask)\n\n        return x, y\n\n\n\nclass MCAN(nn.Module):\n    \"\"\"\n    The definition of the complete network of the improved MCAN, mainly includes:\n    1. A pretrained BERT model used to encode questions (already represented as tokens)\n    2. A linear layer to project CLIP vision features (extracted beforehand, so the CLIP\n        model is not included) to a common embedding space\n    3. An encoder-decoder backbone to fuse question and image features in depth\n    4. A classifier head based on `AttFlat`\n    \"\"\"\n    def __init__(self, __C, answer_size):\n        super().__init__()\n\n        # answer_size = trainset.ans_size\n\n        self.__C = __C\n\n        self.bert = AutoModel.from_pretrained(__C.BERT_VERSION)\n\n        # self.clip_visual = trainset.clip_model.visual\n        # self.clip_visual.layer4 = Identity()\n        # self.clip_visual.float()\n\n        # for p in self.clip_visual.parameters():\n        #     p.requires_grad = False\n\n        self.img_feat_linear = nn.Sequential(\n            nn.Linear(__C.IMG_FEAT_SIZE, __C.HIDDEN_SIZE, bias=False),\n        )\n        self.lang_adapt = nn.Sequential(\n            nn.Linear(__C.LANG_FEAT_SIZE, __C.HIDDEN_SIZE),\n            nn.Tanh(),\n        )\n\n        self.backbone = MCA_ED(__C)\n        self.attflat_img = AttFlat(__C)\n        self.attflat_lang = AttFlat(__C)\n        self.proj_norm = nn.LayerNorm(__C.FLAT_OUT_SIZE)\n        self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)\n\n    def forward(self, input_tuple, output_answer_latent=False):\n        img_feat, ques_ix = input_tuple\n\n        # Make mask\n        lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))\n        img_feat_mask = None#self.make_mask(img_feat)\n\n        # Pre-process Language Feature\n        lang_feat = self.bert(\n            ques_ix, \n            attention_mask= ~lang_feat_mask.squeeze(1).squeeze(1)\n        )[0]\n        lang_feat = self.lang_adapt(lang_feat)\n\n        # Pre-process Image Feature\n        img_feat = self.img_feat_linear(img_feat)\n\n\n        # Backbone Framework\n        # img_feat = flatten(img_feat)\n        lang_feat, img_feat = self.backbone(\n            lang_feat,\n            img_feat,\n            lang_feat_mask,\n            img_feat_mask\n        )\n        lang_feat = self.attflat_lang(\n            lang_feat,\n            lang_feat_mask\n        )\n        img_feat = self.attflat_img(\n            img_feat,\n            img_feat_mask\n        )\n\n        proj_feat = lang_feat + img_feat\n        answer_latent = self.proj_norm(proj_feat)\n        proj_feat = self.proj(answer_latent)\n\n        if output_answer_latent:\n            return proj_feat, answer_latent\n\n        return proj_feat\n\n    # Masking\n    def make_mask(self, feature):\n        return (torch.sum(\n            torch.abs(feature),\n            dim=-1\n        ) == 0).unsqueeze(1).unsqueeze(2)\n"
  },
  {
    "path": "prophet/stage1/model/mcan_for_finetune.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: the definition of A wrapper of MCAN for finetuning with the \n# strategy described in the paper.\n# ------------------------------------------------------------------------------ #\n\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom .mcan import *\n\n\nclass MCANForFinetune(MCAN):\n    \"\"\"\n    A wrapper of MCAN for finetuning with the strategy described \n    in the paper. We inherit the parameters of existing answers \n    and append new parameters for the new answers.\n    \"\"\"\n    def __init__(self, __C, answer_size, base_answer_size=3129):\n        super().__init__(__C, base_answer_size)\n\n        self.proj1 = nn.Linear(__C.FLAT_OUT_SIZE, answer_size - base_answer_size)\n\n    @torch.no_grad()\n    def parameter_init(self):\n        self.proj1.weight.data.zero_()\n        self.proj1.bias.data = self.proj.bias.data.mean() + torch.zeros(self.proj1.bias.data.shape)\n\n    def forward(self, input_tuple, output_answer_latent=False):\n        proj_feat, answer_latent = super().forward(input_tuple, output_answer_latent=True)\n        proj_feat = torch.cat([\n            proj_feat,\n            self.proj1(answer_latent)\n        ], dim=1)\n        \n        if output_answer_latent:\n            return proj_feat, answer_latent\n\n        return proj_feat\n"
  },
  {
    "path": "prophet/stage1/model/net_utils.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Utilities for layer definitions\n# ------------------------------------------------------------------------------ #\n\nfrom torch import nn\nimport math\n\nclass FC(nn.Module):\n    def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):\n        super(FC, self).__init__()\n        self.dropout_r = dropout_r\n        self.use_relu = use_relu\n\n        self.linear = nn.Linear(in_size, out_size)\n\n        if use_relu:\n            self.relu = nn.ReLU(inplace=True)\n\n        if dropout_r > 0:\n            self.dropout = nn.Dropout(dropout_r)\n\n    def forward(self, x):\n        x = self.linear(x)\n\n        if self.use_relu:\n            x = self.relu(x)\n\n        if self.dropout_r > 0:\n            x = self.dropout(x)\n\n        return x\n\n\nclass MLP(nn.Module):\n    def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):\n        super(MLP, self).__init__()\n\n        self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)\n        self.linear = nn.Linear(mid_size, out_size)\n\n    def forward(self, x):\n        return self.linear(self.fc(x))\n\n\ndef flatten(x):\n    x = x.view(x.shape[0], x.shape[1], -1)\\\n        .permute(0, 2, 1).contiguous()\n    return x\n\n\ndef unflatten(x, shape):\n    x = x.permute(0, 2, 1).contiguous()\\\n        .view(x.shape[0], -1, shape[0], shape[1])\n    return x\n\n\nclass Identity(nn.Module):\n    def __init__(self):\n        super().__init__()\n    \n    def forward(self, x):\n        return x\n"
  },
  {
    "path": "prophet/stage1/model/rope2d.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: A 2D version of rotary positional embeddings \n# (https://arxiv.org/abs/2104.09864).\n# ------------------------------------------------------------------------------ #\n\n\nimport math\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\n# from einops import rearrange, repeat\n\ndef rotate_every_two(x):\n    shape = x.shape\n    # x = rearrange(x, '... (d j) -> ... d j', j = 2)\n    # x1, x2 = x.unbind(dim = -1)\n    x = x.view(*shape[:-1], -1, 2)[..., [1, 0]]\n    x = x.view(*shape)\n    return x\n\ndef apply_rotary_pos_emb(q, k, sinu_pos):\n    sin, cos = sinu_pos\n    q, k = map(lambda t: (t * cos) + (rotate_every_two(t) * sin), (q, k))\n    return q, k\n\n# rotary embeddings for 2d position\nclass RoPE2d(nn.Module):\n    def __init__(self, in_dim, size):\n        super().__init__()\n        dim = in_dim // 2\n        inv_freq = 1. / (40 ** (torch.arange(0, dim, 2).float() / dim))\n        position = torch.arange(0, size, dtype=torch.float)\n        sinusoid_inp = torch.einsum(\"i,j->ij\", position, inv_freq)\n        _sin = sinusoid_inp.sin()\n        _cos = sinusoid_inp.cos()\n        _sin, _cos = map(\n            lambda x: x.unsqueeze(-1).repeat(1, 1, 2),\n            (_sin, _cos)\n        )\n        _sin[..., 0] = -_sin[..., 0]\n        _sin, _cos = map(lambda x: x.view(*x.shape[:-2], -1), (_sin, _cos))\n        _sin, _cos = map(\n            lambda x: torch.cat([\n                x.unsqueeze(0).repeat(size, 1, 1),\n                x.unsqueeze(1).repeat(1, size, 1)\n            ], dim=-1).view(-1, in_dim),\n            (_sin, _cos)\n        )\n        self.register_buffer('sin', _sin)\n        self.register_buffer('cos', _cos)\n\n    def forward(self, k, q):\n        q, k = apply_rotary_pos_emb(q, k, (self.sin, self.cos))\n        return q, k\n\nif __name__ == '__main__':\n    rope = RoPE2d(512, size=4)\n    q = torch.randn(1, 16, 512)\n    k = torch.randn(1, 16, 512)\n    q, k = rope(q, k)\n    print(q.shape, k.shape)\n    \n"
  },
  {
    "path": "prophet/stage1/pretrain.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Runner that handles the pretraining process\n# ------------------------------------------------------------------------------ #\n\nimport os, sys\n# sys.path.append(os.getcwd())\n\nfrom datetime import datetime\nimport pickle, random, math, time\nimport json\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.utils.data as Data\nimport argparse\nfrom pathlib import Path\nfrom copy import deepcopy\nimport yaml\n\nfrom configs.task_cfgs import Cfgs\nfrom .utils.load_data import CommonData, DataSet\nfrom .model.mcan import MCAN\nfrom .utils.optim import get_optim\n\nclass Runner(object):\n    def __init__(self, __C, *args, **kwargs):\n        self.__C = __C\n\n    def train(self, train_set, eval_set=None):\n        data_size = train_set.data_size\n\n        # Define the MCAN model\n        net = MCAN(self.__C, train_set.ans_size)\n\n        # Define the optimizer\n        # Load checkpoint if resume training\n        if self.__C.RESUME:\n            print(' ========== Resume training')\n\n            path = self.__C.RESUME_PATH\n\n            # Load the network parameters\n            print('Loading ckpt {}'.format(path))\n            ckpt = torch.load(path, map_location='cpu')\n            print('Finish loading.')\n            net.load_state_dict(ckpt['state_dict'])\n\n            # Load the optimizer paramters\n            optim = get_optim(self.__C, net)\n            optim.warmup_lr_scale = ckpt['warmup_lr_scale']\n            optim.decay_lr_scale = ckpt['decay_lr_scale']\n            optim.optimizer.load_state_dict(ckpt['optimizer'])\n            start_epoch = self.__C.CKPT_EPOCH\n\n        else:\n            optim = get_optim(self.__C, net)\n            start_epoch = 0\n\n        # load to gpu\n        net.cuda()\n        # Define the multi-gpu training if needed\n        if self.__C.N_GPU > 1:\n            net = nn.DataParallel(net, device_ids=self.__C.GPU_IDS)\n\n        # Define the binary cross entropy loss\n        loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')\n        epoch_loss = 0\n\n        # Define multi-thread dataloader\n        dataloader = Data.DataLoader(\n            train_set,\n            batch_size=self.__C.BATCH_SIZE,\n            shuffle=True,\n            num_workers=self.__C.NUM_WORKERS,\n            pin_memory=self.__C.PIN_MEM,\n            drop_last=True\n        )\n\n        # Training script\n        for epoch in range(start_epoch, self.__C.MAX_EPOCH):\n            net.train()\n            # Save log information\n            with open(self.__C.LOG_PATH, 'a+') as logfile:\n                logfile.write(\n                    f'nowTime: {datetime.now():%Y-%m-%d %H:%M:%S}\\n'\n                )\n\n            time_start = time.time()\n\n            # Iteration\n            for step, input_tuple in enumerate(dataloader):\n                iteration_loss = 0\n                optim.zero_grad()\n                input_tuple = [x.cuda() for x in input_tuple]\n                SUB_BATCH_SIZE = self.__C.BATCH_SIZE // self.__C.GRAD_ACCU_STEPS\n                for accu_step in range(self.__C.GRAD_ACCU_STEPS):\n\n                    sub_tuple = [x[accu_step * SUB_BATCH_SIZE:\n                        (accu_step + 1) * SUB_BATCH_SIZE] for x in input_tuple]\n                    \n                    sub_ans_iter = sub_tuple[-1]\n                    pred = net(sub_tuple[:-1])\n                    loss = loss_fn(pred, sub_ans_iter)\n                    loss.backward()\n                    loss_item = loss.item()\n                    iteration_loss += loss_item\n                    epoch_loss += loss_item# * self.__C.GRAD_ACCU_STEPS\n\n                print(\"\\r[version %s][epoch %2d][step %4d/%4d][Task %s][Mode %s] loss: %.4f, lr: %.2e\" % (\n                    self.__C.VERSION,\n                    epoch + 1,\n                    step,\n                    int(data_size / self.__C.BATCH_SIZE),\n                    self.__C.TASK,\n                    self.__C.RUN_MODE,\n                    iteration_loss / self.__C.BATCH_SIZE,\n                    optim.current_lr(),\n                ), end='          ')\n\n                optim.step()\n\n            time_end = time.time()\n            print('Finished in {}s'.format(int(time_end - time_start)))\n\n            # Logging\n            with open(self.__C.LOG_PATH, 'a+') as logfile:\n                logfile.write(f'epoch = {epoch + 1}  loss = {epoch_loss / data_size}\\nlr = {optim.current_lr()}\\n\\n')\n            \n            optim.schedule_step(epoch)\n\n            # Save checkpoint\n            state = {\n                'state_dict': net.state_dict() if self.__C.N_GPU == 1 \\\n                    else net.module.state_dict(),\n                'optimizer': optim.optimizer.state_dict(),\n                'warmup_lr_scale': optim.warmup_lr_scale,\n                'decay_lr_scale': optim.decay_lr_scale,\n            }\n            torch.save(\n                state,\n                f'{self.__C.CKPTS_DIR}/epoch{epoch + 1}.pkl'\n            )\n\n            epoch_loss = 0\n\n    def run(self):\n        # Set ckpts and log path\n        Path(self.__C.CKPTS_DIR).mkdir(parents=True, exist_ok=True)\n        Path(self.__C.LOG_PATH).parent.mkdir(parents=True, exist_ok=True)\n        with open(self.__C.LOG_PATH, 'w') as f:\n            f.write(str(self.__C) + '\\n')\n        \n        common_data = CommonData(self.__C)\n        train_set = DataSet(\n            self.__C, \n            common_data,\n            self.__C.TRAIN_SPLITS\n        )\n        valid_set = None\n        self.train(train_set, valid_set)\n\ndef pretrain_login_args(parser):\n    parser.add_argument('--task', dest='TASK', help='task name, e.g., ok, aok_val, aok_test', type=str, required=True)\n    parser.add_argument('--cfg', dest='cfg_file', help='optional config file', type=str, required=True)\n    parser.add_argument('--version', dest='VERSION', help='version name', type=str, required=True)\n    parser.add_argument('--resume', dest='RESUME', help='resume training', type=bool, default=False)\n    parser.add_argument('--resume_version', dest='RESUME_VERSION', help='checkpoint version name', type=str, default=None)\n    parser.add_argument('--resume_epoch', dest='RESUME_EPOCH', help='checkpoint epoch', type=int, default=None)\n    parser.add_argument('--resume_path', dest='RESUME_PATH', help='checkpoint path', type=str, default=None)\n    parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default=None)\n    parser.add_argument('--seed', dest='SEED', help='random seed', type=int, default=None)\n    parser.add_argument('--grad_accu', dest='GRAD_ACCU_STEPS', help='random seed', type=int, default=None)\n\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Parameters for pretraining')\n    pretrain_login_args(parser)\n    args = parser.parse_args()\n    __C = Cfgs(args)\n    with open(args.cfg_file, 'r') as f:\n        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)\n    __C.override_from_dict(yaml_dict)\n    print(__C)\n    runner = Runner(__C)\n    runner.run()\n"
  },
  {
    "path": "prophet/stage1/utils/load_data.py",
    "content": "# --------------------------------------------------------------------------------- #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Data loading and preprocessing. Note that for the sake of simplicity,\n#              the code only supports the following datasets for now:\n#              * VQA 2.0\n#              * OK-VQA\n#              * A-OKVQA\n#              Transferring to other datasets is easy. You may need to modify a few \n#              line of code in this file.\n# --------------------------------------------------------------------------------- #\n\nimport numpy as np\nimport glob, json, pickle, random\nimport torch\nimport torch.utils.data as Data\nfrom transformers import AutoTokenizer\n\nfrom evaluation.ans_punct import prep_ans\n# from .transforms import _transform\n\n\ndef soft_target(answers, ans_to_ix, preprocess=True):\n    ans_score = np.zeros(ans_to_ix.__len__(), np.float32)\n    for ans in answers:\n        if preprocess:\n            ans = prep_ans(ans)\n        if ans in ans_to_ix:\n            ans_score[ans_to_ix[ans]] = min(1.0, ans_score[ans_to_ix[ans]] + 0.3)\n    return ans_score\n\n\nclass CommonData:\n    \"\"\"\n    load common data for all dataset objects:\n    * imgid_to_path\n    * bert tokenizer\n    * ans_to_ix, ix_to_ans\n    \"\"\"\n    def __init__(self, __C) -> None:\n        print('Loading common data...')\n        \n        # load imgid_to_path\n        self.img_feat_path_list = []\n        for split in __C.FEATURE_SPLIT:\n            feats_dir = __C.FEATS_DIR[split]\n            self.img_feat_path_list += glob.glob(feats_dir + '*.npz')\n        self.imgid_to_path = {}\n        for feat_path in self.img_feat_path_list:\n            img_id = int(feat_path.split('/')[-1].split('_')[-1].split('.')[0])\n            self.imgid_to_path[img_id] = feat_path\n        # self.preprocess = _transform(__C.RESOLUTION)\n        print(f'== Total image number: {len(self.imgid_to_path)}')\n\n        # load bert tokenizer\n        self.tokenizer = AutoTokenizer.from_pretrained(__C.BERT_VERSION)\n        self.token_size = self.tokenizer.vocab_size\n        print(f'== BertTokenizer loaded, vocab size: {self.token_size}')\n\n        # load ans_to_ix, ix_to_ans\n        ans_dict_path = __C.ANSWER_DICT_PATH[__C.DATA_TAG]\n        self.ix_to_ans = json.load(open(ans_dict_path, 'r'))\n        self.ans_to_ix = {ans: ix for ix, ans in enumerate(self.ix_to_ans)}\n        self.ans_size = len(self.ans_to_ix)\n        print(f'== Answer vocab size: {self.ans_size}')\n\n        print('Common data process is done.\\n')\n        \n\nclass DataSet(Data.Dataset):\n    def __init__(self, __C, common_data, split_name_list):\n        self.__C = __C\n        print(f'Loading dataset for {self.__C.TASK}|{self.__C.RUN_MODE}({split_name_list})')\n        self.split_name_list = split_name_list\n\n        # load all attributes from common data\n        self.imgid_to_path = common_data.imgid_to_path\n        self.tokenizer = common_data.tokenizer\n        self.token_size = common_data.token_size\n        self.ans_to_ix = common_data.ans_to_ix\n        self.ix_to_ans = common_data.ix_to_ans\n        self.ans_size = common_data.ans_size\n\n        # Loading question and answer list\n        self.ques_list = []\n        self.ans_list = []\n\n        for split_name in split_name_list:\n            ques_list = json.load(open(__C.QUESTION_PATH[split_name], 'r'))\n            if 'questions' in ques_list:\n                ques_list = ques_list['questions']\n            self.ques_list += ques_list\n            if split_name in __C.ANSWER_PATH:\n                ans_list = json.load(open(__C.ANSWER_PATH[split_name], 'r'))\n                if 'annotations' in ans_list:\n                    ans_list = ans_list['annotations']\n                self.ans_list += ans_list\n\n        # indexing data, note that all question_id is set to str,\n        # and all image_id is set to int\n        if len(self.ans_list) == len(self.ques_list):\n            self.annotated = True\n            self.qids = [str(ans['question_id']) for ans in self.ans_list]\n        elif len(self.ans_list) < len(self.ques_list):\n            self.annotated = False\n            self.qids = [str(ques['question_id']) for ques in self.ques_list]\n        else:\n            raise ValueError('Answer list is longer than question list!')\n\n        self.data_size = len(self.qids)\n        print(f'== data size: {self.data_size}\\n')\n\n        self.qid_to_ques = {str(ques['question_id']): ques for ques in self.ques_list}\n        self.qid_to_ans = {str(ans['question_id']): ans for ans in self.ans_list}\n\n\n    def __getitem__(self, idx):\n        # get question in token ids, image in features,\n        # and answer in binary-label vector\n\n        __C = self.__C\n\n        # For code safety\n        img_feat  = np.zeros(1)\n        ques_ids  = np.zeros(1)\n        ans_vec   = np.zeros(1)\n\n        qid = self.qids[idx]\n        ques_info = self.qid_to_ques[qid]\n        \n        # Process question\n        ques_str = ques_info['question']\n        ques_ids = self.bert_tokenize(ques_str, __C.MAX_TOKEN)\n\n        # Process image feature\n        img_id = int(ques_info['image_id'])\n        img_feat = np.load(self.imgid_to_path[img_id])['x']\n        assert img_feat.shape == (__C.IMG_FEAT_GRID, __C.IMG_FEAT_GRID, __C.IMG_FEAT_SIZE)\n        img_feat = img_feat.reshape(-1, __C.IMG_FEAT_SIZE)\n\n        # Process answer\n        # The code is compatible with VQA v2, OK-VQA, and A-OKVQA.\n        # It is no guarantee that it works for other datasets. If\n        # you want to use other datasets, please modify following\n        # code to fit your dataset.\n        if self.annotated:\n            ans_info = self.qid_to_ans[qid]\n            if 'answers' in ans_info:\n                ans_list = [ans['answer'] for ans in ans_info['answers']]\n            elif 'direct_answers' in ans_info:\n                ans_list = ans_info['direct_answers']\n            else:\n                raise ValueError('Error: annotation format is not supported!')\n            assert type(ans_list[0]) == str, 'Error: answer format is not supported!'\n            ans_vec = soft_target(ans_list, self.ans_to_ix)\n\n        return  torch.tensor(img_feat, dtype=torch.float), \\\n                torch.tensor(ques_ids, dtype=torch.long), \\\n                torch.tensor(ans_vec, dtype=torch.float)\n\n\n    def __len__(self):\n        return self.data_size\n\n    def bert_tokenize(self, text, max_token):\n        text = text.lower().replace('?', '')\n        tokens = self.tokenizer.tokenize(text)\n        if len(tokens) > max_token - 2:\n            tokens = tokens[:max_token-2]\n        tokens = ['[CLS]'] + tokens + ['[SEP]']\n        ids = self.tokenizer.convert_tokens_to_ids(tokens)\n        ids = ids + [0] * (max_token - len(ids))\n        ids = np.array(ids, np.int64)\n\n        return ids"
  },
  {
    "path": "prophet/stage1/utils/optim.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Utilities for optimization\n# ------------------------------------------------------------------------------ #\n\nimport torch\nimport torch.optim as Optim\nfrom torch.nn.utils import clip_grad_norm_\n\nclass OptimizerWrapper(object):\n    \"\"\"\n    A Wrapper for optimizer to support learning rate warmup and decay.\n    It also support multiple optimizers and switching at different steps.\n    \"\"\"\n    def __init__(self, optimizers, \n                 warmup_schd_steps,\n                 decay_schd_step_list,\n                 decay_rate, \n                 cur_schd_step=-1,\n                 change_optim_step_list=None\n        ):\n        self.optimizer_list = optimizers\n        self.groups_lr_list = []\n        for _optim in self.optimizer_list:\n            self.groups_lr_list.append([])\n            for group in _optim.param_groups:\n                self.groups_lr_list[-1].append(group['lr'])\n        self.curr_optim_id = 0\n        self.optimizer = self.optimizer_list[self.curr_optim_id]\n        self.change_optim_step_list = change_optim_step_list\n        # self.total_schd_steps = total_schd_steps\n        self.warmup_schd_steps = warmup_schd_steps\n        self.decay_schd_step_list = decay_schd_step_list\n        self.decay_rate = decay_rate\n        self._step = 0\n        self.warmup_lr_scale = 1.0\n        self.decay_lr_scale = 1.0\n        self.schedule_step(cur_schd_step)\n\n    def zero_grad(self):\n        self.optimizer.zero_grad()\n\n    def step(self, step=None, schd_step=False):\n        if step is None:\n            step = self._step\n        if schd_step:\n            self.schedule_step(step)\n        \n        for group in self.optimizer.param_groups:\n            if '_grad_norm_clip' in group:\n                if group['_grad_norm_clip'] > 0:\n                    clip_grad_norm_(group['params'], group['_grad_norm_clip'])\n        \n        self.optimizer.step()\n        self._step += 1\n    \n    def schedule_step(self, schd_step):\n        schd_step += 1\n        self.warmup_lr_scale = min(1., float(schd_step + 1) / float(self.warmup_schd_steps + 1))\n        if schd_step in self.decay_schd_step_list:\n            self.decay_lr_scale = self.decay_lr_scale * self.decay_rate\n        lr_scale = self.warmup_lr_scale * self.decay_lr_scale\n        # lr actually changes in following lines\n        if self.change_optim_step_list is not None:\n            if schd_step in self.change_optim_step_list:\n                self.curr_optim_id += 1\n                self.optimizer = self.optimizer_list[self.curr_optim_id]\n        for i, group in enumerate(self.optimizer.param_groups):\n            group['lr'] = lr_scale * self.groups_lr_list[self.curr_optim_id][i]\n\n    def current_lr(self):\n        return self.optimizer.param_groups[0]['lr']\n\n    def state_dict(self):\n        return self.optimizer.state_dict()\n\n    def load_state_dict(self, state_dict):\n        self.optimizer.load_state_dict(state_dict)\n    \n\ndef get_optim(__C, model):\n    optim_class = eval('Optim.' + __C.OPT)\n    params = [\n        {'params': [], 'lr': __C.LR_BASE, '_grad_norm_clip': __C.GRAD_NORM_CLIP},\n        {'params': [], 'lr': __C.LR_BASE * __C.BERT_LR_MULT, '_grad_norm_clip': __C.GRAD_NORM_CLIP},\n    ]\n    for name, param in model.named_parameters():\n        if param.requires_grad:\n            if 'bert' in name:\n                params[1]['params'].append(param)\n            else:\n                params[0]['params'].append(param)\n    hyper_params = {k: eval(v) for k, v in __C.OPT_PARAMS.items()}\n    return OptimizerWrapper(\n        [optim_class(\n            params,\n            **hyper_params\n        ),],\n        warmup_schd_steps=__C.WARMUP_EPOCH,\n        decay_schd_step_list=__C.LR_DECAY_LIST,\n        decay_rate=__C.LR_DECAY_R,\n    )\n\n\ndef get_optim_for_finetune(__C, model, new_params_name='proj1'):\n    # optimizer for finetuning warmup\n    optim_class1 = eval('Optim.' + __C.OPT_FTW)\n    params1 = []\n    for name, param in model.named_parameters():\n        if new_params_name in name and param.requires_grad:\n            params1.append(param)\n    hyper_params1 = {k: eval(v) for k, v in __C.OPT_PARAMS_FTW.items()}\n    optimizer1 = optim_class1(\n        params1,\n        lr=__C.LR_BASE_FTW,\n        **hyper_params1\n    )\n\n    optim_class2 = eval('Optim.' + __C.OPT)\n    params2 = [\n        {'params': [], 'lr': __C.LR_BASE, '_grad_norm_clip': __C.GRAD_NORM_CLIP},\n        {'params': [], 'lr': __C.LR_BASE * __C.BERT_LR_MULT, '_grad_norm_clip': __C.GRAD_NORM_CLIP},\n    ]\n    for name, param in model.named_parameters():\n        if param.requires_grad:\n            if 'bert' in name:\n                params2[1]['params'].append(param)\n            else:\n                params2[0]['params'].append(param)\n    hyper_params2 = {k: eval(v) for k, v in __C.OPT_PARAMS.items()}\n    optimizer2 = optim_class2(\n        params2,\n        **hyper_params2\n    )\n    return OptimizerWrapper(\n        [optimizer1, optimizer2],\n        warmup_schd_steps=__C.WARMUP_EPOCH,\n        decay_schd_step_list=__C.LR_DECAY_LIST,\n        decay_rate=__C.LR_DECAY_R,\n        change_optim_step_list=[__C.EPOPH_FTW,]        \n    )\n"
  },
  {
    "path": "prophet/stage2/prompt.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Runner that handles the prompting process\n# ------------------------------------------------------------------------------ #\n\nimport os, sys\n# sys.path.append(os.getcwd())\n\nimport pickle\nimport json, time\nimport math\nimport random\nimport argparse\nfrom datetime import datetime\nfrom copy import deepcopy\nimport yaml\nfrom pathlib import Path\nimport openai\n\nfrom .utils.fancy_pbar import progress, info_column\nfrom .utils.data_utils import Qid2Data\nfrom configs.task_cfgs import Cfgs\n\n\nclass Runner:\n    def __init__(self, __C, evaluater):\n        self.__C = __C\n        self.evaluater = evaluater\n        openai.api_key = __C.OPENAI_KEY\n    \n    def gpt3_infer(self, prompt_text, _retry=0):\n        # print(prompt_text)\n        # exponential backoff\n        if _retry > 0:\n            print('retrying...')\n            st = 2 ** _retry\n            time.sleep(st)\n        \n        if self.__C.DEBUG:\n            # print(prompt_text)\n            time.sleep(0.05)\n            return 0, 0\n\n        try:\n            # print('calling gpt3...')\n            response = openai.Completion.create(\n                engine=self.__C.MODEL,\n                prompt=prompt_text,\n                temperature=self.__C.TEMPERATURE,\n                max_tokens=self.__C.MAX_TOKENS,\n                logprobs=1,\n                stop=[\"\\n\", \"<|endoftext|>\"],\n                # timeout=20,\n            )\n            # print('gpt3 called.')\n        except Exception as e:\n            print(type(e), e)\n            if str(e) == 'You exceeded your current quota, please check your plan and billing details.':\n                exit(1)\n            return self.gpt3_infer(prompt_text, _retry + 1)\n\n        response_txt = response.choices[0].text.strip()\n        # print(response_txt)\n        plist = []\n        for ii in range(len(response['choices'][0]['logprobs']['tokens'])):\n            if response['choices'][0]['logprobs']['tokens'][ii] in [\"\\n\", \"<|endoftext|>\"]:\n                break\n            plist.append(response['choices'][0]['logprobs']['token_logprobs'][ii])\n        prob = math.exp(sum(plist))\n        \n        return response_txt, prob\n    \n    def sample_make(self, ques, capt, cands, ans=None):\n        line_prefix = self.__C.LINE_PREFIX\n        cands = cands[:self.__C.K_CANDIDATES]\n        prompt_text = line_prefix + f'Context: {capt}\\n'\n        prompt_text += line_prefix + f'Question: {ques}\\n'\n        cands_with_conf = [f'{cand[\"answer\"]}({cand[\"confidence\"]:.2f})' for cand in cands]\n        cands = ', '.join(cands_with_conf)\n        prompt_text += line_prefix + f'Candidates: {cands}\\n'\n        prompt_text += line_prefix + 'Answer:'\n        if ans is not None:\n            prompt_text += f' {ans}'\n        return prompt_text\n\n    def get_context(self, example_qids):\n        # making context text for one testing input\n        prompt_text = self.__C.PROMPT_HEAD\n        examples = []\n        for key in example_qids:\n            ques = self.trainset.get_question(key)\n            caption = self.trainset.get_caption(key)\n            cands = self.trainset.get_topk_candidates(key)\n            gt_ans = self.trainset.get_most_answer(key)\n            examples.append((ques, caption, cands, gt_ans))\n            prompt_text += self.sample_make(ques, caption, cands, ans=gt_ans)\n            prompt_text += '\\n\\n'\n        return prompt_text\n    \n    def run(self):\n        ## where logs will be saved\n        Path(self.__C.LOG_PATH).parent.mkdir(parents=True, exist_ok=True)\n        with open(self.__C.LOG_PATH, 'w') as f:\n            f.write(str(self.__C) + '\\n')\n        ## where results will be saved\n        Path(self.__C.RESULT_DIR).mkdir(parents=True, exist_ok=True)\n        \n        self.cache = {}\n        self.cache_file_path = os.path.join(\n            self.__C.RESULT_DIR,\n            'cache.json'\n        )\n        if self.__C.RESUME:\n            self.cache = json.load(open(self.cache_file_path, 'r'))\n        \n        print('Note that the accuracies printed before final evaluation (the last printed one) are rough, just for checking if the process is normal!!!\\n')\n        self.trainset = Qid2Data(\n            self.__C, \n            self.__C.TRAIN_SPLITS,\n            True\n        )\n        self.valset = Qid2Data(\n            self.__C, \n            self.__C.EVAL_SPLITS,\n            self.__C.EVAL_NOW,\n            json.load(open(self.__C.EXAMPLES_PATH, 'r'))\n        )\n\n        # if 'aok' in self.__C.TASK:\n        #     from evaluation.aokvqa_evaluate import AOKEvaluater as Evaluater\n        # else:\n        #     from evaluation.okvqa_evaluate import OKEvaluater as Evaluater\n        # evaluater = Evaluater(\n        #     self.valset.annotation_path,\n        #     self.valset.question_path\n        # )\n\n        infer_times = self.__C.T_INFER\n        N_inctx = self.__C.N_EXAMPLES\n        \n        print()\n\n        for qid in progress.track(self.valset.qid_to_data, description=\"Working...  \"):\n            if qid in self.cache:\n                continue\n            ques = self.valset.get_question(qid)\n            caption = self.valset.get_caption(qid)\n            cands = self.valset.get_topk_candidates(qid, self.__C.K_CANDIDATES)\n\n            prompt_query = self.sample_make(ques, caption, cands)\n            example_qids = self.valset.get_similar_qids(qid, k=infer_times * N_inctx)\n            random.shuffle(example_qids)\n\n            prompt_info_list = []\n            ans_pool = {}\n            # multi-times infer\n            for t in range(infer_times):\n                # print(f'Infer {t}...')\n                prompt_in_ctx = self.get_context(example_qids[(N_inctx * t):(N_inctx * t + N_inctx)])\n                prompt_text = prompt_in_ctx + prompt_query\n                gen_text, gen_prob = self.gpt3_infer(prompt_text)\n\n                ans = self.evaluater.prep_ans(gen_text)\n                if ans != '':\n                    ans_pool[ans] = ans_pool.get(ans, 0.) + gen_prob\n\n                prompt_info = {\n                    'prompt': prompt_text,\n                    'answer': gen_text,\n                    'confidence': gen_prob\n                }\n                prompt_info_list.append(prompt_info)\n                time.sleep(self.__C.SLEEP_PER_INFER)\n            \n            # vote\n            if len(ans_pool) == 0:\n                answer = self.valset.get_topk_candidates(qid, 1)[0]['answer']\n            else:\n                answer = sorted(ans_pool.items(), key=lambda x: x[1], reverse=True)[0][0]\n            \n            self.evaluater.add(qid, answer)\n            self.cache[qid] = {\n                'question_id': qid,\n                'answer': answer,\n                'prompt_info': prompt_info_list\n            }\n            json.dump(self.cache, open(self.cache_file_path, 'w'))\n\n            ll = len(self.cache)\n            if self.__C.EVAL_NOW and not self.__C.DEBUG:\n                if ll > 21 and ll % 10 == 0:\n                    rt_accuracy = self.valset.rt_evaluate(self.cache.values())\n                    info_column.info = f'Acc: {rt_accuracy}'\n\n        self.evaluater.save(self.__C.RESULT_PATH)\n        if self.__C.EVAL_NOW:\n            with open(self.__C.LOG_PATH, 'a+') as logfile:\n                self.evaluater.evaluate(logfile)\n        \ndef prompt_login_args(parser):\n    parser.add_argument('--debug', dest='DEBUG', help='debug mode', action='store_true')\n    parser.add_argument('--resume', dest='RESUME', help='resume previous run', action='store_true')\n    parser.add_argument('--task', dest='TASK', help='task name, e.g., ok, aok_val, aok_test', type=str, required=True)\n    parser.add_argument('--version', dest='VERSION', help='version name', type=str, required=True)\n    parser.add_argument('--cfg', dest='cfg_file', help='optional config file', type=str, default='configs/prompt.yml')\n    parser.add_argument('--examples_path', dest='EXAMPLES_PATH', help='answer-aware example file path, default: \"assets/answer_aware_examples_for_ok.json\"', type=str, default=None)\n    parser.add_argument('--candidates_path', dest='CANDIDATES_PATH', help='candidates file path, default: \"assets/candidates_for_ok.json\"', type=str, default=None)\n    parser.add_argument('--captions_path', dest='CAPTIONS_PATH', help='captions file path, default: \"assets/captions_for_ok.json\"', type=str, default=None)\n    parser.add_argument('--openai_key', dest='OPENAI_KEY', help='openai api key', type=str, default=None)\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser(description='Heuristics-enhanced Prompting')\n    prompt_login_args(parser)\n    args = parser.parse_args()\n    __C = Cfgs(args)\n    with open(args.cfg_file, 'r') as f:\n        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)\n    __C.override_from_dict(yaml_dict)\n    print(__C)\n\n    runner = Runner(__C)\n    runner.run()\n"
  },
  {
    "path": "prophet/stage2/utils/data_utils.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: dataset utils for stage2\n# ------------------------------------------------------------------------------ #\n\nimport json\nfrom typing import Dict\nimport pickle\nfrom collections import Counter\n\n# following two score is rough, and only for print accuracies during inferring.\ndef ok_score(gt_answers):\n    gt_answers = [a['answer'] for a in gt_answers]\n    ans2cnt = Counter(gt_answers)\n    # sort\n    ans2cnt = sorted(ans2cnt.items(), key=lambda x: x[1], reverse=True)\n    ans2score = {}\n    for ans, cnt in ans2cnt:\n        # ans2score[ans] = min(1.0, cnt / 3.0)\n        if cnt == 1:\n            ans2score[ans] = 0.3\n        elif cnt == 2:\n            ans2score[ans] = 0.6\n        elif cnt == 3:\n            ans2score[ans] = 0.9\n        else:\n            ans2score[ans] = 1.0\n    return ans2score\n\ndef aok_score(gt_answers):\n    gt_answers = [a for a in gt_answers]\n    ans2cnt = Counter(gt_answers)\n    # sort\n    ans2cnt = sorted(ans2cnt.items(), key=lambda x: x[1], reverse=True)\n    ans2score = {}\n    for ans, cnt in ans2cnt:\n        # ans2score[ans] = min(1.0, cnt / 3.0)\n        if cnt == 1:\n            ans2score[ans] = 1 / 3.\n        elif cnt == 2:\n            ans2score[ans] = 2 / 3.\n        else:\n            ans2score[ans] = 1.\n    return ans2score\n\n\nclass Qid2Data(Dict):\n    def __init__(self, __C, splits, annotated=False, similar_examples=None):\n        super().__init__()\n\n        self.__C = __C\n        self.annotated = annotated\n        \n        ques_set = []\n        for split in splits:\n            split_path = self.__C.QUESTION_PATH[split]\n            _ques_set = json.load(open(split_path, 'r'))\n            if 'questions' in _ques_set:\n                _ques_set = _ques_set['questions']\n            ques_set += _ques_set\n        qid_to_ques = {str(q['question_id']): q for q in ques_set}\n\n        if annotated:\n            anno_set = []\n            for split in splits:\n                split_path = self.__C.ANSWER_PATH[split]\n                _anno_set = json.load(open(split_path, 'r'))\n                if 'annotations' in _anno_set:\n                    _anno_set = _anno_set['annotations']\n                anno_set += _anno_set\n            qid_to_anno = {str(a['question_id']): a for a in anno_set}\n        \n        qid_to_topk = json.load(open(__C.CANDIDATES_PATH))\n        # qid_to_topk = {t['question_id']: t for t in topk}\n\n        iid_to_capt = json.load(open(__C.CAPTIONS_PATH))\n        \n        _score = aok_score if 'aok' in __C.TASK else ok_score\n        \n        qid_to_data = {}\n        # ques_set = ques_set['questions']\n        # anno_set = anno_set['annotations']\n        for qid in qid_to_ques:\n            q_item = qid_to_ques[qid]\n            t_item = qid_to_topk[qid]\n\n            iid = str(q_item['image_id'])\n            caption = iid_to_capt[iid].strip()\n            if caption[-1] != '.':\n                caption += '.'\n            \n            qid_to_data[qid] = {\n                'question_id': qid,\n                'image_id': iid,\n                'question': q_item['question'],\n                # 'most_answer': most_answer,\n                # 'gt_scores': ans2score,\n                'topk_candidates': t_item,\n                'caption': caption,\n            }\n            if annotated:\n                a_item = qid_to_anno[qid]\n                if 'answers' in a_item:\n                    answers = a_item['answers']\n                else:\n                    answers = a_item['direct_answers']\n\n                ans2score = _score(answers)\n\n                most_answer = list(ans2score.keys())[0]\n                if most_answer == '':\n                    most_answer = list(ans2score.keys())[1]\n                \n                qid_to_data[qid]['most_answer'] = most_answer\n                qid_to_data[qid]['gt_scores'] = ans2score\n\n        self.qid_to_data = qid_to_data\n\n        k = __C.K_CANDIDATES\n        if annotated:\n            print(f'Loaded dataset size: {len(self.qid_to_data)}, top{k} accuracy: {self.topk_accuracy(k)*100:.2f}, top1 accuracy: {self.topk_accuracy(1)*100:.2f}')\n        \n        if similar_examples:\n            for qid in similar_examples:\n                qid_to_data[qid]['similar_qids'] = similar_examples[qid]\n            \n            # check if all items have similar_qids\n            for qid, item in self.items():\n                if 'similar_qids' not in item:\n                    raise ValueError(f'qid {qid} does not have similar_qids')\n        \n        \n\n    def __getitem__(self, __key):\n        return self.qid_to_data[__key]\n    \n\n    def get_caption(self, qid):\n        caption = self[qid]['caption']\n        # if with_tag:\n        #     tags = self.get_tags(qid, k_tags)\n        #     caption += ' ' + ', '.join(tags) + '.'\n        return caption\n    \n    def get_question(self, qid):\n        return self[qid]['question']\n    \n    \n    def get_gt_answers(self, qid):\n        if not self.annotated:\n            return None\n        return self[qid]['gt_scores']\n    \n    def get_most_answer(self, qid):\n        if not self.annotated:\n            return None\n        return self[qid]['most_answer']\n\n    def get_topk_candidates(self, qid, k=None):\n        if k is None:\n            return self[qid]['topk_candidates']\n        else:\n            return self[qid]['topk_candidates'][:k]\n    \n    def get_similar_qids(self, qid, k=None):\n        similar_qids = self[qid]['similar_qids']\n        if k is not None:\n            similar_qids = similar_qids[:k]\n        return similar_qids\n    \n    def evaluate_by_threshold(self, ans_set, threshold=1.0):\n        if not self.annotated:\n            return -1\n        \n        total_score = 0.0\n        for item in ans_set:\n            qid = item['question_id']\n            topk_candidates = self.get_topk_candidates(qid)\n            top1_confid = topk_candidates[0]['confidence']\n            if top1_confid > threshold:\n                answer = topk_candidates[0]['answer']\n            else:\n                answer = item['answer']\n            gt_answers = self.get_gt_answers(qid)\n            if answer in gt_answers:\n                total_score += gt_answers[answer]\n        return total_score / len(ans_set)\n    \n    def topk_accuracy(self, k=1, sub_set=None):\n        if not self.annotated:\n            return -1\n        \n        total_score = 0.0\n        if sub_set is not None:\n            qids = sub_set\n        else:\n            qids = list(self.qid_to_data.keys())\n        for qid in qids:\n            topk_candidates = self.get_topk_candidates(qid)[:k]\n            gt_answers = self.get_gt_answers(qid)\n            score_list = [gt_answers.get(a['answer'], 0.0) for a in topk_candidates]\n            total_score += max(score_list)\n        return total_score / len(qids)\n    \n    def rt_evaluate(self, answer_set):\n        if not self.annotated:\n            return ''\n        score1 = self.evaluate_by_threshold(answer_set, 1.0) * 100\n        score2 = self.evaluate_by_threshold(answer_set, 0.0) * 100\n        score_string = f'{score2:.2f}->{score1:.2f}'\n        return score_string\n"
  },
  {
    "path": "prophet/stage2/utils/fancy_pbar.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: customized progress bar\n# ------------------------------------------------------------------------------ #\n\nfrom time import sleep\n\nfrom rich.table import Column\nfrom rich.progress import *\nimport atexit\n\nclass RichColumn(ProgressColumn):\n    def __init__(self, table_column: Optional[Column] = None) -> None:\n        super().__init__(table_column)\n        self.time_elapsed_column = TimeElapsedColumn()\n        self.time_remaining_column = TimeRemainingColumn()\n        self.m_of_n = MofNCompleteColumn()\n        self._completed = 0\n        self.sec_per_iter = -1\n        self.info = None\n    \n    def render(self, task: \"Task\") -> Text:\n        m_of_n = self.m_of_n.render(task)\n        m_of_n = Text(f'{m_of_n}'.replace(' ', ''), style=\"red\")\n        elapsed = self.time_elapsed_column.render(task)\n        elapsed = Text(f'{elapsed}', style=\"orange_red1\")\\\n                    + Text('/', style=\"dark_orange\")\n        remaining = self.time_remaining_column.render(task)\n        remaining = Text(f'{remaining}', style=\"yellow\")\n        if task.completed:\n            if self._completed < task.completed:\n                # do not update sec_per_iter if no new completed iterators\n                self._completed = task.completed\n                self.sec_per_iter = task.elapsed / task.completed\n            sec_per_iter = Text(f'({self.sec_per_iter:.1f}s/iter)', style=\"green\")\n        else:\n            sec_per_iter = Text(f'(--s/iter)', style=\"green\")\n\n        rendered = m_of_n + ' ' + elapsed + remaining + sec_per_iter\n        if self.info is None:\n            return rendered\n        info = Text(f' {self.info}', style=\"cyan\")\n        return rendered + info\n\ninfo_column = RichColumn()\nprogress = Progress(\n    TextColumn(\"[bold]{task.description}\", table_column=Column(ratio=1)), \n    BarColumn(bar_width=None, table_column=Column(ratio=8), complete_style=\"blue\"),\n    # MofNCompleteColumn(),\n    info_column,\n    expand=True,\n    redirect_stdout=False,\n    redirect_stderr=False\n)\nprogress.__enter__()\n\ndef exit_progress():\n    progress.__exit__(None, None, None)\natexit.register(exit_progress)\n\nif __name__ == '__main__':\n    # with progress:\n    for n in progress.track(range(10), description=\"Working...  \"):\n        sleep(0.01)\n        print(n)\n        if n == 8:\n            0 / 0"
  },
  {
    "path": "scripts/evaluate_file.sh",
    "content": "#!/bin/bash\n# This script is used to evaluate a result file.\n\n# Parse arguments\nwhile [[ $# -gt 0 ]]; do\n  case \"$1\" in\n    --task)\n      TASK=\"$2\"\n      shift 2;;\n    --result_path)\n      RESULT_PATH=\"$2\"\n      shift 2;;\n    *)\n      echo \"Unknown argument: $1\"\n      exit 1;;\n  esac\ndone\n\nTASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'\nRESULT_PATH=${RESULT_PATH:-\"preds/prophet_611_okvqa.json\"} # path to the result file, default is the result from our experiments\n\nif [ $TASK == \"ok\" ]; then\n  python -m evaluation.okvqa_evaluate --result_path $RESULT_PATH \\\n    --question_path 'datasets/okvqa/OpenEnded_mscoco_val2014_questions.json' \\\n    --annotation_path 'datasets/okvqa/mscoco_val2014_annotations.json'\nelif [ $TASK == \"aok_val\" ]; then\n  python -m evaluation.aokvqa_evaluate --result_path $RESULT_PATH \\\n    --dataset_path 'datasets/aokvqa/aokvqa_v1p0_val.json' \\\n    --direct_answer --multiple_choice\nelif [ $TASK == \"aok_test\" ]; then\n  echo \"Please submit your result to the AOKVQA leaderboard.\"\nelse\n  echo \"Unknown task: $TASK\"\n  exit 1\nfi"
  },
  {
    "path": "scripts/evaluate_model.sh",
    "content": "#!/bin/bash\n# This script is used to evaluate a finetuned model.\n\n# Parse arguments\nwhile [[ $# -gt 0 ]]; do\n  case \"$1\" in\n    --gpu)\n      GPU=\"$2\"\n      shift 2;;\n    --task)\n      TASK=\"$2\"\n      shift 2;;\n    --ckpt_path)\n      CKPT_PATH=\"$2\"\n      shift 2;;\n    --version)\n      VERSION=\"$2\"\n      shift 2;;\n    *)\n      echo \"Unknown argument: $1\"\n      exit 1;;\n  esac\ndone\n\nTASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'\nGPU=${GPU:-0} # GPU id(s) you want to use, default '0'\nCKPT_PATH=${CKPT_PATH:-\"ckpts/mcan_ft_okvqa.pkl\"} # path to the pretrained model, default is the result from our experiments\nVERSION=${VERSION:-\"eval_finetuned_${TASK}_model\"} # version name, default 'eval_finetuned_$TASK_model'\n\n# CUDA_VISIBLE_DEVICES=$GPU \\\npython main.py \\\n    --task $TASK --run_mode finetune_test \\\n    --cfg configs/finetune.yml \\\n    --version $VERSION \\\n    --ckpt_path $CKPT_PATH \\\n    --gpu $GPU --grad_accu 2\n"
  },
  {
    "path": "scripts/extract_img_feats.sh",
    "content": "#!/bin/bash\n# This script is used to extract image features.\n\n# Parse arguments\nwhile [[ $# -gt 0 ]]; do\n  case \"$1\" in\n    --gpu)\n      GPU=\"$2\"\n      shift 2;;\n    --dataset)\n      DATASET=\"$2\"\n      shift 2;;\n    --clip)\n      CLIP_MODEL=\"$2\"\n      shift 2;;\n    *)\n      echo \"Unknown argument: $1\"\n      exit 1;;\n  esac\ndone\n\nDATASET=${DATASET:-ok} # dataset name, one of ['ok', 'aok'], default 'ok'\nGPU=${GPU:-0} # GPU id(s) you want to use, default '0'\nCLIP_MODEL=${CLIP_MODEL:-RN50x64} # clip model name or path, default 'RN50x64'\n\n# CUDA_VISIBLE_DEVICES=$GPU \\\npython tools/extract_img_feats.py \\\n    --dataset $DATASET --gpu $GPU \\\n    --clip_model $CLIP_MODEL"
  },
  {
    "path": "scripts/finetune.sh",
    "content": "#!/bin/bash\n# This script is used to finetune the pretrained MCAN model.\n\n# Parse arguments\nwhile [[ $# -gt 0 ]]; do\n  case \"$1\" in\n    --gpu)\n      GPU=\"$2\"\n      shift 2;;\n    --task)\n      TASK=\"$2\"\n      shift 2;;\n    --pretrained_model)\n      PRETRAINED_MODEL_PATH=\"$2\"\n      shift 2;;\n    --version)\n      VERSION=\"$2\"\n      shift 2;;\n    *)\n      echo \"Unknown argument: $1\"\n      exit 1;;\n  esac\ndone\n\nTASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'\nGPU=${GPU:-0} # GPU id(s) you want to use, default '0'\nPRETRAINED_MODEL_PATH=${PRETRAINED_MODEL_PATH:-\"ckpts/mcan_pt_okvqa.pkl\"} # path to the pretrained model, default is the result from our experiments\nVERSION=${VERSION:-finetuning_okvqa} # version name, default 'finetuning_for_$TASK'\n\n# run python script\n# CUDA_VISIBLE_DEVICES=$GPU \\\npython main.py \\\n    --task $TASK --run_mode finetune \\\n    --cfg configs/finetune.yml \\\n    --version $VERSION \\\n    --pretrained_model $PRETRAINED_MODEL_PATH \\\n    --gpu $GPU --seed 99 --grad_accu 2\n"
  },
  {
    "path": "scripts/heuristics_gen.sh",
    "content": "#!/bin/bash\n# This script is used to generate heuristics from a finetuned model.\n\n# Parse arguments\nwhile [[ $# -gt 0 ]]; do\n  case \"$1\" in\n    --gpu)\n      GPU=\"$2\"\n      shift 2;;\n    --task)\n      TASK=\"$2\"\n      shift 2;;\n    --ckpt_path)\n      CKPT_PATH=\"$2\"\n      shift 2;;\n    --candidate_num)\n      CANDIDATE_NUM=\"$2\"\n      shift 2;;\n    --example_num)\n      EXAMPLE_NUM=\"$2\"\n      shift 2;;\n    --version)\n      VERSION=\"$2\"\n      shift 2;;\n    *)\n      echo \"Unknown argument: $1\"\n      exit 1;;\n  esac\ndone\n\nTASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'\nGPU=${GPU:-0} # GPU id(s) you want to use, default '0'\nCKPT_PATH=${CKPT_PATH:-\"ckpts/mcan_ft_okvqa.pkl\"} # path to the pretrained model, default is the result from our experiments\nCANDIDATE_NUM=${CANDIDATE_NUM:-10} # number of candidates to be generated\nEXAMPLE_NUM=${EXAMPLE_NUM:-100} # number of examples to be generated\nVERSION=${VERSION:-\"heuristics_okvqa\"} # version name, default 'heuristics1_for_$TASK'\n\n# CUDA_VISIBLE_DEVICES=$GPU \\\npython main.py \\\n    --task $TASK --run_mode heuristics \\\n    --version $VERSION \\\n    --cfg configs/finetune.yml \\\n    --ckpt_path $CKPT_PATH \\\n    --candidate_num $CANDIDATE_NUM \\\n    --example_num $EXAMPLE_NUM \\\n    --gpu $GPU"
  },
  {
    "path": "scripts/pretrain.sh",
    "content": "#!/bin/bash\n# This script is used to pretrain the MCAN model.\n\n# Parse arguments\nwhile [[ $# -gt 0 ]]; do\n  case \"$1\" in\n    --gpu)\n      GPU=\"$2\"\n      shift 2;;\n    --task)\n      TASK=\"$2\"\n      shift 2;;\n    --version)\n      VERSION=\"$2\"\n      shift 2;;\n    *)\n      echo \"Unknown argument: $1\"\n      exit 1;;\n  esac\ndone\n\nTASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'\nGPU=${GPU:-0} # GPU id(s) you want to use, default '0'\nVERSION=${VERSION:-pretraining_okvqa} # version name, default 'pretraining_for_$TASK'\n\n# CUDA_VISIBLE_DEVICES=$GPU \\\npython main.py \\\n    --task $TASK --run_mode pretrain\\\n    --cfg configs/pretrain.yml \\\n    --version $VERSION \\\n    --gpu $GPU --seed 99 --grad_accu 2"
  },
  {
    "path": "scripts/prompt.sh",
    "content": "#!/bin/bash\n# This script is used to prompt GPT-3 to generate final answers.\n\n# Parse arguments\nwhile [[ $# -gt 0 ]]; do\n  case \"$1\" in\n    --task)\n      TASK=\"$2\"\n      shift 2;;\n    --version)\n      VERSION=\"$2\"\n      shift 2;;\n    --examples_path)\n      EXAMPLES_PATH=\"$2\"\n      shift 2;;\n    --candidates_path)\n      CANDIDATES_PATH=\"$2\"\n      shift 2;;\n    --captions_path)\n      CAPTIONS_PATH=\"$2\"\n      shift 2;;\n    --openai_key)\n      OPENAI_KEY=\"$2\"\n      shift 2;;\n    *)\n      echo \"Unknown argument: $1\"\n      exit 1;;\n  esac\ndone\n\nTASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'\nVERSION=${VERSION:-\"prompt_okvqa\"} # version name, default 'prompt_for_$TASK'\nEXAMPLES_PATH=${EXAMPLES_PATH:-\"assets/answer_aware_examples_okvqa.json\"} # path to the examples, default is the result from our experiments\nCANDIDATES_PATH=${CANDIDATES_PATH:-\"assets/candidates_okvqa.json\"} # path to the candidates, default is the result from our experiments\nCAPTIONS_PATH=${CAPTIONS_PATH:-\"assets/captions_okvqa.json\"} # path to the captions, default is the result from our experiments\nOPENAI_KEY=${OPENAI_KEY:-\"\"} # path to the captions\n\n# CUDA_VISIBLE_DEVICES=$GPU \\\npython main.py \\\n    --task $TASK --run_mode prompt \\\n    --version $VERSION \\\n    --cfg configs/prompt.yml \\\n    --examples_path $EXAMPLES_PATH \\\n    --candidates_path $CANDIDATES_PATH \\\n    --captions_path $CAPTIONS_PATH \\\n    --openai_key $OPENAI_KEY"
  },
  {
    "path": "tools/extract_img_feats.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Tool for extracting image features\n# ------------------------------------------------------------------------------ #\n\nimport os, sys\nsys.path.append(os.getcwd())\n\nimport glob, re, math, time, datetime\nimport numpy as np\nimport torch\nfrom torch import nn\nfrom PIL import Image\nimport clip\nfrom tqdm import tqdm\nimport argparse\nfrom pathlib import Path\n\nfrom configs.task_cfgs import Cfgs\nfrom configs.task_to_split import *\nfrom tools.transforms import _transform\n\n\n@torch.no_grad()\ndef _extract_feat(img_path, net, T, save_path):\n    # print(img_path)\n    img = Image.open(img_path)\n    # W, H = img.size\n    img = T(img).unsqueeze(0).cuda()\n    clip_feats = net(img).cpu().numpy()[0]\n    clip_feats = clip_feats.transpose(1, 2, 0)\n    # print(clip_feats.shape, save_path)\n    # return\n    Path(save_path).parent.mkdir(parents=True, exist_ok=True)\n    np.savez(\n        save_path,\n        x=clip_feats,\n    )\n\n\nclass ExtractModel:\n    def __init__(self, encoder) -> None:\n        encoder.attnpool = nn.Identity()\n        self.backbone = encoder\n\n        self.backbone.cuda().eval()\n    \n    @torch.no_grad()\n    def __call__(self, img):\n        x = self.backbone(img)\n        return x\n\n\ndef main(__C, dataset):\n    # find imgs\n    img_dir_list = []\n    for split in SPLIT_TO_IMGS:\n        if split.startswith(dataset):\n            img_dir_list.append(\n                __C.IMAGE_DIR[SPLIT_TO_IMGS[split]]\n            )\n    print('image dirs:', img_dir_list)\n    img_path_list = []\n    for img_dir in img_dir_list:\n        img_path_list += glob.glob(img_dir + '*.jpg')\n    print('total images:', len(img_path_list))\n\n    # load model\n    clip_model, _ = clip.load(__C.CLIP_VERSION, device='cpu')\n    img_encoder = clip_model.visual\n\n    model = ExtractModel(img_encoder)\n    T = _transform(__C.IMG_RESOLUTION)\n\n    for img_path in tqdm(img_path_list):\n        img_path_sep = img_path.split('/')\n        img_path_sep[-3] += '_feats'\n        save_path = '/'.join(img_path_sep).replace('.jpg', '.npz')\n        _extract_feat(img_path, model, T, save_path)\n\n\nif __name__ == '__main__':\n    parser = argparse.ArgumentParser('Tool for extracting CLIP image features.')\n    parser.add_argument('--dataset', dest='dataset', help='dataset name, e.g., ok, aok', type=str, required=True)\n    parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default='0')\n    parser.add_argument('--clip_model', dest='CLIP_VERSION', help='clip model name or local model checkpoint path', type=str, default='RN50x64')\n    parser.add_argument('--img_resolution', dest='IMG_RESOLUTION', help='image resolution', type=int, default=512)\n    args = parser.parse_args()\n    __C = Cfgs(args)\n    main(__C, args.dataset)"
  },
  {
    "path": "tools/transforms.py",
    "content": "# ------------------------------------------------------------------------------ #\n# Author: Zhenwei Shao (https://github.com/ParadoxZW)\n# Description: Preprocessing images to be fed into the model, the script is\n#              adapted from the code of CLIP (github.com/openai/CLIP)\n# ------------------------------------------------------------------------------ #\n\nfrom math import ceil\nfrom PIL import Image\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize\nfrom PIL import ImageOps\n\ntry:\n    from torchvision.transforms import InterpolationMode\n    BICUBIC = InterpolationMode.BICUBIC\nexcept ImportError:\n    BICUBIC = Image.BICUBIC\n\ndef Pad():\n    def _pad(image):\n        W, H = image.size # debugged\n        if H < W:\n            pad_H = ceil((W - H) / 2)\n            pad_W = 0\n        else:\n            pad_H = 0\n            pad_W = ceil((H - W) / 2)\n        img = ImageOps.expand(image, border=(pad_W, pad_H, pad_W, pad_H), fill=0)\n        # print(img.size)\n        return img\n    return _pad\n\ndef _convert_image_to_rgb(image):\n    return image.convert(\"RGB\")\n\ndef identity(x):\n    return x\n\ndef _transform(n_px, pad=False, crop=False):\n    return Compose([\n        Pad() if pad else identity,\n        Resize([n_px, n_px], interpolation=BICUBIC),\n        CenterCrop(n_px) if crop else identity,\n        _convert_image_to_rgb,\n        ToTensor(),\n        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n    ])\n\n\nif __name__ == '__main__':\n    img = np.random.rand(100, 333, 3).astype('uint8')\n    img = Image.fromarray(img)\n    img = _transform(32 * 14)(img)\n    img = torch.Tensor(img)\n    print(img.size())\n"
  }
]