Repository: MILVLG/prophet
Branch: main
Commit: 3e92892ec5ec
Files: 50
Total size: 157.6 KB
Directory structure:
gitextract_h4opr1r6/
├── .gitignore
├── LICENSE
├── README.md
├── assets/
│ └── .gitkeep
├── ckpts/
│ └── .gitkeep
├── configs/
│ ├── finetune.yml
│ ├── path_cfgs.py
│ ├── pretrain.yml
│ ├── prompt.yml
│ ├── task_cfgs.py
│ └── task_to_split.py
├── datasets/
│ └── .gitkeep
├── environment.yml
├── evaluation/
│ ├── ans_punct.py
│ ├── aok_utils/
│ │ ├── eval_predictions.py
│ │ ├── load_aokvqa.py
│ │ └── remap_predictions.py
│ ├── aokvqa_evaluate.py
│ ├── okvqa_evaluate.py
│ └── vqa_utils/
│ ├── vqa.py
│ └── vqaEval.py
├── main.py
├── misc/
│ └── tree.txt
├── outputs/
│ ├── ckpts/
│ │ └── .gitkeep
│ ├── logs/
│ │ └── .gitkeep
│ └── results/
│ └── .gitkeep
├── preds/
│ └── .gitkeep
├── prophet/
│ ├── __init__.py
│ ├── stage1/
│ │ ├── finetune.py
│ │ ├── heuristics.py
│ │ ├── model/
│ │ │ ├── layers.py
│ │ │ ├── mcan.py
│ │ │ ├── mcan_for_finetune.py
│ │ │ ├── net_utils.py
│ │ │ └── rope2d.py
│ │ ├── pretrain.py
│ │ └── utils/
│ │ ├── load_data.py
│ │ └── optim.py
│ └── stage2/
│ ├── prompt.py
│ └── utils/
│ ├── data_utils.py
│ └── fancy_pbar.py
├── scripts/
│ ├── evaluate_file.sh
│ ├── evaluate_model.sh
│ ├── extract_img_feats.sh
│ ├── finetune.sh
│ ├── heuristics_gen.sh
│ ├── pretrain.sh
│ └── prompt.sh
└── tools/
├── extract_img_feats.py
└── transforms.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
**/__pycache__/
datasets/*/
!datasets/.gitkeep
assets/*
!assets/.gitkeep
ckpts/*
!ckpts/.gitkeep
outputs/ckpts/*
!outputs/ckpts/.gitkeep
outputs/logs/*
!outputs/logs/.gitkeep
outputs/results/*
!outputs/results/.gitkeep
preds/*
!preds/.gitkeep
tmp
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# Prophet
[](https://paperswithcode.com/sota/visual-question-answering-on-a-okvqa?p=prompting-large-language-models-with-answer)
[](https://paperswithcode.com/sota/visual-question-answering-on-ok-vqa?p=prompting-large-language-models-with-answer)
This repository is the official implementation of the Prophet, a two stage framework designed to prompt GPT-3 with answer heuristics for knowledge-based VQA. In stage one, we train a vanilla VQA model on a specific knowledge-based VQA dataset and extract two types of complementary answer heuristics from the model: answer candidates and answer-aware examples. In stage two, answer heuristics are used to prompt GPT-3 to generate better answers. Prophet significantly outperforms existing state-of-the-art methods on two datasets, delivering 61.1% on OK-VQA and 55.7% on A-OKVQA. Please refer to our [paper](https://arxiv.org/pdf/2303.01903.pdf) for details.

## Updates
April 28, 2023
- Add pretrained and finetuned models on A-OKVOA.
March 10, 2023
- Training and testing codes of the two-stages Prophet framework.
- Pretrained and finetuned models on OK-VOA.
## Table of Contents
- [Prerequisites](#prerequisites)
- [Usage](#usage)
- [Evaluation](#evaluation)
- [Citation](#citation)
- [License](#license)
## Prerequisites
### Hardware and Software Requirements
To conduct the following experiments, a machine with at least 1 RTX 3090 GPU, 50GB memory, and 300GB free disk space is recommended. We strongly recommend using an SSD drive to guarantee high-speed I/O.
Following software is needed:
1. [Python](https://www.python.org/downloads/) >= 3.9
2. [Cuda](https://developer.nvidia.com/cuda-toolkit) >= 11.3
3. [Pytorch](https://pytorch.org/get-started/locally/) >= 12.0
5. what you can find in [environment.yml](environment.yml)
We recommend downloading [Anaconda](https://www.anaconda.com/) first and then creating a new environment with the following command:
``` shell
$ conda env create -f environment.yml
```
This command will create a new environment named `prophet` with all the required packages. To activate the environment, run:
``` shell
$ conda activate prophet
```
### Data Preparation
Before running the code, prepare two folders: `datasets` and `assets`. The `datasets` folder contains all the datasets and features used in this project, and the `assets` folder contains the pre-computed resources and other intermediate files (you can use them to skip some early experiment steps and save time).
First, download the [datasets](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/Ebzd7EANzHVHnh3FvYvCJ7kBkJf56iT1Obe5L2PZAzgM2g?download=1) and [assets](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/Ec5NPIswAxlEqi74qwGjIf0BKInF0O6nwW5dtn4h3GOUsQ?download=1). Then put the `datasets` and `assets` folder in the root directory of this project. Download MSCOCO 2014 and 2017 images from [here](https://cocodataset.org/#download) (you can skip MSCOCO 2017 if you only experiments on OK-VQA) and put them in the `datasets` folder. Run the following command to extract the features of the images:
``` shell
$ bash scripts/extract_img_feats.sh
```
After that, the `datasets` and `assets` folder will have the following structure:
Click to expand
```
datasets
├── aokvqa
│ ├── aokvqa_v1p0_test.json
│ ├── aokvqa_v1p0_train.json
│ └── aokvqa_v1p0_val.json
├── coco2014
│ ├── train2014
│ └── val2014
├── coco2014_feats
│ ├── train2014
│ └── val2014
├── coco2017
│ ├── test2017
│ ├── train2017
│ └── val2017
├── coco2017_feats
│ ├── test2017
│ ├── train2017
│ └── val2017
├── okvqa
│ ├── mscoco_train2014_annotations.json
│ ├── mscoco_val2014_annotations.json
│ ├── OpenEnded_mscoco_train2014_questions.json
│ └── OpenEnded_mscoco_val2014_questions.json
└── vqav2
├── v2_mscoco_train2014_annotations.json
├── v2_mscoco_val2014_annotations.json
├── v2_OpenEnded_mscoco_train2014_questions.json
├── v2_OpenEnded_mscoco_val2014_questions.json
├── v2valvg_no_ok_annotations.json
├── v2valvg_no_ok_questions.json
├── vg_annotations.json
└── vg_questions.json
```
We've also provided a tree structure of the entire project in [misc/tree.txt](misc/tree.txt).
## Usage
We provide bash scripts for each stage of the Prophet framework. You can find them in the `scripts` directory. There are two common arguments you should take care of when running each script:
- `--task`: specify the task (i.e., the target dataset) you want to deal with. The available options are `ok` (training on `train` set of OK-VQA and evaluating on the `test` set of OK-VQA), `aok_val` (training on `train` set of A-OKVQA and evaluating on the `val` set of A-OKVQA) and `aok_test` (training on `train` set and `val` set of A-OKVQA and evaluating on the `test` set of A-OKVQA);
Note that although Prophet uses VQA v2 datasets for pre-training, there are slight differences in how the datasets are used for different tasks (`ok`, `aok_val`, and `aok_test`), as detailed in [configs/task_to_split.py](configs/task_to_split.py). This means that different pre-training commands need to be followed for each task.
- `--version`: specify the version name of this run. This name will be used to create a new folder in the `outputs` directory to store the results of this run.
Notice that you can omit any arguments when invoking following scripts, it will then use the default arguments written in the script files.
Before running any script, you can also update the configuration files (`*.yml`) in the `configs` directory to change hyperparameters.
### 1. OK-VQA
Take OK-VQA for example, Propht consists of two phases, stage one for training a vanilla VQA model and extracting answer heuristics, and stage two for prompting GPT-3 with answer heuristics.
#### **Stage one**
At this stage, we train an improved MCAN model (check the [paper](https://arxiv.org/pdf/2303.01903.pdf) for detail description) through pretraning on VQA v2 and finetuning on target dataset. Multiple GPUs are supported by setting `--gpu 0,1,2,3` (for example). Run pretraining step with commands:
```shell
$ bash scripts/pretrain.sh \
--task ok --version okvqa_pretrain_1 --gpu 0
```
We've provided a pretrained model for OK-VQA [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EcdTatraOqRJnZXBDXfr7QQBPtn8QYCa2m3Pvq0LlEml9Q?download=1). Then, run finetuning step with commands:
```shell
$ bash scripts/finetune.sh \
--task ok --version okvqa_finetune_1 --gpu 0 \
--pretrained_model outputs/okvqa_pretrain_1/ckpts/epoch_13.pkl
```
All epoch checkpoints are saved in `outputs/ckpts/{your_version_name}`. We've also provided a finetuned model for OK-VQA [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/ESUb093PgyZFtLnU_RIYJQsBN_PU0jJdu-eFUb1-4T4mIQ?download=1). You may pick one to generate answer heuristics by run following command:
```shell
$ bash scripts/heuristics_gen.sh \
--task ok --version okvqa_heuristics_1
--gpu 0 --ckpt_path outputs/okvqa_finetune_1/ckpts/epoch_6.pkl
--candidate_num 10 --example_num 100
```
The extracted answer heuristics will be stored as `candidates.json` and `examples.json` in `outputs/results/{your_version_name}` directory.
#### **Stage two**
You may need the `candidates.json` and `examples.json` files generated in the former stage to step into this stage. **Or you can just skip stage one, and use the files of answer heuristics we provided in `assets`. Especially, the `candidates.json` and `examples.json` files for OK-VQA are `answer_aware_examples_okvqa.json` and `candidates_okvqa.json`.** To prompt GPT-3 with answer heuristics and generate better answers, run the following command:
```shell
$ bash scripts/prompt.sh \
--task ok --version okvqa_prompt_1 \
--examples_path outputs/results/okvqa_heuristics_1/examples.json \
--candidates_path outputs/results/okvqa_heuristics_1/candidates.json \
--openai_key sk-xxxxxxxxxxxxxxxxxxxxxx
```
The result file will be stored as `result.json` in `outputs/results/{your_version_name}` directory.
We also provide example scripts for the `aok_val` and `aok_test` modes on A-OKVQA.
Click to expand
### 2. A-OKVQA (val)
#### **Stage one**
Similary, for task of `aok_val`, run pretraining step with commands:
```shell
$ bash scripts/pretrain.sh \
--task aok_val --version aokvqa_val_pretrain_1 --gpu 0
```
We've provided a pretrained model for `aok_val` [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EYeIgGR521pNsEjxliqRkmEBGpcwS5p-qrMGTC9ro_SF6g?download=1).Then, run finetuning step with commands:
```shell
$ bash scripts/finetune.sh \
--task aok_val --version aokvqa_val_finetune_1 --gpu 0 \
--pretrained_model outputs/aokvqa_val_pretrain_1/ckpts/epoch_13.pkl
```
All epoch checkpoints are saved in `outputs/ckpts/{your_version_name}`.We've also provided a finetuned model for `aok_val` [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EQXIIjAIiJJFrOpobVhyH9oBBeBAY-VttHqfS91qPOKlJw?download=1). You may pick one to generate answer heuristics by run following command:
```shell
$ bash scripts/heuristics_gen.sh \
--task aok_val --version aokvqa_val_heuristics_1
--gpu 0 --ckpt_path outputs/aokvqa_val_finetune_1/ckpts/epoch_6.pkl
--candidate_num 10 --example_num 100
```
The extracted answer heuristics will be stored as `candidates.json` and `examples.json` in `outputs/results/{your_version_name}` directory.
#### **Stage two**
You may need the `candidates.json` and `examples.json` files generated in the former stage to step into this stage. **Or you can just skip stage one, and use the files of answer heuristics we provided in `assets`. Especially, the `candidates.json` and `examples.json` files for `aok_val` are `examples_aokvqa_val.json` and `candidates_aokvqa_val.json`.** To prompt GPT-3 with answer heuristics and generate better answers, run the following command:
```shell
$ bash scripts/prompt.sh \
--task ok --version okvqa_val_prompt_1 \
--examples_path outputs/results/aokvqa_val_heuristics_1/examples.json \
--candidates_path outputs/results/aokvqa_val_heuristics_1/candidates.json \
--captions_path assets/captions_aokvqa.json \
--openai_key sk-xxxxxxxxxxxxxxxxxxxxxx
```
The result file will be stored as `result.json` in `outputs/results/{your_version_name}` directory.
### 3. A-OKVQA (test)
For task of `aok_val`, run pretraining step with commands:
#### **Stage one**
```shell
$ bash scripts/pretrain.sh \
--task aok_test --version aokvqa_test_pretrain_1 --gpu 0
```
We've provided a pretrained model for `aok_test` [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EWSBB1OrjIlBoPdTMso6RFABNQKYKBWo1iU4l0w2NVDvuQ?download=1). Then, run finetuning step with commands:
```shell
$ bash scripts/finetune.sh \
--task aok_test --version aokvqa_test_finetune_1 --gpu 0 \
--pretrained_model outputs/aokvqa_test_pretrain_1/ckpts/epoch_13.pkl
```
All epoch checkpoints are saved in `outputs/ckptss/{your_version_name}`.We've also provided a finetuned model for `aok_test` [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EQ6gvWbv9VhHrhh0D08G79kBk6JEA_eqXEt5ULgueCf1tA?download=1). You may pick one to generate answer heuristics by run following command:
```shell
$ bash scripts/heuristics_gen.sh \
--task aok_test --version aokvqa_test_heuristics_1
--gpu 0 --ckpt_path outputs/aokvqa_test_finetune_1/ckpts/epoch_6.pkl
--candidate_num 10 --example_num 100
```
The extracted answer heuristics will be stored as `candidates.json` and `examples.json` in `outputs/results/{your_version_name}` directory.
#### **Stage two**
You may need the `candidates.json` and `examples.json` files generated in the former stage to step into this stage. **Or you can just skip stage one, and use the files of answer heuristics we provided in `assets`. Especially, the `candidates.json` and `examples.json` files for `aok_test` are `examples_aokvqa_test.json` and `candidates_aokvqa_test.json`.** To prompt GPT-3 with answer heuristics and generate better answers, run the following command:
```shell
$ bash scripts/prompt.sh \
--task ok --version okvqa_test_prompt_1 \
--examples_path outputs/results/aokvqa_test_heuristics_1/examples.json \
--candidates_path outputs/results/aokvqa_test_heuristics_1/candidates.json \
--captions_path assets/captions_aokvqa.json \
--openai_key sk-xxxxxxxxxxxxxxxxxxxxxx
```
The result file will be stored as `result.json` in `outputs/results/{your_version_name}` directory.
## Evaluation
For the task of `ok` and `aok_val` whose annotations are available, the scores are automatically computed after finetuning and prompting. You can also evaluate the result files that outputted after finetuning or prompting, by run
```shell
$ bash scripts/evaluate_file.sh \
--task ok --result_path outputs/results/okvqa_prompt_1/result.json
```
Using the corresponding result files and evaluation script above, we obtain the accuracies in the following table, respectively.
| OK-VQA | A-OKVQA (val) | A-OKVQA (test) |
|
| MCAN | Prophet |
|:--:|:--:|
| [53.0%](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EVPAUDjTWX9Gn3GIqj7JwUoB5HMWwL3SRnNf18dSckJBOw?download=1) | [61.1%](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EUqH0N4fLVdPsLYJ48Wl_gsBneZzyGR23Tv5P9RskOBwNQ?download=1) |
|
| MCAN | Prophet |
|:--:|:--:|
| [52.0%](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EdBYZeS55iFEjdlOhUbyWRsBtYnQ3-zerho13mYj2YQ0Ag?download=1) |[58.2%](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EXDUxT3_LrpDugZ7xj-0BMYBynuFDJQS88M3EGeFEhU5dg?download=1) |
|
| MCAN | Prophet |
|:--:|:--:|
| 45.6% | 55.7% |
|
For the task of `aok_test`, you need to submit the result file to the [A-OKVQA Leaderboard](https://leaderboard.allenai.org/a-okvqa/submissions/public) to evaluate the result.
## Citation
If you use this code in your research, please cite our paper:
```BibTex
@inproceedings{shao2023prompting,
title={Prompting Large Language Models with Answer Heuristics for Knowledge-based Visual Question Answering},
author={Shao, Zhenwei and Yu, Zhou and Wang, Meng and Yu, Jun},
booktitle={Computer Vision and Pattern Recognition (CVPR)},
pages={14974--14983},
year={2023}
}
```
## License
This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
================================================
FILE: assets/.gitkeep
================================================
================================================
FILE: ckpts/.gitkeep
================================================
================================================
FILE: configs/finetune.yml
================================================
# Network
IMG_RESOLUTION: 512
IMG_FEAT_GRID: 16
IMG_FEAT_SIZE: 4096
BERT_VERSION: bert-large-uncased
MAX_TOKEN: 32
ARCH_CEIL: {
enc: ['SA', 'FFN'],
dec: ['SA_v', 'GA', 'FFN'],
}
LANG_FEAT_SIZE: 1024
LAYER: 6
HIDDEN_SIZE: 1024
FF_SIZE: 4096
MULTI_HEAD: 8
DROPOUT_R: 0.1
FLAT_MLP_SIZE: 1024
FLAT_GLIMPSES: 1
FLAT_OUT_SIZE: 2048
# Training
BATCH_SIZE: 64
EVAL_BATCH_SIZE: 64
BERT_LR_MULT: 0.01
LR_BASE: 0.00005
LR_DECAY_R: 0.2
LR_DECAY_LIST: [5,]
WARMUP_EPOCH: 0
MAX_EPOCH: 6
GRAD_NORM_CLIP: -1
OPT: AdamW
OPT_PARAMS: {betas: '(0.9, 0.98)', eps: '1e-9'}
## optimizer for finetuning warmup (i.e., only update the new appended parameters as a warm-up)
EPOPH_FTW: 1
OPT_FTW: Adam
LR_BASE_FTW: 0.001
OPT_PARAMS_FTW: {betas: '(0.9, 0.98)', eps: '1e-9'}
================================================
FILE: configs/path_cfgs.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: set const paths and dirs
# ------------------------------------------------------------------------------ #
import os
class PATH:
def __init__(self):
self.LOG_ROOT = 'outputs/logs/'
self.CKPT_ROOT = 'outputs/ckpts/'
self.RESULTS_ROOT = 'outputs/results/'
self.DATASET_ROOT = 'datasets/'
self.ASSETS_ROOT = 'assets/'
self.IMAGE_DIR = {
'train2014': self.DATASET_ROOT + 'coco2014/train2014/',
'val2014': self.DATASET_ROOT + 'coco2014/val2014/',
# 'test2015': self.DATASET_ROOT + 'coco2015/test2015/',
'train2017': self.DATASET_ROOT + 'coco2017/train2017/',
'val2017': self.DATASET_ROOT + 'coco2017/val2017/',
'test2017': self.DATASET_ROOT + 'coco2017/test2017/',
}
self.FEATS_DIR = {
'train2014': self.DATASET_ROOT + 'coco2014_feats/train2014/',
'val2014': self.DATASET_ROOT + 'coco2014_feats/val2014/',
'train2017': self.DATASET_ROOT + 'coco2017_feats/train2017/',
'val2017': self.DATASET_ROOT + 'coco2017_feats/val2017/',
'test2017': self.DATASET_ROOT + 'coco2017_feats/test2017/',
}
self.QUESTION_PATH = {
'v2train': self.DATASET_ROOT + 'vqav2/v2_OpenEnded_mscoco_train2014_questions.json',
'v2val': self.DATASET_ROOT + 'vqav2/v2_OpenEnded_mscoco_val2014_questions.json',
'vg': self.DATASET_ROOT + 'vqav2/vg_questions.json',
'v2valvg_no_ok': self.DATASET_ROOT + 'vqav2/v2valvg_no_ok_questions.json',
'oktrain': self.DATASET_ROOT + 'okvqa/OpenEnded_mscoco_train2014_questions.json',
'oktest': self.DATASET_ROOT + 'okvqa/OpenEnded_mscoco_val2014_questions.json',
'aoktrain': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_train.json',
'aokval': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_val.json',
'aoktest': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_test.json',
}
self.ANSWER_PATH = {
'v2train': self.DATASET_ROOT + 'vqav2/v2_mscoco_train2014_annotations.json',
'v2val': self.DATASET_ROOT + 'vqav2/v2_mscoco_val2014_annotations.json',
'vg': self.DATASET_ROOT + 'vqav2/vg_annotations.json',
'v2valvg_no_ok': self.DATASET_ROOT + 'vqav2/v2valvg_no_ok_annotations.json',
'oktrain': self.DATASET_ROOT + 'okvqa/mscoco_train2014_annotations.json',
'oktest': self.DATASET_ROOT + 'okvqa/mscoco_val2014_annotations.json',
'aoktrain': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_train.json',
'aokval': self.DATASET_ROOT + 'aokvqa/aokvqa_v1p0_val.json',
}
self.ANSWER_DICT_PATH = {
'v2': self.ASSETS_ROOT + 'answer_dict_vqav2.json',
'ok': self.ASSETS_ROOT + 'answer_dict_okvqa.json',
'aok': self.ASSETS_ROOT + 'answer_dict_aokvqa.json',
}
================================================
FILE: configs/pretrain.yml
================================================
# Network
IMG_RESOLUTION: 512
IMG_FEAT_GRID: 16
IMG_FEAT_SIZE: 4096
BERT_VERSION: bert-large-uncased
MAX_TOKEN: 32
ARCH_CEIL: {
enc: ['SA', 'FFN'],
dec: ['SA_v', 'GA', 'FFN'],
}
LANG_FEAT_SIZE: 1024
LAYER: 6
HIDDEN_SIZE: 1024
FF_SIZE: 4096
MULTI_HEAD: 8
DROPOUT_R: 0.1
FLAT_MLP_SIZE: 1024
FLAT_GLIMPSES: 1
FLAT_OUT_SIZE: 2048
# Training
BATCH_SIZE: 64
EVAL_BATCH_SIZE: 64
BERT_LR_MULT: 0.01
LR_BASE: 0.00007
LR_DECAY_R: 0.2
LR_DECAY_LIST: [10, 12]
WARMUP_EPOCH: 3
MAX_EPOCH: 13
GRAD_NORM_CLIP: 2.0
OPT: Adam
OPT_PARAMS: {betas: '(0.9, 0.98)', eps: '1e-9'}
================================================
FILE: configs/prompt.yml
================================================
MODEL: text-davinci-002
TEMPERATURE: 0.
MAX_TOKENS: 8
SLEEP_PER_INFER: 10
PROMPT_HEAD: "Please answer the question according to the context and candidate answers. Each candidate answer is associated with a confidence score within a bracket. The true answer may not be included in the candidate answers.\n\n"
LINE_PREFIX: "===\n"
N_EXAMPLES: 20
K_CANDIDATES: 10
T_INFER: 5
================================================
FILE: configs/task_cfgs.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Object that manages the configuration of the experiments.
# ------------------------------------------------------------------------------ #
import os
import random
import torch
import numpy as np
from datetime import datetime
from .path_cfgs import PATH
from .task_to_split import *
class Cfgs(PATH):
def __init__(self, args):
super(Cfgs, self).__init__()
self.set_silent_attr()
self.GPU = getattr(args, 'GPU', None)
if self.GPU is not None:
self.GPU_IDS = [int(i) for i in self.GPU.split(',')]
# print(f'Avaliable GPUs: {torch.cuda.device_count()}')
# print(f'Using GPU {self.GPU}')
self.CURRENT_GPU = self.GPU_IDS[0]
torch.cuda.set_device(f'cuda:{self.CURRENT_GPU}')
self.N_GPU = len(self.GPU_IDS)
self.SEED = getattr(args, 'SEED', 1111)
torch.manual_seed(self.SEED)
# torch.manual_seed_all(self.SEED)
if self.N_GPU < 2:
torch.cuda.manual_seed(self.SEED)
else:
torch.cuda.manual_seed_all(self.SEED)
torch.backends.cudnn.deterministic = True
np.random.seed(self.SEED)
random.seed(self.SEED)
torch.set_num_threads(2)
# -------------------------
# ---- Version Control ----
# -------------------------
self.TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
self.VERSION = getattr(args, 'VERSION', self.TIMESTAMP)
# paths and dirs
self.CKPTS_DIR = os.path.join(self.CKPT_ROOT, self.VERSION)
self.LOG_PATH = os.path.join(
self.LOG_ROOT,
self.VERSION,
f'log_{self.TIMESTAMP}.txt'
)
self.RESULT_DIR = os.path.join(self.RESULTS_ROOT, self.VERSION)
self.RESULT_PATH = os.path.join(
self.RESULTS_ROOT,
self.VERSION,
'result_' + self.TIMESTAMP + '.json'
)
# about resume
self.RESUME = getattr(args, 'RESUME', False)
if self.RESUME and self.RUN_MODE == 'pretrain':
self.RESUME_VERSION = getattr(args, 'RESUME_VERSION', self.VERSION)
self.RESUME_EPOCH = getattr(args, 'RESUME_EPOCH', None)
resume_path = getattr(args, 'RESUME_PATH', None)
self.RESUME_PATH = os.path.join(
self.CKPTS_DIR,
self.RESUME_VERSION,
f'epoch_{self.RESUME_EPOCH}.pkl'
) if resume_path is None else resume_path
# for testing and heuristics generation
self.CKPT_PATH = getattr(args, 'CKPT_PATH', None)
# ----------------------
# ---- Task Control ----
# ----------------------
self.TASK = getattr(args, 'TASK', 'ok')
assert self.TASK in ['ok', 'aok_val', 'aok_test']
self.RUN_MODE = getattr(args, 'RUN_MODE', 'finetune')
assert self.RUN_MODE in ['pretrain', 'finetune', 'finetune_test', 'heuristics', 'prompt']
if self.RUN_MODE == 'pretrain':
self.DATA_TAG = 'v2' # used to config answer dict
self.DATA_MODE = 'pretrain'
else:
self.DATA_TAG = self.TASK.split('_')[0] # used to config answer dict
self.DATA_MODE = 'finetune'
# config pipeline...
self.EVAL_NOW = True
if self.RUN_MODE == 'pretrain' or self.TASK == 'aok_test':
self.EVAL_NOW = False
# print(f'Eval Now: {self.EVAL_NOW}')
# ------------------------
# ---- Model Training ----
# ------------------------
self.NUM_WORKERS = 8
self.PIN_MEM = True
# --------------------------------
# ---- Heuristics Generations ----
# --------------------------------
self.CANDIDATE_NUM = getattr(args, 'CANDIDATE_NUM', None)
if self.CANDIDATE_NUM is not None:
self.CANDIDATE_FILE_PATH = os.path.join(
self.RESULTS_ROOT,
self.VERSION,
'candidates.json'
)
self.EXAMPLE_FILE_PATH = os.path.join(
self.RESULTS_ROOT,
self.VERSION,
'examples.json'
)
self.ANSWER_LATENTS_DIR = os.path.join(
self.RESULTS_ROOT,
self.VERSION,
'answer_latents'
) # where answer latents will be saved
# write rest arguments to self
for attr in args.__dict__:
setattr(self, attr, getattr(args, attr))
def __repr__(self):
_str = ''
for attr in self.__dict__:
if attr in self.__silent or getattr(self, attr) is None:
continue
_str += '{ %-17s }-> %s\n' % (attr, getattr(self, attr))
return _str
def override_from_dict(self, dict_):
for key, value in dict_.items():
setattr(self, key, value)
def set_silent_attr(self):
self.__silent = []
for attr in self.__dict__:
self.__silent.append(attr)
@property
def TRAIN_SPLITS(self):
return TASK_TO_SPLIT[self.TASK][self.DATA_MODE]['train_split']
@property
def EVAL_SPLITS(self):
return TASK_TO_SPLIT[self.TASK][self.DATA_MODE]['eval_split']
@property
def FEATURE_SPLIT(self):
FEATURE_SPLIT = []
for split in self.TRAIN_SPLITS + self.EVAL_SPLITS:
feat_split = SPLIT_TO_IMGS[split]
if feat_split not in FEATURE_SPLIT:
FEATURE_SPLIT.append(feat_split)
return FEATURE_SPLIT
@property
def EVAL_QUESTION_PATH(self):
# if not self.EVAL_NOW:
# return []
return self.QUESTION_PATH[self.EVAL_SPLITS[0]]
@property
def EVAL_ANSWER_PATH(self):
if not self.EVAL_NOW:
return []
return self.ANSWER_PATH[self.EVAL_SPLITS[0]]
================================================
FILE: configs/task_to_split.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: The goal of this file is to define the mapping from task and data
# mode to dataset splits.
# ------------------------------------------------------------------------------ #
class DictSafe(dict):
def __init__(self, data={}):
dict.__init__(self, data)
for key, value in data.items():
if isinstance(value, dict):
self[key] = DictSafe(value)
def __getitem__(self, key):
return self.get(key, [])
# TASK_TO_SPLIT[TASK][DATA_MODE]['train_split'] is a list of dataset split name for training
# TASK_TO_SPLIT[TASK][DATA_MODE]['eval_split'] is a list of dataset split name for evaluation
# 'pretrain' mode is used for pretrain, so it does not have 'eval_split'
# 'finetune' mode is used for finetune, heuristics generation and prompting
TASK_TO_SPLIT = {
'ok': {
'pretrain': {
'train_split': ['v2train', 'v2valvg_no_ok'],
# As the testing set of okvqa uses a subset of MSCOCO val2014 as the input images,
# we remove this subset from the training set of pretraining to avoid data leakage.
},
'finetune': {
'train_split': ['oktrain'],
'eval_split': ['oktest'],
}
},
'aok_val': {
'pretrain': {
'train_split': ['v2train'],
},
'finetune': {
'train_split': ['aoktrain'],
'eval_split': ['aokval'],
}
},
'aok_test': {
'pretrain': {
'train_split': ['v2train', 'v2val', 'vg'],
},
'finetune': {
'train_split': ['aoktrain', 'aokval'],
'eval_split': ['aoktest'],
}
},
}
TASK_TO_SPLIT = DictSafe(TASK_TO_SPLIT)
SPLIT_TO_IMGS = {
'v2train': 'train2014',
'v2val': 'val2014',
'v2valvg_no_ok': 'val2014',
'vg': 'val2014',
'oktrain': 'train2014',
'oktest': 'val2014',
'aoktrain': 'train2017',
'aokval': 'val2017',
'aoktest': 'test2017',
}
if __name__ == '__main__':
print(TASK_TO_SPLIT['okvqa']['test']['train_split'])
================================================
FILE: datasets/.gitkeep
================================================
================================================
FILE: environment.yml
================================================
name: prophet
channels:
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch
- pytorch
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
- conda-forge
- defaults
dependencies:
- numpy=1.21.2=py39h20f2e39_0
- opt_einsum=3.3.0=pyhd8ed1ab_1
- pip=21.2.4=py39h06a4308_0
- python=3.9.11=h12debd9_2
- pytorch=1.12.0=py3.9_cuda11.3_cudnn8.3.2_0
- rich=12.5.1=py39h06a4308_0
- torchvision=0.13.0=py39_cu113
- pip:
- pyyaml==6.0
- einops==0.6.0
- huggingface-hub==0.12.1
- openai==0.18.0
- opencv-python==4.5.5.64
- pillow==9.3.0
- pyyaml==6.0
- sentence-transformers==2.2.2
- sentencepiece==0.1.96
- tokenizers==0.11.6
- tqdm==4.63.0
- transformers==4.26.1
- git+https://github.com/openai/CLIP.git
================================================
FILE: evaluation/ans_punct.py
================================================
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# based on VQA Evaluation Code
# --------------------------------------------------------
import re
contractions = {
"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve":
"could've", "couldnt": "couldn't", "couldn'tve": "couldn't've",
"couldnt've": "couldn't've", "didnt": "didn't", "doesnt":
"doesn't", "dont": "don't", "hadnt": "hadn't", "hadnt've":
"hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent":
"haven't", "hed": "he'd", "hed've": "he'd've", "he'dve":
"he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll",
"hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", "Im":
"I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've":
"it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's",
"maam": "ma'am", "mightnt": "mightn't", "mightnt've":
"mightn't've", "mightn'tve": "mightn't've", "mightve": "might've",
"mustnt": "mustn't", "mustve": "must've", "neednt": "needn't",
"notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't",
"ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat":
"'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve":
"she'd've", "she's": "she's", "shouldve": "should've", "shouldnt":
"shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve":
"shouldn't've", "somebody'd": "somebodyd", "somebodyd've":
"somebody'd've", "somebody'dve": "somebody'd've", "somebodyll":
"somebody'll", "somebodys": "somebody's", "someoned": "someone'd",
"someoned've": "someone'd've", "someone'dve": "someone'd've",
"someonell": "someone'll", "someones": "someone's", "somethingd":
"something'd", "somethingd've": "something'd've", "something'dve":
"something'd've", "somethingll": "something'll", "thats":
"that's", "thered": "there'd", "thered've": "there'd've",
"there'dve": "there'd've", "therere": "there're", "theres":
"there's", "theyd": "they'd", "theyd've": "they'd've", "they'dve":
"they'd've", "theyll": "they'll", "theyre": "they're", "theyve":
"they've", "twas": "'twas", "wasnt": "wasn't", "wed've":
"we'd've", "we'dve": "we'd've", "weve": "we've", "werent":
"weren't", "whatll": "what'll", "whatre": "what're", "whats":
"what's", "whatve": "what've", "whens": "when's", "whered":
"where'd", "wheres": "where's", "whereve": "where've", "whod":
"who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl":
"who'll", "whos": "who's", "whove": "who've", "whyll": "why'll",
"whyre": "why're", "whys": "why's", "wont": "won't", "wouldve":
"would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll":
"y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've",
"y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd":
"you'd", "youd've": "you'd've", "you'dve": "you'd've", "youll":
"you'll", "youre": "you're", "youve": "you've"
}
manual_map = { 'none': '0',
'zero': '0',
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'ten': '10'}
articles = ['a', 'an', 'the']
period_strip = re.compile("(?!<=\d)(\.)(?!\d)")
comma_strip = re.compile("(\d)(\,)(\d)")
punct = [';', r"/", '[', ']', '"', '{', '}',
'(', ')', '=', '+', '\\', '_', '-',
'>', '<', '@', '`', ',', '?', '!']
def process_punctuation(inText):
outText = inText
for p in punct:
if (p + ' ' in inText or ' ' + p in inText) \
or (re.search(comma_strip, inText) != None):
outText = outText.replace(p, '')
else:
outText = outText.replace(p, ' ')
outText = period_strip.sub("", outText, re.UNICODE)
return outText
def process_digit_article(inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = manual_map.setdefault(word, word)
if word not in articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in contractions:
outText[wordId] = contractions[word]
outText = ' '.join(outText)
return outText
def prep_ans(answer):
answer = process_digit_article(process_punctuation(answer))
answer = answer.replace(',', '')
return answer
================================================
FILE: evaluation/aok_utils/eval_predictions.py
================================================
import argparse
import pathlib
import json
import glob
from .load_aokvqa import load_aokvqa
def eval_aokvqa(dataset, preds, multiple_choice=False, strict=True):
if isinstance(dataset, list):
dataset = { dataset[i]['question_id'] : dataset[i] for i in range(len(dataset)) }
# print(f'Loaded dataset size: {len(dataset)}')
if multiple_choice is False:
dataset = {k:v for k,v in dataset.items() if v['difficult_direct_answer'] is False}
# print(f'Loaded dataset size: {len(dataset)}')
if strict:
dataset_qids = set(dataset.keys())
preds_qids = set(preds.keys())
assert dataset_qids.issubset(preds_qids)
# dataset = q_id (str) : dataset element (dict)
# preds = q_id (str) : prediction (str)
acc = []
for q in dataset.keys():
if q not in preds.keys():
acc.append(0.0)
continue
pred = preds[q]
choices = dataset[q]['choices']
direct_answers = dataset[q]['direct_answers']
## Multiple Choice setting
if multiple_choice:
if strict:
assert pred in choices, 'Prediction must be a valid choice'
correct_choice_idx = dataset[q]['correct_choice_idx']
acc.append( float(pred == choices[correct_choice_idx]) )
## Direct Answer setting
else:
num_match = sum([pred == da for da in direct_answers])
vqa_acc = min(1.0, num_match / 3.0)
# with open('2.txt', 'a') as f:
# f.write(q + ' ' + str(vqa_acc) + '\n')
acc.append(vqa_acc)
acc = sum(acc) / len(acc) * 100
return acc
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
parser.add_argument('--split', type=str, choices=['train', 'val', 'test_w_ans'], required=True)
parser.add_argument('--preds', type=str, required=True, dest='prediction_files')
args = parser.parse_args()
dataset = load_aokvqa(args.aokvqa_dir, args.split)
for prediction_file in glob.glob(args.prediction_files):
predictions = json.load(open(prediction_file, 'r'))
# Multiple choice
mc_predictions = {}
for q in predictions.keys():
if 'multiple_choice' in predictions[q].keys():
mc_predictions[q] = predictions[q]['multiple_choice']
if mc_predictions != {}:
mc_acc = eval_aokvqa(
dataset,
mc_predictions,
multiple_choice=True,
strict=False
)
print(prediction_file, 'MC', mc_acc)
# Direct Answer
da_predictions = {}
for q in predictions.keys():
if 'direct_answer' in predictions[q].keys():
da_predictions[q] = predictions[q]['direct_answer']
if da_predictions != {}:
da_acc = eval_aokvqa(
dataset,
da_predictions,
multiple_choice=False,
strict=False
)
print(prediction_file, 'DA', da_acc)
================================================
FILE: evaluation/aok_utils/load_aokvqa.py
================================================
import os
import json
def load_aokvqa(aokvqa_dir, split, version='v1p0'):
assert split in ['train', 'val', 'test', 'test_w_ans']
dataset = json.load(open(
os.path.join(aokvqa_dir, f"aokvqa_{version}_{split}.json")
))
return dataset
def get_coco_path(split, image_id, coco_dir):
return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg")
================================================
FILE: evaluation/aok_utils/remap_predictions.py
================================================
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import argparse
import pathlib
import json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from .load_aokvqa import load_aokvqa
def map_to_choices(dataset, predictions, device='cpu'):
if isinstance(dataset, list):
dataset = { dataset[i]['question_id'] : dataset[i] for i in range(len(dataset)) }
if all([p in dataset[q]['choices'] for q, p in predictions.items()]):
return predictions
model = SentenceTransformer('sentence-transformers/average_word_embeddings_glove.6B.300d')
model.to(device)
for q in tqdm(predictions.keys()):
choices = dataset[q]['choices']
if predictions[q] not in choices:
choice_embeddings = model.encode([predictions[q]] + choices, convert_to_tensor=True)
a_idx = cos_sim(choice_embeddings[0], choice_embeddings[1:]).argmax().item()
predictions[q] = choices[a_idx]
return predictions
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
parser.add_argument('--pred', type=argparse.FileType('r'), required=True, dest='prediction_file')
parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file')
args = parser.parse_args()
dataset = load_aokvqa(args.aokvqa_dir, args.split)
predictions = json.load(args.prediction_file)
# predictions = {qid: predictions[qid]['direct_answer'] for qid in predictions }
# json.dump(predictions, open('cache/mcan_da.json', 'w'))
predictions = map_to_choices(dataset, predictions)
json.dump(predictions, args.output_file)
================================================
FILE: evaluation/aokvqa_evaluate.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Evaluation script for A-OKVQA
# ------------------------------------------------------------------------------ #
import json
from evaluation.aok_utils.eval_predictions import eval_aokvqa
from evaluation.aok_utils.remap_predictions import map_to_choices
from .ans_punct import prep_ans
import argparse
class AOKEvaluater:
def __init__(self, annotation_path: str, question_path: str):
self.annotation_path = annotation_path
self.question_path = question_path
self.dataset = json.load(open(question_path, 'r'))
self.result_file = {}
self.result_path = None
self.multiple_choice = False
self.map_to_mc = True
def init(self):
self.result_file = []
def set_mode(self, multiple_choice=None, map_to_mc=None):
if multiple_choice is not None:
self.multiple_choice = multiple_choice
if map_to_mc is not None:
self.map_to_mc = map_to_mc
def prep_ans(self, answer):
return prep_ans(answer)
def add(self, qid, answer):
if self.multiple_choice:
self.result_file[qid] = {
'multiple_choice': answer,
}
else:
self.result_file[qid] = {
'direct_answer': answer,
}
def save(self, result_path: str):
self.result_path = result_path
if not self.multiple_choice and self.map_to_mc:
predictions = {qid: item['direct_answer'] for qid, item in self.result_file.items()}
predictions = map_to_choices(self.dataset, predictions, 'cuda:0')
for qid, answer in predictions.items():
self.result_file[qid]['multiple_choice'] = answer
json.dump(self.result_file, open(self.result_path, 'w'))
def evaluate(self, logfile=None):
assert self.result_path is not None, "Please save the result file first."
direct_answer = not self.multiple_choice
multiple_choice = self.multiple_choice or self.map_to_mc
eval_str = _evaluate(self.dataset, self.result_file, direct_answer=direct_answer, multiple_choice=multiple_choice)
print(eval_str)
if logfile is not None:
print(eval_str + '\n', file=logfile)
def _evaluate(dataset, results, direct_answer=True, multiple_choice=True):
result_str = ''
if direct_answer:
# Direct Answer Evaluation
da_predictions = {}
for qid, item in results.items():
da_predictions[qid] = item['direct_answer']
da_acc = eval_aokvqa(
dataset,
da_predictions,
multiple_choice=False,
strict=False
)
result_str += f'DA: {da_acc: .2f}\n'
if multiple_choice:
# Multiple Choice Evaluation
mc_predictions = {}
for qid, item in results.items():
mc_predictions[qid] = item['multiple_choice']
mc_acc = eval_aokvqa(
dataset,
mc_predictions,
multiple_choice=True,
strict=False
)
result_str += f'MC: {mc_acc: .2f}\n'
return result_str
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Evaluate A-OKVQA result file.')
parser.add_argument('--dataset_path', type=str, required=True)
parser.add_argument('--result_path', type=str, required=True)
parser.add_argument('--direct_answer', action='store_true')
parser.add_argument('--multiple_choice', action='store_true')
args = parser.parse_args()
dataset = json.load(open(args.dataset_path, 'r'))
result = json.load(open(args.result_path, 'r'))
result_str = _evaluate(dataset, result, direct_answer=args.direct_answer, multiple_choice=args.multiple_choice)
print(result_str)
================================================
FILE: evaluation/okvqa_evaluate.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Evaluation script for OK-VQA
# ------------------------------------------------------------------------------ #
import json
from evaluation.vqa_utils.vqa import VQA
from evaluation.vqa_utils.vqaEval import VQAEval
from .ans_punct import prep_ans
import argparse
class OKEvaluater:
def __init__(self, annotation_path: str, question_path: str):
self.annotation_path = annotation_path
self.question_path = question_path
# print(f'== Annotation file: {self.annotation_path}')
# print(f'== Question file: {self.question_path}')
self.result_file = []
self.result_path = None
def init(self):
self.result_file = []
def prep_ans(self, answer):
return prep_ans(answer)
def add(self, qid, answer):
qid = int(qid)
self.result_file.append({
'question_id': qid,
'answer': answer
})
def save(self, result_path: str):
self.result_path = result_path
json.dump(self.result_file, open(self.result_path, 'w'))
def evaluate(self, logfile=None):
assert self.result_path is not None, "Please save the result file first."
eval_str = _evaluate(self.annotation_path, self.question_path, self.result_path)
print()
print(eval_str)
if logfile is not None:
print(eval_str + '\n', file=logfile)
def _evaluate(annotation_file: str, question_file: str, result_file: str):
# print(f'== Annotation file: {annotation_file}')
# print(f'== Question file: {question_file}')
vqa = VQA(annotation_file, question_file)
vqaRes_prophet = vqa.loadRes(result_file, question_file)
vqaEval_prophet = VQAEval(vqa, vqaRes_prophet, n=2)
vqaEval_prophet.evaluate()
question_types = {
"eight": "Plants and Animals",
"nine": "Science and Technology",
"four": "Sports and Recreation",
"six": "Geography, History, Language and Culture",
"two": "Brands, Companies and Products",
"one": "Vehicles and Transportation",
"five": "Cooking and Food",
"ten": "Weather and Climate",
"seven": "People and Everyday life",
"three": "Objects, Material and Clothing"
# "other": "Other",
}
result_str = ''
result_str += "Overall Accuracy is: %.02f\n" % (vqaEval_prophet.accuracy['overall'])
result_str += f"{'Question Type':40s}\t{'Prophet'}\n"
for quesType in question_types:
result_str += "%-40s\t%.02f\n" % (question_types[quesType], vqaEval_prophet.accuracy['perQuestionType'][quesType])
# print(result_str)
return result_str
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Evaluate OK-VQA result file.')
parser.add_argument('--annotation_path', type=str, required=True)
parser.add_argument('--question_path', type=str, required=True)
parser.add_argument('--result_path', type=str, required=True)
args = parser.parse_args()
result_str = _evaluate(args.annotation_path, args.question_path, args.result_path)
print(result_str)
================================================
FILE: evaluation/vqa_utils/vqa.py
================================================
__author__ = 'aagrawal'
__version__ = '0.9'
# Interface for accessing the VQA dataset.
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
# The following functions are defined:
# VQA - VQA class that loads VQA annotation file and prepares data structures.
# getQuesIds - Get question ids that satisfy given filter conditions.
# getImgIds - Get image ids that satisfy given filter conditions.
# loadQA - Load questions and answers with the specified question ids.
# showQA - Display the specified questions and answers.
# loadRes - Load result file and create result object.
# Help on each function can be accessed by: "help(COCO.function)"
import json
import datetime
import copy
class VQA:
def __init__(self, annotation_file=None, question_file=None):
"""
Constructor of VQA helper class for reading and visualizing questions and answers.
:param annotation_file (str): location of VQA annotation file
:return:
"""
# load dataset
self.dataset = {}
self.questions = {}
self.qa = {}
self.qqa = {}
self.imgToQA = {}
if not annotation_file == None and not question_file == None:
print('loading VQA annotations and questions into memory...')
time_t = datetime.datetime.utcnow()
dataset = json.load(open(annotation_file, 'r'))
questions = json.load(open(question_file, 'r'))
print(datetime.datetime.utcnow() - time_t)
self.dataset = dataset
self.questions = questions
self.createIndex()
def createIndex(self):
# create index
print('creating index...')
imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
for ann in self.dataset['annotations']:
imgToQA[ann['image_id']] += [ann]
qa[ann['question_id']] = ann
for ques in self.questions['questions']:
qqa[ques['question_id']] = ques
print('index created!')
# create class members
self.qa = qa
self.qqa = qqa
self.imgToQA = imgToQA
def info(self):
"""
Print information about the VQA annotation file.
:return:
"""
for key, value in self.dataset['info'].items():
print('%s: %s' % (key, value))
def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
"""
Get question ids that satisfy given filter conditions. default skips that filter
:param imgIds (int array) : get question ids for given imgs
quesTypes (str array) : get question ids for given question types
ansTypes (str array) : get question ids for given answer types
:return: ids (int array) : integer array of question ids
"""
imgIds = imgIds if type(imgIds) == list else [imgIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset['annotations']
else:
if not len(imgIds) == 0:
anns = sum([self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA], [])
else:
anns = self.dataset['annotations']
anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]
anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]
ids = [ann['question_id'] for ann in anns]
return ids
def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
"""
Get image ids that satisfy given filter conditions. default skips that filter
:param quesIds (int array) : get image ids for given question ids
quesTypes (str array) : get image ids for given question types
ansTypes (str array) : get image ids for given answer types
:return: ids (int array) : integer array of image ids
"""
quesIds = quesIds if type(quesIds) == list else [quesIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset['annotations']
else:
if not len(quesIds) == 0:
anns = sum([self.qa[quesId] for quesId in quesIds if quesId in self.qa], [])
else:
anns = self.dataset['annotations']
anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann['question_type'] in quesTypes]
anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann['answer_type'] in ansTypes]
ids = [ann['image_id'] for ann in anns]
return ids
def loadQA(self, ids=[]):
"""
Load questions and answers with the specified question ids.
:param ids (int array) : integer ids specifying question ids
:return: qa (object array) : loaded qa objects
"""
if type(ids) == list:
return [self.qa[id] for id in ids]
elif type(ids) == int:
return [self.qa[ids]]
def showQA(self, anns):
"""
Display the specified annotations.
:param anns (array of object): annotations to display
:return: None
"""
if len(anns) == 0:
return 0
for ann in anns:
quesId = ann['question_id']
print("Question: %s" % (self.qqa[quesId]['question']))
for ans in ann['answers']:
print("Answer %d: %s" % (ans['answer_id'], ans['answer']))
def loadRes(self, resFile, quesFile):
"""
Load result file and return a result object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = VQA()
res.questions = json.load(open(quesFile))
res.dataset['info'] = copy.deepcopy(self.questions['info'])
res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
res.dataset['data_subtype'] = copy.deepcopy(self.questions['data_subtype'])
res.dataset['license'] = copy.deepcopy(self.questions['license'])
print('Loading and preparing results... ')
time_t = datetime.datetime.utcnow()
anns = json.load(open(resFile))
assert type(anns) == list, 'results is not an array of objects'
annsQuesIds = [ann['question_id'] for ann in anns]
assert set(annsQuesIds) == set(self.getQuesIds()), \
'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file.'
for ann in anns:
quesId = ann['question_id']
if res.dataset['task_type'] == 'Multiple Choice':
assert ann['answer'] in self.qqa[quesId][
'multiple_choices'], 'predicted answer is not one of the multiple choices'
qaAnn = self.qa[quesId]
ann['image_id'] = qaAnn['image_id']
ann['question_type'] = qaAnn['question_type']
ann['answer_type'] = qaAnn['answer_type']
print('DONE (t=%0.2fs)' % ((datetime.datetime.utcnow() - time_t).total_seconds()))
res.dataset['annotations'] = anns
res.createIndex()
return res
================================================
FILE: evaluation/vqa_utils/vqaEval.py
================================================
# coding=utf-8
__author__='aagrawal'
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
import sys
import re
class VQAEval:
def __init__(self, vqa, vqaRes, n=2):
self.n = n
self.accuracy = {}
self.evalQA = {}
self.evalQuesType = {}
self.evalAnsType = {}
self.vqa = vqa
self.vqaRes = vqaRes
self.params = {'question_id': vqa.getQuesIds()}
self.contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't",
"couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't",
"hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've",
"he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've",
"Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's",
"maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've",
"mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't",
"ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've",
"she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've",
"somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll",
"somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've",
"someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've",
"something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've",
"there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've",
"they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't",
"wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're",
"whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've",
"whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll",
"whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've",
"y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've",
"youll": "you'll", "youre": "you're", "youve": "you've"}
self.manualMap = { 'none': '0',
'zero': '0',
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'ten': '10'
}
self.articles = ['a',
'an',
'the'
]
self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
self.commaStrip = re.compile("(\d)(,)(\d)")
self.punct = [';', r"/", '[', ']', '"', '{', '}',
'(', ')', '=', '+', '\\', '_', '-',
'>', '<', '@', '`', ',', '?', '!']
def evaluate(self, quesIds=None):
if quesIds == None:
quesIds = [quesId for quesId in self.params['question_id']]
gts = {}
res = {}
for quesId in quesIds:
gts[quesId] = self.vqa.qa[quesId]
res[quesId] = self.vqaRes.qa[quesId]
# =================================================
# Compute accuracy
# =================================================
accQA = []
accQuesType = {}
accAnsType = {}
print ("computing accuracy")
step = 0
for quesId in quesIds:
resAns = res[quesId]['answer']
resAns = resAns.replace('\n', ' ')
resAns = resAns.replace('\t', ' ')
resAns = resAns.strip()
resAns = self.processPunctuation(resAns)
resAns = self.processDigitArticle(resAns)
gtAcc = []
gtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]
if len(set(gtAnswers)) > 1:
for ansDic in gts[quesId]['answers']:
ansDic['answer'] = self.processPunctuation(ansDic['answer'])
for gtAnsDatum in gts[quesId]['answers']:
otherGTAns = [item for item in gts[quesId]['answers'] if item!=gtAnsDatum]
matchingAns = [item for item in otherGTAns if item['answer']==resAns]
acc = min(1, float(len(matchingAns))/3)
gtAcc.append(acc)
quesType = gts[quesId]['question_type']
ansType = gts[quesId]['answer_type']
avgGTAcc = float(sum(gtAcc))/len(gtAcc)
accQA.append(avgGTAcc)
if quesType not in accQuesType:
accQuesType[quesType] = []
accQuesType[quesType].append(avgGTAcc)
if ansType not in accAnsType:
accAnsType[ansType] = []
accAnsType[ansType].append(avgGTAcc)
self.setEvalQA(quesId, avgGTAcc)
self.setEvalQuesType(quesId, quesType, avgGTAcc)
self.setEvalAnsType(quesId, ansType, avgGTAcc)
if step%100 == 0:
self.updateProgress(step/float(len(quesIds)))
step = step + 1
self.setAccuracy(accQA, accQuesType, accAnsType)
print ("Done computing accuracy")
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + ' ' in inText or ' ' + p in inText) or (re.search(self.commaStrip, inText) != None):
outText = outText.replace(p, '')
else:
outText = outText.replace(p, ' ')
outText = self.periodStrip.sub("",
outText,
re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = ' '.join(outText)
return outText
def setAccuracy(self, accQA, accQuesType, accAnsType):
self.accuracy['overall'] = round(100*float(sum(accQA))/len(accQA), self.n)
self.accuracy['perQuestionType'] = {quesType: round(100*float(sum(accQuesType[quesType]))/len(accQuesType[quesType]), self.n) for quesType in accQuesType}
self.accuracy['perAnswerType'] = {ansType: round(100*float(sum(accAnsType[ansType]))/len(accAnsType[ansType]), self.n) for ansType in accAnsType}
def setEvalQA(self, quesId, acc):
self.evalQA[quesId] = round(100*acc, self.n)
def setEvalQuesType(self, quesId, quesType, acc):
if quesType not in self.evalQuesType:
self.evalQuesType[quesType] = {}
self.evalQuesType[quesType][quesId] = round(100*acc, self.n)
def setEvalAnsType(self, quesId, ansType, acc):
if ansType not in self.evalAnsType:
self.evalAnsType[ansType] = {}
self.evalAnsType[ansType][quesId] = round(100*acc, self.n)
def updateProgress(self, progress):
barLength = 20
status = ""
if isinstance(progress, int):
progress = float(progress)
if not isinstance(progress, float):
progress = 0
status = "error: progress var must be float\r\n"
if progress < 0:
progress = 0
status = "Halt...\r\n"
if progress >= 1:
progress = 1
status = "Done...\r\n"
block = int(round(barLength*progress))
text = "\rFinshed Percent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), int(progress*100), status)
sys.stdout.write(text)
sys.stdout.flush()
================================================
FILE: main.py
================================================
import argparse
import yaml
import torch
from evaluation.okvqa_evaluate import OKEvaluater
from evaluation.aokvqa_evaluate import AOKEvaluater
from configs.task_cfgs import Cfgs
from prophet import get_args, get_runner
# parse cfgs and args
args = get_args()
__C = Cfgs(args)
with open(args.cfg_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
__C.override_from_dict(yaml_dict)
print(__C)
# build runner
if __C.RUN_MODE == 'pretrain':
evaluater = None
elif 'aok' in __C.TASK:
evaluater = AOKEvaluater(
__C.EVAL_ANSWER_PATH,
__C.EVAL_QUESTION_PATH,
)
else:
evaluater = OKEvaluater(
__C.EVAL_ANSWER_PATH,
__C.EVAL_QUESTION_PATH,
)
runner = get_runner(__C, evaluater)
# run
runner.run()
================================================
FILE: misc/tree.txt
================================================
prophet
├── assets
│ ├── answer_aware_examples_okvqa.json
│ ├── answer_dict_aokvqa.json
│ ├── answer_dict_okvqa.json
│ ├── answer_dict_vqav2.json
│ ├── candidates_aokvqa_test.json
│ ├── candidates_aokvqa_val.json
│ ├── candidates_okvqa.json
│ ├── captions_aokvqa.json
│ ├── captions_okvqa.json
│ ├── examples_aokvqa_test.json.json
│ └── examples_aokvqa_val.json.json
├── ckpts
│ ├── mcan_ft_aokvqa_test.pkl
│ ├── mcan_ft_aokvqa_val.pkl
│ ├── mcan_ft_okvqa.pkl
│ ├── mcan_pt_aokvqa_test.pkl
│ └── mcan_pt_aokvqa_val.pkl
│ ├── mcan_pt_okvqa.pkl
├── configs
│ ├── finetune.yml
│ ├── path_cfgs.py
│ ├── pretrain.yml
│ ├── prompt.yml
│ ├── task_cfgs.py
│ └── task_to_split.py
├── datasets
│ ├── aokvqa
│ │ ├── aokvqa_v1p0_test.json
│ │ ├── aokvqa_v1p0_train.json
│ │ └── aokvqa_v1p0_val.json
│ ├── coco2014
│ ├── coco2014_feats
│ ├── coco2017
│ ├── coco2017_feats
│ ├── okvqa
│ │ ├── mscoco_train2014_annotations.json
│ │ ├── mscoco_val2014_annotations.json
│ │ ├── OpenEnded_mscoco_train2014_questions.json
│ │ └── OpenEnded_mscoco_val2014_questions.json
│ └── vqav2
│ ├── v2_mscoco_train2014_annotations.json
│ ├── v2_mscoco_val2014_annotations.json
│ ├── v2_OpenEnded_mscoco_train2014_questions.json
│ ├── v2_OpenEnded_mscoco_val2014_questions.json
│ ├── v2valvg_no_ok_annotations.json
│ ├── v2valvg_no_ok_questions.json
│ ├── vg_annotations.json
│ └── vg_questions.json
├── environment.yml
├── evaluation
│ ├── ans_punct.py
│ ├── aok_utils
│ │ ├── eval_predictions.py
│ │ ├── load_aokvqa.py
│ │ └── remap_predictions.py
│ ├── aokvqa_evaluate.py
│ ├── okvqa_evaluate.py
│ └── vqa_utils
│ ├── vqaEval.py
│ └── vqa.py
├── main.py
├── misc
│ └── framework.png
├── outputs
│ ├── ckpts
│ ├── logs
│ └── results
├── preds
│ ├── mcan_530_okvqa.json
│ └── prophet_611_okvqa.json
├── prophet
│ ├── __init__.py
│ ├── stage1
│ │ ├── finetune.py
│ │ ├── heuristics.py
│ │ ├── model
│ │ │ ├── layers.py
│ │ │ ├── mcan_for_finetune.py
│ │ │ ├── mcan.py
│ │ │ ├── net_utils.py
│ │ │ └── rope2d.py
│ │ ├── pretrain.py
│ │ └── utils
│ │ ├── load_data.py
│ │ ├── optim.py
│ └── stage2
│ ├── prompt.py
│ └── utils
│ ├── data_utils.py
│ ├── fancy_pbar.py
├── README.md
├── scripts
│ ├── evaluate_model.sh
│ ├── extract_img_feats.sh
│ ├── finetune.sh
│ ├── heuristics_gen.sh
│ ├── pretrain.sh
│ └── prompt.sh
└── tools
├── extract_img_feats.py
└── transforms.py
================================================
FILE: outputs/ckpts/.gitkeep
================================================
================================================
FILE: outputs/logs/.gitkeep
================================================
================================================
FILE: outputs/results/.gitkeep
================================================
================================================
FILE: preds/.gitkeep
================================================
================================================
FILE: prophet/__init__.py
================================================
__author__ = 'Zhenwei Shao'
__version__ = '1.0'
import argparse
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task', dest='TASK', help="task name, one of ['ok', 'aok_val', 'aok_test']", type=str, required=True)
parser.add_argument('--run_mode', dest='RUN_MODE', help="run mode, one of ['pretrain', 'finetune', 'finetune_test', 'heuristics', 'prompt']", type=str, required=True)
parser.add_argument('--cfg', dest='cfg_file', help='config file', type=str, required=True)
parser.add_argument('--version', dest='VERSION', help='version name, output folder will be named as version name', type=str, required=True)
parser.add_argument('--ckpt_path', dest='CKPT_PATH', help='checkpoint path for test', type=str, default=None)
parser.add_argument('--pretrained_model', dest='PRETRAINED_MODEL_PATH', help='pretrained model path', type=str, default=None)
parser.add_argument('--debug', dest='DEBUG', help='debug mode', action='store_true')
parser.add_argument('--resume', dest='RESUME', help='resume previous run', action='store_true')
parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default=None)
parser.add_argument('--grad_accu', dest='GRAD_ACCU_STEPS', help='random seed', type=int, default=None)
parser.add_argument('--seed', dest='SEED', help='random seed', type=int, default=99)
parser.add_argument('--candidate_num', dest='CANDIDATE_NUM', help='topk candidates', type=int, default=None)
parser.add_argument('--example_num', dest='EXAMPLE_NUM', help='number of most similar examples to be searched, default: 200', type=int, default=None)
parser.add_argument('--examples_path', dest='EXAMPLES_PATH', help='answer-aware example file path, default: "assets/answer_aware_examples_for_ok.json"', type=str, default=None)
parser.add_argument('--candidates_path', dest='CANDIDATES_PATH', help='candidates file path, default: "assets/candidates_for_ok.json"', type=str, default=None)
parser.add_argument('--captions_path', dest='CAPTIONS_PATH', help='captions file path, default: "assets/captions_for_ok.json"', type=str, default=None)
parser.add_argument('--openai_key', dest='OPENAI_KEY', help='openai api key', type=str, default=None)
args = parser.parse_args()
return args
def get_runner(__C, evaluater):
if __C.RUN_MODE == 'pretrain':
from .stage1.pretrain import Runner
elif __C.RUN_MODE == 'finetune':
from .stage1.finetune import Runner
elif __C.RUN_MODE == 'finetune_test':
from .stage1.finetune import Runner
elif __C.RUN_MODE == 'heuristics':
from .stage1.heuristics import Runner
elif __C.RUN_MODE == 'prompt':
from .stage2.prompt import Runner
else:
raise NotImplementedError
runner = Runner(__C, evaluater)
return runner
================================================
FILE: prophet/stage1/finetune.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Runner that handles the finetuning and evaluation process
# ------------------------------------------------------------------------------ #
import os, sys
# sys.path.append(os.getcwd())
from datetime import datetime
import pickle, random, math, time
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import argparse
from pathlib import Path
from copy import deepcopy
import yaml
from configs.task_cfgs import Cfgs
from .utils.load_data import CommonData, DataSet
from .model.mcan_for_finetune import MCANForFinetune
from .utils.optim import get_optim_for_finetune as get_optim
class Runner(object):
def __init__(self, __C, evaluater):
self.__C = __C
self.evaluater = evaluater
def train(self, train_set, eval_set=None):
data_size = train_set.data_size
# Define the MCAN model
net = MCANForFinetune(self.__C, train_set.ans_size)
## load the pretrained model
if self.__C.PRETRAINED_MODEL_PATH is not None:
print(f'Loading pretrained model from {self.__C.PRETRAINED_MODEL_PATH}')
ckpt = torch.load(self.__C.PRETRAINED_MODEL_PATH, map_location='cpu')
net.load_state_dict(ckpt['state_dict'], strict=False)
net.parameter_init()
print('Finish loading.')
# Define the optimizer
if self.__C.RESUME:
raise NotImplementedError('Resume training is not needed as the finetuning is fast')
else:
optim = get_optim(self.__C, net)
start_epoch = 0
# load to gpu
net.cuda()
# Define the multi-gpu training if needed
if self.__C.N_GPU > 1:
net = nn.DataParallel(net, device_ids=self.__C.GPU_IDS)
# Define the binary cross entropy loss
loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')
epoch_loss = 0
# Define multi-thread dataloader
dataloader = Data.DataLoader(
train_set,
batch_size=self.__C.BATCH_SIZE,
shuffle=True,
num_workers=self.__C.NUM_WORKERS,
pin_memory=self.__C.PIN_MEM,
drop_last=True
)
# Training script
for epoch in range(start_epoch, self.__C.MAX_EPOCH):
net.train()
# Save log information
with open(self.__C.LOG_PATH, 'a+') as logfile:
logfile.write(
f'nowTime: {datetime.now():%Y-%m-%d %H:%M:%S}\n'
)
time_start = time.time()
# Iteration
for step, input_tuple in enumerate(dataloader):
iteration_loss = 0
optim.zero_grad()
input_tuple = [x.cuda() for x in input_tuple]
SUB_BATCH_SIZE = self.__C.BATCH_SIZE // self.__C.GRAD_ACCU_STEPS
for accu_step in range(self.__C.GRAD_ACCU_STEPS):
sub_tuple = [x[accu_step * SUB_BATCH_SIZE:
(accu_step + 1) * SUB_BATCH_SIZE] for x in input_tuple]
sub_ans_iter = sub_tuple[-1]
pred = net(sub_tuple[:-1])
loss = loss_fn(pred, sub_ans_iter)
loss.backward()
loss_item = loss.item()
iteration_loss += loss_item
epoch_loss += loss_item# * self.__C.GRAD_ACCU_STEPS
print("\r[version %s][epoch %2d][step %4d/%4d][Task %s][Mode %s] loss: %.4f, lr: %.2e" % (
self.__C.VERSION,
epoch + 1,
step,
int(data_size / self.__C.BATCH_SIZE),
self.__C.TASK,
self.__C.RUN_MODE,
iteration_loss / self.__C.BATCH_SIZE,
optim.current_lr(),
), end=' ')
optim.step()
time_end = time.time()
print('Finished in {}s'.format(int(time_end - time_start)))
# Logging
with open(self.__C.LOG_PATH, 'a+') as logfile:
logfile.write(f'epoch = {epoch + 1} loss = {epoch_loss / data_size}\nlr = {optim.current_lr()}\n\n')
optim.schedule_step(epoch)
# Save checkpoint
state = {
'state_dict': net.state_dict() if self.__C.N_GPU == 1 \
else net.module.state_dict(),
'optimizer': optim.optimizer.state_dict(),
'warmup_lr_scale': optim.warmup_lr_scale,
'decay_lr_scale': optim.decay_lr_scale,
}
torch.save(
state,
f'{self.__C.CKPTS_DIR}/epoch{epoch + 1}.pkl'
)
# Eval after every epoch
if eval_set is not None:
self.eval(
eval_set,
net,
eval_now=True
)
epoch_loss = 0
# Evaluation
@torch.no_grad()
def eval(self, dataset, net=None, eval_now=False):
data_size = dataset.data_size
# if eval_now and self.evaluater is None:
# self.build_evaluator(dataset)
if net is None:
# Load parameters
path = self.__C.CKPT_PATH
print('Loading ckpt {}'.format(path))
net = MCANForFinetune(self.__C, dataset.ans_size)
ckpt = torch.load(path, map_location='cpu')
net.load_state_dict(ckpt['state_dict'], strict=False)
net.cuda()
if self.__C.N_GPU > 1:
net = nn.DataParallel(net, device_ids=self.__C.GPU)
print('Finish!')
net.eval()
dataloader = Data.DataLoader(
dataset,
batch_size=self.__C.EVAL_BATCH_SIZE,
shuffle=False,
num_workers=self.__C.NUM_WORKERS,
pin_memory=True
)
qid_idx = 0
self.evaluater.init()
for step, input_tuple in enumerate(dataloader):
print("\rEvaluation: [step %4d/%4d]" % (
step,
int(data_size / self.__C.EVAL_BATCH_SIZE),
), end=' ')
input_tuple = [x.cuda() for x in input_tuple]
pred = net(input_tuple[:-1])
pred_np = pred.cpu().numpy()
pred_argmax = np.argmax(pred_np, axis=1)
# collect answers for every batch
for i in range(len(pred_argmax)):
qid = dataset.qids[qid_idx]
qid_idx += 1
ans_id = int(pred_argmax[i])
ans = dataset.ix_to_ans[ans_id]
# log result to evaluater
self.evaluater.add(qid, ans)
print()
self.evaluater.save(self.__C.RESULT_PATH)
# evaluate if eval_now is True
if eval_now:
with open(self.__C.LOG_PATH, 'a+') as logfile:
self.evaluater.evaluate(logfile)
# def build_evaluator(self, valid_set):
# if 'aok' in self.__C.TASK:
# from evaluation.aokvqa_evaluate import Evaluater
# elif 'ok' in self.__C.TASK:
# from evaluation.okvqa_evaluate import Evaluater
# else:
# raise ValueError('Unknown dataset')
# self.evaluater = Evaluater(
# valid_set.annotation_path,
# valid_set.question_path,
# )
def run(self):
# Set ckpts and log path
## where checkpoints will be saved
Path(self.__C.CKPTS_DIR).mkdir(parents=True, exist_ok=True)
## where logs will be saved
Path(self.__C.LOG_PATH).parent.mkdir(parents=True, exist_ok=True)
## where eval results will be saved
Path(self.__C.RESULT_PATH).parent.mkdir(parents=True, exist_ok=True)
with open(self.__C.LOG_PATH, 'w') as f:
f.write(str(self.__C) + '\n')
# build dataset entities
common_data = CommonData(self.__C)
if self.__C.RUN_MODE == 'finetune':
train_set = DataSet(
self.__C,
common_data,
self.__C.TRAIN_SPLITS
)
valid_set = None
if self.__C.EVAL_NOW:
valid_set = DataSet(
self.__C,
common_data,
self.__C.EVAL_SPLITS
)
self.train(train_set, valid_set)
elif self.__C.RUN_MODE == 'finetune_test':
test_set = DataSet(
self.__C,
common_data,
self.__C.EVAL_SPLITS
)
self.eval(test_set, eval_now=self.__C.EVAL_NOW)
else:
raise ValueError('Invalid run mode')
def finetune_login_args(parser):
parser.add_argument('--task', dest='TASK', help='task name, e.g., ok, aok_val, aok_test', type=str, required=True)
parser.add_argument('--run_mode', dest='RUN_MODE', help='run mode', type=str, required=True)
parser.add_argument('--cfg', dest='cfg_file', help='optional config file', type=str, required=True)
parser.add_argument('--version', dest='VERSION', help='version name', type=str, required=True)
parser.add_argument('--resume', dest='RESUME', help='resume training', type=bool, default=False)
parser.add_argument('--resume_version', dest='RESUME_VERSION', help='checkpoint version name', type=str, default='')
parser.add_argument('--resume_epoch', dest='RESUME_EPOCH', help='checkpoint epoch', type=int, default=1)
parser.add_argument('--resume_path', dest='RESUME_PATH', help='checkpoint path', type=str, default='')
parser.add_argument('--ckpt_path', dest='CKPT_PATH', help='checkpoint path for test', type=str, default=None)
parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default=None)
parser.add_argument('--seed', dest='SEED', help='random seed', type=int, default=None)
parser.add_argument('--grad_accu', dest='GRAD_ACCU_STEPS', help='random seed', type=int, default=None)
parser.add_argument('--pretrained_model', dest='PRETRAINED_MODEL_PATH', help='pretrained model path', type=str, default=None)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parameters for pretraining')
finetune_login_args(parser)
args = parser.parse_args()
__C = Cfgs(args)
with open(args.cfg_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
__C.override_from_dict(yaml_dict)
print(__C)
runner = Runner(__C)
runner.run()
================================================
FILE: prophet/stage1/heuristics.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Runner that handles the heuristics generations process
# ------------------------------------------------------------------------------ #
import os, sys
# sys.path.append(os.getcwd())
from datetime import datetime
import pickle, random, math, time
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
import torch.utils.data as Data
import argparse
from pathlib import Path
import yaml
from copy import deepcopy
from tqdm import tqdm
from configs.task_cfgs import Cfgs
from .utils.load_data import CommonData, DataSet
from .model.mcan_for_finetune import MCANForFinetune
from .utils.optim import get_optim_for_finetune as get_optim
class Runner(object):
def __init__(self, __C, *args, **kwargs):
self.__C = __C
self.net = None
# heuristics generation
@torch.no_grad()
def eval(self, dataset):
data_size = dataset.data_size
if self.net is None:
# Load parameters
path = self.__C.CKPT_PATH
print('Loading ckpt {}'.format(path))
net = MCANForFinetune(self.__C, dataset.ans_size)
ckpt = torch.load(path, map_location='cpu')
net.load_state_dict(ckpt['state_dict'], strict=False)
net.cuda()
if self.__C.N_GPU > 1:
net = nn.DataParallel(net, device_ids=self.__C.GPU_IDS)
print('Finish!')
self.net = net
else:
net = self.net
net.eval()
dataloader = Data.DataLoader(
dataset,
batch_size=self.__C.EVAL_BATCH_SIZE,
shuffle=False,
num_workers=self.__C.NUM_WORKERS,
pin_memory=True
)
qid_idx = 0
topk_results = {}
latent_results = []
k = self.__C.CANDIDATE_NUM
for step, input_tuple in enumerate(dataloader):
print("\rEvaluation: [step %4d/%4d]" % (
step,
int(data_size / self.__C.EVAL_BATCH_SIZE),
), end=' ')
input_tuple = [x.cuda() for x in input_tuple]
pred, answer_latents = net(input_tuple[:-1], output_answer_latent=True)
pred_np = pred.sigmoid().cpu().numpy()
answer_latents_np = answer_latents.cpu().numpy()
# collect answers for every batch
for i in range(len(pred_np)):
qid = dataset.qids[qid_idx]
qid_idx += 1
ans_np = pred_np[i]
ans_idx = np.argsort(-ans_np)[:k]
ans_item = []
for idx in ans_idx:
ans_item.append(
{
'answer': dataset.ix_to_ans[idx],
'confidence': float(ans_np[idx])
}
)
topk_results[qid] = ans_item
latent_np = answer_latents_np[i]
latent_results.append(latent_np)
np.save(
os.path.join(self.__C.ANSWER_LATENTS_DIR, f'{qid}.npy'),
latent_np
)
print()
return topk_results, latent_results
def run(self):
# Set ckpts and log path
## where checkpoints will be saved
Path(self.__C.CKPTS_DIR).mkdir(parents=True, exist_ok=True)
## where the result file of topk candidates will be saved
Path(self.__C.CANDIDATE_FILE_PATH).parent.mkdir(parents=True, exist_ok=True)
## where answer latents will be saved
Path(self.__C.ANSWER_LATENTS_DIR).mkdir(parents=True, exist_ok=True)
# build dataset entities
common_data = CommonData(self.__C)
train_set = DataSet(
self.__C,
common_data,
self.__C.TRAIN_SPLITS
)
test_set = DataSet(
self.__C,
common_data,
self.__C.EVAL_SPLITS
)
# forward VQA model
train_topk_results, train_latent_results = self.eval(train_set)
test_topk_results, test_latent_results = self.eval(test_set)
# save topk candidates
topk_results = train_topk_results | test_topk_results
json.dump(
topk_results,
open(self.__C.CANDIDATE_FILE_PATH, 'w'),
indent=4
)
# search similar examples
train_features = np.vstack(train_latent_results)
train_features = train_features / np.linalg.norm(train_features, axis=1, keepdims=True)
test_features = np.vstack(test_latent_results)
test_features = test_features / np.linalg.norm(test_features, axis=1, keepdims=True)
# compute top-E similar examples for each testing input
E = self.__C.EXAMPLE_NUM
similar_qids = {}
print(f'\ncompute top-{E} similar examples for each testing input')
for i, test_qid in enumerate(tqdm(test_set.qids)):
# cosine similarity
dists = np.dot(test_features[i], train_features.T)
top_E = np.argsort(-dists)[:E]
similar_qids[test_qid] = [train_set.qids[j] for j in top_E]
# save similar qids
with open(self.__C.EXAMPLE_FILE_PATH, 'w') as f:
json.dump(similar_qids, f)
def heuristics_login_args(parser):
parser.add_argument('--task', dest='TASK', help='task name, e.g., ok, aok_val, aok_test', type=str, required=True)
parser.add_argument('--cfg', dest='cfg_file', help='optional config file', type=str, required=True)
parser.add_argument('--version', dest='VERSION', help='version name', type=str, required=True)
parser.add_argument('--ckpt_path', dest='CKPT_PATH', help='checkpoint path for heuristics', type=str, default=None)
parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default=None)
parser.add_argument('--candidate_num', dest='CANDIDATE_NUM', help='topk candidates', type=int, default=None)
parser.add_argument('--example_num', dest='EXAMPLE_NUM', help='number of most similar examples to be searched, default: 200', type=int, default=None)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parameters for pretraining')
heuristics_login_args(parser)
args = parser.parse_args()
__C = Cfgs(args)
with open(args.cfg_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
__C.override_from_dict(yaml_dict)
print(__C)
runner = Runner(__C)
runner.run()
================================================
FILE: prophet/stage1/model/layers.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: basic layers & blocks of MCAN
# ------------------------------------------------------------------------------ #
import torch
from torch import nn
from torch.nn import functional as F
import math
from .net_utils import *
from .rope2d import RoPE2d
class AttFlat(nn.Module):
def __init__(self, __C):
super(AttFlat, self).__init__()
self.__C = __C
self.mlp = MLP(
in_size=__C.HIDDEN_SIZE,
mid_size=__C.FLAT_MLP_SIZE,
out_size=__C.FLAT_GLIMPSES,
dropout_r=__C.DROPOUT_R,
use_relu=True
)
self.linear_merge = nn.Linear(
__C.HIDDEN_SIZE * __C.FLAT_GLIMPSES,
__C.FLAT_OUT_SIZE
)
def forward(self, x, x_mask):
att = self.mlp(x)
if x_mask is not None:
att = att.masked_fill(
x_mask.squeeze(1).squeeze(1).unsqueeze(2),
-1e9
)
att = F.softmax(att, dim=1)
att_list = []
for i in range(self.__C.FLAT_GLIMPSES):
att_list.append(
torch.sum(att[:, :, i: i + 1] * x, dim=1)
)
x_atted = torch.cat(att_list, dim=1)
x_atted = self.linear_merge(x_atted)
return x_atted
class MHAtt(nn.Module):
def __init__(self, __C):
super().__init__()
self.__C = __C
self.n_head = __C.MULTI_HEAD
self.external_dim = __C.HIDDEN_SIZE
self.internal_dim = __C.HIDDEN_SIZE // self.n_head
self.linear_v = nn.Linear(self.external_dim, self.external_dim, bias=False)
self.linear_k = nn.Linear(self.external_dim, self.external_dim)
self.linear_q = nn.Linear(self.external_dim, self.external_dim)
self.linear_merge = nn.Linear(self.external_dim, self.external_dim)
self.dropout = nn.Dropout(__C.DROPOUT_R)
def forward(self, v, k, q, mask):
n_batches = q.size(0)
v = self.linear_v(v).view(
n_batches, -1, self.n_head, self.internal_dim
).transpose(1, 2)
k = self.linear_k(k).view(
n_batches, -1, self.n_head, self.internal_dim
).transpose(1, 2)
q = self.linear_q(q).view(
n_batches, -1, self.n_head, self.internal_dim
).transpose(1, 2)
atted = self.att(v, k, q, mask)
atted = atted.transpose(1, 2).contiguous().view(
n_batches, -1, self.external_dim
)
atted = self.linear_merge(atted)
return atted
def att(self, value, key, query, mask):
d_k = query.size(-1)
scores = torch.matmul(
query, key.transpose(-2, -1)
) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask, -1e9)
att_map = F.softmax(scores, dim=-1)
att_map = self.dropout(att_map)
return torch.matmul(att_map, value)
class SA_v(nn.Module):
def __init__(self, __C):
super().__init__()
self.__C = __C
self.n_head = __C.MULTI_HEAD
self.external_dim = __C.HIDDEN_SIZE
self.internal_dim = __C.HIDDEN_SIZE // self.n_head
self.linear_v = nn.Linear(self.external_dim, self.external_dim, bias=False)
self.linear_k = nn.Linear(self.external_dim, self.external_dim)
self.linear_q = nn.Linear(self.external_dim, self.external_dim)
self.linear_merge = nn.Linear(self.external_dim, self.external_dim)
self.dropout = nn.Dropout(__C.DROPOUT_R)
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
self.norm1 = nn.LayerNorm(__C.HIDDEN_SIZE)
self.rope = RoPE2d(self.internal_dim, __C.IMG_FEAT_GRID)
def forward(self, *args):
x, *_ = args
n_batches = x.size(0)
v = self.linear_v(x).view(
n_batches, -1, self.n_head, self.internal_dim
).transpose(1, 2)
k = self.linear_k(x).view(
n_batches, -1, self.n_head, self.internal_dim
).transpose(1, 2)
q = self.linear_q(x).view(
n_batches, -1, self.n_head, self.internal_dim
).transpose(1, 2)
q, k = self.rope(q, k)
atted = self.att(v, k, q, None)
atted = atted.transpose(1, 2).contiguous().view(
n_batches, -1, self.external_dim
)
atted = self.linear_merge(atted)
x = self.norm1(x + self.dropout1(atted))
return x
def att(self, value, key, query, mask):
d_k = query.size(-1)
scores = torch.matmul(
query, key.transpose(-2, -1)
) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask, -1e9)
att_map = F.softmax(scores, dim=-1)
att_map = self.dropout(att_map)
return torch.matmul(att_map, value)
class FFN(nn.Module):
def __init__(self, __C):
super(FFN, self).__init__()
self.mlp = MLP(
in_size=__C.HIDDEN_SIZE,
mid_size=__C.FF_SIZE,
out_size=__C.HIDDEN_SIZE,
dropout_r=__C.DROPOUT_R,
use_relu=True
)
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
self.norm1 = nn.LayerNorm(__C.HIDDEN_SIZE)
def forward(self, x, *args):
x = self.norm1(x + self.dropout1(
self.mlp(x)
))
return x
class SA(nn.Module):
def __init__(self, __C):
super(SA, self).__init__()
self.mhatt = MHAtt(__C)
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
self.norm1 = nn.LayerNorm(__C.HIDDEN_SIZE)
def forward(self, x, x_mask, *args):
x = self.norm1(x + self.dropout1(
self.mhatt(x, x, x, x_mask)
))
return x
class GA(nn.Module):
def __init__(self, __C):
super().__init__()
self.mhatt1 = MHAtt(__C)
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
self.norm1 = nn.LayerNorm(__C.HIDDEN_SIZE)
def forward(self, x, y, x_mask, y_mask, *args):
x = self.norm1(x + self.dropout1(
self.mhatt1(y, y, x, y_mask)
))
return x
================================================
FILE: prophet/stage1/model/mcan.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: the definition of the improved MCAN
# ------------------------------------------------------------------------------ #
import torch
from torch import nn
from torch.nn import functional as F
import math
from transformers import AutoModel, logging
logging.set_verbosity_error()
from .net_utils import *
from .layers import *
class MCA_ED(nn.Module):
"""
The definition of the encoder-decoder backbone of MCAN.
"""
def __init__(self, __C):
super(MCA_ED, self).__init__()
enc = __C.ARCH_CEIL['enc'] * __C.LAYER
dec = __C.ARCH_CEIL['dec'] * __C.LAYER
self.enc_list = nn.ModuleList([eval(layer)(__C) for layer in enc])
self.dec_list = nn.ModuleList([eval(layer)(__C) for layer in dec])
def forward(self, x, y, x_mask, y_mask):
for enc in self.enc_list:
x = enc(x, x_mask)
for dec in self.dec_list:
y = dec(y, x, y_mask, x_mask)
return x, y
class MCAN(nn.Module):
"""
The definition of the complete network of the improved MCAN, mainly includes:
1. A pretrained BERT model used to encode questions (already represented as tokens)
2. A linear layer to project CLIP vision features (extracted beforehand, so the CLIP
model is not included) to a common embedding space
3. An encoder-decoder backbone to fuse question and image features in depth
4. A classifier head based on `AttFlat`
"""
def __init__(self, __C, answer_size):
super().__init__()
# answer_size = trainset.ans_size
self.__C = __C
self.bert = AutoModel.from_pretrained(__C.BERT_VERSION)
# self.clip_visual = trainset.clip_model.visual
# self.clip_visual.layer4 = Identity()
# self.clip_visual.float()
# for p in self.clip_visual.parameters():
# p.requires_grad = False
self.img_feat_linear = nn.Sequential(
nn.Linear(__C.IMG_FEAT_SIZE, __C.HIDDEN_SIZE, bias=False),
)
self.lang_adapt = nn.Sequential(
nn.Linear(__C.LANG_FEAT_SIZE, __C.HIDDEN_SIZE),
nn.Tanh(),
)
self.backbone = MCA_ED(__C)
self.attflat_img = AttFlat(__C)
self.attflat_lang = AttFlat(__C)
self.proj_norm = nn.LayerNorm(__C.FLAT_OUT_SIZE)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, input_tuple, output_answer_latent=False):
img_feat, ques_ix = input_tuple
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
img_feat_mask = None#self.make_mask(img_feat)
# Pre-process Language Feature
lang_feat = self.bert(
ques_ix,
attention_mask= ~lang_feat_mask.squeeze(1).squeeze(1)
)[0]
lang_feat = self.lang_adapt(lang_feat)
# Pre-process Image Feature
img_feat = self.img_feat_linear(img_feat)
# Backbone Framework
# img_feat = flatten(img_feat)
lang_feat, img_feat = self.backbone(
lang_feat,
img_feat,
lang_feat_mask,
img_feat_mask
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
img_feat = self.attflat_img(
img_feat,
img_feat_mask
)
proj_feat = lang_feat + img_feat
answer_latent = self.proj_norm(proj_feat)
proj_feat = self.proj(answer_latent)
if output_answer_latent:
return proj_feat, answer_latent
return proj_feat
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)
================================================
FILE: prophet/stage1/model/mcan_for_finetune.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: the definition of A wrapper of MCAN for finetuning with the
# strategy described in the paper.
# ------------------------------------------------------------------------------ #
import torch
from torch import nn
from torch.nn import functional as F
from .mcan import *
class MCANForFinetune(MCAN):
"""
A wrapper of MCAN for finetuning with the strategy described
in the paper. We inherit the parameters of existing answers
and append new parameters for the new answers.
"""
def __init__(self, __C, answer_size, base_answer_size=3129):
super().__init__(__C, base_answer_size)
self.proj1 = nn.Linear(__C.FLAT_OUT_SIZE, answer_size - base_answer_size)
@torch.no_grad()
def parameter_init(self):
self.proj1.weight.data.zero_()
self.proj1.bias.data = self.proj.bias.data.mean() + torch.zeros(self.proj1.bias.data.shape)
def forward(self, input_tuple, output_answer_latent=False):
proj_feat, answer_latent = super().forward(input_tuple, output_answer_latent=True)
proj_feat = torch.cat([
proj_feat,
self.proj1(answer_latent)
], dim=1)
if output_answer_latent:
return proj_feat, answer_latent
return proj_feat
================================================
FILE: prophet/stage1/model/net_utils.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Utilities for layer definitions
# ------------------------------------------------------------------------------ #
from torch import nn
import math
class FC(nn.Module):
def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
super(FC, self).__init__()
self.dropout_r = dropout_r
self.use_relu = use_relu
self.linear = nn.Linear(in_size, out_size)
if use_relu:
self.relu = nn.ReLU(inplace=True)
if dropout_r > 0:
self.dropout = nn.Dropout(dropout_r)
def forward(self, x):
x = self.linear(x)
if self.use_relu:
x = self.relu(x)
if self.dropout_r > 0:
x = self.dropout(x)
return x
class MLP(nn.Module):
def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
super(MLP, self).__init__()
self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)
self.linear = nn.Linear(mid_size, out_size)
def forward(self, x):
return self.linear(self.fc(x))
def flatten(x):
x = x.view(x.shape[0], x.shape[1], -1)\
.permute(0, 2, 1).contiguous()
return x
def unflatten(x, shape):
x = x.permute(0, 2, 1).contiguous()\
.view(x.shape[0], -1, shape[0], shape[1])
return x
class Identity(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x
================================================
FILE: prophet/stage1/model/rope2d.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: A 2D version of rotary positional embeddings
# (https://arxiv.org/abs/2104.09864).
# ------------------------------------------------------------------------------ #
import math
import torch
import torch.nn.functional as F
from torch import nn
# from einops import rearrange, repeat
def rotate_every_two(x):
shape = x.shape
# x = rearrange(x, '... (d j) -> ... d j', j = 2)
# x1, x2 = x.unbind(dim = -1)
x = x.view(*shape[:-1], -1, 2)[..., [1, 0]]
x = x.view(*shape)
return x
def apply_rotary_pos_emb(q, k, sinu_pos):
sin, cos = sinu_pos
q, k = map(lambda t: (t * cos) + (rotate_every_two(t) * sin), (q, k))
return q, k
# rotary embeddings for 2d position
class RoPE2d(nn.Module):
def __init__(self, in_dim, size):
super().__init__()
dim = in_dim // 2
inv_freq = 1. / (40 ** (torch.arange(0, dim, 2).float() / dim))
position = torch.arange(0, size, dtype=torch.float)
sinusoid_inp = torch.einsum("i,j->ij", position, inv_freq)
_sin = sinusoid_inp.sin()
_cos = sinusoid_inp.cos()
_sin, _cos = map(
lambda x: x.unsqueeze(-1).repeat(1, 1, 2),
(_sin, _cos)
)
_sin[..., 0] = -_sin[..., 0]
_sin, _cos = map(lambda x: x.view(*x.shape[:-2], -1), (_sin, _cos))
_sin, _cos = map(
lambda x: torch.cat([
x.unsqueeze(0).repeat(size, 1, 1),
x.unsqueeze(1).repeat(1, size, 1)
], dim=-1).view(-1, in_dim),
(_sin, _cos)
)
self.register_buffer('sin', _sin)
self.register_buffer('cos', _cos)
def forward(self, k, q):
q, k = apply_rotary_pos_emb(q, k, (self.sin, self.cos))
return q, k
if __name__ == '__main__':
rope = RoPE2d(512, size=4)
q = torch.randn(1, 16, 512)
k = torch.randn(1, 16, 512)
q, k = rope(q, k)
print(q.shape, k.shape)
================================================
FILE: prophet/stage1/pretrain.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Runner that handles the pretraining process
# ------------------------------------------------------------------------------ #
import os, sys
# sys.path.append(os.getcwd())
from datetime import datetime
import pickle, random, math, time
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import argparse
from pathlib import Path
from copy import deepcopy
import yaml
from configs.task_cfgs import Cfgs
from .utils.load_data import CommonData, DataSet
from .model.mcan import MCAN
from .utils.optim import get_optim
class Runner(object):
def __init__(self, __C, *args, **kwargs):
self.__C = __C
def train(self, train_set, eval_set=None):
data_size = train_set.data_size
# Define the MCAN model
net = MCAN(self.__C, train_set.ans_size)
# Define the optimizer
# Load checkpoint if resume training
if self.__C.RESUME:
print(' ========== Resume training')
path = self.__C.RESUME_PATH
# Load the network parameters
print('Loading ckpt {}'.format(path))
ckpt = torch.load(path, map_location='cpu')
print('Finish loading.')
net.load_state_dict(ckpt['state_dict'])
# Load the optimizer paramters
optim = get_optim(self.__C, net)
optim.warmup_lr_scale = ckpt['warmup_lr_scale']
optim.decay_lr_scale = ckpt['decay_lr_scale']
optim.optimizer.load_state_dict(ckpt['optimizer'])
start_epoch = self.__C.CKPT_EPOCH
else:
optim = get_optim(self.__C, net)
start_epoch = 0
# load to gpu
net.cuda()
# Define the multi-gpu training if needed
if self.__C.N_GPU > 1:
net = nn.DataParallel(net, device_ids=self.__C.GPU_IDS)
# Define the binary cross entropy loss
loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')
epoch_loss = 0
# Define multi-thread dataloader
dataloader = Data.DataLoader(
train_set,
batch_size=self.__C.BATCH_SIZE,
shuffle=True,
num_workers=self.__C.NUM_WORKERS,
pin_memory=self.__C.PIN_MEM,
drop_last=True
)
# Training script
for epoch in range(start_epoch, self.__C.MAX_EPOCH):
net.train()
# Save log information
with open(self.__C.LOG_PATH, 'a+') as logfile:
logfile.write(
f'nowTime: {datetime.now():%Y-%m-%d %H:%M:%S}\n'
)
time_start = time.time()
# Iteration
for step, input_tuple in enumerate(dataloader):
iteration_loss = 0
optim.zero_grad()
input_tuple = [x.cuda() for x in input_tuple]
SUB_BATCH_SIZE = self.__C.BATCH_SIZE // self.__C.GRAD_ACCU_STEPS
for accu_step in range(self.__C.GRAD_ACCU_STEPS):
sub_tuple = [x[accu_step * SUB_BATCH_SIZE:
(accu_step + 1) * SUB_BATCH_SIZE] for x in input_tuple]
sub_ans_iter = sub_tuple[-1]
pred = net(sub_tuple[:-1])
loss = loss_fn(pred, sub_ans_iter)
loss.backward()
loss_item = loss.item()
iteration_loss += loss_item
epoch_loss += loss_item# * self.__C.GRAD_ACCU_STEPS
print("\r[version %s][epoch %2d][step %4d/%4d][Task %s][Mode %s] loss: %.4f, lr: %.2e" % (
self.__C.VERSION,
epoch + 1,
step,
int(data_size / self.__C.BATCH_SIZE),
self.__C.TASK,
self.__C.RUN_MODE,
iteration_loss / self.__C.BATCH_SIZE,
optim.current_lr(),
), end=' ')
optim.step()
time_end = time.time()
print('Finished in {}s'.format(int(time_end - time_start)))
# Logging
with open(self.__C.LOG_PATH, 'a+') as logfile:
logfile.write(f'epoch = {epoch + 1} loss = {epoch_loss / data_size}\nlr = {optim.current_lr()}\n\n')
optim.schedule_step(epoch)
# Save checkpoint
state = {
'state_dict': net.state_dict() if self.__C.N_GPU == 1 \
else net.module.state_dict(),
'optimizer': optim.optimizer.state_dict(),
'warmup_lr_scale': optim.warmup_lr_scale,
'decay_lr_scale': optim.decay_lr_scale,
}
torch.save(
state,
f'{self.__C.CKPTS_DIR}/epoch{epoch + 1}.pkl'
)
epoch_loss = 0
def run(self):
# Set ckpts and log path
Path(self.__C.CKPTS_DIR).mkdir(parents=True, exist_ok=True)
Path(self.__C.LOG_PATH).parent.mkdir(parents=True, exist_ok=True)
with open(self.__C.LOG_PATH, 'w') as f:
f.write(str(self.__C) + '\n')
common_data = CommonData(self.__C)
train_set = DataSet(
self.__C,
common_data,
self.__C.TRAIN_SPLITS
)
valid_set = None
self.train(train_set, valid_set)
def pretrain_login_args(parser):
parser.add_argument('--task', dest='TASK', help='task name, e.g., ok, aok_val, aok_test', type=str, required=True)
parser.add_argument('--cfg', dest='cfg_file', help='optional config file', type=str, required=True)
parser.add_argument('--version', dest='VERSION', help='version name', type=str, required=True)
parser.add_argument('--resume', dest='RESUME', help='resume training', type=bool, default=False)
parser.add_argument('--resume_version', dest='RESUME_VERSION', help='checkpoint version name', type=str, default=None)
parser.add_argument('--resume_epoch', dest='RESUME_EPOCH', help='checkpoint epoch', type=int, default=None)
parser.add_argument('--resume_path', dest='RESUME_PATH', help='checkpoint path', type=str, default=None)
parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default=None)
parser.add_argument('--seed', dest='SEED', help='random seed', type=int, default=None)
parser.add_argument('--grad_accu', dest='GRAD_ACCU_STEPS', help='random seed', type=int, default=None)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parameters for pretraining')
pretrain_login_args(parser)
args = parser.parse_args()
__C = Cfgs(args)
with open(args.cfg_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
__C.override_from_dict(yaml_dict)
print(__C)
runner = Runner(__C)
runner.run()
================================================
FILE: prophet/stage1/utils/load_data.py
================================================
# --------------------------------------------------------------------------------- #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Data loading and preprocessing. Note that for the sake of simplicity,
# the code only supports the following datasets for now:
# * VQA 2.0
# * OK-VQA
# * A-OKVQA
# Transferring to other datasets is easy. You may need to modify a few
# line of code in this file.
# --------------------------------------------------------------------------------- #
import numpy as np
import glob, json, pickle, random
import torch
import torch.utils.data as Data
from transformers import AutoTokenizer
from evaluation.ans_punct import prep_ans
# from .transforms import _transform
def soft_target(answers, ans_to_ix, preprocess=True):
ans_score = np.zeros(ans_to_ix.__len__(), np.float32)
for ans in answers:
if preprocess:
ans = prep_ans(ans)
if ans in ans_to_ix:
ans_score[ans_to_ix[ans]] = min(1.0, ans_score[ans_to_ix[ans]] + 0.3)
return ans_score
class CommonData:
"""
load common data for all dataset objects:
* imgid_to_path
* bert tokenizer
* ans_to_ix, ix_to_ans
"""
def __init__(self, __C) -> None:
print('Loading common data...')
# load imgid_to_path
self.img_feat_path_list = []
for split in __C.FEATURE_SPLIT:
feats_dir = __C.FEATS_DIR[split]
self.img_feat_path_list += glob.glob(feats_dir + '*.npz')
self.imgid_to_path = {}
for feat_path in self.img_feat_path_list:
img_id = int(feat_path.split('/')[-1].split('_')[-1].split('.')[0])
self.imgid_to_path[img_id] = feat_path
# self.preprocess = _transform(__C.RESOLUTION)
print(f'== Total image number: {len(self.imgid_to_path)}')
# load bert tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(__C.BERT_VERSION)
self.token_size = self.tokenizer.vocab_size
print(f'== BertTokenizer loaded, vocab size: {self.token_size}')
# load ans_to_ix, ix_to_ans
ans_dict_path = __C.ANSWER_DICT_PATH[__C.DATA_TAG]
self.ix_to_ans = json.load(open(ans_dict_path, 'r'))
self.ans_to_ix = {ans: ix for ix, ans in enumerate(self.ix_to_ans)}
self.ans_size = len(self.ans_to_ix)
print(f'== Answer vocab size: {self.ans_size}')
print('Common data process is done.\n')
class DataSet(Data.Dataset):
def __init__(self, __C, common_data, split_name_list):
self.__C = __C
print(f'Loading dataset for {self.__C.TASK}|{self.__C.RUN_MODE}({split_name_list})')
self.split_name_list = split_name_list
# load all attributes from common data
self.imgid_to_path = common_data.imgid_to_path
self.tokenizer = common_data.tokenizer
self.token_size = common_data.token_size
self.ans_to_ix = common_data.ans_to_ix
self.ix_to_ans = common_data.ix_to_ans
self.ans_size = common_data.ans_size
# Loading question and answer list
self.ques_list = []
self.ans_list = []
for split_name in split_name_list:
ques_list = json.load(open(__C.QUESTION_PATH[split_name], 'r'))
if 'questions' in ques_list:
ques_list = ques_list['questions']
self.ques_list += ques_list
if split_name in __C.ANSWER_PATH:
ans_list = json.load(open(__C.ANSWER_PATH[split_name], 'r'))
if 'annotations' in ans_list:
ans_list = ans_list['annotations']
self.ans_list += ans_list
# indexing data, note that all question_id is set to str,
# and all image_id is set to int
if len(self.ans_list) == len(self.ques_list):
self.annotated = True
self.qids = [str(ans['question_id']) for ans in self.ans_list]
elif len(self.ans_list) < len(self.ques_list):
self.annotated = False
self.qids = [str(ques['question_id']) for ques in self.ques_list]
else:
raise ValueError('Answer list is longer than question list!')
self.data_size = len(self.qids)
print(f'== data size: {self.data_size}\n')
self.qid_to_ques = {str(ques['question_id']): ques for ques in self.ques_list}
self.qid_to_ans = {str(ans['question_id']): ans for ans in self.ans_list}
def __getitem__(self, idx):
# get question in token ids, image in features,
# and answer in binary-label vector
__C = self.__C
# For code safety
img_feat = np.zeros(1)
ques_ids = np.zeros(1)
ans_vec = np.zeros(1)
qid = self.qids[idx]
ques_info = self.qid_to_ques[qid]
# Process question
ques_str = ques_info['question']
ques_ids = self.bert_tokenize(ques_str, __C.MAX_TOKEN)
# Process image feature
img_id = int(ques_info['image_id'])
img_feat = np.load(self.imgid_to_path[img_id])['x']
assert img_feat.shape == (__C.IMG_FEAT_GRID, __C.IMG_FEAT_GRID, __C.IMG_FEAT_SIZE)
img_feat = img_feat.reshape(-1, __C.IMG_FEAT_SIZE)
# Process answer
# The code is compatible with VQA v2, OK-VQA, and A-OKVQA.
# It is no guarantee that it works for other datasets. If
# you want to use other datasets, please modify following
# code to fit your dataset.
if self.annotated:
ans_info = self.qid_to_ans[qid]
if 'answers' in ans_info:
ans_list = [ans['answer'] for ans in ans_info['answers']]
elif 'direct_answers' in ans_info:
ans_list = ans_info['direct_answers']
else:
raise ValueError('Error: annotation format is not supported!')
assert type(ans_list[0]) == str, 'Error: answer format is not supported!'
ans_vec = soft_target(ans_list, self.ans_to_ix)
return torch.tensor(img_feat, dtype=torch.float), \
torch.tensor(ques_ids, dtype=torch.long), \
torch.tensor(ans_vec, dtype=torch.float)
def __len__(self):
return self.data_size
def bert_tokenize(self, text, max_token):
text = text.lower().replace('?', '')
tokens = self.tokenizer.tokenize(text)
if len(tokens) > max_token - 2:
tokens = tokens[:max_token-2]
tokens = ['[CLS]'] + tokens + ['[SEP]']
ids = self.tokenizer.convert_tokens_to_ids(tokens)
ids = ids + [0] * (max_token - len(ids))
ids = np.array(ids, np.int64)
return ids
================================================
FILE: prophet/stage1/utils/optim.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Utilities for optimization
# ------------------------------------------------------------------------------ #
import torch
import torch.optim as Optim
from torch.nn.utils import clip_grad_norm_
class OptimizerWrapper(object):
"""
A Wrapper for optimizer to support learning rate warmup and decay.
It also support multiple optimizers and switching at different steps.
"""
def __init__(self, optimizers,
warmup_schd_steps,
decay_schd_step_list,
decay_rate,
cur_schd_step=-1,
change_optim_step_list=None
):
self.optimizer_list = optimizers
self.groups_lr_list = []
for _optim in self.optimizer_list:
self.groups_lr_list.append([])
for group in _optim.param_groups:
self.groups_lr_list[-1].append(group['lr'])
self.curr_optim_id = 0
self.optimizer = self.optimizer_list[self.curr_optim_id]
self.change_optim_step_list = change_optim_step_list
# self.total_schd_steps = total_schd_steps
self.warmup_schd_steps = warmup_schd_steps
self.decay_schd_step_list = decay_schd_step_list
self.decay_rate = decay_rate
self._step = 0
self.warmup_lr_scale = 1.0
self.decay_lr_scale = 1.0
self.schedule_step(cur_schd_step)
def zero_grad(self):
self.optimizer.zero_grad()
def step(self, step=None, schd_step=False):
if step is None:
step = self._step
if schd_step:
self.schedule_step(step)
for group in self.optimizer.param_groups:
if '_grad_norm_clip' in group:
if group['_grad_norm_clip'] > 0:
clip_grad_norm_(group['params'], group['_grad_norm_clip'])
self.optimizer.step()
self._step += 1
def schedule_step(self, schd_step):
schd_step += 1
self.warmup_lr_scale = min(1., float(schd_step + 1) / float(self.warmup_schd_steps + 1))
if schd_step in self.decay_schd_step_list:
self.decay_lr_scale = self.decay_lr_scale * self.decay_rate
lr_scale = self.warmup_lr_scale * self.decay_lr_scale
# lr actually changes in following lines
if self.change_optim_step_list is not None:
if schd_step in self.change_optim_step_list:
self.curr_optim_id += 1
self.optimizer = self.optimizer_list[self.curr_optim_id]
for i, group in enumerate(self.optimizer.param_groups):
group['lr'] = lr_scale * self.groups_lr_list[self.curr_optim_id][i]
def current_lr(self):
return self.optimizer.param_groups[0]['lr']
def state_dict(self):
return self.optimizer.state_dict()
def load_state_dict(self, state_dict):
self.optimizer.load_state_dict(state_dict)
def get_optim(__C, model):
optim_class = eval('Optim.' + __C.OPT)
params = [
{'params': [], 'lr': __C.LR_BASE, '_grad_norm_clip': __C.GRAD_NORM_CLIP},
{'params': [], 'lr': __C.LR_BASE * __C.BERT_LR_MULT, '_grad_norm_clip': __C.GRAD_NORM_CLIP},
]
for name, param in model.named_parameters():
if param.requires_grad:
if 'bert' in name:
params[1]['params'].append(param)
else:
params[0]['params'].append(param)
hyper_params = {k: eval(v) for k, v in __C.OPT_PARAMS.items()}
return OptimizerWrapper(
[optim_class(
params,
**hyper_params
),],
warmup_schd_steps=__C.WARMUP_EPOCH,
decay_schd_step_list=__C.LR_DECAY_LIST,
decay_rate=__C.LR_DECAY_R,
)
def get_optim_for_finetune(__C, model, new_params_name='proj1'):
# optimizer for finetuning warmup
optim_class1 = eval('Optim.' + __C.OPT_FTW)
params1 = []
for name, param in model.named_parameters():
if new_params_name in name and param.requires_grad:
params1.append(param)
hyper_params1 = {k: eval(v) for k, v in __C.OPT_PARAMS_FTW.items()}
optimizer1 = optim_class1(
params1,
lr=__C.LR_BASE_FTW,
**hyper_params1
)
optim_class2 = eval('Optim.' + __C.OPT)
params2 = [
{'params': [], 'lr': __C.LR_BASE, '_grad_norm_clip': __C.GRAD_NORM_CLIP},
{'params': [], 'lr': __C.LR_BASE * __C.BERT_LR_MULT, '_grad_norm_clip': __C.GRAD_NORM_CLIP},
]
for name, param in model.named_parameters():
if param.requires_grad:
if 'bert' in name:
params2[1]['params'].append(param)
else:
params2[0]['params'].append(param)
hyper_params2 = {k: eval(v) for k, v in __C.OPT_PARAMS.items()}
optimizer2 = optim_class2(
params2,
**hyper_params2
)
return OptimizerWrapper(
[optimizer1, optimizer2],
warmup_schd_steps=__C.WARMUP_EPOCH,
decay_schd_step_list=__C.LR_DECAY_LIST,
decay_rate=__C.LR_DECAY_R,
change_optim_step_list=[__C.EPOPH_FTW,]
)
================================================
FILE: prophet/stage2/prompt.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Runner that handles the prompting process
# ------------------------------------------------------------------------------ #
import os, sys
# sys.path.append(os.getcwd())
import pickle
import json, time
import math
import random
import argparse
from datetime import datetime
from copy import deepcopy
import yaml
from pathlib import Path
import openai
from .utils.fancy_pbar import progress, info_column
from .utils.data_utils import Qid2Data
from configs.task_cfgs import Cfgs
class Runner:
def __init__(self, __C, evaluater):
self.__C = __C
self.evaluater = evaluater
openai.api_key = __C.OPENAI_KEY
def gpt3_infer(self, prompt_text, _retry=0):
# print(prompt_text)
# exponential backoff
if _retry > 0:
print('retrying...')
st = 2 ** _retry
time.sleep(st)
if self.__C.DEBUG:
# print(prompt_text)
time.sleep(0.05)
return 0, 0
try:
# print('calling gpt3...')
response = openai.Completion.create(
engine=self.__C.MODEL,
prompt=prompt_text,
temperature=self.__C.TEMPERATURE,
max_tokens=self.__C.MAX_TOKENS,
logprobs=1,
stop=["\n", "<|endoftext|>"],
# timeout=20,
)
# print('gpt3 called.')
except Exception as e:
print(type(e), e)
if str(e) == 'You exceeded your current quota, please check your plan and billing details.':
exit(1)
return self.gpt3_infer(prompt_text, _retry + 1)
response_txt = response.choices[0].text.strip()
# print(response_txt)
plist = []
for ii in range(len(response['choices'][0]['logprobs']['tokens'])):
if response['choices'][0]['logprobs']['tokens'][ii] in ["\n", "<|endoftext|>"]:
break
plist.append(response['choices'][0]['logprobs']['token_logprobs'][ii])
prob = math.exp(sum(plist))
return response_txt, prob
def sample_make(self, ques, capt, cands, ans=None):
line_prefix = self.__C.LINE_PREFIX
cands = cands[:self.__C.K_CANDIDATES]
prompt_text = line_prefix + f'Context: {capt}\n'
prompt_text += line_prefix + f'Question: {ques}\n'
cands_with_conf = [f'{cand["answer"]}({cand["confidence"]:.2f})' for cand in cands]
cands = ', '.join(cands_with_conf)
prompt_text += line_prefix + f'Candidates: {cands}\n'
prompt_text += line_prefix + 'Answer:'
if ans is not None:
prompt_text += f' {ans}'
return prompt_text
def get_context(self, example_qids):
# making context text for one testing input
prompt_text = self.__C.PROMPT_HEAD
examples = []
for key in example_qids:
ques = self.trainset.get_question(key)
caption = self.trainset.get_caption(key)
cands = self.trainset.get_topk_candidates(key)
gt_ans = self.trainset.get_most_answer(key)
examples.append((ques, caption, cands, gt_ans))
prompt_text += self.sample_make(ques, caption, cands, ans=gt_ans)
prompt_text += '\n\n'
return prompt_text
def run(self):
## where logs will be saved
Path(self.__C.LOG_PATH).parent.mkdir(parents=True, exist_ok=True)
with open(self.__C.LOG_PATH, 'w') as f:
f.write(str(self.__C) + '\n')
## where results will be saved
Path(self.__C.RESULT_DIR).mkdir(parents=True, exist_ok=True)
self.cache = {}
self.cache_file_path = os.path.join(
self.__C.RESULT_DIR,
'cache.json'
)
if self.__C.RESUME:
self.cache = json.load(open(self.cache_file_path, 'r'))
print('Note that the accuracies printed before final evaluation (the last printed one) are rough, just for checking if the process is normal!!!\n')
self.trainset = Qid2Data(
self.__C,
self.__C.TRAIN_SPLITS,
True
)
self.valset = Qid2Data(
self.__C,
self.__C.EVAL_SPLITS,
self.__C.EVAL_NOW,
json.load(open(self.__C.EXAMPLES_PATH, 'r'))
)
# if 'aok' in self.__C.TASK:
# from evaluation.aokvqa_evaluate import AOKEvaluater as Evaluater
# else:
# from evaluation.okvqa_evaluate import OKEvaluater as Evaluater
# evaluater = Evaluater(
# self.valset.annotation_path,
# self.valset.question_path
# )
infer_times = self.__C.T_INFER
N_inctx = self.__C.N_EXAMPLES
print()
for qid in progress.track(self.valset.qid_to_data, description="Working... "):
if qid in self.cache:
continue
ques = self.valset.get_question(qid)
caption = self.valset.get_caption(qid)
cands = self.valset.get_topk_candidates(qid, self.__C.K_CANDIDATES)
prompt_query = self.sample_make(ques, caption, cands)
example_qids = self.valset.get_similar_qids(qid, k=infer_times * N_inctx)
random.shuffle(example_qids)
prompt_info_list = []
ans_pool = {}
# multi-times infer
for t in range(infer_times):
# print(f'Infer {t}...')
prompt_in_ctx = self.get_context(example_qids[(N_inctx * t):(N_inctx * t + N_inctx)])
prompt_text = prompt_in_ctx + prompt_query
gen_text, gen_prob = self.gpt3_infer(prompt_text)
ans = self.evaluater.prep_ans(gen_text)
if ans != '':
ans_pool[ans] = ans_pool.get(ans, 0.) + gen_prob
prompt_info = {
'prompt': prompt_text,
'answer': gen_text,
'confidence': gen_prob
}
prompt_info_list.append(prompt_info)
time.sleep(self.__C.SLEEP_PER_INFER)
# vote
if len(ans_pool) == 0:
answer = self.valset.get_topk_candidates(qid, 1)[0]['answer']
else:
answer = sorted(ans_pool.items(), key=lambda x: x[1], reverse=True)[0][0]
self.evaluater.add(qid, answer)
self.cache[qid] = {
'question_id': qid,
'answer': answer,
'prompt_info': prompt_info_list
}
json.dump(self.cache, open(self.cache_file_path, 'w'))
ll = len(self.cache)
if self.__C.EVAL_NOW and not self.__C.DEBUG:
if ll > 21 and ll % 10 == 0:
rt_accuracy = self.valset.rt_evaluate(self.cache.values())
info_column.info = f'Acc: {rt_accuracy}'
self.evaluater.save(self.__C.RESULT_PATH)
if self.__C.EVAL_NOW:
with open(self.__C.LOG_PATH, 'a+') as logfile:
self.evaluater.evaluate(logfile)
def prompt_login_args(parser):
parser.add_argument('--debug', dest='DEBUG', help='debug mode', action='store_true')
parser.add_argument('--resume', dest='RESUME', help='resume previous run', action='store_true')
parser.add_argument('--task', dest='TASK', help='task name, e.g., ok, aok_val, aok_test', type=str, required=True)
parser.add_argument('--version', dest='VERSION', help='version name', type=str, required=True)
parser.add_argument('--cfg', dest='cfg_file', help='optional config file', type=str, default='configs/prompt.yml')
parser.add_argument('--examples_path', dest='EXAMPLES_PATH', help='answer-aware example file path, default: "assets/answer_aware_examples_for_ok.json"', type=str, default=None)
parser.add_argument('--candidates_path', dest='CANDIDATES_PATH', help='candidates file path, default: "assets/candidates_for_ok.json"', type=str, default=None)
parser.add_argument('--captions_path', dest='CAPTIONS_PATH', help='captions file path, default: "assets/captions_for_ok.json"', type=str, default=None)
parser.add_argument('--openai_key', dest='OPENAI_KEY', help='openai api key', type=str, default=None)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Heuristics-enhanced Prompting')
prompt_login_args(parser)
args = parser.parse_args()
__C = Cfgs(args)
with open(args.cfg_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
__C.override_from_dict(yaml_dict)
print(__C)
runner = Runner(__C)
runner.run()
================================================
FILE: prophet/stage2/utils/data_utils.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: dataset utils for stage2
# ------------------------------------------------------------------------------ #
import json
from typing import Dict
import pickle
from collections import Counter
# following two score is rough, and only for print accuracies during inferring.
def ok_score(gt_answers):
gt_answers = [a['answer'] for a in gt_answers]
ans2cnt = Counter(gt_answers)
# sort
ans2cnt = sorted(ans2cnt.items(), key=lambda x: x[1], reverse=True)
ans2score = {}
for ans, cnt in ans2cnt:
# ans2score[ans] = min(1.0, cnt / 3.0)
if cnt == 1:
ans2score[ans] = 0.3
elif cnt == 2:
ans2score[ans] = 0.6
elif cnt == 3:
ans2score[ans] = 0.9
else:
ans2score[ans] = 1.0
return ans2score
def aok_score(gt_answers):
gt_answers = [a for a in gt_answers]
ans2cnt = Counter(gt_answers)
# sort
ans2cnt = sorted(ans2cnt.items(), key=lambda x: x[1], reverse=True)
ans2score = {}
for ans, cnt in ans2cnt:
# ans2score[ans] = min(1.0, cnt / 3.0)
if cnt == 1:
ans2score[ans] = 1 / 3.
elif cnt == 2:
ans2score[ans] = 2 / 3.
else:
ans2score[ans] = 1.
return ans2score
class Qid2Data(Dict):
def __init__(self, __C, splits, annotated=False, similar_examples=None):
super().__init__()
self.__C = __C
self.annotated = annotated
ques_set = []
for split in splits:
split_path = self.__C.QUESTION_PATH[split]
_ques_set = json.load(open(split_path, 'r'))
if 'questions' in _ques_set:
_ques_set = _ques_set['questions']
ques_set += _ques_set
qid_to_ques = {str(q['question_id']): q for q in ques_set}
if annotated:
anno_set = []
for split in splits:
split_path = self.__C.ANSWER_PATH[split]
_anno_set = json.load(open(split_path, 'r'))
if 'annotations' in _anno_set:
_anno_set = _anno_set['annotations']
anno_set += _anno_set
qid_to_anno = {str(a['question_id']): a for a in anno_set}
qid_to_topk = json.load(open(__C.CANDIDATES_PATH))
# qid_to_topk = {t['question_id']: t for t in topk}
iid_to_capt = json.load(open(__C.CAPTIONS_PATH))
_score = aok_score if 'aok' in __C.TASK else ok_score
qid_to_data = {}
# ques_set = ques_set['questions']
# anno_set = anno_set['annotations']
for qid in qid_to_ques:
q_item = qid_to_ques[qid]
t_item = qid_to_topk[qid]
iid = str(q_item['image_id'])
caption = iid_to_capt[iid].strip()
if caption[-1] != '.':
caption += '.'
qid_to_data[qid] = {
'question_id': qid,
'image_id': iid,
'question': q_item['question'],
# 'most_answer': most_answer,
# 'gt_scores': ans2score,
'topk_candidates': t_item,
'caption': caption,
}
if annotated:
a_item = qid_to_anno[qid]
if 'answers' in a_item:
answers = a_item['answers']
else:
answers = a_item['direct_answers']
ans2score = _score(answers)
most_answer = list(ans2score.keys())[0]
if most_answer == '':
most_answer = list(ans2score.keys())[1]
qid_to_data[qid]['most_answer'] = most_answer
qid_to_data[qid]['gt_scores'] = ans2score
self.qid_to_data = qid_to_data
k = __C.K_CANDIDATES
if annotated:
print(f'Loaded dataset size: {len(self.qid_to_data)}, top{k} accuracy: {self.topk_accuracy(k)*100:.2f}, top1 accuracy: {self.topk_accuracy(1)*100:.2f}')
if similar_examples:
for qid in similar_examples:
qid_to_data[qid]['similar_qids'] = similar_examples[qid]
# check if all items have similar_qids
for qid, item in self.items():
if 'similar_qids' not in item:
raise ValueError(f'qid {qid} does not have similar_qids')
def __getitem__(self, __key):
return self.qid_to_data[__key]
def get_caption(self, qid):
caption = self[qid]['caption']
# if with_tag:
# tags = self.get_tags(qid, k_tags)
# caption += ' ' + ', '.join(tags) + '.'
return caption
def get_question(self, qid):
return self[qid]['question']
def get_gt_answers(self, qid):
if not self.annotated:
return None
return self[qid]['gt_scores']
def get_most_answer(self, qid):
if not self.annotated:
return None
return self[qid]['most_answer']
def get_topk_candidates(self, qid, k=None):
if k is None:
return self[qid]['topk_candidates']
else:
return self[qid]['topk_candidates'][:k]
def get_similar_qids(self, qid, k=None):
similar_qids = self[qid]['similar_qids']
if k is not None:
similar_qids = similar_qids[:k]
return similar_qids
def evaluate_by_threshold(self, ans_set, threshold=1.0):
if not self.annotated:
return -1
total_score = 0.0
for item in ans_set:
qid = item['question_id']
topk_candidates = self.get_topk_candidates(qid)
top1_confid = topk_candidates[0]['confidence']
if top1_confid > threshold:
answer = topk_candidates[0]['answer']
else:
answer = item['answer']
gt_answers = self.get_gt_answers(qid)
if answer in gt_answers:
total_score += gt_answers[answer]
return total_score / len(ans_set)
def topk_accuracy(self, k=1, sub_set=None):
if not self.annotated:
return -1
total_score = 0.0
if sub_set is not None:
qids = sub_set
else:
qids = list(self.qid_to_data.keys())
for qid in qids:
topk_candidates = self.get_topk_candidates(qid)[:k]
gt_answers = self.get_gt_answers(qid)
score_list = [gt_answers.get(a['answer'], 0.0) for a in topk_candidates]
total_score += max(score_list)
return total_score / len(qids)
def rt_evaluate(self, answer_set):
if not self.annotated:
return ''
score1 = self.evaluate_by_threshold(answer_set, 1.0) * 100
score2 = self.evaluate_by_threshold(answer_set, 0.0) * 100
score_string = f'{score2:.2f}->{score1:.2f}'
return score_string
================================================
FILE: prophet/stage2/utils/fancy_pbar.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: customized progress bar
# ------------------------------------------------------------------------------ #
from time import sleep
from rich.table import Column
from rich.progress import *
import atexit
class RichColumn(ProgressColumn):
def __init__(self, table_column: Optional[Column] = None) -> None:
super().__init__(table_column)
self.time_elapsed_column = TimeElapsedColumn()
self.time_remaining_column = TimeRemainingColumn()
self.m_of_n = MofNCompleteColumn()
self._completed = 0
self.sec_per_iter = -1
self.info = None
def render(self, task: "Task") -> Text:
m_of_n = self.m_of_n.render(task)
m_of_n = Text(f'{m_of_n}'.replace(' ', ''), style="red")
elapsed = self.time_elapsed_column.render(task)
elapsed = Text(f'{elapsed}', style="orange_red1")\
+ Text('/', style="dark_orange")
remaining = self.time_remaining_column.render(task)
remaining = Text(f'{remaining}', style="yellow")
if task.completed:
if self._completed < task.completed:
# do not update sec_per_iter if no new completed iterators
self._completed = task.completed
self.sec_per_iter = task.elapsed / task.completed
sec_per_iter = Text(f'({self.sec_per_iter:.1f}s/iter)', style="green")
else:
sec_per_iter = Text(f'(--s/iter)', style="green")
rendered = m_of_n + ' ' + elapsed + remaining + sec_per_iter
if self.info is None:
return rendered
info = Text(f' {self.info}', style="cyan")
return rendered + info
info_column = RichColumn()
progress = Progress(
TextColumn("[bold]{task.description}", table_column=Column(ratio=1)),
BarColumn(bar_width=None, table_column=Column(ratio=8), complete_style="blue"),
# MofNCompleteColumn(),
info_column,
expand=True,
redirect_stdout=False,
redirect_stderr=False
)
progress.__enter__()
def exit_progress():
progress.__exit__(None, None, None)
atexit.register(exit_progress)
if __name__ == '__main__':
# with progress:
for n in progress.track(range(10), description="Working... "):
sleep(0.01)
print(n)
if n == 8:
0 / 0
================================================
FILE: scripts/evaluate_file.sh
================================================
#!/bin/bash
# This script is used to evaluate a result file.
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--task)
TASK="$2"
shift 2;;
--result_path)
RESULT_PATH="$2"
shift 2;;
*)
echo "Unknown argument: $1"
exit 1;;
esac
done
TASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'
RESULT_PATH=${RESULT_PATH:-"preds/prophet_611_okvqa.json"} # path to the result file, default is the result from our experiments
if [ $TASK == "ok" ]; then
python -m evaluation.okvqa_evaluate --result_path $RESULT_PATH \
--question_path 'datasets/okvqa/OpenEnded_mscoco_val2014_questions.json' \
--annotation_path 'datasets/okvqa/mscoco_val2014_annotations.json'
elif [ $TASK == "aok_val" ]; then
python -m evaluation.aokvqa_evaluate --result_path $RESULT_PATH \
--dataset_path 'datasets/aokvqa/aokvqa_v1p0_val.json' \
--direct_answer --multiple_choice
elif [ $TASK == "aok_test" ]; then
echo "Please submit your result to the AOKVQA leaderboard."
else
echo "Unknown task: $TASK"
exit 1
fi
================================================
FILE: scripts/evaluate_model.sh
================================================
#!/bin/bash
# This script is used to evaluate a finetuned model.
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--gpu)
GPU="$2"
shift 2;;
--task)
TASK="$2"
shift 2;;
--ckpt_path)
CKPT_PATH="$2"
shift 2;;
--version)
VERSION="$2"
shift 2;;
*)
echo "Unknown argument: $1"
exit 1;;
esac
done
TASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'
GPU=${GPU:-0} # GPU id(s) you want to use, default '0'
CKPT_PATH=${CKPT_PATH:-"ckpts/mcan_ft_okvqa.pkl"} # path to the pretrained model, default is the result from our experiments
VERSION=${VERSION:-"eval_finetuned_${TASK}_model"} # version name, default 'eval_finetuned_$TASK_model'
# CUDA_VISIBLE_DEVICES=$GPU \
python main.py \
--task $TASK --run_mode finetune_test \
--cfg configs/finetune.yml \
--version $VERSION \
--ckpt_path $CKPT_PATH \
--gpu $GPU --grad_accu 2
================================================
FILE: scripts/extract_img_feats.sh
================================================
#!/bin/bash
# This script is used to extract image features.
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--gpu)
GPU="$2"
shift 2;;
--dataset)
DATASET="$2"
shift 2;;
--clip)
CLIP_MODEL="$2"
shift 2;;
*)
echo "Unknown argument: $1"
exit 1;;
esac
done
DATASET=${DATASET:-ok} # dataset name, one of ['ok', 'aok'], default 'ok'
GPU=${GPU:-0} # GPU id(s) you want to use, default '0'
CLIP_MODEL=${CLIP_MODEL:-RN50x64} # clip model name or path, default 'RN50x64'
# CUDA_VISIBLE_DEVICES=$GPU \
python tools/extract_img_feats.py \
--dataset $DATASET --gpu $GPU \
--clip_model $CLIP_MODEL
================================================
FILE: scripts/finetune.sh
================================================
#!/bin/bash
# This script is used to finetune the pretrained MCAN model.
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--gpu)
GPU="$2"
shift 2;;
--task)
TASK="$2"
shift 2;;
--pretrained_model)
PRETRAINED_MODEL_PATH="$2"
shift 2;;
--version)
VERSION="$2"
shift 2;;
*)
echo "Unknown argument: $1"
exit 1;;
esac
done
TASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'
GPU=${GPU:-0} # GPU id(s) you want to use, default '0'
PRETRAINED_MODEL_PATH=${PRETRAINED_MODEL_PATH:-"ckpts/mcan_pt_okvqa.pkl"} # path to the pretrained model, default is the result from our experiments
VERSION=${VERSION:-finetuning_okvqa} # version name, default 'finetuning_for_$TASK'
# run python script
# CUDA_VISIBLE_DEVICES=$GPU \
python main.py \
--task $TASK --run_mode finetune \
--cfg configs/finetune.yml \
--version $VERSION \
--pretrained_model $PRETRAINED_MODEL_PATH \
--gpu $GPU --seed 99 --grad_accu 2
================================================
FILE: scripts/heuristics_gen.sh
================================================
#!/bin/bash
# This script is used to generate heuristics from a finetuned model.
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--gpu)
GPU="$2"
shift 2;;
--task)
TASK="$2"
shift 2;;
--ckpt_path)
CKPT_PATH="$2"
shift 2;;
--candidate_num)
CANDIDATE_NUM="$2"
shift 2;;
--example_num)
EXAMPLE_NUM="$2"
shift 2;;
--version)
VERSION="$2"
shift 2;;
*)
echo "Unknown argument: $1"
exit 1;;
esac
done
TASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'
GPU=${GPU:-0} # GPU id(s) you want to use, default '0'
CKPT_PATH=${CKPT_PATH:-"ckpts/mcan_ft_okvqa.pkl"} # path to the pretrained model, default is the result from our experiments
CANDIDATE_NUM=${CANDIDATE_NUM:-10} # number of candidates to be generated
EXAMPLE_NUM=${EXAMPLE_NUM:-100} # number of examples to be generated
VERSION=${VERSION:-"heuristics_okvqa"} # version name, default 'heuristics1_for_$TASK'
# CUDA_VISIBLE_DEVICES=$GPU \
python main.py \
--task $TASK --run_mode heuristics \
--version $VERSION \
--cfg configs/finetune.yml \
--ckpt_path $CKPT_PATH \
--candidate_num $CANDIDATE_NUM \
--example_num $EXAMPLE_NUM \
--gpu $GPU
================================================
FILE: scripts/pretrain.sh
================================================
#!/bin/bash
# This script is used to pretrain the MCAN model.
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--gpu)
GPU="$2"
shift 2;;
--task)
TASK="$2"
shift 2;;
--version)
VERSION="$2"
shift 2;;
*)
echo "Unknown argument: $1"
exit 1;;
esac
done
TASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'
GPU=${GPU:-0} # GPU id(s) you want to use, default '0'
VERSION=${VERSION:-pretraining_okvqa} # version name, default 'pretraining_for_$TASK'
# CUDA_VISIBLE_DEVICES=$GPU \
python main.py \
--task $TASK --run_mode pretrain\
--cfg configs/pretrain.yml \
--version $VERSION \
--gpu $GPU --seed 99 --grad_accu 2
================================================
FILE: scripts/prompt.sh
================================================
#!/bin/bash
# This script is used to prompt GPT-3 to generate final answers.
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--task)
TASK="$2"
shift 2;;
--version)
VERSION="$2"
shift 2;;
--examples_path)
EXAMPLES_PATH="$2"
shift 2;;
--candidates_path)
CANDIDATES_PATH="$2"
shift 2;;
--captions_path)
CAPTIONS_PATH="$2"
shift 2;;
--openai_key)
OPENAI_KEY="$2"
shift 2;;
*)
echo "Unknown argument: $1"
exit 1;;
esac
done
TASK=${TASK:-ok} # task name, one of ['ok', 'aok_val', 'aok_test'], default 'ok'
VERSION=${VERSION:-"prompt_okvqa"} # version name, default 'prompt_for_$TASK'
EXAMPLES_PATH=${EXAMPLES_PATH:-"assets/answer_aware_examples_okvqa.json"} # path to the examples, default is the result from our experiments
CANDIDATES_PATH=${CANDIDATES_PATH:-"assets/candidates_okvqa.json"} # path to the candidates, default is the result from our experiments
CAPTIONS_PATH=${CAPTIONS_PATH:-"assets/captions_okvqa.json"} # path to the captions, default is the result from our experiments
OPENAI_KEY=${OPENAI_KEY:-""} # path to the captions
# CUDA_VISIBLE_DEVICES=$GPU \
python main.py \
--task $TASK --run_mode prompt \
--version $VERSION \
--cfg configs/prompt.yml \
--examples_path $EXAMPLES_PATH \
--candidates_path $CANDIDATES_PATH \
--captions_path $CAPTIONS_PATH \
--openai_key $OPENAI_KEY
================================================
FILE: tools/extract_img_feats.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Tool for extracting image features
# ------------------------------------------------------------------------------ #
import os, sys
sys.path.append(os.getcwd())
import glob, re, math, time, datetime
import numpy as np
import torch
from torch import nn
from PIL import Image
import clip
from tqdm import tqdm
import argparse
from pathlib import Path
from configs.task_cfgs import Cfgs
from configs.task_to_split import *
from tools.transforms import _transform
@torch.no_grad()
def _extract_feat(img_path, net, T, save_path):
# print(img_path)
img = Image.open(img_path)
# W, H = img.size
img = T(img).unsqueeze(0).cuda()
clip_feats = net(img).cpu().numpy()[0]
clip_feats = clip_feats.transpose(1, 2, 0)
# print(clip_feats.shape, save_path)
# return
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
np.savez(
save_path,
x=clip_feats,
)
class ExtractModel:
def __init__(self, encoder) -> None:
encoder.attnpool = nn.Identity()
self.backbone = encoder
self.backbone.cuda().eval()
@torch.no_grad()
def __call__(self, img):
x = self.backbone(img)
return x
def main(__C, dataset):
# find imgs
img_dir_list = []
for split in SPLIT_TO_IMGS:
if split.startswith(dataset):
img_dir_list.append(
__C.IMAGE_DIR[SPLIT_TO_IMGS[split]]
)
print('image dirs:', img_dir_list)
img_path_list = []
for img_dir in img_dir_list:
img_path_list += glob.glob(img_dir + '*.jpg')
print('total images:', len(img_path_list))
# load model
clip_model, _ = clip.load(__C.CLIP_VERSION, device='cpu')
img_encoder = clip_model.visual
model = ExtractModel(img_encoder)
T = _transform(__C.IMG_RESOLUTION)
for img_path in tqdm(img_path_list):
img_path_sep = img_path.split('/')
img_path_sep[-3] += '_feats'
save_path = '/'.join(img_path_sep).replace('.jpg', '.npz')
_extract_feat(img_path, model, T, save_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser('Tool for extracting CLIP image features.')
parser.add_argument('--dataset', dest='dataset', help='dataset name, e.g., ok, aok', type=str, required=True)
parser.add_argument('--gpu', dest='GPU', help='gpu id', type=str, default='0')
parser.add_argument('--clip_model', dest='CLIP_VERSION', help='clip model name or local model checkpoint path', type=str, default='RN50x64')
parser.add_argument('--img_resolution', dest='IMG_RESOLUTION', help='image resolution', type=int, default=512)
args = parser.parse_args()
__C = Cfgs(args)
main(__C, args.dataset)
================================================
FILE: tools/transforms.py
================================================
# ------------------------------------------------------------------------------ #
# Author: Zhenwei Shao (https://github.com/ParadoxZW)
# Description: Preprocessing images to be fed into the model, the script is
# adapted from the code of CLIP (github.com/openai/CLIP)
# ------------------------------------------------------------------------------ #
from math import ceil
from PIL import Image
import numpy as np
import torch
import torch.nn.functional as F
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import ImageOps
try:
from torchvision.transforms import InterpolationMode
BICUBIC = InterpolationMode.BICUBIC
except ImportError:
BICUBIC = Image.BICUBIC
def Pad():
def _pad(image):
W, H = image.size # debugged
if H < W:
pad_H = ceil((W - H) / 2)
pad_W = 0
else:
pad_H = 0
pad_W = ceil((H - W) / 2)
img = ImageOps.expand(image, border=(pad_W, pad_H, pad_W, pad_H), fill=0)
# print(img.size)
return img
return _pad
def _convert_image_to_rgb(image):
return image.convert("RGB")
def identity(x):
return x
def _transform(n_px, pad=False, crop=False):
return Compose([
Pad() if pad else identity,
Resize([n_px, n_px], interpolation=BICUBIC),
CenterCrop(n_px) if crop else identity,
_convert_image_to_rgb,
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])
if __name__ == '__main__':
img = np.random.rand(100, 333, 3).astype('uint8')
img = Image.fromarray(img)
img = _transform(32 * 14)(img)
img = torch.Tensor(img)
print(img.size())