Full Code of UX-Decoder/LLaVA-Grounding for AI

main 668b7cc3d536 cached

257 files

2.6 MB

691.7k tokens

1862 symbols

1 requests

Download .txt

Showing preview only (2,764K chars total). Download the full file or copy to clipboard to get everything.

Repository: UX-Decoder/LLaVA-Grounding
Branch: main
Commit: 668b7cc3d536
Files: 257
Total size: 2.6 MB

Directory structure:
gitextract_hcvjbjkn/

├── .gitignore
├── LICENSE
├── README.md
├── configs/
│   ├── openseed/
│   │   ├── openseed_swint_lang_joint.yaml
│   │   ├── openseed_swint_lang_joint_2st.yaml
│   │   └── openseed_swint_lang_joint_2st_visual_prompt.yaml
│   └── semsam/
│       └── visual_prompt_encoder.yaml
├── datasets_os/
│   ├── __init__.py
│   ├── build.py
│   ├── custom_dataset_dataloader.py
│   ├── dataset_mappers/
│   │   ├── __init__.py
│   │   ├── coco_instance_new_baseline_dataset_mapper.py
│   │   ├── coco_instruct_grounding_dataset_interactive_mapper.py
│   │   ├── coco_instruct_grounding_dataset_mapper.py
│   │   ├── coco_interactive_panoptic_new_baseline_dataset_mapper.py
│   │   ├── coco_panoptic_interactive_dataset_mapper.py
│   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
│   │   ├── flickr_instance_new_baseline_dataset_mapper.py
│   │   ├── flickr_instance_new_baseline_dataset_mapper_.py
│   │   ├── flickr_instance_new_baseline_dataset_mapper_end.py
│   │   ├── flickr_new_baseline_dataset_mapper.py
│   │   ├── inference_mapper_with_gt.py
│   │   ├── llava_dataset_mapper.py
│   │   ├── refcoco_dataset_mapper.py
│   │   └── vg_instance_new_baseline_dataset_mapper.py
│   ├── refer.py
│   ├── registration/
│   │   ├── __init__.py
│   │   ├── register_coco_instruct_grounding_dataset.py
│   │   ├── register_coco_panoptic_annos_grounding_interactive.py
│   │   ├── register_flickr_dataset.py
│   │   └── register_vg_dataset.py
│   └── semseg_loader.py
├── docs/
│   └── MODEL_ZOO.md
├── gradio_demo/
│   ├── LLaVA_G_Demo.py
│   └── __init__.py
├── llava/
│   ├── __init__.py
│   ├── constants.py
│   ├── conversation.py
│   ├── eval/
│   │   ├── LLaVA_G_Eval.py
│   │   ├── eval_gpt_review.py
│   │   ├── eval_gpt_review_bench.py
│   │   ├── eval_gpt_review_visual.py
│   │   ├── eval_gpt_review_visual2.py
│   │   ├── eval_science_qa.py
│   │   ├── eval_science_qa_gpt4.py
│   │   ├── eval_science_qa_gpt4_requery.py
│   │   ├── generate_webpage_data_from_table.py
│   │   ├── llava_mapper.py
│   │   ├── model_qa.py
│   │   ├── model_vqa.py
│   │   ├── model_vqa_science.py
│   │   ├── qa_baseline_gpt35.py
│   │   ├── run_llava.py
│   │   ├── summarize_gpt_review.py
│   │   └── webpage/
│   │       ├── index.html
│   │       ├── script.js
│   │       └── styles.css
│   ├── mm_utils.py
│   ├── model/
│   │   ├── __init__.py
│   │   ├── apply_delta.py
│   │   ├── builder.py
│   │   ├── consolidate.py
│   │   ├── language_model/
│   │   │   ├── llava_llama.py
│   │   │   ├── llava_llama_gd.py
│   │   │   ├── llava_mpt.py
│   │   │   └── mpt/
│   │   │       ├── adapt_tokenizer.py
│   │   │       ├── attention.py
│   │   │       ├── blocks.py
│   │   │       ├── configuration_mpt.py
│   │   │       ├── custom_embedding.py
│   │   │       ├── flash_attn_triton.py
│   │   │       ├── hf_prefixlm_converter.py
│   │   │       ├── meta_init_context.py
│   │   │       ├── modeling_mpt.py
│   │   │       ├── norm.py
│   │   │       └── param_init_fns.py
│   │   ├── llava_arch.py
│   │   ├── make_delta.py
│   │   ├── multimodal_encoder/
│   │   │   ├── builder.py
│   │   │   └── clip_encoder.py
│   │   ├── openseed/
│   │   │   ├── BaseModel.py
│   │   │   ├── __init__.py
│   │   │   ├── architectures/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── openseed_model.py
│   │   │   │   ├── openseed_model_decouple_train.py
│   │   │   │   └── registry.py
│   │   │   ├── backbone/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── backbone.py
│   │   │   │   ├── build.py
│   │   │   │   ├── focal.py
│   │   │   │   ├── focal_dw.py
│   │   │   │   ├── registry.py
│   │   │   │   └── swin.py
│   │   │   ├── body/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── decoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── modules.py
│   │   │   │   │   ├── openseed_decoder.py
│   │   │   │   │   ├── openseed_decoder_decouple.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── utils/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── dino_decoder.py
│   │   │   │   │       └── utils.py
│   │   │   │   ├── encoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── encoder_deform.py
│   │   │   │   │   ├── ops/
│   │   │   │   │   │   ├── functions/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── ms_deform_attn_func.py
│   │   │   │   │   │   ├── make.sh
│   │   │   │   │   │   ├── modules/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── ms_deform_attn.py
│   │   │   │   │   │   ├── setup.py
│   │   │   │   │   │   ├── src/
│   │   │   │   │   │   │   ├── cpu/
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cpu.cpp
│   │   │   │   │   │   │   │   └── ms_deform_attn_cpu.h
│   │   │   │   │   │   │   ├── cuda/
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.cu
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.h
│   │   │   │   │   │   │   │   └── ms_deform_im2col_cuda.cuh
│   │   │   │   │   │   │   ├── ms_deform_attn.h
│   │   │   │   │   │   │   └── vision.cpp
│   │   │   │   │   │   └── test.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── transformer_encoder_fpn.py
│   │   │   │   ├── openseed_head.py
│   │   │   │   ├── registry.py
│   │   │   │   └── transformer_blocks.py
│   │   │   ├── language/
│   │   │   │   ├── LangEncoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── transformer.py
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── encoder.py
│   │   │   │   ├── registry.py
│   │   │   │   └── vlpencoder.py
│   │   │   ├── modules/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── criterion.py
│   │   │   │   ├── matcher.py
│   │   │   │   ├── point_features.py
│   │   │   │   ├── position_encoding.py
│   │   │   │   └── postprocessing.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── box_ops.py
│   │   │       ├── config.py
│   │   │       └── misc.py
│   │   ├── semsam/
│   │   │   ├── BaseModel.py
│   │   │   ├── __init__.py
│   │   │   ├── architectures/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── idino_model_partwhole_all_llm_ref_feats_all_det_pretrainv1.py
│   │   │   │   └── registry.py
│   │   │   ├── backbone/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── backbone.py
│   │   │   │   ├── build.py
│   │   │   │   ├── focal.py
│   │   │   │   ├── focal_dw.py
│   │   │   │   ├── registry.py
│   │   │   │   ├── swin.py
│   │   │   │   └── swin_new.py
│   │   │   ├── body/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── decoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── idino_decoder_no_iou_token_partwhole_all_llm.py
│   │   │   │   │   ├── modules.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── utils/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── dino_decoder.py
│   │   │   │   │       └── utils.py
│   │   │   │   ├── encoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── encoder_deform.py
│   │   │   │   │   ├── ops/
│   │   │   │   │   │   ├── functions/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── ms_deform_attn_func.py
│   │   │   │   │   │   ├── make.sh
│   │   │   │   │   │   ├── modules/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── ms_deform_attn.py
│   │   │   │   │   │   ├── setup.py
│   │   │   │   │   │   ├── src/
│   │   │   │   │   │   │   ├── cpu/
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cpu.cpp
│   │   │   │   │   │   │   │   └── ms_deform_attn_cpu.h
│   │   │   │   │   │   │   ├── cuda/
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.cu
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.h
│   │   │   │   │   │   │   │   └── ms_deform_im2col_cuda.cuh
│   │   │   │   │   │   │   ├── ms_deform_attn.h
│   │   │   │   │   │   │   └── vision.cpp
│   │   │   │   │   │   └── test.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── transformer_encoder_fpn.py
│   │   │   │   ├── openseed_head.py
│   │   │   │   ├── registry.py
│   │   │   │   └── transformer_blocks.py
│   │   │   ├── language/
│   │   │   │   ├── LangEncoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── transformer.py
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── encoder.py
│   │   │   │   ├── fixencoder.py
│   │   │   │   ├── llama_encoder.py
│   │   │   │   ├── loss.py
│   │   │   │   ├── misc.py
│   │   │   │   ├── modeling_llama_os.py
│   │   │   │   ├── registry.py
│   │   │   │   └── vlpencoder.py
│   │   │   ├── modules/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── criterion_id_llm.py
│   │   │   │   ├── hooks.py
│   │   │   │   ├── matcher.py
│   │   │   │   ├── point_features.py
│   │   │   │   ├── position_encoding.py
│   │   │   │   └── postprocessing.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── box_ops.py
│   │   │       ├── config.py
│   │   │       └── misc.py
│   │   └── utils.py
│   ├── serve/
│   │   ├── __init__.py
│   │   ├── cli.py
│   │   ├── controller.py
│   │   ├── gradio_web_server.py
│   │   ├── register_worker.py
│   │   └── test_message.py
│   ├── train/
│   │   ├── llama_flash_attn_monkey_patch.py
│   │   ├── llava_trainer.py
│   │   ├── llava_trainer_gd.py
│   │   ├── llava_trainer_joint_train.py
│   │   ├── train.py
│   │   ├── train_grounding_1st.py
│   │   ├── train_joint_1st.py
│   │   ├── train_joint_2st.py
│   │   ├── train_joint_2st_interactive_refcoco_coco_instruction.py
│   │   └── train_mem.py
│   └── utils.py
├── pyproject.toml
├── scripts/
│   ├── convert_sqa_to_llava.py
│   ├── convert_sqa_to_llava_base_prompt.py
│   ├── finetune.sh
│   ├── finetune_visual_prompt.sh
│   ├── merge_lora_weights.py
│   └── pretrain_joint.sh
└── utils/
    ├── Config.py
    ├── __init__.py
    ├── arguments.py
    ├── constants.py
    ├── constants_ori.py
    ├── dist.py
    ├── distributed.py
    ├── misc.py
    ├── model.py
    ├── nms.py
    ├── prompt_engineering.py
    ├── utils.py
    └── visualizer.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.err
*.out
*.pyc
wandb
/data_preparation/vis_results/
/data_preparation/vis_results_new/
/LLAVA_Stage1_Pretrained/
/work_dirs/
/llava.egg-info/
/data_preparation/data/
/vis_results/
model_worker*
/playground/
*.jsonl
*.pth
gradio_demo/tmp_files
llava_bench_results
symmary_results
eval_gpt4
vis_results_pdf_precision
vis_results_pdf_recall
output/
datasets/
output
datasets
*.log
*.json
__pycache__/
*/__pycache__
*/*/__pycache__
*/*/*/__pycache__
*/*/*/*/__pycache__
gradio_demo/examples/*.mp4


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
🌋 LLaVA-Grounding: Grounded Visual Chat with Large Multimodal Models
========

[[Project Page](https://llava-vl.github.io/llava-grounding)] [[Arxiv](https://arxiv.org/abs/2312.02949)]  [[Demo](https://llava-grounding.deepdataspace.com/
)]  [[Model Zoo](https://github.com/UX-Decoder/LLaVA-Grounding/blob/main/docs/MODEL_ZOO.md)] 
<!-- [[`Paper`](xxx)] [[`BibTex`](#black_nib-citation)] -->

## :fire: News
[2024/1/14] Our training code is released.

[2023/12/6] Our paper is available in arxiv.


## Contents
- [🌋 LLaVA-Grounding: Grounded Visual Chat with Large Multimodal Models](#-llava-grounding-grounded-visual-chat-with-large-multimodal-models)
  - [:fire: News](#fire-news)
  - [Contents](#contents)
    - [Install](#install)
    - [LLaVA-Grounding Weights](#llava-grounding-weights)
    - [Demo](#demo)
    - [Training data](#training-data)
      - [Flickr30k](#flickr30k)
      - [COCO](#coco)
      - [LLaVA](#llava)
    - [Training](#training)
    - [Citation](#citation)

### Install
1. Clone this repository and navigate to LLaVA-Grounding fold:
```shell
git clone https://github.com/UX-Decoder/LLaVA-Grounding.git
cd LLaVA-Grounding
```
2. Install required packages:
```
conda create -n llava python=3.10 -y
conda activate llava
pip install --upgrade pip  # enable PEP 660 support
pip install -e .
```

3. Install additional packages for training cases
```
pip install -e ".[train]"
pip install flash-attn --no-build-isolation
```
4. Install packages necessary for [OpenSeeD](https://github.com/IDEA-Research/OpenSeeD) and [Semantic-SAM](https://github.com/UX-Decoder/Semantic-SAM).

### LLaVA-Grounding Weights
Please check out our [Model Zoo](https://github.com/UX-Decoder/LLaVA-Grounding/blob/main/docs/MODEL_ZOO.md) for all public LLaVA-Grounding checkpoints, and the instructions on how to use the weights.
### Demo
After downloading model weights, simply conduct the following commends to run demo on your own machine.
```shell
CUDA_VISIBLE_DEVICES=0 python gradio_demo/LLaVA_G_Demo.py --path_vision_cfg path_to_vision_cfg --path_inter_cfg path_to_inter_cfg --model_path path_to_ckpt_dir

# for example, after downloading weights into checkpoints/llava_grounding
CUDA_VISIBLE_DEVICES=0 python gradio_demo/LLaVA_G_Demo.py --path_vision_cfg configs/openseed/openseed_swint_lang_joint_2st_visual_prompt.yaml --path_inter_cfg configs/semsam/visual_prompt_encoder.yaml --model_path checkpoints/llava_grounding
```

Please refer to our [Online Demo](https://llava-grounding.deepdataspace.com/) for the more detailed user's guidence.
### Training data
```text
data
├── flickr30k_entities
│   ├── train/
│   ├── val/
│   ├── annotations
│          ├──final_flickr_separateGT_train.json
│          ├──final_flickr_separateGT_val.json
├── coco
│   ├── train2014/
│   ├── train2017/
│   ├── panoptic_train2017/
│   ├── panoptic_semseg_train2017/
│   ├── annotations
│   │      ├──instances_train2017.json
│   │      ├──instances_train2017_gvc.json
│   │      ├──grounded_visual_chat_data.json
│   │      ├──instances_train2014_filter.json
│   │      ├──panoptic_train2017_filter.json
│   │      ├──grounding_train2017.json
├── llava
│   ├── annotations
│          ├── cap600k_brackets_all.json
│          ├── llava_instruct_150k.json
│          ├── llava_instruct_150k_visual_prompt.json

```
#### Flickr30k
Please refer to [MDETR's pre-processed flickr30k data](https://github.com/ashkamath/mdetr/blob/main/.github/flickr.md).
#### COCO
Please download coco train2014 and train2017 images and panoptic segmentation and semantic segmentation data. Other annoations can be downloaded [here](https://github.com/UX-Decoder/LLaVA-Grounding/releases/tag/train_data).
#### LLaVA
The processed annotations can be downloaded [here](https://github.com/UX-Decoder/LLaVA-Grounding/releases/tag/train_data).
### Training
Stage 1
```shell
bash scripts/pretrain_joint.py
```
Stage 2
```shell
bash scripts/finetune.py
```
Stage 3
```shell
bash scripts/finetune_visual_prompt.py
```
### Citation
If you find LLaVA-Grounding useful for your research and applications, please cite using this BibTeX:
```bibtex

@misc{zhang2023llavagrounding,
      title={LLaVA-Grounding: Grounded Visual Chat with Large Multimodal Models},
      author={Hao Zhang and Hongyang Li and Feng Li and Tianhe Ren and Xueyan Zou and Shilong Liu and Shijia Huang and Jianfeng Gao and Lei Zhang and Chunyuan Li and Jianwei Yang},
      year={2023},
      booktitle={arXiv}
}

@misc{liu2023llava,
      title={Visual Instruction Tuning}, 
      author={Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
      publisher={arXiv:2304.08485},
      year={2023}
}
```


================================================
FILE: configs/openseed/openseed_swint_lang_joint.yaml
================================================
# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
# --------------------------------------------------------

##################
# Task settings
##################
WEIGHT: ''
PORT: 53711
VERBOSE: true
#OUTPUT_DIR: '../../data/output/test'
inference_only: true
OUTPUT_DIR: '../../data/output/test'
clip: true
# misc
LOADER:
  JOINT: True
  KEY_DATASET: 'flickr'
# model
MODEL:
  NAME: openseed_model
  HEAD: openseed_head
  MASK_ON: false
  KEYPOINT_ON: false
  LOAD_PROPOSALS: false
  DIM_PROJ: 4096
  BACKBONE_DIM: 768
  BACKGROUND: False
  WEIGHTS: ''
  TEXT:
    ARCH: encoder
    NAME: transformer
    TOKENIZER: clip
    CONTEXT_LENGTH: 18 # 18
    WIDTH: 512
    HEADS: 8
    LAYERS: 12
    AUTOGRESSIVE: True
  BACKBONE:
    NAME: swin
    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'
    LOAD_PRETRAINED: true
    SWIN:
      PRETRAIN_IMG_SIZE: 224
      PATCH_SIZE: 4
      EMBED_DIM: 96
      DEPTHS: [ 2, 2, 6, 2 ]
      NUM_HEADS: [ 3, 6, 12, 24 ]
      WINDOW_SIZE: 7
      MLP_RATIO: 4.0
      QKV_BIAS: true
      QK_SCALE: ~
      DROP_RATE: 0.0
      ATTN_DROP_RATE: 0.0
      DROP_PATH_RATE: 0.3
      APE: false
      PATCH_NORM: true
      USE_CHECKPOINT: false
      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
  ENCODER:
    NAME: encoder_deform
    IGNORE_VALUE: 255
    NUM_CLASSES: 133
    LOSS_WEIGHT: 1.0
    CONVS_DIM: 256
    MASK_DIM: 256
    NORM: "GN"
    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
    COMMON_STRIDE: 4
    TRANSFORMER_ENC_LAYERS: 6
    TOTAL_NUM_FEATURE_LEVELS: 4
    NUM_FEATURE_LEVELS: 3
    FEATURE_ORDER: "low2high"
  DECODER:
    NAME: openseed_decoder
    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
    MASK: True
    BOX: True
    GROUNDING:
      ENABLED: False
      MAX_LEN: 5
      TEXT_WEIGHT: 2.0
      CLASS_WEIGHT: 0.5
    CAPTION:
      ENABLED: False
      PHRASE_PROB: 0.0
      SIM_THRES: 0.95
    CAPTIONING:
      ENABLED: False
      STEP: 50
    RETRIEVAL:
      ENABLED: False
      DIM_IMG: 768
      ENSEMBLE: True
    OPENIMAGE:
      ENABLED: False
      NEGATIVE_SAMPLES: 5
      GROUNDING:
        ENABLED: False
        MAX_LEN: 5
    DEEP_SUPERVISION: True
    NO_OBJECT_WEIGHT: 0.1
    CLASS_WEIGHT: 4.0
    MASK_WEIGHT: 5.0
    DICE_WEIGHT: 5.0
    BOX_WEIGHT: 5.0
    GIOU_WEIGHT: 2.0
    COST_CLASS_WEIGHT: 4.0
    COST_DICE_WEIGHT: 5.0
    COST_MASK_WEIGHT: 5.0
    COST_BOX_WEIGHT: 5.0
    COST_GIOU_WEIGHT: 2.0
    HIDDEN_DIM: 256
    NUM_OBJECT_QUERIES: 300
    NHEADS: 8
    DROPOUT: 0.0
    DIM_FEEDFORWARD: 2048
    ENC_LAYERS: 0
    PRE_NORM: False
    ENFORCE_INPUT_PROJ: False
    SIZE_DIVISIBILITY: 32
    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
    TRAIN_NUM_POINTS: 12544
    OVERSAMPLE_RATIO: 3.0
    IMPORTANCE_SAMPLE_RATIO: 0.75
    TWO_STAGE: True
    INITIALIZE_BOX_TYPE: 'no'
    DN: seg
    DN_NOISE_SCALE: 0.4
    DN_NUM: 100
    INITIAL_PRED: True
    LEARN_TGT: False
    TOTAL_NUM_FEATURE_LEVELS: 4
    SEMANTIC_CE_LOSS: False
    PANO_BOX_LOSS: False
    COCO: True
    O365: False
    TEST:
      SEMANTIC_ON: True
      INSTANCE_ON: True
      PANOPTIC_ON: True
      OVERLAP_THRESHOLD: 0.8
      OBJECT_MASK_THRESHOLD: 0.25
      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
      TEST_FOUCUS_ON_BOX: False
      PANO_TRANSFORM_EVAL: True
      PANO_TEMPERATURE: 0.06

TEST:
  EVAL_PERIOD: 500000
  PRECISE_BN:
    NUM_ITER: 1
    ENABLED: False
  AUG:
    ENABLED: False

SAM:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 0.99
    MAX_SCALE: 1.01
    DATASET_MAPPER_NAME: "sam"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'sam'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

COCO:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "coco_ref_panoptic_lsj"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

VLP:
  INPUT:
    IMAGE_SIZE: 224
    DATASET_MAPPER_NAME: "vlpretrain"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TRAIN:
    BATCH_SIZE_TOTAL: 2
    BATCH_SIZE_PER_GPU: 2
  TEST:
    BATCH_SIZE_TOTAL: 256
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

INPUT:
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]

DATASETS:
  TRAIN: ["flickr_train","coco_2017_train_panoptic_ref_full_with_sem_seg_caption_grounding"]

  TEST: ["flickr_val"]

  CLASS_CONCAT: false
  SIZE_DIVISIBILITY: 32
  PROPOSAL_FILES_TRAIN: []

DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: False
  NUM_WORKERS: 16
  LOAD_PROPOSALS: False
  SAMPLER_TRAIN: "TrainingSampler"
  ASPECT_RATIO_GROUPING: True

# Detectron2 training config for optimizer and lr scheduler
SOLVER:
  BASE_LR_END: 0.0
  MOMENTUM: 0.9
  NESTEROV: False
  CHECKPOINT_PERIOD: 5000
  IMS_PER_BATCH: 1
  REFERENCE_WORLD_SIZE: 0
  BIAS_LR_FACTOR: 1.0
  WEIGHT_DECAY_BIAS: None
  # original
  BASE_LR: 0.0001
  STEPS: [327778, 355092]
  MAX_ITER: 368750
  GAMMA: 0.1
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WARMUP_METHOD: "linear"
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
  LR_MULTIPLIER:
    backbone: 0.1
    lang_encoder: 0.1
  WEIGHT_DECAY_NORM: 0.0
  WEIGHT_DECAY_EMBED: 0.0
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True

# Evaluation Dataset
ADE20K:
  INPUT:
    MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 640
    MAX_SIZE_TRAIN: 2560
    MAX_SIZE_TEST: 2560
    MASK_FORMAT: "polygon"
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [640, 640]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: 640  # used in dataset mapper
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    FORMAT: "RGB"
  DATASET:
    DATASET: 'ade'
  TRAIN:
    ASPECT_RATIO_GROUPING: true
    BATCH_SIZE_TOTAL: 16
    BATCH_SIZE_PER_GPU: 2
    SHUFFLE: true
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

REF:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
    FORMAT: "RGB"
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SUN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SCAN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

BDD:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

CITY:
  INPUT:
    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 1024
    MAX_SIZE_TRAIN: 4096
    MAX_SIZE_TEST: 2048
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [ 512, 1024 ]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: -1
    FORMAT: "RGB"
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    MASK_FORMAT: "polygon"
    TEST:
      EVAL_PERIOD: 5000
      BATCH_SIZE_TOTAL: 1
      AUG:
        ENABLED: False
        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
        MAX_SIZE: 4096
        FLIP: True
    DATALOADER:
      FILTER_EMPTY_ANNOTATIONS: True
      NUM_WORKERS: 16
      LOAD_PROPOSALS: False
      SAMPLER_TRAIN: "TrainingSampler"
      ASPECT_RATIO_GROUPING: True
    TRAIN:
      ASPECT_RATIO_GROUPING: true
      BATCH_SIZE_TOTAL: 2
      BATCH_SIZE_PER_GPU: 2
      SHUFFLE: true

PSACAL_PART:
  INPUT:
      MIN_SIZE_TEST: 800
      MAX_SIZE_TEST: 1333
      IMAGE_SIZE: 1024
      MIN_SCALE: 0.1
      MAX_SCALE: 2.0
      DATASET_MAPPER_NAME: "pascal_part_lsj"
      IGNORE_VALUE: 255
      COLOR_AUG_SSD: False
      SIZE_DIVISIBILITY: 32
      RANDOM_FLIP: "horizontal"
      MASK_FORMAT: "polygon"
      FORMAT: "RGB"
      CROP:
        ENABLED: True
  MODEL:
    MASK_ON: True
    KEYPOINT_ON: False
    LOAD_PROPOSALS: False
  # DATASET:
  #   DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

llava:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "llava"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

flickr:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "flickr"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

vg:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "vg"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

================================================
FILE: configs/openseed/openseed_swint_lang_joint_2st.yaml
================================================
# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
# --------------------------------------------------------

##################
# Task settings
##################
WEIGHT: ''
PORT: 53711
detach_seg: False
VERBOSE: true
#OUTPUT_DIR: '../../data/output/test'
inference_only: true
OUTPUT_DIR: '../../data/output/test'
clip: true
# misc
LOADER:
  JOINT: True
  KEY_DATASET: 'flickr'
# model
MODEL:
  NAME: openseed_model
  HEAD: openseed_head
  MASK_ON: false
  KEYPOINT_ON: false
  LOAD_PROPOSALS: false
  DIM_PROJ: 4096
  BACKBONE_DIM: 768
  BACKGROUND: False
  WEIGHTS: ''
  TEXT:
    ARCH: encoder
    NAME: transformer
    TOKENIZER: clip
    CONTEXT_LENGTH: 18 # 18
    WIDTH: 512
    HEADS: 8
    LAYERS: 12
    AUTOGRESSIVE: True
  BACKBONE:
    NAME: swin
    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'
    LOAD_PRETRAINED: true
    SWIN:
      PRETRAIN_IMG_SIZE: 224
      PATCH_SIZE: 4
      EMBED_DIM: 96
      DEPTHS: [ 2, 2, 6, 2 ]
      NUM_HEADS: [ 3, 6, 12, 24 ]
      WINDOW_SIZE: 7
      MLP_RATIO: 4.0
      QKV_BIAS: true
      QK_SCALE: ~
      DROP_RATE: 0.0
      ATTN_DROP_RATE: 0.0
      DROP_PATH_RATE: 0.3
      APE: false
      PATCH_NORM: true
      USE_CHECKPOINT: false
      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
  ENCODER:
    NAME: encoder_deform
    IGNORE_VALUE: 255
    NUM_CLASSES: 133
    LOSS_WEIGHT: 1.0
    CONVS_DIM: 256
    MASK_DIM: 256
    NORM: "GN"
    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
    COMMON_STRIDE: 4
    TRANSFORMER_ENC_LAYERS: 6
    TOTAL_NUM_FEATURE_LEVELS: 4
    NUM_FEATURE_LEVELS: 3
    FEATURE_ORDER: "low2high"
  DECODER:
    NAME: openseed_decoder
    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
    MASK: True
    BOX: True
    COCO_ONLY: True
    GROUNDING:
      ENABLED: False
      MAX_LEN: 5
      TEXT_WEIGHT: 2.0
      CLASS_WEIGHT: 0.5
    CAPTION:
      ENABLED: False
      PHRASE_PROB: 0.0
      SIM_THRES: 0.95
    CAPTIONING:
      ENABLED: False
      STEP: 50
    RETRIEVAL:
      ENABLED: False
      DIM_IMG: 768
      ENSEMBLE: True
    OPENIMAGE:
      ENABLED: False
      NEGATIVE_SAMPLES: 5
      GROUNDING:
        ENABLED: False
        MAX_LEN: 5
    DEEP_SUPERVISION: True
    NO_OBJECT_WEIGHT: 0.1
    CLASS_WEIGHT: 4.0
    MASK_WEIGHT: 5.0
    DICE_WEIGHT: 5.0
    BOX_WEIGHT: 5.0
    GIOU_WEIGHT: 2.0
    LLM_WEIGHT: 1.0
    WEIGHT_MULTIPLIER: 1.0
    COST_CLASS_WEIGHT: 4.0
    COST_DICE_WEIGHT: 5.0
    COST_MASK_WEIGHT: 5.0
    COST_BOX_WEIGHT: 5.0
    COST_GIOU_WEIGHT: 2.0
    HIDDEN_DIM: 256
    NUM_OBJECT_QUERIES: 300
    NHEADS: 8
    DROPOUT: 0.0
    DIM_FEEDFORWARD: 2048
    ENC_LAYERS: 0
    PRE_NORM: False
    ENFORCE_INPUT_PROJ: False
    SIZE_DIVISIBILITY: 32
    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
    TRAIN_NUM_POINTS: 12544
    OVERSAMPLE_RATIO: 3.0
    IMPORTANCE_SAMPLE_RATIO: 0.75
    TWO_STAGE: True
    INITIALIZE_BOX_TYPE: 'no'
    DN: seg
    DN_NOISE_SCALE: 0.4
    DN_NUM: 100
    INITIAL_PRED: True
    LEARN_TGT: False
    TOTAL_NUM_FEATURE_LEVELS: 4
    SEMANTIC_CE_LOSS: False
    PANO_BOX_LOSS: False
    COCO: True
    O365: False
    TEST:
      SEMANTIC_ON: True
      INSTANCE_ON: True
      PANOPTIC_ON: True
      OVERLAP_THRESHOLD: 0.8
      OBJECT_MASK_THRESHOLD: 0.25
      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
      TEST_FOUCUS_ON_BOX: False
      PANO_TRANSFORM_EVAL: True
      PANO_TEMPERATURE: 0.06

TEST:
  EVAL_PERIOD: 500000
  PRECISE_BN:
    NUM_ITER: 1
    ENABLED: False
  AUG:
    ENABLED: False

SAM:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 0.99
    MAX_SCALE: 1.01
    DATASET_MAPPER_NAME: "sam"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'sam'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

COCO:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "coco_ref_panoptic_lsj"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

VLP:
  INPUT:
    IMAGE_SIZE: 224
    DATASET_MAPPER_NAME: "vlpretrain"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TRAIN:
    BATCH_SIZE_TOTAL: 2
    BATCH_SIZE_PER_GPU: 2
  TEST:
    BATCH_SIZE_TOTAL: 256
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

INPUT:
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]

DATASETS:
  TRAIN: ["coco_instruct_train_v3","flickr_train"]

DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: False
  NUM_WORKERS:  4
  LOAD_PROPOSALS: False
  SAMPLER_TRAIN: "TrainingSampler"
  ASPECT_RATIO_GROUPING: True

# Detectron2 training config for optimizer and lr scheduler
SOLVER:
  BASE_LR_END: 0.0
  MOMENTUM: 0.9
  NESTEROV: False
  CHECKPOINT_PERIOD: 5000
  IMS_PER_BATCH: 1
  REFERENCE_WORLD_SIZE: 0
  BIAS_LR_FACTOR: 1.0
  WEIGHT_DECAY_BIAS: None
  # original
  BASE_LR: 0.0001
  STEPS: [327778, 355092]
  MAX_ITER: 368750
  GAMMA: 0.1
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WARMUP_METHOD: "linear"
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
  LR_MULTIPLIER:
    backbone: 0.1
    lang_encoder: 0.1
  WEIGHT_DECAY_NORM: 0.0
  WEIGHT_DECAY_EMBED: 0.0
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True

# Evaluation Dataset
ADE20K:
  INPUT:
    MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 640
    MAX_SIZE_TRAIN: 2560
    MAX_SIZE_TEST: 2560
    MASK_FORMAT: "polygon"
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [640, 640]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: 640  # used in dataset mapper
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    FORMAT: "RGB"
  DATASET:
    DATASET: 'ade'
  TRAIN:
    ASPECT_RATIO_GROUPING: true
    BATCH_SIZE_TOTAL: 16
    BATCH_SIZE_PER_GPU: 2
    SHUFFLE: true
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

REF:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
    FORMAT: "RGB"
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SUN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SCAN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

BDD:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

CITY:
  INPUT:
    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 1024
    MAX_SIZE_TRAIN: 4096
    MAX_SIZE_TEST: 2048
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [ 512, 1024 ]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: -1
    FORMAT: "RGB"
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    MASK_FORMAT: "polygon"
    TEST:
      EVAL_PERIOD: 5000
      BATCH_SIZE_TOTAL: 1
      AUG:
        ENABLED: False
        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
        MAX_SIZE: 4096
        FLIP: True
    DATALOADER:
      FILTER_EMPTY_ANNOTATIONS: True
      NUM_WORKERS:  4
      LOAD_PROPOSALS: False
      SAMPLER_TRAIN: "TrainingSampler"
      ASPECT_RATIO_GROUPING: True
    TRAIN:
      ASPECT_RATIO_GROUPING: true
      BATCH_SIZE_TOTAL: 2
      BATCH_SIZE_PER_GPU: 2
      SHUFFLE: true

PSACAL_PART:
  INPUT:
      MIN_SIZE_TEST: 800
      MAX_SIZE_TEST: 1333
      IMAGE_SIZE: 1024
      MIN_SCALE: 0.1
      MAX_SCALE: 2.0
      DATASET_MAPPER_NAME: "pascal_part_lsj"
      IGNORE_VALUE: 255
      COLOR_AUG_SSD: False
      SIZE_DIVISIBILITY: 32
      RANDOM_FLIP: "horizontal"
      MASK_FORMAT: "polygon"
      FORMAT: "RGB"
      CROP:
        ENABLED: True
  MODEL:
    MASK_ON: True
    KEYPOINT_ON: False
    LOAD_PROPOSALS: False
  # DATASET:
  #   DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

llava:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "llava"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

flickr:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "flickr"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

coco_instruct:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "coco_instruct"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

vg:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "vg"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

================================================
FILE: configs/openseed/openseed_swint_lang_joint_2st_visual_prompt.yaml
================================================
# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
# --------------------------------------------------------

##################
# Task settings
##################
WEIGHT: ''
PORT: 53711
detach_seg: False
VERBOSE: true
#OUTPUT_DIR: '../../data/output/test'
inference_only: true
OUTPUT_DIR: '../../data/output/test'
clip: true
# misc
LOADER:
  JOINT: True
  KEY_DATASET: 'flickr'
# model
MODEL:
  NAME: openseed_model
  HEAD: openseed_head
  MASK_ON: false
  KEYPOINT_ON: false
  LOAD_PROPOSALS: false
  DIM_PROJ: 4096
  BACKBONE_DIM: 768
  BACKGROUND: False
  WEIGHTS: ''
  TEXT:
    ARCH: encoder
    NAME: transformer
    TOKENIZER: clip
    CONTEXT_LENGTH: 18 # 18
    WIDTH: 512
    HEADS: 8
    LAYERS: 12
    AUTOGRESSIVE: True
  BACKBONE:
    NAME: swin
    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'
    LOAD_PRETRAINED: true
    SWIN:
      PRETRAIN_IMG_SIZE: 224
      PATCH_SIZE: 4
      EMBED_DIM: 96
      DEPTHS: [ 2, 2, 6, 2 ]
      NUM_HEADS: [ 3, 6, 12, 24 ]
      WINDOW_SIZE: 7
      MLP_RATIO: 4.0
      QKV_BIAS: true
      QK_SCALE: ~
      DROP_RATE: 0.0
      ATTN_DROP_RATE: 0.0
      DROP_PATH_RATE: 0.3
      APE: false
      PATCH_NORM: true
      USE_CHECKPOINT: false
      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
  ENCODER:
    NAME: encoder_deform
    IGNORE_VALUE: 255
    NUM_CLASSES: 133
    LOSS_WEIGHT: 1.0
    CONVS_DIM: 256
    MASK_DIM: 256
    NORM: "GN"
    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
    COMMON_STRIDE: 4
    TRANSFORMER_ENC_LAYERS: 6
    TOTAL_NUM_FEATURE_LEVELS: 4
    NUM_FEATURE_LEVELS: 3
    FEATURE_ORDER: "low2high"
  DECODER:
    NAME: openseed_decoder
    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
    MASK: True
    BOX: True
    COCO_ONLY: True
    GROUNDING:
      ENABLED: False
      MAX_LEN: 5
      TEXT_WEIGHT: 2.0
      CLASS_WEIGHT: 0.5
    CAPTION:
      ENABLED: False
      PHRASE_PROB: 0.0
      SIM_THRES: 0.95
    CAPTIONING:
      ENABLED: False
      STEP: 50
    RETRIEVAL:
      ENABLED: False
      DIM_IMG: 768
      ENSEMBLE: True
    OPENIMAGE:
      ENABLED: False
      NEGATIVE_SAMPLES: 5
      GROUNDING:
        ENABLED: False
        MAX_LEN: 5
    DEEP_SUPERVISION: True
    NO_OBJECT_WEIGHT: 0.1
    CLASS_WEIGHT: 4.0
    MASK_WEIGHT: 5.0
    DICE_WEIGHT: 5.0
    BOX_WEIGHT: 5.0
    GIOU_WEIGHT: 2.0
    LLM_WEIGHT: 1.0
    WEIGHT_MULTIPLIER: 1.0
    COST_CLASS_WEIGHT: 4.0
    COST_DICE_WEIGHT: 5.0
    COST_MASK_WEIGHT: 5.0
    COST_BOX_WEIGHT: 5.0
    COST_GIOU_WEIGHT: 2.0
    HIDDEN_DIM: 256
    NUM_OBJECT_QUERIES: 300
    NHEADS: 8
    DROPOUT: 0.0
    DIM_FEEDFORWARD: 2048
    ENC_LAYERS: 0
    PRE_NORM: False
    ENFORCE_INPUT_PROJ: False
    SIZE_DIVISIBILITY: 32
    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
    TRAIN_NUM_POINTS: 12544
    OVERSAMPLE_RATIO: 3.0
    IMPORTANCE_SAMPLE_RATIO: 0.75
    TWO_STAGE: True
    INITIALIZE_BOX_TYPE: 'no'
    DN: seg
    DN_NOISE_SCALE: 0.4
    DN_NUM: 100
    INITIAL_PRED: True
    LEARN_TGT: False
    TOTAL_NUM_FEATURE_LEVELS: 4
    SEMANTIC_CE_LOSS: False
    PANO_BOX_LOSS: False
    COCO: True
    O365: False
    TEST:
      SEMANTIC_ON: True
      INSTANCE_ON: True
      PANOPTIC_ON: True
      OVERLAP_THRESHOLD: 0.8
      OBJECT_MASK_THRESHOLD: 0.25
      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
      TEST_FOUCUS_ON_BOX: False
      PANO_TRANSFORM_EVAL: True
      PANO_TEMPERATURE: 0.06

TEST:
  EVAL_PERIOD: 500000
  PRECISE_BN:
    NUM_ITER: 1
    ENABLED: False
  AUG:
    ENABLED: False

SAM:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 0.99
    MAX_SCALE: 1.01
    DATASET_MAPPER_NAME: "sam"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'sam'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

COCO:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "coco_ref_panoptic_lsj"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

VLP:
  INPUT:
    IMAGE_SIZE: 224
    DATASET_MAPPER_NAME: "vlpretrain"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TRAIN:
    BATCH_SIZE_TOTAL: 2
    BATCH_SIZE_PER_GPU: 2
  TEST:
    BATCH_SIZE_TOTAL: 256
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

INPUT:
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]

DATASETS:
  TRAIN: ["coco_interactive_refcoco","coco_interactive","flickr_train"]

DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: False
  NUM_WORKERS:  4
  LOAD_PROPOSALS: False
  SAMPLER_TRAIN: "TrainingSampler"
  ASPECT_RATIO_GROUPING: True

# Detectron2 training config for optimizer and lr scheduler
SOLVER:
  BASE_LR_END: 0.0
  MOMENTUM: 0.9
  NESTEROV: False
  CHECKPOINT_PERIOD: 5000
  IMS_PER_BATCH: 1
  REFERENCE_WORLD_SIZE: 0
  BIAS_LR_FACTOR: 1.0
  WEIGHT_DECAY_BIAS: None
  # original
  BASE_LR: 0.0001
  STEPS: [327778, 355092]
  MAX_ITER: 368750
  GAMMA: 0.1
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WARMUP_METHOD: "linear"
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
  LR_MULTIPLIER:
    backbone: 0.1
    lang_encoder: 0.1
  WEIGHT_DECAY_NORM: 0.0
  WEIGHT_DECAY_EMBED: 0.0
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True

# Evaluation Dataset
ADE20K:
  INPUT:
    MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 640
    MAX_SIZE_TRAIN: 2560
    MAX_SIZE_TEST: 2560
    MASK_FORMAT: "polygon"
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [640, 640]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: 640  # used in dataset mapper
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    FORMAT: "RGB"
  DATASET:
    DATASET: 'ade'
  TRAIN:
    ASPECT_RATIO_GROUPING: true
    BATCH_SIZE_TOTAL: 16
    BATCH_SIZE_PER_GPU: 2
    SHUFFLE: true
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

REF:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
    FORMAT: "RGB"
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SUN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SCAN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

BDD:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

CITY:
  INPUT:
    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 1024
    MAX_SIZE_TRAIN: 4096
    MAX_SIZE_TEST: 2048
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [ 512, 1024 ]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: -1
    FORMAT: "RGB"
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    MASK_FORMAT: "polygon"
    TEST:
      EVAL_PERIOD: 5000
      BATCH_SIZE_TOTAL: 1
      AUG:
        ENABLED: False
        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
        MAX_SIZE: 4096
        FLIP: True
    DATALOADER:
      FILTER_EMPTY_ANNOTATIONS: True
      NUM_WORKERS:  4
      LOAD_PROPOSALS: False
      SAMPLER_TRAIN: "TrainingSampler"
      ASPECT_RATIO_GROUPING: True
    TRAIN:
      ASPECT_RATIO_GROUPING: true
      BATCH_SIZE_TOTAL: 2
      BATCH_SIZE_PER_GPU: 2
      SHUFFLE: true

PSACAL_PART:
  INPUT:
      MIN_SIZE_TEST: 800
      MAX_SIZE_TEST: 1333
      IMAGE_SIZE: 1024
      MIN_SCALE: 0.1
      MAX_SCALE: 2.0
      DATASET_MAPPER_NAME: "pascal_part_lsj"
      IGNORE_VALUE: 255
      COLOR_AUG_SSD: False
      SIZE_DIVISIBILITY: 32
      RANDOM_FLIP: "horizontal"
      MASK_FORMAT: "polygon"
      FORMAT: "RGB"
      CROP:
        ENABLED: True
  MODEL:
    MASK_ON: True
    KEYPOINT_ON: False
    LOAD_PROPOSALS: False
  # DATASET:
  #   DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

llava:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "llava"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

flickr:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "flickr"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

coco_instruct:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "coco_instruct"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

coco_interactive:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "coco_interactive"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True


vg:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "vg"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS:  4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

================================================
FILE: configs/semsam/visual_prompt_encoder.yaml
================================================
# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
# --------------------------------------------------------

##################
# Task settings
##################
WEIGHT: ''
PORT: 53711
VERBOSE: true
#OUTPUT_DIR: '../../data/output/test'
inference_only: true
OUTPUT_DIR: '../../data/output/test'
# misc
LOADER:
  JOINT: True
  KEY_DATASET: 'coco'
# model
MODEL:
  NAME: idino_model_partwhole_all_llm_ref_feats_all_det_pretrainv1
  HEAD: openseed_head
  MASK_ON: false
  KEYPOINT_ON: false
  LOAD_PROPOSALS: false
  DIM_PROJ: 512
  BACKBONE_DIM: 768
  BACKGROUND: False
  WEIGHTS: None
  LLAMA:
    model_name_or_path: '/comp_robot/liushilong/data/LLAVA/LLAVA_7b'
    cache_dir: None
    model_max_length: 2048
    hidden_size: 4096
    tune_mm_mlp_adapter: True
    im_width: 16
    load_fp16: False
    lora_r: 0
    lora_alpha: 16
    lora_dropout: 0.05

  TEXT:
    ARCH: llama_encoder
  BACKBONE:
    NAME: swin
    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'
    LOAD_PRETRAINED: true
    SWIN:
      PRETRAIN_IMG_SIZE: 224
      PATCH_SIZE: 4
      EMBED_DIM: 96
      DEPTHS: [ 2, 2, 6, 2 ]
      NUM_HEADS: [ 3, 6, 12, 24 ]
      WINDOW_SIZE: 7
      MLP_RATIO: 4.0
      QKV_BIAS: true
      QK_SCALE: ~
      DROP_RATE: 0.0
      ATTN_DROP_RATE: 0.0
      DROP_PATH_RATE: 0.3
      APE: false
      PATCH_NORM: true
      USE_CHECKPOINT: false
      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
  ENCODER:
    NAME: encoder_deform
    IGNORE_VALUE: 255
    NUM_CLASSES: 1
    LOSS_WEIGHT: 1.0
    CONVS_DIM: 256
    MASK_DIM: 256
    NORM: "GN"
    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
    COMMON_STRIDE: 4
    TRANSFORMER_ENC_LAYERS: 6
    TOTAL_NUM_FEATURE_LEVELS: 4
    NUM_FEATURE_LEVELS: 3
    FEATURE_ORDER: "low2high"
  DECODER:
    NAME: idino_decoder_no_iou_token_partwhole_all_llm
    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
    MASK: True
    BOX: True
    PART: True
    pretrain: True
    match_loss: True
    GROUNDING:
      ENABLED: True
      MAX_LEN: 5
      TEXT_WEIGHT: 2.0
      CLASS_WEIGHT: 0.5
    CAPTION:
      ENABLED: True
      PHRASE_PROB: 0.0
      SIM_THRES: 0.95
    CAPTIONING:
      ENABLED: True
      STEP: 50
    RETRIEVAL:
      ENABLED: True
      DIM_IMG: 768
      ENSEMBLE: True
    OPENIMAGE:
      ENABLED: False
      NEGATIVE_SAMPLES: 5
      GROUNDING:
        ENABLED: False
        MAX_LEN: 5
    DEEP_SUPERVISION: True
    NO_OBJECT_WEIGHT: 0.1
    CLASS_WEIGHT: 4.0
    MASK_WEIGHT: 5.0
    DICE_WEIGHT: 5.0
    BOX_WEIGHT: 5.0
    GIOU_WEIGHT: 2.0
    IOU_WEIGHT: 1.0
    LLAMA_WEIGHT: 5.0
    llama_det_weight: 2.0
    llama_ref_weight: 1.0
    llama_region_cap_weight: 1.0
    llama_img_cap_weight: 1.0
    llama_gd_weight: 20.0
    llama_gd_text_weight: 2.0
    REFER_WEIGHT: 5.0
    COST_CLASS_WEIGHT: 4.0
    COST_DICE_WEIGHT: 5.0
    COST_MASK_WEIGHT: 5.0
    COST_BOX_WEIGHT: 5.0
    COST_GIOU_WEIGHT: 2.0
    HIDDEN_DIM: 256
    NUM_OBJECT_QUERIES: 0
    NHEADS: 8
    DROPOUT: 0.0
    DIM_FEEDFORWARD: 2048
    ENC_LAYERS: 0
    PRE_NORM: False
    ENFORCE_INPUT_PROJ: False
    SIZE_DIVISIBILITY: 32
    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
    TRAIN_NUM_POINTS: 12544
    OVERSAMPLE_RATIO: 3.0
    IMPORTANCE_SAMPLE_RATIO: 0.75
    TWO_STAGE: False
    INITIALIZE_BOX_TYPE: 'no'
    DN: seg
    DN_NOISE_SCALE: 0.4
    DN_NUM: 100
    INITIAL_PRED: False
    LEARN_TGT: False
    TOTAL_NUM_FEATURE_LEVELS: 4
    SEMANTIC_CE_LOSS: False
    PANO_BOX_LOSS: False
    COCO: True
    O365: False
    SAM: True
    PASCAL: True
    RE_POINT: True
    NUM_INTERACTIVE_TOKENS: 3
    TEST:
      SEMANTIC_ON: True
      INSTANCE_ON: True
      PANOPTIC_ON: True
      OVERLAP_THRESHOLD: 0.8
      OBJECT_MASK_THRESHOLD: 0.25
      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
      TEST_FOUCUS_ON_BOX: False
      PANO_TRANSFORM_EVAL: True
      PANO_TEMPERATURE: 0.06

TEST:
  EVAL_PERIOD: 500000
  PRECISE_BN:
    NUM_ITER: 1
    ENABLED: False
  AUG:
    ENABLED: False

SAM:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 0.99
    MAX_SCALE: 1.01
    DATASET_MAPPER_NAME: "sam"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'sam'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 4
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

COCO:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  DATASET:
    DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 2
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

VLP:
  INPUT:
    IMAGE_SIZE: 224
    DATASET_MAPPER_NAME: "vlpretrain"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TRAIN:
    BATCH_SIZE_TOTAL: 2
    BATCH_SIZE_PER_GPU: 2
  TEST:
    BATCH_SIZE_TOTAL: 256
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 16
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

INPUT:
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]

DATASETS:
  TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding","mapillary_vistas_panoptic_train","ade20k_panoptic_train","sam_train","pascal_part_train","paco_train","partimagenet_train"]#,"sam_train","pascal_part_train"]#,"paco_train","partimagenet_train"]

DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: False
  NUM_WORKERS: 16
  LOAD_PROPOSALS: False
  SAMPLER_TRAIN: "TraziningSampler"
  ASPECT_RATIO_GROUPING: True

# Detectron2 training config for optimizer and lr scheduler
SOLVER:
  BASE_LR_END: 0.0
  MOMENTUM: 0.9
  NESTEROV: False
  CHECKPOINT_PERIOD: 5000
  IMS_PER_BATCH: 1
  REFERENCE_WORLD_SIZE: 0
  BIAS_LR_FACTOR: 1.0
  WEIGHT_DECAY_BIAS: None
  # original
  BASE_LR: 0.0001
  STEPS: [327778, 355092]
  MAX_ITER: 368750
  GAMMA: 0.1
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 10
  WARMUP_METHOD: "linear"
  WEIGHT_DECAY: 0.05
  OPTIMIZER: "ADAMW"
  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
  LR_MULTIPLIER:
    backbone: 0.1
    lang_encoder: 0.1
  WEIGHT_DECAY_NORM: 0.0
  WEIGHT_DECAY_EMBED: 0.0
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
  AMP:
    ENABLED: True

# Evaluation Dataset
ADE20K:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True

  DATASET:
    DATASET: 'ade'
  TRAIN:
    ASPECT_RATIO_GROUPING: true
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 8
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 8
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

REF:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
    FORMAT: "RGB"
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 0
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SUN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 0
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

SCAN:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 1024
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 0
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

BDD:
  INPUT:
    PIXEL_MEAN: [123.675, 116.280, 103.530]
    PIXEL_STD: [58.395, 57.120, 57.375]
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 0
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: False
  TEST:
    BATCH_SIZE_TOTAL: 8

CITY:
  INPUT:
    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 1024
    MAX_SIZE_TRAIN: 4096
    MAX_SIZE_TEST: 2048
    CROP:
      ENABLED: True
      TYPE: "absolute"
      SIZE: [ 512, 1024 ]
      SINGLE_CATEGORY_MAX_AREA: 1.0
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: -1
    FORMAT: "RGB"
    DATASET_MAPPER_NAME: "mask_former_panoptic"
    MASK_FORMAT: "polygon"
    TEST:
      EVAL_PERIOD: 5000
      BATCH_SIZE_TOTAL: 1
      AUG:
        ENABLED: False
        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
        MAX_SIZE: 4096
        FLIP: True
    DATALOADER:
      FILTER_EMPTY_ANNOTATIONS: True
      NUM_WORKERS: 2
      LOAD_PROPOSALS: False
      SAMPLER_TRAIN: "TrainingSampler"
      ASPECT_RATIO_GROUPING: True
    TRAIN:
      ASPECT_RATIO_GROUPING: true
      BATCH_SIZE_TOTAL: 2
      BATCH_SIZE_PER_GPU: 2
      SHUFFLE: true

PSACAL_PART:
  INPUT:
      MIN_SIZE_TEST: 800
      MAX_SIZE_TEST: 1333
      IMAGE_SIZE: 1024
      MIN_SCALE: 1.0
      MAX_SCALE: 1.0
      DATASET_MAPPER_NAME: "pascal_part_lsj"
      IGNORE_VALUE: 255
      COLOR_AUG_SSD: False
      SIZE_DIVISIBILITY: 32
      RANDOM_FLIP: "horizontal"
      MASK_FORMAT: "polygon"
      FORMAT: "RGB"
      CROP:
        ENABLED: True
  MODEL:
    MASK_ON: True
    KEYPOINT_ON: False
    LOAD_PROPOSALS: False
  # DATASET:
  #   DATASET: 'coco'
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 8
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 2
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

llava:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "llava"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 2
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

flickr:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "flickr"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 2
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

part:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "part"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 2
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

vg:
  INPUT:
    MIN_SIZE_TEST: 800
    MAX_SIZE_TEST: 1333
    IMAGE_SIZE: 1024
    MIN_SCALE: 1.0
    MAX_SCALE: 1.0
    DATASET_MAPPER_NAME: "vg"
    IGNORE_VALUE: 255
    COLOR_AUG_SSD: False
    SIZE_DIVISIBILITY: 32
    RANDOM_FLIP: "horizontal"
    MASK_FORMAT: "polygon"
    FORMAT: "RGB"
    CROP:
      ENABLED: True
  TEST:
    DETECTIONS_PER_IMAGE: 100
    NAME: coco_eval
    IOU_TYPE: ['bbox', 'segm']
    USE_MULTISCALE: false
    BATCH_SIZE_TOTAL: 1
    MODEL_FILE: ''
    AUG:
      ENABLED: False
  TRAIN:
    BATCH_SIZE_TOTAL: 1
    BATCH_SIZE_PER_GPU: 1
    SHUFFLE: true
  DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: False
    NUM_WORKERS: 2
    LOAD_PROPOSALS: False
    SAMPLER_TRAIN: "TrainingSampler"
    ASPECT_RATIO_GROUPING: True

================================================
FILE: datasets_os/__init__.py
================================================
from . import registration
from .build import *

================================================
FILE: datasets_os/build.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
import os
import itertools
import logging
import copy
from typing import Any, Callable, Dict, List, Optional, Union

import torch
import torch.utils.data
import torch.utils.data as torchdata

import detectron2.utils.comm as comm
from detectron2.data.build import (
    build_batch_data_loader,
    load_proposals_into_dataset,
    trivial_batch_collator,
)
from detectron2.data import MetadataCatalog
from detectron2.data.catalog import DatasetCatalog
from detectron2.data.common import DatasetFromList, MapDataset
from detectron2.data.dataset_mapper import DatasetMapper
from detectron2.data.samplers import InferenceSampler, TrainingSampler

from fvcore.common.config import CfgNode
from omegaconf import DictConfig, OmegaConf

from .dataset_mappers import (
    COCOPanopticInteractiveDatasetMapper,
    FlickrNewBaselineDatasetMapper,
    VGNewBaselineDatasetMapper,
    COCOInstructGroundingDatasetMapper,
    COCOInterGroundingDatasetMapper,
)

from .custom_dataset_dataloader import build_custom_test_loader
from llava.model.openseed.utils import configurable
from detectron2.utils.comm import get_world_size, is_main_process
from typing import Any, Dict, List, Set

class JointLoader(torchdata.IterableDataset):
    def __init__(self, loaders, key_dataset):
        dataset_names = []
        for key, loader in loaders.items():
            name = "{}".format(key.split('_')[0])
            # name = "{}".format(key)
            setattr(self, name, loader)
            dataset_names += [name]
        self.dataset_names = dataset_names
        self.key_dataset = key_dataset
    
    def __iter__(self):
        for batch in zip(*[getattr(self, name) for name in self.dataset_names]):
            yield {key: batch[i] for i, key in enumerate(self.dataset_names)}

    def __len__(self):
        return len(getattr(self, self.key_dataset))

def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names):
    """
    Filter out images with none annotations or only crowd annotations
    (i.e., images without non-crowd annotations).
    A common training-time preprocessing on COCO dataset.

    Args:
        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.

    Returns:
        list[dict]: the same format, but filtered.
    """
    num_before = len(dataset_dicts)

    def valid(anns):
        for ann in anns:
            if isinstance(ann, list):
                for instance in ann:
                    if instance.get("iscrowd", 0) == 0:
                        return True
            else:
                if ann.get("iscrowd", 0) == 0:
                    return True
        return False

    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
    num_after = len(dataset_dicts)
    logger = logging.getLogger(__name__)
    logger.info(
        "Removed {} images with no usable annotations. {} images left.".format(
            num_before - num_after, num_after
        )
    )
    return dataset_dicts


def get_detection_dataset_dicts(
    dataset_names, filter_empty=True, proposal_files=None
):
    """
    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.

    Args:
        dataset_names (str or list[str]): a dataset name or a list of dataset names
        filter_empty (bool): whether to filter out images without instance annotations
        proposal_files (list[str]): if given, a list of object proposal files
            that match each dataset in `dataset_names`.

    Returns:
        list[dict]: a list of dicts following the standard dataset dict format.
    """
    if isinstance(dataset_names, str):
        dataset_names = [dataset_names]
    assert len(dataset_names)
    
    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)

    if proposal_files is not None:
        assert len(dataset_names) == len(proposal_files)
        # load precomputed proposals from proposal files
        dataset_dicts = [
            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
        ]

    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))

    has_instances = "annotations" in dataset_dicts[0]
    if filter_empty and has_instances:
        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names)

    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(dataset_names))
    return dataset_dicts


def _test_loader_from_config(cfg, dataset_name, mapper=None):
    """
    Uses the given `dataset_name` argument (instead of the names in cfg), because the
    standard practice is to evaluate each test set individually (not combining them).
    """
    if isinstance(dataset_name, str):
        dataset_name = [dataset_name]

    dataset = get_detection_dataset_dicts(
        dataset_name,
        filter_empty=False,
        proposal_files=None,
    )
    # import ipdb;ipdb.set_trace()
    if mapper is None:
        if isinstance(cfg, (DictConfig)):
            cfg = OmegaConf.to_container(copy.deepcopy(cfg))
        mapper_cfg = CfgNode({'INPUT': cfg['INPUT'], 'MODEL': cfg['MODEL'], 'DATASETS': cfg['DATASETS']})
        mapper = DatasetMapper(mapper_cfg, False)
    assert cfg['TEST']['BATCH_SIZE_TOTAL'] % get_world_size() == 0, "Evaluation total batchsize is not divisible by gpu number"
    batch_size = cfg['TEST']['BATCH_SIZE_TOTAL'] // get_world_size()

    return {
        "dataset": dataset,
        "mapper": mapper,
        "num_workers": cfg['DATALOADER']['NUM_WORKERS'],
        "sampler": InferenceSampler(len(dataset)),
        "batch_size": batch_size,
    }


@configurable(from_config=_test_loader_from_config)
def build_detection_test_loader(
    dataset: Union[List[Any], torchdata.Dataset],
    *,
    mapper: Callable[[Dict[str, Any]], Any],
    sampler: Optional[torchdata.Sampler] = None,
    batch_size: int = 1,
    num_workers: int = 0,
    collate_fn: Optional[Callable[[List[Any]], Any]] = None,
) -> torchdata.DataLoader:
    """
    Similar to `build_detection_train_loader`, with default batch size = 1,
    and sampler = :class:`InferenceSampler`. This sampler coordinates all workers
    to produce the exact set of all samples.

    Args:
        dataset: a list of dataset dicts,
            or a pytorch dataset (either map-style or iterable). They can be obtained
            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
        mapper: a callable which takes a sample (dict) from dataset
           and returns the format to be consumed by the model.
           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
        sampler: a sampler that produces
            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
            which splits the dataset across all workers. Sampler must be None
            if `dataset` is iterable.
        batch_size: the batch size of the data loader to be created.
            Default to 1 image per worker since this is the standard when reporting
            inference time in papers.
        num_workers: number of parallel data loading workers
        collate_fn: same as the argument of `torch.utils.data.DataLoader`.
            Defaults to do no collation and return a list of data.

    Returns:
        DataLoader: a torch DataLoader, that loads the given detection
        dataset, with test-time transformation and batching.

    Examples:
    ::
        data_loader = build_detection_test_loader(
            DatasetRegistry.get("my_test"),
            mapper=DatasetMapper(...))

        # or, instantiate with a CfgNode:
        data_loader = build_detection_test_loader(cfg, "my_test")
    """

    if isinstance(dataset, list):
        dataset = DatasetFromList(dataset, copy=False)
    if mapper is not None:
        dataset = MapDataset(dataset, mapper)
    if isinstance(dataset, torchdata.IterableDataset):
        assert sampler is None, "sampler must be None if dataset is IterableDataset"
    else:
        if sampler is None:
            sampler = InferenceSampler(len(dataset))
    return torchdata.DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=sampler,
        drop_last=False,
        num_workers=num_workers,
        collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
    )


def _train_loader_from_config(cfg, dataset_name, mapper, *, dataset=None, sampler=None):
    cfg_datasets = cfg['DATASETS']
    cfg_dataloader = cfg['DATALOADER']
    
    if dataset is None:
        dataset = get_detection_dataset_dicts(
            dataset_name,
            filter_empty=cfg_dataloader['FILTER_EMPTY_ANNOTATIONS'],
            proposal_files=cfg_datasets['PROPOSAL_FILES_TRAIN'] if cfg_dataloader['LOAD_PROPOSALS'] else None,
        )

    if mapper is None:
        mapper = DatasetMapper(cfg, True)

    if sampler is None:
        sampler_name = cfg_dataloader['SAMPLER_TRAIN']
        logger = logging.getLogger(__name__)
        logger.info("Using training sampler {}".format(sampler_name))
        sampler = TrainingSampler(len(dataset))

    return {
        "dataset": dataset,
        "sampler": sampler,
        "mapper": mapper,
        "total_batch_size": cfg['TRAIN']['BATCH_SIZE_TOTAL'],
        "aspect_ratio_grouping": cfg_dataloader['ASPECT_RATIO_GROUPING'],
        "num_workers": cfg_dataloader['NUM_WORKERS'],
    }


@configurable(from_config=_train_loader_from_config)
def build_detection_train_loader(
    dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
):
    """
    Build a dataloader for object detection with some default features.
    This interface is experimental.

    Args:
        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
            or a map-style pytorch dataset. They can be obtained by using
            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
        mapper (callable): a callable which takes a sample (dict) from dataset and
            returns the format to be consumed by the model.
            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
        sampler (torch.utils.data.sampler.Sampler or None): a sampler that
            produces indices to be applied on ``dataset``.
            Default to :class:`TrainingSampler`, which coordinates a random shuffle
            sequence across all workers.
        total_batch_size (int): total batch size across all workers. Batching
            simply puts data into a list.
        aspect_ratio_grouping (bool): whether to group images with similar
            aspect ratio for efficiency. When enabled, it requires each
            element in dataset be a dict with keys "width" and "height".
        num_workers (int): number of parallel data loading workers

    Returns:
        torch.utils.data.DataLoader: a dataloader. Each output from it is a
            ``list[mapped_element]`` of length ``total_batch_size / num_workers``,
            where ``mapped_element`` is produced by the ``mapper``.
    """
    if isinstance(dataset, list):
        dataset = DatasetFromList(dataset, copy=False)
    if mapper is not None:
        dataset = MapDataset(dataset, mapper)
    if sampler is None:
        sampler = TrainingSampler(len(dataset))
    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
    
    return build_batch_data_loader(
        dataset,
        sampler,
        total_batch_size,
        aspect_ratio_grouping=aspect_ratio_grouping,
        num_workers=num_workers,
    )


def get_config_from_name(cfg, dataset_name):
    # adjust config according to dataset
    if 'sam' in dataset_name:
        cfg.update(cfg['SAM'])
        return cfg
    elif 'flickr' in dataset_name:
        cfg.update(cfg['flickr'])
        return cfg
    elif 'coco_instruct' in dataset_name:
        cfg.update(cfg['coco_instruct'])
        return cfg
    elif 'coco_interactive' in dataset_name:
        cfg.update(cfg['coco_interactive'])
        return cfg
    elif 'lisa' in dataset_name:
        cfg.update(cfg['LISA_REF'])
        return cfg
    elif 'llava' in dataset_name:
        cfg.update(cfg['llava'])
        return cfg
    elif 'vg' in dataset_name:
        cfg.update(cfg['vg'])
        return cfg
    elif 'part' in dataset_name and 'pascal_part' not in dataset_name and 'partimagenet' not in dataset_name:
        cfg.update(cfg['part'])
        return cfg
    elif 'pascal' in dataset_name or 'paco' in dataset_name or 'partimagenet' in dataset_name :
        cfg.update(cfg['PSACAL_PART'])
        return cfg
    elif 'coco' in dataset_name and 'refonly' in dataset_name:
        # if 'COCO' in cfg.keys():
        cfg.update(cfg['COCO_REF'])
        return cfg
    elif 'coco' in dataset_name:
        if 'COCO' in cfg.keys():
            cfg.update(cfg['COCO'])
        return cfg
    elif "mapillary" in dataset_name:
        if 'MAPILLARY' in cfg.keys():
            cfg.update(cfg['MAPILLARY'])
        return cfg
    elif 'ade' in dataset_name:
        if 'ADE20K' in cfg.keys():
            cfg.update(cfg['ADE20K'])
        return cfg
    elif 'imagenet' in dataset_name:
        if 'IMAGENET' in cfg.keys():
            cfg.update(cfg['IMAGENET'])
        return cfg
    elif 'vlp' in dataset_name:
        cfg.update(cfg['VLP'])
        return cfg
    elif 'sun' in dataset_name:
        cfg.update(cfg['SUN'])
        return cfg
    elif 'object365' in dataset_name:
        cfg.update(cfg['OBJECT365'])
        return cfg
    elif 'scan' in dataset_name:
        cfg.update(cfg['SCAN'])
        return cfg
    elif 'cityscape' in dataset_name:
        cfg.update(cfg['CITY'])
        return cfg
    elif 'bdd' in dataset_name:
        cfg.update(cfg['BDD'])
        return cfg
    else:
        assert False, "dataset not support."



def build_train_dataloader(cfg,tokenizer=None,data_args=None,preprocess=None,llava_cap_loader=None ):
    dataset_names = cfg['DATASETS']['TRAIN']
    
    loaders = {}
    cfg = copy.deepcopy(cfg)
    for dataset_name in dataset_names:
        cfg = get_config_from_name(cfg, dataset_name)
        mapper_name = cfg['INPUT']['DATASET_MAPPER_NAME']

        if mapper_name =="flickr":
            mapper=FlickrNewBaselineDatasetMapper(cfg,True,tokenizer=tokenizer,data_args=data_args,preprocess=preprocess)
            loaders['flickr'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
        elif mapper_name =="coco_instruct":
            mapper=COCOInstructGroundingDatasetMapper(cfg,True,tokenizer=tokenizer,data_args=data_args,preprocess=preprocess)
            loaders['coco_instruct'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
        elif mapper_name =="coco_interactive":
            if "refcoco" in dataset_name:
                refcoco=True
            else:
                refcoco=False
            mapper=COCOInterGroundingDatasetMapper(cfg,True,tokenizer=tokenizer,data_args=data_args,preprocess=preprocess,refcoco=refcoco)
            if refcoco:
                loaders['interactiveref'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
            else:
                loaders['interactive'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
        elif mapper_name =="vg":
            mapper=VGNewBaselineDatasetMapper(cfg,True,tokenizer=tokenizer,data_args=data_args,preprocess=preprocess)
            loaders['vg'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
        elif mapper_name == "coco_ref_panoptic_lsj":
            mapper = COCOPanopticInteractiveDatasetMapper(cfg, cfg.get('Train',True),tokenizer=tokenizer,data_args=data_args,preprocess=preprocess)
            loaders['refcoco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
        else:
            mapper = None
            loaders[dataset_name] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)

    if llava_cap_loader is not None:
        loaders['llava_cap'] = llava_cap_loader
    if len(loaders) == 1 and not cfg['LOADER'].get('JOINT', False):
        for k, v in loaders.items():
            print("number of iterations per epoch: ", v, len(loaders[k]))
        return list(loaders.values())[0]
    else:
        return JointLoader(loaders, key_dataset=cfg['LOADER'].get('KEY_DATASET', 'coco'))

================================================
FILE: datasets_os/custom_dataset_dataloader.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Part of the code is from https://github.com/xingyizhou/UniDet/blob/master/projects/UniDet/unidet/data/multi_dataset_dataloader.py (Apache-2.0 License)
import copy
import logging
import numpy as np
import operator
from typing import Any, Callable, Dict, List, Optional, Union
import torch
import torch.utils.data as torchdata
import json
from detectron2.utils.comm import get_world_size
from detectron2.utils.logger import _log_api_usage, log_first_n

from detectron2.config import configurable
from detectron2.data import samplers
from torch.utils.data.sampler import BatchSampler, Sampler
from detectron2.data.common import DatasetFromList, MapDataset
from detectron2.data.dataset_mapper import DatasetMapper
from detectron2.data.build import get_detection_dataset_dicts, build_batch_data_loader
from detectron2.data.samplers import TrainingSampler, RepeatFactorTrainingSampler, InferenceSampler
from detectron2.data.build import worker_init_reset_seed, print_instances_class_histogram
from detectron2.data.build import filter_images_with_only_crowd_annotations
from detectron2.data.build import filter_images_with_few_keypoints
from detectron2.data.build import check_metadata_consistency
from detectron2.data.catalog import MetadataCatalog, DatasetCatalog
from detectron2.utils import comm
import itertools
import math
from collections import defaultdict
from typing import Optional

logger = logging.getLogger('detectron2.vlpart.data.custom_dataset_dataloader')


def _custom_test_loader_from_config(cfg, dataset_name, mapper=None):
    if isinstance(dataset_name, str):
        dataset_name = [dataset_name]

    dataset = get_detection_dataset_dicts(
        dataset_name,
        filter_empty=False,
        proposal_files=[
            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
        ]
        if cfg.MODEL.LOAD_PROPOSALS_TEST
        else None,
    )
    if mapper is None:
        mapper = DatasetMapper(cfg, False)
    return {
        "dataset": dataset,
        "mapper": mapper,
        "num_workers": cfg.DATALOADER.NUM_WORKERS,
        "sampler": InferenceSampler(len(dataset))
        if not isinstance(dataset, torchdata.IterableDataset)
        else None,
    }


@configurable(from_config=_custom_test_loader_from_config)
def build_custom_test_loader(
    dataset: Union[List[Any], torchdata.Dataset],
    *,
    mapper: Callable[[Dict[str, Any]], Any],
    sampler: Optional[torchdata.Sampler] = None,
    batch_size: int = 1,
    num_workers: int = 0,
    collate_fn: Optional[Callable[[List[Any]], Any]] = None,
) -> torchdata.DataLoader:
    if isinstance(dataset, list):
        dataset = DatasetFromList(dataset, copy=False)
    if mapper is not None:
        dataset = MapDataset(dataset, mapper)
    if isinstance(dataset, torchdata.IterableDataset):
        assert sampler is None, "sampler must be None if dataset is IterableDataset"
    else:
        if sampler is None:
            sampler = InferenceSampler(len(dataset))
    return torchdata.DataLoader(
        dataset,
        batch_size=batch_size,
        sampler=sampler,
        drop_last=False,
        num_workers=num_workers,
        collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
    )


def trivial_batch_collator(batch):
    return batch


def _custom_train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
    if 'MultiDataset' in sampler_name:
        dataset_dicts = get_detection_dataset_dicts_with_source(
            cfg.DATASETS.TRAIN,
            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
            if cfg.MODEL.KEYPOINT_ON else 0,
            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
        )
    else:
        dataset_dicts = get_detection_dataset_dicts(
            cfg.DATASETS.TRAIN,
            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
            if cfg.MODEL.KEYPOINT_ON else 0,
            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
        )

    if mapper is None:
        mapper = DatasetMapper(cfg, True)

    if sampler is not None:
        pass
    elif sampler_name == "TrainingSampler":
        sampler = TrainingSampler(len(dataset))
    elif sampler_name == "MultiDatasetSampler":
        sampler = MultiDatasetSampler(
            dataset_dicts,
            dataset_ratio = cfg.DATALOADER.DATASET_RATIO,
            use_rfs = cfg.DATALOADER.USE_RFS,
            dataset_ann = cfg.DATALOADER.DATASET_ANN,
            repeat_threshold = cfg.DATALOADER.REPEAT_THRESHOLD,
        )
    elif sampler_name == "RepeatFactorTrainingSampler":
        repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
        )
        sampler = RepeatFactorTrainingSampler(repeat_factors)
    else:
        raise ValueError("Unknown training sampler: {}".format(sampler_name))

    return {
        "dataset": dataset_dicts,
        "sampler": sampler,
        "mapper": mapper,
        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        "num_workers": cfg.DATALOADER.NUM_WORKERS,
        'multi_dataset_grouping': cfg.DATALOADER.MULTI_DATASET_GROUPING,
        'use_diff_bs_size': cfg.DATALOADER.USE_DIFF_BS_SIZE,
        'dataset_bs': cfg.DATALOADER.DATASET_BS,
        'num_datasets': len(cfg.DATASETS.TRAIN)
    }


@configurable(from_config=_custom_train_loader_from_config)
def build_custom_train_loader(
        dataset, *, mapper, sampler, 
        total_batch_size=16,
        aspect_ratio_grouping=True, 
        num_workers=0,
        num_datasets=1,
        multi_dataset_grouping=False,
        use_diff_bs_size=False,
        dataset_bs=[]
    ):
    """
    Modified from detectron2.data.build.build_custom_train_loader, but supports
    different samplers
    """
    if isinstance(dataset, list):
        dataset = DatasetFromList(dataset, copy=False)
    if mapper is not None:
        dataset = MapDataset(dataset, mapper)
    if sampler is None:
        sampler = TrainingSampler(len(dataset))
    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
    if multi_dataset_grouping:
        return build_multi_dataset_batch_data_loader(
            use_diff_bs_size,
            dataset_bs,
            dataset,
            sampler,
            total_batch_size,
            num_datasets=num_datasets,
            num_workers=num_workers,
        )
    else:
        return build_batch_data_loader(
            dataset,
            sampler,
            total_batch_size,
            aspect_ratio_grouping=aspect_ratio_grouping,
            num_workers=num_workers,
        )


def build_multi_dataset_batch_data_loader(
    use_diff_bs_size, dataset_bs,
    dataset, sampler, total_batch_size, num_datasets, num_workers=0
):
    """
    """
    world_size = get_world_size()
    assert (
        total_batch_size > 0 and total_batch_size % world_size == 0
    ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format(
        total_batch_size, world_size
    )

    batch_size = total_batch_size // world_size
    data_loader = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        num_workers=num_workers,
        batch_sampler=None,
        collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
        worker_init_fn=worker_init_reset_seed,
    )  # yield individual mapped dict
    if use_diff_bs_size:
        return DIFFMDAspectRatioGroupedDataset(
            data_loader, dataset_bs, num_datasets)
    else:
        return MDAspectRatioGroupedDataset(
            data_loader, batch_size, num_datasets)


def get_detection_dataset_dicts_with_source(
    dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None
):
    assert len(dataset_names)
    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
    
    for source_id, (dataset_name, dicts) in \
        enumerate(zip(dataset_names, dataset_dicts)):
        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
        for d in dicts:
            d['dataset_source'] = source_id

        if "annotations" in dicts[0]:
            try:
                class_names = MetadataCatalog.get(dataset_name).thing_classes
                check_metadata_consistency("thing_classes", dataset_name)
                print_instances_class_histogram(dicts, class_names)
            except AttributeError:  # class names are not available for this dataset
                pass

    assert proposal_files is None

    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))

    has_instances = "annotations" in dataset_dicts[0]
    if filter_empty and has_instances:
        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
    if min_keypoints > 0 and has_instances:
        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)

    return dataset_dicts


class MultiDatasetSampler(Sampler):
    def __init__(
        self, 
        dataset_dicts, 
        dataset_ratio,
        use_rfs,
        dataset_ann,
        repeat_threshold=0.001,
        seed: Optional[int] = None,
        ):
        """
        """
        sizes = [0 for _ in range(len(dataset_ratio))]
        for d in dataset_dicts:
            sizes[d['dataset_source']] += 1
        logger.info('dataset sizes {}'.format(sizes))
        
        self.sizes = sizes
        assert len(dataset_ratio) == len(sizes), \
            'length of dataset ratio {} should be equal to number if dataset {}'.format(
                len(dataset_ratio), len(sizes)
            )
        if seed is None:
            seed = comm.shared_random_seed()
        self._seed = int(seed)
        self._rank = comm.get_rank()
        self._world_size = comm.get_world_size()
        
        self.dataset_ids = torch.tensor(
            [d['dataset_source'] for d in dataset_dicts], dtype=torch.long)

        dataset_weight = [torch.ones(s) * max(sizes) / s * r / sum(dataset_ratio) \
            for i, (r, s) in enumerate(zip(dataset_ratio, sizes))]
        dataset_weight = torch.cat(dataset_weight)

        rfs_factors = []
        st = 0
        for i, s in enumerate(sizes):
            if use_rfs[i]:
                if dataset_ann[i] == 'box':
                    rfs_func = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency
                else:
                    rfs_func = repeat_factors_from_tag_frequency
                rfs_factor = rfs_func(
                    dataset_dicts[st: st + s],
                    repeat_thresh=repeat_threshold)
                rfs_factor = rfs_factor * (s / rfs_factor.sum())
            else:
                rfs_factor = torch.ones(s)
            rfs_factors.append(rfs_factor)
            st = st + s
        rfs_factors = torch.cat(rfs_factors)

        self.weights = dataset_weight * rfs_factors
        self.sample_epoch_size = len(self.weights)


    def __iter__(self):
        start = self._rank
        yield from itertools.islice(
            self._infinite_indices(), start, None, self._world_size)


    def _infinite_indices(self):
        g = torch.Generator()
        g.manual_seed(self._seed)
        while True:
            ids = torch.multinomial(
                self.weights, self.sample_epoch_size, generator=g, 
                replacement=True)
            nums = [(self.dataset_ids[ids] == i).sum().int().item() \
                for i in range(len(self.sizes))]
            yield from ids


class MDAspectRatioGroupedDataset(torch.utils.data.IterableDataset):
    def __init__(self, dataset, batch_size, num_datasets):
        """
        """
        self.dataset = dataset
        self.batch_size = batch_size
        self._buckets = [[] for _ in range(2 * num_datasets)]

    def __iter__(self):
        for d in self.dataset:
            w, h = d["width"], d["height"]
            aspect_ratio_bucket_id = 0 if w > h else 1
            bucket_id = d['dataset_source'] * 2 + aspect_ratio_bucket_id
            bucket = self._buckets[bucket_id]
            bucket.append(d)
            if len(bucket) == self.batch_size:
                yield bucket[:]
                del bucket[:]


class DIFFMDAspectRatioGroupedDataset(torch.utils.data.IterableDataset):
    def __init__(self, dataset, batch_sizes, num_datasets):
        """
        """
        self.dataset = dataset
        self.batch_sizes = batch_sizes
        self._buckets = [[] for _ in range(2 * num_datasets)]

    def __iter__(self):
        for d in self.dataset:
            w, h = d["width"], d["height"]
            aspect_ratio_bucket_id = 0 if w > h else 1
            bucket_id = d['dataset_source'] * 2 + aspect_ratio_bucket_id
            bucket = self._buckets[bucket_id]
            bucket.append(d)
            if len(bucket) == self.batch_sizes[d['dataset_source']]:
                yield bucket[:]
                del bucket[:]


def repeat_factors_from_tag_frequency(dataset_dicts, repeat_thresh):
    """
    """
    category_freq = defaultdict(int)
    for dataset_dict in dataset_dicts:
        cat_ids = dataset_dict['pos_category_ids']
        for cat_id in cat_ids:
            category_freq[cat_id] += 1
    num_images = len(dataset_dicts)
    for k, v in category_freq.items():
        category_freq[k] = v / num_images

    category_rep = {
        cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
        for cat_id, cat_freq in category_freq.items()
    }

    rep_factors = []
    for dataset_dict in dataset_dicts:
        cat_ids = dataset_dict['pos_category_ids']
        rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
        rep_factors.append(rep_factor)

    return torch.tensor(rep_factors, dtype=torch.float32)


================================================
FILE: datasets_os/dataset_mappers/__init__.py
================================================

from .coco_panoptic_interactive_dataset_mapper import COCOPanopticInteractiveDatasetMapper
from .flickr_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper as FlickrNewBaselineDatasetMapper
from .coco_instruct_grounding_dataset_mapper import COCOInstanceNewBaselineDatasetMapper as COCOInstructGroundingDatasetMapper
from .coco_instruct_grounding_dataset_interactive_mapper import COCOInstanceNewBaselineDatasetMapper as COCOInterGroundingDatasetMapper
from .vg_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper as VGNewBaselineDatasetMapper

================================================
FILE: datasets_os/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging

import numpy as np
import torch

from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances

from pycocotools import mask as coco_mask

from llava.model.openseed.utils import configurable

__all__ = ["COCOInstanceNewBaselineDatasetMapper"]


def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        mask = mask.any(dim=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, dim=0)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8)
    return masks


def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    assert is_train, "Only support training augmentation"
    cfg_input = cfg['INPUT']
    image_size = cfg_input['IMAGE_SIZE']
    min_scale = cfg_input['MIN_SCALE']
    max_scale = cfg_input['MAX_SCALE']

    augmentation = []

    if cfg_input['RANDOM_FLIP'] != "none":
        augmentation.append(
            T.RandomFlip(
                horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
                vertical=cfg_input['RANDOM_FLIP'] == "vertical",
            )
        )

    augmentation.extend([
        T.ResizeScale(
            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
        ),
        T.FixedSizeCrop(crop_size=(image_size, image_size)),
    ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
        )

        self.img_format = image_format
        self.is_train = is_train
    
    @classmethod
    def from_config(cls, cfg, is_train=True):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg['INPUT']['FORMAT'],
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        # TODO: get padding mask
        # by feeding a "segmentation mask" to the same transforms
        padding_mask = np.ones(image.shape[:2])

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        # the crop transformation has default padding value 0 for segmentation
        padding_mask = transforms.apply_segmentation(padding_mask)
        padding_mask = ~ padding_mask.astype(bool)

        image_shape = image.shape[:2]  # h, w

        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))

        if not self.is_train:
            # USER: Modify this if you want to keep them for some reason.
            dataset_dict.pop("annotations", None)
            return dataset_dict

        if "annotations" in dataset_dict:
            # USER: Modify this if you want to keep them for some reason.
            for anno in dataset_dict["annotations"]:
                # Let's always keep mask
                # if not self.mask_on:
                #     anno.pop("segmentation", None)
                anno.pop("keypoints", None)

            # USER: Implement additional transformations if you have other types of data
            annos = [
                utils.transform_instance_annotations(obj, transforms, image_shape)
                for obj in dataset_dict.pop("annotations")
                if obj.get("iscrowd", 0) == 0
            ]
            # NOTE: does not support BitMask due to augmentation
            # Current BitMask cannot handle empty objects
            instances = utils.annotations_to_instances(annos, image_shape)
            # After transforms such as cropping are applied, the bounding box may no longer
            # tightly bound the object. As an example, imagine a triangle object
            # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
            # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
            # the intersection of original bounding box and the cropping box.
            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
            # Need to filter empty instances first (due to augmentation)
            instances = utils.filter_empty_instances(instances)
            # Generate masks from polygon
            h, w = instances.image_size
            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
            if hasattr(instances, 'gt_masks'):
                gt_masks = instances.gt_masks
                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
                instances.gt_masks = gt_masks
            dataset_dict["instances"] = instances

        return dataset_dict


================================================
FILE: datasets_os/dataset_mappers/coco_instruct_grounding_dataset_interactive_mapper.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging
import random

import numpy as np
import torch
import PIL.Image as Image
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances

from pycocotools import mask as coco_mask

from llava.model.openseed.utils import configurable
from detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
from llava import conversation as conversation_lib
from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

# from llava.train.train_hao_seg_flickr import ,preprocess
__all__ = ["COCOInstanceNewBaselineDatasetMapper"]


def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        mask = mask.any(dim=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, dim=0)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8)
    return masks

def preprocess_multimodal(
    sources,
    data_args
):
    is_multimodal = data_args.is_multimodal
    if not is_multimodal:
        return sources

    for source in sources:
        for sentence in source:
            if DEFAULT_IMAGE_TOKEN in sentence['value']:
                sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
                sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
                sentence['value'] = sentence['value'].strip()
                if "mmtag" in conversation_lib.default_conversation.version:
                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
            replace_token = DEFAULT_IMAGE_TOKEN
            if data_args.mm_use_im_start_end:
                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)

    return sources

def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    if is_train:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])
    else:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
        tokenizer,
        data_args,
        preprocess,
        refcoco=None,
        max_sampled=5,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
        )

        self.img_format = image_format
        self.is_train = is_train
        self.tokenizer = tokenizer
        self.processor = data_args.image_processor
        self.data_args = data_args
        self.preprocess = preprocess
        self.refcoco=refcoco
        self.max_sampled=max_sampled
    
    @classmethod
    def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,preprocess=None,refcoco=None):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg['INPUT']['FORMAT'],
            "tokenizer": tokenizer,
            "data_args": data_args,
            "preprocess": preprocess,
            "refcoco":refcoco,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        #########llava image processing

        if self.data_args.image_aspect_ratio == 'pad':
            def expand2square(pil_img, background_color):
                width, height = pil_img.size
                if width == height:
                    return pil_img
                elif width > height:
                    result = Image.new(pil_img.mode, (width, width), background_color)
                    result.paste(pil_img, (0, (width - height) // 2))
                    return result
                else:
                    result = Image.new(pil_img.mode, (height, height), background_color)
                    result.paste(pil_img, ((height - width) // 2, 0))
                    return result

            image_clip = expand2square(image, tuple(int(x * 255) for x in self.processor.image_mean))
            image_clip = self.processor.preprocess(image_clip, return_tensors='pt')['pixel_values'][0]
        else:
            image_clip = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
        dataset_dict["image_clip"] = image_clip

        ##################

        # TODO: get padding mask
        # by feeding a "segmentation mask" to the same transforms
        padding_mask = np.ones(image.shape[:2])

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        dataset_dict["image_ori"]=image
        # the crop transformation has default padding value 0 for segmentation
        padding_mask = transforms.apply_segmentation(padding_mask)
        padding_mask = ~ padding_mask.astype(bool)

        image_shape = image.shape[:2]  # h, w

        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))

        num_conversations = len(dataset_dict['conversations'])
        if self.refcoco:
            max_sampled=min(self.max_sampled,num_conversations)
            sample_num=random.randint(1,max_sampled)
            sampled_convs=random.sample(dataset_dict['conversations'], k=sample_num)
            grounding_list=[]
            selected_conversation=[]
            sampled_convs[0][0][0]['value']='<image>\n'+sampled_convs[0][0][0]['value']
            for conv,gd in sampled_convs:
                grounding_list.extend(gd)
                conv[1]['value']=random.choice(conv[1]['value'])
                selected_conversation.extend(conv)

        else:
            rd = np.random.choice(num_conversations)
            selected_conversation, grounding_list = dataset_dict['conversations'][rd]
        dataset_dict['conversation'] = [selected_conversation]
        sources = preprocess_multimodal(
            copy.deepcopy(dataset_dict['conversation']),
            self.data_args)
        data_dict_conversation = self.preprocess(
            sources,
            self.tokenizer,
            has_image=True)
        data_dict_conversation = dict(input_ids=data_dict_conversation["input_ids"][0],
                                      labels=data_dict_conversation["labels"][0])
        dataset_dict.update(data_dict_conversation)
        dataset_dict['tokenizer'] = self.tokenizer
        # num_segs = sum([conv['value'].count('<seg>') for conv in selected_conversation])
        # grounding_list=
        assert "grounding_info" in dataset_dict and len(dataset_dict['grounding_info'])>0
        anno_id2id=dict()
        for id,obj in enumerate(dataset_dict['grounding_info']):
            obj["bbox_mode"] = BoxMode.XYWH_ABS
            anno_id2id[obj['id']]=id
        # id2class=[[] for _ in range(len(dataset_dict['grounding_info']))]

        annos = [
            utils.transform_instance_annotations(obj, transforms, image_shape)
            for obj in dataset_dict["grounding_info"]
        ]
        # assert  "segmentation" in annos[0]
        instances = utils.annotations_to_instances(annos, image_shape,mask_format="bitmask")

        h, w = instances.image_size
        # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
        if hasattr(instances, 'gt_masks'):
            gt_masks = instances.gt_masks
            # gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
            instances.gt_masks = gt_masks.tensor
        num_objs=(data_dict_conversation['input_ids']==1273).sum()
        grounding_list=[gd for gd in grounding_list if gd is not None]
        merged_grounding_list=sum(grounding_list,[])
        # assert num_objs==len(merged_grounding_list)
        if num_objs<len(merged_grounding_list):
            merged_grounding_list=merged_grounding_list[:num_objs]
        elif num_objs>len(merged_grounding_list):
            merged_grounding_list=merged_grounding_list+[merged_grounding_list[-1]]*(num_objs-len(merged_grounding_list))
        merged_grounding_list=[anno_id2id[annid] for annid in merged_grounding_list]
        dataset_dict['grounding_index']=merged_grounding_list
        dataset_dict["instances"] = instances
            # if grounding_list is None:
            #     dataset_dict['grounding']=False
            #     grounding_mask=[False for _ in range(num_segs)]
            #     dataset_dict['grounding_mask']=grounding_mask
            # else:
            #     grounding_mask=[True if g is not None else False for g in grounding_list]
            #     dataset_dict['grounding_mask']=grounding_mask
            #     new_grounding_list=[g for g in grounding_list if g is not None]
            #     if sum(grounding_mask)==0:
            #         dataset_dict['grounding']=False
            #     else:
            #         dataset_dict['grounding']=True
            # if dataset_dict['grounding']:
            #     # assert num_segs == len(grounding_list)
            #     for grounding_id,grounding in enumerate(new_grounding_list):
            #         if grounding is not None:
            #             for annid in grounding:
            #                 id2class[anno_id2id[annid]].append(grounding_id)
            #
            #     instances.gt_classes=id2class
            # dataset_dict["instances"] = instances

        # else:
        #     dataset_dict['grounding'] = False
        #     grounding_mask = [False for _ in range(num_segs)]
        #     dataset_dict['grounding_mask'] = grounding_mask

        return dataset_dict


================================================
FILE: datasets_os/dataset_mappers/coco_instruct_grounding_dataset_mapper.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging

import numpy as np
import torch
import PIL.Image as Image
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances

from pycocotools import mask as coco_mask

from llava.model.openseed.utils import configurable
from detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
from llava import conversation as conversation_lib
from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

# from llava.train.train_hao_seg_flickr import ,preprocess
__all__ = ["COCOInstanceNewBaselineDatasetMapper"]
suffix=[
"Please also provide the boxes and masks for the noun phrases in the response."
, "Kindly ensure that the response includes the relevant boxes and masks for each noun phrase."
, "Additionally, include the boxes and masks that match each noun phrase in the response."
, "Please provide the boxes and masks that correspond to every noun phrase in your response."
, "It’s important to have the boxes and masks that align with each noun phrase in the response."
, "Make sure to include the appropriate boxes and masks for each noun phrase in your response."
, "In your response, include the boxes and masks that pertain to each noun phrase."
, "Also, supply the boxes and masks that are linked to each noun phrase in the response."
, "Additionally, please furnish the boxes and masks that correspond to each noun phrase in the response."
, "Don’t forget to provide the boxes and masks associated with each noun phrase in your response."
, "Ensure that each noun phrase in the response has its respective boxes and masks.",
]

def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        mask = mask.any(dim=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, dim=0)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8)
    return masks

def preprocess_multimodal(
    sources,
    data_args
):
    is_multimodal = data_args.is_multimodal
    if not is_multimodal:
        return sources

    for source in sources:
        for sentence in source:
            if DEFAULT_IMAGE_TOKEN in sentence['value']:
                sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
                sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
                sentence['value'] = sentence['value'].strip()
                if "mmtag" in conversation_lib.default_conversation.version:
                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
            replace_token = DEFAULT_IMAGE_TOKEN
            if data_args.mm_use_im_start_end:
                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)

    return sources

def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    if is_train:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])
    else:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
        tokenizer,
        data_args,
        preprocess,
        replace_suffix=False,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
        )

        self.img_format = image_format
        self.is_train = is_train
        self.tokenizer = tokenizer
        self.processor = data_args.image_processor
        self.data_args = data_args
        self.preprocess = preprocess
        self.replace_suffix=replace_suffix
    
    @classmethod
    def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,preprocess=None):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg['INPUT']['FORMAT'],
            "tokenizer": tokenizer,
            "data_args": data_args,
            "preprocess": preprocess,
            "replace_suffix": cfg['MODEL'].get('REPLACE_SUFFIX', False),
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        #########llava image processing

        if self.data_args.image_aspect_ratio == 'pad':
            def expand2square(pil_img, background_color):
                width, height = pil_img.size
                if width == height:
                    return pil_img
                elif width > height:
                    result = Image.new(pil_img.mode, (width, width), background_color)
                    result.paste(pil_img, (0, (width - height) // 2))
                    return result
                else:
                    result = Image.new(pil_img.mode, (height, height), background_color)
                    result.paste(pil_img, ((height - width) // 2, 0))
                    return result

            image_clip = expand2square(image, tuple(int(x * 255) for x in self.processor.image_mean))
            image_clip = self.processor.preprocess(image_clip, return_tensors='pt')['pixel_values'][0]
        else:
            image_clip = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
        dataset_dict["image_clip"] = image_clip

        ##################

        # TODO: get padding mask
        # by feeding a "segmentation mask" to the same transforms
        padding_mask = np.ones(image.shape[:2])

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        dataset_dict["image_ori"]=image
        # the crop transformation has default padding value 0 for segmentation
        padding_mask = transforms.apply_segmentation(padding_mask)
        padding_mask = ~ padding_mask.astype(bool)

        image_shape = image.shape[:2]  # h, w

        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
        num_conversations = len(dataset_dict['conversations'])
        rd = np.random.choice(num_conversations)
        selected_conversation, grounding_list = dataset_dict['conversations'][rd]
        dataset_dict['conversation'] = [selected_conversation]
        sources = preprocess_multimodal(
            copy.deepcopy(dataset_dict['conversation']),
            self.data_args)
        if self.replace_suffix:
            for conv in sources[0]:
                sf=np.random.choice(suffix)
                if conv['from'] == 'human':
                    conv['value'] = conv['value'].replace('(with grounding)', sf, 1)

        data_dict_conversation = self.preprocess(
            sources,
            self.tokenizer,
            has_image=True)
        data_dict_conversation = dict(input_ids=data_dict_conversation["input_ids"][0],
                                      labels=data_dict_conversation["labels"][0])
        dataset_dict.update(data_dict_conversation)
        dataset_dict['tokenizer'] = self.tokenizer
        num_segs = sum([conv['value'].count('<seg>') for conv in selected_conversation])
        # grounding_list=
        if "grounding_info" in dataset_dict and len(dataset_dict['grounding_info'])>0:
            anno_id2id=dict()
            for id,obj in enumerate(dataset_dict['grounding_info']):
                obj["bbox_mode"] = BoxMode.XYWH_ABS
                anno_id2id[obj['id']]=id
            id2class=[[] for _ in range(len(dataset_dict['grounding_info']))]

            annos = [
                utils.transform_instance_annotations(obj, transforms, image_shape)
                for obj in dataset_dict["grounding_info"]
            ]
            # assert  "segmentation" in annos[0]
            instances = utils.annotations_to_instances(annos, image_shape,mask_format="bitmask")

            h, w = instances.image_size
            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
            if hasattr(instances, 'gt_masks'):
                gt_masks = instances.gt_masks
                # gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
                instances.gt_masks = gt_masks.tensor

            if grounding_list is None:
                dataset_dict['grounding']=False
                grounding_mask=[False for _ in range(num_segs)]
                dataset_dict['grounding_mask']=grounding_mask
            else:
                grounding_mask=[True if g is not None else False for g in grounding_list]
                dataset_dict['grounding_mask']=grounding_mask
                new_grounding_list=[g for g in grounding_list if g is not None]
                if sum(grounding_mask)==0:
                    dataset_dict['grounding']=False
                else:
                    dataset_dict['grounding']=True
            if dataset_dict['grounding']:
                # assert num_segs == len(grounding_list)
                for grounding_id,grounding in enumerate(new_grounding_list):
                    if grounding is not None:
                        for annid in grounding:
                            id2class[anno_id2id[annid]].append(grounding_id)

                instances.gt_classes=id2class
            dataset_dict["instances"] = instances

        else:
            dataset_dict['grounding'] = False
            grounding_mask = [False for _ in range(num_segs)]
            dataset_dict['grounding_mask'] = grounding_mask

        return dataset_dict


================================================
FILE: datasets_os/dataset_mappers/coco_interactive_panoptic_new_baseline_dataset_mapper.py
================================================
# ------------------------------------------------------------------------
# Copyright (c) 2022 IDEA. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li.
import copy
import logging

import numpy as np
import torch

from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Boxes, Instances

__all__ = ["COCOInteractivePanopticNewBaselineDatasetMapper"]


def filter_empty_instances_by_box(
        instances, by_box=True, by_mask=False, box_threshold=1e-5, return_mask=False
):
    assert by_box or by_mask
    r = []
    if by_box:
        r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
    if instances.has("gt_masks") and by_mask:
        r.append(instances.gt_masks.nonempty())

    # TODO: can also filter visible keypoints

    if not r:
        return instances
    m = r[0]
    for x in r[1:]:
        m = m & x
    if return_mask:
        return instances[m], m
    return instances[m]

def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    assert is_train, "Only support training augmentation"
    image_size = cfg.INPUT.IMAGE_SIZE
    min_scale = cfg.INPUT.MIN_SCALE
    max_scale = cfg.INPUT.MAX_SCALE

    augmentation = []

    if cfg.INPUT.RANDOM_FLIP != "none":
        augmentation.append(
            T.RandomFlip(
                horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
                vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
            )
        )

    augmentation.extend([
        T.ResizeScale(
            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
        ),
        T.FixedSizeCrop(crop_size=(image_size, image_size)),
    ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOInteractivePanopticNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            crop_gen: crop augmentation
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
                str(self.tfm_gens)
            )
        )

        self.img_format = image_format
        self.is_train = is_train

    @classmethod
    def from_config(cls, cfg, is_train=True):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg.INPUT.FORMAT,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        image_shape = image.shape[:2]  # h, w
        dataset_dict["image_ori"]=image
        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))

        # if not self.is_train:
        #     # USER: Modify this if you want to keep them for some reason.
        #     dataset_dict.pop("annotations", None)
        #     return dataset_dict

        if "pan_seg_file_name" in dataset_dict:
            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
            segments_info = dataset_dict["segments_info"]

            # apply the same transformation to panoptic segmentation
            pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)

            from panopticapi.utils import rgb2id

            pan_seg_gt = rgb2id(pan_seg_gt)

            instances = Instances(image_shape)
            classes = []
            masks = []
            for segment_info in segments_info:
                class_id = segment_info["category_id"]
                if not segment_info["iscrowd"]:
                    classes.append(class_id)
                    masks.append(pan_seg_gt == segment_info["id"])

            classes = np.array(classes)
            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
            if len(masks) == 0:
                # Some image does not have annotation (all ignored)
                instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
                instances.gt_boxes = Boxes(torch.zeros((0, 4)))
            else:
                masks = BitMasks(
                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
                )
                instances.gt_masks = masks.tensor
                instances.gt_boxes = masks.get_bounding_boxes()

            dataset_dict["instances"] = filter_empty_instances_by_box(instances)

        return dataset_dict


================================================
FILE: datasets_os/dataset_mappers/coco_panoptic_interactive_dataset_mapper.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging
import random

import numpy as np
import torch
import PIL.Image as Image
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Boxes, Instances, BoxMode
from detectron2.structures.boxes import pairwise_iou
from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
from detectron2.data import MetadataCatalog
from pycocotools import mask as coco_mask
from utils.prompt_engineering import prompt_engineering, get_prompt_templates
from llava.model.openseed.utils import configurable
# from ..shapes.sampler import build_shape_sampler

__all__ = ["COCOPanopticInteractiveDatasetMapper"]

def filter_empty_instances_by_box(
        instances, by_box=True, by_mask=False, box_threshold=1e-5, return_mask=False
):
    assert by_box or by_mask
    r = []
    if by_box:
        r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
    if instances.has("gt_masks") and by_mask:
        r.append(instances.gt_masks.nonempty())

    # TODO: can also filter visible keypoints

    if not r:
        return instances
    m = r[0]
    for x in r[1:]:
        m = m & x
    if return_mask:
        return instances[m], m
    return instances[m]

def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    # assert is_train, "Only support training augmentation"
    cfg_input = cfg['INPUT']
    image_size = cfg_input['IMAGE_SIZE']
    min_scale = cfg_input['MIN_SCALE']
    max_scale = cfg_input['MAX_SCALE']

    augmentation = []

    # if cfg_input['RANDOM_FLIP'] != "none":
    #     augmentation.append(
    #         T.RandomFlip(
    #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
    #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
    #         )
    #     )

    augmentation.extend([
        T.ResizeScale(
            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
        ),
        T.FixedSizeCrop(crop_size=(image_size, image_size)),
    ])

    return augmentation


def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        mask = mask.any(dim=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, dim=0)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8)
    return masks


# This is specifically designed for the COCO dataset.
class COCOPanopticInteractiveDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
            self,
            is_train=True,
            *,
            tfm_gens,
            image_format,
            caption_thres,
            # lvis,
            # lvis_thres,
            max_grounding_num,
            tokenizer,
            data_args,
            preprocess,
            # shape_sampler,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            crop_gen: crop augmentation
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
                str(self.tfm_gens)
            )
        )

        self.img_format = image_format
        self.is_train = is_train
        self.caption_thres = caption_thres
        self.grounding = True
        # self.lvis = lvis
        # self.lvis_thres = lvis_thres
        self.max_grounding_num = max_grounding_num
        self.caption_similarity = torch.load(MetadataCatalog.get('logistic').get('caption_similarity_pth'))
        self.tokenizer = tokenizer
        self.processor = data_args.image_processor
        self.data_args = data_args
        self.preprocess = preprocess

        # self.shape_sampler = shape_sampler

    @classmethod
    def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,preprocess=None):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)
        # shape_sampler = build_shape_sampler(cfg)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg['INPUT']['FORMAT'],
            "caption_thres": cfg['MODEL']['DECODER']['CAPTION']['SIM_THRES'],
            "max_grounding_num": cfg['MODEL']['DECODER']['GROUNDING']['MAX_LEN'],
            "tokenizer": tokenizer,
            "data_args": data_args,
            "preprocess": preprocess,
            # "shape_sampler": shape_sampler,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)
        #########llava image processing

        if self.data_args.image_aspect_ratio == 'pad':
            def expand2square(pil_img, background_color):
                width, height = pil_img.size
                if width == height:
                    return pil_img
                elif width > height:
                    result = Image.new(pil_img.mode, (width, width), background_color)
                    result.paste(pil_img, (0, (width - height) // 2))
                    return result
                else:
                    result = Image.new(pil_img.mode, (height, height), background_color)
                    result.paste(pil_img, ((height - width) // 2, 0))
                    return result

            image_clip = expand2square(image, tuple(int(x * 255) for x in self.processor.image_mean))
            image_clip = self.processor.preprocess(image_clip, return_tensors='pt')['pixel_values'][0]
        else:
            image_clip = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
        dataset_dict["image_clip"] = image_clip

        ##################
        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        image_shape = image.shape[:2]  # h, w
        dataset_dict["image_ori"]=image
        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))


        if "pan_seg_file_name" in dataset_dict:
            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
            segments_info = dataset_dict["segments_info"]

            # apply the same transformation to panoptic segmentation
            pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)

            from panopticapi.utils import rgb2id

            pan_seg_gt = rgb2id(pan_seg_gt)

            instances = Instances(image_shape)
            classes = []
            masks = []
            for segment_info in segments_info:
                class_id = segment_info["category_id"]
                if not segment_info["iscrowd"]:
                    classes.append(class_id)
                    masks.append(pan_seg_gt == segment_info["id"])

            # is_things = [COCO_CATEGORIES[idx]['isthing'] for idx in classes]
            classes = np.array(classes)
            # is_things = np.array(is_things)
            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
            # instances.is_things = torch.tensor(is_things, dtype=torch.int64)

            if len(masks) == 0:
                # Some image does not have annotation (all ignored)
                masks = BitMasks(torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])))
                instances.gt_masks = masks
                instances.gt_boxes = Boxes(torch.zeros((0, 4)))
            else:
                masks = BitMasks(
                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
                )
                instances.gt_masks = masks
                instances.gt_boxes = masks.get_bounding_boxes()

        if self.grounding:
            grounding_anno = dataset_dict['grounding_info']
            if self.is_train:
                grounding_len = random.randint(1, self.max_grounding_num - 1)
            else:
                grounding_len = 1
            if len(grounding_anno) > 0:
                masks_grd = []
                texts_grd = []
                mode = 'text'
                random.shuffle(grounding_anno)
                for ann in grounding_anno:
                    rle = coco_mask.frPyObjects(
                        ann['segmentation'], dataset_dict['height'], dataset_dict['width'])
                    m = coco_mask.decode(rle)
                    # sometimes there are multiple binary map (corresponding to multiple segs)
                    m = np.sum(m, axis=2)>0
                    m = m.astype(np.uint8)  # convert to np.uint8
                    m = transforms.apply_segmentation(m[:, :, None])[:, :, 0]==1
                    masks_grd += [m]
                    # random select a sentence of a single annotation.
                    rand_index = random.randint(0, len(ann['sentences']) - 1)
                    texts_grd += [ann['sentences'][rand_index]['raw'].lower()]
                max_len = min(grounding_len, len(texts_grd))
                indices = np.random.permutation(max_len)
                texts_grd = list(np.array(texts_grd)[indices])
                masks_grd = torch.tensor(np.stack(masks_grd)[indices])
                hash_grd = np.array([hash(txt) for txt in texts_grd])
                gt_classes = list(range(len(texts_grd)))
                gt_classes = [[lb] for lb in gt_classes]
                label_set=texts_grd
            else:
                assert self.is_train
                masks_grd = instances.gt_masks.tensor
                mode = 'class'
                assert len(masks_grd) > 0

                texts_grd = np.array([COCO_CATEGORIES[idx]['name'] for idx in classes])
                hash_grd = np.array([hash(txt) for txt in texts_grd])
                unique_hash_grd = np.unique(hash_grd)
                np.random.shuffle(unique_hash_grd)
                max_len = min(grounding_len,len(unique_hash_grd))
                indices = np.random.permutation(max_len)
                selected_unique_hash_grd = unique_hash_grd[indices]
                selected_mask = np.in1d(hash_grd, selected_unique_hash_grd)
                texts_grd = texts_grd[selected_mask]
                hash_grd = hash_grd[selected_mask]
                masks_grd = masks_grd[selected_mask]
                texts_grd = [
                    text.replace('-other', '').replace('-merged', '').replace('-stuff', '')
                    for text in texts_grd]
                label_set=list(set(texts_grd))
                gt_classes=[[label_set.index(lb)] for lb in texts_grd]

            instances_gd = Instances(image_shape)
            instances_gd.gt_masks = BitMasks(masks_grd)
            instances_gd.gt_boxes = BitMasks(masks_grd).get_bounding_boxes()
            instances_gd.gt_masks=instances_gd.gt_masks.tensor
            instances_gd.gt_classes=gt_classes
            dataset_dict["instances"] = instances_gd
            conversations=[]
            for i in range(len(label_set)):
                if i==0:
                    question={'from': 'human', 'value': f"<image>\n Please detect the object according to the text {label_set[i]} (referring)."}
                else:
                    question={'from': 'human', 'value': f"Please detect the object according to the text {label_set[i]} (referring)."}
                answer={'from': 'gpt', 'value': '<seg> .'}
                conversations.append(question)
                conversations.append(answer)

            dataset_dict['conversation'] = [conversations]

            data_dict_conversation = self.preprocess(
                dataset_dict['conversation'],
                self.tokenizer,
                has_image=True)
            data_dict_conversation = dict(input_ids=data_dict_conversation["input_ids"][0],
                             labels=data_dict_conversation["labels"][0])
            dataset_dict.update(data_dict_conversation)
            dataset_dict['tokenizer']=self.tokenizer


        return dataset_dict

================================================
FILE: datasets_os/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
================================================
# ------------------------------------------------------------------------
# Copyright (c) 2022 IDEA. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li.
import copy
import logging

import numpy as np
import torch

from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Boxes, Instances

__all__ = ["COCOPanopticNewBaselineDatasetMapper"]


def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    # assert is_train, "Only support training augmentation"
    image_size = cfg.INPUT.IMAGE_SIZE
    min_scale = cfg.INPUT.MIN_SCALE
    max_scale = cfg.INPUT.MAX_SCALE

    augmentation = []

    # if cfg.INPUT.RANDOM_FLIP != "none":
    #     augmentation.append(
    #         T.RandomFlip(
    #             horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
    #             vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
    #         )
    #     )

    augmentation.extend([
        T.ResizeScale(
            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
        ),
        T.FixedSizeCrop(crop_size=(image_size, image_size)),
    ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOPanopticNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            crop_gen: crop augmentation
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
                str(self.tfm_gens)
            )
        )

        self.img_format = image_format
        self.is_train = is_train

    @classmethod
    def from_config(cls, cfg, is_train=True):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg.INPUT.FORMAT,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        image_shape = image.shape[:2]  # h, w

        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))

        # if not self.is_train:
        #     # USER: Modify this if you want to keep them for some reason.
        #     dataset_dict.pop("annotations", None)
        #     return dataset_dict

        if "pan_seg_file_name" in dataset_dict:
            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
            segments_info = dataset_dict["segments_info"]

            # apply the same transformation to panoptic segmentation
            pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)

            from panopticapi.utils import rgb2id

            pan_seg_gt = rgb2id(pan_seg_gt)

            instances = Instances(image_shape)
            classes = []
            masks = []
            for segment_info in segments_info:
                class_id = segment_info["category_id"]
                if not segment_info["iscrowd"]:
                    classes.append(class_id)
                    masks.append(pan_seg_gt == segment_info["id"])

            classes = np.array(classes)
            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
            if len(masks) == 0:
                # Some image does not have annotation (all ignored)
                instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
                instances.gt_boxes = Boxes(torch.zeros((0, 4)))
            else:
                masks = BitMasks(
                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
                )
                instances.gt_masks = masks.tensor
                instances.gt_boxes = masks.get_bounding_boxes()

            dataset_dict["instances"] = instances

        return dataset_dict


================================================
FILE: datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging

import numpy as np
import torch
import PIL.Image as Image
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances

from pycocotools import mask as coco_mask

from llava.model.openseed.utils import configurable
from detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
# from llava.train.train_hao_seg_flickr import ,preprocess
__all__ = ["COCOInstanceNewBaselineDatasetMapper"]


def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        mask = mask.any(dim=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, dim=0)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8)
    return masks


def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    if is_train:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])
    else:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
        tokenizer,
        data_args,
        preprocess,
        gd_mode="inter",
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
        )

        self.img_format = image_format
        self.is_train = is_train
        self.tokenizer = tokenizer
        self.processor = data_args.image_processor
        self.data_args = data_args
        self.preprocess = preprocess
        self.gd_mode= gd_mode
    
    @classmethod
    def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,preprocess=None):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg['INPUT']['FORMAT'],
            "tokenizer": tokenizer,
            "data_args": data_args,
            "preprocess": preprocess,
            "gd_mode": cfg.flickr.get("gd_mode", "inter"),
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        #########llava image processing

        if self.data_args.image_aspect_ratio == 'pad':
            def expand2square(pil_img, background_color):
                width, height = pil_img.size
                if width == height:
                    return pil_img
                elif width > height:
                    result = Image.new(pil_img.mode, (width, width), background_color)
                    result.paste(pil_img, (0, (width - height) // 2))
                    return result
                else:
                    result = Image.new(pil_img.mode, (height, height), background_color)
                    result.paste(pil_img, ((height - width) // 2, 0))
                    return result

            image_clip = expand2square(image, tuple(int(x * 255) for x in self.processor.image_mean))
            image_clip = self.processor.preprocess(image_clip, return_tensors='pt')['pixel_values'][0]
        else:
            image_clip = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
        dataset_dict["image_clip"] = image_clip

        ##################

        # TODO: get padding mask
        # by feeding a "segmentation mask" to the same transforms
        padding_mask = np.ones(image.shape[:2])

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        dataset_dict["image_ori"]=image
        # the crop transformation has default padding value 0 for segmentation
        padding_mask = transforms.apply_segmentation(padding_mask)
        padding_mask = ~ padding_mask.astype(bool)

        image_shape = image.shape[:2]  # h, w

        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
        # Therefore it's important to use torch.Tensor.
        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))

        # if not self.is_train:
        #     # USER: Modify this if you want to keep them for some reason.
        #     dataset_dict.pop("annotations", None)
        #     return dataset_dict

        if "grounding_info" in dataset_dict:

            for obj in dataset_dict['grounding_info']:
                obj["bbox_mode"] = BoxMode.XYWH_ABS
                obj['tokens']=dataset_dict['caption'][obj['tokens_positive'][0][0]:obj['tokens_positive'][0][1]]
            # USER: Implement additional transformations if you have other types of data
            annos = [
                utils.transform_instance_annotations(obj, transforms, image_shape)
                for obj in dataset_dict["grounding_info"]
            ]
            # NOTE: does not support BitMask due to augmentation
            # Current BitMask cannot handle empty objects
            assert len(annos)>0
            assert  "segmentation" in annos[0]
            instances = utils.annotations_to_instances(annos, image_shape,mask_format="bitmask")
            # After transforms such as cropping are applied, the bounding box may no longer
            # tightly bound the object. As an example, imagine a triangle object
            # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
            # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
            # the intersection of original bounding box and the cropping box.
            # instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
            # Need to filter empty instances first (due to augmentation)
            # instances = utils.filter_empty_instances(instances)
            # Generate masks from polygon
            h, w = instances.image_size
            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
            if hasattr(instances, 'gt_masks'):
                gt_masks = instances.gt_masks
                # gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
                instances.gt_masks = gt_masks.tensor

            span_set = dict()
            end_dict = dict()
            gt_classes= []
            for i, info in enumerate(dataset_dict['grounding_info']):
                gt_classes.append([])
                # if len(info['tokens_positive'])>1:
                #     print("multi class")
                for j in range(len(info['tokens_positive'])):
                    if info['tokens_positive'][j][0] in span_set:
                        span_set[info['tokens_positive'][j][0]].append(i)
                    else:
                        span_set[info['tokens_positive'][j][0]] = [i]
                    if info['tokens_positive'][j][0] in end_dict:
                        assert end_dict[info['tokens_positive'][j][0]] == info['tokens_positive'][j][1]
                    else:
                        end_dict[info['tokens_positive'][j][0]] = info['tokens_positive'][j][1]
                    gt_classes[-1].append(info['tokens_positive'][j][0])

            end_dict = sorted(end_dict.items())
            start2id = dict()
            for i, (s, e) in enumerate(end_dict):
                start2id[s] = i
            gt_classes= [[start2id[s] for s in gt_class] for gt_class in gt_classes]
            instances.gt_classes = gt_classes
            dataset_dict["instances"] = instances
            # span_list = sorted(span_set.items())

            # for k, v in span_set:
            #     for i in range(len(v)):
            #         v[i] = positive_new_ids[v[i]]
            cap_pieces = []
            last_e = 0
            for s, e in end_dict:
                cap_pieces.append(dataset_dict['caption'][last_e:s])
                cap_pieces.append(dataset_dict['caption'][s:e])
                last_e = e
            cap_pieces.append(dataset_dict['caption'][last_e:])
            new_cap = []
            if 'end' in self.gd_mode:
                k=1
                for i, piece in enumerate(cap_pieces):
                    if i % 2 == 1:
                        if self.gd_mode == 'end':
                            piece = '<g_s>' + piece + '<g_e>'
                        else:
                            assert self.gd_mode == 'end_num'
                            piece = f'<g_s> {k} ' + piece + '<g_e>'
                            k+=1
                    new_cap.append(piece)
                new_cap = "".join(new_cap)
                tail = [f'{i + 1}: <seg>' for i in range(new_cap.count("<g_s>"))]
                tail = '; '.join(tail)
                new_cap += f' {tail}.'
            else:
                for i, piece in enumerate(cap_pieces):
                    if i % 2 == 1:
                        piece = '<g_s>' + piece + '<g_e><seg>'
                    new_cap.append(piece)
                new_cap = "".join(new_cap)
            # gt_ids = []
            # for s, e in end_dict:
            #     if len(span_set[s]) > 1:
            #         return dataset_dict
            #     gt_ids.append(span_set[s][0] + 1)
            # ground_annos = dict()
            # ground_annos['gt_ids'] = gt_ids
            # ground_annos['gt_anno_ids'] = [dataset_dict['grounding_info'][gt_id_ - 1]['id'] for gt_id_ in gt_ids]
            # ground_annos['caption'] = new_cap
            question={'from': 'human', 'value': "<image>\nPresent a compact description of the photo's key features.\n(with grounding)"}
            answer={'from': 'gpt', 'value': new_cap}
            dataset_dict['conversation'] = [[question, answer]]
            # sources = preprocess_multimodal(
            #     copy.deepcopy(dataset_dict['conversation']),
            #     self.data_args)
            data_dict_conversation = self.preprocess(
                dataset_dict['conversation'],
                self.tokenizer,
                has_image=True)
            data_dict_conversation = dict(input_ids=data_dict_conversation["input_ids"][0],
                             labels=data_dict_conversation["labels"][0])
            dataset_dict.update(data_dict_conversation)
            dataset_dict['tokenizer']=self.tokenizer

        return dataset_dict


================================================
FILE: datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper_.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging

import numpy as np
import torch
import PIL.Image as Image
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances

from pycocotools import mask as coco_mask

from llava.model.openseed.utils import configurable
from detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
from llava import conversation as conversation_lib
from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

# from llava.train.train_hao_seg_flickr import ,preprocess
__all__ = ["COCOInstanceNewBaselineDatasetMapper"]


def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        mask = mask.any(dim=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, dim=0)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8)
    return masks

def preprocess_multimodal(
    sources,
    data_args
):
    is_multimodal = data_args.is_multimodal
    if not is_multimodal:
        return sources

    for source in sources:
        for sentence in source:
            if DEFAULT_IMAGE_TOKEN in sentence['value']:
                sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
                sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
                sentence['value'] = sentence['value'].strip()
                if "mmtag" in conversation_lib.default_conversation.version:
                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
            replace_token = DEFAULT_IMAGE_TOKEN
            if data_args.mm_use_im_start_end:
                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)

    return sources

def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    if is_train:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])
    else:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
        tokenizer,
        data_args,
        preprocess,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
        )

        self.img_format = image_format
        self.is_train = is_train
        self.tokenizer = tokenizer
        self.processor = data_args.image_processor
        self.data_args = data_args
        self.preprocess = preprocess
    
    @classmethod
    def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,preprocess=None):
        # Build augmentation
        tfm_gens = build_transform_gen(cfg, is_train)

        ret = {
            "is_train": is_train,
            "tfm_gens": tfm_gens,
            "image_format": cfg['INPUT']['FORMAT'],
            "tokenizer": tokenizer,
            "data_args": data_args,
            "preprocess": preprocess,
        }
        return ret

    def __call__(self, dataset_dict):
        """
        Args:
            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.

        Returns:
            dict: a format that builtin models in detectron2 accept
        """
        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
        utils.check_image_size(dataset_dict, image)

        #########llava image processing

        if self.data_args.image_aspect_ratio == 'pad':
            def expand2square(pil_img, background_color):
                width, height = pil_img.size
                if width == height:
                    return pil_img
                elif width > height:
                    result = Image.new(pil_img.mode, (width, width), background_color)
                    result.paste(pil_img, (0, (width - height) // 2))
                    return result
                else:
                    result = Image.new(pil_img.mode, (height, height), background_color)
                    result.paste(pil_img, ((height - width) // 2, 0))
                    return result

            image_clip = expand2square(image, tuple(int(x * 255) for x in self.processor.image_mean))
            image_clip = self.processor.preprocess(image_clip, return_tensors='pt')['pixel_values'][0]
        else:
            image_clip = self.processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
        dataset_dict["image_clip"] = image_clip

        ##################

        # TODO: get padding mask
        # by feeding a "segmentation mask" to the same transforms
        padding_mask = np.ones(image.shape[:2])

        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
        dataset_dict["image_ori"]=image
        # the crop transformation has default padding value 0 for segmentation
        padding_mask = transforms.apply_segmentation(padding_mask)
        padding_mask = ~ padding_mask.astype(bool)

        image_shape = image.shape[:2]  # h, w

        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))

        if "grounding_info" in dataset_dict:
            anno_id2id=dict()
            for id,obj in enumerate(dataset_dict['grounding_info']):
                obj["bbox_mode"] = BoxMode.XYWH_ABS
                anno_id2id[obj['id']]=id
            id2class=[[] for _ in range(len(dataset_dict['grounding_info']))]

            annos = [
                utils.transform_instance_annotations(obj, transforms, image_shape)
                for obj in dataset_dict["grounding_info"]
            ]
            # NOTE: does not support BitMask due to augmentation
            # Current BitMask cannot handle empty objects
            assert len(annos)>0
            assert  "segmentation" in annos[0]
            instances = utils.annotations_to_instances(annos, image_shape,mask_format="bitmask")

            h, w = instances.image_size
            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
            if hasattr(instances, 'gt_masks'):
                gt_masks = instances.gt_masks
                # gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
                instances.gt_masks = gt_masks.tensor

            num_conversations = len(dataset_dict['conversations'])
            rd = np.random.choice(num_conversations)
            selected_conversation, grounding_list=dataset_dict['conversations'][rd]

            if grounding_list is None:
                dataset_dict['grounding']=False
            else:
                non_none=[1 for g in grounding_list if g is not None]
                if len(non_none)==0:
                    dataset_dict['grounding']=False
                else:
                    dataset_dict['grounding']=True
            if dataset_dict['grounding']:
                num_segs = sum([conv['value'].count('<seg>') for conv in selected_conversation])
                assert num_segs == len(grounding_list)
                for grounding_id,grounding in enumerate(grounding_list):
                    if grounding is not None:
                        for annid in grounding:
                            id2class[anno_id2id[annid]].append(grounding_id)

                instances.gt_classes=id2class
            dataset_dict["instances"] = instances

            dataset_dict['conversation'] = [selected_conversation]
            sources = preprocess_multimodal(
                copy.deepcopy(dataset_dict['conversation']),
                self.data_args)
            data_dict_conversation = self.preprocess(
                sources,
                self.tokenizer,
                has_image=True)
            data_dict_conversation = dict(input_ids=data_dict_conversation["input_ids"][0],
                             labels=data_dict_conversation["labels"][0])
            dataset_dict.update(data_dict_conversation)
            dataset_dict['tokenizer']=self.tokenizer

        return dataset_dict


================================================
FILE: datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper_end.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging

import numpy as np
import torch
import PIL.Image as Image
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances

from pycocotools import mask as coco_mask

from llava.model.openseed.utils import configurable
from detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
# from llava.train.train_hao_seg_flickr import ,preprocess
__all__ = ["COCOInstanceNewBaselineDatasetMapper"]


def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        mask = mask.any(dim=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, dim=0)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8)
    return masks


def build_transform_gen(cfg, is_train):
    """
    Create a list of default :class:`Augmentation` from config.
    Now it includes resizing and flipping.
    Returns:
        list[Augmentation]
    """
    if is_train:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])
    else:
        cfg_input = cfg['INPUT']
        image_size = cfg_input['IMAGE_SIZE']
        min_scale = cfg_input['MIN_SCALE']
        max_scale = cfg_input['MAX_SCALE']

        augmentation = []

        # if cfg_input['RANDOM_FLIP'] != "none":
        #     augmentation.append(
        #         T.RandomFlip(
        #             horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
        #             vertical=cfg_input['RANDOM_FLIP'] == "vertical",
        #         )
        #     )

        augmentation.extend([
            T.ResizeScale(
                min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
            ),
            T.FixedSizeCrop(crop_size=(image_size, image_size)),
        ])

    return augmentation


# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
    """
    A callable which takes a dataset dict in Detectron2 Dataset format,
    and map it into a format used by MaskFormer.

    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.

    The callable currently does the following:

    1. Read the image from "file_name"
    2. Applies geometric transforms to the image and annotation
    3. Find and applies suitable cropping to the image and annotation
    4. Prepare image and annotation to Tensors
    """

    @configurable
    def __init__(
        self,
        is_train=True,
        *,
        tfm_gens,
        image_format,
        tokenizer,
        data_args,
        preprocess,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            is_train: for training or inference
            augmentations: a list of augmentations or deterministic transforms to apply
            tfm_gens: data augmentation
            image_format: an image format supported by :func:`detection_utils.read_image`.
        """
        self.tfm_gens = tfm_gens
        logging.getLogger(__name__).info(
            "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
        )

        self.img_format = image_format
        self.is_train = is_train
        self.tokenizer = tokenizer
        self.processor = data_args.image_processor
        self.data_args = data_args

Download .txt

gitextract_hcvjbjkn/

├── .gitignore
├── LICENSE
├── README.md
├── configs/
│   ├── openseed/
│   │   ├── openseed_swint_lang_joint.yaml
│   │   ├── openseed_swint_lang_joint_2st.yaml
│   │   └── openseed_swint_lang_joint_2st_visual_prompt.yaml
│   └── semsam/
│       └── visual_prompt_encoder.yaml
├── datasets_os/
│   ├── __init__.py
│   ├── build.py
│   ├── custom_dataset_dataloader.py
│   ├── dataset_mappers/
│   │   ├── __init__.py
│   │   ├── coco_instance_new_baseline_dataset_mapper.py
│   │   ├── coco_instruct_grounding_dataset_interactive_mapper.py
│   │   ├── coco_instruct_grounding_dataset_mapper.py
│   │   ├── coco_interactive_panoptic_new_baseline_dataset_mapper.py
│   │   ├── coco_panoptic_interactive_dataset_mapper.py
│   │   ├── coco_panoptic_new_baseline_dataset_mapper.py
│   │   ├── flickr_instance_new_baseline_dataset_mapper.py
│   │   ├── flickr_instance_new_baseline_dataset_mapper_.py
│   │   ├── flickr_instance_new_baseline_dataset_mapper_end.py
│   │   ├── flickr_new_baseline_dataset_mapper.py
│   │   ├── inference_mapper_with_gt.py
│   │   ├── llava_dataset_mapper.py
│   │   ├── refcoco_dataset_mapper.py
│   │   └── vg_instance_new_baseline_dataset_mapper.py
│   ├── refer.py
│   ├── registration/
│   │   ├── __init__.py
│   │   ├── register_coco_instruct_grounding_dataset.py
│   │   ├── register_coco_panoptic_annos_grounding_interactive.py
│   │   ├── register_flickr_dataset.py
│   │   └── register_vg_dataset.py
│   └── semseg_loader.py
├── docs/
│   └── MODEL_ZOO.md
├── gradio_demo/
│   ├── LLaVA_G_Demo.py
│   └── __init__.py
├── llava/
│   ├── __init__.py
│   ├── constants.py
│   ├── conversation.py
│   ├── eval/
│   │   ├── LLaVA_G_Eval.py
│   │   ├── eval_gpt_review.py
│   │   ├── eval_gpt_review_bench.py
│   │   ├── eval_gpt_review_visual.py
│   │   ├── eval_gpt_review_visual2.py
│   │   ├── eval_science_qa.py
│   │   ├── eval_science_qa_gpt4.py
│   │   ├── eval_science_qa_gpt4_requery.py
│   │   ├── generate_webpage_data_from_table.py
│   │   ├── llava_mapper.py
│   │   ├── model_qa.py
│   │   ├── model_vqa.py
│   │   ├── model_vqa_science.py
│   │   ├── qa_baseline_gpt35.py
│   │   ├── run_llava.py
│   │   ├── summarize_gpt_review.py
│   │   └── webpage/
│   │       ├── index.html
│   │       ├── script.js
│   │       └── styles.css
│   ├── mm_utils.py
│   ├── model/
│   │   ├── __init__.py
│   │   ├── apply_delta.py
│   │   ├── builder.py
│   │   ├── consolidate.py
│   │   ├── language_model/
│   │   │   ├── llava_llama.py
│   │   │   ├── llava_llama_gd.py
│   │   │   ├── llava_mpt.py
│   │   │   └── mpt/
│   │   │       ├── adapt_tokenizer.py
│   │   │       ├── attention.py
│   │   │       ├── blocks.py
│   │   │       ├── configuration_mpt.py
│   │   │       ├── custom_embedding.py
│   │   │       ├── flash_attn_triton.py
│   │   │       ├── hf_prefixlm_converter.py
│   │   │       ├── meta_init_context.py
│   │   │       ├── modeling_mpt.py
│   │   │       ├── norm.py
│   │   │       └── param_init_fns.py
│   │   ├── llava_arch.py
│   │   ├── make_delta.py
│   │   ├── multimodal_encoder/
│   │   │   ├── builder.py
│   │   │   └── clip_encoder.py
│   │   ├── openseed/
│   │   │   ├── BaseModel.py
│   │   │   ├── __init__.py
│   │   │   ├── architectures/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── openseed_model.py
│   │   │   │   ├── openseed_model_decouple_train.py
│   │   │   │   └── registry.py
│   │   │   ├── backbone/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── backbone.py
│   │   │   │   ├── build.py
│   │   │   │   ├── focal.py
│   │   │   │   ├── focal_dw.py
│   │   │   │   ├── registry.py
│   │   │   │   └── swin.py
│   │   │   ├── body/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── decoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── modules.py
│   │   │   │   │   ├── openseed_decoder.py
│   │   │   │   │   ├── openseed_decoder_decouple.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── utils/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── dino_decoder.py
│   │   │   │   │       └── utils.py
│   │   │   │   ├── encoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── encoder_deform.py
│   │   │   │   │   ├── ops/
│   │   │   │   │   │   ├── functions/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── ms_deform_attn_func.py
│   │   │   │   │   │   ├── make.sh
│   │   │   │   │   │   ├── modules/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── ms_deform_attn.py
│   │   │   │   │   │   ├── setup.py
│   │   │   │   │   │   ├── src/
│   │   │   │   │   │   │   ├── cpu/
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cpu.cpp
│   │   │   │   │   │   │   │   └── ms_deform_attn_cpu.h
│   │   │   │   │   │   │   ├── cuda/
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.cu
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.h
│   │   │   │   │   │   │   │   └── ms_deform_im2col_cuda.cuh
│   │   │   │   │   │   │   ├── ms_deform_attn.h
│   │   │   │   │   │   │   └── vision.cpp
│   │   │   │   │   │   └── test.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── transformer_encoder_fpn.py
│   │   │   │   ├── openseed_head.py
│   │   │   │   ├── registry.py
│   │   │   │   └── transformer_blocks.py
│   │   │   ├── language/
│   │   │   │   ├── LangEncoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── transformer.py
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── encoder.py
│   │   │   │   ├── registry.py
│   │   │   │   └── vlpencoder.py
│   │   │   ├── modules/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── criterion.py
│   │   │   │   ├── matcher.py
│   │   │   │   ├── point_features.py
│   │   │   │   ├── position_encoding.py
│   │   │   │   └── postprocessing.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── box_ops.py
│   │   │       ├── config.py
│   │   │       └── misc.py
│   │   ├── semsam/
│   │   │   ├── BaseModel.py
│   │   │   ├── __init__.py
│   │   │   ├── architectures/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── idino_model_partwhole_all_llm_ref_feats_all_det_pretrainv1.py
│   │   │   │   └── registry.py
│   │   │   ├── backbone/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── backbone.py
│   │   │   │   ├── build.py
│   │   │   │   ├── focal.py
│   │   │   │   ├── focal_dw.py
│   │   │   │   ├── registry.py
│   │   │   │   ├── swin.py
│   │   │   │   └── swin_new.py
│   │   │   ├── body/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── decoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── idino_decoder_no_iou_token_partwhole_all_llm.py
│   │   │   │   │   ├── modules.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── utils/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── dino_decoder.py
│   │   │   │   │       └── utils.py
│   │   │   │   ├── encoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── encoder_deform.py
│   │   │   │   │   ├── ops/
│   │   │   │   │   │   ├── functions/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── ms_deform_attn_func.py
│   │   │   │   │   │   ├── make.sh
│   │   │   │   │   │   ├── modules/
│   │   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   │   └── ms_deform_attn.py
│   │   │   │   │   │   ├── setup.py
│   │   │   │   │   │   ├── src/
│   │   │   │   │   │   │   ├── cpu/
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cpu.cpp
│   │   │   │   │   │   │   │   └── ms_deform_attn_cpu.h
│   │   │   │   │   │   │   ├── cuda/
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.cu
│   │   │   │   │   │   │   │   ├── ms_deform_attn_cuda.h
│   │   │   │   │   │   │   │   └── ms_deform_im2col_cuda.cuh
│   │   │   │   │   │   │   ├── ms_deform_attn.h
│   │   │   │   │   │   │   └── vision.cpp
│   │   │   │   │   │   └── test.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── transformer_encoder_fpn.py
│   │   │   │   ├── openseed_head.py
│   │   │   │   ├── registry.py
│   │   │   │   └── transformer_blocks.py
│   │   │   ├── language/
│   │   │   │   ├── LangEncoder/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── build.py
│   │   │   │   │   ├── registry.py
│   │   │   │   │   └── transformer.py
│   │   │   │   ├── __init__.py
│   │   │   │   ├── build.py
│   │   │   │   ├── encoder.py
│   │   │   │   ├── fixencoder.py
│   │   │   │   ├── llama_encoder.py
│   │   │   │   ├── loss.py
│   │   │   │   ├── misc.py
│   │   │   │   ├── modeling_llama_os.py
│   │   │   │   ├── registry.py
│   │   │   │   └── vlpencoder.py
│   │   │   ├── modules/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── criterion_id_llm.py
│   │   │   │   ├── hooks.py
│   │   │   │   ├── matcher.py
│   │   │   │   ├── point_features.py
│   │   │   │   ├── position_encoding.py
│   │   │   │   └── postprocessing.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── box_ops.py
│   │   │       ├── config.py
│   │   │       └── misc.py
│   │   └── utils.py
│   ├── serve/
│   │   ├── __init__.py
│   │   ├── cli.py
│   │   ├── controller.py
│   │   ├── gradio_web_server.py
│   │   ├── register_worker.py
│   │   └── test_message.py
│   ├── train/
│   │   ├── llama_flash_attn_monkey_patch.py
│   │   ├── llava_trainer.py
│   │   ├── llava_trainer_gd.py
│   │   ├── llava_trainer_joint_train.py
│   │   ├── train.py
│   │   ├── train_grounding_1st.py
│   │   ├── train_joint_1st.py
│   │   ├── train_joint_2st.py
│   │   ├── train_joint_2st_interactive_refcoco_coco_instruction.py
│   │   └── train_mem.py
│   └── utils.py
├── pyproject.toml
├── scripts/
│   ├── convert_sqa_to_llava.py
│   ├── convert_sqa_to_llava_base_prompt.py
│   ├── finetune.sh
│   ├── finetune_visual_prompt.sh
│   ├── merge_lora_weights.py
│   └── pretrain_joint.sh
└── utils/
    ├── Config.py
    ├── __init__.py
    ├── arguments.py
    ├── constants.py
    ├── constants_ori.py
    ├── dist.py
    ├── distributed.py
    ├── misc.py
    ├── model.py
    ├── nms.py
    ├── prompt_engineering.py
    ├── utils.py
    └── visualizer.py

Download .txt

SYMBOL INDEX (1862 symbols across 193 files)

FILE: datasets_os/build.py
  class JointLoader (line 40) | class JointLoader(torchdata.IterableDataset):
    method __init__ (line 41) | def __init__(self, loaders, key_dataset):
    method __iter__ (line 51) | def __iter__(self):
    method __len__ (line 55) | def __len__(self):
  function filter_images_with_only_crowd_annotations (line 58) | def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_nam...
  function get_detection_dataset_dicts (line 94) | def get_detection_dataset_dicts(
  function _test_loader_from_config (line 135) | def _test_loader_from_config(cfg, dataset_name, mapper=None):
  function build_detection_test_loader (line 167) | def build_detection_test_loader(
  function _train_loader_from_config (line 232) | def _train_loader_from_config(cfg, dataset_name, mapper, *, dataset=None...
  function build_detection_train_loader (line 263) | def build_detection_train_loader(
  function get_config_from_name (line 310) | def get_config_from_name(cfg, dataset_name):
  function build_train_dataloader (line 382) | def build_train_dataloader(cfg,tokenizer=None,data_args=None,preprocess=...

FILE: datasets_os/custom_dataset_dataloader.py
  function _custom_test_loader_from_config (line 35) | def _custom_test_loader_from_config(cfg, dataset_name, mapper=None):
  function build_custom_test_loader (line 61) | def build_custom_test_loader(
  function trivial_batch_collator (line 89) | def trivial_batch_collator(batch):
  function _custom_train_loader_from_config (line 93) | def _custom_train_loader_from_config(cfg, mapper=None, *, dataset=None, ...
  function build_custom_train_loader (line 150) | def build_custom_train_loader(
  function build_multi_dataset_batch_data_loader (line 191) | def build_multi_dataset_batch_data_loader(
  function get_detection_dataset_dicts_with_source (line 221) | def get_detection_dataset_dicts_with_source(
  class MultiDatasetSampler (line 256) | class MultiDatasetSampler(Sampler):
    method __init__ (line 257) | def __init__(
    method __iter__ (line 313) | def __iter__(self):
    method _infinite_indices (line 319) | def _infinite_indices(self):
  class MDAspectRatioGroupedDataset (line 331) | class MDAspectRatioGroupedDataset(torch.utils.data.IterableDataset):
    method __init__ (line 332) | def __init__(self, dataset, batch_size, num_datasets):
    method __iter__ (line 339) | def __iter__(self):
  class DIFFMDAspectRatioGroupedDataset (line 351) | class DIFFMDAspectRatioGroupedDataset(torch.utils.data.IterableDataset):
    method __init__ (line 352) | def __init__(self, dataset, batch_sizes, num_datasets):
    method __iter__ (line 359) | def __iter__(self):
  function repeat_factors_from_tag_frequency (line 371) | def repeat_factors_from_tag_frequency(dataset_dicts, repeat_thresh):

FILE: datasets_os/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
  function convert_coco_poly_to_mask (line 21) | def convert_coco_poly_to_mask(segmentations, height, width):
  function build_transform_gen (line 38) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 72) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 88) | def __init__(
    method from_config (line 112) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 123) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/coco_instruct_grounding_dataset_interactive_mapper.py
  function convert_coco_poly_to_mask (line 26) | def convert_coco_poly_to_mask(segmentations, height, width):
  function preprocess_multimodal (line 42) | def preprocess_multimodal(
  function build_transform_gen (line 65) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 121) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 137) | def __init__(
    method from_config (line 172) | def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,...
    method __call__ (line 187) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/coco_instruct_grounding_dataset_mapper.py
  function convert_coco_poly_to_mask (line 37) | def convert_coco_poly_to_mask(segmentations, height, width):
  function preprocess_multimodal (line 53) | def preprocess_multimodal(
  function build_transform_gen (line 76) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 132) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 148) | def __init__(
    method from_config (line 181) | def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,...
    method __call__ (line 196) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/coco_interactive_panoptic_new_baseline_dataset_mapper.py
  function filter_empty_instances_by_box (line 21) | def filter_empty_instances_by_box(
  function build_transform_gen (line 42) | def build_transform_gen(cfg, is_train):
  class COCOInteractivePanopticNewBaselineDatasetMapper (line 75) | class COCOInteractivePanopticNewBaselineDatasetMapper:
    method __init__ (line 91) | def __init__(
    method from_config (line 118) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 129) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/coco_panoptic_interactive_dataset_mapper.py
  function filter_empty_instances_by_box (line 24) | def filter_empty_instances_by_box(
  function build_transform_gen (line 45) | def build_transform_gen(cfg, is_train):
  function convert_coco_poly_to_mask (line 78) | def convert_coco_poly_to_mask(segmentations, height, width):
  class COCOPanopticInteractiveDatasetMapper (line 96) | class COCOPanopticInteractiveDatasetMapper:
    method __init__ (line 112) | def __init__(
    method from_config (line 159) | def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,...
    method __call__ (line 177) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
  function build_transform_gen (line 21) | def build_transform_gen(cfg, is_train):
  class COCOPanopticNewBaselineDatasetMapper (line 54) | class COCOPanopticNewBaselineDatasetMapper:
    method __init__ (line 70) | def __init__(
    method from_config (line 97) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 108) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper.py
  function convert_coco_poly_to_mask (line 22) | def convert_coco_poly_to_mask(segmentations, height, width):
  function build_transform_gen (line 39) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 95) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 111) | def __init__(
    method from_config (line 144) | def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,...
    method __call__ (line 159) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper_.py
  function convert_coco_poly_to_mask (line 25) | def convert_coco_poly_to_mask(segmentations, height, width):
  function preprocess_multimodal (line 41) | def preprocess_multimodal(
  function build_transform_gen (line 64) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 120) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 136) | def __init__(
    method from_config (line 167) | def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,...
    method __call__ (line 181) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper_end.py
  function convert_coco_poly_to_mask (line 22) | def convert_coco_poly_to_mask(segmentations, height, width):
  function build_transform_gen (line 39) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 95) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 111) | def __init__(
    method from_config (line 142) | def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,...
    method __call__ (line 156) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/flickr_new_baseline_dataset_mapper.py
  function filter_empty_instances_by_box (line 21) | def filter_empty_instances_by_box(
  function build_transform_gen (line 42) | def build_transform_gen(cfg, is_train):
  class COCOInteractivePanopticNewBaselineDatasetMapper (line 75) | class COCOInteractivePanopticNewBaselineDatasetMapper:
    method __init__ (line 91) | def __init__(
    method from_config (line 118) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 129) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/inference_mapper_with_gt.py
  class CoCoInferenceDatasetMapper (line 21) | class CoCoInferenceDatasetMapper:
    method __init__ (line 39) | def __init__(
    method from_config (line 87) | def from_config(cls, cfg, is_train: bool = True):
    method _transform_annotations (line 116) | def _transform_annotations(self, dataset_dict, transforms, image_shape):
    method __call__ (line 145) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/llava_dataset_mapper.py
  function convert_coco_poly_to_mask (line 22) | def convert_coco_poly_to_mask(segmentations, height, width):
  function build_transform_gen (line 39) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 95) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 111) | def __init__(
    method from_config (line 135) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 146) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/refcoco_dataset_mapper.py
  function build_transform_gen (line 26) | def build_transform_gen(cfg, is_train):
  class RefCOCODatasetMapper (line 61) | class RefCOCODatasetMapper:
    method __init__ (line 77) | def __init__(
    method from_config (line 108) | def from_config(cls, cfg, is_train=True):
    method __call__ (line 126) | def __call__(self, dataset_dict):

FILE: datasets_os/dataset_mappers/vg_instance_new_baseline_dataset_mapper.py
  function convert_coco_poly_to_mask (line 22) | def convert_coco_poly_to_mask(segmentations, height, width):
  function build_transform_gen (line 39) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 95) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 111) | def __init__(
    method from_config (line 144) | def from_config(cls, cfg, is_train=True,tokenizer=None,data_args=None,...
    method __call__ (line 159) | def __call__(self, dataset_dict):

FILE: datasets_os/refer.py
  class REFER (line 45) | class REFER:
    method __init__ (line 46) | def __init__(self, data_root, dataset='refcoco', splitBy='unc'):
    method createIndex (line 79) | def createIndex(self):
    method getRefIds (line 143) | def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
    method getAnnIds (line 176) | def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
    method getImgIds (line 198) | def getImgIds(self, ref_ids=[]):
    method getCatIds (line 208) | def getCatIds(self):
    method loadRefs (line 211) | def loadRefs(self, ref_ids=[]):
    method loadAnns (line 217) | def loadAnns(self, ann_ids=[]):
    method loadImgs (line 223) | def loadImgs(self, image_ids=[]):
    method loadCats (line 229) | def loadCats(self, cat_ids=[]):
    method getRefBox (line 235) | def getRefBox(self, ref_id):
    method showRef (line 240) | def showRef(self, ref, seg_box='seg'):
    method getMask (line 286) | def getMask(self, ref):
    method showMask (line 338) | def showMask(self, ref):

FILE: datasets_os/registration/register_coco_instruct_grounding_dataset.py
  function get_metadata (line 41) | def get_metadata():
  function load_coco_json (line 46) | def load_coco_json(image_root, annot_json,conversation, metadata):
  function register_coco (line 112) | def register_coco(
  function register_all_coco (line 128) | def register_all_coco(root):

FILE: datasets_os/registration/register_coco_panoptic_annos_grounding_interactive.py
  function get_metadata (line 27) | def get_metadata():
  function load_coco_panoptic_json (line 71) | def load_coco_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, gr...
  function register_coco_panoptic_annos_caption_grounding_sem_seg (line 137) | def register_coco_panoptic_annos_caption_grounding_sem_seg(
  function register_all_coco_panoptic_annos_caption_grounding_sem_seg (line 169) | def register_all_coco_panoptic_annos_caption_grounding_sem_seg(root):

FILE: datasets_os/registration/register_flickr_dataset.py
  function get_metadata (line 29) | def get_metadata():
  function load_flickr_json (line 34) | def load_flickr_json(image_root, annot_json, metadata):
  function register_flickr (line 68) | def register_flickr(
  function register_all_flickr (line 84) | def register_all_flickr(root,anno_root):

FILE: datasets_os/registration/register_vg_dataset.py
  function get_metadata (line 25) | def get_metadata():
  function load_vg_json (line 30) | def load_vg_json(image_root, annot_json, metadata):
  function register_vg (line 59) | def register_vg(
  function register_all_vg (line 75) | def register_all_vg(root,anno_root):

FILE: datasets_os/semseg_loader.py
  function load_semseg (line 5) | def load_semseg(filename, loader_type):

FILE: gradio_demo/LLaVA_G_Demo.py
  function get_image_name (line 13) | def get_image_name(dir_save="./gradio_demo/tmp_files", prefix="click_img...
  function preprocess_multi_conv (line 21) | def preprocess_multi_conv(
  function filter_empty_box_mask (line 65) | def filter_empty_box_mask(text, boxes_image, masks_image):
  class InferenceDemo (line 101) | class InferenceDemo(object):
    method __init__ (line 102) | def __init__(self,
    method hitory2datadict (line 114) | def hitory2datadict(self, history, text):
    method inference (line 161) | def inference(self, data_dict):
  function generate_distinct_colors (line 176) | def generate_distinct_colors(count):
  function add_text (line 192) | def add_text(history, text, image, threshold_slider, temporature_slider,...
  function add_image (line 371) | def add_image(history, image):
  function add_interaction_click (line 383) | def add_interaction_click(history, image, interaction_selector):
  function bot (line 397) | def bot(history):
  function clear_history (line 400) | def clear_history(history, txt, img):
  function clear_response (line 402) | def clear_response(history):
  function upvote_one (line 412) | def upvote_one(history):
  function downvote_one (line 415) | def downvote_one(history):
  function flag_one (line 418) | def flag_one(history):

FILE: llava/conversation.py
  class SeparatorStyle (line 6) | class SeparatorStyle(Enum):
  class Conversation (line 16) | class Conversation:
    method get_prompt (line 29) | def get_prompt(self):
    method append_message (line 106) | def append_message(self, role, message):
    method get_images (line 109) | def get_images(self, return_pil=False):
    method to_gradio_chatbot (line 158) | def to_gradio_chatbot(self):
    method copy (line 191) | def copy(self):
    method dict (line 202) | def dict(self):

FILE: llava/eval/LLaVA_G_Eval.py
  function load_jsonl_file (line 22) | def load_jsonl_file(path_jsonl):
  function save_jsonl_file (line 29) | def save_jsonl_file(data, path_save):
  function load_benchmark (line 34) | def load_benchmark(image_root, path_benchmark):
  function preprocess_v1 (line 66) | def preprocess_v1(
  class Evaluator_MM (line 150) | class Evaluator_MM:
    method __init__ (line 151) | def __init__(self,
    method construct_model (line 176) | def construct_model(self, model_path, model_base=None, model_name=None...
    method construct_vision_model (line 302) | def construct_vision_model(self, path_vision_model_cfg):
    method load_parameters (line 384) | def load_parameters(self, path_model):
    method evaluate_sample (line 392) | def evaluate_sample(self, input_data, get_box=True, get_mask=False):
  class Evaluator_MM_Inter (line 402) | class Evaluator_MM_Inter(Evaluator_MM):
    method __init__ (line 403) | def __init__(self, model_path, path_vision_model_cfg=None, path_inter_...
    method construct_model (line 406) | def construct_model(self, model_path, model_base=None, model_name=None...
    method construct_vision_model (line 532) | def construct_vision_model(self, path_vision_model_cfg):
    method evaluate_sample (line 618) | def evaluate_sample(self, input_data):
  function formatting (line 623) | def formatting(text, boxes, question_id):
  function evaluate_ (line 695) | def evaluate_(path_benchmarks, dir_image, evaluator, matching_threshold):
  function evaluate (line 792) | def evaluate(args=None):

FILE: llava/eval/eval_gpt_review.py
  function get_eval (line 13) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 39) | def parse_score(review):

FILE: llava/eval/eval_gpt_review_bench.py
  function get_eval (line 16) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 41) | def parse_score(review):

FILE: llava/eval/eval_gpt_review_visual.py
  function get_eval (line 15) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 40) | def parse_score(review):

FILE: llava/eval/eval_gpt_review_visual2.py
  function get_eval (line 15) | def get_eval(content: str, max_tokens: int):
  function parse_score (line 40) | def parse_score(review):

FILE: llava/eval/eval_science_qa.py
  function get_args (line 8) | def get_args():
  function convert_caps (line 19) | def convert_caps(results):
  function get_pred_idx (line 28) | def get_pred_idx(prediction, choices, options):

FILE: llava/eval/eval_science_qa_gpt4.py
  function get_args (line 9) | def get_args():
  function convert_caps (line 19) | def convert_caps(results):
  function get_pred_idx (line 28) | def get_pred_idx(prediction, choices, options):

FILE: llava/eval/eval_science_qa_gpt4_requery.py
  function get_args (line 9) | def get_args():
  function convert_caps (line 21) | def convert_caps(results):
  function get_pred_idx (line 30) | def get_pred_idx(prediction, choices, options):

FILE: llava/eval/generate_webpage_data_from_table.py
  function read_jsonl (line 10) | def read_jsonl(path: str, key: str=None):
  function trim_hanging_lines (line 23) | def trim_hanging_lines(s: str, n: int) -> str:

FILE: llava/eval/llava_mapper.py
  function convert_coco_poly_to_mask (line 25) | def convert_coco_poly_to_mask(segmentations, height, width):
  function preprocess_multimodal (line 41) | def preprocess_multimodal(
  function build_transform_gen (line 63) | def build_transform_gen(cfg, is_train):
  class COCOInstanceNewBaselineDatasetMapper (line 119) | class COCOInstanceNewBaselineDatasetMapper:
    method __init__ (line 135) | def __init__(
    method from_config (line 165) | def from_config(cls, cfg, is_train=True,tokenizer=None,image_processor...
    method __call__ (line 179) | def __call__(self, dataset_dict):

FILE: llava/eval/model_qa.py
  class KeywordsStoppingCriteria (line 14) | class KeywordsStoppingCriteria(StoppingCriteria):
    method __init__ (line 15) | def __init__(self, keywords, tokenizer, input_ids):
    method __call__ (line 21) | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTe...
  function eval_model (line 33) | def eval_model(model_name, questions_file, answers_file):

FILE: llava/eval/model_vqa.py
  function split_list (line 18) | def split_list(lst, n):
  function get_chunk (line 24) | def get_chunk(lst, n, k):
  function eval_model (line 29) | def eval_model(args):

FILE: llava/eval/model_vqa_science.py
  function split_list (line 18) | def split_list(lst, n):
  function get_chunk (line 24) | def get_chunk(lst, n, k):
  function eval_model (line 29) | def eval_model(args):

FILE: llava/eval/qa_baseline_gpt35.py
  function get_answer (line 16) | def get_answer(question_id: int, question: str, max_tokens: int):

FILE: llava/eval/run_llava.py
  function load_image (line 17) | def load_image(image_file):
  function eval_model (line 26) | def eval_model(args):

FILE: llava/eval/summarize_gpt_review.py
  function parse_args (line 9) | def parse_args():

FILE: llava/eval/webpage/script.js
  function text2Markdown (line 35) | function text2Markdown(text) {
  function capitalizeFirstChar (line 41) | function capitalizeFirstChar(str) {
  function updateQuestionSelect (line 48) | function updateQuestionSelect(question_id) {
  function updateModelSelect (line 64) | function updateModelSelect() {
  function populateModels (line 70) | function populateModels(models) {
  function populateQuestions (line 81) | function populateQuestions(questions) {
  function displayQuestion (line 110) | function displayQuestion(index) {
  function displayAnswers (line 116) | function displayAnswers(index) {
  function switchQuestionAndCategory (line 203) | function switchQuestionAndCategory() {
  function updateExpandButtonVisibility (line 226) | function updateExpandButtonVisibility(card) {

FILE: llava/mm_utils.py
  function load_image_from_base64 (line 10) | def load_image_from_base64(image):
  function process_images (line 14) | def process_images(images, image_processor, model_cfg):
  function tokenizer_image_token (line 18) | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOK...
  function tokenizer_image_token_inter (line 39) | def tokenizer_image_token_inter(prompt, tokenizer, image_token_index=IMA...
  function get_model_name_from_path (line 60) | def get_model_name_from_path(model_path):
  class KeywordsStoppingCriteria (line 71) | class KeywordsStoppingCriteria(StoppingCriteria):
    method __init__ (line 72) | def __init__(self, keywords, tokenizer, input_ids):
    method __call__ (line 83) | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTe...

FILE: llava/model/apply_delta.py
  function apply_delta (line 13) | def apply_delta(base_model_path, target_model_path, delta_path):

FILE: llava/model/builder.py
  function load_pretrained_model (line 25) | def load_pretrained_model(model_path, model_base, model_name, load_8bit=...

FILE: llava/model/consolidate.py
  function consolidate_ckpt (line 13) | def consolidate_ckpt(src_path, dst_path):

FILE: llava/model/language_model/llava_llama.py
  class LlavaConfig (line 30) | class LlavaConfig(LlamaConfig):
  class LlavaLlamaModel (line 34) | class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
    method __init__ (line 37) | def __init__(self, config: LlamaConfig):
  class LlavaLlamaForCausalLM (line 41) | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 44) | def __init__(self, config):
    method get_model (line 53) | def get_model(self):
    method forward (line 56) | def forward(
    method prepare_inputs_for_generation (line 122) | def prepare_inputs_for_generation(

FILE: llava/model/language_model/llava_llama_gd.py
  class DataCollatorForSupervisedDataset (line 31) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 36) | def __call__(self, instances,tokenizer):
  class LlavaConfig (line 63) | class LlavaConfig(LlamaConfig):
  class LlavaLlamaModel (line 67) | class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
    method __init__ (line 70) | def __init__(self, config: LlamaConfig):
  class LlavaLlamaForCausalLM (line 74) | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 77) | def __init__(self, config):
    method get_model (line 86) | def get_model(self):
    method forward (line 89) | def forward(
    method prepare_inputs_for_generation (line 150) | def prepare_inputs_for_generation(
  class LlavaLlamaForCausalLM_gd (line 172) | class LlavaLlamaForCausalLM_gd(LlamaForCausalLM, LlavaMetaForCausalLM_gd):
    method __init__ (line 175) | def __init__(self, config):
    method get_model (line 184) | def get_model(self):
    method forward (line 187) | def forward(self,**batched_inputs):
    method forward_inner (line 206) | def forward_inner(
    method prepare_inputs_for_generation (line 348) | def prepare_inputs_for_generation(
    method forward_eval (line 370) | def forward_eval(self, inputs):
    method forward_inner_eval (line 376) | def forward_inner_eval(
    method auto_regressive_generate (line 407) | def auto_regressive_generate(self,
  class LlavaLlamaForCausalLM_joint (line 470) | class LlavaLlamaForCausalLM_joint(LlavaLlamaForCausalLM_gd):
    method forward (line 471) | def forward(self,**batched_inputs):
    method forward_inner (line 491) | def forward_inner(
  class LlavaLlamaForCausalLM_joint_2st (line 599) | class LlavaLlamaForCausalLM_joint_2st(LlavaLlamaForCausalLM_gd):
    method forward (line 600) | def forward(self,**batched_inputs):
    method forward_inner (line 620) | def forward_inner(
  class LlavaLlamaForCausalLM_joint_2st_it_only_ref_instr (line 737) | class LlavaLlamaForCausalLM_joint_2st_it_only_ref_instr(LlamaForCausalLM...
    method __init__ (line 740) | def __init__(self, config):
    method get_model (line 749) | def get_model(self):
    method forward (line 752) | def forward(self,**batched_inputs):
    method forward_inner (line 790) | def forward_inner(
    method forward_eval (line 870) | def forward_eval(self, batched_inputs):
    method forward_inner_eval (line 891) | def forward_inner_eval(
    method forward_inner_eval_interactive (line 922) | def forward_inner_eval_interactive(
    method auto_regressive_generate (line 967) | def auto_regressive_generate(self,

FILE: llava/model/language_model/llava_mpt.py
  class LlavaMPTConfig (line 30) | class LlavaMPTConfig(MPTConfig):
  class LlavaMPTModel (line 34) | class LlavaMPTModel(LlavaMetaModel, MPTModel):
    method __init__ (line 37) | def __init__(self, config: MPTConfig):
    method embed_tokens (line 41) | def embed_tokens(self, x):
  class LlavaMPTForCausalLM (line 45) | class LlavaMPTForCausalLM(MPTForCausalLM, LlavaMetaForCausalLM):
    method __init__ (line 49) | def __init__(self, config):
    method get_model (line 65) | def get_model(self):
    method _set_gradient_checkpointing (line 68) | def _set_gradient_checkpointing(self, module, value=False):
    method forward (line 72) | def forward(self, input_ids: torch.LongTensor, past_key_values: Option...
    method prepare_inputs_for_generation (line 91) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...

FILE: llava/model/language_model/mpt/adapt_tokenizer.py
  function adapt_tokenizer_for_denoising (line 6) | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
  class AutoTokenizerForMOD (line 25) | class AutoTokenizerForMOD(AutoTokenizer):
    method from_pretrained (line 37) | def from_pretrained(cls, *args, **kwargs):

FILE: llava/model/language_model/mpt/attention.py
  function _reset_is_causal (line 12) | def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, origina...
  function scaled_multihead_dot_product_attention (line 20) | def scaled_multihead_dot_product_attention(query, key, value, n_heads, p...
  function check_valid_inputs (line 64) | def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bflo...
  function flash_attn_fn (line 71) | def flash_attn_fn(query, key, value, n_heads, past_key_value=None, softm...
  function triton_flash_attn_fn (line 107) | def triton_flash_attn_fn(query, key, value, n_heads, past_key_value=None...
  class MultiheadAttention (line 151) | class MultiheadAttention(nn.Module):
    method __init__ (line 158) | def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton'...
    method forward (line 191) | def forward(self, x, past_key_value=None, attn_bias=None, attention_ma...
  class MultiQueryAttention (line 204) | class MultiQueryAttention(nn.Module):
    method __init__ (line 211) | def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton'...
    method forward (line 245) | def forward(self, x, past_key_value=None, attn_bias=None, attention_ma...
  function attn_bias_shape (line 258) | def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causa...
  function build_attn_bias (line 272) | def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False...
  function gen_slopes (line 283) | def gen_slopes(n_heads, alibi_bias_max=8, device=None):
  function build_alibi_bias (line 292) | def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, dev...

FILE: llava/model/language_model/mpt/blocks.py
  class MPTMLP (line 8) | class MPTMLP(nn.Module):
    method __init__ (line 10) | def __init__(self, d_model: int, expansion_ratio: int, device: Optiona...
    method forward (line 17) | def forward(self, x):
  class MPTBlock (line 20) | class MPTBlock(nn.Module):
    method __init__ (line 22) | def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, a...
    method forward (line 34) | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torc...

FILE: llava/model/language_model/mpt/configuration_mpt.py
  class MPTConfig (line 7) | class MPTConfig(PretrainedConfig):
    method __init__ (line 10) | def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=2...
    method _set_config_defaults (line 90) | def _set_config_defaults(self, config, config_defaults):
    method _validate_config (line 96) | def _validate_config(self):

FILE: llava/model/language_model/mpt/custom_embedding.py
  class SharedEmbedding (line 6) | class SharedEmbedding(nn.Embedding):
    method forward (line 8) | def forward(self, input: Tensor, unembed: bool=False) -> Tensor:

FILE: llava/model/language_model/mpt/flash_attn_triton.py
  function _fwd_kernel (line 51) | def _fwd_kernel(Q, K, V, Bias, Out, Lse, TMP, softmax_scale, stride_qb, ...
  function _bwd_preprocess_do_o_dot (line 155) | def _bwd_preprocess_do_o_dot(Out, DO, Delta, stride_ob, stride_oh, strid...
  function _bwd_store_dk_dv (line 168) | def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k,...
  function _bwd_kernel_one_col_block (line 184) | def _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LS...
  function init_to_zero (line 300) | def init_to_zero(name):
  function _bwd_kernel (line 306) | def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, st...
  function _flash_attn_forward (line 329) | def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=...
  function _flash_attn_backward (line 366) | def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, cau...
  class FlashAttnQKVPackedFunc (line 401) | class FlashAttnQKVPackedFunc(torch.autograd.Function):
    method forward (line 404) | def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
    method backward (line 419) | def backward(ctx, do):
  class FlashAttnKVPackedFunc (line 428) | class FlashAttnKVPackedFunc(torch.autograd.Function):
    method forward (line 431) | def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
    method backward (line 446) | def backward(ctx, do):
  class FlashAttnFunc (line 457) | class FlashAttnFunc(torch.autograd.Function):
    method forward (line 460) | def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
    method backward (line 475) | def backward(ctx, do):

FILE: llava/model/language_model/mpt/hf_prefixlm_converter.py
  function _convert_gpt_causal_lm_to_prefix_lm (line 29) | def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUS...
  function _convert_bloom_causal_lm_to_prefix_lm (line 113) | def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> Bl...
  function _convert_opt_causal_lm_to_prefix_lm (line 269) | def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTFor...
  function convert_hf_causal_lm_to_prefix_lm (line 335) | def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_...
  function add_bidirectional_mask_if_missing (line 401) | def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):

FILE: llava/model/language_model/mpt/meta_init_context.py
  function init_empty_weights (line 6) | def init_empty_weights(include_buffers: bool=False):
  function init_on_device (line 37) | def init_on_device(device: torch.device, include_buffers: bool=False):

FILE: llava/model/language_model/mpt/modeling_mpt.py
  class MPTPreTrainedModel (line 28) | class MPTPreTrainedModel(PreTrainedModel):
  class MPTModel (line 33) | class MPTModel(MPTPreTrainedModel):
    method __init__ (line 35) | def __init__(self, config: MPTConfig):
    method get_input_embeddings (line 81) | def get_input_embeddings(self):
    method set_input_embeddings (line 84) | def set_input_embeddings(self, value):
    method _attn_bias (line 88) | def _attn_bias(self, device, dtype, attention_mask: Optional[torch.Byt...
    method _apply_prefix_mask (line 119) | def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: tor...
    method _apply_sequence_id (line 134) | def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: tor...
    method forward (line 144) | def forward(self, input_ids: torch.LongTensor, past_key_values: Option...
    method param_init_fn (line 222) | def param_init_fn(self, module):
    method fsdp_wrap_fn (line 226) | def fsdp_wrap_fn(self, module):
    method activation_checkpointing_fn (line 229) | def activation_checkpointing_fn(self, module):
  class MPTForCausalLM (line 232) | class MPTForCausalLM(MPTPreTrainedModel):
    method __init__ (line 234) | def __init__(self, config: MPTConfig):
    method get_input_embeddings (line 255) | def get_input_embeddings(self):
    method set_input_embeddings (line 258) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 261) | def get_output_embeddings(self):
    method set_output_embeddings (line 264) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 267) | def set_decoder(self, decoder):
    method get_decoder (line 270) | def get_decoder(self):
    method forward (line 273) | def forward(self, input_ids: torch.LongTensor, past_key_values: Option...
    method param_init_fn (line 291) | def param_init_fn(self, module):
    method fsdp_wrap_fn (line 295) | def fsdp_wrap_fn(self, module):
    method activation_checkpointing_fn (line 298) | def activation_checkpointing_fn(self, module):
    method prepare_inputs_for_generation (line 301) | def prepare_inputs_for_generation(self, input_ids, past_key_values=Non...
    method _reorder_cache (line 322) | def _reorder_cache(past_key_values, beam_idx):

FILE: llava/model/language_model/mpt/norm.py
  function _cast_if_autocast_enabled (line 3) | def _cast_if_autocast_enabled(tensor):
  class LPLayerNorm (line 14) | class LPLayerNorm(torch.nn.LayerNorm):
    method __init__ (line 16) | def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=Tru...
    method forward (line 19) | def forward(self, x):
  function rms_norm (line 27) | def rms_norm(x, weight=None, eps=1e-05):
  class RMSNorm (line 33) | class RMSNorm(torch.nn.Module):
    method __init__ (line 35) | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=Non...
    method forward (line 43) | def forward(self, x):
  class LPRMSNorm (line 46) | class LPRMSNorm(RMSNorm):
    method __init__ (line 48) | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=Non...
    method forward (line 51) | def forward(self, x):

FILE: llava/model/language_model/mpt/param_init_fns.py
  function torch_default_param_init_fn_ (line 10) | def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kw...
  function fused_init_helper_ (line 17) | def fused_init_helper_(module: nn.Module, init_fn_):
  function generic_param_init_fn_ (line 28) | def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d...
  function _normal_init_ (line 121) | def _normal_init_(std, mean=0.0):
  function _normal_param_init_fn_ (line 124) | def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int,...
  function baseline_param_init_fn_ (line 131) | def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers...
  function small_param_init_fn_ (line 137) | def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int,...
  function neox_param_init_fn_ (line 142) | def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, ...
  function kaiming_uniform_param_init_fn_ (line 155) | def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_m...
  function kaiming_normal_param_init_fn_ (line 162) | def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_mo...
  function xavier_uniform_param_init_fn_ (line 169) | def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_mo...
  function xavier_normal_param_init_fn_ (line 176) | def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_mod...

FILE: llava/model/llava_arch.py
  class LlavaMetaModel (line 31) | class LlavaMetaModel:
    method __init__ (line 33) | def __init__(self, config):
    method get_vision_tower (line 40) | def get_vision_tower(self):
    method initialize_vision_modules (line 46) | def initialize_vision_modules(self, model_args, fsdp=None):
  class LlavaMetaForCausalLM (line 78) | class LlavaMetaForCausalLM(ABC):
    method get_model (line 81) | def get_model(self):
    method get_vision_tower (line 84) | def get_vision_tower(self):
    method encode_images (line 87) | def encode_images(self, images):
    method prepare_inputs_labels_for_multimodal (line 92) | def prepare_inputs_labels_for_multimodal(
    method initialize_vision_tokenizer (line 217) | def initialize_vision_tokenizer(self, model_args, tokenizer):
  class LlavaMetaForCausalLM_gd (line 298) | class LlavaMetaForCausalLM_gd(ABC):
    method get_model (line 301) | def get_model(self):
    method get_vision_tower (line 304) | def get_vision_tower(self):
    method encode_images (line 307) | def encode_images(self, images):
    method prepare_inputs_labels_for_multimodal (line 312) | def prepare_inputs_labels_for_multimodal(
    method initialize_vision_tokenizer (line 438) | def initialize_vision_tokenizer(self, model_args, tokenizer):
    method initialize_seg_modules (line 513) | def initialize_seg_modules(self, cfg):
    method freeze_seg_modules (line 518) | def freeze_seg_modules(self):
  class LlavaMetaForCausalLM_gd_interactive (line 523) | class LlavaMetaForCausalLM_gd_interactive(ABC):
    method get_model (line 526) | def get_model(self):
    method get_vision_tower (line 529) | def get_vision_tower(self):
    method encode_images (line 532) | def encode_images(self, images):
    method prepare_inputs_labels_for_multimodal (line 537) | def prepare_inputs_labels_for_multimodal(
    method prepare_inputs_labels_for_multimodal_NoInter (line 667) | def prepare_inputs_labels_for_multimodal_NoInter(
    method initialize_vision_tokenizer (line 793) | def initialize_vision_tokenizer(self, model_args, tokenizer):
    method initialize_seg_modules (line 869) | def initialize_seg_modules(self, cfg):
    method initialize_interactive_modules (line 874) | def initialize_interactive_modules(self, cfg):
    method freeze_seg_modules (line 882) | def freeze_seg_modules(self):

FILE: llava/model/make_delta.py
  function make_delta (line 13) | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_...

FILE: llava/model/multimodal_encoder/builder.py
  function build_vision_tower (line 4) | def build_vision_tower(vision_tower_cfg, **kwargs):

FILE: llava/model/multimodal_encoder/clip_encoder.py
  class CLIPVisionTower (line 7) | class CLIPVisionTower(nn.Module):
    method __init__ (line 8) | def __init__(self, vision_tower, args, delay_load=False):
    method load_model (line 22) | def load_model(self):
    method feature_select (line 29) | def feature_select(self, image_forward_outs):
    method forward (line 40) | def forward(self, images):
    method dummy_feature (line 54) | def dummy_feature(self):
    method dtype (line 58) | def dtype(self):
    method device (line 62) | def device(self):
    method config (line 66) | def config(self):
    method hidden_size (line 73) | def hidden_size(self):
    method num_patches (line 77) | def num_patches(self):

FILE: llava/model/openseed/BaseModel.py
  function align_and_update_state_dicts (line 12) | def align_and_update_state_dicts(model_state_dict, ckpt_state_dict):
  class BaseModel (line 46) | class BaseModel(nn.Module):
    method __init__ (line 47) | def __init__(self, opt, module: nn.Module):
    method forward (line 52) | def forward(self, *inputs, **kwargs):
    method save_pretrained (line 56) | def save_pretrained(self, save_dir):
    method from_pretrained (line 59) | def from_pretrained(self, load_dir):

FILE: llava/model/openseed/architectures/build.py
  function build_model (line 4) | def build_model(config, **kwargs):

FILE: llava/model/openseed/architectures/openseed_model.py
  class OpenSeeD (line 25) | class OpenSeeD(nn.Module):
    method __init__ (line 31) | def __init__(
    method from_config (line 141) | def from_config(cls, cfg):
    method device (line 271) | def device(self):
    method forward (line 274) | def forward(self, batched_inputs, inference_task='seg'):
    method forward_seg (line 348) | def forward_seg(self, batched_inputs, task='seg',default_text_embeddin...
    method prepare_targets (line 541) | def prepare_targets(self, targets, images, task='seg'):
    method semantic_inference (line 564) | def semantic_inference(self, mask_cls, mask_pred):
    method panoptic_inference (line 583) | def panoptic_inference(self, mask_cls, mask_pred):
    method instance_inference (line 645) | def instance_inference(self, mask_cls, mask_pred, mask_box_result):
    method box_postprocess (line 686) | def box_postprocess(self, out_bbox, img_h, img_w):
    method forward_eval (line 694) | def forward_eval(self, batched_inputs, text_embeddings):
    method forward_inner_eval (line 708) | def forward_inner_eval(self, batched_inputs, task='seg',default_text_e...
  function get_segmentation_model (line 759) | def get_segmentation_model(cfg, **kwargs):

FILE: llava/model/openseed/architectures/openseed_model_decouple_train.py
  class OpenSeeD (line 26) | class OpenSeeD(nn.Module):
    method __init__ (line 32) | def __init__(
    method from_config (line 152) | def from_config(cls, cfg):
    method device (line 298) | def device(self):
    method forward (line 301) | def forward(self, batched_inputs, inference_task='seg'):
    method forward_seg (line 329) | def forward_seg(self, batched_inputs, task='seg'):
    method prepare_targets (line 478) | def prepare_targets(self, targets, images, task='seg'):
    method semantic_inference (line 501) | def semantic_inference(self, mask_cls, mask_pred):
    method panoptic_inference (line 520) | def panoptic_inference(self, mask_cls, mask_pred):
    method instance_inference (line 582) | def instance_inference(self, mask_cls, mask_pred, mask_box_result,spli...
    method box_postprocess (line 627) | def box_postprocess(self, out_bbox, img_h, img_w):
  function get_segmentation_model (line 636) | def get_segmentation_model(cfg, **kwargs):

FILE: llava/model/openseed/architectures/registry.py
  function register_model (line 3) | def register_model(fn):
  function model_entrypoints (line 9) | def model_entrypoints(model_name):
  function is_model (line 12) | def is_model(model_name):

FILE: llava/model/openseed/backbone/backbone.py
  class Backbone (line 11) | class Backbone(nn.Module):
    method __init__ (line 16) | def __init__(self):
    method forward (line 22) | def forward(self):
    method size_divisibility (line 32) | def size_divisibility(self) -> int:
    method output_shape (line 42) | def output_shape(self):

FILE: llava/model/openseed/backbone/build.py
  function build_backbone (line 6) | def build_backbone(config, **kwargs):

FILE: llava/model/openseed/backbone/focal.py
  class Mlp (line 24) | class Mlp(nn.Module):
    method __init__ (line 27) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 36) | def forward(self, x):
  class FocalModulation (line 44) | class FocalModulation(nn.Module):
    method __init__ (line 56) | def __init__(self, dim, proj_drop=0., focal_level=2, focal_window=7, f...
    method forward (line 89) | def forward(self, x):
  class FocalModulationBlock (line 118) | class FocalModulationBlock(nn.Module):
    method __init__ (line 132) | def __init__(self, dim, mlp_ratio=4., drop=0., drop_path=0.,
    method forward (line 166) | def forward(self, x):
  class BasicLayer (line 197) | class BasicLayer(nn.Module):
    method __init__ (line 214) | def __init__(self,
    method forward (line 264) | def forward(self, x, H, W):
  class PatchEmbed (line 287) | class PatchEmbed(nn.Module):
    method __init__ (line 299) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 322) | def forward(self, x):
  class FocalNet (line 340) | class FocalNet(nn.Module):
    method __init__ (line 364) | def __init__(self,
    method _freeze_stages (line 438) | def _freeze_stages(self):
    method init_weights (line 452) | def init_weights(self, pretrained=None):
    method load_weights (line 478) | def load_weights(self, pretrained_dict=None, pretrained_layers=[], ver...
    method forward (line 566) | def forward(self, x):
    method train (line 592) | def train(self, mode=True):
  class D2FocalNet (line 598) | class D2FocalNet(FocalNet, Backbone):
    method __init__ (line 599) | def __init__(self, cfg, input_shape):
    method forward (line 652) | def forward(self, x):
    method output_shape (line 669) | def output_shape(self):
    method size_divisibility (line 678) | def size_divisibility(self):
  function get_focal_backbone (line 682) | def get_focal_backbone(cfg):

FILE: llava/model/openseed/backbone/focal_dw.py
  class Mlp (line 24) | class Mlp(nn.Module):
    method __init__ (line 27) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 36) | def forward(self, x):
  class FocalModulation (line 44) | class FocalModulation(nn.Module):
    method __init__ (line 56) | def __init__(self, dim, proj_drop=0., focal_level=2, focal_window=7, f...
    method forward (line 89) | def forward(self, x):
  class FocalModulationBlock (line 118) | class FocalModulationBlock(nn.Module):
    method __init__ (line 132) | def __init__(self, dim, mlp_ratio=4., drop=0., drop_path=0.,
    method forward (line 168) | def forward(self, x):
  class BasicLayer (line 206) | class BasicLayer(nn.Module):
    method __init__ (line 223) | def __init__(self,
    method forward (line 275) | def forward(self, x, H, W):
  class PatchEmbed (line 368) | class PatchEmbed(nn.Module):
    method __init__ (line 380) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 410) | def forward(self, x):
  class FocalNet (line 434) | class FocalNet(nn.Module):
    method __init__ (line 458) | def __init__(self,
    method _freeze_stages (line 535) | def _freeze_stages(self):
    method init_weights (line 549) | def init_weights(self, pretrained=None):
    method load_weights (line 575) | def load_weights(self, pretrained_dict=None, pretrained_layers=[], ver...
    method forward (line 663) | def forward(self, x):
    method train (line 689) | def train(self, mode=True):
  class D2FocalNet (line 695) | class D2FocalNet(FocalNet, Backbone):
    method __init__ (line 696) | def __init__(self, cfg, input_shape):
    method forward (line 749) | def forward(self, x):
    method output_shape (line 766) | def output_shape(self):
    method size_divisibility (line 775) | def size_divisibility(self):
  function get_focal_backbone (line 779) | def get_focal_backbone(cfg):

FILE: llava/model/openseed/backbone/registry.py
  function register_backbone (line 4) | def register_backbone(fn):
  function model_entrypoints (line 10) | def model_entrypoints(model_name):
  function is_model (line 13) | def is_model(model_name):

FILE: llava/model/openseed/backbone/swin.py
  class Mlp (line 26) | class Mlp(nn.Module):
    method __init__ (line 29) | def __init__(
    method forward (line 40) | def forward(self, x):
  function window_partition (line 49) | def window_partition(x, window_size):
  function window_reverse (line 63) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 79) | class WindowAttention(nn.Module):
    method __init__ (line 92) | def __init__(
    method forward (line 136) | def forward(self, x, mask=None):
  class SwinTransformerBlock (line 180) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 197) | def __init__(
    method forward (line 241) | def forward(self, x, mask_matrix):
  class PatchMerging (line 309) | class PatchMerging(nn.Module):
    method __init__ (line 316) | def __init__(self, dim, norm_layer=nn.LayerNorm):
    method forward (line 322) | def forward(self, x, H, W):
  class BasicLayer (line 351) | class BasicLayer(nn.Module):
    method __init__ (line 369) | def __init__(
    method forward (line 417) | def forward(self, x, H, W):
  class PatchEmbed (line 467) | class PatchEmbed(nn.Module):
    method __init__ (line 476) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 490) | def forward(self, x):
  class SwinTransformer (line 509) | class SwinTransformer(nn.Module):
    method __init__ (line 537) | def __init__(
    method _freeze_stages (line 629) | def _freeze_stages(self):
    method init_weights (line 646) | def init_weights(self, pretrained=None):
    method load_weights (line 663) | def load_weights(self, pretrained_dict=None, pretrained_layers=[], ver...
    method forward (line 730) | def forward(self, x):
    method train (line 763) | def train(self, mode=True):
  class D2SwinTransformer (line 769) | class D2SwinTransformer(SwinTransformer, Backbone):
    method __init__ (line 770) | def __init__(self, cfg, pretrain_img_size, patch_size, in_chans, embed...
    method forward (line 810) | def forward(self, x):
    method output_shape (line 827) | def output_shape(self):
    method size_divisibility (line 837) | def size_divisibility(self):
  function get_swin_backbone (line 842) | def get_swin_backbone(cfg):

FILE: llava/model/openseed/body/build.py
  function build_openseed_head (line 6) | def build_openseed_head(config, *args, **kwargs):

FILE: llava/model/openseed/body/decoder/build.py
  function build_decoder (line 5) | def build_decoder(config, *args, **kwargs):

FILE: llava/model/openseed/body/decoder/modules.py
  class SelfAttentionLayer (line 12) | class SelfAttentionLayer(nn.Module):
    method __init__ (line 14) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 27) | def _reset_parameters(self):
    method with_pos_embed (line 32) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 35) | def forward_post(self, tgt,
    method forward_pre (line 47) | def forward_pre(self, tgt,
    method forward (line 59) | def forward(self, tgt,
  class CrossAttentionLayer (line 70) | class CrossAttentionLayer(nn.Module):
    method __init__ (line 72) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 85) | def _reset_parameters(self):
    method with_pos_embed (line 90) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 93) | def forward_post(self, tgt, memory,
    method forward_pre (line 106) | def forward_pre(self, tgt, memory,
    method forward (line 120) | def forward(self, tgt, memory,
  class FFNLayer (line 132) | class FFNLayer(nn.Module):
    method __init__ (line 134) | def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
    method _reset_parameters (line 149) | def _reset_parameters(self):
    method with_pos_embed (line 154) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 157) | def forward_post(self, tgt):
    method forward_pre (line 163) | def forward_pre(self, tgt):
    method forward (line 169) | def forward(self, tgt):
  function _get_activation_fn (line 175) | def _get_activation_fn(activation):
  class MLP (line 186) | class MLP(nn.Module):
    method __init__ (line 189) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 195) | def forward(self, x):

FILE: llava/model/openseed/body/decoder/openseed_decoder.py
  class OpenSeeDDecoder (line 26) | class OpenSeeDDecoder(nn.Module):
    method __init__ (line 28) | def __init__(
    method from_config (line 159) | def from_config(cls, cfg, in_channels, mask_classification, extra):
    method prepare_for_dn (line 191) | def prepare_for_dn(self, targets, tgt, refpoint_emb, batch_size):
    method dn_post_process (line 316) | def dn_post_process(self,outputs_class,outputs_coord,mask_dict,outputs...
    method get_valid_ratio (line 336) | def get_valid_ratio(self, mask):
    method pred_box (line 345) | def pred_box(self, reference, hs, ref0=None):
    method compute_similarity (line 363) | def compute_similarity(self, v_emb,name='default'):
    method forward (line 371) | def forward(self, x, mask_features, masks, targets=None, target_querie...
    method forward_prediction_heads (line 531) | def forward_prediction_heads(self, output, mask_features, pred_mask=Tr...
    method _set_aux_loss (line 546) | def _set_aux_loss(self, outputs_class, outputs_seg_masks, out_boxes=No...
  function get_maskdino_transformer_decoder (line 568) | def get_maskdino_transformer_decoder(cfg, in_channels, mask_classificati...

FILE: llava/model/openseed/body/decoder/openseed_decoder_decouple.py
  class MaskDINODecoder (line 25) | class MaskDINODecoder(nn.Module):
    method __init__ (line 27) | def __init__(
    method from_config (line 163) | def from_config(cls, cfg, in_channels, lang_encoder, mask_classificati...
    method prepare_for_dn (line 197) | def prepare_for_dn(self, targets, tgt, refpoint_emb, batch_size,task="...
    method dn_post_process (line 328) | def dn_post_process(self,outputs_class,outputs_coord,mask_dict,outputs...
    method get_valid_ratio (line 348) | def get_valid_ratio(self, mask):
    method pred_box (line 357) | def pred_box(self, reference, hs, ref0=None):
    method forward_cls (line 375) | def forward_cls(self, x, mask_features, masks, targets=None, target_qu...
    method forward (line 537) | def forward(self, x, mask_features, masks, targets=None, target_querie...
    method forward_prediction_heads (line 701) | def forward_prediction_heads(self, output, mask_features, pred_mask=Tr...
    method _set_aux_loss (line 716) | def _set_aux_loss(self, outputs_class, outputs_seg_masks, out_boxes=No...
  function get_maskdino_transformer_decoder (line 744) | def get_maskdino_transformer_decoder(cfg, in_channels, lang_encoder, mas...

FILE: llava/model/openseed/body/decoder/registry.py
  function register_decoder (line 3) | def register_decoder(fn):
  function model_entrypoints (line 9) | def model_entrypoints(model_name):
  function is_model (line 12) | def is_model(model_name):

FILE: llava/model/openseed/body/decoder/utils/dino_decoder.py
  class TransformerDecoder (line 18) | class TransformerDecoder(nn.Module):
    method __init__ (line 20) | def __init__(self, decoder_layer, num_layers, norm=None,
    method _reset_parameters (line 88) | def _reset_parameters(self):
    method forward (line 96) | def forward(self, tgt, memory,
  class DeformableTransformerDecoderLayer (line 196) | class DeformableTransformerDecoderLayer(nn.Module):
    method __init__ (line 198) | def __init__(self, d_model=256, d_ffn=1024,
    method rm_self_attn_modules (line 230) | def rm_self_attn_modules(self):
    method with_pos_embed (line 236) | def with_pos_embed(tensor, pos):
    method forward_ffn (line 239) | def forward_ffn(self, tgt):
    method forward (line 246) | def forward(self,

FILE: llava/model/openseed/body/decoder/utils/utils.py
  class MLP (line 11) | class MLP(nn.Module):
    method __init__ (line 14) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 20) | def forward(self, x):
  function inverse_sigmoid (line 26) | def inverse_sigmoid(x, eps=1e-5):
  function gen_encoder_output_proposals (line 33) | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tens...
  function gen_sineembed_for_position (line 74) | def gen_sineembed_for_position(pos_tensor, dim=128):
  function _get_activation_fn (line 103) | def _get_activation_fn(activation):
  function _get_clones (line 118) | def _get_clones(module, N, layer_share=False):

FILE: llava/model/openseed/body/encoder/build.py
  function build_encoder (line 7) | def build_encoder(config, *args, **kwargs):

FILE: llava/model/openseed/body/encoder/encoder_deform.py
  class MSDeformAttnTransformerEncoderOnly (line 30) | class MSDeformAttnTransformerEncoderOnly(nn.Module):
    method __init__ (line 31) | def __init__(self, d_model=256, nhead=8,
    method _reset_parameters (line 49) | def _reset_parameters(self):
    method get_valid_ratio (line 58) | def get_valid_ratio(self, mask):
    method forward (line 67) | def forward(self, srcs, masks, pos_embeds, use_ckpt=False):
  class MSDeformAttnTransformerEncoderLayer (line 104) | class MSDeformAttnTransformerEncoderLayer(nn.Module):
    method __init__ (line 105) | def __init__(self,
    method with_pos_embed (line 125) | def with_pos_embed(tensor, pos):
    method forward_ffn (line 128) | def forward_ffn(self, src):
    method forward (line 134) | def forward(self, src, pos, reference_points, spatial_shapes, level_st...
  class MSDeformAttnTransformerEncoder (line 146) | class MSDeformAttnTransformerEncoder(nn.Module):
    method __init__ (line 147) | def __init__(self, encoder_layer, num_layers):
    method get_reference_points (line 153) | def get_reference_points(spatial_shapes, valid_ratios, device):
    method forward (line 167) | def forward(self, src, spatial_shapes, level_start_index, valid_ratios...
  class OpenSeeDEncoder (line 179) | class OpenSeeDEncoder(nn.Module):
    method __init__ (line 184) | def __init__(
    method from_config (line 331) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], *args, **...
    method forward_features (line 357) | def forward_features(self, features, masks):
  function get_maskdino_encoder_deform (line 436) | def get_maskdino_encoder_deform(cfg, input_shape):

FILE: llava/model/openseed/body/encoder/ops/functions/ms_deform_attn_func.py
  class MSDeformAttnFunction (line 32) | class MSDeformAttnFunction(Function):
    method forward (line 34) | def forward(ctx, value, value_spatial_shapes, value_level_start_index,...
    method backward (line 43) | def backward(ctx, grad_output):
  function ms_deform_attn_core_pytorch (line 52) | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_lo...

FILE: llava/model/openseed/body/encoder/ops/modules/ms_deform_attn.py
  function _is_power_of_2 (line 28) | def _is_power_of_2(n):
  class MSDeformAttn (line 34) | class MSDeformAttn(nn.Module):
    method __init__ (line 35) | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
    method _reset_parameters (line 66) | def _reset_parameters(self):
    method forward (line 82) | def forward(self, query, reference_points, input_flatten, input_spatia...

FILE: llava/model/openseed/body/encoder/ops/setup.py
  function get_extensions (line 26) | def get_extensions():

FILE: llava/model/openseed/body/encoder/ops/src/cpu/ms_deform_attn_cpu.cpp
  function ms_deform_attn_cpu_forward (line 22) | at::Tensor
  function ms_deform_attn_cpu_backward (line 34) | std::vector<at::Tensor>

FILE: llava/model/openseed/body/encoder/ops/src/ms_deform_attn.h
  function im2col_step (line 32) | int im2col_step)

FILE: llava/model/openseed/body/encoder/ops/src/vision.cpp
  function PYBIND11_MODULE (line 18) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: llava/model/openseed/body/encoder/ops/test.py
  function check_forward_equal_with_pytorch_double (line 35) | def check_forward_equal_with_pytorch_double():
  function check_forward_equal_with_pytorch_float (line 51) | def check_forward_equal_with_pytorch_float():
  function check_gradient_numerical (line 66) | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_...

FILE: llava/model/openseed/body/encoder/registry.py
  function register_encoder (line 3) | def register_encoder(fn):
  function model_entrypoints (line 9) | def model_entrypoints(model_name):
  function is_model (line 12) | def is_model(model_name):

FILE: llava/model/openseed/body/encoder/transformer_encoder_fpn.py
  class BasePixelDecoder (line 22) | class BasePixelDecoder(nn.Module):
    method __init__ (line 23) | def __init__(
    method from_config (line 112) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 123) | def forward_features(self, features):
    method forward (line 145) | def forward(self, features, targets=None):
  class TransformerEncoderOnly (line 151) | class TransformerEncoderOnly(nn.Module):
    method __init__ (line 152) | def __init__(
    method _reset_parameters (line 175) | def _reset_parameters(self):
    method forward (line 180) | def forward(self, src, mask, pos_embed):
  class TransformerEncoderPixelDecoder (line 193) | class TransformerEncoderPixelDecoder(BasePixelDecoder):
    method __init__ (line 195) | def __init__(
    method from_config (line 262) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 276) | def forward_features(self, features):
    method forward (line 304) | def forward(self, features, targets=None):
  function get_transformer_encoder_fpn (line 312) | def get_transformer_encoder_fpn(cfg, input_shape):

FILE: llava/model/openseed/body/openseed_head.py
  class OpenSeeDHead (line 21) | class OpenSeeDHead(nn.Module):
    method __init__ (line 23) | def __init__(
    method from_config (line 56) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], lang_enco...
    method forward (line 77) | def forward(self, features, mask=None,targets=None, target_queries=Non...
  function get_maskdino_head (line 86) | def get_maskdino_head(cfg, input_shape, lang_encoder, extra):

FILE: llava/model/openseed/body/registry.py
  function register_body (line 4) | def register_body(fn):
  function model_entrypoints (line 10) | def model_entrypoints(model_name):
  function is_model (line 13) | def is_model(model_name):

FILE: llava/model/openseed/body/transformer_blocks.py
  class Transformer (line 19) | class Transformer(nn.Module):
    method __init__ (line 20) | def __init__(
    method _reset_parameters (line 56) | def _reset_parameters(self):
    method forward (line 61) | def forward(self, src, mask, query_embed, pos_embed):
  class TransformerEncoder (line 78) | class TransformerEncoder(nn.Module):
    method __init__ (line 79) | def __init__(self, encoder_layer, num_layers, norm=None):
    method forward (line 85) | def forward(
  class TransformerDecoder (line 105) | class TransformerDecoder(nn.Module):
    method __init__ (line 106) | def __init__(self, decoder_layer, num_layers, norm=None, return_interm...
    method forward (line 113) | def forward(
  class TransformerEncoderLayer (line 154) | class TransformerEncoderLayer(nn.Module):
    method __init__ (line 155) | def __init__(
    method with_pos_embed (line 179) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 182) | def forward_post(
    method forward_pre (line 201) | def forward_pre(
    method forward (line 219) | def forward(
  class TransformerDecoderLayer (line 231) | class TransformerDecoderLayer(nn.Module):
    method __init__ (line 232) | def __init__(
    method with_pos_embed (line 259) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 262) | def forward_post(
    method forward_pre (line 293) | def forward_pre(
    method forward (line 324) | def forward(
  function _get_clones (line 358) | def _get_clones(module, N):
  function _get_activation_fn (line 362) | def _get_activation_fn(activation):

FILE: llava/model/openseed/language/LangEncoder/build.py
  function build_lang_encoder (line 10) | def build_lang_encoder(config_encoder, tokenizer, verbose, **kwargs):
  function build_tokenizer (line 19) | def build_tokenizer(config_encoder):

FILE: llava/model/openseed/language/LangEncoder/registry.py
  function register_lang_encoder (line 4) | def register_lang_encoder(fn):
  function lang_encoders (line 13) | def lang_encoders(model_name):
  function is_lang_encoder (line 17) | def is_lang_encoder(model_name):

FILE: llava/model/openseed/language/LangEncoder/transformer.py
  class LayerNorm (line 21) | class LayerNorm(nn.Module):
    method __init__ (line 22) | def __init__(self, hidden_size, eps=1e-12):
    method forward (line 30) | def forward(self, x):
  class QuickGELU (line 39) | class QuickGELU(nn.Module):
    method forward (line 40) | def forward(self, x: torch.Tensor):
  class ResidualAttentionBlock (line 44) | class ResidualAttentionBlock(nn.Module):
    method __init__ (line 45) | def __init__(self,
    method attention (line 63) | def attention(self, x: torch.Tensor, key_padding_mask: torch.Tensor = ...
    method forward (line 75) | def forward(self, x: torch.Tensor, key_padding_mask: torch.Tensor = No...
  class Transformer (line 81) | class Transformer(nn.Module):
    method __init__ (line 82) | def __init__(self,
    method dim_out (line 119) | def dim_out(self):
    method build_attention_mask (line 122) | def build_attention_mask(self):
    method _init_weights (line 130) | def _init_weights(self, m):
    method load_pretrained (line 142) | def load_pretrained(self, pretrained='', pretrained_layers=[], verbose...
    method no_weight_decay (line 188) | def no_weight_decay(self):
    method forward (line 194) | def forward(self, input_ids, attention_mask=None):
  function lang_encoder (line 210) | def lang_encoder(config_encoder, tokenizer, verbose, **kwargs):

FILE: llava/model/openseed/language/build.py
  function build_language_encoder (line 5) | def build_language_encoder(config, **kwargs):

FILE: llava/model/openseed/language/encoder.py
  class LanguageEncoder (line 13) | class LanguageEncoder(nn.Module):
    method __init__ (line 16) | def __init__(
    method from_config (line 33) | def from_config(cls, cfg):
    method get_text_embeddings (line 54) | def get_text_embeddings(self, class_names, name='default', is_eval=Fal...
    method forward_language (line 109) | def forward_language(self, texts, norm=True):
    method compute_similarity (line 123) | def compute_similarity(self, v_emb, name='default'):
  function get_language_model (line 131) | def get_language_model(cfg, **kwargs):

FILE: llava/model/openseed/language/registry.py
  function register_model (line 3) | def register_model(fn):
  function model_entrypoints (line 9) | def model_entrypoints(model_name):
  function is_model (line 12) | def is_model(model_name):

FILE: llava/model/openseed/language/vlpencoder.py
  class LanguageEncoder (line 19) | class LanguageEncoder(nn.Module):
    method __init__ (line 22) | def __init__(
    method from_config (line 46) | def from_config(cls, cfg):
    method get_text_embeddings (line 70) | def get_text_embeddings(self, class_names, name='default', is_eval=Fal...
    method get_text_token_embeddings (line 127) | def get_text_token_embeddings(self, txts, name='default', token=False,...
    method forward_language (line 142) | def forward_language(self, texts, norm=True):
    method forward_language_token (line 156) | def forward_language_token(self, texts, norm=False):
    method compute_similarity (line 174) | def compute_similarity(self, v_emb, name='default', fake=False):
  function get_language_model (line 184) | def get_language_model(cfg, **kwargs):

FILE: llava/model/openseed/modules/attention.py
  function multi_head_attention_forward (line 13) | def multi_head_attention_forward(
  class _LinearWithBias (line 324) | class _LinearWithBias(nn.Linear):
    method __init__ (line 327) | def __init__(self, in_features: int, out_features: int) -> None:
  class MultiheadAttention (line 331) | class MultiheadAttention(nn.Module):
    method __init__ (line 364) | def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bi...
    method _reset_parameters (line 403) | def _reset_parameters(self):
    method __setstate__ (line 419) | def __setstate__(self, state):
    method forward (line 426) | def forward(self, query: Tensor, key: Tensor, value: Tensor, key_paddi...

FILE: llava/model/openseed/modules/criterion.py
  function sigmoid_focal_loss (line 29) | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, ...
  function dice_loss (line 58) | def dice_loss(
  function sigmoid_ce_loss (line 85) | def sigmoid_ce_loss(
  function calculate_uncertainty (line 110) | def calculate_uncertainty(logits):
  class SetCriterion (line 127) | class SetCriterion(nn.Module):
    method __init__ (line 134) | def __init__(self, num_classes, matcher, weight_dict, eos_coef, top_x_...
    method loss_labels_ce (line 168) | def loss_labels_ce(self, outputs, targets, indices, num_masks, layer_i...
    method loss_labels_masked (line 193) | def loss_labels_masked(self, outputs, targets, indices, num_boxes, log...
    method loss_labels (line 223) | def loss_labels(self, outputs, targets, indices, num_boxes, log=True, ...
    method loss_boxes (line 257) | def loss_boxes(self, outputs, targets, indices, num_boxes, layer_id=No...
    method loss_boxes_panoptic (line 285) | def loss_boxes_panoptic(self, outputs, targets, indices, num_boxes, la...
    method loss_masks (line 316) | def loss_masks(self, outputs, targets, indices, num_masks, layer_id=No...
    method prep_for_dn (line 374) | def prep_for_dn(self,mask_dict):
    method _get_src_permutation_idx (line 385) | def _get_src_permutation_idx(self, indices):
    method _get_tgt_permutation_idx (line 391) | def _get_tgt_permutation_idx(self, indices):
    method get_loss (line 397) | def get_loss(self, loss, outputs, targets, indices, num_masks=None, la...
    method forward (line 408) | def forward(self, outputs, targets, mask_dict=None, extra=None, task='...
    method __repr__ (line 529) | def __repr__(self):

FILE: llava/model/openseed/modules/matcher.py
  function batch_dice_loss (line 22) | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_sigmoid_ce_loss (line 45) | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
  class HungarianMatcher (line 77) | class HungarianMatcher(nn.Module):
    method __init__ (line 85) | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_d...
    method memory_efficient_forward (line 108) | def memory_efficient_forward(self, outputs, targets, cost=["cls", "box...
    method forward (line 206) | def forward(self, outputs, targets, cost=["cls", "box", "mask"], mode=...
    method __repr__ (line 235) | def __repr__(self, _repr_indent=4):

FILE: llava/model/openseed/modules/point_features.py
  function point_sample (line 21) | def point_sample(input, point_coords, **kwargs):
  function generate_regular_grid_point_coords (line 47) | def generate_regular_grid_point_coords(R, side_size, device):
  function get_uncertain_point_coords_with_randomness (line 65) | def get_uncertain_point_coords_with_randomness(
  function get_uncertain_point_coords_on_grid (line 121) | def get_uncertain_point_coords_on_grid(uncertainty_map, num_points):
  function point_sample_fine_grained_features (line 148) | def point_sample_fine_grained_features(features_list, feature_scales, bo...
  function get_point_coords_wrt_image (line 194) | def get_point_coords_wrt_image(boxes_coords, point_coords):
  function sample_point_labels (line 221) | def sample_point_labels(instances, point_coords):

FILE: llava/model/openseed/modules/position_encoding.py
  class PositionEmbeddingSine (line 12) | class PositionEmbeddingSine(nn.Module):
    method __init__ (line 18) | def __init__(self, num_pos_feats=64, temperature=10000, normalize=Fals...
    method forward (line 29) | def forward(self, x, mask=None):
    method __repr__ (line 54) | def __repr__(self, _repr_indent=4):

FILE: llava/model/openseed/modules/postprocessing.py
  function detector_postprocess (line 9) | def detector_postprocess(
  function bbox_postprocess (line 77) | def bbox_postprocess(result, input_size, img_size, output_height, output...
  function sem_seg_postprocess (line 99) | def sem_seg_postprocess(result, img_size, output_height, output_width):

FILE: llava/model/openseed/utils/box_ops.py
  function box_cxcywh_to_xyxy (line 9) | def box_cxcywh_to_xyxy(x):
  function box_xyxy_to_cxcywh (line 16) | def box_xyxy_to_cxcywh(x):
  function box_xywh_to_xyxy (line 22) | def box_xywh_to_xyxy(x):
  function box_iou (line 29) | def box_iou(boxes1, boxes2):
  function generalized_box_iou (line 45) | def generalized_box_iou(boxes1, boxes2):
  function masks_to_boxes (line 69) | def masks_to_boxes(masks):

FILE: llava/model/openseed/utils/config.py
  function configurable (line 7) | def configurable(init_func=None, *, from_config=None):
  function _called_with_cfg (line 94) | def _called_with_cfg(*args, **kwargs):
  function _get_args_from_config (line 111) | def _get_args_from_config(from_config_func, *args, **kwargs):

FILE: llava/model/openseed/utils/misc.py
  function _max_by_axis (line 26) | def _max_by_axis(the_list):
  class NestedTensor (line 34) | class NestedTensor(object):
    method __init__ (line 35) | def __init__(self, tensors, mask: Optional[Tensor]):
    method to (line 39) | def to(self, device):
    method decompose (line 50) | def decompose(self):
    method __repr__ (line 53) | def __repr__(self):
  function nested_tensor_from_tensor_list (line 56) | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
  function _collate_and_pad_divisibility (line 98) | def _collate_and_pad_divisibility(tensor_list: list, div=32):
  function _onnx_nested_tensor_from_tensor_list (line 132) | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> N...
  function is_dist_avail_and_initialized (line 161) | def is_dist_avail_and_initialized():

FILE: llava/model/semsam/BaseModel.py
  class BaseModel (line 12) | class BaseModel(nn.Module):
    method __init__ (line 13) | def __init__(self, opt, module: nn.Module):
    method forward (line 18) | def forward(self, *inputs, **kwargs):
    method save_pretrained (line 22) | def save_pretrained(self, save_dir):
    method from_pretrained (line 25) | def from_pretrained(self, load_dir):

FILE: llava/model/semsam/architectures/build.py
  function build_model (line 4) | def build_model(config, **kwargs):

FILE: llava/model/semsam/architectures/idino_model_partwhole_all_llm_ref_feats_all_det_pretrainv1.py
  function dice_loss (line 23) | def dice_loss(
  function iou_score_loss (line 49) | def iou_score_loss(inputs, targets):
  function sigmoid_ce_loss (line 59) | def sigmoid_ce_loss(
  function calculate_uncertainty (line 87) | def calculate_uncertainty(logits):
  function sigmoid_focal_loss (line 104) | def sigmoid_focal_loss(inputs, targets, alpha: float = 0.25, gamma: floa...
  class SemanticSAM (line 132) | class SemanticSAM(nn.Module):
    method __init__ (line 138) | def __init__(
    method from_config (line 264) | def from_config(cls, cfg):
    method device (line 418) | def device(self):
    method evaluate_demo (line 421) | def evaluate_demo(self, batched_inputs, all_whole=None, all_parts=None...
    method forward (line 478) | def forward(self, batched_inputs, inference_task='seg',detach=False):
    method forward_det_pretrain (line 499) | def forward_det_pretrain(self, batched_inputs, task='seg',
    method prepare_targets_sam (line 568) | def prepare_targets_sam(self, targets, images, prediction_switch, task...
  function get_segmentation_model (line 657) | def get_segmentation_model(cfg, **kwargs):

FILE: llava/model/semsam/architectures/registry.py
  function register_model (line 3) | def register_model(fn):
  function model_entrypoints (line 9) | def model_entrypoints(model_name):
  function is_model (line 12) | def is_model(model_name):

FILE: llava/model/semsam/backbone/backbone.py
  class Backbone (line 11) | class Backbone(nn.Module):
    method __init__ (line 16) | def __init__(self):
    method forward (line 22) | def forward(self):
    method size_divisibility (line 32) | def size_divisibility(self) -> int:
    method output_shape (line 42) | def output_shape(self):

FILE: llava/model/semsam/backbone/build.py
  function build_backbone (line 6) | def build_backbone(config, **kwargs):

FILE: llava/model/semsam/backbone/focal.py
  class Mlp (line 24) | class Mlp(nn.Module):
    method __init__ (line 27) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 36) | def forward(self, x):
  class FocalModulation (line 44) | class FocalModulation(nn.Module):
    method __init__ (line 56) | def __init__(self, dim, proj_drop=0., focal_level=2, focal_window=7, f...
    method forward (line 89) | def forward(self, x):
  class FocalModulationBlock (line 118) | class FocalModulationBlock(nn.Module):
    method __init__ (line 132) | def __init__(self, dim, mlp_ratio=4., drop=0., drop_path=0.,
    method forward (line 166) | def forward(self, x):
  class BasicLayer (line 197) | class BasicLayer(nn.Module):
    method __init__ (line 214) | def __init__(self,
    method forward (line 264) | def forward(self, x, H, W):
  class PatchEmbed (line 287) | class PatchEmbed(nn.Module):
    method __init__ (line 299) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 322) | def forward(self, x):
  class FocalNet (line 340) | class FocalNet(nn.Module):
    method __init__ (line 364) | def __init__(self,
    method _freeze_stages (line 438) | def _freeze_stages(self):
    method init_weights (line 452) | def init_weights(self, pretrained=None):
    method load_weights (line 478) | def load_weights(self, pretrained_dict=None, pretrained_layers=[], ver...
    method forward (line 566) | def forward(self, x):
    method train (line 592) | def train(self, mode=True):
  class D2FocalNet (line 598) | class D2FocalNet(FocalNet, Backbone):
    method __init__ (line 599) | def __init__(self, cfg, input_shape):
    method forward (line 652) | def forward(self, x):
    method output_shape (line 669) | def output_shape(self):
    method size_divisibility (line 678) | def size_divisibility(self):
  function get_focal_backbone (line 682) | def get_focal_backbone(cfg):

FILE: llava/model/semsam/backbone/focal_dw.py
  class Mlp (line 24) | class Mlp(nn.Module):
    method __init__ (line 27) | def __init__(self, in_features, hidden_features=None, out_features=Non...
    method forward (line 36) | def forward(self, x):
  class FocalModulation (line 44) | class FocalModulation(nn.Module):
    method __init__ (line 56) | def __init__(self, dim, proj_drop=0., focal_level=2, focal_window=7, f...
    method forward (line 89) | def forward(self, x):
  class FocalModulationBlock (line 118) | class FocalModulationBlock(nn.Module):
    method __init__ (line 132) | def __init__(self, dim, mlp_ratio=4., drop=0., drop_path=0.,
    method forward (line 168) | def forward(self, x):
  class BasicLayer (line 206) | class BasicLayer(nn.Module):
    method __init__ (line 223) | def __init__(self,
    method forward (line 275) | def forward(self, x, H, W):
  class PatchEmbed (line 368) | class PatchEmbed(nn.Module):
    method __init__ (line 380) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 410) | def forward(self, x):
  class FocalNet (line 434) | class FocalNet(nn.Module):
    method __init__ (line 458) | def __init__(self,
    method _freeze_stages (line 535) | def _freeze_stages(self):
    method init_weights (line 549) | def init_weights(self, pretrained=None):
    method load_weights (line 575) | def load_weights(self, pretrained_dict=None, pretrained_layers=[], ver...
    method forward (line 663) | def forward(self, x):
    method train (line 689) | def train(self, mode=True):
  class D2FocalNet (line 695) | class D2FocalNet(FocalNet, Backbone):
    method __init__ (line 696) | def __init__(self, cfg, input_shape):
    method forward (line 749) | def forward(self, x):
    method output_shape (line 766) | def output_shape(self):
    method size_divisibility (line 775) | def size_divisibility(self):
  function get_focal_backbone (line 779) | def get_focal_backbone(cfg):

FILE: llava/model/semsam/backbone/registry.py
  function register_backbone (line 4) | def register_backbone(fn):
  function model_entrypoints (line 10) | def model_entrypoints(model_name):
  function is_model (line 13) | def is_model(model_name):

FILE: llava/model/semsam/backbone/swin.py
  class Mlp (line 26) | class Mlp(nn.Module):
    method __init__ (line 29) | def __init__(
    method forward (line 40) | def forward(self, x):
  function window_partition (line 49) | def window_partition(x, window_size):
  function window_reverse (line 63) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 79) | class WindowAttention(nn.Module):
    method __init__ (line 92) | def __init__(
    method forward (line 136) | def forward(self, x, mask=None):
  class SwinTransformerBlock (line 180) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 197) | def __init__(
    method forward (line 241) | def forward(self, x, mask_matrix):
  class PatchMerging (line 309) | class PatchMerging(nn.Module):
    method __init__ (line 316) | def __init__(self, dim, norm_layer=nn.LayerNorm):
    method forward (line 322) | def forward(self, x, H, W):
  class BasicLayer (line 351) | class BasicLayer(nn.Module):
    method __init__ (line 369) | def __init__(
    method forward (line 417) | def forward(self, x, H, W):
  class PatchEmbed (line 467) | class PatchEmbed(nn.Module):
    method __init__ (line 476) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 490) | def forward(self, x):
  class SwinTransformer (line 509) | class SwinTransformer(nn.Module):
    method __init__ (line 537) | def __init__(
    method _freeze_stages (line 629) | def _freeze_stages(self):
    method init_weights (line 646) | def init_weights(self, pretrained=None):
    method load_weights (line 663) | def load_weights(self, pretrained_dict=None, pretrained_layers=[], ver...
    method forward (line 730) | def forward(self, x):
    method train (line 763) | def train(self, mode=True):
  class D2SwinTransformer (line 769) | class D2SwinTransformer(SwinTransformer, Backbone):
    method __init__ (line 770) | def __init__(self, cfg, pretrain_img_size, patch_size, in_chans, embed...
    method forward (line 810) | def forward(self, x):
    method output_shape (line 827) | def output_shape(self):
    method size_divisibility (line 837) | def size_divisibility(self):
  function get_swin_backbone (line 842) | def get_swin_backbone(cfg):

FILE: llava/model/semsam/backbone/swin_new.py
  class Mlp (line 21) | class Mlp(nn.Module):
    method __init__ (line 24) | def __init__(
    method forward (line 35) | def forward(self, x):
  function window_partition (line 44) | def window_partition(x, window_size):
  function window_reverse (line 58) | def window_reverse(windows, window_size, H, W):
  class WindowAttention (line 74) | class WindowAttention(nn.Module):
    method __init__ (line 87) | def __init__(
    method forward (line 131) | def forward(self, x, mask=None):
  class SwinTransformerBlock (line 174) | class SwinTransformerBlock(nn.Module):
    method __init__ (line 191) | def __init__(
    method forward (line 235) | def forward(self, x, mask_matrix):
  class PatchMerging (line 298) | class PatchMerging(nn.Module):
    method __init__ (line 305) | def __init__(self, dim, norm_layer=nn.LayerNorm):
    method forward (line 311) | def forward(self, x, H, W):
  class BasicLayer (line 340) | class BasicLayer(nn.Module):
    method __init__ (line 358) | def __init__(
    method forward (line 406) | def forward(self, x, H, W):
  class PatchEmbed (line 456) | class PatchEmbed(nn.Module):
    method __init__ (line 465) | def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=...
    method forward (line 479) | def forward(self, x):
  class SwinTransformer (line 498) | class SwinTransformer(nn.Module):
    method __init__ (line 526) | def __init__(
    method _freeze_stages (line 618) | def _freeze_stages(self):
    method init_weights (line 635) | def init_weights(self, pretrained=None):
    method forward (line 651) | def forward(self, x):
    method train (line 680) | def train(self, mode=True):
  class D2SwinTransformer (line 687) | class D2SwinTransformer(SwinTransformer, Backbone):
    method __init__ (line 688) | def __init__(self, cfg, input_shape):
    method forward (line 743) | def forward(self, x):
    method output_shape (line 760) | def output_shape(self):
    method size_divisibility (line 769) | def size_divisibility(self):

FILE: llava/model/semsam/body/build.py
  function build_openseed_head (line 6) | def build_openseed_head(config, *args, **kwargs):

FILE: llava/model/semsam/body/decoder/build.py
  function build_decoder (line 5) | def build_decoder(config, *args, **kwargs):

FILE: llava/model/semsam/body/decoder/idino_decoder_no_iou_token_partwhole_all_llm.py
  class MaskDINODecoder (line 24) | class MaskDINODecoder(nn.Module):
    method __init__ (line 26) | def __init__(
    method from_config (line 178) | def from_config(cls, cfg, in_channels, lang_encoder, mask_classificati...
    method prepare_for_dn (line 213) | def prepare_for_dn(self, targets, tgt, refpoint_emb, batch_size):
    method prepare_for_dn_o3 (line 338) | def prepare_for_dn_o3(self, targets, tgt, refpoint_emb, batch_size):
    method prepare_for_dn_mo (line 463) | def prepare_for_dn_mo(self, targets, tgt, refpoint_emb, batch_size):
    method prepare_for_dn_mo_infer (line 563) | def prepare_for_dn_mo_infer(self, targets, tgt, refpoint_emb, batch_si...
    method dn_post_process (line 612) | def dn_post_process(self,outputs_class,outputs_coord,mask_dict,outputs...
    method get_valid_ratio (line 632) | def get_valid_ratio(self, mask):
    method pred_box (line 641) | def pred_box(self, reference, hs, ref0=None):
    method pred_box_old (line 662) | def pred_box_old(self, reference, hs, ref0=None):
    method forward (line 680) | def forward(self, x, mask_features, masks, targets=None, target_querie...
    method forward_o365 (line 797) | def forward_o365(self, x, mask_features, masks, targets=None, target_q...
    method forward_prediction_heads (line 906) | def forward_prediction_heads(self, output, mask_features, pred_mask=Tr...
    method idno_forward_prediction_heads (line 920) | def idno_forward_prediction_heads(self, output, mask_features, pred_ma...
    method _set_aux_loss (line 953) | def _set_aux_loss(self, outputs_class=None, outputs_seg_masks=None, ou...
  function get_maskdino_transformer_decoder (line 980) | def get_maskdino_transformer_decoder(cfg, in_channels, lang_encoder, mas...

FILE: llava/model/semsam/body/decoder/modules.py
  class SelfAttentionLayer (line 12) | class SelfAttentionLayer(nn.Module):
    method __init__ (line 14) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 27) | def _reset_parameters(self):
    method with_pos_embed (line 32) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 35) | def forward_post(self, tgt,
    method forward_pre (line 47) | def forward_pre(self, tgt,
    method forward (line 59) | def forward(self, tgt,
  class CrossAttentionLayer (line 70) | class CrossAttentionLayer(nn.Module):
    method __init__ (line 72) | def __init__(self, d_model, nhead, dropout=0.0,
    method _reset_parameters (line 85) | def _reset_parameters(self):
    method with_pos_embed (line 90) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 93) | def forward_post(self, tgt, memory,
    method forward_pre (line 106) | def forward_pre(self, tgt, memory,
    method forward (line 120) | def forward(self, tgt, memory,
  class FFNLayer (line 132) | class FFNLayer(nn.Module):
    method __init__ (line 134) | def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
    method _reset_parameters (line 149) | def _reset_parameters(self):
    method with_pos_embed (line 154) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 157) | def forward_post(self, tgt):
    method forward_pre (line 163) | def forward_pre(self, tgt):
    method forward (line 169) | def forward(self, tgt):
  function _get_activation_fn (line 175) | def _get_activation_fn(activation):
  class MLP (line 186) | class MLP(nn.Module):
    method __init__ (line 189) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 195) | def forward(self, x):

FILE: llava/model/semsam/body/decoder/registry.py
  function register_decoder (line 3) | def register_decoder(fn):
  function model_entrypoints (line 9) | def model_entrypoints(model_name):
  function is_model (line 12) | def is_model(model_name):

FILE: llava/model/semsam/body/decoder/utils/dino_decoder.py
  class TransformerDecoder (line 19) | class TransformerDecoder(nn.Module):
    method __init__ (line 21) | def __init__(self, decoder_layer, num_layers, norm=None,
    method _reset_parameters (line 89) | def _reset_parameters(self):
    method forward (line 97) | def forward(self, tgt, memory,
  class DeformableTransformerDecoderLayer (line 195) | class DeformableTransformerDecoderLayer(nn.Module):
    method __init__ (line 197) | def __init__(self, d_model=256, d_ffn=1024,
    method rm_self_attn_modules (line 229) | def rm_self_attn_modules(self):
    method with_pos_embed (line 235) | def with_pos_embed(tensor, pos):
    method forward_ffn (line 238) | def forward_ffn(self, tgt):
    method forward (line 245) | def forward(self,

FILE: llava/model/semsam/body/decoder/utils/utils.py
  class MLP (line 11) | class MLP(nn.Module):
    method __init__ (line 14) | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    method forward (line 20) | def forward(self, x):
  function inverse_sigmoid (line 26) | def inverse_sigmoid(x, eps=1e-5):
  function gen_encoder_output_proposals (line 33) | def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tens...
  function gen_sineembed_for_position (line 74) | def gen_sineembed_for_position(pos_tensor, dim=128):
  function _get_activation_fn (line 103) | def _get_activation_fn(activation):
  function _get_clones (line 118) | def _get_clones(module, N, layer_share=False):

FILE: llava/model/semsam/body/encoder/build.py
  function build_encoder (line 7) | def build_encoder(config, *args, **kwargs):

FILE: llava/model/semsam/body/encoder/encoder_deform.py
  class MSDeformAttnTransformerEncoderOnly (line 29) | class MSDeformAttnTransformerEncoderOnly(nn.Module):
    method __init__ (line 30) | def __init__(self, d_model=256, nhead=8,
    method _reset_parameters (line 48) | def _reset_parameters(self):
    method get_valid_ratio (line 57) | def get_valid_ratio(self, mask):
    method forward (line 66) | def forward(self, srcs, masks, pos_embeds, use_ckpt=False):
  class MSDeformAttnTransformerEncoderLayer (line 103) | class MSDeformAttnTransformerEncoderLayer(nn.Module):
    method __init__ (line 104) | def __init__(self,
    method with_pos_embed (line 124) | def with_pos_embed(tensor, pos):
    method forward_ffn (line 127) | def forward_ffn(self, src):
    method forward (line 133) | def forward(self, src, pos, reference_points, spatial_shapes, level_st...
  class MSDeformAttnTransformerEncoder (line 145) | class MSDeformAttnTransformerEncoder(nn.Module):
    method __init__ (line 146) | def __init__(self, encoder_layer, num_layers):
    method get_reference_points (line 152) | def get_reference_points(spatial_shapes, valid_ratios, device):
    method forward (line 166) | def forward(self, src, spatial_shapes, level_start_index, valid_ratios...
  class MaskDINOEncoder (line 179) | class MaskDINOEncoder(nn.Module):
    method __init__ (line 184) | def __init__(
    method from_config (line 331) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], *args, **...
    method forward_features (line 357) | def forward_features(self, features, masks):
  function get_maskdino_encoder_deform (line 428) | def get_maskdino_encoder_deform(cfg, input_shape):

FILE: llava/model/semsam/body/encoder/ops/functions/ms_deform_attn_func.py
  class MSDeformAttnFunction (line 32) | class MSDeformAttnFunction(Function):
    method forward (line 34) | def forward(ctx, value, value_spatial_shapes, value_level_start_index,...
    method backward (line 43) | def backward(ctx, grad_output):
  function ms_deform_attn_core_pytorch (line 52) | def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_lo...

FILE: llava/model/semsam/body/encoder/ops/modules/ms_deform_attn.py
  function _is_power_of_2 (line 28) | def _is_power_of_2(n):
  class MSDeformAttn (line 34) | class MSDeformAttn(nn.Module):
    method __init__ (line 35) | def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
    method _reset_parameters (line 66) | def _reset_parameters(self):
    method forward (line 82) | def forward(self, query, reference_points, input_flatten, input_spatia...

FILE: llava/model/semsam/body/encoder/ops/setup.py
  function get_extensions (line 26) | def get_extensions():

FILE: llava/model/semsam/body/encoder/ops/src/cpu/ms_deform_attn_cpu.cpp
  function ms_deform_attn_cpu_forward (line 22) | at::Tensor
  function ms_deform_attn_cpu_backward (line 34) | std::vector<at::Tensor>

FILE: llava/model/semsam/body/encoder/ops/src/ms_deform_attn.h
  function im2col_step (line 32) | int im2col_step)

FILE: llava/model/semsam/body/encoder/ops/src/vision.cpp
  function PYBIND11_MODULE (line 18) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: llava/model/semsam/body/encoder/ops/test.py
  function check_forward_equal_with_pytorch_double (line 35) | def check_forward_equal_with_pytorch_double():
  function check_forward_equal_with_pytorch_float (line 51) | def check_forward_equal_with_pytorch_float():
  function check_gradient_numerical (line 66) | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_...

FILE: llava/model/semsam/body/encoder/registry.py
  function register_encoder (line 3) | def register_encoder(fn):
  function model_entrypoints (line 9) | def model_entrypoints(model_name):
  function is_model (line 12) | def is_model(model_name):

FILE: llava/model/semsam/body/encoder/transformer_encoder_fpn.py
  class BasePixelDecoder (line 22) | class BasePixelDecoder(nn.Module):
    method __init__ (line 23) | def __init__(
    method from_config (line 112) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 123) | def forward_features(self, features):
    method forward (line 145) | def forward(self, features, targets=None):
  class TransformerEncoderOnly (line 151) | class TransformerEncoderOnly(nn.Module):
    method __init__ (line 152) | def __init__(
    method _reset_parameters (line 175) | def _reset_parameters(self):
    method forward (line 180) | def forward(self, src, mask, pos_embed):
  class TransformerEncoderPixelDecoder (line 193) | class TransformerEncoderPixelDecoder(BasePixelDecoder):
    method __init__ (line 195) | def __init__(
    method from_config (line 262) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
    method forward_features (line 276) | def forward_features(self, features):
    method forward (line 304) | def forward(self, features, targets=None):
  function get_transformer_encoder_fpn (line 312) | def get_transformer_encoder_fpn(cfg, input_shape):

FILE: llava/model/semsam/body/openseed_head.py
  class MaskDINOHead (line 21) | class MaskDINOHead(nn.Module):
    method __init__ (line 23) | def __init__(
    method from_config (line 56) | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], lang_enco...
    method forward (line 78) | def forward(self, features, mask=None, targets=None, target_queries=No...
    method layers (line 81) | def layers(self, features, mask=None,targets=None, target_queries=None...
  function get_maskdino_head (line 94) | def get_maskdino_head(cfg, input_shape, lang_encoder, extra):

FILE: llava/model/semsam/body/registry.py
  function register_body (line 4) | def register_body(fn):
  function model_entrypoints (line 10) | def model_entrypoints(model_name):
  function is_model (line 13) | def is_model(model_name):

FILE: llava/model/semsam/body/transformer_blocks.py
  class Transformer (line 19) | class Transformer(nn.Module):
    method __init__ (line 20) | def __init__(
    method _reset_parameters (line 56) | def _reset_parameters(self):
    method forward (line 61) | def forward(self, src, mask, query_embed, pos_embed):
  class TransformerEncoder (line 78) | class TransformerEncoder(nn.Module):
    method __init__ (line 79) | def __init__(self, encoder_layer, num_layers, norm=None):
    method forward (line 85) | def forward(
  class TransformerDecoder (line 105) | class TransformerDecoder(nn.Module):
    method __init__ (line 106) | def __init__(self, decoder_layer, num_layers, norm=None, return_interm...
    method forward (line 113) | def forward(
  class TransformerEncoderLayer (line 154) | class TransformerEncoderLayer(nn.Module):
    method __init__ (line 155) | def __init__(
    method with_pos_embed (line 179) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 182) | def forward_post(
    method forward_pre (line 201) | def forward_pre(
    method forward (line 219) | def forward(
  class TransformerDecoderLayer (line 231) | class TransformerDecoderLayer(nn.Module):
    method __init__ (line 232) | def __init__(
    method with_pos_embed (line 259) | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
    method forward_post (line 262) | def forward_post(
    method forward_pre (line 293) | def forward_pre(
    method forward (line 324) | def forward(
  function _get_clones (line 358) | def _get_clones(module, N):
  function _get_activation_fn (line 362) | def _get_activation_fn(activation):

FILE: llava/model/semsam/language/LangEncoder/build.py
  function build_lang_encoder (line 10) | def build_lang_encoder(config_encoder, tokenizer, verbose, **kwargs):
  function build_tokenizer (line 19) | def build_tokenizer(config_encoder):

FILE: llava/model/semsam/language/LangEncoder/registry.py
  function register_lang_encoder (line 4) | def register_lang_encoder(fn):
  function lang_encoders (line 13) | def lang_encoders(model_name):
  function is_lang_encoder (line 17) | def is_lang_encoder(model_name):

FILE: llava/model/semsam/language/LangEncoder/transformer.py
  class LayerNorm (line 21) | class LayerNorm(nn.Module):
    method __init__ (line 22) | def __init__(self, hidden_size, eps=1e-12):
    method forward (line 30) | def forward(self, x):
  class QuickGELU (line 39) | class QuickGELU(nn.Module):
    method forward (line 40) | def forward(self, x: torch.Tensor):
  class ResidualAttentionBlock (line 44) | class ResidualAttentionBlock(nn.Module):
    method __init__ (line 45) | def __init__(self,
    method attention (line 63) | def attention(self, x: torch.Tensor, key_padding_mask: torch.Tensor = ...
    method forward (line 75) | def forward(self, x: torch.Tensor, key_padding_mask: torch.Tensor = No...
  class Transformer (line 81) | class Transformer(nn.Module):
    method __init__ (line 82) | def __init__(self,
    method dim_out (line 119) | def dim_out(self):
    method build_attention_mask (line 122) | def build_attention_mask(self):
    method _init_weights (line 130) | def _init_weights(self, m):
    method load_pretrained (line 142) | def load_pretrained(self, pretrained='', pretrained_layers=[], verbose...
    method no_weight_decay (line 188) | def no_weight_decay(self):
    method forward (line 194) | def forward(self, input_ids, attention_mask=None):
  function lang_encoder (line 210) | def lang_encoder(config_encoder, tokenizer, verbose, **kwargs):

FILE: llava/model/semsam/language/build.py
  function build_language_encoder (line 5) | def build_language_encoder(config, **kwargs):

FILE: llava/model/semsam/language/encoder.py
  class LanguageEncoder (line 13) | class LanguageEncoder(nn.Module):
    method __init__ (line 16) | def __init__(
    method from_config (line 33) | def from_config(cls, cfg):
    method get_text_embeddings (line 54) | def get_text_embeddings(self, class_names, name='default', is_eval=Fal...
    method forward_language (line 109) | def forward_language(self, texts, norm=True):
    method compute_similarity (line 123) | def compute_similarity(self, v_emb, name='default'):
  function get_language_model (line 131) | def get_language_model(cfg, **kwargs):

FILE: llava/model/semsam/language/fixencoder.py
  class LanguageEncoder (line 13) | class LanguageEncoder(nn.Module):
    method __init__ (line 16) | def __init__(
    method from_config (line 33) | def from_config(cls, cfg):
    method get_text_embeddings (line 54) | def get_text_embeddings(self, class_names, name='default', is_eval=Fal...
    method forward_language (line 109) | def forward_language(self, texts, norm=True):
    method compute_similarity (line 124) | def compute_similarity(self, v_emb, name='default'):
  function get_language_model (line 132) | def get_language_model(cfg, **kwargs):

FILE: llava/model/semsam/language/llama_encoder.py
  class ModelArguments (line 65) | class ModelArguments:
  class DataArguments (line 80) | class DataArguments:
  class TrainingArguments (line 91) | class TrainingArguments(transformers.TrainingArguments):
  function safe_save_model_for_hf_trainer (line 105) | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
  function smart_tokenizer_and_embedding_resize (line 118) | def smart_tokenizer_and_embedding_resize(
  function _tokenize_fn (line 143) | def _tokenize_fn(strings: Sequence[str],
  function _mask_targets (line 170) | def _mask_targets(target, tokenized_lens, speakers):
  function _add_speaker_and_signal (line 181) | def _add_speaker_and_signal(header, source, get_conversation=True):
  function preprocess_multimodal (line 202) | def preprocess_multimodal(
  function preprocess (line 223) | def preprocess(
  class SupervisedDataset (line 253) | class SupervisedDataset(Dataset):
    method __init__ (line 256) | def __init__(self, data_path: str,
    method __len__ (line 269) | def __len__(self):
    method __getitem__ (line 272) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class LazySupervisedDataset (line 276) | class LazySupervisedDataset(Dataset):
    method __init__ (line 279) | def __init__(self, data_path: str,
    method __len__ (line 291) | def __len__(self):
    method __getitem__ (line 294) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDataset (line 348) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 353) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_supervised_data_module (line 379) | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokeni...
  function setup (line 402) | def setup(config_file):
  function get_language_model (line 415) | def get_language_model(cfg, **kwargs):
  function train (line 430) | def train():

FILE: llava/model/semsam/language/loss.py
  function is_dist_initialized (line 13) | def is_dist_initialized():
  function get_world_size (line 16) | def get_world_size():
  function get_rank (line 21) | def get_rank():
  function all_gather_grad (line 26) | def all_gather_grad(x):
  function vl_multilabel_contrastive_loss (line 34) | def vl_multilabel_contrastive_loss(image_feat, text_feat, temperature=1):
  function vl_contrastive_loss (line 93) | def vl_contrastive_loss(image_feat, text_feat, temperature=1):
  function all_gather_pickle (line 112) | def all_gather_pickle(data, device):
  function all_gather_arbitary_tensor (line 154) | def all_gather_arbitary_tensor(tensor):
  function ql_contrastive_loss (line 165) | def ql_contrastive_loss(image_feat, text_feat, temperature=1):
  function vl_similarity (line 178) | def vl_similarity(image_feat, text_feat, temperature=1):
  function ql_multi_contrastive_loss (line 184) | def ql_multi_contrastive_loss(image_feat, text_feat, text_hash, temperat...
  function image_text_contrastive_loss_queue (line 209) | def image_text_contrastive_loss_queue(image_feat_inp, text_feat_inp, lan...

FILE: llava/model/semsam/language/misc.py
  function vl_similarity (line 11) | def vl_similarity(image_feat, text_feat, temperature=1):
  function get_tag (line 17) | def get_tag(tokenized, tags):
  function get_noun_phrase (line 27) | def get_noun_phrase(tokenized):
  function text_noun_with_prompt_all (line 56) | def text_noun_with_prompt_all(text, phrase_prob=0.0, append_text=True):

FILE: llava/model/semsam/language/modeling_llama_os.py
  function _make_causal_mask (line 41) | def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, p...
  function _expand_mask (line 56) | def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Option...
  class LlamaRMSNorm (line 70) | class LlamaRMSNorm(nn.Module):
    method __init__ (line 71) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 79) | def forward(self, hidden_states):
  class LlamaRotaryEmbedding (line 90) | class LlamaRotaryEmbedding(torch.nn.Module):
    method __init__ (line 91) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method forward (line 105) | def forward(self, x, seq_len=None):
  function rotate_half (line 122) | def rotate_half(x):
  function apply_rotary_pos_emb (line 129) | def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
  class LlamaMLP (line 137) | class LlamaMLP(nn.Module):
    method __init__ (line 138) | def __init__(
    method forward (line 150) | def forward(self, x):
  class LlamaAttention (line 154) | class LlamaAttention(nn.Module):
    method __init__ (line 157) | def __init__(
    method _shape (line 194) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 197) | def forward(
  class LlamaDecoderLayer (line 266) | class LlamaDecoderLayer(nn.Module):
    method __init__ (line 267) | def __init__(self, config: LlamaConfig):
    method forward (line 282) | def forward(
  class LlamaPreTrainedModel (line 356) | class LlamaPreTrainedModel(PreTrainedModel):
    method _init_weights (line 363) | def _init_weights(self, module):
    method _set_gradient_checkpointing (line 374) | def _set_gradient_checkpointing(self, module, value=False):
  class LlamaModel (line 443) | class LlamaModel(LlamaPreTrainedModel):
    method __init__ (line 451) | def __init__(self, config: LlamaConfig):
    method get_input_embeddings (line 472) | def get_input_embeddings(self):
    method get_output_embeddings (line 475) | def get_output_embeddings(self):
    method set_input_embeddings (line 478) | def set_input_embeddings(self, value):
    method _prepare_decoder_attention_mask (line 482) | def _prepare_decoder_attention_mask(self, attention_mask, input_shape,...
    method find_pattern_list (line 503) | def find_pattern_list(self, pattern, src):
    method forward (line 518) | def forward(
  class LlamaForCausalLM (line 783) | class LlamaForCausalLM(LlamaPreTrainedModel):
    method __init__ (line 786) | def __init__(self, config):
    method get_input_embeddings (line 795) | def get_input_embeddings(self):
    method set_input_embeddings (line 798) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 801) | def get_output_embeddings(self):
    method set_output_embeddings (line 804) | def set_output_embeddings(self, new_embeddings):
    method set_decoder (line 807) | def set_decoder(self, decoder):
    method get_decoder (line 810) | def get_decoder(self):
    method forward (line 814) | def forward(
    method prepare_inputs_for_generation (line 958) | def prepare_inputs_for_generation(
    method _reorder_cache (line 981) | def _reorder_cache(past_key_values, beam_idx):
  class LlamaForSequenceClassification (line 1003) | class LlamaForSequenceClassification(LlamaPreTrainedModel):
    method __init__ (line 1006) | def __init__(self, config):
    method get_input_embeddings (line 1015) | def get_input_embeddings(self):
    method set_input_embeddings (line 1018) | def set_input_embeddings(self, value):
    method forward (line 1022) | def forward(

FILE: llava/model/semsam/language/registry.py
  function register_model (line 3) | def register_model(fn):
  function model_entrypoints (line 9) | def model_entrypoints(model_name):
  function is_model (line 12) | def is_model(model_name):

FILE: llava/model/semsam/language/vlpencoder.py
  class LanguageEncoder (line 19) | class LanguageEncoder(nn.Module):
    method __init__ (line 22) | def __init__(
    method from_config (line 46) | def from_config(cls, cfg):
    method get_text_embeddings (line 70) | def get_text_embeddings(self, class_names, name='default', is_eval=Fal...
    method get_text_token_embeddings (line 127) | def get_text_token_embeddings(self, txts, name='default', token=False,...
    method forward_language (line 142) | def forward_language(self, texts, norm=True):
    method forward_language_token (line 156) | def forward_language_token(self, texts, norm=False):
    method compute_similarity (line 174) | def compute_similarity(self, v_emb, name='default', fake=False):
  function get_language_model (line 184) | def get_language_model(cfg, **kwargs):

FILE: llava/model/semsam/modules/attention.py
  function multi_head_attention_forward (line 13) | def multi_head_attention_forward(
  class _LinearWithBias (line 324) | class _LinearWithBias(nn.Linear):
    method __init__ (line 327) | def __init__(self, in_features: int, out_features: int) -> None:
  class MultiheadAttention (line 331) | class MultiheadAttention(nn.Module):
    method __init__ (line 364) | def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bi...
    method _reset_parameters (line 403) | def _reset_parameters(self):
    method __setstate__ (line 419) | def __setstate__(self, state):
    method forward (line 426) | def forward(self, query: Tensor, key: Tensor, value: Tensor, key_paddi...

FILE: llava/model/semsam/modules/criterion_id_llm.py
  function sigmoid_focal_loss (line 24) | def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, ...
  function dice_loss (line 52) | def dice_loss(
  function iou_score_loss (line 78) | def iou_score_loss(inputs, targets):
  function sigmoid_ce_loss (line 88) | def sigmoid_ce_loss(
  function calculate_uncertainty (line 116) | def calculate_uncertainty(logits):
  class SetCriterionLLM (line 133) | class SetCriterionLLM(nn.Module):
    method __init__ (line 140) | def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
    method loss_labels_ce (line 184) | def loss_labels_ce(self, outputs, targets, indices, num_masks):
    method loss_labels (line 202) | def loss_labels(self, outputs, targets, indices, num_boxes, log=True, ...
    method loss_labels_part (line 239) | def loss_labels_part(self, outputs, targets, indices, num_boxes, log=T...
    method loss_boxes_o365 (line 273) | def loss_boxes_o365(self, outputs, targets, indices, num_boxes, layer_...
    method loss_boxes (line 299) | def loss_boxes(self, outputs, targets, indices, num_boxes):
    method loss_boxes_panoptic (line 337) | def loss_boxes_panoptic(self, outputs, targets, indices, num_boxes):
    method loss_masks (line 362) | def loss_masks(self, outputs, targets, indices, num_masks):
    method loss_labels_o365 (line 461) | def loss_labels_o365(self, outputs, targets, indices, num_boxes, log=T...
    method prep_for_dn (line 489) | def prep_for_dn(self, mask_dict):
    method _get_src_permutation_idx (line 500) | def _get_src_permutation_idx(self, indices):
    method _get_tgt_permutation_idx (line 506) | def _get_tgt_permutation_idx(self, indices):
    method get_loss (line 512) | def get_loss(self, loss, outputs, targets, indices, num_masks):
    method forward (line 524) | def forward(self, outputs, targets, mask_dict=None, task='sam', extra=...
    method __repr__ (line 630) | def __repr__(self):

FILE: llava/model/semsam/modules/hooks.py
  class HookBase (line 13) | class HookBase:
    method before_train (line 50) | def before_train(self):
    method after_train (line 56) | def after_train(self):
    method before_step (line 62) | def before_step(self):
    method after_step (line 68) | def after_step(self):
    method state_dict (line 74) | def state_dict(self):
  class CallbackHook (line 129) | class CallbackHook(HookBase):
    method __init__ (line 134) | def __init__(self, *, before_train=None, after_train=None, before_step...
    method before_train (line 143) | def before_train(self):
    method after_train (line 147) | def after_train(self):
    method before_step (line 155) | def before_step(self):
    method after_step (line 159) | def after_step(self):
  class IterationTimer (line 164) | class IterationTimer(HookBase):
    method __init__ (line 176) | def __init__(self, warmup_iter=3):
    method before_train (line 187) | def before_train(self):
    method after_train (line 192) | def after_train(self):
    method before_step (line 218) | def before_step(self):
    method after_step (line 222) | def after_step(self):
  class PeriodicWriter (line 236) | class PeriodicWriter(HookBase):
    method __init__ (line 244) | def __init__(self, writers, period=20):
    method after_step (line 255) | def after_step(self):
    method after_train (line 262) | def after_train(self):
  class PeriodicCheckpointer (line 270) | class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase):
    method before_train (line 281) | def before_train(self):
    method after_step (line 284) | def after_step(self):
  class BestCheckpointer (line 289) | class BestCheckpointer(HookBase):
    method __init__ (line 297) | def __init__(
    method _update_best (line 330) | def _update_best(self, val, iteration):
    method _best_checking (line 337) | def _best_checking(self):
    method after_step (line 370) | def after_step(self):
    method after_train (line 380) | def after_train(self):
  class LRScheduler (line 386) | class LRScheduler(HookBase):
    method __init__ (line 392) | def __init__(self, optimizer=None, scheduler=None):
    method before_train (line 405) | def before_train(self):
    method get_best_param_group_id (line 417) | def get_best_param_group_id(optimizer):
    method after_step (line 435) | def after_step(self):
    method scheduler (line 441) | def scheduler(self):
    method state_dict (line 444) | def state_dict(self):
    method load_state_dict (line 449) | def load_state_dict(self, state_dict):
  class TorchProfiler (line 456) | class TorchProfiler(HookBase):
    method __init__ (line 474) | def __init__(self, enable_predicate, output_dir, *, activities=None, s...
    method before_step (line 489) | def before_step(self):
    method after_step (line 514) | def after_step(self):
  class AutogradProfiler (line 536) | class AutogradProfiler(TorchProfiler):
    method __init__ (line 559) | def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
    method before_step (line 573) | def before_step(self):
  class EvalHook (line 581) | class EvalHook(HookBase):
    method __init__ (line 588) | def __init__(self, eval_period, eval_function, eval_after_train=True):
    method _do_eval (line 607) | def _do_eval(self):
    method after_step (line 630) | def after_step(self):
    method after_train (line 637) | def after_train(self):
  class PreciseBN (line 646) | class PreciseBN(HookBase):
    method __init__ (line 656) | def __init__(self, period, model, data_loader, num_iter):
    method after_step (line 685) | def after_step(self):
    method update_stats (line 691) | def update_stats(self):
  class TorchMemoryStats (line 718) | class TorchMemoryStats(HookBase):
    method __init__ (line 723) | def __init__(self, period=20, max_runs=10):
    method after_step (line 735) | def after_step(self):

FILE: llava/model/semsam/modules/matcher.py
  function batch_dice_loss (line 22) | def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
  function batch_sigmoid_ce_loss (line 45) | def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
  class HungarianMatcher (line 77) | class HungarianMatcher(nn.Module):
    method __init__ (line 85) | def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_d...
    method memory_efficient_forward (line 108) | def memory_efficient_forward(self, outputs, targets, cost=["cls", "box...
    method grounding_forward (line 196) | def grounding_forward(self, outputs, targets, extra):
    method caption_forward_womask (line 259) | def caption_forward_womask(self, outputs, targets, extra):
    method caption_forward_wmask (line 295) | def caption_forward_wmask(self, outputs, targets, extra):
    method forward (line 365) | def forward(self, outputs, targets, cost=["cls", "box", "mask"], mode=...
    method __repr__ (line 396) | def __repr__(self, _repr_indent=4):

FILE: llava/model/semsam/modules/point_features.py
  function point_sample (line 21) | def point_sample(input, point_coords, **kwargs):
  function generate_regular_grid_point_coords (line 47) | def generate_regular_grid_point_coords(R, side_size, device):
  function get_uncertain_point_coords_with_randomness (line 65) | def get_uncertain_point_coords_with_randomness(
  function get_uncertain_point_coords_on_grid (line 121) | def get_uncertain_point_coords_on_grid(uncertainty_map, num_points):
  function point_sample_fine_grained_features (line 148) | def point_sample_fine_grained_features(features_list, feature_scales, bo...
  function get_point_coords_wrt_image (line 194) | def get_point_coords_wrt_image(boxes_coords, point_coords):
  function sample_point_labels (line 221) | def sample_point_labels(instances, point_coords):

FILE: llava/model/semsam/modules/position_encoding.py
  class PositionEmbeddingSine (line 12) | class PositionEmbeddingSine(nn.Module):
    method __init__ (line 18) | def __init__(self, num_pos_feats=64, temperature=10000, normalize=Fals...
    method forward (line 29) | def forward(self, x, mask=None):
    method __repr__ (line 54) | def __repr__(self, _repr_indent=4):

FILE: llava/model/semsam/modules/postprocessing.py
  function detector_postprocess (line 9) | def detector_postprocess(
  function bbox_postprocess (line 77) | def bbox_postprocess(result, input_size, img_size, output_height, output...
  function sem_seg_postprocess (line 99) | def sem_seg_postprocess(result, img_size, output_height, output_width):

FILE: llava/model/semsam/utils/box_ops.py
  function box_cxcywh_to_xyxy (line 9) | def box_cxcywh_to_xyxy(x):
  function box_xyxy_to_cxcywh (line 16) | def box_xyxy_to_cxcywh(x):
  function box_xywh_to_xyxy (line 22) | def box_xywh_to_xyxy(x):
  function box_iou (line 29) | def box_iou(boxes1, boxes2):
  function generalized_box_iou (line 45) | def generalized_box_iou(boxes1, boxes2):
  function masks_to_boxes (line 69) | def masks_to_boxes(masks):

FILE: llava/model/semsam/utils/config.py
  function configurable (line 7) | def configurable(init_func=None, *, from_config=None):
  function _called_with_cfg (line 95) | def _called_with_cfg(*args, **kwargs):
  function _get_args_from_config (line 112) | def _get_args_from_config(from_config_func, *args, **kwargs):

FILE: llava/model/semsam/utils/misc.py
  function get_iou (line 25) | def get_iou(gt_masks, pred_masks, ignore_label=-1):
  function _max_by_axis (line 34) | def _max_by_axis(the_list):
  class NestedTensor (line 42) | class NestedTensor(object):
    method __init__ (line 43) | def __init__(self, tensors, mask: Optional[Tensor]):
    method to (line 47) | def to(self, device):
    method decompose (line 58) | def decompose(self):
    method __repr__ (line 61) | def __repr__(self):
  function nested_tensor_from_tensor_list (line 64) | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
  function _collate_and_pad_divisibility (line 106) | def _collate_and_pad_divisibility(tensor_list: list, div=32):
  function _onnx_nested_tensor_from_tensor_list (line 140) | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> N...
  function is_dist_avail_and_initialized (line 169) | def is_dist_avail_and_initialized():
  function get_class_names (line 177) | def get_class_names(name, background=True):

FILE: llava/model/utils.py
  function auto_upgrade (line 4) | def auto_upgrade(config):

FILE: llava/serve/cli.py
  function load_image (line 18) | def load_image(image_file):
  function main (line 27) | def main(args):

FILE: llava/serve/controller.py
  class DispatchMethod (line 28) | class DispatchMethod(Enum):
    method from_str (line 33) | def from_str(cls, name):
  class WorkerInfo (line 43) | class WorkerInfo:
  function heart_beat_controller (line 51) | def heart_beat_controller(controller):
  class Controller (line 57) | class Controller:
    method __init__ (line 58) | def __init__(self, dispatch_method: str):
    method register_worker (line 69) | def register_worker(self, worker_name: str, check_heart_beat: bool,
    method get_worker_status (line 88) | def get_worker_status(self, worker_name: str):
    method remove_worker (line 101) | def remove_worker(self, worker_name: str):
    method refresh_all_workers (line 104) | def refresh_all_workers(self):
    method list_models (line 112) | def list_models(self):
    method get_worker_address (line 120) | def get_worker_address(self, model_name: str):
    method receive_heart_beat (line 173) | def receive_heart_beat(self, worker_name: str, queue_length: int):
    method remove_stable_workers_by_expiration (line 183) | def remove_stable_workers_by_expiration(self):
    method worker_api_generate_stream (line 193) | def worker_api_generate_stream(self, params):
    method worker_api_get_status (line 220) | def worker_api_get_status(self):
  function register_worker (line 243) | async def register_worker(request: Request):
  function refresh_all_workers (line 251) | async def refresh_all_workers():
  function list_models (line 256) | async def list_models():
  function get_worker_address (line 262) | async def get_worker_address(request: Request):
  function receive_heart_beat (line 269) | async def receive_heart_beat(request: Request):
  function worker_api_generate_stream (line 277) | async def worker_api_generate_stream(request: Request):
  function worker_api_get_status (line 284) | async def worker_api_get_status(request: Request):

FILE: llava/serve/gradio_web_server.py
  function get_conv_log_filename (line 32) | def get_conv_log_filename():
  function get_model_list (line 38) | def get_model_list():
  function load_demo (line 58) | def load_demo(url_params, request: gr.Request):
  function load_demo_refresh_model_list (line 78) | def load_demo_refresh_model_list(request: gr.Request):
  function vote_last_response (line 92) | def vote_last_response(state, vote_type, model_selector, request: gr.Req...
  function upvote_last_response (line 104) | def upvote_last_response(state, model_selector, request: gr.Request):
  function downvote_last_response (line 110) | def downvote_last_response(state, model_selector, request: gr.Request):
  function flag_last_response (line 116) | def flag_last_response(state, model_selector, request: gr.Request):
  function regenerate (line 122) | def regenerate(state, image_process_mode, request: gr.Request):
  function clear_history (line 132) | def clear_history(request: gr.Request):
  function add_text (line 138) | def add_text(state, text, image, image_process_mode, request: gr.Request):
  function http_bot (line 165) | def http_bot(state, model_selector, temperature, top_p, max_new_tokens, ...
  function build_demo (line 310) | def build_demo(embed_mode):

FILE: llava/serve/test_message.py
  function main (line 9) | def main():

FILE: llava/train/llama_flash_attn_monkey_patch.py
  function forward (line 19) | def forward(
  function _prepare_decoder_attention_mask (line 107) | def _prepare_decoder_attention_mask(
  function replace_llama_attn_with_flash_attn (line 114) | def replace_llama_attn_with_flash_attn():

FILE: llava/train/llava_trainer.py
  function maybe_zero_3 (line 8) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_mm_adapter_state_maybe_zero_3 (line 22) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  class LLaVATrainer (line 28) | class LLaVATrainer(Trainer):
    method _save_checkpoint (line 30) | def _save_checkpoint(self, model, trial, metrics=None):
    method _save (line 51) | def _save(self, output_dir: Optional[str] = None, state_dict=None):

FILE: llava/train/llava_trainer_gd.py
  function maybe_zero_3 (line 9) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_mm_adapter_state_maybe_zero_3 (line 23) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  class TrainerLLavaGD (line 29) | class TrainerLLavaGD(Trainer):
    method __init__ (line 114) | def __init__(
    method add_callback (line 508) | def add_callback(self, callback):
    method pop_callback (line 519) | def pop_callback(self, callback):
    method remove_callback (line 535) | def remove_callback(self, callback):
    method _move_model_to_device (line 546) | def _move_model_to_device(self, model, device):
    method _set_signature_columns_if_needed (line 552) | def _set_signature_columns_if_needed(self):
    method _remove_unused_columns (line 560) | def _remove_unused_columns(self, dataset: "datasets.Dataset", descript...
    method _get_collator_with_removed_columns (line 586) | def _get_collator_with_removed_columns(
    method _get_train_sampler (line 604) | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
    method get_train_dataloader (line 629) | def get_train_dataloader(self) -> DataLoader:
    method get_train_dataloaderd2 (line 663) | def get_train_dataloaderd2(self) -> DataLoader:
    method _get_eval_sampler (line 666) | def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.u...
    method get_eval_dataloader (line 688) | def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) ...
    method get_test_dataloader (line 722) | def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
    method create_optimizer_and_scheduler (line 754) | def create_optimizer_and_scheduler(self, num_training_steps: int):
    method create_optimizer (line 770) | def create_optimizer(self):
    method get_optimizer_cls_and_kwargs (line 876) | def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any...
    method create_scheduler (line 998) | def create_scheduler(self, num_training_steps: int, optimizer: torch.o...
    method num_examples (line 1016) | def num_examples(self, dataloader: DataLoader) -> int:
    method _hp_search_setup (line 1030) | def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
    method _report_to_hp_search (line 1077) | def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, ...
    method _tune_save_checkpoint (line 1095) | def _tune_save_checkpoint(self):
    method call_model_init (line 1108) | def call_model_init(self, trial=None):
    method torch_jit_model_eval (line 1122) | def torch_jit_model_eval(self, model, dataloader, training=False):
    method ipex_optimize_model (line 1165) | def ipex_optimize_model(self, model, training=False, dtype=torch.float...
    method _wrap_model (line 1188) | def _wrap_model(self, model, training=True, dataloader=None):
    method train (line 1319) | def train(
    method _inner_training_loop (line 1403) | def _inner_training_loop(
    method _get_output_dir (line 1842) | def _get_output_dir(self, trial):
    method _load_from_checkpoint (line 1862) | def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
    method _load_best_model (line 1957) | def _load_best_model(self):
    method _issue_warnings_after_load (line 2044) | def _issue_warnings_after_load(self, load_result):
    method _maybe_log_save_evaluate (line 2057) | def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignor...
    method _load_rng_state (line 2110) | def _load_rng_state(self, checkpoint):
    method _save_checkpoint (line 2151) | def _save_checkpoint(self, model, trial, metrics=None):
    method _load_optimizer_and_scheduler (line 2272) | def _load_optimizer_and_scheduler(self, checkpoint):
    method hyperparameter_search (line 2350) | def hyperparameter_search(
    method log (line 2425) | def log(self, logs: Dict[str, float]) -> None:
    method _prepare_input (line 2442) | def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torc...
    method _prepare_inputs (line 2460) | def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]])...
    method compute_loss_context_manager (line 2476) | def compute_loss_context_manager(self):
    method autocast_smart_context_manager (line 2482) | def autocast_smart_context_manager(self, cache_enabled: Optional[bool]...
    method training_step (line 2498) | def training_step(self, model: nn.Module, inputs: Dict[str, Union[torc...
    method compute_loss (line 2542) | def compute_loss(self, model, inputs, return_outputs=False):
    method is_local_process_zero (line 2574) | def is_local_process_zero(self) -> bool:
    method is_world_process_zero (line 2581) | def is_world_process_zero(self) -> bool:
    method save_model (line 2593) | def save_model(self, output_dir: Optional[str] = None, _internal_call:...
    method _save_tpu (line 2648) | def _save_tpu(self, output_dir: Optional[str] = None):
    method _save (line 2676) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
    method store_flos (line 2710) | def store_flos(self):
    method _sorted_checkpoints (line 2721) | def _sorted_checkpoints(
    method _rotate_checkpoints (line 2745) | def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
    method evaluate (line 2770) | def evaluate(
    method predict (line 2841) | def predict(
    method evaluation_loop (line 2903) | def evaluation_loop(
    method _nested_gather (line 3114) | def _nested_gather(self, tensors, name=None):
    method prediction_step (line 3133) | def prediction_step(
    method floating_point_ops (line 3238) | def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any...
    method init_git_repo (line 3256) | def init_git_repo(self, at_init: bool = False):
    method create_model_card (line 3303) | def create_model_card(
    method _push_from_checkpoint (line 3359) | def _push_from_checkpoint(self, checkpoint_folder):
    method push_to_hub (line 3406) | def push_to_hub(self, commit_message: Optional[str] = "End of training...
    method prediction_loop (line 3466) | def prediction_loop(
    method _gather_and_numpify (line 3617) | def _gather_and_numpify(self, tensors, name):
    method _add_sm_patterns_to_gitignore (line 3633) | def _add_sm_patterns_to_gitignore(self) -> None:
    method create_accelerator_and_postprocess (line 3672) | def create_accelerator_and_postprocess(self):
  class LLaVATrainer (line 3707) | class LLaVATrainer(TrainerLLavaGD):
    method _save_checkpoint (line 3709) | def _save_checkpoint(self, model, trial, metrics=None):
    method _save (line 3730) | def _save(self, output_dir: Optional[str] = None, state_dict=None):

FILE: llava/train/llava_trainer_joint_train.py
  function maybe_zero_3 (line 13) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_mm_adapter_state_maybe_zero_3 (line 27) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  class DataCollatorForSupervisedDatasetEmpty (line 33) | class DataCollatorForSupervisedDatasetEmpty(object):
    method __call__ (line 38) | def __call__(self, instances: Sequence[Dict]):
  class TrainerLLavaGD (line 66) | class TrainerLLavaGD(Trainer):
    method __init__ (line 151) | def __init__(
    method add_callback (line 545) | def add_callback(self, callback):
    method pop_callback (line 556) | def pop_callback(self, callback):
    method remove_callback (line 572) | def remove_callback(self, callback):
    method _move_model_to_device (line 583) | def _move_model_to_device(self, model, device):
    method _set_signature_columns_if_needed (line 589) | def _set_signature_columns_if_needed(self):
    method _remove_unused_columns (line 597) | def _remove_unused_columns(self, dataset: "datasets.Dataset", descript...
    method _get_collator_with_removed_columns (line 623) | def _get_collator_with_removed_columns(
    method _get_train_sampler (line 641) | def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
    method get_train_dataloader (line 666) | def get_train_dataloader(self) -> DataLoader:
    method get_train_dataloaderd2 (line 701) | def get_train_dataloaderd2(self) -> DataLoader:
    method _get_eval_sampler (line 705) | def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.u...
    method get_eval_dataloader (line 727) | def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) ...
    method get_test_dataloader (line 761) | def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
    method create_optimizer_and_scheduler (line 793) | def create_optimizer_and_scheduler(self, num_training_steps: int):
    method create_optimizer (line 809) | def create_optimizer(self):
    method get_optimizer_cls_and_kwargs (line 944) | def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any...
    method create_scheduler (line 1066) | def create_scheduler(self, num_training_steps: int, optimizer: torch.o...
    method num_examples (line 1084) | def num_examples(self, dataloader: DataLoader) -> int:
    method _hp_search_setup (line 1098) | def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
    method _report_to_hp_search (line 1145) | def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, ...
    method _tune_save_checkpoint (line 1163) | def _tune_save_checkpoint(self):
    method call_model_init (line 1176) | def call_model_init(self, trial=None):
    method torch_jit_model_eval (line 1190) | def torch_jit_model_eval(self, model, dataloader, training=False):
    method ipex_optimize_model (line 1233) | def ipex_optimize_model(self, model, training=False, dtype=torch.float...
    method _wrap_model (line 1256) | def _wrap_model(self, model, training=True, dataloader=None):
    method train (line 1387) | def train(
    method _inner_training_loop (line 1471) | def _inner_training_loop(
    method _get_output_dir (line 1910) | def _get_output_dir(self, trial):
    method _load_from_checkpoint (line 1930) | def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
    method _load_best_model (line 2025) | def _load_best_model(self):
    method _issue_warnings_after_load (line 2112) | def _issue_warnings_after_load(self, load_result):
    method _maybe_log_save_evaluate (line 2125) | def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignor...
    method _load_rng_state (line 2178) | def _load_rng_state(self, checkpoint):
    method _save_checkpoint (line 2219) | def _save_checkpoint(self, model, trial, metrics=None):
    method _load_optimizer_and_scheduler (line 2340) | def _load_optimizer_and_scheduler(self, checkpoint):
    method hyperparameter_search (line 2418) | def hyperparameter_search(
    method log (line 2493) | def log(self, logs: Dict[str, float]) -> None:
    method _prepare_input (line 2510) | def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torc...
    method _prepare_inputs (line 2528) | def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]])...
    method compute_loss_context_manager (line 2544) | def compute_loss_context_manager(self):
    method autocast_smart_context_manager (line 2550) | def autocast_smart_context_manager(self, cache_enabled: Optional[bool]...
    method training_step (line 2566) | def training_step(self, model: nn.Module, inputs: Dict[str, Union[torc...
    method compute_loss (line 2610) | def compute_loss(self, model, inputs, return_outputs=False):
    method is_local_process_zero (line 2642) | def is_local_process_zero(self) -> bool:
    method is_world_process_zero (line 2649) | def is_world_process_zero(self) -> bool:
    method save_model (line 2661) | def save_model(self, output_dir: Optional[str] = None, _internal_call:...
    method _save_tpu (line 2716) | def _save_tpu(self, output_dir: Optional[str] = None):
    method _save (line 2744) | def _save(self, output_dir: Optional[str] = None, state_dict=None):
    method store_flos (line 2778) | def store_flos(self):
    method _sorted_checkpoints (line 2789) | def _sorted_checkpoints(
    method _rotate_checkpoints (line 2813) | def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
    method evaluate (line 2838) | def evaluate(
    method predict (line 2909) | def predict(
    method evaluation_loop (line 2971) | def evaluation_loop(
    method _nested_gather (line 3182) | def _nested_gather(self, tensors, name=None):
    method prediction_step (line 3201) | def prediction_step(
    method floating_point_ops (line 3306) | def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any...
    method init_git_repo (line 3324) | def init_git_repo(self, at_init: bool = False):
    method create_model_card (line 3371) | def create_model_card(
    method _push_from_checkpoint (line 3427) | def _push_from_checkpoint(self, checkpoint_folder):
    method push_to_hub (line 3474) | def push_to_hub(self, commit_message: Optional[str] = "End of training...
    method prediction_loop (line 3534) | def prediction_loop(
    method _gather_and_numpify (line 3685) | def _gather_and_numpify(self, tensors, name):
    method _add_sm_patterns_to_gitignore (line 3701) | def _add_sm_patterns_to_gitignore(self) -> None:
    method create_accelerator_and_postprocess (line 3740) | def create_accelerator_and_postprocess(self):
  class LLaVATrainer (line 3775) | class LLaVATrainer(TrainerLLavaGD):
    method _save_checkpoint (line 3777) | def _save_checkpoint(self, model, trial, metrics=None):
    method _save (line 3798) | def _save(self, output_dir: Optional[str] = None, state_dict=None):

FILE: llava/train/train.py
  function rank0_print (line 43) | def rank0_print(*args):
  class ModelArguments (line 49) | class ModelArguments:
  class DataArguments (line 63) | class DataArguments:
  class TrainingArguments (line 74) | class TrainingArguments(transformers.TrainingArguments):
  function maybe_zero_3 (line 108) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_peft_state_maybe_zero_3 (line 123) | def get_peft_state_maybe_zero_3(named_params, bias):
  function get_peft_state_non_lora_maybe_zero_3 (line 148) | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only...
  function get_mm_adapter_state_maybe_zero_3 (line 156) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function find_all_linear_names (line 162) | def find_all_linear_names(model):
  function safe_save_model_for_hf_trainer (line 176) | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
  function smart_tokenizer_and_embedding_resize (line 195) | def smart_tokenizer_and_embedding_resize(
  function _tokenize_fn (line 220) | def _tokenize_fn(strings: Sequence[str],
  function _mask_targets (line 247) | def _mask_targets(target, tokenized_lens, speakers):
  function _add_speaker_and_signal (line 258) | def _add_speaker_and_signal(header, source, get_conversation=True):
  function preprocess_multimodal (line 279) | def preprocess_multimodal(
  function preprocess_llama_2 (line 303) | def preprocess_llama_2(
  function preprocess_v1 (line 385) | def preprocess_v1(
  function preprocess_mpt (line 467) | def preprocess_mpt(
  function preprocess_plain (line 533) | def preprocess_plain(
  function preprocess (line 555) | def preprocess(
  class LazySupervisedDataset (line 603) | class LazySupervisedDataset(Dataset):
    method __init__ (line 606) | def __init__(self, data_path: str,
    method __len__ (line 617) | def __len__(self):
    method __getitem__ (line 620) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDataset (line 686) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 691) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_supervised_data_module (line 719) | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokeni...
  function train (line 731) | def train():

FILE: llava/train/train_grounding_1st.py
  function rank0_print (line 44) | def rank0_print(*args):
  class ModelArguments (line 50) | class ModelArguments:
  class DataArguments (line 68) | class DataArguments:
  class TrainingArguments (line 79) | class TrainingArguments(transformers.TrainingArguments):
  function maybe_zero_3 (line 117) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_peft_state_maybe_zero_3 (line 132) | def get_peft_state_maybe_zero_3(named_params, bias):
  function get_peft_state_non_lora_maybe_zero_3 (line 157) | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only...
  function get_mm_adapter_state_maybe_zero_3 (line 165) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function find_all_linear_names (line 171) | def find_all_linear_names(model):
  function safe_save_model_for_hf_trainer (line 185) | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
  function smart_tokenizer_and_embedding_resize (line 204) | def smart_tokenizer_and_embedding_resize(
  function _tokenize_fn (line 229) | def _tokenize_fn(strings: Sequence[str],
  function _mask_targets (line 256) | def _mask_targets(target, tokenized_lens, speakers):
  function _add_speaker_and_signal (line 267) | def _add_speaker_and_signal(header, source, get_conversation=True):
  function preprocess_multimodal (line 288) | def preprocess_multimodal(
  function preprocess_llama_2 (line 312) | def preprocess_llama_2(
  function preprocess_v1 (line 394) | def preprocess_v1(
  function preprocess_mpt (line 476) | def preprocess_mpt(
  function preprocess_plain (line 542) | def preprocess_plain(
  function preprocess (line 564) | def preprocess(
  class LazySupervisedDataset (line 612) | class LazySupervisedDataset(Dataset):
    method __init__ (line 615) | def __init__(self, data_path: str,
    method __len__ (line 626) | def __len__(self):
    method __getitem__ (line 629) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDataset (line 684) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 689) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_supervised_data_module (line 717) | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokeni...
  function setup (line 730) | def setup(args):
  function train (line 743) | def train():

FILE: llava/train/train_joint_1st.py
  function rank0_print (line 44) | def rank0_print(*args):
  class ModelArguments (line 50) | class ModelArguments:
  class DataArguments (line 68) | class DataArguments:
  class TrainingArguments (line 79) | class TrainingArguments(transformers.TrainingArguments):
  function maybe_zero_3 (line 116) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_peft_state_maybe_zero_3 (line 131) | def get_peft_state_maybe_zero_3(named_params, bias):
  function get_peft_state_non_lora_maybe_zero_3 (line 156) | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only...
  function get_mm_adapter_state_maybe_zero_3 (line 164) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function find_all_linear_names (line 170) | def find_all_linear_names(model):
  function safe_save_model_for_hf_trainer (line 184) | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
  function smart_tokenizer_and_embedding_resize (line 203) | def smart_tokenizer_and_embedding_resize(
  function _tokenize_fn (line 228) | def _tokenize_fn(strings: Sequence[str],
  function _mask_targets (line 255) | def _mask_targets(target, tokenized_lens, speakers):
  function _add_speaker_and_signal (line 266) | def _add_speaker_and_signal(header, source, get_conversation=True):
  function preprocess_multimodal (line 287) | def preprocess_multimodal(
  function preprocess_llama_2 (line 311) | def preprocess_llama_2(
  function preprocess_v1 (line 393) | def preprocess_v1(
  function preprocess_mpt (line 475) | def preprocess_mpt(
  function preprocess_plain (line 541) | def preprocess_plain(
  function preprocess (line 563) | def preprocess(
  class LazySupervisedDataset (line 611) | class LazySupervisedDataset(Dataset):
    method __init__ (line 614) | def __init__(self, data_path: str,
    method __len__ (line 625) | def __len__(self):
    method __getitem__ (line 628) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDataset (line 683) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 688) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDatasetEmpty (line 716) | class DataCollatorForSupervisedDatasetEmpty(object):
    method __call__ (line 721) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_supervised_data_module (line 749) | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokeni...
  function setup (line 763) | def setup(args):
  function train (line 776) | def train():

FILE: llava/train/train_joint_2st.py
  function rank0_print (line 44) | def rank0_print(*args):
  class ModelArguments (line 50) | class ModelArguments:
  class DataArguments (line 68) | class DataArguments:
  class TrainingArguments (line 79) | class TrainingArguments(transformers.TrainingArguments):
  function maybe_zero_3 (line 116) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_peft_state_maybe_zero_3 (line 131) | def get_peft_state_maybe_zero_3(named_params, bias):
  function get_peft_state_non_lora_maybe_zero_3 (line 156) | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only...
  function get_mm_adapter_state_maybe_zero_3 (line 164) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function find_all_linear_names (line 170) | def find_all_linear_names(model):
  function safe_save_model_for_hf_trainer (line 184) | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
  function smart_tokenizer_and_embedding_resize (line 203) | def smart_tokenizer_and_embedding_resize(
  function _tokenize_fn (line 228) | def _tokenize_fn(strings: Sequence[str],
  function _mask_targets (line 255) | def _mask_targets(target, tokenized_lens, speakers):
  function _add_speaker_and_signal (line 266) | def _add_speaker_and_signal(header, source, get_conversation=True):
  function preprocess_multimodal (line 287) | def preprocess_multimodal(
  function preprocess_llama_2 (line 311) | def preprocess_llama_2(
  function preprocess_v1 (line 393) | def preprocess_v1(
  function preprocess_mpt (line 475) | def preprocess_mpt(
  function preprocess_plain (line 541) | def preprocess_plain(
  function preprocess (line 563) | def preprocess(
  class LazySupervisedDataset (line 611) | class LazySupervisedDataset(Dataset):
    method __init__ (line 614) | def __init__(self, data_path: str,
    method __len__ (line 625) | def __len__(self):
    method __getitem__ (line 628) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDataset (line 683) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 688) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDatasetEmpty (line 716) | class DataCollatorForSupervisedDatasetEmpty(object):
    method __call__ (line 721) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_supervised_data_module (line 749) | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokeni...
  function setup (line 763) | def setup(args):
  function train (line 776) | def train():

FILE: llava/train/train_joint_2st_interactive_refcoco_coco_instruction.py
  function rank0_print (line 44) | def rank0_print(*args):
  class ModelArguments (line 50) | class ModelArguments:
  class DataArguments (line 70) | class DataArguments:
  class TrainingArguments (line 81) | class TrainingArguments(transformers.TrainingArguments):
  function maybe_zero_3 (line 118) | def maybe_zero_3(param, ignore_status=False, name=None):
  function get_peft_state_maybe_zero_3 (line 133) | def get_peft_state_maybe_zero_3(named_params, bias):
  function get_peft_state_non_lora_maybe_zero_3 (line 158) | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only...
  function get_mm_adapter_state_maybe_zero_3 (line 166) | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
  function find_all_linear_names (line 172) | def find_all_linear_names(model):
  function safe_save_model_for_hf_trainer (line 186) | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
  function smart_tokenizer_and_embedding_resize (line 205) | def smart_tokenizer_and_embedding_resize(
  function _tokenize_fn (line 230) | def _tokenize_fn(strings: Sequence[str],
  function _mask_targets (line 257) | def _mask_targets(target, tokenized_lens, speakers):
  function _add_speaker_and_signal (line 268) | def _add_speaker_and_signal(header, source, get_conversation=True):
  function preprocess_multimodal (line 289) | def preprocess_multimodal(
  function preprocess_llama_2 (line 313) | def preprocess_llama_2(
  function preprocess_v1 (line 395) | def preprocess_v1(
  function preprocess_mpt (line 477) | def preprocess_mpt(
  function preprocess_plain (line 543) | def preprocess_plain(
  function preprocess (line 565) | def preprocess(
  class LazySupervisedDataset (line 613) | class LazySupervisedDataset(Dataset):
    method __init__ (line 616) | def __init__(self, data_path: str,
    method __len__ (line 627) | def __len__(self):
    method __getitem__ (line 630) | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDataset (line 685) | class DataCollatorForSupervisedDataset(object):
    method __call__ (line 690) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  class DataCollatorForSupervisedDatasetEmpty (line 718) | class DataCollatorForSupervisedDatasetEmpty(object):
    method __call__ (line 723) | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
  function make_supervised_data_module (line 751) | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokeni...
  function setup (line 765) | def setup(args):
  function train (line 782) | def train():

FILE: llava/utils.py
  function build_logger (line 17) | def build_logger(logger_name, logger_filename):
  class StreamToLogger (line 60) | class StreamToLogger(object):
    method __init__ (line 64) | def __init__(self, logger, log_level=logging.INFO):
    method __getattr__ (line 70) | def __getattr__(self, attr):
    method write (line 73) | def write(self, buf):
    method flush (line 87) | def flush(self):
  function disable_torch_init (line 93) | def disable_torch_init():
  function violates_moderation (line 102) | def violates_moderation(text):
  function pretty_print_semaphore (line 123) | def pretty_print_semaphore(semaphore):

FILE: scripts/convert_sqa_to_llava.py
  function convert_to_llava (line 8) | def convert_to_llava(base_dir, split, prompt_format="QCM-LEPA"):
  function convert_to_jsonl (line 49) | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
  function main (line 83) | def main(task, **kwargs):

FILE: scripts/convert_sqa_to_llava_base_prompt.py
  function get_question_text (line 1) | def get_question_text(problem):
  function get_context_text (line 6) | def get_context_text(problem, use_caption):
  function get_choice_text (line 15) | def get_choice_text(probelm, options):
  function get_answer (line 25) | def get_answer(problem, options):
  function get_lecture_text (line 29) | def get_lecture_text(problem):
  function get_solution_text (line 35) | def get_solution_text(problem):
  function create_one_example_chatbot (line 41) | def create_one_example_chatbot(format, question, context, choice, answer...
  function create_one_example (line 106) | def create_one_example(format, question, context, choice, answer, lectur...
  function create_one_example_gpt4 (line 162) | def create_one_example_gpt4(format, question, context, choice, answer, l...
  function build_prompt_chatbot (line 221) | def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption...
  function build_prompt (line 244) | def build_prompt(problems, shot_qids, test_qid, args):
  function build_prompt_gpt4 (line 291) | def build_prompt_gpt4(problems, shot_qids, test_qid, args):

FILE: scripts/merge_lora_weights.py
  function merge_lora (line 6) | def merge_lora(args):

FILE: utils/Config.py
  class CfgNode (line 3) | class CfgNode(_CfgNode):
    method merge_from_dict (line 23) | def merge_from_dict(self, dict):

FILE: utils/arguments.py
  function load_config_dict_to_opt (line 9) | def load_config_dict_to_opt(opt, config_dict):
  function load_opt_from_config_files (line 30) | def load_opt_from_config_files(conf_files):
  function load_opt_command (line 50) | def load_opt_command(args):
  function save_opt_to_json (line 93) | def save_opt_to_json(opt, conf_file):
  function save_opt_to_yaml (line 98) | def save_opt_to_yaml(opt, conf_file):

FILE: utils/dist.py
  function init_distributed_mode (line 19) | def init_distributed_mode(args):

FILE: utils/misc.py
  function hook_metadata (line 11) | def hook_metadata(metadata, name):
  function hook_opt (line 16) | def hook_opt(model, name):
  function hook_switcher (line 23) | def hook_switcher(model, name):
  class AverageMeter (line 44) | class AverageMeter(object):
    method __init__ (line 46) | def __init__(self):
    method reset (line 49) | def reset(self):
    method update (line 55) | def update(self, val, n=1, decay=0):

FILE: utils/model.py
  function register_norm_module (line 25) | def register_norm_module(cls):
  function align_and_update_state_dicts (line 29) | def align_and_update_state_dicts(model_state_dict, ckpt_state_dict):

FILE: utils/nms.py
  function matrix_nms (line 4) | def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', s...
  function matrix_nms_merge (line 26) | def matrix_nms_merge(seg_masks, cate_labels, cate_scores, kernel='gaussi...
  function multiclass_nms (line 61) | def multiclass_nms(multi_bboxes,

FILE: utils/prompt_engineering.py
  function get_prompt_templates (line 4) | def get_prompt_templates():
  function prompt_engineering (line 90) | def prompt_engineering(classnames, topk=1, suffix='.'):

FILE: utils/utils.py
  function slprint (line 4) | def slprint(x, name='x'):

FILE: utils/visualizer.py
  class ColorMode (line 37) | class ColorMode(Enum):
  class GenericMask (line 59) | class GenericMask:
    method __init__ (line 67) | def __init__(self, mask_or_polygons, height, width):
    method mask (line 99) | def mask(self):
    method polygons (line 105) | def polygons(self):
    method has_holes (line 111) | def has_holes(self):
    method mask_to_polygons (line 119) | def mask_to_polygons(self, mask):
    method polygons_to_mask (line 138) | def polygons_to_mask(self, polygons):
    method area (line 143) | def area(self):
    method bbox (line 146) | def bbox(self):
  class _PanopticPrediction (line 155) | class _PanopticPrediction:
    method __init__ (line 160) | def __init__(self, panoptic_seg, segments_info, metadata=None):
    method non_empty_mask (line 196) | def non_empty_mask(self):
    method semantic_masks (line 212) | def semantic_masks(self):
    method instance_masks (line 220) | def instance_masks(self):
  function _create_text_labels (line 230) | def _create_text_labels(classes, scores, class_names, is_crowd=None):
  class VisImage (line 257) | class VisImage:
    method __init__ (line 258) | def __init__(self, img, scale=1.0):
    method _setup_figure (line 269) | def _setup_figure(self, img):
    method reset_image (line 294) | def reset_image(self, img):
    method save (line 302) | def save(self, filepath):
    method get_image (line 310) | def get_image(self):
  class Visualizer (line 331) | class Visualizer:
    method __init__ (line 357) | def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=Co...
    method draw_instance_predictions (line 384) | def draw_instance_predictions(self, predictions):
    method draw_sem_seg (line 447) | def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.7):
    method draw_panoptic_seg (line 483) | def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshol...
    method draw_dataset_dict (line 549) | def draw_dataset_dict(self, dic):
    method overlay_instances (line 618) | def overlay_instances(
    method overlay_rotated_instances (line 760) | def overlay_rotated_instances(self, boxes=None, labels=None, assigned_...
    method draw_and_connect_keypoints (line 798) | def draw_and_connect_keypoints(self, keypoints):
    method draw_text (line 861) | def draw_text(
    method draw_box (line 908) | def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
    method draw_rotated_box_with_label (line 942) | def draw_rotated_box_with_label(
    method draw_circle (line 997) | def draw_circle(self, circle_coord, color, radius=3):
    method draw_line (line 1015) | def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=No...
    method draw_binary_mask (line 1046) | def draw_binary_mask(
    method draw_soft_mask (line 1097) | def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5):
    method draw_polygon (line 1125) | def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
    method _jitter (line 1161) | def _jitter(self, color):
    method _create_grayscale_image (line 1181) | def _create_grayscale_image(self, mask=None):
    method _change_color_brightness (line 1192) | def _change_color_brightness(self, color, brightness_factor):
    method _convert_boxes (line 1217) | def _convert_boxes(self, boxes):
    method _convert_masks (line 1226) | def _convert_masks(self, masks_or_polygons):
    method _draw_text_in_mask (line 1249) | def _draw_text_in_mask(self, binary_mask, text, color):
    method _convert_keypoints (line 1267) | def _convert_keypoints(self, keypoints):
    method get_output (line 1273) | def get_output(self):

Download .json

Condensed preview — 257 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,810K chars).

[
  {
    "path": ".gitignore",
    "chars": 496,
    "preview": "*.err\n*.out\n*.pyc\nwandb\n/data_preparation/vis_results/\n/data_preparation/vis_results_new/\n/LLAVA_Stage1_Pretrained/\n/wor"
  },
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 4645,
    "preview": "🌋 LLaVA-Grounding: Grounded Visual Chat with Large Multimodal Models\n========\n\n[[Project Page](https://llava-vl.github.i"
  },
  {
    "path": "configs/openseed/openseed_swint_lang_joint.yaml",
    "chars": 13699,
    "preview": "# --------------------------------------------------------\n# X-Decoder -- Generalized Decoding for Pixel, Image, and Lan"
  },
  {
    "path": "configs/openseed/openseed_swint_lang_joint_2st.yaml",
    "chars": 14424,
    "preview": "# --------------------------------------------------------\n# X-Decoder -- Generalized Decoding for Pixel, Image, and Lan"
  },
  {
    "path": "configs/openseed/openseed_swint_lang_joint_2st_visual_prompt.yaml",
    "chars": 15232,
    "preview": "# --------------------------------------------------------\n# X-Decoder -- Generalized Decoding for Pixel, Image, and Lan"
  },
  {
    "path": "configs/semsam/visual_prompt_encoder.yaml",
    "chars": 15082,
    "preview": "# --------------------------------------------------------\n# X-Decoder -- Generalized Decoding for Pixel, Image, and Lan"
  },
  {
    "path": "datasets_os/__init__.py",
    "chars": 47,
    "preview": "from . import registration\nfrom .build import *"
  },
  {
    "path": "datasets_os/build.py",
    "chars": 16760,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport os\nimport itertools\nimport logging\nimport copy\nfrom typing imp"
  },
  {
    "path": "datasets_os/custom_dataset_dataloader.py",
    "chars": 14335,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Part of the code is from https://github.com/xingyizhou/UniDet/blob/"
  },
  {
    "path": "datasets_os/dataset_mappers/__init__.py",
    "chars": 602,
    "preview": "\nfrom .coco_panoptic_interactive_dataset_mapper import COCOPanopticInteractiveDatasetMapper\nfrom .flickr_instance_new_ba"
  },
  {
    "path": "datasets_os/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py",
    "chars": 7266,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/dataset_mappers/coco_instruct_grounding_dataset_interactive_mapper.py",
    "chars": 13268,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/dataset_mappers/coco_instruct_grounding_dataset_mapper.py",
    "chars": 13186,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/dataset_mappers/coco_interactive_panoptic_new_baseline_dataset_mapper.py",
    "chars": 6667,
    "preview": "# ------------------------------------------------------------------------\n# Copyright (c) 2022 IDEA. All Rights Reserve"
  },
  {
    "path": "datasets_os/dataset_mappers/coco_panoptic_interactive_dataset_mapper.py",
    "chars": 13884,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py",
    "chars": 6034,
    "preview": "# ------------------------------------------------------------------------\n# Copyright (c) 2022 IDEA. All Rights Reserve"
  },
  {
    "path": "datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper.py",
    "chars": 13853,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper_.py",
    "chars": 11425,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/dataset_mappers/flickr_instance_new_baseline_dataset_mapper_end.py",
    "chars": 13147,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/dataset_mappers/flickr_new_baseline_dataset_mapper.py",
    "chars": 6660,
    "preview": "# ------------------------------------------------------------------------\n# Copyright (c) 2022 IDEA. All Rights Reserve"
  },
  {
    "path": "datasets_os/dataset_mappers/inference_mapper_with_gt.py",
    "chars": 9860,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport copy\nimport logging\nimport numpy as np\nfrom typing import List"
  },
  {
    "path": "datasets_os/dataset_mappers/llava_dataset_mapper.py",
    "chars": 8175,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/dataset_mappers/refcoco_dataset_mapper.py",
    "chars": 7351,
    "preview": "# --------------------------------------------------------\n# X-Decoder -- Generalized Decoding for Pixel, Image, and Lan"
  },
  {
    "path": "datasets_os/dataset_mappers/vg_instance_new_baseline_dataset_mapper.py",
    "chars": 11395,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "datasets_os/refer.py",
    "chars": 14909,
    "preview": "__author__ = 'licheng'\n\n\"\"\"\nThis interface provides access to four datasets:\n1) refclef\n2) refcoco\n3) refcoco+\n4) refcoc"
  },
  {
    "path": "datasets_os/registration/__init__.py",
    "chars": 226,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nfrom . import (\n    register_coco_panoptic_annos_grounding_interactiv"
  },
  {
    "path": "datasets_os/registration/register_coco_instruct_grounding_dataset.py",
    "chars": 4674,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# --------------------------------------------------------\n# X-Decode"
  },
  {
    "path": "datasets_os/registration/register_coco_panoptic_annos_grounding_interactive.py",
    "chars": 7874,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport json\nimport os\nimport collections\n\nfrom detectron2.data import"
  },
  {
    "path": "datasets_os/registration/register_flickr_dataset.py",
    "chars": 2836,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# --------------------------------------------------------\n# X-Decode"
  },
  {
    "path": "datasets_os/registration/register_vg_dataset.py",
    "chars": 2409,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# --------------------------------------------------------\n# X-Decode"
  },
  {
    "path": "datasets_os/semseg_loader.py",
    "chars": 293,
    "preview": "from PIL import Image\nimport scipy.io\nimport numpy as np\n\ndef load_semseg(filename, loader_type):\n    if loader_type == "
  },
  {
    "path": "docs/MODEL_ZOO.md",
    "chars": 602,
    "preview": "# LLaVA-Grounding Checkpoints\n\nWe will continuously update the model zoo.\n\n| Model Name | LLM version | Model Config | W"
  },
  {
    "path": "gradio_demo/LLaVA_G_Demo.py",
    "chars": 27456,
    "preview": "\nimport gradio as gr\nimport os\nimport cv2\n\nimport torch\nimport numpy as np\nfrom llava.eval.LLaVA_G_Eval import Evaluator"
  },
  {
    "path": "gradio_demo/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "llava/__init__.py",
    "chars": 41,
    "preview": "from .model import LlavaLlamaForCausalLM\n"
  },
  {
    "path": "llava/constants.py",
    "chars": 293,
    "preview": "CONTROLLER_HEART_BEAT_EXPIRATION = 30\nWORKER_HEART_BEAT_INTERVAL = 15\n\nLOGDIR = \".\"\n\n# Model Constants\nIGNORE_INDEX = -1"
  },
  {
    "path": "llava/conversation.py",
    "chars": 15448,
    "preview": "import dataclasses\nfrom enum import auto, Enum\nfrom typing import List, Tuple\n\n\nclass SeparatorStyle(Enum):\n    \"\"\"Diffe"
  },
  {
    "path": "llava/eval/LLaVA_G_Eval.py",
    "chars": 38865,
    "preview": "import os\nimport cv2\nimport json\nimport torch\nimport collections\nimport transformers\nimport numpy as np\nfrom llava.model"
  },
  {
    "path": "llava/eval/eval_gpt_review.py",
    "chars": 3620,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport tqdm\nimport ray\nimport time\n\nNUM_SECONDS_TO_SLEEP = 3\n\n@ray."
  },
  {
    "path": "llava/eval/eval_gpt_review_bench.py",
    "chars": 4297,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport time\n\nNUM_SECONDS_TO_SLEEP = 0.5\nopenai.api_type = \"azure\"\no"
  },
  {
    "path": "llava/eval/eval_gpt_review_visual.py",
    "chars": 4452,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport time\n\nNUM_SECONDS_TO_SLEEP = 0.5\nopenai.api_type = \"azure\"\no"
  },
  {
    "path": "llava/eval/eval_gpt_review_visual2.py",
    "chars": 4436,
    "preview": "import argparse\nimport json\nimport os\n\nimport openai\nimport time\n\nNUM_SECONDS_TO_SLEEP = 0.5\n\nos.environ['OPENAI_API_KEY"
  },
  {
    "path": "llava/eval/eval_science_qa.py",
    "chars": 3225,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\n\n\ndef get_args():\n    parser = argparse.ArgumentParser()\n "
  },
  {
    "path": "llava/eval/eval_science_qa_gpt4.py",
    "chars": 3675,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\nfrom collections import defaultdict\n\n\ndef get_args():\n    "
  },
  {
    "path": "llava/eval/eval_science_qa_gpt4_requery.py",
    "chars": 5774,
    "preview": "import argparse\nimport json\nimport os\nimport re\nimport random\nfrom collections import defaultdict\n\n\ndef get_args():\n    "
  },
  {
    "path": "llava/eval/generate_webpage_data_from_table.py",
    "chars": 4088,
    "preview": "\"\"\"Generate json file for webpage.\"\"\"\nimport json\nimport os\nimport re\n\n# models = ['llama', 'alpaca', 'gpt35', 'bard']\nm"
  },
  {
    "path": "llava/eval/llava_mapper.py",
    "chars": 11008,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "llava/eval/model_qa.py",
    "chars": 3285,
    "preview": "import argparse\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria\nimport torch\nimport os\nim"
  },
  {
    "path": "llava/eval/model_vqa.py",
    "chars": 4856,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llava.constants import I"
  },
  {
    "path": "llava/eval/model_vqa_science.py",
    "chars": 6059,
    "preview": "import argparse\nimport torch\nimport os\nimport json\nfrom tqdm import tqdm\nimport shortuuid\n\nfrom llava.constants import I"
  },
  {
    "path": "llava/eval/qa_baseline_gpt35.py",
    "chars": 2345,
    "preview": "\"\"\"Generate answers with GPT-3.5\"\"\"\n# Note: you need to be using OpenAI Python v0.27.0 for the code below to work\nimport"
  },
  {
    "path": "llava/eval/run_llava.py",
    "chars": 3632,
    "preview": "import argparse\nimport torch\n\nfrom llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN"
  },
  {
    "path": "llava/eval/summarize_gpt_review.py",
    "chars": 2009,
    "preview": "import json\nimport os\nfrom collections import defaultdict\n\nimport numpy as np\n\nimport argparse\n\ndef parse_args():\n    pa"
  },
  {
    "path": "llava/eval/webpage/index.html",
    "chars": 7664,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-width"
  },
  {
    "path": "llava/eval/webpage/script.js",
    "chars": 9967,
    "preview": "// Description: Script for the evaluation webpage.\n\nlet currentQuestionIndex = 1;\n\n// Store the model name mapping for l"
  },
  {
    "path": "llava/eval/webpage/styles.css",
    "chars": 1822,
    "preview": "body {\n    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;\n    background-color: #f8f9fa;\n}\n\n.navbar-dark "
  },
  {
    "path": "llava/mm_utils.py",
    "chars": 3641,
    "preview": "from PIL import Image\nfrom io import BytesIO\nimport base64\n\nimport torch\nfrom transformers import StoppingCriteria\nfrom "
  },
  {
    "path": "llava/model/__init__.py",
    "chars": 288,
    "preview": "from .language_model.llava_llama_gd import LlavaLlamaForCausalLM,LlavaLlamaForCausalLM_gd,LlavaLlamaForCausalLM_joint,Ll"
  },
  {
    "path": "llava/model/apply_delta.py",
    "chars": 1956,
    "preview": "\"\"\"\nUsage:\npython3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --de"
  },
  {
    "path": "llava/model/builder.py",
    "chars": 7711,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/consolidate.py",
    "chars": 914,
    "preview": "\"\"\"\nUsage:\npython3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate\n"
  },
  {
    "path": "llava/model/language_model/llava_llama.py",
    "chars": 5507,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/language_model/llava_llama_gd.py",
    "chars": 51300,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/language_model/llava_mpt.py",
    "chars": 5852,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/language_model/mpt/adapt_tokenizer.py",
    "chars": 1752,
    "preview": "from typing import Union\nfrom transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast\nTokenizer "
  },
  {
    "path": "llava/model/language_model/mpt/attention.py",
    "chars": 17692,
    "preview": "\"\"\"Attention layers.\"\"\"\nimport math\nimport warnings\nfrom typing import Optional\nimport torch\nimport torch.nn as nn\nfrom "
  },
  {
    "path": "llava/model/language_model/mpt/blocks.py",
    "chars": 2551,
    "preview": "\"\"\"GPT Blocks used for the GPT Model.\"\"\"\nfrom typing import Dict, Optional, Tuple\nimport torch\nimport torch.nn as nn\nfro"
  },
  {
    "path": "llava/model/language_model/mpt/configuration_mpt.py",
    "chars": 9198,
    "preview": "\"\"\"A HuggingFace-style model configuration.\"\"\"\nfrom typing import Dict, Optional, Union\nfrom transformers import Pretrai"
  },
  {
    "path": "llava/model/language_model/mpt/custom_embedding.py",
    "chars": 305,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch import Tensor\n\nclass SharedEmbedding(nn.Em"
  },
  {
    "path": "llava/model/language_model/mpt/flash_attn_triton.py",
    "chars": 28182,
    "preview": "\"\"\"\nCopied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn"
  },
  {
    "path": "llava/model/language_model/mpt/hf_prefixlm_converter.py",
    "chars": 27219,
    "preview": "\"\"\"Converts Huggingface Causal LM to Prefix LM.\n\nConversion does lightweight surgery on a HuggingFace\nCausal LM to conve"
  },
  {
    "path": "llava/model/language_model/mpt/meta_init_context.py",
    "chars": 3639,
    "preview": "from contextlib import contextmanager\nimport torch\nimport torch.nn as nn\n\n@contextmanager\ndef init_empty_weights(include"
  },
  {
    "path": "llava/model/language_model/mpt/modeling_mpt.py",
    "chars": 19710,
    "preview": "\"\"\"A simple, flexible implementation of a GPT model.\n\nInspired by https://github.com/karpathy/minGPT/blob/master/mingpt/"
  },
  {
    "path": "llava/model/language_model/mpt/norm.py",
    "chars": 2563,
    "preview": "import torch\n\ndef _cast_if_autocast_enabled(tensor):\n    if torch.is_autocast_enabled():\n        if tensor.device.type ="
  },
  {
    "path": "llava/model/language_model/mpt/param_init_fns.py",
    "chars": 12556,
    "preview": "import math\nimport warnings\nfrom collections.abc import Sequence\nfrom functools import partial\nfrom typing import Option"
  },
  {
    "path": "llava/model/llava_arch.py",
    "chars": 52281,
    "preview": "#    Copyright 2023 Haotian Liu\n#\n#    Licensed under the Apache License, Version 2.0 (the \"License\");\n#    you may not "
  },
  {
    "path": "llava/model/make_delta.py",
    "chars": 2257,
    "preview": "\"\"\"\nUsage:\npython3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~"
  },
  {
    "path": "llava/model/multimodal_encoder/builder.py",
    "chars": 428,
    "preview": "from .clip_encoder import CLIPVisionTower\n\n\ndef build_vision_tower(vision_tower_cfg, **kwargs):\n    vision_tower = getat"
  },
  {
    "path": "llava/model/multimodal_encoder/clip_encoder.py",
    "chars": 2710,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig\n\n\ncla"
  },
  {
    "path": "llava/model/openseed/BaseModel.py",
    "chars": 2412,
    "preview": "import os\nimport logging\n\nimport torch\nimport torch.nn as nn\n\n# from utils.model import align_and_update_state_dicts\n\nlo"
  },
  {
    "path": "llava/model/openseed/__init__.py",
    "chars": 148,
    "preview": "from __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom .arch"
  },
  {
    "path": "llava/model/openseed/architectures/__init__.py",
    "chars": 107,
    "preview": "from .openseed_model import *\n# from .openseed_model_decouple_train import *\nfrom .build import build_model"
  },
  {
    "path": "llava/model/openseed/architectures/build.py",
    "chars": 297,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\ndef build_model(config, **kwargs):\n    model_nam"
  },
  {
    "path": "llava/model/openseed/architectures/openseed_model.py",
    "chars": 37691,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2023 IDEA. All Rights "
  },
  {
    "path": "llava/model/openseed/architectures/openseed_model_decouple_train.py",
    "chars": 30237,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2023 IDEA. All Rights "
  },
  {
    "path": "llava/model/openseed/architectures/registry.py",
    "chars": 339,
    "preview": "_model_entrypoints = {}\n\ndef register_model(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = modul"
  },
  {
    "path": "llava/model/openseed/backbone/__init__.py",
    "chars": 123,
    "preview": "from .build import build_backbone\n\nfrom .focal import *\nfrom .focal_dw import *\nfrom .swin import *\nfrom .backbone impor"
  },
  {
    "path": "llava/model/openseed/backbone/backbone.py",
    "chars": 1500,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport torch.nn as nn\n\nfrom detectron2.modeling import ShapeSpec\n\n# f"
  },
  {
    "path": "llava/model/openseed/backbone/build.py",
    "chars": 336,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\nfrom .backbone import *\n\ndef build_backbone(conf"
  },
  {
    "path": "llava/model/openseed/backbone/focal.py",
    "chars": 27324,
    "preview": "# --------------------------------------------------------\n# FocalNet for Semantic Segmentation\n# Copyright (c) 2022 Mic"
  },
  {
    "path": "llava/model/openseed/backbone/focal_dw.py",
    "chars": 31620,
    "preview": "# --------------------------------------------------------\n# FocalNet for Semantic Segmentation\n# Copyright (c) 2022 Mic"
  },
  {
    "path": "llava/model/openseed/backbone/registry.py",
    "chars": 344,
    "preview": "_model_entrypoints = {}\n\n\ndef register_backbone(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = m"
  },
  {
    "path": "llava/model/openseed/backbone/swin.py",
    "chars": 32695,
    "preview": "# --------------------------------------------------------\n# Swin Transformer\n# Copyright (c) 2021 Microsoft\n# Licensed "
  },
  {
    "path": "llava/model/openseed/body/__init__.py",
    "chars": 38,
    "preview": "from .build import build_openseed_head"
  },
  {
    "path": "llava/model/openseed/body/build.py",
    "chars": 364,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\nfrom .openseed_head import *\n\n\ndef build_openseed"
  },
  {
    "path": "llava/model/openseed/body/decoder/__init__.py",
    "chars": 105,
    "preview": "from .build import build_decoder\nfrom .openseed_decoder import *\nfrom .openseed_decoder_decouple import *"
  },
  {
    "path": "llava/model/openseed/body/decoder/build.py",
    "chars": 325,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\n\ndef build_decoder(config, *args, **kwargs):\n   "
  },
  {
    "path": "llava/model/openseed/body/decoder/modules.py",
    "chars": 7368,
    "preview": "from typing import Optional\n\nimport torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\n\nfrom timm.m"
  },
  {
    "path": "llava/model/openseed/body/decoder/openseed_decoder.py",
    "chars": 27966,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2023 IDEA. All Rights "
  },
  {
    "path": "llava/model/openseed/body/decoder/openseed_decoder_decouple.py",
    "chars": 37294,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2022 IDEA. All Rights "
  },
  {
    "path": "llava/model/openseed/body/decoder/registry.py",
    "chars": 341,
    "preview": "_model_entrypoints = {}\n\ndef register_decoder(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = mod"
  },
  {
    "path": "llava/model/openseed/body/decoder/utils/__init__.py",
    "chars": 20,
    "preview": "from .utils import *"
  },
  {
    "path": "llava/model/openseed/body/decoder/utils/dino_decoder.py",
    "chars": 13627,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2022 IDEA. All Rights "
  },
  {
    "path": "llava/model/openseed/body/decoder/utils/utils.py",
    "chars": 4805,
    "preview": "import torch\nimport copy\nfrom torch import nn, Tensor\nimport os\n\nimport math\nimport torch.nn.functional as F\nfrom torch "
  },
  {
    "path": "llava/model/openseed/body/encoder/__init__.py",
    "chars": 32,
    "preview": "from .build import build_encoder"
  },
  {
    "path": "llava/model/openseed/body/encoder/build.py",
    "chars": 394,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\nfrom .transformer_encoder_fpn import *\nfrom .enc"
  },
  {
    "path": "llava/model/openseed/body/encoder/encoder_deform.py",
    "chars": 19624,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2023 IDEA. All Rights "
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/functions/__init__.py",
    "chars": 734,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/functions/ms_deform_attn_func.py",
    "chars": 3728,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/make.sh",
    "chars": 736,
    "preview": "#!/usr/bin/env bash\n# ------------------------------------------------------------------------------------------------\n#"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/modules/__init__.py",
    "chars": 720,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/modules/ms_deform_attn.py",
    "chars": 7133,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/setup.py",
    "chars": 3038,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/src/cpu/ms_deform_attn_cpu.cpp",
    "chars": 1399,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/src/cpu/ms_deform_attn_cpu.h",
    "chars": 1282,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/src/cuda/ms_deform_attn_cuda.cu",
    "chars": 7459,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/src/cuda/ms_deform_attn_cuda.h",
    "chars": 1283,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/src/cuda/ms_deform_im2col_cuda.cuh",
    "chars": 54837,
    "preview": "/*!\n**************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 Se"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/src/ms_deform_attn.h",
    "chars": 1981,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/src/vision.cpp",
    "chars": 942,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/openseed/body/encoder/ops/test.py",
    "chars": 4223,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/openseed/body/encoder/registry.py",
    "chars": 342,
    "preview": "_model_entrypoints = {}\n\ndef register_encoder(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = mod"
  },
  {
    "path": "llava/model/openseed/body/encoder/transformer_encoder_fpn.py",
    "chars": 12591,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport logging\nimport numpy as np\nfrom typing import Callable, Dict, "
  },
  {
    "path": "llava/model/openseed/body/openseed_head.py",
    "chars": 3587,
    "preview": "# ------------------------------------------------------------------------\n# Copyright (c) 2022 IDEA. All Rights Reserve"
  },
  {
    "path": "llava/model/openseed/body/registry.py",
    "chars": 339,
    "preview": "_model_entrypoints = {}\n\n\ndef register_body(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = modul"
  },
  {
    "path": "llava/model/openseed/body/transformer_blocks.py",
    "chars": 11944,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from: https://github.com/facebookresearch/d"
  },
  {
    "path": "llava/model/openseed/language/LangEncoder/__init__.py",
    "chars": 210,
    "preview": "from __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom .buil"
  },
  {
    "path": "llava/model/openseed/language/LangEncoder/build.py",
    "chars": 1289,
    "preview": "import os\n\nfrom transformers import CLIPTokenizer, CLIPTokenizerFast\nfrom transformers import AutoTokenizer\n\nfrom .regis"
  },
  {
    "path": "llava/model/openseed/language/LangEncoder/registry.py",
    "chars": 339,
    "preview": "_lang_encoders = {}\n\n\ndef register_lang_encoder(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = m"
  },
  {
    "path": "llava/model/openseed/language/LangEncoder/transformer.py",
    "chars": 8398,
    "preview": "from collections import OrderedDict\nfrom typing import Tuple, Union\nimport logging\nimport os\n\nimport numpy as np\nimport "
  },
  {
    "path": "llava/model/openseed/language/__init__.py",
    "chars": 120,
    "preview": "# from .vlpencoder import *\n# from .encoder import *\n# # from .loss import *\n# from .build import build_language_encoder"
  },
  {
    "path": "llava/model/openseed/language/build.py",
    "chars": 317,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\n\ndef build_language_encoder(config, **kwargs):\n "
  },
  {
    "path": "llava/model/openseed/language/encoder.py",
    "chars": 5143,
    "preview": "import torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom timm.models.layers import trunc_normal_\n\nfr"
  },
  {
    "path": "llava/model/openseed/language/registry.py",
    "chars": 339,
    "preview": "_model_entrypoints = {}\n\ndef register_model(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = modul"
  },
  {
    "path": "llava/model/openseed/language/vlpencoder.py",
    "chars": 7231,
    "preview": "# --------------------------------------------------------\n# X-Decoder -- Generalized Decoding for Pixel, Image, and Lan"
  },
  {
    "path": "llava/model/openseed/modules/__init__.py",
    "chars": 165,
    "preview": "from .point_features import *\nfrom .position_encoding import *\nfrom .postprocessing import *\nfrom .attention import *\nfr"
  },
  {
    "path": "llava/model/openseed/modules/attention.py",
    "chars": 22826,
    "preview": "import warnings\nfrom typing import Optional, Tuple\n\nimport torch\nimport torch.nn as nn\nfrom torch import Tensor\nfrom tor"
  },
  {
    "path": "llava/model/openseed/modules/criterion.py",
    "chars": 25390,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2023 IDEA. All Rights "
  },
  {
    "path": "llava/model/openseed/modules/matcher.py",
    "chars": 10599,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2023 IDEA. All Rights "
  },
  {
    "path": "llava/model/openseed/modules/point_features.py",
    "chars": 11822,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport torch\nfrom torch.nn import functional as F\n\nfrom detectron2.la"
  },
  {
    "path": "llava/model/openseed/modules/position_encoding.py",
    "chars": 2500,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# # Modified by Bowen Cheng from: https://github.com/facebookresearch"
  },
  {
    "path": "llava/model/openseed/modules/postprocessing.py",
    "chars": 4915,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport torch\nfrom torch.nn import functional as F\n\nfrom detectron2.st"
  },
  {
    "path": "llava/model/openseed/utils/__init__.py",
    "chars": 41,
    "preview": "from .config import *\nfrom .misc import *"
  },
  {
    "path": "llava/model/openseed/utils/box_ops.py",
    "chars": 2707,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\"\"\"\nUtilities for bounding box manipulation and G"
  },
  {
    "path": "llava/model/openseed/utils/config.py",
    "chars": 5263,
    "preview": "# -*- coding: utf-8 -*-\n# Copyright (c) Facebook, Inc. and its affiliates.\n\nimport functools\nimport inspect\n\ndef configu"
  },
  {
    "path": "llava/model/openseed/utils/misc.py",
    "chars": 10059,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from https://github.com/facebookresearch/de"
  },
  {
    "path": "llava/model/semsam/BaseModel.py",
    "chars": 1515,
    "preview": "import os\nimport logging\n\nimport torch\nimport torch.nn as nn\n\nfrom utils.model import align_and_update_state_dicts\n\nlogg"
  },
  {
    "path": "llava/model/semsam/__init__.py",
    "chars": 148,
    "preview": "from __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom .arch"
  },
  {
    "path": "llava/model/semsam/architectures/__init__.py",
    "chars": 104,
    "preview": "from .idino_model_partwhole_all_llm_ref_feats_all_det_pretrainv1 import *\nfrom .build import build_model"
  },
  {
    "path": "llava/model/semsam/architectures/build.py",
    "chars": 297,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\ndef build_model(config, **kwargs):\n    model_nam"
  },
  {
    "path": "llava/model/semsam/architectures/idino_model_partwhole_all_llm_ref_feats_all_det_pretrainv1.py",
    "chars": 29858,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nfrom typing import Tuple\nimport torch\nfrom torch import nn\nfrom torch"
  },
  {
    "path": "llava/model/semsam/architectures/registry.py",
    "chars": 339,
    "preview": "_model_entrypoints = {}\n\ndef register_model(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = modul"
  },
  {
    "path": "llava/model/semsam/backbone/__init__.py",
    "chars": 123,
    "preview": "from .build import build_backbone\n\nfrom .focal import *\nfrom .focal_dw import *\nfrom .swin import *\nfrom .backbone impor"
  },
  {
    "path": "llava/model/semsam/backbone/backbone.py",
    "chars": 1500,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport torch.nn as nn\n\nfrom detectron2.modeling import ShapeSpec\n\n# f"
  },
  {
    "path": "llava/model/semsam/backbone/build.py",
    "chars": 336,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\nfrom .backbone import *\n\ndef build_backbone(conf"
  },
  {
    "path": "llava/model/semsam/backbone/focal.py",
    "chars": 27324,
    "preview": "# --------------------------------------------------------\n# FocalNet for Semantic Segmentation\n# Copyright (c) 2022 Mic"
  },
  {
    "path": "llava/model/semsam/backbone/focal_dw.py",
    "chars": 31620,
    "preview": "# --------------------------------------------------------\n# FocalNet for Semantic Segmentation\n# Copyright (c) 2022 Mic"
  },
  {
    "path": "llava/model/semsam/backbone/registry.py",
    "chars": 344,
    "preview": "_model_entrypoints = {}\n\n\ndef register_backbone(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = m"
  },
  {
    "path": "llava/model/semsam/backbone/swin.py",
    "chars": 32767,
    "preview": "# --------------------------------------------------------\n# Swin Transformer\n# Copyright (c) 2021 Microsoft\n# Licensed "
  },
  {
    "path": "llava/model/semsam/backbone/swin_new.py",
    "chars": 27476,
    "preview": "# --------------------------------------------------------\n# Swin Transformer\n# Copyright (c) 2021 Microsoft\n# Licensed "
  },
  {
    "path": "llava/model/semsam/body/__init__.py",
    "chars": 38,
    "preview": "from .build import build_openseed_head"
  },
  {
    "path": "llava/model/semsam/body/build.py",
    "chars": 364,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\nfrom .openseed_head import *\n\n\ndef build_openseed"
  },
  {
    "path": "llava/model/semsam/body/decoder/__init__.py",
    "chars": 92,
    "preview": "from .build import build_decoder\nfrom .idino_decoder_no_iou_token_partwhole_all_llm import *"
  },
  {
    "path": "llava/model/semsam/body/decoder/build.py",
    "chars": 325,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\n\ndef build_decoder(config, *args, **kwargs):\n   "
  },
  {
    "path": "llava/model/semsam/body/decoder/idino_decoder_no_iou_token_partwhole_all_llm.py",
    "chars": 46898,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2022 IDEA. All Rights "
  },
  {
    "path": "llava/model/semsam/body/decoder/modules.py",
    "chars": 7368,
    "preview": "from typing import Optional\n\nimport torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\n\nfrom timm.m"
  },
  {
    "path": "llava/model/semsam/body/decoder/registry.py",
    "chars": 341,
    "preview": "_model_entrypoints = {}\n\ndef register_decoder(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = mod"
  },
  {
    "path": "llava/model/semsam/body/decoder/utils/__init__.py",
    "chars": 20,
    "preview": "from .utils import *"
  },
  {
    "path": "llava/model/semsam/body/decoder/utils/dino_decoder.py",
    "chars": 13870,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2022 IDEA. All Rights "
  },
  {
    "path": "llava/model/semsam/body/decoder/utils/utils.py",
    "chars": 4805,
    "preview": "import torch\nimport copy\nfrom torch import nn, Tensor\nimport os\n\nimport math\nimport torch.nn.functional as F\nfrom torch "
  },
  {
    "path": "llava/model/semsam/body/encoder/__init__.py",
    "chars": 32,
    "preview": "from .build import build_encoder"
  },
  {
    "path": "llava/model/semsam/body/encoder/build.py",
    "chars": 394,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\nfrom .transformer_encoder_fpn import *\nfrom .enc"
  },
  {
    "path": "llava/model/semsam/body/encoder/encoder_deform.py",
    "chars": 19299,
    "preview": "# ------------------------------------------------------------------------\n# DINO\n# Copyright (c) 2022 IDEA. All Rights "
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/functions/__init__.py",
    "chars": 734,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/functions/ms_deform_attn_func.py",
    "chars": 3728,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/make.sh",
    "chars": 736,
    "preview": "#!/usr/bin/env bash\n# ------------------------------------------------------------------------------------------------\n#"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/modules/__init__.py",
    "chars": 720,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/modules/ms_deform_attn.py",
    "chars": 7470,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/setup.py",
    "chars": 3038,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/src/cpu/ms_deform_attn_cpu.cpp",
    "chars": 1399,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/src/cpu/ms_deform_attn_cpu.h",
    "chars": 1282,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/src/cuda/ms_deform_attn_cuda.cu",
    "chars": 7459,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/src/cuda/ms_deform_attn_cuda.h",
    "chars": 1283,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/src/cuda/ms_deform_im2col_cuda.cuh",
    "chars": 54837,
    "preview": "/*!\n**************************************************************************\n* Deformable DETR\n* Copyright (c) 2020 Se"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/src/ms_deform_attn.h",
    "chars": 1981,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/src/vision.cpp",
    "chars": 942,
    "preview": "/*!\n**************************************************************************************************\n* Deformable DETR"
  },
  {
    "path": "llava/model/semsam/body/encoder/ops/test.py",
    "chars": 4223,
    "preview": "# ------------------------------------------------------------------------------------------------\n# Deformable DETR\n# C"
  },
  {
    "path": "llava/model/semsam/body/encoder/registry.py",
    "chars": 342,
    "preview": "_model_entrypoints = {}\n\ndef register_encoder(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = mod"
  },
  {
    "path": "llava/model/semsam/body/encoder/transformer_encoder_fpn.py",
    "chars": 12591,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport logging\nimport numpy as np\nfrom typing import Callable, Dict, "
  },
  {
    "path": "llava/model/semsam/body/openseed_head.py",
    "chars": 4176,
    "preview": "# ------------------------------------------------------------------------\n# Copyright (c) 2022 IDEA. All Rights Reserve"
  },
  {
    "path": "llava/model/semsam/body/registry.py",
    "chars": 339,
    "preview": "_model_entrypoints = {}\n\n\ndef register_body(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = modul"
  },
  {
    "path": "llava/model/semsam/body/transformer_blocks.py",
    "chars": 11944,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# Modified by Bowen Cheng from: https://github.com/facebookresearch/d"
  },
  {
    "path": "llava/model/semsam/language/LangEncoder/__init__.py",
    "chars": 210,
    "preview": "from __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nfrom .buil"
  },
  {
    "path": "llava/model/semsam/language/LangEncoder/build.py",
    "chars": 1289,
    "preview": "import os\n\nfrom transformers import CLIPTokenizer, CLIPTokenizerFast\nfrom transformers import AutoTokenizer\n\nfrom .regis"
  },
  {
    "path": "llava/model/semsam/language/LangEncoder/registry.py",
    "chars": 339,
    "preview": "_lang_encoders = {}\n\n\ndef register_lang_encoder(fn):\n    module_name_split = fn.__module__.split('.')\n    model_name = m"
  },
  {
    "path": "llava/model/semsam/language/LangEncoder/transformer.py",
    "chars": 8398,
    "preview": "from collections import OrderedDict\nfrom typing import Tuple, Union\nimport logging\nimport os\n\nimport numpy as np\nimport "
  },
  {
    "path": "llava/model/semsam/language/__init__.py",
    "chars": 289,
    "preview": "# from .vlpencoder import *\n# from .encoder import *\n# from .fixencoder import *\n# from .loss import *\n# from .modeling_"
  },
  {
    "path": "llava/model/semsam/language/build.py",
    "chars": 369,
    "preview": "from .registry import model_entrypoints\nfrom .registry import is_model\n\n\ndef build_language_encoder(config, **kwargs):\n "
  },
  {
    "path": "llava/model/semsam/language/encoder.py",
    "chars": 5143,
    "preview": "import torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom timm.models.layers import trunc_normal_\n\nfr"
  }
]

// ... and 57 more files (download for full content)

About this extraction

This page contains the full source code of the UX-Decoder/LLaVA-Grounding GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 257 files (2.6 MB), approximately 691.7k tokens, and a symbol index with 1862 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo