Repository: bytedance/dplm
Branch: main
Commit: 8a2e15e53416
Files: 319
Total size: 12.3 MB
Directory structure:
gitextract_uly26sb1/
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── analysis/
│ ├── TMalign
│ ├── TMalign.cpp
│ ├── TMscore
│ ├── TMscore.cpp
│ ├── cal_plddt_dir.py
│ ├── cal_tmscore.py
│ ├── motif_analysis.ipynb
│ ├── plddt_calculate.sh
│ ├── plot.ipynb
│ └── uncond_analysis.ipynb
├── configs/
│ ├── callbacks/
│ │ ├── default.yaml
│ │ ├── fixedbb.yaml
│ │ ├── lm.yaml
│ │ └── structok.yaml
│ ├── config.yaml
│ ├── datamodule/
│ │ ├── cath_4.3.yaml
│ │ ├── pdb.yaml
│ │ ├── tokenized_protein.yaml
│ │ ├── uniref50.yaml
│ │ └── uniref50_hf.yaml
│ ├── experiment/
│ │ ├── base.yaml
│ │ ├── dplm/
│ │ │ ├── cond_dplm_150m.yaml
│ │ │ ├── cond_dplm_3b.yaml
│ │ │ ├── cond_dplm_650m.yaml
│ │ │ ├── dplm_150m.yaml
│ │ │ ├── dplm_150m_ds.yaml
│ │ │ ├── dplm_150m_stage2.yaml
│ │ │ ├── dplm_15b_ds.yaml
│ │ │ ├── dplm_30b_ds.yaml
│ │ │ ├── dplm_3b.yaml
│ │ │ ├── dplm_3b_ds.yaml
│ │ │ ├── dplm_3b_stage2.yaml
│ │ │ ├── dplm_650m.yaml
│ │ │ ├── dplm_650m_ds.yaml
│ │ │ ├── dplm_650m_stage2.yaml
│ │ │ └── mlm_150m.yaml
│ │ ├── dplm2/
│ │ │ ├── dplm2_150m.yaml
│ │ │ ├── dplm2_3b.yaml
│ │ │ ├── dplm2_650m.yaml
│ │ │ ├── dplm2_650m_selfmixup.yaml
│ │ │ └── dplm2_bit_650m.yaml
│ │ └── structok/
│ │ ├── inference/
│ │ │ ├── forward_folding.yaml
│ │ │ ├── inverse_folding.yaml
│ │ │ ├── reconstruction.yaml
│ │ │ ├── unconditional.yaml
│ │ │ └── unconditional_codesign.yaml
│ │ └── structok_lfq_8k_pdb_swissprot_c512.yaml
│ ├── hydra/
│ │ └── default.yaml
│ ├── logger/
│ │ ├── tensorboard.yaml
│ │ └── wandb.yaml
│ ├── paths/
│ │ └── default.yaml
│ ├── test.yaml
│ └── trainer/
│ ├── ddp.yaml
│ ├── ddp_bf16.yaml
│ ├── ddp_fp16.yaml
│ ├── deepspeed_zero2.yaml
│ ├── deepspeed_zero2_bf16.yaml
│ ├── deepspeed_zero2_fp16.yaml
│ ├── deepspeed_zero2_offload.yaml
│ ├── deepspeed_zero3.yaml
│ ├── deepspeed_zero3_bf16.yaml
│ └── default.yaml
├── env.yml
├── generate_dplm.py
├── generate_dplm2.py
├── requirements.txt
├── run/
│ ├── scaffold_generate_dplm.py
│ └── scaffold_generate_dplm2.py
├── scripts/
│ ├── download_cath.sh
│ ├── download_metadata.sh
│ ├── download_motif_scaffolds.sh
│ ├── download_pdb_swissprot_hf.sh
│ ├── download_uniref50_hf.sh
│ └── install.sh
├── setup.cfg
├── setup.py
├── src/
│ └── byprot/
│ ├── __init__.py
│ ├── datamodules/
│ │ ├── __init__.py
│ │ ├── cath_datamodule.py
│ │ ├── dataset/
│ │ │ ├── __init__.py
│ │ │ ├── cath.py
│ │ │ ├── data_utils.py
│ │ │ ├── tokenized_protein.py
│ │ │ ├── uniref.py
│ │ │ └── uniref_hf.py
│ │ ├── pdb_dataset/
│ │ │ ├── __init__.py
│ │ │ ├── all_atom.py
│ │ │ ├── pdb_datamodule.py
│ │ │ ├── protein.py
│ │ │ ├── residue_constants.py
│ │ │ └── utils.py
│ │ ├── tokenized_protein_datamodule.py
│ │ ├── uniref50.py
│ │ └── uniref50_hf.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── dplm/
│ │ │ ├── __init__.py
│ │ │ ├── dplm.py
│ │ │ ├── dplm_invfold.py
│ │ │ └── modules/
│ │ │ ├── dplm_adapter.py
│ │ │ ├── dplm_modeling_esm.py
│ │ │ └── gvp_transformer_encoder.py
│ │ ├── dplm2/
│ │ │ ├── __init__.py
│ │ │ ├── dplm2.py
│ │ │ ├── dplm2_bit.py
│ │ │ └── modules/
│ │ │ ├── dplm2_bit_modeling_esm.py
│ │ │ └── dplm2_modeling_esm.py
│ │ ├── structok/
│ │ │ ├── modules/
│ │ │ │ ├── ema.py
│ │ │ │ ├── folding_utils/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── categorical_mixture.py
│ │ │ │ │ ├── decoder.py
│ │ │ │ │ ├── esmfold.py
│ │ │ │ │ ├── misc.py
│ │ │ │ │ ├── pretrained.py
│ │ │ │ │ ├── structure_module.py
│ │ │ │ │ ├── tri_self_attn_block.py
│ │ │ │ │ └── trunk.py
│ │ │ │ ├── gvp_encoder.py
│ │ │ │ ├── lfq.py
│ │ │ │ ├── loss.py
│ │ │ │ ├── nn.py
│ │ │ │ └── vqvae.py
│ │ │ └── structok_lfq.py
│ │ └── utils.py
│ ├── modules/
│ │ ├── __init__.py
│ │ ├── cross_entropy.py
│ │ ├── metrics.py
│ │ └── protein_metrics.py
│ ├── tasks/
│ │ ├── __init__.py
│ │ ├── lm/
│ │ │ ├── dplm.py
│ │ │ ├── dplm2.py
│ │ │ ├── dplm_invfold.py
│ │ │ └── mlm.py
│ │ └── struct_tokenizer/
│ │ └── structok.py
│ ├── testing_pipeline.py
│ ├── training_pipeline.py
│ └── utils/
│ ├── __init__.py
│ ├── callbacks.py
│ ├── config.py
│ ├── io.py
│ ├── logger.py
│ ├── lr_scheduler.py
│ ├── optim.py
│ ├── protein/
│ │ ├── __init__.py
│ │ ├── all_atom.py
│ │ ├── evaluator_dplm2.py
│ │ ├── folding_model.py
│ │ ├── residue_constants.py
│ │ ├── tokenize_pdb.py
│ │ └── utils.py
│ ├── registry.py
│ ├── scaffold_utils.py
│ └── strategies.py
├── test.py
├── train.py
└── vendor/
└── openfold/
├── CITATION.cff
├── Dockerfile
├── LICENSE
├── README.md
├── deepspeed_config.json
├── environment.yml
├── notebooks/
│ ├── OpenFold.ipynb
│ └── environment.yml
├── openfold/
│ ├── __init__.py
│ ├── config.py
│ ├── data/
│ │ ├── __init__.py
│ │ ├── data_modules.py
│ │ ├── data_pipeline.py
│ │ ├── data_transforms.py
│ │ ├── errors.py
│ │ ├── feature_pipeline.py
│ │ ├── input_pipeline.py
│ │ ├── mmcif_parsing.py
│ │ ├── parsers.py
│ │ ├── templates.py
│ │ └── tools/
│ │ ├── __init__.py
│ │ ├── hhblits.py
│ │ ├── hhsearch.py
│ │ ├── jackhmmer.py
│ │ ├── kalign.py
│ │ └── utils.py
│ ├── model/
│ │ ├── __init__.py
│ │ ├── dropout.py
│ │ ├── embedders.py
│ │ ├── evoformer.py
│ │ ├── heads.py
│ │ ├── model.py
│ │ ├── msa.py
│ │ ├── outer_product_mean.py
│ │ ├── pair_transition.py
│ │ ├── primitives.py
│ │ ├── structure_module.py
│ │ ├── template.py
│ │ ├── torchscript.py
│ │ ├── triangular_attention.py
│ │ └── triangular_multiplicative_update.py
│ ├── np/
│ │ ├── __init__.py
│ │ ├── protein.py
│ │ ├── relax/
│ │ │ ├── __init__.py
│ │ │ ├── amber_minimize.py
│ │ │ ├── cleanup.py
│ │ │ ├── relax.py
│ │ │ └── utils.py
│ │ └── residue_constants.py
│ ├── resources/
│ │ ├── __init__.py
│ │ └── stereo_chemical_props.txt
│ └── utils/
│ ├── __init__.py
│ ├── argparse.py
│ ├── callbacks.py
│ ├── checkpointing.py
│ ├── chunk_utils.py
│ ├── exponential_moving_average.py
│ ├── feats.py
│ ├── import_weights.py
│ ├── kernel/
│ │ ├── __init__.py
│ │ ├── attention_core.py
│ │ └── csrc/
│ │ ├── compat.h
│ │ ├── softmax_cuda.cpp
│ │ ├── softmax_cuda_kernel.cu
│ │ └── softmax_cuda_stub.cpp
│ ├── logger.py
│ ├── loss.py
│ ├── lr_schedulers.py
│ ├── precision_utils.py
│ ├── rigid_utils.py
│ ├── script_utils.py
│ ├── seed.py
│ ├── superimposition.py
│ ├── suppress_output.py
│ ├── tensor_utils.py
│ ├── trace_utils.py
│ └── validation_metrics.py
├── run_pretrained_openfold.py
├── scripts/
│ ├── activate_conda_env.sh
│ ├── alignment_db_scripts/
│ │ ├── create_alignment_db.py
│ │ └── unify_alignment_db_indices.py
│ ├── build_deepspeed_config.py
│ ├── colabfold_search.sh
│ ├── convert_of_weights_to_jax.py
│ ├── data_dir_to_fasta.py
│ ├── deactivate_conda_env.sh
│ ├── download_alphafold_dbs.sh
│ ├── download_alphafold_params.sh
│ ├── download_bfd.sh
│ ├── download_cameo.py
│ ├── download_colabfold_envdb.sh
│ ├── download_mgnify.sh
│ ├── download_mmseqs_dbs.sh
│ ├── download_openfold_params.sh
│ ├── download_openfold_params_gdrive.sh
│ ├── download_openfold_params_huggingface.sh
│ ├── download_pdb70.sh
│ ├── download_pdb_mmcif.sh
│ ├── download_roda_pdbs.sh
│ ├── download_small_bfd.sh
│ ├── download_uniclust30.sh
│ ├── download_uniref30.sh
│ ├── download_uniref90.sh
│ ├── flatten_roda.sh
│ ├── generate_alphafold_feature_dict.py
│ ├── generate_chain_data_cache.py
│ ├── generate_mmcif_cache.py
│ ├── install_hh_suite.sh
│ ├── install_third_party_dependencies.sh
│ ├── precompute_alignments.py
│ ├── precompute_alignments_mmseqs.py
│ ├── precompute_embeddings.py
│ ├── prep_mmseqs_dbs.sh
│ ├── prep_proteinnet_msas.py
│ ├── run_unit_tests.sh
│ ├── slurm_scripts/
│ │ └── run_uniclust30_search.sh
│ ├── unpack_proteinnet.py
│ ├── utils.py
│ ├── vars.sh
│ └── zero_to_fp32.py
├── setup.py
├── tests/
│ ├── __init__.py
│ ├── compare_utils.py
│ ├── config.py
│ ├── data_utils.py
│ ├── test_data/
│ │ ├── alignments/
│ │ │ ├── bfd_uniclust_hits.a3m
│ │ │ ├── mgnify_hits.sto
│ │ │ ├── pdb70_hits.hhr
│ │ │ └── uniref90_hits.sto
│ │ ├── alphafold_feature_dict.pickle
│ │ ├── features.pkl
│ │ ├── mmcifs/
│ │ │ ├── 1hf9.cif
│ │ │ ├── 1psm.cif
│ │ │ ├── 2crb.cif
│ │ │ ├── 2q2k.cif
│ │ │ ├── 3u8v.cif
│ │ │ ├── 3zee.cif
│ │ │ ├── 4i6p.cif
│ │ │ ├── 4zey.cif
│ │ │ └── 5kc1.cif
│ │ └── short.fasta
│ ├── test_data_pipeline.py
│ ├── test_data_transforms.py
│ ├── test_embedders.py
│ ├── test_evoformer.py
│ ├── test_feats.py
│ ├── test_import_weights.py
│ ├── test_kernels.py
│ ├── test_loss.py
│ ├── test_model.py
│ ├── test_msa.py
│ ├── test_outer_product_mean.py
│ ├── test_pair_transition.py
│ ├── test_primitives.py
│ ├── test_structure_module.py
│ ├── test_template.py
│ ├── test_triangular_attention.py
│ ├── test_triangular_multiplicative_update.py
│ └── test_utils.py
├── thread_sequence.py
└── train_openfold.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
### VisualStudioCode
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace
**/.vscode
# JetBrains
.idea/
# Lightning-Hydra-Template
configs/local/default.yaml
configs/local/*
!*/data
/data/
logs/
wandb/
.env
.autoenv
workspace.ipynb
run/logs
# model weight
*.ckpt
# pdb
*.pdb
byprot-checkpoints/
generation-results/
data-bin/scaffolding-pdbs/*.pdb
temp.ipynb
# run/
================================================
FILE: .pre-commit-config.yaml
================================================
default_language_version:
python: python3.9
exclude: ^(docs/|build/|node_modules/|venv/|\.venv/|vendor/)
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
# list of supported hooks: https://pre-commit.com/hooks.html
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-docstring-first
- id: check-yaml
- id: debug-statements
- id: detect-private-key
- id: check-toml
- id: check-case-conflict
- id: check-added-large-files
args: [--maxkb=1000]
- id: check-json
- id: check-merge-conflict
- id: check-shebang-scripts-are-executable
- id: fix-byte-order-marker
- id: fix-encoding-pragma
args: [--remove]
- id: mixed-line-ending
args: [--fix=lf]
# python code formatting
- repo: https://github.com/psf/black
rev: 22.6.0
hooks:
- id: black
args: [--line-length, "79"]
language_version: python3.9
# python import sorting
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
hooks:
- id: isort
args:
[
"--line-length=79",
"--multi-line=3",
"--profile=black",
"--filter-files",
]
language_version: python3.9
# python upgrading syntax to newer version
# - repo: https://github.com/asottile/pyupgrade
# rev: v2.32.1
# hooks:
# - id: pyupgrade
# args: [--py38-plus]
# python docstring formatting
- repo: https://github.com/myint/docformatter
rev: v1.4
hooks:
- id: docformatter
args: [--in-place, --wrap-summaries=79, --wrap-descriptions=79]
# python check (PEP8), programming errors and code complexity
- repo: https://github.com/PyCQA/flake8
rev: 4.0.1
hooks:
- id: flake8
args:
[
"--extend-ignore",
"E203,E402,E501,F401,F841",
"--exclude",
"logs/*,data/*",
]
# python security linter
- repo: https://github.com/PyCQA/bandit
rev: "1.7.1"
hooks:
- id: bandit
args: ["-s", "B101"]
# yaml formatting
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v2.7.1
hooks:
- id: prettier
types: [yaml]
# shell scripts linter
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: v0.8.0.4
hooks:
- id: shellcheck
# # md formatting
# - repo: https://github.com/executablebooks/mdformat
# rev: 0.7.14
# hooks:
# - id: mdformat
# args: ["--number"]
# additional_dependencies:
# - mdformat-gfm
# - mdformat-tables
# - mdformat_frontmatter
# # - mdformat-toc
# # - mdformat-black
# jupyter notebook cell output clearing
# - repo: https://github.com/kynan/nbstripout
# rev: 0.5.0
# hooks:
# - id: nbstripout
# jupyter notebook linting
# - repo: https://github.com/nbQA-dev/nbQA
# rev: 1.4.0
# hooks:
# - id: nbqa-black
# args: ["--line-length=79"]
# - id: nbqa-isort
# args: ["--profile=black"]
# - id: nbqa-flake8
# args:
# [
# "--extend-ignore=E203,E402,E501,F401,F841",
# "--exclude=logs/*,data/*",
# ]
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# The Family of Diffusion Protein Language Models (DPLM)

## Overview 🌟
This repository contains the official implementation of training and inference as well as the pre-trained weights for the Family of Diffusion Protein Language Models (DPLM), including:
- `DPLM` from ICML'24 paper ["Diffusion Language Models Are Versatile Protein Learners"](https://arxiv.org/abs/2402.18567), which introduces **d**iffusion **p**rotein **l**anguage **m**odel (DPLM), a versatile protein language model that demonstrates strong generative and predictive capabilities for protein sequences.
- `DPLM-2` from ICLR'25 paper ["DPLM-2: A Multimodal Diffusion Protein Language Model"](https://arxiv.org/abs/2410.13782), a multimodal protein foundation model that extends discrete diffusion protein language model to accommodate both sequences and structures.
- ICML'25 spotlight paper ["Elucidating the Design Space of Multimodal Protein Language Models"](https://arxiv.org/abs/2504.11454), where we elucidate the challenges of structure modeling of multimodal protein language models (e.g., DPLM-2 and ESM3) and propose advanced designs for better structure modeling. We have released the finer-grained bit-based generative modeling (`DPLM-2 Bit`). The full implementation of the paper will be released soon.
## Key Features 🔑
Specifically, the DPLM family exhibits impressive performance in protein (structure and sequence) co-generation, any-to-any conditional generation (e.g., folding, inverse folding, and motif scaffolding), and representation learning.
We develop DPLM based on the [ByProt](https://github.com/BytedProtein/ByProt). This repository contains pretraining scripts for DPLM and running scripts for various protein generation and understanding tasks, as detailed below:
- **Unconditional protein generation**:
**DPLM** is capable of unconditionally generating protein sequences with reasonable predicted structures. **DPLM-2** can generate diverse and highly plausible proteins through simultaneous structure-sequence co-generation.
- **Sequence-conditioned generation (forward folding)**:
DPLM-2 can generate reasonable protein structure given the input protein sequence, achieving close performance with the strong folding model (e.g., ESMFold).
- **Structure-conditioned generation (inverse folding)**:
DPLM and DPLM-2 can produce sequences that can confidently fold into the given backbone structure.
- **Motif scaffolding**:
DPLM can generate reasonable scaffold sequences given specific functional motifs. DPLM-2 achieves more successful motif scaffolding through multimodal motif conditioning.
- **Representation learning**:
DPLM is a superior protein sequence representation learner, while DPLM-2 offers structure-aware protein represenrations, demonstrating impressive performance across a variety of protein predictive tasks.
- **Controllable generation**:
DPLM enjoys plug-and-play programmability, generating samples satisfying provided secondary structure annotations.
**TODOs**
- [ ] Controllable/guided generation with discrete diffusion classifier guidance.
- [ ] Representation learning of DPLM-2
## DPLM
> ["Diffusion Language Models Are Versatile Protein Learners." Wang et al., In ICML 2024](https://arxiv.org/abs/2402.18567)

## DPLM-2
> ["DPLM-2: A Multimodal Diffusion Protein Language Model." Wang et al., In ICLR 2025](https://arxiv.org/abs/2410.13782)

## Updates 📢
- **[2025-07]** We update the default sampling strategy of **DPLM-2** to `annealing@2.0:0.1`.
- **[2025-04]** Our latest work **DPLM-2.1**, which focuses on analysis and better protein structure modeling of multimodal protein language models, is accepted to ICML'25 Spotlight! Check [Elucidating the Design Space of Multimodal Protein Language Models](https://arxiv.org/abs/2504.11454). We have release the implementation of finer-grained and better structure modeling (**DPLM-2 Bit**). The full implementation will be released soon.
- **[2024-10]** Check out our new work [DPLM-2](https://arxiv.org/abs/2410.13782), a multimodal protein foundation model that extends DPLM to simultaneously model, understand, and generate both sequences and structures!
- **[2024-03]** We release [DPLM](https://arxiv.org/abs/2402.18567), a versatile protein language model that demonstrates strong generative and predictive capabilities for protein sequences!
## Table of Contents 📚
- [Quick Start](#quick-start)
- [Installation](#installation)
- [Load Pretrained Models](#load-pretrained-models)
- [Generation Examples](#generation-examples)
- [Model Checkpoints](#model-checkpoints)
- [Advanced Usage](#advanced-usage)
- [Training](#training)
- [Unconditional protein (co-)generation](#unconditional-protein-co-generation)
- [Protein sequence generation (DPLM)](#protein-sequence-generation-dplm)
- [Protein sequence-structure co-generation (DPLM-2 & DPLM-2-Bit)](#protein-sequence-structure-co-generation-dplm-2--dplm-2-bit)
- [Sequence-conditioned Generation: Forward Folding](#sequence-conditioned-generation-forward-folding)
- [Structure-conditioned generation: inverse folding](#structure-conditioned-generation-inverse-folding)
- [Motif scaffolding](#motif-scaffolding)
- [Representation Learning](#representation-learning)
- [Acknowledgements](#acknowledgements)
- [Citation](#citation)
# Quick Start
## Installation
```bash
# clone project
git clone --recursive https://url/to/this/repo/dplm.git
cd dplm
# create conda virtual environment
env_name=dplm
conda create -n ${env_name} python=3.9 pip
conda activate ${env_name}
# automatically install everything else
bash scripts/install.sh
```
## Load Pretrained Models
Users can load DPLM/DPLM-2 checkpoint by:
```python
from byprot.models.dplm import DiffusionProteinLanguageModel as DPLM
from byprot.models.dplm2 import MultimodalDiffusionProteinLanguageModel as DPLM2
from byprot.models.dplm2 import DPLM2Bit
dplm = DPLM.from_pretrained("airkingbd/dplm_650m").cuda()
dplm2 = DPLM2.from_pretrained("airkingbd/dplm2_650m").cuda()
dplm2_bit = DPLM2Bit.from_pretrained("airkingbd/dplm2_bit_650m").cuda()
```
## Generation Examples
**Protein sequence generation**
```python
from generate_dplm import initialize_generation
input_tokens = initialize_generation(
length=200,
num_seqs=5,
tokenizer=dplm.tokenizer,
device=next(dplm.parameters()).device
)
samples = dplm.generate(
input_tokens=input_tokens,
max_iter=500,
)
print([''.join(seq.split(' ')) for seq in dplm.tokenizer.batch_decode(samples, skip_special_tokens=True)])
```
**Protein sequence-structure co-generation**
User can check the generated sequence and structure in the `./generation-results` folder.
```python
from generate_dplm2 import initialize_generation, save_results
input_tokens = initialize_generation(
task="co_generation",
length=200,
num_seqs=5,
tokenizer=dplm2.tokenizer,
device=next(dplm2.parameters()).device
)[0]
samples = dplm2.generate(
input_tokens=input_tokens,
max_iter=500,
)
save_results(
outputs=samples,
task="co_generation",
save_dir="./generation-results/dplm2_generation",
tokenizer=dplm2.tokenizer,
struct_tokenizer=dplm2.struct_tokenizer, save_pdb=True
)
samples = dplm2_bit.generate(
input_tokens=input_tokens,
max_iter=500,
)
save_results(
outputs=samples,
task="co_generation",
save_dir="./generation-results/dplm2_bit_generation",
tokenizer=dplm2_bit.tokenizer,
struct_tokenizer=dplm2_bit.struct_tokenizer
)
```
## Model Checkpoints
Access pretrained models in varying sizes:
| Model name | Model size |
| ------------------------------------------------------------ | --------------- |
| [dplm-150m](https://huggingface.co/airkingbd/dplm_150m/tree/main) | 150M parameters |
| [dplm-650m](https://huggingface.co/airkingbd/dplm_650m/tree/main) | 650M parameters |
| [dplm-3b](https://huggingface.co/airkingbd/dplm_3b/tree/main) | 3B parameters |
| [dplm2-150m](https://huggingface.co/airkingbd/dplm2_150m/tree/main) | 150M parameters |
| [dplm2-650m](https://huggingface.co/airkingbd/dplm2_650m/tree/main) | 650M parameters |
| [dplm2-3b](https://huggingface.co/airkingbd/dplm2_3b/tree/main) | 3B parameters |
| [dplm2-bit-650m](https://huggingface.co/airkingbd/dplm2_bit_650m/tree/main) | 650M parameters |
# Advanced Usage
## Training
### DPLM
#### Dataset
We pretrain DPLM on the UniRef50 dataset, which contains about 42 million protein sequences. We obtain the preprocessed UniRef50 dataset provided by [EvoDiff (Alamdari et al, 2023)](https://www.biorxiv.org/content/10.1101/2023.09.11.556673v1), which can be downloaded from this [link](https://zenodo.org/record/6564798). After downloading, please place the dataset in the `./data-bin/uniref50` folder.
We also provide the preprocessed dataset in [HuggingFace datasets](https://huggingface.co/datasets/airkingbd/uniref50) format, which we recommend to use. User can download the HF dataset locally in advance for faster loading by:
```bash
bash scripts/download_uniref50_hf.sh
```
#### Example of training
We train DPLM with approximately 1 million tokens per batch for 100,000 training steps.
The following command is run on one node with 8 A100 GPUs. If you want to train on multiple nodes, you can adjust the total number of tokens by ensuring that `max_tokens` \* `accumulate_grad_batches`\*`#GPUs` is approximately 1 million.
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
max_tokens=8192
accumulate_grad_batches=16
# this means the effective batch size is #GPUs(8) * max_tokens(8192) * accumulate_grad_batches(16), resulting in approximately 1 million.
exp=dplm/dplm_650m
model_name=dplm_650m
python train.py \
experiment=${exp} name=${model_name} \
datamodule.max_tokens=${max_tokens} \
trainer.accumulate_grad_batches=${accumulate_grad_batches}
```
You can adjust the other training configurations in the `configs/experiment/dplm/dplm_650m.yaml` as needed.
### DPLM-2
#### Dataset
We use the experimental structures from [PDB](https://pubmed.ncbi.nlm.nih.gov/10592235/) and AF2-predicted structures from [SwissProt](https://academic.oup.com/nar/article/50/D1/D439/6430488) dataset as training data for DPLM-2. We provide a preprocessed [HuggingFace dataset](https://huggingface.co/datasets/airkingbd/pdb_swissprot) of PDB and SwissProt. User can download the HF dataset locally in advance for faster loading by:
```bash
bash scripts/download_pdb_swissprot.sh
```
#### Example of training
As noted in section 3.2 in [DPLM-2](https://arxiv.org/abs/2410.13782) paper, we propose an efficient warm-up training strategy to mitigate the scarcity of structure training data. During training, we initialize the DPLM-2 model with pretrained DPLM checkpoint, to leverage the evolutionary knowledge captured by sequence-based pLM during large-scale sequence pretraining, which is beneficial for structure modeling.
We train DPLM-2 with approximately 64,000 tokens per batch for 100,000 training steps. To preserve the evolutionary knowledge captured by DPLM, we use the [LoRA](github.com/peft) to prevent large parameter shifts. The training command is as follows:
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
max_tokens=8192
accumulate_grad_batches=1
# this means the effective batch size is #GPUs(8) * max_tokens(8192) * accumulate_grad_batches(1), resulting in approximately 64 thousand.
exp=dplm2/dplm2_650m
model_name=dplm2_650m
python train.py \
experiment=${exp} name=${model_name} \
datamodule.max_tokens=${max_tokens} \
trainer.accumulate_grad_batches=${accumulate_grad_batches}
```
### DPLM-2 Bit-based Modeling
In our latest work [DPLM-2.1](https://arxiv.org/abs/2504.11454), we show that the index-based structure token is challenging for the model to predict. A finer-grained, bit-based modeling approach in the latent space (i.e., predicting each bit of the quantized structure feature instead of the index) leads to better structural modeling and generation performance.
The training dataset is same to DPLM-2, and the training command is as below:
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
max_tokens=8192
accumulate_grad_batches=1
# this means the effective batch size is #GPU(8) * max_tokens(8192) * accumulate_grad_batches(1), resulting in approximately 64 thousand.
exp=dplm2/dplm2_bit_650m
model_name=dplm2_bit_650m
python train.py \
experiment=${exp} name=${model_name} \
datamodule.max_tokens=${max_tokens} \
trainer.accumulate_grad_batches=${accumulate_grad_batches}
```
## Unconditional protein (co-)generation
### Protein sequence generation (DPLM)
The results of unconditional protein sequence generation of DPLM of different scales (150M, 650M, 3B) are shown in the table below. For more details, please refer to our paper.
| Length | 100 | 200 | 300 | 400 | 500 | 600 | 700 | 800 | 900 | 1000 |
| ------ | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | -------------- | -------------- |
| 150M | 73.31 | 84.30 | 84.82 | 86.90 | 81.71 | 81.53 | 81.56 | 80.92 | 78.71 | 72.10 |
| 650M | 74.00 (+0.69) | 85.61 (+1.31) | 85.91 (+1.09) | 88.16 (+1.26) | 82.58 (+0.87) | 84.38 (+2.85) | 83.87 (+2.31) | 83.00 (+2.08) | 84.92 (+6.21) | 81.51 (+9.41) |
| 3B | 77.78 (+4.47) | 86.16 (+1.86) | 87.39 (+2.57) | 90.06 (+3.16) | 87.43 (+5.72) | 86.01 (+4.48) | 84.64 (+3.08) | 85.88 (+4.96) | 85.93 (+7.22) | 83.86 (+11.76) |
To generate new protein sequences using a pre-trained DPLM model:
```bash
model_name=dplm_650m # choose from dplm_150m, dplm_650m, dplm_3b
output_dir=generation-results/${model_name}/uncond_generation
mkdir -p generation-results
python generate_dplm.py --model_name airkingbd/${model_name} \
--seq_lens 100 200 300 400 500 \
--saveto ${output_dir}
# Evaluation
bash anylasis/plddt_calculate.sh ${output_dir} # compute pLDDT using ESMFold
```
We also provide evaluation scripts in the `analysis` folder. Users can use the `analysis/uncond_analysis.ipynb` to obtain average pLDDT score of each length and draw the line chart of the pLDDT score.
### Protein sequence-structure co-generation (DPLM-2 & DPLM-2-Bit)
DPLM-2 can generate diverse and highly-plausible protein with simultaneous structure-sequence co-generation.
User can co-generate sequence and structure simultaneously with the command below:
```bash
# choose from dplm2_150m, dplm2_650m, dplm2_3b
model_name=dplm2_650m
# About the default sampling strategy, annealing@2.0:0.1,
# which anneals the temperature from 2.0 to 0.1.
# It begins with high randomness to maximize diversity
# and concludes with low randomness to ensure designability.
# This achieves a better trade-off between the quality and diversity.
sampling_strategy=annealing@2.0:0.1
output_dir=generation-results/${model_name}
task=co_generation
mkdir -p ${output_dir}
python generate_dplm2.py \
--model_name airkingbd/${model_name} \
--task ${task} \
--sampling_strategy ${sampling_strategy} \
--num_seqs 50 \
--max_iter 500 \
--seq_lens 100 200 300 400 500 \
--saveto ${output_dir}
# Evaluation
input_fasta_dir=${output_dir}/co_generation
python src/byprot/utils/protein/evaluator_dplm2.py -cn unconditional_codesign \
inference.input_fasta_dir=${input_fasta_dir}
```
User can use `analysis/plot.ipynb` to plot the rmsd, tmscore distribution and diversity of each length.
Co-generate sequence and structure with dplm-2 bit modeling variant:
```bash
model_name=dplm2_bit_650m
sampling_strategy=annealing@1.1:0.1
output_dir=generation-results/${model_name}
task=co_generation
mkdir -p ${output_dir}
python generate_dplm2.py \
--model_name airkingbd/${model_name} \
--task ${task} \
--bit_model \
--sampling_strategy ${sampling_strategy} \
--num_seqs 50 \
--max_iter 500 \
--seq_lens 100 200 300 400 500 \
--saveto ${output_dir}
```
## Sequence-conditioned Generation: Forward Folding
DPLM-2 spontaneously enables protein structure prediction given sequence (i.e., folding) in a zero-shot manner.
We use the [CAMEO 2022 (provided by EigenFold)](https://github.com/bjing2016/EigenFold) and a [PDB date split (provided by MultiFlow)](https://github.com/jasonkyuyim/multiflow) as testsets, and we provide our preprocessed dataset in this [link](https://zenodo.org/records/15424801), and can be downloaded by:
```bash
bash script/download_metadata.sh
```
Partial results are shown in the table below. For more details, please refer to [DPLM-2.1](https://arxiv.org/abs/2504.11454) paper.
| Models | CAMEO 2022 | | PDB date | |
|---|---|---|---|---|
| | rmsd | tmscore | Rmsd | tmscore |
| ESMFold | 3.99 | 0.85 | 2.84 | 0.93 |
| DPLM-2 | 7.70 | 0.79 | 5.30 | 0.83 |
| DPLM-2 Bit | 6.40 | 0.84 | 3.22 | 0.90 |
The folding generation and evaluation script is as follows.
We utilize RMSD and TMscore between the predicted and ground truth structures for evaluation. DPLM-2 adopts argmax decoding for 100 sampling iterations.
```bash
model_name=dplm2_650m
output_dir=generation-results/${model_name}
task=folding
mkdir -p ${output_dir}
input_fasta_path=data-bin/cameo2022/aatype.fasta
python generate_dplm2.py \
--model_name airkingbd/${model_name} \
--task ${task} \
--input_fasta_path ${input_fasta_path} \
--max_iter 100 \
--unmasking_strategy deterministic \
--sampling_strategy argmax \
--saveto ${output_dir}
# Evaluation
input_fasta_dir=${output_dir}/folding
python src/byprot/utils/protein/evaluator_dplm2.py -cn forward_folding inference.input_fasta_dir=${input_fasta_dir}
```
For structure prediction conditioned on other customized sequences, users can input a FASTA file and modify the `input_fasta_path` variable to generate the predicted structure.
## Structure-conditioned generation: inverse folding
DPLM family can perform inverse folding in different ways according to DPLM variant. DPLM performs inverse folding by placing an adapter layer on the top of pLM, similar to [LM-Design](https://github.com/BytedProtein/ByProt). On the other hand, DPLM-2 directly conditions on the tokenized structure tokens to predict the sequence.
### Inverse Folding with DPLM
Partial results on the CATH 4.3 dataset are shown in the table below. For more details, please refer to our paper.
| Models | Trainable Params. | AAR | scTM | pLDDT |
|-----------|-------------------|-----------|----------|-----------|
| LM-Design | 6.3M/650M | 56.49 | 0.85 | 74.89 |
| DPLM-150M | 3.1M/150M | 53.27 | 0.85 | 75.31 |
| DPLM-650M | 6.3M/650M | _56.61_ | _0.86_ | _76.78_ |
| DPLM-3B | 68.2M/3.0B | **58.64** | **0.86** | **76.95** |
#### Data
**Download the preproceesd CATH datasets**
- CATH 4.2 dataset provided by [Generative Models for Graph-Based Protein Design (Ingraham et al, NeurIPS'19)](https://papers.nips.cc/paper/2019/hash/f3a4ff4839c56a5f460c88cce3666a2b-Abstract.html)
- CATH 4.3 dataset provided by [Learning inverse folding from millions of predicted structures (Hsu et al, ICML'22)](https://www.biorxiv.org/content/10.1101/2022.04.10.487779v1)
```bash
bash scripts/download_cath.sh
```
#### Training
We train structure-conditional DPLM based on the [LM-Design](https://github.com/BytedProtein/ByProt) framework, designating the pre-trained protein language model as DPLM. The training script is as below.
```bash
exp=dplm/dplm_650m_invfold
dataset=cath_4.3
name=${dataset}/dplm_650m/invfold
python train.py \
experiment=${exp} datamodule=${dataset} name=${name} \
logger=tensorboard trainer=ddp_fp16
```
#### Evaluation on valid/test datasets
Users can set the `eval_sc` to `true` to calculate the self-consistency TMscore and pLDDT, which will result in a significant evaluation time overhead.
```bash
dataset=cath_4.3
exp_path=${dataset}/dplm_650m/invfold
eval_sc=false
# if set ${eval_sc} to true, the program will calculate the self-consistency
# TMscore and pLDDT during generation,
# thus siginificantly increase the evaluation time.
python test.py \
experiment_path=${exp_path} \
data_split=test ckpt_path=best.ckpt mode=predict \
task.generator.max_iter=100 task.generator.eval_sc=${eval_sc}
```
### Inverse Folding with DPLM-2
We provide the CAMEO 2022 and PDB date test set split used in our paper, where the structure has been tokenized and saved to `data-bin/cameo2022/struct.fasta` and `data-bin/PDB_date/struct.fasta`.
User can use the following script to do the inverse folding and evaluation.
```bash
model_name=dplm2_650m
output_dir=generation-results/${model_name}
task=inverse_folding
mkdir -p ${output_dir}
input_fasta_path=data-bin/cameo2022/struct.fasta
python generate_dplm2.py \
--model_name airkingbd/${model_name} \
--task ${task} \
--input_fasta_path ${input_fasta_path} \
--max_iter 100 \
--unmasking_strategy deterministic \
--sampling_strategy argmax \
--saveto ${output_dir}
# Evaluation
input_fasta_dir=${output_dir}/inverse_folding
python src/byprot/utils/protein/evaluator_dplm2.py -cn inverse_folding inference.input_fasta_dir=${input_fasta_dir}
```
For any customized input structure, user can first tokenize the structure with structure tokenizer and save it to a FASTA file using the following script:
```bash
# Tokenize
# each protein is represented by a pdb file
input_pdb_folder=/path/to/your/input/structure
# this will save two fasta files in the ${input_pdb_folder}/tokenized_protein folder:
# 1) struct.fasta, containing the tokenized structure tokens
# 2) aatype.fasta, containing the amino acid tokens.
python src/byprot/utils/protein/tokenize_pdb.py --input_pdb_folder ${input_pdb_folder} --output_dir ${input_pdb_folder}/tokenized_protein
```
Then user can specify the path of generated `struct.fasta` as input and predict the sequence.
## Motif scaffolding
DPLM and DPLM-2 can both perform motif scaffolding. DPLM can condition on the motif sequence and predict the scaffold sequence. DPLM-2 is able to condition on both the sequence and structure of the motif and simultaneously co-generate the sequence and structure of the scaffold part, which leads to better performance.
We examine on the benchmark, provided by [FrameFlow](https://github.com/microsoft/protein-frame-flow/blob/main/motif_scaffolding/benchmark.csv). We use the motif pdb files which are provided by [EvoDiff](https://github.com/microsoft/evodiff/tree/main/examples/scaffolding-pdbs), and we also provide the pdbs and the corresponding structure tokens in this [link](https://zenodo.org/records/15424801). You can download the dataset by
```bash
bash scripts/download_motif_scaffolds.sh
```
For each motif-scaffolding problem, we sample 100 sequences and then calculate the success rate according to two aspects: motif part consistency and overall quality. For motif part consistency, we use the motif-RMSD < 1$\AA$ as the success criterion. For overall quality, the assessment varies across different approaches: sequence-based method (DPLM) we use pLDDT > 70, while for co-generation method (DPLM-2) we use scTM > 0.8. For more details, please refer to our paper.
The success rate of each motif-scaffold problem is shown below.
| | Pass rate | Avg. Success rate | 1BCF | 1PRW | 1QJG | 1YCR | 2KL8 | 3IXT | 4JHW | 4ZYP | 5IUS | 5TPN | 5TRV_long | 5TRV_med | 5TRV_short | 5WN9 | 5YUI | 6E6R_long | 6E6R_med | 6E6R_short | 6EXZ_long | 6EXZ_med | 6EXZ_short | 7MRX_long | 7MRX_med | 7MRX_short |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| DPLM | 11/24 | 0.19 | 0.00 | 0.83 | 0.00 | 0.38 | 0.08 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.65 | 0.94 | 0.87 | 0.01 | 0.00 | 0.00 | 0.02 | 0.31 | 0.34 |
| DPLM-2 | 18/24 | 0.29 | 0.01 | 0.84 | 0.02 | 0.53 | 0.57 | 0.41 | 0.00 | 0.10 | 0.00 | 0.00 | 0.00 | 0.02 | 0.03 | 0.00 | 0.00 | 0.78 | 0.77 | 0.64 | 0.44 | 0.55 | 0.58 | 0.20 | 0.22 | 0.24 |
### DPLM
We provide the following script to sample sequences for each motif-scaffolding problem. Note that before generation, you should download the motif pdbs and place them in the `data-bin/scaffolding-pdbs` folder.
```bash
export CUDA_VISIBLE_DEVICES=0
model_name=dplm_650m
output_dir=./generation-results/${model_name}/motif_scaffold
mkdir -p generation-results
# Generate scaffold
python run/scaffold_generate_dplm.py \
--model_name airkingbd/${model_name} \
--num_seqs 100 \
--saveto $output_dir
# Predict structure by ESMFold
max_tokens=1024
pdb_path=$output_dir/scaffold_fasta/esmfold_pdb
# folding
mkdir -p $pdb_path
echo 'folding by ESMFold'
output_filename_list=$(ls ${output_dir}/scaffold_fasta)
echo $output_filename_list
python analysis/cal_plddt_dir.py -i ${output_dir}/scaffold_fasta -o ${pdb_path} --max-tokens-per-batch ${max_tokens}
```
For evaluation, users can use the `analysis/motif_analysis.ipynb` to obtain success rate of each problem.
### DPLM-2
Before generation, the FASTA file of tokenized structure tokens and amino acid tokens of the motif should be in the `data-bin/scaffolding-pdbs` folder. Users can co-generate the scaffold sequence and structure, conditioning on the sequence and structure of the motif part.
```bash
export CUDA_VISIBLE_DEVICES=0
model_name=dplm2_650m
output_dir=./generation-results/${model_name}/motif_scaffold
mkdir -p generation-results
# Generate scaffold
python run/scaffold_generate_dplm2.py \
--model_name airkingbd/${model_name} \
--num_seqs 100 \
--saveto ${output_dir}
# Predict structure by ESMFold
max_tokens=1024
python analysis/cal_plddt_dir.py -i ${output_dir}/scaffold_fasta --max-tokens-per-batch ${max_tokens}
# Calculate sc-TMscore
python src/byprot/utils/protein/evaluator_dplm2.py -cn unconditional_codesign \
inference.input_fasta_dir=${output_dir}/scaffold_fasta inference.calculate_diversity=false
```
For evaluation, users can use the `analysis/motif_analysis.ipynb` to obtain success rate of each problem.
## Representation Learning
The DPLM family excels in various downstream protein predictive tasks. DPLM is a superior protein sequence representation learner, while DPLM-2 can perform multimodal representation learning by leveraging both structure and sequence information, demonstrating its versatility and effectiveness. The following table summarizes the DPLM family performance, and the italic number means performance of DPLM-2, which offers structure-aware protein representations and outperforms sequence-based DPLM on most of the predictive tasks. Meanwhile, we also find the performance improves along with the model size.
| Models | Thermostability | HumanPPI | Metal Ion Binding | EC | GO-MF | GO-BP | GO-CC | DeepLoc-Subcellular | DeepLoc-Binary |
| --------------------- | --------------- | --------- | ----------------- | --------- | :-------: | :-------: | :-------: | ------------------- | -------------- |
| ESM2 (650M) | 0.691 | 84.78 | 71.88 | 0.866 | 0.676 | 0.344 | 0.402 | 83.68 | 92.28 |
| AR-LM | 0.638 | 68.48 | 61.66 | 0.691 | 0.566 | 0.258 | 0.287 | 68.53 | 88.31 |
| DPLM (150M) | 0.687 | 80.98 | 72.17 | 0.822 | 0.662 | 0.328 | 0.379 | 82.41 | 92.63 |
| DPLM (650M) | 0.695 | 86.41 | 75.15 | 0.875 | 0.680 | 0.357 | 0.409 | 84.56 | 93.09 |
| DPLM-2 (650M) | **_0.714_** | _84.44_ | _74.28_ | _0.878_ | _0.680_ | _0.359_ | _0.411_ | 82.98 | _93.64_ |
| *DPLM-2 (650M) | -- | _87.78_ | -- | --| --| -- | -- | _83.42_ | -- |
| DPLM (3B) | 0.704 | **90.00** | **75.94** | **0.883** | **0.687** | **0.369** | **0.463** | **85.32** | **93.93** |
> We find DPLM-2 demonstrates a performance degradation on some tasks (e.g., HumanPPI and DeepLoc-Subcellular), due to continue training on smaller magnitude of structure data and result in overfitting and degradation of the representations learned during large-scale sequence pretraining. \* means training on the larger-scale [AFDB representative](https://www.nature.com/articles/s41586-023-06510-w) structure data, and we find that enlarging structure data is indeed a key factor for better multimodal protein representations. Please refer to DPLM-2 paper for more details about this.
The training and evaluation pipeline is based on the [SaProt](https://github.com/westlake-repl/SaProt/tree/main) repository, and we slightly modify the code to support DPLM. Users can select the "representationlearning" branch for the evaluation of protein predictive tasks.
# Acknowledgements
DPLM extends its gratitude to the following projects and individuals.
We draw inspiration and leverages/modifies implementations from:
- [microsoft/evodiff](https://github.com/microsoft/evodiff) for the preprocessed UniRef50 dataset, sequence sampling evaluation implementation and data pipeline.
- [westlake-repl/SaProt](https://github.com/westlake-repl/SaProt/tree/main) for the representation learning evaluation pipeline.
- [jingraham/neurips19-graph-protein-design](https://github.com/jingraham/neurips19-graph-protein-design) for the preprocessed CATH dataset.
- [facebook/esm](https://github.com/facebookresearch/esm/) for their ESM implementations and pretrained model weights.
- [jasonkyuyim/se3_diffusion](https://github.com/jasonkyuyim/se3_diffusion) for their self-consistency structural evaluation implementation.
- [jasonkyuyim/multiflow](https://github.com/jasonkyuyim/multiflow) for their evaluation pipeline, structure data processing and preprocessed PDB dataset.
- [bjing2016/EigenFold](https://github.com/bjing2016/EigenFold) for the CAMEO 2022 dataset.
We express our sincere appreciation to the authors of these repositories for their invaluable contributions to the development of DPLM family.
# Citation
```
@inproceedings{wang2024dplm,
title={Diffusion Language Models Are Versatile Protein Learners},
author={Wang, Xinyou and Zheng, Zaixiang and Ye, Fei and Xue, Dongyu and Huang, Shujian and Gu, Quanquan},
booktitle={International Conference on Machine Learning},
year={2024}
}
@inproceedings{wang2025dplm2,
title={DPLM-2: A Multimodal Diffusion Protein Language Model},
author={Wang, Xinyou and Zheng, Zaixiang and Ye, Fei and Xue, Dongyu and Huang, Shujian and Gu, Quanquan},
booktitle={International Conference on Learning Representations},
year={2025}
}
@inproceedings{hsieh2025dplm2_1,
title={Elucidating the Design Space of Multimodal Protein Language Models},
author={Hsieh, Cheng-Yen and Wang, Xinyou and Zhang, Daiheng and Xue, Dongyu and Ye, Fei and Huang, Shujian and Zheng, Zaixiang and Gu, Quanquan},
booktitle={International Conference on Machine Learning},
year={2025}
}
```
================================================
FILE: analysis/TMalign.cpp
================================================
/* TM-align: sequence-independent structure alignment of monomer proteins by
* TM-score superposition. Please report issues to yangzhanglab@umich.edu
*
* References to cite:
* Y Zhang, J Skolnick. Nucl Acids Res 33, 2302-9 (2005)
*
* DISCLAIMER:
* Permission to use, copy, modify, and distribute the Software for any
* purpose, with or without fee, is hereby granted, provided that the
* notices on the head, the reference information, and this copyright
* notice appear in all copies or substantial portions of the Software.
* It is provided "as is" without express or implied warranty.
*
* ==========================
* How to install the program
* ==========================
* The following command compiles the program in your Linux computer:
*
* g++ -static -O3 -ffast-math -lm -o TMalign TMalign.cpp
*
* The '-static' flag should be removed on Mac OS, which does not support
* building static executables.
*
* ======================
* How to use the program
* ======================
* You can run the program without argument to obtain the document.
* Briefly, you can compare two structures by:
*
* ./TMalign structure1.pdb structure2.pdb
*
* ==============
* Update history
* ==============
* 2012/01/24: A C/C++ code of TM-align was constructed by Jianyi Yang
* 2016/05/21: Several updates of this program were made by Jianji Wu:
* (1) fixed several compiling bugs
* (2) made I/O of C/C++ version consistent with the Fortran version
* (3) added outputs including full-atom and ligand structures
* (4) added options of '-i', '-I' and '-m'
* 2016/05/25: Fixed a bug on PDB file reading
* 2018/06/04: Several updates were made by Chengxin Zhang, including
* (1) Fixed bug in reading PDB files with negative residue index,
* (2) Implemented the fTM-align algorithm (by the '-fast' option)
* as described in R Dong, S Pan, Z Peng, Y Zhang, J Yang
* (2018) Nucleic acids research. gky430.
* (3) Included option to perform TM-align against a whole
* folder of PDB files. A full list of options not available
* in the Fortran version can be explored by TMalign -h
* 2018/07/27: Added the -byresi option for TM-score superposition without
* re-alignment as in TMscore and TMscore -c
* 2018/08/07: Added the -dir option
* 2018/08/14: Added the -split option
* 2018/08/16: Added the -infmt1, -infmt2 options.
* 2019/01/07: Added support for PDBx/mmCIF format.
* 2019/02/09: Fixed asymmetric alignment bug.
* 2019/03/17: Added the -cp option for circular permutation
* 2019/07/23: Supported RasMol output by '-o' option
* 2019/07/24: Fixed bug on PyMOL format output by '-o' option with mmCIF input
* 2019/08/18: Fixed bug on RasMol format output file *_atm. Removed excessive
* circular permutation alignment by -cp
* 2019/08/20: Clarified PyMOL syntax.
* 2019/08/22: Added four additional PyMOL scripts.
* 2020/12/12: Fixed bug in double precision coordinate cif file alignment.
* 2021/02/24: Fixed file format issue for new incentive PyMOL.
* 2022/04/12: Compatible with AlphaFold CIF
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include