Repository: chainer/chainer-chemistry Branch: master Commit: efe323aa21f6 Files: 374 Total size: 1.2 MB Directory structure: gitextract_w32vn7wx/ ├── .codecov.yml ├── .flexci/ │ ├── config.pbtxt │ ├── gen_config.py │ └── pytest_script.sh ├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── LICENSE ├── README.md ├── chainer_chemistry/ │ ├── __init__.py │ ├── _version.py │ ├── config.py │ ├── dataset/ │ │ ├── __init__.py │ │ ├── converters/ │ │ │ ├── __init__.py │ │ │ ├── cgcnn_converter.py │ │ │ ├── concat_mols.py │ │ │ └── megnet_converter.py │ │ ├── graph_dataset/ │ │ │ ├── __init__.py │ │ │ ├── base_graph_data.py │ │ │ ├── base_graph_dataset.py │ │ │ └── feature_converters.py │ │ ├── indexer.py │ │ ├── indexers/ │ │ │ ├── __init__.py │ │ │ └── numpy_tuple_dataset_feature_indexer.py │ │ ├── networkx_preprocessors/ │ │ │ ├── base_networkx.py │ │ │ └── reddit_coo.py │ │ ├── parsers/ │ │ │ ├── __init__.py │ │ │ ├── base_parser.py │ │ │ ├── csv_file_parser.py │ │ │ ├── data_frame_parser.py │ │ │ ├── sdf_file_parser.py │ │ │ └── smiles_parser.py │ │ ├── preprocessors/ │ │ │ ├── __init__.py │ │ │ ├── atomic_number_preprocessor.py │ │ │ ├── base_preprocessor.py │ │ │ ├── cgcnn_preprocessor.py │ │ │ ├── common.py │ │ │ ├── ecfp_preprocessor.py │ │ │ ├── ggnn_preprocessor.py │ │ │ ├── gin_preprocessor.py │ │ │ ├── gnnfilm_preprocessor.py │ │ │ ├── gwm_preprocessor.py │ │ │ ├── megnet_preprocessor.py │ │ │ ├── mol_preprocessor.py │ │ │ ├── nfp_preprocessor.py │ │ │ ├── relgat_preprocessor.py │ │ │ ├── relgcn_preprocessor.py │ │ │ ├── rsgcn_preprocessor.py │ │ │ ├── schnet_preprocessor.py │ │ │ ├── weavenet_preprocessor.py │ │ │ ├── wle.py │ │ │ ├── wle_atom_array_update.py │ │ │ ├── wle_io.py │ │ │ └── wle_util.py │ │ ├── splitters/ │ │ │ ├── __init__.py │ │ │ ├── base_splitter.py │ │ │ ├── deepchem_scaffold_splitter.py │ │ │ ├── random_splitter.py │ │ │ ├── scaffold_splitter.py │ │ │ ├── stratified_splitter.py │ │ │ └── time_splitter.py │ │ └── utils.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── citation_network/ │ │ │ ├── citation.py │ │ │ ├── citeseer.py │ │ │ └── cora.py │ │ ├── molnet/ │ │ │ ├── __init__.py │ │ │ ├── chembl_tasks.py │ │ │ ├── molnet.py │ │ │ ├── molnet_config.py │ │ │ ├── pdbbind_time.py │ │ │ └── toxcast_tasks.py │ │ ├── numpy_tuple_dataset.py │ │ ├── qm9.py │ │ ├── reddit/ │ │ │ └── reddit.py │ │ ├── tox21.py │ │ └── zinc.py │ ├── functions/ │ │ ├── __init__.py │ │ ├── activation/ │ │ │ ├── __init__.py │ │ │ ├── megnet_softplus.py │ │ │ ├── shifted_softplus.py │ │ │ └── softmax.py │ │ ├── evaluation/ │ │ │ ├── __init__.py │ │ │ └── r2_score.py │ │ ├── loss/ │ │ │ ├── __init__.py │ │ │ ├── mean_absolute_error.py │ │ │ └── mean_squared_error.py │ │ └── math/ │ │ ├── __init__.py │ │ └── matmul.py │ ├── iterators/ │ │ ├── __init__.py │ │ ├── balanced_serial_iterator.py │ │ └── index_iterator.py │ ├── link_hooks/ │ │ ├── __init__.py │ │ └── variable_monitor_link_hook.py │ ├── links/ │ │ ├── __init__.py │ │ ├── array/ │ │ │ ├── __init__.py │ │ │ └── shape_transformer_to_2d.py │ │ ├── connection/ │ │ │ ├── __init__.py │ │ │ ├── embed_atom_id.py │ │ │ ├── graph_linear.py │ │ │ └── graph_mlp.py │ │ ├── normalization/ │ │ │ ├── __init__.py │ │ │ └── graph_batch_normalization.py │ │ ├── readout/ │ │ │ ├── __init__.py │ │ │ ├── cgcnn_readout.py │ │ │ ├── general_readout.py │ │ │ ├── ggnn_readout.py │ │ │ ├── megnet_readout.py │ │ │ ├── mpnn_readout.py │ │ │ ├── nfp_readout.py │ │ │ ├── scatter_ggnn_readout.py │ │ │ ├── schnet_readout.py │ │ │ └── set2set.py │ │ ├── scaler/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── flow_scaler.py │ │ │ ├── max_abs_scaler.py │ │ │ ├── min_max_scaler.py │ │ │ └── standard_scaler.py │ │ └── update/ │ │ ├── __init__.py │ │ ├── cgcnn_update.py │ │ ├── ggnn_update.py │ │ ├── gin_update.py │ │ ├── gnn_film_update.py │ │ ├── megnet_update.py │ │ ├── mpnn_update.py │ │ ├── nfp_update.py │ │ ├── relgat_update.py │ │ ├── relgcn_update.py │ │ ├── rsgcn_update.py │ │ └── schnet_update.py │ ├── models/ │ │ ├── __init__.py │ │ ├── cgcnn.py │ │ ├── cwle/ │ │ │ ├── __init__.py │ │ │ ├── cwle_graph_conv_model.py │ │ │ └── cwle_net.py │ │ ├── ggnn.py │ │ ├── gin.py │ │ ├── gnn_film.py │ │ ├── gwle/ │ │ │ ├── __init__.py │ │ │ ├── gwle_graph_conv_model.py │ │ │ └── gwle_net.py │ │ ├── gwm/ │ │ │ ├── __init__.py │ │ │ ├── gwm.py │ │ │ ├── gwm_graph_conv_model.py │ │ │ └── gwm_net.py │ │ ├── megnet.py │ │ ├── mlp.py │ │ ├── mpnn.py │ │ ├── nfp.py │ │ ├── prediction/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── classifier.py │ │ │ ├── graph_conv_predictor.py │ │ │ ├── node_classifier.py │ │ │ ├── regressor.py │ │ │ └── set_up_predictor.py │ │ ├── relgat.py │ │ ├── relgcn.py │ │ ├── rsgcn.py │ │ ├── schnet.py │ │ └── weavenet.py │ ├── saliency/ │ │ ├── __init__.py │ │ ├── calculator/ │ │ │ ├── __init__.py │ │ │ ├── base_calculator.py │ │ │ ├── calculator_utils.py │ │ │ ├── gradient_calculator.py │ │ │ ├── integrated_gradients_calculator.py │ │ │ └── occlusion_calculator.py │ │ └── visualizer/ │ │ ├── __init__.py │ │ ├── base_visualizer.py │ │ ├── image_visualizer.py │ │ ├── mol_visualizer.py │ │ ├── table_visualizer.py │ │ └── visualizer_utils.py │ ├── training/ │ │ ├── __init__.py │ │ └── extensions/ │ │ ├── __init__.py │ │ ├── auto_print_report.py │ │ ├── batch_evaluator.py │ │ ├── prc_auc_evaluator.py │ │ ├── r2_score_evaluator.py │ │ └── roc_auc_evaluator.py │ └── utils/ │ ├── __init__.py │ ├── extend.py │ ├── json_utils.py │ ├── permutation.py │ ├── sparse_utils.py │ └── train_utils.py ├── docker/ │ ├── conda/ │ │ ├── python36/ │ │ │ ├── Dockerfile │ │ │ └── conda-entrypoint.sh │ │ ├── python37/ │ │ │ ├── Dockerfile │ │ │ └── conda-entrypoint.sh │ │ ├── python37-chainerx-cpu-base/ │ │ │ ├── Dockerfile │ │ │ └── conda-entrypoint.sh │ │ ├── python37-chainerx-cpu-latest/ │ │ │ ├── Dockerfile │ │ │ └── conda-entrypoint.sh │ │ ├── python37-chainerx-cpu-stable/ │ │ │ ├── Dockerfile │ │ │ └── conda-entrypoint.sh │ │ ├── python37-chainerx-gpu-base/ │ │ │ ├── Dockerfile │ │ │ └── conda-entrypoint.sh │ │ ├── python37-chainerx-gpu-latest/ │ │ │ ├── Dockerfile │ │ │ └── conda-entrypoint.sh │ │ └── python37-chainerx-gpu-stable/ │ │ ├── Dockerfile │ │ └── conda-entrypoint.sh │ └── python3/ │ └── Dockerfile ├── docs/ │ ├── Makefile │ └── source/ │ ├── _autosummary_check.py │ ├── conf.py │ ├── contribution.rst │ ├── dataset.rst │ ├── datasets.rst │ ├── development.rst │ ├── environment.yml │ ├── functions.rst │ ├── index.rst │ ├── install.rst │ ├── iterators.rst │ ├── links.rst │ ├── models.rst │ ├── reference.rst │ ├── requirements.txt │ ├── training.rst │ ├── tutorial.rst │ └── utils.rst ├── examples/ │ ├── .gitignore │ ├── README.md │ ├── molnet/ │ │ ├── README.md │ │ ├── evaluate_models_molnet.sh │ │ ├── predict_molnet.py │ │ ├── summary_eval_molnet.py │ │ ├── test_molnet.sh │ │ └── train_molnet.py │ ├── molnet_wle/ │ │ ├── README.md │ │ ├── predict_molnet_wle.py │ │ └── train_molnet_wle.py │ ├── network_graph/ │ │ ├── README.md │ │ ├── citeseer/ │ │ │ └── .gitignore │ │ ├── cora/ │ │ │ └── .gitignore │ │ ├── padding_model_wrapper.py │ │ ├── reddit/ │ │ │ └── .gitignore │ │ └── train_network_graph.py │ ├── own_dataset/ │ │ ├── README.md │ │ ├── dataset_test.csv │ │ ├── dataset_train.csv │ │ ├── evaluate_own_dataset.sh │ │ ├── plot.py │ │ ├── predict_own_dataset.py │ │ ├── test_own_dataset.sh │ │ └── train_own_dataset.py │ ├── qm9/ │ │ ├── README.md │ │ ├── evaluate_models_qm9.sh │ │ ├── plot.py │ │ ├── predict_qm9.py │ │ ├── qm9_dataset_exploration.ipynb │ │ ├── test_qm9.sh │ │ └── train_qm9.py │ ├── test_examples.sh │ └── tox21/ │ ├── .gitignore │ ├── README.md │ ├── data.py │ ├── evaluate_models_tox21.sh │ ├── plot.py │ ├── predict_tox21_with_classifier.py │ ├── test_tox21.sh │ ├── tox21_dataset_exploration.ipynb │ └── train_tox21.py ├── setup.py └── tests/ ├── dataset_tests/ │ ├── parsers_tests/ │ │ ├── test_csv_file_parser.py │ │ ├── test_data_frame_parser.py │ │ ├── test_sdf_file_parser.py │ │ └── test_smiles_parser.py │ ├── preprocessor_tests/ │ │ └── test_common.py │ ├── preprocessors_tests/ │ │ ├── test_atomic_number_preprocessor.py │ │ ├── test_cgcnn_preprocessor.py │ │ ├── test_gat_preprocessor.py │ │ ├── test_ggnn_preprocessor.py │ │ ├── test_gwm_preprocessor.py │ │ ├── test_mol_preprocessor.py │ │ ├── test_nfp_preprocessor.py │ │ ├── test_relgcn_preprocessor.py │ │ ├── test_rsgcn_preprocessor.py │ │ ├── test_schnet_preprocessor.py │ │ ├── test_weavenet_preprocessor.py │ │ ├── test_wle.py │ │ ├── test_wle_atom_array_update.py │ │ └── test_wle_util.py │ ├── splitters_tests/ │ │ ├── test_deepchem_scaffold_splitter.py │ │ ├── test_random_splitter.py │ │ ├── test_scaffold_splitter.py │ │ ├── test_stratified_splitter.py │ │ └── test_time_splitter.py │ ├── test_converters.py │ └── test_numpy_tuple_feature_indexer.py ├── datasets_tests/ │ ├── molnet_tests/ │ │ ├── test_molnet.py │ │ └── test_pdbbind_time.py │ ├── test_numpy_tuple_dataset.py │ ├── test_qm9.py │ ├── test_tox21.py │ └── test_zinc.py ├── functions_tests/ │ ├── activation/ │ │ ├── test_megnet_softplus.py │ │ ├── test_shifted_softplus.py │ │ └── test_softmax.py │ ├── evaluation/ │ │ └── test_r2_score.py │ └── loss/ │ ├── test_mean_absolute_error.py │ └── test_mean_squared_error.py ├── iterators_tests/ │ ├── test_balanced_serial_iterator.py │ └── test_index_iterator.py ├── link_hooks_tests/ │ └── test_variable_monitor_link_hook.py ├── links_tests/ │ ├── array_tests/ │ │ └── test_shape_transformer_to_2d.py │ ├── connection_tests/ │ │ ├── test_embed_atom_id.py │ │ ├── test_graph_linear.py │ │ └── test_graph_mlp.py │ ├── readout_tests/ │ │ ├── test_cgcnn_readout.py │ │ ├── test_general_readout.py │ │ ├── test_ggnn_readout.py │ │ ├── test_megnet_readout.py │ │ ├── test_mpnn_readout.py │ │ ├── test_nfp_readout.py │ │ ├── test_schnet_readout.py │ │ └── test_set2set.py │ ├── scaler_tests/ │ │ ├── test_flow_scaler.py │ │ ├── test_max_abs_scaler.py │ │ ├── test_min_max_scaler.py │ │ └── test_standard_scaler.py │ └── update_tests/ │ ├── test_cgcnn_update.py │ ├── test_ggnn_update.py │ ├── test_gin_update.py │ ├── test_gnn_film_update.py │ ├── test_megnet_update.py │ ├── test_mpnn_update.py │ ├── test_nfp_update.py │ ├── test_relgat_update.py │ ├── test_relgcn_update.py │ ├── test_rsgcn_update.py │ └── test_schnet_update.py ├── models_tests/ │ ├── gwm_tests/ │ │ ├── test_gwm.py │ │ └── test_gwm_graph_conv_model.py │ ├── prediction_tests/ │ │ ├── test_base.py │ │ ├── test_classifier.py │ │ ├── test_graph_conv_predictor.py │ │ ├── test_regressor.py │ │ └── test_set_up_predictor.py │ ├── test_cgcnn.py │ ├── test_ggnn.py │ ├── test_gin.py │ ├── test_gnn_film.py │ ├── test_megnet.py │ ├── test_mlp.py │ ├── test_mpnn.py │ ├── test_nfp.py │ ├── test_relgat.py │ ├── test_relgcn.py │ ├── test_rsgcn.py │ ├── test_schnet.py │ └── test_weavenet.py ├── saliency_tests/ │ ├── calculator_tests/ │ │ ├── test_base_calculator.py │ │ ├── test_calculator_utils.py │ │ ├── test_gradient_calculator.py │ │ ├── test_integrated_gradient_calculator.py │ │ └── test_occlusion_calculator.py │ └── visualizer_tests/ │ ├── test_image_visualizer.py │ ├── test_mol_visualizer.py │ ├── test_table_visualizer.py │ └── test_visualizer_utils.py ├── test_init.py ├── training_tests/ │ └── extensions_tests/ │ ├── test_auto_print_report.py │ ├── test_prc_auc_evaluator.py │ ├── test_r2_score_evaluator.py │ └── test_roc_auc_evaluator.py └── utils_tests/ ├── test_extend.py ├── test_json_utils.py ├── test_permutation.py ├── test_sparse_utils.py └── test_train_utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .codecov.yml ================================================ coverage: status: # pull-requests only # Allow to drop coverage project: default: threshold: 5.0% patch: default: threshold: 20.0% comment: layout: "header, diff" require_changes: false branches: null behavior: default flags: null paths: null ================================================ FILE: .flexci/config.pbtxt ================================================ # DO NOT MODIFY THIS FILE MANUALLY. # USE gen_config.py INSTEAD. configs { key: "chainerch.py37.stable.cpu.chx" value { requirement { cpu: 4 memory: 16 disk: 10 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "stable" } environment_variables { key: "CHAINERX" value: "1" } environment_variables { key: "GPU" value: "0" } } } configs { key: "chainerch.py37.stable.gpu.chx" value { requirement { cpu: 4 memory: 16 disk: 10 gpu: 1 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "stable" } environment_variables { key: "CHAINERX" value: "1" } environment_variables { key: "GPU" value: "1" } } } configs { key: "chainerch.py37.stable.cpu" value { requirement { cpu: 4 memory: 16 disk: 10 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "stable" } environment_variables { key: "CHAINERX" value: "0" } environment_variables { key: "GPU" value: "0" } } } configs { key: "chainerch.py37.stable.gpu" value { requirement { cpu: 4 memory: 16 disk: 10 gpu: 1 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "stable" } environment_variables { key: "CHAINERX" value: "0" } environment_variables { key: "GPU" value: "1" } } } configs { key: "chainerch.py37.latest.cpu.chx" value { requirement { cpu: 4 memory: 16 disk: 10 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "latest" } environment_variables { key: "CHAINERX" value: "1" } environment_variables { key: "GPU" value: "0" } } } configs { key: "chainerch.py37.latest.gpu.chx" value { requirement { cpu: 4 memory: 16 disk: 10 gpu: 1 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "latest" } environment_variables { key: "CHAINERX" value: "1" } environment_variables { key: "GPU" value: "1" } } } configs { key: "chainerch.py37.latest.cpu" value { requirement { cpu: 4 memory: 16 disk: 10 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "latest" } environment_variables { key: "CHAINERX" value: "0" } environment_variables { key: "GPU" value: "0" } } } configs { key: "chainerch.py37.latest.gpu" value { requirement { cpu: 4 memory: 16 disk: 10 gpu: 1 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "latest" } environment_variables { key: "CHAINERX" value: "0" } environment_variables { key: "GPU" value: "1" } } } configs { key: "chainerch.py37.base.cpu.chx" value { requirement { cpu: 4 memory: 16 disk: 10 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "base" } environment_variables { key: "CHAINERX" value: "1" } environment_variables { key: "GPU" value: "0" } } } configs { key: "chainerch.py37.base.gpu.chx" value { requirement { cpu: 4 memory: 16 disk: 10 gpu: 1 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "base" } environment_variables { key: "CHAINERX" value: "1" } environment_variables { key: "GPU" value: "1" } } } configs { key: "chainerch.py37.base.cpu" value { requirement { cpu: 4 memory: 16 disk: 10 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "base" } environment_variables { key: "CHAINERX" value: "0" } environment_variables { key: "GPU" value: "0" } } } configs { key: "chainerch.py37.base.gpu" value { requirement { cpu: 4 memory: 16 disk: 10 gpu: 1 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "37" } environment_variables { key: "CHAINER" value: "base" } environment_variables { key: "CHAINERX" value: "0" } environment_variables { key: "GPU" value: "1" } } } configs { key: "chainerch.py36.stable.gpu" value { requirement { cpu: 4 memory: 16 disk: 10 gpu: 1 } command: "bash .flexci/pytest_script.sh" environment_variables { key: "PYTHON" value: "36" } environment_variables { key: "CHAINER" value: "stable" } environment_variables { key: "CHAINERX" value: "0" } environment_variables { key: "GPU" value: "1" } } } ================================================ FILE: .flexci/gen_config.py ================================================ """Config generator for Flex CI Usage: $ python gen_config.py > config.pbtxt """ from __future__ import print_function from collections import OrderedDict import itertools def test_config(python, chainer, target, chainerx): if chainerx: s_chainerx = '.chx' else: s_chainerx = '' key = 'chainerch.py{}.{}.{}{}'.format(python, chainer, target, s_chainerx) value = OrderedDict(( ('requirement', OrderedDict(( ('cpu', 4), ('memory', 16), ('disk', 10), ))), ('command', 'bash .flexci/pytest_script.sh'), ('environment_variables', [ ('PYTHON', str(python)), ('CHAINER', chainer), ('CHAINERX', '1' if chainerx else '0'), ('GPU', '1' if target == 'gpu' else '0'), ]), )) if target == 'gpu': value['requirement']['gpu'] = 1 return key, value def main(): configs = [] for python, chainer in itertools.product( (37,), ('stable', 'latest', 'base')): for chainerx in (True, False): configs.append(test_config(python, chainer, 'cpu', chainerx)) configs.append(test_config(python, chainer, 'gpu', chainerx)) # small test in python 36 configs.append(test_config(36, 'stable', 'gpu', False)) print('# DO NOT MODIFY THIS FILE MANUALLY.') print('# USE gen_config.py INSTEAD.') print() dump_pbtxt('configs', configs) def dump_pbtxt(key, value, level=0): indent = ' ' * level if isinstance(value, int): print('{}{}: {}'.format(indent, key, value)) elif isinstance(value, str): print('{}{}: "{}"'.format(indent, key, value)) elif isinstance(value, list): for k, v in value: print('{}{} {{'.format(indent, key)) dump_pbtxt('key', k, level + 1) dump_pbtxt('value', v, level + 1) print('{}}}'.format(indent)) elif isinstance(value, dict): print('{}{} {{'.format(indent, key)) for k, v in value.items(): dump_pbtxt(k, v, level + 1) print('{}}}'.format(indent)) if __name__ == '__main__': main() ================================================ FILE: .flexci/pytest_script.sh ================================================ #!/bin/bash set -eux BASE=6.0.0 service docker stop mount -t tmpfs -o size=100% tmpfs /var/lib/docker service docker start gcloud auth configure-docker if [ ${CHAINERX} -gt 0 ]; then if [ ${GPU} -gt 0 ]; then case ${CHAINER} in stable) DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-gpu-stable:latest ;; latest) DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-gpu-latest:latest ;; base) DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-gpu-base:latest ;; esac else case ${CHAINER} in stable) DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-cpu-stable:latest ;; latest) DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-cpu-latest:latest ;; base) DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-cpu-base:latest ;; esac fi echo "Use installed chainer in Docker image" else DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON:latest case ${CHAINER} in stable) echo pip install chainer >> install.sh ;; latest) echo pip install --pre chainer >> install.sh ;; base) echo pip install chainer==${BASE} >> install.sh ;; esac if [ ${GPU} -gt 0 ]; then case ${CHAINER} in stable) echo pip install cupy-cuda101 >> install.sh ;; latest) echo pip install --pre cupy-cuda101 >> install.sh ;; base) echo pip install cupy-cuda101==${BASE} >> install.sh ;; esac fi fi echo pip install pytest-cov pytest-xdist mock >> install.sh echo pip install -e . >> install.sh echo $DOCKER_IMAGE cat install.sh if [ ${GPU} -gt 0 ]; then PYTEST_OPTION="not slow" RUNTIME="--runtime=nvidia" else PYTEST_OPTION="not slow and not gpu" RUNTIME="" fi docker run $RUNTIME --interactive --rm \ --volume $(pwd):/repo/ --workdir /repo/\ $DOCKER_IMAGE sh -ex << EOD . ./install.sh pytest -n 4 --cov=chainer_chemistry -m '${PYTEST_OPTION}' tests/ EOD ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ docs/source/generated # PyBuilder target/ # IPython Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # dotenv .env # virtualenv venv/ ENV/ # Spyder project settings .spyderproject # Rope project settings .ropeproject # PyCharm project settings .idea/ examples/tox21/input/ examples/qm9/input/ examples/molnet/input/ # Trained models and plots examples/tox21/eval_* examples/qm9/eval_* examples/molnet/eval_* examples/own_dataset/eval_* # emacs *~ # VSCode .vscode/ # Visual Studio .vs/ *.sln *.pyproj .pytest_cache ================================================ FILE: .readthedocs.yml ================================================ name: chainer-chemistry type: sphinx base: docs/source conda: file: docs/source/environment.yml ================================================ FILE: .travis.yml ================================================ sudo: false language: python os: linux dist: trusty python: - 3.6 env: env: - CHAINER_VERSION="chainer==7.0.0" - CHAINER_VERSION="chainer" - CHAINER_VERSION="prerelease" install: - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - export PATH="$HOME/miniconda/bin:$PATH" - hash -r - conda config --set always_yes yes - conda update -q conda - conda info -a - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION - source activate test-environment - pip install codecov - pip install mock - conda install pip pytest pytest-cov - conda install -c rdkit rdkit==2019.03.2.0 - if [ "${CHAINER_VERSION}" = "prerelease" ]; then pip install --pre chainer; else pip install "${CHAINER_VERSION}"; fi - pip install --no-cache-dir -e . script: - if [ "${TRAVIS_EVENT_TYPE}" = "cron" ]; then pytest --cov=./ -m "not gpu" tests; (cd examples && bash -x test_examples.sh -1); else pytest --cov=./ -m "not (gpu or slow)" tests; fi after_success: - codecov ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 Preferred Networks, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Chainer Chemistry: A Library for Deep Learning in Biology and Chemistry [![PyPI](https://img.shields.io/pypi/v/chainer-chemistry.svg)](https://pypi.python.org/pypi/chainer-chemistry) [![GitHub license](https://img.shields.io/github/license/pfnet-research/chainer-chemistry.svg)](https://github.com/pfnet-research/chainer-chemistry/blob/master/LICENSE) [![travis](https://img.shields.io/travis/pfnet-research/chainer-chemistry/master.svg)](https://travis-ci.org/pfnet-research/chainer-chemistry) [![Read the Docs](https://readthedocs.org/projects/chainer-chemistry/badge/?version=latest)](http://chainer-chemistry.readthedocs.io/en/latest/?badge=latest)

Chainer Chemistry Overview

Chainer Chemistry is a deep learning framework (based on Chainer) with applications in Biology and Chemistry. It supports various state-of-the-art models (especially GCNN - Graph Convolutional Neural Network) for chemical property prediction. For more information, please refer to the [documentation](http://chainer-chemistry.readthedocs.io/en/latest/index.html). Also, a quick introduction to deep learning for molecules and Chainer Chemistry is available [here](https://www.slideshare.net/KentaOono/deep-learning-for-molecules-introduction-to-chainer-chemistry-93288837). ## Dependencies Chainer Chemistry depends on the following packages: - [`chainer`](https://docs.chainer.org/en/stable/index.html) - [`pandas`](https://pandas.pydata.org) - [`scikit-learn`](http://scikit-learn.org/stable/) - [`tqdm`](https://pypi.python.org/pypi/tqdm) - [`h5py`](https://pypi.python.org/pypi/h5py) These are automatically added to the system when installing the library via the `pip` command (see _Installation_). However, the following needs to be installed manually: - [`rdkit (release 2019.03.2.0)`](https://github.com/rdkit/rdkit) Please refer to the RDKit [documentation](http://www.rdkit.org/docs/Install.html) for more information regarding the installation steps. Note that only the following versions of Chainer Chemistry's dependencies are currently supported: | Chainer Chemistry | Chainer | RDKit | Python | | ------------------: | --------------: | -------------: | ---------------: | | v0.1.0 ~ v0.3.0 | v2.0 ~ v3.0 | 2017.09.3.0 | 2.7, 3.5, 3.6 | | v0.4.0 | v3.0 ~ v4.0 *1 | 2017.09.3.0 | 2.7, 3.5, 3.6 | | v0.5.0 | v3.0 ~ v5.0 *2 | 2017.09.3.0 | 2.7, 3.5, 3.6 | | v0.6.0 | v6.0 ~ *3 | 2017.09.3.0 | 2.7, 3.5, 3.6 | | v0.7.0 ~ v0.7.1 | v7.0 ~ | 2019.03.2.0 | 3.6, 3.7 *4 | | master branch *5 | v7.0 ~ | 2019.03.2.0 | 3.6, 3.7 | [Footnote] *1: We used `FunctionNode` in [this PR](https://github.com/pfnet-research/chainer-chemistry/pull/190), which is introduced after chainer v3. See [this issue](https://github.com/pfnet-research/chainer-chemistry/issues/192) for details. *2: Saliency modules only work after chainer v5. *3: Chainer v6 is released and [ChainerX](https://chainer.org/announcement/2018/12/03/chainerx.html) is newly introduced. In order to support this new feature & API, we broke backward compatibility for chainer chemistry v0.6.0 release. See [ChainerX Documentation](https://chainer.org/announcement/2018/12/03/chainerx.html) for details. *4: python 2.x support is dropped, following the same policy with `chainer` and `rdkit`. *5: As [announced in chainer blog](https://chainer.org/announcement/2019/12/05/released-v7.html), further development will be limited to only serious bug-fixes and maintenance. ## Installation Chainer Chemistry can be installed using the `pip` command, as follows: ``` pip install chainer-chemistry ``` Example to install rdkit with conda: ```bash # newer conda version is necessary to install rdkit 2019.03.2.0 conda install -n base conda==4.6.14 conda install -c rdkit rdkit==2019.03.2.0 ``` If you would like to use the latest sources, please checkout the master branch and install with the following commands: ``` git clone https://github.com/pfnet-research/chainer-chemistry.git pip install -e chainer-chemistry ``` ## Sample Code Sample code is provided with this repository. This includes, but is not limited to, the following: - Training a new model on a given dataset - Performing inference on a given dataset, using a pretrained model - Evaluating and reporting performance metrics of different models on a given dataset Please refer to the `examples` directory for more information. ## Supported Models The following graph convolutional neural networks are currently supported: - NFP: Neural Fingerprint [2, 3] - GGNN: Gated Graph Neural Network [4, 3] - WeaveNet [5, 3] - SchNet [6] - RSGCN: Renormalized Spectral Graph Convolutional Network [10]
\* The name is not from the original paper - see [PR #89](https://github.com/pfnet-research/chainer-chemistry/pull/89) for the naming convention. - RelGCN: Relational Graph Convolutional Network [14] - GAT: Graph Attention Networks [15] - GIN: Graph Isomorphism Networks [17] - MPNN: Message Passing Neural Networks [3] - Set2Set [19] - GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation [20] - MEGNet: MatErials Graph Network [24] - CGCNN: Crystal Graph Convolutional Neural Networks [25] We test supporting the brand-new Graph Warp Module (GWM) [18]-attached models for: - NFP ('nfp_gwm') - GGNN ('ggnn_gwm') - RSGCN ('rsgcn_gwm') - GIN ('gin_gwm') In the directory `examples/molnet_wle`, we have implemented the new preprocessing ''Weisfeiler-Lehman Embedding for Molecular Graph Neural Networks'' [26] for several GNN architectures. Please find the Readme in that directory for the usage and the details. ## Supported Datasets The following datasets are currently supported: ### Chemical - QM9 [7, 8] - Tox21 [9] - MoleculeNet [11] - ZINC (only 250k dataset) [12, 13] - User (own) dataset ### Network - cora [21] - citeseer [22] - reddit [23] ## Research Projects If you use Chainer Chemistry in your research, feel free to submit a pull request and add the name of your project to this list: - BayesGrad: Explaining Predictions of Graph Convolutional Networks ([paper](https://arxiv.org/abs/1807.01985), [code](https://github.com/pfnet-research/bayesgrad)) - Graph Warp Module: an Auxiliary Module for Boosting the Power of Graph Neural Networks ([paper](https://arxiv.org/abs/1902.01020), [code](https://github.com/k-ishiguro/chainer-chemistry/tree/gwm_for_CC)) - GraphNVP: An Invertible Flow Model for Generating Molecular Graphs ([paper](https://arxiv.org/abs/1905.11600), [code](https://github.com/pfnet-research/graph-nvp)) - Graph Residual Flow for Molecular Graph Generation ([paper](https://arxiv.org/abs/1909.13521)) ## Useful Links Chainer Chemistry: - [Documentation](https://chainer-chemistry.readthedocs.io) - [Research Blog](https://preferredresearch.jp/2017/12/18/chainer-chemistry-beta-release/) Other Chainer frameworks: - [Chainer: A Flexible Framework of Neural Networks for Deep Learning](https://chainer.org/) - [ChainerRL: Deep Reinforcement Learning Library Built on Top of Chainer](https://github.com/chainer/chainerrl) - [ChainerCV: A Library for Deep Learning in Computer Vision](https://github.com/chainer/chainercv) - [ChainerMN: Scalable Distributed Deep Learning with Chainer](https://github.com/chainer/chainermn) - [ChainerUI: User Interface for Chainer](https://github.com/chainer/chainerui) ## License This project is released under the MIT License. Please refer to the [this page](https://github.com/pfnet-research/chainer-chemistry/blob/master/LICENSE) for more information. Please note that Chainer Chemistry is still in experimental development. We continuously strive to improve its functionality and performance, but at this stage we cannot guarantee the reproducibility of any results published in papers. Use the library at your own risk. ## References [1] Seiya Tokui, Kenta Oono, Shohei Hido, and Justin Clayton. Chainer: a next-generation open source framework for deep learning. In *Proceedings of Workshop on Machine Learning Systems (LearningSys) in Advances in Neural Information Processing System (NIPS) 28*, 2015. [2] David K Duvenaud, Dougal Maclaurin, Jorge Iparraguirre, Rafael Bombarell, Timothy Hirzel, Alan Aspuru-Guzik, and Ryan P Adams. Convolutional networks on graphs for learning molecular fingerprints. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors, *Advances in Neural Information Processing Systems (NIPS) 28*, pages 2224–2232. Curran Asso- ciates, Inc., 2015. [3] Justin Gilmer, Samuel S Schoenholz, Patrick F Riley, Oriol Vinyals, and George E Dahl. Neural message passing for quantum chemistry. *arXiv preprint arXiv:1704.01212*, 2017. [4] Yujia Li, Daniel Tarlow, Marc Brockschmidt, and Richard Zemel. Gated graph sequence neural networks. *arXiv preprint arXiv:1511.05493*, 2015. [5] Steven Kearnes, Kevin McCloskey, Marc Berndl, Vijay Pande, and Patrick Riley. Molecular graph convolutions: moving beyond fingerprints. *Journal of computer-aided molecular design*, 30(8):595–608, 2016. [6] Kristof Schütt, Pieter-Jan Kindermans, Huziel Enoc Sauceda Felix, Stefan Chmiela, Alexandre Tkatchenko, and Klaus-Rober Müller. Schnet: A continuous-filter convolutional neural network for modeling quantum interactions. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, *Advances in Neural Information Processing Systems (NIPS) 30*, pages 992–1002. Curran Associates, Inc., 2017. [7] Lars Ruddigkeit, Ruud Van Deursen, Lorenz C Blum, and Jean-Louis Reymond. Enumeration of 166 billion organic small molecules in the chemical universe database gdb-17. *Journal of chemical information and modeling*, 52(11):2864–2875, 2012. [8] Raghunathan Ramakrishnan, Pavlo O Dral, Matthias Rupp, and O Anatole Von Lilienfeld. Quantum chemistry structures and properties of 134 kilo molecules. *Scientific data*, 1:140022, 2014. [9] Ruili Huang, Menghang Xia, Dac-Trung Nguyen, Tongan Zhao, Srilatha Sakamuru, Jinghua Zhao, Sampada A Shahane, Anna Rossoshek, and Anton Simeonov. Tox21challenge to build predictive models of nuclear receptor and stress response pathways as mediated by exposure to environmental chemicals and drugs. *Frontiers in Environmental Science*, 3:85, 2016. [10] Kipf, Thomas N. and Welling, Max. Semi-Supervised Classification with Graph Convolutional Networks. *International Conference on Learning Representations (ICLR)*, 2017. [11] Zhenqin Wu, Bharath Ramsundar, Evan N. Feinberg, Joseph Gomes, Caleb Geniesse, Aneesh S. Pappu, Karl Leswing, Vijay Pande, MoleculeNet: A Benchmark for Molecular Machine Learning, arXiv preprint, arXiv: 1703.00564, 2017. [12] J. J. Irwin, T. Sterling, M. M. Mysinger, E. S. Bolstad, and R. G. Coleman. Zinc: a free tool to discover chemistry for biology. *Journal of chemical information and modeling*, 52(7):1757–1768, 2012. [13] Preprocessed csv file downloaded from https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv [14] Michael Schlichtkrull, Thomas N. Kipf, Peter Bloem, Rianne van den Berg, Ivan Titov, Max Welling. Modeling Relational Data with Graph Convolutional Networks. *Extended Semantic Web Conference (ESWC)*, 2018. [15] Veličković, P., Cucurull, G., Casanova, A., Romero, A., Liò, P., & Bengio, Y. (2017). Graph Attention Networks. arXiv preprint arXiv:1710.10903. [16] Dan Busbridge, Dane Sherburn, Pietro Cavallo and Nils Y. Hammerla. (2019). Relational Graph Attention Networks. https://openreview.net/forum?id=Bklzkh0qFm [17] Keyulu Xu, Weihua Hu, Jure Leskovec, Stefanie Jegelka, ``How Powerful are Graph Neural Networks?'', arXiv:1810.00826 [cs.LG], 2018 (to appear at ICLR19). [18] K. Ishiguro, S. Maeda, and M. Koyama, ``Graph Warp Module: an Auxiliary Module for Boosting the Power of Graph Neural Networks'', arXiv:1902.01020 [cs.LG], 2019. [19] Oriol Vinyals, Samy Bengio, Manjunath Kudlur. Order Matters: Sequence to sequence for sets. *arXiv preprint arXiv:1511.06391*, 2015. [20] Marc Brockschmidt, ``GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation'', arXiv:1906.12192 [cs.ML], 2019. [21] McCallum, Andrew Kachites and Nigam, Kamal and Rennie, Jason and Seymore, Kristie, Automating the Construction of Internet Portals with Machine Learning. *Information Retrieval*, 2000. [22] C. Lee Giles and Kurt D. Bollacker and Steve Lawrence, CiteSeer: An Automatic Citation Indexing System. *Proceedings of the Third ACM Conference on Digital Libraries*, 1998. [23] William L. Hamilton and Zhitao Ying and Jure Leskovec, Inductive Representation Learning on Large Graphs. *Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, 4-9 December 2017* [24] Chi Chen, Weike Ye, Yunxing Zuo, Chen Zheng, and Shyue Ping Ong. Graph networks as a universal machine learning framework for molecules and crystals. *Chemistry of Materials*, 31(9):3564–3572, 2019. [25] Tian Xie and Jeffrey C Grossman. Crystal graph convolutional neural networks for an accurate and interpretable prediction of material properties. *Physical review letters*, 120(14):145301, 2018. [26] Katsuhiko Ishiguro, Kenta Oono, and Kohei Hayashi, "Weisfeiler-Lehman Embedding for Molecular Graph Neural Networks", arXiv: 2006.06909, 2020. [paper link](https://arxiv.org/abs/2006.06909) ================================================ FILE: chainer_chemistry/__init__.py ================================================ import warnings from chainer_chemistry import dataset # NOQA try: from chainer_chemistry import datasets # NOQA except ImportError as e: if 'rdkit' in e.msg: warnings.warn( 'A module chainer_chemistry.datasets was not imported, ' 'probably because RDKit is not installed. ' 'To install RDKit, please follow instruction in ' 'https://github.com/pfnet-research/chainer-chemistry#installation.', # NOQA UserWarning) else: raise(e) from chainer_chemistry import functions # NOQA from chainer_chemistry import links # NOQA from chainer_chemistry import models # NOQA from chainer_chemistry import training # NOQA # --- config variable definitions --- from chainer_chemistry.config import * # NOQA from chainer_chemistry import _version # NOQA __version__ = _version.__version__ ================================================ FILE: chainer_chemistry/_version.py ================================================ __version__ = '0.7.1' ================================================ FILE: chainer_chemistry/config.py ================================================ # --- Configuration --- # --- Constant definitions --- # The maximum atomic number in rdkit MAX_ATOMIC_NUM = 117 WEAVE_DEFAULT_NUM_MAX_ATOMS = 20 # 60 # paper ================================================ FILE: chainer_chemistry/dataset/__init__.py ================================================ from chainer_chemistry.dataset.indexer import BaseFeatureIndexer # NOQA from chainer_chemistry.dataset.indexer import BaseIndexer # NOQA ================================================ FILE: chainer_chemistry/dataset/converters/__init__.py ================================================ from chainer_chemistry.dataset.converters.cgcnn_converter import cgcnn_converter # NOQA from chainer_chemistry.dataset.converters.concat_mols import concat_mols # NOQA from chainer_chemistry.dataset.converters.megnet_converter import megnet_converter # NOQA converter_method_dict = { 'ecfp': concat_mols, 'nfp': concat_mols, 'nfp_gwm': concat_mols, 'ggnn': concat_mols, 'ggnn_gwm': concat_mols, 'gin': concat_mols, 'gin_gwm': concat_mols, 'schnet': concat_mols, 'weavenet': concat_mols, 'relgcn': concat_mols, 'rsgcn': concat_mols, 'rsgcn_gwm': concat_mols, 'relgat': concat_mols, 'gnnfilm': concat_mols, 'megnet': megnet_converter, 'cgcnn': cgcnn_converter } ================================================ FILE: chainer_chemistry/dataset/converters/cgcnn_converter.py ================================================ import numpy import chainer from chainer.dataset.convert import to_device from chainer import functions @chainer.dataset.converter() def cgcnn_converter(batch, device=None, padding=None): """CGCNN converter""" if len(batch) == 0: raise ValueError("batch is empty") atom_feat, nbr_feat, nbr_idx = [], [], [] batch_atom_idx, target = [], [] current_idx = 0 xp = device.xp for element in batch: atom_feat.append(element[0]) nbr_feat.append(element[1]) nbr_idx.append(element[2] + current_idx) target.append(element[3]) n_atom = element[0].shape[0] atom_idx = numpy.arange(n_atom) + current_idx batch_atom_idx.append(atom_idx) current_idx += n_atom atom_feat = to_device(device, functions.concat(atom_feat, axis=0).data) nbr_feat = to_device(device, functions.concat(nbr_feat, axis=0).data) # Always use numpy array for batch_atom_index # this is list of variable length array batch_atom_idx = numpy.array(batch_atom_idx) nbr_idx = to_device(device, functions.concat(nbr_idx, axis=0).data) target = to_device(device, xp.asarray(target)) result = (atom_feat, nbr_feat, batch_atom_idx, nbr_idx, target) return result ================================================ FILE: chainer_chemistry/dataset/converters/concat_mols.py ================================================ import chainer @chainer.dataset.converter() def concat_mols(batch, device=None, padding=0): """Concatenates a list of molecules into array(s). This function converts an "array of tuples" into a "tuple of arrays". Specifically, given a list of examples each of which consists of a list of elements, this function first makes an array by taking the element in the same position from each example and concatenates them along the newly-inserted first axis (called `batch dimension`) into one array. It repeats this for all positions and returns the resulting arrays. The output type depends on the type of examples in ``batch``. For instance, consider each example consists of two arrays ``(x, y)``. Then, this function concatenates ``x`` 's into one array, and ``y`` 's into another array, and returns a tuple of these two arrays. Another example: consider each example is a dictionary of two entries whose keys are ``'x'`` and ``'y'``, respectively, and values are arrays. Then, this function concatenates ``x`` 's into one array, and ``y`` 's into another array, and returns a dictionary with two entries ``x`` and ``y`` whose values are the concatenated arrays. When the arrays to concatenate have different shapes, the behavior depends on the ``padding`` value. If ``padding`` is ``None``, it raises an error. Otherwise, it builds an array of the minimum shape that the contents of all arrays can be substituted to. The padding value is then used to the extra elements of the resulting arrays. The current implementation is identical to :func:`~chainer.dataset.concat_examples` of Chainer, except the default value of the ``padding`` option is changed to ``0``. .. admonition:: Example >>> import numpy >>> from chainer_chemistry.dataset.converters import concat_mols >>> x0 = numpy.array([1, 2]) >>> x1 = numpy.array([4, 5, 6]) >>> dataset = [x0, x1] >>> results = concat_mols(dataset) >>> print(results) [[1 2 0] [4 5 6]] .. seealso:: :func:`chainer.dataset.concat_examples` Args: batch (list): A list of examples. This is typically given by a dataset iterator. device (int): Device ID to which each array is sent. Negative value indicates the host memory (CPU). If it is omitted, all arrays are left in the original device. padding: Scalar value for extra elements. If this is None (default), an error is raised on shape mismatch. Otherwise, an array of minimum dimensionalities that can accommodate all arrays is created, and elements outside of the examples are padded by this value. Returns: Array, a tuple of arrays, or a dictionary of arrays: The type depends on the type of each example in the batch. """ return chainer.dataset.concat_examples(batch, device, padding=padding) ================================================ FILE: chainer_chemistry/dataset/converters/megnet_converter.py ================================================ import chainer from chainer.dataset.convert import to_device @chainer.dataset.converter() def megnet_converter(batch, device=None, padding=0): """MEGNet converter""" if len(batch) == 0: raise ValueError("batch is empty") atom_feat, pair_feat, global_feat, target = [], [], [], [] atom_idx, pair_idx, start_idx, end_idx = [], [], [], [] batch_size = len(batch) current_atom_idx = 0 for i in range(batch_size): element = batch[i] n_atom = element[0].shape[0] n_pair = element[1].shape[0] atom_feat.extend(element[0]) pair_feat.extend(element[1]) global_feat.append(element[2]) atom_idx.extend([i]*n_atom) pair_idx.extend([i]*n_pair) start_idx.extend(element[3][0] + current_atom_idx) end_idx.extend(element[3][1] + current_atom_idx) target.append(element[4]) current_atom_idx += n_atom xp = device.xp atom_feat = to_device(device, xp.asarray(atom_feat)) pair_feat = to_device(device, xp.asarray(pair_feat)) global_feat = to_device(device, xp.asarray(global_feat)) atom_idx = to_device(device, xp.asarray(atom_idx)) pair_idx = to_device(device, xp.asarray(pair_idx)) start_idx = to_device(device, xp.asarray(start_idx)) end_idx = to_device(device, xp.asarray(end_idx)) target = to_device(device, xp.asarray(target)) result = (atom_feat, pair_feat, global_feat, atom_idx, pair_idx, start_idx, end_idx, target) return result ================================================ FILE: chainer_chemistry/dataset/graph_dataset/__init__.py ================================================ ================================================ FILE: chainer_chemistry/dataset/graph_dataset/base_graph_data.py ================================================ import numpy import chainer class BaseGraphData(object): """Base class of graph data """ def __init__(self, *args, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) def to_device(self, device): """Send self to `device` Args: device (chainer.backend.Device): device Returns: self sent to `device` """ for k, v in self.__dict__.items(): if isinstance(v, (numpy.ndarray)): setattr(self, k, device.send(v)) elif isinstance(v, (chainer.utils.CooMatrix)): data = device.send(v.data.array) row = device.send(v.row) col = device.send(v.col) device_coo_matrix = chainer.utils.CooMatrix( data, row, col, v.shape, order=v.order) setattr(self, k, device_coo_matrix) return self class PaddingGraphData(BaseGraphData): """Graph data class for padding pattern Args: x (numpy.ndarray): input node feature adj (numpy.ndarray): adjacency matrix y (int or numpy.ndarray): graph or node label """ def __init__(self, x=None, adj=None, super_node=None, pos=None, y=None, **kwargs): self.x = x self.adj = adj self.super_node = super_node self.pos = pos self.y = y self.n_nodes = x.shape[0] super(PaddingGraphData, self).__init__(**kwargs) class SparseGraphData(BaseGraphData): """Graph data class for sparse pattern Args: x (numpy.ndarray): input node feature edge_index (numpy.ndarray): sources and destinations of edges edge_attr (numpy.ndarray): attribution of edges y (int or numpy.ndarray): graph or node label """ def __init__(self, x=None, edge_index=None, edge_attr=None, pos=None, super_node=None, y=None, **kwargs): self.x = x self.edge_index = edge_index self.edge_attr = edge_attr self.pos = pos self.super_node = super_node self.y = y self.n_nodes = x.shape[0] super(SparseGraphData, self).__init__(**kwargs) ================================================ FILE: chainer_chemistry/dataset/graph_dataset/base_graph_dataset.py ================================================ import numpy import chainer from chainer._backend import Device from chainer_chemistry.dataset.graph_dataset.base_graph_data import BaseGraphData # NOQA from chainer_chemistry.dataset.graph_dataset.feature_converters \ import batch_with_padding, batch_without_padding, concat, shift_concat, \ concat_with_padding, shift_concat_with_padding # NOQA class BaseGraphDataset(object): """Base class of graph dataset (list of graph data)""" _pattern = '' _feature_entries = [] _feature_batch_method = [] def __init__(self, data_list, *args, **kwargs): self.data_list = data_list def register_feature(self, key, batch_method, skip_if_none=True): """Register feature with batch method Args: key (str): name of the feature batch_method (function): batch method skip_if_none (bool, optional): If true, skip if `batch_method` is None. Defaults to True. """ if skip_if_none and getattr(self.data_list[0], key, None) is None: return self._feature_entries.append(key) self._feature_batch_method.append(batch_method) def update_feature(self, key, batch_method): """Update batch method of the feature Args: key (str): name of the feature batch_method (function): batch method """ index = self._feature_entries.index(key) self._feature_batch_method[index] = batch_method def __len__(self): return len(self.data_list) def __getitem__(self, item): return self.data_list[item] def converter(self, batch, device=None): """Converter Args: batch (list[BaseGraphData]): list of graph data device (int, optional): specifier of device. Defaults to None. Returns: self sent to `device` """ if not isinstance(device, Device): device = chainer.get_device(device) batch = [method(name, batch, device=device) for name, method in zip(self._feature_entries, self._feature_batch_method)] data = BaseGraphData( **{key: value for key, value in zip(self._feature_entries, batch)}) return data class PaddingGraphDataset(BaseGraphDataset): """Graph dataset class for padding pattern""" _pattern = 'padding' def __init__(self, data_list): super(PaddingGraphDataset, self).__init__(data_list) self.register_feature('x', batch_with_padding) self.register_feature('adj', batch_with_padding) self.register_feature('super_node', batch_with_padding) self.register_feature('pos', batch_with_padding) self.register_feature('y', batch_without_padding) self.register_feature('n_nodes', batch_without_padding) class SparseGraphDataset(BaseGraphDataset): """Graph dataset class for sparse pattern""" _pattern = 'sparse' def __init__(self, data_list): super(SparseGraphDataset, self).__init__(data_list) self.register_feature('x', concat) self.register_feature('edge_index', shift_concat) self.register_feature('edge_attr', concat) self.register_feature('super_node', concat) self.register_feature('pos', concat) self.register_feature('y', batch_without_padding) self.register_feature('n_nodes', batch_without_padding) def converter(self, batch, device=None): """Converter add `self.batch`, which represents the index of the graph each node belongs to. Args: batch (list[BaseGraphData]): list of graph data device (int, optional): specifier of device. Defaults to None. Returns: self sent to `device` """ data = super(SparseGraphDataset, self).converter(batch, device=device) if not isinstance(device, Device): device = chainer.get_device(device) data.batch = numpy.concatenate([ numpy.full((data.x.shape[0]), i, dtype=numpy.int) for i, data in enumerate(batch) ]) data.batch = device.send(data.batch) return data # for experiment # use converter for the normal use def converter_with_padding(self, batch, device=None): self.update_feature('x', concat_with_padding) self.update_feature('edge_index', shift_concat_with_padding) data = super(SparseGraphDataset, self).converter(batch, device=device) if not isinstance(device, Device): device = chainer.get_device(device) max_n_nodes = max([data.x.shape[0] for data in batch]) data.batch = numpy.concatenate([ numpy.full((max_n_nodes), i, dtype=numpy.int) for i, data in enumerate(batch) ]) data.batch = device.send(data.batch) return data ================================================ FILE: chainer_chemistry/dataset/graph_dataset/feature_converters.py ================================================ import numpy from chainer.dataset.convert import _concat_arrays def batch_with_padding(name, batch, device=None, pad=0): """Batch with padding (increase ndim by 1) Args: name (str): propaty name of graph data batch (list[BaseGraphData]): list of base graph data device (chainer.backend.Device, optional): device. Defaults to None. pad (int, optional): padding value. Defaults to 0. Returns: BaseGraphDataset: graph dataset sent to `device` """ feat = _concat_arrays( [getattr(example, name) for example in batch], pad) return device.send(feat) def batch_without_padding(name, batch, device=None): """Batch without padding (increase ndim by 1) Args: name (str): propaty name of graph data batch (list[BaseGraphData]): list of base graph data device (chainer.backend.Device, optional): device. Defaults to None. Returns: BaseGraphDataset: graph dataset sent to `device` """ feat = _concat_arrays( [getattr(example, name) for example in batch], None) return device.send(feat) def concat_with_padding(name, batch, device=None, pad=0): """Concat without padding (ndim does not increase) Args: name (str): propaty name of graph data batch (list[BaseGraphData]): list of base graph data device (chainer.backend.Device, optional): device. Defaults to None. pad (int, optional): padding value. Defaults to 0. Returns: BaseGraphDataset: graph dataset sent to `device` """ feat = batch_with_padding(name, batch, device=device, pad=pad) a, b = feat.shape return feat.reshape((a * b)) def concat(name, batch, device=None, axis=0): """Concat with padding (ndim does not increase) Args: name (str): propaty name of graph data batch (list[BaseGraphData]): list of base graph data device (chainer.backend.Device, optional): device. Defaults to None. pad (int, optional): padding value. Defaults to 0. Returns: BaseGraphDataset: graph dataset sent to `device` """ feat = numpy.concatenate([getattr(data, name) for data in batch], axis=axis) return device.send(feat) def shift_concat(name, batch, device=None, shift_attr='x', shift_axis=1): """Concat with index shift (ndim does not increase) Concatenate graphs into a big one. Used for sparse pattern batching. Args: name (str): propaty name of graph data batch (list[BaseGraphData]): list of base graph data device (chainer.backend.Device, optional): device. Defaults to None. Returns: BaseGraphDataset: graph dataset sent to `device` """ shift_index_array = numpy.cumsum( numpy.array([0] + [getattr(data, shift_attr).shape[0] for data in batch])) feat = numpy.concatenate([ getattr(data, name) + shift_index_array[i] for i, data in enumerate(batch)], axis=shift_axis) return device.send(feat) def shift_concat_with_padding(name, batch, device=None, shift_attr='x', shift_axis=1): """Concat with index shift and padding (ndim does not increase) Concatenate graphs into a big one. Used for sparse pattern batching. Args: name (str): propaty name of graph data batch (list[BaseGraphData]): list of base graph data device (chainer.backend.Device, optional): device. Defaults to None. Returns: BaseGraphDataset: graph dataset sent to `device` """ max_n_nodes = max([data.x.shape[0] for data in batch]) shift_index_array = numpy.arange(0, len(batch) * max_n_nodes, max_n_nodes) feat = numpy.concatenate([ getattr(data, name) + shift_index_array[i] for i, data in enumerate(batch)], axis=shift_axis) return device.send(feat) ================================================ FILE: chainer_chemistry/dataset/indexer.py ================================================ import numpy import six class ExtractBySliceNotSupportedError(Exception): pass class BaseIndexer(object): """Base class for Indexer""" def __getitem__(self, item): raise NotImplementedError class BaseFeatureIndexer(BaseIndexer): """Base class for FeatureIndexer FeatureIndexer can be accessed by 2-dimensional indices, axis=0 is used for dataset index and axis=1 is used for feature index. For example, let `features` be the instance of `BaseFeatureIndexer`, then `features[i, j]` returns `i`-th dataset of `j`-th feature. `features[ind]` works same with `features[ind, :]` Note that the returned value will be numpy array, even though the dataset is initilized with other format (e.g. list). """ def __init__(self, dataset): super(BaseFeatureIndexer, self).__init__() self.dataset = dataset def features_length(self): """Returns length of features Returns (int): feature length """ raise NotImplementedError @property def dataset_length(self): return len(self.dataset) @property def shape(self): return self.dataset_length, self.features_length() def extract_feature_by_slice(self, slice_index, j): """Extracts `slice_index`-th data's `j`-th feature. Here, `slice_index` is indices of slice object. This method may be override to support efficient feature extraction. If not override, `ExtractBySliceNotSupportedError` is raised by default, and in this case `extract_feature` is used instead. Args: slice_index (slice): slice of data index to be extracted j (int): `j`-th feature to be extracted Returns: feature """ raise ExtractBySliceNotSupportedError def extract_feature(self, i, j): """Extracts `i`-th data's `j`-th feature Args: i (int): `i`-th data to be extracted j (int): `j`-th feature to be extracted Returns: feature """ raise NotImplementedError def create_feature_index_list(self, feature_index): if isinstance(feature_index, slice): feature_index_list = numpy.arange( *feature_index.indices(self.features_length()) ) elif isinstance(feature_index, (list, numpy.ndarray)): if isinstance(feature_index[0], (bool, numpy.bool, numpy.bool_)): if len(feature_index) != self.features_length(): raise ValueError('Feature index wrong length {} instead of' ' {}'.format(len(feature_index), self.features_length())) feature_index_list = numpy.argwhere(feature_index ).ravel() else: feature_index_list = feature_index else: # assuming int type feature_index_list = [feature_index] return feature_index_list def preprocess(self, item): pass def postprocess(self, item): pass def __getitem__(self, item): self.preprocess(item) if isinstance(item, tuple): index_dim = len(item) # multi dimensional access if index_dim == 1: # This is not unexpected case... data_index = item[0] feature_index_list = self.create_feature_index_list( slice(None) ) elif index_dim == 2: data_index, feature_index = item feature_index_list = self.create_feature_index_list( feature_index ) else: raise IndexError('too many indices for features') else: data_index = item feature_index_list = self.create_feature_index_list(slice(None)) if len(feature_index_list) == 1: self._extract_single_feature = True ret = self._extract_feature(data_index, feature_index_list[0]) else: self._extract_single_feature = False ret = tuple([self._extract_feature(data_index, j) for j in feature_index_list]) self.postprocess(item) return ret def check_type_feature_index(self, j): if j >= self.features_length(): raise IndexError('index {} is out of bounds for axis 1 with ' 'size {}'.format(j, self.features_length())) def _extract_feature(self, data_index, j): """Format `data_index` and call proper method to extract feature. Args: data_index (int, slice, list or numpy.ndarray): j (int or key): """ self.check_type_feature_index(j) if isinstance(data_index, slice): try: return self.extract_feature_by_slice(data_index, j) except ExtractBySliceNotSupportedError: # Accessing by each index, copy occurs current, stop, step = data_index.indices(self.dataset_length) res = [self.extract_feature(i, j) for i in six.moves.range(current, stop, step)] elif isinstance(data_index, (list, numpy.ndarray)): if len(data_index) == 0: try: # HACKING return self.extract_feature_by_slice(slice(0, 0, 1), j) except ExtractBySliceNotSupportedError: res = [] else: if isinstance(data_index[0], (bool, numpy.bool, numpy.bool_)): # Access by bool flag list if len(data_index) != self.dataset_length: raise ValueError( 'Feature index wrong length {} instead of' ' {}'.format(len(data_index), self.dataset_length)) data_index = numpy.argwhere(data_index).ravel() res = [self.extract_feature(i, j) for i in data_index] else: # `data_index` is expected to be `int` return self.extract_feature(data_index, j) try: feature = numpy.asarray(res) except ValueError: feature = numpy.empty(len(res), dtype=object) feature[:] = res[:] return feature ================================================ FILE: chainer_chemistry/dataset/indexers/__init__.py ================================================ from chainer_chemistry.dataset.indexers.numpy_tuple_dataset_feature_indexer import NumpyTupleDatasetFeatureIndexer # NOQA ================================================ FILE: chainer_chemistry/dataset/indexers/numpy_tuple_dataset_feature_indexer.py ================================================ from chainer_chemistry.dataset.indexer import BaseFeatureIndexer class NumpyTupleDatasetFeatureIndexer(BaseFeatureIndexer): """FeatureIndexer for NumpyTupleDataset Args: dataset (NumpyTupleDataset): dataset instance """ def __init__(self, dataset): super(NumpyTupleDatasetFeatureIndexer, self).__init__(dataset) self.datasets = dataset.get_datasets() def features_length(self): return len(self.datasets) def extract_feature_by_slice(self, slice_index, j): return self.datasets[j][slice_index] def extract_feature(self, i, j): return self.datasets[j][i] ================================================ FILE: chainer_chemistry/dataset/networkx_preprocessors/base_networkx.py ================================================ import networkx import numpy import chainer from chainer_chemistry.dataset.graph_dataset.base_graph_dataset import PaddingGraphDataset, SparseGraphDataset # NOQA from chainer_chemistry.dataset.graph_dataset.base_graph_data import PaddingGraphData, SparseGraphData # NOQA from chainer_chemistry.dataset.graph_dataset.feature_converters import batch_without_padding # NOQA class BaseNetworkxPreprocessor(object): """Base class to preprocess `Networkx::Graph` object""" def __init__(self, *args, **kwargs): pass def get_x(self, graph): if 'x' in graph.graph: x = graph.graph['x'] else: feature_dim, = graph.nodes[0]['x'].shape x = numpy.empty((graph.number_of_nodes(), feature_dim), dtype=numpy.float32) for v, data in graph.nodes.data(): x[v] = data['x'] return x def get_y(self, graph): if 'y' in graph.graph: y = graph.graph['y'] else: y = numpy.empty(graph.number_of_nodes(), dtype=numpy.int32) for v, data in graph.nodes.data(): y[v] = data['y'] return y class BasePaddingNetworkxPreprocessor(BaseNetworkxPreprocessor): """Base class to preprocess `Networkx::Graph` into `PaddingGraphDataset` """ # NOQA def __init__(self, use_coo=False, *args, **kwargs): self.use_coo = use_coo def construct_data(self, graph): """Construct `PaddingGraphData` from `Networkx::Graph` Args: graph (Networkx::Graph): graph Returns: PaddingGraphData: graph data of padding pattern """ if not self.use_coo: return PaddingGraphData( x=self.get_x(graph), adj=networkx.to_numpy_array(graph, dtype=numpy.float32), y=self.get_y(graph), label_num=graph.graph['label_num'] ) n_edges = graph.number_of_edges() * 2 row = numpy.empty((n_edges), dtype=numpy.int) col = numpy.empty((n_edges), dtype=numpy.int) data = numpy.ones((n_edges), dtype=numpy.float32) for i, edge in enumerate(graph.edges): row[2 * i] = edge[0] row[2 * i + 1] = edge[1] col[2 * i] = edge[1] col[2 * i + 1] = edge[0] # ensure row is sorted if not numpy.all(row[:-1] <= row[1:]): order = numpy.argsort(row) row = row[order] col = col[order] assert numpy.all(row[:-1] <= row[1:]) adj = chainer.utils.CooMatrix( data=data, row=row, col=col, shape=(graph.number_of_nodes(), graph.number_of_nodes()), order='C') return PaddingGraphData( x=self.get_x(graph), adj=adj, y=self.get_y(graph), label_num=graph.graph['label_num'] ) def create_dataset(self, graph_list): """Create `PaddingGraphDataset` from list of `Networkx::Graph` Args: graph_list (list[Networkx::Graph]): list of graphs Returns: PaddingGraphDataset: graph dataset of padding pattern """ data_list = [ self.construct_data(graph) for graph in graph_list ] dataset = PaddingGraphDataset(data_list) dataset.register_feature('label_num', batch_without_padding) return dataset class BaseSparseNetworkxPreprocessor(BaseNetworkxPreprocessor): """Base class to preprocess `Networkx::Graph` into `SparseGraphDataset` """ def construct_data(self, graph): """Construct `SparseGraphData` from `Networkx::Graph` Args: graph (Networkx::Graph): graph Returns: SparseGraphData: graph data of sparse pattern """ edge_index = numpy.empty((2, graph.number_of_edges() * 2), dtype=numpy.int) for i, edge in enumerate(graph.edges): edge_index[0][2 * i] = edge[0] edge_index[0][2 * i + 1] = edge[1] edge_index[1][2 * i] = edge[1] edge_index[1][2 * i + 1] = edge[0] return SparseGraphData( x=self.get_x(graph), edge_index=numpy.array(edge_index, dtype=numpy.int), y=self.get_y(graph), label_num=graph.graph['label_num'] ) def add_self_loop(self, graph): for v in range(graph.number_of_nodes()): graph.add_edge(v, v) return graph def create_dataset(self, graph_list): """Create `SparseGraphDataset` from list of `Networkx::Graph` Args: graph_list (list[Networkx::Graph]): list of graphs Returns: SparseGraphDataset: graph dataset of sparse pattern """ data_list = [ self.construct_data(graph) for graph in graph_list ] dataset = SparseGraphDataset(data_list) dataset.register_feature('label_num', batch_without_padding) return dataset ================================================ FILE: chainer_chemistry/dataset/networkx_preprocessors/reddit_coo.py ================================================ import os import numpy import scipy import chainer from chainer_chemistry.dataset.graph_dataset.base_graph_data import PaddingGraphData # NOQA def get_reddit_coo_data(dirpath): """Temporary function to obtain reddit coo data for GIN (because it takes to much time to convert it to networkx) Returns: PaddingGraphData: `PaddingGraphData` of reddit """ print("Loading node feature and label") reddit_data = numpy.load(os.path.join(dirpath, "reddit_data.npz")) print("Loading edge data") coo_adj = scipy.sparse.load_npz(os.path.join(dirpath, "reddit_graph.npz")) row = coo_adj.row.astype(numpy.int32) col = coo_adj.col.astype(numpy.int32) data = coo_adj.data.astype(numpy.float32) # ensure row is sorted if not numpy.all(row[:-1] <= row[1:]): order = numpy.argsort(row) row = row[order] col = col[order] assert numpy.all(row[:-1] <= row[1:]) adj = chainer.utils.CooMatrix( data=data, row=row, col=col, shape=coo_adj.shape, order='C') return PaddingGraphData( x=reddit_data['feature'].astype(numpy.float32), adj=adj, y=reddit_data['label'].astype(numpy.int32), label_num=41 ) ================================================ FILE: chainer_chemistry/dataset/parsers/__init__.py ================================================ from chainer_chemistry.dataset.parsers import base_parser # NOQA from chainer_chemistry.dataset.parsers import csv_file_parser # NOQA from chainer_chemistry.dataset.parsers import data_frame_parser # NOQA from chainer_chemistry.dataset.parsers import sdf_file_parser # NOQA from chainer_chemistry.dataset.parsers import smiles_parser # NOQA from chainer_chemistry.dataset.parsers.base_parser import BaseFileParser # NOQA from chainer_chemistry.dataset.parsers.base_parser import BaseParser # NOQA from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser # NOQA from chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser # NOQA from chainer_chemistry.dataset.parsers.sdf_file_parser import SDFFileParser # NOQA from chainer_chemistry.dataset.parsers.smiles_parser import SmilesParser # NOQA ================================================ FILE: chainer_chemistry/dataset/parsers/base_parser.py ================================================ class BaseParser(object): def __init__(self): pass class BaseFileParser(BaseParser): """base class for file parser""" def __init__(self, preprocessor): super(BaseFileParser, self).__init__() self.preprocessor = preprocessor def parse(self, filepath): raise NotImplementedError ================================================ FILE: chainer_chemistry/dataset/parsers/csv_file_parser.py ================================================ import pandas from chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser class CSVFileParser(DataFrameParser): """csv file parser This FileParser parses .csv file. It should contain column which contain SMILES as input, and label column which is the target to predict. Args: preprocessor (BasePreprocessor): preprocessor instance labels (str or list): labels column smiles_col (str): smiles column postprocess_label (Callable): post processing function if necessary postprocess_fn (Callable): post processing function if necessary logger: """ def __init__(self, preprocessor, labels=None, smiles_col='smiles', postprocess_label=None, postprocess_fn=None, logger=None): super(CSVFileParser, self).__init__( preprocessor, labels=labels, smiles_col=smiles_col, postprocess_label=postprocess_label, postprocess_fn=postprocess_fn, logger=logger) def parse(self, filepath, return_smiles=False, target_index=None, return_is_successful=False): """parse csv file using `preprocessor` Label is extracted from `labels` columns and input features are extracted from smiles information in `smiles` column. Args: filepath (str): file path to be parsed. return_smiles (bool): If set to True, this function returns preprocessed dataset and smiles list. If set to False, this function returns preprocessed dataset and `None`. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. return_is_successful (bool): If set to `True`, boolean list is returned in the key 'is_successful'. It represents preprocessing has succeeded or not for each SMILES. If set to False, `None` is returned in the key 'is_success'. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ df = pandas.read_csv(filepath) return super(CSVFileParser, self).parse( df, return_smiles=return_smiles, target_index=target_index, return_is_successful=return_is_successful) def extract_total_num(self, filepath): """Extracts total number of data which can be parsed We can use this method to determine the value fed to `target_index` option of `parse` method. For example, if we want to extract input feature from 10% of whole dataset, we need to know how many samples are in a file. The returned value of this method may not to be same as the final dataset size. Args: filepath (str): file path of to check the total number. Returns (int): total number of dataset can be parsed. """ df = pandas.read_csv(filepath) return len(df) ================================================ FILE: chainer_chemistry/dataset/parsers/data_frame_parser.py ================================================ from logging import getLogger import numpy from rdkit import Chem from tqdm import tqdm from chainer_chemistry.dataset.parsers.base_parser import BaseFileParser from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA import traceback class DataFrameParser(BaseFileParser): """data frame parser This FileParser parses pandas dataframe. It should contain column which contain SMILES as input, and label column which is the target to predict. Args: preprocessor (BasePreprocessor): preprocessor instance labels (str or list or None): labels column smiles_col (str): smiles column postprocess_label (Callable): post processing function if necessary postprocess_fn (Callable): post processing function if necessary logger: """ def __init__(self, preprocessor, labels=None, smiles_col='smiles', postprocess_label=None, postprocess_fn=None, logger=None): super(DataFrameParser, self).__init__(preprocessor) if isinstance(labels, str): labels = [labels, ] self.labels = labels # type: list self.smiles_col = smiles_col self.postprocess_label = postprocess_label self.postprocess_fn = postprocess_fn self.logger = logger or getLogger(__name__) def parse(self, df, return_smiles=False, target_index=None, return_is_successful=False): """parse DataFrame using `preprocessor` Label is extracted from `labels` columns and input features are extracted from smiles information in `smiles` column. Args: df (pandas.DataFrame): dataframe to be parsed. return_smiles (bool): If set to `True`, smiles list is returned in the key 'smiles', it is a list of SMILES from which input features are successfully made. If set to `False`, `None` is returned in the key 'smiles'. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. return_is_successful (bool): If set to `True`, boolean list is returned in the key 'is_successful'. It represents preprocessing has succeeded or not for each SMILES. If set to False, `None` is returned in the key 'is_success'. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ logger = self.logger pp = self.preprocessor smiles_list = [] is_successful_list = [] # counter = 0 if isinstance(pp, MolPreprocessor): if target_index is not None: df = df.iloc[target_index] features = None smiles_index = df.columns.get_loc(self.smiles_col) if self.labels is None: labels_index = [] # dummy list else: labels_index = [df.columns.get_loc(c) for c in self.labels] total_count = df.shape[0] fail_count = 0 success_count = 0 for row in tqdm(df.itertuples(index=False), total=df.shape[0]): smiles = row[smiles_index] # TODO(Nakago): Check. # currently it assumes list labels = [row[i] for i in labels_index] try: mol = Chem.MolFromSmiles(smiles) if mol is None: fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # Note that smiles expression is not unique. # we obtain canonical smiles canonical_smiles, mol = pp.prepare_smiles_and_mol(mol) input_features = pp.get_input_features(mol) # Extract label if self.postprocess_label is not None: labels = self.postprocess_label(labels) if return_smiles: smiles_list.append(canonical_smiles) except MolFeatureExtractionError as e: # NOQA # This is expected error that extracting feature failed, # skip this molecule. fail_count += 1 if return_is_successful: is_successful_list.append(False) continue except Exception as e: logger.warning('parse(), type: {}, {}' .format(type(e).__name__, e.args)) logger.info(traceback.format_exc()) fail_count += 1 if return_is_successful: is_successful_list.append(False) continue # Initialize features: list of list if features is None: if isinstance(input_features, tuple): num_features = len(input_features) else: num_features = 1 if self.labels is not None: num_features += 1 features = [[] for _ in range(num_features)] if isinstance(input_features, tuple): for i in range(len(input_features)): features[i].append(input_features[i]) else: features[0].append(input_features) if self.labels is not None: features[len(features) - 1].append(labels) success_count += 1 if return_is_successful: is_successful_list.append(True) ret = [] for feature in features: try: feat_array = numpy.asarray(feature) except ValueError: # Temporal work around. # See, # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa feat_array = numpy.empty(len(feature), dtype=numpy.ndarray) feat_array[:] = feature[:] ret.append(feat_array) result = tuple(ret) logger.info('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}' .format(fail_count, success_count, total_count)) else: raise NotImplementedError smileses = numpy.array( smiles_list, dtype=object) if return_smiles else None if return_is_successful: is_successful = numpy.array(is_successful_list) else: is_successful = None if isinstance(result, tuple): if self.postprocess_fn is not None: result = self.postprocess_fn(*result) dataset = pp.create_dataset(*result) else: if self.postprocess_fn is not None: result = self.postprocess_fn(result) dataset = pp.create_dataset(*result) return {"dataset": dataset, "smiles": smileses, "is_successful": is_successful} def extract_total_num(self, df): """Extracts total number of data which can be parsed We can use this method to determine the value fed to `target_index` option of `parse` method. For example, if we want to extract input feature from 10% of whole dataset, we need to know how many samples are in a file. The returned value of this method may not to be same as the final dataset size. Args: df (pandas.DataFrame): dataframe to be parsed. Returns (int): total number of dataset can be parsed. """ return len(df) ================================================ FILE: chainer_chemistry/dataset/parsers/sdf_file_parser.py ================================================ from logging import getLogger import numpy from rdkit import Chem from tqdm import tqdm from chainer_chemistry.dataset.parsers.base_parser import BaseFileParser from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA class SDFFileParser(BaseFileParser): """sdf file parser Args: preprocessor (BasePreprocessor): preprocessor instance labels (str or list): labels column postprocess_label (Callable): post processing function if necessary postprocess_fn (Callable): post processing function if necessary logger: """ def __init__(self, preprocessor, labels=None, postprocess_label=None, postprocess_fn=None, logger=None): super(SDFFileParser, self).__init__(preprocessor) self.labels = labels self.postprocess_label = postprocess_label self.postprocess_fn = postprocess_fn self.logger = logger or getLogger(__name__) def parse(self, filepath, return_smiles=False, target_index=None, return_is_successful=False): """parse sdf file using `preprocessor` Note that label is extracted from preprocessor's method. Args: filepath (str): file path to be parsed. return_smiles (bool): If set to True, this function returns preprocessed dataset and smiles list. If set to False, this function returns preprocessed dataset and `None`. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. return_is_successful (bool): If set to `True`, boolean list is returned in the key 'is_successful'. It represents preprocessing has succeeded or not for each SMILES. If set to False, `None` is returned in the key 'is_success'. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ logger = self.logger pp = self.preprocessor smiles_list = [] is_successful_list = [] if isinstance(pp, MolPreprocessor): mol_supplier = Chem.SDMolSupplier(filepath) if target_index is None: target_index = list(range(len(mol_supplier))) features = None total_count = len(mol_supplier) fail_count = 0 success_count = 0 for index in tqdm(target_index): # `mol_supplier` does not accept numpy.integer, we must use int mol = mol_supplier[int(index)] if mol is None: fail_count += 1 if return_is_successful: is_successful_list.append(False) continue try: # Labels need to be extracted from `mol` before standardize # smiles. if self.labels is not None: label = pp.get_label(mol, self.labels) if self.postprocess_label is not None: label = self.postprocess_label(label) # Note that smiles expression is not unique. # we obtain canonical smiles smiles = Chem.MolToSmiles(mol) mol = Chem.MolFromSmiles(smiles) canonical_smiles, mol = pp.prepare_smiles_and_mol(mol) input_features = pp.get_input_features(mol) # Initialize features: list of list if features is None: if isinstance(input_features, tuple): num_features = len(input_features) else: num_features = 1 if self.labels is not None: num_features += 1 features = [[] for _ in range(num_features)] if return_smiles: smiles_list.append(canonical_smiles) except MolFeatureExtractionError as e: # NOQA # This is expected error that extracting feature failed, # skip this molecule. fail_count += 1 if return_is_successful: is_successful_list.append(False) continue except Exception as e: logger.warning('parse() error, type: {}, {}' .format(type(e).__name__, e.args)) fail_count += 1 if return_is_successful: is_successful_list.append(False) continue if isinstance(input_features, tuple): for i in range(len(input_features)): features[i].append(input_features[i]) else: features[0].append(input_features) if self.labels is not None: features[len(features) - 1].append(label) success_count += 1 if return_is_successful: is_successful_list.append(True) ret = [] for feature in features: try: feat_array = numpy.asarray(feature) except ValueError: # Temporal work around to convert object-type list into # numpy array. # See, https://goo.gl/kgJXwb feat_array = numpy.empty(len(feature), dtype=numpy.ndarray) feat_array[:] = feature[:] ret.append(feat_array) result = tuple(ret) logger.info('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}' .format(fail_count, success_count, total_count)) else: # Spec not finalized yet for general case result = pp.process(filepath) smileses = numpy.array( smiles_list, dtype=object) if return_smiles else None if return_is_successful: is_successful = numpy.array(is_successful_list) else: is_successful = None if isinstance(result, tuple): if self.postprocess_fn is not None: result = self.postprocess_fn(*result) dataset = pp.create_dataset(*result) else: if self.postprocess_fn is not None: result = self.postprocess_fn(result) dataset = pp.create_dataset(*result) return {"dataset": dataset, "smiles": smileses, "is_successful": is_successful} def extract_total_num(self, filepath): """Extracts total number of data which can be parsed We can use this method to determine the value fed to `target_index` option of `parse` method. For example, if we want to extract input feature from 10% of whole dataset, we need to know how many samples are in a file. The returned value of this method may not to be same as the final dataset size. Args: filepath (str): file path of to check the total number. Returns (int): total number of dataset can be parsed. """ mol_supplier = Chem.SDMolSupplier(filepath) return len(mol_supplier) ================================================ FILE: chainer_chemistry/dataset/parsers/smiles_parser.py ================================================ import pandas from chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser class SmilesParser(DataFrameParser): """smiles parser It parses `smiles_list`, which is a list of string of smiles. Args: preprocessor (BasePreprocessor): preprocessor instance postprocess_label (Callable): post processing function if necessary postprocess_fn (Callable): post processing function if necessary logger: """ def __init__(self, preprocessor, postprocess_label=None, postprocess_fn=None, logger=None): super(SmilesParser, self).__init__( preprocessor, labels=None, smiles_col='smiles', postprocess_label=postprocess_label, postprocess_fn=postprocess_fn, logger=logger) def parse(self, smiles_list, return_smiles=False, target_index=None, return_is_successful=False): """parse `smiles_list` using `preprocessor` Label is extracted from `labels` columns and input features are extracted from smiles information in `smiles` column. Args: smiles_list (list): list of strings of smiles return_smiles (bool): If set to True, this function returns preprocessed dataset and smiles list. If set to False, this function returns preprocessed dataset and `None`. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. return_is_successful (bool): If set to `True`, boolean list is returned in the key 'is_successful'. It represents preprocessing has succeeded or not for each SMILES. If set to False, `None` is returned in the key 'is_success'. Returns (dict): dictionary that contains Dataset, 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or None. """ df = pandas.DataFrame({'smiles': smiles_list}) return super(SmilesParser, self).parse( df, return_smiles=return_smiles, target_index=target_index, return_is_successful=return_is_successful) def extract_total_num(self, smiles_list): """Extracts total number of data which can be parsed We can use this method to determine the value fed to `target_index` option of `parse` method. For example, if we want to extract input feature from 10% of whole dataset, we need to know how many samples are in a file. The returned value of this method may not to be same as the final dataset size. Args: smiles_list (list): list of strings of smiles Returns (int): total number of dataset can be parsed. """ return len(smiles_list) ================================================ FILE: chainer_chemistry/dataset/preprocessors/__init__.py ================================================ from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.base_preprocessor import BasePreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.cgcnn_preprocessor import CGCNNPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.common import construct_adj_matrix # NOQA from chainer_chemistry.dataset.preprocessors.common import construct_atomic_number_array # NOQA from chainer_chemistry.dataset.preprocessors.common import construct_discrete_edge_matrix # NOQA from chainer_chemistry.dataset.preprocessors.common import construct_supernode_feature # NOQA from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms # NOQA from chainer_chemistry.dataset.preprocessors.ecfp_preprocessor import ECFPPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import GGNNPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.gin_preprocessor import GINPreprocessor, GINSparsePreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.gnnfilm_preprocessor import GNNFiLMPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.gwm_preprocessor import GGNNGWMPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.gwm_preprocessor import GINGWMPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.gwm_preprocessor import NFPGWMPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.gwm_preprocessor import RSGCNGWMPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.megnet_preprocessor import MEGNetPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.nfp_preprocessor import NFPPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.relgat_preprocessor import RelGATPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.relgcn_preprocessor import RelGCNPreprocessor, RelGCNSparsePreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.schnet_preprocessor import SchNetPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.weavenet_preprocessor import WeaveNetPreprocessor # NOQA preprocess_method_dict = { 'ecfp': ECFPPreprocessor, 'nfp': NFPPreprocessor, 'nfp_gwm': NFPGWMPreprocessor, 'ggnn': GGNNPreprocessor, 'ggnn_gwm': GGNNGWMPreprocessor, 'gin': GINPreprocessor, 'gin_gwm': GINGWMPreprocessor, 'schnet': SchNetPreprocessor, 'weavenet': WeaveNetPreprocessor, 'relgcn': RelGCNPreprocessor, 'rsgcn': RSGCNPreprocessor, 'rsgcn_gwm': RSGCNGWMPreprocessor, 'relgat': RelGATPreprocessor, 'relgcn_sparse': RelGCNSparsePreprocessor, 'gin_sparse': GINSparsePreprocessor, 'gnnfilm': GNNFiLMPreprocessor, 'megnet': MEGNetPreprocessor, 'cgcnn': CGCNNPreprocessor } ================================================ FILE: chainer_chemistry/dataset/preprocessors/atomic_number_preprocessor.py ================================================ from chainer_chemistry.dataset.preprocessors.common \ import construct_atomic_number_array from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms from chainer_chemistry.dataset.preprocessors.mol_preprocessor \ import MolPreprocessor class AtomicNumberPreprocessor(MolPreprocessor): """Atomic number Preprocessor Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. """ def __init__(self, max_atoms=-1, out_size=-1): super(AtomicNumberPreprocessor, self).__init__() if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): """get input features Args: mol (Mol): Returns: """ type_check_num_atoms(mol, self.max_atoms) atom_array = construct_atomic_number_array(mol, out_size=self.out_size) return atom_array ================================================ FILE: chainer_chemistry/dataset/preprocessors/base_preprocessor.py ================================================ """ Preprocessor supports feature extraction for each model (network) """ class BasePreprocessor(object): """Base class for preprocessor""" def __init__(self): pass def process(self, filepath): pass ================================================ FILE: chainer_chemistry/dataset/preprocessors/cgcnn_preprocessor.py ================================================ from logging import getLogger import numpy import os import shutil from chainer.dataset import download from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA from chainer_chemistry.dataset.utils import GaussianDistance from chainer_chemistry.utils import load_json download_url = 'https://raw.githubusercontent.com/txie-93/cgcnn/master/data/sample-regression/atom_init.json' # NOQA file_name_atom_init_json = 'atom_init.json' _root = 'pfnet/chainer/cgcnn' def get_atom_init_json_filepath(download_if_not_exist=True): """Construct a filepath which stores atom_init_json This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If `True` download dataset if it is not downloaded yet. Returns (str): file path for atom_init_json """ cache_root = download.get_dataset_directory(_root) cache_path = os.path.join(cache_root, file_name_atom_init_json) if not os.path.exists(cache_path) and download_if_not_exist: logger = getLogger(__name__) logger.info('Downloading atom_init.json...') download_file_path = download.cached_download(download_url) shutil.copy(download_file_path, cache_path) return cache_path class CGCNNPreprocessor(MolPreprocessor): """CGCNNPreprocessor Args: For Molecule: TODO """ def __init__(self, max_num_nbr=12, max_radius=8, expand_dim=40): super(CGCNNPreprocessor, self).__init__() self.max_num_nbr = max_num_nbr self.max_radius = max_radius self.gdf = GaussianDistance(centers=numpy.linspace(0, 8, expand_dim)) feat_dict = load_json(get_atom_init_json_filepath()) self.atom_features = {int(key): numpy.array(value, dtype=numpy.float32) for key, value in feat_dict.items()} def get_input_features(self, mol): raise NotImplementedError() ================================================ FILE: chainer_chemistry/dataset/preprocessors/common.py ================================================ """Common preprocess method is gethered in this file""" import numpy from rdkit import Chem from rdkit.Chem import rdmolops from chainer_chemistry.config import MAX_ATOMIC_NUM class MolFeatureExtractionError(Exception): pass # --- Type check --- def type_check_num_atoms(mol, num_max_atoms=-1): """Check number of atoms in `mol` does not exceed `num_max_atoms` If number of atoms in `mol` exceeds the number `num_max_atoms`, it will raise `MolFeatureExtractionError` exception. Args: mol (Mol): num_max_atoms (int): If negative value is set, not check number of atoms. """ num_atoms = mol.GetNumAtoms() if num_max_atoms >= 0 and num_atoms > num_max_atoms: # Skip extracting feature. ignore this case. raise MolFeatureExtractionError( 'Number of atoms in mol {} exceeds num_max_atoms {}' .format(num_atoms, num_max_atoms)) # --- Atom preprocessing --- def construct_atomic_number_array(mol, out_size=-1): """Returns atomic numbers of atoms consisting a molecule. Args: mol (rdkit.Chem.Mol): Input molecule. out_size (int): The size of returned array. If this option is negative, it does not take any effect. Otherwise, it must be larger than the number of atoms in the input molecules. In that case, the tail of the array is padded with zeros. Returns: numpy.ndarray: an array consisting of atomic numbers of atoms in the molecule. """ atom_list = [a.GetAtomicNum() for a in mol.GetAtoms()] n_atom = len(atom_list) if out_size < 0: return numpy.array(atom_list, dtype=numpy.int32) elif out_size >= n_atom: # 'empty' padding for atom_list # 0 represents empty place for atom atom_array = numpy.zeros(out_size, dtype=numpy.int32) atom_array[:n_atom] = numpy.array(atom_list, dtype=numpy.int32) return atom_array else: raise ValueError('`out_size` (={}) must be negative or ' 'larger than or equal to the number ' 'of atoms in the input molecules (={})' '.'.format(out_size, n_atom)) # --- Adjacency matrix preprocessing --- def construct_adj_matrix(mol, out_size=-1, self_connection=True): """Returns the adjacent matrix of the given molecule. This function returns the adjacent matrix of the given molecule. Contrary to the specification of :func:`rdkit.Chem.rdmolops.GetAdjacencyMatrix`, The diagonal entries of the returned matrix are all-one. Args: mol (rdkit.Chem.Mol): Input molecule. out_size (int): The size of the returned matrix. If this option is negative, it does not take any effect. Otherwise, it must be larger than the number of atoms in the input molecules. In that case, the adjacent matrix is expanded and zeros are padded to right columns and bottom rows. self_connection (bool): Add self connection or not. If True, diagonal element of adjacency matrix is filled with 1. Returns: adj_array (numpy.ndarray): The adjacent matrix of the input molecule. It is 2-dimensional array with shape (atoms1, atoms2), where atoms1 & atoms2 represent from and to of the edge respectively. If ``out_size`` is non-negative, the returned its size is equal to that value. Otherwise, it is equal to the number of atoms in the the molecule. """ adj = rdmolops.GetAdjacencyMatrix(mol) s0, s1 = adj.shape if s0 != s1: raise ValueError('The adjacent matrix of the input molecule' 'has an invalid shape: ({}, {}). ' 'It must be square.'.format(s0, s1)) if self_connection: adj = adj + numpy.eye(s0) if out_size < 0: adj_array = adj.astype(numpy.float32) elif out_size >= s0: adj_array = numpy.zeros((out_size, out_size), dtype=numpy.float32) adj_array[:s0, :s1] = adj else: raise ValueError( '`out_size` (={}) must be negative or larger than or equal to the ' 'number of atoms in the input molecules (={}).' .format(out_size, s0)) return adj_array def construct_discrete_edge_matrix(mol, out_size=-1, add_self_connection_channel=False): """Returns the edge-type dependent adjacency matrix of the given molecule. Args: mol (rdkit.Chem.Mol): Input molecule. out_size (int): The size of the returned matrix. If this option is negative, it does not take any effect. Otherwise, it must be larger than the number of atoms in the input molecules. In that case, the adjacent matrix is expanded and zeros are padded to right columns and bottom rows. add_self_connection_channel (bool): Add self connection or not. If True, adjacency matrix whose diagonal element filled with 1 is added to last channel. Returns: adj_array (numpy.ndarray): The adjacent matrix of the input molecule. It is 3-dimensional array with shape (edge_type, atoms1, atoms2), where edge_type represents the bond type, atoms1 & atoms2 represent from and to of the edge respectively. If ``out_size`` is non-negative, its size is equal to that value. Otherwise, it is equal to the number of atoms in the the molecule. """ if mol is None: raise MolFeatureExtractionError('mol is None') N = mol.GetNumAtoms() if out_size < 0: size = N elif out_size >= N: size = out_size else: raise ValueError( 'out_size {} is smaller than number of atoms in mol {}' .format(out_size, N)) if add_self_connection_channel: adjs = numpy.zeros((5, size, size), dtype=numpy.float32) else: adjs = numpy.zeros((4, size, size), dtype=numpy.float32) bond_type_to_channel = { Chem.BondType.SINGLE: 0, Chem.BondType.DOUBLE: 1, Chem.BondType.TRIPLE: 2, Chem.BondType.AROMATIC: 3 } for bond in mol.GetBonds(): bond_type = bond.GetBondType() ch = bond_type_to_channel[bond_type] i = bond.GetBeginAtomIdx() j = bond.GetEndAtomIdx() adjs[ch, i, j] = 1.0 adjs[ch, j, i] = 1.0 if add_self_connection_channel: adjs[-1] = numpy.eye(N) return adjs def mol_basic_info_feature(mol, atom_array, adj): n_atoms = mol.GetNumAtoms() if n_atoms != len(atom_array): raise ValueError("[ERROR] n_atoms {} != len(atom_array) {}" .format(n_atoms, len(atom_array))) # Note: this is actual number of edges * 2. n_edges = adj.sum() return numpy.asarray([n_atoms, n_edges]) def mol_atom_type_feature(mol, atom_array, adj): atom_count = numpy.bincount(atom_array, minlength=MAX_ATOMIC_NUM + 1) return (atom_count > 0).astype(numpy.float)[1:] def mol_atom_freq_feature(mol, atom_array, adj): atom_count = numpy.bincount(atom_array, minlength=MAX_ATOMIC_NUM + 1) return (atom_count / len(atom_array))[1:] def mol_bond_type_feature(mol, atom_array, adj): if adj.ndim == 2: adj = numpy.expand_dims(adj, axis=0) adj = adj.reshape((adj.shape[0], -1)) return adj.max(axis=1) def mol_bond_freq_feature(mol, atom_array, adj): if adj.ndim == 2: adj = numpy.expand_dims(adj, axis=0) adj = adj.reshape((adj.shape[0], -1)) adj_sum = adj.sum() if adj_sum == 0: return adj.sum(axis=1) else: return adj.sum(axis=1) / adj_sum def construct_supernode_feature(mol, atom_array, adj, feature_functions=None): """Construct an input feature x' for a supernode `out_size` is automatically inferred by `atom_array` and `adj` Args: mol (rdkit.Chem.Mol): Input molecule atom_array (numpy.ndarray) : array of atoms adj (numpy.ndarray): N by N 2-way array, or |E| by N by N 3-way array where |E| is the number of edgetypes. feature_functions (None or list): list of callable Returns: super_node_x (numpy.ndarray); 1-way array, the supernode feature. len(super_node_x) will be 2 + 2 + MAX_ATOMIC_NUM*2 for 2-way adjs, 2 + 4*2 + MAX_ATOMIC_NUM*2 for 3-way adjs """ if feature_functions is None: feature_functions = [ mol_basic_info_feature, mol_bond_type_feature, mol_bond_freq_feature, mol_atom_type_feature, mol_atom_freq_feature] super_node_x = numpy.concatenate( [func(mol, atom_array, adj) for func in feature_functions]) super_node_x = super_node_x.astype(numpy.float32) return super_node_x ================================================ FILE: chainer_chemistry/dataset/preprocessors/ecfp_preprocessor.py ================================================ from logging import getLogger import numpy from rdkit.Chem import rdMolDescriptors from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA class ECFPPreprocessor(MolPreprocessor): def __init__(self, radius=2): super(ECFPPreprocessor, self).__init__() self.radius = radius def get_input_features(self, mol): try: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, self.radius) except Exception as e: logger = getLogger(__name__) logger.debug('exception caught at ECFPPreprocessor:', e) # Extracting feature failed raise MolFeatureExtractionError # TODO(Nakago): Test it. return numpy.asarray(fp, numpy.float32) ================================================ FILE: chainer_chemistry/dataset/preprocessors/ggnn_preprocessor.py ================================================ import numpy from chainer_chemistry.dataset.graph_dataset.base_graph_data import SparseGraphData # NOQA from chainer_chemistry.dataset.graph_dataset.base_graph_dataset import SparseGraphDataset # NOQA from chainer_chemistry.dataset.preprocessors.common \ import construct_atomic_number_array, construct_discrete_edge_matrix # NOQA from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms # NOQA from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA class GGNNPreprocessor(MolPreprocessor): """GGNN Preprocessor Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. add_Hs (bool): If True, implicit Hs are added. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False, kekulize=False): super(GGNNPreprocessor, self).__init__( add_Hs=add_Hs, kekulize=kekulize) if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): """get input features Args: mol (Mol): Molecule input Returns: """ type_check_num_atoms(mol, self.max_atoms) atom_array = construct_atomic_number_array(mol, out_size=self.out_size) adj_array = construct_discrete_edge_matrix(mol, out_size=self.out_size) return atom_array, adj_array class GGNNSparsePreprocessor(GGNNPreprocessor): """Sparse GGNN Preprocessor""" def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False, kekulize=False): super(GGNNSparsePreprocessor, self).__init__( max_atoms=max_atoms, out_size=out_size, add_Hs=add_Hs, kekulize=kekulize) def construct_sparse_data(self, x, adj, y): """Construct `SparseGraphData` from `x`, `adj`, `y` Args: x (numpy.ndarray): input feature adj (numpy.ndarray): adjacency matrix y (numpy.ndarray): output label Returns: SparseGraphData: graph data object for sparse pattern """ edge_index = [[], []] edge_attr = [] label_num, n, _ = adj.shape for label in range(label_num): for i in range(n): for j in range(n): if adj[label, i, j] != 0.: edge_index[0].append(i) edge_index[1].append(i) edge_attr.append(label) return SparseGraphData( x=x, edge_index=numpy.array(edge_index, dtype=numpy.int), edge_attr=numpy.array(edge_attr, dtype=numpy.int), y=y ) def create_dataset(self, *args, **kwargs): """Create `SparseGraphData` from list of `(x, adj, y)` Returns: SparseGraphDataset: graph dataset object for sparse pattern """ # args: (atom_array, adj_array, label_array) data_list = [ self.construct_sparse_data(x, adj, y) for (x, adj, y) in zip(*args) ] return SparseGraphDataset(data_list) ================================================ FILE: chainer_chemistry/dataset/preprocessors/gin_preprocessor.py ================================================ import numpy from chainer_chemistry.dataset.graph_dataset.base_graph_data import SparseGraphData # NOQA from chainer_chemistry.dataset.graph_dataset.base_graph_dataset import SparseGraphDataset # NOQA from chainer_chemistry.dataset.preprocessors.common \ import construct_atomic_number_array, construct_adj_matrix # NOQA from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA class GINPreprocessor(MolPreprocessor): """GIN Preprocessor """ def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False): """initialize the GIN Preprocessor. Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. add_Hs (bool): If true, add Hydrogens explicitly. """ super(GINPreprocessor, self).__init__(add_Hs=add_Hs) if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): """get input features Args: mol (Mol): Returns: """ type_check_num_atoms(mol, self.max_atoms) atom_array = construct_atomic_number_array(mol, out_size=self.out_size) adj_array = construct_adj_matrix(mol, out_size=self.out_size) return atom_array, adj_array class GINSparsePreprocessor(MolPreprocessor): """Sparse GIN Preprocessor""" def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False): super(GINSparsePreprocessor, self).__init__(add_Hs=add_Hs) if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): type_check_num_atoms(mol, self.max_atoms) atom_array = construct_atomic_number_array(mol, out_size=self.out_size) adj_array = construct_adj_matrix(mol, out_size=self.out_size) return atom_array, adj_array def construct_sparse_data(self, x, adj, y): """Construct `SparseGraphData` from `x`, `adj`, `y` Args: x (numpy.ndarray): input feature adj (numpy.ndarray): adjacency matrix y (numpy.ndarray): output label Returns: SparseGraphData: graph data object for sparse pattern """ edge_index = [[], []] n, _ = adj.shape for i in range(n): for j in range(n): if adj[i, j] != 0.: edge_index[0].append(i) edge_index[1].append(j) return SparseGraphData( x=x, edge_index=numpy.array(edge_index, dtype=numpy.int), y=y ) def create_dataset(self, *args, **kwargs): """Create `SparseGraphData` from list of `(x, adj, y)` Returns: SparseGraphDataset: graph dataset object for sparse pattern """ # args: (atom_array, adj_array, label_array) data_list = [ self.construct_sparse_data(x, adj, y) for (x, adj, y) in zip(*args) ] return SparseGraphDataset(data_list) ================================================ FILE: chainer_chemistry/dataset/preprocessors/gnnfilm_preprocessor.py ================================================ from chainer_chemistry.dataset.preprocessors.common \ import construct_atomic_number_array, construct_discrete_edge_matrix # NOQA from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms # NOQA from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA class GNNFiLMPreprocessor(MolPreprocessor): """GNNFiLM Preprocessor Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. add_Hs (bool): If True, implicit Hs are added. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False, kekulize=False): super(GNNFiLMPreprocessor, self).__init__( add_Hs=add_Hs, kekulize=kekulize) if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): """get input features Args: mol (Mol): Molecule input Returns: """ type_check_num_atoms(mol, self.max_atoms) atom_array = construct_atomic_number_array(mol, out_size=self.out_size) adj_array = construct_discrete_edge_matrix( mol, out_size=self.out_size, add_self_connection_channel=True) return atom_array, adj_array ================================================ FILE: chainer_chemistry/dataset/preprocessors/gwm_preprocessor.py ================================================ from chainer_chemistry.dataset.preprocessors.common import construct_supernode_feature # NOQA from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import GGNNPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.gin_preprocessor import GINPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.nfp_preprocessor import NFPPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor # NOQA class NFPGWMPreprocessor(NFPPreprocessor): def get_input_features(self, mol): atom_array, adj_array = super( NFPGWMPreprocessor, self).get_input_features(mol) super_node_x = construct_supernode_feature( mol, atom_array, adj_array) return atom_array, adj_array, super_node_x class GGNNGWMPreprocessor(GGNNPreprocessor): def get_input_features(self, mol): atom_array, adj_array = super( GGNNGWMPreprocessor, self).get_input_features(mol) super_node_x = construct_supernode_feature( mol, atom_array, adj_array) return atom_array, adj_array, super_node_x class GINGWMPreprocessor(GINPreprocessor): def get_input_features(self, mol): atom_array, adj_array = super( GINGWMPreprocessor, self).get_input_features(mol) super_node_x = construct_supernode_feature( mol, atom_array, adj_array) return atom_array, adj_array, super_node_x class RSGCNGWMPreprocessor(RSGCNPreprocessor): def get_input_features(self, mol): atom_array, adj_array = super( RSGCNGWMPreprocessor, self).get_input_features(mol) super_node_x = construct_supernode_feature( mol, atom_array, adj_array) return atom_array, adj_array, super_node_x ================================================ FILE: chainer_chemistry/dataset/preprocessors/megnet_preprocessor.py ================================================ from logging import getLogger import os import traceback import numpy from rdkit import Chem, RDConfig # NOQA from rdkit.Chem import AllChem, ChemicalFeatures, Descriptors, rdmolops # NOQA from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms # NOQA from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA from chainer_chemistry.dataset.utils import GaussianDistance MAX_ATOM_ELEMENT = 94 ATOM = ['H', 'C', 'N', 'O', 'F'] # create singleton class class ChemicalFeaturesFactory(object): _instance = None @classmethod def get_instance(self): if not self._instance: fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') self._instance = ChemicalFeatures.BuildFeatureFactory(fdefName) return self._instance # --- atom feature extraction --- def construct_atom_type_vec(mol, num_max_atoms, atom_list=None, include_unknown_atom=False): atom_list = atom_list or ATOM if include_unknown_atom: # all atom not in `atom_list` as considered as "unknown atom" # and its index is `len(atom_list)` n_atom_type = len(atom_list) + 1 else: n_atom_type = len(atom_list) atom_type_vec = numpy.zeros((num_max_atoms, n_atom_type), dtype=numpy.float32) for i in range(num_max_atoms): a = mol.GetAtomWithIdx(i) try: atom_idx = atom_list.index(a.GetSymbol()) except ValueError as e: if include_unknown_atom: atom_idx = len(atom_list) else: raise MolFeatureExtractionError(e) atom_type_vec[i, atom_idx] = 1.0 return atom_type_vec def construct_atom_chirality_vec(mol, num_max_atoms): chirality_vec = numpy.zeros((num_max_atoms, 2), dtype=numpy.float32) # chiral_cc: (atom_index, chirality) : (1, 'S') chiral_cc = Chem.FindMolChiralCenters(mol) for chiral_dict in chiral_cc: if chiral_dict[1] == 'R': chirality_vec[chiral_dict[0]] = [1, 0] if chiral_dict[1] == 'S': chirality_vec[chiral_dict[0]] = [0, 1] return chirality_vec def construct_atom_ring_vec(mol, num_max_atoms): sssr = Chem.GetSymmSSSR(mol) ring_feature = numpy.zeros((num_max_atoms, 6,), dtype=numpy.float32) for ring in sssr: ring = list(ring) for i in range(num_max_atoms): if i in ring: ring_size = len(ring) if ring_size >= 3 and ring_size <= 8: ring_feature[i, ring_size - 3] = 1.0 return ring_feature def construct_hybridization_vec(mol, num_max_atoms): hybridization_vec = numpy.zeros((num_max_atoms, 3), dtype=numpy.float32) for i in range(num_max_atoms): a = mol.GetAtomWithIdx(i) hybridization_type = a.GetHybridization() if hybridization_type is None: continue hybridization_type = str(hybridization_type) if hybridization_type == 'SP1': hybridization_vec[i, 0] = 1.0 elif hybridization_type == 'SP2': hybridization_vec[i, 1] = 1.0 elif hybridization_type == 'SP3': hybridization_vec[i, 2] = 1.0 return hybridization_vec def construct_hydrogen_bonding(mol, num_max_atoms): factory = ChemicalFeaturesFactory.get_instance() feats = factory.GetFeaturesForMol(mol) hydrogen_bonding_vec = numpy.zeros((num_max_atoms, 2), dtype=numpy.float32) for f in feats: atom_type = f.GetFamily() if atom_type == 'Donor': idx = f.GetAtomIds()[0] hydrogen_bonding_vec[idx, 0] = 1.0 if atom_type == 'Acceptor': idx = f.GetAtomIds()[0] hydrogen_bonding_vec[idx, 1] = 1.0 return hydrogen_bonding_vec def construct_aromaticity_vec(mol, num_max_atoms): aromaticity_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32) aromatix_atoms = mol.GetAromaticAtoms() for a in aromatix_atoms: aromaticity_vec[a.GetIdx()] = 1.0 return aromaticity_vec def construct_atom_feature(mol, use_all_feature, atom_list=None, include_unknown_atom=False): """construct atom feature Args: mol (Mol): mol instance use_all_feature (bool): If True, all atom features are extracted. If False, a part of atom features is extracted. You can confirm the detail in the paper. atom_list (list): list of atoms to extract feature. If None, default `ATOM` is used as `atom_list` include_unknown_atom (bool): If False, when the `mol` includes atom which is not in `atom_list`, it will raise `MolFeatureExtractionError`. If True, even the atom is not in `atom_list`, `atom_type` is set as "unknown" atom. Returns: atom_feature (numpy.ndarray): The shape is (num_nodes, num_node_features). """ num_max_atoms = mol.GetNumAtoms() atom_type_vec = construct_atom_type_vec( mol, num_max_atoms, atom_list=atom_list, include_unknown_atom=include_unknown_atom) atom_chirality_vec = construct_atom_chirality_vec( mol, num_max_atoms=num_max_atoms) atom_ring_vec = construct_atom_ring_vec( mol, num_max_atoms=num_max_atoms) hybridization_vec = construct_hybridization_vec( mol, num_max_atoms=num_max_atoms) hydrogen_bonding = construct_hydrogen_bonding( mol, num_max_atoms=num_max_atoms) aromaticity_vec = construct_aromaticity_vec( mol, num_max_atoms=num_max_atoms) if use_all_feature: feature = numpy.hstack((atom_type_vec, atom_chirality_vec, atom_ring_vec, hybridization_vec, hydrogen_bonding, aromaticity_vec)) else: feature = construct_atom_type_vec( mol, num_max_atoms, atom_list=atom_list, include_unknown_atom=include_unknown_atom) return feature # --- pair feature extraction --- def construct_bond_vec(mol, i, j): bond_feature_vec = numpy.zeros((4, ), dtype=numpy.float32) k = mol.GetBondBetweenAtoms(i, j) if k is not None: bond_type = str(k.GetBondType()) if bond_type == 'SINGLE': bond_feature_vec[0] = 1.0 elif bond_type == 'DOUBLE': bond_feature_vec[1] = 1.0 elif bond_type == 'TRIPLE': bond_feature_vec[2] = 1.0 elif bond_type == 'AROMATIC': bond_feature_vec[3] = 1.0 else: raise ValueError("Unknown bond type {}".format(bond_type)) return bond_feature_vec def get_is_in_ring(mol): """create a cache about whether the atom is in a ring or not Args: mol (Mol): mol instance Returns is_in_ring (dict): key is the atom idx, value is the set() """ sssr = Chem.GetSymmSSSR(mol) is_in_ring = {} ring_idx = 0 for ring in sssr: ring = list(ring) for i in ring: if i not in is_in_ring: is_in_ring[i] = set() is_in_ring[i].add(ring_idx) ring_idx += 1 return is_in_ring def construct_ring_feature_vec(is_in_ring, i, j): ring_feature_vec = numpy.zeros((1, ), dtype=numpy.float32) if i in is_in_ring and j in is_in_ring and is_in_ring[i] & is_in_ring[j]: ring_feature_vec[0] = 1.0 return ring_feature_vec def construct_expanded_distance_vec(distance_matrix_3d, converter, i, j): # calculate the bond length distance = distance_matrix_3d[i, j] # convert from the bond length to vector expanded_distance_vec = converter.expand(distance) return expanded_distance_vec def construct_pair_feature(mol, use_all_feature): """construct pair feature Args: mol (Mol): mol instance use_all_feature (bool): If True, all pair features are extracted. If False, a part of pair features is extracted. You can confirm the detail in the paper. Returns: features (numpy.ndarray): The shape is (num_edges, num_edge_features) bond_idx (numpy.ndarray): The shape is (2, num_edges) bond_idx[0] represents the list of StartNodeIdx and bond_idx[1] represents the list of EndNodeIdx. """ converter = GaussianDistance() # prepare the data for extracting the pair feature bonds = mol.GetBonds() # (n_nodes, n_nodes): distance in terms of the graph bond. graph_distance_matrix = Chem.GetDistanceMatrix(mol) is_in_ring = get_is_in_ring(mol) confid = AllChem.EmbedMolecule(mol) try: distance_matrix_3d = rdmolops.Get3DDistanceMatrix( mol, confId=confid) except ValueError as e: logger = getLogger(__name__) logger.info('construct_distance_matrix failed, type: {}, {}' .format(type(e).__name__, e.args)) logger.debug(traceback.format_exc()) raise MolFeatureExtractionError feature = [] bond_idx = [] for bond in bonds: start_node = bond.GetBeginAtomIdx() end_node = bond.GetEndAtomIdx() # create pair feature distance_feature = numpy.array( graph_distance_matrix[start_node][end_node], dtype=numpy.float32) bond_feature = construct_bond_vec(mol, start_node, end_node) ring_feature = construct_ring_feature_vec( is_in_ring, start_node, end_node) bond_idx.append((start_node, end_node)) if use_all_feature: expanded_distance_feature = \ construct_expanded_distance_vec( distance_matrix_3d, converter, start_node, end_node) feature.append(numpy.hstack((bond_feature, ring_feature, distance_feature, expanded_distance_feature))) else: expanded_distance_feature = \ construct_expanded_distance_vec( distance_matrix_3d, converter, start_node, end_node) feature.append(expanded_distance_feature) bond_idx = numpy.array(bond_idx).T feature = numpy.array(feature) return feature, bond_idx def construct_global_state_feature(mol): """construct global state feature Args: mol (Mol): mol instance Returns: feature (numpy.ndarray): 1 dimensional array """ n_atom = mol.GetNumAtoms() ave_mol_wt = Descriptors.MolWt(mol) / n_atom ave_num_of_bonds = len(mol.GetBonds()) / n_atom feature = numpy.array([ave_mol_wt, ave_num_of_bonds], dtype=numpy.float32) return feature class MEGNetPreprocessor(MolPreprocessor): """MEGNetPreprocessor Args: For Molecule max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. add_Hs (bool): If True, implicit Hs are added. use_all_feature (bool): If True, all atom and pair features is extracted. If it is False, a part of atom and pair features is extracted. You can confirm the detail in the paper. atom_list (list): list of atoms to extract feature. If None, default `ATOM` is used as `atom_list` include_unknown_atom (bool): If False, when the `mol` includes atom which is not in `atom_list`, it will raise `MolFeatureExtractionError`. If True, even the atom is not in `atom_list`, `atom_type` is set as "unknown" atom. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, max_atoms=-1, add_Hs=True, use_all_feature=False, atom_list=None, include_unknown_atom=False, kekulize=False, max_num_nbr=12, max_radius=8, expand_dim=100): super(MEGNetPreprocessor, self).__init__( add_Hs=add_Hs, kekulize=kekulize) self.max_atoms = max_atoms self.add_Hs = add_Hs self.use_all_feature = use_all_feature self.atom_list = atom_list self.include_unknown_atom = include_unknown_atom self.max_num_nbr = max_num_nbr self.max_radius = max_radius self.expand_dim = expand_dim self.gdf = GaussianDistance(centers=numpy.linspace(0, 5, expand_dim)) def get_input_features(self, mol): """get input features from mol object Args: mol (Mol): """ type_check_num_atoms(mol, self.max_atoms) atom_feature = construct_atom_feature(mol, self.use_all_feature, self.atom_list, self.include_unknown_atom) pair_feature, bond_idx = construct_pair_feature(mol, self.use_all_feature) global_feature = construct_global_state_feature(mol) return atom_feature, pair_feature, global_feature, bond_idx ================================================ FILE: chainer_chemistry/dataset/preprocessors/mol_preprocessor.py ================================================ from rdkit import Chem from chainer_chemistry.dataset.preprocessors.base_preprocessor import BasePreprocessor # NOQA from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset # NOQA class MolPreprocessor(BasePreprocessor): """preprocessor class specified for rdkit mol instance Args: add_Hs (bool): If True, implicit Hs are added. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, add_Hs=False, kekulize=False): super(MolPreprocessor, self).__init__() self.add_Hs = add_Hs self.kekulize = kekulize def prepare_smiles_and_mol(self, mol): """Prepare `smiles` and `mol` used in following preprocessing. This method is called before `get_input_features` is called, by parser class. This method may be overriden to support custom `smile`/`mol` extraction Args: mol (mol): mol instance Returns (tuple): (`smiles`, `mol`) """ # Note that smiles expression is not unique. # we obtain canonical smiles which is unique in `mol` canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True) mol = Chem.MolFromSmiles(canonical_smiles) if self.add_Hs: mol = Chem.AddHs(mol) if self.kekulize: Chem.Kekulize(mol) return canonical_smiles, mol def get_label(self, mol, label_names=None): """Extracts label information from a molecule. This method extracts properties whose keys are specified by ``label_names`` from a molecule ``mol`` and returns these values as a list. The order of the values is same as that of ``label_names``. If the molecule does not have a property with some label, this function fills the corresponding index of the returned list with ``None``. Args: mol (rdkit.Chem.Mol): molecule whose features to be extracted label_names (None or iterable): list of label names. Returns: list of str: label information. Its length is equal to that of ``label_names``. If ``label_names`` is ``None``, this function returns an empty list. """ if label_names is None: return [] label_list = [] for label_name in label_names: if mol.HasProp(label_name): label_list.append(mol.GetProp(label_name)) else: label_list.append(None) # TODO(Nakago): Review implementation # Label -1 work in case of classification. # However in regression, assign -1 is not a good strategy... # label_list.append(-1) # Failed to GetProp for label, skip this case. # print('no label') # raise MolFeatureExtractionError return label_list def get_input_features(self, mol): """get molecule's feature representation, descriptor. Each subclass must override this method. Args: mol (Mol): molecule whose feature to be extracted. `mol` is prepared by the method `prepare_smiles_and_mol`. """ raise NotImplementedError def create_dataset(self, *args, **kwargs): return NumpyTupleDataset(*args) def process(self, filepath): # Not used now... pass ================================================ FILE: chainer_chemistry/dataset/preprocessors/nfp_preprocessor.py ================================================ from chainer_chemistry.dataset.preprocessors.common import construct_adj_matrix from chainer_chemistry.dataset.preprocessors.common \ import construct_atomic_number_array from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms from chainer_chemistry.dataset.preprocessors.mol_preprocessor \ import MolPreprocessor class NFPPreprocessor(MolPreprocessor): """NFP Preprocessor Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. add_Hs (bool): If True, implicit Hs are added. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False, kekulize=False): super(NFPPreprocessor, self).__init__( add_Hs=add_Hs, kekulize=kekulize) if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): """get input features Args: mol (Mol): Returns: """ type_check_num_atoms(mol, self.max_atoms) atom_array = construct_atomic_number_array(mol, out_size=self.out_size) adj_array = construct_adj_matrix(mol, out_size=self.out_size) return atom_array, adj_array ================================================ FILE: chainer_chemistry/dataset/preprocessors/relgat_preprocessor.py ================================================ from chainer_chemistry.dataset.preprocessors.common import construct_atomic_number_array # NOQA from chainer_chemistry.dataset.preprocessors.common import construct_discrete_edge_matrix # NOQA from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA class RelGATPreprocessor(MolPreprocessor): """RelGAT Preprocessor Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. """ def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False): super(RelGATPreprocessor, self).__init__(add_Hs=add_Hs) if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): """get input features Args: mol (Mol): Returns: """ type_check_num_atoms(mol, self.max_atoms) atom_array = construct_atomic_number_array(mol, out_size=self.out_size) adj_array = construct_discrete_edge_matrix(mol, out_size=self.out_size) return atom_array, adj_array ================================================ FILE: chainer_chemistry/dataset/preprocessors/relgcn_preprocessor.py ================================================ from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import GGNNPreprocessor, GGNNSparsePreprocessor # NOQA class RelGCNPreprocessor(GGNNPreprocessor): """RelGCN Preprocessor Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. add_Hs (bool): If True, implicit Hs are added. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False, kekulize=False): super(RelGCNPreprocessor, self).__init__( max_atoms=max_atoms, out_size=out_size, add_Hs=add_Hs, kekulize=kekulize) def get_input_features(self, mol): """get input features Args: mol (Mol): Returns: """ return super(RelGCNPreprocessor, self).get_input_features(mol) class RelGCNSparsePreprocessor(GGNNSparsePreprocessor): pass ================================================ FILE: chainer_chemistry/dataset/preprocessors/rsgcn_preprocessor.py ================================================ from chainer_chemistry.dataset.preprocessors.common import construct_adj_matrix # NOQA from chainer_chemistry.dataset.preprocessors.common import construct_atomic_number_array # NOQA from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms # NOQA from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor # NOQA import numpy class RSGCNPreprocessor(MolPreprocessor): """RSGCN Preprocessor Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. add_Hs (bool): If True, implicit Hs are added. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False, kekulize=False): super(RSGCNPreprocessor, self).__init__( add_Hs=add_Hs, kekulize=kekulize) if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): """get input features Args: mol (Mol): Returns: """ type_check_num_atoms(mol, self.max_atoms) num_atoms = mol.GetNumAtoms() # Construct the atom array and adjacency matrix. atom_array = construct_atomic_number_array(mol, out_size=self.out_size) adj_array = construct_adj_matrix(mol, out_size=self.out_size) # Adjust the adjacency matrix. degree_vec = numpy.sum(adj_array[:num_atoms], axis=1) degree_sqrt_inv = 1. / numpy.sqrt(degree_vec) adj_array[:num_atoms, :num_atoms] *= numpy.broadcast_to( degree_sqrt_inv[:, None], (num_atoms, num_atoms)) adj_array[:num_atoms, :num_atoms] *= numpy.broadcast_to( degree_sqrt_inv[None, :], (num_atoms, num_atoms)) return atom_array, adj_array ================================================ FILE: chainer_chemistry/dataset/preprocessors/schnet_preprocessor.py ================================================ from logging import getLogger import traceback import numpy from rdkit.Chem import AllChem from rdkit.Chem import rdmolops from chainer_chemistry.dataset.preprocessors.common \ import construct_atomic_number_array from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms from chainer_chemistry.dataset.preprocessors.mol_preprocessor \ import MolPreprocessor def construct_distance_matrix(mol, out_size=-1, contain_Hs=False): """Construct distance matrix Args: mol (Chem.Mol): out_size (int): contain_Hs (bool): Returns (numpy.ndarray): 2 dimensional array which represents distance between atoms """ if mol is None: raise MolFeatureExtractionError('mol is None') N = mol.GetNumAtoms() if out_size < 0: size = N elif out_size >= N: size = out_size else: raise MolFeatureExtractionError('out_size {} is smaller than number ' 'of atoms in mol {}' .format(out_size, N)) if contain_Hs: mol2 = mol else: mol2 = AllChem.AddHs(mol) conf_id = AllChem.EmbedMolecule(mol2) if not contain_Hs: mol2 = AllChem.RemoveHs(mol2) try: dist_matrix = rdmolops.Get3DDistanceMatrix(mol2, confId=conf_id) except ValueError as e: logger = getLogger(__name__) logger.info('construct_distance_matrix failed, type: {}, {}' .format(type(e).__name__, e.args)) logger.debug(traceback.format_exc()) raise MolFeatureExtractionError if size > N: dists = numpy.zeros((size, size), dtype=numpy.float32) a0, a1 = dist_matrix.shape dists[:a0, :a1] = dist_matrix else: dists = dist_matrix return dists.astype(numpy.float32) class SchNetPreprocessor(MolPreprocessor): """SchNet Preprocessor Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. out_size (int): It specifies the size of array returned by `get_input_features`. If the number of atoms in the molecule is less than this value, the returned arrays is padded to have fixed size. Setting negative value indicates do not pad returned array. add_Hs (bool): If True, implicit Hs are added. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False, kekulize=False): super(SchNetPreprocessor, self).__init__( add_Hs=add_Hs, kekulize=kekulize) if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size: raise ValueError('max_atoms {} must be less or equal to ' 'out_size {}'.format(max_atoms, out_size)) self.max_atoms = max_atoms self.out_size = out_size def get_input_features(self, mol): """get input features Args: mol (Mol): Returns: """ type_check_num_atoms(mol, self.max_atoms) atom_array = construct_atomic_number_array(mol, out_size=self.out_size) dist_array = construct_distance_matrix(mol, out_size=self.out_size, contain_Hs=self.add_Hs) return atom_array, dist_array ================================================ FILE: chainer_chemistry/dataset/preprocessors/weavenet_preprocessor.py ================================================ import os import numpy from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import ChemicalFeatures from rdkit import RDConfig from chainer_chemistry.config import WEAVE_DEFAULT_NUM_MAX_ATOMS from chainer_chemistry.dataset.preprocessors.common \ import construct_atomic_number_array from chainer_chemistry.dataset.preprocessors.common \ import MolFeatureExtractionError from chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms from chainer_chemistry.dataset.preprocessors.mol_preprocessor \ import MolPreprocessor ATOM = ['H', 'C', 'N', 'O', 'S', 'Cl', 'Br', 'F', 'P', 'I'] MAX_DISTANCE = 2 # 7 # --- Atom feature extraction --- def construct_atom_type_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS, atom_list=None, include_unknown_atom=False): atom_list = atom_list or ATOM if include_unknown_atom: # all atom not in `atom_list` as considered as "unknown atom" # and its index is `len(atom_list)` n_atom_type = len(atom_list) + 1 else: n_atom_type = len(atom_list) n_atom = mol.GetNumAtoms() atom_type_vec = numpy.zeros((num_max_atoms, n_atom_type), dtype=numpy.float32) for i in range(n_atom): a = mol.GetAtomWithIdx(i) try: atom_idx = atom_list.index(a.GetSymbol()) except ValueError as e: if include_unknown_atom: atom_idx = len(atom_list) else: raise MolFeatureExtractionError(e) atom_type_vec[i, atom_idx] = 1.0 return atom_type_vec def construct_formal_charge_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): n_atom = mol.GetNumAtoms() formal_charge_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32) for i in range(n_atom): a = mol.GetAtomWithIdx(i) formal_charge_vec[i, 0] = a.GetFormalCharge() return formal_charge_vec def construct_hybridization_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): # TODO(Oono) # Can we enhance preprocessing speed by making factory once # prior to calling this function many times? n_atom = mol.GetNumAtoms() hybridization_vec = numpy.zeros((num_max_atoms, 3), dtype=numpy.float32) for i in range(n_atom): a = mol.GetAtomWithIdx(i) if a.GetHybridization() is None: continue hybridization_type = str(a.GetHybridization()) if hybridization_type == 'SP1': hybridization_vec[i, 0] = 1.0 elif hybridization_type == 'SP2': hybridization_vec[i, 1] = 1.0 elif hybridization_type == 'SP3': hybridization_vec[i, 2] = 1.0 return hybridization_vec def construct_partial_charge_vec( mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): AllChem.ComputeGasteigerCharges(mol) n = mol.GetNumAtoms() partial_charge_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32) for i in range(n): a = mol.GetAtomWithIdx(i) partial_charge_vec[i, 0] = a.GetProp("_GasteigerCharge") return partial_charge_vec def construct_atom_ring_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): nAtom = mol.GetNumAtoms() sssr = Chem.GetSymmSSSR(mol) ring_feature = numpy.zeros((num_max_atoms, 6,), dtype=numpy.float32) for ring in sssr: ring = list(ring) for i in range(nAtom): if i in ring: ring_size = len(ring) if ring_size >= 3 and ring_size <= 8: ring_feature[i, ring_size - 3] = 1.0 return ring_feature def construct_hydrogen_bonding(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) hydrogen_bonding_vec = numpy.zeros((num_max_atoms, 2), dtype=numpy.float32) for f in feats: if f.GetFamily() == 'Donor': idx = f.GetAtomIds()[0] hydrogen_bonding_vec[idx, 0] = 1.0 if f.GetFamily() == 'Acceptor': idx = f.GetAtomIds()[0] hydrogen_bonding_vec[idx, 1] = 1.0 return hydrogen_bonding_vec def construct_num_hydrogens_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): n_hydrogen_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32) n_atom = mol.GetNumAtoms() for i in range(n_atom): n = 0 for j in range(n_atom): if i == j: continue a = mol.GetAtomWithIdx(j) if a.GetSymbol() != 'H': continue k = mol.GetBondBetweenAtoms(i, j) if k is not None: n += 1 n_hydrogen_vec[i, 0] = n return n_hydrogen_vec def construct_aromaticity_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): aromaticity_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32) aromatix_atoms = mol.GetAromaticAtoms() for a in aromatix_atoms: aromaticity_vec[a.GetIdx()] = 1.0 return aromaticity_vec def construct_atom_feature(mol, add_Hs, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS, atom_list=None, include_unknown_atom=False): """construct atom feature Args: mol (Mol): mol instance add_Hs (bool): if the `mol` instance was added Hs, set True. num_max_atoms (int): number of max atoms atom_list (list): list of atoms to extract feature. If None, default `ATOM` is used as `atom_list` include_unknown_atom (bool): If False, when the `mol` includes atom which is not in `atom_list`, it will raise `MolFeatureExtractionError`. If True, even the atom is not in `atom_list`, `atom_type` is set as "unknown" atom. Returns (numpy.ndarray): 2 dimensional array. First axis size is `num_max_atoms`, representing each atom index. Second axis for feature. """ atom_type_vec = construct_atom_type_vec( mol, num_max_atoms, atom_list=atom_list, include_unknown_atom=include_unknown_atom) # TODO(nakago): Chilarity formal_charge_vec = construct_formal_charge_vec( mol, num_max_atoms=num_max_atoms) partial_charge_vec = construct_partial_charge_vec( mol, num_max_atoms=num_max_atoms) atom_ring_vec = construct_atom_ring_vec( mol, num_max_atoms=num_max_atoms) hybridization_vec = construct_hybridization_vec( mol, num_max_atoms=num_max_atoms) hydrogen_bonding = construct_hydrogen_bonding( mol, num_max_atoms=num_max_atoms) aromaticity_vec = construct_aromaticity_vec( mol, num_max_atoms=num_max_atoms) if add_Hs: num_hydrogens_vec = construct_num_hydrogens_vec( mol, num_max_atoms=num_max_atoms) feature = numpy.hstack((atom_type_vec, formal_charge_vec, partial_charge_vec, atom_ring_vec, hybridization_vec, hydrogen_bonding, aromaticity_vec, num_hydrogens_vec)) else: feature = numpy.hstack((atom_type_vec, formal_charge_vec, partial_charge_vec, atom_ring_vec, hybridization_vec, hydrogen_bonding, aromaticity_vec)) return feature # --- Pair feature extraction --- def construct_bond_vec(mol, i, j): bond_feature_vec = numpy.zeros((4, ), dtype=numpy.float32) k = mol.GetBondBetweenAtoms(i, j) if k is not None: bond_type = str(k.GetBondType()) if bond_type == 'SINGLE': bond_feature_vec[0] = 1.0 elif bond_type == 'DOUBLE': bond_feature_vec[1] = 1.0 elif bond_type == 'TRIPLE': bond_feature_vec[2] = 1.0 elif bond_type == 'AROMATIC': bond_feature_vec[3] = 1.0 else: raise ValueError("Unknown bond type {}".format(bond_type)) return bond_feature_vec def construct_distance_vec(distance_matrix, i, j): distance = min(MAX_DISTANCE, int(distance_matrix[i][j])) distance_feature = numpy.zeros((MAX_DISTANCE, ), dtype=numpy.float32) distance_feature[:distance] = 1.0 return distance_feature def construct_ring_feature_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): n_atom = mol.GetNumAtoms() sssr = Chem.GetSymmSSSR(mol) ring_feature_vec = numpy.zeros( (num_max_atoms ** 2, 1,), dtype=numpy.float32) for ring in sssr: ring = list(ring) n_atom_in_ring = len(ring) for i in range(n_atom_in_ring): for j in range(n_atom_in_ring): a0 = ring[i] a1 = ring[j] ring_feature_vec[a0 * n_atom + a1] = 1 return ring_feature_vec def construct_pair_feature(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS): """construct pair feature Args: mol (Mol): mol instance num_max_atoms (int): number of max atoms Returns (numpy.ndarray): 2 dimensional array. First axis size is `num_max_atoms` ** 2, representing index of each atom pair. Second axis for feature. """ n_atom = mol.GetNumAtoms() distance_matrix = Chem.GetDistanceMatrix(mol) distance_feature = numpy.zeros((num_max_atoms ** 2, MAX_DISTANCE,), dtype=numpy.float32) for i in range(n_atom): for j in range(n_atom): distance_feature[i * n_atom + j] = construct_distance_vec( distance_matrix, i, j) bond_feature = numpy.zeros((num_max_atoms ** 2, 4,), dtype=numpy.float32) for i in range(n_atom): for j in range(n_atom): bond_feature[i * n_atom + j] = construct_bond_vec(mol, i, j) ring_feature = construct_ring_feature_vec(mol, num_max_atoms=num_max_atoms) feature = numpy.hstack((distance_feature, bond_feature, ring_feature)) return feature class WeaveNetPreprocessor(MolPreprocessor): """WeaveNetPreprocessor WeaveNet must have fixed-size atom list for now, zero_padding option is always set to True. Args: max_atoms (int): Max number of atoms for each molecule, if the number of atoms is more than this value, this data is simply ignored. Setting negative value indicates no limit for max atoms. add_Hs (bool): If True, implicit Hs are added. use_fixed_atom_feature (bool): If True, atom feature is extracted used in original paper. If it is False, atomic number is used instead. atom_list (list): list of atoms to extract feature. If None, default `ATOM` is used as `atom_list` include_unknown_atom (bool): If False, when the `mol` includes atom which is not in `atom_list`, it will raise `MolFeatureExtractionError`. If True, even the atom is not in `atom_list`, `atom_type` is set as "unknown" atom. kekulize (bool): If True, Kekulizes the molecule. """ def __init__(self, max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS, add_Hs=True, use_fixed_atom_feature=False, atom_list=None, include_unknown_atom=False, kekulize=False): super(WeaveNetPreprocessor, self).__init__( add_Hs=add_Hs, kekulize=kekulize) zero_padding = True if zero_padding and max_atoms <= 0: raise ValueError('max_atoms must be set to positive value when ' 'zero_padding is True') self.max_atoms = max_atoms self.add_Hs = add_Hs self.zero_padding = zero_padding self.use_fixed_atom_feature = use_fixed_atom_feature self.atom_list = atom_list self.include_unknown_atom = include_unknown_atom def get_input_features(self, mol): """get input features for WeaveNet WeaveNetPreprocessor automatically add `H` to `mol` Args: mol (Mol): """ type_check_num_atoms(mol, self.max_atoms) if self.use_fixed_atom_feature: # original paper feature extraction atom_array = construct_atom_feature(mol, self.add_Hs, self.max_atoms, self.atom_list, self.include_unknown_atom) else: # embed id of atomic numbers atom_array = construct_atomic_number_array(mol, self.max_atoms) pair_feature = construct_pair_feature(mol, num_max_atoms=self.max_atoms) return atom_array, pair_feature ================================================ FILE: chainer_chemistry/dataset/preprocessors/wle.py ================================================ import collections import numpy as np from chainer_chemistry.dataset.preprocessors import wle_io from chainer_chemistry.dataset.preprocessors import wle_atom_array_update as wle_update DEBUG = False def apply_wle_for_datasets(datasets, cutoff=0, k=1): """ Apply label Weisfeiler--Lehman Embedding for the tuple of datasets. Args: datasets: tuple of dataset (usually, train/val/test), each dataset consists of atom_array and adj_array and teach_signal cutoff: int, if more than 0, the expanded labels whose freq <= cutoff will be removed. k: int, the number of iterations of neighborhood aggregation. Returns: - tuple of dataset (usually, train/val/test), each dataest consists of atom_number_array and adj_tensor with expanded labels - list of all labels, used in the dataset parts. - dictionary of label frequencies key:label valeu:frequency count """ atom_arrays, adj_arrays, teach_signals = wle_io.load_dataset_elements(datasets) for _ in range(k): atom_arrays, labels_frequencies = wle_update.update_atom_arrays( atom_arrays, adj_arrays, cutoff) datasets_expanded = wle_io.create_datasets(atom_arrays, adj_arrays, teach_signals) expanded_labels = list(labels_frequencies.keys()) return tuple(datasets_expanded), expanded_labels, labels_frequencies def apply_cwle_for_datasets(datasets, k=1): """ Apply Concatenated Weisfeiler--Lehman embedding for the tuple of datasets. This also applicalbe for the Gated-sum Weisfeiler--Lehman embedding. Args: datasets: tuple of dataset (usually, train/val/test), each dataset consists of atom_array and adj_array and teach_signal k: int, the number of iterations of neighborhood aggregation. Returns: - tuple of dataset (usually, train/val/test), each dataest consists of atom_number_array, expanded_label_array, and adj_tensor - list of all expanded labels, used in the dataset parts. - dictionary of label frequencies key:label valeu:frequency count """ if k <= 0: raise ValueError('Iterations should be a positive integer. ' 'Found k={}'.format(k)) atom_arrays, adj_arrays, teach_signals = wle_io.load_dataset_elements(datasets) for i in range(k): if i != k - 1: atom_arrays, labels_frequencies = wle_update.update_atom_arrays( atom_arrays, adj_arrays, 0) else: wle_arrays, labels_frequencies = wle_update.update_atom_arrays( atom_arrays, adj_arrays, 0, False) datasets_expanded = wle_io.create_datasets( atom_arrays, adj_arrays, teach_signals, wle_arrays) expanded_labels = list(labels_frequencies.keys()) return tuple(datasets_expanded), expanded_labels, labels_frequencies def _findmaxidx(datasets, idx): atom_data_size = len(datasets[0][0]) if atom_data_size <= idx: raise ValueError( 'data index is out of index. ' 'atom_data size={} <= idx={}'.format( atom_data_size, idx)) max_idx = -1 for dataset in datasets: for mol_data in dataset: atom_array = mol_data[idx] max_atom_idx = np.max(atom_array) if max_atom_idx > max_idx: max_idx = max_atom_idx return max_idx + 1 # 0-origin def findmaxidx(datasets, target='atom_label'): """ Retruns the maximum number of the symbol index in an atom array, throughout the datasets. Args: datasets: dataset entity target: choice of 'atom_label' of 'wle_label' Returns: _findmaxidx(datasets, 0/2) """ if target == 'atom_label': return _findmaxidx(datasets, 0) elif target == 'wle_label': return _findmaxidx(datasets, 2) ================================================ FILE: chainer_chemistry/dataset/preprocessors/wle_atom_array_update.py ================================================ import collections import numpy as np from chainer_chemistry.dataset.preprocessors import wle_util def update_atom_arrays(atom_arrays, adj_arrays, cutoff, with_focus_atom=True): expanded_atom_lists, labels_frequencies = list_all_expanded_labels( atom_arrays, adj_arrays, with_focus_atom) if cutoff > 0: expanded_atom_lists, labels_frequencies = shrink_expanded_labels( expanded_atom_lists, labels_frequencies, cutoff) expanded_labels = list(labels_frequencies.keys()) atom_arrays = [wle_util.to_index(l, expanded_labels) for l in expanded_atom_lists] return atom_arrays, labels_frequencies def shrink_expanded_labels(expanded_atom_lists, labels_frequencies, cutoff): """ Cut off the few-appearance expanded labels Args: expanded_atom_lists: tuple of list of expanded labels labels_frequencies: list of label apperacne frequencies cutoff: int, frequency cut of expanded labels Returns: - 3 (train/val/test) tuple of expanded atom arrays (all nodes are associated with string representations of expanded signals) - dictionary of frequencies all labels (key: label, value: frequency) """ # atom_array values are expanded label "STRING", not numbers new_expanded_atom_lists = [] new_labels_frequencies = collections.defaultdict(lambda: 0) # for each train/val/test, do for set_expanded_atom_list in expanded_atom_lists: # for each molecule sample, do new_set_expanded_atom_list = [] for expanded_atom_list in set_expanded_atom_list: mol_expanded_atom_list = [] # for each node i in the molecule, # get the neighbor's atom label (number index) for expanded_label in expanded_atom_list: freq = labels_frequencies[expanded_label] # check frequency here if freq > cutoff: label = expanded_label else: label = wle_util.get_focus_node_label(expanded_label) mol_expanded_atom_list.append(label) new_labels_frequencies[label] = new_labels_frequencies[label] + 1 # end cutoff-ifelse # end i-for new_set_expanded_atom_list.append(mol_expanded_atom_list) # end zip(atom_arrays, adj_array)-for new_expanded_atom_lists.append(new_set_expanded_atom_list) # end zip(atom_arrays, adj_array)-for return new_expanded_atom_lists, dict(new_labels_frequencies) def list_all_expanded_labels(atom_arrays, adj_arrays, with_focus_atom=True): """ Exapnd all nodes into WLE representation. At the same time, return the list of all labels after expansion Args: atom_arrays: 3 (train/val/test) tuple of atom arrays adj_arrays: 3 (train/val/test) tuple of adj.arrays with_focus_atom: bool, if True, the expanded label starts from the original atom label ("C-ON-OFN") if False, the exnapndd label do not include the original atom albel("-CN-OFN") Returns: - 3 (train/val/test) tuple of expanded atom arrays (all nodes are associated with string representations of expanded signals) - list of all labels appeared in the expanded atom arrays. - dictionary of frequencies all labels (key: label, value: frequency) """ expanded_atom_lists = [] # atom_array values are expanded label "STRING", not numbers labels_frequencies = collections.defaultdict(lambda: 0) # for each train/val/test, do for set_atom_arrays, set_adj_arrays in zip(atom_arrays, adj_arrays): # for each molecule sample, do set_expanded_atom_list = [] for atom_array, adj_array in zip(set_atom_arrays, set_adj_arrays): N = len(atom_array) # number of molecules # atom_array: N by F # adj_array: N by N or N by R by N # compress the relation axis adj_array = wle_util.compress_relation_axis(adj_array) assert adj_array.shape == (N, N) # find neighbors # array[0]: row index array[1]: column index neighbors = np.nonzero(adj_array) mol_expanded_atom_list = [] # for each node i in the molecule, # get the neighbor's atom label (number index) for i in range(N): expanded_label = wle_util.get_neighbor_representation( i, atom_array, neighbors, with_focus_atom) mol_expanded_atom_list.append(expanded_label) labels_frequencies[expanded_label] = labels_frequencies[expanded_label] + 1 # end i-for set_expanded_atom_list.append(mol_expanded_atom_list) # end zip(atom_arrays, adj_array)-for expanded_atom_lists.append(set_expanded_atom_list) # end zip(atom_arrays, adj_array)-for # Convert to a normal dictionary because # we cannot pickle defaultdicts with lambdas. return expanded_atom_lists, dict(labels_frequencies) ================================================ FILE: chainer_chemistry/dataset/preprocessors/wle_io.py ================================================ import numpy as np from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset DEBUG = False def create_datasets(atom_arrays, adj_arrays, teach_signals, wle_arrays=None): """ Expand the atomic_num_arrays with the expanded labels, then return valid datasets (tuple of NumpyTupleDataset) Args: atom_arrays: 3-tuple of list of lists. atom_arrays[i][j][k] is the id of an atom i: train/val/test j: index of a sample (i.e. molcule) k: index of an atom adj_arrays: list of list of numpy.array, all mol's adjacnecy tensors teach_signals: list of list of numpy.array, all teacher (supervision) signals wle_arrays: None (for WLE) or 3-tuple of list of lists (for CWLE and GWLE). Returns: 3 tuple of valid datasets (train/vel/test) in NumpyTuppleDataset """ output_datasets = [] # ToDo: try another indexing: e.g. orignal node label + extneions assert len(atom_arrays) == len(adj_arrays) == len(teach_signals) if wle_arrays is not None: assert len(atom_arrays) == len(wle_arrays) for i in range(len(atom_arrays)): # We have swaped the axes 0 and 1 for adj-arrays. re-swap set_adj_arrays = np.array(adj_arrays[i]) for m in range(len(set_adj_arrays)): set_adj_arrays[m] = np.swapaxes(set_adj_arrays[m], 0, 1) if wle_arrays is None: dataset = NumpyTupleDataset(np.array(atom_arrays[i]), set_adj_arrays, np.array(teach_signals[i])) else: dataset = NumpyTupleDataset(np.array(atom_arrays[i]), set_adj_arrays, np.array(wle_arrays[i]), np.array(teach_signals[i])) output_datasets.append(dataset) # end expanded-for return output_datasets def load_dataset_elements(datasets): """ Load all dataset tuples: atom array, adj. array, and teacher signals. Args: datasets: tuple of NumpyTupleDataset Returns: - tuple of lists of atom arrays, adj.arrays, and teacher signals. """ if DEBUG: print('type(datasets)', type(datasets)) # tuple atom_arrays = [] # 3 by num_mols by N by F adj_arrays = [] # 3 by num_mols by N by N, or 3 by N by R by N by N by N teach_signals = [] # 3 by num_mols by N by (data-dependent) for dataset in datasets: if DEBUG: print('type(dataset)', type(dataset)) # NumpyTupleDataset set_atom_arrays = [] # Mol by N set_adj_arrays = [] # Mol by N by N or N by R by N by N set_teach_signals = [] # Mol by (data-dependent) for mol_data in dataset: atom_array = mol_data[0] adj_array = mol_data[1] teach_signal = mol_data[2] if DEBUG: print("type(mol_data)=", type(mol_data)) # tuple print("type(atom_arrray)=", type(atom_array)) # ndarray print("type(adj_arrray)=", type(adj_array)) # ndarray print("type(teach_signal)=", type(teach_signal)) # ndarray set_atom_arrays.append(atom_array) # for 3-D tensor, we swap axis here set_adj_arrays.append(adj_array.swapaxes(0, 1)) set_teach_signals.append(teach_signal) # end dataset-for atom_arrays.append(set_atom_arrays) adj_arrays.append(set_adj_arrays) teach_signals.append(set_teach_signals) # end datasets-for return atom_arrays, adj_arrays, teach_signals ================================================ FILE: chainer_chemistry/dataset/preprocessors/wle_util.py ================================================ import numpy as np DEBUG = False def _index(atom, values): idx = values.index(atom) if DEBUG: print("idx=", idx) print("expanded_label=", atom) return idx def to_index(mols, values): return np.array([np.array([_index(atom, values) for atom in mol], dtype=np.int32) for mol in mols]) def compress_relation_axis(adj_array): ndim = adj_array.ndim if ndim == 2: return adj_array elif ndim == 3: return np.sum(adj_array, axis=1, keepdims=False) else: raise ValueError( 'ndim of adjacency matrix should be 2 or 3. ' 'Found ndim={}.'.format(ndim)) def _to_string(atom_label, neighbor_labels, with_focus_atom): expanded_label = ".".join(map(str, neighbor_labels)) if with_focus_atom: expanded_label = str(atom_label) + "-" + expanded_label if DEBUG: print("expanded_label=" + expanded_label) return expanded_label def get_neighbor_representation(idx, atom_array, neighbors, with_focus_atom): atom_label = atom_array[idx] neighbor = neighbors[1][np.where(neighbors[0] == idx)] if DEBUG: print(neighbor) print("len(neighbor_i)=", len(neighbor)) neighbor_labels = np.sort([atom_array[x] for x in neighbor if x != idx]) return _to_string( atom_label, neighbor_labels, with_focus_atom) def get_focus_node_label(expanded_label): tokens = expanded_label.split('-') if len(tokens) != 2: raise ValueError( 'Invalid label={}'.format(expanded_label)) return tokens[0] ================================================ FILE: chainer_chemistry/dataset/splitters/__init__.py ================================================ from chainer_chemistry.dataset.splitters import base_splitter # NOQA from chainer_chemistry.dataset.splitters import random_splitter # NOQA from chainer_chemistry.dataset.splitters import scaffold_splitter # NOQA from chainer_chemistry.dataset.splitters import deepchem_scaffold_splitter # NOQA from chainer_chemistry.dataset.splitters import stratified_splitter # NOQA from chainer_chemistry.dataset.splitters import time_splitter # NOQA from chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter # NOQA from chainer_chemistry.dataset.splitters.random_splitter import RandomSplitter # NOQA from chainer_chemistry.dataset.splitters.scaffold_splitter import ScaffoldSplitter # NOQA from chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import DeepChemScaffoldSplitter # NOQA from chainer_chemistry.dataset.splitters.stratified_splitter import StratifiedSplitter # NOQA from chainer_chemistry.dataset.splitters.time_splitter import TimeSplitter # NOQA split_method_dict = { 'random': RandomSplitter, 'stratified': StratifiedSplitter, 'scaffold': ScaffoldSplitter, 'dc_scaffold': DeepChemScaffoldSplitter, 'time': TimeSplitter, } ================================================ FILE: chainer_chemistry/dataset/splitters/base_splitter.py ================================================ from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset def converter_default(dataset, indices): return dataset[indices] def converter_numpy_tuple_dataset(dataset, indices): return NumpyTupleDataset(*dataset.features[indices]) converter_dict = { NumpyTupleDataset: converter_numpy_tuple_dataset } class BaseSplitter(object): def k_fold_split(self, dataset, k): raise NotImplementedError def _split(self, dataset, **kwargs): raise NotImplementedError def train_valid_test_split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, converter=None, return_index=True, **kwargs): if converter is None: converter = converter_dict.get(type(dataset), converter_default) train_inds, valid_inds, test_inds = self._split(dataset, frac_train, frac_valid, frac_test, **kwargs) if return_index: return train_inds, valid_inds, test_inds else: train = converter(dataset, train_inds) valid = converter(dataset, valid_inds) test = converter(dataset, test_inds) return train, valid, test, def train_valid_split(self, dataset, frac_train=0.9, frac_valid=0.1, converter=None, return_index=True, **kwargs): train_inds, valid_inds, test_inds = self._split(dataset, frac_train, frac_valid, 0., **kwargs) assert len(test_inds) == 0 if converter is None: converter = converter_dict.get(type(dataset), converter_default) if return_index: return train_inds, valid_inds else: train = converter(dataset, train_inds) valid = converter(dataset, valid_inds) return train, valid, ================================================ FILE: chainer_chemistry/dataset/splitters/deepchem_scaffold_splitter.py ================================================ from collections import defaultdict import numpy from rdkit import Chem from rdkit.Chem.Scaffolds import MurckoScaffold from chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter def generate_scaffold(smiles, include_chirality=False): """return scaffold string of target molecule""" mol = Chem.MolFromSmiles(smiles) scaffold = MurckoScaffold\ .MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality) return scaffold class DeepChemScaffoldSplitter(BaseSplitter): """Class for doing data splits by chemical scaffold. Referred Deepchem for the implementation, https://github.com/deepchem/deepchem/blob/master/deepchem/splits/splitters.py """ def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, **kwargs): print("Using DeepChem Scaffold") numpy.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) seed = kwargs.get('seed', None) smiles_list = kwargs.get('smiles_list') include_chirality = kwargs.get('include_chirality') if len(dataset) != len(smiles_list): raise ValueError("The lengths of dataset and smiles_list are " "different") rng = numpy.random.RandomState(seed) scaffolds = {} data_len = len(dataset) for ind, smiles in enumerate(smiles_list): scaffold = generate_scaffold(smiles, include_chirality) if scaffold not in scaffolds: scaffolds[scaffold] = [ind] else: scaffolds[scaffold].append(ind) # Sort from largest to smallest scaffold sets scaffolds = {key: sorted(value) for key, value in scaffolds.items()} scaffold_sets = [ scaffold_set for (scaffold, scaffold_set) in sorted(scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True) ] train_cutoff = frac_train * len(dataset) valid_cutoff = (frac_train + frac_valid) * len(dataset) train_inds, valid_inds, test_inds = [], [], [] for scaffold_set in scaffold_sets: if len(train_inds) + len(scaffold_set) > train_cutoff: if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff: test_inds += scaffold_set else: valid_inds += scaffold_set else: train_inds += scaffold_set return numpy.array(train_inds), numpy.array(valid_inds),\ numpy.array(test_inds),\ def train_valid_test_split(self, dataset, smiles_list, frac_train=0.8, frac_valid=0.1, frac_test=0.1, converter=None, return_index=True, seed=None, include_chirality=False, **kwargs): """Split dataset into train, valid and test set. Split indices are generated by splitting based on the scaffold of small molecules. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. smiles_list(list): SMILES list corresponding to datset. seed (int): Random seed. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. converter(callable): return_index(bool): If `True`, this function returns only indices. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indices """ return super(DeepChemScaffoldSplitter, self)\ .train_valid_test_split(dataset, frac_train, frac_valid, frac_test, converter, return_index, seed=seed, smiles_list=smiles_list, include_chirality=include_chirality, **kwargs) def train_valid_split(self, dataset, smiles_list, frac_train=0.9, frac_valid=0.1, converter=None, return_index=True, seed=None, include_chirality=False, **kwargs): """Split dataset into train and valid set. Split indices are generated by splitting based on the scaffold of small molecules. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. smiles_list(list): SMILES list corresponding to datset. seed (int): Random seed. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. converter(callable): return_index(bool): If `True`, this function returns only indices. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indices """ return super(DeepChemScaffoldSplitter, self)\ .train_valid_split(dataset, frac_train, frac_valid, converter, return_index, seed=seed, smiles_list=smiles_list, include_chirality=include_chirality, **kwargs) ================================================ FILE: chainer_chemistry/dataset/splitters/random_splitter.py ================================================ import numpy from chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter class RandomSplitter(BaseSplitter): """Class for doing random data splits.""" def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, **kwargs): seed = kwargs.get('seed') numpy.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) if seed is not None: perm = numpy.random.RandomState(seed).permutation(len(dataset)) else: perm = numpy.random.permutation(len(dataset)) train_data_size = int(len(dataset) * frac_train) valid_data_size = int(len(dataset) * frac_valid) return (perm[:train_data_size], perm[train_data_size:train_data_size + valid_data_size], perm[train_data_size + valid_data_size:]) def train_valid_test_split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, converter=None, return_index=True, seed=None, **kwargs): """Generate indices to split data into train, valid and test set. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. seed (int): Random seed. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. frac_test(float): Fraction of dataset put into test data. converter(callable): return_index(bool): If `True`, this function returns only indexes. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indexes .. admonition:: Example >>> from chainer_chemistry.datasets import NumpyTupleDataset >>> from chainer_chemistry.dataset.splitters import RandomSplitter >>> a = numpy.random.random((10, 10)) >>> b = numpy.random.random((10, 8)) >>> c = numpy.random.random((10, 1)) >>> d = NumpyTupleDataset(a, b, c) >>> splitter = RandomSplitter() >>> train, valid, test = splitter.train_valid_test_split(dataset, return_index=False) >>> print(len(train), len(valid), len(test)) 8, 1, 1 """ return super(RandomSplitter, self).train_valid_test_split(dataset, frac_train, frac_valid, frac_test, converter, return_index, seed=seed, **kwargs) def train_valid_split(self, dataset, frac_train=0.9, frac_valid=0.1, converter=None, return_index=True, seed=None, **kwargs): """Generate indices to split data into train and valid set. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. seed (int): Random seed. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. converter(callable): return_index(bool): If `True`, this function returns only indexes. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indexes .. admonition:: Example >>> from chainer_chemistry.datasets import NumpyTupleDataset >>> from chainer_chemistry.dataset.splitters import RandomSplitter >>> a = numpy.random.random((10, 10)) >>> b = numpy.random.random((10, 8)) >>> c = numpy.random.random((10, 1)) >>> d = NumpyTupleDataset(a, b, c) >>> splitter = RandomSplitter() >>> train, valid = splitter.train_valid_split(dataset, return_index=False) >>> print(len(train), len(valid)) 9, 1 """ return super(RandomSplitter, self).train_valid_split(dataset, frac_train, frac_valid, converter, return_index, seed=seed, **kwargs) ================================================ FILE: chainer_chemistry/dataset/splitters/scaffold_splitter.py ================================================ from collections import defaultdict import numpy from rdkit import Chem from rdkit.Chem.Scaffolds import MurckoScaffold from chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter def generate_scaffold(smiles, include_chirality=False): """return scaffold string of target molecule""" mol = Chem.MolFromSmiles(smiles) scaffold = MurckoScaffold\ .MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality) return scaffold class ScaffoldSplitter(BaseSplitter): """Class for doing data splits by chemical scaffold. Referred Deepchem for the implementation, https://git.io/fXzF4 """ def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, **kwargs): numpy.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) seed = kwargs.get('seed', None) smiles_list = kwargs.get('smiles_list') include_chirality = kwargs.get('include_chirality') if len(dataset) != len(smiles_list): raise ValueError("The lengths of dataset and smiles_list are " "different") rng = numpy.random.RandomState(seed) scaffolds = defaultdict(list) for ind, smiles in enumerate(smiles_list): scaffold = generate_scaffold(smiles, include_chirality) scaffolds[scaffold].append(ind) scaffold_sets = rng.permutation(list(scaffolds.values())) n_total_valid = int(numpy.floor(frac_valid * len(dataset))) n_total_test = int(numpy.floor(frac_test * len(dataset))) train_index = [] valid_index = [] test_index = [] for scaffold_set in scaffold_sets: if len(valid_index) + len(scaffold_set) <= n_total_valid: valid_index.extend(scaffold_set) elif len(test_index) + len(scaffold_set) <= n_total_test: test_index.extend(scaffold_set) else: train_index.extend(scaffold_set) return numpy.array(train_index), numpy.array(valid_index),\ numpy.array(test_index),\ def train_valid_test_split(self, dataset, smiles_list, frac_train=0.8, frac_valid=0.1, frac_test=0.1, converter=None, return_index=True, seed=None, include_chirality=False, **kwargs): """Split dataset into train, valid and test set. Split indices are generated by splitting based on the scaffold of small molecules. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. smiles_list(list): SMILES list corresponding to datset. seed (int): Random seed. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. converter(callable): return_index(bool): If `True`, this function returns only indices. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indices """ return super(ScaffoldSplitter, self)\ .train_valid_test_split(dataset, frac_train, frac_valid, frac_test, converter, return_index, seed=seed, smiles_list=smiles_list, include_chirality=include_chirality, **kwargs) def train_valid_split(self, dataset, smiles_list, frac_train=0.9, frac_valid=0.1, converter=None, return_index=True, seed=None, include_chirality=False, **kwargs): """Split dataset into train and valid set. Split indices are generated by splitting based on the scaffold of small molecules. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. smiles_list(list): SMILES list corresponding to datset. seed (int): Random seed. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. converter(callable): return_index(bool): If `True`, this function returns only indices. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indices """ return super(ScaffoldSplitter, self)\ .train_valid_split(dataset, frac_train, frac_valid, converter, return_index, seed=seed, smiles_list=smiles_list, include_chirality=include_chirality, **kwargs) ================================================ FILE: chainer_chemistry/dataset/splitters/stratified_splitter.py ================================================ import numpy import pandas from chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset def _approximate_mode(class_counts, n_draws): """Referred scikit-learn, https://git.io/fPMmB""" n_class = len(class_counts) continuous = class_counts * n_draws / class_counts.sum() floored = numpy.floor(continuous) assert n_draws // n_class == floored.sum() // n_class n_remainder = int(n_draws - floored.sum()) remainder = continuous - floored inds = numpy.argsort(remainder)[::-1] inds = inds[:n_remainder] floored[inds] += 1 assert n_draws == floored.sum() return floored.astype(numpy.int) class StratifiedSplitter(BaseSplitter): """Class for doing stratified data splits.""" def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, labels=None, **kwargs): numpy.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) seed = kwargs.get('seed', None) label_axis = kwargs.get('label_axis', -1) task_index = kwargs.get('task_index', 0) n_bin = kwargs.get('n_bin', 10) task_type = kwargs.get('task_type', 'auto') if task_type not in ['classification', 'regression', 'auto']: raise ValueError("{} is invalid. Please use 'classification'," "'regression' or 'auto'".format(task_type)) rng = numpy.random.RandomState(seed) if isinstance(labels, list): labels = numpy.array(labels) elif labels is None: if not isinstance(dataset, NumpyTupleDataset): raise ValueError("Please assign label dataset.") labels = dataset.features[:, label_axis] if labels.ndim == 1: labels = labels else: labels = labels[:, task_index] if task_type == 'auto': if labels.dtype.kind == 'i': task_type = 'classification' elif labels.dtype.kind == 'f': task_type = 'regression' else: raise ValueError if task_type == 'classification': classes, labels = numpy.unique(labels, return_inverse=True) elif task_type == 'regression': classes = numpy.arange(n_bin) labels = pandas.qcut(labels, n_bin, labels=False) else: raise ValueError n_classes = classes.shape[0] n_total_valid = int(numpy.floor(frac_valid * len(dataset))) n_total_test = int(numpy.floor(frac_test * len(dataset))) class_counts = numpy.bincount(labels) class_indices = numpy.split(numpy.argsort(labels, kind='mergesort'), numpy.cumsum(class_counts)[:-1]) # n_total_train is the remainder: n - n_total_valid - n_total_test n_valid_samples = _approximate_mode(class_counts, n_total_valid) class_counts = class_counts - n_valid_samples n_test_samples = _approximate_mode(class_counts, n_total_test) train_index = [] valid_index = [] test_index = [] for i in range(n_classes): n_valid = n_valid_samples[i] n_test = n_test_samples[i] perm = rng.permutation(len(class_indices[i])) class_perm_index = class_indices[i][perm] class_valid_index = class_perm_index[:n_valid] class_test_index = class_perm_index[n_valid:n_valid+n_test] class_train_index = class_perm_index[n_valid+n_test:] train_index.extend(class_train_index) valid_index.extend(class_valid_index) test_index.extend(class_test_index) assert n_total_valid == len(valid_index) assert n_total_test == len(test_index) return numpy.array(train_index), numpy.array(valid_index),\ numpy.array(test_index), def train_valid_test_split(self, dataset, labels=None, label_axis=-1, task_index=0, frac_train=0.8, frac_valid=0.1, frac_test=0.1, converter=None, return_index=True, seed=None, task_type='auto', n_bin=10, **kwargs): """Split dataset into train, valid and test set. Split indices are generated by stratified splitting of labels. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. labels(numpy.ndarray): Target label. If `None`, this function assumes that dataset is an instance of `NumpyTupleDataset`. labels_axis(int): Dataset feature axis in NumpyTupleDataset. task_index(int): Target task index in dataset for stratification. seed (int): Random seed. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. return_index(bool): If `True`, this function returns only indexes. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indexes .. admonition:: Example >>> from chainer_chemistry.datasets import NumpyTupleDataset >>> from chainer_chemistry.dataset.splitters import StratifiedSplitter # NOQA >>> >>> a = numpy.random.random((10, 10)) >>> b = numpy.random.random((10, 8)) >>> c = numpy.random.random((10, 1)) >>> d = NumpyTupleDataset(a, b, c) >>> splitter = StratifiedSplitter() >>> train, valid, test = splitter.train_valid_test_split(dataset, return_index=False) >>> print(len(train), len(valid)) 8, 1, 1 """ return super(StratifiedSplitter, self)\ .train_valid_test_split(dataset, frac_train, frac_valid, frac_test, converter, return_index, seed=seed, label_axis=label_axis, task_type=task_type, task_index=task_index, n_bin=n_bin, labels=labels, **kwargs) def train_valid_split(self, dataset, labels=None, label_axis=-1, task_index=0, frac_train=0.9, frac_valid=0.1, converter=None, return_index=True, seed=None, task_type='auto', n_bin=10, **kwargs): """Split dataset into train and valid set. Split indices are generated by stratified splitting of labels. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. labels(numpy.ndarray): Target label. If `None`, this function assumes that dataset is an instance of `NumpyTupleDataset`. labels_axis(int): Dataset feature axis in NumpyTupleDataset. task_index(int): Target task index in dataset for stratification. seed (int): Random seed. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. return_index(bool): If `True`, this function returns only indexes. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indexes .. admonition:: Example >>> from chainer_chemistry.datasets import NumpyTupleDataset >>> from chainer_chemistry.dataset.splitters \ >>> import StratifiedSplitter >>> a = numpy.random.random((10, 10)) >>> b = numpy.random.random((10, 8)) >>> c = numpy.random.random((10, 1)) >>> d = NumpyTupleDataset(a, b, c) >>> splitter = StratifiedSplitter() >>> train, valid = splitter.train_valid_split(dataset, return_index=False) >>> print(len(train), len(valid)) 9, 1 """ return super(StratifiedSplitter, self)\ .train_valid_split(dataset, frac_train, frac_valid, converter, return_index, seed=seed, label_axis=label_axis, task_type=task_type, task_index=task_index, n_bin=n_bin, labels=labels, **kwargs) ================================================ FILE: chainer_chemistry/dataset/splitters/time_splitter.py ================================================ import numpy from chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter class TimeSplitter(BaseSplitter): """Class for doing time order splits.""" def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, **kwargs): numpy.testing.assert_almost_equal( frac_train + frac_valid + frac_test, 1.) time_list = kwargs.get('time_list') train_cutoff = int(frac_train * len(dataset)) valid_cutoff = int((frac_train + frac_valid) * len(dataset)) index = [idx for idx, _ in sorted( enumerate(time_list), key=lambda x: x[1])][:len(dataset)] train_index = index[:train_cutoff] valid_index = index[train_cutoff:valid_cutoff] test_index = index[valid_cutoff:] return numpy.array(train_index), numpy.array(valid_index), \ numpy.array(test_index) def train_valid_test_split(self, dataset, time_list=None, frac_train=0.8, frac_valid=0.1, frac_test=0.1, converter=None, return_index=True, **kwargs): """Split dataset into train, valid and test set. Split indices are generated by splitting based on time order. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. time_list(list): Time list corresponding to dataset. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. frac_test(float): Fraction of dataset put into test data. converter(callable): return_index(bool): If `True`, this function returns only indexes. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indices .. admonition:: Example >>> from chainer_chemistry.datasets import NumpyTupleDataset >>> from chainer_chemistry.dataset.splitters import TimeSplitter >>> a = numpy.random.random((10, 10)) >>> b = numpy.random.random((10, 8)) >>> c = numpy.random.random((10, 1)) >>> d = NumpyTupleDataset(a, b, c) >>> splitter = TimeSplitter() >>> train, valid, test = splitter.train_valid_test_split(dataset, return_index=False) >>> print(len(train), len(valid)) 8, 1, 1 """ return super(TimeSplitter, self).train_valid_test_split( dataset, frac_train, frac_valid, frac_test, converter, return_index, time_list=time_list, **kwargs) def train_valid_split(self, dataset, time_list=None, frac_train=0.9, frac_valid=0.1, converter=None, return_index=True, **kwargs): """Split dataset into train and valid set. Split indices are generated by splitting based on time order. Args: dataset(NumpyTupleDataset, numpy.ndarray): Dataset. time_list(list): Time list corresponding to dataset. frac_train(float): Fraction of dataset put into training data. frac_valid(float): Fraction of dataset put into validation data. converter(callable): return_index(bool): If `True`, this function returns only indexes. If `False`, this function returns splitted dataset. Returns: SplittedDataset(tuple): splitted dataset or indexes .. admonition:: Example >>> from chainer_chemistry.datasets import NumpyTupleDataset >>> from chainer_chemistry.dataset.splitters import TimeSplitter >>> a = numpy.random.random((10, 10)) >>> b = numpy.random.random((10, 8)) >>> c = numpy.random.random((10, 1)) >>> d = NumpyTupleDataset(a, b, c) >>> splitter = TimeSplitter() >>> train, valid = splitter.train_valid_split(dataset, return_index=False) >>> print(len(train), len(valid)) 9, 1 """ return super(TimeSplitter, self).train_valid_split( dataset, frac_train, frac_valid, converter, return_index, time_list=time_list, **kwargs) ================================================ FILE: chainer_chemistry/dataset/utils.py ================================================ import numpy class GaussianDistance(object): """Expand distance with Gaussian basis sit at centers and with width 0.5. Args: centers (numpy.ndarray): 1 dimensional array. The positions of the center of the peak in a gaussian function. width (float): Normal distribution in a gaussian function. """ def __init__(self, centers=None, width=0.5): if centers is None: centers = numpy.linspace(0, 4, 20) self.centers = centers self.width = width def expand(self, d): """Expand distance with given parameters. Args: d (float): distance Returns expanded_distance (numpy.1darray): M dimension with M the length of centers """ return numpy.exp(-(d-self.centers)**2 / self.width**2, dtype=numpy.float32) def expand_from_distances(self, distances): """Expand distances with given parameters. The original implemantation is below. https://github.com/txie-93/cgcnn/blob/fdcd7eec8771e223e60e1b0abf7e6c7bc7d006bf/cgcnn/data.py#L152 Args: distances (numpy.ndarray): 1 dimensional array. Returns expanded_distances (numpy.ndarray): 2 dimensional array. First axis size is the number of distance, Second axis size is M dimension with M the length of centers """ return numpy.exp(-(distances[..., numpy.newaxis] - self.centers)**2 / self.width**2, dtype=numpy.float32) ================================================ FILE: chainer_chemistry/datasets/__init__.py ================================================ from chainer_chemistry.datasets import molnet # NOQA from chainer_chemistry.datasets import qm9 # NOQA from chainer_chemistry.datasets import tox21 # NOQA from chainer_chemistry.datasets import zinc # NOQA # import class and function from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset # NOQA from chainer_chemistry.datasets.qm9 import get_qm9 # NOQA from chainer_chemistry.datasets.qm9 import get_qm9_filepath # NOQA from chainer_chemistry.datasets.qm9 import get_qm9_label_names # NOQA from chainer_chemistry.datasets.tox21 import get_tox21 # NOQA from chainer_chemistry.datasets.tox21 import get_tox21_filepath # NOQA from chainer_chemistry.datasets.tox21 import get_tox21_label_names # NOQA from chainer_chemistry.datasets.zinc import get_zinc250k # NOQA from chainer_chemistry.datasets.zinc import get_zinc250k_filepath # NOQA from chainer_chemistry.datasets.zinc import get_zinc250k_label_names # NOQA ================================================ FILE: chainer_chemistry/datasets/citation_network/citation.py ================================================ import os import networkx as nx import numpy from tqdm import tqdm def citation_to_networkx(dirpath, name): G = nx.Graph() # node feature, node label with open(os.path.join(dirpath, "{}.content".format(name))) as f: lines = f.readlines() compressor = {} acc = 0 for line in tqdm(lines): lis = line.split() key, val = lis[0], lis[-1] if val in compressor: val = compressor[val] else: compressor[val] = acc val = acc acc += 1 G.add_node(key, x=numpy.array(lis[1:-1], dtype=numpy.float32), y=val) G.graph['label_num'] = acc # edge with open(os.path.join(dirpath, "{}.cites".format(name))) as f: lines = f.readlines() for line in tqdm(lines): u, v = line.split() if u not in G.nodes.keys(): print("Warning: {} does not appear in {}{}.content".format( u, dirpath, name)) elif v not in G.nodes.keys(): print("Warning: {} does not appear in {}{}.content".format( v, dirpath, name)) else: G.add_edge(u, v) G = nx.convert_node_labels_to_integers(G) print("Finished loading graph: {}".format(dirpath)) print("number of nodes: {}, number of edges: {}".format( G.number_of_nodes(), G.number_of_edges() )) return G ================================================ FILE: chainer_chemistry/datasets/citation_network/citeseer.py ================================================ from logging import getLogger import os import tarfile from typing import List, Tuple # NOQA from chainer.dataset import download download_url = 'https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz' feat_file_name = 'citeseer.content' edge_file_name = 'citeseer.cites' _root = 'pfnet/chainer/citeseer' _label_names = ['Agents', 'AI', 'DB', 'IR', 'ML', 'HCI'] def get_citeseer_label_names(): # type: () -> List[str] """Return label names of Cora dataset.""" return _label_names def get_citeseer_dirpath(download_if_not_exist=True): # type: (bool) -> str """Construct a dirpath which stores citeseer dataset. This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If ``True``, download dataset if it is not downloaded yet. Returns: dirpath (str): directory path for citeseer dataset. """ feat_cache_path, edge_cache_path = get_citeseer_filepath( download_if_not_exist=download_if_not_exist) dirpath = os.path.dirname(feat_cache_path) dirpath2 = os.path.dirname(edge_cache_path) assert dirpath == dirpath2 return dirpath def get_citeseer_filepath(download_if_not_exist=True): # type: (bool) -> Tuple[str, str] """Construct a filepath which stores citeseer dataset. This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If ``True``, download dataset if it is not downloaded yet. Returns: feat_cache_path (str): file path for citeseer dataset (features). edge_cache_path (str): file path for citeseer dataset (edge index). """ feat_cache_path, edge_cache_path = _get_citeseer_filepath() if not os.path.exists(feat_cache_path): if download_if_not_exist: is_successful = download_and_extract_citeseer( save_dirpath=os.path.dirname(feat_cache_path)) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return feat_cache_path, edge_cache_path def _get_citeseer_filepath(): # type: () -> Tuple[str, str] """Construct a filepath which stores citeseer dataset. This method does not check if the file is already downloaded or not. Returns: feat_cache_path (str): file path for citeseer dataset (features). edge_cache_path (str): file path for citeseer dataset (edge index). """ cache_root = download.get_dataset_directory(_root) feat_cache_path = os.path.join(cache_root, feat_file_name) edge_cache_path = os.path.join(cache_root, edge_file_name) return feat_cache_path, edge_cache_path def download_and_extract_citeseer(save_dirpath): # type: (str) -> bool print('downloading citeseer dataset...') download_file_path = download.cached_download(download_url) print('extracting citeseer dataset...') tf = tarfile.open(download_file_path, 'r') tf.extractall(os.path.dirname(save_dirpath)) return True ================================================ FILE: chainer_chemistry/datasets/citation_network/cora.py ================================================ from logging import getLogger import os import tarfile from typing import List, Tuple # NOQA from chainer.dataset import download download_url = 'https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz' feat_file_name = 'cora.content' edge_file_name = 'cora.cites' _root = 'pfnet/chainer/cora' _label_names = [ 'Case_Based', 'Genetic_Algorithms', 'Neural_Networks', 'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning', 'Theory' ] def get_cora_label_names(): # type: () -> List[str] """Return label names of Cora dataset.""" return _label_names def get_cora_dirpath(download_if_not_exist=True): # type: (bool) -> str """Construct a dirpath which stores Cora dataset. This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If ``True``, download dataset if it is not downloaded yet. Returns: dirpath (str): directory path for Cora dataset. """ feat_cache_path, edge_cache_path = get_cora_filepath( download_if_not_exist=download_if_not_exist) dirpath = os.path.dirname(feat_cache_path) dirpath2 = os.path.dirname(edge_cache_path) assert dirpath == dirpath2 return dirpath def get_cora_filepath(download_if_not_exist=True): # type: (bool) -> Tuple[str, str] """Construct a filepath which stores Cora dataset. This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If ``True``, download dataset if it is not downloaded yet. Returns: feat_cache_path (str): file path for Cora dataset (features). edge_cache_path (str): file path for Cora dataset (edge index). """ feat_cache_path, edge_cache_path = _get_cora_filepath() if not os.path.exists(feat_cache_path): if download_if_not_exist: is_successful = download_and_extract_cora( save_dirpath=os.path.dirname(feat_cache_path)) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return feat_cache_path, edge_cache_path def _get_cora_filepath(): # type: () -> Tuple[str, str] """Construct a filepath which stores Cora dataset. This method does not check if the file is already downloaded or not. Returns: feat_cache_path (str): file path for Cora dataset (features). edge_cache_path (str): file path for Cora dataset (edge index). """ cache_root = download.get_dataset_directory(_root) feat_cache_path = os.path.join(cache_root, feat_file_name) edge_cache_path = os.path.join(cache_root, edge_file_name) return feat_cache_path, edge_cache_path def download_and_extract_cora(save_dirpath): # type: (str) -> bool print('downloading cora dataset...') download_file_path = download.cached_download(download_url) print('extracting cora dataset...') tf = tarfile.open(download_file_path, 'r') tf.extractall(os.path.dirname(save_dirpath)) return True ================================================ FILE: chainer_chemistry/datasets/molnet/__init__.py ================================================ from chainer_chemistry.datasets.molnet import chembl_tasks # NOQA from chainer_chemistry.datasets.molnet import molnet # NOQA from chainer_chemistry.datasets.molnet import molnet_config # NOQA from chainer_chemistry.datasets.molnet import pdbbind_time # NOQA from chainer_chemistry.datasets.molnet import toxcast_tasks # NOQA from chainer_chemistry.datasets.molnet.molnet import get_grid_featurized_pdbbind_dataset # NOQA from chainer_chemistry.datasets.molnet.molnet import get_molnet_dataframe # NOQA from chainer_chemistry.datasets.molnet.molnet import get_molnet_dataset # NOQA from chainer_chemistry.datasets.molnet.molnet import get_molnet_filepath # NOQA from chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config # NOQA ================================================ FILE: chainer_chemistry/datasets/molnet/chembl_tasks.py ================================================ # flake8: noqa chembl_tasks = [ 'CHEMBL1075104', 'CHEMBL1075228', 'CHEMBL1075284', 'CHEMBL1163101', 'CHEMBL1163125', 'CHEMBL1255149', 'CHEMBL1293289', 'CHEMBL1741186', 'CHEMBL1790', 'CHEMBL1792', 'CHEMBL1801', 'CHEMBL1804', 'CHEMBL1811', 'CHEMBL1821', 'CHEMBL1824', 'CHEMBL1827', 'CHEMBL1829', 'CHEMBL1833', 'CHEMBL1836', 'CHEMBL1844', 'CHEMBL1850', 'CHEMBL1853', 'CHEMBL1862', 'CHEMBL1865', 'CHEMBL1867', 'CHEMBL1868', 'CHEMBL1871', 'CHEMBL1873', 'CHEMBL1875', 'CHEMBL1881', 'CHEMBL1889', 'CHEMBL1898', 'CHEMBL1899', 'CHEMBL1900', 'CHEMBL1901', 'CHEMBL1906', 'CHEMBL1908', 'CHEMBL1913', 'CHEMBL1914', 'CHEMBL1916', 'CHEMBL1917', 'CHEMBL1921', 'CHEMBL1926', 'CHEMBL1936', 'CHEMBL1937', 'CHEMBL1941', 'CHEMBL1942', 'CHEMBL1951', 'CHEMBL1952', 'CHEMBL1955', 'CHEMBL1957', 'CHEMBL1968', 'CHEMBL1974', 'CHEMBL1977', 'CHEMBL1978', 'CHEMBL1980', 'CHEMBL1981', 'CHEMBL1983', 'CHEMBL1991', 'CHEMBL1994', 'CHEMBL1995', 'CHEMBL2000', 'CHEMBL2007', 'CHEMBL2014', 'CHEMBL2016', 'CHEMBL202', 'CHEMBL2028', 'CHEMBL203', 'CHEMBL2034', 'CHEMBL2035', 'CHEMBL2039', 'CHEMBL204', 'CHEMBL2041', 'CHEMBL2047', 'CHEMBL2049', 'CHEMBL205', 'CHEMBL2056', 'CHEMBL206', 'CHEMBL2069', 'CHEMBL208', 'CHEMBL209', 'CHEMBL210', 'CHEMBL211', 'CHEMBL213', 'CHEMBL214', 'CHEMBL2147', 'CHEMBL2148', 'CHEMBL215', 'CHEMBL216', 'CHEMBL217', 'CHEMBL218', 'CHEMBL2185', 'CHEMBL219', 'CHEMBL220', 'CHEMBL2208', 'CHEMBL221', 'CHEMBL222', 'CHEMBL223', 'CHEMBL224', 'CHEMBL2243', 'CHEMBL225', 'CHEMBL226', 'CHEMBL2265', 'CHEMBL2274', 'CHEMBL2276', 'CHEMBL228', 'CHEMBL2285', 'CHEMBL229', 'CHEMBL2292', 'CHEMBL230', 'CHEMBL2304402', 'CHEMBL2304404', 'CHEMBL231', 'CHEMBL2318', 'CHEMBL232', 'CHEMBL2326', 'CHEMBL2327', 'CHEMBL2329', 'CHEMBL233', 'CHEMBL2334', 'CHEMBL2335', 'CHEMBL2337', 'CHEMBL234', 'CHEMBL2345', 'CHEMBL235', 'CHEMBL236', 'CHEMBL2363', 'CHEMBL2366505', 'CHEMBL2366516', 'CHEMBL237', 'CHEMBL238', 'CHEMBL239', 'CHEMBL2391', 'CHEMBL2397', 'CHEMBL240', 'CHEMBL2409', 'CHEMBL241', 'CHEMBL242', 'CHEMBL2425', 'CHEMBL2431', 'CHEMBL244', 'CHEMBL245', 'CHEMBL246', 'CHEMBL247', 'CHEMBL2470', 'CHEMBL248', 'CHEMBL2487', 'CHEMBL2488', 'CHEMBL2489', 'CHEMBL249', 'CHEMBL2492', 'CHEMBL2499', 'CHEMBL251', 'CHEMBL2525', 'CHEMBL2527', 'CHEMBL253', 'CHEMBL2534', 'CHEMBL254', 'CHEMBL255', 'CHEMBL256', 'CHEMBL2563', 'CHEMBL2564', 'CHEMBL2575', 'CHEMBL258', 'CHEMBL2581', 'CHEMBL259', 'CHEMBL2599', 'CHEMBL260', 'CHEMBL261', 'CHEMBL262', 'CHEMBL2622', 'CHEMBL2637', 'CHEMBL264', 'CHEMBL265', 'CHEMBL2652', 'CHEMBL267', 'CHEMBL268', 'CHEMBL269', 'CHEMBL2695', 'CHEMBL270', 'CHEMBL2716', 'CHEMBL2722', 'CHEMBL273', 'CHEMBL274', 'CHEMBL2742', 'CHEMBL2749', 'CHEMBL275', 'CHEMBL276', 'CHEMBL2782', 'CHEMBL2789', 'CHEMBL279', 'CHEMBL280', 'CHEMBL2803', 'CHEMBL2808', 'CHEMBL2815', 'CHEMBL2820', 'CHEMBL2828', 'CHEMBL283', 'CHEMBL2835', 'CHEMBL284', 'CHEMBL2842', 'CHEMBL2858', 'CHEMBL286', 'CHEMBL2868', 'CHEMBL287', 'CHEMBL2871', 'CHEMBL288', 'CHEMBL2882', 'CHEMBL2885', 'CHEMBL2902', 'CHEMBL2903', 'CHEMBL2949', 'CHEMBL2954', 'CHEMBL2959', 'CHEMBL2971', 'CHEMBL2973', 'CHEMBL298', 'CHEMBL299', 'CHEMBL2993', 'CHEMBL2996', 'CHEMBL301', 'CHEMBL3012', 'CHEMBL3018', 'CHEMBL302', 'CHEMBL3024', 'CHEMBL3025', 'CHEMBL3037', 'CHEMBL304', 'CHEMBL3045', 'CHEMBL3048', 'CHEMBL3060', 'CHEMBL3072', 'CHEMBL308', 'CHEMBL309', 'CHEMBL3105', 'CHEMBL3116', 'CHEMBL312', 'CHEMBL313', 'CHEMBL3130', 'CHEMBL3138', 'CHEMBL3142', 'CHEMBL3145', 'CHEMBL3155', 'CHEMBL3166', 'CHEMBL318', 'CHEMBL3180', 'CHEMBL3181', 'CHEMBL319', 'CHEMBL3192', 'CHEMBL3199', 'CHEMBL3202', 'CHEMBL321', 'CHEMBL322', 'CHEMBL3222', 'CHEMBL3223', 'CHEMBL3227', 'CHEMBL3229', 'CHEMBL3230', 'CHEMBL3231', 'CHEMBL324', 'CHEMBL3242', 'CHEMBL325', 'CHEMBL326', 'CHEMBL3267', 'CHEMBL3286', 'CHEMBL3305', 'CHEMBL331', 'CHEMBL3310', 'CHEMBL332', 'CHEMBL333', 'CHEMBL3332', 'CHEMBL335', 'CHEMBL3351', 'CHEMBL3358', 'CHEMBL3360', 'CHEMBL3361', 'CHEMBL3371', 'CHEMBL338', 'CHEMBL339', 'CHEMBL3403', 'CHEMBL3426', 'CHEMBL3438', 'CHEMBL344', 'CHEMBL3464', 'CHEMBL3468', 'CHEMBL3471', 'CHEMBL3473', 'CHEMBL3476', 'CHEMBL3510', 'CHEMBL3522', 'CHEMBL3524', 'CHEMBL3553', 'CHEMBL3563', 'CHEMBL3568', 'CHEMBL3571', 'CHEMBL3582', 'CHEMBL3587', 'CHEMBL3594', 'CHEMBL3602', 'CHEMBL3614', 'CHEMBL3629', 'CHEMBL3650', 'CHEMBL3687', 'CHEMBL3699', 'CHEMBL3706', 'CHEMBL3710', 'CHEMBL3717', 'CHEMBL3729', 'CHEMBL3746', 'CHEMBL3759', 'CHEMBL3766', 'CHEMBL3769', 'CHEMBL3772', 'CHEMBL3775', 'CHEMBL3776', 'CHEMBL3778', 'CHEMBL3788', 'CHEMBL3795', 'CHEMBL3807', 'CHEMBL3836', 'CHEMBL3837', 'CHEMBL3864', 'CHEMBL3869', 'CHEMBL3892', 'CHEMBL3910', 'CHEMBL3912', 'CHEMBL3920', 'CHEMBL3943', 'CHEMBL3952', 'CHEMBL3969', 'CHEMBL3976', 'CHEMBL3979', 'CHEMBL3983', 'CHEMBL3991', 'CHEMBL3996', 'CHEMBL4005', 'CHEMBL4040', 'CHEMBL4072', 'CHEMBL4073', 'CHEMBL4077', 'CHEMBL4078', 'CHEMBL4093', 'CHEMBL4102', 'CHEMBL4124', 'CHEMBL4128', 'CHEMBL4140', 'CHEMBL4142', 'CHEMBL4145', 'CHEMBL4150', 'CHEMBL4153', 'CHEMBL4179', 'CHEMBL4188', 'CHEMBL4191', 'CHEMBL4203', 'CHEMBL4204', 'CHEMBL4224', 'CHEMBL4235', 'CHEMBL4247', 'CHEMBL4282', 'CHEMBL4296', 'CHEMBL4302', 'CHEMBL4321', 'CHEMBL4333', 'CHEMBL4336', 'CHEMBL4354', 'CHEMBL4361', 'CHEMBL4372', 'CHEMBL4393', 'CHEMBL4409', 'CHEMBL4414', 'CHEMBL4427', 'CHEMBL4429', 'CHEMBL4439', 'CHEMBL4465', 'CHEMBL4471', 'CHEMBL4477', 'CHEMBL4481', 'CHEMBL4482', 'CHEMBL4501', 'CHEMBL4523', 'CHEMBL4561', 'CHEMBL4586', 'CHEMBL4588', 'CHEMBL4599', 'CHEMBL4600', 'CHEMBL4608', 'CHEMBL4618', 'CHEMBL4625', 'CHEMBL4630', 'CHEMBL4644', 'CHEMBL4653', 'CHEMBL4657', 'CHEMBL4662', 'CHEMBL4681', 'CHEMBL4683', 'CHEMBL4687', 'CHEMBL4696', 'CHEMBL4722', 'CHEMBL4768', 'CHEMBL4777', 'CHEMBL4779', 'CHEMBL4780', 'CHEMBL4789', 'CHEMBL4792', 'CHEMBL4793', 'CHEMBL4794', 'CHEMBL4801', 'CHEMBL4802', 'CHEMBL4803', 'CHEMBL4804', 'CHEMBL4816', 'CHEMBL4822', 'CHEMBL4828', 'CHEMBL4829', 'CHEMBL4860', 'CHEMBL4895', 'CHEMBL4899', 'CHEMBL4975', 'CHEMBL4980', 'CHEMBL5017', 'CHEMBL5024', 'CHEMBL5067', 'CHEMBL5071', 'CHEMBL5076', 'CHEMBL5077', 'CHEMBL5103', 'CHEMBL5113', 'CHEMBL5122', 'CHEMBL5145', 'CHEMBL5147', 'CHEMBL5160', 'CHEMBL5205', 'CHEMBL5251', 'CHEMBL5314', 'CHEMBL5328', 'CHEMBL5331', 'CHEMBL5373', 'CHEMBL5407', 'CHEMBL5414', 'CHEMBL5441', 'CHEMBL5445', 'CHEMBL5457', 'CHEMBL5491', 'CHEMBL5508', 'CHEMBL5543', 'CHEMBL5570', 'CHEMBL5631', 'CHEMBL5658', 'CHEMBL5669', 'CHEMBL5763', 'CHEMBL5800', 'CHEMBL5847', 'CHEMBL5932', 'CHEMBL6007', 'CHEMBL6009', 'CHEMBL6137', 'CHEMBL6140', 'CHEMBL6154', 'CHEMBL6164', 'CHEMBL6166', 'CHEMBL6184'] ================================================ FILE: chainer_chemistry/datasets/molnet/molnet.py ================================================ import joblib from logging import getLogger import os import shutil import tarfile import numpy import pandas from chainer.dataset import download from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA from chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter from chainer_chemistry.dataset.splitters.scaffold_splitter import ScaffoldSplitter # NOQA from chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import DeepChemScaffoldSplitter # NOQA from chainer_chemistry.dataset.splitters import split_method_dict from chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config # NOQA from chainer_chemistry.datasets.molnet.pdbbind_time import get_pdbbind_time from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset _root = 'pfnet/chainer/molnet' def get_molnet_dataset(dataset_name, preprocessor=None, labels=None, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, seed=777, return_smiles=False, return_pdb_id=False, target_index=None, task_index=0, **kwargs): """Downloads, caches and preprocess MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site `_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. return_smiles (bool): If set to ``True``, smiles array is also returned. return_pdb_id (bool): If set to ``True``, PDB ID array is also returned. This argument is only used when you select 'pdbbind_smiles'. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or `None`. """ if dataset_name not in molnet_default_config: raise ValueError("We don't support {} dataset. Please choose from {}". format(dataset_name, list(molnet_default_config.keys()))) if dataset_name == 'pdbbind_grid': pdbbind_subset = kwargs.get('pdbbind_subset') return get_pdbbind_grid(pdbbind_subset, split=split, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, task_index=task_index) if dataset_name == 'pdbbind_smiles': pdbbind_subset = kwargs.get('pdbbind_subset') time_list = kwargs.get('time_list') return get_pdbbind_smiles(pdbbind_subset, preprocessor=preprocessor, labels=labels, split=split, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, return_smiles=return_smiles, return_pdb_id=return_pdb_id, target_index=target_index, task_index=task_index, time_list=time_list) dataset_config = molnet_default_config[dataset_name] labels = labels or dataset_config['tasks'] if isinstance(labels, str): labels = [labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() if dataset_config['task_type'] == 'regression': def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) elif dataset_config['task_type'] == 'classification': def postprocess_label(label_list): label_list = numpy.asarray(label_list) label_list[numpy.isnan(label_list)] = -1 return label_list.astype(numpy.int32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=dataset_config['smiles_columns'], postprocess_label=postprocess_label) if dataset_config['dataset_type'] == 'one_file_csv': split = dataset_config['split'] if split is None else split if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str or instance of" " BaseSplitter, but got {}".format(type(split))) if isinstance(splitter, (ScaffoldSplitter, DeepChemScaffoldSplitter)): get_smiles = True else: get_smiles = return_smiles result = parser.parse(get_molnet_filepath(dataset_name), return_smiles=get_smiles, target_index=target_index, **kwargs) dataset = result['dataset'] smiles = result['smiles'] train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, smiles_list=smiles, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) if return_smiles: train_smiles = smiles[train_ind] valid_smiles = smiles[valid_ind] test_smiles = smiles[test_ind] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None elif dataset_config['dataset_type'] == 'separate_csv': result = {} train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'), return_smiles=return_smiles, target_index=target_index) valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'), return_smiles=return_smiles, target_index=target_index) test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'), return_smiles=return_smiles, target_index=target_index) result['dataset'] = (train_result['dataset'], valid_result['dataset'], test_result['dataset']) result['smiles'] = (train_result['smiles'], valid_result['smiles'], test_result['smiles']) else: raise ValueError('dataset_type={} is not supported' .format(dataset_config['dataset_type'])) return result def get_molnet_dataframe(dataset_name, pdbbind_subset=None): """Downloads, caches and get the dataframe of MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site `_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. pdbbind_subset (str): PDBbind dataset subset name. If you want to know the detail of subset, please refer to `official site ` Returns (pandas.DataFrame or tuple): DataFrame of dataset without any preprocessing. When the files of dataset are seprated, this function returns multiple DataFrame. """ if dataset_name not in molnet_default_config: raise ValueError("We don't support {} dataset. Please choose from {}". format(dataset_name, list(molnet_default_config.keys()))) if dataset_name == 'pdbbind_grid': raise ValueError('pdbbind_grid dataset is not supported. Please ', 'choose pdbbind_smiles dataset.') dataset_config = molnet_default_config[dataset_name] if dataset_config['dataset_type'] == 'one_file_csv': df = pandas.read_csv(get_molnet_filepath( dataset_name, pdbbind_subset=pdbbind_subset)) return df elif dataset_config['dataset_type'] == 'separate_csv': train_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'train')) valid_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'valid')) test_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'test')) return train_df, valid_df, test_df else: raise ValueError('dataset_type={} is not supported' .format(dataset_config['dataset_type'])) def get_molnet_filepath(dataset_name, filetype='onefile', download_if_not_exist=True, pdbbind_subset=None): """Construct a file path which stores MoleculeNet dataset. This method check whether the file exist or not, and downloaded it if necessary. Args: dataset_name (str): MoleculeNet dataset name. file_type (str): either 'onefile', 'train', 'valid', 'test' download_if_not_exist (bool): Download a file if it does not exist. Returns (str): filepath for specific MoleculeNet dataset """ filetype_supported = ['onefile', 'train', 'valid', 'test'] if filetype not in filetype_supported: raise ValueError("filetype {} not supported, please choose filetype " "from {}".format(filetype, filetype_supported)) if filetype == 'onefile': url_key = 'url' else: url_key = filetype + '_url' if dataset_name == 'pdbbind_smiles': file_url = molnet_default_config[dataset_name][url_key][pdbbind_subset] else: file_url = molnet_default_config[dataset_name][url_key] file_name = file_url.split('/')[-1] cache_path = _get_molnet_filepath(file_name) if not os.path.exists(cache_path): if download_if_not_exist: is_successful = download_dataset(file_url, save_filepath=cache_path) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return cache_path def _get_molnet_filepath(file_name): """Construct a filepath which stores MoleculeNet dataset in csv This method does not check if the file is already downloaded or not. Args: file_name (str): file name of MoleculeNet dataset Returns (str): filepath for one of MoleculeNet dataset """ cache_root = download.get_dataset_directory(_root) cache_path = os.path.join(cache_root, file_name) return cache_path def download_dataset(dataset_url, save_filepath): """Download and caches MoleculeNet Dataset Args: dataset_url (str): URL of dataset save_filepath (str): filepath for dataset Returns (bool): If success downloading, returning `True`. """ logger = getLogger(__name__) logger.warning('Downloading {} dataset, it takes time...' .format(dataset_url.split('/')[-1])) download_file_path = download.cached_download(dataset_url) shutil.move(download_file_path, save_filepath) # pandas can load gzipped or tarball csv file return True def get_pdbbind_smiles(pdbbind_subset, preprocessor=None, labels=None, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, return_smiles=False, return_pdb_id=True, target_index=None, task_index=0, time_list=None, **kwargs): """Downloads, caches and preprocess PDBbind dataset. Args: pdbbind_subset (str): PDBbind dataset subset name. If you want to know the detail of subset, please refer to `official site ` preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. return_smiles (bool): If set to ``True``, smiles array is also returned. return_pdb_id (bool): If set to ``True``, PDB ID array is also returned. This argument is only used when you select 'pdbbind_smiles'. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy arrays with dtype=object(string) which are vectors of smiles and pdb_id for each example or `None`. """ config = molnet_default_config['pdbbind_smiles'] labels = labels or config['tasks'] if isinstance(labels, str): labels = [labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=config['smiles_columns'], postprocess_label=postprocess_label) split = config['split'] if split is None else split if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str or instance of" " BaseSplitter, but got {}".format(type(split))) result = parser.parse(get_molnet_filepath('pdbbind_smiles', pdbbind_subset=pdbbind_subset), return_smiles=return_smiles, return_is_successful=True, target_index=target_index) dataset = result['dataset'] smiles = result['smiles'] is_successful = result['is_successful'] if return_pdb_id: df = pandas.read_csv( get_molnet_filepath('pdbbind_smiles', pdbbind_subset=pdbbind_subset)) pdb_id = df['id'][is_successful] else: pdb_id = None train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, time_list=time_list, smiles_list=smiles, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) if return_smiles: train_smiles = smiles[train_ind] valid_smiles = smiles[valid_ind] test_smiles = smiles[test_ind] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None if return_pdb_id: train_pdb_id = pdb_id[train_ind] valid_pdb_id = pdb_id[valid_ind] test_pdb_id = pdb_id[test_ind] result['pdb_id'] = (train_pdb_id, valid_pdb_id, test_pdb_id) else: result['pdb_id'] = None return result def get_pdbbind_grid(pdbbind_subset, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, task_index=0, **kwargs): """Downloads, caches and grid-featurize PDBbind dataset. Args: pdbbind_subset (str): PDBbind dataset subset name. If you want to know the detail of subset, please refer to `official site ` split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy arrays with dtype=object(string) which are vectors of smiles and pdb_id for each example or `None`. """ result = {} dataset = get_grid_featurized_pdbbind_dataset(pdbbind_subset) if split is None: split = molnet_default_config['pdbbind_grid']['split'] if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str, or instance of" " BaseSplitter, but got {}".format(type(split))) time_list = get_pdbbind_time() train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, time_list=time_list, smiles_list=None, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) result['smiles'] = None return result def get_grid_featurized_pdbbind_dataset(subset): """Downloads and caches grid featurized PDBBind dataset. Args: subset (str): subset name of PDBBind dataset. Returns (NumpyTupleDataset): grid featurized PDBBind dataset. """ x_path, y_path = get_grid_featurized_pdbbind_filepath(subset) x = joblib.load(x_path).astype('i') y = joblib.load(y_path).astype('f') dataset = NumpyTupleDataset(x, y) return dataset def get_grid_featurized_pdbbind_dirpath(subset, download_if_not_exist=True): """Construct a directory path which stores grid featurized PDBBind dataset. This method check whether the file exist or not, and downloaded it if necessary. Args: subset (str): subset name of PDBBind dataset. download_if_not_exist (bool): Download a file if it does not exist. Returns (str): directory path for specific subset of PDBBind dataset. """ subset_supported = ['core', 'full', 'refined'] if subset not in subset_supported: raise ValueError("subset {} not supported, please choose filetype " "from {}".format(subset, subset_supported)) file_url = \ molnet_default_config['pdbbind_grid']['url'][subset] file_name = file_url.split('/')[-1] cache_path = _get_molnet_filepath(file_name) if not os.path.exists(cache_path): if download_if_not_exist: is_successful = download_dataset(file_url, save_filepath=cache_path) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return cache_path def get_grid_featurized_pdbbind_filepath(subset): """Construct a filepath which stores featurized PDBBind dataset in joblib This method does not check if the file is already downloaded or not. Args: subset (str): subset name of PDBBind dataset Returns: x_path (str): filepath for feature vectors y_path (str): filepath for -logKd/Ki """ dirpath = get_grid_featurized_pdbbind_dirpath(subset=subset) savedir = '/'.join(dirpath.split('/')[:-1]) + '/' with tarfile.open(dirpath, 'r:gz') as tar: tar.extractall(savedir) x_path = savedir + subset + '_grid/shard-0-X.joblib' y_path = savedir + subset + '_grid/shard-0-y.joblib' return x_path, y_path ================================================ FILE: chainer_chemistry/datasets/molnet/molnet_config.py ================================================ import chainer.functions as F import chainer_chemistry from chainer_chemistry.datasets.molnet.chembl_tasks import chembl_tasks from chainer_chemistry.datasets.molnet.toxcast_tasks import toxcast_tasks from chainer_chemistry.functions import mean_absolute_error from chainer_chemistry.functions import mean_squared_error from chainer_chemistry.training.extensions.prc_auc_evaluator import PRCAUCEvaluator # NOQA from chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator # NOQA molnet_base = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' featurized_base = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \ + 'featurized_datasets/' def mae(x, t): return mean_absolute_error(x, t, ignore_nan=True) def mse(x, t): return mean_squared_error(x, t, ignore_nan=True) def rmse(x, t): return F.sqrt(mse(x, t)) def r2_score(x, t): return chainer_chemistry.functions.r2_score(x, t, ignore_nan=True) molnet_default_config = { "bace_Class": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'roc_auc': ROCAUCEvaluator}, "smiles_columns": 'mol', "split": 'random', "task_type": 'classification', "tasks": ["Class"], "url": molnet_base + 'bace.csv', }, "bace_pIC50": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'MAE': mae}, "smiles_columns": 'mol', "split": 'random', "task_type": 'regression', "tasks": ["pIC50"], "url": molnet_base + 'bace.csv', }, "bbbp": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'roc_auc': ROCAUCEvaluator}, "smiles_columns": 'smiles', "split": 'scaffold', "task_type": 'classification', "tasks": ["p_np"], "url": molnet_base + 'BBBP.csv', }, # TODO(mottodora): There are many separating ways for chembl dataset # TODO(mottodora): only use 5thresh dataset(sparse dataset is not used.) # TODO(mottodora): support mix dataset type in example "chembl": { "dataset_type": 'one_file_csv', "loss": mse, "smiles_columns": 'smiles', "split": 'random', "task_type": 'mix', "tasks": chembl_tasks, "url": molnet_base + 'chembl_5thresh.csv.gz', }, "clearance": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'RMSE': rmse}, "smiles_columns": 'smile', "split": 'random', "task_type": 'regression', "tasks": ["target"], "url": molnet_base + 'clearance.csv', }, "clintox": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'roc_auc': ROCAUCEvaluator}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'classification', "tasks": ["FDA_APPROVED", "CT_TOX"], "url": molnet_base + 'clintox.csv.gz', }, "delaney": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'RMSE': rmse}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'regression', "tasks": ['measured log solubility in mols per litre'], "url": molnet_base + 'delaney-processed.csv', }, "HIV": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'roc_auc': ROCAUCEvaluator}, "smiles_columns": 'smiles', "split": 'scaffold', "task_type": 'classification', "tasks": ["HIV_active"], "url": molnet_base + 'HIV.csv', }, "hopv": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'RMSE': rmse}, "smiles_columns": 'hopv.csv', "split": 'random', "task_type": 'regression', "tasks": ['HOMO', 'LUMO', 'electrochemical_gap', 'optical_gap', 'PCE', 'V_OC', 'J_SC', 'fill_factor'], "url": molnet_base + 'hopv.tar.gz', }, "kaggle": { "dataset_type": 'separate_csv', "loss": mse, "metrics": {'RMSE': rmse}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'regression', "tasks": ['3A4', 'CB1', 'DPP4', 'HIVINT', 'HIV_PROT', 'LOGD', 'METAB', 'NK1', 'OX1', 'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI', 'THROMBIN' ], "test_url": molnet_base + 'KAGGLE_test2_' 'disguised_combined_full.csv.gz', "train_url": molnet_base + 'KAGGLE_training_' 'disguised_combined_full.csv.gz', "valid_url": molnet_base + 'KAGGLE_test1_' 'disguised_combined_full.csv.gz', }, "lipo": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'RMSE': rmse}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'regression', "tasks": ['exp'], "url": molnet_base + 'Lipophilicity.csv', }, "muv": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'prc_auc': PRCAUCEvaluator}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'classification', "tasks": ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'], "url": molnet_base + 'muv.csv.gz', }, "nci": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'RMSE': rmse}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'regression', "tasks": ['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226', 'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226', 'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205', 'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620', 'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251', 'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2', 'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1', 'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES', 'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393', 'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7', 'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549', 'T-47D' ], "url": molnet_base + 'nci_unique.csv', }, "pcba": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'prc_auc': PRCAUCEvaluator}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'classification', "tasks": ['PCBA-1030', 'PCBA-1379', 'PCBA-1452', 'PCBA-1454', 'PCBA-1457', 'PCBA-1458', 'PCBA-1460', 'PCBA-1461', 'PCBA-1468', 'PCBA-1469', 'PCBA-1471', 'PCBA-1479', 'PCBA-1631', 'PCBA-1634', 'PCBA-1688', 'PCBA-1721', 'PCBA-2100', 'PCBA-2101', 'PCBA-2147', 'PCBA-2242', 'PCBA-2326', 'PCBA-2451', 'PCBA-2517', 'PCBA-2528', 'PCBA-2546', 'PCBA-2549', 'PCBA-2551', 'PCBA-2662', 'PCBA-2675', 'PCBA-2676', 'PCBA-411', 'PCBA-463254', 'PCBA-485281', 'PCBA-485290', 'PCBA-485294', 'PCBA-485297', 'PCBA-485313', 'PCBA-485314', 'PCBA-485341', 'PCBA-485349', 'PCBA-485353', 'PCBA-485360', 'PCBA-485364', 'PCBA-485367', 'PCBA-492947', 'PCBA-493208', 'PCBA-504327', 'PCBA-504332', 'PCBA-504333', 'PCBA-504339', 'PCBA-504444', 'PCBA-504466', 'PCBA-504467', 'PCBA-504706', 'PCBA-504842', 'PCBA-504845', 'PCBA-504847', 'PCBA-504891', 'PCBA-540276', 'PCBA-540317', 'PCBA-588342', 'PCBA-588453', 'PCBA-588456', 'PCBA-588579', 'PCBA-588590', 'PCBA-588591', 'PCBA-588795', 'PCBA-588855', 'PCBA-602179', 'PCBA-602233', 'PCBA-602310', 'PCBA-602313', 'PCBA-602332', 'PCBA-624170', 'PCBA-624171', 'PCBA-624173', 'PCBA-624202', 'PCBA-624246', 'PCBA-624287', 'PCBA-624288', 'PCBA-624291', 'PCBA-624296', 'PCBA-624297', 'PCBA-624417', 'PCBA-651635', 'PCBA-651644', 'PCBA-651768', 'PCBA-651965', 'PCBA-652025', 'PCBA-652104', 'PCBA-652105', 'PCBA-652106', 'PCBA-686970', 'PCBA-686978', 'PCBA-686979', 'PCBA-720504', 'PCBA-720532', 'PCBA-720542', 'PCBA-720551', 'PCBA-720553', 'PCBA-720579', 'PCBA-720580', 'PCBA-720707', 'PCBA-720708', 'PCBA-720709', 'PCBA-720711', 'PCBA-743255', 'PCBA-743266', 'PCBA-875', 'PCBA-881', 'PCBA-883', 'PCBA-884', 'PCBA-885', 'PCBA-887', 'PCBA-891', 'PCBA-899', 'PCBA-902', 'PCBA-903', 'PCBA-904', 'PCBA-912', 'PCBA-914', 'PCBA-915', 'PCBA-924', 'PCBA-925', 'PCBA-926', 'PCBA-927', 'PCBA-938', 'PCBA-995'], "url": molnet_base + 'pcba.csv.gz', }, "pdbbind_smiles": { "subset": ["core", "full", "refined"], "dataset_type": 'one_file_csv', "url": {'core': molnet_base + 'core_smiles_labels.csv', 'full': molnet_base + 'full_smiles_labels.csv', 'refined': molnet_base + 'refined_smiles_labels.csv'}, "smiles_columns": 'smiles', "metrics": {'R2': r2_score}, "split": 'time', "task_type": 'regression', "tasks": ["-logKd/Ki"], }, "pdbbind_grid": { "pdbbind_subset": ["core", "full", "refined"], "dataset_type": 'joblib', "url": {'core': featurized_base + 'core_grid.tar.gz', 'full': featurized_base + 'full_grid.tar.gz', 'refined': featurized_base + 'refined_grid.tar.gz'}, "smiles_columns": '', "metrics": {'R2': r2_score}, "split": 'time', "task_type": 'regression', "tasks": ["-logKd/Ki"], }, "ppb": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'RMSE': rmse}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'regression', "tasks": ["exp"], "url": molnet_base + 'PPB.csv', }, # TODO(motoki): there are multiple data types in qm7 dataset. "qm7": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'MAE': mae}, "smiles_columns": 'smiles', "split": 'stratified', "task_type": 'regression', "tasks": ["u0_atom"], "url": molnet_base + 'qm7.csv', }, # TODO(motoki): there are sdf data types in qm8 dataset. "qm8": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'MAE': mae}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'regression', "tasks": ["E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-CAM", "E2-CAM", "f1-CAM", "f2-CAM"], "url": molnet_base + 'qm8.csv', }, # TODO(motoki): there are sdf data types in qm9 dataset. "qm9": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'MAE': mae}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'regression', "tasks": ["mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv", "u0", "u298", "h298", "g298"], "url": molnet_base + 'qm9.csv', }, "SAMPL": { "dataset_type": 'one_file_csv', "loss": mse, "metrics": {'RMSE': rmse}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'regression', "tasks": ["expt"], "url": molnet_base + 'SAMPL.csv', }, "sider": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'roc_auc': ROCAUCEvaluator}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'classification', "tasks": ['Hepatobiliary disorders', 'Metabolism and nutrition disorders', 'Product issues', 'Eye disorders', 'Investigations', 'Musculoskeletal and connective tissue disorders', 'Gastrointestinal disorders', 'Social circumstances', 'Immune system disorders', 'Reproductive system and breast disorders', 'Neoplasms benign, malignant and unspecified ' '(incl cysts and polyps)', 'General disorders and administration site conditions', 'Endocrine disorders', 'Surgical and medical procedures', 'Vascular disorders', 'Blood and lymphatic system disorders', 'Skin and subcutaneous tissue disorders', 'Congenital, familial and genetic disorders', 'Infections and infestations', 'Respiratory, thoracic and mediastinal disorders', 'Psychiatric disorders', 'Renal and urinary disorders', 'Pregnancy, puerperium and perinatal conditions', 'Ear and labyrinth disorders', 'Cardiac disorders', 'Nervous system disorders', 'Injury, poisoning and procedural complications'], "url": molnet_base + 'sider.csv.gz', }, "tox21": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'roc_auc': ROCAUCEvaluator}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'classification', "tasks": ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'], "url": molnet_base + 'tox21.csv.gz', }, "toxcast": { "dataset_type": 'one_file_csv', "loss": F.sigmoid_cross_entropy, "metrics": {'binary_accuracy': F.binary_accuracy, 'roc_auc': ROCAUCEvaluator}, "smiles_columns": 'smiles', "split": 'random', "task_type": 'classification', "tasks": toxcast_tasks, "url": molnet_base + 'toxcast_data.csv.gz', }, } ================================================ FILE: chainer_chemistry/datasets/molnet/pdbbind_time.py ================================================ from logging import getLogger import os import shutil import pandas from chainer.dataset import download def get_pdbbind_time(): """Get time list for PDBbind dataset. Args: Returns(list): Time list for PDBbind dataset. """ df = pandas.read_csv(get_pdbbind_time_filepath(), header=None) time_list = df[1].values.tolist() return time_list def get_pdbbind_time_filepath(download_if_not_exist=True): """Construct a file path which stores year table of PDBbind. This method check whether the file exist or not, and download it if necessary. Args: download_if_not_exist(bool): Download a file if it does not exist. Returns(str): filepath for year table """ url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' \ 'pdbbind_year.csv' file_name = url.split('/')[-1] cache_path = _get_pdbbind_time_filepath(file_name) if not os.path.exists(cache_path): if download_if_not_exist: is_successful = download_pdbbind_time(url, save_filepath=cache_path) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return cache_path def _get_pdbbind_time_filepath(file_name): """Construct a filepath which stores year table in csv. This method does not check if the file is already downloaded or not. Args: file_name(str): file name of year table Returns(str): filepath for one of year table """ cache_root = download.get_dataset_directory('pfnet/chainer/molnet') cache_path = os.path.join(cache_root, file_name) return cache_path def download_pdbbind_time(url, save_filepath): """Download and caches PDBBind year table. Args: url(str): URL of year table save_filepath(str): filepath for year table Returns(bool): If success downloading, returning `True`. """ download_file_path = download.cached_download(url) shutil.move(download_file_path, save_filepath) return True ================================================ FILE: chainer_chemistry/datasets/molnet/toxcast_tasks.py ================================================ # flake8: noqa toxcast_tasks = ['ACEA_T47D_80hr_Negative', 'ACEA_T47D_80hr_Positive', 'APR_HepG2_CellCycleArrest_24h_dn', 'APR_HepG2_CellCycleArrest_24h_up', 'APR_HepG2_CellCycleArrest_72h_dn', 'APR_HepG2_CellLoss_24h_dn', 'APR_HepG2_CellLoss_72h_dn', 'APR_HepG2_MicrotubuleCSK_24h_dn', 'APR_HepG2_MicrotubuleCSK_24h_up', 'APR_HepG2_MicrotubuleCSK_72h_dn', 'APR_HepG2_MicrotubuleCSK_72h_up', 'APR_HepG2_MitoMass_24h_dn', 'APR_HepG2_MitoMass_24h_up', 'APR_HepG2_MitoMass_72h_dn', 'APR_HepG2_MitoMass_72h_up', 'APR_HepG2_MitoMembPot_1h_dn', 'APR_HepG2_MitoMembPot_24h_dn', 'APR_HepG2_MitoMembPot_72h_dn', 'APR_HepG2_MitoticArrest_24h_up', 'APR_HepG2_MitoticArrest_72h_up', 'APR_HepG2_NuclearSize_24h_dn', 'APR_HepG2_NuclearSize_72h_dn', 'APR_HepG2_NuclearSize_72h_up', 'APR_HepG2_OxidativeStress_24h_up', 'APR_HepG2_OxidativeStress_72h_up', 'APR_HepG2_StressKinase_1h_up', 'APR_HepG2_StressKinase_24h_up', 'APR_HepG2_StressKinase_72h_up', 'APR_HepG2_p53Act_24h_up', 'APR_HepG2_p53Act_72h_up', 'APR_Hepat_Apoptosis_24hr_up', 'APR_Hepat_Apoptosis_48hr_up', 'APR_Hepat_CellLoss_24hr_dn', 'APR_Hepat_CellLoss_48hr_dn', 'APR_Hepat_DNADamage_24hr_up', 'APR_Hepat_DNADamage_48hr_up', 'APR_Hepat_DNATexture_24hr_up', 'APR_Hepat_DNATexture_48hr_up', 'APR_Hepat_MitoFxnI_1hr_dn', 'APR_Hepat_MitoFxnI_24hr_dn', 'APR_Hepat_MitoFxnI_48hr_dn', 'APR_Hepat_NuclearSize_24hr_dn', 'APR_Hepat_NuclearSize_48hr_dn', 'APR_Hepat_Steatosis_24hr_up', 'APR_Hepat_Steatosis_48hr_up', 'ATG_AP_1_CIS_dn', 'ATG_AP_1_CIS_up', 'ATG_AP_2_CIS_dn', 'ATG_AP_2_CIS_up', 'ATG_AR_TRANS_dn', 'ATG_AR_TRANS_up', 'ATG_Ahr_CIS_dn', 'ATG_Ahr_CIS_up', 'ATG_BRE_CIS_dn', 'ATG_BRE_CIS_up', 'ATG_CAR_TRANS_dn', 'ATG_CAR_TRANS_up', 'ATG_CMV_CIS_dn', 'ATG_CMV_CIS_up', 'ATG_CRE_CIS_dn', 'ATG_CRE_CIS_up', 'ATG_C_EBP_CIS_dn', 'ATG_C_EBP_CIS_up', 'ATG_DR4_LXR_CIS_dn', 'ATG_DR4_LXR_CIS_up', 'ATG_DR5_CIS_dn', 'ATG_DR5_CIS_up', 'ATG_E2F_CIS_dn', 'ATG_E2F_CIS_up', 'ATG_EGR_CIS_up', 'ATG_ERE_CIS_dn', 'ATG_ERE_CIS_up', 'ATG_ERRa_TRANS_dn', 'ATG_ERRg_TRANS_dn', 'ATG_ERRg_TRANS_up', 'ATG_ERa_TRANS_up', 'ATG_E_Box_CIS_dn', 'ATG_E_Box_CIS_up', 'ATG_Ets_CIS_dn', 'ATG_Ets_CIS_up', 'ATG_FXR_TRANS_up', 'ATG_FoxA2_CIS_dn', 'ATG_FoxA2_CIS_up', 'ATG_FoxO_CIS_dn', 'ATG_FoxO_CIS_up', 'ATG_GAL4_TRANS_dn', 'ATG_GATA_CIS_dn', 'ATG_GATA_CIS_up', 'ATG_GLI_CIS_dn', 'ATG_GLI_CIS_up', 'ATG_GRE_CIS_dn', 'ATG_GRE_CIS_up', 'ATG_GR_TRANS_dn', 'ATG_GR_TRANS_up', 'ATG_HIF1a_CIS_dn', 'ATG_HIF1a_CIS_up', 'ATG_HNF4a_TRANS_dn', 'ATG_HNF4a_TRANS_up', 'ATG_HNF6_CIS_dn', 'ATG_HNF6_CIS_up', 'ATG_HSE_CIS_dn', 'ATG_HSE_CIS_up', 'ATG_IR1_CIS_dn', 'ATG_IR1_CIS_up', 'ATG_ISRE_CIS_dn', 'ATG_ISRE_CIS_up', 'ATG_LXRa_TRANS_dn', 'ATG_LXRa_TRANS_up', 'ATG_LXRb_TRANS_dn', 'ATG_LXRb_TRANS_up', 'ATG_MRE_CIS_up', 'ATG_M_06_TRANS_up', 'ATG_M_19_CIS_dn', 'ATG_M_19_TRANS_dn', 'ATG_M_19_TRANS_up', 'ATG_M_32_CIS_dn', 'ATG_M_32_CIS_up', 'ATG_M_32_TRANS_dn', 'ATG_M_32_TRANS_up', 'ATG_M_61_TRANS_up', 'ATG_Myb_CIS_dn', 'ATG_Myb_CIS_up', 'ATG_Myc_CIS_dn', 'ATG_Myc_CIS_up', 'ATG_NFI_CIS_dn', 'ATG_NFI_CIS_up', 'ATG_NF_kB_CIS_dn', 'ATG_NF_kB_CIS_up', 'ATG_NRF1_CIS_dn', 'ATG_NRF1_CIS_up', 'ATG_NRF2_ARE_CIS_dn', 'ATG_NRF2_ARE_CIS_up', 'ATG_NURR1_TRANS_dn', 'ATG_NURR1_TRANS_up', 'ATG_Oct_MLP_CIS_dn', 'ATG_Oct_MLP_CIS_up', 'ATG_PBREM_CIS_dn', 'ATG_PBREM_CIS_up', 'ATG_PPARa_TRANS_dn', 'ATG_PPARa_TRANS_up', 'ATG_PPARd_TRANS_up', 'ATG_PPARg_TRANS_up', 'ATG_PPRE_CIS_dn', 'ATG_PPRE_CIS_up', 'ATG_PXRE_CIS_dn', 'ATG_PXRE_CIS_up', 'ATG_PXR_TRANS_dn', 'ATG_PXR_TRANS_up', 'ATG_Pax6_CIS_up', 'ATG_RARa_TRANS_dn', 'ATG_RARa_TRANS_up', 'ATG_RARb_TRANS_dn', 'ATG_RARb_TRANS_up', 'ATG_RARg_TRANS_dn', 'ATG_RARg_TRANS_up', 'ATG_RORE_CIS_dn', 'ATG_RORE_CIS_up', 'ATG_RORb_TRANS_dn', 'ATG_RORg_TRANS_dn', 'ATG_RORg_TRANS_up', 'ATG_RXRa_TRANS_dn', 'ATG_RXRa_TRANS_up', 'ATG_RXRb_TRANS_dn', 'ATG_RXRb_TRANS_up', 'ATG_SREBP_CIS_dn', 'ATG_SREBP_CIS_up', 'ATG_STAT3_CIS_dn', 'ATG_STAT3_CIS_up', 'ATG_Sox_CIS_dn', 'ATG_Sox_CIS_up', 'ATG_Sp1_CIS_dn', 'ATG_Sp1_CIS_up', 'ATG_TAL_CIS_dn', 'ATG_TAL_CIS_up', 'ATG_TA_CIS_dn', 'ATG_TA_CIS_up', 'ATG_TCF_b_cat_CIS_dn', 'ATG_TCF_b_cat_CIS_up', 'ATG_TGFb_CIS_dn', 'ATG_TGFb_CIS_up', 'ATG_THRa1_TRANS_dn', 'ATG_THRa1_TRANS_up', 'ATG_VDRE_CIS_dn', 'ATG_VDRE_CIS_up', 'ATG_VDR_TRANS_dn', 'ATG_VDR_TRANS_up', 'ATG_XTT_Cytotoxicity_up', 'ATG_Xbp1_CIS_dn', 'ATG_Xbp1_CIS_up', 'ATG_p53_CIS_dn', 'ATG_p53_CIS_up', 'BSK_3C_Eselectin_down', 'BSK_3C_HLADR_down', 'BSK_3C_ICAM1_down', 'BSK_3C_IL8_down', 'BSK_3C_MCP1_down', 'BSK_3C_MIG_down', 'BSK_3C_Proliferation_down', 'BSK_3C_SRB_down', 'BSK_3C_Thrombomodulin_down', 'BSK_3C_Thrombomodulin_up', 'BSK_3C_TissueFactor_down', 'BSK_3C_TissueFactor_up', 'BSK_3C_VCAM1_down', 'BSK_3C_Vis_down', 'BSK_3C_uPAR_down', 'BSK_4H_Eotaxin3_down', 'BSK_4H_MCP1_down', 'BSK_4H_Pselectin_down', 'BSK_4H_Pselectin_up', 'BSK_4H_SRB_down', 'BSK_4H_VCAM1_down', 'BSK_4H_VEGFRII_down', 'BSK_4H_uPAR_down', 'BSK_4H_uPAR_up', 'BSK_BE3C_HLADR_down', 'BSK_BE3C_IL1a_down', 'BSK_BE3C_IP10_down', 'BSK_BE3C_MIG_down', 'BSK_BE3C_MMP1_down', 'BSK_BE3C_MMP1_up', 'BSK_BE3C_PAI1_down', 'BSK_BE3C_SRB_down', 'BSK_BE3C_TGFb1_down', 'BSK_BE3C_tPA_down', 'BSK_BE3C_uPAR_down', 'BSK_BE3C_uPAR_up', 'BSK_BE3C_uPA_down', 'BSK_CASM3C_HLADR_down', 'BSK_CASM3C_IL6_down', 'BSK_CASM3C_IL6_up', 'BSK_CASM3C_IL8_down', 'BSK_CASM3C_LDLR_down', 'BSK_CASM3C_LDLR_up', 'BSK_CASM3C_MCP1_down', 'BSK_CASM3C_MCP1_up', 'BSK_CASM3C_MCSF_down', 'BSK_CASM3C_MCSF_up', 'BSK_CASM3C_MIG_down', 'BSK_CASM3C_Proliferation_down', 'BSK_CASM3C_Proliferation_up', 'BSK_CASM3C_SAA_down', 'BSK_CASM3C_SAA_up', 'BSK_CASM3C_SRB_down', 'BSK_CASM3C_Thrombomodulin_down', 'BSK_CASM3C_Thrombomodulin_up', 'BSK_CASM3C_TissueFactor_down', 'BSK_CASM3C_VCAM1_down', 'BSK_CASM3C_VCAM1_up', 'BSK_CASM3C_uPAR_down', 'BSK_CASM3C_uPAR_up', 'BSK_KF3CT_ICAM1_down', 'BSK_KF3CT_IL1a_down', 'BSK_KF3CT_IP10_down', 'BSK_KF3CT_IP10_up', 'BSK_KF3CT_MCP1_down', 'BSK_KF3CT_MCP1_up', 'BSK_KF3CT_MMP9_down', 'BSK_KF3CT_SRB_down', 'BSK_KF3CT_TGFb1_down', 'BSK_KF3CT_TIMP2_down', 'BSK_KF3CT_uPA_down', 'BSK_LPS_CD40_down', 'BSK_LPS_Eselectin_down', 'BSK_LPS_Eselectin_up', 'BSK_LPS_IL1a_down', 'BSK_LPS_IL1a_up', 'BSK_LPS_IL8_down', 'BSK_LPS_IL8_up', 'BSK_LPS_MCP1_down', 'BSK_LPS_MCSF_down', 'BSK_LPS_PGE2_down', 'BSK_LPS_PGE2_up', 'BSK_LPS_SRB_down', 'BSK_LPS_TNFa_down', 'BSK_LPS_TNFa_up', 'BSK_LPS_TissueFactor_down', 'BSK_LPS_TissueFactor_up', 'BSK_LPS_VCAM1_down', 'BSK_SAg_CD38_down', 'BSK_SAg_CD40_down', 'BSK_SAg_CD69_down', 'BSK_SAg_Eselectin_down', 'BSK_SAg_Eselectin_up', 'BSK_SAg_IL8_down', 'BSK_SAg_IL8_up', 'BSK_SAg_MCP1_down', 'BSK_SAg_MIG_down', 'BSK_SAg_PBMCCytotoxicity_down', 'BSK_SAg_PBMCCytotoxicity_up', 'BSK_SAg_Proliferation_down', 'BSK_SAg_SRB_down', 'BSK_hDFCGF_CollagenIII_down', 'BSK_hDFCGF_EGFR_down', 'BSK_hDFCGF_EGFR_up', 'BSK_hDFCGF_IL8_down', 'BSK_hDFCGF_IP10_down', 'BSK_hDFCGF_MCSF_down', 'BSK_hDFCGF_MIG_down', 'BSK_hDFCGF_MMP1_down', 'BSK_hDFCGF_MMP1_up', 'BSK_hDFCGF_PAI1_down', 'BSK_hDFCGF_Proliferation_down', 'BSK_hDFCGF_SRB_down', 'BSK_hDFCGF_TIMP1_down', 'BSK_hDFCGF_VCAM1_down', 'CEETOX_H295R_11DCORT_dn', 'CEETOX_H295R_ANDR_dn', 'CEETOX_H295R_CORTISOL_dn', 'CEETOX_H295R_DOC_dn', 'CEETOX_H295R_DOC_up', 'CEETOX_H295R_ESTRADIOL_dn', 'CEETOX_H295R_ESTRADIOL_up', 'CEETOX_H295R_ESTRONE_dn', 'CEETOX_H295R_ESTRONE_up', 'CEETOX_H295R_OHPREG_up', 'CEETOX_H295R_OHPROG_dn', 'CEETOX_H295R_OHPROG_up', 'CEETOX_H295R_PROG_up', 'CEETOX_H295R_TESTO_dn', 'CLD_ABCB1_48hr', 'CLD_ABCG2_48hr', 'CLD_CYP1A1_24hr', 'CLD_CYP1A1_48hr', 'CLD_CYP1A1_6hr', 'CLD_CYP1A2_24hr', 'CLD_CYP1A2_48hr', 'CLD_CYP1A2_6hr', 'CLD_CYP2B6_24hr', 'CLD_CYP2B6_48hr', 'CLD_CYP2B6_6hr', 'CLD_CYP3A4_24hr', 'CLD_CYP3A4_48hr', 'CLD_CYP3A4_6hr', 'CLD_GSTA2_48hr', 'CLD_SULT2A_24hr', 'CLD_SULT2A_48hr', 'CLD_UGT1A1_24hr', 'CLD_UGT1A1_48hr', 'NCCT_HEK293T_CellTiterGLO', 'NCCT_QuantiLum_inhib_2_dn', 'NCCT_QuantiLum_inhib_dn', 'NCCT_TPO_AUR_dn', 'NCCT_TPO_GUA_dn', 'NHEERL_ZF_144hpf_TERATOSCORE_up', 'NVS_ADME_hCYP19A1', 'NVS_ADME_hCYP1A1', 'NVS_ADME_hCYP1A2', 'NVS_ADME_hCYP2A6', 'NVS_ADME_hCYP2B6', 'NVS_ADME_hCYP2C19', 'NVS_ADME_hCYP2C9', 'NVS_ADME_hCYP2D6', 'NVS_ADME_hCYP3A4', 'NVS_ADME_hCYP4F12', 'NVS_ADME_rCYP2C12', 'NVS_ENZ_hAChE', 'NVS_ENZ_hAMPKa1', 'NVS_ENZ_hAurA', 'NVS_ENZ_hBACE', 'NVS_ENZ_hCASP5', 'NVS_ENZ_hCK1D', 'NVS_ENZ_hDUSP3', 'NVS_ENZ_hES', 'NVS_ENZ_hElastase', 'NVS_ENZ_hFGFR1', 'NVS_ENZ_hGSK3b', 'NVS_ENZ_hMMP1', 'NVS_ENZ_hMMP13', 'NVS_ENZ_hMMP2', 'NVS_ENZ_hMMP3', 'NVS_ENZ_hMMP7', 'NVS_ENZ_hMMP9', 'NVS_ENZ_hPDE10', 'NVS_ENZ_hPDE4A1', 'NVS_ENZ_hPDE5', 'NVS_ENZ_hPI3Ka', 'NVS_ENZ_hPTEN', 'NVS_ENZ_hPTPN11', 'NVS_ENZ_hPTPN12', 'NVS_ENZ_hPTPN13', 'NVS_ENZ_hPTPN9', 'NVS_ENZ_hPTPRC', 'NVS_ENZ_hSIRT1', 'NVS_ENZ_hSIRT2', 'NVS_ENZ_hTrkA', 'NVS_ENZ_hVEGFR2', 'NVS_ENZ_oCOX1', 'NVS_ENZ_oCOX2', 'NVS_ENZ_rAChE', 'NVS_ENZ_rCNOS', 'NVS_ENZ_rMAOAC', 'NVS_ENZ_rMAOAP', 'NVS_ENZ_rMAOBC', 'NVS_ENZ_rMAOBP', 'NVS_ENZ_rabI2C', 'NVS_GPCR_bAdoR_NonSelective', 'NVS_GPCR_bDR_NonSelective', 'NVS_GPCR_g5HT4', 'NVS_GPCR_gH2', 'NVS_GPCR_gLTB4', 'NVS_GPCR_gLTD4', 'NVS_GPCR_gMPeripheral_NonSelective', 'NVS_GPCR_gOpiateK', 'NVS_GPCR_h5HT2A', 'NVS_GPCR_h5HT5A', 'NVS_GPCR_h5HT6', 'NVS_GPCR_h5HT7', 'NVS_GPCR_hAT1', 'NVS_GPCR_hAdoRA1', 'NVS_GPCR_hAdoRA2a', 'NVS_GPCR_hAdra2A', 'NVS_GPCR_hAdra2C', 'NVS_GPCR_hAdrb1', 'NVS_GPCR_hAdrb2', 'NVS_GPCR_hAdrb3', 'NVS_GPCR_hDRD1', 'NVS_GPCR_hDRD2s', 'NVS_GPCR_hDRD4.4', 'NVS_GPCR_hH1', 'NVS_GPCR_hLTB4_BLT1', 'NVS_GPCR_hM1', 'NVS_GPCR_hM2', 'NVS_GPCR_hM3', 'NVS_GPCR_hM4', 'NVS_GPCR_hNK2', 'NVS_GPCR_hOpiate_D1', 'NVS_GPCR_hOpiate_mu', 'NVS_GPCR_hTXA2', 'NVS_GPCR_p5HT2C', 'NVS_GPCR_r5HT1_NonSelective', 'NVS_GPCR_r5HT_NonSelective', 'NVS_GPCR_rAdra1B', 'NVS_GPCR_rAdra1_NonSelective', 'NVS_GPCR_rAdra2_NonSelective', 'NVS_GPCR_rAdrb_NonSelective', 'NVS_GPCR_rNK1', 'NVS_GPCR_rNK3', 'NVS_GPCR_rOpiate_NonSelective', 'NVS_GPCR_rOpiate_NonSelectiveNa', 'NVS_GPCR_rSST', 'NVS_GPCR_rTRH', 'NVS_GPCR_rV1', 'NVS_GPCR_rabPAF', 'NVS_GPCR_rmAdra2B', 'NVS_IC_hKhERGCh', 'NVS_IC_rCaBTZCHL', 'NVS_IC_rCaDHPRCh_L', 'NVS_IC_rNaCh_site2', 'NVS_LGIC_bGABARa1', 'NVS_LGIC_h5HT3', 'NVS_LGIC_hNNR_NBungSens', 'NVS_LGIC_rGABAR_NonSelective', 'NVS_LGIC_rNNR_BungSens', 'NVS_MP_hPBR', 'NVS_MP_rPBR', 'NVS_NR_bER', 'NVS_NR_bPR', 'NVS_NR_cAR', 'NVS_NR_hAR', 'NVS_NR_hCAR_Antagonist', 'NVS_NR_hER', 'NVS_NR_hFXR_Agonist', 'NVS_NR_hFXR_Antagonist', 'NVS_NR_hGR', 'NVS_NR_hPPARa', 'NVS_NR_hPPARg', 'NVS_NR_hPR', 'NVS_NR_hPXR', 'NVS_NR_hRAR_Antagonist', 'NVS_NR_hRARa_Agonist', 'NVS_NR_hTRa_Antagonist', 'NVS_NR_mERa', 'NVS_NR_rAR', 'NVS_NR_rMR', 'NVS_OR_gSIGMA_NonSelective', 'NVS_TR_gDAT', 'NVS_TR_hAdoT', 'NVS_TR_hDAT', 'NVS_TR_hNET', 'NVS_TR_hSERT', 'NVS_TR_rNET', 'NVS_TR_rSERT', 'NVS_TR_rVMAT2', 'OT_AR_ARELUC_AG_1440', 'OT_AR_ARSRC1_0480', 'OT_AR_ARSRC1_0960', 'OT_ER_ERaERa_0480', 'OT_ER_ERaERa_1440', 'OT_ER_ERaERb_0480', 'OT_ER_ERaERb_1440', 'OT_ER_ERbERb_0480', 'OT_ER_ERbERb_1440', 'OT_ERa_EREGFP_0120', 'OT_ERa_EREGFP_0480', 'OT_FXR_FXRSRC1_0480', 'OT_FXR_FXRSRC1_1440', 'OT_NURR1_NURR1RXRa_0480', 'OT_NURR1_NURR1RXRa_1440', 'TOX21_ARE_BLA_Agonist_ch1', 'TOX21_ARE_BLA_Agonist_ch2', 'TOX21_ARE_BLA_agonist_ratio', 'TOX21_ARE_BLA_agonist_viability', 'TOX21_AR_BLA_Agonist_ch1', 'TOX21_AR_BLA_Agonist_ch2', 'TOX21_AR_BLA_Agonist_ratio', 'TOX21_AR_BLA_Antagonist_ch1', 'TOX21_AR_BLA_Antagonist_ch2', 'TOX21_AR_BLA_Antagonist_ratio', 'TOX21_AR_BLA_Antagonist_viability', 'TOX21_AR_LUC_MDAKB2_Agonist', 'TOX21_AR_LUC_MDAKB2_Antagonist', 'TOX21_AR_LUC_MDAKB2_Antagonist2', 'TOX21_AhR_LUC_Agonist', 'TOX21_Aromatase_Inhibition', 'TOX21_AutoFluor_HEK293_Cell_blue', 'TOX21_AutoFluor_HEK293_Media_blue', 'TOX21_AutoFluor_HEPG2_Cell_blue', 'TOX21_AutoFluor_HEPG2_Cell_green', 'TOX21_AutoFluor_HEPG2_Media_blue', 'TOX21_AutoFluor_HEPG2_Media_green', 'TOX21_ELG1_LUC_Agonist', 'TOX21_ERa_BLA_Agonist_ch1', 'TOX21_ERa_BLA_Agonist_ch2', 'TOX21_ERa_BLA_Agonist_ratio', 'TOX21_ERa_BLA_Antagonist_ch1', 'TOX21_ERa_BLA_Antagonist_ch2', 'TOX21_ERa_BLA_Antagonist_ratio', 'TOX21_ERa_BLA_Antagonist_viability', 'TOX21_ERa_LUC_BG1_Agonist', 'TOX21_ERa_LUC_BG1_Antagonist', 'TOX21_ESRE_BLA_ch1', 'TOX21_ESRE_BLA_ch2', 'TOX21_ESRE_BLA_ratio', 'TOX21_ESRE_BLA_viability', 'TOX21_FXR_BLA_Antagonist_ch1', 'TOX21_FXR_BLA_Antagonist_ch2', 'TOX21_FXR_BLA_agonist_ch2', 'TOX21_FXR_BLA_agonist_ratio', 'TOX21_FXR_BLA_antagonist_ratio', 'TOX21_FXR_BLA_antagonist_viability', 'TOX21_GR_BLA_Agonist_ch1', 'TOX21_GR_BLA_Agonist_ch2', 'TOX21_GR_BLA_Agonist_ratio', 'TOX21_GR_BLA_Antagonist_ch2', 'TOX21_GR_BLA_Antagonist_ratio', 'TOX21_GR_BLA_Antagonist_viability', 'TOX21_HSE_BLA_agonist_ch1', 'TOX21_HSE_BLA_agonist_ch2', 'TOX21_HSE_BLA_agonist_ratio', 'TOX21_HSE_BLA_agonist_viability', 'TOX21_MMP_ratio_down', 'TOX21_MMP_ratio_up', 'TOX21_MMP_viability', 'TOX21_NFkB_BLA_agonist_ch1', 'TOX21_NFkB_BLA_agonist_ch2', 'TOX21_NFkB_BLA_agonist_ratio', 'TOX21_NFkB_BLA_agonist_viability', 'TOX21_PPARd_BLA_Agonist_viability', 'TOX21_PPARd_BLA_Antagonist_ch1', 'TOX21_PPARd_BLA_agonist_ch1', 'TOX21_PPARd_BLA_agonist_ch2', 'TOX21_PPARd_BLA_agonist_ratio', 'TOX21_PPARd_BLA_antagonist_ratio', 'TOX21_PPARd_BLA_antagonist_viability', 'TOX21_PPARg_BLA_Agonist_ch1', 'TOX21_PPARg_BLA_Agonist_ch2', 'TOX21_PPARg_BLA_Agonist_ratio', 'TOX21_PPARg_BLA_Antagonist_ch1', 'TOX21_PPARg_BLA_antagonist_ratio', 'TOX21_PPARg_BLA_antagonist_viability', 'TOX21_TR_LUC_GH3_Agonist', 'TOX21_TR_LUC_GH3_Antagonist', 'TOX21_VDR_BLA_Agonist_viability', 'TOX21_VDR_BLA_Antagonist_ch1', 'TOX21_VDR_BLA_agonist_ch2', 'TOX21_VDR_BLA_agonist_ratio', 'TOX21_VDR_BLA_antagonist_ratio', 'TOX21_VDR_BLA_antagonist_viability', 'TOX21_p53_BLA_p1_ch1', 'TOX21_p53_BLA_p1_ch2', 'TOX21_p53_BLA_p1_ratio', 'TOX21_p53_BLA_p1_viability', 'TOX21_p53_BLA_p2_ch1', 'TOX21_p53_BLA_p2_ch2', 'TOX21_p53_BLA_p2_ratio', 'TOX21_p53_BLA_p2_viability', 'TOX21_p53_BLA_p3_ch1', 'TOX21_p53_BLA_p3_ch2', 'TOX21_p53_BLA_p3_ratio', 'TOX21_p53_BLA_p3_viability', 'TOX21_p53_BLA_p4_ch1', 'TOX21_p53_BLA_p4_ch2', 'TOX21_p53_BLA_p4_ratio', 'TOX21_p53_BLA_p4_viability', 'TOX21_p53_BLA_p5_ch1', 'TOX21_p53_BLA_p5_ch2', 'TOX21_p53_BLA_p5_ratio', 'TOX21_p53_BLA_p5_viability', 'Tanguay_ZF_120hpf_AXIS_up', 'Tanguay_ZF_120hpf_ActivityScore', 'Tanguay_ZF_120hpf_BRAI_up', 'Tanguay_ZF_120hpf_CFIN_up', 'Tanguay_ZF_120hpf_CIRC_up', 'Tanguay_ZF_120hpf_EYE_up', 'Tanguay_ZF_120hpf_JAW_up', 'Tanguay_ZF_120hpf_MORT_up', 'Tanguay_ZF_120hpf_OTIC_up', 'Tanguay_ZF_120hpf_PE_up', 'Tanguay_ZF_120hpf_PFIN_up', 'Tanguay_ZF_120hpf_PIG_up', 'Tanguay_ZF_120hpf_SNOU_up', 'Tanguay_ZF_120hpf_SOMI_up', 'Tanguay_ZF_120hpf_SWIM_up', 'Tanguay_ZF_120hpf_TRUN_up', 'Tanguay_ZF_120hpf_TR_up', 'Tanguay_ZF_120hpf_YSE_up'] ================================================ FILE: chainer_chemistry/datasets/numpy_tuple_dataset.py ================================================ import os import six import numpy from chainer_chemistry.dataset.converters import concat_mols from chainer_chemistry.dataset.indexers.numpy_tuple_dataset_feature_indexer import NumpyTupleDatasetFeatureIndexer # NOQA class NumpyTupleDataset(object): """Dataset of a tuple of datasets. It combines multiple datasets into one dataset. Each example is represented by a tuple whose ``i``-th item corresponds to the i-th dataset. And each ``i``-th dataset is expected to be an instance of numpy.ndarray. Args: datasets: Underlying datasets. The ``i``-th one is used for the ``i``-th item of each example. All datasets must have the same length. """ def __init__(self, *datasets): if not datasets: raise ValueError('no datasets are given') length = len(datasets[0]) for i, dataset in enumerate(datasets): if len(dataset) != length: raise ValueError( 'dataset of the index {} has a wrong length'.format(i)) self._datasets = datasets self._length = length self._features_indexer = NumpyTupleDatasetFeatureIndexer(self) def __getitem__(self, index): batches = [dataset[index] for dataset in self._datasets] if isinstance(index, (slice, list, numpy.ndarray)): length = len(batches[0]) return [tuple([batch[i] for batch in batches]) for i in six.moves.range(length)] else: return tuple(batches) def __len__(self): return self._length def get_datasets(self): return self._datasets @property def converter(self): return concat_mols @property def features(self): """Extract features according to the specified index. - axis 0 is used to specify dataset id (`i`-th dataset) - axis 1 is used to specify feature index .. admonition:: Example >>> import numpy >>> from chainer_chemistry.datasets import NumpyTupleDataset >>> x = numpy.array([0, 1, 2], dtype=numpy.float32) >>> t = x * x >>> numpy_tuple_dataset = NumpyTupleDataset(x, t) >>> targets = numpy_tuple_dataset.features[:, 1] >>> print('targets', targets) # We can extract only target value targets [0, 1, 4] """ return self._features_indexer @classmethod def save(cls, filepath, numpy_tuple_dataset): """save the dataset to filepath in npz format Args: filepath (str): filepath to save dataset. It is recommended to end with '.npz' extension. numpy_tuple_dataset (NumpyTupleDataset): dataset instance """ if not isinstance(numpy_tuple_dataset, NumpyTupleDataset): raise TypeError('numpy_tuple_dataset is not instance of ' 'NumpyTupleDataset, got {}' .format(type(numpy_tuple_dataset))) numpy.savez(filepath, *numpy_tuple_dataset._datasets) @classmethod def load(cls, filepath, allow_pickle=True): if not os.path.exists(filepath): return None load_data = numpy.load(filepath, allow_pickle=allow_pickle) result = [] i = 0 while True: key = 'arr_{}'.format(i) if key in load_data.keys(): result.append(load_data[key]) i += 1 else: break return NumpyTupleDataset(*result) ================================================ FILE: chainer_chemistry/datasets/qm9.py ================================================ import glob from logging import getLogger import os import shutil import tarfile import tempfile from chainer.dataset import download import numpy import pandas from tqdm import tqdm from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA download_url = 'https://ndownloader.figshare.com/files/3195389' file_name = 'qm9.csv' _root = 'pfnet/chainer/qm9' _label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv'] _smiles_column_names = ['SMILES1', 'SMILES2'] def get_qm9_label_names(): """Returns label names of QM9 datasets.""" return _label_names def get_qm9(preprocessor=None, labels=None, return_smiles=False, target_index=None): """Downloads, caches and preprocesses QM9 dataset. Args: preprocessor (BasePreprocessor): Preprocessor. This should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. Returns: dataset, which is composed of `features`, which depends on `preprocess_method`. """ labels = labels or get_qm9_label_names() if isinstance(labels, str): labels = [labels, ] def postprocess_label(label_list): # This is regression task, cast to float value. return numpy.asarray(label_list, dtype=numpy.float32) if preprocessor is None: preprocessor = AtomicNumberPreprocessor() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES1') result = parser.parse(get_qm9_filepath(), return_smiles=return_smiles, target_index=target_index) if return_smiles: return result['dataset'], result['smiles'] else: return result['dataset'] def get_qm9_filepath(download_if_not_exist=True): """Construct a filepath which stores qm9 dataset for config_name This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If `True` download dataset if it is not downloaded yet. Returns (str): file path for qm9 dataset (formatted to csv) """ cache_path = _get_qm9_filepath() if not os.path.exists(cache_path): if download_if_not_exist: is_successful = download_and_extract_qm9(save_filepath=cache_path) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return cache_path def _get_qm9_filepath(): """Construct a filepath which stores QM9 dataset in csv This method does not check if the file is already downloaded or not. Returns (str): filepath for qm9 dataset """ cache_root = download.get_dataset_directory(_root) cache_path = os.path.join(cache_root, file_name) return cache_path def download_and_extract_qm9(save_filepath): logger = getLogger(__name__) logger.warning('Extracting QM9 dataset, it takes time...') download_file_path = download.cached_download(download_url) tf = tarfile.open(download_file_path, 'r') temp_dir = tempfile.mkdtemp() tf.extractall(temp_dir) file_re = os.path.join(temp_dir, '*.xyz') file_pathes = glob.glob(file_re) # Make sure the order is sorted file_pathes.sort() ls = [] for path in tqdm(file_pathes): with open(path, 'r') as f: data = [line.strip() for line in f] num_atom = int(data[0]) properties = list(map(float, data[1].split('\t')[1:])) smiles = data[3 + num_atom].split('\t') new_ls = smiles + properties ls.append(new_ls) df = pandas.DataFrame(ls, columns=_smiles_column_names + _label_names) df.to_csv(save_filepath) shutil.rmtree(temp_dir) return True ================================================ FILE: chainer_chemistry/datasets/reddit/reddit.py ================================================ from logging import getLogger import os from zipfile import ZipFile import networkx as nx import numpy import scipy from chainer.dataset import download download_url = 'https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/reddit.zip' feat_file_name = 'reddit_data.npz' edge_file_name = 'reddit_graph.npz' _root = 'pfnet/chainer/reddit' def reddit_to_networkx(dirpath): print("Loading graph data") coo_adj = scipy.sparse.load_npz(os.path.join(dirpath, edge_file_name)) G = nx.from_scipy_sparse_matrix(coo_adj) print("Loading node feature and label") # node feature, edge label reddit_data = numpy.load(os.path.join(dirpath, feat_file_name)) G.graph['x'] = reddit_data['feature'].astype(numpy.float32) G.graph['y'] = reddit_data['label'].astype(numpy.int32) G.graph['label_num'] = 41 # G = nx.convert_node_labels_to_integers(G) print("Finish loading graph: {}".format(dirpath)) return G def get_reddit_dirpath(download_if_not_exist=True): # type: (bool) -> str """Construct a dirpath which stores reddit dataset. This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If ``True``, download dataset if it is not downloaded yet. Returns: dirpath (str): directory path for reddit dataset. """ feat_cache_path, edge_cache_path = get_reddit_filepath( download_if_not_exist=download_if_not_exist) dirpath = os.path.dirname(feat_cache_path) dirpath2 = os.path.dirname(edge_cache_path) assert dirpath == dirpath2 return dirpath def get_reddit_filepath(download_if_not_exist=True): # type: (bool) -> Tuple[str, str] """Construct a filepath which stores reddit dataset. This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If ``True``, download dataset if it is not downloaded yet. Returns: feat_cache_path (str): file path for reddit dataset (features). edge_cache_path (str): file path for reddit dataset (edge index). """ feat_cache_path, edge_cache_path = _get_reddit_filepath() if not os.path.exists(feat_cache_path): if download_if_not_exist: is_successful = download_and_extract_reddit( save_dirpath=os.path.dirname(feat_cache_path)) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return feat_cache_path, edge_cache_path def _get_reddit_filepath(): # type: () -> Tuple[str, str] """Construct a filepath which stores reddit dataset. This method does not check if the file is already downloaded or not. Returns: feat_cache_path (str): file path for reddit dataset (features). edge_cache_path (str): file path for reddit dataset (edge index). """ cache_root = download.get_dataset_directory(_root) feat_cache_path = os.path.join(cache_root, feat_file_name) edge_cache_path = os.path.join(cache_root, edge_file_name) return feat_cache_path, edge_cache_path def download_and_extract_reddit(save_dirpath): # type: (str) -> bool print('downloading reddit dataset...') download_file_path = download.cached_download(download_url) print('extracting reddit dataset...') zip = ZipFile(download_file_path, 'r') zip.extractall(save_dirpath) return True ================================================ FILE: chainer_chemistry/datasets/tox21.py ================================================ from logging import getLogger import os import shutil import zipfile from chainer.dataset import download import numpy from chainer_chemistry.dataset.parsers.sdf_file_parser import SDFFileParser from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA _config = { 'train': { 'url': 'https://tripod.nih.gov/tox21/challenge/download?' 'id=tox21_10k_data_allsdf', 'filename': 'tox21_10k_data_all.sdf' }, 'val': { 'url': 'https://tripod.nih.gov/tox21/challenge/download?' 'id=tox21_10k_challenge_testsdf', 'filename': 'tox21_10k_challenge_test.sdf' }, 'test': { 'url': 'https://tripod.nih.gov/tox21/challenge/download?' 'id=tox21_10k_challenge_scoresdf', 'filename': 'tox21_10k_challenge_score.sdf' } } _root = 'pfnet/chainer/tox21' _label_names = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'] def get_tox21_label_names(): """Returns label names of Tox21 datasets.""" return _label_names def get_tox21(preprocessor=None, labels=None, return_smiles=False, train_target_index=None, val_target_index=None, test_target_index=None): """Downloads, caches and preprocesses Tox21 dataset. Args: preprocesssor (BasePreprocessor): Preprocessor. This should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. return_smiles (bool): If set to True, smiles array is also returned. train_target_index (list or None): target index list to partially extract train dataset. If None (default), all examples are parsed. val_target_index (list or None): target index list to partially extract val dataset. If None (default), all examples are parsed. test_target_index (list or None): target index list to partially extract test dataset. If None (default), all examples are parsed. Returns: The 3-tuple consisting of train, validation and test datasets, respectively. Each dataset is composed of `features`, which depends on `preprocess_method`. """ labels = labels or get_tox21_label_names() if isinstance(labels, str): labels = [labels, ] def postprocess_label(label_list): # Set -1 to the place where the label is not found, # this corresponds to not calculate loss with `sigmoid_cross_entropy` t = numpy.array([-1 if label is None else label for label in label_list], dtype=numpy.int32) return t if preprocessor is None: preprocessor = AtomicNumberPreprocessor() parser = SDFFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels) train_result = parser.parse( get_tox21_filepath('train'), return_smiles=return_smiles, target_index=train_target_index ) val_result = parser.parse( get_tox21_filepath('val'), return_smiles=return_smiles, target_index=val_target_index ) test_result = parser.parse( get_tox21_filepath('test'), return_smiles=return_smiles, target_index=test_target_index ) if return_smiles: train, train_smiles = train_result['dataset'], train_result['smiles'] val, val_smiles = val_result['dataset'], val_result['smiles'] test, test_smiles = test_result['dataset'], test_result['smiles'] return train, val, test, train_smiles, val_smiles, test_smiles else: train = train_result['dataset'] val = val_result['dataset'] test = test_result['dataset'] return train, val, test def _get_tox21_filepath(dataset_type): """Returns a file path in which the tox21 dataset is cached. This function returns a file path in which `dataset_type` of the tox21 dataset is cached. Note that this function does not check if the dataset has actually been downloaded or not. Args: dataset_type(str): Name of the target dataset type. Either 'train', 'val', or 'test'. Returns (str): file path for the tox21 dataset """ if dataset_type not in _config.keys(): raise ValueError("Invalid dataset type '{}'. Accepted values are " "'train', 'val' or 'test'.".format(dataset_type)) c = _config[dataset_type] sdffile = c['filename'] cache_root = download.get_dataset_directory(_root) cache_path = os.path.join(cache_root, sdffile) return cache_path def get_tox21_filepath(dataset_type, download_if_not_exist=True): """Returns a file path in which the tox21 dataset is cached. This function returns a file path in which `dataset_type` of the tox21 dataset is or will be cached. If the dataset is not cached and if ``download_if_not_exist`` is ``True``, this function also downloads the dataset. Args: dataset_type: Name of the target dataset type. Either 'train', 'val', or 'test' download_if_not_exist (bool): If `True` download dataset if it is not downloaded yet. Returns (str): file path for tox21 dataset """ cache_filepath = _get_tox21_filepath(dataset_type) if not os.path.exists(cache_filepath): if download_if_not_exist: is_successful = _download_and_extract_tox21(dataset_type, cache_filepath) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return cache_filepath def _download_and_extract_tox21(config_name, save_filepath): is_successful = False c = _config[config_name] url = c['url'] sdffile = c['filename'] # Download tox21 dataset download_file_path = download.cached_download(url) # Extract zipfile to get sdffile with zipfile.ZipFile(download_file_path, 'r') as z: z.extract(sdffile) shutil.move(sdffile, save_filepath) is_successful = True return is_successful def download_and_extract_tox21(): """Downloads and extracts Tox21 dataset. Returns: None """ for config in ['train', 'val', 'test']: _download_and_extract_tox21(config, _get_tox21_filepath(config)) ================================================ FILE: chainer_chemistry/datasets/zinc.py ================================================ from logging import getLogger import os from chainer.dataset import download import numpy import pandas from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA download_url = 'https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv' # NOQA file_name_250k = 'zinc250k.csv' _root = 'pfnet/chainer/zinc' _label_names = ['logP', 'qed', 'SAS'] _smiles_column_names = ['smiles'] def get_zinc250k_label_names(): """Returns label names of ZINC250k datasets.""" return _label_names def get_zinc250k(preprocessor=None, labels=None, return_smiles=False, target_index=None): """Downloads, caches and preprocesses Zinc 250K dataset. Args: preprocessor (BasePreprocessor): Preprocessor. This should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If None (default), all examples are parsed. Returns: dataset, which is composed of `features`, which depends on `preprocess_method`. """ labels = labels or get_zinc250k_label_names() if isinstance(labels, str): labels = [labels, ] def postprocess_label(label_list): # This is regression task, cast to float value. return numpy.asarray(label_list, dtype=numpy.float32) if preprocessor is None: preprocessor = AtomicNumberPreprocessor() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='smiles') result = parser.parse(get_zinc250k_filepath(), return_smiles=return_smiles, target_index=target_index) if return_smiles: return result['dataset'], result['smiles'] else: return result['dataset'] def get_zinc250k_filepath(download_if_not_exist=True): """Construct a filepath which stores ZINC250k dataset for config_name This method check whether the file exist or not, and downloaded it if necessary. Args: download_if_not_exist (bool): If `True` download dataset if it is not downloaded yet. Returns (str): file path for ZINC250k dataset (csv format) """ cache_path = _get_zinc250k_filepath() if not os.path.exists(cache_path): if download_if_not_exist: is_successful = download_and_extract_zinc250k( save_filepath=cache_path) if not is_successful: logger = getLogger(__name__) logger.warning('Download failed.') return cache_path def _get_zinc250k_filepath(): """Construct a filepath which stores ZINC250k dataset in csv This method does not check if the file is already downloaded or not. Returns (str): filepath for ZINC250k dataset """ cache_root = download.get_dataset_directory(_root) cache_path = os.path.join(cache_root, file_name_250k) return cache_path def _remove_new_line(s): return s.replace('\n', '') def download_and_extract_zinc250k(save_filepath): logger = getLogger(__name__) logger.info('Extracting ZINC250k dataset...') download_file_path = download.cached_download(download_url) df = pandas.read_csv(download_file_path) # 'smiles' column contains '\n', need to remove it. df['smiles'] = df['smiles'].apply(_remove_new_line) df.to_csv(save_filepath, columns=_smiles_column_names + _label_names) return True ================================================ FILE: chainer_chemistry/functions/__init__.py ================================================ from chainer_chemistry.functions.activation.megnet_softplus import megnet_softplus # NOQA from chainer_chemistry.functions.activation.shifted_softplus import shifted_softplus # NOQA from chainer_chemistry.functions.activation.softmax import softmax # NOQA from chainer_chemistry.functions.evaluation.r2_score import r2_score # NOQA from chainer_chemistry.functions.evaluation.r2_score import R2Score # NOQA from chainer_chemistry.functions.loss.mean_absolute_error import mean_absolute_error # NOQA from chainer_chemistry.functions.loss.mean_absolute_error import MeanAbsoluteError # NOQA from chainer_chemistry.functions.loss.mean_squared_error import mean_squared_error # NOQA from chainer_chemistry.functions.loss.mean_squared_error import MeanSquaredError # NOQA from chainer_chemistry.functions.math.matmul import matmul # NOQA ================================================ FILE: chainer_chemistry/functions/activation/__init__.py ================================================ ================================================ FILE: chainer_chemistry/functions/activation/megnet_softplus.py ================================================ from chainer import functions def megnet_softplus(x): """Modified softplus function used by MEGNet The original implemantation is below. https://github.com/materialsvirtuallab/megnet/blob/f91773f0f3fa8402b494638af9ef2ed2807fcba7/megnet/activations.py#L6 Args: x (Variable): Input variable Returns: output (Variable): Output variable whose shape is same with `x` """ return functions.relu(x) + \ functions.log(0.5 * functions.exp(-functions.absolute(x)) + 0.5) ================================================ FILE: chainer_chemistry/functions/activation/shifted_softplus.py ================================================ import chainer from chainer import functions def shifted_softplus(x, beta=1, shift=0.5, threshold=20): """shifted softplus function, which holds f(0)=0. Args: x (Variable): Input variable beta (float): Parameter :math:`\\beta`. shift (float): Shift Parameter threshold (float): threshold to avoid overflow Returns: output (Variable): Output variable whose shape is same with `x` """ xp = chainer.cuda.get_array_module(x) cond = chainer.as_variable(x).array > threshold x = functions.where(cond, x, functions.softplus(x, beta=beta)) x += xp.log(shift) return x ================================================ FILE: chainer_chemistry/functions/activation/softmax.py ================================================ from chainer import functions def softmax(x, axis=1, mask=None, mask_value=1e10): """softmax function, which supports `mask`. Args: x (Variable): Input variable axis (int): The axis along which the softmax is to be computed. mask (Variable or None): Default value is `None` which does not use mask, this case the result is same with original `softmax` computation. When `mask` is not `None`, it is assumed to have value 1 or 0. 1 indicates actual feature, and 0 indicates virtual feature to be masked. mask_value (int): The value used for masking. Returns: output (Variable): Output variable whose shape is same with `x` """ if mask is None: h = x else: if x.shape != mask.shape: raise ValueError("x.shape={} and mask.shape={} must be same!" .format(x.shape, mask.shape)) h = x + (mask - 1.) * mask_value return functions.softmax(h, axis=axis) ================================================ FILE: chainer_chemistry/functions/evaluation/__init__.py ================================================ ================================================ FILE: chainer_chemistry/functions/evaluation/r2_score.py ================================================ from chainer.backends import cuda from chainer import function from chainer.utils import type_check class R2Score(function.Function): def __init__(self, sample_weight, multioutput, ignore_nan=False): if sample_weight is not None: raise NotImplementedError() if multioutput in ['uniform_average', 'raw_values']: self.multioutput = multioutput else: raise ValueError("invalid multioutput argument") self.ignore_nan = ignore_nan def check_type_forward(self, in_types): type_check.expect(in_types.size() == 2) pred_type, true_type = in_types type_check.expect( pred_type.dtype.kind == 'f', true_type.dtype.kind == 'f' ) type_check.expect( pred_type.shape == true_type.shape, ) def forward(self, inputs): xp = cuda.get_array_module(*inputs) pred, true = inputs diff = pred - true dev = true - xp.mean(true, axis=0) if self.ignore_nan: diff[xp.isnan(diff)] = 0. dev[xp.isnan(dev)] = 0. SS_res = xp.asarray( xp.sum(diff ** 2, axis=0)) SS_tot = xp.asarray( xp.sum(dev ** 2, axis=0)) SS_tot_iszero = SS_tot == 0 SS_tot[SS_tot_iszero] = 1 # Assign dummy value to avoid zero-division ret = xp.where( SS_tot_iszero, 0.0, 1 - SS_res / SS_tot).astype(pred.dtype) if self.multioutput == 'uniform_average': return xp.asarray(ret.mean()), elif self.multioutput == 'raw_values': return ret, def r2_score(pred, true, sample_weight=None, multioutput='uniform_average', ignore_nan=False): """Computes R^2(coefficient of determination) regression score function. Args: pred(Variable): Variable holding a vector, matrix or tensor of estimated target values. true(Variable): Variable holding a vector, matrix or tensor of correct target values. sample_weight: This argument is for compatibility with scikit-learn's implementation of r2_score. Current implementation admits None only. multioutput(string): ['uniform_average', 'raw_values']. if 'uniform_average', this function returns an average of R^2 score of multiple output. If 'raw_average', this function return a set of R^2 score of multiple output. Returns: Variable: A Variable holding a scalar array of the R^2 score if 'multioutput' is 'uniform_average' or a vector of R^2 scores if 'multioutput' is 'raw_values'. .. note:: This function is non-differentiable. """ return R2Score(sample_weight=sample_weight, multioutput=multioutput, ignore_nan=ignore_nan)(pred, true) ================================================ FILE: chainer_chemistry/functions/loss/__init__.py ================================================ ================================================ FILE: chainer_chemistry/functions/loss/mean_absolute_error.py ================================================ import numpy import chainer from chainer.backends import cuda from chainer import function_node from chainer.utils import type_check class MeanAbsoluteError(function_node.FunctionNode): """Mean absolute error function.""" def __init__(self, ignore_nan=False): # TODO(mottodora): implement task weight calculation self.ignore_nan = ignore_nan def check_type_forward(self, in_types): type_check.expect(in_types.size() == 2) type_check.expect( in_types[0].dtype == numpy.float32, in_types[1].dtype == numpy.float32, in_types[0].shape == in_types[1].shape ) def forward_cpu(self, inputs): self.retain_inputs((0, 1)) x0, x1 = inputs diff = (inputs[0] - inputs[1]).ravel() # TODO(mottodora): add reduce option if self.ignore_nan: diff[numpy.isnan(diff)] = 0. return numpy.array(abs(diff).sum() / diff.size, dtype=diff.dtype), def forward_gpu(self, inputs): self.retain_inputs((0, 1)) cupy = cuda.cupy diff = (inputs[0] - inputs[1]).ravel() if self.ignore_nan: diff[cupy.isnan(diff)] = 0. return abs(diff).sum() / diff.dtype.type(diff.size), def backward(self, indexes, gy): x0, x1 = self.get_retained_inputs() xp = cuda.get_array_module(x0) diff = x0 - x1 if self.ignore_nan: diff = chainer.functions.where(xp.isnan(diff.array), xp.zeros_like(diff.array), diff) gy0 = chainer.functions.broadcast_to(gy[0], diff.shape) gx0 = gy0 * chainer.functions.sign(diff) * 1. / diff.size return gx0, -gx0 def mean_absolute_error(x0, x1, ignore_nan=False): """Mean absolute error function. This function computes mean absolute error between two variables. The mean is taken over the minibatch. Args: x0 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Input variable. x1 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Input variable. ignore_nan (bool): If `True`, this function compute mean absolute error ignoring NaNs. The arithmetic mean is the sum of the non-NaN elements along the axis divided by the number of whole elements. Returns: ~chainer.Variable: A variable holding an array representing the mean absolute error of two inputs. """ return MeanAbsoluteError(ignore_nan).apply((x0, x1))[0] ================================================ FILE: chainer_chemistry/functions/loss/mean_squared_error.py ================================================ import numpy from chainer import cuda from chainer import function_node import chainer.functions from chainer.utils import type_check class MeanSquaredError(function_node.FunctionNode): """Mean squared error (a.k.a. Euclidean loss) function.""" def __init__(self, ignore_nan=False): # TODO(mottodora): implement task weight calculation self.ignore_nan = ignore_nan def check_type_forward(self, in_types): type_check.expect(in_types.size() == 2) type_check.expect( in_types[0].dtype == numpy.float32, in_types[1].dtype == numpy.float32, in_types[0].shape == in_types[1].shape ) def forward_cpu(self, inputs): self.retain_inputs((0, 1)) diff = (inputs[0] - inputs[1]).ravel() # TODO(mottodora): add reduce option if self.ignore_nan: diff[numpy.isnan(diff)] = 0. return numpy.array(diff.dot(diff) / diff.size, dtype=diff.dtype), def forward_gpu(self, inputs): cupy = cuda.cupy self.retain_inputs((0, 1)) diff = (inputs[0] - inputs[1]).ravel() # TODO(mottodora): add reduce option if self.ignore_nan: diff[cupy.isnan(diff)] = 0. return diff.dot(diff) / diff.dtype.type(diff.size), def backward(self, indexes, gy): x0, x1 = self.get_retained_inputs() xp = cuda.get_array_module(x0) ret = [] diff = x0 - x1 if self.ignore_nan: diff = chainer.functions.where(xp.isnan(diff.array), xp.zeros_like(diff.array), diff) gy0 = chainer.functions.broadcast_to(gy[0], diff.shape) gx0 = gy0 * diff * (2. / diff.size) if 0 in indexes: ret.append(gx0) if 1 in indexes: ret.append(-gx0) return ret def mean_squared_error(x0, x1, ignore_nan=False): """Mean squared error function. This function computes mean squared error between two variables. The mean is taken over the minibatch. Note that the error is not scaled by 1/2. Args: x0 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Input variable. x1 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Input variable. ignore_nan (bool): If `True`, this function compute mean squared error ignoring NaNs. The arithmetic mean is the sum of the non-NaN elements along the axis divided by the number of whole elements. Returns: ~chainer.Variable: A variable holding an array representing the mean squared error of two inputs. """ return MeanSquaredError(ignore_nan).apply((x0, x1))[0] ================================================ FILE: chainer_chemistry/functions/math/__init__.py ================================================ ================================================ FILE: chainer_chemistry/functions/math/matmul.py ================================================ import chainer if int(chainer.__version__[0]) >= 3: _matmul_fn = chainer.functions.matmul else: _matmul_fn = chainer.functions.batch_matmul def matmul(a, b, transa=False, transb=False): """Computes the matrix multiplication of two arrays. Args: a (Variable): The left operand of the matrix multiplication. If ``a`` and ``b`` are both 1-D arrays, ``matmul`` returns a dot product of vector `a` and vector `b`. If 2-D arrays, ``matmul`` returns matrix product of ``a`` and ``b``. If arrays' dimension is larger than 2, they are treated as a stack of matrices residing in the last two indexes. ``matmul`` returns a stack of each two arrays. ``a`` and ``b`` must have the same dimension. b (Variable): The right operand of the matrix multiplication. Its array is treated as a matrix in the same way as ``a``'s array. transa (bool): If ``True``, each matrices in ``a`` will be transposed. If ``a.ndim == 1``, do nothing. transb (bool): If ``True``, each matrices in ``b`` will be transposed. If ``b.ndim == 1``, do nothing. Returns: ~chainer.Variable: The result of the matrix multiplication. .. admonition:: Example >>> a = np.array([[1, 0], [0, 1]], 'f') >>> b = np.array([[4, 1], [2, 2]], 'f') >>> F.matmul(a, b).data array([[ 4., 1.], [ 2., 2.]], dtype=float32) """ return _matmul_fn(a, b, transa=transa, transb=transb) ================================================ FILE: chainer_chemistry/iterators/__init__.py ================================================ from chainer_chemistry.iterators.balanced_serial_iterator import BalancedSerialIterator # NOQA from chainer_chemistry.iterators.index_iterator import IndexIterator # NOQA ================================================ FILE: chainer_chemistry/iterators/balanced_serial_iterator.py ================================================ from __future__ import division from logging import getLogger from chainer.dataset import iterator import numpy from chainer_chemistry.iterators.index_iterator import IndexIterator class BalancedSerialIterator(iterator.Iterator): """Dataset iterator that serially reads the examples with balancing label. Args: dataset: Dataset to iterate. batch_size (int): Number of examples within each minibatch. labels (list or numpy.ndarray): 1d array which specifies label feature of `dataset`. Its size must be same as the length of `dataset`. repeat (bool): If ``True``, it infinitely loops over the dataset. Otherwise, it stops iteration at the end of the first epoch. shuffle (bool): If ``True``, the order of examples is shuffled at the beginning of each epoch. Otherwise, the order is permanently same as that of `dataset`. batch_balancing (bool): If ``True``, examples are sampled in the way that each label examples are roughly evenly sampled in each minibatch. Otherwise, the iterator only guarantees that total numbers of examples are same among label features. ignore_labels (int or list or None): Labels to be ignored. If not ``None``, the example whose label is in `ignore_labels` are not sampled by this iterator. """ def __init__(self, dataset, batch_size, labels, repeat=True, shuffle=True, batch_balancing=False, ignore_labels=None, logger=getLogger(__name__)): assert len(dataset) == len(labels) labels = numpy.asarray(labels) if len(dataset) != labels.size: raise ValueError('dataset length {} and labels size {} must be ' 'same!'.format(len(dataset), labels.size)) labels = numpy.ravel(labels) self.dataset = dataset self.batch_size = batch_size self.labels = labels self.logger = logger if ignore_labels is None: ignore_labels = [] elif isinstance(ignore_labels, int): ignore_labels = [ignore_labels, ] self.ignore_labels = list(ignore_labels) self._repeat = repeat self._shuffle = shuffle self._batch_balancing = batch_balancing self.labels_iterator_dict = {} max_label_count = -1 include_label_count = 0 for label in numpy.unique(labels): label_index = numpy.argwhere(labels == label).ravel() label_count = len(label_index) ii = IndexIterator(label_index, shuffle=shuffle) self.labels_iterator_dict[label] = ii if label in self.ignore_labels: continue if max_label_count < label_count: max_label_count = label_count include_label_count += 1 self.max_label_count = max_label_count self.N_augmented = max_label_count * include_label_count self.reset() def __next__(self): if not self._repeat and self.epoch > 0: raise StopIteration self._previous_epoch_detail = self.epoch_detail i = self.current_position i_end = i + self.batch_size N = self.N_augmented batch = [self.dataset[index] for index in self._order[i:i_end]] if i_end >= N: if self._repeat: rest = i_end - N self._update_order() if rest > 0: batch.extend([self.dataset[index] for index in self._order[:rest]]) self.current_position = rest else: self.current_position = 0 self.epoch += 1 self.is_new_epoch = True else: self.is_new_epoch = False self.current_position = i_end return batch next = __next__ @property def epoch_detail(self): return self.epoch + self.current_position / self.N_augmented @property def previous_epoch_detail(self): # This iterator saves ``-1`` as _previous_epoch_detail instead of # ``None`` because some serializers do not support ``None``. if self._previous_epoch_detail < 0: return None return self._previous_epoch_detail def serialize(self, serializer): self.current_position = serializer('current_position', self.current_position) self.epoch = serializer('epoch', self.epoch) self.is_new_epoch = serializer('is_new_epoch', self.is_new_epoch) if self._order is not None: serializer('order', self._order) self._previous_epoch_detail = serializer( 'previous_epoch_detail', self._previous_epoch_detail) for label, index_iterator in self.labels_iterator_dict.items(): self.labels_iterator_dict[label].serialize( serializer['index_iterator_{}'.format(label)]) def _update_order(self): indices_list = [] for label, index_iterator in self.labels_iterator_dict.items(): if label in self.ignore_labels: # Not include index of ignore_labels continue indices_list.append(index_iterator.get_next_indices( self.max_label_count)) if self._batch_balancing: # `indices_list` contains same number of indices of each label. # we can `transpose` and `ravel` it to get each label's index in # sequence, which guarantees that label in each batch is balanced. indices = numpy.array(indices_list).transpose().ravel() self._order = indices else: indices = numpy.array(indices_list).ravel() self._order = numpy.random.permutation(indices) def reset(self): self._update_order() self.current_position = 0 self.epoch = 0 self.is_new_epoch = False # use -1 instead of None internally. self._previous_epoch_detail = -1. def show_label_stats(self): self.logger.warning(' label count rate status') total = 0 for label, index_iterator in self.labels_iterator_dict.items(): count = len(index_iterator.index_list) total += count for label, index_iterator in self.labels_iterator_dict.items(): count = len(index_iterator.index_list) rate = count / len(self.dataset) status = 'ignored' if label in self.ignore_labels else 'included' self.logger.warning('{:>8} {:>8} {:>8.4f} {:>10}' .format(label, count, rate, status)) ================================================ FILE: chainer_chemistry/iterators/index_iterator.py ================================================ import numpy from chainer.dataset import iterator class IndexIterator(iterator.Iterator): """Index iterator IndexIterator is used internally in `BalancedSerialIterator`, as each label's index iterator Args: index_list (list): list of int which represents indices. shuffle (bool): shuffle flag. If True, indices specified by ``index_list`` will be randomly shuffled. num (int): number of indices to be extracted when ``___next___`` is called. """ def __init__(self, index_list, shuffle=True, num=0): self.index_list = numpy.asarray(index_list) assert self.index_list.ndim == 1 self.index_length = len(index_list) self.current_index_list = None self.current_pos = 0 self.shuffle = shuffle self.num = num self.update_current_index_list() def update_current_index_list(self): if self.shuffle: self.current_index_list = numpy.random.permutation(self.index_list) else: self.current_index_list = self.index_list def __next__(self): return self.get_next_indices(self.num) def get_next_indices(self, num): """get next indices Args: num (int): number for indices to extract. Returns (numpy.ndarray): 1d array of indices .. admonition:: Example >>> ii = IndexIterator([1, 3, 5, 10], shuffle=True) >>> print(ii.get_next_indices(5)) [ 5 1 10 3 10] >>> print(ii.get_next_indices(5)) [ 3 1 5 10 1] """ indices = [] if self.current_pos + num < self.index_length: indices.append(self.current_index_list[ self.current_pos: self.current_pos + num]) self.current_pos += num else: indices.append(self.current_index_list[self.current_pos:]) num -= (self.index_length - self.current_pos) # When `num` is twice bigger than `self.index_length`, `index_list` # is repeated `q` times to get desired length of `indices`. q, r = divmod(num, self.index_length) if self.shuffle: for _ in range(q): indices.append(numpy.random.permutation(self.index_list)) else: indices.append(numpy.tile(self.index_list, q)) self.update_current_index_list() indices.append(self.current_index_list[:r]) self.current_pos = r return numpy.concatenate(indices).ravel() def serialize(self, serializer): self.current_index_list = serializer('current_index_list', self.current_index_list) self.current_pos = serializer('current_pos', self.current_pos) ================================================ FILE: chainer_chemistry/link_hooks/__init__.py ================================================ try: from chainer_chemistry.link_hooks import variable_monitor_link_hook # NOQA from chainer_chemistry.link_hooks.variable_monitor_link_hook import VariableMonitorLinkHook # NOQA is_link_hooks_available = True except ImportError: import warnings warnings.warn('link_hooks failed to import, you need to upgrade chainer ' 'version to use link_hooks feature') is_link_hooks_available = False ================================================ FILE: chainer_chemistry/link_hooks/variable_monitor_link_hook.py ================================================ from collections import OrderedDict from logging import getLogger import chainer from chainer.link_hook import _ForwardPostprocessCallbackArgs, _ForwardPreprocessCallbackArgs # NOQA def _default_extract_pre(hook, args): """Default extract_fn when `timing='pre` Args: hook (VariableMonitorLinkHook): args (_ForwardPreprocessCallbackArgs): Returns (chainer.Variable): First input variable to the link. """ return args.args[0] def _default_extract_post(hook, args): """Default extract_fn when `timing='post` Args: hook (VariableMonitorLinkHook): args (_ForwardPostprocessCallbackArgs): Returns (chainer.Variable): Output variable to the link. """ return args.out class VariableMonitorLinkHook(chainer.LinkHook): """Monitor Variable of specific link input/output Args: target_link (chainer.Link): target link to monitor variable. name (str): name of this link hook timing (str): timing of this link hook to monitor. 'pre' or 'post'. If 'pre', the input of `target_link` is monitored. If 'post', the output of `target_link` is monitored. extract_fn (callable): Specify custom method to extract target variable Default behavior is to extract first input when `timing='pre'`, or extract output when `timing='post'`. It takes `hook, args` as argument. logger: .. admonition:: Example >>> import numpy >>> from chainer import cuda, links, functions # NOQA >>> from chainer_chemistry.link_hooks.variable_monitor_link_hook import VariableMonitorLinkHook # NOQA >>> class DummyModel(chainer.Chain): >>> def __init__(self): >>> super(DummyModel, self).__init__() >>> with self.init_scope(): >>> self.l1 = links.Linear(None, 1) >>> self.h = None >>> >>> def forward(self, x): >>> h = self.l1(x) >>> out = functions.sigmoid(h) >>> return out >>> model = DummyModel() >>> hook = VariableMonitorLinkHook(model.l1, timing='post') >>> x = numpy.array([1, 2, 3]) >>> # Example 1. `get_variable` of `target_link`. >>> with hook: >>> out = model(x) >>> # You can extract `h`, which is output of `model.l1` as follows. >>> var_h = hook.get_variable() >>> # Example 2. `add_process` to override value of target variable. >>> def _process_zeros(hook, args, target_var): >>> xp = cuda.get_array_module(target_var.array) >>> target_var.array = xp.zeros(target_var.array.shape) >>> hook.add_process('_process_zeros', _process_zeros) >>> with hook: >>> # During the forward, `h` is overriden to value 0. >>> out = model(x) >>> # Remove _process_zeros method >>> hook.delete_process('_process_zeros') """ def __init__(self, target_link, name='VariableMonitorLinkHook', timing='post', extract_fn=None, logger=None): if not isinstance(target_link, chainer.Link): raise TypeError('target_link must be instance of chainer.Link!' 'actual {}'.format(type(target_link))) if timing not in ['pre', 'post']: raise ValueError( "Unexpected value timing={}, " "must be either pre or post" .format(timing)) super(VariableMonitorLinkHook, self).__init__() self.target_link = target_link # This LinkHook maybe instantiated multiple times. # So it is allowed to change name by argument. self.name = name self.logger = logger or getLogger(__name__) if extract_fn is None: if timing == 'pre': extract_fn = _default_extract_pre elif timing == 'post': extract_fn = _default_extract_post else: raise ValueError("Unexpected value timing={}" .format(timing)) self.extract_fn = extract_fn self.process_fns = OrderedDict() # Additional process, if necessary self.timing = timing self.result = None def add_process(self, key, fn): """Add additional process for target variable Args: key (str): id for this process, you may remove added process by `delete_process` with this key. fn (callable): function which takes `hook, args, target_var` as arguments. """ if not isinstance(key, str): raise TypeError('key must be str, actual {}'.format(type(key))) if not callable(fn): raise TypeError('fn must be callable') self.process_fns[key] = fn def delete_process(self, key): """Delete process added at `add_process` Args: key (str): id for the process, named at `add_process`. """ if not isinstance(key, str): raise TypeError('key must be str, actual {}'.format(type(key))) if key in self.process_fns.keys(): del self.process_fns[key] else: # Nothing to delete self.logger.warning('{} is not in process_fns, skip delete_process' .format(key)) def get_variable(self): """Get target variable, which is input or output of `target_link`. Returns (chainer.Variable): target variable """ return self.result def forward_preprocess(self, args): if self.timing == 'pre' and args.link is self.target_link: self.result = self.extract_fn(self, args) if self.process_fns is not None: for key, fn in self.process_fns.items(): fn(self, args, self.result) def forward_postprocess(self, args): if self.timing == 'post' and args.link is self.target_link: self.result = self.extract_fn(self, args) if self.process_fns is not None: for key, fn in self.process_fns.items(): fn(self, args, self.result) ================================================ FILE: chainer_chemistry/links/__init__.py ================================================ from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID # NOQA from chainer_chemistry.links.connection.graph_linear import GraphLinear # NOQA from chainer_chemistry.links.connection.graph_mlp import GraphMLP # NOQA from chainer_chemistry.links.normalization.graph_batch_normalization import GraphBatchNormalization # NOQA from chainer_chemistry.links.readout.general_readout import GeneralReadout # NOQA from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout # NOQA from chainer_chemistry.links.readout.mpnn_readout import MPNNReadout # NOQA from chainer_chemistry.links.readout.nfp_readout import NFPReadout # NOQA from chainer_chemistry.links.readout.schnet_readout import SchNetReadout # NOQA from chainer_chemistry.links.readout.set2set import Set2Set # NOQA from chainer_chemistry.links.scaler.flow_scaler import FlowScaler # NOQA from chainer_chemistry.links.scaler.standard_scaler import StandardScaler # NOQA from chainer_chemistry.links.update.ggnn_update import GGNNUpdate # NOQA from chainer_chemistry.links.update.gin_update import GINUpdate # NOQA from chainer_chemistry.links.update.mpnn_update import EdgeNet # NOQA from chainer_chemistry.links.update.mpnn_update import MPNNUpdate # NOQA from chainer_chemistry.links.update.nfp_update import NFPUpdate # NOQA from chainer_chemistry.links.update.relgat_update import RelGATUpdate # NOQA from chainer_chemistry.links.update.relgcn_update import RelGCNUpdate # NOQA from chainer_chemistry.links.update.rsgcn_update import RSGCNUpdate # NOQA from chainer_chemistry.links.update.schnet_update import SchNetUpdate # NOQA ================================================ FILE: chainer_chemistry/links/array/__init__.py ================================================ ================================================ FILE: chainer_chemistry/links/array/shape_transformer_to_2d.py ================================================ import chainer from chainer import functions class ShapeTransformerTo2D(chainer.Link): """Transforms input array `x` to 2-dim and reverts. It converts array to be 2-dim, where 1th axis is `axis` and the rest is gathered to 0th axis. Note that this class does not have any parameters but behaves as "function wrapper" which has internal attribute to `transform` and `inverse_transform`. Args: axis (int): feature axis, which will be 1st axis. """ def __init__(self, axis=1): super(ShapeTransformerTo2D, self).__init__() self.original_shape = None self.transpose_order = None self.axis = axis def transform(self, x): self.original_shape = x.shape axis = self.axis if axis < 0: axis += x.ndim transpose_order = [i for i in range(x.ndim)] transpose_order.pop(axis) transpose_order = transpose_order + [axis] x = functions.transpose(x, tuple(transpose_order)) x = functions.reshape(x, (-1, self.original_shape[axis])) self.transpose_order = transpose_order return x def inverse_transform(self, x): if x.ndim != 2: raise ValueError( "[ERROR] Unexpected value x.shape={}, 2-dim array is expected" .format(x.shape)) if self.original_shape is None: raise AttributeError( '[Error] original_shape is None, call transform beforehand!') ndim = len(self.original_shape) axis = self.axis if axis < 0: axis += ndim inverse_transpose_order = [i for i in range(ndim - 1)] inverse_transpose_order.insert(axis, ndim-1) x = functions.reshape(x, tuple([self.original_shape[i] for i in self.transpose_order])) x = functions.transpose(x, tuple(inverse_transpose_order)) return x ================================================ FILE: chainer_chemistry/links/connection/__init__.py ================================================ ================================================ FILE: chainer_chemistry/links/connection/embed_atom_id.py ================================================ import chainer from chainer_chemistry.config import MAX_ATOMIC_NUM class EmbedAtomID(chainer.links.EmbedID): """Embeddning specialized to atoms. This is a chain in the sense of Chainer that converts an atom, represented by a sequence of molecule IDs, to a sequence of embedding vectors of molecules. The operation is done in a minibatch manner, as most chains do. The forward propagation of link consists of ID embedding, which converts the input `x` into vector embedding `h` where its shape represents (minibatch, atom, channel) .. seealso:: :class:`chainer.links.EmbedID` """ def __init__(self, out_size, in_size=MAX_ATOMIC_NUM, initialW=None, ignore_label=None): super(EmbedAtomID, self).__init__( in_size=in_size, out_size=out_size, initialW=initialW, ignore_label=ignore_label) def __call__(self, x): """Forward propagaion. Args: x (:class:`chainer.Variable`, or :class:`numpy.ndarray` \ or :class:`cupy.ndarray`): Input array that should be an integer array whose ``ndim`` is 2. This method treats the array as a minibatch of atoms, each of which consists of a sequence of molecules represented by integer IDs. The first axis should be an index of atoms (i.e. minibatch dimension) and the second one be an index of molecules. Returns: :class:`chainer.Variable`: A 3-dimensional array consisting of embedded vectors of atoms, representing (minibatch, atom, channel). """ h = super(EmbedAtomID, self).__call__(x) return h ================================================ FILE: chainer_chemistry/links/connection/graph_linear.py ================================================ import chainer class GraphLinear(chainer.links.Linear): """Graph Linear layer. This function assumes its input is 3-dimensional. Differently from :class:`chainer.functions.linear`, it applies an affine transformation to the third axis of input `x`. .. seealso:: :class:`chainer.links.Linear` """ def __call__(self, x): """Forward propagation. Args: x (:class:`chainer.Variable`, or :class:`numpy.ndarray`\ or :class:`cupy.ndarray`): Input array that should be a float array whose ``ndim`` is 3. It represents a minibatch of atoms, each of which consists of a sequence of molecules. Each molecule is represented by integer IDs. The first axis is an index of atoms (i.e. minibatch dimension) and the second one an index of molecules. Returns: :class:`chainer.Variable`: A 3-dimeisional array. """ h = x # (minibatch, atom, ch) s0, s1, s2 = h.shape h = chainer.functions.reshape(h, (s0 * s1, s2)) h = super(GraphLinear, self).__call__(h) h = chainer.functions.reshape(h, (s0, s1, self.out_size)) return h ================================================ FILE: chainer_chemistry/links/connection/graph_mlp.py ================================================ import numpy import chainer from chainer.functions import relu from chainer_chemistry.links.connection.graph_linear import GraphLinear class GraphMLP(chainer.Chain): """Graph MLP layer Args: channels (list or numpy.ndarray): list of int, representing each layer's hidden dim. e.g., if [32, 16], it will construct 2-layer MLP with hidden dim 32 and output dim 16. in_channels (int or None): input channel size. activation (chainer.functions): activation function """ def __init__(self, channels, in_channels=None, activation=relu): super(GraphMLP, self).__init__() if not isinstance(channels, (list, numpy.ndarray)): raise TypeError('channels {} is expected to be list, actual {}' .format(channels, type(channels))) channels_list = [in_channels] + list(channels) layers = [GraphLinear(channels_list[i], channels_list[i+1]) for i in range(len(channels_list) - 1)] with self.init_scope(): self.layers = chainer.ChainList(*layers) self.activation = activation def __call__(self, x): h = x for l in self.layers[:-1]: h = self.activation(l(h)) h = self.layers[-1](h) return h ================================================ FILE: chainer_chemistry/links/normalization/__init__.py ================================================ ================================================ FILE: chainer_chemistry/links/normalization/graph_batch_normalization.py ================================================ import chainer class GraphBatchNormalization(chainer.links.BatchNormalization): """Graph Batch Normalization layer. .. seealso:: :class:`chainer.links.BatchNormalization` """ def __call__(self, x): """Forward propagation. Args: x (:class:`chainer.Variable`, or :class:`numpy.ndarray`\ or :class:`cupy.ndarray`): Input array that should be a float array whose ``ndim`` is 3. It represents a minibatch of atoms, each of which consists of a sequence of molecules. Each molecule is represented by integer IDs. The first axis is an index of atoms (i.e. minibatch dimension) and the second one an index of molecules. Returns: :class:`chainer.Variable`: A 3-dimeisional array. """ h = x # (minibatch, atom, ch) # The implemenataion of batch normalization for graph convolution below # is rather naive. To be precise, it is necessary to consider the # difference in the number of atoms for each graph. However, the # implementation below does not take it into account, and assumes # that all graphs have the same number of atoms, hence extra numbers # of zero are included when average is computed. In other word, the # results of batch normalization below is biased. s0, s1, s2 = h.shape h = chainer.functions.reshape(h, (s0 * s1, s2)) h = super(GraphBatchNormalization, self).__call__(h) h = chainer.functions.reshape(h, (s0, s1, s2)) return h ================================================ FILE: chainer_chemistry/links/readout/__init__.py ================================================ ================================================ FILE: chainer_chemistry/links/readout/cgcnn_readout.py ================================================ import chainer from chainer import functions, links # NOQA class CGCNNReadout(chainer.Chain): """CGCNN submodule for readout part. Args: out_dim (int): dimension of output feature vector """ def __init__(self, out_dim=128): super(CGCNNReadout, self).__init__() with self.init_scope(): self.linear = links.Linear(None, out_dim) def __call__(self, atom_feat, atom_idx): average_pool = [functions.mean(atom_feat[idx], axis=0, keepdims=True) for idx in atom_idx] h = functions.concat(average_pool, axis=0) h = self.linear(h) h = functions.softplus(h) return h ================================================ FILE: chainer_chemistry/links/readout/general_readout.py ================================================ import chainer from chainer import functions class GeneralReadout(chainer.Link): """General submodule for readout part. This class can be used for `rsgcn` and `weavenet`. Note that this class has no learnable parameter, even though this is subclass of `chainer.Link`. This class is under `links` module for consistency with other readout module. Args: mode (str): activation (callable): activation function """ def __init__(self, mode='sum', activation=None, **kwargs): super(GeneralReadout, self).__init__() self.mode = mode self.activation = activation def __call__(self, h, axis=1, **kwargs): if self.activation is not None: h = self.activation(h) else: h = h if self.mode == 'sum': y = functions.sum(h, axis=axis) elif self.mode == 'max': y = functions.max(h, axis=axis) elif self.mode == 'summax': h_sum = functions.sum(h, axis=axis) h_max = functions.max(h, axis=axis) y = functions.concat((h_sum, h_max), axis=axis) else: raise ValueError('mode {} is not supported'.format(self.mode)) return y class ScatterGeneralReadout(chainer.Link): """General submodule for readout part by scatter operation. This class is used in sparse pattern. Args: mode (str): activation (callable): activation function """ def __init__(self, mode='sum', activation=None, **kwargs): super(ScatterGeneralReadout, self).__init__() self.mode = mode self.activation = activation def __call__(self, h, batch, **kwargs): if self.activation is not None: h = self.activation(h) else: h = h if self.mode == 'sum': y = self.xp.zeros((batch[-1] + 1, h.shape[1]), dtype=self.xp.float32) y = functions.scatter_add(y, batch, h) else: raise ValueError('mode {} is not supported'.format(self.mode)) return y ================================================ FILE: chainer_chemistry/links/readout/ggnn_readout.py ================================================ import chainer from chainer import functions from chainer_chemistry.links.connection.graph_linear import GraphLinear class GGNNReadout(chainer.Chain): """GGNN submodule for readout part. Args: out_dim (int): dimension of output feature vector in_channels (int or None): dimension of feature vector associated to each node. `in_channels` is the total dimension of `h` and `h0`. nobias (bool): If ``True``, then this function does not use the bias activation (~chainer.Function or ~chainer.FunctionNode): activate function for node representation `functions.tanh` was suggested in original paper. activation_agg (~chainer.Function or ~chainer.FunctionNode): activate function for aggregation `functions.tanh` was suggested in original paper. """ def __init__(self, out_dim, in_channels=None, nobias=False, activation=functions.identity, activation_agg=functions.identity): super(GGNNReadout, self).__init__() with self.init_scope(): self.i_layer = GraphLinear(in_channels, out_dim, nobias=nobias) self.j_layer = GraphLinear(in_channels, out_dim, nobias=nobias) self.out_dim = out_dim self.in_channels = in_channels self.nobias = nobias self.activation = activation self.activation_agg = activation_agg def __call__(self, h, h0=None, is_real_node=None): # --- Readout part --- # h, h0: (minibatch, node, ch) # is_real_node: (minibatch, node) h1 = functions.concat((h, h0), axis=2) if h0 is not None else h g1 = functions.sigmoid(self.i_layer(h1)) g2 = self.activation(self.j_layer(h1)) g = g1 * g2 if is_real_node is not None: # mask virtual node feature to be 0 mask = self.xp.broadcast_to( is_real_node[:, :, None], g.shape) g = g * mask # sum along node axis g = self.activation_agg(functions.sum(g, axis=1)) return g ================================================ FILE: chainer_chemistry/links/readout/megnet_readout.py ================================================ import chainer from chainer import functions, links # NOQA from chainer_chemistry.functions import megnet_softplus from chainer_chemistry.links.readout.set2set import Set2Set class MEGNetReadout(chainer.Chain): """MEGNet submodule for readout part. Args: out_dim (int): dimension of output feature vector in_channels (int): dimension of feature vector associated to each node. Must not be `None`. n_layers (int): number of LSTM layers for set2set processing_steps (int): number of processing for set2set dropout_ratio (float): ratio of dropout activation (~chainer.Function or ~chainer.FunctionNode): activate function for megnet model `megnet_softplus` was used in original paper. """ def __init__(self, out_dim=32, in_channels=32, n_layers=1, processing_steps=3, dropout_ratio=-1, activation=megnet_softplus): super(MEGNetReadout, self).__init__() if processing_steps <= 0: raise ValueError("[ERROR] Unexpected value processing_steps={}" .format(processing_steps)) self.processing_steps = processing_steps self.dropout_ratio = dropout_ratio self.activation = activation with self.init_scope(): self.set2set_for_atom = Set2Set( in_channels=in_channels, n_layers=n_layers) self.set2set_for_pair = Set2Set( in_channels=in_channels, n_layers=n_layers) self.linear = links.Linear(None, out_dim) def __call__(self, atoms_feat, pair_feat, global_feat): a_f = atoms_feat p_f = pair_feat g_f = global_feat # readout for atom and pair feature self.set2set_for_atom.reset_state() self.set2set_for_pair.reset_state() for i in range(self.processing_steps): a_f_r = self.set2set_for_atom(a_f) p_f_r = self.set2set_for_pair(p_f) # concating all features h = functions.concat((a_f_r, p_f_r, g_f), axis=1) if self.dropout_ratio > 0.0: h = functions.dropout(h, ratio=self.dropout_ratio) out = self.activation(self.linear(h)) return out ================================================ FILE: chainer_chemistry/links/readout/mpnn_readout.py ================================================ import chainer from chainer import functions from chainer import links from chainer_chemistry.links.readout.set2set import Set2Set class MPNNReadout(chainer.Chain): """MPNN submodule for readout part. Args: out_dim (int): dimension of output feature vector in_channels (int): dimension of feature vector associated to each node. Must not be `None`. n_layers (int): number of LSTM layers for set2set processing_steps (int): number of processing for set2set """ def __init__(self, out_dim, in_channels, n_layers=1, processing_steps=3): # type: (int, int, int, int) -> None super(MPNNReadout, self).__init__() if processing_steps <= 0: raise ValueError("[ERROR] Unexpected value processing_steps={}" .format(processing_steps)) with self.init_scope(): self.set2set = Set2Set(in_channels=in_channels, n_layers=n_layers) self.linear1 = links.Linear(in_channels * 2, in_channels) self.linear2 = links.Linear(in_channels, out_dim) self.out_dim = out_dim self.in_channels = in_channels self.n_layers = n_layers self.processing_steps = processing_steps def __call__(self, h, **kwargs): # type: (chainer.Variable) -> chainer.Variable # h: (mb, node, ch) self.set2set.reset_state() for i in range(self.processing_steps): g = self.set2set(h) # g: (mb, ch * 2) g = functions.relu(self.linear1(g)) # g: (mb, ch) g = self.linear2(g) # g: (mb, out_dim) return g ================================================ FILE: chainer_chemistry/links/readout/nfp_readout.py ================================================ import chainer from chainer import functions from chainer_chemistry.links.connection.graph_linear import GraphLinear class NFPReadout(chainer.Chain): """NFP submodule for readout part. Args: out_dim (int): output dimension of feature vector associated to each graph in_channels (int or None): dimension of feature vector associated to each node """ def __init__(self, out_dim, in_channels): super(NFPReadout, self).__init__() with self.init_scope(): self.output_weight = GraphLinear(in_channels, out_dim) self.in_channels = in_channels self.out_dim = out_dim def __call__(self, h, is_real_node=None, **kwargs): # h: (minibatch, node, ch) # is_real_node: (minibatch, node) # ---Readout part --- i = self.output_weight(h) i = functions.softmax(i, axis=2) # softmax along channel axis if is_real_node is not None: # mask virtual node feature to be 0 mask = self.xp.broadcast_to( is_real_node[:, :, None], i.shape) i = i * mask i = functions.sum(i, axis=1) # sum along atom's axis return i ================================================ FILE: chainer_chemistry/links/readout/scatter_ggnn_readout.py ================================================ import numpy import chainer from chainer import functions class ScatterGGNNReadout(chainer.Chain): """GGNN submodule for readout part using scatter operation. Args: out_dim (int): dimension of output feature vector in_channels (int or None): dimension of feature vector associated to each node. `in_channels` is the total dimension of `h` and `h0`. nobias (bool): If ``True``, then this function does not use the bias activation (~chainer.Function or ~chainer.FunctionNode): activate function for node representation `functions.tanh` was suggested in original paper. activation_agg (~chainer.Function or ~chainer.FunctionNode): activate function for aggregation `functions.tanh` was suggested in original paper. concat_n_info (bool): If ``True``, node information is concated to the result. """ def __init__(self, out_dim, in_channels=None, nobias=False, activation=functions.identity, activation_agg=functions.identity, concat_n_info=False): super(ScatterGGNNReadout, self).__init__() self.concat_n_info = concat_n_info if self.concat_n_info: out_dim -= 1 with self.init_scope(): self.i_layer = chainer.links.Linear( in_channels, out_dim, nobias=nobias) self.j_layer = chainer.links.Linear( in_channels, out_dim, nobias=nobias) self.out_dim = out_dim self.in_channels = in_channels self.nobias = nobias self.activation = activation self.activation_agg = activation_agg def __call__(self, h, batch, h0=None, is_real_node=None): # --- Readout part --- h1 = functions.concat((h, h0), axis=1) if h0 is not None else h g1 = functions.sigmoid(self.i_layer(h1)) g2 = self.activation(self.j_layer(h1)) g = g1 * g2 # sum along node axis y = self.xp.zeros((int(batch[-1]) + 1, self.out_dim), dtype=numpy.float32) y = functions.scatter_add(y, batch, g) y = self.activation_agg(y) if self.concat_n_info: n_nodes = self.xp.zeros(y.shape[0], dtype=self.xp.float32) n_nodes = functions.scatter_add(n_nodes, batch, self.xp.ones(batch.shape[0])) y = functions.concat((y, n_nodes.reshape((-1, 1)))) return y ================================================ FILE: chainer_chemistry/links/readout/schnet_readout.py ================================================ import chainer from chainer import functions from chainer_chemistry.functions import shifted_softplus from chainer_chemistry.links.connection.graph_linear import GraphLinear class SchNetReadout(chainer.Chain): """SchNet submodule for readout part. Args: out_dim (int): dimension of output feature vector in_channels (int or None): dimension of feature vector for each node hidden_channels (int): dimension of feature vector for each node """ def __init__(self, out_dim=1, in_channels=None, hidden_channels=32): super(SchNetReadout, self).__init__() with self.init_scope(): self.linear1 = GraphLinear(in_channels, hidden_channels) self.linear2 = GraphLinear(hidden_channels, out_dim) self.out_dim = out_dim self.hidden_dim = in_channels def __call__(self, h, **kwargs): h = self.linear1(h) h = shifted_softplus(h) h = self.linear2(h) h = functions.sum(h, axis=1) return h ================================================ FILE: chainer_chemistry/links/readout/set2set.py ================================================ from typing import List, Optional # NOQA import chainer from chainer import cuda from chainer import functions from chainer import links import numpy # NOQA class Set2Set(chainer.Chain): r"""MPNN subsubmodule for readout part. See: Oriol Vinyals+, \ Order Matters: Sequence to sequence for sets. November 2015. `arXiv:1511.06391 ` Args: in_channels (int): dimension of input feature vector n_layers (int): number of LSTM layers Returns (chainer.Variable): Output feature vector: (minibatch, in_channels * 2) """ def __init__(self, in_channels, n_layers=1): # type: (int, int) -> None super(Set2Set, self).__init__() with self.init_scope(): self.lstm_layer = links.NStepLSTM( n_layers=n_layers, in_size=in_channels * 2, out_size=in_channels, dropout=0) self.in_channels = in_channels self.n_layers = n_layers self.hx = None # type: Optional[chainer.Variable] self.cx = None # type: Optional[chainer.Variable] self.q_star = None # type: Optional[List] def __call__(self, h): # type: (chainer.Variable) -> chainer.Variable xp = cuda.get_array_module(h) mb, node, ch = h.shape # type: int, int, int if self.q_star is None: self.q_star = [ xp.zeros((1, self.in_channels * 2)).astype('f') for _ in range(mb) ] self.hx, self.cx, q = self.lstm_layer(self.hx, self.cx, self.q_star) # self.hx: (mb, mb, ch) # self.cx: (mb, mb, ch) # q: List[(1, ch) * mb] q = functions.stack(q) # q: (mb, 1, ch) q_ = functions.transpose(q, axes=(0, 2, 1)) # q_: (mb, ch, 1) e = functions.matmul(h, q_) # e: (mb, node, 1) a = functions.softmax(e) # a: (mb, node, 1) a = functions.broadcast_to(a, h.shape) # a: (mb, node, ch) r = functions.sum((a * h), axis=1, keepdims=True) # r: (mb, 1, ch) q_star_ = functions.concat((q, r), axis=2) # q_star_: (mb, 1, ch*2) self.q_star = functions.separate(q_star_) return functions.reshape(q_star_, (mb, ch * 2)) def reset_state(self): # type: () -> None self.hx = None self.cx = None self.q_star = None ================================================ FILE: chainer_chemistry/links/scaler/__init__.py ================================================ ================================================ FILE: chainer_chemistry/links/scaler/base.py ================================================ import chainer def to_array(x): """Convert x into numpy.ndarray or cupy.ndarray""" if isinstance(x, chainer.Variable): x = x.data return x class BaseScaler(chainer.Link): """Base class for scaler. x maybe array or Variable """ def fit(self, x, **kwargs): """fit parameter from given input `x`. It should return self after fitting parameters. """ raise NotImplementedError def transform(self, x, **kwargs): """transform input `x` using fitted parameters. This method should be called after `fit` is called. """ raise NotImplementedError def inverse_transform(self, x, **kwargs): """inverse operation of `transform`. This method should be called after `fit` is called. """ raise NotImplementedError def fit_transform(self, x, **kwargs): return self.fit(x, **kwargs).transform(x) # `__call__` method invokes `forward` method. def forward(self, x, **kwargs): return self.transform(x, **kwargs) ================================================ FILE: chainer_chemistry/links/scaler/flow_scaler.py ================================================ import numpy import chainer from chainer_chemistry.links.scaler.base import BaseScaler, to_array # NOQA def _sigmoid_derivative(x): h = chainer.functions.sigmoid(x) return chainer.grad([h], [x], enable_double_backprop=True)[0] def format_x(x): """x may be array or Variable""" # currently, only consider the case x is 2-dim, (batchsize, feature) if x.ndim == 1: # Deal with as 1 feature with several samples. x = x[:, None] if x.ndim != 2: raise ValueError( "Unexpected value x.shape={}, only x.ndim=2 is supported." .format(x.shape)) return x class FlowScaler(BaseScaler): """Flow Scaler. Flow Scaler is a Scaler that scale data into the normal distribution. This scaler uses a technique named "flow". By using this technique, parametrized bijective function is learned to scale data that distributes arbitrary continuous distribution into specified continuous distribution. In this scaler, multi-layer perceptron whose weight is restricted into positive range is used as parametrized bijective function. Args: hidden_num(int): number of units in hidden layer of multi-layer perceptron. """ def __init__(self, hidden_num=20): super(FlowScaler, self).__init__() self.hidden_num = hidden_num self.mean = None self.register_persistent('mean') self.std = None self.register_persistent('std') self.eps = numpy.float32(1e-6) W_initializer = chainer.initializers.Normal(0.1) with self.init_scope(): self.W1_ = chainer.Parameter(W_initializer) self.b1 = chainer.Parameter(0) self.W2_ = chainer.Parameter(W_initializer) self.b2 = chainer.Parameter(0) def _initialize_params(self, in_size): self.W1_.initialize((self.hidden_num, in_size, 1, 1, 1, 1)) self.b1.initialize((self.hidden_num, in_size, 1)) self.W2_.initialize((1, in_size, 1, self.hidden_num, 1, 1)) self.b2.initialize((1, in_size, 1)) @property def W1(self): return chainer.functions.softplus(self.W1_) @property def W2(self): return chainer.functions.softplus(self.W2_) def _forward(self, x): x = chainer.functions.expand_dims(x, axis=1) x = chainer.functions.expand_dims(x, axis=3) h = chainer.functions.local_convolution_2d(x, self.W1, self.b1) h = chainer.functions.sigmoid(h) h = chainer.functions.local_convolution_2d(h, self.W2, self.b2) h = h[:, 0, :, 0] return h def _derivative(self, x): x = chainer.functions.expand_dims(x, axis=1) x = chainer.functions.expand_dims(x, axis=3) h = chainer.functions.local_convolution_2d(x, self.W1, self.b1) h = _sigmoid_derivative(h) h = h * chainer.functions.expand_dims(self.W1[:, :, 0, 0, 0], axis=0) h = chainer.functions.local_convolution_2d(h, self.W2) h = h[:, 0, :, 0] return h def _loss(self, x): # loss = -log(p(f(x))) - log|f'(x)| x_nan = self.xp.isnan(x) x_not_nan = self.xp.logical_not(x_nan) x = self.xp.nan_to_num(x) if not isinstance(x, chainer.Variable): x = chainer.Variable(x.astype(numpy.float32)) y = self._forward(x) gy = self._derivative(x) # gy, = chainer.grad([y], [x], enable_double_backprop=True) std_gaussian = chainer.distributions.Normal( self.xp.zeros(shape=x.shape, dtype=numpy.float32), self.xp.ones(shape=x.shape, dtype=numpy.float32)) loss = -std_gaussian.log_prob(y) loss -= chainer.functions.log(abs(gy) + self.eps) loss = chainer.functions.sum(loss[x_not_nan]) / x_not_nan.sum() chainer.reporter.report({'loss': loss}, self) return loss def fit(self, x, batch_size=100, iteration=3000): """Fitting parameter. Args: x(:class:`~chainer.Variable` or :ref:`ndarray`): data for learning. batch_size(int): size of batch used for learning multi-layer perceptron. iteration(int): number of iteration. Returns: self (FlowScaler): this instance. """ if isinstance(x, chainer.Variable): x = x.array x = format_x(x) self._initialize_params(x.shape[1]) xp = self.xp if xp is numpy: self.mean = xp.nanmean(x, axis=0) self.std = xp.nanstd(x, axis=0) else: if int(xp.sum(xp.isnan(x))) > 0: raise NotImplementedError( "FlowScaling with nan value on GPU is not supported.") # cupy.nanmean, cupy.nanstd is not implemented yet. self.mean = xp.mean(x, axis=0) self.std = xp.std(x, axis=0) x = (x - self.mean) / (self.std + self.eps) optimizer = chainer.optimizers.Adam(0.3) optimizer.setup(self) train = chainer.datasets.TupleDataset(x) train_iter = chainer.iterators.SerialIterator(train, batch_size) updater = chainer.training.updaters.StandardUpdater( train_iter, optimizer, loss_func=self._loss) trainer = chainer.training.Trainer( updater, (iteration, 'iteration')) trainer.extend(chainer.training.extensions.LogReport( trigger=(100, 'iteration'))) trainer.extend(chainer.training.extensions.PrintReport( ['epoch', 'iteration', 'main/loss', 'elapsed_time'])) trainer.run() return self def transform(self, x, batch_size=100): """Transform. Args: x(:class:`~chainer.Variable` or :ref:`ndarray`): data. batch_size(int): size of batch used for learning multi-layer perceptron. Returns: scaled_x(:class:`~chainer.Variable` or :ref:`ndarray`): transformed data. """ if self.mean is None: raise AttributeError('[Error] mean is None, call fit beforehand!') x_ = format_x(x) x_ = (x_ - self.mean) / (self.std + self.eps) y = [] for i in range((len(x) - 1) // batch_size + 1): y.append(self._forward( x_[i*batch_size: (i+1)*batch_size])) y = chainer.functions.concat(y, axis=0) if x.ndim == 1: y = y[:, 0] if isinstance(x_, chainer.Variable): return y else: return y.data ================================================ FILE: chainer_chemistry/links/scaler/max_abs_scaler.py ================================================ from logging import getLogger import numpy from chainer import cuda, Variable # NOQA from chainer_chemistry.links.scaler.base import BaseScaler, to_array # NOQA from chainer_chemistry.links.array.shape_transformer_to_2d import ShapeTransformerTo2D # NOQA def format_x(x): """x may be array or Variable.""" # currently, only consider the case x is 2-dim, (batchsize, feature) if x.ndim == 1: # Deal with as 1 feature with several samples. x = x[:, None] return x class MaxAbsScaler(BaseScaler): def __init__(self): super(MaxAbsScaler, self).__init__() self.indices = None self.register_persistent('indices') self.max_abs = None self.register_persistent('max_abs') def fit(self, x, indices=None, axis=1): """Fitting parameter. Args: x (numpy.ndarray or cupy.ndarray or Variable): indices (list or tuple or None): indices for applying standard scaling. axis (int): axis to calculate mean & std. Returns: self (StandardScaler): this instance. """ x = to_array(x) x = format_x(x) x = ShapeTransformerTo2D(axis=axis).transform(x).array if indices is None: pass elif isinstance(indices, (list, tuple)): indices = numpy.asarray(indices) self.indices = indices if self.indices is not None: x = x[:, self.indices] xp = self.xp if xp is numpy: x = cuda.to_cpu(x) else: x = cuda.to_gpu(x) self.max_abs = xp.nanmax(xp.abs(x), axis=0) # result consistency check if xp.sum(self.max_abs == 0) > 0: logger = getLogger(__name__) ind = numpy.argwhere(cuda.to_cpu(self.max_abs) == 0)[:, 0] logger.warning('fit: max_abs was 0 at indices {}'.format(ind)) return self def _compute_max_abs_all(self, input_dim): if self.indices is None: max_abs_all = self.xp.ones(input_dim, dtype=self.xp.float32) max_abs_all[self.max_abs != 0] = self.max_abs[self.max_abs != 0] return max_abs_all else: max_abs_all = self.xp.ones(input_dim, dtype=self.xp.float32) non_zero_indices = self.indices[self.max_abs != 0] max_abs_all[non_zero_indices] = self.max_abs[self.max_abs != 0] return max_abs_all def transform(self, x, axis=1): is_array = not isinstance(x, Variable) if self.max_abs is None: raise AttributeError( '[Error] max_abs is None, call fit beforehand!') x = format_x(x) shape_transformer = ShapeTransformerTo2D(axis=axis) x = shape_transformer.transform(x) max_abs_all = self._compute_max_abs_all(x.shape[1]) x = x / max_abs_all[None, :] x = shape_transformer.inverse_transform(x) if is_array: x = x.array return x def inverse_transform(self, x, axis=1): is_array = not isinstance(x, Variable) if self.max_abs is None: raise AttributeError( '[Error] max_abs is None, call fit beforehand!') x = format_x(x) shape_transformer = ShapeTransformerTo2D(axis=axis) x = shape_transformer.transform(x) max_abs_all = self._compute_max_abs_all(x.shape[1]) x = x * max_abs_all[None, :] x = shape_transformer.inverse_transform(x) if is_array: x = x.array return x ================================================ FILE: chainer_chemistry/links/scaler/min_max_scaler.py ================================================ from logging import getLogger import numpy from chainer import cuda, Variable # NOQA from chainer_chemistry.links.scaler.base import BaseScaler, to_array # NOQA from chainer_chemistry.links.array.shape_transformer_to_2d import ShapeTransformerTo2D # NOQA def format_x(x): """x may be array or Variable.""" # currently, only consider the case x is 2-dim, (batchsize, feature) if x.ndim == 1: # Deal with as 1 feature with several samples. x = x[:, None] return x class MinMaxScaler(BaseScaler): def __init__(self): super(MinMaxScaler, self).__init__() self.indices = None self.register_persistent('indices') self.min = None self.register_persistent('min') self.max = None self.register_persistent('max') def fit(self, x, indices=None, axis=1): """Fitting parameter. Args: x (numpy.ndarray or cupy.ndarray or Variable): indices (list or tuple or None): indices for applying standard scaling. axis (int): axis to calculate min & max. Returns: self (MinMaxScaler): this instance. """ x = to_array(x) x = format_x(x) x = ShapeTransformerTo2D(axis=axis).transform(x).array if indices is None: pass elif isinstance(indices, (list, tuple)): indices = numpy.asarray(indices) self.indices = indices if self.indices is not None: x = x[:, self.indices] xp = self.xp if xp is numpy: x = cuda.to_cpu(x) else: x = cuda.to_gpu(x) self.min = xp.nanmin(x, axis=0) self.max = xp.nanmax(x, axis=0) # result consistency check if xp.sum(self.max - self.min == 0) > 0: logger = getLogger(__name__) ind = numpy.argwhere(cuda.to_cpu(self.max-self.min) == 0)[:, 0] logger.warning('fit: max-min was 0 at indices {}'.format(ind)) return self def _compute_min_diff_all(self, input_dim): diff = self.max - self.min diff_nonzero_indices = diff != 0 if self.indices is None: diff_all = self.xp.ones(input_dim, dtype=self.xp.float32) diff_all[diff_nonzero_indices] = diff[diff_nonzero_indices] return self.min, diff_all else: min_all = self.xp.zeros(input_dim, dtype=self.xp.float32) min_all[self.indices] = self.min diff_all = self.xp.ones(input_dim, dtype=self.xp.float32) non_zero_indices = self.indices[diff_nonzero_indices] diff_all[non_zero_indices] = diff[diff_nonzero_indices] return min_all, diff_all def transform(self, x, axis=1): is_array = not isinstance(x, Variable) if self.min is None: raise AttributeError( '[Error] min is None, call fit beforehand!') x = format_x(x) shape_transformer = ShapeTransformerTo2D(axis=axis) x = shape_transformer.transform(x) min_all, diff_all = self._compute_min_diff_all(x.shape[1]) x = (x - min_all[None, :]) / diff_all[None, :] x = shape_transformer.inverse_transform(x) if is_array: x = x.array return x def inverse_transform(self, x, axis=1): is_array = not isinstance(x, Variable) if self.min is None: raise AttributeError( '[Error] min is None, call fit beforehand!') x = format_x(x) shape_transformer = ShapeTransformerTo2D(axis=axis) x = shape_transformer.transform(x) min_all, diff_all = self._compute_min_diff_all(x.shape[1]) x = x * diff_all[None, :] + min_all[None, :] x = shape_transformer.inverse_transform(x) if is_array: x = x.array return x ================================================ FILE: chainer_chemistry/links/scaler/standard_scaler.py ================================================ from logging import getLogger import numpy from chainer import cuda, Variable # NOQA from chainer_chemistry.links.scaler.base import BaseScaler, to_array # NOQA from chainer_chemistry.links.array.shape_transformer_to_2d import ShapeTransformerTo2D # NOQA def format_x(x): """x may be array or Variable.""" # currently, only consider the case x is 2-dim, (batchsize, feature) if x.ndim == 1: # Deal with as 1 feature with several samples. x = x[:, None] return x class StandardScaler(BaseScaler): def __init__(self): super(StandardScaler, self).__init__() self.indices = None self.register_persistent('indices') self.mean = None self.register_persistent('mean') self.std = None self.register_persistent('std') def fit(self, x, indices=None, axis=1): """Fitting parameter. Args: x (numpy.ndarray or cupy.ndarray or Variable): indices (list or tuple or None): indices for applying standard scaling. axis (int): axis to calculate mean & std. Returns: self (StandardScaler): this instance. """ x = to_array(x) x = format_x(x) x = ShapeTransformerTo2D(axis=axis).transform(x).array if indices is None: pass elif isinstance(indices, (list, tuple)): indices = numpy.asarray(indices) self.indices = indices if self.indices is not None: x = x[:, self.indices] xp = self.xp if xp is numpy: x = cuda.to_cpu(x) self.mean = xp.nanmean(x, axis=0) self.std = xp.nanstd(x, axis=0) else: x = cuda.to_gpu(x) if int(xp.sum(xp.isnan(x))) > 0: raise NotImplementedError( "StandardScaling with nan value on GPU is not supported.") # cupy.nanmean, cupy.nanstd is not implemented yet. self.mean = xp.mean(x, axis=0) self.std = xp.std(x, axis=0) # result consistency check if xp.sum(self.std == 0) > 0: logger = getLogger(__name__) ind = numpy.argwhere(cuda.to_cpu(self.std) == 0)[:, 0] logger.warning('fit: std was 0 at indices {}'.format(ind)) return self def _compute_mean_std_all(self, input_dim): if self.indices is None: std_all = self.xp.ones(input_dim, dtype=self.xp.float32) std_all[self.std != 0] = self.std[self.std != 0] return self.mean, std_all else: mean_all = self.xp.zeros(input_dim, dtype=self.xp.float32) mean_all[self.indices] = self.mean std_all = self.xp.ones(input_dim, dtype=self.xp.float32) non_zero_indices = self.indices[self.std != 0] std_all[non_zero_indices] = self.std[self.std != 0] return mean_all, std_all def transform(self, x, axis=1): is_array = not isinstance(x, Variable) if self.mean is None: raise AttributeError('[Error] mean is None, call fit beforehand!') x = format_x(x) shape_transformer = ShapeTransformerTo2D(axis=axis) x = shape_transformer.transform(x) mean_all, std_all = self._compute_mean_std_all(x.shape[1]) x = (x - mean_all[None, :]) / std_all[None, :] x = shape_transformer.inverse_transform(x) if is_array: x = x.array return x def inverse_transform(self, x, axis=1): is_array = not isinstance(x, Variable) if self.mean is None: raise AttributeError('[Error] mean is None, call fit beforehand!') x = format_x(x) shape_transformer = ShapeTransformerTo2D(axis=axis) x = shape_transformer.transform(x) mean_all, std_all = self._compute_mean_std_all(x.shape[1]) x = x * std_all[None, :] + mean_all[None, :] x = shape_transformer.inverse_transform(x) if is_array: x = x.array return x ================================================ FILE: chainer_chemistry/links/update/__init__.py ================================================ ================================================ FILE: chainer_chemistry/links/update/cgcnn_update.py ================================================ import chainer from chainer import links, functions # NOQA class CGCNNUpdate(chainer.Chain): """Update submodule for CGCNN Args: n_site_features (int): hidden dimension of atom feature vector. This value must be the same as n_site_feat. """ def __init__(self, n_site_features=64): super(CGCNNUpdate, self).__init__() with self.init_scope(): self.fc = links.Linear(None, 2*n_site_features) self.bn1 = links.BatchNormalization(2*n_site_features) self.bn2 = links.BatchNormalization(n_site_features) def __call__(self, site_feat, nbr_feat, nbr_feat_idx): n_site, n_nbr, n_nbr_feat = nbr_feat.shape _, n_site_feat = site_feat.shape site_nbr_feat = site_feat[nbr_feat_idx] total_feat = functions.concat([ functions.broadcast_to(site_feat[:, None, :], (n_site, n_nbr, n_site_feat)), site_nbr_feat, nbr_feat ], axis=2) total_feat = self.fc(total_feat.reshape( n_site*n_nbr, 2*n_site_feat+n_nbr_feat)) total_feat = self.bn1(total_feat).reshape(n_site, n_nbr, 2*n_site_feat) feat_gate, feat_core = functions.split_axis(total_feat, 2, axis=-1) feat_gate = functions.sigmoid(feat_gate) feat_core = functions.softplus(feat_core) feat_sum = functions.sum(feat_gate * feat_core, axis=1) feat_sum = self.bn2(feat_sum) out = functions.softplus(site_feat + feat_sum) return out ================================================ FILE: chainer_chemistry/links/update/ggnn_update.py ================================================ import chainer from chainer import functions from chainer import links import chainer_chemistry from chainer_chemistry.links.connection.graph_linear import GraphLinear from chainer_chemistry.utils import is_sparse class GGNNUpdate(chainer.Chain): """GGNN submodule for update part. Args: in_channels (int or None): input dim of feature vector for each node hidden_channels (int): dimension of feature vector for each node out_channels (int or None): output dime of feature vector for each node When `None`, `hidden_channels` is used. n_edge_types (int): number of types of edge """ def __init__(self, in_channels=None, hidden_channels=16, out_channels=None, n_edge_types=4, **kwargs): if out_channels is None: out_channels = hidden_channels super(GGNNUpdate, self).__init__() if in_channels is None: gru_in_channels = None else: gru_in_channels = in_channels + hidden_channels with self.init_scope(): self.graph_linear = GraphLinear( in_channels, n_edge_types * hidden_channels) self.update_layer = links.GRU(gru_in_channels, out_channels) self.n_edge_types = n_edge_types self.in_channels = in_channels self.hidden_channels = hidden_channels self.out_channels = out_channels def __call__(self, h, adj, **kwargs): hidden_ch = self.hidden_channels # --- Message part --- mb, atom, in_ch = h.shape m = functions.reshape(self.graph_linear(h), (mb, atom, hidden_ch, self.n_edge_types)) # m: (minibatch, atom, ch, edge_type) # Transpose m = functions.transpose(m, (0, 3, 1, 2)) # m: (minibatch, edge_type, atom, ch) # (minibatch * edge_type, atom, out_ch) m = functions.reshape(m, (mb * self.n_edge_types, atom, hidden_ch)) if is_sparse(adj): m = functions.sparse_matmul(adj, m) else: adj = functions.reshape(adj, (mb * self.n_edge_types, atom, atom)) m = chainer_chemistry.functions.matmul(adj, m) # (minibatch * edge_type, atom, out_ch) m = functions.reshape(m, (mb, self.n_edge_types, atom, hidden_ch)) m = functions.sum(m, axis=1) # (minibatch, atom, out_ch) # --- Update part --- # Contraction h = functions.reshape(h, (mb * atom, in_ch)) # Contraction m = functions.reshape(m, (mb * atom, hidden_ch)) out_h = self.update_layer(functions.concat((h, m), axis=1)) # Expansion out_h = functions.reshape(out_h, (mb, atom, self.out_channels)) return out_h def reset_state(self): self.update_layer.reset_state() ================================================ FILE: chainer_chemistry/links/update/gin_update.py ================================================ import chainer from chainer import functions import chainer_chemistry from chainer_chemistry.links import GraphMLP class GINUpdate(chainer.Chain): r"""GIN submodule for update part. Simplest implementation of Graph Isomorphism Network (GIN): N-layered MLP + ReLU No learnable epsilon Batch Normalization is not implemented. instead we use dropout # TODO: implement Batch Normalization inside GraphMLP # Linear -> BN -> relu is used. See: Xu, Hu, Leskovec, and Jegelka, \ "How powerful are graph neural networks?", in ICLR 2019. Args: in_channels (int or None): input dim of feature vector for each node hidden_channels (int): dimension of feature vector for each node out_channels (int or None): output dime of feature vector for each node When `None`, `hidden_channels` is used. dropout_ratio (float): ratio of dropout, instead of batch normalization n_layers (int): layers used in `GraphMLP` """ def __init__(self, in_channels=None, hidden_channels=16, out_channels=None, dropout_ratio=0.5, n_layers=2, **kwargs): if out_channels is None: out_channels = hidden_channels super(GINUpdate, self).__init__() channels = [hidden_channels] * (n_layers - 1) + [out_channels] with self.init_scope(): # two Linear + RELU self.graph_mlp = GraphMLP( channels=channels, in_channels=in_channels, activation=functions.relu) self.dropout_ratio = dropout_ratio def __call__(self, h, adj, **kwargs): """Describing a layer. Args: h (numpy.ndarray): minibatch by num_nodes by hidden_dim numpy array. local node hidden states adj (numpy.ndarray): minibatch by num_nodes by num_nodes 1/0 array. Adjacency matrices over several bond types Returns: updated h """ # Support for one graph (node classification task) if h.ndim == 2: h = h[None] # (minibatch, atom, ch) mb, atom, ch = h.shape # --- Message part --- if isinstance(adj, chainer.utils.CooMatrix): # coo pattern # Support for one graph if adj.data.ndim == 1: adj.data = adj.data[None] adj.col = adj.col[None] adj.row = adj.row[None] fv = functions.sparse_matmul(adj, h) else: # padding pattern # adj (mb, atom, atom) # fv (minibatch, atom, ch) fv = chainer_chemistry.functions.matmul(adj, h) assert (fv.shape == (mb, atom, ch)) # sum myself sum_h = fv + h assert (sum_h.shape == (mb, atom, ch)) # apply MLP new_h = self.graph_mlp(sum_h) new_h = functions.relu(new_h) if self.dropout_ratio > 0.0: new_h = functions.dropout(new_h, ratio=self.dropout_ratio) return new_h class GINSparseUpdate(chainer.Chain): """sparse GIN submodule for update part""" def __init__(self, in_channels=None, hidden_channels=16, out_channels=None, dropout_ratio=0.5, n_layers=2, **kwargs): # To avoid circular reference from chainer_chemistry.models.mlp import MLP if out_channels is None: out_channels = hidden_channels super(GINSparseUpdate, self).__init__() with self.init_scope(): self.mlp = MLP( out_dim=out_channels, hidden_dim=hidden_channels, n_layers=n_layers, activation=functions.relu ) self.dropout_ratio = dropout_ratio def __call__(self, h, edge_index): # add self node feature new_h = h messages = h[edge_index[0]] new_h = functions.scatter_add(new_h, edge_index[1], messages) # apply MLP new_h = self.mlp(new_h) if self.dropout_ratio > 0.0: new_h = functions.dropout(new_h, ratio=self.dropout_ratio) return new_h ================================================ FILE: chainer_chemistry/links/update/gnn_film_update.py ================================================ import chainer from chainer import functions from chainer import links from chainer_chemistry.links.connection.graph_linear import GraphLinear class GNNFiLMUpdate(chainer.Chain): """GNNFiLM submodule for update part. Args: hidden_channels (int): dimension of feature vector associated to each atom n_edge_types (int): number of types of edge """ def __init__(self, hidden_channels=16, n_edge_types=5, activation=functions.relu): super(GNNFiLMUpdate, self).__init__() self.n_edge_types = n_edge_types self.activation = activation with self.init_scope(): self.W_linear = GraphLinear( in_size=None, out_size=self.n_edge_types * hidden_channels, nobias=True) # W_l in eq. (6) self.W_g = GraphLinear( in_size=None, out_size=self.n_edge_types * hidden_channels * 2, nobias=True) # g in eq. (6) self.norm_layer = links.LayerNormalization() # l in eq. (6) def forward(self, h, adj): # --- Message part --- xp = self.xp mb, atom, ch = h.shape newshape = adj.shape + (ch, ) adj = functions.broadcast_to(adj[:, :, :, :, xp.newaxis], newshape) messages = functions.reshape(self.W_linear(h), (mb, atom, ch, self.n_edge_types)) messages = functions.transpose(messages, (3, 0, 1, 2)) film_weights = functions.reshape(self.W_g(h), (mb, atom, 2 * ch, self.n_edge_types)) film_weights = functions.transpose(film_weights, (3, 0, 1, 2)) # (n_edge_types, minibatch, atom, out_ch) gamma = film_weights[:, :, :, :ch] # (n_edge_types, minibatch, atom, out_ch) beta = film_weights[:, :, :, ch:] # --- Update part --- messages = functions.expand_dims( gamma, axis=3) * functions.expand_dims( messages, axis=2) + functions.expand_dims(beta, axis=3) messages = self.activation(messages) # (minibatch, n_edge_types, atom, atom, out_ch) messages = functions.transpose(messages, (1, 0, 2, 3, 4)) messages = adj * messages messages = functions.sum(messages, axis=3) # sum across atoms messages = functions.sum(messages, axis=1) # sum across n_edge_types messages = functions.reshape(messages, (mb * atom, ch)) messages = self.norm_layer(messages) messages = functions.reshape(messages, (mb, atom, ch)) return messages ================================================ FILE: chainer_chemistry/links/update/megnet_update.py ================================================ import chainer from chainer import functions, links # NOQA from chainer_chemistry.functions import megnet_softplus class DenseLayer(chainer.Chain): def __init__(self, hidden_dim=[64, 32], activation=megnet_softplus): super(DenseLayer, self).__init__() self.n_layers = len(hidden_dim) self.activation = activation with self.init_scope(): self.update_layer = chainer.ChainList( *[links.Linear(None, hidden_dim[i]) for i in range(self.n_layers)]) def __call__(self, v): for i in range(self.n_layers): v = self.activation(self.update_layer[i](v)) return v class UpdateLayer(chainer.Chain): def __init__(self, hidden_dim=[64, 64, 32], activation=megnet_softplus): super(UpdateLayer, self).__init__() self.n_layers = len(hidden_dim) self.activation = activation with self.init_scope(): self.update_layer = chainer.ChainList( *[links.Linear(None, hidden_dim[i]) for i in range(self.n_layers)]) def __call__(self, v): for i in range(self.n_layers): v = self.update_layer[i](v) # doesn't pass the activation at the last layer if i != (self.n_layers-1): v = self.activation(v) return v def get_mean_feat(feat, idx, out_shape, xp): """Return mean node or edge feature in each graph. This method is the same as average pooling about node or edge feature in each graph. """ zero = xp.zeros(out_shape, dtype=xp.float32) sum_vec = functions.scatter_add(zero, idx, feat) one = xp.ones(feat.shape, dtype=xp.float32) degree = functions.scatter_add(zero, idx, one) return sum_vec / degree class MEGNetUpdate(chainer.Chain): """Update submodule for MEGNet Args: dim_for_dense (list): dimension list of dense layer dim_for_update (list): dimension list of update layer dropout_ratio (float): ratio of dropout activation (~chainer.Function or ~chainer.FunctionNode): activate function for megnet model `megnet_softplus` was used in original paper. skip_intermediate (bool): When `True`, intermediate feature after dense calculation is used for skip connection. When `False`, input feature is used for skip connection. It is `True` for first layer, and `False` for other layer in the original paper. """ def __init__(self, dim_for_dense=[64, 32], dim_for_update=[64, 64, 32], dropout_ratio=-1, activation=megnet_softplus, skip_intermediate=True): super(MEGNetUpdate, self).__init__() if len(dim_for_dense) != 2: raise ValueError('dim_for_dense must have 2 elements') if len(dim_for_update) != 3: raise ValueError('dim_for_update must have 3 elements') self.dropout_ratio = dropout_ratio with self.init_scope(): # for dense layer self.dense_for_atom = DenseLayer(dim_for_dense, activation) self.dense_for_pair = DenseLayer(dim_for_dense, activation) self.dense_for_global = DenseLayer(dim_for_dense, activation) # for update layer self.update_for_atom = UpdateLayer(dim_for_update, activation) self.update_for_pair = UpdateLayer(dim_for_update, activation) self.update_for_global = UpdateLayer(dim_for_update, activation) self.skip_intermediate = skip_intermediate def __call__(self, atoms_feat, pair_feat, global_feat, atom_idx, pair_idx, start_idx, end_idx): # 1) Pass the Dense layer a_f_d = self.dense_for_atom(atoms_feat) p_f_d = self.dense_for_pair(pair_feat) g_f_d = self.dense_for_global(global_feat) # 2) Update the edge vector start_node = a_f_d[start_idx] end_node = a_f_d[end_idx] g_f_extend_with_pair_idx = g_f_d[pair_idx] concat_p_v = functions.concat((p_f_d, start_node, end_node, g_f_extend_with_pair_idx)) update_p = self.update_for_pair(concat_p_v) # 3) Update the node vector # 1. get sum edge feature of all nodes using scatter_add method zero = self.xp.zeros(a_f_d.shape, dtype=self.xp.float32) sum_edeg_vec = functions.scatter_add(zero, start_idx, update_p) + \ functions.scatter_add(zero, end_idx, update_p) # 2. get degree of all nodes using scatter_add method one = self.xp.ones(p_f_d.shape, dtype=self.xp.float32) degree = functions.scatter_add(zero, start_idx, one) + \ functions.scatter_add(zero, end_idx, one) # 3. get mean edge feature of all nodes mean_edge_vec = sum_edeg_vec / degree # 4. concating g_f_extend_with_atom_idx = g_f_d[atom_idx] concat_a_v = functions.concat((a_f_d, mean_edge_vec, g_f_extend_with_atom_idx)) update_a = self.update_for_atom(concat_a_v) # 4) Update the global vector out_shape = g_f_d.shape ave_p = get_mean_feat(update_p, pair_idx, out_shape, self.xp) ave_a = get_mean_feat(update_a, atom_idx, out_shape, self.xp) concat_g_v = functions.concat((ave_a, ave_p, g_f_d), axis=1) update_g = self.update_for_global(concat_g_v) # 5) Skip connection if self.skip_intermediate: # Skip intermediate feature, used for first layer. new_a_f = update_a + a_f_d new_p_f = update_p + p_f_d new_g_f = update_g + g_f_d else: # Skip input feature, used all layer except first layer. # input feature must be same dimension with updated feature. new_a_f = update_a + atoms_feat new_p_f = update_p + pair_feat new_g_f = update_g + global_feat # 6) dropout if self.dropout_ratio > 0.0: new_a_f = functions.dropout(new_a_f, ratio=self.dropout_ratio) new_p_f = functions.dropout(new_p_f, ratio=self.dropout_ratio) new_g_f = functions.dropout(new_g_f, ratio=self.dropout_ratio) return new_a_f, new_p_f, new_g_f ================================================ FILE: chainer_chemistry/links/update/mpnn_update.py ================================================ import chainer from chainer import functions from chainer import links import chainer_chemistry class MPNNUpdate(chainer.Chain): r"""MPNN submodule for update part. See: Justin Gilmer+, \ Neural Message Passing for Quantum Chemistry. April 2017. `arXiv:1704.01212 ` Args: in_channels (int or None): input dim of feature vector for each node hidden_channels (int): dimension of feature vector for each node out_channels (int or None): output dime of feature vector for each node When `None`, `hidden_channels` is used. nn (~chainer.Link): """ def __init__(self, in_channels=None, hidden_channels=16, out_channels=None, nn=None, **kwargs): if out_channels is None: out_channels = hidden_channels if in_channels is None: # Current `EdgeNet` hidden_channels must be same with input `h` dim in_channels = out_channels super(MPNNUpdate, self).__init__() with self.init_scope(): self.message_layer = EdgeNet(out_channels=hidden_channels, nn=nn) self.update_layer = links.GRU(2 * hidden_channels, out_channels) self.in_channels = in_channels # currently it is not used... self.hidden_channels = hidden_channels self.out_channels = out_channels self.nn = nn def __call__(self, h, adj, **kwargs): # type: (chainer.Variable, chainer.Variable) -> chainer.Variable # adj: (mb, edge_type, node, node) mb, node, ch = h.shape h = self.message_layer(h, adj) # h: (mb, node, hidden_dim*2) h = functions.reshape(h, (mb * node, self.hidden_channels * 2)) h = self.update_layer(h) # h: (mb*node, hidden_dim) h = functions.reshape(h, (mb, node, self.out_channels)) return h def reset_state(self): self.update_layer.reset_state() class EdgeNet(chainer.Chain): """MPNN submodule for message part. Edge Network expands edge vector dimension to (d x d) matrix. If undirected graph, adj_in and adj_out are same. Args: out_channels (int): dimension of output feature vector Currently, it must be same with input dimension. nn (~chainer.Link): """ def __init__(self, out_channels, nn=None): # type: (int, chainer.Link) -> None super(EdgeNet, self).__init__() if nn is None: from chainer_chemistry.models.mlp import MLP nn = MLP(out_dim=out_channels**2, hidden_dim=16) if not isinstance(nn, chainer.Link): raise ValueError('nn {} must be chainer.Link'.format(nn)) with self.init_scope(): self.nn_layer_in = nn self.nn_layer_out = nn self.out_channels = out_channels def __call__(self, h, adj): # type: (chainer.Variable, chainer.Variable) -> chainer.Variable mb, node, ch = h.shape if ch != self.out_channels: raise ValueError('hidden_channels must be equal to dimension ' 'of feature vector associated to each atom, ' '{}, but it was set to {}'.format( ch, self.out_channels)) # adj: (mb, edge_type, node, node) edge_type = adj.shape[1] adj_in = adj adj_out = functions.transpose(adj, axes=(0, 1, 3, 2)) # expand edge vector to matrix adj_in = functions.reshape(adj_in, (-1, edge_type)) # adj_in: (mb*node*node, edge_type) adj_in = self.nn_layer_in(adj_in) # adj_in: (mb*node*node, out_ch*out_ch) adj_in = functions.reshape(adj_in, (mb, node, node, ch, ch)) adj_in = functions.reshape( functions.transpose(adj_in, axes=(0, 1, 3, 2, 4)), (mb, node * ch, node * ch)) adj_out = functions.reshape(adj_out, (-1, edge_type)) # adj_out: (mb*node*node, edge_type) adj_out = self.nn_layer_out(adj_out) # adj_out: (mb*node*node, out_ch*out_ch) adj_out = functions.reshape(adj_out, (mb, node, node, ch, ch)) adj_out = functions.reshape( functions.transpose(adj_out, axes=(0, 1, 3, 2, 4)), (mb, node * ch, node * ch)) # calculate message h = functions.reshape(h, (mb, node * ch, 1)) message_in = chainer_chemistry.functions.matmul(adj_in, h) # message_in: (mb, node*ch, 1) message_in = functions.reshape(message_in, (mb, node, ch)) # message_in: (mb, node, out_ch) message_out = chainer_chemistry.functions.matmul(adj_out, h) # message_out: (mb, node*ch, 1) message_out = functions.reshape(message_out, (mb, node, ch)) message = functions.concat([message_in, message_out], axis=2) return message # message: (mb, node, out_ch * 2) ================================================ FILE: chainer_chemistry/links/update/nfp_update.py ================================================ import chainer from chainer import functions import numpy import chainer_chemistry from chainer_chemistry.links.connection.graph_linear import GraphLinear class NFPUpdate(chainer.Chain): """NFP submodule for update part. Args: in_channels (int or None): input channel dimension out_channels (int): output channel dimension max_degree (int): max degree of edge """ def __init__(self, in_channels, out_channels, max_degree=6, **kwargs): super(NFPUpdate, self).__init__() num_degree_type = max_degree + 1 with self.init_scope(): self.graph_linears = chainer.ChainList( *[GraphLinear(in_channels, out_channels) for _ in range(num_degree_type)]) self.max_degree = max_degree self.in_channels = in_channels self.out_channels = out_channels def __call__(self, h, adj, deg_conds): # h: (minibatch, atom, ch) # h encodes each atom's info in ch axis of size hidden_dim # adjs: (minibatch, atom, atom) # --- Message part --- # Take sum along adjacent atoms # fv: (minibatch, atom, ch) fv = chainer_chemistry.functions.matmul(adj, h) # --- Update part --- # TODO(nakago): self.xp is chainerx if self.xp is numpy: zero_array = numpy.zeros(fv.shape, dtype=numpy.float32) else: zero_array = self.xp.zeros_like(fv.array) fvds = [functions.where(cond, fv, zero_array) for cond in deg_conds] out_h = 0 for graph_linear, fvd in zip(self.graph_linears, fvds): out_h = out_h + graph_linear(fvd) # out_h shape (minibatch, max_num_atoms, hidden_dim) out_h = functions.sigmoid(out_h) return out_h ================================================ FILE: chainer_chemistry/links/update/relgat_update.py ================================================ import chainer from chainer import functions from chainer_chemistry.links.connection.graph_linear import GraphLinear class RelGATUpdate(chainer.Chain): """RelGAT submodule for update part. Args: in_channels (int or None): dimension of input feature vector out_channels (int): dimension of output feature vector n_heads (int): number of multi-head-attentions. n_edge_types (int): number of edge types. dropout_ratio (float): dropout ratio of the normalized attention coefficients negative_slope (float): LeakyRELU angle of the negative slope softmax_mode (str): take the softmax over the logits 'across' or 'within' relation. If you would like to know the detail discussion, please refer Relational GAT paper. concat_heads (bool) : Whether to concat or average multi-head attentions """ def __init__(self, in_channels, out_channels, n_heads=3, n_edge_types=4, dropout_ratio=-1., negative_slope=0.2, softmax_mode='across', concat_heads=False): super(RelGATUpdate, self).__init__() with self.init_scope(): self.message_layer = GraphLinear( in_channels, out_channels * n_edge_types * n_heads) self.attention_layer = GraphLinear(out_channels * 2, 1) self.in_channels = in_channels self.out_channels = out_channels self.n_heads = n_heads self.n_edge_types = n_edge_types self.dropout_ratio = dropout_ratio self.softmax_mode = softmax_mode self.concat_heads = concat_heads self.negative_slope = negative_slope def __call__(self, h, adj, **kwargs): xp = self.xp # (minibatch, atom, channel) mb, atom, ch = h.shape # (minibatch, atom, EDGE_TYPE * heads * out_dim) h = self.message_layer(h) # (minibatch, atom, EDGE_TYPE, heads, out_dim) h = functions.reshape(h, (mb, atom, self.n_edge_types, self.n_heads, self.out_channels)) # concat all pairs of atom # (minibatch, 1, atom, heads, out_dim) h_i = functions.reshape(h, (mb, 1, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, heads, out_dim) h_i = functions.broadcast_to(h_i, (mb, atom, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, 1, EDGE_TYPE, heads, out_dim) h_j = functions.reshape(h, (mb, atom, 1, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim) h_j = functions.broadcast_to(h_j, (mb, atom, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim * 2) e = functions.concat([h_i, h_j], axis=5) # (minibatch, EDGE_TYPE, heads, atom, atom, out_dim * 2) e = functions.transpose(e, (0, 3, 4, 1, 2, 5)) # (minibatch * EDGE_TYPE * heads, atom * atom, out_dim * 2) e = functions.reshape(e, (mb * self.n_edge_types * self.n_heads, atom * atom, self.out_channels * 2)) # (minibatch * EDGE_TYPE * heads, atom * atom, 1) e = self.attention_layer(e) # (minibatch, EDGE_TYPE, heads, atom, atom) e = functions.reshape(e, (mb, self.n_edge_types, self.n_heads, atom, atom)) e = functions.leaky_relu(e, self.negative_slope) # (minibatch, EDGE_TYPE, atom, atom) if isinstance(adj, chainer.Variable): cond = adj.array.astype(xp.bool) else: cond = adj.astype(xp.bool) # (minibatch, EDGE_TYPE, 1, atom, atom) cond = xp.reshape(cond, (mb, self.n_edge_types, 1, atom, atom)) # (minibatch, EDGE_TYPE, heads, atom, atom) cond = xp.broadcast_to(cond, e.array.shape) # TODO(mottodora): find better way to ignore non connected e = functions.where(cond, e, xp.broadcast_to(xp.array(-10000), e.array.shape) .astype(xp.float32)) # In Relational Graph Attention Networks eq.(7) # ARGAT: take the softmax over the logits across node neighborhoods # irrespective of relation if self.softmax_mode == 'across': # (minibatch, heads, atom, EDGE_TYPE, atom) e = functions.transpose(e, (0, 2, 3, 1, 4)) # (minibatch, heads, atom, EDGE_TYPE * atom) e = functions.reshape(e, (mb, self.n_heads, atom, self.n_edge_types * atom)) # (minibatch, heads, atom, EDGE_TYPE * atom) alpha = functions.softmax(e, axis=3) if self.dropout_ratio >= 0: alpha = functions.dropout(alpha, ratio=self.dropout_ratio) # (minibatch, heads, atom, EDGE_TYPE, atom) alpha = functions.reshape(alpha, (mb, self.n_heads, atom, self.n_edge_types, atom)) # (minibatch, EDGE_TYPE, heads, atom, atom) alpha = functions.transpose(alpha, (0, 3, 1, 2, 4)) # In Relational Graph Attention Networks eq.(6) # WIRGAT: take the softmax over the logits independently for each # relation elif self.softmax_mode == 'within': alpha = functions.softmax(e, axis=4) if self.dropout_ratio >= 0: alpha = functions.dropout(alpha, ratio=self.dropout_ratio) else: raise ValueError("{} is invalid. Please use 'across' or 'within'" .format(self.softmax_mode)) # before: (minibatch, atom, EDGE_TYPE, heads, out_dim) # after: (minibatch, EDGE_TYPE, heads, atom, out_dim) h = functions.transpose(h, (0, 2, 3, 1, 4)) # (minibatch, EDGE_TYPE, heads, atom, out_dim) h_new = functions.matmul(alpha, h) # (minibatch, heads, atom, out_dim) h_new = functions.sum(h_new, axis=1) if self.concat_heads: # -> (minibatch, atom, heads, out_dim) h_new = functions.transpose(h_new, (0, 2, 1, 3)) bs, n_nodes, n_heads, outdim = h_new.shape # (minibatch, atom, heads * out_dim) h_new = functions.reshape(h_new, (bs, n_nodes, n_heads * outdim)) else: # (minibatch, atom, out_dim) h_new = functions.mean(h_new, axis=1) return h_new ================================================ FILE: chainer_chemistry/links/update/relgcn_update.py ================================================ import chainer from chainer import functions from chainer_chemistry.links.connection.graph_linear import GraphLinear class RelGCNUpdate(chainer.Chain): """RelGUN submodule for update part. Args: in_channels (int or None): input channel dimension out_channels (int): output channel dimension num_edge_type (int): number of types of edge """ def __init__(self, in_channels, out_channels, n_edge_types=4, **kwargs): super(RelGCNUpdate, self).__init__() with self.init_scope(): self.graph_linear_self = GraphLinear(in_channels, out_channels) self.graph_linear_edge = GraphLinear( in_channels, out_channels * n_edge_types) self.n_edge_types = n_edge_types self.in_channels = in_channels self.out_channels = out_channels def __call__(self, h, adj, **kwargs): """main calculation Args: h: (batchsize, num_nodes, in_channels) adj: (batchsize, num_edge_type, num_nodes, num_nodes) Returns: (batchsize, num_nodes, ch) """ mb, node, ch = h.shape # --- self connection, apply linear function --- hs = self.graph_linear_self(h) # --- relational feature, from neighbor connection --- # Expected number of neighbors of a vertex # Since you have to divide by it, if its 0, you need to # arbitrarily set it to 1 m = self.graph_linear_edge(h) m = functions.reshape( m, (mb, node, self.out_channels, self.n_edge_types)) m = functions.transpose(m, (0, 3, 1, 2)) # m: (batchsize, edge_type, node, ch) # hrL (batchsize, edge_type, node, ch) hr = functions.matmul(adj, m) # hr: (batchsize, node, ch) hr = functions.sum(hr, axis=1) return hs + hr class RelGCNSparseUpdate(chainer.Chain): """sparse RelGCN submodule for update part""" def __init__(self, in_channels, out_channels, n_edge_types): super(RelGCNSparseUpdate, self).__init__() self.out_channels = out_channels self.n_edge_types = n_edge_types with self.init_scope(): self.root_weight = chainer.links.Linear(in_channels, out_channels) self.edge_weight = chainer.links.Linear( in_channels, n_edge_types * out_channels) def __call__(self, h, edge_index, edge_attr): next_h = self.root_weight(h) features = self.edge_weight( h) .reshape(-1, self.n_edge_types, self.out_channels) messages = features[edge_index[0], edge_attr, :] return functions.scatter_add(next_h, edge_index[1], messages) ================================================ FILE: chainer_chemistry/links/update/rsgcn_update.py ================================================ import chainer import chainer_chemistry from chainer_chemistry.links.connection.graph_linear import GraphLinear class RSGCNUpdate(chainer.Chain): """RSGCN submodule for message and update part. Args: in_channels (int or None): input channel dimension out_channels (int): output channel dimension """ def __init__(self, in_channels, out_channels, **kwargs): super(RSGCNUpdate, self).__init__() with self.init_scope(): self.graph_linear = GraphLinear( in_channels, out_channels, nobias=True) self.in_channels = in_channels self.out_channels = out_channels def __call__(self, h, adj, **kwargs): # --- Message part --- h = chainer_chemistry.functions.matmul(adj, h) # --- Update part --- h = self.graph_linear(h) return h ================================================ FILE: chainer_chemistry/links/update/schnet_update.py ================================================ """ Chainer implementation of CFConv. SchNet: A continuous-filter convolutional neural network for modeling quantum interactions Kristof et al. See: https://arxiv.org/abs/1706.08566 """ import chainer from chainer import functions from chainer import links from chainer_chemistry.functions import shifted_softplus from chainer_chemistry.links.connection.graph_linear import GraphLinear class CFConv(chainer.Chain): """CFConv Args: num_rbf (int): Number of RBF kernel radius_resolution (float): resolution of radius. Roughly `num_rbf * radius_resolution` ball is convolved in 1 step. gamma (float): coefficient to apply kernel. hidden_dim (int): hidden dim """ def __init__(self, num_rbf=300, radius_resolution=0.1, gamma=10.0, hidden_dim=64): super(CFConv, self).__init__() with self.init_scope(): self.dense1 = links.Linear(num_rbf, hidden_dim) self.dense2 = links.Linear(hidden_dim) self.hidden_dim = hidden_dim self.num_rbf = num_rbf self.radius_resolution = radius_resolution self.gamma = gamma def __call__(self, h, dist): """main calculation Args: h (numpy.ndarray): axis 0 represents minibatch index, axis 1 represents atom_index and axis2 represents feature dimension. dist (numpy.ndarray): axis 0 represents minibatch index, axis 1 and 2 represent distance between atoms. """ mb, atom, ch = h.shape if ch != self.hidden_dim: raise ValueError('h.shape[2] {} and hidden_dim {} must be same!' .format(ch, self.hidden_dim)) embedlist = self.xp.arange( self.num_rbf).astype('f') * self.radius_resolution dist = functions.reshape(dist, (mb, atom, atom, 1)) dist = functions.broadcast_to(dist, (mb, atom, atom, self.num_rbf)) dist = functions.exp(- self.gamma * (dist - embedlist) ** 2) dist = functions.reshape(dist, (-1, self.num_rbf)) dist = self.dense1(dist) dist = shifted_softplus(dist) dist = self.dense2(dist) dist = shifted_softplus(dist) dist = functions.reshape(dist, (mb, atom, atom, self.hidden_dim)) h = functions.reshape(h, (mb, atom, 1, self.hidden_dim)) h = functions.broadcast_to(h, (mb, atom, atom, self.hidden_dim)) h = functions.sum(h * dist, axis=1) return h class SchNetUpdate(chainer.Chain): """Update submodule for SchNet `in_channels` and `hidden_channels` must be same with `hidden_channels` in this module. Args: hidden_channels (int): num_rbf (int): radius_resolution (float): gamma (float): """ def __init__(self, hidden_channels=64, num_rbf=300, radius_resolution=0.1, gamma=10.0): super(SchNetUpdate, self).__init__() with self.init_scope(): self.linear = chainer.ChainList( *[GraphLinear(None, hidden_channels) for _ in range(3)]) self.cfconv = CFConv( num_rbf=num_rbf, radius_resolution=radius_resolution, gamma=gamma, hidden_dim=hidden_channels) self.hidden_channels = hidden_channels def __call__(self, h, adj, **kwargs): v = self.linear[0](h) v = self.cfconv(v, adj) v = self.linear[1](v) v = shifted_softplus(v) v = self.linear[2](v) return h + v ================================================ FILE: chainer_chemistry/models/__init__.py ================================================ from chainer_chemistry.models import ggnn # NOQA from chainer_chemistry.models import gin # NOQA from chainer_chemistry.models import gwm # NOQA from chainer_chemistry.models import mlp # NOQA from chainer_chemistry.models import mpnn # NOQA from chainer_chemistry.models import nfp # NOQA from chainer_chemistry.models import prediction # NOQA from chainer_chemistry.models import relgat # NOQA from chainer_chemistry.models import relgcn # NOQA from chainer_chemistry.models import rsgcn # NOQA from chainer_chemistry.models import schnet # NOQA from chainer_chemistry.models import weavenet # NOQA from chainer_chemistry.models.ggnn import GGNN # NOQA from chainer_chemistry.models.ggnn import SparseGGNN # NOQA from chainer_chemistry.models.gin import GIN # NOQA from chainer_chemistry.models.gnn_film import GNNFiLM # NOQA from chainer_chemistry.models.mlp import MLP # NOQA from chainer_chemistry.models.mpnn import MPNN # NOQA from chainer_chemistry.models.nfp import NFP # NOQA from chainer_chemistry.models.relgat import RelGAT # NOQA from chainer_chemistry.models.relgcn import RelGCN # NOQA from chainer_chemistry.models.rsgcn import RSGCN # NOQA from chainer_chemistry.models.schnet import SchNet # NOQA from chainer_chemistry.models.weavenet import WeaveNet # NOQA from chainer_chemistry.models.gwm.gwm_net import GGNN_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import GIN_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import NFP_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import RSGCN_GWM # NOQA from chainer_chemistry.models.cwle.cwle_net import GGNN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RelGAT_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RelGCN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import GIN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import NFP_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RSGCN_CWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import GGNN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RelGAT_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RelGCN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import GIN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import NFP_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RSGCN_GWLE # NOQA from chainer_chemistry.models.prediction.base import BaseForwardModel # NOQA from chainer_chemistry.models.prediction.classifier import Classifier # NOQA from chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor # NOQA from chainer_chemistry.models.prediction.regressor import Regressor # NOQA from chainer_chemistry.models.prediction.set_up_predictor import set_up_predictor # NOQA ================================================ FILE: chainer_chemistry/models/cgcnn.py ================================================ import chainer from chainer import links from chainer_chemistry.links.readout.cgcnn_readout import CGCNNReadout from chainer_chemistry.links.update.cgcnn_update import CGCNNUpdate class CGCNN(chainer.Chain): """CGCNN See Tian Xie et al, \ Crystal Graph Convolutional Neural Networks for an Accurate and Interpretable Prediction of Material Properties. \ `arXiv:1710.10324 `_ Args: out_dim (int): dimension of output feature vector n_update_layers (int): number of CGCNNUpdate layers n_atom_features (int): hidden dimension of atom feature vector """ def __init__(self, out_dim=128, n_update_layers=3, n_atom_features=64): super(CGCNN, self).__init__() with self.init_scope(): self.atom_feature_embedding = links.Linear(None, n_atom_features) self.crystal_convs = chainer.ChainList( *[CGCNNUpdate(n_atom_features) for _ in range(n_update_layers)] ) self.readout = CGCNNReadout(out_dim=out_dim) def __call__(self, atom_feat, nbr_feat, atom_idx, feat_idx): # atom feature embedding atom_feat = self.atom_feature_embedding(atom_feat) # --- CGCNN update --- for conv_layer in self.crystal_convs: atom_feat = conv_layer(atom_feat, nbr_feat, feat_idx) # --- CGCNN readout --- pool = self.readout(atom_feat, atom_idx) return pool ================================================ FILE: chainer_chemistry/models/cwle/__init__.py ================================================ #from chainer_chemistry.models.cwle import cwle from chainer_chemistry.models.cwle import cwle_graph_conv_model from chainer_chemistry.models.cwle import cwle_net #from chainer_chemistry.models.cwle.gwm import GWM from chainer_chemistry.models.cwle.cwle_graph_conv_model import CWLEGraphConvModel # NOQA from chainer_chemistry.models.cwle.cwle_net import GGNN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RelGAT_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RelGCN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import GIN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import NFP_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RSGCN_CWLE # NOQA ================================================ FILE: chainer_chemistry/models/cwle/cwle_graph_conv_model.py ================================================ import chainer from chainer import cuda from chainer import links from chainer import functions from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.connection.graph_linear import GraphLinear from chainer_chemistry.links.normalization.graph_batch_normalization import GraphBatchNormalization # NOQA from chainer_chemistry.links.readout.general_readout import GeneralReadout from chainer_chemistry.config import MAX_ATOMIC_NUM #from chainer_chemistry.models.gwm.gwm import GWM from chainer_chemistry.models.relgcn import rescale_adj MAX_WLE_NUM = 800 def to_array(x): """Convert x into numpy.ndarray or cupy.ndarray""" if isinstance(x, chainer.Variable): x = x.array return x class CWLEGraphConvModel(chainer.Chain): """Unified module of Graph Convolution Model with CWLE Note that this module is experimental, all update_layer and readout_layer combination is not supported. Please refer `test_gwm_graph_conv_model.py` for tested combinations. This module might not be maintained in the future. Args: hidden_channels (int or list): hidden channels for update out_dim (int): output dim update_layer (chainer.links.Link): readout_layer (chainer.links.Link): n_update_layers (int or None): out_channels (None or lsit): wle_dim (int): n_atom_types (int): n_edge_types (int): dropout_ratio (float): with_wle (bool): enabler for Combined NLE concat_hidden (bool): sum_hidden (bool): weight_tying (bool): scale_adj (bool): activation (callable): use_batchnorm (bool): n_activation (int or None): update_kwargs (dict or None): readout_kwargs (dict or None): wle_kwargs (dict or None): n_wle_types (string): """ def __init__(self, hidden_channels, out_dim, update_layer, readout_layer, n_update_layers=None, out_channels=None, wle_dim=None, n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4, dropout_ratio=-1.0, with_wle=True, concat_hidden=False, sum_hidden=False, weight_tying=False, scale_adj=False, activation=None, use_batchnorm=False, n_activation=None, update_kwargs=None, readout_kwargs=None, wle_kwargs=None, n_wle_types=MAX_WLE_NUM): super(CWLEGraphConvModel, self).__init__() # General: length of hidden_channels must be n_layers + 1 if isinstance(hidden_channels, int): if n_update_layers is None: raise ValueError('n_update_layers is None') else: hidden_channels = [hidden_channels for _ in range(n_update_layers + 1)] elif isinstance(hidden_channels, list): if out_channels is None: n_update_layers = len(hidden_channels) - 1 else: n_update_layers = len(hidden_channels) else: raise TypeError('Unexpected value for hidden_channels {}' .format(hidden_channels)) if readout_layer == GeneralReadout and hidden_channels[-1] != out_dim: # When use GWM, hidden channels must be same. But GeneralReadout # cannot change the dimension. So when use General Readout and GWM, # hidden channel and out_dim should be same. if with_wle: raise ValueError('Unsupported combination.') else: hidden_channels[-1] = out_dim # When use with_gwm, concat_hidden, sum_hidden and weight_tying option, # hidden_channels must be same if with_wle or concat_hidden or sum_hidden or weight_tying: if not all([in_dim == hidden_channels[0] for in_dim in hidden_channels]): raise ValueError( 'hidden_channels must be same but different {}' .format(hidden_channels)) if with_wle and wle_dim is None: print('[WARNING] wle_dim is None, set to {}' .format(hidden_channels[0])) wle_dim = hidden_channels[0] if out_channels is None: in_channels_list = hidden_channels[:-1] out_channels_list = hidden_channels[1:] else: # For RelGAT concat_heads option in_channels_list = hidden_channels out_channels_list = out_channels assert len(in_channels_list) == n_update_layers assert len(out_channels_list) == n_update_layers n_use_update_layers = 1 if weight_tying else n_update_layers n_readout_layers = n_use_update_layers if concat_hidden or sum_hidden else 1 n_activation = n_use_update_layers if n_activation is None else n_activation if update_kwargs is None: update_kwargs = {} if readout_kwargs is None: readout_kwargs = {} if wle_kwargs is None: wle_kwargs = {} with self.init_scope(): self.embed = EmbedAtomID(out_size=hidden_channels[0], in_size=n_atom_types) # +1 for label 0 self.update_layers = chainer.ChainList( *[update_layer(in_channels=in_channels_list[i], out_channels=out_channels_list[i], n_edge_types=n_edge_types, **update_kwargs) for i in range(n_use_update_layers)]) # when use weight_tying option, hidden_channels must be same. So we can use -1 index self.readout_layers = chainer.ChainList( *[readout_layer(out_dim=out_dim, # in_channels=hidden_channels[-1], in_channels=None, **readout_kwargs) for _ in range(n_readout_layers)]) if with_wle: self.embed_wle = links.EmbedID(out_size=wle_dim, in_size=n_wle_types) self.linear_for_concat_wle = GraphLinear(in_size=wle_dim + hidden_channels[0], out_size=hidden_channels[0]) if use_batchnorm: self.bnorms = chainer.ChainList( *[GraphBatchNormalization( out_channels_list[i]) for i in range(n_use_update_layers)]) self.readout_layer = readout_layer self.update_layer = update_layer self.weight_tying = weight_tying self.with_wle = with_wle self.concat_hidden = concat_hidden self.sum_hidden = sum_hidden self.scale_adj = scale_adj self.activation = activation self.dropout_ratio = dropout_ratio self.use_batchnorm = use_batchnorm self.n_activation = n_activation self.n_update_layers = n_update_layers self.n_edge_types = n_edge_types def __call__(self, atom_array, adj, wle_array=None, is_real_node=None): self.reset_state() if atom_array.dtype == self.xp.int32: h = self.embed(atom_array) else: # TODO: GraphLinear or GraphMLP can be used. h = atom_array h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) # all Combined NLE processes are done here. if self.with_wle: h_s = self.embed_wle(wle_array) h_h_s = functions.concat( (h, h_s), axis=2 ) h = self.linear_for_concat_wle(h_h_s) additional_kwargs = self.preprocess_addtional_kwargs( atom_array, adj, wle_array=wle_array, is_real_node=is_real_node) if self.scale_adj: adj = rescale_adj(adj) g_list = [] for step in range(self.n_update_layers): update_layer_index = 0 if self.weight_tying else step h = self.update_layers[update_layer_index]( h=h, adj=adj, **additional_kwargs) if self.use_batchnorm: h = self.bnorms[update_layer_index](h) if self.dropout_ratio > 0.: h = functions.dropout(h, ratio=self.dropout_ratio) if self.activation is not None and step < self.n_activation: h = self.activation(h) if self.concat_hidden or self.sum_hidden: g = self.readout_layers[step]( h=h, h0=h0, is_real_node=is_real_node, **additional_kwargs) g_list.append(g) if self.concat_hidden: return functions.concat(g_list, axis=1) else: if self.sum_hidden: g = functions.sum(functions.stack(g_list), axis=0) else: g = self.readout_layers[0]( h=h, h0=h0, is_real_node=is_real_node) return g def reset_state(self): if hasattr(self.update_layers[0], 'reset_state'): [update_layer.reset_state() for update_layer in self.update_layers] def preprocess_addtional_kwargs(self, *args, **kwargs): return {} ================================================ FILE: chainer_chemistry/models/cwle/cwle_net.py ================================================ from chainer import functions from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import GINUpdate, NFPReadout, NFPUpdate, \ RSGCNUpdate, GeneralReadout # NOQA from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.update.ggnn_update import GGNNUpdate from chainer_chemistry.links.update.relgat_update import RelGATUpdate from chainer_chemistry.links.update.relgcn_update \ import RelGCNUpdate, RelGCNSparseUpdate from chainer_chemistry.models.cwle.cwle_graph_conv_model import CWLEGraphConvModel # NOQA from chainer_chemistry.models.cwle.cwle_graph_conv_model import to_array from chainer_chemistry.models.cwle.cwle_graph_conv_model import MAX_WLE_NUM class GGNN_CWLE(CWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM): readout_kwargs = {'activation': activation, 'activation_agg': activation} super(GGNN_CWLE, self).__init__( update_layer=GGNNUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_wle=with_wle, readout_kwargs=readout_kwargs, n_wle_types=n_wle_types) class RelGCN_CWLE(CWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM): readout_kwargs = {'activation': activation, 'activation_agg': activation} super(RelGCN_CWLE, self).__init__( update_layer=RelGCNUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_wle=with_wle, readout_kwargs=readout_kwargs, n_wle_types=n_wle_types) class RelGAT_CWLE(CWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM): readout_kwargs = {'activation': activation, 'activation_agg': activation} super(RelGAT_CWLE, self).__init__( update_layer=RelGATUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_wle=with_wle, readout_kwargs=readout_kwargs, n_wle_types=n_wle_types) class GIN_CWLE(CWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, dropout_ratio=0.5, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM): update_kwargs = {'dropout_ratio': dropout_ratio} readout_kwargs = {'activation': activation, 'activation_agg': activation} super(GIN_CWLE, self).__init__( update_layer=GINUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_wle=with_wle, update_kwargs=update_kwargs, readout_kwargs=readout_kwargs, n_wle_types=n_wle_types) class NFP_CWLE(CWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, max_degree=6, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, with_wle=True, n_wle_types=MAX_WLE_NUM): update_kwargs = {'max_degree': max_degree} super(NFP_CWLE, self).__init__( update_layer=NFPUpdate, readout_layer=NFPReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, sum_hidden=True, with_wle=with_wle, update_kwargs=update_kwargs, n_wle_types=n_wle_types) self.max_degree = max_degree self.n_degree_type = max_degree + 1 self.ch0 = hidden_channels def preprocess_addtional_kwargs(self, *args, **kwargs): atom_array, adj = args[:2] bs, num_node = atom_array.shape[:2] # For NFP Update if adj.ndim == 4: degree_mat = self.xp.sum(to_array(adj), axis=(1, 2)) elif adj.ndim == 3: degree_mat = self.xp.sum(to_array(adj), axis=1) else: raise ValueError('Unexpected value adj ' .format(adj.shape)) # deg_conds: (minibatch, atom, ch) deg_conds = [self.xp.broadcast_to( ((degree_mat - degree) == 0)[:, :, None], (bs, num_node, self.ch0)) for degree in range(1, self.n_degree_type + 1)] return {'deg_conds': deg_conds} class RSGCN_CWLE(CWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=32, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, use_batch_norm=False, readout=None, dropout_ratio=0.5, with_wle=True, n_wle_types=MAX_WLE_NUM): if readout is None: readout = GeneralReadout super(RSGCN_CWLE, self).__init__( update_layer=RSGCNUpdate, readout_layer=readout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, use_batchnorm=use_batch_norm, activation=functions.relu, n_activation=n_update_layers-1, dropout_ratio=dropout_ratio, with_wle=with_wle, n_wle_types=n_wle_types) ================================================ FILE: chainer_chemistry/models/ggnn.py ================================================ import chainer from chainer import functions, cuda # NOQA from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import EmbedAtomID from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.update.ggnn_update import GGNNUpdate from chainer_chemistry.utils import convert_sparse_with_edge_type class GGNN(chainer.Chain): """Gated Graph Neural Networks (GGNN) See: Li, Y., Tarlow, D., Brockschmidt, M., & Zemel, R. (2015).\ Gated graph sequence neural networks. \ `arXiv:1511.05493 `_ Args: out_dim (int): dimension of output feature vector hidden_channels (int): dimension of feature vector for each node n_update_layers (int): number of layers n_atom_types (int): number of types of atoms concat_hidden (bool): If set to True, readout is executed in each layer and the result is concatenated weight_tying (bool): enable weight_tying or not activation (~chainer.Function or ~chainer.FunctionNode): activate function n_edge_types (int): number of edge type. Defaults to 4 for single, double, triple and aromatic bond. """ def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4): super(GGNN, self).__init__() n_readout_layer = n_update_layers if concat_hidden else 1 n_message_layer = 1 if weight_tying else n_update_layers with self.init_scope(): # Update self.embed = EmbedAtomID( out_size=hidden_channels, in_size=n_atom_types) self.update_layers = chainer.ChainList(*[GGNNUpdate( hidden_channels=hidden_channels, n_edge_types=n_edge_types) for _ in range(n_message_layer)]) # Readout self.readout_layers = chainer.ChainList(*[GGNNReadout( out_dim=out_dim, in_channels=hidden_channels * 2, activation=activation, activation_agg=activation) for _ in range(n_readout_layer)]) self.out_dim = out_dim self.hidden_channels = hidden_channels self.n_update_layers = n_update_layers self.n_edge_types = n_edge_types self.activation = activation self.concat_hidden = concat_hidden self.weight_tying = weight_tying def __call__(self, atom_array, adj, is_real_node=None): """Forward propagation Args: atom_array (numpy.ndarray): minibatch of molecular which is represented with atom IDs (representing C, O, S, ...) `atom_array[mol_index, atom_index]` represents `mol_index`-th molecule's `atom_index`-th atomic number adj (numpy.ndarray): minibatch of adjancency matrix with edge-type information is_real_node (numpy.ndarray): 2-dim array (minibatch, num_nodes). 1 for real node, 0 for virtual node. If `None`, all node is considered as real node. Returns: ~chainer.Variable: minibatch of fingerprint """ # reset state self.reset_state() if atom_array.dtype == self.xp.int32: h = self.embed(atom_array) # (minibatch, max_num_atoms) else: h = atom_array h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) g_list = [] for step in range(self.n_update_layers): message_layer_index = 0 if self.weight_tying else step h = self.update_layers[message_layer_index](h, adj) if self.concat_hidden: g = self.readout_layers[step](h, h0, is_real_node) g_list.append(g) if self.concat_hidden: return functions.concat(g_list, axis=1) else: g = self.readout_layers[0](h, h0, is_real_node) return g def reset_state(self): [update_layer.reset_state() for update_layer in self.update_layers] class SparseGGNN(GGNN): """GGNN model for sparse matrix inputs. The constructor of this model is the same with that of GGNN. See the documentation of GGNN for the detail. """ def __init__(self, *args, **kwargs): super(SparseGGNN, self).__init__(*args, **kwargs) def __call__(self, atom_array, data, row, col, edge_type, is_real_node=None): """Forward propagation Args: atom_array (numpy.ndarray): minibatch of molecular which is represented with atom IDs (representing C, O, S, ...) `atom_array[mol_index, atom_index]` represents `mol_index`-th molecule's `atom_index`-th atomic number data (numpy.ndarray): the entries of the batched sparse matrix. row (numpy.ndarray): the row indices of the matrix entries. col (numpy.ndarray): the column indices of the matrix entries. edge_type (numpy.ndarray): edge type information of edges. is_real_node (numpy.ndarray): 2-dim array (minibatch, num_nodes). 1 for real node, 0 for virtual node. If `None`, all node is considered as real node. Returns: ~chainer.Variable: minibatch of fingerprint """ num_nodes = atom_array.shape[1] adj = convert_sparse_with_edge_type( data, row, col, num_nodes, edge_type, self.n_edge_types) return super(SparseGGNN, self).__call__( atom_array, adj, is_real_node=is_real_node) ================================================ FILE: chainer_chemistry/models/gin.py ================================================ import chainer from chainer import functions, cuda # NOQA from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import EmbedAtomID from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.readout.scatter_ggnn_readout import ScatterGGNNReadout # NOQA from chainer_chemistry.links.update.gin_update import GINUpdate, GINSparseUpdate # NOQA class GIN(chainer.Chain): """Simple implementation of Graph Isomorphism Network (GIN) See: Xu, Hu, Leskovec, and Jegelka, \ "How powerful are graph neural networks?", in ICLR 2019. Args: out_dim (int): dimension of output feature vector hidden_channels (int): dimension of feature vector for each node n_update_layers (int): number of layers n_atom_types (int): number of types of atoms concat_hidden (bool): If set to True, readout is executed in each layer and the result is concatenated dropout_ratio (float): dropout ratio. Negative value indicates not apply dropout weight_tying (bool): enable weight_tying or not activation (~chainer.Function or ~chainer.FunctionNode): activate function n_edge_types (int): number of edge type. Defaults to 4 for single, double, triple and aromatic bond. """ def __init__(self, out_dim, node_embedding=False, hidden_channels=16, out_channels=None, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, dropout_ratio=0.5, concat_hidden=False, weight_tying=False, activation=functions.identity, n_edge_types=4): super(GIN, self).__init__() n_message_layer = 1 if weight_tying else n_update_layers n_readout_layer = n_update_layers if concat_hidden else 1 with self.init_scope(): # embedding self.embed = EmbedAtomID(out_size=hidden_channels, in_size=n_atom_types) self.first_mlp = GINUpdate( hidden_channels=hidden_channels, dropout_ratio=dropout_ratio, out_channels=hidden_channels).graph_mlp # two non-linear MLP part if out_channels is None: out_channels = hidden_channels self.update_layers = chainer.ChainList(*[GINUpdate( hidden_channels=hidden_channels, dropout_ratio=dropout_ratio, out_channels=(out_channels if i == n_message_layer - 1 else hidden_channels)) for i in range(n_message_layer)]) # Readout self.readout_layers = chainer.ChainList(*[GGNNReadout( out_dim=out_dim, in_channels=hidden_channels * 2, activation=activation, activation_agg=activation) for _ in range(n_readout_layer)]) # end with self.node_embedding = node_embedding self.out_dim = out_dim self.hidden_channels = hidden_channels self.n_update_layers = n_update_layers self.n_message_layers = n_message_layer self.n_readout_layer = n_readout_layer self.dropout_ratio = dropout_ratio self.concat_hidden = concat_hidden self.weight_tying = weight_tying self.n_edge_types = n_edge_types def __call__(self, atom_array, adj, is_real_node=None): """forward propagation Args: atom_array (numpy.ndarray): mol-minibatch by node numpy.ndarray, minibatch of molecular which is represented with atom IDs (representing C, O, S, ...) atom_array[m, i] = a represents m-th molecule's i-th node is value a (atomic number) adj (numpy.ndarray): mol-minibatch by relation-types by node by node numpy.ndarray, minibatch of multple relational adjancency matrix with edge-type information adj[i, j] = b represents m-th molecule's edge from node i to node j has value b is_real_node: Returns: numpy.ndarray: final molecule representation """ if atom_array.dtype == self.xp.int32: h = self.embed(atom_array) # (minibatch, max_num_atoms) else: h = atom_array h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) g_list = [] for step in range(self.n_update_layers): message_layer_index = 0 if self.weight_tying else step h = self.update_layers[message_layer_index](h, adj) if step != self.n_message_layers - 1: h = functions.relu(h) if self.concat_hidden: g = self.readout_layers[step](h, h0, is_real_node) g_list.append(g) if self.node_embedding: return h if self.concat_hidden: return functions.concat(g_list, axis=1) else: g = self.readout_layers[0](h, h0, is_real_node) return g class GINSparse(chainer.Chain): """Simple implementation of sparseGraph Isomorphism Network (GIN)""" def __init__(self, out_dim, node_embedding=False, hidden_channels=16, out_channels=None, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, dropout_ratio=0.5, concat_hidden=False, weight_tying=False, activation=functions.identity, n_edge_types=4): super(GINSparse, self).__init__() n_message_layer = 1 if weight_tying else n_update_layers n_readout_layer = n_update_layers if concat_hidden else 1 with self.init_scope(): # embedding self.embed = EmbedAtomID(out_size=hidden_channels, in_size=n_atom_types) self.first_mlp = GINSparseUpdate( hidden_channels=hidden_channels, dropout_ratio=dropout_ratio, out_channels=hidden_channels).mlp # two non-linear MLP part if out_channels is None: out_channels = hidden_channels self.update_layers = chainer.ChainList(*[GINSparseUpdate( hidden_channels=hidden_channels, dropout_ratio=dropout_ratio, out_channels=(out_channels if i == n_message_layer - 1 else hidden_channels)) for i in range(n_message_layer)]) # Readout self.readout_layers = chainer.ChainList(*[ScatterGGNNReadout( out_dim=out_dim, in_channels=hidden_channels * 2, activation=activation, activation_agg=activation) for _ in range(n_readout_layer)]) # end with self.node_embedding = node_embedding self.out_dim = out_dim self.hidden_channels = hidden_channels self.n_message_layers = n_message_layer self.n_readout_layer = n_readout_layer self.dropout_ratio = dropout_ratio self.concat_hidden = concat_hidden self.weight_tying = weight_tying self.n_edge_types = n_edge_types def __call__(self, sparse_batch, is_real_node=None): if sparse_batch.x.dtype == self.xp.int32: h = self.embed(sparse_batch.x) # (minibatch, max_num_atoms) else: h = self.first_mlp(sparse_batch.x) h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) g_list = [] for step in range(self.n_message_layers): message_layer_index = 0 if self.weight_tying else step h = self.update_layers[message_layer_index]( h, sparse_batch.edge_index) if step != self.n_message_layers - 1: h = functions.relu(h) if self.concat_hidden: g = self.readout_layers[step](h, h0, is_real_node) g_list.append(g) if self.node_embedding: return h if self.concat_hidden: return functions.concat(g_list, axis=1) else: g = self.readout_layers[0](h, sparse_batch.batch, h0, is_real_node) return g ================================================ FILE: chainer_chemistry/models/gnn_film.py ================================================ import chainer from chainer import cuda from chainer import functions from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.update.gnn_film_update import GNNFiLMUpdate class GNNFiLM(chainer.Chain): """Graph Neural Networks with Feature-wise Linear Modulation (GNN_FiLM) Marc Brockschmidt (2019).\ GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation \ `arXiv:1906.12192 `_ Args: out_dim (int): dimension of output feature vector hidden_channels (int): dimension of feature vector associated to each atom n_update_layers (int): number of layers n_atom_types (int): number of types of atoms concat_hidden (bool): If set to True, readout is executed in each layer and the result is concatenated weight_tying (bool): enable weight_tying or not activation (~chainer.Function or ~chainer.FunctionNode): activate function n_edge_types (int): number of edge type. Defaults to 5 for single, double, triple, aromatic bond and self-connection. """ def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=5): super(GNNFiLM, self).__init__() n_readout_layer = n_update_layers if concat_hidden else 1 n_message_layer = 1 if weight_tying else n_update_layers with self.init_scope(): # Update self.embed = EmbedAtomID(out_size=hidden_channels, in_size=n_atom_types) self.update_layers = chainer.ChainList(*[GNNFiLMUpdate( hidden_channels=hidden_channels, n_edge_types=n_edge_types) for _ in range(n_message_layer)]) # Readout # self.readout_layers = chainer.ChainList(*[GeneralReadout( # out_dim=out_dim, hidden_channels=hidden_channels, # activation=activation, activation_agg=activation) # for _ in range(n_readout_layer)]) self.readout_layers = chainer.ChainList(*[GGNNReadout( out_dim=out_dim, in_channels=hidden_channels * 2, activation=activation, activation_agg=activation) for _ in range(n_readout_layer)]) self.out_dim = out_dim self.hidden_channels = hidden_channels self.n_update_layers = n_update_layers self.n_edge_types = n_edge_types self.activation = activation self.concat_hidden = concat_hidden self.weight_tying = weight_tying def __call__(self, atom_array, adj, is_real_node=None): """Forward propagation Args: atom_array (numpy.ndarray): minibatch of molecular which is represented with atom IDs (representing C, O, S, ...) `atom_array[mol_index, atom_index]` represents `mol_index`-th molecule's `atom_index`-th atomic number adj (numpy.ndarray): minibatch of adjancency matrix with edge-type information is_real_node (numpy.ndarray): 2-dim array (minibatch, num_nodes). 1 for real node, 0 for virtual node. If `None`, all node is considered as real node. Returns: ~chainer.Variable: minibatch of fingerprint """ # reset state # self.reset_state() if atom_array.dtype == self.xp.int32: h = self.embed(atom_array) # (minibatch, max_num_atoms) else: h = atom_array h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) g_list = [] for step in range(self.n_update_layers): message_layer_index = 0 if self.weight_tying else step h = self.update_layers[message_layer_index](h, adj) if self.concat_hidden: g = self.readout_layers[step](h, h0, is_real_node) g_list.append(g) if self.concat_hidden: return functions.concat(g_list, axis=1) else: g = self.readout_layers[0](h, h0, is_real_node) return g def reset_state(self): [update_layer.reset_state() for update_layer in self.update_layers] ================================================ FILE: chainer_chemistry/models/gwle/__init__.py ================================================ #from chainer_chemistry.models.cwle import cwle from chainer_chemistry.models.gwle import gwle_graph_conv_model from chainer_chemistry.models.gwle import gwle_net from chainer_chemistry.models.gwle.gwle_net import GGNN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RelGAT_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RelGCN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import GIN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import NFP_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RSGCN_GWLE # NOQA ================================================ FILE: chainer_chemistry/models/gwle/gwle_graph_conv_model.py ================================================ import chainer from chainer import cuda from chainer import links from chainer import functions from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.connection.graph_linear import GraphLinear from chainer_chemistry.links.normalization.graph_batch_normalization import GraphBatchNormalization # NOQA from chainer_chemistry.links.readout.general_readout import GeneralReadout from chainer_chemistry.config import MAX_ATOMIC_NUM #from chainer_chemistry.models.gwm.gwm import GWM from chainer_chemistry.models.relgcn import rescale_adj MAX_WLE_NUM = 800 def to_array(x): """Convert x into numpy.ndarray or cupy.ndarray""" if isinstance(x, chainer.Variable): x = x.array return x class GWLEGraphConvModel(chainer.Chain): """Unified module of Graph Convolution Model with GWLE Note that this module is experimental, all update_layer and readout_layer combination is not supported. Please refer `test_gwm_graph_conv_model.py` for tested combinations. This module might not be maintained in the future. Args: hidden_channels (int or list): hidden channels for update out_dim (int): output dim update_layer (chainer.links.Link): readout_layer (chainer.links.Link): n_update_layers (int or None): out_channels (None or lsit): wle_dim (int): n_atom_types (int): n_edge_types (int): dropout_ratio (float): with_wle (bool): enabler for Combined NLE concat_hidden (bool): sum_hidden (bool): weight_tying (bool): scale_adj (bool): activation (callable): use_batchnorm (bool): n_activation (int or None): update_kwargs (dict or None): readout_kwargs (dict or None): wle_kwargs (dict or None): n_wle_types (int): """ def __init__(self, hidden_channels, out_dim, update_layer, readout_layer, n_update_layers=None, out_channels=None, wle_dim=None, n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4, dropout_ratio=-1.0, with_wle=True, concat_hidden=False, sum_hidden=False, weight_tying=False, scale_adj=False, activation=None, use_batchnorm=False, n_activation=None, update_kwargs=None, readout_kwargs=None, wle_kwargs=None, n_wle_types=MAX_WLE_NUM): super(GWLEGraphConvModel, self).__init__() # General: length of hidden_channels must be n_layers + 1 if isinstance(hidden_channels, int): if n_update_layers is None: raise ValueError('n_update_layers is None') else: hidden_channels = [hidden_channels for _ in range(n_update_layers + 1)] elif isinstance(hidden_channels, list): if out_channels is None: n_update_layers = len(hidden_channels) - 1 else: n_update_layers = len(hidden_channels) else: raise TypeError('Unexpected value for hidden_channels {}' .format(hidden_channels)) if readout_layer == GeneralReadout and hidden_channels[-1] != out_dim: # When use GWM, hidden channels must be same. But GeneralReadout # cannot change the dimension. So when use General Readout and GWM, # hidden channel and out_dim should be same. if with_wle: raise ValueError('Unsupported combination.') else: hidden_channels[-1] = out_dim # When use with_gwm, concat_hidden, sum_hidden and weight_tying option, # hidden_channels must be same if with_wle or concat_hidden or sum_hidden or weight_tying: if not all([in_dim == hidden_channels[0] for in_dim in hidden_channels]): raise ValueError( 'hidden_channels must be same but different {}' .format(hidden_channels)) if with_wle and wle_dim is None: print('[WARNING] wle_dim is None, set to {}' .format(hidden_channels[0])) wle_dim = hidden_channels[0] if out_channels is None: in_channels_list = hidden_channels[:-1] out_channels_list = hidden_channels[1:] else: # For RelGAT concat_heads option in_channels_list = hidden_channels out_channels_list = out_channels assert len(in_channels_list) == n_update_layers assert len(out_channels_list) == n_update_layers n_use_update_layers = 1 if weight_tying else n_update_layers n_readout_layers = n_use_update_layers if concat_hidden or sum_hidden else 1 n_activation = n_use_update_layers if n_activation is None else n_activation if update_kwargs is None: update_kwargs = {} if readout_kwargs is None: readout_kwargs = {} if wle_kwargs is None: wle_kwargs = {} with self.init_scope(): self.embed = EmbedAtomID(out_size=hidden_channels[0], in_size=n_atom_types) # +1 for label 0 self.update_layers = chainer.ChainList( *[update_layer(in_channels=in_channels_list[i], out_channels=out_channels_list[i], n_edge_types=n_edge_types, **update_kwargs) for i in range(n_use_update_layers)]) # when use weight_tying option, hidden_channels must be same. So we can use -1 index self.readout_layers = chainer.ChainList( *[readout_layer(out_dim=out_dim, # in_channels=hidden_channels[-1], in_channels=None, **readout_kwargs) for _ in range(n_readout_layers)]) if with_wle: self.embed_wle = links.EmbedID(out_size=wle_dim, in_size=n_wle_types) # Gates self.gate_W1 = GraphLinear(in_size=hidden_channels[0], out_size=hidden_channels[0]) self.gate_W2 = GraphLinear(in_size=wle_dim, out_size=hidden_channels[0]) if use_batchnorm: self.bnorms = chainer.ChainList( *[GraphBatchNormalization( out_channels_list[i]) for i in range(n_use_update_layers)]) self.readout_layer = readout_layer self.update_layer = update_layer self.weight_tying = weight_tying self.with_wle = with_wle self.concat_hidden = concat_hidden self.sum_hidden = sum_hidden self.scale_adj = scale_adj self.activation = activation self.dropout_ratio = dropout_ratio self.use_batchnorm = use_batchnorm self.n_activation = n_activation self.n_update_layers = n_update_layers self.n_edge_types = n_edge_types def __call__(self, atom_array, adj, wle_array=None, is_real_node=None): self.reset_state() if atom_array.dtype == self.xp.int32: h = self.embed(atom_array) else: # TODO: GraphLinear or GraphMLP can be used. h = atom_array h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) # all Combined NLE processes are done here. if self.with_wle: h_s = self.embed_wle(wle_array) # gated sum gate_input = self.gate_W1(h) + self.gate_W2(h_s) gate_coefff = functions.sigmoid(gate_input) h = (1.0 - gate_coefff) * h + gate_coefff * h_s additional_kwargs = self.preprocess_addtional_kwargs( atom_array, adj, wle_array=wle_array, is_real_node=is_real_node) if self.scale_adj: adj = rescale_adj(adj) g_list = [] for step in range(self.n_update_layers): update_layer_index = 0 if self.weight_tying else step h = self.update_layers[update_layer_index]( h=h, adj=adj, **additional_kwargs) if self.use_batchnorm: h = self.bnorms[update_layer_index](h) if self.dropout_ratio > 0.: h = functions.dropout(h, ratio=self.dropout_ratio) if self.activation is not None and step < self.n_activation: h = self.activation(h) if self.concat_hidden or self.sum_hidden: g = self.readout_layers[step]( h=h, h0=h0, is_real_node=is_real_node, **additional_kwargs) g_list.append(g) if self.concat_hidden: return functions.concat(g_list, axis=1) else: if self.sum_hidden: g = functions.sum(functions.stack(g_list), axis=0) else: g = self.readout_layers[0]( h=h, h0=h0, is_real_node=is_real_node) return g def reset_state(self): if hasattr(self.update_layers[0], 'reset_state'): [update_layer.reset_state() for update_layer in self.update_layers] def preprocess_addtional_kwargs(self, *args, **kwargs): return {} ================================================ FILE: chainer_chemistry/models/gwle/gwle_net.py ================================================ from chainer import functions from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import GINUpdate, NFPReadout, NFPUpdate, \ RSGCNUpdate, GeneralReadout # NOQA from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.update.ggnn_update import GGNNUpdate from chainer_chemistry.links.update.relgat_update import RelGATUpdate from chainer_chemistry.links.update.relgcn_update \ import RelGCNUpdate, RelGCNSparseUpdate from chainer_chemistry.models.gwle.gwle_graph_conv_model import GWLEGraphConvModel # NOQA from chainer_chemistry.models.cwle.cwle_graph_conv_model import to_array from chainer_chemistry.models.cwle.cwle_graph_conv_model import MAX_WLE_NUM class GGNN_GWLE(GWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM): readout_kwargs = {'activation': activation, 'activation_agg': activation} super(GGNN_GWLE, self).__init__( update_layer=GGNNUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_wle=with_wle, readout_kwargs=readout_kwargs, n_wle_types=n_wle_types) class RelGCN_GWLE(GWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM): readout_kwargs = {'activation': activation, 'activation_agg': activation} super(RelGCN_GWLE, self).__init__( update_layer=RelGCNUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_wle=with_wle, readout_kwargs=readout_kwargs, n_wle_types=n_wle_types) class RelGAT_GWLE(GWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM): readout_kwargs = {'activation': activation, 'activation_agg': activation} super(RelGAT_GWLE, self).__init__( update_layer=RelGATUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_wle=with_wle, readout_kwargs=readout_kwargs, n_wle_types=n_wle_types) class GIN_GWLE(GWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, dropout_ratio=0.5, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM): update_kwargs = {'dropout_ratio': dropout_ratio} readout_kwargs = {'activation': activation, 'activation_agg': activation} super(GIN_GWLE, self).__init__( update_layer=GINUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_wle=with_wle, update_kwargs=update_kwargs, readout_kwargs=readout_kwargs, n_wle_types=n_wle_types) class NFP_GWLE(GWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, max_degree=6, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, with_wle=True, n_wle_types=MAX_WLE_NUM): update_kwargs = {'max_degree': max_degree} super(NFP_GWLE, self).__init__( update_layer=NFPUpdate, readout_layer=NFPReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, sum_hidden=True, with_wle=with_wle, update_kwargs=update_kwargs, n_wle_types=n_wle_types) self.max_degree = max_degree self.n_degree_type = max_degree + 1 self.ch0 = hidden_channels def preprocess_addtional_kwargs(self, *args, **kwargs): atom_array, adj = args[:2] bs, num_node = atom_array.shape[:2] # For NFP Update if adj.ndim == 4: degree_mat = self.xp.sum(to_array(adj), axis=(1, 2)) elif adj.ndim == 3: degree_mat = self.xp.sum(to_array(adj), axis=1) else: raise ValueError('Unexpected value adj ' .format(adj.shape)) # deg_conds: (minibatch, atom, ch) deg_conds = [self.xp.broadcast_to( ((degree_mat - degree) == 0)[:, :, None], (bs, num_node, self.ch0)) for degree in range(1, self.n_degree_type + 1)] return {'deg_conds': deg_conds} class RSGCN_GWLE(GWLEGraphConvModel): def __init__(self, out_dim, hidden_channels=32, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, use_batch_norm=False, readout=None, dropout_ratio=0.5, with_wle=True, n_wle_types=MAX_WLE_NUM): if readout is None: readout = GeneralReadout super(RSGCN_GWLE, self).__init__( update_layer=RSGCNUpdate, readout_layer=readout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, use_batchnorm=use_batch_norm, activation=functions.relu, n_activation=n_update_layers-1, dropout_ratio=dropout_ratio, with_wle=with_wle, n_wle_types=n_wle_types) ================================================ FILE: chainer_chemistry/models/gwm/__init__.py ================================================ from chainer_chemistry.models.gwm import gwm # NOQA from chainer_chemistry.models.gwm import gwm_graph_conv_model # NOQA from chainer_chemistry.models.gwm import gwm_net # NOQA from chainer_chemistry.models.gwm.gwm import GWM # NOQA from chainer_chemistry.models.gwm.gwm_graph_conv_model import GWMGraphConvModel # NOQA from chainer_chemistry.models.gwm.gwm_net import GGNN_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import GIN_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import NFP_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import RSGCN_GWM # NOQA ================================================ FILE: chainer_chemistry/models/gwm/gwm.py ================================================ import chainer from chainer import functions from chainer import links from chainer_chemistry.links import GraphLinear class WarpGateUnit(chainer.Chain): """WarpGateUnit It computes gated-sum mixing `merged` feature from normal node feature `h` and super node feature `g`, See Section "3.4 Warp Gate" of the paper. Args: output_type (str): supported type as below. graph: super: hidden_dim (int): hidden dim dropout_ratio (float): negative value indicates to not apply dropout. activation (callable): """ def __init__(self, output_type='graph', hidden_dim=16, dropout_ratio=-1, activation=functions.sigmoid): super(WarpGateUnit, self).__init__() if output_type == 'graph': LinearLink = GraphLinear elif output_type == 'super': LinearLink = links.Linear else: raise ValueError( 'output_type = {} is unexpected. graph or super is supported.' .format(output_type)) with self.init_scope(): self.H = LinearLink(in_size=hidden_dim, out_size=hidden_dim) self.G = LinearLink(in_size=hidden_dim, out_size=hidden_dim) self.hidden_dim = hidden_dim self.dropout_ratio = dropout_ratio self.output_type = output_type self.activation = activation def __call__(self, h, g): # TODO(nakago): more efficient computation. Maybe we can calculate # self.G(g) as Linear layer followed by broadcast to each atom. z = self.H(h) + self.G(g) if self.dropout_ratio > 0.0: z = functions.dropout(z, ratio=self.dropout_ratio) z = self.activation(z) merged = (1 - z) * h + z * g return merged class SuperNodeTransmitterUnit(chainer.Chain): """SuperNodeTransmitterUnit It calculates message from super node to normal node. Args: hidden_dim_super (int): hidden_dim (int): hiddem dim for dropout_ratio (float): negative value indicates to not apply dropout. """ def __init__(self, hidden_dim_super=16, hidden_dim=16, dropout_ratio=-1): super(SuperNodeTransmitterUnit, self).__init__() with self.init_scope(): self.F_super = links.Linear(in_size=hidden_dim_super, out_size=hidden_dim) self.hidden_dim = hidden_dim self.hidden_dim_super = hidden_dim_super self.dropout_ratio = dropout_ratio def __call__(self, g, n_nodes): """main calculation Args: g: super node feature. shape (bs, hidden_dim_super) n_nodes (int): number of nodes Returns: g_trans: super --> original transmission """ mb = len(g) # for local updates g_trans = self.F_super(g) # intermediate_h_super.shape == (mb, self.hidden_dim) g_trans = functions.tanh(g_trans) # intermediate_h_super.shape == (mb, 1, self.hidden_dim) g_trans = functions.expand_dims(g_trans, 1) # intermediate_h_super.shape == (mb, atom, self.hidden_dim) g_trans = functions.broadcast_to(g_trans, (mb, n_nodes, self.hidden_dim)) return g_trans class GraphTransmitterUnit(chainer.Chain): """GraphTransmitterUnit It calculates message from normal node to super node. Args: hidden_dim_super (int): hidden_dim (int): n_heads (int): dropout_ratio (float): activation (callable): """ def __init__(self, hidden_dim_super=16, hidden_dim=16, n_heads=8, dropout_ratio=-1, activation=functions.tanh): super(GraphTransmitterUnit, self).__init__() hdim_n = hidden_dim * n_heads with self.init_scope(): self.V_super = GraphLinear(hidden_dim, hdim_n) self.W_super = links.Linear(hdim_n, hidden_dim_super) self.B = GraphLinear(hidden_dim, n_heads * hidden_dim_super) self.hidden_dim = hidden_dim self.hidden_dim_super = hidden_dim_super self.dropout_ratio = dropout_ratio self.n_heads = n_heads self.activation = activation def __call__(self, h, g, step=0): mb, atom, ch = h.shape h_j = self.V_super(h) h_j = functions.reshape(h_j, (mb, atom, self.n_heads, ch)) # h_j (mb, atom, self.n_heads, ch) h_j = functions.transpose(h_j, (0, 2, 1, 3)) # expand h_super # g_extend.shape (mb, 1, self.hidden_dim_super) g_extend = functions.expand_dims(g, 1) # g_extend.shape == (mb, self.n_heads, self.hidden_dim_super) g_extend = functions.broadcast_to(g_extend, (mb, self.n_heads, self.hidden_dim_super)) # g_extend.shape == (mb, self.n_heads, 1, self.hidden_dim_super) g_extend = functions.expand_dims(g_extend, 2) # update for attention-message B h_i # h (mb, atom, ch) # Bh_i.shape == (mb, atom, self.n_heads * self.hidden_dim_super) Bh_i = self.B(h) # Bh_i.shpae == (mb, atom, num_head, ch) Bh_i = functions.reshape(Bh_i, (mb, atom, self.n_heads, self.hidden_dim_super)) # Bh_i.shape == (mb, num_head, atom, ch) Bh_i = functions.transpose(Bh_i, [0, 2, 1, 3]) # take g^{T} * B * h_i # indexed by i # mb, self.n_haeds atom(i) # b_hi.shape == (mb, self.n_heads, 1, atom) # This will reduce the last hidden_dim_super axis b_hi = functions.matmul(g_extend, Bh_i, transb=True) # softmax. sum/normalize over the last axis. # mb, self.n_heda, atom(i-normzlied) # attention_i.shape == (mb, self.n_heads, 1, atom) attention_i = functions.softmax(b_hi, axis=3) if self.dropout_ratio > 0.0: attention_i = functions.dropout(attention_i, ratio=self.dropout_ratio) # element-wise product --> sum over i # mb, num_head, hidden_dim_super # attention_sum.shape == (mb, self.n_heads, 1, ch) attention_sum = functions.matmul(attention_i, h_j) # attention_sum.shape == (mb, self.n_heads * ch) attention_sum = functions.reshape(attention_sum, (mb, self.n_heads * ch)) # weighting h for different heads # intermediate_h.shape == (mb, self.n_heads * ch) # compress heads h_trans = self.W_super(attention_sum) # intermediate_h.shape == (mb, self.hidden_dim_super) h_trans = self.activation(h_trans) return h_trans class GWM(chainer.Chain): """Graph Warping Module (GWM) Module for a single layer update. See: Ishiguro, Maeda, and Koyama. "Graph Warp Module: an Auxiliary Module for Boosting the Power of Graph NeuralNetworks", arXiv, 2019. Args: hidden_dim (int): dimension of hidden vectors associated to each atom (local node) hidden_dim_super (int); dimension of super-node hidden vector n_layers (int): number of layers n_heads (int): number of heads dropout_ratio (float): dropout ratio. Negative value indicates to not apply dropout. tying_flag (bool): enable if you want to share params across layers. activation (callable): wgu_activation (callable): gtu_activation (callable): """ def __init__(self, hidden_dim=16, hidden_dim_super=16, n_layers=4, n_heads=8, dropout_ratio=-1, tying_flag=False, activation=functions.relu, wgu_activation=functions.sigmoid, gtu_activation=functions.tanh): super(GWM, self).__init__() n_use_layers = 1 if tying_flag else n_layers with self.init_scope(): self.update_super = chainer.ChainList( *[links.Linear(in_size=hidden_dim_super, out_size=hidden_dim_super) for _ in range(n_use_layers)] ) # for Transmitter unit self.super_transmitter = chainer.ChainList( *[SuperNodeTransmitterUnit( hidden_dim=hidden_dim, hidden_dim_super=hidden_dim_super, dropout_ratio=dropout_ratio) for _ in range(n_use_layers)]) self.graph_transmitter = chainer.ChainList( *[GraphTransmitterUnit( hidden_dim=hidden_dim, hidden_dim_super=hidden_dim_super, n_heads=n_heads, dropout_ratio=dropout_ratio, activation=gtu_activation) for _ in range(n_use_layers)]) # for Warp Gate unit self.wgu_local = chainer.ChainList( *[WarpGateUnit( output_type='graph', hidden_dim=hidden_dim, dropout_ratio=dropout_ratio, activation=wgu_activation) for _ in range(n_use_layers)]) self.wgu_super = chainer.ChainList( *[WarpGateUnit( output_type='super', hidden_dim=hidden_dim_super, dropout_ratio=dropout_ratio, activation=wgu_activation) for _ in range(n_use_layers)]) # Weight tying: not layer-wise but recurrent through layers self.GRU_local = links.GRU(in_size=hidden_dim, out_size=hidden_dim) self.GRU_super = links.GRU(in_size=hidden_dim_super, out_size=hidden_dim_super) # end init_scope-with self.hidden_dim = hidden_dim self.hidden_dim_super = hidden_dim_super self.n_layers = n_layers self.n_heads = n_heads self.dropout_ratio = dropout_ratio self.tying_flag = tying_flag self.activation = activation self.wgu_activation = wgu_activation def __call__(self, h, h_new, g, step=0): """main calculation Note: Do not forget to reset GRU for each batch. Args: h: Minibatch by num_nodes by hidden_dim numpy array. current local node hidden states as input of the vanilla GNN h_new: Minibatch by num_nodes by hidden_dim numpy array. updated local node hidden states as output from the vanilla GNN g: Minibatch by bond_types by num_nodes by num_nodes 1/0 array. Adjacency matrices over several bond types step: Minibatch by hidden_dim_super numpy array. current super node hiddden state Returns: Updated h and g """ # (minibatch, atom, ch) mb, n_nodes, ch = h.shape # non linear update of the super node g_new = self.activation(self.update_super[step](g)) # Transmitter unit: inter-module message passing # original --> super transmission h_trans = self.graph_transmitter[step](h, g) # g_trans: super --> original transmission g_trans = self.super_transmitter[step](g, n_nodes) # Warp Gate unit merged_h = self.wgu_local[step](h_new, g_trans) merged_g = self.wgu_super[step](h_trans, g_new) # Self recurrent out_h = functions.reshape(merged_h, (mb * n_nodes, self.hidden_dim)) out_h = self.GRU_local(out_h) out_h = functions.reshape(out_h, (mb, n_nodes, self.hidden_dim)) out_g = self.GRU_super(merged_g) return out_h, out_g def reset_state(self): self.GRU_local.reset_state() self.GRU_super.reset_state() ================================================ FILE: chainer_chemistry/models/gwm/gwm_graph_conv_model.py ================================================ import chainer from chainer import cuda from chainer import functions from chainer import links from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.normalization.graph_batch_normalization import GraphBatchNormalization # NOQA from chainer_chemistry.links.readout.general_readout import GeneralReadout from chainer_chemistry.models.gwm.gwm import GWM from chainer_chemistry.models.relgcn import rescale_adj def to_array(x): """Convert x into numpy.ndarray or cupy.ndarray""" if isinstance(x, chainer.Variable): x = x.array return x class GWMGraphConvModel(chainer.Chain): """Unified module of Graph Convolution Model with GWM Note that this module is experimental, all update_layer and readout_layer combination is not supported. Please refer `test_gwm_graph_conv_model.py` for tested combinations. This module might not be maintained in the future. Args: hidden_channels (int or list): hidden channels for update out_dim (int): output dim update_layer (chainer.links.Link): readout_layer (chainer.links.Link): n_update_layers (int or None): out_channels (None or lsit): super_node_dim (int): n_atom_types (int): n_edge_types (int): dropout_ratio (float): with_gwm (bool): concat_hidden (bool): sum_hidden (bool): weight_tying (bool): scale_adj (bool): activation (callable): use_batchnorm (bool): n_activation (int or None): update_kwargs (dict or None): readout_kwargs (dict or None): gwm_kwargs (dict or None): """ def __init__(self, hidden_channels, out_dim, update_layer, readout_layer, n_update_layers=None, out_channels=None, super_node_dim=None, n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4, dropout_ratio=-1.0, with_gwm=True, concat_hidden=False, sum_hidden=False, weight_tying=False, scale_adj=False, activation=None, use_batchnorm=False, n_activation=None, update_kwargs=None, readout_kwargs=None, gwm_kwargs=None): super(GWMGraphConvModel, self).__init__() # General: length of hidden_channels must be n_layers + 1 if isinstance(hidden_channels, int): if n_update_layers is None: raise ValueError('n_update_layers is None') else: hidden_channels = [hidden_channels for _ in range(n_update_layers + 1)] elif isinstance(hidden_channels, list): if out_channels is None: n_update_layers = len(hidden_channels) - 1 else: n_update_layers = len(hidden_channels) else: raise TypeError('Unexpected value for hidden_channels {}' .format(hidden_channels)) if readout_layer == GeneralReadout and hidden_channels[-1] != out_dim: # When use GWM, hidden channels must be same. But GeneralReadout # cannot change the dimension. So when use General Readout and GWM, # hidden channel and out_dim should be same. if with_gwm: raise ValueError('Unsupported combination.') else: hidden_channels[-1] = out_dim # When use with_gwm, concat_hidden, sum_hidden and weight_tying option, # hidden_channels must be same if with_gwm or concat_hidden or sum_hidden or weight_tying: if not all([in_dim == hidden_channels[0] for in_dim in hidden_channels]): raise ValueError( 'hidden_channels must be same but different {}' .format(hidden_channels)) if with_gwm and super_node_dim is None: print('[WARNING] super_node_dim is None, set to {}' .format(hidden_channels[0])) super_node_dim = hidden_channels[0] if out_channels is None: in_channels_list = hidden_channels[:-1] out_channels_list = hidden_channels[1:] else: # For RelGAT concat_heads option in_channels_list = hidden_channels out_channels_list = out_channels assert len(in_channels_list) == n_update_layers assert len(out_channels_list) == n_update_layers n_use_update_layers = 1 if weight_tying else n_update_layers n_readout_layers = n_use_update_layers if concat_hidden or sum_hidden else 1 # NOQA n_activation = n_use_update_layers if n_activation is None else n_activation # NOQA if update_kwargs is None: update_kwargs = {} if readout_kwargs is None: readout_kwargs = {} if gwm_kwargs is None: gwm_kwargs = {} with self.init_scope(): self.embed = EmbedAtomID(out_size=hidden_channels[0], in_size=n_atom_types) self.update_layers = chainer.ChainList( *[update_layer(in_channels=in_channels_list[i], out_channels=out_channels_list[i], n_edge_types=n_edge_types, **update_kwargs) for i in range(n_use_update_layers)]) # when use weight_tying option, hidden_channels must be same. # So we can use -1 index self.readout_layers = chainer.ChainList( *[readout_layer(out_dim=out_dim, # in_channels=hidden_channels[-1], in_channels=None, **readout_kwargs) for _ in range(n_readout_layers)]) if with_gwm: self.gwm = GWM(hidden_dim=hidden_channels[0], hidden_dim_super=super_node_dim, n_layers=n_use_update_layers, **gwm_kwargs) self.embed_super = links.Linear(None, out_size=super_node_dim) self.linear_for_concat_super = links.Linear(in_size=None, out_size=out_dim) if use_batchnorm: self.bnorms = chainer.ChainList( *[GraphBatchNormalization( out_channels_list[i]) for i in range(n_use_update_layers)]) self.readout_layer = readout_layer self.update_layer = update_layer self.weight_tying = weight_tying self.with_gwm = with_gwm self.concat_hidden = concat_hidden self.sum_hidden = sum_hidden self.scale_adj = scale_adj self.activation = activation self.dropout_ratio = dropout_ratio self.use_batchnorm = use_batchnorm self.n_activation = n_activation self.n_update_layers = n_update_layers self.n_edge_types = n_edge_types def __call__(self, atom_array, adj, super_node=None, is_real_node=None): self.reset_state() if atom_array.dtype == self.xp.int32: h = self.embed(atom_array) else: # TODO(nakago): GraphLinear or GraphMLP can be used. h = atom_array h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) if self.with_gwm: h_s = self.embed_super(super_node) additional_kwargs = self.preprocess_addtional_kwargs( atom_array, adj, super_node=super_node, is_real_node=is_real_node) if self.scale_adj: adj = rescale_adj(adj) g_list = [] for step in range(self.n_update_layers): update_layer_index = 0 if self.weight_tying else step h_new = self.update_layers[update_layer_index]( h=h, adj=adj, **additional_kwargs) if self.with_gwm: h_new, h_s = self.gwm(h, h_new, h_s, update_layer_index) h = h_new if self.use_batchnorm: h = self.bnorms[update_layer_index](h) if self.dropout_ratio > 0.: h = functions.dropout(h, ratio=self.dropout_ratio) if self.activation is not None and step < self.n_activation: h = self.activation(h) if self.concat_hidden or self.sum_hidden: g = self.readout_layers[step]( h=h, h0=h0, is_real_node=is_real_node, **additional_kwargs) g_list.append(g) if self.concat_hidden: return functions.concat(g_list, axis=1) else: if self.sum_hidden: g = functions.sum(functions.stack(g_list), axis=0) else: g = self.readout_layers[0]( h=h, h0=h0, is_real_node=is_real_node) if self.with_gwm: g = functions.concat((g, h_s), axis=1) g = functions.relu(self.linear_for_concat_super(g)) return g def reset_state(self): if hasattr(self.update_layers[0], 'reset_state'): [update_layer.reset_state() for update_layer in self.update_layers] if self.with_gwm: self.gwm.reset_state() def preprocess_addtional_kwargs(self, *args, **kwargs): return {} ================================================ FILE: chainer_chemistry/models/gwm/gwm_net.py ================================================ from chainer import functions from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import GINUpdate, NFPReadout, NFPUpdate, \ RSGCNUpdate, GeneralReadout # NOQA from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.update.ggnn_update import GGNNUpdate from chainer_chemistry.models.gwm.gwm_graph_conv_model import GWMGraphConvModel, to_array # NOQA class GGNN_GWM(GWMGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_gwm=True): readout_kwargs = {'activation': activation, 'activation_agg': activation} super(GGNN_GWM, self).__init__( update_layer=GGNNUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_gwm=with_gwm, readout_kwargs=readout_kwargs) class GIN_GWM(GWMGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, dropout_ratio=0.5, concat_hidden=False, weight_tying=True, activation=functions.identity, n_edge_types=4, with_gwm=True): update_kwargs = {'dropout_ratio': dropout_ratio} readout_kwargs = {'activation': activation, 'activation_agg': activation} super(GIN_GWM, self).__init__( update_layer=GINUpdate, readout_layer=GGNNReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, weight_tying=weight_tying, n_edge_types=n_edge_types, with_gwm=with_gwm, update_kwargs=update_kwargs, readout_kwargs=readout_kwargs ) class NFP_GWM(GWMGraphConvModel): def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, max_degree=6, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, with_gwm=True): update_kwargs = {'max_degree': max_degree} super(NFP_GWM, self).__init__( update_layer=NFPUpdate, readout_layer=NFPReadout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, concat_hidden=concat_hidden, sum_hidden=True, with_gwm=with_gwm, update_kwargs=update_kwargs ) self.max_degree = max_degree self.n_degree_type = max_degree + 1 self.ch0 = hidden_channels def preprocess_addtional_kwargs(self, *args, **kwargs): atom_array, adj = args[:2] bs, num_node = atom_array.shape[:2] # For NFP Update if adj.ndim == 4: degree_mat = self.xp.sum(to_array(adj), axis=(1, 2)) elif adj.ndim == 3: degree_mat = self.xp.sum(to_array(adj), axis=1) else: raise ValueError('Unexpected value adj ' .format(adj.shape)) # deg_conds: (minibatch, atom, ch) deg_conds = [self.xp.broadcast_to( ((degree_mat - degree) == 0)[:, :, None], (bs, num_node, self.ch0)) for degree in range(1, self.n_degree_type + 1)] return {'deg_conds': deg_conds} class RSGCN_GWM(GWMGraphConvModel): def __init__(self, out_dim, hidden_channels=32, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, use_batch_norm=False, readout=None, dropout_ratio=0.5, with_gwm=True): if readout is None: readout = GeneralReadout super(RSGCN_GWM, self).__init__( update_layer=RSGCNUpdate, readout_layer=readout, out_dim=out_dim, hidden_channels=hidden_channels, n_update_layers=n_update_layers, n_atom_types=n_atom_types, use_batchnorm=use_batch_norm, activation=functions.relu, n_activation=n_update_layers-1, dropout_ratio=dropout_ratio, with_gwm=with_gwm) ================================================ FILE: chainer_chemistry/models/megnet.py ================================================ import chainer from chainer.backend import get_array_module from chainer import functions from chainer_chemistry.functions import megnet_softplus from chainer_chemistry.links.readout.megnet_readout import MEGNetReadout from chainer_chemistry.links.update.megnet_update import MEGNetUpdate def reshaped_feat(feat, idx): """Convert node stack pattern into pad pattern This method is converting from node stack pattern to pad pattern about node and edge feature. This is because the current set2set implementation is only focus on pad pattern feature. """ xp = get_array_module(idx) max_idx = int(xp.max(idx)) vec_list = [feat[idx == i] for i in range(max_idx+1)] return functions.pad_sequence(vec_list) class MEGNet(chainer.Chain): """MEGNet See Chi Chen et al, \ Graph Networks as a Universal Machine Learning Framework for Molecules and Crystals. \ `arXiv:1812.05055 `_ Args: out_dim (int): dimension of output feature vector n_update_layers (int): the number of MEGNetUpdate layers dropout_ratio (float): ratio of dropout activation (~chainer.Function or ~chainer.FunctionNode): activate function for megnet model `megnet_softplus` was used in original paper. """ def __init__(self, out_dim=32, n_update_layers=3, dropout_ratio=-1, activation=megnet_softplus): super(MEGNet, self).__init__() if n_update_layers <= 0: raise ValueError('n_update_layers must be a positive integer, ' 'but it was set to {}'.format(n_update_layers)) self.n_update_layers = n_update_layers with self.init_scope(): self.update_layers = chainer.ChainList( *[MEGNetUpdate( dim_for_dense=[64, 32], dim_for_update=[64, 64, 32], dropout_ratio=dropout_ratio, activation=activation, skip_intermediate=(i == 0) ) for i in range(n_update_layers)]) self.readout = MEGNetReadout(out_dim=out_dim, in_channels=32, n_layers=1, processing_steps=3, dropout_ratio=dropout_ratio, activation=activation) def __call__(self, atoms_feat, pair_feat, global_feat, *args): a_f = atoms_feat p_f = pair_feat g_f = global_feat # --- MGENet update --- for i in range(self.n_update_layers): a_f, p_f, g_f = self.update_layers[i](a_f, p_f, g_f, *args) # --- reshape --- atom_idx = args[0] pair_idx = args[1] a_f = reshaped_feat(a_f, atom_idx) p_f = reshaped_feat(p_f, pair_idx) # --- MGENet readout --- out = self.readout(a_f, p_f, g_f) return out ================================================ FILE: chainer_chemistry/models/mlp.py ================================================ import chainer from chainer.functions import relu from chainer import links class MLP(chainer.Chain): """Basic implementation for MLP Args: out_dim (int): dimension of output feature vector hidden_dim (int): dimension of feature vector associated to each atom n_layers (int): number of layers activation (chainer.functions): activation function """ def __init__(self, out_dim, hidden_dim=16, n_layers=2, activation=relu): super(MLP, self).__init__() if n_layers <= 0: raise ValueError('n_layers must be a positive integer, but it was ' 'set to {}'.format(n_layers)) layers = [links.Linear(None, hidden_dim) for i in range(n_layers - 1)] with self.init_scope(): self.layers = chainer.ChainList(*layers) self.l_out = links.Linear(None, out_dim) self.activation = activation def __call__(self, x): h = x for l in self.layers: h = self.activation(l(h)) h = self.l_out(h) return h ================================================ FILE: chainer_chemistry/models/mpnn.py ================================================ from functools import partial from typing import Optional # NOQA import chainer from chainer import cuda, functions # NOQA from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import EmbedAtomID from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.readout.mpnn_readout import MPNNReadout from chainer_chemistry.links.update.ggnn_update import GGNNUpdate from chainer_chemistry.links.update.mpnn_update import MPNNUpdate class MPNN(chainer.Chain): """Message Passing Neural Networks (MPNN). Args: out_dim (int): dimension of output feature vector hidden_channels (int): dimension of feature vector for each node n_update_layers (int): number of update layers n_atom_types (int): number of types of atoms concat_hidden (bool): If set to True, readout is executed in each layer and the result is concatenated weight_tying (bool): enable weight_tying or not n_edge_types (int): number of edge type. Defaults to 4 for single, double, triple and aromatic bond. nn (~chainer.Link): Neural Networks for expanding edge vector dimension message_func (str): message function. 'edgenet' and 'ggnn' are supported. readout_func (str): readout function. 'set2set' and 'ggnn' are supported. """ def __init__( self, out_dim, # type: int hidden_channels=16, # type: int n_update_layers=4, # type: int n_atom_types=MAX_ATOMIC_NUM, # type: int concat_hidden=False, # type: bool weight_tying=True, # type: bool n_edge_types=4, # type: int nn=None, # type: Optional[chainer.Link] message_func='edgenet', # type: str readout_func='set2set', # type: str ): # type: (...) -> None super(MPNN, self).__init__() if message_func not in ('edgenet', 'ggnn'): raise ValueError( 'Invalid message function: {}'.format(message_func)) if readout_func not in ('set2set', 'ggnn'): raise ValueError( 'Invalid readout function: {}'.format(readout_func)) n_readout_layer = n_update_layers if concat_hidden else 1 n_message_layer = 1 if weight_tying else n_update_layers with self.init_scope(): # Update self.embed = EmbedAtomID(out_size=hidden_channels, in_size=n_atom_types) if message_func == 'ggnn': self.update_layers = chainer.ChainList(*[ GGNNUpdate( hidden_channels=hidden_channels, n_edge_types=n_edge_types) for _ in range(n_message_layer) ]) else: self.update_layers = chainer.ChainList(*[ MPNNUpdate(hidden_channels=hidden_channels, nn=nn) for _ in range(n_message_layer) ]) # Readout if readout_func == 'ggnn': self.readout_layers = chainer.ChainList(*[ GGNNReadout(out_dim=out_dim, in_channels=hidden_channels * 2) for _ in range(n_readout_layer) ]) else: self.readout_layers = chainer.ChainList(*[ MPNNReadout( out_dim=out_dim, in_channels=hidden_channels, n_layers=1) for _ in range(n_readout_layer) ]) self.out_dim = out_dim self.hidden_channels = hidden_channels self.n_update_layers = n_update_layers self.n_edge_types = n_edge_types self.concat_hidden = concat_hidden self.weight_tying = weight_tying self.message_func = message_func self.readout_func = readout_func def __call__(self, atom_array, adj): # type: (numpy.ndarray, numpy.ndarray) -> chainer.Variable """Forward propagation. Args: atom_array (numpy.ndarray): minibatch of molecular which is represented with atom IDs (representing C, O, S, ...) `atom_array[mol_index, atom_index]` represents `mol_index`-th molecule's `atom_index`-th atomic number adj (numpy.ndarray): minibatch of adjancency matrix with edge-type information Returns: ~chainer.Variable: minibatch of fingerprint """ # reset state self.reset_state() if atom_array.dtype == self.xp.int32: h = self.embed(atom_array) else: h = atom_array if self.readout_func == 'ggnn': h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) readout_layers = [ partial(readout_layer, h0=h0) for readout_layer in self.readout_layers ] else: readout_layers = self.readout_layers g_list = [] for step in range(self.n_update_layers): message_layer_index = 0 if self.weight_tying else step h = self.update_layers[message_layer_index](h, adj) if self.concat_hidden: g = readout_layers[step](h) g_list.append(g) if self.concat_hidden: return functions.concat(g_list, axis=1) else: g = readout_layers[0](h) return g def reset_state(self): # type: () -> None [update_layer.reset_state() for update_layer in self.update_layers] ================================================ FILE: chainer_chemistry/models/nfp.py ================================================ import chainer from chainer import Variable, functions # NOQA from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import EmbedAtomID from chainer_chemistry.links.readout.nfp_readout import NFPReadout from chainer_chemistry.links.update.nfp_update import NFPUpdate class NFP(chainer.Chain): """Neural Finger Print (NFP) See: David K Duvenaud, Dougal Maclaurin, Jorge Iparraguirre, Rafael Bombarell, Timothy Hirzel, Alan Aspuru-Guzik, and Ryan P Adams (2015). Convolutional networks on graphs for learning molecular fingerprints. *Advances in Neural Information Processing Systems (NIPS) 28*, Args: out_dim (int): dimension of output feature vector hidden_channels (int): dimension of feature vector for each node n_update_layers (int): number of layers max_degree (int): max degree of atoms when molecules are regarded as graphs n_atom_types (int): number of types of atoms concat_hidden (bool): If set to True, readout is executed in each layer and the result is concatenated """ def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, max_degree=6, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False): super(NFP, self).__init__() n_degree_types = max_degree + 1 with self.init_scope(): self.embed = EmbedAtomID(in_size=n_atom_types, out_size=hidden_channels) self.layers = chainer.ChainList( *[NFPUpdate(hidden_channels, hidden_channels, max_degree=max_degree) for _ in range(n_update_layers)]) self.readout_layers = chainer.ChainList( *[NFPReadout(out_dim=out_dim, in_channels=hidden_channels) for _ in range(n_update_layers)]) self.out_dim = out_dim self.hidden_channels = hidden_channels self.max_degree = max_degree self.n_degree_types = n_degree_types self.n_update_layers = n_update_layers self.concat_hidden = concat_hidden def __call__(self, atom_array, adj, is_real_node=None): """Forward propagation Args: atom_array (numpy.ndarray): minibatch of molecular which is represented with atom IDs (representing C, O, S, ...) `atom_array[mol_index, atom_index]` represents `mol_index`-th molecule's `atom_index`-th atomic number adj (numpy.ndarray): minibatch of adjancency matrix `adj[mol_index]` represents `mol_index`-th molecule's adjacency matrix is_real_node (numpy.ndarray): 2-dim array (minibatch, num_nodes). 1 for real node, 0 for virtual node. If `None`, all node is considered as real node. Returns: ~chainer.Variable: minibatch of fingerprint """ if atom_array.dtype == self.xp.int32: # atom_array: (minibatch, atom) h = self.embed(atom_array) else: h = atom_array # h: (minibatch, atom, ch) g = 0 # --- NFP update & readout --- # degree_mat: (minibatch, max_num_atoms) if isinstance(adj, Variable): adj_array = adj.data else: adj_array = adj degree_mat = self.xp.sum(adj_array, axis=1) # deg_conds: (minibatch, atom, ch) deg_conds = [self.xp.broadcast_to( ((degree_mat - degree) == 0)[:, :, None], h.shape) for degree in range(1, self.n_degree_types + 1)] g_list = [] for update, readout in zip(self.layers, self.readout_layers): h = update(h, adj, deg_conds) dg = readout(h, is_real_node) g = g + dg if self.concat_hidden: g_list.append(g) if self.concat_hidden: return functions.concat(g_list, axis=2) else: return g ================================================ FILE: chainer_chemistry/models/prediction/__init__.py ================================================ from chainer_chemistry.models.prediction import base # NOQA from chainer_chemistry.models.prediction import classifier # NOQA from chainer_chemistry.models.prediction import graph_conv_predictor # NOQA from chainer_chemistry.models.prediction import regressor # NOQA from chainer_chemistry.models.prediction.base import BaseForwardModel # NOQA from chainer_chemistry.models.prediction.classifier import Classifier # NOQA from chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor # NOQA from chainer_chemistry.models.prediction.regressor import Regressor # NOQA from chainer_chemistry.models.prediction.set_up_predictor import set_up_predictor # NOQA ================================================ FILE: chainer_chemistry/models/prediction/base.py ================================================ import pickle import numpy import chainer from chainer import cuda from chainer.dataset.convert import concat_examples from chainer.iterators import SerialIterator from chainer import link import chainerx # NOQA def _to_tuple(x): if not isinstance(x, tuple): x = (x,) return x def _extract_numpy(x): if isinstance(x, chainer.Variable): x = x.data return cuda.to_cpu(x) class BaseForwardModel(link.Chain): """A base model which supports forward functionality. It also supports pickle save/load functionality. """ def __init__(self): super(BaseForwardModel, self).__init__() self.inputs = None def initialize(self, device=-1): """Initialization of the model. It must be executed **after** the link registration (often done by `with self.init_scope()` finished. Args: device (int or chainer._backend.Device): GPU device id of this model to be used. -1 indicates to use in CPU. """ self.update_device(device=device) def update_device(self, device=-1): if not isinstance(device, chainer._backend.Device): device = chainer.get_device(device) # type: chainerx.Device if self.device != device: device.use() # reset current state self.to_cpu() # update the model to specified device self.to_device(device) def _forward(self, data, fn, batchsize=16, converter=concat_examples, retain_inputs=False, preprocess_fn=None, postprocess_fn=None): """Forward data by iterating with batch Args: data: "train_x array" or "chainer dataset" fn (Callable): Main function to forward. Its input argument is either Variable, cupy.ndarray or numpy.ndarray, and returns Variable. batchsize (int): batch size converter (Callable): convert from `data` to `inputs` retain_inputs (bool): If True, this instance keeps inputs in `self.inputs` or not. preprocess_fn (Callable): Its input is numpy.ndarray or cupy.ndarray, it can return either Variable, cupy.ndarray or numpy.ndarray postprocess_fn (Callable): Its input argument is Variable, but this method may return either Variable, cupy.ndarray or numpy.ndarray. Returns (tuple or numpy.ndarray): forward result """ input_list = None output_list = None it = SerialIterator(data, batch_size=batchsize, repeat=False, shuffle=False) for batch in it: inputs = converter(batch, self.device) inputs = _to_tuple(inputs) if preprocess_fn: inputs = preprocess_fn(*inputs) inputs = _to_tuple(inputs) outputs = fn(*inputs) outputs = _to_tuple(outputs) # Init if retain_inputs: if input_list is None: input_list = [[] for _ in range(len(inputs))] for j, input in enumerate(inputs): input_list[j].append(cuda.to_cpu(input)) if output_list is None: output_list = [[] for _ in range(len(outputs))] if postprocess_fn: outputs = postprocess_fn(*outputs) outputs = _to_tuple(outputs) for j, output in enumerate(outputs): output_list[j].append(_extract_numpy(output)) if retain_inputs: self.inputs = [numpy.concatenate( in_array) for in_array in input_list] result = [numpy.concatenate(output) for output in output_list] if len(result) == 1: return result[0] else: return result def save_pickle(self, filepath, protocol=None): """Save the model to `filepath` as a pickle file This function send the parameters to CPU before saving the model so that the pickled file can be loaded with in CPU-only environment. After the model is saved, it is sent back to the original device. Saved pickle file can be loaded with `load_pickle` static method. Note that the transportability of the saved file follows the specification of `pickle` module, namely serialized data depends on the specific class or attribute structure when saved. The file may not be loaded in different environment (version of python or dependent libraries), or after large refactoring of the pickled object class. If you want to avoid it, use `chainer.serializers.save_npz` method instead to save only model parameters. .. admonition:: Example >>> from chainer_chemistry.models import BaseForwardModel >>> class DummyForwardModel(BaseForwardModel): >>> >>> def __init__(self, device=-1): >>> super(DummyForwardModel, self).__init__() >>> with self.init_scope(): >>> self.l = chainer.links.Linear(3, 10) >>> self.initialize(device) >>> >>> def __call__(self, x): >>> return self.l(x) >>> >>> model = DummyForwardModel() >>> filepath = 'model.pkl' >>> model.save_pickle(filepath) Args: filepath (str): file path of pickle file. protocol (int or None): protocol version used in `pickle`. Use 2 if you need python2/python3 compatibility. 3 or higher is used for python3. Please refer the official document [1] for more details. [1]: https://docs.python.org/3.6/library/pickle.html#module-interface """ # NOQA current_device = self.device # --- Move the model to CPU for saving --- self.update_device(-1) with open(filepath, mode='wb') as f: pickle.dump(self, f, protocol=protocol) # --- Revert the model to original device --- self.update_device(current_device) @staticmethod def load_pickle(filepath, device=-1): """Load the model from `filepath` of pickle file, and send to `device` The file saved by `save_pickle` method can be loaded, but it may fail to load when loading from different develop environment or after updating library version. See `save_pickle` method for the transportability of the saved file. .. admonition:: Example >>> from chainer_chemistry.models import BaseForwardModel >>> filepath = 'model.pkl' >>> # `load_pickle` is static method, call from Class to get an instance >>> model = BaseForwardModel.load_pickle(filepath) Args: filepath (str): file path of pickle file. device (int or chainerx.Device): GPU device id of this model to be used. -1 indicates to use in CPU. """ with open(filepath, mode='rb') as f: model = pickle.load(f) if not isinstance(model, BaseForwardModel): raise TypeError('Unexpected type {}'.format(type(model))) # --- Revert the model to specified device --- model.initialize(device) return model ================================================ FILE: chainer_chemistry/models/prediction/classifier.py ================================================ import warnings import numpy import chainer from chainer.dataset.convert import concat_examples from chainer.functions.evaluation import accuracy from chainer.functions.loss import softmax_cross_entropy from chainer import cuda, Variable # NOQA from chainer import reporter from chainer_chemistry.models.prediction.base import BaseForwardModel def _argmax(*args): x = args[0] return chainer.functions.argmax(x, axis=1) class Classifier(BaseForwardModel): """A simple classifier model. This is an example of chain that wraps another chain. It computes the loss and accuracy based on a given input/label pair. Args: predictor (~chainer.Link): Predictor network. lossfun (function): Loss function. accfun (function): DEPRECATED. Please use `metrics_fun` instead. metrics_fun (function or dict or None): Function that computes metrics. label_key (int or str): Key to specify label variable from arguments. When it is ``int``, a variable in positional arguments is used. And when it is ``str``, a variable in keyword arguments is used. device (int or chainer._backend.Device): GPU device id of this Regressor to be used. -1 indicates to use in CPU. Attributes: predictor (~chainer.Link): Predictor network. lossfun (function): Loss function. accfun (function): DEPRECATED. Please use `metrics_fun` instead. y (~chainer.Variable): Prediction for the last minibatch. loss (~chainer.Variable): Loss value for the last minibatch. metrics (dict): Metrics computed in last minibatch compute_metrics (bool): If ``True``, compute metrics on the forward computation. The default value is ``True``. .. note:: The differences between original `Classifier` class in chainer and chainer chemistry are as follows. 1. `predict` and `predict_proba` methods are supported. 2. `device` can be managed internally by the `Classifier` 3. `accfun` is deprecated, `metrics_fun` is used instead. 4. `metrics_fun` can be `dict` which specifies the metrics name as key and function as value. .. note:: This link uses :func:`chainer.softmax_cross_entropy` with default arguments as a loss function (specified by ``lossfun``), if users do not explicitly change it. In particular, the loss function does not support double backpropagation. If you need second or higher order differentiation, you need to turn it on with ``enable_double_backprop=True``: >>> import chainer.functions as F >>> import chainer.links as L >>> >>> def lossfun(x, t): ... return F.softmax_cross_entropy( ... x, t, enable_double_backprop=True) >>> >>> predictor = L.Linear(10) >>> model = L.Classifier(predictor, lossfun=lossfun) """ compute_metrics = True def __init__(self, predictor, lossfun=softmax_cross_entropy.softmax_cross_entropy, accfun=None, metrics_fun=accuracy.accuracy, label_key=-1, device=-1): if not (isinstance(label_key, (int, str))): raise TypeError('label_key must be int or str, but is %s' % type(label_key)) if accfun is not None: warnings.warn( 'accfun is deprecated, please use metrics_fun instead') warnings.warn('overriding metrics by accfun...') # override metrics by accfun metrics_fun = accfun super(Classifier, self).__init__() self.lossfun = lossfun if metrics_fun is None: self.compute_metrics = False self.metrics_fun = {} elif callable(metrics_fun): self.metrics_fun = {'accuracy': metrics_fun} elif isinstance(metrics_fun, dict): self.metrics_fun = metrics_fun else: raise TypeError('Unexpected type metrics_fun must be None or ' 'Callable or dict. actual {}'.format(type(accfun))) self.y = None self.loss = None self.metrics = None self.label_key = label_key with self.init_scope(): self.predictor = predictor # `initialize` must be called after `init_scope`. self.initialize(device) def _convert_to_scalar(self, value): """Converts an input value to a scalar if its type is a Variable, numpy or cupy array, otherwise it returns the value as it is. """ if isinstance(value, Variable): value = value.array if numpy.isscalar(value): return value if type(value) is not numpy.array: value = cuda.to_cpu(value) return numpy.asscalar(value) def __call__(self, *args, **kwargs): """Computes the loss value for an input and label pair. It also computes accuracy and stores it to the attribute. Args: args (list of ~chainer.Variable): Input minibatch. kwargs (dict of ~chainer.Variable): Input minibatch. When ``label_key`` is ``int``, the correpoding element in ``args`` is treated as ground truth labels. And when it is ``str``, the element in ``kwargs`` is used. The all elements of ``args`` and ``kwargs`` except the ground trush labels are features. It feeds features to the predictor and compare the result with ground truth labels. Returns: ~chainer.Variable: Loss value. """ # --- Separate `args` and `t` --- if isinstance(self.label_key, int): if not (-len(args) <= self.label_key < len(args)): msg = 'Label key %d is out of bounds' % self.label_key raise ValueError(msg) t = args[self.label_key] if self.label_key == -1: args = args[:-1] else: args = args[:self.label_key] + args[self.label_key + 1:] elif isinstance(self.label_key, str): if self.label_key not in kwargs: msg = 'Label key "%s" is not found' % self.label_key raise ValueError(msg) t = kwargs[self.label_key] del kwargs[self.label_key] else: raise TypeError('Label key type {} not supported' .format(type(self.label_key))) self.y = None self.loss = None self.metrics = None self.y = self.predictor(*args, **kwargs) self.loss = self.lossfun(self.y, t) reporter.report( {'loss': self._convert_to_scalar(self.loss)}, self) if self.compute_metrics: # Note: self.accuracy is `dict`, which is different from original # chainer implementation self.metrics = {key: self._convert_to_scalar(value(self.y, t)) for key, value in self.metrics_fun.items()} reporter.report(self.metrics, self) return self.loss def predict_proba( self, data, batchsize=16, converter=concat_examples, retain_inputs=False, preprocess_fn=None, postprocess_fn=chainer.functions.softmax): """Calculate probability of each category. Args: data: "train_x array" or "chainer dataset" fn (Callable): Main function to forward. Its input argument is either Variable, cupy.ndarray or numpy.ndarray, and returns Variable. batchsize (int): batch size converter (Callable): convert from `data` to `inputs` preprocess_fn (Callable): Its input is numpy.ndarray or cupy.ndarray, it can return either Variable, cupy.ndarray or numpy.ndarray postprocess_fn (Callable): Its input argument is Variable, but this method may return either Variable, cupy.ndarray or numpy.ndarray. retain_inputs (bool): If True, this instance keeps inputs in `self.inputs` or not. Returns (tuple or numpy.ndarray): Typically, it is 2-dimensional float array with shape (batchsize, number of category) which represents each examples probability to be each category. """ with chainer.no_backprop_mode(), chainer.using_config('train', False): proba = self._forward( data, fn=self.predictor, batchsize=batchsize, converter=converter, retain_inputs=retain_inputs, preprocess_fn=preprocess_fn, postprocess_fn=postprocess_fn) return proba def predict( self, data, batchsize=16, converter=concat_examples, retain_inputs=False, preprocess_fn=None, postprocess_fn=_argmax): """Predict label of each category by taking . Args: data: input data batchsize (int): batch size converter (Callable): convert from `data` to `inputs` preprocess_fn (Callable): Its input is numpy.ndarray or cupy.ndarray, it can return either Variable, cupy.ndarray or numpy.ndarray postprocess_fn (Callable): Its input argument is Variable, but this method may return either Variable, cupy.ndarray or numpy.ndarray. retain_inputs (bool): If True, this instance keeps inputs in `self.inputs` or not. Returns (tuple or numpy.ndarray): Typically, it is 1-dimensional int array with shape (batchsize, ) which represents each examples category prediction. """ with chainer.no_backprop_mode(), chainer.using_config('train', False): predict_labels = self._forward( data, fn=self.predictor, batchsize=batchsize, converter=converter, retain_inputs=retain_inputs, preprocess_fn=preprocess_fn, postprocess_fn=postprocess_fn) return predict_labels # --- For backward compatibility --- @property def compute_accuracy(self): warnings.warn('compute_accuracy is deprecated,' 'please use compute_metrics instead') return self.compute_metrics @compute_accuracy.setter def compute_accuracy(self, value): warnings.warn('compute_accuracy is deprecated,' 'please use compute_metrics instead') self.compute_metrics = value @property def accuracy(self): warnings.warn('accuracy is deprecated,' 'please use metrics instead') return self.metrics @accuracy.setter def accuracy(self, value): warnings.warn('accuracy is deprecated,' 'please use metrics instead') self.metrics = value @property def accfun(self): warnings.warn('accfun is deprecated,' 'please use metrics_fun instead') return self.metrics_fun @accfun.setter def accfun(self, value): warnings.warn('accfun is deprecated,' 'please use metrics_fun instead') self.metrics_fun = value ================================================ FILE: chainer_chemistry/models/prediction/graph_conv_predictor.py ================================================ from typing import Optional # NOQA import chainer import numpy # NOQA class GraphConvPredictor(chainer.Chain): """Wrapper class that combines a graph convolution and MLP.""" def __init__( self, graph_conv, # type: chainer.Link mlp=None, # type: Optional[chainer.Link] label_scaler=None, # type: Optional[chainer.Link] postprocess_fn=None # type: Optional[chainer.FunctionNode] ): # type: (...) -> None """Initialize the graph convolution predictor. Args: graph_conv (chainer.Chain): The graph convolution network required to obtain molecule feature representation. mlp (chainer.Chain or None): Multi layer perceptron; used as the final fully connected layer. Set it to `None` if no operation is necessary after the `graph_conv` calculation. label_scaler (chainer.Link or None): scaler link postprocess_fn (chainer.FunctionNode or None): postprocess function for prediction. """ super(GraphConvPredictor, self).__init__() with self.init_scope(): self.graph_conv = graph_conv if isinstance(mlp, chainer.Link): self.mlp = mlp if isinstance(label_scaler, chainer.Link): self.label_scaler = label_scaler if not isinstance(mlp, chainer.Link): self.mlp = mlp if not isinstance(label_scaler, chainer.Link): self.label_scaler = label_scaler self.postprocess_fn = postprocess_fn or chainer.functions.identity def __call__(self, *args, **kwargs): x = self.graph_conv(*args, **kwargs) if self.mlp: x = self.mlp(x) if self.label_scaler is not None: x = self.label_scaler.inverse_transform(x) return x def predict(self, atoms, adjs): # type: (numpy.ndarray, numpy.ndarray) -> chainer.Variable # TODO(nakago): support super_node & is_real_node args. with chainer.no_backprop_mode(), chainer.using_config('train', False): x = self.__call__(atoms, adjs) return self.postprocess_fn(x) ================================================ FILE: chainer_chemistry/models/prediction/node_classifier.py ================================================ from chainer import reporter from chainer_chemistry.models.prediction.classifier import Classifier class NodeClassifier(Classifier): """A simple node classifier model.""" def __call__(self, data, train_mask, valid_mask, *args, **kwargs): """Computes the loss value for an input and label pair.""" self.metrics = None self.y = self.predictor(data) # Support for padding pattern if self.y.ndim == 3: assert self.y.shape[0] == 1 self.y = self.y[0] self.train_loss = self.lossfun(self.y[train_mask], data.y[train_mask]) self.valid_loss = self.lossfun(self.y[valid_mask], data.y[valid_mask]) reporter.report( {'loss(train)': self._convert_to_scalar(self.train_loss)}, self) reporter.report( {'loss(valid)': self._convert_to_scalar(self.valid_loss)}, self) if self.compute_metrics: # Note: self.accuracy is `dict`, which is different from original # chainer implementation self.train_metrics = {key + "(train)": self._convert_to_scalar( value(self.y[train_mask], data.y[train_mask])) for key, value in self.metrics_fun.items()} self.valid_metrics = {key + "(valid)": self._convert_to_scalar( value(self.y[valid_mask], data.y[valid_mask])) for key, value in self.metrics_fun.items()} reporter.report(self.train_metrics, self) reporter.report(self.valid_metrics, self) return self.train_loss ================================================ FILE: chainer_chemistry/models/prediction/regressor.py ================================================ import numpy import chainer from chainer.dataset.convert import concat_examples from chainer import cuda, Variable # NOQA from chainer import reporter from chainer_chemistry.dataset.graph_dataset.base_graph_data import BaseGraphData # NOQA from chainer_chemistry.models.prediction.base import BaseForwardModel class Regressor(BaseForwardModel): """A simple regressor model. This is an example of chain that wraps another chain. It computes the loss and metrics based on a given input/label pair. Args: predictor (~chainer.Link): Predictor network. lossfun (function): Loss function. metrics_fun (function or dict or None): Function that computes metrics. label_key (int or str): Key to specify label variable from arguments. When it is ``int``, a variable in positional arguments is used. And when it is ``str``, a variable in keyword arguments is used. device (int or chainer._backend.Device): GPU device id of this Regressor to be used. -1 indicates to use in CPU. Attributes: predictor (~chainer.Link): Predictor network. lossfun (function): Loss function. y (~chainer.Variable): Prediction for the last minibatch. loss (~chainer.Variable): Loss value for the last minibatch. metrics (dict): Metrics computed in last minibatch compute_metrics (bool): If ``True``, compute metrics on the forward computation. The default value is ``True``. """ compute_metrics = True def __init__(self, predictor, lossfun=chainer.functions.mean_squared_error, metrics_fun=None, label_key=-1, device=-1): if not (isinstance(label_key, (int, str))): raise TypeError('label_key must be int or str, but is %s' % type(label_key)) super(Regressor, self).__init__() self.lossfun = lossfun if metrics_fun is None: self.compute_metrics = False self.metrics_fun = {} elif callable(metrics_fun): self.metrics_fun = {'metrics': metrics_fun} elif isinstance(metrics_fun, dict): self.metrics_fun = metrics_fun else: raise TypeError('Unexpected type metrics_fun must be None or ' 'Callable or dict. actual {}' .format(type(metrics_fun))) self.y = None self.loss = None self.metrics = None self.label_key = label_key with self.init_scope(): self.predictor = predictor # `initialize` must be called after `init_scope`. self.initialize(device) def _convert_to_scalar(self, value): """Converts an input value to a scalar if its type is a Variable, numpy or cupy array, otherwise it returns the value as it is. """ if isinstance(value, Variable): value = value.array if numpy.isscalar(value): return value if type(value) is not numpy.array: value = cuda.to_cpu(value) return numpy.asscalar(value) def __call__(self, *args, **kwargs): """Computes the loss value for an input and label pair. It also computes metrics and stores it to the attribute. Args: args (list of ~chainer.Variable): Input minibatch. kwargs (dict of ~chainer.Variable): Input minibatch. When ``label_key`` is ``int``, the correpoding element in ``args`` is treated as ground truth labels. And when it is ``str``, the element in ``kwargs`` is used. The all elements of ``args`` and ``kwargs`` except the ground trush labels are features. It feeds features to the predictor and compare the result with ground truth labels. Returns: ~chainer.Variable: Loss value. """ # --- Separate `args` and `t` --- if isinstance(args[0], BaseGraphData): # for graph dataset t = args[0].y elif isinstance(self.label_key, int): if not (-len(args) <= self.label_key < len(args)): msg = 'Label key %d is out of bounds' % self.label_key raise ValueError(msg) t = args[self.label_key] if self.label_key == -1: args = args[:-1] else: args = args[:self.label_key] + args[self.label_key + 1:] elif isinstance(self.label_key, str): if self.label_key not in kwargs: msg = 'Label key "%s" is not found' % self.label_key raise ValueError(msg) t = kwargs[self.label_key] del kwargs[self.label_key] else: raise TypeError('Label key type {} not supported' .format(type(self.label_key))) self.y = None self.loss = None self.metrics = None self.y = self.predictor(*args, **kwargs) self.loss = self.lossfun(self.y, t) # When the reported data is a numpy array, the loss and metrics values # are scalars. When the reported data is a cupy array, sometimes the # same values become arrays instead. This seems to be a bug inside the # reporter class, which needs to be addressed and fixed. Until then, # the reported values will be converted to numpy arrays. reporter.report( {'loss': self._convert_to_scalar(self.loss)}, self) if self.compute_metrics: # Note: self.metrics_fun is `dict`, # which is different from original chainer implementation self.metrics = {key: self._convert_to_scalar(value(self.y, t)) for key, value in self.metrics_fun.items()} reporter.report(self.metrics, self) return self.loss def predict( self, data, batchsize=16, converter=concat_examples, retain_inputs=False, preprocess_fn=None, postprocess_fn=None): """Predict label of each category by taking . Args: data: input data batchsize (int): batch size converter (Callable): convert from `data` to `inputs` preprocess_fn (Callable): Its input is numpy.ndarray or cupy.ndarray, it can return either Variable, cupy.ndarray or numpy.ndarray postprocess_fn (Callable): Its input argument is Variable, but this method may return either Variable, cupy.ndarray or numpy.ndarray. retain_inputs (bool): If True, this instance keeps inputs in `self.inputs` or not. Returns (tuple or numpy.ndarray): Typically, it is 1-dimensional int array with shape (batchsize, ) which represents each examples category prediction. """ with chainer.no_backprop_mode(), chainer.using_config('train', False): predict_labels = self._forward( data, fn=self.predictor, batchsize=batchsize, converter=converter, retain_inputs=retain_inputs, preprocess_fn=preprocess_fn, postprocess_fn=postprocess_fn) return predict_labels ================================================ FILE: chainer_chemistry/models/prediction/set_up_predictor.py ================================================ from typing import Any # NOQA from typing import Dict # NOQA from typing import Optional # NOQA import chainer # NOQA from chainer_chemistry.models.cgcnn import CGCNN from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.ggnn import GGNN from chainer_chemistry.models.gin import GIN, GINSparse # NOQA from chainer_chemistry.models.gnn_film import GNNFiLM from chainer_chemistry.models.megnet import MEGNet from chainer_chemistry.models.mlp import MLP from chainer_chemistry.models.nfp import NFP from chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor # NOQA from chainer_chemistry.models.relgat import RelGAT from chainer_chemistry.models.relgcn import RelGCN, RelGCNSparse # NOQA from chainer_chemistry.models.rsgcn import RSGCN from chainer_chemistry.models.schnet import SchNet from chainer_chemistry.models.weavenet import WeaveNet from chainer_chemistry.models.gwm.gwm_net import GGNN_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import GIN_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import NFP_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import RSGCN_GWM # NOQA from chainer_chemistry.models.cwle.cwle_net import GGNN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RelGAT_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RelGCN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import GIN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import NFP_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RSGCN_CWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import GGNN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RelGAT_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RelGCN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import GIN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import NFP_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RSGCN_GWLE # NOQA from chainer_chemistry.models.cwle.cwle_graph_conv_model import MAX_WLE_NUM def set_up_predictor( method, # type: str n_unit, # type: int conv_layers, # type: int class_num, # type: int label_scaler=None, # type: Optional[chainer.Link] postprocess_fn=None, # type: Optional[chainer.FunctionNode] n_atom_types=MAX_ATOMIC_NUM, conv_kwargs=None, # type: Optional[Dict[str, Any]] n_wle_types=MAX_WLE_NUM # type: int ): # type: (...) -> GraphConvPredictor """Set up the predictor, consisting of a GCN and a MLP. Args: method (str): Method name. n_unit (int): Number of hidden units. conv_layers (int): Number of convolutional layers for the graph convolution network. class_num (int): Number of output classes. label_scaler (chainer.Link or None): scaler link postprocess_fn (chainer.FunctionNode or None): postprocess function for prediction. conv_kwargs (dict): keyword args for GraphConvolution model. """ mlp = MLP(out_dim=class_num, hidden_dim=n_unit) # type: Optional[MLP] if conv_kwargs is None: conv_kwargs = {} if method == 'nfp' or method == 'nfp_wle': print('Set up NFP predictor...') conv = NFP( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'ggnn' or method == 'ggnn_wle': print('Set up GGNN predictor...') conv = GGNN( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'schnet': print('Set up SchNet predictor...') conv = SchNet( out_dim=class_num, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) mlp = None elif method == 'weavenet': print('Set up WeaveNet predictor...') conv = WeaveNet(hidden_dim=n_unit, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'rsgcn' or method == 'rsgcn_wle': print('Set up RSGCN predictor...') conv = RSGCN(out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'relgcn' or method == 'relgcn_wle': print('Set up Relational GCN predictor...') num_edge_type = 4 conv = RelGCN( out_dim=n_unit, n_edge_types=num_edge_type, scale_adj=True, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'relgat' or method == 'relgat_wle': print('Set up Relational GAT predictor...') conv = RelGAT( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'gin' or method == 'gin_wle': print('Set up GIN predictor...') conv = GIN( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'nfp_gwm': print('Set up NFP_GWM predictor...') conv = NFP_GWM( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'ggnn_gwm': print('Set up GGNN_GWM predictor...') conv = GGNN_GWM( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'rsgcn_gwm': print('Set up RSGCN_GWM predictor...') conv = RSGCN_GWM( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'gin_gwm': print('Set up GIN_GWM predictor...') conv = GIN_GWM( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'nfp_cwle': print('Set up NFP_CWLE predictor...') conv = NFP_CWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'ggnn_cwle': print('Set up GGNN_CWLE predictor...') conv = GGNN_CWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'relgat_cwle': print('Set up RelGAT_CWLE predictor...') conv = RelGAT_CWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'relgcn_cwle': print('Set up RelGCN_CWLE predictor...') conv = RelGCN_CWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'rsgcn_cwle': print('Set up RSGCN_CWLE predictor...') conv = RSGCN_CWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'gin_cwle': print('Set up GIN_CWLE predictor...') conv = GIN_CWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'nfp_gwle': print('Set up NFP_GWLE predictor...') conv = NFP_GWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'ggnn_gwle': print('Set up GGNN_GWLE predictor...') conv = GGNN_GWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'relgat_gwle': print('Set up RelGAT_GWLE predictor...') conv = RelGAT_GWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'relgcn_gwle': print('Set up RelGCN_GWLE predictor...') conv = RelGCN_GWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'rsgcn_gwle': print('Set up RSGCN_GWLE predictor...') conv = RSGCN_GWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'gin_cwle': print('Set up GIN_CWLE predictor...') conv = GIN_CWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'gin_gwle': print('Set up GIN_gWLE predictor...') conv = GIN_GWLE( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, n_wle_types=n_wle_types, **conv_kwargs) elif method == 'relgcn_sparse': print('Set up RelGCNSparse predictor...') conv = RelGCNSparse( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'gin_sparse': print('Set up GIN predictor...') conv = GINSparse( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'gnnfilm': print('Training a GNN_FiLM predictor...') conv = GNNFiLM( out_dim=n_unit, hidden_channels=n_unit, n_update_layers=conv_layers, n_edge_types=5, n_atom_types=n_atom_types, **conv_kwargs) elif method == 'megnet': print('Set up MEGNet predictor...') conv = MEGNet( out_dim=n_unit, n_update_layers=conv_layers, **conv_kwargs) elif method == 'cgcnn': print('Set up CGCNN predictor...') conv = CGCNN( out_dim=n_unit, n_update_layers=conv_layers, **conv_kwargs) else: raise ValueError('[ERROR] Invalid method: {}'.format(method)) predictor = GraphConvPredictor(conv, mlp, label_scaler, postprocess_fn) return predictor ================================================ FILE: chainer_chemistry/models/relgat.py ================================================ # -*- coding: utf-8 -*- import chainer from chainer import functions, cuda # NOQA from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import EmbedAtomID from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.update.relgat_update import RelGATUpdate class RelGAT(chainer.Chain): """Relational Graph Attention Networks (GAT) See: Veličković, Petar, et al. (2017).\ Graph Attention Networks.\ `arXiv:1701.10903 `\ Dan Busbridge, et al. (2018).\ Relational Graph Attention Networks ``\ Args: out_dim (int): dimension of output feature vector hidden_channels (int): dimension of feature vector for each node n_update_layers (int): number of layers n_atom_types (int): number of types of atoms concat_hidden (bool): If set to True, readout is executed in each layer and the result is concatenated dropout_ratio (float): dropout ratio of the normalized attention coefficients weight_tying (bool): enable weight_tying or not activation (~chainer.Function or ~chainer.FunctionNode): activate function n_edge_types (int): number of edge type. Defaults to 4 for single, double, triple and aromatic bond. n_heads (int): number of multi-head-attentions. negative_slope (float): LeakyRELU angle of the negative slope softmax_mode (str): take the softmax over the logits 'across' or 'within' relation. If you would like to know the detail discussion, please refer Relational GAT paper. concat_heads (bool) : Whether to concat or average multi-head attentions """ def __init__(self, out_dim, hidden_channels=16, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, dropout_ratio=-1., weight_tying=False, activation=functions.identity, n_edge_types=4, n_heads=3, negative_slope=0.2, softmax_mode='across', concat_heads=False): super(RelGAT, self).__init__() n_readout_layer = n_update_layers if concat_hidden else 1 n_message_layer = n_update_layers with self.init_scope(): self.embed = EmbedAtomID(out_size=hidden_channels, in_size=n_atom_types) update_layers = [] for i in range(n_message_layer): if i > 0 and concat_heads: input_dim = hidden_channels * n_heads else: input_dim = hidden_channels update_layers.append( RelGATUpdate(input_dim, hidden_channels, n_heads=n_heads, n_edge_types=n_edge_types, dropout_ratio=dropout_ratio, negative_slope=negative_slope, softmax_mode=softmax_mode, concat_heads=concat_heads)) self.update_layers = chainer.ChainList(*update_layers) if concat_heads: in_channels = hidden_channels * (n_heads + 1) else: in_channels = hidden_channels * 2 self.readout_layers = chainer.ChainList(*[GGNNReadout( out_dim=out_dim, in_channels=in_channels, activation=activation, activation_agg=activation) for _ in range(n_readout_layer)]) self.out_dim = out_dim self.n_heads = n_heads self.hidden_channels = hidden_channels self.n_update_layers = n_update_layers self.concat_hidden = concat_hidden self.concat_heads = concat_heads self.weight_tying = weight_tying self.negative_slope = negative_slope self.n_edge_types = n_edge_types self.dropout_ratio = dropout_ratio def __call__(self, atom_array, adj): """Forward propagation Args: atom_array (numpy.ndarray): minibatch of molecular which is represented with atom IDs (representing C, O, S, ...) `atom_array[mol_index, atom_index]` represents `mol_index`-th molecule's `atom_index`-th atomic number adj (numpy.ndarray): minibatch of adjancency matrix with edge-type information Returns: ~chainer.Variable: minibatch of fingerprint """ # reset state if atom_array.dtype == self.xp.int32: h = self.embed(atom_array) # (minibatch, max_num_atoms) else: h = atom_array h0 = functions.copy(h, cuda.get_device_from_array(h.data).id) g_list = [] for step in range(self.n_update_layers): message_layer_index = 0 if self.weight_tying else step h = self.update_layers[message_layer_index](h, adj) if self.concat_hidden: g = self.readout_layers[step](h, h0) g_list.append(g) if self.concat_hidden: return functions.concat(g_list, axis=1) else: g = self.readout_layers[0](h, h0) return g ================================================ FILE: chainer_chemistry/models/relgcn.py ================================================ import chainer from chainer import functions, cuda # NOQA from chainer.links import Linear from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.scatter_ggnn_readout import ScatterGGNNReadout # NOQA from chainer_chemistry.links import EmbedAtomID, GraphLinear # NOQA from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.update.relgcn_update import RelGCNUpdate, RelGCNSparseUpdate # NOQA def rescale_adj(adj): """Normalize adjacency matrix It ensures that activations are on a similar scale irrespective of the number of neighbors Args: adj (:class:`chainer.Variable`, or :class:`numpy.ndarray` \ or :class:`cupy.ndarray`): adjacency matrix Returns: :class:`chainer.Variable`: normalized adjacency matrix """ xp = cuda.get_array_module(adj) num_neighbors = functions.sum(adj, axis=(1, 2)) base = xp.ones(num_neighbors.shape, dtype=xp.float32) cond = num_neighbors.data != 0 num_neighbors_inv = 1 / functions.where(cond, num_neighbors, base) return adj * functions.broadcast_to( num_neighbors_inv[:, None, None, :], adj.shape) class RelGCN(chainer.Chain): """Relational GCN (RelGCN) See: Michael Schlichtkrull+, \ Modeling Relational Data with Graph Convolutional Networks. \ March 2017. \ `arXiv:1703.06103 ` Args: out_dim (int): dimension of output feature vector hidden_channels (None or int or list): dimension of feature vector for each node n_update_layers (int): number of layers n_atom_types (int): number of types of atoms n_edge_types (int): number of edge type. Defaults to 4 for single, double, triple and aromatic bond. scale_adj (bool): If ``True``, then this network normalizes adjacency matrix """ def __init__(self, out_dim=64, hidden_channels=None, n_update_layers=None, n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4, input_type='int', scale_adj=False): super(RelGCN, self).__init__() if hidden_channels is None: hidden_channels = [16, 128, 64] elif isinstance(hidden_channels, int): if not isinstance(n_update_layers, int): raise ValueError( 'Must specify n_update_layers when hidden_channels is int') hidden_channels = [hidden_channels] * n_update_layers with self.init_scope(): if input_type == 'int': self.embed = EmbedAtomID(out_size=hidden_channels[0], in_size=n_atom_types) elif input_type == 'float': self.embed = GraphLinear(None, hidden_channels[0]) else: raise ValueError("[ERROR] Unexpected value input_type={}" .format(input_type)) self.rgcn_convs = chainer.ChainList(*[ RelGCNUpdate(hidden_channels[i], hidden_channels[i + 1], n_edge_types) for i in range(len(hidden_channels) - 1)]) self.rgcn_readout = GGNNReadout( out_dim=out_dim, in_channels=hidden_channels[-1], nobias=True, activation=functions.tanh) # self.num_relations = num_edge_type self.input_type = input_type self.scale_adj = scale_adj def __call__(self, x, adj): """main calculation Args: x: (batchsize, num_nodes, in_channels) adj: (batchsize, num_edge_type, num_nodes, num_nodes) Returns: (batchsize, hidden_channels) """ if x.dtype == self.xp.int32: assert self.input_type == 'int' else: assert self.input_type == 'float' h = self.embed(x) # (minibatch, max_num_atoms) if self.scale_adj: adj = rescale_adj(adj) for rgcn_conv in self.rgcn_convs: h = functions.tanh(rgcn_conv(h, adj)) h = self.rgcn_readout(h) return h class RelGCNSparse(chainer.Chain): """Relational GCN (RelGCN) Sparse Pattern See: Michael Schlichtkrull+, \ Modeling Relational Data with Graph Convolutional Networks. \ March 2017. \ `arXiv:1703.06103 ` Args: out_dim (int): dimension of output feature vector hidden_channels (None or int or list): dimension of feature vector for each node n_update_layers (int): number of layers n_atom_types (int): number of types of atoms n_edge_types (int): number of edge type. Defaults to 4 for single, double, triple and aromatic bond. scale_adj (bool): If ``True``, then this network normalizes adjacency matrix """ def __init__(self, out_dim=64, hidden_channels=None, n_update_layers=None, n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4, input_type='int', scale_adj=False): super(RelGCNSparse, self).__init__() if hidden_channels is None: hidden_channels = [16, 128, 64] elif isinstance(hidden_channels, int): if not isinstance(n_update_layers, int): raise ValueError( 'Must specify n_update_layers when hidden_channels is int') hidden_channels = [hidden_channels] * n_update_layers with self.init_scope(): if input_type == 'int': self.embed = EmbedAtomID(out_size=hidden_channels[0], in_size=n_atom_types) elif input_type == 'float': self.embed = Linear(None, hidden_channels[0]) else: raise ValueError("[ERROR] Unexpected value input_type={}" .format(input_type)) self.rgcn_convs = chainer.ChainList(*[ RelGCNSparseUpdate(hidden_channels[i], hidden_channels[i + 1], n_edge_types) for i in range(len(hidden_channels) - 1)]) self.rgcn_readout = ScatterGGNNReadout( out_dim=out_dim, in_channels=hidden_channels[-1], nobias=True, activation=functions.tanh) # self.num_relations = num_edge_type self.input_type = input_type self.scale_adj = scale_adj def __call__(self, sparse_batch): """main calculation Args: x: (batchsize, num_nodes, in_channels) adj: (batchsize, num_edge_type, num_nodes, num_nodes) Returns: (batchsize, hidden_channels) """ if sparse_batch.x.dtype == self.xp.int32: assert self.input_type == 'int' else: assert self.input_type == 'float' h = self.embed(sparse_batch.x) # (minibatch, max_num_atoms) if self.scale_adj: raise NotImplementedError for rgcn_conv in self.rgcn_convs: h = functions.tanh(rgcn_conv( h, sparse_batch.edge_index, sparse_batch.edge_attr)) h = self.rgcn_readout(h, sparse_batch.batch) return h ================================================ FILE: chainer_chemistry/models/rsgcn.py ================================================ import chainer from chainer import functions, Variable # NOQA import chainer_chemistry from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.general_readout import GeneralReadout from chainer_chemistry.links.update.rsgcn_update import RSGCNUpdate class RSGCN(chainer.Chain): """Renormalized Spectral Graph Convolutional Network (RSGCN) See: Thomas N. Kipf and Max Welling, \ Semi-Supervised Classification with Graph Convolutional Networks. \ September 2016. \ `arXiv:1609.02907 `_ The name of this model "Renormalized Spectral Graph Convolutional Network (RSGCN)" is named by us rather than the authors of the paper above. The authors call this model just "Graph Convolution Network (GCN)", but we think that "GCN" is bit too general and may cause namespace issue. That is why we did not name this model as GCN. Args: out_dim (int): dimension of output feature vector hidden_channels (int): dimension of feature vector for each node n_update_layers (int): number of layers n_atom_types (int): number of types of atoms use_batch_norm (bool): If True, batch normalization is applied after graph convolution. readout (Callable): readout function. If None, `GeneralReadout(mode='sum)` is used. To the best of our knowledge, the paper of RSGCN model does not give any suggestion on readout. dropout_ratio (float): ratio used in dropout function. If 0 or negative value is set, dropout function is skipped. """ def __init__(self, out_dim, hidden_channels=32, n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM, use_batch_norm=False, readout=None, dropout_ratio=0.5): super(RSGCN, self).__init__() in_dims = [hidden_channels for _ in range(n_update_layers)] out_dims = [hidden_channels for _ in range(n_update_layers)] out_dims[n_update_layers - 1] = out_dim if readout is None: readout = GeneralReadout() with self.init_scope(): self.embed = chainer_chemistry.links.EmbedAtomID(out_size=hidden_channels, in_size=n_atom_types) self.gconvs = chainer.ChainList( *[RSGCNUpdate(in_dims[i], out_dims[i]) for i in range(n_update_layers)]) if use_batch_norm: self.bnorms = chainer.ChainList( *[chainer_chemistry.links.GraphBatchNormalization( out_dims[i]) for i in range(n_update_layers)]) else: self.bnorms = [None for _ in range(n_update_layers)] if isinstance(readout, chainer.Link): self.readout = readout if not isinstance(readout, chainer.Link): self.readout = readout self.out_dim = out_dim self.hidden_channels = hidden_channels self.n_update_layers = n_update_layers self.dropout_ratio = dropout_ratio def __call__(self, atom_array, adj, **kwargs): """Forward propagation Args: atom_array (numpy.ndarray): minibatch of molecular which is represented with atom IDs (representing C, O, S, ...) `atom_array[mol_index, atom_index]` represents `mol_index`-th molecule's `atom_index`-th atomic number adj (numpy.ndarray): minibatch of adjancency matrix `adj[mol_index]` represents `mol_index`-th molecule's adjacency matrix Returns: ~chainer.Variable: minibatch of fingerprint """ if atom_array.dtype == self.xp.int32: # atom_array: (minibatch, nodes) h = self.embed(atom_array) else: h = atom_array # h: (minibatch, nodes, ch) if isinstance(adj, Variable): w_adj = adj.data else: w_adj = adj w_adj = Variable(w_adj, requires_grad=False) # --- RSGCN update --- for i, (gconv, bnorm) in enumerate(zip(self.gconvs, self.bnorms)): #print(h.shape) h = gconv(h, w_adj) if bnorm is not None: h = bnorm(h) if self.dropout_ratio > 0.: h = functions.dropout(h, ratio=self.dropout_ratio) if i < self.n_update_layers - 1: h = functions.relu(h) # --- readout --- y = self.readout(h) return y ================================================ FILE: chainer_chemistry/models/schnet.py ================================================ import chainer from chainer import functions from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import EmbedAtomID from chainer_chemistry.links.readout.schnet_readout import SchNetReadout from chainer_chemistry.links.update.schnet_update import SchNetUpdate class SchNet(chainer.Chain): """SchNet See Kristof et al, \ SchNet: A continuous-filter convolutional neural network for modeling quantum interactions. \ `arXiv:1706.08566 `_ Args: out_dim (int): dimension of output feature vector hidden_channels (int): dimension of feature vector for each node n_update_layers (int): number of layers readout_hidden_dim (int): dimension of feature vector associated to each molecule n_atom_types (int): number of types of atoms concat_hidden (bool): If set to True, readout is executed in each layer and the result is concatenated num_rbf (int): Number of RDF kernels used in `CFConv`. radius_resolution (float): Resolution of radius. The range (radius_resolution * 1 ~ radius_resolution * num_rbf) are taken inside `CFConv`. gamma (float): exponential factor of `CFConv`'s radius kernel. """ def __init__(self, out_dim=1, hidden_channels=64, n_update_layers=3, readout_hidden_dim=32, n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False, num_rbf=300, radius_resolution=0.1, gamma=10.0): super(SchNet, self).__init__() with self.init_scope(): self.embed = EmbedAtomID(out_size=hidden_channels, in_size=n_atom_types) self.update_layers = chainer.ChainList( *[SchNetUpdate( hidden_channels, num_rbf=num_rbf, radius_resolution=radius_resolution, gamma=gamma) for _ in range(n_update_layers)]) self.readout_layer = SchNetReadout( out_dim, in_channels=None, hidden_channels=readout_hidden_dim) self.out_dim = out_dim self.hidden_channels = hidden_channels self.readout_hidden_dim = readout_hidden_dim self.n_update_layers = n_update_layers self.concat_hidden = concat_hidden def __call__(self, atom_features, dist_features): x = self.embed(atom_features) h = [] # --- update part --- for i in range(self.n_update_layers): x = self.update_layers[i](x, dist_features) if self.concat_hidden: h.append(x) # --- readout part --- if self.concat_hidden: x = functions.concat(h, axis=2) x = self.readout_layer(x) return x ================================================ FILE: chainer_chemistry/models/weavenet.py ================================================ import chainer from chainer import functions from chainer import links from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.config import WEAVE_DEFAULT_NUM_MAX_ATOMS from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.readout.general_readout import GeneralReadout WEAVENET_DEFAULT_WEAVE_CHANNELS = [50, ] class LinearLayer(chainer.Chain): def __init__(self, n_channel, n_layer): super(LinearLayer, self).__init__() with self.init_scope(): self.layers = chainer.ChainList( *[links.Linear(None, n_channel) for _ in range(n_layer)] ) self.n_output_channel = n_channel def forward(self, x): n_batch, n_atom, n_channel = x.shape x = functions.reshape(x, (n_batch * n_atom, n_channel)) for l in self.layers: x = l(x) x = functions.relu(x) x = functions.reshape(x, (n_batch, n_atom, self.n_output_channel)) return x class AtomToPair(chainer.Chain): def __init__(self, n_channel, n_layer, n_atom): super(AtomToPair, self).__init__() with self.init_scope(): self.linear_layers = chainer.ChainList( *[links.Linear(None, n_channel) for _ in range(n_layer)] ) self.n_atom = n_atom self.n_channel = n_channel def forward(self, x): n_batch, n_atom, n_feature = x.shape atom_repeat = functions.reshape(x, (n_batch, 1, n_atom, n_feature)) atom_repeat = functions.broadcast_to( atom_repeat, (n_batch, n_atom, n_atom, n_feature)) atom_repeat = functions.reshape(atom_repeat, (n_batch, n_atom * n_atom, n_feature)) atom_tile = functions.reshape(x, (n_batch, n_atom, 1, n_feature)) atom_tile = functions.broadcast_to( atom_tile, (n_batch, n_atom, n_atom, n_feature)) atom_tile = functions.reshape(atom_tile, (n_batch, n_atom * n_atom, n_feature)) pair_x0 = functions.concat((atom_tile, atom_repeat), axis=2) pair_x0 = functions.reshape(pair_x0, (n_batch * n_atom * n_atom, n_feature * 2)) for l in self.linear_layers: pair_x0 = l(pair_x0) pair_x0 = functions.relu(pair_x0) pair_x0 = functions.reshape(pair_x0, (n_batch, n_atom * n_atom, self.n_channel)) pair_x1 = functions.concat((atom_repeat, atom_tile), axis=2) pair_x1 = functions.reshape(pair_x1, (n_batch * n_atom * n_atom, n_feature * 2)) for l in self.linear_layers: pair_x1 = l(pair_x1) pair_x1 = functions.relu(pair_x1) pair_x1 = functions.reshape(pair_x1, (n_batch, n_atom * n_atom, self.n_channel)) return pair_x0 + pair_x1 class PairToAtom(chainer.Chain): def __init__(self, n_channel, n_layer, n_atom, mode='sum'): super(PairToAtom, self).__init__() with self.init_scope(): self.linearLayer = chainer.ChainList( *[links.Linear(None, n_channel) for _ in range(n_layer)] ) self.readout = GeneralReadout(mode=mode) self.n_atom = n_atom self.n_channel = n_channel self.mode = mode def forward(self, x): n_batch, n_pair, n_feature = x.shape a = functions.reshape( x, (n_batch * (self.n_atom * self.n_atom), n_feature)) for l in self.linearLayer: a = l(a) a = functions.relu(a) a = functions.reshape(a, (n_batch, self.n_atom, self.n_atom, self.n_channel)) a = self.readout(a, axis=2) return a class WeaveModule(chainer.Chain): def __init__(self, n_atom, output_channel, n_sub_layer, readout_mode='sum'): super(WeaveModule, self).__init__() with self.init_scope(): self.atom_layer = LinearLayer(output_channel, n_sub_layer) self.pair_layer = LinearLayer(output_channel, n_sub_layer) self.atom_to_atom = LinearLayer(output_channel, n_sub_layer) self.pair_to_pair = LinearLayer(output_channel, n_sub_layer) self.atom_to_pair = AtomToPair(output_channel, n_sub_layer, n_atom) self.pair_to_atom = PairToAtom(output_channel, n_sub_layer, n_atom, mode=readout_mode) self.n_atom = n_atom self.n_channel = output_channel self.readout_mode = readout_mode def forward(self, atom_x, pair_x, atom_only=False): a0 = self.atom_to_atom.forward(atom_x) a1 = self.pair_to_atom.forward(pair_x) a = functions.concat([a0, a1], axis=2) next_atom = self.atom_layer.forward(a) next_atom = functions.relu(next_atom) if atom_only: return next_atom p0 = self.atom_to_pair.forward(atom_x) p1 = self.pair_to_pair.forward(pair_x) p = functions.concat([p0, p1], axis=2) next_pair = self.pair_layer.forward(p) next_pair = functions.relu(next_pair) return next_atom, next_pair class WeaveNet(chainer.Chain): """WeaveNet implementation Args: weave_channels (list): list of int, output dimension for each weave module hidden_dim (int): hidden dim n_atom (int): number of atom of input array n_sub_layer (int): number of layer for each `AtomToPair`, `PairToAtom` layer n_atom_types (int): number of atom id readout_mode (str): 'sum' or 'max' or 'summax' """ def __init__(self, weave_channels=None, hidden_dim=16, n_atom=WEAVE_DEFAULT_NUM_MAX_ATOMS, n_sub_layer=1, n_atom_types=MAX_ATOMIC_NUM, readout_mode='sum'): weave_channels = weave_channels or WEAVENET_DEFAULT_WEAVE_CHANNELS weave_module = [ WeaveModule(n_atom, c, n_sub_layer, readout_mode=readout_mode) for c in weave_channels ] super(WeaveNet, self).__init__() with self.init_scope(): self.embed = EmbedAtomID(out_size=hidden_dim, in_size=n_atom_types) self.weave_module = chainer.ChainList(*weave_module) self.readout = GeneralReadout(mode=readout_mode) self.readout_mode = readout_mode def __call__(self, atom_x, pair_x, train=True): if atom_x.dtype == self.xp.int32: # atom_array: (minibatch, atom) atom_x = self.embed(atom_x) for i in range(len(self.weave_module)): if i == len(self.weave_module) - 1: # last layer, only `atom_x` is needed. atom_x = self.weave_module[i].forward(atom_x, pair_x, atom_only=True) else: # not last layer, both `atom_x` and `pair_x` are needed atom_x, pair_x = self.weave_module[i].forward(atom_x, pair_x) x = self.readout(atom_x, axis=1) return x ================================================ FILE: chainer_chemistry/saliency/__init__.py ================================================ from chainer_chemistry.saliency import calculator # NOQA from chainer_chemistry.saliency import visualizer # NOQA ================================================ FILE: chainer_chemistry/saliency/calculator/__init__.py ================================================ from chainer_chemistry.saliency.calculator import base_calculator # NOQA from chainer_chemistry.saliency.calculator import calculator_utils # NOQA from chainer_chemistry.saliency.calculator import gradient_calculator # NOQA from chainer_chemistry.saliency.calculator import integrated_gradients_calculator # NOQA from chainer_chemistry.saliency.calculator import occlusion_calculator # NOQA from chainer_chemistry.saliency.calculator.base_calculator import BaseCalculator # NOQA from chainer_chemistry.saliency.calculator.gradient_calculator import GradientCalculator # NOQA from chainer_chemistry.saliency.calculator.integrated_gradients_calculator import IntegratedGradientsCalculator # NOQA from chainer_chemistry.saliency.calculator.occlusion_calculator import OcclusionCalculator # NOQA from chainer_chemistry.saliency.calculator.calculator_utils import GaussianNoiseSampler # NOQA ================================================ FILE: chainer_chemistry/saliency/calculator/base_calculator.py ================================================ from logging import getLogger import numpy import chainer from chainer import cuda from chainer.dataset.convert import concat_examples, _concat_arrays_with_padding # NOQA from chainer.iterators import SerialIterator from chainer_chemistry.link_hooks import is_link_hooks_available from tqdm import tqdm if is_link_hooks_available: from chainer import LinkHook from chainer_chemistry.link_hooks import VariableMonitorLinkHook _sampling_axis = 0 def _to_tuple(x): if not isinstance(x, tuple): x = (x,) return x def _to_variable(x): if not isinstance(x, chainer.Variable): x = chainer.Variable(x) return x def _extract_numpy(x): if isinstance(x, chainer.Variable): x = x.data return cuda.to_cpu(x) def _concat(batch_list): try: return numpy.concatenate(batch_list) except Exception as e: # NOQA # Thre is a case that each input has different shape, # we cannot concatenate into array in this case. elem_list = [elem for batch in batch_list for elem in batch] return _concat_arrays_with_padding(elem_list, padding=0) def add_linkhook(linkhook, prefix='', logger=None): link_hooks = chainer._get_link_hooks() name = prefix + linkhook.name if name in link_hooks: logger = logger or getLogger(__name__) logger.warning('hook {} already exists, overwrite.'.format(name)) pass # skip this case... # raise KeyError('hook %s already exists' % name) link_hooks[name] = linkhook linkhook.added(None) return linkhook def delete_linkhook(linkhook, prefix='', logger=None): name = prefix + linkhook.name link_hooks = chainer._get_link_hooks() if name not in link_hooks.keys(): logger = logger or getLogger(__name__) logger.warning('linkhook {} is not registered'.format(name)) return link_hooks[name].deleted(None) del link_hooks[name] class BaseCalculator(object): """Base class for saliency calculator Use `compute`, `aggregate` method to calculate saliency. This base class supports to calculate SmoothGrad[1] and BayesGrad[2] of concrete subclass. See: Daniel Smilkov, Nikhil Thorat, Been Kim, Fernanda Viegas, and Martin Wattenberg. SmoothGrad: removing noise by adding noise. `arXiv:1706.03825 `_ See: Akita, Hirotaka and Nakago, Kosuke and Komatsu, Tomoki and Sugawara, Yohei and Maeda, Shin-ichi and Baba, Yukino and Kashima, Hisashi BayesGrad: Explaining Predictions of Graph Convolutional Networks `arXiv:1807.01985 `_ Args: model (chainer.Chain): target model to calculate saliency. target_extractor (VariableMonitorLinkHook or None): It determines `target_var`, target variable to calculate saliency. If `None`, first argument of input to the model is treated as `target_var`. output_extractor (VariableMonitorLinkHook or None): It determines `output_var`, output variable to calculate saliency. If `None`, output of the model is treated as `output_var`. device (int or None): device id to calculate saliency. If `None`, device id is inferred automatically from `model`. logger: """ def __init__(self, model, target_extractor=None, output_extractor=None, device=None, logger=None): self.model = model # type: chainer.Chain if device is not None: self._device = device else: self._device = cuda.get_device_from_array(*model.params()).id self.target_extractor = target_extractor self.output_extractor = output_extractor self.logger = logger or getLogger(__name__) def compute(self, data, M=1, batchsize=16, converter=concat_examples, retain_inputs=False, preprocess_fn=None, postprocess_fn=None, train=False, noise_sampler=None, show_progress=True): """computes saliency_samples Args: data: dataset to calculate saliency M (int): sampling size. `M > 1` may be set with SmoothGrad or BayesGrad configuration. See `train` and `noise_sampler` description. batchsize (int): batch size converter (function): converter to make batch from `data` retain_inputs (bool): retain input flag preprocess_fn (function or None): preprocess function postprocess_fn (function or None): postprocess function train (bool): chainer.config.train flag. When the `model` contains `dropout` (or other stochastic) function, `train=True` corresponds to calculate BayesGrad. noise_sampler: noise sampler class with `sample` method. If this is set, noise is added to `target_var`. It can be used to calculate SmoothGrad. If `None`, noise is not sampled. show_progress (bool): Show progress bar or not. Returns: saliency_samples (numpy.ndarray): M samples of saliency array. Its shape is (M,) + target_var.shape, i.e., sampling axis is added to the first axis. """ saliency_list = [] for _ in tqdm(range(M), disable=not show_progress): with chainer.using_config('train', train): saliency = self._forward( data, batchsize=batchsize, converter=converter, retain_inputs=retain_inputs, preprocess_fn=preprocess_fn, postprocess_fn=postprocess_fn, noise_sampler=noise_sampler) saliency_array = cuda.to_cpu(saliency) saliency_list.append(saliency_array) return numpy.stack(saliency_list, axis=_sampling_axis) def aggregate(self, saliency_arrays, method='raw', ch_axis=None): """Aggregate saliency samples into one saliency score. Args: saliency_arrays (numpy.ndarray): M samples of saliency array calculated by `compute` method. method (str): It supports following methods for aggregation. raw: simply take mean of samples. absolute: calc absolute mean of samples. square: calc squared mean of samples. ch_axis (int, tuple or None): channel axis. The ch_axis is considered as reduced axis for saliency calculation. Returns: saliency (numpy.ndarray): saliency score """ if method == 'raw': h = saliency_arrays # do nothing elif method == 'abs': h = numpy.abs(saliency_arrays) elif method == 'square': h = saliency_arrays ** 2 else: raise ValueError("[ERROR] Unexpected value method={}" .format(method)) if ch_axis is not None: h = numpy.sum(h, axis=ch_axis) sampling_axis = _sampling_axis return numpy.mean(h, axis=sampling_axis) def _compute_core(self, *inputs): """Core computation routine Each concrete subclass should implement this method """ raise NotImplementedError def get_target_var(self, inputs): if isinstance(self.target_extractor, VariableMonitorLinkHook): target_var = self.target_extractor.get_variable() else: if isinstance(inputs, tuple): target_var = inputs[0] else: target_var = inputs if target_var is None: self.logger.warning( 'target_var is None. This may be caused because "model" is not' ' forwarded in advance or "model" does not implement "forward"' ' method and LinkHook is not triggered.') return target_var def get_output_var(self, outputs): if isinstance(self.output_extractor, VariableMonitorLinkHook): output_var = self.output_extractor.get_variable() else: output_var = outputs if output_var is None: self.logger.warning( 'output_var is None. This may be caused because "model" is not' ' forwarded in advance or "model" does not implement "forward"' ' method and LinkHook is not triggered.') return output_var def _forward(self, data, batchsize=16, converter=concat_examples, retain_inputs=False, preprocess_fn=None, postprocess_fn=None, noise_sampler=None): """Forward data by iterating with batch Args: data: "train_x array" or "chainer dataset" batchsize (int): batch size converter (Callable): convert from `data` to `inputs` retain_inputs (bool): If True, this instance keeps inputs in `self.inputs` or not. preprocess_fn (Callable): Its input is numpy.ndarray or cupy.ndarray, it can return either Variable, cupy.ndarray or numpy.ndarray postprocess_fn (Callable): Its input argument is Variable, but this method may return either Variable, cupy.ndarray or numpy.ndarray. Returns (tuple or numpy.ndarray): forward result """ input_list = None output_list = None it = SerialIterator(data, batch_size=batchsize, repeat=False, shuffle=False) if isinstance(self.target_extractor, LinkHook): add_linkhook(self.target_extractor, prefix='/saliency/target/', logger=self.logger) if isinstance(self.output_extractor, LinkHook): add_linkhook(self.output_extractor, prefix='/saliency/output/', logger=self.logger) for batch in it: inputs = converter(batch, self._device) inputs = _to_tuple(inputs) if preprocess_fn: inputs = preprocess_fn(*inputs) inputs = _to_tuple(inputs) inputs = [_to_variable(x) for x in inputs] # --- Main saliency computation ---- if noise_sampler is None: # VanillaGrad computation outputs = self._compute_core(*inputs) else: # SmoothGrad computation if self.target_extractor is None: # inputs[0] is considered as "target_var" noise = noise_sampler.sample(inputs[0].array) inputs[0].array += noise outputs = self._compute_core(*inputs) else: # Add process to LinkHook def add_noise(hook, args, target_var): noise = noise_sampler.sample(target_var.array) target_var.array += noise self.target_extractor.add_process('/saliency/add_noise', add_noise) outputs = self._compute_core(*inputs) self.target_extractor.delete_process('/saliency/add_noise') # --- Main saliency computation end --- # Init if retain_inputs: if input_list is None: input_list = [[] for _ in range(len(inputs))] for j, input in enumerate(inputs): input_list[j].append(cuda.to_cpu(input)) if output_list is None: output_list = [[] for _ in range(len(outputs))] if postprocess_fn: outputs = postprocess_fn(*outputs) outputs = _to_tuple(outputs) for j, output in enumerate(outputs): output_list[j].append(_extract_numpy(output)) if isinstance(self.target_extractor, LinkHook): delete_linkhook(self.target_extractor, prefix='/saliency/target/', logger=self.logger) if isinstance(self.output_extractor, LinkHook): delete_linkhook(self.output_extractor, prefix='/saliency/output/', logger=self.logger) if retain_inputs: self.inputs = [numpy.concatenate( in_array) for in_array in input_list] result = [_concat(output) for output in output_list] if len(result) == 1: return result[0] else: self.logger.error('return multiple result handling is not ' 'implemented yet and not supported.') return result ================================================ FILE: chainer_chemistry/saliency/calculator/calculator_utils.py ================================================ from chainer import cuda class GaussianNoiseSampler(object): """Default noise sampler class to calculate SmoothGrad""" def __init__(self, mode='relative', scale=0.15): self.mode = mode self.scale = scale def sample(self, target_array): xp = cuda.get_array_module(target_array) noise = xp.random.normal( 0, self.scale, target_array.shape) if self.mode == 'absolute': # `scale` is used as is pass elif self.mode == 'relative': # `scale_axis` is used to calculate `max` and `min` of target_array # As default, all axes except batch axis are used. scale_axis = tuple(range(1, target_array.ndim)) vmax = xp.max(target_array, axis=scale_axis, keepdims=True) vmin = xp.min(target_array, axis=scale_axis, keepdims=True) noise = noise * (vmax - vmin) else: raise ValueError("[ERROR] Unexpected value mode={}" .format(self.mode)) return noise ================================================ FILE: chainer_chemistry/saliency/calculator/gradient_calculator.py ================================================ import chainer # NOQA from chainer import functions from chainer_chemistry.saliency.calculator.base_calculator import BaseCalculator # NOQA class GradientCalculator(BaseCalculator): """Gradient saliency calculator Use `compute`, `aggregate` method to calculate saliency. See: Dumitru Erhan, Yoshua Bengio, Aaron Courville, Pascal Vincent (2009). Visualizing Higher-Layer Features of a Deep Network. See: Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. Deep inside convolutional networks: Visualising image classication models and saliency maps. `arXiv:1312.6034 `_ Args: model (chainer.Chain): target model to calculate saliency. target_extractor (VariableMonitorLinkHook or None): It determines `target_var`, target variable to calculate saliency. If `None`, first argument of input to the model is treated as `target_var`. output_extractor (VariableMonitorLinkHook or None): It determines `output_var`, output variable to calculate saliency. If `None`, output of the model is treated as `output_var`. eval_fun (callable): If multiply_target (bool): If `False`, return value is `target_var.grad`. If `True`, return value is `target_var.grad * target_var`. device (int or None): device id to calculate saliency. If `None`, device id is inferred automatically from `model`. """ def __init__(self, model, target_extractor=None, output_extractor=None, eval_fun=None, multiply_target=False, device=None): super(GradientCalculator, self).__init__( model, target_extractor=target_extractor, output_extractor=output_extractor, device=device) self.eval_fun = eval_fun or model.__call__ self.multiply_target = multiply_target def _compute_core(self, *inputs): self.model.cleargrads() outputs = self.eval_fun(*inputs) target_var = self.get_target_var(inputs) target_var.grad = None # Need to reset grad beforehand of backward. output_var = self.get_output_var(outputs) # --- type check for output_var --- if output_var.size != 1: self.logger.warning( 'output_var.size is not 1, calculate scalar value. ' 'functions.sum is applied.') output_var = functions.sum(output_var) output_var.backward(retain_grad=True) saliency = target_var.grad if self.multiply_target: saliency *= target_var.data outputs = (saliency,) return outputs ================================================ FILE: chainer_chemistry/saliency/calculator/integrated_gradients_calculator.py ================================================ import numpy from chainer_chemistry.saliency.calculator.gradient_calculator import GradientCalculator # NOQA class IntegratedGradientsCalculator(GradientCalculator): """Integrated gradient saliency calculator Use `compute`, `aggregate` method to calculate saliency. See: Mukund Sundararajan, Ankur Taly, and Qiqi Yan (2017). Axiomatic attribution for deep networks. PMLR. URL http://proceedings.mlr.press/v70/sundararajan17a.html. Args: model (chainer.Chain): target model to calculate saliency. target_extractor (VariableMonitorLinkHook or None): It determines `target_var`, target variable to calculate saliency. If `None`, first argument of input to the model is treated as `target_var`. output_extractor (VariableMonitorLinkHook or None): It determines `output_var`, output variable to calculate saliency. If `None`, output of the model is treated as `output_var`. eval_fun (callable): If baseline (numpy.ndarray or None): If `None`, baseline is set as 0. steps (int): Number of separation to calculate integrated gradient. device (int or None): device id to calculate saliency. If `None`, device id is inferred automatically from `model`. """ def __init__(self, model, target_extractor=None, output_extractor=None, eval_fun=None, baseline=None, steps=25, device=None): super(IntegratedGradientsCalculator, self).__init__( model, target_extractor=target_extractor, output_extractor=output_extractor, multiply_target=False, eval_fun=eval_fun, device=device) self.baseline = baseline or 0. self.steps = steps def _compute_core(self, *inputs): total_grads = 0. self.model.cleargrads() # Need to forward once to get target_var outputs = self.eval_fun(*inputs) # NOQA target_var = self.get_target_var(inputs) # output_var = self.get_output_var(outputs) base = self.baseline diff = target_var.array - base for alpha in numpy.linspace(0., 1., self.steps): if self.target_extractor is None: interpolated_inputs = base + alpha * diff inputs[0].array = interpolated_inputs total_grads += super( IntegratedGradientsCalculator, self)._compute_core( *inputs)[0] else: def interpolate_target_var(hook, args, _target_var): interpolated_inputs = base + alpha * diff _target_var.array[:] = interpolated_inputs self.target_extractor.add_process( '/saliency/interpolate_target_var', interpolate_target_var) total_grads += super( IntegratedGradientsCalculator, self)._compute_core( *inputs)[0] self.target_extractor.delete_process( '/saliency/interpolate_target_var') saliency = total_grads * diff / self.steps return saliency, ================================================ FILE: chainer_chemistry/saliency/calculator/occlusion_calculator.py ================================================ import itertools import six import chainer from chainer import cuda from chainer_chemistry.saliency.calculator.base_calculator import BaseCalculator # NOQA def _to_tuple(x): if isinstance(x, int): x = (x,) elif isinstance(x, (list, tuple)): x = tuple(x) else: raise TypeError('Unexpected type {}'.format(type(x))) return x class OcclusionCalculator(BaseCalculator): """Occlusion saliency calculator Use `compute`, `aggregate` method to calculate saliency. See: Matthew D Zeiler and Rob Fergus (2014). Visualizing and understanding convolutional networks. In European conference on computer vision, pp. 818-833. Springer. Args: model (chainer.Chain): target model to calculate saliency. target_extractor (VariableMonitorLinkHook or None): It determines `target_var`, target variable to calculate saliency. If `None`, first argument of input to the model is treated as `target_var`. output_extractor (VariableMonitorLinkHook or None): It determines `output_var`, output variable to calculate saliency. If `None`, output of the model is treated as `output_var`. eval_fun (callable): If enable_backprop (bool): chainer.config.enable_backprop option. size (int or tuple): occlusion window size. If `int`, window has same size along `slide_axis`. If `tuple`, its length must be same with `slide_axis`. slide_axis (int or tuple): slide axis which occlusion window moves. device (int or None): device id to calculate saliency. If `None`, device id is inferred automatically from `model`. """ def __init__(self, model, target_extractor=None, output_extractor=None, eval_fun=None, device=None, enable_backprop=False, size=1, slide_axis=(2, 3)): super(OcclusionCalculator, self).__init__( model, target_extractor=target_extractor, output_extractor=output_extractor, device=device) self.eval_fun = eval_fun or model.__call__ self.enable_backprop = enable_backprop self.slide_axis = _to_tuple(slide_axis) size = _to_tuple(size) if len(self.slide_axis) != size: size = size * len(self.slide_axis) self.size = size def _compute_core(self, *inputs): # Usually, backward() is not necessary for calculating occlusion with chainer.using_config('enable_backprop', self.enable_backprop): original_result = self.eval_fun(*inputs) target_var = self.get_target_var(inputs) original_target_array = target_var.array.copy() original_score = self.get_output_var(original_result) xp = cuda.get_array_module(target_var.array) value = 0. # fill with `value` target_dim = target_var.ndim batch_size = target_var.shape[0] occlusion_window_shape = [1] * target_dim occlusion_window_shape[0] = batch_size for axis, size in zip(self.slide_axis, self.size): occlusion_window_shape[axis] = size occlusion_scores_shape = [1] * target_dim occlusion_scores_shape[0] = batch_size for axis, size in zip(self.slide_axis, self.size): occlusion_scores_shape[axis] = target_var.shape[axis] occlusion_window = xp.ones(occlusion_window_shape, dtype=target_var.dtype) * value occlusion_scores = xp.zeros(occlusion_scores_shape, dtype=xp.float32) def _extract_index(slide_axis, size, start_indices): colon = slice(None) index = [colon] * target_dim for axis, size, start in zip(slide_axis, size, start_indices): index[axis] = slice(start, start + size, 1) return tuple(index) end_list = [target_var.data.shape[axis] - size + 1 for axis, size in zip(self.slide_axis, self.size)] for start in itertools.product(*[six.moves.range(end) for end in end_list]): occlude_index = _extract_index(self.slide_axis, self.size, start) if self.target_extractor is None: inputs[0].array = original_target_array.copy() inputs[0].array[occlude_index] = occlusion_window with chainer.using_config('enable_backprop', self.enable_backprop): occluded_result = self.eval_fun(*inputs) else: def mask_target_var(hook, args, _target_var): _target_var.array = original_target_array.copy() _target_var.array[occlude_index] = occlusion_window self.target_extractor.add_process( '/saliency/mask_target_var', mask_target_var) with chainer.using_config('enable_backprop', self.enable_backprop): occluded_result = self.eval_fun(*inputs) self.target_extractor.delete_process( '/saliency/mask_target_var') occluded_score = self.get_output_var(occluded_result) score_diff_var = original_score - occluded_score # (bs, 1) # expand_dim for ch_axis score_diff = xp.reshape(score_diff_var.array, occlusion_window_shape) occlusion_scores[occlude_index] += score_diff outputs = (occlusion_scores,) return outputs ================================================ FILE: chainer_chemistry/saliency/visualizer/__init__.py ================================================ from chainer_chemistry.saliency.visualizer import base_visualizer # NOQA from chainer_chemistry.saliency.visualizer import image_visualizer # NOQA from chainer_chemistry.saliency.visualizer import mol_visualizer # NOQA from chainer_chemistry.saliency.visualizer import table_visualizer # NOQA from chainer_chemistry.saliency.visualizer import visualizer_utils # NOQA from chainer_chemistry.saliency.visualizer.base_visualizer import BaseVisualizer # NOQA from chainer_chemistry.saliency.visualizer.image_visualizer import ImageVisualizer # NOQA from chainer_chemistry.saliency.visualizer.mol_visualizer import MolVisualizer # NOQA from chainer_chemistry.saliency.visualizer.mol_visualizer import SmilesVisualizer # NOQA from chainer_chemistry.saliency.visualizer.table_visualizer import TableVisualizer # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import abs_max_scaler # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import min_max_scaler # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import normalize_scaler # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import red_blue_cmap # NOQA ================================================ FILE: chainer_chemistry/saliency/visualizer/base_visualizer.py ================================================ class BaseVisualizer(object): """Base saliency visualizer""" def visualize(self, *args, **kwargs): """Main visualization routine Each concrete subclass should implement this method """ raise NotImplementedError ================================================ FILE: chainer_chemistry/saliency/visualizer/image_visualizer.py ================================================ from logging import getLogger import matplotlib.cm as cm import matplotlib.pyplot as plt import numpy from chainer import cuda from chainer_chemistry.saliency.visualizer.base_visualizer import BaseVisualizer # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import abs_max_scaler # NOQA class ImageVisualizer(BaseVisualizer): """Saliency visualizer for image data Args: logger: """ def __init__(self, logger=None): self.logger = logger or getLogger(__name__) def visualize(self, saliency, image=None, save_filepath=None, scaler=abs_max_scaler, title='Image saliency map', cmap=cm.jet, alpha=0.5, show_colorbar=False, bbox_inches='tight'): """Visualize or save `saliency` of image. Args: saliency (numpy.ndarray): Saliency array. Must be either 2-dim (h, w) or 3-dim (ch, h, w). image (numpy.ndarray or PIL.Image or None): If set, image is drawn in background, and saliency is shown in foreground. If numpy array, must be in the order of 2-dim (h, w) or 3-dim (ch, h, w). save_filepath (str or None): If specified, file is saved to path. scaler (callable): function which takes `x` as input and outputs scaled `x`, for plotting. title (str or None): title of plot cmap: color map used to plot saliency alpha (float): alpha value of fore ground saliency. This option is used only when `image` is set. show_colorbar (bool): show colorbar in plot or not. bbox_inches (str or Bbox or None): used for `plt.savefig` option. """ # --- type check --- if saliency.ndim == 3: # (ch, h, w) -> (h, w, ch) saliency = cuda.to_cpu(saliency) saliency_image = numpy.transpose(saliency, (1, 2, 0)) elif saliency.ndim == 2: # (h, w) saliency_image = saliency else: raise ValueError("[ERROR] Unexpected value saliency.shape={}" .format(saliency.shape)) if image is not None: # If `image` is PIL Image, convert to numpy array image = numpy.asarray(image) if image.ndim == 3: # Convert to (h, w, ch) order if image.shape[0] == 3 or image.shape[0] == 4: # Assume (ch, h, w) order -> (h, w, ch) image = numpy.transpose(image, (1, 2, 0)) elif image.ndim == 2: # (h, w) order pass else: raise ValueError("[ERROR] Unexpected value image.shape={}" .format(image.shape)) if image.shape[:2] != saliency_image.shape[:2]: self.logger.warning( 'saliency and image height or width is different\n' 'saliency_image.shape {}, image.shape [}' .format(saliency_image.shape, image.shape)) # Normalize to [-1, 1] or [0, 1] if scaler is not None: saliency_image = scaler(saliency_image) fig = plt.figure() plt.clf() if title is not None: plt.title(title) if image is None: # Only show saliency image, not set alpha im = plt.imshow(saliency_image, cmap=cmap) else: # Show original image, and overlay saliency image with alpha plt.imshow(image) im = plt.imshow(saliency_image, alpha=alpha, cmap=cmap) if show_colorbar: fig.colorbar(im) if save_filepath: plt.savefig(save_filepath, bbox_inches=bbox_inches) else: plt.show() ================================================ FILE: chainer_chemistry/saliency/visualizer/mol_visualizer.py ================================================ from logging import getLogger import numpy from rdkit import Chem from rdkit.Chem.Draw import rdMolDraw2D from rdkit.Chem import rdDepictor from chainer_chemistry.saliency.visualizer.base_visualizer import BaseVisualizer # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import red_blue_cmap, abs_max_scaler # NOQA def _convert_to_2d(axes, nrows, ncols): if nrows == 1 and ncols == 1: axes = numpy.array([[axes]]) elif nrows == 1: axes = axes[None, :] elif ncols == 1: axes = axes[:, None] else: pass assert axes.ndim == 2 return axes def is_visible(begin, end): if begin <= 0 or end <= 0: return 0 elif begin >= 1 or end >= 1: return 1 else: return (begin + end) * 0.5 class MolVisualizer(BaseVisualizer): """Saliency visualizer for mol data Args: logger: """ def __init__(self, logger=None): self.logger = logger or getLogger(__name__) def visualize(self, saliency, mol, save_filepath=None, visualize_ratio=1.0, color_fn=red_blue_cmap, scaler=abs_max_scaler, legend='', raise_import_error=False ): """Visualize or save `saliency` with molecule returned value can be used for visualization. .. admonition:: Example >>> svg = visualizer.visualize(saliency, mol) >>> >>> # For a Jupyter user, it will show figure on notebook. >>> from IPython.core.display import SVG >>> SVG(svg.replace('svg:', '')) >>> >>> # For a user who want to save a file as png >>> import cairosvg >>> cairosvg.svg2png(bytestring=svg, write_to="foo.png") Args: saliency (numpy.ndarray): 1-dim saliency array (num_node,) mol (Chem.Mol): mol instance of this saliency save_filepath (str or None): If specified, file is saved to path. visualize_ratio (float): If set, only plot saliency color of top-X atoms. color_fn (callable): color function to show saliency scaler (callable): function which takes `x` as input and outputs scaled `x`, for plotting. legend (str): legend for the plot raise_import_error (bool): raise error when `ImportError` is raised Returns: svg (str): drawed svg text. """ rdDepictor.Compute2DCoords(mol) Chem.SanitizeMol(mol) Chem.Kekulize(mol) num_atoms = mol.GetNumAtoms() # --- type check --- if saliency.ndim != 1: raise ValueError("Unexpected value saliency.shape={}" .format(saliency.shape)) # Cut saliency array for unnecessary tail part saliency = saliency[:num_atoms] if scaler is not None: # Normalize to [-1, 1] or [0, 1] saliency = scaler(saliency) abs_saliency = numpy.abs(saliency) if visualize_ratio < 1.0: threshold_index = int(num_atoms * visualize_ratio) idx = numpy.argsort(abs_saliency) idx = numpy.flip(idx, axis=0) # set threshold to top `visualize_ratio` saliency threshold = abs_saliency[idx[threshold_index]] saliency = numpy.where(abs_saliency < threshold, 0., saliency) else: threshold = numpy.min(saliency) highlight_atoms = list(map(lambda g: g.__int__(), numpy.where( abs_saliency >= threshold)[0])) atom_colors = {i: color_fn(e) for i, e in enumerate(saliency)} bondlist = [bond.GetIdx() for bond in mol.GetBonds()] def color_bond(bond): begin = saliency[bond.GetBeginAtomIdx()] end = saliency[bond.GetEndAtomIdx()] return color_fn(is_visible(begin, end)) bondcolorlist = {i: color_bond(bond) for i, bond in enumerate(mol.GetBonds())} drawer = rdMolDraw2D.MolDraw2DSVG(500, 375) drawer.DrawMolecule( mol, highlightAtoms=highlight_atoms, highlightAtomColors=atom_colors, highlightBonds=bondlist, highlightBondColors=bondcolorlist, legend=legend) drawer.FinishDrawing() svg = drawer.GetDrawingText() if save_filepath: extention = save_filepath.split('.')[-1] if extention == 'svg': with open(save_filepath, 'w') as f: f.write(svg) elif extention == 'png': # TODO(nakago): check it is possible without cairosvg or not try: import cairosvg cairosvg.svg2png(bytestring=svg, write_to=save_filepath) except ImportError as e: self.logger.error( 'cairosvg is not installed! ' 'Please install cairosvg to save by png format.\n' 'pip install cairosvg') if raise_import_error: raise e else: raise ValueError( 'Unsupported extention {} for save_filepath {}' .format(extention, save_filepath)) return svg class SmilesVisualizer(MolVisualizer): def visualize(self, saliency, smiles, save_filepath=None, visualize_ratio=1.0, color_fn=red_blue_cmap, scaler=abs_max_scaler, legend='', add_Hs=False, use_canonical_smiles=True, raise_import_error=False): """Visualize or save `saliency` with molecule See parent `MolVisualizer` class for further usage. Args: saliency (numpy.ndarray): 1-dim saliency array (num_node,) smiles (str): smiles of the molecule. save_filepath (str or None): If specified, file is saved to path. visualize_ratio (float): If set, only plot saliency color of top-X atoms. color_fn (callable): color function to show saliency scaler (callable): function which takes `x` as input and outputs scaled `x`, for plotting. legend (str): legend for the plot add_Hs (bool): Add explicit H or not use_canonical_smiles (bool): If `True`, smiles are converted to canonical smiles before constructing `mol` raise_import_error (bool): raise error when `ImportError` is raised Returns: svg (str): drawed svg text. """ mol = Chem.MolFromSmiles(smiles) if use_canonical_smiles: smiles = Chem.MolToSmiles(mol, canonical=True) mol = Chem.MolFromSmiles(smiles) if add_Hs: mol = Chem.AddHs(mol) return super(SmilesVisualizer, self).visualize( saliency, mol, save_filepath=save_filepath, visualize_ratio=visualize_ratio, color_fn=color_fn, scaler=scaler, legend=legend, raise_import_error=raise_import_error) ================================================ FILE: chainer_chemistry/saliency/visualizer/table_visualizer.py ================================================ import matplotlib.pyplot as plt import numpy from chainer_chemistry.saliency.visualizer.base_visualizer import BaseVisualizer # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import abs_max_scaler # NOQA class TableVisualizer(BaseVisualizer): """Saliency visualizer for table data""" def visualize(self, saliency, feature_names=None, save_filepath=None, num_visualize=-1, scaler=abs_max_scaler, sort='descending', title='Feature Importance', color='b', xlabel='Importance', bbox_inches='tight'): """Visualize or save `saliency` in bar plot. Args: saliency (numpy.ndarray): 1-dim saliency array (num_feature,) feature_names (list or numpy.ndarray): Feature names of `saliency` save_filepath (str or None): If specified, file is saved to path. num_visualize (int): If positive value is set, only plot specified number of features. scaler (callable): function which takes `x` as input and outputs scaled `x`, for plotting. sort (str): Below sort options are supported. none: not sort ascending: plot in ascending order descending: plot in descending order title (str or None): title of plot color (str): color of bar in plot xlabel (str): x label legend bbox_inches (str or Bbox or None): used for `plt.savefig` option. """ # --- type check --- if saliency.ndim != 1: raise ValueError("[ERROR] Unexpected value saliency.shape={}" .format(saliency.shape)) num_total_feat = saliency.shape[0] if feature_names is not None: # type check if len(feature_names) != num_total_feat: raise ValueError( "feature_names={} must have same length with `saliency`" .format(feature_names)) else: feature_names = numpy.arange(num_total_feat) if sort == 'none': indices = numpy.arange(num_total_feat) elif sort == 'ascending': indices = numpy.argsort(saliency)[::-1] elif sort == 'descending': indices = numpy.argsort(saliency) else: raise ValueError("[ERROR] Unexpected value sort={}".format(sort)) saliency = saliency[indices] feature_names = numpy.asarray(feature_names)[indices] if scaler is not None: # Normalize to [-1, 1] or [0, 1] saliency = scaler(saliency) if num_visualize > 0: saliency = saliency[:num_visualize] if feature_names is not None: feature_names = feature_names[:num_visualize] else: num_visualize = num_total_feat plt.figure() plt.clf() if title is not None: plt.title(title) plt.barh(range(num_visualize), saliency, color=color, align='center') plt.yticks(range(num_visualize), feature_names) plt.xlabel(xlabel) if save_filepath: plt.savefig(save_filepath, bbox_inches=bbox_inches) else: plt.show() ================================================ FILE: chainer_chemistry/saliency/visualizer/visualizer_utils.py ================================================ from logging import getLogger import numpy # NOQA from chainer import cuda def red_blue_cmap(x): """Red to Blue color map Args: x (float): value between -1 ~ 1, represents normalized saliency score Returns (tuple): tuple of 3 float values representing R, G, B. """ if x > 0: # Red for positive value # x=0 -> 1, 1, 1 (white) # x=1 -> 1, 0, 0 (red) return 1., 1. - x, 1. - x else: # Blue for negative value x *= -1 return 1. - x, 1. - x, 1. def min_max_scaler(saliency, logger=None): """Normalize saliency to value 0~1 Args: saliency (numpy.ndarray or cupy.ndarray): saliency array logger: Returns (numpy.ndarray or cupy.ndarray): normalized saliency array """ xp = cuda.get_array_module(saliency) maxv = xp.max(saliency) minv = xp.min(saliency) if maxv == minv: logger = logger or getLogger(__name__) logger.info('All saliency value is 0') saliency = xp.zeros_like(saliency) else: saliency = (saliency - minv) / (maxv - minv) return saliency def abs_max_scaler(saliency, logger=None): """Normalize saliency to value -1~1 Args: saliency (numpy.ndarray or cupy.ndarray): saliency array logger: Returns (numpy.ndarray or cupy.ndarray): normalized saliency array """ xp = cuda.get_array_module(saliency) maxv = xp.max(xp.abs(saliency)) if maxv <= 0: logger = logger or getLogger(__name__) logger.info('All saliency value is 0') return xp.zeros_like(saliency) else: return saliency / maxv def normalize_scaler(saliency, axis=None, logger=None): """Normalize saliency to be sum=1 Args: saliency (numpy.ndarray or cupy.ndarray): saliency array. axis (int): axis to take sum for normalization. logger: Returns (numpy.ndarray or cupy.ndarray): normalized saliency array """ xp = cuda.get_array_module(saliency) if xp.sum(saliency < 0) > 0: logger = logger or getLogger(__name__) logger.warning('saliency array contains negative number, ' 'which is unexpected!') vsum = xp.sum(xp.abs(saliency), axis=axis, keepdims=True) if vsum <= 0: logger = logger or getLogger(__name__) logger.info('All saliency value is 0') return xp.zeros_like(saliency) else: return saliency / vsum ================================================ FILE: chainer_chemistry/training/__init__.py ================================================ from chainer_chemistry.training import extensions # NOQA ================================================ FILE: chainer_chemistry/training/extensions/__init__.py ================================================ from chainer_chemistry.training.extensions import batch_evaluator # NOQA from chainer_chemistry.training.extensions import r2_score_evaluator # NOQA from chainer_chemistry.training.extensions import roc_auc_evaluator # NOQA # import class and function from chainer_chemistry.training.extensions.batch_evaluator import BatchEvaluator # NOQA from chainer_chemistry.training.extensions.r2_score_evaluator import R2ScoreEvaluator # NOQA from chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator # NOQA ================================================ FILE: chainer_chemistry/training/extensions/auto_print_report.py ================================================ from copy import deepcopy import os import sys from chainer.training import extension from chainer.training.extensions import log_report as log_report_module from chainer.training.extensions import util def create_header_and_templates(entries): # format information entry_widths = [max(10, len(s)) for s in entries] header = ' '.join(('{:%d}' % w for w in entry_widths)).format( *entries) + '\n' templates = [] for entry, w in zip(entries, entry_widths): templates.append((entry, '{:<%dg} ' % w, ' ' * (w + 2))) return header, templates def filter_and_sort_entries(all_entries, unit='epoch'): entries = deepcopy(all_entries) # TODO(nakago): sort other entries if necessary if 'iteration' in entries: # move iteration to head entries.pop(entries.index('iteration')) if unit == 'iteration': entries = ['iteration'] + entries if 'epoch' in entries: # move epoch to head entries.pop(entries.index('epoch')) if unit == 'epoch': entries = ['epoch'] + entries if 'elapsed_time' in entries: # move elapsed_time to tail entries.pop(entries.index('elapsed_time')) entries.append('elapsed_time') return entries class AutoPrintReport(extension.Extension): """`PrintReport` with auto `entries` detection. This extension uses the log accumulated by a :class:`LogReport` extension to print specified entries of the log in a human-readable format. Args: log_report (str or LogReport): Log report to accumulate the observations. This is either the name of a LogReport extensions registered to the trainer, or a LogReport instance to use internally. out: Stream to print the bar. Standard output is used by default. """ def __init__(self, log_report='LogReport', out=sys.stdout): self._entries = [] self._log_report = log_report self._out = out self._log_len = 0 # number of observations already printed header, templates = create_header_and_templates([]) self._header = header # printed at the first call self._templates = templates self._all_entries = [] def get_log_report(self, trainer): log_report = self._log_report if isinstance(log_report, str): log_report = trainer.get_extension(log_report) elif isinstance(log_report, log_report_module.LogReport): log_report(trainer) # update the log report else: raise TypeError('log report has a wrong type %s' % type(log_report)) return log_report def __call__(self, trainer): # --- update entries --- log_report = self.get_log_report(trainer) log = log_report.log updated_flag = False aggregate_entries = log[self._log_len:] for obs in aggregate_entries: for entry in obs.keys(): if entry not in self._all_entries: self._all_entries.append(entry) updated_flag = True if updated_flag: if hasattr(log_report, '_trigger') and hasattr(log_report._trigger, 'unit'): unit = log_report._trigger.unit else: # Failed to infer `unit`, use epoch as default unit = 'epoch' entries = filter_and_sort_entries(self._all_entries, unit=unit) self._entries = entries header, templates = create_header_and_templates(entries) self._header = header # printed at the first call self._templates = templates out = self._out if self._header: out.write(self._header) self._header = None log_len = self._log_len while len(log) > log_len: # delete the printed contents from the current cursor if os.name == 'nt': util.erase_console(0, 0) else: out.write('\033[J') self._print(log[log_len]) log_len += 1 self._log_len = log_len def serialize(self, serializer): log_report = self._log_report if isinstance(log_report, log_report_module.LogReport): log_report.serialize(serializer['_log_report']) def _print(self, observation): out = self._out for entry, template, empty in self._templates: if entry in observation: out.write(template.format(observation[entry])) else: out.write(empty) out.write('\n') if hasattr(out, 'flush'): out.flush() ================================================ FILE: chainer_chemistry/training/extensions/batch_evaluator.py ================================================ import copy from logging import getLogger import numpy import chainer from chainer import cuda from chainer.dataset import convert from chainer import reporter from chainer.training.extensions import Evaluator def _get_1d_numpy_array(v): """Convert array or Variable to 1d numpy array Args: v (numpy.ndarray or cupy.ndarray or chainer.Variable): array to be converted to 1d numpy array Returns (numpy.ndarray): Raveled 1d numpy array """ if isinstance(v, chainer.Variable): v = v.data return cuda.to_cpu(v).ravel() class BatchEvaluator(Evaluator): def __init__(self, iterator, target, converter=convert.concat_examples, device=None, eval_hook=None, eval_func=None, metrics_fun=None, name=None, logger=None): super(BatchEvaluator, self).__init__( iterator, target, converter=converter, device=device, eval_hook=eval_hook, eval_func=eval_func) self.name = name self.logger = logger or getLogger() if callable(metrics_fun): # TODO(mottodora): use better name or infer self.metrics_fun = {"evaluation": metrics_fun} elif isinstance(metrics_fun, dict): self.metrics_fun = metrics_fun else: raise TypeError('Unexpected type metrics_fun must be Callable or ' 'dict.') def evaluate(self): iterator = self._iterators['main'] eval_func = self.eval_func or self._targets['main'] if self.eval_hook: self.eval_hook(self) if hasattr(iterator, 'reset'): iterator.reset() it = iterator else: it = copy.copy(iterator) y_total = [] t_total = [] for batch in it: in_arrays = self.converter(batch, self.device) with chainer.no_backprop_mode(), chainer.using_config('train', False): y = eval_func(*in_arrays[:-1]) t = in_arrays[-1] y_data = _get_1d_numpy_array(y) t_data = _get_1d_numpy_array(t) y_total.append(y_data) t_total.append(t_data) y_total = numpy.concatenate(y_total).ravel() t_total = numpy.concatenate(t_total).ravel() # metrics_value = self.metrics_fun(y_total, t_total) metrics = {key: metric_fun(y_total, t_total) for key, metric_fun in self.metrics_fun.items()} observation = {} with reporter.report_scope(observation): reporter.report(metrics, self._targets['main']) return observation ================================================ FILE: chainer_chemistry/training/extensions/prc_auc_evaluator.py ================================================ import numpy from chainer.dataset import convert from sklearn import metrics from chainer_chemistry.training.extensions.batch_evaluator import BatchEvaluator # NOQA def _to_list(a): """convert value `a` to list Args: a: value to be convert to `list` Returns (list): """ if isinstance(a, (int, float)): return [a, ] else: # expected to be list or some iterable class return a class PRCAUCEvaluator(BatchEvaluator): """Evaluator which calculates PRC AUC score Note that this Evaluator is only applicable to binary classification task. Args: iterator: Dataset iterator for the dataset to calculate PRC AUC score. It can also be a dictionary of iterators. If this is just an iterator, the iterator is registered by the name ``'main'``. target: Link object or a dictionary of links to evaluate. If this is just a link object, the link is registered by the name ``'main'``. converter: Converter function to build input arrays and true label. :func:`~chainer.dataset.concat_examples` is used by default. It is expected to return input arrays of the form `[x_0, ..., x_n, t]`, where `x_0, ..., x_n` are the inputs to the evaluation function and `t` is the true label. device: Device to which the training data is sent. Negative value indicates the host memory (CPU). eval_hook: Function to prepare for each evaluation process. It is called at the beginning of the evaluation. The evaluator extension object is passed at each call. eval_func: Evaluation function called at each iteration. The target link to evaluate as a callable is used by default. name (str): name of this extension. When `name` is None, `default_name='validation'` which is defined in super class `Evaluator` is used as extension name. This name affects to the reported key name. pos_labels (int or list): labels of the positive class, other classes are considered as negative. ignore_labels (int or list or None): labels to be ignored. `None` is used to not ignore all labels. raise_value_error (bool): If `False`, `ValueError` caused by `roc_auc_score` calculation is suppressed and ignored with a warning message. logger: Attributes: converter: Converter function. device: Device to which the training data is sent. eval_hook: Function to prepare for each evaluation process. eval_func: Evaluation function called at each iteration. pos_labels (list): labels of the positive class ignore_labels (list): labels to be ignored. """ def __init__(self, iterator, target, converter=convert.concat_examples, device=None, eval_hook=None, eval_func=None, name=None, pos_labels=1, ignore_labels=None, raise_value_error=True, logger=None): metrics_fun = {'prc_auc': self.prc_auc_score} super(PRCAUCEvaluator, self).__init__( iterator, target, converter=converter, device=device, eval_hook=eval_hook, eval_func=eval_func, metrics_fun=metrics_fun, name=name, logger=logger) self.pos_labels = _to_list(pos_labels) self.ignore_labels = _to_list(ignore_labels) self.raise_value_error = raise_value_error def prc_auc_score(self, y_total, t_total): # --- ignore labels if specified --- if self.ignore_labels: valid_ind = numpy.in1d(t_total, self.ignore_labels, invert=True) y_total = y_total[valid_ind] t_total = t_total[valid_ind] # --- set positive labels to 1, negative labels to 0 --- pos_indices = numpy.in1d(t_total, self.pos_labels) t_total = numpy.where(pos_indices, 1, 0) if len(numpy.unique(t_total)) != 2: if self.raise_value_error: raise ValueError("Only one class present in y_true. PRC AUC " "score is not defined in that case.") else: return numpy.nan precision, recall, _ = metrics.precision_recall_curve(t_total, y_total) prc_auc = metrics.auc(recall, precision) return prc_auc ================================================ FILE: chainer_chemistry/training/extensions/r2_score_evaluator.py ================================================ from chainer.backends import cuda from chainer.dataset import convert from chainer_chemistry.training.extensions.batch_evaluator import BatchEvaluator # NOQA class R2ScoreEvaluator(BatchEvaluator): """Evaluator with calculates R^2 (coefficient of determination) regression score. Args: iterator: Dataset iterator for the dataset to calculate R^2(coefficient of determination) regression score. It can also be a dictionary of iterators. If this is just an iterator, the iterator is registered by the name ``'main'``. target: Link object or a dictionary of links to evaluate. If this is just a link object, the link is registered by the name ``'main'``. converter: Converter function to build input arrays and true label. :func:`~chainer.dataset.concat_examples` is used by default. It is expected to return input arrays of the form `[x_0, ..., x_n, t]`, where `x_0, ..., x_n` are the inputs to the evaluation function and `t` is the true label. device: Device to which the training data is sent. Negative value indicates the host memory (CPU). eval_hook: Function to prepare for each evaluation process. It is called at the beginning of the evaluation. The evaluator extension object is passed at each call. eval_func: Evaluation function called at each iteration. The target link to evaluate as a callable is used by default. name (str): name of this extension. When `name` is None, `default_name='validation'` which is defined in super class `Evaluator` is used as extension name. This name affects to the reported key name. pos_labels (int or list): labels of the positive class, other classes are considered as negative. ignore_labels (int or list or None): labels to be ignored. `None` is used to not ignore all labels. raise_value_error (bool): If `False`, `ValueError` caused by `roc_auc_score` calculation is suppressed and ignored with a warning message. logger: sample_weight: This argument is for compatibility with scikit-learn's implementation of r2_score. Current implementation admits None only. multioutput (str): If 'uniform_average', this function returns an average of R^2 score of multiple output. If 'raw_average', this function return a set of R^2 score of multiple output. Attributes: converter: Converter function. device: Device to which the training data is sent. eval_hook: Function to prepare for each evaluation process. eval_func: Evaluation function called at each iteration. pos_labels (list): labels of the positive class ignore_labels (list): labels to be ignored. """ def __init__(self, iterator, target, converter=convert.concat_examples, device=None, eval_hook=None, eval_func=None, name=None, pos_label=1, ignore_labels=None, raise_value_error=True, logger=None, sample_weight=None, multioutput='uniform_average', ignore_nan=False): metrics_fun = {'r2_score': self.r2_score} super(R2ScoreEvaluator, self).__init__( iterator, target, converter=converter, device=device, eval_hook=eval_hook, eval_func=eval_func, metrics_fun=metrics_fun, name=name, logger=logger) self.pos_label = pos_label self.ignore_labels = ignore_labels self.raise_value_error = raise_value_error self.sample_weight = sample_weight self.multioutput = multioutput self.ignore_nan = ignore_nan def r2_score(self, pred, true, sample_weight=None, multioutput='uniform_average', ignore_nan=False): if self.sample_weight is not None: raise NotImplementedError() if self.multioutput not in ['uniform_average', 'raw_values']: raise ValueError('invalid multioutput argument') xp = cuda.get_array_module(pred) diff = pred - true dev = true - xp.mean(true, axis=0) if self.ignore_nan: diff[xp.isnan(diff)] = 0. dev[xp.isnan(dev)] = 0. SS_res = xp.asarray(xp.sum(diff ** 2, axis=0)) SS_tot = xp.asarray(xp.sum(dev ** 2, axis=0)) SS_tot_iszero = SS_tot == 0 SS_tot[SS_tot_iszero] = 1 # Assign dummy value to avoid zero-division ret = xp.where( SS_tot_iszero, 0.0, 1 - SS_res / SS_tot).astype(pred.dtype) if self.multioutput == 'uniform_average': return xp.asarray(ret.mean()) elif self.multioutput == 'raw_values': return ret ================================================ FILE: chainer_chemistry/training/extensions/roc_auc_evaluator.py ================================================ import numpy from chainer.dataset import convert from sklearn import metrics from chainer_chemistry.training.extensions.batch_evaluator import BatchEvaluator # NOQA def _to_list(a): """convert value `a` to list Args: a: value to be convert to `list` Returns (list): """ if isinstance(a, (int, float)): return [a, ] else: # expected to be list or some iterable class return a class ROCAUCEvaluator(BatchEvaluator): """Evaluator which calculates ROC AUC score Note that this Evaluator is only applicable to binary classification task. Args: iterator: Dataset iterator for the dataset to calculate ROC AUC score. It can also be a dictionary of iterators. If this is just an iterator, the iterator is registered by the name ``'main'``. target: Link object or a dictionary of links to evaluate. If this is just a link object, the link is registered by the name ``'main'``. converter: Converter function to build input arrays and true label. :func:`~chainer.dataset.concat_examples` is used by default. It is expected to return input arrays of the form `[x_0, ..., x_n, t]`, where `x_0, ..., x_n` are the inputs to the evaluation function and `t` is the true label. device: Device to which the training data is sent. Negative value indicates the host memory (CPU). eval_hook: Function to prepare for each evaluation process. It is called at the beginning of the evaluation. The evaluator extension object is passed at each call. eval_func: Evaluation function called at each iteration. The target link to evaluate as a callable is used by default. name (str): name of this extension. When `name` is None, `default_name='validation'` which is defined in super class `Evaluator` is used as extension name. This name affects to the reported key name. pos_labels (int or list): labels of the positive class, other classes are considered as negative. ignore_labels (int or list or None): labels to be ignored. `None` is used to not ignore all labels. raise_value_error (bool): If `False`, `ValueError` caused by `roc_auc_score` calculation is suppressed and ignored with a warning message. logger: Attributes: converter: Converter function. device: Device to which the training data is sent. eval_hook: Function to prepare for each evaluation process. eval_func: Evaluation function called at each iteration. pos_labels (list): labels of the positive class ignore_labels (list): labels to be ignored. """ def __init__(self, iterator, target, converter=convert.concat_examples, device=None, eval_hook=None, eval_func=None, name=None, pos_labels=1, ignore_labels=None, raise_value_error=True, logger=None): metrics_fun = {'roc_auc': self.roc_auc_score} super(ROCAUCEvaluator, self).__init__( iterator, target, converter=converter, device=device, eval_hook=eval_hook, eval_func=eval_func, metrics_fun=metrics_fun, name=name, logger=logger) self.pos_labels = _to_list(pos_labels) self.ignore_labels = _to_list(ignore_labels) self.raise_value_error = raise_value_error def roc_auc_score(self, y_total, t_total): # --- ignore labels if specified --- if self.ignore_labels: valid_ind = numpy.in1d(t_total, self.ignore_labels, invert=True) y_total = y_total[valid_ind] t_total = t_total[valid_ind] # --- set positive labels to 1, negative labels to 0 --- pos_indices = numpy.in1d(t_total, self.pos_labels) t_total = numpy.where(pos_indices, 1, 0) try: roc_auc = metrics.roc_auc_score(t_total, y_total) except ValueError as e: # When only one class present in `y_true`, `ValueError` is raised. # ROC AUC score is not defined in that case. if self.raise_value_error: raise e else: self.logger.warning( 'ValueError detected during roc_auc_score calculation. {}' .format(e.args)) roc_auc = numpy.nan return roc_auc ================================================ FILE: chainer_chemistry/utils/__init__.py ================================================ from chainer_chemistry.utils.json_utils import load_json # NOQA from chainer_chemistry.utils.json_utils import save_json # NOQA from chainer_chemistry.utils.sparse_utils import convert_sparse_with_edge_type # NOQA from chainer_chemistry.utils.sparse_utils import is_sparse # NOQA from chainer_chemistry.utils.train_utils import run_train # NOQA ================================================ FILE: chainer_chemistry/utils/extend.py ================================================ from collections import Iterable from logging import getLogger import six from chainer import cuda def _to_list(a): if isinstance(a, Iterable): a = list(a) else: a = [a] return a def extend_node(node, out_size, axis=-1, value=0): """Extend size of `node` array For now, this function works same with `extend_array` method, this is just an alias function. Args: node (numpy.ndarray): the array whose `axis` to be extended. first axis is considered as "batch" axis. out_size (int): target output size for specified `axis`. axis (int): node feature axis to be extended. Default is `axis=-1`, which extends only last axis. value (int or float): value to be filled for extended place. Returns (numpy.ndarray): extended `node` array, extended place is filled with `value` """ return extend_arrays_to_size( node, out_size=out_size, axis=axis, value=value) def extend_adj(adj, out_size, axis=None, value=0): """Extend size of `adj` array For now, this function only differs default `axis` value from `extend_array` method, this is an alias function. Args: adj (numpy.ndarray): the array whose `axis` to be extended. first axis is considered as "batch" axis. out_size (int): target output size for specified `axis`. axis (list or None): node feature axis to be extended. Default is None, in this case `axis=[-1, -2]` is used to extend last 2 axes. value (int or float): value to be filled for extended place. Returns (numpy.ndarray): extended `adj` array, extended place is filled with `value` """ axis = axis or [-1, -2] return extend_arrays_to_size( adj, out_size=out_size, axis=axis, value=value) def extend_arrays_to_size(arrays, out_size, axis=-1, value=0): """Extend size of `arrays` array Args: arrays (numpy.ndarray): the array whose `axis` to be extended. first axis is considered as "batch" axis. out_size (int): target output size for specified `axis`. axis (int or list): node feature axis to be extended. value (int or float): value to be filled for extended place. Returns (numpy.ndarray): extended array, extended place is filled with `value` """ batch_size = len(arrays) in_shape = _to_list(arrays[0].shape) out_shape = [batch_size] + in_shape axis = _to_list(axis) for ax in axis: if ax == 0: logger = getLogger(__name__) logger.warning('axis 0 detected, but axis=0 is expected to be ' 'batch size dimension.') if out_shape[ax] > out_size: raise ValueError( 'current size={} is larger than out_size={} at axis={}' .format(out_shape[ax], out_size, ax)) out_shape[ax] = out_size return extend_arrays_to_shape(arrays, out_shape, value=value) def extend_arrays_to_shape(arrays, out_shape, value=0): # Ref: `_concat_arrays_with_padding` method in chainer convert.py # https://github.com/chainer/chainer/blob/master/chainer/dataset/convert.py xp = cuda.get_array_module(arrays[0]) with cuda.get_device_from_array(arrays[0]): result = xp.full(out_shape, value, dtype=arrays[0].dtype) for i in six.moves.range(len(arrays)): src = arrays[i] slices = tuple(slice(dim) for dim in src.shape) result[(i,) + slices] = src return result ================================================ FILE: chainer_chemistry/utils/json_utils.py ================================================ import json from logging import getLogger import numpy try: from pathlib import PurePath _is_pathlib_available = True except ImportError: _is_pathlib_available = False from chainer import cuda class JSONEncoderEX(json.JSONEncoder): """Encoder class used for `json.dump`""" def default(self, obj): if isinstance(obj, numpy.integer): return int(obj) elif isinstance(obj, numpy.floating): return float(obj) elif isinstance(obj, numpy.ndarray): return obj.tolist() elif isinstance(obj, cuda.ndarray): return cuda.to_cpu(obj).tolist() elif _is_pathlib_available and isinstance(obj, PurePath): # save as str representation # convert windows path separator to linux format return str(obj).replace('\\', '/') else: return super(JSONEncoderEX, self).default(obj) def save_json(filepath, params, ignore_error=False, indent=4, logger=None): """Save `params` to `filepath` in json format. It also supports `numpy` & `cupy` array serialization by converting them to `list` format. Args: filepath (str): filepath to save args params (dict or list): parameters to be saved. ignore_error (bool): If `True`, it will ignore exception with printing error logs, which prevents to stop. indent (int): Indent for saved file. logger: """ try: with open(filepath, 'w') as f: json.dump(params, f, indent=indent, cls=JSONEncoderEX) except Exception as e: if not ignore_error: raise e else: logger = logger or getLogger(__name__) logger.warning('Error occurred at save_json, but ignoring...') logger.warning('The file {} may not be saved or corrupted.' .format(filepath)) logger.warning(e) def load_json(filepath): """Load params, which is stored in json format. Args: filepath (str): filepath to json file to load. Returns (dict or list): params """ with open(filepath, 'r') as f: params = json.load(f) return params ================================================ FILE: chainer_chemistry/utils/permutation.py ================================================ import numpy def permute_node(node, permutation_index, axis=-1): """Permute index of `node` array Args: node (numpy.ndarray): the array whose `axis` to be permuted. permutation_index (numpy.ndarray): 1d numpy array whose size should be same as permutation axis of `node`. axis (int): permutation axis. Returns (numpy.ndarray): permutated `node` array. """ if node.shape[axis] != len(permutation_index): raise ValueError( 'node.shape[{}] = {} and len(permutation_index) = {} do not match!' .format(axis, node.shape[axis], len(permutation_index))) out_node = numpy.take(node, permutation_index, axis=axis).copy() return out_node def permute_adj(adj, permutation_index, axis=None): """Permute index of adjacency matrix array Args: adj (numpy.ndarray): the array whose `axis` to be permuted. It is considered as adjacency matrix. permutation_index (numpy.ndarray): 1d numpy array whose size should be same as permutation axis of `node`. axis (list or tuple or None): list of 2d int, indicates the permutation axis. When None is passed (default), it uses -1 and -2 as `axis`, it means that last 2 axis are considered to be permuted. Returns (numpy.ndarray): permutated `adj` array. """ if axis is not None: if not isinstance(axis, (list, tuple)): raise TypeError('axis must be list or tuple, got {}' .format(type(axis))) if len(axis) != 2: raise ValueError('axis length must 2, got {}'.format(len(axis))) else: axis = [-1, -2] # default value is to use last 2 axis num_node = len(permutation_index) for ax in axis: if adj.shape[ax] != len(permutation_index): raise ValueError( 'adj.shape[{}] = {} and len(permutation_index) = {} do not ' 'match!'.format(axis, adj.shape[axis], len(permutation_index))) out_adj = numpy.zeros_like(adj) ndim = adj.ndim for i in range(num_node): for j in range(num_node): in_indices = [slice(None)] * ndim out_indices = [slice(None)] * ndim in_indices[axis[0]] = i in_indices[axis[1]] = j out_indices[axis[0]] = permutation_index[i] out_indices[axis[1]] = permutation_index[j] out_adj[tuple(in_indices)] = adj[tuple(out_indices)] return out_adj ================================================ FILE: chainer_chemistry/utils/sparse_utils.py ================================================ import chainer from chainer import cuda import numpy as np try: from chainer.utils import CooMatrix _coomatrix_imported = True except Exception: _coomatrix_imported = False def _flatten(x): if isinstance(x, chainer.Variable): x = x.data x = chainer.backends.cuda.to_cpu(x) return x.flatten() def sparse_utils_available(): from distutils.version import StrictVersion return _coomatrix_imported and\ StrictVersion(np.__version__) >= StrictVersion('1.16') def is_sparse(x): if _coomatrix_imported and isinstance(x, CooMatrix): return True else: return False def convert_sparse_with_edge_type(data, row, col, num_nodes, edge_type, num_edge_type): """Convert a sparse matrix with edge type to a regular COO matrix. Args: data (numpy.ndarray): the entries of the batched sparse matrix. row (numpy.ndarray): the row indices of the matrix entries. col (numpy.ndarray): the column indices of the matrix entries. num_nodes (int): the number of nodes in the batched graph. edge_type (numpy.ndarray): edge type information of edges. num_edge_type (int): number of edge type. Returns (chainer.utils.CooMatrix): new sparse COO matrix whose minibatch size is equal to ((original minibatch size) * num_edge_type). """ assert len(data.shape) == 2 assert row.shape == data.shape assert col.shape == data.shape assert edge_type.shape == data.shape mb, length = data.shape xp = cuda.get_array_module(data) data = _flatten(data) row = _flatten(row) col = _flatten(col) edge_type = _flatten(edge_type) # From now on, suppose that # edge_type = [[1, 1, 3, 1], [0, 2, 1, 0]] as example. # Then, # pos_mb = [1, 1, 3, 1, 4, 6, 5, 4]. pos_mb = np.repeat(np.arange(mb), length) * num_edge_type + edge_type # argsort = [0, 1, 3, 2, 4, 7, 6, 5] # sorted_pos = [1, 1, 1, 3, 4, 4, 5, 6] argsort = pos_mb.argsort() sorted_pos = pos_mb[argsort] # df = [0, 0, 0, 1, 1, 0, 1, 1] df = np.diff(sorted_pos, prepend=sorted_pos[0]) != 0 # extract = [3, 4, 6, 7] extract = np.arange(mb * length)[df] # d_extract = [3, 1, 2, 1] d_extract = np.diff(extract, prepend=0) # p = [0, 0, 0, 3, 1, 0, 2, 1] p = np.zeros(mb * length, dtype=np.int32) p[df] = d_extract # pos_i_perm = [0, 1, 2, 0, 0, 1, 0, 0] pos_i_perm = np.arange(mb * length) - p.cumsum() # pos_i = [0, 1, 0, 2, 0, 0, 0, 1] pos_i = np.zeros_like(pos_i_perm) pos_i[argsort] = pos_i_perm # new_length = 3 new_length = pos_i.max() + 1 new_mb = mb * num_edge_type new_data = xp.zeros((new_mb, new_length), dtype=data.dtype) new_data[pos_mb, pos_i] = data new_row = xp.zeros((new_mb, new_length), dtype=np.int32) new_row[pos_mb, pos_i] = row new_col = xp.zeros((new_mb, new_length), dtype=np.int32) new_col[pos_mb, pos_i] = col new_shape = (num_nodes, num_nodes) return chainer.utils.CooMatrix(new_data, new_row, new_col, new_shape) def _convert_to_sparse(dense_adj): # naive conversion function mainly for testing xp = cuda.get_array_module(dense_adj) dense_adj = cuda.to_cpu(dense_adj) batch_size, num_edge_type, atom_size = dense_adj.shape[:3] data = [] row = [] col = [] edge_type = [] for mb in range(batch_size): data.append([]) row.append([]) col.append([]) edge_type.append([]) for e in range(num_edge_type): for i in range(atom_size): for j in range(atom_size): data[-1].append(dense_adj[mb, e, i, j]) row[-1].append(i) col[-1].append(j) edge_type[-1].append(e) data = xp.array(data, dtype=dense_adj.dtype) row = xp.array(row, dtype=xp.int32) col = xp.array(col, dtype=xp.int32) edge_type = xp.array(edge_type, dtype=xp.int32) return data, row, col, edge_type ================================================ FILE: chainer_chemistry/utils/train_utils.py ================================================ import chainer from chainer import optimizers, training, Optimizer # NOQA from chainer._backend import Device from chainer.dataset import convert, Iterator # NOQA from chainer.iterators import SerialIterator from chainer.training import extensions from chainer_chemistry.training.extensions.auto_print_report import AutoPrintReport # NOQA def run_train(model, train, valid=None, batch_size=16, epoch=10, optimizer=None, out='result', extensions_list=None, device=-1, converter=convert.concat_examples, use_default_extensions=True, resume_path=None): """Util function to train chainer's model with StandardUpdater. Typical Regression/Classification tasks suffices to use this method to train chainer model. Args: model (chainer.Chain): model to train train (dataset or Iterator): training dataset or train iterator valid (dataset or Iterator): validation dataset or valid iterator batch_size (int): batch size for training epoch (int): epoch for training optimizer (Optimizer): out (str): path for `trainer`'s out directory extensions_list (None or list): list of extensions to add to `trainer` device (Device): chainer Device converter (callable): use_default_extensions (bool): If `True`, default extensions are added to `trainer`. resume_path (None or str): If specified, `trainer` is resumed with this serialized file. """ if optimizer is None: # Use Adam optimizer as default optimizer = optimizers.Adam() elif not isinstance(optimizer, Optimizer): raise ValueError("[ERROR] optimizer must be instance of Optimizer, " "but passed {}".format(type(Optimizer))) optimizer.setup(model) if isinstance(train, Iterator): train_iter = train else: # Assume `train` as training dataset, Use SerialIterator as default. train_iter = SerialIterator(train, batch_size=batch_size) updater = training.StandardUpdater( train_iter, optimizer, device=device, converter=converter) trainer = training.Trainer(updater, (epoch, 'epoch'), out=out) if use_default_extensions: if valid is not None: if isinstance(valid, Iterator): valid_iter = valid else: # Assume `valid` as validation dataset, # Use SerialIterator as default. valid_iter = SerialIterator(valid, batch_size=batch_size, shuffle=False, repeat=False) trainer.extend(extensions.Evaluator( valid_iter, model, device=device, converter=converter)) trainer.extend(extensions.LogReport()) trainer.extend(AutoPrintReport()) trainer.extend(extensions.ProgressBar(update_interval=10)) # TODO(nakago): consider to include snapshot as default extension. # trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) if extensions_list is not None: for e in extensions_list: trainer.extend(e) if resume_path: chainer.serializers.load_npz(resume_path, trainer) trainer.run() return def run_node_classification_train(model, data, train_mask, valid_mask, epoch=10, optimizer=None, out='result', extensions_list=None, device=-1, converter=None, use_default_extensions=True, resume_path=None): if optimizer is None: # Use Adam optimizer as default optimizer = optimizers.Adam() elif not isinstance(optimizer, Optimizer): raise ValueError("[ERROR] optimizer must be instance of Optimizer, " "but passed {}".format(type(Optimizer))) optimizer.setup(model) def one_batch_converter(batch, device): if not isinstance(device, Device): device = chainer.get_device(device) data, train_mask, valid_mask = batch[0] return (data.to_device(device), device.send(train_mask), device.send(valid_mask)) data_iter = SerialIterator([(data, train_mask, valid_mask)], batch_size=1) updater = training.StandardUpdater( data_iter, optimizer, device=device, converter=one_batch_converter) trainer = training.Trainer(updater, (epoch, 'epoch'), out=out) if use_default_extensions: trainer.extend(extensions.LogReport()) trainer.extend(AutoPrintReport()) trainer.extend(extensions.ProgressBar(update_interval=10)) # TODO(nakago): consider to include snapshot as default extension. # trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch')) if extensions_list is not None: for e in extensions_list: trainer.extend(e) if resume_path: chainer.serializers.load_npz(resume_path, trainer) trainer.run() return ================================================ FILE: docker/conda/python36/Dockerfile ================================================ FROM nvidia/cuda:10.1-cudnn7-devel RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ git \ wget \ bzip2 \ ca-certificates \ curl \ cmake \ libblas3 \ libblas-dev \ libxext6 \ libgl1-mesa-glx \ libxrender-dev \ && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ /opt/conda/bin/conda clean -tipsy && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc RUN conda update -n base -c defaults conda RUN conda create -n py36 python=3.6 conda && \ . /opt/conda/etc/profile.d/conda.sh && \ conda init bash && \ conda activate py36 && \ conda install -c rdkit rdkit && \ pip install pytest mock ADD conda-entrypoint.sh /conda-entrypoint.sh ENTRYPOINT [ "/conda-entrypoint.sh" ] ================================================ FILE: docker/conda/python36/conda-entrypoint.sh ================================================ #!/bin/bash . /opt/conda/etc/profile.d/conda.sh conda activate py36 exec "$@" ================================================ FILE: docker/conda/python37/Dockerfile ================================================ FROM nvidia/cuda:10.1-cudnn7-devel RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ git \ wget \ bzip2 \ ca-certificates \ curl \ cmake \ libblas3 \ libblas-dev \ libxext6 \ libgl1-mesa-glx \ libxrender-dev \ && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ /opt/conda/bin/conda clean -tipsy && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc RUN conda update -n base -c defaults conda RUN conda create -n py37 python=3.7 conda && \ . /opt/conda/etc/profile.d/conda.sh && \ conda init bash && \ conda activate py37 && \ conda install -c rdkit rdkit && \ pip install pytest mock ADD conda-entrypoint.sh /conda-entrypoint.sh ENTRYPOINT [ "/conda-entrypoint.sh" ] ================================================ FILE: docker/conda/python37/conda-entrypoint.sh ================================================ #!/bin/bash . /opt/conda/etc/profile.d/conda.sh conda activate py37 exec "$@" ================================================ FILE: docker/conda/python37-chainerx-cpu-base/Dockerfile ================================================ FROM nvidia/cuda:10.1-cudnn7-devel RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ git \ wget \ bzip2 \ ca-certificates \ curl \ cmake \ libblas3 \ libblas-dev \ libxext6 \ libgl1-mesa-glx \ libxrender-dev \ && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ /opt/conda/bin/conda clean -tipsy && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc RUN conda update -n base -c defaults conda ENV MAKEFLAGS -j4 RUN conda create -n py37 python=3.7 conda && \ . /opt/conda/etc/profile.d/conda.sh && \ conda init bash && \ conda activate py37 && \ CHAINER_BUILD_CHAINERX=1 pip install -vvvv --no-cache-dir chainer==6.0.0 && \ conda install -c rdkit rdkit==2019.03.4.0 ADD conda-entrypoint.sh /conda-entrypoint.sh ENTRYPOINT [ "/conda-entrypoint.sh" ] ================================================ FILE: docker/conda/python37-chainerx-cpu-base/conda-entrypoint.sh ================================================ #!/bin/bash . /opt/conda/etc/profile.d/conda.sh conda activate py37 exec "$@" ================================================ FILE: docker/conda/python37-chainerx-cpu-latest/Dockerfile ================================================ FROM nvidia/cuda:10.1-cudnn7-devel RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ git \ wget \ bzip2 \ ca-certificates \ curl \ cmake \ libblas3 \ libblas-dev \ libxext6 \ libgl1-mesa-glx \ libxrender-dev \ && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ /opt/conda/bin/conda clean -tipsy && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc RUN conda update -n base -c defaults conda ENV MAKEFLAGS -j4 RUN conda create -n py37 python=3.7 conda && \ . /opt/conda/etc/profile.d/conda.sh && \ conda init bash && \ conda activate py37 && \ CHAINER_BUILD_CHAINERX=1 pip install -vvvv --no-cache-dir chainer==7.0.0b2 && \ conda install -c rdkit rdkit==2019.03.4.0 ADD conda-entrypoint.sh /conda-entrypoint.sh ENTRYPOINT [ "/conda-entrypoint.sh" ] ================================================ FILE: docker/conda/python37-chainerx-cpu-latest/conda-entrypoint.sh ================================================ #!/bin/bash . /opt/conda/etc/profile.d/conda.sh conda activate py37 exec "$@" ================================================ FILE: docker/conda/python37-chainerx-cpu-stable/Dockerfile ================================================ FROM nvidia/cuda:10.1-cudnn7-devel RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ git \ wget \ bzip2 \ ca-certificates \ curl \ cmake \ libblas3 \ libblas-dev \ libxext6 \ libgl1-mesa-glx \ libxrender-dev \ && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ /opt/conda/bin/conda clean -tipsy && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc RUN conda update -n base -c defaults conda ENV MAKEFLAGS -j4 RUN conda create -n py37 python=3.7 conda && \ . /opt/conda/etc/profile.d/conda.sh && \ conda init bash && \ conda activate py37 && \ CHAINER_BUILD_CHAINERX=1 pip install -vvvv --no-cache-dir chainer==6.2.0 && \ conda install -c rdkit rdkit==2019.03.4.0 ADD conda-entrypoint.sh /conda-entrypoint.sh ENTRYPOINT [ "/conda-entrypoint.sh" ] ================================================ FILE: docker/conda/python37-chainerx-cpu-stable/conda-entrypoint.sh ================================================ #!/bin/bash . /opt/conda/etc/profile.d/conda.sh conda activate py37 exec "$@" ================================================ FILE: docker/conda/python37-chainerx-gpu-base/Dockerfile ================================================ FROM nvidia/cuda:10.1-cudnn7-devel RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ git \ wget \ bzip2 \ ca-certificates \ curl \ cmake \ libblas3 \ libblas-dev \ libxext6 \ libgl1-mesa-glx \ libxrender-dev \ && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ /opt/conda/bin/conda clean -tipsy && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc RUN conda update -n base -c defaults conda ENV MAKEFLAGS -j4 RUN conda create -n py37 python=3.7 conda && \ . /opt/conda/etc/profile.d/conda.sh && \ conda init bash && \ conda activate py37 && \ CHAINER_BUILD_CHAINERX=1 CHAINERX_BUILD_CUDA=1 pip install -vvvv --no-cache-dir cupy-cuda101==6.0.0 chainer==6.0.0 && \ conda install -c rdkit rdkit==2019.03.4.0 ADD conda-entrypoint.sh /conda-entrypoint.sh ENTRYPOINT [ "/conda-entrypoint.sh" ] ================================================ FILE: docker/conda/python37-chainerx-gpu-base/conda-entrypoint.sh ================================================ #!/bin/bash . /opt/conda/etc/profile.d/conda.sh conda activate py37 exec "$@" ================================================ FILE: docker/conda/python37-chainerx-gpu-latest/Dockerfile ================================================ FROM nvidia/cuda:10.1-cudnn7-devel RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ git \ wget \ bzip2 \ ca-certificates \ curl \ cmake \ libblas3 \ libblas-dev \ libxext6 \ libgl1-mesa-glx \ libxrender-dev \ && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ /opt/conda/bin/conda clean -tipsy && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc RUN conda update -n base -c defaults conda ENV MAKEFLAGS -j4 RUN conda create -n py37 python=3.7 conda && \ . /opt/conda/etc/profile.d/conda.sh && \ conda init bash && \ conda activate py37 && \ CHAINER_BUILD_CHAINERX=1 CHAINERX_BUILD_CUDA=1 pip install -vvvv --no-cache-dir cupy-cuda101==7.0.0b2 chainer==7.0.0b2 && \ conda install -c rdkit rdkit==2019.03.4.0 ADD conda-entrypoint.sh /conda-entrypoint.sh ENTRYPOINT [ "/conda-entrypoint.sh" ] ================================================ FILE: docker/conda/python37-chainerx-gpu-latest/conda-entrypoint.sh ================================================ #!/bin/bash . /opt/conda/etc/profile.d/conda.sh conda activate py37 exec "$@" ================================================ FILE: docker/conda/python37-chainerx-gpu-stable/Dockerfile ================================================ FROM nvidia/cuda:10.1-cudnn7-devel RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ git \ wget \ bzip2 \ ca-certificates \ curl \ cmake \ libblas3 \ libblas-dev \ libxext6 \ libgl1-mesa-glx \ libxrender-dev \ && \ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ /opt/conda/bin/conda clean -tipsy && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc RUN conda update -n base -c defaults conda ENV MAKEFLAGS -j4 RUN conda create -n py37 python=3.7 conda && \ . /opt/conda/etc/profile.d/conda.sh && \ conda init bash && \ conda activate py37 && \ CHAINER_BUILD_CHAINERX=1 CHAINERX_BUILD_CUDA=1 pip install -vvvv --no-cache-dir cupy-cuda101==6.2.0 chainer==6.2.0 && \ conda install -c rdkit rdkit==2019.03.4.0 ADD conda-entrypoint.sh /conda-entrypoint.sh ENTRYPOINT [ "/conda-entrypoint.sh" ] ================================================ FILE: docker/conda/python37-chainerx-gpu-stable/conda-entrypoint.sh ================================================ #!/bin/bash . /opt/conda/etc/profile.d/conda.sh conda activate py37 exec "$@" ================================================ FILE: docker/python3/Dockerfile ================================================ FROM chainer/chainer:v6.1.0-python3 RUN apt-get update -y && \ apt-get install -y --no-install-recommends \ curl ca-certificates \ libboost-dev \ libboost-python-dev \ libboost-serialization-dev \ libboost-iostreams-dev \ libboost-thread-dev \ libboost-system-dev \ libeigen3-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* # build & install rdkit ARG RDKIT_VERSION=Release_2017_09_3 RUN curl -sLo ${RDKIT_VERSION}.tar.gz https://github.com/rdkit/rdkit/archive/${RDKIT_VERSION}.tar.gz && \ tar xf ${RDKIT_VERSION}.tar.gz && \ mkdir -p rdkit-${RDKIT_VERSION}/build && \ base_dir=$(pwd) && \ cd rdkit-${RDKIT_VERSION}/build && \ cmake \ -D RDK_BUILD_SWIG_SUPPORT=OFF \ -D RDK_BUILD_PYTHON_WRAPPERS=ON \ -D RDK_BUILD_COMPRESSED_SUPPLIERS=ON \ -D RDK_BUILD_INCHI_SUPPORT=ON \ -D RDK_BUILD_AVALON_SUPPORT=ON \ -D RDK_BUILD_CPP_TESTS=OFF \ -D RDK_INSTALL_INTREE=OFF \ -D RDK_INSTALL_STATIC_LIBS=OFF \ -D PYTHON_EXECUTABLE=/usr/bin/python3.5 \ -D PYTHON_NUMPY_INCLUDE_PATH=/usr/local/lib/python3.5/dist-packages/numpy/core/include \ -D PYTHON_INSTDIR=/usr/local/lib/python3.5/dist-packages \ -D Python_ADDITIONAL_VERSIONS=3.5 \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=/usr/local \ .. && \ make -j $(nproc) && \ make install && \ cd "$base_dir" && \ rm -rf rdkit-${RDKIT_VERSION} ${RDKIT_VERSION}.tar.gz && \ ldconfig # install chainer-chemistry # matplotlib >= 3.1 requires upgrade of pip # pandas >= 0.25 doesn't support python3.5.2 which is installed for ubuntu16.04 RUN pip3 install --no-cache-dir matplotlib==3.0 pandas==0.24 chainer-chemistry ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = Chainer-Chemistry SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/source/_autosummary_check.py ================================================ import inspect import os import types import chainer_chemistry.functions import chainer_chemistry.links import chainer_chemistry.models def _is_rst_exists(entity): return os.path.exists('source/generated/{}.rst'.format(entity)) def check(app, exception): missing_entities = [] missing_entities += [ name for name in _list_chainer_functions() if not _is_rst_exists(name)] missing_entities += [ name for name in _list_chainer_links() if not _is_rst_exists(name)] missing_entities += [ name for name in _list_chainer_models() if not _is_rst_exists(name)] if len(missing_entities) != 0: app.warn('\n'.join([ 'Undocumented entities found.', '', ] + missing_entities)) def _list_chainer_functions(): # List exported functions under chainer.functions. return ['chainer_chemistry.functions.{}'.format(name) for (name, func) in chainer_chemistry.functions.__dict__.items() if isinstance(func, types.FunctionType)] def _list_chainer_links(): # List exported classes under chainer.links. return ['chainer_chemistry.links.{}'.format(name) for (name, link) in chainer_chemistry.links.__dict__.items() if inspect.isclass(link)] def _list_chainer_models(): # List exported classes under chainer.links. return ['chainer_chemistry.models.{}'.format(name) for (name, model) in chainer_chemistry.models.__dict__.items() if inspect.isclass(model)] ================================================ FILE: docs/source/conf.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import pkg_resources import sys sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) import sphinx_rtd_theme import _autosummary_check __version__ = pkg_resources.get_distribution('chainer-chemistry').version # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', 'sphinx.ext.autosummary', 'sphinx.ext.napoleon'] autosummary_generate = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General information about the project. project = 'Chainer Chemistry' copyright = '2017, Preferred Networks, Inc.' author = 'Preferred Networks, Inc.' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = __version__ # The full version, including alpha/beta/rc tags. release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { '**': [ 'relations.html', # needs 'show_related': True theme option to display 'searchbox.html', ] } # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = 'Chainer-Chemistrydoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'Chainer-Chemistry.tex', 'Chainer Chemistry Documentation', 'Preferred Networks, Inc.', 'manual'), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'chainer-chemistry', 'Chainer Chemistry Documentation', [author], 1) ] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'Chainer Chemistry', 'Chainer Chemistry Documentation', author, 'Chainer Chemistry', 'One line description of project.', 'Miscellaneous'), ] # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'https://docs.python.org/': None} def setup(app): app.connect('build-finished', _build_finished) def _build_finished(app, exception): if exception is None: _autosummary_check.check(app, exception) ================================================ FILE: docs/source/contribution.rst ================================================ ================== Contribution guide ================== We welcome any type of contribution that helps to improve and promote Chainer Chemistry. Typical contribution includes: * Send pull requests (PRs) to the `repository `_ (We recommend developers making PRs to read the :ref:`development-policy` before starting to implement). * Report bugs or problems as `issues `_. * Send questions to developer community sites like `Stackoverflow `_ or Chainer Slack (`en `_, `jp `_). * Write a blog post about Chainer Chemistry or its use case. ================================================ FILE: docs/source/dataset.rst ================================================ ======= Dataset ======= Converters ========== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.dataset.converters.concat_mols Indexers ======== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.dataset.indexer.BaseIndexer chainer_chemistry.dataset.indexer.BaseFeatureIndexer chainer_chemistry.dataset.indexers.NumpyTupleDatasetFeatureIndexer Parsers ======= .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.dataset.parsers.BaseParser chainer_chemistry.dataset.parsers.CSVFileParser chainer_chemistry.dataset.parsers.SDFFileParser chainer_chemistry.dataset.parsers.DataFrameParser chainer_chemistry.dataset.parsers.SmilesParser Preprocessors ============= Base preprocessors ------------------ .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.dataset.preprocessors.BasePreprocessor chainer_chemistry.dataset.preprocessors.MolPreprocessor Concrete preprocessors ---------------------- .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.dataset.preprocessors.AtomicNumberPreprocessor chainer_chemistry.dataset.preprocessors.ECFPPreprocessor chainer_chemistry.dataset.preprocessors.GGNNPreprocessor chainer_chemistry.dataset.preprocessors.NFPPreprocessor chainer_chemistry.dataset.preprocessors.SchNetPreprocessor chainer_chemistry.dataset.preprocessors.WeaveNetPreprocessor chainer_chemistry.dataset.preprocessors.RelGATPreprocessor chainer_chemistry.dataset.preprocessors.RelGCNPreprocessor chainer_chemistry.dataset.preprocessors.RSGCNPreprocessor Utilities --------- .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.dataset.preprocessors.MolFeatureExtractionError chainer_chemistry.dataset.preprocessors.type_check_num_atoms chainer_chemistry.dataset.preprocessors.construct_atomic_number_array chainer_chemistry.dataset.preprocessors.construct_adj_matrix Splitters ========== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.dataset.splitters.RandomSplitter chainer_chemistry.dataset.splitters.StratifiedSplitter chainer_chemistry.dataset.splitters.ScaffoldSplitter ================================================ FILE: docs/source/datasets.rst ================================================ ======== Datasets ======== Dataset implementations ======================= .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.datasets.NumpyTupleDataset Dataset loaders =============== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.datasets.tox21.get_tox21 chainer_chemistry.datasets.qm9.get_qm9 chainer_chemistry.datasets.molnet.get_molnet_dataset chainer_chemistry.datasets.molnet.get_molnet_dataframe ================================================ FILE: docs/source/development.rst ================================================ .. _development-policy: ================== Development policy ================== In this section, we describe the development policy that the core developers follow. Developers who are thinking to send PRs to the repository are encouraged to read the following sections before starting implementation. Versioning policy ================= Basically, we follow the `semantic versioning v2.0.0 `_. In Chainer Chemistry, *public APIs* in the sense of semantic versioning are ones in `the document `_. We follow these rules about versioning during the major version zero in addition to ones described in the the semantic versioning: * We do not plan any scheduled releases. * We do not plan any pre releases. * We release the minor version when the core development team agrees. Typically, we do so when (1) sufficient number of features are added since the last minor release (2) the latest release cannot run the example code in the master branch of the repository (3) critical bugs are found. But we are not restricted to them. * If we find critical bugs, we should release a patch version or a minor version that fixes them. The core development team will determine which version to release. We do not have a concrete plan about versioning strategy after v1.0.0. Compatibiity policy =================== As an immediate consequence of the semantic versioning, we may break compatibility of public APIs including addition, deletion, and changes in their semantics anytime in the major version zero. Since APIs of Chainer Chemistry are still immature and unstable, we expect introduction of new features can sometime involve compatibility break. If we are faced with a dilemma between cost for backward compatibility and benefit of new features, we are likely to give up the former because we want to place importance on introducing new features as soon as possible. Of course, we care backward compatibility whenever it is easy and low-cost. Like `ChainerCV `_, Chainer Chemistry provides several off-the-shelf deep learning models (e.g. Neural Finger Print) whose papers are available in such as arXiv or conferences related to machine learning. Although, most of published papers reports evaluation results of the models with publicly available datasets, we do *NOT* guarantee the reproducibility of experiments in the papers. At some point, coding examples in the master branch of the official repository may not work even with the latest release. In that case, users are recommended to either use the example code of the latest release or update the library code to the master branch. As of v0.3.0, we have introduced `BaseForwardModel`, which provides methods for serializing itself to and loading from a file. As these methods intenally use `pickle `_, portability of the class depends on that of pickling. Especially, serialized instances of `BaseForwardModel` made with older Chainer Chemistry may not be loaded with newer one, partly because we may change their internal structures for refactoring, performance improvement, and so on. See the document of `BaseForwardModel` and their subclasses (e.g. `Classifier`, `Regressor`). Branch strategy =============== The official repository of Chainer Chemistry is https://github.com/pfnet-research/chainer-chemistry. We use the *master* branch of the repository for development. Therefore, developer who makes PRs should send them to the master branch. During major version zero, we do not maintain any released versions. When a bug is found, changes for the bug should be merged to the next version (either minor or patch). If the bug is critical, we will release the next version as soon as possible. Coding guideline ================ We basically adopt `PEP8 _` as a style guide. You can check it with `flake8`, which we can install by:: $ pip install flake8 and run with ``flake8`` command. In addition to PEP8, we use upper camel case (e.g. ``FooBar``) for class names and snake case (e.g. ``foo_bar``) for function, method, variable and package names. Although we recommend developers to follow these rules as well, they are not mandatory. For documents, we follow the `Google Python Style Guide `_ and compile it with `Napoleon `_, which is an extension of `Sphinx `_. Testing guideline ================= Chainer Chemistry uses `pytest `_ as a unit-test framework. All unit tests are located in ``tests/`` directory. We can run tests with normal usage of pytest. For example, the following command runs all unit tests:: $ pytest tests Some unit tests require GPUs, which are annotated with ``@pytest.mark.gpu``. Therefore, you can skip them with ``-m`` option:: $ pytest -m "not gpu" tests If a develop who write a unit test that uses GPUs, you must anotate it with ``@pytest.mark.gpu``. Similarly, some unit tests take long time to complete. We annotated them with ``@pytest.mark.slow`` and can skip them with ``-m`` option:: $ pytest -m "not slow" tests Any unit test that uses GPUs muct be annotated with ``@pytest.mark.slow``. We can skip both GPU and slow tests with the following command:: $ pytest -m "not (gpu or slow)" tests Terminology =========== In the context of machine learning, especially chemoinformatics, we use several terms such as feature, feature vectors, descriptor and so on to indicate representation of inputs. To avoid disambiguity and align naming convention within the library code, we use these terms in the following way: * *Feature* is a representation of a sample of interest (typically molecules in Chainer Chemistry). * *Label* is a target value of we want to predict. * *Input feature* is a representation of a sample from which we want to predict the target value. For example, consider a suepervised learning task whose dataset consisting of input-output pairs ``((x_1, y_1), ..., (x_N, y_N))``, where ``N`` is the number of samples. In Chainer Chemistry ``x_i` and ``y_i`` are called input feature and label, respectively and a pair of ``(x_i, y_i)`` is feature for each ``i``. Relation to Chainer =================== `Chainer `_ is a deep learning framework written in Python that features dynamic computational graph construction (the "define-by-run" paradigm) for flexible and intuitive model development. As the name indicates, Chainer Chemistry is an extension library of Chainer built on top of it. The core development team members of Chainer and that of Chainer Chemistry work together tightly. ================================================ FILE: docs/source/environment.yml ================================================ name: chainer-chemistry channels: !!python/tuple - defaults dependencies: - rdkit::boost=1.63.0=py36_1 - rdkit::rdkit=2017.09.1=py36_1 ================================================ FILE: docs/source/functions.rst ================================================ ========= Functions ========= Function implementations ======================== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.functions.matmul chainer_chemistry.functions.mean_squared_error chainer_chemistry.functions.mean_absolute_error chainer_chemistry.functions.r2_score ================================================ FILE: docs/source/index.rst ================================================ Chainer Chemistry: Chainer extension library for Biology and Chemistry ====================================================================== `Chainer Chemistry `_ is a collection of tools to train and run neural networks for tasks in biology and chemistry using `Chainer `_ . Features -------- * State-of-the-art deep learning neural network models (especially graph convolutions) for chemical molecules (NFP, GGNN, Weave, SchNet etc.) * Preprocessors of molecules tailored for these models * Parsers for several standard file formats (CSV, SDF etc.) * Loaders for several well-known datasets (QM9, Tox21 etc.) Introductory to deep learning for molecules and Chainer Chemistry is also available `here (SlideShare) `_. .. toctree:: :maxdepth: 1 :caption: Contents install tutorial contribution development reference ================================================ FILE: docs/source/install.rst ================================================ ============ Installation ============ Dependency ======================== Following packages are required to install Chainer Chemistry and are automatically installed when you install the library by `pip` command. * `chainer `_ * `pandas `_ * `scikit-learn `_ * `tqdm `_ Also, it uses following library, which you need to manually install. * `rdkit `_ See the `official document `_ for installation. If you have setup ``anaconda``, you may install ``rdkit`` by following command:: $ conda install -c rdkit rdkit Install via pip ======================== It can be installed by ``pip`` command:: $ pip install chainer-chemistry Install from source ======================== The tarball of the source tree is available via ``pip download chainer-chemistry``. You can use ``setup.py`` to install Chainer Chemistry from the tarball:: $ tar zxf chainer-chemistry-x.x.x.tar.gz $ cd chainer-chemistry-x.x.x $ python setup.py install Install from the latest source from the master branch:: $ git clone https://github.com/pfnet-research/chainer-chemistry.git $ pip install -e chainer-chemistry Run example training code ========================= `The official repository `_ provides examples of training several graph convolution networks. The code can be obtained by cloning the repository:: $ git clone https://github.com/pfnet-research/chainer-chemistry.git The following code is how to train Neural Fingerprint (NFP) with the Tox21 dataset on CPU:: $ cd chainer-chemistry/examples/tox21 $ python train_tox21.py --method=nfp --gpu=-1 # set --gpu=0 if you have GPU ================================================ FILE: docs/source/iterators.rst ================================================ ========= Iterators ========= Iterator Implementations ======================== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.iterators.BalancedSerialIterator chainer_chemistry.iterators.IndexIterator ================================================ FILE: docs/source/links.rst ================================================ ===== Links ===== Link implementations ==================== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.links.EmbedAtomID chainer_chemistry.links.GraphLinear chainer_chemistry.links.GraphBatchNormalization Scaler implementations ====================== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.links.StandardScaler Update implementations ====================== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.links.GGNNUpdate chainer_chemistry.links.NFPUpdate chainer_chemistry.links.RelGATUpdate chainer_chemistry.links.RelGCNUpdate chainer_chemistry.links.RSGCNUpdate chainer_chemistry.links.SchNetUpdate Readout implementations ======================= .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.links.GeneralReadout chainer_chemistry.links.GGNNReadout chainer_chemistry.links.NFPReadout chainer_chemistry.links.SchNetReadout ================================================ FILE: docs/source/models.rst ================================================ ====== Models ====== Model implementations ===================== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.models.NFP chainer_chemistry.models.GGNN chainer_chemistry.models.MLP chainer_chemistry.models.SchNet chainer_chemistry.models.WeaveNet chainer_chemistry.models.RelGAT chainer_chemistry.models.RelGCN chainer_chemistry.models.RSGCN Wrapper models ============== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.models.BaseForwardModel chainer_chemistry.models.Classifier chainer_chemistry.models.Regressor ================================================ FILE: docs/source/reference.rst ================================================ ============= API Reference ============= .. toctree:: :maxdepth: 1 dataset datasets functions iterators links models utils training ================================================ FILE: docs/source/requirements.txt ================================================ chainer scipy scikit-learn pandas tqdm ================================================ FILE: docs/source/training.rst ================================================ ========= Training ========= Extensions ========== .. autosummary:: :toctree: generated/ :nosignatures: chainer_chemistry.training.extensions.batch_evaluator.BatchEvaluator chainer_chemistry.training.extensions.roc_auc_evaluator.ROCAUCEvaluator chainer_chemistry.training.extensions.prc_auc_evaluator.PRCAUCEvaluator ================================================ FILE: docs/source/tutorial.rst ================================================ ============ Tutorial ============ Abstract ======================== In this tutorial, we predict Highest Occupied Molecular Orbital (HOMO) level of the molecules in `QM9 dataset `_ [1][2] by `Neural Finger Print (NFP) `_ [3][4]. We concentrate on exaplaining usage of Chainer Chemistry briefly and do not look over the detail of NFP implementation. .. _environment: Tested Environment ======================== - Chainer Chemistry >= 0.0.1 (See :doc:`install`) - Chainer >= 2.0.2 - CUDA == 8.0, CuPy >= 1.0.3 (Required only when using GPU) - For CUDA 9.0, CuPy >= 2.0.0 is required - sklearn >= 0.17.1 (Only for preprocessing) QM9 Dataset ======================== QM9 is a publicly available dataset of small organic molecule structures and their simulated properties for data driven researches of material property prediction and chemical space exploration. It contains 133,885 stable small organic molecules made up of CHONF. The available properties are geometric, energetic, electronic, and thermodynamic ones. In this tutorial, we predict HOMO level in the properties. Physically, we need quantum chemical calculations to compute HOMO level. From mathematical viewpoint it requires a solution of an internal eigenvalue problem for a Hamiltonian matrix. It is a big challenge to predict HOMO level accurately by a neural network, because the network should approximate both calculating the Hamiltonian matrix and solving the internal eigenvalue problem. HOMO prediction by NFP ======================== At first you should clone the library repository from `GitHub `_. There is a Python script ``examples/qm9/train_qm9.py`` in the repository. It executes a whole training procedure, that is, downloads QM9 dataset, preprocess it, define an NFP model and run trainning on them. Execute the following commands on a machine satisfying the tested environment in :ref:`environment`. .. code-block:: shell ~$ git clone git@github.com:pfnet-research/chainer-chemistry.git ~$ cd chainer-chemistry/examples/qm9/ Hereafter all shell commands should be executed in this directory. If you are a beginner for Chainer, `Chainer handson `_ will greatly help you. Especially the explanation of inclusion relationship of Chainer classes in Sec. 4 in `Chap. 2 `_ is helpful when you read the sample script. Next the dataset preparation part and the model definition part in ``train_qm9.py`` are explained. If you are not interested in them, skip :ref:`dataset-preparation` and :ref:`model-definition`, and jump to :ref:`run`. .. _dataset-preparation: Dataset Preparation ------------------------ Chainer Chemistry accepts the same dataset type with Chainer, such as ``chainer.datasets.SubDataset``. In this section we learn how to download QM9 dataset and use it as a Chainer dataset. The following Python script downloads and saves the dataset in ``.npz`` format. .. code-block:: python #!/usr/bin/env python from chainer_chemistry import datasets as D from chainer_chemistry.dataset.preprocessors import preprocess_method_dict from chainer_chemistry.datasets import NumpyTupleDataset preprocessor = preprocess_method_dict['nfp']() dataset = D.get_qm9(preprocessor, labels='homo') cache_dir = 'input/nfp_homo/' os.makedirs(cache_dir) NumpyTupleDataset.save(cache_dir + 'data.npz', dataset) The last two lines save the dataset to ``input/nfp_homo/data.npz`` and we need not to download the dataset next time. The following Python script read the dataset from the saved ``.npz`` file and split the data points into training and validation sets. .. code-block:: python #!/usr/bin/env python from chainer.datasets import split_dataset_random from chainer_chemistry import datasets as D from chainer_chemistry.dataset.preprocessors import preprocess_method_dict from chainer_chemistry.datasets import NumpyTupleDataset cache_dir = 'input/nfp_homo/' dataset = NumpyTupleDataset.load(cache_dir + 'data.npz') train_data_ratio = 0.7 train_data_size = int(len(dataset) * train_data_ratio) train, val = split_dataset_random(dataset, train_data_size, 777) print('train dataset size:', len(train)) print('validation dataset size:', len(val)) The function ``split_dataset_random()`` returns a tuple of two ``chainer.datasets.SubDataset`` objects (training and validation set). Now you have prepared training and validation data points and you can construct ``chainer.iterator.Iterator`` objects, needed for updaters in Chainer. .. _model-definition: Model Definition ------------------------ In Chainer, a neural network model is defined as a ``chainer.Chain`` object. Graph convolutional networks such as NFP are generally connection of graph convolution layers and multi perceptron layers. Therefore it is convenient to define a class which inherits ``chainer.Chain`` and compose two ``chainer.Chain`` objects corresponding to the two kind of layers. Execute the following Python script and check you can define such a class. ``NFP`` and ``MLP`` are already defined ``chainer.Chain`` classes. .. code-block:: python #!/usr/bin/env python import chainer from chainer_chemistry.models import MLP, NFP class GraphConvPredictor(chainer.Chain): def __init__(self, graph_conv, mlp): super(GraphConvPredictor, self).__init__() with self.init_scope(): self.graph_conv = graph_conv self.mlp = mlp def __call__(self, atoms, adjs): x = self.graph_conv(atoms, adjs) x = self.mlp(x) return x n_unit = 16 conv_layers = 4 model = GraphConvPredictor(NFP(n_unit, n_unit, conv_layers), MLP(n_unit, 1)) .. _run: Run ------------------------ You have defined the dataset and the NFP model on Chainer. There are no other procedures specific to Chainer Chemistry. Hereafter you should just follow the usual procedures in Chainer to execute training. The sample script ``examples/qm9/train_qm9.py`` contains all the procedures and you can execute training just by invoking the script. The following command starts training for 20 epochs and reports loss and accuracy during training. They are reported for each of ``main`` (dataset for training) and ``validation`` (dataset for validation). The ``--gpu 0`` option is to utilize a GPU with device id = 0. If you do not have a GPU, set ``--gpu -1`` or just drop ``--gpu 0`` to use CPU for all the calculation. In most cases, calculation with GPU is much faster than that only with CPU. .. code-block:: shell ~/chainer-chemistry/examples/qm9$ python train_qm9.py --method nfp --label homo --gpu 0 # If GPU is unavailable, set --gpu -1 Train NFP model... epoch main/loss main/accuracy validation/main/loss validation/main/accuracy elapsed_time 1 0.746135 0.0336724 0.680088 0.0322597 58.4605 2 0.642823 0.0311715 0.622942 0.0307055 113.748 (...) 19 0.540646 0.0277585 0.532406 0.0276445 1052.41 20 0.537062 0.0276631 0.551695 0.0277499 1107.29 After finished, you will find ``log`` file in ``result/`` directory. Evaluation ------------------------ In the loss and accuracy report, we are mainly interested in ``validation/main/accuracy``. Although it decreases during training, the ``accuracy`` field is actually mean absolute error. The unit is Hartree. Therefore the last line means validation mean absolute error is 0.0277499 Hartree. See ``scaled_abs_error()`` function in ``train_qm9.py`` for the detailed definition of mean absolute error. .. 1 kcal/mol = 0.0016 Hartree = 0.043 eV = 500 K .. 17.4133 kcal/mol = 0.0277499 Hartree = 0.755114 eV = 8762.78 K .. DFT error of HOMO level reported in https://arxiv.org/pdf/1702.05532.pdf is 2.0 eV = 0.073 Hartree. You can also train other type models like GGNN, SchNet or WeaveNet, and other target values like LUMO, dipole moment and internal energy, just by changing ``--model`` and ``--label`` options, respectively. See output of ``python train_qm9.py --help``. Using your own dataset ======================== You can use your own dataset in Chainer Chemistry. `example/own_dataset `_ shows an example. Reference ======================== [1] L. Ruddigkeit, R. van Deursen, L. C. Blum, J.-L. Reymond, Enumeration of 166 billion organic small molecules in the chemical universe database GDB-17, J. Chem. Inf. Model. 52, 2864–2875, 2012. [2] R. Ramakrishnan, P. O. Dral, M. Rupp, O. A. von Lilienfeld, Quantum chemistry structures and properties of 134 kilo molecules, Scientific Data 1, 140022, 2014. [3] Duvenaud, D. K., Maclaurin, D., Iparraguirre, J., Bombarell, R., Hirzel, T., Aspuru-Guzik, A., & Adams, R. P. (2015). Convolutional networks on graphs for learning molecular fingerprints. In Advances in neural information processing systems (pp. 2224-2232). [4] Gilmer, J., Schoenholz, S. S., Riley, P. F., Vinyals, O., & Dahl, G. E. (2017). Neural message passing for quantum chemistry. arXiv preprint arXiv:1704.01212. ================================================ FILE: docs/source/utils.rst ================================================ ========= Utilities ========= ================================================ FILE: examples/.gitignore ================================================ result/ ================================================ FILE: examples/README.md ================================================ # Chainer Chemistry examples These examples are implemented to train the model. * Tox21: 12 types of toxity classification * QM9: Chemical property regression * Own dataset: Own dataset (prepared in csv format) regression * Molcule Net: Various dataset for both classification and regression ## Test To test code of all examples, run ``` bash -x test_examples.sh -1 # for CPU bash -x test_examples.sh 0 # for GPU ``` If you encounter errors, please report them to [Github issues](https://github.com/pfnet-research/chainer-chemistry/issues) along with error logs. We appreciate your help. ================================================ FILE: examples/molnet/README.md ================================================ # MoleculeNet [MoleculeNet](http://moleculenet.ai/) provides various dataset, which ranges Physics, Chemistry, Bio and Physiology. You can specify dataset type, and train the model for the dataset. ## How to run the code ### Train the model by specifying dataset You can specify dataset type by `--dataset` option. Please refer [molnet_config.py](https://github.com/pfnet-research/chainer-chemistry/blob/master/chainer_chemistry/datasets/molnet/molnet_config.py) for the list of available dataset in Chainer Chemistry. For example, if you want to train "bbbp" dataset, With CPU: ```angular2html python train_molnet.py --dataset=bbbp ``` With GPU: ```angular2html python train_molnet.py --dataset=bbbp -g 0 ``` ================================================ FILE: examples/molnet/evaluate_models_molnet.sh ================================================ #!/usr/bin/env bash set -e # List of available datasets. # TODO: Investigate why training on `clearance` fails. datasets=(bace_Class bace_pIC50 bbbp clintox delaney HIV hopv lipo \ muv nci pcba ppb qm7 qm8 qm9 SAMPL sider tox21 toxcast) methods=(relgcn) # device identifier; set it to -1 to train on the CPU (default). device=${1:--1} # Remove directories with previously trained models. [ -d result ] && rm -rf result for dataset in ${datasets[@]}; do for method in ${methods[@]}; do python train_molnet.py \ --dataset ${dataset} \ --method ${method} \ --device ${device} \ --epoch 1 \ --unit-num 10 \ --conv-layers 1 \ --num-data 100 \ --out result python predict_molnet.py \ --dataset ${dataset} \ --method ${method} \ --in-dir result \ --device ${device} \ --num-data 100 done done ================================================ FILE: examples/molnet/predict_molnet.py ================================================ #!/usr/bin/env python from __future__ import print_function import argparse import os import chainer from chainer.iterators import SerialIterator from chainer.training.extensions import Evaluator from chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator # NOQA # Proposed by Ishiguro # ToDo: consider go/no-go with following modification # Re-load the best-validation score snapshot using serializers # from chainer import serializers from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry.datasets import NumpyTupleDataset from chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config # NOQA from chainer_chemistry.models.prediction import Classifier from chainer_chemistry.models.prediction import Regressor from chainer_chemistry.utils import save_json # These import is necessary for pickle to work from chainer_chemistry.links.scaler.standard_scaler import StandardScaler # NOQA from chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor # NOQA from train_molnet import dataset_part_filename from train_molnet import download_entire_dataset def parse_arguments(): # Lists of supported preprocessing methods/models. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'gnnfilm', 'megnet', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm'] # scale_list = ['standardize', 'none'] dataset_names = list(molnet_default_config.keys()) # Set up the argument parser. parser = argparse.ArgumentParser(description='Prediction on Molnet.') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') # parser.add_argument('--scale', type=str, choices=scale_list, # help='label scaling method', default='standardize') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--in-dir', '-i', type=str, default='result', help='directory to load model data from') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') return parser.parse_args() def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name, method, labels)) else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) # Load the cached dataset. filename = dataset_part_filename('test', num_data) path = os.path.join(cache_dir, filename) if os.path.exists(path): print('Loading cached dataset from {}.'.format(path)) test = NumpyTupleDataset.load(path) else: _, _, test = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) # Model-related data is stored this directory. model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir)) model_filename = {'classification': 'classifier.pkl', 'regression': 'regressor.pkl'} task_type = molnet_default_config[dataset_name]['task_type'] model_path = os.path.join(model_dir, model_filename[task_type]) print("model_path=" + model_path) print('Loading model weights from {}...'.format(model_path)) device = chainer.get_device(args.device) if task_type == 'classification': model = Classifier.load_pickle(model_path, device=device) elif task_type == 'regression': model = Regressor.load_pickle(model_path, device=device) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Re-load the best-validation score snapshot # serializers.load_npz(os.path.join( # model_dir, "best_val_" + model_filename[task_type]), model) # Run an evaluator on the test dataset. print('Evaluating...') converter = converter_method_dict[method] test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, model, converter=converter, device=device)() print('Evaluation result: ', eval_result) # Add more stats if task_type == 'regression': # loss = cuda.to_cpu(numpy.array(eval_result['main/loss'])) # eval_result['main/loss'] = loss # convert to native values.. for k, v in eval_result.items(): eval_result[k] = float(v) elif task_type == "classification": # For Classifier, we do not equip the model with ROC-AUC evalation # function. use separate ROC-AUC Evaluator rocauc_result = ROCAUCEvaluator( test_iterator, model, converter=converter, device=device, eval_func=model.predictor, name='test', ignore_labels=-1)() print('ROCAUC Evaluation result: ', rocauc_result) save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result) else: print('[WARNING] unknown task_type {}.'.format(task_type)) # Save the evaluation results. save_json(os.path.join(model_dir, 'eval_result.json'), eval_result) if __name__ == '__main__': main() ================================================ FILE: examples/molnet/summary_eval_molnet.py ================================================ #! -*- coding: utf-8 -*- import argparse import json import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import os import seaborn as sns import numpy as np from chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config # NOQA from pandas import DataFrame def save_evaluation_plot(x, y_mean, metric, dataset_name, filename): plt.figure() sns.set() ax = sns.barplot(y=x, x=y_mean) # If "text" does not work, change the attribute name to "s" for n, (label, _y) in enumerate(zip(x, y_mean)): ax.annotate( s='{:.3f}'.format(abs(_y)), xy=(_y, n), ha='right', va='center', xytext=(-5, 0), textcoords='offset points', color='white') plt.title('Performance on ' + dataset_name) plt.xlabel(metric) plt.savefig(filename) def main(): parser = argparse.ArgumentParser() parser.add_argument('--prefix', required=True) parser.add_argument('--methods', nargs='+', required=True) parser.add_argument('--dataset', required=True) parser.add_argument('--runs', type=int, required=True) parser.add_argument('--out_prefix', default="result_") args = parser.parse_args() # # load the config file in the designated directory # dataset_name = args.dataset task_type = molnet_default_config[dataset_name]['task_type'] print('task type=\'' + str(task_type) + "\'") if task_type=='regression': metrics = ['main/MAE', 'main/RMSE'] elif task_type=='classification': metrics = ['test/main/roc_auc'] x = args.methods for metric in metrics: y = np.zeros( (len(args.methods), args.runs) ) for m, method in enumerate(args.methods): for run in range(0, args.runs): #for run in range(1, args.runs+1): with open(os.path.join(args.prefix + "_" + method + "_" + str(run), 'eval_result.json')) as f: result = json.load(f) y[m, run-1,] = result[metric] # end with # end run-for # end method-for metric_lastslash = metric.rindex("/") metric_name = metric[metric_lastslash+1:] # draw figure save_evaluation_plot(x, np.mean(y, axis=1), metric, dataset_name, args.out_prefix + metric_name + '.png') save_evaluation_plot(x, np.mean(y, axis=1), metric, dataset_name, args.out_prefix + metric_name + '.pdf') # output as text. mean/std y_mean = np.mean(y, axis=1) y_std = np.std(y, axis=1) with open(args.out_prefix + "_summary_" + metric_name + ".tsv", "w") as fout: for m, method in enumerate(args.methods): fout.write(method + "\t" + str(y_mean[m]) + "\t" + str(y_std[m]) + "\n") # end-for # end with # end metric-for if __name__ == "__main__": main() ================================================ FILE: examples/molnet/test_molnet.sh ================================================ #!/usr/bin/env bash set -e # List of available datasets. # TODO: Investigate why training on `clearance` fails. datasets=(bace_Class bace_pIC50 bbbp clintox delaney HIV hopv lipo \ muv nci pcba ppb qm7 qm8 qm9 SAMPL sider tox21 toxcast) # device identifier; set it to -1 to train on the CPU (default). device=${1:--1} # Remove directories with previously trained models. [ -d input ] && rm -rf input for dataset in ${datasets[@]} do # Run the training script for the current dataset. python train_molnet.py \ --dataset $dataset \ --method nfp \ --conv-layers 1 \ --device ${device} \ --epoch 1 \ --unit-num 10 \ --out nfp_${dataset} \ --batchsize 32 \ --num-data=100 done ================================================ FILE: examples/molnet/train_molnet.py ================================================ #!/usr/bin/env python from __future__ import print_function import argparse import numpy import os import types import chainer from chainer import iterators from chainer import optimizers from chainer import training from chainer.training import extensions as E from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry.dataset.preprocessors import preprocess_method_dict from chainer_chemistry import datasets as D from chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config # NOQA from chainer_chemistry.datasets import NumpyTupleDataset from chainer_chemistry.links import StandardScaler from chainer_chemistry.models.prediction import Classifier from chainer_chemistry.models.prediction import Regressor from chainer_chemistry.models.prediction import set_up_predictor from chainer_chemistry.training.extensions import BatchEvaluator, ROCAUCEvaluator # NOQA from chainer_chemistry.training.extensions.auto_print_report import AutoPrintReport # NOQA from chainer_chemistry.utils import save_json def parse_arguments(): # Lists of supported preprocessing methods/models and datasets. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'gnnfilm', 'megnet', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm'] dataset_names = list(molnet_default_config.keys()) scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='molnet example') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--protocol', type=int, default=2, help='pickle protocol version') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') parser.add_argument('--scale', type=str, choices=scale_list, help='label scaling method', default='standardize') return parser.parse_args() def dataset_part_filename(dataset_part, num_data): """Returns the filename corresponding to a train/valid/test parts of a dataset, based on the amount of data samples that need to be parsed. Args: dataset_part: String containing any of the following 'train', 'valid' or 'test'. num_data: Amount of data samples to be parsed from the dataset. """ if num_data >= 0: return '{}_data_{}.npz'.format(dataset_part, str(num_data)) return '{}_data.npz'.format(dataset_part) def download_entire_dataset(dataset_name, num_data, labels, method, cache_dir): """Downloads the train/valid/test parts of a dataset and stores them in the cache directory. Args: dataset_name: Dataset to be downloaded. num_data: Amount of data samples to be parsed from the dataset. labels: Target labels for regression. method: Method name. See `parse_arguments`. cache_dir: Directory to store the dataset to. """ print('Downloading {}...'.format(dataset_name)) preprocessor = preprocess_method_dict[method]() # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) if num_data >= 0 else None dataset_parts = D.molnet.get_molnet_dataset(dataset_name, preprocessor, labels=labels, target_index=target_index) dataset_parts = dataset_parts['dataset'] # Cache the downloaded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) for i, part in enumerate(['train', 'valid', 'test']): filename = dataset_part_filename(part, num_data) path = os.path.join(cache_dir, filename) NumpyTupleDataset.save(path, dataset_parts[i]) return dataset_parts def fit_scaler(datasets): """Standardizes (scales) the dataset labels. Args: datasets: Tuple containing the datasets. Returns: Datasets with standardized labels and the scaler object. """ scaler = StandardScaler() # Collect all labels in order to apply scaling over the entire dataset. labels = None offsets = [] for dataset in datasets: if labels is None: labels = dataset.get_datasets()[-1] else: labels = numpy.vstack([labels, dataset.get_datasets()[-1]]) offsets.append(len(labels)) scaler.fit(labels) return scaler def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers task_type = molnet_default_config[dataset_name]['task_type'] model_filename = {'classification': 'classifier.pkl', 'regression': 'regressor.pkl'} print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [dataset_part_filename(p, num_data) for p in ['train', 'valid']] paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) train, valid = dataset_parts[0], dataset_parts[1] # Scale the label values, if necessary. scaler = None if args.scale == 'standardize': if task_type == 'regression': print('Applying standard scaling to the labels.') scaler = fit_scaler(dataset_parts) else: print('Label scaling is not available for classification tasks.') else: print('No label scaling was selected.') # Set up the predictor. predictor = set_up_predictor(method, n_unit, conv_layers, class_num, label_scaler=scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = {k: v for k, v in metrics.items() if isinstance(v, types.FunctionType)} loss_fun = molnet_default_config[dataset_name]['loss'] device = chainer.get_device(args.device) if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam() optimizer.setup(model) # Save model-related output to this directory. if not os.path.exists(args.out): os.makedirs(args.out) save_json(os.path.join(args.out, 'args.json'), vars(args)) model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set up the updater. converter = converter_method_dict[method] updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=converter) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend(E.Evaluator(valid_iter, model, device=device, converter=converter)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # TODO: consider go/no-go of the following block # # (i) more reporting for val/evalutaion # # (ii) best validation score snapshot # if task_type == 'regression': # metric_name_list = list(metrics.keys()) # if 'RMSE' in metric_name_list: # trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MinValueTrigger('validation/main/RMSE')) # elif 'MAE' in metric_name_list: # trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MinValueTrigger('validation/main/MAE')) # else: # print("[WARNING] No validation metric defined?") # # elif task_type == 'classification': # train_eval_iter = iterators.SerialIterator( # train, args.batchsize, repeat=False, shuffle=False) # trainer.extend(ROCAUCEvaluator( # train_eval_iter, predictor, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='train', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # extension name='validation' is already used by `Evaluator`, # # instead extension name `val` is used. # trainer.extend(ROCAUCEvaluator( # valid_iter, predictor, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='val', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # trainer.extend(E.snapshot_object( # model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MaxValueTrigger('val/main/roc_auc')) # else: # raise NotImplementedError( # 'Not implemented task_type = {}'.format(task_type)) trainer.extend(AutoPrintReport()) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol) if __name__ == '__main__': main() ================================================ FILE: examples/molnet_wle/README.md ================================================ # Weisfeiler-Lehman Embedding preprocessor implementations In this directory, we provide an implementaion of [Weisfeiler-Lehman Embedding (WLE)](https://arxiv.org/abs/2006.06909) [1] preprocessor for ChainerChemistry GNN models. ## How to run the code ### Test run command ```bash # Training tox21 dataset using RSGCN-CWLE model. Short 3 epoch for testing. python train_molnet_wle.py --dataset tox21 --method rsgcn_cwle --epoch 3 --device 0 # Prediction with trained model python predict_molnet_wle.py --dataset tox21 --method rsgcn_cwle --in-dir result --device 0 ``` ### Train the model by specifying dataset Basically, no changes from the original molnet examples (examples/molnet/train_molnet.py). The main difference is the choice of '--method' option. To test WLE, choose one of 'xxx_wle', 'xxx_cwle', and 'xxx_gwle' where 'xxx' is a GNN architecture identifier (e.g. 'rsgcn', 'relgat'). - xxx_wle: apply the naive WLE to the GNN 'xxx' - xxx_cwle (recommended): apply the Concat WLE to the GNN 'xxx' - xxx_gwle: apply the Gated-sum WLE to the GNN 'xxx' #### Additional options Introducing the WLE, we have some more additional options. In general you do not need to specify these options (use the default values!). ## Performance The paper [1] shows that the use of (C)WLE consistently improves the generalization (test) performance of the several GNN architectures (if hyperparameters are optimized by a Black-box optimizer such as [Optuna] (https://preferred.jp/ja/projects/optuna/). ## References [1] Katsuhiko Ishiguro, Kenta Oono, and Kohei Hayashi, "Weisfeiler-Lehman Embedding for Molecular Graph Neural Networks", arXiv: 2006.06909, 2020. [paper link](https://arxiv.org/abs/2006.06909) ================================================ FILE: examples/molnet_wle/predict_molnet_wle.py ================================================ #!/usr/bin/env python from __future__ import print_function import argparse import os import chainer from chainer.iterators import SerialIterator from chainer.training.extensions import Evaluator from chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator # NOQA # Proposed by Ishiguro # ToDo: consider go/no-go with following modification # Re-load the best-validation score snapshot using serializers from chainer import serializers from chainer_chemistry.dataset.converters import concat_mols from chainer_chemistry.datasets import NumpyTupleDataset from chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config # NOQA from chainer_chemistry.models.prediction import Classifier from chainer_chemistry.models.prediction import Regressor from chainer_chemistry.utils import save_json # These import is necessary for pickle to work from chainer import functions as F from chainer_chemistry.links.scaler.standard_scaler import StandardScaler # NOQA from chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor # NOQA from train_molnet_wle import dict_for_wles from train_molnet_wle import dataset_part_filename from train_molnet_wle import download_entire_dataset dict_for_wles() def parse_arguments(): # Lists of supported preprocessing methods/models. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'gnnfilm', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm', 'nfp_wle', 'ggnn_wle', 'relgat_wle', 'relgcn_wle', 'rsgcn_wle', 'gin_wle', 'nfp_cwle', 'ggnn_cwle', 'relgat_cwle', 'relgcn_cwle', 'rsgcn_cwle', 'gin_cwle', 'nfp_gwle', 'ggnn_gwle', 'relgat_gwle', 'relgcn_gwle', 'rsgcn_gwle', 'gin_gwle'] # scale_list = ['standardize', 'none'] dataset_names = list(molnet_default_config.keys()) # Set up the argument parser. parser = argparse.ArgumentParser(description='Prediction on Molnet.') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') # parser.add_argument('--scale', type=str, choices=scale_list, # help='label scaling method', default='standardize') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--in-dir', '-i', type=str, default='result', help='directory to load model data from') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') return parser.parse_args() def main(): args = parse_arguments() # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name, method, labels)) else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) # Load the cached dataset. filename = dataset_part_filename('test', num_data) path = os.path.join(cache_dir, filename) if os.path.exists(path): print('Loading cached dataset from {}.'.format(path)) test = NumpyTupleDataset.load(path) else: _, _, test = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir) # Model-related data is stored this directory. model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir)) model_filename = {'classification': 'classifier.pkl', 'regression': 'regressor.pkl'} task_type = molnet_default_config[dataset_name]['task_type'] model_path = os.path.join(model_dir, model_filename[task_type]) print("model_path=" + model_path) print('Loading model weights from {}...'.format(model_path)) device = chainer.get_device(args.device) if task_type == 'classification': model = Classifier.load_pickle(model_path, device=device) elif task_type == 'regression': model = Regressor.load_pickle(model_path, device=device) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Re-load the best-validation score snapshot # serializers.load_npz(os.path.join( # model_dir, "best_val_" + model_filename[task_type]), model) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, model, converter=concat_mols, device=device)() print('Evaluation result: ', eval_result) # Add more stats if task_type == 'regression': # loss = cuda.to_cpu(numpy.array(eval_result['main/loss'])) # eval_result['main/loss'] = loss # convert to native values.. for k, v in eval_result.items(): eval_result[k] = float(v) elif task_type == "classification": # For Classifier, we do not equip the model with ROC-AUC evalation # function. use separate ROC-AUC Evaluator rocauc_result = ROCAUCEvaluator( test_iterator, model, converter=concat_mols, device=device, eval_func=model.predictor, name='test', ignore_labels=-1)() print('ROCAUC Evaluation result: ', rocauc_result) # add for k, v in rocauc_result.items(): eval_result[k] = float(v) #save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result) else: print('[WARNING] unknown task_type {}.'.format(task_type)) # Save the evaluation results. save_json(os.path.join(model_dir, 'eval_result.json'), eval_result) if __name__ == '__main__': main() ================================================ FILE: examples/molnet_wle/train_molnet_wle.py ================================================ #!/usr/bin/env python from __future__ import print_function import argparse import numpy import os import types import pickle import chainer from chainer import iterators from chainer import optimizers from chainer import training from chainer.training import extensions as E from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry.dataset.preprocessors import preprocess_method_dict, wle from chainer_chemistry import datasets as D from chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config # NOQA from chainer_chemistry.datasets import NumpyTupleDataset from chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import DeepChemScaffoldSplitter # NOQA from chainer_chemistry.links import StandardScaler from chainer_chemistry.models.prediction import Classifier from chainer_chemistry.models.prediction import Regressor from chainer_chemistry.models.prediction import set_up_predictor from chainer_chemistry.training.extensions.auto_print_report import AutoPrintReport # NOQA from chainer_chemistry.utils import save_json from chainer_chemistry.models.cwle.cwle_graph_conv_model import MAX_WLE_NUM def dict_for_wles(): wle_keys = ['nfp_wle', 'ggnn_wle', 'relgat_wle', 'relgcn_wle', 'rsgcn_wle', 'gin_wle', 'nfp_cwle', 'ggnn_cwle', 'relgat_cwle', 'relgcn_cwle', 'rsgcn_cwle', 'gin_cwle', 'nfp_gwle', 'ggnn_gwle', 'relgat_gwle', 'relgcn_gwle', 'rsgcn_gwle', 'gin_gwle'] from chainer_chemistry.dataset.converters.concat_mols import concat_mols from chainer_chemistry.dataset.preprocessors.nfp_preprocessor import NFPPreprocessor from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import GGNNPreprocessor from chainer_chemistry.dataset.preprocessors.gin_preprocessor import GINPreprocessor from chainer_chemistry.dataset.preprocessors.relgat_preprocessor import RelGATPreprocessor from chainer_chemistry.dataset.preprocessors.relgcn_preprocessor import RelGCNPreprocessor from chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor for key in wle_keys: converter_method_dict[key] = concat_mols if key.startswith('nfp'): preprocess_method_dict[key] = NFPPreprocessor elif key.startswith('ggnn'): preprocess_method_dict[key] = GGNNPreprocessor elif key.startswith('gin'): preprocess_method_dict[key] = GINPreprocessor elif key.startswith('relgcn'): preprocess_method_dict[key] = RelGCNPreprocessor elif key.startswith('rsgcn'): preprocess_method_dict[key] = RSGCNPreprocessor elif key.startswith('relgat'): preprocess_method_dict[key] = RelGATPreprocessor else: assert key in wle_keys # should be die dict_for_wles() def parse_arguments(): # Lists of supported preprocessing methods/models and datasets. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'gnnfilm', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm', 'nfp_wle', 'ggnn_wle', 'relgat_wle', 'relgcn_wle', 'rsgcn_wle', 'gin_wle', 'nfp_cwle', 'ggnn_cwle', 'relgat_cwle', 'relgcn_cwle', 'rsgcn_cwle', 'gin_cwle', 'nfp_gwle', 'ggnn_gwle', 'relgat_gwle', 'relgcn_gwle', 'rsgcn_gwle', 'gin_gwle'] dataset_names = list(molnet_default_config.keys()) scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='molnet example') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--protocol', type=int, default=2, help='pickle protocol version') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') parser.add_argument('--scale', type=str, choices=scale_list, help='label scaling method', default='standardize') parser.add_argument('--adam-alpha', type=float, help='alpha of adam', default=0.001) # WLE options parser.add_argument('--cutoff-wle', type=int, default=0, help="set more than zero to cut-off WL expanded labels") parser.add_argument('--hop-num', '-k', type=int, default=1, help="The number of iterations of WLs") return parser.parse_args() def dataset_part_filename(dataset_part, num_data): """Returns the filename corresponding to a train/valid/test parts of a dataset, based on the amount of data samples that need to be parsed. Args: dataset_part: String containing any of the following 'train', 'valid' or 'test'. num_data: Amount of data samples to be parsed from the dataset. """ if num_data >= 0: return '{}_data_{}.npz'.format(dataset_part, str(num_data)) return '{}_data.npz'.format(dataset_part) def download_entire_dataset(dataset_name, num_data, labels, method, cache_dir, apply_wle_flag=False, cutoff_wle=0, apply_cwle_flag=False, apply_gwle_flag=False, n_hop=1): """Downloads the train/valid/test parts of a dataset and stores them in the cache directory. Args: dataset_name: Dataset to be downloaded. num_data: Amount of data samples to be parsed from the dataset. labels: Target labels for regression. method: Method name. See `parse_arguments`. cache_dir: Directory to store the dataset to. apply_wle_flag: boolean, set True if you apply the naive WL embeddding cutoff_wle: int set more than zero to cut off WEEs apply_cwle_flag: boolean, set True if you apply Concatenating WLE (CWLE) apply_gwle_flag: boolean, set True if you apply Gated-sum WLE (GWLE) """ print('Downloading {}...'.format(dataset_name)) preprocessor = preprocess_method_dict[method]() # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) if num_data >= 0 else None # To force DeepChem scaffold split dc_scaffold_splitter = DeepChemScaffoldSplitter() dataset_parts = D.molnet.get_molnet_dataset(dataset_name, preprocessor, labels=labels, split=dc_scaffold_splitter, target_index=target_index) dataset_parts = dataset_parts['dataset'] # Cache the downloaded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) # apply Neighboring Label Expansion if apply_wle_flag: dataset_parts_expand, labels_expanded, labels_frequency = wle.apply_wle_for_datasets(dataset_parts, cutoff_wle, n_hop) dataset_parts = dataset_parts_expand num_expanded_symbols = len(labels_expanded) print("WLE Expanded Labels Applied to datasets: vocab=", num_expanded_symbols) print(labels_expanded) # save in text file_name = "WLE_labels.dat" path = os.path.join(cache_dir, file_name) with open(path, "w") as fout: for label in labels_expanded: fout.write(label + " " + str(labels_frequency[label]) + "\n") # save binaries file_name = "WLE_labels.pkl" outfile = cache_dir + "/" + file_name with open(outfile, "wb") as fout: pickle.dump( (labels_expanded, labels_frequency), fout) elif apply_cwle_flag: dataset_parts_expand, labels_expanded, labels_frequency = wle.apply_cwle_for_datasets(dataset_parts, n_hop) dataset_parts = dataset_parts_expand num_expanded_symbols = len(labels_expanded) print("Concatenating WLE Expanded Labels Applied to datasets: vocab=", num_expanded_symbols) print(labels_expanded) # save in text file_name = "CWLE_labels.dat" path = os.path.join(cache_dir, file_name) with open(path, "w") as fout: for label in labels_expanded: fout.write(label + " " + str(labels_frequency[label]) + "\n") # save binaries file_name = "CWLE_labels.pkl" outfile = cache_dir + "/" + file_name with open(outfile, "wb") as fout: pickle.dump( (labels_expanded, labels_frequency), fout) elif apply_gwle_flag: dataset_parts_expand, labels_expanded, labels_frequency = wle.apply_cwle_for_datasets(dataset_parts, n_hop) dataset_parts = dataset_parts_expand num_expanded_symbols = len(labels_expanded) print("Gated-sum WLE Expanded Labels Applied to datasets: vocab=", num_expanded_symbols) print(labels_expanded) # save in text file_name = "GWLE_labels.dat" path = os.path.join(cache_dir, file_name) with open(path, "w") as fout: for label in labels_expanded: fout.write(label + " " + str(labels_frequency[label]) + "\n") # save binaries file_name = "GWLE_labels.pkl" outfile = cache_dir + "/" + file_name with open(outfile, "wb") as fout: pickle.dump( (labels_expanded, labels_frequency), fout) else: labels_expanded = [] # ToDO: scaler should be placed here # ToDo: fit the scaler # ToDo: transform dataset_parts[0-2] for i, part in enumerate(['train', 'valid', 'test']): filename = dataset_part_filename(part, num_data) path = os.path.join(cache_dir, filename) if False: print(type(dataset_parts[i])) print(type(dataset_parts[i][0])) print(type(dataset_parts[i][0][0])) print(type(dataset_parts[i][0][1])) print(type(dataset_parts[i][0][2])) print(dataset_parts[i][0][0].shape) print(dataset_parts[i][0][1].shape) print(dataset_parts[i][0][2].shape) print(dataset_parts[i][0][0].dtype) print(dataset_parts[i][0][1].dtype) print(dataset_parts[i][0][2].dtype) NumpyTupleDataset.save(path, dataset_parts[i]) return dataset_parts def fit_scaler(datasets): """Standardizes (scales) the dataset labels. Args: datasets: Tuple containing the datasets. Returns: Datasets with standardized labels and the scaler object. """ scaler = StandardScaler() # Collect all labels in order to apply scaling over the entire dataset. labels = None offsets = [] for dataset in datasets: if labels is None: labels = dataset.get_datasets()[-1] else: labels = numpy.vstack([labels, dataset.get_datasets()[-1]]) offsets.append(len(labels)) scaler.fit(labels) return scaler def main(): args = parse_arguments() print(args) # Set up some useful variables that will be used later on. dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers adam_alpha = args.adam_alpha cutoff_wle = args.cutoff_wle n_hop = args.hop_num apply_wle_flag = method in ['nfp_wle', 'ggnn_wle', 'relgat_wle', 'relgcn_wle', 'rsgcn_wle', 'gin_wle'] apply_cwle_flag = method in ['nfp_cwle', 'ggnn_cwle', 'relgat_cwle', 'relgcn_cwle', 'rsgcn_cwle', 'gin_cwle'] apply_gwle_flag = method in ['nfp_gwle', 'ggnn_gwle', 'relgat_gwle', 'relgcn_gwle', 'rsgcn_gwle', 'gin_gwle'] task_type = molnet_default_config[dataset_name]['task_type'] model_filename = {'classification': 'classifier.pkl', 'regression': 'regressor.pkl'} print('Using dataset: {}...'.format(dataset_name)) # Set up some useful variables that will be used later on. if args.label: labels = args.label cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Load the train and validation parts of the dataset. filenames = [dataset_part_filename(p, num_data) for p in ['train', 'valid', 'test']] # ToDo: We need to incoporeat scaler into download_entire_dataset, instead of predictors. paths = [os.path.join(cache_dir, f) for f in filenames] if all([os.path.exists(path) for path in paths]): dataset_parts = [] for path in paths: print('Loading cached dataset from {}.'.format(path)) dataset_parts.append(NumpyTupleDataset.load(path)) else: dataset_parts = download_entire_dataset(dataset_name, num_data, labels, method, cache_dir, apply_wle_flag, cutoff_wle, apply_cwle_flag, apply_gwle_flag, n_hop) train, valid = dataset_parts[0], dataset_parts[1] # ToDo: scaler must be incorporated into download_entire_datasets. not here # Scale the label values, if necessary. scaler = None if args.scale == 'standardize': if task_type == 'regression': print('Applying standard scaling to the labels.') scaler = fit_scaler(dataset_parts) else: print('Label scaling is not available for classification tasks.') else: print('No label scaling was selected.') # ToDo: set label_scaler always None # Set up the predictor. if apply_wle_flag: # find the num_atoms max_symbol_index = wle.findmaxidx(dataset_parts) print("number of expanded symbols (WLE) = ", max_symbol_index) predictor = set_up_predictor( method, n_unit, conv_layers, class_num, label_scaler=scaler, n_atom_types=max_symbol_index) elif apply_cwle_flag or apply_gwle_flag: n_wle_types = wle.findmaxidx( dataset_parts, 'wle_label') # Kenta Oono (oono@preferred.jp) # In the previous implementation, we use MAX_WLE_NUM # as the dimension of one-hot vectors for WLE labels # when the model is CWLE or WLNE and hop_num k = 1. # When k >= 2, # of wle labels can be larger than MAX_WLE_NUM, # which causes an error. # Therefore, we have increased the dimension of vectors. # To align with the previous experiments, # we change n_wle_types only if it exceeds MAX_WLE_NUM. n_wle_types = max(n_wle_types, MAX_WLE_NUM) print("number of expanded symbols (CWLE/GWLE) = ", n_wle_types) predictor = set_up_predictor( method, n_unit, conv_layers, class_num, label_scaler=scaler, n_wle_types=n_wle_types) else: predictor = set_up_predictor( method, n_unit, conv_layers, class_num, label_scaler=scaler) # Set up the iterators. train_iter = iterators.SerialIterator(train, args.batchsize) valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Load metrics for the current dataset. metrics = molnet_default_config[dataset_name]['metrics'] metrics_fun = {k: v for k, v in metrics.items() if isinstance(v, types.FunctionType)} loss_fun = molnet_default_config[dataset_name]['loss'] device = chainer.get_device(args.device) if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=device) else: raise ValueError('Invalid task type ({}) encountered when processing ' 'dataset ({}).'.format(task_type, dataset_name)) # Set up the optimizer. optimizer = optimizers.Adam(alpha=adam_alpha) optimizer.setup(model) # Save model-related output to this directory. if not os.path.exists(args.out): os.makedirs(args.out) save_json(os.path.join(args.out, 'args.json'), vars(args)) model_dir = os.path.join(args.out, os.path.basename(cache_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) # Set up the updater. converter = converter_method_dict[method] updater = training.StandardUpdater(train_iter, optimizer, device=device, converter=converter) # Set up the trainer. trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir) trainer.extend(E.Evaluator(valid_iter, model, device=device, converter=converter)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) # TODO: consider go/no-go of the following block # # (i) more reporting for val/evalutaion # # (ii) best validation score snapshot # if task_type == 'regression': # metric_name_list = list(metrics.keys()) # if 'RMSE' in metric_name_list: # trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MinValueTrigger('validation/main/RMSE')) # elif 'MAE' in metric_name_list: # trainer.extend(E.snapshot_object(model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MinValueTrigger('validation/main/MAE')) # else: # print("[WARNING] No validation metric defined?") # # elif task_type == 'classification': # train_eval_iter = iterators.SerialIterator( # train, args.batchsize, repeat=False, shuffle=False) # trainer.extend(ROCAUCEvaluator( # train_eval_iter, predictor, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='train', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # extension name='validation' is already used by `Evaluator`, # # instead extension name `val` is used. # trainer.extend(ROCAUCEvaluator( # valid_iter, predictor, eval_func=predictor, # device=args.gpu, converter=concat_mols, name='val', # pos_labels=1, ignore_labels=-1, raise_value_error=False)) # # trainer.extend(E.snapshot_object( # model, "best_val_" + model_filename[task_type]), # trigger=training.triggers.MaxValueTrigger('val/main/roc_auc')) # else: # raise NotImplementedError( # 'Not implemented task_type = {}'.format(task_type)) trainer.extend(AutoPrintReport()) trainer.extend(E.ProgressBar()) trainer.run() # Save the model's parameters. model_path = os.path.join(model_dir, model_filename[task_type]) print('Saving the trained model to {}...'.format(model_path)) model.save_pickle(model_path, protocol=args.protocol) # dump the parameter, if CWLE #if apply_cwle_flag: # cwle = predictor.graph_conv # #print(cwle) # concatW = cwle.linear_for_concat_wle.W.data # #print(type(concatW)) # # # dump the raw W # out_prefix = args.out + "/" + method + "_" + dataset_name +"_learnedW" # with open(out_prefix + ".dat", 'w') as fout: # import csv # writer = csv.writer(fout, lineterminator="\n") # writer.writerows(concatW) # # end with # # import matplotlib # matplotlib.use('Agg') # import matplotlib.pyplot as plt # # visualize # fig1, ax1 = plt.subplots() # plt.imshow(concatW, cmap="jet") # plt.colorbar(ax=ax1) # # plt.title('Learned W on ' + dataset_name + ' + ' + method) # plt.savefig(out_prefix + ".png") # plt.savefig(out_prefix + ".pdf") # # # visualize the absolute value # fig2, ax2 = plt.subplots() # plt.imshow(numpy.abs(concatW), cmap="jet") # plt.colorbar(ax=ax2) # # plt.title('Learned abs(W) on ' + dataset_name + ' + ' + method) # plt.savefig(out_prefix + "_abs.png") # plt.savefig(out_prefix + "_abs.pdf") if __name__ == '__main__': main() ================================================ FILE: examples/network_graph/README.md ================================================ # Network Node Classification Example This example performs semi-supervised node classification. ## Dependencies Before running the example, the following packages also need to be installed: - [`matplotlib`](https://matplotlib.org/) - [`seaborn`](https://seaborn.pydata.org/) - [`scikit-learn`](http://scikit-learn.org/stable/) ## Supported dataset - [Cora](https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz) - [Citeseer](https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz) - [Reddit](https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/reddit.zip) - we use the dataset provided by [dmlc/dgl](https://github.com/dmlc/dgl/blob/master/python/dgl/data/reddit.py) repository. Note that dataset is downloaded automatically. ## How to run the code ### Train a model To train a model, run the following: On the CPU: ```angular2html python train_network_graph.py --dataset cora ``` Train sparse model with GPU: ```angular2html python train_network_graph.py --dataset cora --device 0 --method gin_sparse ``` ### Train a model with reddit dataset reddit dataset contains, it can run only with specific configuration. Please turn on coo option to run training of reddit dataset. ```angular2html python train_network_graph.py --dataset reddit --device 0 --method gin --coo true ``` ================================================ FILE: examples/network_graph/citeseer/.gitignore ================================================ citeseer.cites citeseer.content README ================================================ FILE: examples/network_graph/cora/.gitignore ================================================ cora.cites cora.content README ================================================ FILE: examples/network_graph/padding_model_wrapper.py ================================================ import chainer from chainer_chemistry.dataset.graph_dataset.base_graph_data import PaddingGraphData # NOQA class PaddingModelWrapper(chainer.Chain): def __init__(self, predictor): super(PaddingModelWrapper, self).__init__() with self.init_scope(): self.predictor = predictor def forward(self, data): assert isinstance(data, PaddingGraphData) return self.predictor(data.x, data.adj) ================================================ FILE: examples/network_graph/reddit/.gitignore ================================================ reddit.zip reddit_data.npz reddit_graph.npz ================================================ FILE: examples/network_graph/train_network_graph.py ================================================ import argparse from distutils.util import strtobool import numpy from chainer_chemistry.datasets.citation_network.citation import citation_to_networkx # NOQA from chainer_chemistry.datasets.citation_network.citeseer import \ get_citeseer_dirpath from chainer_chemistry.datasets.citation_network.cora import get_cora_dirpath from chainer_chemistry.datasets.reddit.reddit import reddit_to_networkx, \ get_reddit_dirpath from chainer_chemistry.dataset.networkx_preprocessors.base_networkx import BasePaddingNetworkxPreprocessor, BaseSparseNetworkxPreprocessor # NOQA from chainer_chemistry.dataset.graph_dataset.base_graph_data import PaddingGraphData # NOQA from chainer_chemistry.utils.train_utils import run_node_classification_train from chainer_chemistry.models.prediction.node_classifier import NodeClassifier from chainer_chemistry.models.gin import GINSparse, GIN from chainer_chemistry.dataset.networkx_preprocessors.reddit_coo import get_reddit_coo_data # NOQA from padding_model_wrapper import PaddingModelWrapper # NOQA def get_cora(): return citation_to_networkx(get_cora_dirpath(), "cora") def get_citeseer(): return citation_to_networkx(get_citeseer_dirpath(), "citeseer") def get_reddit(): return reddit_to_networkx(get_reddit_dirpath()) dataset_dict = { 'cora': get_cora, 'citeseer': get_citeseer, 'reddit': get_reddit, } method_dict = { 'gin': GIN, 'gin_sparse': GINSparse, } preprocessor_dict = { 'gin': BasePaddingNetworkxPreprocessor, 'gin_sparse': BaseSparseNetworkxPreprocessor, } def parse_arguments(): # Lists of supported preprocessing methods/models. dataset_list = ['cora', 'citeseer', 'reddit'] method_list = ['gin', 'gin_sparse'] # Set up the argument parser. parser = argparse.ArgumentParser( description='Node classification on network a graph') parser.add_argument('--dataset', type=str, choices=dataset_list, default='cora', help='dataset name') parser.add_argument('--method', '-m', type=str, choices=method_list, default='gin_sparse', help='method name') parser.add_argument('--conv-layers', '-c', type=int, default=2, help='number of convolution layers') parser.add_argument( '--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=32, help='number of units in one layer of the model') parser.add_argument('--seed', '-s', type=int, default=777, help='random seed value') parser.add_argument('--train-data-ratio', '-r', type=float, default=0.2, help='ratio of training data w.r.t the dataset') parser.add_argument('--dropout', type=float, default=0.0, help='dropout ratio') parser.add_argument('--coo', type=strtobool, default='false', help='use Coo matrix') return parser.parse_args() def generate_random_mask(n, train_num, seed=777): numpy.random.seed(seed) mask = numpy.zeros(n, dtype=bool) mask[:train_num] = True numpy.random.shuffle(mask) return mask, numpy.logical_not(mask) # (train_mask, val_mask) if __name__ == '__main__': args = parse_arguments() if args.dataset == 'reddit' and args.coo: # because it takes time to load reddit coo data via networkx data = get_reddit_coo_data(get_reddit_dirpath()) else: networkx_graph = dataset_dict[args.dataset]() preprocessor = preprocessor_dict[args.method](use_coo=args.coo) data = preprocessor.construct_data(networkx_graph) print('label num: {}'.format(data.label_num)) gnn = method_dict[args.method](out_dim=None, node_embedding=True, out_channels=data.label_num, hidden_channels=args.unit_num, n_update_layers=args.conv_layers, dropout_ratio=args.dropout) if isinstance(data, PaddingGraphData): gnn = PaddingModelWrapper(gnn) predictor = NodeClassifier(gnn, device=args.device) train_label_num = int(data.n_nodes * args.train_data_ratio) train_mask, valid_mask = generate_random_mask( data.n_nodes, train_label_num) print("train label: {}, validation label: {}".format( train_label_num, data.n_nodes - train_label_num)) run_node_classification_train( predictor, data, train_mask, valid_mask, epoch=args.epoch, device=args.device) ================================================ FILE: examples/own_dataset/README.md ================================================ # Example of using your own dataset This example shows how to train models with your own dataset stored in the CSV format. A regression task is performed using [`Regressor`](http://chainer-chemistry.readthedocs.io/en/stable/generated/chainer_chemistry.models.Regressor.html#chainer_chemistry.models.Regressor). For a classification setting that makes use of [`Classifier`](http://chainer-chemistry.readthedocs.io/en/stable/generated/chainer_chemistry.models.Classifier.html#chainer_chemistry.models.Classifier), please refer to the `tox21` example. ## Dependencies Before running the example, the following packages also need to be installed: - [`matplotlib`](https://matplotlib.org/) - [`seaborn`](https://seaborn.pydata.org/) - [`scikit-learn`](http://scikit-learn.org/stable/) ## How to run the code ### Dataset preparation Prepare a CSV file containing the training data samples, one per row. Each row contains the SMILES string of one molecule, followed by the (label) values of the molecule's desired properties. The first line of the CSV file contains label names. Below you can find an example: ``` SMILES,value1,value2 CC1CC1CN1CC1C,-0.2190999984741211,0.08590000122785568 C#CCC(=N)OC=O,-0.2750999927520752,-0.032999999821186066 Cc1cnc(C=O)n1C,-0.23080000281333923,-0.053700000047683716 N=COCC(C=O)CO,-0.26260000467300415,-0.043699998408555984 [...] ``` Save one CSV file for training (e.g., `dataset_train.csv`) and one for testing (e.g., `dataset_test.csv`). Then pass them to the training and testing scripts, as shown below. ### Train a model To train a new model, run the following: ``` python train_own_dataset.py --datafile dataset_train.csv --label value1 value2 ``` The `--label` option specifies which columns in `dataset_train.csv` are trained. Type `python train_own_dataset.py --help` to see the complete set of options. ### Inference using a pretrained model To perform inference using a pretrained model, run the following: ``` python predict_own_dataset.py --datafile dataset_test.csv --label value1 value2 ``` Type `python test_own_dataset.py --help` to see the complete set of options. ### Evaluation of implemented models To evaluate the performance of the currently implemented models, run the following: ``` bash evaluate_own_dataset.sh [gpu_id] [epoch] ``` where `gpu_id` is the identifier of your GPU and `epoch` is the number of training epochs. To run the code on CPU, set `gpu_id` to `-1`. The scripts start the training process. Inference is then performed and evaluation metrics are reported. For regression tasks (such as the current example), these are MAE and RMSE. One plot per metric is created (saved as `eval_[metric]_own.png` in the example directory), which outputs these values as reported by the different models. ================================================ FILE: examples/own_dataset/dataset_test.csv ================================================ SMILES,value1,value2 CC1CC1CN1CC1C,-0.2190999984741211,0.08590000122785568 C#CCC(=N)OC=O,-0.2750999927520752,-0.032999999821186066 Cc1cnc(C=O)n1C,-0.23080000281333923,-0.053700000047683716 N=COCC(C=O)CO,-0.26260000467300415,-0.043699998408555984 CC1=C2CC3C(C1)C23C,-0.19580000638961792,-0.022700000554323196 CC1C=CCC(C)C1O,-0.2443999946117401,0.019099999219179153 C1c2n[nH]nc2C2CN12,-0.23350000381469727,-0.011800000444054604 COC1(C#N)CCC1C,-0.27480000257492065,0.02250000089406967 N=CNC(=O)C1CCO1,-0.25049999356269836,-0.020800000056624413 O=CC1(O)COC1=O,-0.27869999408721924,-0.06939999759197235 ================================================ FILE: examples/own_dataset/dataset_train.csv ================================================ SMILES,value1,value2 CC1=CC2CC(CC1)O2,-0.227400004863739,0.010400000028312206 O=Cc1nccn1C=O,-0.2678000032901764,-0.09380000084638596 CCC(C)(C)C(O)C=O,-0.2685000002384186,-0.038100000470876694 C#CCC(C)(CO)OC,-0.2535000145435333,0.044599998742341995 Nc1coc(=O)nc1N,-0.2303999960422516,-0.04170000180602074 CC12C=CC(CCC1)C2,-0.2312999963760376,0.02239999920129776 CC12CCC1C2OC=O,-0.2605000138282776,0.005400000140070915 CC1C2CC3(COC3)N12,-0.23430000245571136,0.0697999969124794 O=C1NC=NC12CC2,-0.24070000648498535,-0.017000000923871994 C1=CC2CN2CC2NC12,-0.22169999778270721,0.007699999958276749 CC1C2COCC12O,-0.2467000037431717,0.07410000264644623 CC(=O)C1OCOC1=O,-0.2590000033378601,-0.042500000447034836 CC1N2C3CC1(C)C32,-0.2295999974012375,0.0835999995470047 CC1=CC2OC2(C#N)C1,-0.25999999046325684,-0.019899999722838402 OC1CCC1,-0.25600001215934753,0.08009999990463257 C#CC1(O)COC1C#N,-0.2849000096321106,-0.01769999973475933 CC1(C#N)CC12CCC2,-0.2685000002384186,0.03460000082850456 CCCC(N)(C#N)CO,-0.25760000944137573,0.028999999165534973 NC1=NC2(CC2)CC1=O,-0.22470000386238098,-0.053700000047683716 C#CC12C3CC1(C)OC32,-0.2273000031709671,0.026900000870227814 CC(C)C#CCC=O,-0.24539999663829803,-0.02669999934732914 CC#CC(C=O)CC,-0.24169999361038208,-0.02539999969303608 CC1OC2C1=CC1OC12,-0.2485000044107437,-0.01769999973475933 CNC(=N)C(C#N)OC,-0.23420000076293945,-0.0013000000035390258 C#CC(C#C)OCC=O,-0.26100000739097595,-0.031599998474121094 CN1CC(O)C12CC2,-0.20479999482631683,0.08730000257492065 OC1C2C3OC4C1C2C34,-0.24469999969005585,0.04230000078678131 OCC1C(O)C2CC12O,-0.24169999361038208,0.05739999935030937 O=C([O-])C12[NH2+]CC1C2O,-0.2508000135421753,-0.0003000000142492354 Cn1cc(O)c(CO)n1,-0.2045000046491623,0.01850000023841858 O=C1COC2C3OC2C13,-0.2498999983072281,-0.03700000047683716 C1#CCCOC=NCC1,-0.24279999732971191,0.012600000016391277 O=c1ocncc1CO,-0.2563000023365021,-0.06289999932050705 CC1NC1C(O)C(N)=O,-0.2547999918460846,0.023800000548362732 CC1OC(=N)CC2CC21,-0.2498999983072281,0.032499998807907104 OC12CCC3CN3C1C2,-0.21709999442100525,0.07280000299215317 C#CC(CCO)OC,-0.2581999897956848,0.033900000154972076 CCC1COC(CO)=N1,-0.2540999948978424,0.019200000911951065 ON=C1C=CC2C(O)C12,-0.2184000015258789,-0.04349999874830246 CN=c1cconn1,-0.23919999599456787,-0.037700001150369644 CC1(C)CC2CC2C1O,-0.2540999948978424,0.066600002348423 CCC1CCC(=N)O1,-0.2526000142097473,0.032600000500679016 O=C1C2CCC1C1NC21,-0.2282000035047531,-0.00279999990016222 CCOc1ccc(C)o1,-0.19059999287128448,0.033799998462200165 O=C1C2CC3C4C2C1N34,-0.23479999601840973,-0.026100000366568565 O=C1C=CCC=CC1=O,-0.24130000174045563,-0.08780000358819962 Cc1cc(F)c[nH]c1=O,-0.2117999941110611,-0.042100001126527786 CC1=CCc2nocc21,-0.22419999539852142,-0.019200000911951065 N#CC1(O)CN=COC1,-0.26980000734329224,-0.002400000113993883 Nc1n[nH]cc1N1CC1,-0.18649999797344208,0.03739999979734421 CN1C2CC3(O)C1C23C,-0.19619999825954437,0.07779999822378159 N=c1nccco1,-0.23680000007152557,-0.0689999982714653 COC12COC1(C)C2C,-0.22339999675750732,0.07020000368356705 CCOC1COC(=N)O1,-0.2547000050544739,0.0560000017285347 COC1(C(N)=O)CC1,-0.23800000548362732,0.0284000001847744 C#CCC#CC1NC1C,-0.23970000445842743,0.03180000185966492 C1NC1CN1C2CCC21,-0.2379000037908554,0.06539999693632126 CC(O)c1cc(N)[nH]n1,-0.21449999511241913,0.029899999499320984 CC1(O)C(O)C1C=O,-0.24230000376701355,-0.022099999710917473 C#CC1(C)C2C3OC3C21,-0.23819999396800995,0.025800000876188278 c1c[nH]c2cccc-2c1,-0.17229999601840973,-0.037300001829862595 CCC1(O)C(C)C1C=O,-0.24089999496936798,-0.01810000091791153 C1=C2C(CC1)CC1NC21,-0.2231999933719635,0.01940000057220459 C#CC1C2C(O)C1C2O,-0.24420000612735748,0.041999999433755875 CC1(C)CN2CC(C2)O1,-0.2093999981880188,0.07599999755620956 CC1OC1C1C2CN1C2,-0.22990000247955322,0.08429999649524689 CC(=O)C12CC(=O)C1C2,-0.25049999356269836,-0.04270000010728836 CC12C3=NCC1CC2O3,-0.23119999468326569,-0.016599999740719795 c1cc2onnc2[nH]1,-0.23520000278949738,-0.042399998754262924 O=CCCC1OC2CC12,-0.24369999766349792,-0.01850000023841858 OCCC1C2C3CC3N12,-0.2175000011920929,0.06040000170469284 OCC#CC1CC1,-0.23720000684261322,0.03359999880194664 OC1C2CC3C1N1C2C31,-0.22709999978542328,0.0640999972820282 CC1(C=O)C=CC(=O)N1,-0.25369998812675476,-0.05649999901652336 CC1CC23CC12CCO3,-0.20999999344348907,0.08139999955892563 CC(O)(C(N)=O)C1CO1,-0.24469999969005585,0.02889999933540821 CC1=NC2(CC2)C(=N)N1,-0.2134999930858612,0.0024999999441206455 N#CCCC(=O)C(N)=O,-0.25949999690055847,-0.08160000294446945 CC(O)(C#N)COC=N,-0.27379998564720154,0.00570000009611249 CC12C=CC(C)(N1)C2O,-0.22859999537467957,-0.0012000000569969416 CC12COC1CCO2,-0.2468000054359436,0.07940000295639038 c1noc2c1CCOC2,-0.24819999933242798,-0.010700000450015068 C#CC1CCCCOC1,-0.2467000037431717,0.053599998354911804 CN1C2C3OC2(C=O)C31,-0.23469999432563782,-0.04619999974966049 CCn1cc(O)nn1,-0.22519999742507935,0.0013000000035390258 CCOC(=NC)C(C)=O,-0.23420000076293945,-0.05640000104904175 CC12CC1(C#N)C1CC12,-0.26750001311302185,0.02070000022649765 CC(=O)C1OC1CC=O,-0.251800000667572,-0.04360000044107437 Nc1cc(=O)cno1,-0.23770000040531158,-0.053700000047683716 O=C1CC=CCC1O,-0.25519999861717224,-0.027300000190734863 ================================================ FILE: examples/own_dataset/evaluate_own_dataset.sh ================================================ #!/usr/bin/env bash set -e # List of available graph convolution methods. methods=(nfp ggnn schnet weavenet rsgcn relgcn relgat megnet) # device identifier; set it to -1 to train on the CPU (default). device=${1:--1} # Number of training epochs (default: 1). epoch=${2:-1} for method in ${methods[@]} do # Train with the current method. python train_own_dataset.py \ --method ${method} \ --label value1 \ --conv-layers 1 \ --device ${device} \ --epoch ${epoch} \ --unit-num 10 \ --out eval_${method} # Run inference on the test set. python predict_own_dataset.py \ --method ${method} \ --label value1 \ --conv-layers 1 \ --device ${device} \ --epoch ${epoch} \ --unit-num 10 \ --in-dir eval_${method} \ --out eval_${method} done # Create plot showing the evaluation performance. python plot.py --prefix eval_ --methods ${methods[@]} ================================================ FILE: examples/own_dataset/plot.py ================================================ #!/usr/bin/env python import argparse import json import matplotlib.pyplot as plt import os import seaborn as sns def save_evaluation_plot(x, y, metric, filename): plt.figure() sns.set() ax = sns.barplot(y=x, x=y) for n, (label, _y) in enumerate(zip(x, y)): ax.annotate( '{:.3f}'.format(abs(_y)), xy=(_y, n), ha='right', va='center', xytext=(-5, 0), textcoords='offset points', color='white') plt.title('Performance on own dataset') plt.xlabel(metric) plt.savefig(filename) def main(): parser = argparse.ArgumentParser() parser.add_argument('--prefix', required=True) parser.add_argument('--methods', nargs='+', required=True) args = parser.parse_args() metrics = ['mean_abs_error', 'root_mean_sqr_error'] x = args.methods y = {metric: [] for metric in metrics} for method in args.methods: with open(os.path.join(args.prefix + method, 'eval_result.json')) as f: result = json.load(f) for metric in metrics: y[metric].append(result['main/' + metric]) for metric in metrics: save_evaluation_plot( x, y[metric], metric, 'eval_' + metric + '_own.png') if __name__ == "__main__": main() ================================================ FILE: examples/own_dataset/predict_own_dataset.py ================================================ #!/usr/bin/env python from __future__ import print_function import chainer import numpy import os from argparse import ArgumentParser from chainer.iterators import SerialIterator from chainer.training.extensions import Evaluator from chainer_chemistry.models.prediction import Regressor from chainer_chemistry.dataset.parsers import CSVFileParser from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry.dataset.preprocessors import preprocess_method_dict # These imports are necessary for pickle to work. from chainer_chemistry.links.scaler.standard_scaler import StandardScaler # NOQA from chainer_chemistry.models.prediction import GraphConvPredictor # NOQA from chainer_chemistry.utils import save_json from train_own_dataset import rmse def parse_arguments(): # Lists of supported preprocessing methods/models. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'megnet'] scale_list = ['standardize', 'none'] # Set up the argument parser. parser = ArgumentParser(description='Regression on own dataset') parser.add_argument('--datafile', '-d', type=str, default='dataset_test.csv', help='csv file containing the dataset') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', nargs='+', default=['value1', 'value2'], help='target label for regression') parser.add_argument('--scale', type=str, choices=scale_list, help='label scaling method', default='standardize') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=10, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--protocol', type=int, default=2, help='pickle protocol version') parser.add_argument('--in-dir', '-i', type=str, default='result', help='directory containing the saved model') parser.add_argument('--model-filename', type=str, default='regressor.pkl', help='saved model filename') return parser.parse_args() def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label else: raise ValueError('No target label was specified.') # Dataset preparation. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] test = dataset print('Predicting...') # Set up the regressor. device = chainer.get_device(args.device) model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=device) # Perform the prediction. print('Evaluating...') converter = converter_method_dict[args.method] test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=converter, device=device)() print('Evaluation result: ', eval_result) save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) if __name__ == '__main__': main() ================================================ FILE: examples/own_dataset/test_own_dataset.sh ================================================ #!/usr/bin/env bash set -e # device specifier given from first argument, default value is -1 device=${1:--1} for method in nfp ggnn schnet weavenet rsgcn relgcn megnet do python train_own_dataset.py --datafile dataset_train.csv --method ${method} --label value1 --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --batchsize 32 --out eval_${method} python predict_own_dataset.py --datafile dataset_test.csv --method ${method} --label value1 --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --in-dir eval_${method} --out eval_${method} done ================================================ FILE: examples/own_dataset/train_own_dataset.py ================================================ #!/usr/bin/env python from __future__ import print_function import chainer import numpy import os from argparse import ArgumentParser from chainer.datasets import split_dataset_random from chainer import functions as F from chainer_chemistry.dataset.parsers import CSVFileParser from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry.dataset.preprocessors import preprocess_method_dict from chainer_chemistry.links.scaler.standard_scaler import StandardScaler from chainer_chemistry.models import Regressor from chainer_chemistry.models.prediction import set_up_predictor from chainer_chemistry.utils import run_train def rmse(x0, x1): return F.sqrt(F.mean_squared_error(x0, x1)) def parse_arguments(): # Lists of supported preprocessing methods/models. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'mpnn', 'gnnfilm', 'megnet'] scale_list = ['standardize', 'none'] # Set up the argument parser. parser = ArgumentParser(description='Regression on own dataset') parser.add_argument('--datafile', '-d', type=str, default='dataset_train.csv', help='csv file containing the dataset') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', nargs='+', default=['value1', 'value2'], help='target label for regression') parser.add_argument('--scale', type=str, choices=scale_list, help='label scaling method', default='standardize') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=10, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--seed', '-s', type=int, default=777, help='random seed value') parser.add_argument('--train-data-ratio', '-r', type=float, default=0.7, help='ratio of training data w.r.t the dataset') parser.add_argument('--protocol', type=int, default=2, help='pickle protocol version') parser.add_argument('--model-filename', type=str, default='regressor.pkl', help='saved model filename') return parser.parse_args() def main(): # Parse the arguments. args = parse_arguments() if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: raise ValueError('No target label was specified.') # Dataset preparation. Postprocessing is required for the regression task. def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) # Apply a preprocessor to the dataset. print('Preprocessing dataset...') preprocessor = preprocess_method_dict[args.method]() parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label, labels=labels, smiles_col='SMILES') dataset = parser.parse(args.datafile)['dataset'] # Scale the label values, if necessary. if args.scale == 'standardize': scaler = StandardScaler() scaler.fit(dataset.get_datasets()[-1]) else: scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, _ = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor( args.method, args.unit_num, args.conv_layers, class_num, label_scaler=scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) print('Training...') converter = converter_method_dict[args.method] run_train(regressor, train, valid=None, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=converter, resume_path=None) # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) # TODO(nakago): ChainerX array cannot be sent to numpy array when internal # state has gradients. if hasattr(regressor.predictor.graph_conv, 'reset_state'): regressor.predictor.graph_conv.reset_state() regressor.save_pickle(model_path, protocol=args.protocol) if __name__ == '__main__': main() ================================================ FILE: examples/qm9/README.md ================================================ # QM9 Regression Example This example performs regression on the QM9 dataset. ## Dependencies Before running the example, the following packages also need to be installed: - [`matplotlib`](https://matplotlib.org/) - [`seaborn`](https://seaborn.pydata.org/) - [`scikit-learn`](http://scikit-learn.org/stable/) ## How to run the code ### Train a model To train a model, run the following: On the CPU: ```angular2html python train_qm9.py ``` On the GPU: ```angular2html python train_qm9.py -g 0 ``` ### Inference using a pretrained model As of v0.3.0, the `Regressor` class has been introduced, which provides the `predict` method for easier inference. `Regressor` also supports the `load_pickle` method, which allows for loading of a pretrained model, using the `pickle` library. The perform inference using a pretrained model, run the following: On the CPU: ``` python predict_qm9.py [-i /path/to/training/result/directory] ``` On the GPU: ``` python predict_qm9.py -g 0 [-i /path/to/training/result/directory] ``` ### Evaluation of implemented models To evaluate the performance of the currently implemented models, run the following: On the CPU: ``` bash evaluate_models_qm9.sh -1 [epoch] ``` On the GPU: ``` bash evaluate_models_qm9.sh 0 [epoch] ``` This scripts start the training process for a number of `epoch` epochs per model. Inference is then performed and evaluation metrics are reported. For regression tasks (such as with QM9), these are MAE and RMSE. One plot per metric is then createad (saved as `eval_[metric]_qm9.png` in the example directory), which outputs these values as reported by the diffent models. ================================================ FILE: examples/qm9/evaluate_models_qm9.sh ================================================ set -eu # List of available graph convolution methods. methods=(nfp ggnn schnet weavenet rsgcn relgcn relgat megnet) prefix=eval_ # device identifier; set it to -1 to train on the CPU (default). device=${1:--1} # Number of training epochs (default: 1). epoch=${2:-1} label=${3:-all} echo evaluating label ${label} for method in ${methods[@]} do result_dir=${prefix}${method} python train_qm9.py \ --method ${method} \ --device ${device} \ --out ${result_dir} \ --epoch ${epoch} \ --label ${label} python predict_qm9.py \ --in-dir ${result_dir} \ --method ${method} \ --label ${label} done python plot.py --prefix ${prefix} --methods ${methods[@]} ================================================ FILE: examples/qm9/plot.py ================================================ #! -*- coding: utf-8 -*- import argparse import json from collections import defaultdict import matplotlib.pyplot as plt import os import seaborn as sns from chainer_chemistry.utils import load_json def save_evaluation_plot(x, y, metric, filename): plt.figure() sns.set() ax = sns.barplot(y=x, x=y) for n, (label, _y) in enumerate(zip(x, y)): ax.annotate( s='{:.4g}'.format(abs(_y)), xy=(_y, n), ha='left', va='center', xytext=(5, 0), textcoords='offset points', color='gray') plt.title('Performance on qm9: {}'.format(metric)) plt.xlabel(metric) plt.savefig(filename) plt.close() def main(): parser = argparse.ArgumentParser() parser.add_argument('--prefix', required=True) parser.add_argument('--methods', nargs='+', required=True) args = parser.parse_args() x = args.methods y = defaultdict(list) for method in args.methods: result = load_json(os.path.join( args.prefix + method, 'eval_result_mae.json')) for label, value in result.items(): y[label].append(value) for label in y.keys(): save_evaluation_plot( x, y[label], label, 'eval_qm9_{}_mae.png'.format(label)) if __name__ == "__main__": main() ================================================ FILE: examples/qm9/predict_qm9.py ================================================ #!/usr/bin/env python from __future__ import print_function import argparse import os import chainer import numpy import pandas from chainer.datasets import split_dataset_random from chainer.iterators import SerialIterator from chainer.training.extensions import Evaluator from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry.dataset.preprocessors import preprocess_method_dict from chainer_chemistry import datasets as D from chainer_chemistry.datasets import NumpyTupleDataset from chainer_chemistry.models.prediction import Regressor from chainer_chemistry.utils import save_json # These import is necessary for pickle to work from chainer_chemistry.links.scaler.standard_scaler import StandardScaler # NOQA from chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor # NOQA from train_qm9 import rmse def parse_arguments(): # Lists of supported preprocessing methods/models. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'gnnfilm', 'relgcn_sparse', 'gin_sparse', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm', 'megnet'] label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv'] scale_list = ['standardize', 'none'] # Set up the argument parser. parser = argparse.ArgumentParser(description='Regression on QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, choices=label_names + ['all'], default='all', help='target label for regression; all means ' 'predicting all properties at once') parser.add_argument('--scale', type=str, choices=scale_list, help='label scaling method', default='standardize') parser.add_argument( '--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--seed', '-s', type=int, default=777, help='random seed value') parser.add_argument('--train-data-ratio', '-r', type=float, default=0.7, help='ratio of training data w.r.t the dataset') parser.add_argument('--in-dir', '-i', type=str, default='result', help='directory to load model data from') parser.add_argument('--model-filename', type=str, default='regressor.pkl', help='saved model filename') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') return parser.parse_args() def main(): # Parse the arguments. args = parse_arguments() device = chainer.get_device(args.device) # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': label = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, label)) labels = [label] else: labels = D.get_qm9_label_names() cache_dir = os.path.join('input', '{}_all'.format(method)) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached data from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the newly preprocessed dataset. if not os.path.exists(cache_dir): os.mkdir(cache_dir) if isinstance(dataset, NumpyTupleDataset): NumpyTupleDataset.save(dataset_cache_path, dataset) # Use a predictor with scaled output labels. model_path = os.path.join(args.in_dir, args.model_filename) regressor = Regressor.load_pickle(model_path, device=device) # Split the dataset into training and testing. train_data_size = int(len(dataset) * args.train_data_ratio) _, test = split_dataset_random(dataset, train_data_size, args.seed) # This callback function extracts only the inputs and discards the labels. # TODO(nakago): consider how to switch which `converter` to use. if isinstance(dataset, NumpyTupleDataset): converter = converter_method_dict[method] @chainer.dataset.converter() def extract_inputs(batch, device=None): return converter(batch, device=device)[:-1] # Extract the ground-truth labels as numpy array. original_t = converter(test, device=-1)[-1] else: converter = dataset.converter extract_inputs = converter # Extract the ground-truth labels as numpy array. original_t = converter(test, device=-1).y # Predict the output labels. print('Predicting...') y_pred = regressor.predict( test, converter=extract_inputs) df_dict = {} for i, l in enumerate(labels): df_dict.update({'y_pred_{}'.format(l): y_pred[:, i], 't_{}'.format(l): original_t[:, i], }) df = pandas.DataFrame(df_dict) # Show a prediction/ground truth table with 5 random examples. print(df.sample(5)) n_eval = 10 for target_label in range(y_pred.shape[1]): label_name = labels[target_label] diff = y_pred[:n_eval, target_label] - original_t[:n_eval, target_label] print('label_name = {}, y_pred = {}, t = {}, diff = {}' .format(label_name, y_pred[:n_eval, target_label], original_t[:n_eval, target_label], diff)) # Run an evaluator on the test dataset. print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator(test_iterator, regressor, converter=converter, device=device)() print('Evaluation result: ', eval_result) # Save the evaluation results. save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result) # Calculate mean abs error for each label mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0) eval_result = {} for i, l in enumerate(labels): eval_result.update({l: mae[i]}) save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result) if __name__ == '__main__': main() ================================================ FILE: examples/qm9/qm9_dataset_exploration.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## QM9 Dataset exploration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The purpose of this notebook is as follows,\n", "\n", " - Explain [QM9 dataset](http://quantum-machine.org/datasets/): Check the labels and visualization of molecules to understand what kind of data are stored.\n", " - Explain internal structure of QM9 dataset in `chainer_chemistry`: We handle the dataset with `NumpyTupleDataset`.\n", " - Explain how `preprocessor` and `parser` work on `chainer_chemistry`: One concrete example using `GGNNPreprocessor` is explained.\n", "\n", "It is out of scope of this notebook to explain how to train graph convolutional network using this dataset, please refer [document tutorial](http://chainer-chemistry.readthedocs.io/en/latest/tutorial.html#) or try `train_qm9.py` in [QM9 example](https://github.com/pfnet-research/chainer-chemistry/tree/master/examples/qm9) for the model training.\n", "\n", "[Note]\n", "This notebook is executed on 1, March, 2018.\n", "The behavior of QM9 dataset in `chainer_chemistry` might change in the future." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Loading modules and set loglevel." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import logging\n", "from rdkit import RDLogger\n", "from chainer_chemistry import datasets\n", "\n", "# Disable errors by RDKit occurred in preprocessing QM9 dataset.\n", "lg = RDLogger.logger()\n", "lg.setLevel(RDLogger.CRITICAL)\n", "\n", "# show INFO level log from chainer chemistry\n", "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "QM9 dataset can be downloaded automatically by chainer chemistry. \n", "Original format of QM9 dataset is zipped file where each molecule's information is stored in each \"xyz\" file.\n", "\n", "Chainer Chemistry automatically merge these information in one csv file internally, you may check the file path of this csv file with `get_qm9_filepath` method. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dataset_filepath = datasets.get_qm9_filepath()\n", "\n", "print('dataset_filepath =', dataset_filepath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset contains several chemical/physical properties. The labels of QM9 dataset can be checked by `get_qm9_label_names` method." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "QM9 label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']\n" ] } ], "source": [ "label_names = datasets.get_qm9_label_names()\n", "print('QM9 label_names =', label_names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "More detail information is described in `readme.txt` of QM9 dataset, which can be downloaded from \n", " - [https://figshare.com/articles/Readme_file%3A_Data_description_for__Quantum_chemistry_structures_and_properties_of_134_kilo_molecules_/1057641](https://figshare.com/articles/Readme_file%3A_Data_description_for__Quantum_chemistry_structures_and_properties_of_134_kilo_molecules_/1057641)\n", "\n", "Below is the description of each property(label), written in readme.txt\n", "\n", "\n", "
\n", "
\n",
    "I.  Property  Unit         Description\n",
    "--  --------  -----------  --------------\n",
    " 1  tag       -            \"gdb9\"; string constant to ease extraction via grep\n",
    " 2  index     -            Consecutive, 1-based integer identifier of molecule\n",
    " 3  A         GHz          Rotational constant A\n",
    " 4  B         GHz          Rotational constant B\n",
    " 5  C         GHz          Rotational constant C\n",
    " 6  mu        Debye        Dipole moment\n",
    " 7  alpha     Bohr^3       Isotropic polarizability\n",
    " 8  homo      Hartree      Energy of Highest occupied molecular orbital (HOMO)\n",
    " 9  lumo      Hartree      Energy of Lowest occupied molecular orbital (LUMO)\n",
    "10  gap       Hartree      Gap, difference between LUMO and HOMO\n",
    "11  r2        Bohr^2       Electronic spatial extent\n",
    "12  zpve      Hartree      Zero point vibrational energy\n",
    "13  U0        Hartree      Internal energy at 0 K\n",
    "14  U         Hartree      Internal energy at 298.15 K\n",
    "15  H         Hartree      Enthalpy at 298.15 K\n",
    "16  G         Hartree      Free energy at 298.15 K\n",
    "17  Cv        cal/(mol K)  Heat capacity at 298.15 K\n",
    "
\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing dataset\n", "\n", "Dataset extraction depends on the preprocessing method, which is determined by `preprocessor`.\n", "\n", "Here, let's look an example of using `GGNNPreprocessor` preprocessor for QM9 dataset extraction.\n", "\n", "Procedure is as follows,\n", "\n", "1. Instantiate `preprocessor` (here `GGNNPreprocessor` is used).\n", "2. call `get_qm9` method with `preprocessor`.\n", " - `labels=None` option is used to extract all labels. In this case, 15 types of physical properties are extracted (see above).\n", "\n", "Note that `return_smiles` option can be used to get SMILES information together with the dataset itself." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████| 133885/133885 [01:35<00:00, 1406.47it/s]\n", "INFO:chainer_chemistry.dataset.parsers.csv_file_parser:Preprocess finished. FAIL 0, SUCCESS 133885, TOTAL 133885\n" ] } ], "source": [ "from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import \\\n", " GGNNPreprocessor\n", " \n", "preprocessor = GGNNPreprocessor()\n", "dataset, dataset_smiles = datasets.get_qm9(preprocessor, labels=None, return_smiles=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Check extracted dataset\n", "\n", "First, let's check type and number of dataset." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dataset information...\n", "dataset 133885\n", "smiles information...\n", "dataset_smiles 133885\n" ] } ], "source": [ "print('dataset information...')\n", "print('dataset', type(dataset), len(dataset))\n", "\n", "print('smiles information...')\n", "print('dataset_smiles', type(dataset_smiles), len(dataset_smiles))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As you can see, QM9 dataset consists of 133885 data.\n", "\n", "The dataset is a class of `NumpyTupleDataset`, where i-th dataset features can be accessed by `dataset[i]`.\n", "\n", "When `GGNNPreprocessor` is used, each dataset consists of following features\n", " 1. atom feature: representing atomic number of given molecule. \n", " 2. adjacency matrix feature: representing adjacency matrix of given molecule.\n", " `GGNNPreprocessor` extracts adjacency matrix of each bonding type.\n", " 3. label feature: representing chemical properties (label) of given molecule.\n", " Please refer [above table](#table1) for details." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look an example of 7777-th dataset" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index=7777, SMILES=CC1=NCCC(C)O1\n", "atom (8,) [6 6 7 6 6 6 6 8]\n", "adj (4, 8, 8)\n", "adjacency matrix for SINGLE bond type\n", " [[0. 1. 0. 0. 0. 0. 0. 0.]\n", " [1. 0. 0. 0. 0. 0. 0. 1.]\n", " [0. 0. 0. 1. 0. 0. 0. 0.]\n", " [0. 0. 1. 0. 1. 0. 0. 0.]\n", " [0. 0. 0. 1. 0. 1. 0. 0.]\n", " [0. 0. 0. 0. 1. 0. 1. 1.]\n", " [0. 0. 0. 0. 0. 1. 0. 0.]\n", " [0. 1. 0. 0. 0. 1. 0. 0.]]\n", "adjacency matrix for DOUBLE bond type\n", " [[0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 1. 0. 0. 0. 0. 0.]\n", " [0. 1. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]]\n", "adjacency matrix for TRIPLE bond type\n", " [[0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]]\n", "adjacency matrix for AROMATIC bond type\n", " [[0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0.]]\n", "labels [ 3.1431000e+00 1.8749400e+00 1.2443100e+00 1.9313999e+00\n", " 7.3379997e+01 -2.3750000e-01 3.4699999e-02 2.7219999e-01\n", " 1.0124120e+03 1.6597100e-01 -3.6510001e+02 -3.6509183e+02\n", " -3.6509088e+02 -3.6513235e+02 3.0584999e+01]\n" ] } ], "source": [ "index = 7777\n", "\n", "print('index={}, SMILES={}'.format(index, dataset_smiles[index]))\n", "atom, adj, labels = dataset[index]\n", "# This molecule has N=8 atoms.\n", "print('atom', atom.shape, atom)\n", "# adjacency matrix is NxN matrix, where N is number of atoms in the molecule.\n", "# Unlike usual adjacency matrix, diagonal elements are filled with 1, for NFP calculation purpose.\n", "print('adj', adj.shape)\n", "print('adjacency matrix for SINGLE bond type\\n', adj[0])\n", "print('adjacency matrix for DOUBLE bond type\\n', adj[1])\n", "print('adjacency matrix for TRIPLE bond type\\n', adj[2])\n", "print('adjacency matrix for AROMATIC bond type\\n', adj[3])\n", "\n", "print('labels', labels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Visualizing the molecule\n", "\n", "One might want to visualize molecule given SMILES information. Here is an example code:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# This script is referred from http://rdkit.blogspot.jp/2015/02/new-drawing-code.html\n", "# and http://cheminformist.itmol.com/TEST/wp-content/uploads/2015/07/rdkit_moldraw2d_2.html\n", "from __future__ import print_function\n", "from rdkit import Chem\n", "from rdkit.Chem.Draw import IPythonConsole\n", "from IPython.display import SVG\n", "\n", "from rdkit.Chem import rdDepictor\n", "from rdkit.Chem.Draw import rdMolDraw2D\n", "def moltosvg(mol,molSize=(450,150),kekulize=True):\n", " mc = Chem.Mol(mol.ToBinary())\n", " if kekulize:\n", " try:\n", " Chem.Kekulize(mc)\n", " except:\n", " mc = Chem.Mol(mol.ToBinary())\n", " if not mc.GetNumConformers():\n", " rdDepictor.Compute2DCoords(mc)\n", " drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])\n", " drawer.DrawMolecule(mc)\n", " drawer.FinishDrawing()\n", " svg = drawer.GetDrawingText()\n", " return svg\n", "\n", "def render_svg(svg):\n", " # It seems that the svg renderer used doesn't quite hit the spec.\n", " # Here are some fixes to make it work in the notebook, although I think\n", " # the underlying issue needs to be resolved at the generation step\n", " return SVG(svg.replace('svg:',''))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "smiles: CC1=NCCC(C)O1\n" ] }, { "data": { "image/svg+xml": [ "\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "N\n", "O\n", "" ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "smiles = dataset_smiles[index]\n", "mol = Chem.MolFromSmiles(dataset_smiles[index])\n", "\n", "print('smiles:', smiles)\n", "svg = moltosvg(mol)\n", "render_svg(svg)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Note] SVG images cannot be displayed on GitHub, but you can see an image of molecule when you execute it on jupyter notebook." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interactively watch through the QM9 dataset\n", "\n", "Jupyter notebook provides handy module to check/visualize the data. Here interact module can be used to interactively check the internal of QM9 dataset." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index=114829, SMILES=CCC1CC2OCC1O2\n", "atom [6 6 6 6 6 8 6 6 8]\n", "labels [ 3.248 1.224 1.16 1.911 76.47 -0.249 0.082 0.331\n", " 1179.493 0.185 -424.24 -424.232 -424.231 -424.272 31.209]\n" ] }, { "data": { "image/svg+xml": [ "\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "O\n", "O\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from ipywidgets import interact\n", "import numpy as np\n", "np.set_printoptions(precision=3, suppress=True)\n", "\n", "def show_dataset(index):\n", " print('index={}, SMILES={}'.format(index, dataset_smiles[index]))\n", " atom, adj, labels = dataset[index]\n", " print('atom', atom)\n", " # print('adj', adj)\n", " print('labels', labels)\n", " mol = Chem.MolFromSmiles(dataset_smiles[index])\n", " return render_svg(moltosvg(mol))\n", "\n", "interact(show_dataset, index=(0, len(dataset) - 1, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Appendix: how to save the molecule figure?\n", "\n", "\n", "### 1. Save with SVG format\n", "\n", "First method is simply save svg in file." ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "dirpath = 'images'\n", "\n", "if not os.path.exists(dirpath):\n", " os.mkdir(dirpath)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def save_svg(mol, filepath):\n", " svg = moltosvg(mol)\n", " with open(filepath, \"w\") as fw:\n", " fw.write(svg)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "drawing images\\mol_7777.svg\n" ] } ], "source": [ "index = 7777\n", "save_filepath = os.path.join(dirpath, 'mol_{}.svg'.format(index))\n", "print('drawing {}'.format(save_filepath))\n", "\n", "mol = Chem.MolFromSmiles(dataset_smiles[index])\n", "save_svg(mol, save_filepath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Save with png format\n", "\n", "`rdkit` provides `Draw.MolToFile` method to visualize mol instance and save it to png format." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from rdkit.Chem import Draw\n", "\n", "def save_png(mol, filepath, size=(600, 600)):\n", " Draw.MolToFile(mol, filepath, size=size)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "drawing images\\mol_7777.png\n" ] } ], "source": [ "from rdkit.Chem import Draw\n", "index = 7777\n", "save_filepath = os.path.join(dirpath, 'mol_{}.png'.format(index))\n", "print('drawing {}'.format(save_filepath))\n", "\n", "mol = Chem.MolFromSmiles(dataset_smiles[index])\n", "save_png(mol, save_filepath, size=(600, 600))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: examples/qm9/test_qm9.sh ================================================ #!/usr/bin/env bash set -e # List of available graph convolution methods. # schnet test is skipped, since it takes long time to preprocess... methods=(nfp ggnn weavenet rsgcn relgcn relgat gin gnnfilm megnet nfp_gwm ggnn_gwm rsgcn_gwm gin_gwm relgcn_sparse gin_sparse megnet) # device identifier; set it to -1 to train on the CPU (default). device=${1:--1} # Number of training epochs (default: 1). epoch=${2:-1} for method in ${methods[@]} do # Remove any previously cached models. [ -d "input" ] && rm -rf input # Train with the current method (one label). python train_qm9.py \ --method ${method} \ --label A \ --conv-layers 1 \ --device ${device} \ --epoch ${epoch} \ --unit-num 10 \ --num-data 100 # Predict with the current method (one label). python predict_qm9.py \ --method ${method} \ --label A \ --device ${device} \ --num-data 100 # Train with the current method (all labels). python train_qm9.py \ --method ${method} \ --conv-layers 1 \ --device ${device} \ --epoch ${epoch} \ --unit-num 10 \ --num-data 100 # Predict with the current method (all labels). python predict_qm9.py \ --method ${method} \ --device ${device} \ --num-data 100 done ================================================ FILE: examples/qm9/train_qm9.py ================================================ #!/usr/bin/env python from __future__ import print_function import argparse import chainer import numpy import os from chainer.datasets import split_dataset_random from chainer import functions as F from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry.dataset.preprocessors import preprocess_method_dict from chainer_chemistry import datasets as D from chainer_chemistry.datasets import NumpyTupleDataset from chainer_chemistry.links.scaler.standard_scaler import StandardScaler from chainer_chemistry.models.prediction.regressor import Regressor from chainer_chemistry.models.prediction import set_up_predictor from chainer_chemistry.utils import run_train def rmse(x0, x1): return F.sqrt(F.mean_squared_error(x0, x1)) def parse_arguments(): # Lists of supported preprocessing methods/models. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'gnnfilm', 'relgcn_sparse', 'gin_sparse', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm', 'megnet'] label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv'] scale_list = ['standardize', 'none'] # Set up the argument parser. parser = argparse.ArgumentParser(description='Regression on QM9.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp', help='method name') parser.add_argument('--label', '-l', type=str, choices=label_names + ['all'], default='all', help='target label for regression; all means ' 'predicting all properties at once') parser.add_argument('--scale', type=str, choices=scale_list, default='standardize', help='label scaling method') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument( '--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--seed', '-s', type=int, default=777, help='random seed value') parser.add_argument('--train-data-ratio', '-r', type=float, default=0.7, help='ratio of training data w.r.t the dataset') parser.add_argument('--protocol', type=int, default=2, help='pickle protocol version') parser.add_argument('--model-filename', type=str, default='regressor.pkl', help='saved model filename') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') return parser.parse_args() def main(): # Parse the arguments. args = parse_arguments() # Set up some useful variables that will be used later on. method = args.method if args.label != 'all': labels = args.label cache_dir = os.path.join('input', '{}_{}'.format(method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_all'.format(method)) class_num = len(D.get_qm9_label_names()) # Get the filename corresponding to the cached dataset, based on the amount # of data samples that need to be parsed from the original dataset. num_data = args.num_data if num_data >= 0: dataset_filename = 'data_{}.npz'.format(num_data) else: dataset_filename = 'data.npz' # Load the cached dataset. dataset_cache_path = os.path.join(cache_dir, dataset_filename) dataset = None if os.path.exists(dataset_cache_path): print('Loading cached dataset from {}.'.format(dataset_cache_path)) dataset = NumpyTupleDataset.load(dataset_cache_path) if dataset is None: print('Preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Select the first `num_data` samples from the dataset. target_index = numpy.arange(num_data) dataset = D.get_qm9(preprocessor, labels=labels, target_index=target_index) else: # Load the entire dataset. dataset = D.get_qm9(preprocessor, labels=labels) # Cache the laded dataset. if not os.path.exists(cache_dir): os.makedirs(cache_dir) if isinstance(dataset, NumpyTupleDataset): NumpyTupleDataset.save(dataset_cache_path, dataset) # TODO: support caching of other dataset type... # Scale the label values, if necessary. if args.scale == 'standardize': print('Fit StandardScaler to the labels.') scaler = StandardScaler() if isinstance(dataset, NumpyTupleDataset): scaler.fit(dataset.get_datasets()[-1]) else: y = numpy.array([data.y for data in dataset]) scaler.fit(y) else: print('No standard scaling was selected.') scaler = None # Split the dataset into training and validation. train_data_size = int(len(dataset) * args.train_data_ratio) train, valid = split_dataset_random(dataset, train_data_size, args.seed) # Set up the predictor. predictor = set_up_predictor(method, args.unit_num, args.conv_layers, class_num, scaler) # Set up the regressor. device = chainer.get_device(args.device) metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse} regressor = Regressor(predictor, lossfun=F.mean_squared_error, metrics_fun=metrics_fun, device=device) # TODO(nakago): consider how to switch which `converter` to use. if isinstance(dataset, NumpyTupleDataset): converter = converter_method_dict[method] else: converter = dataset.converter print('Training...') run_train(regressor, train, valid=valid, batch_size=args.batchsize, epoch=args.epoch, out=args.out, extensions_list=None, device=device, converter=converter, resume_path=None) # Save the regressor's parameters. model_path = os.path.join(args.out, args.model_filename) print('Saving the trained model to {}...'.format(model_path)) regressor.save_pickle(model_path, protocol=args.protocol) if __name__ == '__main__': main() ================================================ FILE: examples/test_examples.sh ================================================ #!/usr/bin/env bash set -e gpu=${1:--1} echo Using gpu ${gpu} # Tox21 echo --- Testing Tox21 --- cd tox21 && bash -x test_tox21.sh ${gpu} && cd .. # QM9 echo --- Testing QM9 --- cd qm9 && bash -x test_qm9.sh ${gpu} && cd .. # Own dataset echo --- Testing on own dataset --- cd own_dataset && bash -x test_own_dataset.sh ${gpu} && cd .. # MolNet echo --- Testing MolNet dataset --- cd molnet && bash -x test_molnet.sh ${gpu} && cd .. ================================================ FILE: examples/tox21/.gitignore ================================================ prediction.npz ================================================ FILE: examples/tox21/README.md ================================================ # Training graph convolution models with Tox21 dataset This is an example of learning toxicity of chemical molecules with graph convolution networks in a multi-task supervised setting. We use graph convolution models that takes molecules represented as graphs as predictor. Chainer Chemistry provides off-the-shelf graph convolution models including [NFP](https://arxiv.org/abs/1509.09292), [GGNN](https://arxiv.org/abs/1511.05493), [SchNet](https://arxiv.org/abs/1706.08566) and so on. We use Tox21 dataset, provided by [The Toxicology in the 21st Century (Tox21)](https://ncats.nih.gov/tox21). It is one of the most widely used datasets in bio and chemo informatics and consists of the chemical information of molecules and their assessments of toxicity. ## How to run the code ### Train the model with tox21 dataset With CPU: ```angular2html python train_tox21.py ``` With GPU: ```angular2html python train_tox21.py -g 0 ``` This script trains the model with the tox21 dataset and outputs trained parameters and other information to a specified directory. We specify an ID of GPU in use by `-g` or `--gpu` option. Negative value indicate running the code with CPU. The output directory can be specified by `-o` option. Its default value is `result`. The Tox21 dataset consists of several assays. Some molecules can have more than one types of assay results. We can specify which assay to use by specifying an assay name with `-l` option. Assay names are available by running the script with `-h` or `--help` or execute the following command: ``` python -c import chainer_chemistry; chainer_chemistry.datasets.get_tox21_label_names() ``` If `-l` option is not specified, this script conducts multitask learning with all labels. The full options available including `-g` and `-o` are found by running the following command: ``` python train_tox21.py -h ``` ### Inference with a trained model using Classifier As of v0.3.0, `Classifier` class is introduced which supports `predict` and `predict_proba` methods for easier inference. `Classifier` also supports `load_pickle` method, user may load the instance of pretrained-model using `pickle` file. The example implemented in `predict_tox21_with_classifier.py`. With CPU: ``` python predict_tox21_with_classifier.py [-i /path/to/training/result/directory] ``` With GPU: ``` python predict_tox21_with_classifier.py -g 0 [-i /path/to/training/result/directory] ``` ### Evaluation of Models `seaborn` is required to run this script. ``` bash examples/tox21/evaluate_models_tox21.sh ``` This script evaluates each method and generate a graph. ================================================ FILE: examples/tox21/data.py ================================================ import os import numpy from chainer_chemistry.dataset.preprocessors import preprocess_method_dict from chainer_chemistry import datasets as D from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset class _CacheNamePolicy(object): train_file_name = 'train.npz' val_file_name = 'val.npz' test_file_name = 'test.npz' def _get_cache_directory_path(self, method, labels, prefix, num_data): num_data_str = '_{}'.format(num_data) if num_data >= 0 else '' if labels: return os.path.join(prefix, '{}_{}{}'.format(method, labels, num_data_str)) else: return os.path.join(prefix, '{}_all{}'.format(method, num_data_str)) def __init__(self, method, labels, prefix='input', num_data=-1): self.method = method self.labels = labels self.prefix = prefix self.num_data = num_data self.cache_dir = self._get_cache_directory_path( method, labels, prefix, num_data) def get_train_file_path(self): return os.path.join(self.cache_dir, self.train_file_name) def get_val_file_path(self): return os.path.join(self.cache_dir, self.val_file_name) def get_test_file_path(self): return os.path.join(self.cache_dir, self.test_file_name) def create_cache_directory(self): try: os.makedirs(self.cache_dir) except OSError: if not os.path.isdir(self.cache_dir): raise def load_dataset(method, labels, prefix='input', num_data=-1): policy = _CacheNamePolicy(method, labels, prefix, num_data=num_data) train_path = policy.get_train_file_path() val_path = policy.get_val_file_path() test_path = policy.get_test_file_path() train, val, test = None, None, None print() if os.path.exists(policy.cache_dir): print('load from cache {}'.format(policy.cache_dir)) train = NumpyTupleDataset.load(train_path) val = NumpyTupleDataset.load(val_path) test = NumpyTupleDataset.load(test_path) if train is None or val is None or test is None: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() if num_data >= 0: # Use `num_data` examples for train target_index = numpy.arange(num_data) train, val, test = D.get_tox21( preprocessor, labels=labels, train_target_index=target_index, val_target_index=None, test_target_index=None ) else: train, val, test = D.get_tox21(preprocessor, labels=labels) # Cache dataset policy.create_cache_directory() NumpyTupleDataset.save(train_path, train) NumpyTupleDataset.save(val_path, val) NumpyTupleDataset.save(test_path, test) return train, val, test ================================================ FILE: examples/tox21/evaluate_models_tox21.sh ================================================ set -eu device=-1 methods=(nfp ggnn schnet weavenet rsgcn relgcn relgat megnet) prefix=eval_ for method in ${methods[@]} do result_dir=${prefix}${method} python train_tox21.py --method ${method} --device ${device} --out ${result_dir} python predict_tox21_with_classifier.py --in-dir ${result_dir} done python plot.py --prefix ${prefix} --methods ${methods[@]} ================================================ FILE: examples/tox21/plot.py ================================================ #! -*- coding: utf-8 -*- import argparse import json import os import matplotlib.pyplot as plt import seaborn as sns parser = argparse.ArgumentParser() parser.add_argument('--prefix', required=True) parser.add_argument('--methods', nargs='+', required=True) args = parser.parse_args() sns.set() x = args.methods y = [] for method in args.methods: with open(os.path.join(args.prefix + method, 'eval_result.json')) as f: result = json.load(f) y.append(result["test/main/roc_auc"]) ax = sns.barplot(y=x, x=y) for n, (label, _y) in enumerate(zip(x, y)): ax.annotate( s='{:.3f}'.format(abs(_y)), xy=(_y, n), ha='right', va='center', xytext=(-5, 0), textcoords='offset points', color='white' ) plt.title("Performance on tox21") plt.xlabel("ROC-AUC") plt.savefig('eval_results_tox21.png') ================================================ FILE: examples/tox21/predict_tox21_with_classifier.py ================================================ import os import argparse import json import chainer import numpy from chainer import cuda import chainer.functions as F from chainer.iterators import SerialIterator from chainer.training.extensions import Evaluator from rdkit import RDLogger import six from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry.models.prediction import Classifier from chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator # NOQA import data # Disable errors by RDKit occurred in preprocessing Tox21 dataset. lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) def main(): parser = argparse.ArgumentParser( description='Predict with a trained model.') parser.add_argument('--in-dir', '-i', type=str, default='result', help='Path to the result directory of the training ' 'script.') parser.add_argument('--batchsize', '-b', type=int, default=128, help='batch size') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--model-filename', type=str, default='classifier.pkl', help='file name for pickled model') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() with open(os.path.join(args.in_dir, 'config.json'), 'r') as i: config = json.loads(i.read()) method = config['method'] labels = config['labels'] _, test, _ = data.load_dataset(method, labels, num_data=args.num_data) y_test = test.get_datasets()[-1] device = chainer.get_device(args.device) # Load pretrained model clf = Classifier.load_pickle( os.path.join(args.in_dir, args.model_filename), device=device) # type: Classifier # ---- predict --- print('Predicting...') # We need to feed only input features `x` to `predict`/`predict_proba`. # This converter extracts only inputs (x1, x2, ...) from the features which # consist of input `x` and label `t` (x1, x2, ..., t). converter = converter_method_dict[method] def extract_inputs(batch, device=None): return converter(batch, device=device)[:-1] def postprocess_pred(x): x_array = cuda.to_cpu(x.data) return numpy.where(x_array > 0, 1, 0) y_pred = clf.predict(test, converter=extract_inputs, postprocess_fn=postprocess_pred) y_proba = clf.predict_proba(test, converter=extract_inputs, postprocess_fn=F.sigmoid) # `predict` method returns the prediction label (0: non-toxic, 1:toxic) print('y_pread.shape = {}, y_pred[:5, 0] = {}' .format(y_pred.shape, y_pred[:5, 0])) # `predict_proba` method returns the probability to be toxic print('y_proba.shape = {}, y_proba[:5, 0] = {}' .format(y_proba.shape, y_proba[:5, 0])) # --- predict end --- if y_pred.ndim == 1: y_pred = y_pred[:, None] if y_pred.shape != y_test.shape: raise RuntimeError('The shape of the prediction result array and ' 'that of the ground truth array do not match. ' 'Contents of the input directory may be corrupted ' 'or modified.') statistics = [] for t, p in six.moves.zip(y_test.T, y_pred.T): idx = t != -1 n_correct = (t[idx] == p[idx]).sum() n_total = len(t[idx]) accuracy = float(n_correct) / n_total statistics.append([n_correct, n_total, accuracy]) print('{:>6} {:>8} {:>8} {:>8}' .format('TaskID', 'Correct', 'Total', 'Accuracy')) for idx, (n_correct, n_total, accuracy) in enumerate(statistics): print('task{:>2} {:>8} {:>8} {:>8.4f}' .format(idx, n_correct, n_total, accuracy)) prediction_result_file = 'prediction.npz' print('Save prediction result to {}'.format(prediction_result_file)) numpy.savez_compressed(prediction_result_file, y_pred) # --- evaluate --- # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator` print('Evaluating...') test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False) eval_result = Evaluator( test_iterator, clf, converter=converter, device=device)() print('Evaluation result: ', eval_result) rocauc_result = ROCAUCEvaluator( test_iterator, clf, converter=converter, device=device, eval_func=clf.predictor, name='test', ignore_labels=-1)() print('ROCAUC Evaluation result: ', rocauc_result) with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f: json.dump(rocauc_result, f) # --- evaluate end --- if __name__ == '__main__': main() ================================================ FILE: examples/tox21/test_tox21.sh ================================================ #!/usr/bin/env bash set -e # device specifier given from first argument, default value is -1 device=${1:--1} # Preprocessor parse result must contain both pos/neg samples tox21_num_data=100 for method in nfp ggnn do if [ ! -f "input" ]; then rm -rf input fi # Tox21 classification task with only one label out_dir=nr_ar_${method} python train_tox21.py --method ${method} --label NR-AR --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --out ${out_dir} --batchsize 32 --num-data=${tox21_num_data} python predict_tox21_with_classifier.py --in-dir ${out_dir} --device ${device} --num-data=${tox21_num_data} # Tox21 classification task with all labels out_dir=all_${method} python train_tox21.py --method ${method} --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --out ${out_dir} --batchsize 16 --num-data=${tox21_num_data} python predict_tox21_with_classifier.py --in-dir ${out_dir} --num-data=${tox21_num_data} done # BalancedSerialIterator test with Tox21 python train_tox21.py --method nfp --label NR-AR --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --out nr_ar_nfp_balanced --iterator-type balanced --eval-mode 0 --num-data 1000 # ROCAUCEvaluator test with Tox21 python train_tox21.py --method nfp --label NR-AR --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --out nr_ar_nfp_balanced --iterator-type serial --eval-mode 1 --num-data 1000 ================================================ FILE: examples/tox21/tox21_dataset_exploration.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Tox 21 dataset exploration\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The purpose of this notebook is as follows,\n", "\n", " - Explain [Tox21 dataset](https://tripod.nih.gov/tox21/challenge/): Check the labels and visualization of molecules to understand what kind of data are stored.\n", " - Explain internal structure of tox21 dataset in `chainer_chemistry`: We handle the dataset with `NumpyTupleDataset`.\n", " - Explain how `preprocessor` and `parser` work on `chainer_chemistry`: One concrete example using `NFPPreprocessor` is explained.\n", "\n", "It is out of scope of this notebook to explain how to train graph convolutional network using this dataset, please refer [document tutorial](http://chainer-chemistry.readthedocs.io/en/latest/tutorial.html#) or try `train_tox21.py` in [tox21 example](https://github.com/pfnet-research/chainer-chemistry/tree/master/examples/tox21) for the model training.\n", "\n", "[Note]\n", "This notebook is executed on 1, March, 2018. \n", "The behavior of tox21 dataset in `chainer_chemistry` might change in the future." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Loading modules and set loglevel." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import logging\n", "from rdkit import RDLogger\n", "from chainer_chemistry import datasets\n", "\n", "# Disable errors by RDKit occurred in preprocessing Tox21 dataset.\n", "lg = RDLogger.logger()\n", "lg.setLevel(RDLogger.CRITICAL)\n", "\n", "# show INFO level log from chainer chemistry\n", "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tox 21 dataset consists of train/validation/test data and they can be downloaded automatically with chainer chemistry. \n", "The format of tox21 dataset is \"sdf\" file.\n", "You can check the file path of downloaded sdf file with `get_tox21_filepath` method. " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "train_filepath = datasets.get_tox21_filepath('train')\n", "val_filepath = datasets.get_tox21_filepath('val')\n", "test_filepath = datasets.get_tox21_filepath('test')\n", "\n", "print('train_filepath =', train_filepath)\n", "print('val_filepath =', val_filepath)\n", "print('test_filepath =', test_filepath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dataset contains 12 types of toxity, the label of toxity can be checked by `get_tox21_label_names` method.\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tox21 label_names = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']\n" ] } ], "source": [ "label_names = datasets.get_tox21_label_names()\n", "print('tox21 label_names =', label_names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing dataset\n", "\n", "Dataset extraction depends on the preprocessing method, which is determined by `preprocessor`.\n", "\n", "Here, let's look an example of using `NFPPreprocessor` preprocessor for tox21 dataset exraction.\n", "\n", "Procedure is as follows,\n", "\n", "1. Instantiate `preprocessor` (here `NFPPreprocessor` is used).\n", "2. call `get_tox21` method with `preprocessor`.\n", " - `labels=None` option is used to extract all labels. In this case, 12 types of toxity labels are extracted (see above).\n", "\n", "[Note] \n", " - `return_smiles` option can be used to get SMILES information together with the dataset itself.\n", " - Preprocessing result depends on RDKit version. \n", "You might get different results due to the difference of RDKit behavior between version." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RDKit version: 2017.03.3\n" ] } ], "source": [ "import rdkit\n", "\n", "print('RDKit version: ', rdkit.__version__)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████| 11764/11764 [00:22<00:00, 531.76it/s]\n", "INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 11757, TOTAL 11757\n", "100%|███████████████████████████████████████████████████████████████████████████████| 296/296 [00:00<00:00, 488.77it/s]\n", "INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 295, TOTAL 295\n", "100%|███████████████████████████████████████████████████████████████████████████████| 647/647 [00:01<00:00, 609.91it/s]\n", "INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 645, TOTAL 645\n" ] } ], "source": [ "from chainer_chemistry.dataset.preprocessors.nfp_preprocessor import \\\n", " NFPPreprocessor\n", "\n", "preprocessor = NFPPreprocessor()\n", "train, val, test, train_smiles, val_smiles, test_smiles = datasets.get_tox21(preprocessor, labels=None, return_smiles=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dataset extraction depends on the `preprocessor`, and you may use other type of `preprocessor` as well.\n", "\n", "Below is another example of using `GGNNPreprocessor` for dataset extraction. But it takes little bit of time, you can skip it for the following tutorial." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████| 11764/11764 [00:29<00:00, 401.74it/s]\n", "INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 11757, TOTAL 11757\n", "100%|███████████████████████████████████████████████████████████████████████████████| 296/296 [00:00<00:00, 336.99it/s]\n", "INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 295, TOTAL 295\n", "100%|███████████████████████████████████████████████████████████████████████████████| 647/647 [00:01<00:00, 479.05it/s]\n", "INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 645, TOTAL 645\n" ] } ], "source": [ "from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import \\\n", " GGNNPreprocessor\n", "\n", "# uncomment it if you want to try `GGNNPreprocessor`\n", "ggnn_preprocessor = GGNNPreprocessor()\n", "results = datasets.get_tox21(ggnn_preprocessor, labels=None, return_smiles=True)\n", "train_ggnn, val_ggnn, test_ggnn, train_smiles_ggnn, val_smiles_ggnn, test_smiles_ggnn = results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check extracted dataset\n", "\n", "First, let's check number of data for train/validation/test dataset." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dataset information...\n", "train 11757\n", "val 295\n", "test 645\n", "smiles information...\n", "train_smiles 11757\n" ] } ], "source": [ "print('dataset information...')\n", "print('train', type(train), len(train))\n", "print('val', type(val), len(val))\n", "print('test', type(test), len(test))\n", "\n", "print('smiles information...')\n", "print('train_smiles', type(train_smiles), len(train_smiles))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are 11757 data in `train`, 295 data in `val` and 645 data in `test` respectively.\n", "(You might get different result with different version of `rdkit`.)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset is a class of `NumpyTupleDataset`, where i-th dataset features can be accessed by `dataset[i]`.\n", "\n", "When `NFPPreprocessor` is used, each dataset consists of following features\n", " 1. atom feature: representing atomic number of given molecule. \n", " 2. adjacency matrix feature: representing adjacency matrix of given molecule.\n", " 3. label feature: representing toxity (label) of given molecule.\n", " Here, 0 indicates negative (no toxity), 1 indicates positive (toxic) and -1 indicates data is not available, respectively." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look an example of 6-th train dataset" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index=6, SMILES=Cc1ccc([N+](=O)[O-])c2c1O[Hg]2\n", "atom (12,) [ 6 6 6 6 6 7 8 8 6 6 8 80]\n", "adj (12, 12)\n", "[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", " [1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n", " [0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0.]\n", " [0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]\n", " [0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1.]\n", " [0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]\n", " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]\n", " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1.]]\n", "labels [-1 -1 -1 -1 -1 -1 -1 -1 -1 1 -1 -1]\n" ] } ], "source": [ "index = 6\n", "\n", "print('index={}, SMILES={}'.format(index, train_smiles[index]))\n", "atom, adj, labels = train[index]\n", "# This molecule has N=12 atoms.\n", "print('atom', atom.shape, atom)\n", "# adjacency matrix is NxN matrix, where N is number of atoms in the molecule.\n", "# Unlike usual adjacency matrix, diagonal elements are filled with 1, for NFP calculation purpose.\n", "print('adj', adj.shape)\n", "print(adj)\n", "print('labels', labels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualizing the molecule\n", "\n", "One might want to visualize molecule given SMILES information.\n", "Here is an example code:\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# This script is referred from http://rdkit.blogspot.jp/2015/02/new-drawing-code.html\n", "# and http://cheminformist.itmol.com/TEST/wp-content/uploads/2015/07/rdkit_moldraw2d_2.html\n", "from __future__ import print_function\n", "from rdkit import Chem\n", "from rdkit.Chem.Draw import IPythonConsole\n", "from IPython.display import SVG\n", "\n", "from rdkit.Chem import rdDepictor\n", "from rdkit.Chem.Draw import rdMolDraw2D\n", "def moltosvg(mol,molSize=(450,150),kekulize=True):\n", " mc = Chem.Mol(mol.ToBinary())\n", " if kekulize:\n", " try:\n", " Chem.Kekulize(mc)\n", " except:\n", " mc = Chem.Mol(mol.ToBinary())\n", " if not mc.GetNumConformers():\n", " rdDepictor.Compute2DCoords(mc)\n", " drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])\n", " drawer.DrawMolecule(mc)\n", " drawer.FinishDrawing()\n", " svg = drawer.GetDrawingText()\n", " return svg\n", "\n", "def render_svg(svg):\n", " # It seems that the svg renderer used doesn't quite hit the spec.\n", " # Here are some fixes to make it work in the notebook, although I think\n", " # the underlying issue needs to be resolved at the generation step\n", " return SVG(svg.replace('svg:',''))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "smiles: Cc1ccc([N+](=O)[O-])c2c1O[Hg]2\n" ] }, { "data": { "image/svg+xml": [ "\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "N+\n", "O\n", "O-\n", "O\n", "Hg\n", "" ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "smiles = train_smiles[index]\n", "mol = Chem.MolFromSmiles(train_smiles[index])\n", "\n", "print('smiles:', smiles)\n", "render_svg(moltosvg(mol))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[Note] SVG images cannot be displayed on GitHub, but you can see an image of molecule when you execute it on jupyter notebook." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Interactively watch through the tox21 dataset\n", "\n", "Jupyter notebook provides handy module to check/visualize the data.\n", "Here `interact` module can be used to interactively check the internal of tox 21 dataset." ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index=5878, SMILES=CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)C(NC(=O)Cc3csc(N)n3)C2SC1.Cl\n", "atom [ 6 7 6 6 6 7 7 7 7 6 16 6 6 6 6 8 8 7 6 8 6 7 6 8\n", " 6 6 6 16 6 7 7 6 16 6 17]\n", "labels [ 0 0 0 0 0 0 0 -1 0 -1 0 0]\n" ] }, { "data": { "image/svg+xml": [ "\n", " \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "N\n", "N\n", "N\n", "N\n", "N\n", "S\n", "O\n", "HO\n", "N\n", "O\n", "NH\n", "O\n", "S\n", "NH2\n", "N\n", "S\n", "ClH\n", "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from ipywidgets import interact\n", "\n", "def show_train_dataset(index):\n", " atom, adj, labels = train[index]\n", " smiles = train_smiles[index]\n", " print('index={}, SMILES={}'.format(index, smiles))\n", " print('atom', atom)\n", " # print('adj', adj)\n", " print('labels', labels)\n", " mol = Chem.MolFromSmiles(train_smiles[index])\n", " return render_svg(moltosvg(mol))\n", "\n", "interact(show_train_dataset, index=(0, len(train) - 1, 1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Appendix: how to save the molecule figure?\n", "\n", "### 1. Save with SVG format\n", "\n", "First method is simply save svg in file.\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "dirpath = 'images'\n", "\n", "if not os.path.exists(dirpath):\n", " os.mkdir(dirpath)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def save_svg(mol, filepath):\n", " svg = moltosvg(mol)\n", " with open(filepath, \"w\") as fw:\n", " fw.write(svg)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "drawing images\\mol_6.svg\n" ] } ], "source": [ "index = 6\n", "save_filepath = os.path.join(dirpath, 'mol_{}.svg'.format(index))\n", "print('drawing {}'.format(save_filepath))\n", "\n", "mol = Chem.MolFromSmiles(train_smiles[index])\n", "save_svg(mol, save_filepath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Save with png format\n", "\n", "`rdkit` provides `Draw.MolToFile` method to visualize mol instance and save it to png format." ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from rdkit.Chem import Draw\n", "\n", "def save_png(mol, filepath, size=(600, 600)):\n", " Draw.MolToFile(mol, filepath, size=size)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "drawing images\\mol_6.png\n" ] } ], "source": [ "index = 6\n", "save_filepath = os.path.join(dirpath, 'mol_{}.png'.format(index))\n", "print('drawing {}'.format(save_filepath))\n", "\n", "mol = Chem.MolFromSmiles(train_smiles[index])\n", "save_png(mol, save_filepath, size=(600, 600))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: examples/tox21/train_tox21.py ================================================ #!/usr/bin/env python from __future__ import print_function import os import logging import argparse import chainer from chainer import functions as F from chainer import iterators as I from rdkit import RDLogger from chainer_chemistry.dataset.converters import converter_method_dict from chainer_chemistry import datasets as D from chainer_chemistry.iterators.balanced_serial_iterator import BalancedSerialIterator # NOQA from chainer_chemistry.models.prediction import Classifier from chainer_chemistry.models.prediction import set_up_predictor from chainer_chemistry.training.extensions import ROCAUCEvaluator # NOQA from chainer_chemistry.utils import run_train, save_json import data # Disable errors by RDKit occurred in preprocessing Tox21 dataset. lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) # show INFO level log from chainer chemistry logging.basicConfig(level=logging.INFO) def main(): # Supported preprocessing/network list method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'megnet'] label_names = D.get_tox21_label_names() iterator_type = ['serial', 'balanced'] parser = argparse.ArgumentParser( description='Multitask Learning with Tox21.') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp', help='graph convolution model to use ' 'as a predictor.') parser.add_argument('--label', '-l', type=str, choices=label_names, default='', help='target label for logistic ' 'regression. Use all labels if this option ' 'is not specified.') parser.add_argument('--iterator-type', type=str, choices=iterator_type, default='serial', help='iterator type. If `balanced` ' 'is specified, data is sampled to take same number of' 'positive/negative labels during training.') parser.add_argument('--eval-mode', type=int, default=1, help='Evaluation mode.' '0: only binary_accuracy is calculated.' '1: binary_accuracy and ROC-AUC score is calculated') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to output directory') parser.add_argument('--epoch', '-e', type=int, default=10, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--resume', '-r', type=str, default='', help='path to a trainer snapshot') parser.add_argument('--frequency', '-f', type=int, default=-1, help='Frequency of taking a snapshot') parser.add_argument('--protocol', type=int, default=2, help='protocol version for pickle') parser.add_argument('--model-filename', type=str, default='classifier.pkl', help='file name for pickled model') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() method = args.method if args.label: labels = args.label class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None class_num = len(label_names) # Dataset preparation train, val, _ = data.load_dataset(method, labels, num_data=args.num_data) # Network predictor_ = set_up_predictor( method, args.unit_num, args.conv_layers, class_num) iterator_type = args.iterator_type if iterator_type == 'serial': train_iter = I.SerialIterator(train, args.batchsize) elif iterator_type == 'balanced': if class_num > 1: raise ValueError('BalancedSerialIterator can be used with only one' 'label classification, please specify label to' 'be predicted by --label option.') train_iter = BalancedSerialIterator( train, args.batchsize, train.features[:, -1], ignore_labels=-1) train_iter.show_label_stats() else: raise ValueError('Invalid iterator type {}'.format(iterator_type)) device = chainer.get_device(args.device) classifier = Classifier(predictor_, lossfun=F.sigmoid_cross_entropy, metrics_fun=F.binary_accuracy, device=device) extensions_list = [] eval_mode = args.eval_mode converter = converter_method_dict[method] if eval_mode == 1: train_eval_iter = I.SerialIterator(train, args.batchsize, repeat=False, shuffle=False) extensions_list.append(ROCAUCEvaluator( train_eval_iter, classifier, eval_func=predictor_, device=device, converter=converter, name='train', pos_labels=1, ignore_labels=-1, raise_value_error=False)) # extension name='validation' is already used by `Evaluator`, # instead extension name `val` is used. val_iter = I.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) extensions_list.append(ROCAUCEvaluator( val_iter, classifier, eval_func=predictor_, device=device, converter=converter, name='val', pos_labels=1, ignore_labels=-1)) run_train(classifier, train_iter, valid=val, batch_size=args.batchsize, epoch=args.epoch, out=args.out, device=device, converter=converter, extensions_list=extensions_list, resume_path=args.resume) # frequency = args.epoch if args.frequency == -1 else max(1, args.frequency) # trainer.extend(E.snapshot(), trigger=(frequency, 'epoch')) # trainer.run() config = {'method': args.method, 'conv_layers': args.conv_layers, 'unit_num': args.unit_num, 'labels': args.label} save_json(os.path.join(args.out, 'config.json'), config) classifier.save_pickle(os.path.join(args.out, args.model_filename), protocol=args.protocol) if __name__ == '__main__': main() ================================================ FILE: setup.py ================================================ from distutils.core import setup import os from setuptools import find_packages setup_requires = [] install_requires = [ 'chainer >=7.0.0', 'joblib', 'matplotlib', 'pandas', 'scikit-learn', 'scipy', 'tqdm', ] here = os.path.abspath(os.path.dirname(__file__)) # Get __version__ variable exec(open(os.path.join(here, 'chainer_chemistry', '_version.py')).read()) setup(name='chainer-chemistry', version=__version__, # NOQA description='Chainer Chemistry: A Library for Deep Learning in Biology\ and Chemistry', author='Kosuke Nakago', author_email='nakago@preferred.jp', packages=find_packages(), license='MIT', url='http://chainer-chemistry.readthedocs.io/en/latest/index.html', setup_requires=setup_requires, install_requires=install_requires ) ================================================ FILE: tests/dataset_tests/parsers_tests/test_csv_file_parser.py ================================================ import os import numpy import pandas import pytest from rdkit import Chem import six from chainer_chemistry.dataset.parsers import CSVFileParser from chainer_chemistry.dataset.preprocessors import NFPPreprocessor @pytest.fixture def mol_smiles(): mol_smiles1 = 'CN=C=O' mol_smiles2 = 'Cc1ccccc1' mol_smiles3 = 'CC1=CC2CC(CC1)O2' return [mol_smiles1, mol_smiles2, mol_smiles3] @pytest.fixture def mols(mol_smiles): return [Chem.MolFromSmiles(smiles) for smiles in mol_smiles] @pytest.fixture() def label_a(): return [2.1, 5.3, -1.2] @pytest.fixture() def csv_file(tmpdir, mol_smiles, label_a): fname = os.path.join(str(tmpdir), 'test.csv') df = pandas.DataFrame({ 'smiles': mol_smiles, 'labelA': label_a }) df.to_csv(fname) return fname @pytest.fixture() def csv_file_invalid(tmpdir): """CSV file with invalid SMILES""" fname = os.path.join(str(tmpdir), 'test_invalid.csv') df = pandas.DataFrame({ 'smiles': ['var', 'CN=C=O', 'hoge', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2'], 'labelA': [0., 2.1, 0., 5.3, -1.2], }) df.to_csv(fname) return fname def check_input_features(actual, expect): assert len(actual) == len(expect) for d, e in six.moves.zip(actual, expect): numpy.testing.assert_array_equal(d, e) def check_features(actual, expect_input_features, expect_label): assert len(actual) == len(expect_input_features) + 1 # input features testing for d, e in six.moves.zip(actual[:-1], expect_input_features): numpy.testing.assert_array_equal(d, e) # label testing assert actual[-1] == expect_label def test_csv_file_parser_not_return_smiles(csv_file, mols): preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, smiles_col='smiles') # Actually, `dataset, smiles = parser.parse(..)` is enough. result = parser.parse(csv_file, return_smiles=False) dataset = result['dataset'] smiles = result['smiles'] is_successful = result['is_successful'] assert len(dataset) == 3 assert smiles is None assert is_successful is None # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect) def test_csv_file_parser_return_smiles(csv_file, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file, return_smiles=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 3 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_features(dataset[i], expect, label_a[i]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'Cc1ccccc1' assert smiles[2] == 'CC1=CC2CC(CC1)O2' def test_csv_file_parser_target_index(csv_file_invalid, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file_invalid, return_smiles=True, target_index=[1, 2, 4], return_is_successful=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 2 is_successful = result['is_successful'] assert numpy.array_equal(is_successful, numpy.array([True, False, True])) assert len(is_successful) == 3 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. expect = preprocessor.get_input_features(mols[0]) check_features(dataset[0], expect, label_a[0]) expect = preprocessor.get_input_features(mols[2]) check_features(dataset[1], expect, label_a[2]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'CC1=CC2CC(CC1)O2' def test_csv_file_parser_extract_total_num(csv_file): preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') num = parser.extract_total_num(csv_file) assert num == 3 def test_csv_parser_return_is_successful(csv_file_invalid, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(csv_file_invalid, return_smiles=True, return_is_successful=True) dataset = result['dataset'] # smiles = result['smiles'] assert len(dataset) == 3 is_successful = result['is_successful'] assert len(is_successful) == 5 # print('is_successful', is_successful) assert numpy.alltrue(is_successful[[1, 3, 4]]) assert numpy.alltrue(~is_successful[[0, 2]]) # We assume NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_features(dataset[i], expect, label_a[i]) if __name__ == '__main__': pytest.main([__file__, '-s', '-v']) ================================================ FILE: tests/dataset_tests/parsers_tests/test_data_frame_parser.py ================================================ import numpy import pandas import pytest from rdkit import Chem import six from chainer_chemistry.dataset.parsers import DataFrameParser from chainer_chemistry.dataset.preprocessors import NFPPreprocessor @pytest.fixture def mol_smiles(): mol_smiles1 = 'CN=C=O' mol_smiles2 = 'Cc1ccccc1' mol_smiles3 = 'CC1=CC2CC(CC1)O2' return [mol_smiles1, mol_smiles2, mol_smiles3] @pytest.fixture def mols(mol_smiles): return [Chem.MolFromSmiles(smiles) for smiles in mol_smiles] @pytest.fixture() def label_a(): return [2.1, 5.3, -1.2] @pytest.fixture() def data_frame(mol_smiles, label_a): df = pandas.DataFrame({ 'smiles': mol_smiles, 'labelA': label_a }) return df def check_input_features(actual, expect): assert len(actual) == len(expect) for d, e in six.moves.zip(actual, expect): numpy.testing.assert_array_equal(d, e) def check_features(actual, expect_input_features, expect_label): assert len(actual) == len(expect_input_features) + 1 # input features testing for d, e in six.moves.zip(actual[:-1], expect_input_features): numpy.testing.assert_array_equal(d, e) # label testing assert actual[-1] == expect_label def test_data_frame_parser_not_return_smiles(data_frame, mols): """Test default behavior""" preprocessor = NFPPreprocessor() parser = DataFrameParser(preprocessor, smiles_col='smiles') # Actually, `dataset, smiles = parser.parse(..)` is enough. result = parser.parse(data_frame, return_smiles=False) dataset = result['dataset'] smiles = result['smiles'] is_successful = result['is_successful'] assert len(dataset) == 3 assert smiles is None assert is_successful is None # As we want test DataFrameParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect) def test_data_frame_parser_return_smiles(data_frame, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = DataFrameParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(data_frame, return_smiles=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 3 # We assume NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_features(dataset[i], expect, label_a[i]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'Cc1ccccc1' assert smiles[2] == 'CC1=CC2CC(CC1)O2' def test_data_frame_parser_target_index(data_frame, mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = DataFrameParser(preprocessor, labels='labelA', smiles_col='smiles') result = parser.parse(data_frame, return_smiles=True, target_index=[0, 2], return_is_successful=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 2 is_successful = result['is_successful'] assert numpy.alltrue(is_successful) assert len(is_successful) == 2 # We assume NFPPreprocessor works as documented. expect = preprocessor.get_input_features(mols[0]) check_features(dataset[0], expect, label_a[0]) expect = preprocessor.get_input_features(mols[2]) check_features(dataset[1], expect, label_a[2]) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'CC1=CC2CC(CC1)O2' def test_data_frame_parser_return_is_successful(mols, label_a): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = DataFrameParser(preprocessor, labels='labelA', smiles_col='smiles') df = pandas.DataFrame({ 'smiles': ['var', 'CN=C=O', 'hoge', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2'], 'labelA': [0., 2.1, 0., 5.3, -1.2], }) result = parser.parse(df, return_smiles=True, return_is_successful=True) dataset = result['dataset'] # smiles = result['smiles'] assert len(dataset) == 3 is_successful = result['is_successful'] assert len(is_successful) == 5 # print('is_successful', is_successful) assert numpy.alltrue(is_successful[[1, 3, 4]]) assert numpy.alltrue(~is_successful[[0, 2]]) # We assume NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_features(dataset[i], expect, label_a[i]) def test_data_frame_parser_extract_total_num(data_frame): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = DataFrameParser(preprocessor) num = parser.extract_total_num(data_frame) assert num == 3 if __name__ == '__main__': pytest.main([__file__, '-s', '-v']) ================================================ FILE: tests/dataset_tests/parsers_tests/test_sdf_file_parser.py ================================================ import os import numpy import pytest from rdkit import Chem import six from chainer_chemistry.dataset.parsers import SDFFileParser from chainer_chemistry.dataset.preprocessors import NFPPreprocessor @pytest.fixture def mols(): mol1 = Chem.MolFromSmiles('CN=C=O') mol2 = Chem.MolFromSmiles('Cc1ccccc1') mol3 = Chem.MolFromSmiles('CC1=CC2CC(CC1)O2') return [mol1, mol2, mol3] @pytest.fixture() def sdf_file(tmpdir, mols): # Chem.AllChem.Compute2DCoords(mol1) fname = os.path.join(str(tmpdir), 'test.sdf') writer = Chem.SDWriter(fname) for mol in mols: writer.write(mol) return fname @pytest.fixture() def sdf_file_long(tmpdir): """SDFFile with long smiles (ccc...)""" fname = os.path.join(str(tmpdir), 'test_long.sdf') writer = Chem.SDWriter(fname) for smiles in ['CCCCCCCCCCCC', 'CN=C=O', 'CCCCCCCCCCCCCCCC', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2']: mol = Chem.MolFromSmiles(smiles) writer.write(mol) return fname def check_input_features(actual, expect): assert len(actual) == len(expect) for d, e in six.moves.zip(actual, expect): numpy.testing.assert_array_equal(d, e) def test_sdf_file_parser_not_return_smiles(sdf_file, mols): preprocessor = NFPPreprocessor() parser = SDFFileParser(preprocessor) result = parser.parse(sdf_file, return_smiles=False) dataset = result['dataset'] smiles = result['smiles'] is_successful = result['is_successful'] assert len(dataset) == 3 assert smiles is None assert is_successful is None # As we want test SDFFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect) def test_sdf_file_parser_return_smiles(sdf_file, mols): preprocessor = NFPPreprocessor() parser = SDFFileParser(preprocessor) result = parser.parse(sdf_file, return_smiles=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 3 # As we want test SDFFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'Cc1ccccc1' assert smiles[2] == 'CC1=CC2CC(CC1)O2' def test_sdf_file_parser_target_index(sdf_file, mols): preprocessor = NFPPreprocessor() parser = SDFFileParser(preprocessor) result = parser.parse(sdf_file, return_smiles=True, target_index=[0, 2], return_is_successful=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 2 is_successful = result['is_successful'] assert numpy.alltrue(is_successful) assert len(is_successful) == 2 # As we want test SDFFileParser, we assume # NFPPreprocessor works as documented. expect = preprocessor.get_input_features(mols[0]) check_input_features(dataset[0], expect) expect = preprocessor.get_input_features(mols[2]) check_input_features(dataset[1], expect) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'CC1=CC2CC(CC1)O2' def test_sdf_file_parser_return_is_successful(sdf_file_long, mols): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor(max_atoms=10) parser = SDFFileParser(preprocessor) result = parser.parse(sdf_file_long, return_smiles=True, return_is_successful=True) dataset = result['dataset'] # smiles = result['smiles'] assert len(dataset) == 3 is_successful = result['is_successful'] assert len(is_successful) == 5 assert numpy.alltrue(is_successful[[1, 3, 4]]) assert numpy.alltrue(~is_successful[[0, 2]]) # We assume NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect) def test_sdf_file_parser_extract_total_num(sdf_file): preprocessor = NFPPreprocessor() parser = SDFFileParser(preprocessor) num = parser.extract_total_num(sdf_file) assert num == 3 if __name__ == '__main__': pytest.main([__file__, '-s', '-v']) ================================================ FILE: tests/dataset_tests/parsers_tests/test_smiles_parser.py ================================================ import numpy import pytest from rdkit import Chem import six from chainer_chemistry.dataset.parsers import SmilesParser from chainer_chemistry.dataset.preprocessors import NFPPreprocessor @pytest.fixture def mol_smiles(): mol_smiles1 = 'CN=C=O' mol_smiles2 = 'Cc1ccccc1' mol_smiles3 = 'CC1=CC2CC(CC1)O2' return [mol_smiles1, mol_smiles2, mol_smiles3] @pytest.fixture def mols(mol_smiles): return [Chem.MolFromSmiles(smiles) for smiles in mol_smiles] def check_input_features(actual, expect): assert len(actual) == len(expect) for d, e in six.moves.zip(actual, expect): numpy.testing.assert_array_equal(d, e) def test_smiles_parser_not_return_smiles(mol_smiles, mols): preprocessor = NFPPreprocessor() parser = SmilesParser(preprocessor) result = parser.parse(mol_smiles, return_smiles=False) dataset = result['dataset'] smiles = result['smiles'] is_successful = result['is_successful'] assert len(dataset) == 3 assert smiles is None assert is_successful is None # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect) def test_smiles_parser_return_smiles(mol_smiles, mols): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = SmilesParser(preprocessor) result = parser.parse(mol_smiles, return_smiles=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 3 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'Cc1ccccc1' assert smiles[2] == 'CC1=CC2CC(CC1)O2' def test_smiles_parser_target_index(mol_smiles, mols): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = SmilesParser(preprocessor) result = parser.parse(mol_smiles, return_smiles=True, target_index=[0, 2], return_is_successful=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 2 is_successful = result['is_successful'] assert numpy.alltrue(is_successful) assert len(is_successful) == 2 # As we want test CSVFileParser, we assume # NFPPreprocessor works as documented. expect = preprocessor.get_input_features(mols[0]) check_input_features(dataset[0], expect) expect = preprocessor.get_input_features(mols[2]) check_input_features(dataset[1], expect) # check smiles array assert type(smiles) == numpy.ndarray assert smiles.ndim == 1 assert len(smiles) == len(dataset) assert smiles[0] == 'CN=C=O' assert smiles[1] == 'CC1=CC2CC(CC1)O2' def test_smiles_parser_return_is_successful(mols): """test `labels` option and retain_smiles=True.""" preprocessor = NFPPreprocessor() parser = SmilesParser(preprocessor) mol_smiles_with_invalid = [ 'var', 'CN=C=O', 'hoge', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2'] result = parser.parse(mol_smiles_with_invalid, return_smiles=True, return_is_successful=True) dataset = result['dataset'] assert len(dataset) == 3 is_successful = result['is_successful'] assert len(is_successful) == 5 assert numpy.alltrue(is_successful[[1, 3, 4]]) assert numpy.alltrue(~is_successful[[0, 2]]) # We assume NFPPreprocessor works as documented. for i in range(3): expect = preprocessor.get_input_features(mols[i]) check_input_features(dataset[i], expect) def test_smiles_parser_extract_total_num(mol_smiles): preprocessor = NFPPreprocessor() parser = SmilesParser(preprocessor) num = parser.extract_total_num(mol_smiles) assert num == 3 if __name__ == '__main__': pytest.main([__file__, '-s', '-v']) ================================================ FILE: tests/dataset_tests/preprocessor_tests/test_common.py ================================================ import numpy import pytest from rdkit import Chem from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.dataset.preprocessors import common from chainer_chemistry.utils.extend import extend_adj @pytest.fixture def sample_molecule(): return Chem.MolFromSmiles('CN=C=O') @pytest.fixture def sample_molecule_2(): return Chem.MolFromSmiles('Cc1ccccc1') class TestGetAtomicNumbers(object): def test_normal(self, sample_molecule): actual = common.construct_atomic_number_array(sample_molecule) assert actual.shape == (4,) expect = numpy.array([6, 7, 6, 8], dtype=numpy.int32) numpy.testing.assert_equal(actual, expect) def test_padding(self, sample_molecule): actual = common.construct_atomic_number_array(sample_molecule, 5) assert actual.shape == (5,) expect = numpy.array([6, 7, 6, 8, 0], dtype=numpy.int32) numpy.testing.assert_equal(actual, expect) def test_normal_truncated(self, sample_molecule): with pytest.raises(ValueError): adj = common.construct_atomic_number_array(sample_molecule, 3) # NOQA class TestGetAdjMatrix(object): def test_normal(self, sample_molecule_2): adj = common.construct_adj_matrix(sample_molecule_2) assert adj.shape == (7, 7) expect = numpy.array( [[1., 1., 0., 0., 0., 0., 0., ], [1., 1., 1., 0., 0., 0., 1., ], [0., 1., 1., 1., 0., 0., 0., ], [0., 0., 1., 1., 1., 0., 0., ], [0., 0., 0., 1., 1., 1., 0., ], [0., 0., 0., 0., 1., 1., 1., ], [0., 1., 0., 0., 0., 1., 1., ]], dtype=numpy.float32) numpy.testing.assert_equal(adj, expect) def test_normal_no_self_connection(self, sample_molecule_2): adj = common.construct_adj_matrix(sample_molecule_2, self_connection=False) assert adj.shape == (7, 7) expect = numpy.array( [[0., 1., 0., 0., 0., 0., 0.], [1., 0., 1., 0., 0., 0., 1.], [0., 1., 0., 1., 0., 0., 0.], [0., 0., 1., 0., 1., 0., 0.], [0., 0., 0., 1., 0., 1., 0.], [0., 0., 0., 0., 1., 0., 1.], [0., 1., 0., 0., 0., 1., 0.]], dtype=numpy.float32) numpy.testing.assert_equal(adj, expect) def test_normal_padding(self, sample_molecule_2): adj = common.construct_adj_matrix(sample_molecule_2, 8) assert adj.shape == (8, 8) expect = numpy.array( [[1., 1., 0., 0., 0., 0., 0., 0.], [1., 1., 1., 0., 0., 0., 1., 0.], [0., 1., 1., 1., 0., 0., 0., 0.], [0., 0., 1., 1., 1., 0., 0., 0.], [0., 0., 0., 1., 1., 1., 0., 0.], [0., 0., 0., 0., 1., 1., 1., 0.], [0., 1., 0., 0., 0., 1., 1., 0.], [0., 0., 0., 0., 0., 0., 0., 0.]], dtype=numpy.float32) numpy.testing.assert_equal(adj, expect) def test_normal_truncated(self, sample_molecule_2): with pytest.raises(ValueError): adj = common.construct_adj_matrix(sample_molecule_2, 6) # NOQA class TestConstructDiscreteEdgeMatrix(object): expect_adj = numpy.array( [[[0., 1., 0., 0., 0., 0., 0.], [1., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 1.], [0., 1., 0., 1., 0., 0., 0.], [0., 0., 1., 0., 1., 0., 0.], [0., 0., 0., 1., 0., 1., 0.], [0., 0., 0., 0., 1., 0., 1.], [0., 1., 0., 0., 0., 1., 0.]]], dtype=numpy.float32) def test_default(self, sample_molecule_2): adj = common.construct_discrete_edge_matrix(sample_molecule_2) assert adj.shape == (4, 7, 7) numpy.testing.assert_equal(adj, self.expect_adj) def test_add_self_connection_channel(self, sample_molecule_2): adj = common.construct_discrete_edge_matrix( sample_molecule_2, add_self_connection_channel=True) assert adj.shape == (5, 7, 7) numpy.testing.assert_equal(adj[:4], self.expect_adj) numpy.testing.assert_equal(adj[4], numpy.eye(7, 7)) def test_padding(self, sample_molecule_2): adj = common.construct_discrete_edge_matrix(sample_molecule_2, 8) assert adj.shape == (4, 8, 8) expect = extend_adj(self.expect_adj, out_size=8, axis=[-1, -2]) numpy.testing.assert_equal(adj, expect) def test_truncated(self, sample_molecule_2): with pytest.raises(ValueError): adj = common.construct_discrete_edge_matrix(sample_molecule_2, 6) # NOQA def test_construct_super_node_feature_adj_ndim2(sample_molecule): adj = common.construct_adj_matrix(sample_molecule) atom_array = common.construct_atomic_number_array(sample_molecule) s = common.construct_supernode_feature(sample_molecule, atom_array, adj) # print(s) assert s.shape == (MAX_ATOMIC_NUM * 2 + 4,) assert s[0] == len(atom_array) assert s[1] == adj.sum() assert s[2] == 1 assert s[3] == 1 assert s[3 + 6] == 1 # C assert s[3 + 7] == 1 # N assert s[3 + 8] == 1 # O assert s[3 + MAX_ATOMIC_NUM] == 0 # other assert s[3 + MAX_ATOMIC_NUM + 6] == 2 / len(atom_array) assert s[3 + MAX_ATOMIC_NUM + 7] == 1 / len(atom_array) assert s[3 + MAX_ATOMIC_NUM + 8] == 1 / len(atom_array) assert s[3 + MAX_ATOMIC_NUM * 2] == 0 def test_construct_super_node_feature_adj_ndim3(sample_molecule): adj = common.construct_discrete_edge_matrix(sample_molecule) atom_array = common.construct_atomic_number_array(sample_molecule) s = common.construct_supernode_feature(sample_molecule, atom_array, adj) assert s.shape == (MAX_ATOMIC_NUM * 2 + 10,) assert s[0] == len(atom_array) assert s[1] == adj.sum() assert s[2] == 1 assert s[3] == 1 assert s[4] == 0 assert s[5] == 0 assert pytest.approx(s[6], 1 * 2 / adj.sum()) # symmetric assert pytest.approx(s[7], 2 * 2 / adj.sum()) # symmetric assert s[8] == 0 assert s[9] == 0 assert s[9 + 6] == 1 # C assert s[9 + 6] == 1 # N assert s[9 + 7] == 1 # O assert s[9 + MAX_ATOMIC_NUM] == 0 # other assert s[9 + MAX_ATOMIC_NUM + 6] == 2 / len(atom_array) assert s[9 + MAX_ATOMIC_NUM + 7] == 1 / len(atom_array) assert s[9 + MAX_ATOMIC_NUM + 8] == 1 / len(atom_array) assert s[9 + MAX_ATOMIC_NUM * 2] == 0 if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_atomic_number_preprocessor.py ================================================ import numpy import pytest from rdkit import Chem from chainer_chemistry.dataset.parsers import SmilesParser from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA @pytest.fixture def mol(): ret = Chem.MolFromSmiles('CN=C=O') return ret def test_atomic_number_default_preprocessor(mol): preprocessor = AtomicNumberPreprocessor() ret_atom_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) def test_atomic_number_non_default_padding_preprocessor(mol): preprocessor = AtomicNumberPreprocessor(out_size=10) ret_atom_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8, 0, 0, 0, 0, 0, 0], dtype=numpy.int32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) def test_atomic_number_non_default_max_atoms_preprocessor(mol): preprocessor = AtomicNumberPreprocessor(max_atoms=5) ret_atom_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) preprocessor = AtomicNumberPreprocessor(max_atoms=3) with pytest.raises(MolFeatureExtractionError): preprocessor.get_input_features(mol) def test_atomic_number_preprocessor(mol): preprocessor = AtomicNumberPreprocessor(max_atoms=5, out_size=10) ret_atom_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8, 0, 0, 0, 0, 0, 0], dtype=numpy.int32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) def test_atomic_number_preprocessor_default(): preprocessor = AtomicNumberPreprocessor() dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset'] index = numpy.random.choice(len(dataset), None) atoms, = dataset[index] assert atoms.ndim == 1 assert atoms.dtype == numpy.int32 def test_atomic_number_preprocessor_assert_raises(): with pytest.raises(ValueError): AtomicNumberPreprocessor(max_atoms=3, out_size=2) # NOQA if __name__ == '__main__': pytest.main() ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_cgcnn_preprocessor.py ================================================ import pytest from chainer_chemistry.dataset.preprocessors import CGCNNPreprocessor def test_cgcnn_preprocessor_init(): pp = CGCNNPreprocessor() print('pp.atom_features', pp.atom_features) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_gat_preprocessor.py ================================================ import numpy import pytest from chainer_chemistry.dataset.parsers import SDFFileParser from chainer_chemistry.dataset.preprocessors import RelGATPreprocessor from chainer_chemistry.datasets import get_tox21_filepath @pytest.mark.slow def test_gat_preprocessor(): preprocessor = RelGATPreprocessor() def postprocess_label(label_list): # Set -1 to the place where the label is not found, # this corresponds to not calculate loss with `sigmoid_cross_entropy` return [-1 if label is None else label for label in label_list] dataset = SDFFileParser(preprocessor, postprocess_label=postprocess_label ).parse(get_tox21_filepath('train'))["dataset"] index = numpy.random.choice(len(dataset), None) atoms, adjs = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (edge_type, atom from, atom to) assert adjs.ndim == 3 assert adjs.dtype == numpy.float32 def test_gat_preprocessor_assert_raises(): with pytest.raises(ValueError): pp = RelGATPreprocessor(max_atoms=3, out_size=2) # NOQA if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_ggnn_preprocessor.py ================================================ import numpy import pytest from chainer_chemistry.dataset.parsers import SmilesParser from chainer_chemistry.dataset.preprocessors import GGNNPreprocessor def test_ggnn_preprocessor(): preprocessor = GGNNPreprocessor() dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'] )["dataset"] index = numpy.random.choice(len(dataset), None) atoms, adjs = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (edge_type, atom from, atom to) assert adjs.ndim == 3 assert adjs.dtype == numpy.float32 atoms0, adjs0 = dataset[0] assert numpy.allclose(atoms0, numpy.array([6, 7], numpy.int32)) expect_adjs = numpy.array( [[[0., 0.], [0., 0.]], [[0., 0.], [0., 0.]], [[0., 1.], [1., 0.]], [[0., 0.], [0., 0.]]], dtype=numpy.float32) assert numpy.allclose(adjs0, expect_adjs) atoms1, adjs1 = dataset[1] assert numpy.allclose( atoms1, numpy.array([6, 6, 6, 7, 6, 6, 8, 7, 6], numpy.int32)) # include aromatic bond (ch=3) expect_adjs = numpy.array( [[[0., 1., 0., 0., 0., 0., 0., 0., 0.], [1., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 1.], [0., 0., 0., 0., 0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 1., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 1., 0.], [0., 1., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 1., 0., 0., 0., 1., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 1., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=numpy.float32) assert numpy.allclose(adjs1, expect_adjs) def test_ggnn_preprocessor_kekulize(): preprocessor = GGNNPreprocessor(kekulize=True) dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'] )["dataset"] atoms1, adjs1 = dataset[1] assert numpy.allclose( atoms1, numpy.array([6, 6, 6, 7, 6, 6, 8, 7, 6], numpy.int32)) # NOT include aromatic bond (ch=3) expect_adjs = numpy.array( [[[0., 1., 0., 0., 0., 0., 0., 0., 0.], [1., 0., 0., 0., 0., 0., 0., 1., 0.], [0., 0., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 1., 0.], [0., 0., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 1., 0., 0., 1., 0., 0., 0., 1.], [0., 0., 0., 0., 0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 0., 0.], [0., 1., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 1., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=numpy.float32) assert numpy.allclose(adjs1, expect_adjs) def test_ggnn_preprocessor_assert_raises(): with pytest.raises(ValueError): pp = GGNNPreprocessor(max_atoms=3, out_size=2) # NOQA if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_gwm_preprocessor.py ================================================ import pytest from rdkit import Chem from chainer_chemistry.dataset.preprocessors.gwm_preprocessor import ( NFPGWMPreprocessor, GGNNGWMPreprocessor, GINGWMPreprocessor, RSGCNGWMPreprocessor) # NOQA @pytest.fixture def mol(): ret = Chem.MolFromSmiles('CN=C=O') return ret @pytest.mark.parametrize('pp_type', [ NFPGWMPreprocessor, GGNNGWMPreprocessor, GINGWMPreprocessor, RSGCNGWMPreprocessor]) def test_gwm_preprocessor(mol, pp_type): pp = pp_type() ret = pp.get_input_features(mol) # currently all preprocessor returns `super_node_x` at 3rd args. assert len(ret) == 3 super_node_x = ret[2] # print('super_node_x', super_node_x.shape, super_node_x) assert super_node_x.ndim == 1 if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_mol_preprocessor.py ================================================ import pytest from rdkit import Chem from chainer_chemistry.dataset.preprocessors import MolPreprocessor @pytest.fixture def mol(): ret = Chem.MolFromSmiles('CN=C=O') ret.SetProp('foo', '1') ret.SetProp('bar', '2') return ret @pytest.fixture def pp(): return MolPreprocessor() class TestGetLabel(object): def test_default(self, mol, pp): labels = pp.get_label(mol) assert labels == [] def test_empty(self, mol, pp): labels = pp.get_label(mol, []) assert labels == [] def test_one_label(self, mol, pp): labels = pp.get_label(mol, ['foo']) assert labels == ['1'] def test_two_labels(self, mol, pp): labels = pp.get_label(mol, ['bar', 'foo']) assert labels == ['2', '1'] def test_non_existent_label(self, mol, pp): labels = pp.get_label(mol, ['foo', 'buz']) assert labels == ['1', None] if __name__ == '__main__': pytest.main() ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_nfp_preprocessor.py ================================================ import numpy import pytest from rdkit import Chem from chainer_chemistry.dataset.parsers import SmilesParser from chainer_chemistry.dataset.preprocessors import NFPPreprocessor @pytest.fixture def mol(): ret = Chem.MolFromSmiles('CN=C=O') return ret @pytest.fixture def pp(): return NFPPreprocessor() def test_nfp_preprocessor(mol, pp): ret = pp.get_input_features(mol) assert len(ret) == 2 actual_atom_array, actual_adj_array = ret expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) numpy.testing.assert_array_equal(actual_atom_array, expect_atom_array) expect_adj_array = numpy.array([[1, 1, 0, 0], [1, 1, 1, 0], [0, 1, 1, 1], [0, 0, 1, 1]], dtype=numpy.float32) numpy.testing.assert_array_equal(actual_adj_array, expect_adj_array) def test_nfp_preprocessor_default(): preprocessor = NFPPreprocessor() dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset'] index = numpy.random.choice(len(dataset), None) atoms, adjs = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) assert adjs.ndim == 2 assert adjs.dtype == numpy.float32 def test_nfp_preprocessor_assert_raises(): with pytest.raises(ValueError): pp = NFPPreprocessor(max_atoms=3, out_size=2) # NOQA if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_relgcn_preprocessor.py ================================================ import numpy import pytest from chainer_chemistry.dataset.parsers import SmilesParser from chainer_chemistry.dataset.preprocessors import RelGCNPreprocessor def test_relgcn_preprocessor(): preprocessor = RelGCNPreprocessor() dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'] )["dataset"] index = numpy.random.choice(len(dataset), None) atoms, adjs = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (edge_type, atom from, atom to) assert adjs.ndim == 3 assert adjs.dtype == numpy.float32 atoms0, adjs0 = dataset[0] assert numpy.allclose(atoms0, numpy.array([6, 7], numpy.int32)) expect_adjs = numpy.array( [[[0., 0.], [0., 0.]], [[0., 0.], [0., 0.]], [[0., 1.], [1., 0.]], [[0., 0.], [0., 0.]]], dtype=numpy.float32) assert numpy.allclose(adjs0, expect_adjs) atoms1, adjs1 = dataset[1] assert numpy.allclose( atoms1, numpy.array([6, 6, 6, 7, 6, 6, 8, 7, 6], numpy.int32)) # include aromatic bond (ch=3) expect_adjs = numpy.array( [[[0., 1., 0., 0., 0., 0., 0., 0., 0.], [1., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 1.], [0., 0., 0., 0., 0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 1., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 1., 0.], [0., 1., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 1., 0., 0., 0., 1., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 1., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=numpy.float32) assert numpy.allclose(adjs1, expect_adjs) def test_relgcn_preprocessor_kekulize(): preprocessor = RelGCNPreprocessor(kekulize=True) dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'] )["dataset"] atoms1, adjs1 = dataset[1] assert numpy.allclose( atoms1, numpy.array([6, 6, 6, 7, 6, 6, 8, 7, 6], numpy.int32)) # NOT include aromatic bond (ch=3) expect_adjs = numpy.array( [[[0., 1., 0., 0., 0., 0., 0., 0., 0.], [1., 0., 0., 0., 0., 0., 0., 1., 0.], [0., 0., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 1., 0.], [0., 0., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 1., 0., 0., 1., 0., 0., 0., 1.], [0., 0., 0., 0., 0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0., 0., 0.], [0., 1., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 1., 0., 0., 0., 0.], [0., 0., 0., 1., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 1., 0., 0.], [0., 0., 0., 0., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=numpy.float32) assert numpy.allclose(adjs1, expect_adjs) def test_relgcn_preprocessor_assert_raises(): with pytest.raises(ValueError): pp = RelGCNPreprocessor(max_atoms=3, out_size=2) # NOQA if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_rsgcn_preprocessor.py ================================================ import numpy import pytest from rdkit import Chem from chainer_chemistry.dataset.parsers import SmilesParser from chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError # NOQA from chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor # NOQA @pytest.fixture def mol(): return Chem.MolFromSmiles('CN=C=O') def test_rsgcn_default_preprocessor(mol): preprocessor = RSGCNPreprocessor() ret_atom_array, ret_adj_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) expect_adj_array = numpy.array( [[0.5, 0.4082, 0, 0], [0.4082, 0.3333, 0.3333, 0], [0, 0.3333, 0.3333, 0.4082], [0, 0, 0.4082, 0.5]], dtype=numpy.float32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) numpy.testing.assert_allclose( ret_adj_array, expect_adj_array, rtol=1e-03, atol=1e-03) def test_rsgcn_non_default_padding_preprocessor(mol): preprocessor = RSGCNPreprocessor(out_size=7) ret_atom_array, ret_adj_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8, 0, 0, 0], dtype=numpy.int32) expect_adj_array = numpy.array( [[0.5, 0.4082, 0, 0, 0, 0, 0], [0.4082, 0.3333, 0.3333, 0, 0, 0, 0], [0, 0.3333, 0.3333, 0.4082, 0, 0, 0], [0, 0, 0.4082, 0.5, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], dtype=numpy.float32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) numpy.testing.assert_allclose( ret_adj_array, expect_adj_array, rtol=1e-03, atol=1e-03) def test_rsgcn_non_default_max_atoms_preprocessor(mol): preprocessor = RSGCNPreprocessor(max_atoms=5) ret_atom_array, ret_adj_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) expect_adj_array = numpy.array( [[0.5, 0.4082, 0, 0], [0.4082, 0.3333, 0.3333, 0], [0, 0.3333, 0.3333, 0.4082], [0, 0, 0.4082, 0.5]], dtype=numpy.float32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) numpy.testing.assert_allclose( ret_adj_array, expect_adj_array, rtol=1e-03, atol=1e-03) preprocessor = RSGCNPreprocessor(max_atoms=3) with pytest.raises(MolFeatureExtractionError): preprocessor.get_input_features(mol) def test_rsgcn_preprocessor(mol): preprocessor = RSGCNPreprocessor(max_atoms=4, out_size=4) ret_atom_array, ret_adj_array = preprocessor.get_input_features(mol) expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) expect_adj_array = numpy.array( [[0.5, 0.4082, 0, 0], [0.4082, 0.3333, 0.3333, 0], [0, 0.3333, 0.3333, 0.4082], [0, 0, 0.4082, 0.5]], dtype=numpy.float32) numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array) numpy.testing.assert_allclose( ret_adj_array, expect_adj_array, rtol=1e-03, atol=1e-03) def test_rsgcn_preprocessor_default(): preprocessor = RSGCNPreprocessor() dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset'] index = numpy.random.choice(len(dataset), None) atoms, adjacency = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 assert adjacency.ndim == 2 assert adjacency.dtype == numpy.float32 def test_rsgcn_preprocessor_assert_raises(): with pytest.raises(ValueError): RSGCNPreprocessor(max_atoms=3, out_size=2) # NOQA if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_schnet_preprocessor.py ================================================ import numpy import pytest from rdkit import Chem from chainer_chemistry.dataset.parsers import SmilesParser from chainer_chemistry.dataset.preprocessors.schnet_preprocessor import SchNetPreprocessor # NOQA @pytest.fixture def mol(): ret = Chem.MolFromSmiles('CN=C=O') return ret @pytest.fixture def pp(): return SchNetPreprocessor() def test_schnet_preprocessor(mol, pp): ret = pp.get_input_features(mol) assert len(ret) == 2 actual_atom_array, actual_adj_array = ret expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32) numpy.testing.assert_array_equal(actual_atom_array, expect_atom_array) # TODO(nakago): write test for adj matrix. # print(actual_adj_array) # expect_adj_array = numpy.array([[1, 1, 0, 0], # [1, 1, 1, 0], # [0, 1, 1, 1], # [0, 0, 1, 1]], dtype=numpy.float32) # numpy.testing.assert_array_equal(actual_adj_array, expect_adj_array) def test_schnet_preprocessor_default(): preprocessor = SchNetPreprocessor() dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset'] index = numpy.random.choice(len(dataset), None) atoms, adjs = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) assert adjs.ndim == 2 assert adjs.dtype == numpy.float32 def test_schnet_preprocessor_assert_raises(): with pytest.raises(ValueError): pp = SchNetPreprocessor(max_atoms=3, out_size=2) # NOQA if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_weavenet_preprocessor.py ================================================ import numpy import pytest from chainer_chemistry.dataset.parsers import SmilesParser from chainer_chemistry.dataset.preprocessors.weavenet_preprocessor import WeaveNetPreprocessor # NOQA @pytest.mark.parametrize('max_atoms', [20, 30]) @pytest.mark.parametrize('use_fixed_atom_feature', [True, False]) def test_weave_preprocessor(max_atoms, use_fixed_atom_feature): preprocessor = WeaveNetPreprocessor( max_atoms=max_atoms, use_fixed_atom_feature=use_fixed_atom_feature) dataset = SmilesParser(preprocessor).parse( ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'] )["dataset"] index = numpy.random.choice(len(dataset), None) atoms, adjs = dataset[index] if use_fixed_atom_feature: assert atoms.ndim == 2 # (atom, ch) assert atoms.dtype == numpy.float32 else: assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from * atom to, ch) assert adjs.ndim == 2 assert adjs.shape[0] == max_atoms * max_atoms assert adjs.dtype == numpy.float32 # TODO(nakago): test feature extraction behavior... atoms0, adjs0 = dataset[0] if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_wle.py ================================================ import numpy as np import pytest from chainer_chemistry.dataset.preprocessors import wle as WLE # NOQA from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset @pytest.fixture def small_datasets(): N_1 = 3 N_2 = 5 # one-hot atom labels: 1 tp N atom_array_1 = np.arange(N_1) atom_array_2 = np.arange(N_2) # adj-array, manually # all connectes. expanded labels is a permutaion of 0,1,2 adj_array_1 = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]).astype(np.int32) # node 0 --> 0-1.2 # node 1 --> 1-0.2 # node 2 --> 2-0.1 adj_array_2 = np.array([[1, 1, 0, 0, 1], [1, 1, 0, 0, 1], [0, 0, 1, 1, 0], [0, 0, 1, 1, 0], [1, 1, 0, 0, 1]]).astype(np.float32) # node 0 --> 0-1.4 # node 1 --> 1-0.4 # node 2 --> 2-3 # node 3 --> 3-2 # node 4 --> 4-0.1 # supervised labels, dummy teach_signal_1 = np.array(1).astype(np.int) teach_signal_2 = np.array(0).astype(np.int) # concat in a one numpy array! atom_arrays = np.array([atom_array_1, atom_array_2]) adj_arrays = np.array([adj_array_1, adj_array_2]) teach_signals = np.array([teach_signal_1, teach_signal_2]) # train/val/test dataset, respectively datasets = [NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals), NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals), NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals)] return datasets def _get_elements(datasets, idx): return [[mol[1] for mol in d] for d in datasets] def _get_atom_arrays(datasets): return _get_elements(datasets, 0) def _get_adj_arrays(datasets): return _get_elements(datasets, 1) def _get_wle_arrays(datasets): return _get_elements(datasets, 2) def _get_teach_signals(datasets, is_cwle=False): if is_cwle: return _get_elements(datasets, 2) else: return _get_elements(datasets, 3) def _check_np_array(actuals, expects): assert len(actuals) == len(expects) == 3 # train/test/val for actual_adjs, expect_adjs in zip(actuals, expects): assert len(actual_adjs) == len(expect_adjs) [np.testing.assert_array_equal(a, e) for a, e in zip(actual_adjs, expect_adjs)] def test_wle(small_datasets): ret_value = WLE.apply_wle_for_datasets(small_datasets, 0) actual_datasets, actual_labels, actual_frequency = ret_value expected_frequency = {'0-1.2': 3, '1-0.2': 3, '2-0.1': 3, '0-1.4': 3, '1-0.4': 3, '2-3': 3, '3-2': 3, '4-0.1': 3} assert expected_frequency == actual_frequency expected_labels = set(expected_frequency.keys()) assert expected_labels == set(actual_labels) actual_adj_arrays = _get_adj_arrays(actual_datasets) expect_adj_arrays = _get_adj_arrays(small_datasets) _check_np_array(actual_adj_arrays, expect_adj_arrays) actual_signal_arrays = _get_teach_signals(actual_datasets) expect_signal_arrays = _get_teach_signals(small_datasets) _check_np_array(actual_signal_arrays, expect_signal_arrays) # Check atom_arrays of train/val/test datasets are identical. # 2 is the number of samples in each (train/val/test) dataset. atom_arrays = _get_atom_arrays(actual_datasets) first_mols = [d[0] for d in atom_arrays] second_mols = [d[1] for d in atom_arrays] for mols in (first_mols, second_mols): assert len(mols) == 3 np.testing.assert_array_equal(mols[0], mols[1]) np.testing.assert_array_equal(mols[1], mols[2]) def test_2_hop_wle(small_datasets): k = 2 ret_value = WLE.apply_wle_for_datasets(small_datasets, 0, k) actual_datasets, actual_labels, actual_frequency = ret_value expected_frequency = {'0-1.2': 3, '1-0.2': 3, '2-0.1': 3, '3-4.7': 3, '4-3.7': 3, '5-6': 3, '6-5': 3, '7-3.4': 3} # Kenta Oono (oono@preferred.jp) # The following assertion checks too strong condition. # Specifically it assumes that the WLE algorithm assigns # the extended atom labels appeared in the first iteration # in a certain order and runs the second iteration. # Strictly speaking, this is not required in the algorithm. assert expected_frequency == actual_frequency expected_labels = set(expected_frequency.keys()) assert expected_labels == set(actual_labels) actual_adj_arrays = _get_adj_arrays(actual_datasets) expect_adj_arrays = _get_adj_arrays(small_datasets) _check_np_array(actual_adj_arrays, expect_adj_arrays) actual_signal_arrays = _get_teach_signals(actual_datasets) expect_signal_arrays = _get_teach_signals(small_datasets) _check_np_array(actual_signal_arrays, expect_signal_arrays) # Check atom_arrays of train/val/test datasets are identical. # 2 is the number of samples in each (train/val/test) dataset. atom_arrays = _get_atom_arrays(actual_datasets) first_mols = [d[0] for d in atom_arrays] second_mols = [d[1] for d in atom_arrays] for mols in (first_mols, second_mols): assert len(mols) == 3 np.testing.assert_array_equal(mols[0], mols[1]) np.testing.assert_array_equal(mols[1], mols[2]) def test_cwle(small_datasets): ret_value = WLE.apply_cwle_for_datasets(small_datasets) actual_datasets, actual_labels, actual_frequency = ret_value expected_frequency = {'1.2': 3, '0.2': 3, '0.1': 6, '1.4': 3, '0.4': 3, '3': 3, '2': 3} assert expected_frequency == actual_frequency expected_labels = set(expected_frequency.keys()) assert expected_labels == set(actual_labels) actual_adj_arrays = _get_adj_arrays(actual_datasets) expect_adj_arrays = _get_adj_arrays(small_datasets) _check_np_array(actual_adj_arrays, expect_adj_arrays) actual_signal_arrays = _get_teach_signals(actual_datasets, True) expect_signal_arrays = _get_teach_signals(small_datasets) _check_np_array(actual_signal_arrays, expect_signal_arrays) # Check atom_arrays of train/val/test datasets are identical. atom_arrays = _get_atom_arrays(actual_datasets) first_mols = [d[0] for d in atom_arrays] second_mols = [d[1] for d in atom_arrays] for mols in (first_mols, second_mols): assert len(mols) == 3 np.testing.assert_array_equal(mols[0], mols[1]) np.testing.assert_array_equal(mols[1], mols[2]) # Check wle_arrays of train/val/test datasets are identical. wle_arrays = _get_wle_arrays(actual_datasets) first_mols = [d[0] for d in wle_arrays] second_mols = [d[1] for d in wle_arrays] for mols in [first_mols, second_mols]: assert len(mols) == 3 np.testing.assert_array_equal(mols[0], mols[1]) np.testing.assert_array_equal(mols[1], mols[2]) def test_findmaxidx_atom_label(small_datasets): actual = WLE.findmaxidx(small_datasets, 'atom_label') expect = 5 assert actual == expect @pytest.fixture def cwle_datasets(): B = 10 D_atom = 5 D_wle = 50 K_large = 10000 atom_arrays = [np.full((B, D_atom), K_large) for _ in range(3)] adj_arrays = [np.eye(B, dtype=np.int32) for _ in range(3)] wle_arrays = [np.arange(B * D_wle, dtype=np.int32).reshape(B, -1) for _ in range(3)] signal_arrays = [np.full(B, K_large) for _ in range(3)] print(wle_arrays[0].shape) datasets = [NumpyTupleDataset(atom_arrays[i], adj_arrays[i], wle_arrays[i], signal_arrays[i]) for i in range(3)] return datasets def test_findmaxidx_wle(cwle_datasets): actual = WLE.findmaxidx(cwle_datasets, 'wle_label') expect = 10 * 50 assert actual == expect ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_wle_atom_array_update.py ================================================ import itertools import numpy as np import pytest from chainer_chemistry.dataset.preprocessors import wle_atom_array_update as wle_update from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset @pytest.fixture def k3_datasets(): train_atoms = np.array([np.zeros(3, dtype=np.int32)]) val_atoms = np.array([np.ones(3, dtype=np.int32)]) test_atoms = np.array([np.full(3, 2, dtype=np.int32)]) train_adjs = np.array([np.ones((3, 3), dtype=np.int32)]) val_adjs = np.array([np.ones((3, 3), dtype=np.int32)]) test_adjs = np.array([np.ones((3, 3), dtype=np.int32)]) return ((train_atoms, val_atoms, test_atoms), (train_adjs, val_adjs, test_adjs)) def _is_all_same(arr): arr = np.array(arr) assert arr.size > 0 return np.all(arr == arr.item(0)) def _is_all_different(arr): for x, y in itertools.combinations(arr, 2): if x == y: return False return True @pytest.mark.parametrize('cutoff', (0, 1, 2, 3, 4)) def test_update_atom_array(k3_datasets, cutoff): atom_arrays, adj_arrays = k3_datasets actual_atom_arrays, actual_label_frequency = wle_update.update_atom_arrays( atom_arrays, adj_arrays, cutoff) mols = [d[0] for d in actual_atom_arrays] for m in mols: assert _is_all_same(m) # train/val/test atoms must have different labels. assert _is_all_different((mols[0][0], mols[1][0], mols[2][0])) if cutoff >= 3: expect_label_frequency = {'0': 3, '1': 3, '2': 3} else: expect_label_frequency = {'0-0.0': 3, '1-1.1': 3, '2-2.2': 3} assert actual_label_frequency == expect_label_frequency @pytest.fixture def single_atom_datasets(): train_atoms = np.array([[0], [1], [2]], dtype=np.int32) val_atoms = np.array([[1], [1], [5]], dtype=np.int32) test_atoms = np.array([[4], [4], [2]], dtype=np.int32) train_adjs = np.array([[[1]], [[1]], [[1]]], dtype=np.int32) val_adjs = np.array([[[1]], [[1]], [[1]]], dtype=np.int32) test_adjs = np.array([[[1]], [[1]], [[1]]], dtype=np.int32) return ((train_atoms, val_atoms, test_atoms), (train_adjs, val_adjs, test_adjs)) @pytest.mark.parametrize('cutoff', (0, 1, 2)) def test_update_atom_array_2(single_atom_datasets, cutoff): atom_arrays, adj_arrays = single_atom_datasets actual_atom_arrays, actual_label_frequency = wle_update.update_atom_arrays( atom_arrays, adj_arrays, cutoff) # Note that labels after expansion need not # same as the original atom labels. # For example, assigning ids accoring to # appearance order # 0 -> 0, 1 -> 1, 2 -> 2, 5 -> 3, 4 -> 4, # results in # Atom arrays # train: [[0], [1], [2]] # val: [[1], [1], [3]] # test: [[4], [4], [2]] # Label Frequency # {'0': 1, '1': 3, '2': 2, '3': 1, '4': 2} # This is acceptable. train, val, test = actual_atom_arrays assert _is_all_same((train[1], val[0], val[1])) assert _is_all_same((train[2], test[2])) assert _is_all_same((test[0], test[1])) assert _is_all_different((train[0], train[1], train[2], val[2], test[0])) expect_label_frequency = {'0-': 1, '1-': 3, '2-': 2, '4-': 2, '5-': 1} # Equal as a multiset. assert (sorted(actual_label_frequency.values()) == sorted(expect_label_frequency.values())) @pytest.fixture def different_sample_size_datasets(): train_atoms = np.array([[0]], dtype=np.int32) val_atoms = np.array([[0], [0]], dtype=np.int32) test_atoms = np.array([[0], [0], [0]], dtype=np.int32) train_adjs = np.array([[[1]]], dtype=np.int32) val_adjs = np.array([[[1]], [[1]]], dtype=np.int32) test_adjs = np.array([[[1]], [[1]], [[1]]], dtype=np.int32) return ((train_atoms, val_atoms, test_atoms), (train_adjs, val_adjs, test_adjs)) def test_update_atom_array_with_diffent_sample_sizes( different_sample_size_datasets): atom_arrays, adj_arrays = different_sample_size_datasets actual_atom_arrays, actual_label_frequency = wle_update.update_atom_arrays( atom_arrays, adj_arrays, 0) all_atoms = sum([list(a.ravel()) for a in actual_atom_arrays], []) assert _is_all_same(all_atoms) expect_label_frequency = {'0-': 6} assert actual_label_frequency == expect_label_frequency @pytest.fixture def different_graph_size_datasets(): train_atoms = np.array([[0]], dtype=np.int32) val_atoms = np.array([[0, 0]], dtype=np.int32) test_atoms = np.array([[0, 0, 0]], dtype=np.int32) train_adjs = np.array([[[1]]], dtype=np.int32) val_adjs = np.array([[[1, 1], [1, 1]]], dtype=np.int32) test_adjs = np.array([[[1, 1, 1], [1, 1, 1], [1, 1, 1]]], dtype=np.int32) return ((train_atoms, val_atoms, test_atoms), (train_adjs, val_adjs, test_adjs)) def test_update_atom_array_with_different_graph_size( different_graph_size_datasets): atom_arrays, adj_arrays = different_graph_size_datasets actual_atom_arrays, actual_label_frequency = wle_update.update_atom_arrays( atom_arrays, adj_arrays, 0) mols = [d[0] for d in actual_atom_arrays] for m in mols: assert _is_all_same(m) expect_label_frequency = {'0-': 1, '0-0': 2, '0-0.0': 3} assert actual_label_frequency == expect_label_frequency @pytest.fixture def line_graph_datasets(): train_atoms = np.zeros(5, dtype=np.int32).reshape(1, -1) val_atoms = np.array([[1]], dtype=np.int32) test_atoms = np.array([[1]], dtype=np.int32) train_adjs = np.array([[[1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [0, 0, 0, 1, 1]]], dtype=np.int32) val_adjs = np.array([[[1]]], dtype=np.int32) test_adjs = np.array([[[1]]], dtype=np.int32) return ((train_atoms, val_atoms, test_atoms), (train_adjs, val_adjs, test_adjs)) def test_update_atom_array_twice(line_graph_datasets): atom_arrays, adj_arrays = line_graph_datasets for _ in range(2): atom_arrays, actual_label_frequency = wle_update.update_atom_arrays( atom_arrays, adj_arrays, 0) expect_label_frequency = {'0-1': 2, '1-0.1': 2, '1-1.1': 1, '2-': 2} # atoms in test and val datasets assert actual_label_frequency == expect_label_frequency @pytest.fixture def small_datasets(): N_1 = 3 N_2 = 5 # one-hot atom labels: 1 tp N atom_array_1 = np.arange(N_1) atom_array_2 = np.arange(N_2) # adj-array, manually # all connectes. expanded labels is a permutaion of 0,1,2 adj_array_1 = np.ones((3, 3), dtype=np.int32) # node 0 --> 0-1.2 # node 1 --> 1-0.2 # node 2 --> 2-0.1 adj_array_2 = np.array([[1, 1, 0, 0, 1], [1, 1, 0, 0, 1], [0, 0, 1, 1, 0], [0, 0, 1, 1, 0], [1, 1, 0, 0, 1]]).astype(np.float32) # node 0 --> 0-1.4 # node 1 --> 1-0.4 # node 2 --> 2-3 # node 3 --> 3-2 # node 4 --> 4-0.1 # supervised labels, dummy teach_signal_1 = np.array(1).astype(np.int) teach_signal_2 = np.array(0).astype(np.int) # concat in a one numpy array! atom_arrays = np.array([atom_array_1, atom_array_2]) adj_arrays = np.array([adj_array_1, adj_array_2]) teach_signals = np.array([teach_signal_1, teach_signal_2]) # train/val/test dataset, respectively datasets = [NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals), NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals), NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals)] return datasets def test_list_all_expanded_labels_with_focus_atom(small_datasets): atom_arrays = [[mol[0] for mol in d] for d in small_datasets] adj_arrays = [[mol[1] for mol in d] for d in small_datasets] actual_atom_lists, actual_frequencies = wle_update.list_all_expanded_labels( atom_arrays, adj_arrays, True) expected_frequency = {'0-1.2': 3, '1-0.2': 3, '2-0.1': 3, '0-1.4': 3, '1-0.4': 3, '2-3': 3, '3-2': 3, '4-0.1': 3} assert expected_frequency == actual_frequencies expect_atom_list = [ set(['0-1.2', '1-0.2', '2-0.1']), set(['0-1.4', '1-0.4', '2-3', '3-2', '4-0.1'])] for actual_atom_list in actual_atom_lists: for a, e in zip(actual_atom_list, expect_atom_list): assert set(a) == e def test_list_all_expanded_labels_without_focus_atom(small_datasets): atom_arrays = [[mol[0] for mol in d] for d in small_datasets] adj_arrays = [[mol[1] for mol in d] for d in small_datasets] actual_atom_lists, actual_frequencies = wle_update.list_all_expanded_labels( atom_arrays, adj_arrays, False) expected_frequency = {'1.2': 3, '0.2': 3, '0.1': 6, '1.4': 3, '0.4': 3, '3': 3, '2': 3} assert expected_frequency == actual_frequencies expect_atom_list = [ set(['1.2', '0.2', '0.1']), set(['1.4', '0.4', '3', '2', '0.1'])] for actual_atom_list in actual_atom_lists: for a, e in zip(actual_atom_list, expect_atom_list): assert set(a) == e ================================================ FILE: tests/dataset_tests/preprocessors_tests/test_wle_util.py ================================================ import numpy as np import pytest from chainer_chemistry.dataset.preprocessors import wle_util def test_to_index(): values = ['foo', 'bar', 'buz', 'non-exist'] mols = [['foo', 'bar', 'buz'], ['foo', 'foo'], ['buz', 'bar']] actual = wle_util.to_index(mols, values) expect = np.array([np.array([0, 1, 2], np.int32), np.array([0, 0], np.int32), np.array([2, 1], np.int32)]) assert len(actual) == len(expect) for a, e in zip(actual, expect): np.testing.assert_array_equal(a, e) def test_to_index_non_existence(): values = ['foo', 'bar'] mols = [['strange_label']] with pytest.raises(ValueError): wle_util.to_index(mols, values) def test_compress_relation_axis_2_dim(): arr = np.random.uniform(size=(10, 2)) actual = wle_util.compress_relation_axis(arr) np.testing.assert_array_equal(actual, arr) def test_compress_relation_axis_3_dim(): arr = np.array( [ [ [1, 0], [2, 0], ], [ [1, 1], [0, 0] ] ] ) arr = np.swapaxes(arr, 0, 1) ret = wle_util.compress_relation_axis(arr) actual = ret != 0 expect = np.array( [[True, True], [True, False]] ) np.testing.assert_array_equal(actual, expect) def test_compress_relation_axis_invalid_ndim(): arr = np.zeros(3) with pytest.raises(ValueError): wle_util.compress_relation_axis(arr) arr = np.zeros((1, 2, 3, 4)) with pytest.raises(ValueError): wle_util.compress_relation_axis(arr) @pytest.fixture def small_molecule(): # a-b-c d atom_array = ['a', 'b', 'c', 'd'] neighbors = np.array( [ [0, 1, 1, 2], # first end of edges [1, 0, 2, 1] # second end of edges ] ) return atom_array, neighbors def test_get_neighbor_representation_with_focus_atom(small_molecule): atom_array, neighbors = small_molecule expects = ['a-b', 'b-a.c', 'c-b', 'd-'] for i in range(len(expects)): actual = wle_util.get_neighbor_representation( i, atom_array, neighbors, True) assert actual == expects[i] def test_get_neighbor_representation_without_focus_atom(small_molecule): atom_array, neighbors = small_molecule expects = ['b', 'a.c', 'b', ''] for i in range(len(expects)): actual = wle_util.get_neighbor_representation( i, atom_array, neighbors, False) assert actual == expects[i] @pytest.mark.parametrize('label, expect', [ ('a-b', 'a'), ('a-b.c', 'a'), ('aa-b', 'aa'), ('a-', 'a'), ('aa-', 'aa'), ]) def test_get_focus_node_label(label, expect): actual = wle_util.get_focus_node_label(label) assert actual == expect @pytest.mark.parametrize('label', ['aa', 'a-a-a', 'a--']) def test_get_focus_node_label_invalid(label): with pytest.raises(ValueError): wle_util.get_focus_node_label(label) ================================================ FILE: tests/dataset_tests/splitters_tests/test_deepchem_scaffold_splitter.py ================================================ import numpy import pandas import pytest from chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser # NOQA from chainer_chemistry.dataset.preprocessors import AtomicNumberPreprocessor from chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import generate_scaffold # NOQA from chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import DeepChemScaffoldSplitter # NOQA from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset @pytest.fixture def smiles_list(): smileses = [ "CC1=CC2CC(CC1)O2", "O=Cc1nccn1C=O", "CCC(C)(C)C(O)C=O", "C#CCC(C)(CO)OC", "Nc1coc(=O)nc1N", "CC12C=CC(CCC1)C2", "CC12CCC1C2OC=O", "CC1C2CC3(COC3)N12", "O=C1NC=NC12CC2", "C1=CC2CN2CC2NC12", ] return smileses @pytest.fixture def dataset(smiles_list): df = pandas.DataFrame(data={'smiles': smiles_list, 'value': numpy.random.rand(10)}) pp = AtomicNumberPreprocessor() parser = DataFrameParser(pp, labels='value') dataset = parser.parse(df, return_smiles=True) return dataset def test_generate_scaffold(): smiles = "Nc1coc(=O)nc1N" actual = generate_scaffold(smiles) expect = 'O=c1nccco1' assert actual == expect def test_split(dataset): splitter = DeepChemScaffoldSplitter() train_ind, valid_ind, test_ind = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles']) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 8 assert valid_ind.shape[0] == 1 assert test_ind.shape[0] == 1 train_ind, valid_ind, test_ind = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 5 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 2 def test_split_fix_seed(dataset): splitter = DeepChemScaffoldSplitter() train_ind1, valid_ind1, test_ind1 = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], seed=44) train_ind2, valid_ind2, test_ind2 = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], seed=44) assert numpy.array_equal(train_ind1, train_ind2) assert numpy.array_equal(valid_ind1, valid_ind2) assert numpy.array_equal(test_ind1, test_ind2) def test_split_fail(dataset): splitter = DeepChemScaffoldSplitter() with pytest.raises(AssertionError): train_ind, valid_ind, test_ind = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], frac_train=0.4, frac_valid=0.3, frac_test=0.2) def test_train_valid_test_split(dataset): splitter = DeepChemScaffoldSplitter() train_ind, valid_ind, test_ind = splitter.train_valid_test_split( dataset=dataset['dataset'], smiles_list=dataset['smiles']) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 8 assert valid_ind.shape[0] == 1 assert test_ind.shape[0] == 1 def test_train_valid_test_split_return_dataset(dataset): splitter = DeepChemScaffoldSplitter() train, valid, test = splitter.train_valid_test_split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert type(test) == NumpyTupleDataset assert len(train) == 8 assert len(valid) == 1 assert len(test) == 1 def test_train_valid_split(dataset): splitter = DeepChemScaffoldSplitter() train_ind, valid_ind = splitter.train_valid_split( dataset=dataset['dataset'], smiles_list=dataset['smiles']) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 9 assert valid_ind.shape[0] == 1 def test_train_valid_split_return_dataset(dataset): splitter = DeepChemScaffoldSplitter() train, valid = splitter.train_valid_split(dataset=dataset['dataset'], smiles_list=dataset['smiles'], return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert len(train) == 9 assert len(valid) == 1 ================================================ FILE: tests/dataset_tests/splitters_tests/test_random_splitter.py ================================================ import numpy import pytest from chainer_chemistry.dataset.splitters.random_splitter import RandomSplitter from chainer_chemistry.datasets import NumpyTupleDataset @pytest.fixture def dataset(): a = numpy.random.random((10, 10)) b = numpy.random.random((10, 8)) c = numpy.random.random((10, 1)) return NumpyTupleDataset(a, b, c) @pytest.fixture def ndarray_dataset(): a = numpy.random.random((10, 10)) return a def test_split(dataset): splitter = RandomSplitter() train_ind, valid_ind, test_ind = splitter._split(dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 8 assert valid_ind.shape[0] == 1 assert test_ind.shape[0] == 1 train_ind, valid_ind, test_ind = splitter._split(dataset, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 5 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 2 def test_split_fix_seed(dataset): splitter = RandomSplitter() train_ind1, valid_ind1, test_ind1 = splitter._split(dataset, seed=44) train_ind2, valid_ind2, test_ind2 = splitter._split(dataset, seed=44) assert numpy.array_equal(train_ind1, train_ind2) assert numpy.array_equal(valid_ind1, valid_ind2) assert numpy.array_equal(test_ind1, test_ind2) def test_split_fail(dataset): splitter = RandomSplitter() with pytest.raises(AssertionError): train_ind, valid_ind, test_ind = splitter._split(dataset, frac_train=0.4, frac_valid=0.3, frac_test=0.2) def test_train_valid_test_split(dataset): splitter = RandomSplitter() train_ind, valid_ind, test_ind = splitter.train_valid_test_split(dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 8 assert valid_ind.shape[0] == 1 assert test_ind.shape[0] == 1 def test_train_valid_test_split_return_dataset(dataset): splitter = RandomSplitter() train, valid, test = splitter.train_valid_test_split(dataset, return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert type(test) == NumpyTupleDataset assert len(train) == 8 assert len(valid) == 1 assert len(test) == 1 def test_train_valid_test_split_ndarray_return_dataset(ndarray_dataset): splitter = RandomSplitter() train, valid, test = splitter.train_valid_test_split(ndarray_dataset, return_index=False) assert type(train) == numpy.ndarray assert type(valid) == numpy.ndarray assert type(test) == numpy.ndarray assert len(train) == 8 assert len(valid) == 1 assert len(test) == 1 def test_train_valid_split(dataset): splitter = RandomSplitter() train_ind, valid_ind = splitter.train_valid_split(dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 9 assert valid_ind.shape[0] == 1 def test_train_valid_split_return_dataset(dataset): splitter = RandomSplitter() train, valid = splitter.train_valid_split(dataset, return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert len(train) == 9 assert len(valid) == 1 ================================================ FILE: tests/dataset_tests/splitters_tests/test_scaffold_splitter.py ================================================ import numpy import pandas import pytest from chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser # NOQA from chainer_chemistry.dataset.preprocessors import AtomicNumberPreprocessor from chainer_chemistry.dataset.splitters.scaffold_splitter import generate_scaffold # NOQA from chainer_chemistry.dataset.splitters.scaffold_splitter import ScaffoldSplitter # NOQA from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset @pytest.fixture def smiles_list(): smileses = [ "CC1=CC2CC(CC1)O2", "O=Cc1nccn1C=O", "CCC(C)(C)C(O)C=O", "C#CCC(C)(CO)OC", "Nc1coc(=O)nc1N", "CC12C=CC(CCC1)C2", "CC12CCC1C2OC=O", "CC1C2CC3(COC3)N12", "O=C1NC=NC12CC2", "C1=CC2CN2CC2NC12", ] return smileses @pytest.fixture def dataset(smiles_list): df = pandas.DataFrame(data={'smiles': smiles_list, 'value': numpy.random.rand(10)}) pp = AtomicNumberPreprocessor() parser = DataFrameParser(pp, labels='value') dataset = parser.parse(df, return_smiles=True) return dataset def test_generate_scaffold(): smiles = "Nc1coc(=O)nc1N" actual = generate_scaffold(smiles) expect = 'O=c1nccco1' assert actual == expect def test_split(dataset): splitter = ScaffoldSplitter() train_ind, valid_ind, test_ind = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles']) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 8 assert valid_ind.shape[0] == 1 assert test_ind.shape[0] == 1 train_ind, valid_ind, test_ind = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 5 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 2 def test_split_fix_seed(dataset): splitter = ScaffoldSplitter() train_ind1, valid_ind1, test_ind1 = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], seed=44) train_ind2, valid_ind2, test_ind2 = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], seed=44) assert numpy.array_equal(train_ind1, train_ind2) assert numpy.array_equal(valid_ind1, valid_ind2) assert numpy.array_equal(test_ind1, test_ind2) def test_split_fail(dataset): splitter = ScaffoldSplitter() with pytest.raises(AssertionError): train_ind, valid_ind, test_ind = splitter._split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], frac_train=0.4, frac_valid=0.3, frac_test=0.2) def test_train_valid_test_split(dataset): splitter = ScaffoldSplitter() train_ind, valid_ind, test_ind = splitter.train_valid_test_split( dataset=dataset['dataset'], smiles_list=dataset['smiles']) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 8 assert valid_ind.shape[0] == 1 assert test_ind.shape[0] == 1 def test_train_valid_test_split_return_dataset(dataset): splitter = ScaffoldSplitter() train, valid, test = splitter.train_valid_test_split( dataset=dataset['dataset'], smiles_list=dataset['smiles'], return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert type(test) == NumpyTupleDataset assert len(train) == 8 assert len(valid) == 1 assert len(test) == 1 def test_train_valid_split(dataset): splitter = ScaffoldSplitter() train_ind, valid_ind = splitter.train_valid_split( dataset=dataset['dataset'], smiles_list=dataset['smiles']) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 9 assert valid_ind.shape[0] == 1 def test_train_valid_split_return_dataset(dataset): splitter = ScaffoldSplitter() train, valid = splitter.train_valid_split(dataset=dataset['dataset'], smiles_list=dataset['smiles'], return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert len(train) == 9 assert len(valid) == 1 ================================================ FILE: tests/dataset_tests/splitters_tests/test_stratified_splitter.py ================================================ import numpy import pytest from chainer_chemistry.dataset.splitters.stratified_splitter import StratifiedSplitter # NOQA from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset @pytest.fixture def cls_dataset(): a = numpy.random.random((30, 10)) b = numpy.random.random((30, 8)) c = numpy.concatenate([numpy.zeros(20), numpy.ones(10)]).astype(numpy.int) return NumpyTupleDataset(a, b, c) @pytest.fixture def cls_label(): c = numpy.concatenate([numpy.zeros(20), numpy.ones(10)]).astype(numpy.int) return c @pytest.fixture def cls_ndarray_dataset(): a = numpy.concatenate([numpy.zeros(20), numpy.ones(10)]).astype(numpy.int) b = numpy.concatenate([numpy.zeros(20), numpy.ones(10)]).astype(numpy.int) return a, b @pytest.fixture def reg_dataset(): a = numpy.random.random((100, 10)) b = numpy.random.random((100, 8)) c = numpy.arange(100).astype(numpy.float) return NumpyTupleDataset(a, b, c) def test_classification_split(cls_dataset): splitter = StratifiedSplitter() train_ind, valid_ind, test_ind = splitter._split(cls_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 24 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 3 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 8 assert (valid.features[:, -1] == 1).sum() == 1 assert (test.features[:, -1] == 1).sum() == 1 train_ind, valid_ind, test_ind = splitter._split(cls_dataset, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 15 assert valid_ind.shape[0] == 9 assert test_ind.shape[0] == 6 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 5 assert (valid.features[:, -1] == 1).sum() == 3 assert (test.features[:, -1] == 1).sum() == 2 def test_classification_split_by_labels_ndarray(cls_dataset, cls_label): splitter = StratifiedSplitter() train_ind, valid_ind, test_ind = splitter._split(cls_dataset, labels=cls_label) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 24 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 3 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 8 assert (valid.features[:, -1] == 1).sum() == 1 assert (test.features[:, -1] == 1).sum() == 1 train_ind, valid_ind, test_ind = splitter._split(cls_dataset, labels=cls_label, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 15 assert valid_ind.shape[0] == 9 assert test_ind.shape[0] == 6 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 5 assert (valid.features[:, -1] == 1).sum() == 3 assert (test.features[:, -1] == 1).sum() == 2 def test_classification_split_by_labels_list(cls_dataset, cls_label): cls_label = cls_label.tolist() splitter = StratifiedSplitter() train_ind, valid_ind, test_ind = splitter._split(cls_dataset, labels=cls_label) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 24 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 3 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 8 assert (valid.features[:, -1] == 1).sum() == 1 assert (test.features[:, -1] == 1).sum() == 1 train_ind, valid_ind, test_ind = splitter._split(cls_dataset, labels=cls_label, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 15 assert valid_ind.shape[0] == 9 assert test_ind.shape[0] == 6 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 5 assert (valid.features[:, -1] == 1).sum() == 3 assert (test.features[:, -1] == 1).sum() == 2 def test_regression_split(reg_dataset): splitter = StratifiedSplitter() train_ind, valid_ind, test_ind = splitter._split(reg_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 80 assert valid_ind.shape[0] == 10 assert test_ind.shape[0] == 10 train = NumpyTupleDataset(*reg_dataset.features[train_ind]) valid = NumpyTupleDataset(*reg_dataset.features[valid_ind]) test = NumpyTupleDataset(*reg_dataset.features[test_ind]) assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0 assert 45.0 < test.features[:, -1].mean() < 55.0 train_ind, valid_ind, test_ind = splitter._split(reg_dataset, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 50 assert valid_ind.shape[0] == 30 assert test_ind.shape[0] == 20 train = NumpyTupleDataset(*reg_dataset.features[train_ind]) valid = NumpyTupleDataset(*reg_dataset.features[valid_ind]) test = NumpyTupleDataset(*reg_dataset.features[test_ind]) assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0 assert 45.0 < test.features[:, -1].mean() < 55.0 def test_classification_split_fix_seed(cls_dataset): splitter = StratifiedSplitter() train_ind1, valid_ind1, test_ind1 = splitter._split(cls_dataset, seed=44) train_ind2, valid_ind2, test_ind2 = splitter._split(cls_dataset, seed=44) assert numpy.array_equal(train_ind1, train_ind2) assert numpy.array_equal(valid_ind1, valid_ind2) assert numpy.array_equal(test_ind1, test_ind2) def test_split_fail_by_frac_ratio(cls_dataset): splitter = StratifiedSplitter() with pytest.raises(AssertionError): train_ind, valid_ind, test_ind = splitter._split(cls_dataset, frac_train=0.4, frac_valid=0.3, frac_test=0.2) def test_split_fail_by_invalid_task_type(cls_dataset): splitter = StratifiedSplitter() with pytest.raises(ValueError): train_ind, valid_ind, test_ind = splitter._split(cls_dataset, frac_train=0.5, frac_valid=0.3, frac_test=0.2, task_type='mix') def test_regression_split_fix_seed(reg_dataset): splitter = StratifiedSplitter() train_ind1, valid_ind1, test_ind1 = splitter._split(reg_dataset, seed=44) train_ind2, valid_ind2, test_ind2 = splitter._split(reg_dataset, seed=44) assert numpy.array_equal(train_ind1, train_ind2) assert numpy.array_equal(valid_ind1, valid_ind2) assert numpy.array_equal(test_ind1, test_ind2) def test_train_valid_test_classification_split(cls_dataset): splitter = StratifiedSplitter() train_ind, valid_ind, test_ind =\ splitter.train_valid_test_split(cls_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 24 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 3 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) test = NumpyTupleDataset(*cls_dataset.features[test_ind]) assert (train.features[:, -1] == 1).sum() == 8 assert (valid.features[:, -1] == 1).sum() == 1 assert (test.features[:, -1] == 1).sum() == 1 def test_train_valid_test_classification_split_return_dataset(cls_dataset): splitter = StratifiedSplitter() train, valid, test = splitter.train_valid_test_split(cls_dataset, return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert type(test) == NumpyTupleDataset assert len(train) == 24 assert len(valid) == 3 assert len(test) == 3 assert (train.features[:, -1] == 1).sum() == 8 assert (valid.features[:, -1] == 1).sum() == 1 assert (test.features[:, -1] == 1).sum() == 1 def test_train_valid_test_classification_split_ndarray_return_dataset( cls_ndarray_dataset): cls_dataset, cls_label = cls_ndarray_dataset splitter = StratifiedSplitter() train, valid, test = splitter.train_valid_test_split(cls_dataset, labels=cls_label, return_index=False) assert type(train) == numpy.ndarray assert type(valid) == numpy.ndarray assert type(test) == numpy.ndarray assert len(train) == 24 assert len(valid) == 3 assert len(test) == 3 assert (train == 1).sum() == 8 assert (valid == 1).sum() == 1 assert (test == 1).sum() == 1 def test_train_valid_test_regression_split(reg_dataset): splitter = StratifiedSplitter() train_ind, valid_ind, test_ind =\ splitter.train_valid_test_split(reg_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 80 assert valid_ind.shape[0] == 10 assert test_ind.shape[0] == 10 train = NumpyTupleDataset(*reg_dataset.features[train_ind]) valid = NumpyTupleDataset(*reg_dataset.features[valid_ind]) test = NumpyTupleDataset(*reg_dataset.features[test_ind]) assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0 assert 45.0 < test.features[:, -1].mean() < 55.0 def test_train_valid_test_regression_split_return_dataset(reg_dataset): splitter = StratifiedSplitter() train, valid, test = splitter.train_valid_test_split(reg_dataset, return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert type(test) == NumpyTupleDataset assert len(train) == 80 assert len(valid) == 10 assert len(test) == 10 assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0 assert 45.0 < test.features[:, -1].mean() < 55.0 def test_train_valid_classification_split(cls_dataset): splitter = StratifiedSplitter() train_ind, valid_ind = splitter.train_valid_split(cls_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 27 assert valid_ind.shape[0] == 3 train = NumpyTupleDataset(*cls_dataset.features[train_ind]) valid = NumpyTupleDataset(*cls_dataset.features[valid_ind]) assert (train.features[:, -1] == 1).sum() == 9 assert (valid.features[:, -1] == 1).sum() == 1 def test_train_valid_classification_split_return_dataset(cls_dataset): splitter = StratifiedSplitter() train, valid = splitter.train_valid_split(cls_dataset, return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert len(train) == 27 assert len(valid) == 3 assert (train.features[:, -1] == 1).sum() == 9 assert (valid.features[:, -1] == 1).sum() == 1 def test_train_valid_classification_split_ndarray_return_dataset( cls_ndarray_dataset): cls_dataset, cls_label = cls_ndarray_dataset splitter = StratifiedSplitter() train, valid = splitter.train_valid_split(cls_dataset, labels=cls_label, return_index=False) assert type(train) == numpy.ndarray assert type(valid) == numpy.ndarray assert len(train) == 27 assert len(valid) == 3 assert (train == 1).sum() == 9 assert (valid == 1).sum() == 1 def test_train_valid_test_cls_split_by_labels_return_dataset(cls_dataset, cls_label): splitter = StratifiedSplitter() train, valid, test = splitter.train_valid_test_split(cls_dataset, labels=cls_label, return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert type(test) == NumpyTupleDataset assert len(train) == 24 assert len(valid) == 3 assert len(test) == 3 assert (train.features[:, -1] == 1).sum() == 8 assert (valid.features[:, -1] == 1).sum() == 1 assert (test.features[:, -1] == 1).sum() == 1 def test_train_valid_cls_split_by_labels_return_dataset(cls_dataset, cls_label): splitter = StratifiedSplitter() train, valid = splitter.train_valid_split(cls_dataset, labels=cls_label, return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert len(train) == 27 assert len(valid) == 3 assert (train.features[:, -1] == 1).sum() == 9 assert (valid.features[:, -1] == 1).sum() == 1 def test_train_valid_regression_split(reg_dataset): splitter = StratifiedSplitter() train_ind, valid_ind = splitter.train_valid_split(reg_dataset) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 90 assert valid_ind.shape[0] == 10 train = NumpyTupleDataset(*reg_dataset.features[train_ind]) valid = NumpyTupleDataset(*reg_dataset.features[valid_ind]) assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0 def test_train_valid_regression_split_return_dataset(reg_dataset): splitter = StratifiedSplitter() train, valid = splitter.train_valid_split(reg_dataset, return_index=False) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert len(train) == 90 assert len(valid) == 10 assert 45.0 < train.features[:, -1].mean() < 55.0 assert 45.0 < valid.features[:, -1].mean() < 55.0 ================================================ FILE: tests/dataset_tests/splitters_tests/test_time_splitter.py ================================================ import numpy import pytest from chainer_chemistry.dataset.splitters.time_splitter import TimeSplitter from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset @pytest.fixture def time_list(): times = [ 1980, 1990, 2010, 2020, 2000, 2050, 2030, 2040, 1960, 1970 ] return times @pytest.fixture() def dataset(): a = numpy.random.random((10, 10)) b = numpy.random.random((10, 8)) c = numpy.random.random((10, 1)) return NumpyTupleDataset(a, b, c) def test_split(dataset, time_list): splitter = TimeSplitter() train_ind, valid_ind, test_ind = splitter._split( dataset, time_list=time_list) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 8 assert valid_ind.shape[0] == 1 assert test_ind.shape[0] == 1 assert train_ind.tolist() == [8, 9, 0, 1, 4, 2, 3, 6] assert valid_ind.tolist() == [7] assert test_ind.tolist() == [5] train_ind, valid_ind, test_ind = splitter._split( dataset, frac_train=0.5, frac_valid=0.3, frac_test=0.2, time_list=time_list) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 5 assert valid_ind.shape[0] == 3 assert test_ind.shape[0] == 2 assert train_ind.tolist() == [8, 9, 0, 1, 4] assert valid_ind.tolist() == [2, 3, 6] assert test_ind.tolist() == [7, 5] def test_split_fail(dataset, time_list): splitter = TimeSplitter() with pytest.raises(AssertionError): train_ind, valid_ind, test_ind = splitter._split( dataset, frac_train=0.4, frac_valid=0.3, frac_test=0.2, time_list=time_list) def test_train_valid_test_split(dataset, time_list): splitter = TimeSplitter() train_ind, valid_ind, test_ind = splitter.train_valid_test_split( dataset, time_list=time_list) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 8 assert valid_ind.shape[0] == 1 assert test_ind.shape[0] == 1 assert train_ind.tolist() == [8, 9, 0, 1, 4, 2, 3, 6] assert valid_ind.tolist() == [7] assert test_ind.tolist() == [5] def test_train_valid_test_split_return_dataset(dataset, time_list): splitter = TimeSplitter() train, valid, test = splitter.train_valid_test_split( dataset, return_index=False, time_list=time_list) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert type(test) == NumpyTupleDataset assert len(train) == 8 assert len(valid) == 1 assert len(test) == 1 def test_train_valid_split(dataset, time_list): splitter = TimeSplitter() train_ind, valid_ind = splitter.train_valid_split( dataset, time_list=time_list) assert type(train_ind) == numpy.ndarray assert train_ind.shape[0] == 9 assert valid_ind.shape[0] == 1 assert train_ind.tolist() == [8, 9, 0, 1, 4, 2, 3, 6, 7] assert valid_ind.tolist() == [5] def test_train_split_return_dataset(dataset, time_list): splitter = TimeSplitter() train, valid = splitter.train_valid_split( dataset, return_index=False, time_list=time_list) assert type(train) == NumpyTupleDataset assert type(valid) == NumpyTupleDataset assert len(train) == 9 assert len(valid) == 1 ================================================ FILE: tests/dataset_tests/test_converters.py ================================================ import chainer import numpy import pytest from chainer_chemistry.dataset.converters import concat_mols @pytest.fixture def data_1d(): a = numpy.array([1, 2]) b = numpy.array([4, 5, 6]) return a, b @pytest.fixture def data_1d_expect(): a = numpy.array([1, 2, 0]) b = numpy.array([4, 5, 6]) return a, b @pytest.fixture def data_2d(): a = numpy.array([[1, 2], [3, 4]]) b = numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) return a, b @pytest.fixture def data_2d_expect(): a = numpy.array([[1, 2, 0], [3, 4, 0], [0, 0, 0]]) b = numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) return a, b def test_concat_mols_1d_cpu(data_1d, data_1d_expect): result = concat_mols(data_1d, device=-1) assert numpy.array_equal(result[0], data_1d_expect[0]) assert numpy.array_equal(result[1], data_1d_expect[1]) def test_concat_mols_2d_cpu(data_2d, data_2d_expect): result = concat_mols(data_2d, device=-1) assert numpy.array_equal(result[0], data_2d_expect[0]) assert numpy.array_equal(result[1], data_2d_expect[1]) @pytest.mark.gpu def test_concat_mols_1d_gpu(data_1d, data_1d_expect): result = concat_mols(data_1d, device=0) assert chainer.cuda.get_device_from_array(result[0]).id == 0 assert chainer.cuda.get_device_from_array(result[1]).id == 0 assert numpy.array_equal(chainer.cuda.to_cpu(result[0]), data_1d_expect[0]) assert numpy.array_equal(chainer.cuda.to_cpu(result[1]), data_1d_expect[1]) @pytest.mark.gpu def test_concat_mols_2d_gpu(data_2d, data_2d_expect): result = concat_mols(data_2d, device=0) assert chainer.cuda.get_device_from_array(result[0]).id == 0 assert chainer.cuda.get_device_from_array(result[1]).id == 0 assert numpy.array_equal(chainer.cuda.to_cpu(result[0]), data_2d_expect[0]) assert numpy.array_equal(chainer.cuda.to_cpu(result[1]), data_2d_expect[1]) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/dataset_tests/test_numpy_tuple_feature_indexer.py ================================================ import numpy import pytest from chainer_chemistry.dataset.indexers.numpy_tuple_dataset_feature_indexer import NumpyTupleDatasetFeatureIndexer # NOQA from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset @pytest.fixture def data(): a = numpy.array([1, 2]) b = numpy.array([4, 5]) c = numpy.array([[6, 7, 8], [8, 9, 10]]) return a, b, c @pytest.fixture def indexer(data): dataset = NumpyTupleDataset(*data) indexer = NumpyTupleDatasetFeatureIndexer(dataset) return indexer class TestNumpyTupleDatasetFeatureIndexer(object): def test_feature_length(self, indexer): assert indexer.features_length() == 3 @pytest.mark.parametrize('slice_index', [ 0, 1, slice(0, 2, None), slice(0, 0, None)]) @pytest.mark.parametrize('j', [0, 1]) def test_extract_feature_by_slice(self, indexer, data, slice_index, j): numpy.testing.assert_array_equal( indexer.extract_feature_by_slice(slice_index, j), data[j][slice_index]) # indexer's __getitem__ should call `extract_feature_by_slice` method, # result should be same with above. numpy.testing.assert_array_equal( indexer[slice_index, j], data[j][slice_index]) @pytest.mark.parametrize('ndarray_index', [ numpy.asarray([0, 1]), numpy.asarray([1]), numpy.asarray([], dtype=numpy.int32)]) @pytest.mark.parametrize('j', [0, 1]) def test_extract_feature_by_ndarray(self, indexer, data, ndarray_index, j): numpy.testing.assert_array_equal( indexer.extract_feature_by_slice(ndarray_index, j), data[j][ndarray_index]) # indexer's __getitem__ should call `extract_feature_by_slice` method, # result should be same with above. numpy.testing.assert_array_equal( indexer[ndarray_index, j], data[j][ndarray_index]) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/datasets_tests/molnet_tests/test_molnet.py ================================================ import os import numpy import pandas import pytest from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA from chainer_chemistry.datasets import molnet from chainer_chemistry.datasets import NumpyTupleDataset expect_bbbp_lengths = [1633, 203, 203] expect_bbbp_lengths2 = [1021, 611, 407] expect_clearance_lengths = [669, 83, 85] expect_pdbbind_lengths = [134, 16, 18] expect_featurized_pdbbind_lengths = [151, 18, 20] expect_qm7_lengths = [5468, 683, 683] def test_get_molnet_filepath_without_download(): filepath = molnet.get_molnet_filepath('bbbp', download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) # ensure a cache file does not exist. filepath = molnet.get_molnet_filepath('bbbp', download_if_not_exist=False) assert isinstance(filepath, str) assert not os.path.exists(filepath) @pytest.mark.slow def test_get_molnet_filepath_with_download(): filepath = molnet.get_molnet_filepath('bbbp', download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) # ensure a cache file does not exist. filepath = molnet.get_molnet_filepath('bbbp', download_if_not_exist=True) assert isinstance(filepath, str) assert os.path.exists(filepath) def test_get_grid_featurized_pdbbind_dataset(): # Test core dataset dataset = molnet.get_grid_featurized_pdbbind_dataset('core') assert isinstance(dataset, NumpyTupleDataset) x, y = dataset.get_datasets() assert x.shape == (189, 2052) assert x.dtype == numpy.int32 assert y.shape == (189, 1) assert y.dtype == numpy.float32 # Test full dataset dataset = molnet.get_grid_featurized_pdbbind_dataset('full') assert isinstance(dataset, NumpyTupleDataset) x, y = dataset.get_datasets() assert x.shape == (11303, 2052) assert x.dtype == numpy.int32 assert y.shape == (11303, 1) assert y.dtype == numpy.float32 # Test refined dataset dataset = molnet.get_grid_featurized_pdbbind_dataset('refined') assert isinstance(dataset, NumpyTupleDataset) x, y = dataset.get_datasets() assert x.shape == (3568, 2052) assert x.dtype == numpy.int32 assert y.shape == (3568, 1) assert y.dtype == numpy.float32 # bbbp is one of classification task dataset @pytest.mark.slow def test_get_molnet_bbbp_dataset(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.int32 assert len(dataset) == expect_bbbp_lengths[i] # bbbp is one of classification task dataset @pytest.mark.slow def test_get_molnet_bbbp_dataset_change_split_ratio(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp, frac_train=0.5, frac_valid=0.3, frac_test=0.2) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.int32 assert len(dataset) == expect_bbbp_lengths2[i] @pytest.mark.slow def test_get_molnet_bbbp_dataset_with_smiles(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp, return_smiles=True) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() smileses = datasets['smiles'] datasets = datasets['dataset'] assert len(smileses) == 3 assert len(datasets) == 3 # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 # NOQA assert label.shape[0] == 1 assert label.dtype == numpy.int32 assert len(dataset) == expect_bbbp_lengths[i] assert len(smileses[i]) == expect_bbbp_lengths[i] # clearance is one of classification task dataset @pytest.mark.slow def test_get_molnet_clearance_dataset(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('clearance', preprocessor=pp) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_clearance_lengths[i] @pytest.mark.slow def test_get_molnet_clearance_dataset_with_return_smiles_enabled(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('clearance', preprocessor=pp, return_smiles=True) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() smileses = datasets['smiles'] datasets = datasets['dataset'] assert len(datasets) == 3 assert len(smileses) == 3 # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_clearance_lengths[i] assert len(smileses[i]) == expect_clearance_lengths[i] @pytest.mark.slow def test_get_molnet_pdbbind_dataset(): # test default behavior pp = AtomicNumberPreprocessor() time_list = numpy.random.randint(1000, size=168).tolist() datasets = molnet.get_molnet_dataset('pdbbind_smiles', preprocessor=pp, pdbbind_subset='core', time_list=time_list, split='random') assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() assert 'pdb_id' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_pdbbind_lengths[i] @pytest.mark.slow def test_get_molnet_pdbbind_dataset_with_pdb_id(): # test default behavior pp = AtomicNumberPreprocessor() time_list = numpy.random.randint(1000, size=168).tolist() datasets = molnet.get_molnet_dataset('pdbbind_smiles', preprocessor=pp, pdbbind_subset='core', return_pdb_id=True, time_list=time_list, split='random') assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() assert 'pdb_id' in datasets.keys() pdb_ids = datasets['pdb_id'] datasets = datasets['dataset'] assert len(pdb_ids) == 3 assert len(datasets) == 3 # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert label.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --Test number of dataset --- assert len(dataset) == expect_pdbbind_lengths[i] assert len(pdb_ids[i]) == expect_pdbbind_lengths[i] @pytest.mark.slow def test_get_molnet_grid_featurized_pdbbind_dataset(): # test default behavioer datasets = molnet.get_molnet_dataset('pdbbind_grid', pdbbind_subset='core', split='random') assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_featurized_pdbbind_lengths[i] # For qm7 dataset, stratified splitting is recommended. @pytest.mark.slow def test_get_molnet_qm7_dataset(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('qm7', preprocessor=pp) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() datasets = datasets['dataset'] assert len(datasets) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_qm7_lengths[i] # For qm7 dataset, stratified splitting is recommended. @pytest.mark.slow def test_get_molnet_qm7_dataset_with_smiles(): # test default behavior pp = AtomicNumberPreprocessor() datasets = molnet.get_molnet_dataset('qm7', preprocessor=pp, return_smiles=True) assert 'smiles' in datasets.keys() assert 'dataset' in datasets.keys() smileses = datasets['smiles'] datasets = datasets['dataset'] assert len(datasets) == 3 assert len(smileses) == 3 assert type(datasets[0]) == NumpyTupleDataset assert type(datasets[1]) == NumpyTupleDataset assert type(datasets[2]) == NumpyTupleDataset # Test each train, valid and test dataset for i, dataset in enumerate(datasets): # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == 1 assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == expect_qm7_lengths[i] assert len(smileses[i]) == expect_qm7_lengths[i] def test_get_molnet_bbbp_dataframe(): datasets = molnet.get_molnet_dataframe('bbbp') assert isinstance(datasets, pandas.DataFrame) assert len(datasets) == 2050 def test_get_molnet_pdbbind_smiles_dataframe(): datasets = molnet.get_molnet_dataframe('pdbbind_smiles', pdbbind_subset='core') assert isinstance(datasets, pandas.DataFrame) assert len(datasets) == 168 def test_get_molnet_pdbbind_grid_dataframe(): with pytest.raises(ValueError): datasets = molnet.get_molnet_dataframe('pdbbind_grid', # NOQA pdbbind_subset='core') if __name__ == '__main__': args = [__file__, '-v', '-s'] pytest.main(args=args) ================================================ FILE: tests/datasets_tests/molnet_tests/test_pdbbind_time.py ================================================ import os import pytest from chainer_chemistry.datasets.molnet import pdbbind_time @pytest.mark.slow def test_get_pdbbind_time_filepath(): filepath = pdbbind_time.get_pdbbind_time_filepath( download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) filepath = pdbbind_time.get_pdbbind_time_filepath( download_if_not_exist=True) assert isinstance(filepath, str) assert os.path.exists(filepath) def test_get_pdbbind_time(): time_list = pdbbind_time.get_pdbbind_time() assert isinstance(time_list, list) for time in time_list: assert 1900 < time < 2100 ================================================ FILE: tests/datasets_tests/test_numpy_tuple_dataset.py ================================================ import os import tempfile import numpy import pytest import six from chainer_chemistry.datasets import NumpyTupleDataset @pytest.fixture def data(): a = numpy.array([1, 2]) b = numpy.array([4, 5]) c = numpy.array([[6, 7, 8], [8, 9, 10]]) return a, b, c @pytest.fixture def long_data(): a = numpy.array([1, 2, 3, 4]) b = numpy.array([4, 5, 6, 7]) c = numpy.array([[6, 7, 8], [8, 9, 10], [11, 12, 13], [14, 15, 16]]) return a, b, c class TestNumpyTupleDataset(object): def test_len(self, data): dataset = NumpyTupleDataset(*data) assert len(dataset) == 2 @pytest.mark.parametrize('index', [0, 1]) def test_get_item_integer_index(self, data, index): dataset = NumpyTupleDataset(*data) actual = dataset[index] assert len(actual) == len(data) for a, d in six.moves.zip(actual, data): numpy.testing.assert_array_equal(a, d[index]) @pytest.mark.parametrize('index', [slice(0, 2, None)]) def test_get_item_slice_index(self, data, index): dataset = NumpyTupleDataset(*data) actual = dataset[index] batches = [d[index] for d in data] length = len(batches[0]) expect = [tuple([batch[i] for batch in batches]) for i in six.moves.range(length)] assert len(actual) == len(expect) for tuple_a, tuple_e in six.moves.zip(actual, expect): assert len(tuple_a) == len(tuple_e) for a, e in six.moves.zip(tuple_a, tuple_e): numpy.testing.assert_array_equal(a, e) @pytest.mark.parametrize('index', [ numpy.asarray([2, 0]), numpy.asarray([1]), numpy.asarray([], dtype=numpy.int32)]) def test_get_item_ndarray_index(self, long_data, index): dataset = NumpyTupleDataset(*long_data) actual = dataset[index] batches = [d[index] for d in long_data] length = len(batches[0]) expect = [tuple([batch[i] for batch in batches]) for i in six.moves.range(length)] assert len(actual) == len(expect) for tuple_a, tuple_e in six.moves.zip(actual, expect): assert len(tuple_a) == len(tuple_e) for a, e in six.moves.zip(tuple_a, tuple_e): numpy.testing.assert_array_equal(a, e) @pytest.mark.parametrize('index', [[2, 0], [1]]) def test_get_item_list_index(self, long_data, index): dataset = NumpyTupleDataset(*long_data) actual = dataset[index] batches = [d[index] for d in long_data] length = len(batches[0]) expect = [tuple([batch[i] for batch in batches]) for i in six.moves.range(length)] assert len(actual) == len(expect) for tuple_a, tuple_e in six.moves.zip(actual, expect): assert len(tuple_a) == len(tuple_e) for a, e in six.moves.zip(tuple_a, tuple_e): numpy.testing.assert_array_equal(a, e) def test_invalid_datasets(self): a = numpy.array([1, 2]) b = numpy.array([1, 2, 3]) with pytest.raises(ValueError): NumpyTupleDataset(a, b) def test_save_load(self, data): tmp_cache_path = os.path.join(tempfile.mkdtemp(), 'tmp.npz') dataset = NumpyTupleDataset(*data) NumpyTupleDataset.save(tmp_cache_path, dataset) assert os.path.exists(tmp_cache_path) load_dataset = NumpyTupleDataset.load(tmp_cache_path) os.remove(tmp_cache_path) assert len(dataset._datasets) == len(load_dataset._datasets) for a, d in six.moves.zip(dataset._datasets, load_dataset._datasets): numpy.testing.assert_array_equal(a, d) def test_get_datasets(self, data): dataset = NumpyTupleDataset(*data) datasets = dataset.get_datasets() assert len(datasets) == len(data) for i in range(len(datasets)): numpy.testing.assert_array_equal(datasets[i], data[i]) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/datasets_tests/test_qm9.py ================================================ import os import numpy import pytest from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA from chainer_chemistry.datasets import qm9 QM9_NUM_LABEL = 15 QM9_NUM_DATASET = 133885 def test_get_qm9_filepath_without_download(): filepath = qm9.get_qm9_filepath(download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) # ensure a cache file does not exist. filepath = qm9.get_qm9_filepath(download_if_not_exist=False) assert isinstance(filepath, str) assert not os.path.exists(filepath) @pytest.mark.slow def test_get_qm9_filepath_with_download(): filepath = qm9.get_qm9_filepath(download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) # ensure a cache file does not exist. # This method downloads the file if not exist filepath = qm9.get_qm9_filepath(download_if_not_exist=True) assert isinstance(filepath, str) assert os.path.exists(filepath) @pytest.mark.slow def test_get_qm9(): # test default behavior pp = AtomicNumberPreprocessor() dataset = qm9.get_qm9(preprocessor=pp) # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == QM9_NUM_LABEL assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == QM9_NUM_DATASET @pytest.mark.slow def test_get_qm9_smiles(): # test default behavior pp = AtomicNumberPreprocessor() dataset, smiles = qm9.get_qm9(preprocessor=pp, return_smiles=True) # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == QM9_NUM_LABEL assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == QM9_NUM_DATASET assert len(smiles) == QM9_NUM_DATASET # --- Test order of dataset --- atoms0, labels0 = dataset[0] assert smiles[0] == 'C' assert numpy.alltrue(atoms0 == numpy.array([6], dtype=numpy.int32)) atoms7777, labels7777 = dataset[7777] assert smiles[7777] == 'CC1=NCCC(C)O1' assert numpy.alltrue( atoms7777 == numpy.array([6, 6, 7, 6, 6, 6, 6, 8], dtype=numpy.int32)) atoms133884, labels133884 = dataset[133884] assert smiles[133884] == 'C1N2C3C4C5OC13C2C54' assert numpy.alltrue( atoms133884 == numpy.array([6, 7, 6, 6, 6, 8, 6, 6, 6], dtype=numpy.int32)) def test_get_qm9_label_names(): label_names = qm9.get_qm9_label_names() assert isinstance(label_names, list) for label in label_names: assert isinstance(label, str) if __name__ == '__main__': args = [__file__, '-v', '-s'] pytest.main(args=args) ================================================ FILE: tests/datasets_tests/test_tox21.py ================================================ import os import numpy import pytest from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA from chainer_chemistry.datasets import tox21 TOX21_NUM_LABEL = 12 dataset_types = [ 'train', 'val', 'test' ] @pytest.mark.parametrize('dataset_type', dataset_types) def test_get_tox21_filepath_without_download(dataset_type): filepath = tox21.get_tox21_filepath(dataset_type, download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) # ensure a cache file does not exist. filepath = tox21.get_tox21_filepath(dataset_type, download_if_not_exist=False) assert isinstance(filepath, str) assert not os.path.exists(filepath) @pytest.mark.slow @pytest.mark.parametrize('dataset_type', dataset_types) def test_get_tox21_filepath_with_download(dataset_type): filepath = tox21.get_tox21_filepath(dataset_type, download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) # ensure a cache file does not exist. # This method downloads the file if not exist filepath = tox21.get_tox21_filepath(dataset_type, download_if_not_exist=True) assert isinstance(filepath, str) assert os.path.exists(filepath) @pytest.mark.slow def test_get_tox21(): # test default behavior pp = AtomicNumberPreprocessor() train, val, test = tox21.get_tox21(preprocessor=pp) # --- Test dataset is correctly obtained --- for dataset in [train, val, test]: index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 assert label.ndim == 1 assert label.shape[0] == TOX21_NUM_LABEL assert label.dtype == numpy.int32 def test_get_tox21_label_names(): label_names = tox21.get_tox21_label_names() assert isinstance(label_names, list) for label in label_names: assert isinstance(label, str) def test_get_tox21_filepath_assert_raises(): with pytest.raises(ValueError): tox21.get_tox21_filepath('other') if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/datasets_tests/test_zinc.py ================================================ import os import numpy import pytest from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor # NOQA from chainer_chemistry.datasets import zinc ZINC250K_NUM_LABEL = 3 ZINC250K_NUM_DATASET = 249455 def test_get_zinc_filepath_without_download(): filepath = zinc.get_zinc250k_filepath(download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) # ensure a cache file does not exist. filepath = zinc.get_zinc250k_filepath(download_if_not_exist=False) assert isinstance(filepath, str) assert not os.path.exists(filepath) @pytest.mark.slow def test_get_zinc_filepath_with_download(): filepath = zinc.get_zinc250k_filepath(download_if_not_exist=False) if os.path.exists(filepath): os.remove(filepath) # ensure a cache file does not exist. # This method downloads the file if not exist filepath = zinc.get_zinc250k_filepath(download_if_not_exist=True) assert isinstance(filepath, str) assert os.path.exists(filepath) @pytest.mark.slow def test_get_zinc(): # test default behavior pp = AtomicNumberPreprocessor() dataset = zinc.get_zinc250k(preprocessor=pp) # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 assert label.ndim == 1 assert label.shape[0] == ZINC250K_NUM_LABEL assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == ZINC250K_NUM_DATASET @pytest.mark.slow def test_get_zinc_smiles(): # test smiles extraction and dataset order pp = AtomicNumberPreprocessor() target_index = [0, 7777, 249454] # set target_index for fast testing... dataset, smiles = zinc.get_zinc250k(preprocessor=pp, return_smiles=True, target_index=target_index) # --- Test dataset is correctly obtained --- index = numpy.random.choice(len(dataset), None) atoms, label = dataset[index] assert atoms.ndim == 1 # (atom, ) assert atoms.dtype == numpy.int32 # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 assert label.shape[0] == ZINC250K_NUM_LABEL assert label.dtype == numpy.float32 # --- Test number of dataset --- assert len(dataset) == len(target_index) assert len(smiles) == len(target_index) # --- Test order of dataset --- assert smiles[0] == 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1' atoms0, labels0 = dataset[0] assert numpy.alltrue(atoms0 == numpy.array( [6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 7, 6, 6, 6, 6, 6, 6, 9, 6, 6], dtype=numpy.int32)) assert numpy.alltrue(labels0 == numpy.array( [5.0506, 0.70201224, 2.0840945], dtype=numpy.float32)) assert smiles[1] == 'CCCc1cc(NC(=O)Nc2ccc3c(c2)OCCO3)n(C)n1' atoms7777, labels7777 = dataset[1] assert numpy.alltrue(atoms7777 == numpy.array( [6, 6, 6, 6, 6, 6, 7, 6, 8, 7, 6, 6, 6, 6, 6, 6, 8, 6, 6, 8, 7, 6, 7], dtype=numpy.int32)) assert numpy.alltrue(labels7777 == numpy.array( [2.7878, 0.9035222, 2.3195992], dtype=numpy.float32)) assert smiles[2] == 'O=C(CC(c1ccccc1)c1ccccc1)N1CCN(S(=O)(=O)c2ccccc2[N+](=O)[O-])CC1' # NOQA atoms249454, labels249454 = dataset[2] assert numpy.alltrue(atoms249454 == numpy.array( [8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 6, 7, 16, 8, 8, 6, 6, 6, 6, 6, 6, 7, 8, 8, 6, 6], dtype=numpy.int32)) assert numpy.alltrue(labels249454 == numpy.array( [3.6499, 0.37028658, 2.2142494], dtype=numpy.float32)) def test_get_zinc_label_names(): label_names = zinc.get_zinc250k_label_names() assert isinstance(label_names, list) for label in label_names: assert isinstance(label, str) if __name__ == '__main__': args = [__file__, '-v', '-s'] pytest.main(args=args) ================================================ FILE: tests/functions_tests/activation/test_megnet_softplus.py ================================================ import numpy import pytest from chainer import cuda from chainer_chemistry.functions.activation.megnet_softplus \ import megnet_softplus def test_forward_cpu(): x = numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32) output = megnet_softplus(x) expect_output = numpy.array([ [0.62011445, 1.4337809, 2.3554401], [5.3093286, 4.313568, 3.3250027]], dtype=numpy.float32) numpy.allclose(output.array, expect_output) def test_forward_zero_cpu(): x = numpy.zeros((2, 3), dtype=numpy.float32) output = megnet_softplus(x) expect_output = numpy.zeros((2, 3), dtype=numpy.float32) numpy.allclose(output.array, expect_output) def test_forward_avoid_overflow_cpu(): x = numpy.array([1e5], dtype=numpy.float32) output = megnet_softplus(x) expect_output = numpy.array([1e5], dtype=numpy.float32) numpy.allclose(output.array, expect_output) @pytest.mark.gpu def test_forward_gpu(): x = cuda.to_gpu(numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32)) output = megnet_softplus(x) expect_output = numpy.array([ [0.62011445, 1.4337809, 2.3554401], [5.3093286, 4.313568, 3.3250027]], dtype=numpy.float32) numpy.allclose(cuda.to_cpu(output.array), expect_output) @pytest.mark.gpu def test_forward_zero_gpu(): x = cuda.to_gpu(numpy.zeros((2, 3), dtype=numpy.float32)) output = megnet_softplus(x) expect_output = numpy.zeros((2, 3), dtype=numpy.float32) numpy.allclose(cuda.to_cpu(output.array), expect_output) @pytest.mark.gpu def test_forward_avoid_overflow_gpu(): x = cuda.to_gpu(numpy.array([1e5], dtype=numpy.float32)) output = megnet_softplus(x) expect_output = numpy.array([1e5], dtype=numpy.float32) numpy.allclose(cuda.to_cpu(output.array), expect_output) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/functions_tests/activation/test_shifted_softplus.py ================================================ import numpy import pytest from chainer import cuda from chainer_chemistry.functions.activation.shifted_softplus import shifted_softplus # NOQA def test_forward_cpu(): x = numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32) output = shifted_softplus(x) expect_output = numpy.array([ [0.62011445, 1.4337809, 2.3554401], [5.3093286, 4.313568, 3.3250027]], dtype=numpy.float32) numpy.allclose(output.array, expect_output) def test_forward_zero_cpu(): x = numpy.zeros((2, 3), dtype=numpy.float32) output = shifted_softplus(x) expect_output = numpy.zeros((2, 3), dtype=numpy.float32) numpy.allclose(output.array, expect_output) def test_forward_avoid_overflow_cpu(): x = numpy.array([1e5], dtype=numpy.float32) output = shifted_softplus(x) expect_output = numpy.array([1e5], dtype=numpy.float32) numpy.allclose(output.array, expect_output) @pytest.mark.gpu def test_forward_gpu(): x = cuda.to_gpu(numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32)) output = shifted_softplus(x) expect_output = numpy.array([ [0.62011445, 1.4337809, 2.3554401], [5.3093286, 4.313568, 3.3250027]], dtype=numpy.float32) numpy.allclose(cuda.to_cpu(output.array), expect_output) @pytest.mark.gpu def test_forward_zero_gpu(): x = cuda.to_gpu(numpy.zeros((2, 3), dtype=numpy.float32)) output = shifted_softplus(x) expect_output = numpy.zeros((2, 3), dtype=numpy.float32) numpy.allclose(cuda.to_cpu(output.array), expect_output) @pytest.mark.gpu def test_forward_avoid_overflow_gpu(): x = cuda.to_gpu(numpy.array([1e5], dtype=numpy.float32)) output = shifted_softplus(x) expect_output = numpy.array([1e5], dtype=numpy.float32) numpy.allclose(cuda.to_cpu(output.array), expect_output) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/functions_tests/activation/test_softmax.py ================================================ import numpy import pytest from chainer import cuda from chainer_chemistry.functions.activation.softmax import softmax def test_forward_cpu(): x = numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32) output = softmax(x) expect_output = numpy.array([ [0.09003057, 0.24472848, 0.66524094], [0.66524094, 0.24472848, 0.09003057]], dtype=numpy.float32) numpy.allclose(output.array, expect_output) def test_forward_cpu_with_mask(): x = numpy.array([[1, 2, 3, 2, 5], [1, 6, 5, 4, 2]], dtype=numpy.float32) mask = numpy.array([[1, 1, 1, 0, 0], [0, 1, 1, 1, 0]], dtype=numpy.float32) output = softmax(x, mask=mask) expect_output = numpy.array([ [0.09003057, 0.24472848, 0.66524094, 0., 0.], [0., 0.66524094, 0.24472848, 0.09003057, 0.]], dtype=numpy.float32) numpy.allclose(output.array, expect_output) @pytest.mark.gpu def test_forward_gpu(): x = cuda.to_gpu(numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32)) output = softmax(x) expect_output = numpy.array([ [0.09003057, 0.24472848, 0.66524094], [0.66524094, 0.24472848, 0.09003057]], dtype=numpy.float32) numpy.allclose(cuda.to_cpu(output.array), expect_output) @pytest.mark.gpu def test_forward_gpu_with_mask(): x = numpy.array([[1, 2, 3, 2, 5], [1, 6, 5, 4, 2]], dtype=numpy.float32) mask = numpy.array([[1, 1, 1, 0, 0], [0, 1, 1, 1, 0]], dtype=numpy.float32) x, mask = map(cuda.to_gpu, (x, mask)) output = softmax(x, mask=mask) expect_output = numpy.array([ [0.09003057, 0.24472848, 0.66524094, 0., 0.], [0., 0.66524094, 0.24472848, 0.09003057, 0.]], dtype=numpy.float32) numpy.allclose(cuda.to_cpu(output.array), expect_output) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/functions_tests/evaluation/test_r2_score.py ================================================ import numpy import pytest from chainer import cuda import chainer_chemistry def r2_score(pred, true, sample_weight=None, multioutput="uniform_average", ignore_nan=False): pred = cuda.to_cpu(pred) true = cuda.to_cpu(true) diff = pred - true dev = true - numpy.mean(true, axis=0) if ignore_nan: diff[numpy.isnan(diff)] = 0. dev[numpy.isnan(dev)] = 0. SS_res = numpy.asarray( numpy.sum(diff ** 2, axis=0)) SS_tot = numpy.asarray( numpy.sum(dev ** 2, axis=0)) if multioutput == 'uniform_average': if numpy.any(SS_tot == 0): return 0.0 else: return (1 - SS_res / SS_tot).mean() elif multioutput == 'raw_values': if numpy.any(SS_tot == 0): # Assign dummy value to avoid zero-division SS_tot_iszero = SS_tot == 0 SS_tot[SS_tot_iszero] = 1 return numpy.where(SS_tot_iszero, 0.0, 1 - SS_res / SS_tot) else: return 1 - SS_res / SS_tot @pytest.fixture def inputs(): numpy.random.seed(0) x0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) # Add sufficient margin to prevent computational error diff = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) diff[abs(diff) < 0.01] = 0.5 x1 = x0 + diff x2 = numpy.asarray([[0.3, numpy.nan, 0.2], [numpy.nan, 0.1, 0.5], [0.9, 0.7, numpy.nan], [0.2, -0.3, 0.4]]).astype(numpy.float32) return x0, x1, x2 def check_forward(inputs): x0, x1, _ = inputs y = chainer_chemistry.functions.r2_score(x0, x1) assert y.data.dtype == 'f' assert y.data.shape == () expect = r2_score(x0, x1) assert numpy.allclose(cuda.to_cpu(y.data), expect) def check_forward_ignore_nan(inputs): x0, _, x2 = inputs y = chainer_chemistry.functions.r2_score(x0, x2, ignore_nan=True) assert y.data.dtype == 'f' assert y.data.shape == () expect = r2_score(x0, x2, ignore_nan=True) assert numpy.allclose(cuda.to_cpu(y.data), expect) def check_forward_ignore_nan_with_nonnan_value(inputs): x0, x1, _ = inputs y = chainer_chemistry.functions.r2_score(x0, x1, ignore_nan=True) assert y.data.dtype == 'f' assert y.data.shape == () expect = r2_score(x0, x1, ignore_nan=True) assert numpy.allclose(y.data, expect) def test_forward_cpu(inputs): check_forward(inputs) check_forward_ignore_nan(inputs) check_forward_ignore_nan_with_nonnan_value(inputs) @pytest.mark.gpu def test_forward_gpu(inputs): x0, x1, x2 = inputs check_forward((cuda.to_gpu(x0), cuda.to_gpu(x1), None)) check_forward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2))) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/functions_tests/loss/test_mean_absolute_error.py ================================================ import numpy import pytest import chainer from chainer import cuda from chainer import gradient_check import chainer_chemistry @pytest.fixture def inputs(): numpy.random.seed(0) x0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) # Add sufficient margin to prevent computational error diff = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) diff[abs(diff) < 0.01] = 0.5 x1 = x0 + diff x2 = numpy.asarray([[0.3, numpy.nan, 0.2], [numpy.nan, 0.1, 0.5], [0.9, 0.7, numpy.nan], [0.2, -0.3, 0.4]]).astype(numpy.float32) return x0, x1, x2 @pytest.fixture def grads(): numpy.random.seed(0) gy = numpy.random.uniform(-1, 1, ()).astype(numpy.float32) ggx0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) ggx1 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) return gy, ggx0, ggx1 def check_forward(inputs): x0_data, x1_data, _ = inputs x0 = chainer.Variable(x0_data) x1 = chainer.Variable(x1_data) loss = chainer_chemistry.functions.mean_absolute_error(x0, x1) loss_value = cuda.to_cpu(loss.data) assert loss.dtype == numpy.float32 assert loss_value.shape == () loss_expect = numpy.zeros(()) x0_data = cuda.to_cpu(x0_data) x1_data = cuda.to_cpu(x1_data) for i in numpy.ndindex(x0_data.shape): loss_expect += abs((x0_data[i] - x1_data[i])) loss_expect /= x0_data.size assert numpy.allclose(loss_value, loss_expect) def check_forward_ignore_nan(inputs): x0_data, _, x2_data = inputs x0 = chainer.Variable(x0_data) x2 = chainer.Variable(x2_data) loss = chainer_chemistry.functions.mean_absolute_error(x0, x2, ignore_nan=True) loss_value = cuda.to_cpu(loss.data) assert loss.dtype == numpy.float32 assert loss_value.shape == () loss_expect = numpy.zeros(()) x0_data = cuda.to_cpu(x0_data) x2_data = cuda.to_cpu(x2_data) nan_mask = numpy.invert(numpy.isnan(x2_data)).astype(x2_data.dtype) for i in numpy.ndindex(x0_data.shape): loss_expect += abs(x0_data[i] - numpy.nan_to_num(x2_data[i])) * nan_mask[i] loss_expect /= x0_data.size assert numpy.allclose(loss_value, loss_expect) def check_forward_ignore_nan_with_nonnan_value(inputs): x0_data, x1_data, _ = inputs x0 = chainer.Variable(x0_data) x1 = chainer.Variable(x1_data) loss = chainer_chemistry.functions.mean_absolute_error(x0, x1, ignore_nan=True) loss_value = cuda.to_cpu(loss.data) assert loss.dtype == numpy.float32 assert loss_value.shape == () loss_expect = numpy.zeros(()) x0_data = cuda.to_cpu(x0_data) x1_data = cuda.to_cpu(x1_data) nan_mask = numpy.invert(numpy.isnan(x1_data)).astype(x1_data.dtype) for i in numpy.ndindex(x0_data.shape): loss_expect += abs(x0_data[i] - numpy.nan_to_num(x1_data[i])) * nan_mask[i] loss_expect /= x0_data.size assert numpy.allclose(loss_value, loss_expect) def test_forward_cpu(inputs): check_forward(inputs) check_forward_ignore_nan(inputs) check_forward_ignore_nan_with_nonnan_value(inputs) @pytest.mark.gpu def test_forward_gpu(inputs): x0, x1, x2 = inputs check_forward((cuda.to_gpu(x0), cuda.to_gpu(x1), None)) check_forward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2))) def check_backward(inputs): x0_data, x1_data, _ = inputs gradient_check.check_backward( chainer_chemistry.functions.mean_absolute_error, (x0_data, x1_data), None, eps=1e-2) def check_backward_ignore_nan(inputs): x0_data, _, x2_data = inputs def func(x0, x1): return chainer_chemistry.functions.mean_absolute_error(x0, x1, ignore_nan=True) gradient_check.check_backward(func, (x0_data, x2_data), None, eps=1e-2, atol=1e-3, rtol=1e-3) def check_backward_ignore_nan_with_nonnan_value(inputs): x0_data, x1_data, _ = inputs def func(x0, x1): return chainer_chemistry.functions.mean_absolute_error(x0, x1, ignore_nan=True) gradient_check.check_backward(func, (x0_data, x1_data), None, eps=1e-2) def test_backward_cpu(inputs): check_backward(inputs) check_backward_ignore_nan(inputs) check_backward_ignore_nan_with_nonnan_value(inputs) @pytest.mark.gpu def test_backward_gpu(inputs): x0, x1, x2 = inputs check_backward((cuda.to_gpu(x0), cuda.to_gpu(x1), None)) check_backward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2))) check_backward_ignore_nan_with_nonnan_value((cuda.to_gpu(x0), cuda.to_gpu(x1), None)) def check_double_backward(inputs, grads): x0, x1, _ = inputs gy, ggx0, ggx1 = grads def func(*xs): y = chainer_chemistry.functions.mean_absolute_error(*xs) return y * y gradient_check.check_double_backward(func, (x0, x1), gy, (ggx0, ggx1)) def check_double_backward_ignore_nan(inputs, grads): x0, _, x2 = inputs gy, ggx0, ggx1 = grads def func(*xs): y = chainer_chemistry.functions.mean_absolute_error(*xs, ignore_nan=True) return y * y gradient_check.check_double_backward(func, (x0, x2), gy, (ggx0, ggx1)) def check_double_backward_ignore_nan_with_nonnan_value(inputs, grads): x0, x1, _ = inputs gy, ggx0, ggx1 = grads def func(*xs): y = chainer_chemistry.functions.mean_absolute_error(*xs, ignore_nan=True) return y * y gradient_check.check_double_backward(func, (x0, x1), gy, (ggx0, ggx1)) def test_double_backward_cpu(inputs, grads): check_double_backward(inputs, grads) check_double_backward_ignore_nan(inputs, grads) check_double_backward_ignore_nan_with_nonnan_value(inputs, grads) @pytest.mark.gpu def test_double_backward_gpu(inputs, grads): x0, x1, x2 = inputs gy, ggx0, ggx1 = grads check_double_backward((cuda.to_gpu(x0), cuda.to_gpu(x1), None), (cuda.to_gpu(gy), cuda.to_gpu(ggx0), cuda.to_gpu(ggx1))) check_double_backward_ignore_nan_with_nonnan_value((cuda.to_gpu(x0), cuda.to_gpu(x1), None), (cuda.to_gpu(gy), cuda.to_gpu(ggx0), cuda.to_gpu(ggx1))) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/functions_tests/loss/test_mean_squared_error.py ================================================ import numpy import pytest import chainer from chainer import cuda from chainer import gradient_check import chainer_chemistry @pytest.fixture def inputs(): numpy.random.seed(0) x0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) x1 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) x2 = numpy.asarray([[0.3, numpy.nan, 0.2], [numpy.nan, 0.1, 0.5], [0.9, 0.7, numpy.nan], [0.2, -0.3, 0.4]]).astype(numpy.float32) return x0, x1, x2 @pytest.fixture def grads(): numpy.random.seed(0) gy = numpy.random.uniform(-1, 1, ()).astype(numpy.float32) ggx0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) ggx1 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32) return gy, ggx0, ggx1 def check_forward(inputs): x0_data, x1_data, _ = inputs x0 = chainer.Variable(x0_data) x1 = chainer.Variable(x1_data) loss = chainer_chemistry.functions.mean_squared_error(x0, x1) loss_value = cuda.to_cpu(loss.data) assert loss.dtype == numpy.float32 assert loss_value.shape == () loss_expect = numpy.zeros(()) x0_data = cuda.to_cpu(x0_data) x1_data = cuda.to_cpu(x1_data) for i in numpy.ndindex(x0_data.shape): loss_expect += ((x0_data[i] - x1_data[i]) ** 2) loss_expect /= x0_data.size assert numpy.allclose(loss_value, loss_expect) def check_forward_ignore_nan(inputs): x0_data, _, x2_data = inputs x0 = chainer.Variable(x0_data) x2 = chainer.Variable(x2_data) loss = chainer_chemistry.functions.mean_squared_error(x0, x2, ignore_nan=True) loss_value = cuda.to_cpu(loss.data) assert loss.dtype == numpy.float32 assert loss_value.shape == () loss_expect = numpy.zeros(()) x0_data = cuda.to_cpu(x0_data) x2_data = cuda.to_cpu(x2_data) nan_mask = numpy.invert(numpy.isnan(x2_data)).astype(x2_data.dtype) for i in numpy.ndindex(x0_data.shape): loss_expect += ((x0_data[i] - numpy.nan_to_num(x2_data[i])) ** 2 * nan_mask[i]) loss_expect /= x0_data.size assert numpy.allclose(loss_value, loss_expect) def check_forward_ignore_nan_with_nonnan_value(inputs): x0_data, x1_data, _ = inputs x0 = chainer.Variable(x0_data) x1 = chainer.Variable(x1_data) loss = chainer_chemistry.functions.mean_squared_error(x0, x1, ignore_nan=True) loss_value = cuda.to_cpu(loss.data) assert loss.dtype == numpy.float32 assert loss_value.shape == () loss_expect = numpy.zeros(()) x0_data = cuda.to_cpu(x0_data) x1_data = cuda.to_cpu(x1_data) nan_mask = numpy.invert(numpy.isnan(x1_data)).astype(x1_data.dtype) for i in numpy.ndindex(x0_data.shape): loss_expect += ((x0_data[i] - numpy.nan_to_num(x1_data[i])) ** 2 * nan_mask[i]) loss_expect /= x0_data.size assert numpy.allclose(loss_value, loss_expect) def test_forward_cpu(inputs): check_forward(inputs) check_forward_ignore_nan(inputs) check_forward_ignore_nan_with_nonnan_value(inputs) @pytest.mark.gpu def test_forward_gpu(inputs): x0, x1, x2 = inputs check_forward((cuda.to_gpu(x0), cuda.to_gpu(x1), None)) check_forward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2))) def check_backward(inputs): x0_data, x1_data, _ = inputs gradient_check.check_backward( chainer_chemistry.functions.mean_squared_error, (x0_data, x1_data), None, eps=1e-2) def check_backward_ignore_nan(inputs): x0_data, _, x2_data = inputs def func(x0, x1): return chainer_chemistry.functions.mean_squared_error(x0, x1, ignore_nan=True) gradient_check.check_backward(func, (x0_data, x2_data), None, eps=1e-2) def check_backward_ignore_nan_with_nonnan_value(inputs): x0_data, x1_data, _ = inputs def func(x0, x1): return chainer_chemistry.functions.mean_squared_error(x0, x1, ignore_nan=True) gradient_check.check_backward(func, (x0_data, x1_data), None, eps=1e-2) def test_backward_cpu(inputs): check_backward(inputs) check_backward_ignore_nan(inputs) check_backward_ignore_nan_with_nonnan_value(inputs) @pytest.mark.gpu def test_backward_gpu(inputs): x0, x1, x2 = inputs check_backward((cuda.to_gpu(x0), cuda.to_gpu(x1), None)) check_backward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2))) check_backward_ignore_nan_with_nonnan_value((cuda.to_gpu(x0), cuda.to_gpu(x1), None)) def check_double_backward(inputs, grads): x0, x1, _ = inputs gy, ggx0, ggx1 = grads gradient_check.check_double_backward( chainer_chemistry.functions.mean_squared_error, (x0, x1), gy, (ggx0, ggx1)) def check_double_backward_ignore_nan(inputs, grads): x0, _, x2 = inputs gy, ggx0, ggx1 = grads def func(x0, x1): return chainer_chemistry.functions.mean_squared_error(x0, x1, ignore_nan=True) gradient_check.check_double_backward(func, (x0, x2), gy, (ggx0, ggx1)) def check_double_backward_ignore_nan_with_nonnan_value(inputs, grads): x0, x1, _ = inputs gy, ggx0, ggx1 = grads def func(x0, x1): return chainer_chemistry.functions.mean_squared_error(x0, x1, ignore_nan=True) gradient_check.check_double_backward(func, (x0, x1), gy, (ggx0, ggx1)) def test_double_backward_cpu(inputs, grads): check_double_backward(inputs, grads) check_double_backward_ignore_nan(inputs, grads) check_double_backward_ignore_nan_with_nonnan_value(inputs, grads) @pytest.mark.gpu def test_double_backward_gpu(inputs, grads): x0, x1, x2 = inputs gy, ggx0, ggx1 = grads check_double_backward((cuda.to_gpu(x0), cuda.to_gpu(x1), None), (cuda.to_gpu(gy), cuda.to_gpu(ggx0), cuda.to_gpu(ggx1))) check_double_backward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2)), (cuda.to_gpu(gy), cuda.to_gpu(ggx0), cuda.to_gpu(ggx1))) check_double_backward_ignore_nan_with_nonnan_value((cuda.to_gpu(x0), cuda.to_gpu(x1), None), (cuda.to_gpu(gy), cuda.to_gpu(ggx0), cuda.to_gpu(ggx1))) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/iterators_tests/test_balanced_serial_iterator.py ================================================ import numpy import pytest from chainer import serializer from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset from chainer_chemistry.iterators.balanced_serial_iterator import BalancedSerialIterator # NOQA class DummySerializer(serializer.Serializer): def __init__(self, target): super(DummySerializer, self).__init__() self.target = target def __getitem__(self, key): target_child = dict() self.target[key] = target_child return DummySerializer(target_child) def __call__(self, key, value): self.target[key] = value return self.target[key] class DummyDeserializer(serializer.Deserializer): def __init__(self, target): super(DummyDeserializer, self).__init__() self.target = target def __getitem__(self, key): target_child = self.target[key] return DummyDeserializer(target_child) def __call__(self, key, value): if value is None: value = self.target[key] elif isinstance(value, numpy.ndarray): numpy.copyto(value, self.target[key]) else: value = type(value)(numpy.asarray(self.target[key])) return value def test_balanced_serial_iterator(): _test_balanced_serial_iterator_no_batch_balancing() _test_balanced_serial_iterator_with_batch_balancing() def _test_balanced_serial_iterator_no_batch_balancing(): x = numpy.arange(8) t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1]) iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=9, labels=t, ignore_labels=-1, batch_balancing=False) # In this case, we have 3 examples of label=1. # When BalancedSerialIterator runs, all label examples are sampled 3 times # in one epoch. # Therefore, number of data is "augmented" as 9 # 3 (number of label types) * 3 (number of maximum examples in one label) expect_N_augmented = 9 assert iterator.N_augmented == expect_N_augmented # iterator.show_label_stats() # we can show label stats batch = iterator.next() assert len(batch) == 9 labels_batch = numpy.array([example[-1] for example in batch]) assert numpy.sum(labels_batch == 0) == 3 assert numpy.sum(labels_batch == 1) == 3 assert numpy.sum(labels_batch == 2) == 3 def _test_balanced_serial_iterator_with_batch_balancing(): x = numpy.arange(8) t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1]) iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3, labels=t, ignore_labels=-1, batch_balancing=True) expect_N_augmented = 9 assert iterator.N_augmented == expect_N_augmented batch1 = iterator.next() batch2 = iterator.next() batch3 = iterator.next() for batch in [batch1, batch2, batch3]: assert len(batch) == 3 labels_batch = numpy.array([example[-1] for example in batch]) assert numpy.sum(labels_batch == 0) == 1 assert numpy.sum(labels_batch == 1) == 1 assert numpy.sum(labels_batch == 2) == 1 def test_balanced_serial_iterator_serialization(): _test_balanced_serial_iterator_serialization_no_batch_balancing() _test_balanced_serial_iterator_serialization_with_batch_balancing() def _test_balanced_serial_iterator_serialization_no_batch_balancing(): x = numpy.arange(8) t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1]) iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=9, labels=t, ignore_labels=-1, batch_balancing=False) batch = iterator.next() # NOQA assert iterator.current_position == 0 assert iterator.epoch == 1 assert iterator.is_new_epoch target = dict() iterator.serialize(DummySerializer(target)) current_index_list_orig = dict() current_pos_orig = dict() for label, index_iterator in iterator.labels_iterator_dict.items(): ii_label = 'index_iterator_{}'.format(label) current_index_list_orig[ii_label] = index_iterator.current_index_list current_pos_orig[ii_label] = index_iterator.current_pos iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=9, labels=t, ignore_labels=-1, batch_balancing=False) iterator.serialize(DummyDeserializer(target)) assert iterator.current_position == 0 assert iterator.epoch == 1 assert iterator.is_new_epoch for label, index_iterator in iterator.labels_iterator_dict.items(): ii_label = 'index_iterator_{}'.format(label) assert numpy.array_equal(index_iterator.current_index_list, current_index_list_orig[ii_label]) assert index_iterator.current_pos == current_pos_orig[ii_label] def _test_balanced_serial_iterator_serialization_with_batch_balancing(): x = numpy.arange(8) t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1]) iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3, labels=t, ignore_labels=-1, batch_balancing=True) batch1 = iterator.next() # NOQA batch2 = iterator.next() # NOQA batch3 = iterator.next() # NOQA assert iterator.current_position == 0 assert iterator.epoch == 1 assert iterator.is_new_epoch target = dict() iterator.serialize(DummySerializer(target)) current_index_list_orig = dict() current_pos_orig = dict() for label, index_iterator in iterator.labels_iterator_dict.items(): ii_label = 'index_iterator_{}'.format(label) current_index_list_orig[ii_label] = index_iterator.current_index_list current_pos_orig[ii_label] = index_iterator.current_pos iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3, labels=t, ignore_labels=-1, batch_balancing=True) iterator.serialize(DummyDeserializer(target)) assert iterator.current_position == 0 assert iterator.epoch == 1 assert iterator.is_new_epoch for label, index_iterator in iterator.labels_iterator_dict.items(): ii_label = 'index_iterator_{}'.format(label) assert numpy.array_equal(index_iterator.current_index_list, current_index_list_orig[ii_label]) assert index_iterator.current_pos == current_pos_orig[ii_label] if __name__ == '__main__': pytest.main([__file__, '-s', '-v']) ================================================ FILE: tests/iterators_tests/test_index_iterator.py ================================================ import numpy import pytest from chainer import serializer from chainer_chemistry.iterators.index_iterator import IndexIterator class DummySerializer(serializer.Serializer): def __init__(self, target): super(DummySerializer, self).__init__() self.target = target def __getitem__(self, key): target_child = dict() self.target[key] = target_child return DummySerializer(target_child) def __call__(self, key, value): self.target[key] = value return self.target[key] class DummyDeserializer(serializer.Deserializer): def __init__(self, target): super(DummyDeserializer, self).__init__() self.target = target def __getitem__(self, key): target_child = self.target[key] return DummyDeserializer(target_child) def __call__(self, key, value): if value is None: value = self.target[key] elif isinstance(value, numpy.ndarray): numpy.copyto(value, self.target[key]) else: value = type(value)(numpy.asarray(self.target[key])) return value def test_index_iterator(): _test_index_iterator_no_shuffle() _test_index_iterator_with_shuffle() def _test_index_iterator_no_shuffle(): index_list = [1, 3, 5, 10] ii = IndexIterator(index_list, shuffle=False, num=2) indices1 = ii.get_next_indices(3) indices2 = ii.get_next_indices(6) indices3 = ii.__next__() assert isinstance(indices1, numpy.ndarray) assert len(indices1) == 3 assert isinstance(indices2, numpy.ndarray) assert len(indices2) == 6 assert isinstance(indices3, numpy.ndarray) assert len(indices3) == 2 assert indices1[0] == index_list[0] assert indices1[1] == index_list[1] assert indices1[2] == index_list[2] assert indices2[0] == index_list[3] assert indices2[1] == index_list[0] assert indices2[2] == index_list[1] assert indices2[3] == index_list[2] assert indices2[4] == index_list[3] assert indices2[5] == index_list[0] assert indices3[0] == index_list[1] assert indices3[1] == index_list[2] def _test_index_iterator_with_shuffle(): index_list = [1, 3, 5, 10] ii = IndexIterator(index_list, shuffle=True, num=2) indices1 = ii.get_next_indices(3) indices2 = ii.get_next_indices(6) indices3 = ii.__next__() assert isinstance(indices1, numpy.ndarray) assert len(indices1) == 3 assert isinstance(indices2, numpy.ndarray) assert len(indices2) == 6 assert isinstance(indices3, numpy.ndarray) assert len(indices3) == 2 for indices in [indices1, indices2, indices3]: for index in indices: assert index in index_list def test_index_iterator_serialization(): _test_index_iterator_serialization_no_shuffle() _test_index_iterator_serialization_with_shuffle() def _test_index_iterator_serialization_no_shuffle(): index_list = [1, 3, 5, 10] ii = IndexIterator(index_list, shuffle=False, num=2) indices1 = ii.get_next_indices(3) # NOQA indices2 = ii.get_next_indices(6) # NOQA indices3 = ii.__next__() # NOQA assert len(ii.current_index_list) == len(index_list) assert numpy.array_equal(ii.current_index_list, numpy.asarray(index_list)) assert ii.current_pos == (3 + 6) % len(index_list) + 2 target = dict() ii.serialize(DummySerializer(target)) ii = IndexIterator(index_list, shuffle=False, num=2) ii.serialize(DummyDeserializer(target)) assert len(ii.current_index_list) == len(index_list) assert numpy.array_equal(ii.current_index_list, numpy.asarray(index_list)) assert ii.current_pos == (3 + 6) % len(index_list) + 2 def _test_index_iterator_serialization_with_shuffle(): index_list = [1, 3, 5, 10] ii = IndexIterator(index_list, shuffle=True, num=2) indices1 = ii.get_next_indices(3) # NOQA indices2 = ii.get_next_indices(6) # NOQA indices3 = ii.__next__() # NOQA assert len(ii.current_index_list) == len(index_list) for index in ii.current_index_list: assert index in index_list assert ii.current_pos == (3 + 6) % len(index_list) + 2 target = dict() ii.serialize(DummySerializer(target)) current_index_list_orig = ii.current_index_list ii = IndexIterator(index_list, shuffle=True, num=2) ii.serialize(DummyDeserializer(target)) assert numpy.array_equal(ii.current_index_list, current_index_list_orig) assert ii.current_pos == (3 + 6) % len(index_list) + 2 if __name__ == '__main__': pytest.main([__file__, '-s', '-v']) ================================================ FILE: tests/link_hooks_tests/test_variable_monitor_link_hook.py ================================================ import numpy import pytest import chainer from chainer import Variable, cuda # NOQA from chainer.links import Linear from chainer_chemistry.link_hooks import is_link_hooks_available if is_link_hooks_available: from chainer_chemistry.link_hooks import VariableMonitorLinkHook class DummyModel(chainer.Chain): def __init__(self): super(DummyModel, self).__init__() with self.init_scope(): self.l1 = Linear( 3, 1, initialW=numpy.array([[1, 3, 2]]), nobias=True) self.h = None def forward(self, x): self.h = self.l1(x) out = self.h * 3 return out @pytest.fixture def model(): return DummyModel() @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_variable_monitor_link_hook_pre(model): x = numpy.array([[1, 5, 8]], dtype=numpy.float32) x = Variable(x) pre_hook = VariableMonitorLinkHook(target_link=model.l1, timing='pre') with pre_hook: model(x) var = pre_hook.get_variable() assert var is x @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_variable_monitor_link_hook_post(model): x = numpy.array([[1, 5, 8]], dtype=numpy.float32) x = Variable(x) pre_hook = VariableMonitorLinkHook(target_link=model.l1, timing='post') with pre_hook: model(x) var = pre_hook.get_variable() assert var is model.h @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_variable_monitor_link_hook_process(model): x = numpy.array([[1, 5, 8]], dtype=numpy.float32) x = Variable(x) pre_hook = VariableMonitorLinkHook(target_link=model.l1, timing='post') # Add process def _process_zeros(hook, args, target_var): xp = cuda.get_array_module(target_var.array) target_var.array = xp.zeros(target_var.array.shape) pre_hook.add_process('_process_zeros', _process_zeros) with pre_hook: model(x) assert numpy.allclose(model.h.array, numpy.zeros(model.h.shape)) assert '_process_zeros' in pre_hook.process_fns.keys() # Delete process pre_hook.delete_process('_process_zeros') assert '_process_zeros' not in pre_hook.process_fns.keys() @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_variable_monitor_link_hook_assert_raises(model): with pytest.raises(TypeError): # target_link must be chainer.Link pre_hook = VariableMonitorLinkHook(target_link='hoge') with pytest.raises(ValueError): # check timing args pre_hook = VariableMonitorLinkHook(target_link=model.l1, timing='hoge') # NOQA hook = VariableMonitorLinkHook(target_link=model.l1) def _process(hook, args, target_var): pass with pytest.raises(TypeError): # key is wrong hook.add_process(1, _process) with pytest.raises(TypeError): # fn is wrong hook.add_process('hoge', 'var') hook.add_process('hoge', _process) with pytest.raises(TypeError): # key is wrong hook.delete_process(1) hook.delete_process('hoge') if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/array_tests/test_shape_transformer_to_2d.py ================================================ import numpy import pytest from chainer_chemistry.links.array.shape_transformer_to_2d import ShapeTransformerTo2D # NOQA @pytest.mark.parametrize('axis', [0, 1, -1]) def test_shape_transformer_2d_2d_array(axis): st = ShapeTransformerTo2D(axis=axis) x = numpy.arange(6).reshape((2, 3)) xt = st.transform(x) xit = st.inverse_transform(xt) if axis == 0: assert numpy.allclose(xt.array, numpy.array([[0, 3], [1, 4], [2, 5]])) elif axis == 1 or axis == -1: assert numpy.allclose(x, xt.array) assert numpy.allclose(x, xit.array) @pytest.mark.parametrize('axis', [0, 1, 2, -1]) def test_shape_transformer_2d_3d_array(axis): st = ShapeTransformerTo2D(axis=axis) x = numpy.arange(12).reshape((2, 3, 2)) xt = st.transform(x) xit = st.inverse_transform(xt) if axis == 0: assert numpy.allclose( xt.array, numpy.array([[0, 6], [1, 7], [2, 8], [3, 9], [4, 10], [5, 11]])) elif axis == 1: assert numpy.allclose( xt.array, numpy.array([[0, 2, 4], [1, 3, 5], [6, 8, 10], [7, 9, 11]])) elif axis == 2 or axis == -1: assert numpy.allclose( xt.array, x.reshape(6, 2)) assert numpy.allclose(x, xit.array) def test_shape_transformer_2d_error(): st = ShapeTransformerTo2D(axis=1) x = numpy.arange(6).reshape(2, 3) with pytest.raises(AttributeError): # call `inverse_transform` before `transform` xt = st.inverse_transform(x) # NOQA if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/connection_tests/test_embed_atom_id.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry import links in_size = 3 atom_size = 5 out_size = 4 batch_size = 2 @pytest.fixture def model(): l = links.EmbedAtomID(in_size=in_size, out_size=out_size) l.cleargrads() return l @pytest.fixture def data(): x_data = numpy.random.randint( in_size, size=(batch_size, atom_size)).astype(numpy.int32) y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, out_size)).astype(numpy.float32) return x_data, y_grad def check_forward(model, x_data): def forward(W, x): y = W[x] return y y_expect = forward(cuda.to_cpu(model.W.data), cuda.to_cpu(x_data)) y_actual = cuda.to_cpu(model(x_data).data) numpy.testing.assert_equal(y_actual, y_expect) def test_forward_cpu(model, data): x_data = data[0] check_forward(model, x_data) @pytest.mark.gpu def test_forward_gpu(model, data): x_data = cuda.to_gpu(data[0]) model.to_gpu() check_forward(model, x_data) def test_backward_cpu(model, data): x_data, y_grad = data gradient_check.check_backward(model, x_data, y_grad, model.W, atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(model, data): x_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward(model, x_data, y_grad, model.W, atol=1e-3, rtol=1e-3) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/links_tests/connection_tests/test_graph_linear.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.links.connection.graph_linear import GraphLinear # NOQA in_size = 3 atom_size = 5 out_size = 4 batch_size = 2 @pytest.fixture def model(): l = GraphLinear(in_size=in_size, out_size=out_size) l.cleargrads() return l @pytest.fixture def data(): x_data = numpy.random.uniform( -1, 1, (batch_size, atom_size, in_size)).astype(numpy.float32) y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, out_size)).astype(numpy.float32) return x_data, y_grad def test_forward_cpu(model, data): # only testing shape for now... x_data = data[0] y_actual = model(x_data) assert y_actual.shape == (batch_size, atom_size, out_size) @pytest.mark.gpu def test_forward_gpu(model, data): x_data = cuda.to_gpu(data[0]) model.to_gpu() y_actual = model(x_data) assert y_actual.shape == (batch_size, atom_size, out_size) def test_backward_cpu(model, data): x_data, y_grad = data gradient_check.check_backward(model, x_data, y_grad, list(model.params()), atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(model, data): x_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward(model, x_data, y_grad, list(model.params()), atol=1e-3, rtol=1e-3) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/connection_tests/test_graph_mlp.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.links.connection.graph_mlp import GraphMLP # NOQA in_size = 3 atom_size = 5 out_size = 4 channels = [16, out_size] batch_size = 2 @pytest.fixture def model(): l = GraphMLP(channels, in_channels=in_size) l.cleargrads() return l @pytest.fixture def data(): x_data = numpy.random.uniform( -1, 1, (batch_size, atom_size, in_size)).astype(numpy.float32) y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, out_size)).astype(numpy.float32) return x_data, y_grad def test_forward_cpu(model, data): # only testing shape for now... x_data = data[0] y_actual = model(x_data) assert y_actual.shape == (batch_size, atom_size, out_size) assert len(model.layers) == len(channels) @pytest.mark.gpu def test_forward_gpu(model, data): x_data = cuda.to_gpu(data[0]) model.to_gpu() y_actual = model(x_data) assert y_actual.shape == (batch_size, atom_size, out_size) assert len(model.layers) == len(channels) def test_backward_cpu(model, data): x_data, y_grad = data gradient_check.check_backward(model, x_data, y_grad, list(model.params()), atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(model, data): x_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward(model, x_data, y_grad, list(model.params()), atol=1e-3, rtol=1e-3) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/readout_tests/test_cgcnn_readout.py ================================================ import numpy import pytest from chainer import cuda from chainer_chemistry.links.readout.cgcnn_readout import CGCNNReadout # node_size_list means the first moleculae has three nodes, # and the seconde molecule has five nodes node_size_list = [3, 5] node_feature_dim = 32 out_dim = 4 batch_size = 2 @pytest.fixture def readout(): return CGCNNReadout(out_dim=out_dim) @pytest.fixture def data(): if len(node_size_list) != batch_size: raise ValueError("Invalid fixture data for CGCNN") numpy.random.seed(0) total_node_size = sum(node_size_list) # atom_feat atom_feat = numpy.random.rand( total_node_size, node_feature_dim).astype(numpy.float32) # atom_idx curr_idx = 0 atom_idx = [] for val in node_size_list: atom_idx.append(numpy.arange(curr_idx, val)) curr_idx += val atom_idx = numpy.asarray(atom_idx) y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_feat, atom_idx, y_grad def check_forward(readout, data): y_actual = cuda.to_cpu(readout(*data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(readout, data): atom_feat, atom_idx = data[:-1] check_forward(readout, (atom_feat, atom_idx)) @pytest.mark.gpu def test_forward_gpu(readout, data): atom_feat, atom_idx, _ = data # atom_idx is list format... use numpy array input_data = (cuda.to_gpu(atom_feat), atom_idx) readout.to_gpu() check_forward(readout, tuple(input_data)) # def test_backward_cpu(readout, data): # input_data, y_grad = data[0:-1], data[-1] # gradient_check.check_backward(readout, tuple(input_data), y_grad, # atol=5e-1, rtol=1e-1) # @pytest.mark.gpu # def test_backward_gpu(readout, data): # data = [cuda.to_gpu(d) for d in data] # input_data, y_grad = data[0:-1], data[-1] # readout.to_gpu() # gradient_check.check_backward(readout, tuple(input_data), y_grad, # atol=5e-1, rtol=1e-1) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/links_tests/readout_tests/test_general_readout.py ================================================ from chainer import cuda from chainer import functions from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.general_readout import GeneralReadout from chainer_chemistry.utils.permutation import permute_node atom_size = 5 hidden_dim = 7 batch_size = 2 @pytest.fixture def readouts(): modes = ['sum', 'max', 'summax'] return (GeneralReadout(mode=mode) for mode in modes) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.uniform( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, hidden_dim) ).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, hidden_dim)).astype('f') return atom_data, y_grad def check_forward(readout, atom_data): y_actual = cuda.to_cpu(readout(atom_data).data) if readout.mode == ('sum' and 'max'): assert y_actual.shape == (batch_size, hidden_dim) elif readout.mode == 'summax': assert y_actual.shape == (batch_size, hidden_dim * 2) def test_forward_cpu(readouts, data): atom_data = data[0] for readout in readouts: check_forward(readout, atom_data) @pytest.mark.gpu def test_forward_gpu(readouts, data): atom_data = cuda.to_gpu(data[0]) for readout in readouts: readout.to_gpu() check_forward(readout, atom_data) def test_forward_cpu_assert_raises(data): atom_data = data[0] readout = GeneralReadout(mode='invalid') with pytest.raises(ValueError): cuda.to_cpu(readout(atom_data).data) def test_backward_cpu(readouts, data): atom_data, y_grad = data for readout in readouts: if readout.mode == 'summax': y_grad = functions.concat((y_grad, y_grad), axis=1).data gradient_check.check_backward( readout, atom_data, y_grad, atol=1e-2, rtol=1e-2) @pytest.mark.gpu def test_backward_gpu(readouts, data): atom_data, y_grad = map(cuda.to_gpu, data) for readout in readouts: readout.to_gpu() if readout.mode == 'summax': y_grad = functions.concat((y_grad, y_grad), axis=1).data # TODO(nakago): check why tolerance is so high. gradient_check.check_backward( readout, atom_data, y_grad, atol=1e-1, rtol=1e-1) def test_forward_cpu_graph_invariant(readouts, data): atom_data = data[0] permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) for readout in readouts: y_actual = cuda.to_cpu(readout(atom_data).data) permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data) numpy.testing.assert_allclose( y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/readout_tests/test_ggnn_readout.py ================================================ from chainer import cuda from chainer import functions from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 7 out_dim = 4 batch_size = 2 @pytest.fixture def readout(): return GGNNReadout(out_dim=out_dim, in_channels=None) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.uniform( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, in_channels) ).astype('f') atom_data0 = functions.copy( atom_data, cuda.get_device_from_array(atom_data.data).id).data y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype('f') return atom_data, atom_data0, y_grad def check_forward(readout, atom_data, atom_data0): y_actual = cuda.to_cpu(readout(atom_data, atom_data0).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(readout, data): atom_data, atom_data0 = data[:2] check_forward(readout, atom_data, atom_data0) @pytest.mark.gpu def test_forward_gpu(readout, data): atom_data, atom_data0 = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) readout.to_gpu() check_forward(readout, atom_data, atom_data0) def test_backward_cpu(readout, data): atom_data, atom_data0, y_grad = data gradient_check.check_backward( readout, (atom_data, atom_data0), y_grad, atol=1e-1, rtol=1e-1) @pytest.mark.gpu def test_backward_gpu(readout, data): atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] readout.to_gpu() gradient_check.check_backward(readout, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1) def test_forward_cpu_graph_invariant(readout, data): atom_data, atom_data0 = data[:2] y_actual = cuda.to_cpu(readout(atom_data, atom_data0).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_atom_data0 = permute_node(atom_data0, permutation_index, axis=1) permute_y_actual = cuda.to_cpu(readout( permute_atom_data, permute_atom_data0).data) numpy.testing.assert_allclose( y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/readout_tests/test_megnet_readout.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.links.readout.megnet_readout import MEGNetReadout max_node_num = 6 max_edge_num = 10 # This value is the same as the atom and pair feature dimension in_channels = 10 global_feature_dim = 5 out_dim = 4 batch_size = 2 @pytest.fixture def readout(): return MEGNetReadout(in_channels=in_channels, out_dim=out_dim) @pytest.fixture def data(): numpy.random.seed(0) atom_feat = numpy.random.rand(batch_size, max_node_num, in_channels).astype(numpy.float32) pair_feat = numpy.random.rand(batch_size, max_edge_num, in_channels).astype(numpy.float32) global_feat = numpy.random.rand(batch_size, global_feature_dim).astype(numpy.float32) y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_feat, pair_feat, global_feat, y_grad def check_forward(readout, data): y_actual = cuda.to_cpu(readout(*data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(readout, data): atom_feat, pair_feat, global_feat = data[:-1] check_forward(readout, (atom_feat, pair_feat, global_feat)) @pytest.mark.gpu def test_forward_gpu(readout, data): input_data = [cuda.to_gpu(d) for d in data[:-1]] readout.to_gpu() check_forward(readout, tuple(input_data)) def test_backward_cpu(readout, data): input_data, y_grad = data[0:-1], data[-1] gradient_check.check_backward(readout, tuple(input_data), y_grad, atol=5e-1, rtol=1e-1) @pytest.mark.gpu def test_backward_gpu(readout, data): data = [cuda.to_gpu(d) for d in data] input_data, y_grad = data[0:-1], data[-1] readout.to_gpu() gradient_check.check_backward(readout, tuple(input_data), y_grad, atol=5e-1, rtol=1e-1) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/readout_tests/test_mpnn_readout.py ================================================ from typing import Tuple # NOQA import numpy import pytest from chainer import cuda from chainer import gradient_check from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.mpnn_readout import MPNNReadout from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 7 out_dim = 4 batch_size = 2 @pytest.fixture def readout(): # type: () -> MPNNReadout return MPNNReadout(out_dim=out_dim, in_channels=in_channels, n_layers=2) @pytest.fixture def data(): # type: () -> Tuple[numpy.ndarray, numpy.ndarray] numpy.random.seed(0) atom_data = numpy.random.uniform( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, in_channels)).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f') return atom_data, y_grad def check_forward(readout, atom_data): # type: (MPNNReadout, numpy.ndarray) -> None y_actual = cuda.to_cpu(readout(atom_data).data) assert y_actual.shape == (batch_size, out_dim) def test_foward_cpu(readout, data): # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data = data[0] check_forward(readout, atom_data) @pytest.mark.gpu def test_foward_gpu(readout, data): # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data = cuda.to_gpu(data[0]) readout.to_gpu() check_forward(readout, atom_data) def test_backward_cpu(readout, data): # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data, y_grad = data gradient_check.check_backward( readout, atom_data, y_grad, atol=1e-1, rtol=1e-1) @pytest.mark.gpu def test_backward_gpu(readout, data): # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data, y_grad = map(cuda.to_gpu, data) readout.to_gpu() gradient_check.check_backward( readout, atom_data, y_grad, atol=1e-1, rtol=1e-1) def test_foward_cpu_graph_invariant(readout, data): # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data = data[0] y_actual = cuda.to_cpu(readout(atom_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data) numpy.testing.assert_allclose( y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/readout_tests/test_nfp_readout.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.nfp_readout import NFPReadout from chainer_chemistry.utils.permutation import permute_node atom_size = 5 hidden_dim = 7 out_dim = 4 batch_size = 2 @pytest.fixture def readout(): return NFPReadout(in_channels=hidden_dim, out_dim=out_dim) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.uniform( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, hidden_dim) ).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f') return atom_data, y_grad def check_forward(readout, atom_data): y_actual = cuda.to_cpu(readout(atom_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(readout, data): atom_data = data[0] check_forward(readout, atom_data) @pytest.mark.gpu def test_forward_gpu(readout, data): atom_data = cuda.to_gpu(data[0]) readout.to_gpu() check_forward(readout, atom_data) def test_backward_cpu(readout, data): atom_data, y_grad = data gradient_check.check_backward( readout, atom_data, y_grad, atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(readout, data): atom_data, y_grad = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) readout.to_gpu() gradient_check.check_backward( readout, atom_data, y_grad, atol=1e-3, rtol=1e-3) def test_forward_cpu_graph_invariant(readout, data): atom_data = data[0] y_actual = cuda.to_cpu(readout(atom_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data) numpy.testing.assert_allclose( y_actual, permute_y_actual, rtol=1e-3, atol=1e-3) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/readout_tests/test_schnet_readout.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.schnet_readout import SchNetReadout from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 7 out_dim = 4 batch_size = 2 @pytest.fixture def readout(): return SchNetReadout(out_dim=out_dim, in_channels=in_channels) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.uniform( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, in_channels) ).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f') return atom_data, y_grad def check_forward(readout, atom_data): y_actual = cuda.to_cpu(readout(atom_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(readout, data): atom_data = data[0] check_forward(readout, atom_data) @pytest.mark.gpu def test_forward_gpu(readout, data): atom_data = cuda.to_gpu(data[0]) readout.to_gpu() check_forward(readout, atom_data) def test_backward_cpu(readout, data): atom_data, y_grad = data gradient_check.check_backward( readout, atom_data, y_grad, atol=1e-1, rtol=1e-1) @pytest.mark.gpu def test_backward_gpu(readout, data): atom_data, y_grad = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) readout.to_gpu() gradient_check.check_backward( readout, atom_data, y_grad, atol=1e-1, rtol=1e-1) def test_forward_cpu_graph_invariant(readout, data): atom_data = data[0] y_actual = cuda.to_cpu(readout(atom_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data) numpy.testing.assert_allclose( y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/readout_tests/test_set2set.py ================================================ from typing import Tuple # NOQA import numpy import pytest from chainer import cuda from chainer import gradient_check from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.set2set import Set2Set from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 7 batch_size = 2 @pytest.fixture def readout(): # type: () -> Set2Set return Set2Set(in_channels=in_channels, n_layers=2) @pytest.fixture def data(): # type: () -> Tuple[numpy.ndarray, numpy.ndarray] numpy.random.seed(0) atom_data = numpy.random.uniform( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, in_channels)).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, in_channels * 2)).astype('f') return atom_data, y_grad def check_forward(readout, atom_data): # type: (Set2Set, numpy.ndarray) -> None readout.reset_state() y_actual = cuda.to_cpu(readout(atom_data).data) assert y_actual.shape == (batch_size, in_channels * 2) def test_forward_cpu(readout, data): # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data = data[0] check_forward(readout, atom_data) @pytest.mark.gpu def test_forward_gpu(readout, data): # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data = cuda.to_gpu(data[0]) readout.to_gpu() check_forward(readout, atom_data) def check_backward(readout, atom_data, y_grad): # type: (Set2Set, numpy.ndarray, numpy.ndarray) -> None """Check gradient of Set2Set. This function is different from other backward tests. Because of LSTM, reset_state method has to be called explicitly before gradient calculation. Args: readout: atom_data: y_grad: """ def f(atom_data): readout.reset_state() return readout(atom_data), gradient_check.check_backward( f, (atom_data), y_grad, atol=1e-1, rtol=1e-1) def test_backward_cpu(readout, data): # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None check_backward(readout, *data) @pytest.mark.gpu def test_backward_gpu(readout, data): # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data, y_grad = [cuda.to_gpu(d) for d in data] readout.to_gpu() check_backward(readout, atom_data, y_grad) def test_forward_cpu_graph_invariant(readout, data): # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None atom_data = data[0] readout.reset_state() y_actual = cuda.to_cpu(readout(atom_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) readout.reset_state() permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data) numpy.testing.assert_allclose( y_actual, permute_y_actual, rtol=1e-6, atol=1e-6) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/scaler_tests/test_flow_scaler.py ================================================ import os import numpy import pytest import scipy.stats from chainer import serializers, Variable, cuda, testing # NOQA from chainer_chemistry.links.scaler.flow_scaler import FlowScaler @testing.with_requires('chainer>=5.0.0') @pytest.mark.slow def test_flow_scaler_transform_uniform(): x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32) scaler = FlowScaler(5) scaler.fit(x) # fit takes time x_scaled = scaler.transform(x) assert scipy.stats.kstest(x_scaled, 'norm').pvalue > 0.05 @testing.with_requires('chainer>=5.0.0') @pytest.mark.slow def test_flow_scaler_transform_mix_gaussian(): plus = numpy.random.binomial(n=1, p=0.6, size=100).astype(numpy.float32) x = plus * numpy.random.normal(10, 5, size=100).astype(numpy.float32) x += (1 - plus) * numpy.random.normal( -10, 5, size=100).astype(numpy.float32) scaler = FlowScaler(5) scaler.fit(x) # fit takes time x_scaled = scaler.transform(x) assert scipy.stats.kstest(x_scaled, 'norm').pvalue > 0.05 @testing.with_requires('chainer>=5.0.0') @pytest.mark.slow def test_flow_scaler_transform_variable(): x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32) xvar = Variable(x) scaler = FlowScaler(5) scaler.fit(xvar) # fit takes time x_scaled = scaler.transform(xvar) assert isinstance(x_scaled, Variable) assert scipy.stats.kstest(x_scaled.array, 'norm').pvalue > 0.05 @testing.with_requires('chainer>=5.0.0') @pytest.mark.gpu @pytest.mark.slow def test_flow_scaler_transform_gpu(): x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32) scaler = FlowScaler(5) scaler.to_gpu() x = cuda.to_gpu(x) scaler.fit(x) # fit takes time x_scaled = scaler.transform(x) assert isinstance(x_scaled, cuda.cupy.ndarray) assert scipy.stats.kstest(cuda.to_cpu(x_scaled), 'norm').pvalue > 0.05 @testing.with_requires('chainer>=5.0.0') @pytest.mark.slow def test_flow_scaler_serialize(tmpdir): x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32) scaler = FlowScaler(5) scaler.fit(x) # fit takes time x_scaled = scaler.transform(x) scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz') serializers.save_npz(scaler_filepath, scaler) scaler2 = FlowScaler(5) serializers.load_npz(scaler_filepath, scaler2) x_scaled2 = scaler2.transform(x) assert numpy.allclose(scaler.W1.array, scaler2.W1.array) assert numpy.allclose(scaler.b1.array, scaler2.b1.array) assert numpy.allclose(scaler.W2.array, scaler2.W2.array) assert numpy.allclose(scaler.b2.array, scaler2.b2.array) assert numpy.allclose(x_scaled, x_scaled2) @testing.with_requires('chainer>=5.0.0') def test_flow_scaler_pipeline(): # Only to test each method without fail, for fast testing. x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32) scaler = FlowScaler(5) scaler.fit(x, iteration=1) x_scaled = scaler.transform(x) assert x_scaled.shape == x.shape @testing.with_requires('chainer>=5.0.0') @pytest.mark.gpu def test_flow_scaler_pipeline_gpu(): # Only to test each method without fail, for fast testing. x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32) x = cuda.to_gpu(x) scaler = FlowScaler(5) scaler.to_gpu() scaler.fit(x, iteration=1) x_scaled = scaler.transform(x) assert isinstance(x_scaled, cuda.cupy.ndarray) assert x_scaled.shape == x.shape if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/scaler_tests/test_max_abs_scaler.py ================================================ import os import numpy import pytest from chainer import serializers, Variable, cuda # NOQA from chainer_chemistry.links.scaler.max_abs_scaler import MaxAbsScaler @pytest.fixture def data(): x = numpy.array( [[0.1, 10., 0.3], [0.2, 20., 0.1], [-0.3, 30., 0.], [0.4, -40., 0.]], dtype=numpy.float32) expect_x_scaled = numpy.array( [[0.25, 0.25, 1.], [0.5, 0.5, 0.3333333], [-0.75, 0.75, 0.], [1., -1., 0.]], dtype=numpy.float32) return x, expect_x_scaled @pytest.mark.parametrize('indices', [None, [0], [1, 2]]) def test_max_abs_scaler_transform(data, indices): x, expect_x_scaled = data scaler = MaxAbsScaler() scaler.fit(x, indices=indices) x_scaled = scaler.transform(x) if indices is None: indices = numpy.arange(x.shape[1]) numpy.allclose(scaler.max_abs, numpy.array([0.4, 40, 0.3])[indices]) for index in range(x.shape[1]): if index in indices: assert numpy.allclose(x_scaled[:, index], expect_x_scaled[:, index]) else: assert numpy.allclose(x_scaled[:, index], x[:, index]) def test_max_abs_scaler_transform_variable(data): x, expect_x_scaled = data xvar = Variable(x) scaler = MaxAbsScaler() scaler.fit(xvar) x_scaled = scaler.transform(xvar) assert isinstance(x_scaled, Variable) assert numpy.allclose(x_scaled.array, expect_x_scaled) @pytest.mark.gpu def test_max_abs_scaler_transform_gpu(data): x, expect_x_scaled = data scaler = MaxAbsScaler() scaler.to_gpu() x = cuda.to_gpu(x) scaler.fit(x) x_scaled = scaler.transform(x) assert isinstance(x_scaled, cuda.cupy.ndarray) assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled) @pytest.mark.parametrize('indices', [None, [0], [1, 2]]) def test_max_abs_scaler_inverse_transform(data, indices): x, expect_x_scaled = data scaler = MaxAbsScaler() scaler.fit(x, indices=indices) x_inverse = scaler.inverse_transform(expect_x_scaled) if indices is None: indices = numpy.arange(x.shape[1]) for index in range(x.shape[1]): if index in indices: assert numpy.allclose(x_inverse[:, index], x[:, index]) else: assert numpy.allclose(x_inverse[:, index], expect_x_scaled[:, index]) @pytest.mark.parametrize('axis', [1, 2]) def test_max_abs_scaler_3darray(data, axis): x, expect_x_scaled = data s0, s1 = x.shape if axis == 1: # feature axis is 1, insert other axis to 2nd axis x = numpy.broadcast_to(x[:, :, None], (s0, s1, 2)) expect_x_scaled = numpy.broadcast_to( expect_x_scaled[:, :, None], (s0, s1, 2)) elif axis == 2: # feature axis is 2, insert other axis to 1st axis x = numpy.broadcast_to(x[:, None, :], (s0, 3, s1)) expect_x_scaled = numpy.broadcast_to( expect_x_scaled[:, None, :], (s0, 3, s1)) assert x.ndim == 3 indices = None scaler = MaxAbsScaler() scaler.fit(x, indices=indices, axis=axis) x_scaled = scaler.transform(x, axis=axis) assert x_scaled.shape == expect_x_scaled.shape assert numpy.allclose(x_scaled, expect_x_scaled, atol=1e-7) x_inverse = scaler.inverse_transform(expect_x_scaled, axis=axis) for index in numpy.arange(x.shape[1]): assert numpy.allclose(x_inverse[:, index], x[:, index], atol=1e-7) def test_max_abs_scaler_fit_transform(data): x, expect_x_scaled = data scaler = MaxAbsScaler() x_scaled = scaler.fit_transform(x) assert numpy.allclose(x_scaled, expect_x_scaled) # TODO(nakago): fix Chainer serializer. # Behavior changed from numpy versioin 1.16.3. # allow_pickle=True must be passed to numpy.load function, # in order to load `None`. # For now, skip test for serialize `None`. # @pytest.mark.parametrize('indices', [None, [0]]) @pytest.mark.parametrize('indices', [[0]]) def test_max_abs_scaler_serialize(tmpdir, data, indices): x, expect_x_scaled = data scaler = MaxAbsScaler() scaler.fit(x, indices=indices) scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz') serializers.save_npz(scaler_filepath, scaler) scaler2 = MaxAbsScaler() serializers.load_npz(scaler_filepath, scaler2) # print('scaler2 attribs:', scaler2.max_abs, scaler2.indices) assert numpy.allclose(scaler.max_abs, scaler2.max_abs) assert scaler.indices == scaler2.indices def test_max_abs_scaler_assert_raises(): x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]], dtype=numpy.float32) scaler = MaxAbsScaler() # call transform before fit raises error with pytest.raises(AttributeError): scaler.transform(x) with pytest.raises(AttributeError): scaler.inverse_transform(x) def test_max_abs_scaler_transform_zero_max(): x = numpy.array([[0, 2], [0, 2], [0, 2]], dtype=numpy.float32) expect_x_scaled = numpy.array([[0, 1], [0, 1], [0, 1]], dtype=numpy.float32) scaler = MaxAbsScaler() scaler.fit(x) x_scaled = scaler.transform(x) # print('max_abs', scaler.max_abs) assert numpy.allclose(x_scaled, expect_x_scaled) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/scaler_tests/test_min_max_scaler.py ================================================ import os import numpy import pytest from chainer import serializers, Variable, cuda # NOQA from chainer_chemistry.links.scaler.min_max_scaler import MinMaxScaler @pytest.fixture def data(): x = numpy.array( [[0.1, 10., 0.3], [0.2, 20., 0.1], [-0.3, 30., 0.], [0.4, -40., 0.]], dtype=numpy.float32) expect_x_scaled = numpy.array( [[0.57142854, 0.71428573, 1.], [0.7142857, 0.85714287, 0.3333333], [0., 1., 0.], [1., 0., 0.]], dtype=numpy.float32) return x, expect_x_scaled @pytest.mark.parametrize('indices', [None, [0], [1, 2]]) def test_min_max_scaler_transform(data, indices): x, expect_x_scaled = data scaler = MinMaxScaler() scaler.fit(x, indices=indices) x_scaled = scaler.transform(x) if indices is None: indices = numpy.arange(x.shape[1]) numpy.allclose(scaler.max, numpy.array([0.4, 30, 0.3])[indices]) numpy.allclose(scaler.min, numpy.array([-0.3, -40, 0])[indices]) for index in range(x.shape[1]): if index in indices: assert numpy.allclose(x_scaled[:, index], expect_x_scaled[:, index]) else: assert numpy.allclose(x_scaled[:, index], x[:, index]) def test_min_max_scaler_transform_variable(data): x, expect_x_scaled = data xvar = Variable(x) scaler = MinMaxScaler() scaler.fit(xvar) x_scaled = scaler.transform(xvar) assert isinstance(x_scaled, Variable) assert numpy.allclose(x_scaled.array, expect_x_scaled) @pytest.mark.gpu def test_min_max_scaler_transform_gpu(data): x, expect_x_scaled = data scaler = MinMaxScaler() scaler.to_gpu() x = cuda.to_gpu(x) scaler.fit(x) x_scaled = scaler.transform(x) assert isinstance(x_scaled, cuda.cupy.ndarray) assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled) @pytest.mark.parametrize('indices', [None, [0], [1, 2]]) def test_min_max_scaler_inverse_transform(data, indices): x, expect_x_scaled = data scaler = MinMaxScaler() scaler.fit(x, indices=indices) x_inverse = scaler.inverse_transform(expect_x_scaled) if indices is None: indices = numpy.arange(x.shape[1]) for index in range(x.shape[1]): if index in indices: assert numpy.allclose(x_inverse[:, index], x[:, index]) else: assert numpy.allclose(x_inverse[:, index], expect_x_scaled[:, index]) @pytest.mark.parametrize('axis', [1, 2]) def test_min_max_scaler_3darray(data, axis): x, expect_x_scaled = data s0, s1 = x.shape if axis == 1: # feature axis is 1, insert other axis to 2nd axis x = numpy.broadcast_to(x[:, :, None], (s0, s1, 2)) expect_x_scaled = numpy.broadcast_to( expect_x_scaled[:, :, None], (s0, s1, 2)) elif axis == 2: # feature axis is 2, insert other axis to 1st axis x = numpy.broadcast_to(x[:, None, :], (s0, 3, s1)) expect_x_scaled = numpy.broadcast_to( expect_x_scaled[:, None, :], (s0, 3, s1)) assert x.ndim == 3 indices = None scaler = MinMaxScaler() scaler.fit(x, indices=indices, axis=axis) x_scaled = scaler.transform(x, axis=axis) assert x_scaled.shape == expect_x_scaled.shape assert numpy.allclose(x_scaled, expect_x_scaled, atol=1e-7) x_inverse = scaler.inverse_transform(expect_x_scaled, axis=axis) for index in numpy.arange(x.shape[1]): assert numpy.allclose(x_inverse[:, index], x[:, index], atol=1e-7) def test_min_max_scaler_fit_transform(data): x, expect_x_scaled = data scaler = MinMaxScaler() x_scaled = scaler.fit_transform(x) assert numpy.allclose(x_scaled, expect_x_scaled) # TODO(nakago): fix Chainer serializer. # Behavior changed from numpy versioin 1.16.3. # allow_pickle=True must be passed to numpy.load function, # in order to load `None`. # For now, skip test for serialize `None`. # @pytest.mark.parametrize('indices', [None, [0]]) @pytest.mark.parametrize('indices', [[0]]) def test_min_max_scaler_serialize(tmpdir, data, indices): x, expect_x_scaled = data scaler = MinMaxScaler() scaler.fit(x, indices=indices) scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz') serializers.save_npz(scaler_filepath, scaler) scaler2 = MinMaxScaler() serializers.load_npz(scaler_filepath, scaler2) # print('scaler2 attribs:', scaler2.min, scaler2.max, scaler2.indices) assert numpy.allclose(scaler.min, scaler2.min) assert numpy.allclose(scaler.max, scaler2.max) assert scaler.indices == scaler2.indices def test_min_max_scaler_assert_raises(): x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]], dtype=numpy.float32) scaler = MinMaxScaler() # call transform before fit raises error with pytest.raises(AttributeError): scaler.transform(x) with pytest.raises(AttributeError): scaler.inverse_transform(x) def test_min_max_scaler_transform_zero_max(): x = numpy.array([[0, 2], [0, 2], [0, 2]], dtype=numpy.float32) expect_x_scaled = numpy.array([[0, 0], [0, 0], [0, 0]], dtype=numpy.float32) scaler = MinMaxScaler() scaler.fit(x) x_scaled = scaler.transform(x) # print('min', scaler.min, 'max', scaler.max) assert numpy.allclose(x_scaled, expect_x_scaled) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/scaler_tests/test_standard_scaler.py ================================================ import os import chainer import numpy import pytest from chainer import serializers, Variable, cuda # NOQA from chainer_chemistry.links.scaler.standard_scaler import StandardScaler @pytest.fixture def data(): x = numpy.array( [[0.1, 10., 0.3], [0.2, 20., 0.1], [0.3, 30., 0.], [0.4, 40., 0.]], dtype=numpy.float32) expect_x_scaled = numpy.array( [[-1.3416407, -1.3416408, 1.6329931], [-0.44721353, -0.4472136, 0.], [0.44721368, 0.4472136, -0.8164965], [1.3416407, 1.3416408, -0.8164965]], dtype=numpy.float32) return x, expect_x_scaled @pytest.mark.parametrize('indices', [None, [0], [1, 2]]) def test_standard_scaler_transform(data, indices): x, expect_x_scaled = data scaler = StandardScaler() scaler.fit(x, indices=indices) x_scaled = scaler.transform(x) if indices is None: indices = numpy.arange(x.shape[1]) for index in range(x.shape[1]): if index in indices: assert numpy.allclose(x_scaled[:, index], expect_x_scaled[:, index]) else: assert numpy.allclose(x_scaled[:, index], x[:, index]) def test_standard_scaler_transform_variable(data): x, expect_x_scaled = data xvar = Variable(x) scaler = StandardScaler() scaler.fit(xvar) x_scaled = scaler.transform(xvar) assert isinstance(x_scaled, Variable) assert numpy.allclose(x_scaled.array, expect_x_scaled) @pytest.mark.gpu def test_standard_scaler_transform_gpu(data): x, expect_x_scaled = data scaler = StandardScaler() scaler.to_gpu() x = cuda.to_gpu(x) scaler.fit(x) x_scaled = scaler.transform(x) assert isinstance(x_scaled, cuda.cupy.ndarray) assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled) @pytest.mark.parametrize('indices', [None, [0], [1, 2]]) def test_standard_scaler_inverse_transform(data, indices): x, expect_x_scaled = data scaler = StandardScaler() scaler.fit(x, indices=indices) x_inverse = scaler.inverse_transform(expect_x_scaled) if indices is None: indices = numpy.arange(x.shape[1]) for index in range(x.shape[1]): if index in indices: assert numpy.allclose(x_inverse[:, index], x[:, index]) else: assert numpy.allclose(x_inverse[:, index], expect_x_scaled[:, index]) @pytest.mark.parametrize('axis', [1, 2]) def test_standard_scaler_3darray(data, axis): x, expect_x_scaled = data s0, s1 = x.shape if axis == 1: # feature axis is 1, insert other axis to 2nd axis x = numpy.broadcast_to(x[:, :, None], (s0, s1, 2)) expect_x_scaled = numpy.broadcast_to( expect_x_scaled[:, :, None], (s0, s1, 2)) elif axis == 2: # feature axis is 2, insert other axis to 1st axis x = numpy.broadcast_to(x[:, None, :], (s0, 3, s1)) expect_x_scaled = numpy.broadcast_to( expect_x_scaled[:, None, :], (s0, 3, s1)) assert x.ndim == 3 indices = None scaler = StandardScaler() scaler.fit(x, indices=indices, axis=axis) x_scaled = scaler.transform(x, axis=axis) assert x_scaled.shape == expect_x_scaled.shape assert numpy.allclose(x_scaled, expect_x_scaled, atol=1e-7) x_inverse = scaler.inverse_transform(expect_x_scaled, axis=axis) for index in numpy.arange(x.shape[1]): assert numpy.allclose(x_inverse[:, index], x[:, index], atol=1e-7) def test_standard_scaler_fit_transform(data): x, expect_x_scaled = data scaler = StandardScaler() x_scaled = scaler.fit_transform(x) assert numpy.allclose(x_scaled, expect_x_scaled) # TODO(nakago): fix Chainer serializer. # Behavior changed from numpy versioin 1.16.3. # allow_pickle=True must be passed to numpy.load function, # in order to load `None`. # For now, skip test for serialize `None`. # @pytest.mark.parametrize('indices', [None, [0]]) @pytest.mark.parametrize('indices', [[0]]) def test_standard_scaler_serialize(tmpdir, data, indices): x, expect_x_scaled = data scaler = StandardScaler() scaler.fit(x, indices=indices) scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz') serializers.save_npz(scaler_filepath, scaler) scaler2 = StandardScaler() serializers.load_npz(scaler_filepath, scaler2) # print('scaler2 attribs:', scaler2.mean, scaler2.std, scaler2.indices) assert numpy.allclose(scaler.mean, scaler2.mean) assert numpy.allclose(scaler.std, scaler2.std) assert scaler.indices == scaler2.indices def test_standard_scaler_assert_raises(): x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]], dtype=numpy.float32) scaler = StandardScaler() # call transform before fit raises error with pytest.raises(AttributeError): scaler.transform(x) with pytest.raises(AttributeError): scaler.inverse_transform(x) def test_standard_scaler_transform_zero_std(): x = numpy.array([[1, 2], [1, 2], [1, 2]], dtype=numpy.float32) expect_x_scaled = numpy.array([[0, 0], [0, 0], [0, 0]], dtype=numpy.float32) scaler = StandardScaler() scaler.fit(x) x_scaled = scaler.transform(x) assert numpy.allclose(x_scaled, expect_x_scaled) def test_standard_scaler_forward(data): # test `forward` and `__call__` method. indices = [0] x, expect_x_scaled = data scaler = StandardScaler() scaler.fit(x, indices=indices) x_scaled_transform = scaler.transform(x) x_scaled_forward = scaler.forward(x) assert numpy.allclose(x_scaled_transform, x_scaled_forward) if int(chainer.__version__.split('.')[0]) >= 5: # `__call__` invokes `forward` method from version 5. # Skip test for chainer v4. x_scaled_call = scaler(x) assert numpy.allclose(x_scaled_transform, x_scaled_call) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_cgcnn_update.py ================================================ import numpy import pytest from chainer import cuda from chainer_chemistry.links.update.cgcnn_update import CGCNNUpdate # node_size_list means the first moleculae has three nodes, # and the seconde molecule has five nodes node_size_list = [3, 5] max_num_nbr = 6 node_feature_dim = 10 edge_feature_dim = 15 out_dim = node_feature_dim batch_size = 2 @pytest.fixture def update(): return CGCNNUpdate(n_site_features=node_feature_dim) @pytest.fixture def data(): if len(node_size_list) != batch_size: raise ValueError("Invalid fixture data for CGCNN") numpy.random.seed(0) total_node_size = sum(node_size_list) atom_feat = numpy.random.rand(total_node_size, node_feature_dim).astype(numpy.float32) nbr_feat = numpy.random.rand(total_node_size, max_num_nbr, edge_feature_dim).astype(numpy.float32) # nbr_idx curr_idx = 0 nbr_idx = [] for val in node_size_list: for _ in range(val): max_val = curr_idx + val nbr_idx.append(numpy.random.randint(curr_idx, max_val, max_num_nbr)) curr_idx += val nbr_idx = numpy.array(nbr_idx, dtype=numpy.int32) y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_feat, nbr_feat, nbr_idx, y_grad def check_forward(update, data): y_actual = cuda.to_cpu(update(*data).data) assert y_actual.shape == (sum(node_size_list), out_dim) def test_forward_cpu(update, data): atom_feat, nbr_feat, nbr_idx = data[:-1] check_forward(update, (atom_feat, nbr_feat, nbr_idx)) @pytest.mark.gpu def test_forward_gpu(update, data): input_data = [cuda.to_gpu(d) for d in data[:-1]] update.to_gpu() check_forward(update, tuple(input_data)) # def test_backward_cpu(update, data): # input_data, y_grad = data[0:-1], data[-1] # gradient_check.check_backward(update, tuple(input_data), y_grad, # atol=5e-1, rtol=1e-1) # @pytest.mark.gpu # def test_backward_gpu(update, data): # atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] # update.to_gpu() # gradient_check.check_backward(update, (atom_data, adj_data), y_grad, # atol=5e-1, rtol=1e-1) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/links_tests/update_tests/test_ggnn_update.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.ggnn_update import GGNNUpdate from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node from chainer_chemistry.utils.sparse_utils import _convert_to_sparse from chainer_chemistry.utils.sparse_utils import convert_sparse_with_edge_type from chainer_chemistry.utils.sparse_utils import sparse_utils_available atom_size = 5 in_channels = 4 hidden_channels = 7 batch_size = 2 n_edge_types = 2 @pytest.fixture def update(): return GGNNUpdate(in_channels=in_channels, hidden_channels=hidden_channels, n_edge_types=n_edge_types) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype('i') adj_data = numpy.random.uniform( 0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size) ).astype('f') y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_channels)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels) embed_atom_data = embed(atom_data).data return embed_atom_data, adj_data, y_grad @pytest.mark.skipif(not sparse_utils_available()) def convert_to_sparse(dense_adj): # auxiliary function data, row, col, edge_type = _convert_to_sparse(dense_adj) return convert_sparse_with_edge_type(data, row, col, atom_size, edge_type, n_edge_types) def check_forward(update, atom_data, adj_data): update.reset_state() y_actual = cuda.to_cpu(update(atom_data, adj_data).data) assert y_actual.shape == (batch_size, atom_size, hidden_channels) return y_actual def test_forward_cpu(update, data): atom_data, adj_data = data[:2] y_dense = check_forward(update, atom_data, adj_data) if sparse_utils_available(): sparse_adj = convert_to_sparse(adj_data) y_sparse = check_forward(update, atom_data, sparse_adj) # results for dense matrix and sparse matrix must be same numpy.testing.assert_allclose( y_dense, y_sparse, atol=1e-4, rtol=1e-4) @pytest.mark.gpu def test_forward_gpu(update, data): atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) update.to_gpu() y_dense = check_forward(update, atom_data, adj_data) if sparse_utils_available(): sparse_adj = convert_to_sparse(adj_data) y_sparse = check_forward(update, atom_data, sparse_adj) numpy.testing.assert_allclose( cuda.to_cpu(y_dense), cuda.to_cpu(y_sparse), atol=1e-4, rtol=1e-4) def check_backward(update, atom_data, adj_data, y_grad): """Check gradient of GGNNUpdate. This function is different from other backward tests. Because of GRU, reset_state method has to be called explicitly before gradient calculation. Args: update (callable): atom_data (numpy.ndarray): adj_data (numpy.ndarray): y_grad (numpy.ndarray): """ def f(atom_data): # skip adj_data check. update.reset_state() return update(atom_data, adj_data) gradient_check.check_backward( f, (atom_data), y_grad, atol=1e-1, rtol=1e-1) def test_backward_cpu(update, data): atom_data, adj_data, y_grad = data check_backward(update, atom_data, adj_data, y_grad) if sparse_utils_available(): sparse_adj = convert_to_sparse(adj_data) check_backward(update, atom_data, sparse_adj, y_grad) @pytest.mark.gpu def test_backward_gpu(update, data): update.to_gpu() atom_data, adj_data, y_grad = map(cuda.to_gpu, data) check_backward(update, atom_data, adj_data, y_grad) if sparse_utils_available(): sparse_adj = convert_to_sparse(adj_data) check_backward(update, atom_data, sparse_adj, y_grad) def test_forward_cpu_graph_invariant(update, data): permutation_index = numpy.random.permutation(atom_size) atom_data, adj_data = data[:2] update.reset_state() y_actual = cuda.to_cpu(update(atom_data, adj_data).data) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_adj_data = permute_adj(adj_data, permutation_index) update.reset_state() permute_y_actual = cuda.to_cpu(update( permute_atom_data, permute_adj_data).data) numpy.testing.assert_allclose( permute_node(y_actual, permutation_index, axis=1), permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_gin_update.py ================================================ from typing import Tuple # NOQA import chainer # NOQA from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.gin_update import GINUpdate from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 4 hidden_channels = 6 batch_size = 3 num_edge_type = 7 @pytest.fixture def update(): # type: () -> GINUpdate return GINUpdate(in_channels=in_channels, hidden_channels=hidden_channels, dropout_ratio=0) @pytest.fixture def data(): # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray] numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, atom_size, atom_size)).astype('f') y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_channels)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels) embed_atom_data = embed(atom_data).data return embed_atom_data, adj_data, y_grad # Test Update Function def check_forward(update, atom_data, adj_data): # type: (GINUpdate, numpy.ndarray, numpy.ndarray) -> None y_actual = cuda.to_cpu(update(atom_data, adj_data).data) assert y_actual.shape == (batch_size, atom_size, hidden_channels) def test_forward_cpu(update, data): # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] check_forward(update, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(update, data): # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = map(cuda.to_gpu, data[:2]) update.to_gpu() check_forward(update, atom_data, adj_data) def test_backward_cpu(update, data): # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = data gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(update, data): # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = map(cuda.to_gpu, data[:3]) update.to_gpu() gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) def test_forward_cpu_graph_invariant(update, data): # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] y_actual = cuda.to_cpu(update(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( update(permute_atom_data, permute_adj_data).data) numpy.testing.assert_allclose( permute_node(y_actual, permutation_index, axis=1), permute_y_actual, rtol=1e-3, atol=1e-3) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_gnn_film_update.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.gnn_film_update import GNNFiLMUpdate from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 7 hidden_channels = 7 batch_size = 2 n_edge_types = 5 @pytest.fixture def update(): return GNNFiLMUpdate(hidden_channels=hidden_channels, n_edge_types=n_edge_types) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype('i') adj_data = numpy.random.uniform( 0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size) ).astype('f') y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_channels)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels) embed_atom_data = embed(atom_data).data adj_data = adj_data return embed_atom_data, adj_data, y_grad # Test Update Function def check_forward(update, atom_data, adj_data): # type: (GNNFiLMUpdate, numpy.ndarray, numpy.ndarray) -> None y_actual = cuda.to_cpu(update(atom_data, adj_data).data) assert y_actual.shape == (batch_size, atom_size, hidden_channels) def test_forward_cpu(update, data): # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] check_forward(update, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(update, data): # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = map(cuda.to_gpu, data[:2]) update.to_gpu() check_forward(update, atom_data, adj_data) def test_backward_cpu(update, data): # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = data gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) @pytest.mark.gpu def test_backward_gpu(update, data): # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = map(cuda.to_gpu, data[:3]) update.to_gpu() # print(type(adj_data)) gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) def test_forward_cpu_graph_invariant(update, data): # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] y_actual = cuda.to_cpu(update(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( update(permute_atom_data, permute_adj_data).data) numpy.testing.assert_allclose( permute_node(y_actual, permutation_index, axis=1), permute_y_actual, rtol=1e-3, atol=1e-3) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_megnet_update.py ================================================ from chainer import cuda import numpy import pytest from chainer_chemistry.links.update.megnet_update import MEGNetUpdate # node_size_list means the first moleculae has six nodes, # and the seconde molecule has four nodes node_size_list = [6, 4] # edge_size_list means the first moleculae has eight edges, # and the seconde molecule has four edges edge_size_list = [8, 4] node_feature_dim = 5 edge_feature_dim = 10 global_feature_dim = 2 out_dim = 32 batch_size = 2 @pytest.fixture def update(): return MEGNetUpdate() @pytest.fixture def data(): if len(node_size_list) != batch_size or len(edge_size_list) != batch_size: raise ValueError("Invalid fixture for MEGNet") numpy.random.seed(0) total_node_size = sum(node_size_list) total_edge_size = sum(edge_size_list) atom_feat = numpy.random.rand(total_node_size, node_feature_dim).astype(numpy.float32) pair_feat = numpy.random.rand(total_edge_size, edge_feature_dim).astype(numpy.float32) global_feat = numpy.random.rand(batch_size, global_feature_dim).astype(numpy.float32) # atom idx atom_idx = numpy.hstack([[i] * node_size_list[i] for i in range(batch_size)]).astype(numpy.int32) # pair idx pair_idx = numpy.hstack([[i] * edge_size_list[i] for i in range(batch_size)]).astype(numpy.int32) # create start and end idx edge_idx = [] acc_node_size = [sum(node_size_list[:i+1]) for i in range(batch_size)] low = numpy.roll(acc_node_size + [0], 1)[0:batch_size+1] high = numpy.array(acc_node_size) for i in range(batch_size): idx = [numpy.random.choice(numpy.arange(low[i], high[i]), 2, replace=False) for _ in range(edge_size_list[i])] edge_idx.extend(idx) start_idx = numpy.array(edge_idx, dtype=numpy.int32)[:, 0] end_idx = numpy.array(edge_idx, dtype=numpy.int32)[:, 1] y_grad_atom = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) y_grad_pair = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) y_grad_global = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_feat, pair_feat, global_feat, \ atom_idx, pair_idx, start_idx, end_idx, \ y_grad_atom, y_grad_pair, y_grad_global def check_forward(update, data): y_actual = [cuda.to_cpu(d.data) for d in update(*data)] atom_feat, pair_feat, global_feat = y_actual assert atom_feat.shape == (sum(node_size_list), out_dim) assert pair_feat.shape == (sum(edge_size_list), out_dim) assert global_feat.shape == (batch_size, out_dim) def test_forward_cpu(update, data): atom_feat, pair_feat, global_feat, \ atom_idx, pair_idx, start_idx, end_idx = data[:-3] check_forward(update, (atom_feat, pair_feat, global_feat, atom_idx, pair_idx, start_idx, end_idx)) @pytest.mark.gpu def test_forward_gpu(update, data): input_data = [cuda.to_gpu(d) for d in data[:-3]] update.to_gpu() check_forward(update, tuple(input_data)) # def test_backward_cpu(update, data): # input_data, y_grad = data[0:-3], data[-3:] # gradient_check.check_backward(update, tuple(input_data), tuple(y_grad), # atol=5e-1, rtol=1e-1) # @pytest.mark.gpu # def test_backward_gpu(update, data): # data = [cuda.to_gpu(d) for d in data] # input_data, y_grad = data[0:-3], data[-3:] # update.to_gpu() # gradient_check.check_backward(update, tuple(input_data), tuple(y_grad), # atol=5e-1, rtol=1e-1) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_mpnn_update.py ================================================ from typing import Tuple # NOQA import chainer # NOQA from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.mpnn_update import EdgeNet from chainer_chemistry.links.update.mpnn_update import MPNNUpdate atom_size = 5 hidden_channels = 4 batch_size = 3 num_edge_type = 7 @pytest.fixture def message(): # type: () -> EdgeNet return EdgeNet(out_channels=hidden_channels) @pytest.fixture def update(): # type: () -> MPNNUpdate return MPNNUpdate(hidden_channels=hidden_channels) @pytest.fixture def data(): # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray] # NOQA numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, num_edge_type, atom_size, atom_size)).astype('f') y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_channels)).astype('f') y_grad_ = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_channels)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=hidden_channels) embed_atom_data = embed(atom_data).data return embed_atom_data, adj_data, y_grad, y_grad_ # Test Message Function def check_message_forward(message, atom_data, adj_data): # type: (EdgeNet, numpy.ndarray, numpy.ndarray) -> None y_actual = cuda.to_cpu(message(atom_data, adj_data).data) assert y_actual.shape == (batch_size, atom_size, hidden_channels * 2) def test_message_forward_cpu(message, data): # type: (EdgeNet, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] check_message_forward(message, atom_data, adj_data) @pytest.mark.gpu def test_message_forward_gpu(message, data): # type: (EdgeNet, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = map(cuda.to_gpu, data[:2]) message.to_gpu() check_message_forward(message, atom_data, adj_data) def test_message_backward_cpu(message, data): # type: (EdgeNet, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad, y_grad_ = data y_grad = numpy.concatenate([y_grad, y_grad_], axis=2) gradient_check.check_backward( message, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) @pytest.mark.gpu def test_message_backward_gpu(message, data): # type: (EdgeNet, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad, y_grad_ = map(cuda.to_gpu, data) xp = cuda.get_array_module(atom_data) y_grad = xp.concatenate([y_grad, y_grad_], axis=2) message.to_gpu() gradient_check.check_backward( message, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1) # Test Update Function def check_forward(update, atom_data, adj_data): # type: (MPNNUpdate, numpy.ndarray, numpy.ndarray) -> None y_actual = cuda.to_cpu(update(atom_data, adj_data).data) assert y_actual.shape == (batch_size, atom_size, hidden_channels) def test_forward_cpu(update, data): # type: (MPNNUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] check_forward(update, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(update, data): # type: (MPNNUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = map(cuda.to_gpu, data[:2]) update.to_gpu() check_forward(update, atom_data, adj_data) def check_backward(update, atom_data, adj_data, y_grad): # type: (MPNNUpdate, numpy.ndarray, numpy.ndarray, numpy.ndarray) -> None """Check gradient of MPNNUpdate. This function is different from other backward tests. Because of GRU, reset_state method has to be called explicitly before gradient calculation. Args: update (callable): atom_data (numpy.ndarray): adj_data (numpy.ndarray): y_grad (numpy.ndarray): """ def f(*args, **kwargs): update.reset_state() return update(*args, **kwargs) gradient_check.check_backward( f, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1) def test_backward_cpu(update, data): # type: (MPNNUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = data[:3] check_backward(update, atom_data, adj_data, y_grad) @pytest.mark.gpu def test_backward_gpu(update, data): # type: (MPNNUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = map(cuda.to_gpu, data[:3]) update.to_gpu() # gradient_check.check_backward(update, (atom_data, adj_data), y_grad, # atol=1e-1, rtol=1e-1) check_backward(update, atom_data, adj_data, y_grad) if __name__ == '__main__': pytest.main([__file__, '-v', '-s', '-x']) ================================================ FILE: tests/links_tests/update_tests/test_nfp_update.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.nfp_update import NFPUpdate from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 hidden_channels = 4 batch_size = 2 num_degree_type = 7 @pytest.fixture def update(): return NFPUpdate(in_channels=hidden_channels, out_channels=hidden_channels) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, atom_size, atom_size)).astype('f') y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_channels)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=hidden_channels) embed_atom_data = embed(atom_data).data degree_mat = numpy.sum(adj_data, axis=1) deg_conds = numpy.array([numpy.broadcast_to( ((degree_mat - degree) == 0)[:, :, None], embed_atom_data.shape) for degree in range(1, num_degree_type + 1)]) return embed_atom_data, adj_data, deg_conds, y_grad def check_forward(update, atom_data, adj_data, deg_conds): y_actual = cuda.to_cpu(update(atom_data, adj_data, deg_conds).data) assert y_actual.shape == (batch_size, atom_size, hidden_channels) def test_forward_cpu(update, data): atom_data, adj_data, deg_conds = data[:3] check_forward(update, atom_data, adj_data, deg_conds) @pytest.mark.gpu def test_forward_gpu(update, data): atom_data, adj_data, deg_conds = map(cuda.to_gpu, data[:3]) update.to_gpu() check_forward(update, atom_data, adj_data, deg_conds) def test_backward_cpu(update, data): atom_data, adj_data, deg_conds, y_grad = data gradient_check.check_backward( update, (atom_data, adj_data, deg_conds), y_grad, atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(update, data): atom_data, adj_data, deg_conds, y_grad = map(cuda.to_gpu, data) update.to_gpu() gradient_check.check_backward( update, (atom_data, adj_data, deg_conds), y_grad, atol=1e-3, rtol=1e-3) def test_forward_cpu_graph_invariant(update, data): atom_data, adj_data, deg_conds = data[:3] y_actual = cuda.to_cpu(update(atom_data, adj_data, deg_conds).data) permutation_index = numpy.random.permutation(atom_size) # atom_data: (batch_size, atom_size, hidden_channels) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_adj_data = permute_adj(adj_data, permutation_index) # deg_conds: (num_degree_type, batch_size, atom_size, hidden_channels) permute_deg_conds = permute_node(deg_conds, permutation_index, axis=2) permute_y_actual = cuda.to_cpu(update( permute_atom_data, permute_adj_data, permute_deg_conds).data) numpy.testing.assert_allclose( permute_node(y_actual, permutation_index, axis=1), permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_relgat_update.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.relgat_update import RelGATUpdate from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node in_channels = 3 out_channels = 4 atom_size = 5 batch_size = 2 num_edge_type = 7 @pytest.fixture def update(): return RelGATUpdate(in_channels=in_channels, out_channels=out_channels, n_edge_types=num_edge_type) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, num_edge_type, atom_size, atom_size) ).astype('f') y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, out_channels)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels) embed_atom_data = embed(atom_data).data return embed_atom_data, adj_data, y_grad def check_forward(update, atom_data, adj_data): y_actual = cuda.to_cpu(update(atom_data, adj_data).data) assert y_actual.shape == (batch_size, atom_size, out_channels) def test_forward_cpu(update, data): atom_data, adj_data = data[:2] check_forward(update, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(update, data): atom_data, adj_data = map(cuda.to_gpu, data[:2]) update.to_gpu() check_forward(update, atom_data, adj_data) def test_backward_cpu(update, data): atom_data, adj_data, y_grad = data # gradient_check.check_backward( # update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) params = tuple(update.params()) # NOQA gradient_check.check_backward(update, (atom_data, adj_data), y_grad, no_grads=[False, True], atol=1e-1, rtol=1e-1) @pytest.mark.gpu def test_backward_gpu(update, data): atom_data, adj_data, y_grad = map(cuda.to_gpu, data) update.to_gpu() gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1) def test_forward_cpu_graph_invariant(update, data): atom_data, adj_data = data[:2] y_actual = cuda.to_cpu(update(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( update(permute_atom_data, permute_adj_data).data) numpy.testing.assert_allclose( permute_node(y_actual, permutation_index, axis=1), permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_relgcn_update.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.relgcn_update import RelGCNUpdate from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 3 hidden_dim = 4 batch_size = 2 n_edge_types = 7 @pytest.fixture def update(): return RelGCNUpdate(in_channels=in_channels, out_channels=hidden_dim, n_edge_types=n_edge_types) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size) ).astype('f') y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_dim)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels) embed_atom_data = embed(atom_data).data return embed_atom_data, adj_data, y_grad def check_forward(update, atom_data, adj_data): y_actual = cuda.to_cpu(update(atom_data, adj_data).data) assert y_actual.shape == (batch_size, atom_size, hidden_dim) def test_forward_cpu(update, data): atom_data, adj_data = data[:2] check_forward(update, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(update, data): atom_data, adj_data = map(cuda.to_gpu, data[:2]) update.to_gpu() check_forward(update, atom_data, adj_data) def test_backward_cpu(update, data): atom_data, adj_data, y_grad = data gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) @pytest.mark.gpu def test_backward_gpu(update, data): atom_data, adj_data, y_grad = map(cuda.to_gpu, data) update.to_gpu() gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) def test_forward_cpu_graph_invariant(update, data): atom_data, adj_data = data[:2] y_actual = cuda.to_cpu(update(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( update(permute_atom_data, permute_adj_data).data) numpy.testing.assert_allclose( permute_node(y_actual, permutation_index, axis=1), permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_rsgcn_update.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.rsgcn_update import RSGCNUpdate from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 3 hidden_dim = 4 batch_size = 2 num_edge_type = 7 @pytest.fixture def update(): return RSGCNUpdate(in_channels=in_channels, out_channels=hidden_dim) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, atom_size, atom_size)).astype('f') y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_dim)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels) embed_atom_data = embed(atom_data).data return embed_atom_data, adj_data, y_grad def check_forward(update, atom_data, adj_data): y_actual = cuda.to_cpu(update(atom_data, adj_data).data) assert y_actual.shape == (batch_size, atom_size, hidden_dim) def test_forward_cpu(update, data): atom_data, adj_data = data[:2] check_forward(update, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(update, data): atom_data, adj_data = map(cuda.to_gpu, data[:2]) update.to_gpu() check_forward(update, atom_data, adj_data) def test_backward_cpu(update, data): atom_data, adj_data, y_grad = data gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(update, data): atom_data, adj_data, y_grad = map(cuda.to_gpu, data) update.to_gpu() gradient_check.check_backward( update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) def test_forward_cpu_graph_invariant(update, data): atom_data, adj_data = data[:2] y_actual = cuda.to_cpu(update(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( update(permute_atom_data, permute_adj_data).data) numpy.testing.assert_allclose( permute_node(y_actual, permutation_index, axis=1), permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/links_tests/update_tests/test_schnet_update.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID from chainer_chemistry.links.update.schnet_update import SchNetUpdate from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 in_channels = 4 hidden_channels = in_channels # must be same for now batch_size = 2 @pytest.fixture def update(): return SchNetUpdate(hidden_channels=hidden_channels) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype('i') # symmetric matrix dist_data = numpy.random.uniform( 0, high=30, size=(batch_size, atom_size, atom_size)).astype('f') dist_data = (dist_data + dist_data.swapaxes(-1, -2)) / 2. y_grad = numpy.random.uniform( -1, 1, (batch_size, atom_size, hidden_channels)).astype('f') embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels) embed_atom_data = embed(atom_data).data return embed_atom_data, dist_data, y_grad def check_forward(update, atom_data, dist_data): y_actual = cuda.to_cpu(update(atom_data, dist_data).data) assert y_actual.shape == (batch_size, atom_size, hidden_channels) def test_forward_cpu(update, data): atom_data, dist_data = data[:2] check_forward(update, atom_data, dist_data) @pytest.mark.gpu def test_forward_gpu(update, data): atom_data, dist_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) update.to_gpu() check_forward(update, atom_data, dist_data) def test_backward_cpu(update, data): atom_data, dist_data, y_grad = data gradient_check.check_backward( update, (atom_data, dist_data), y_grad, atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(update, data): atom_data, dist_data, y_grad = map(cuda.to_gpu, data) update.to_gpu() gradient_check.check_backward( update, (atom_data, dist_data), y_grad, atol=1e-3, rtol=1e-3) def test_forward_cpu_graph_invariant(update, data): atom_data, dist_data = data[:2] y_actual = cuda.to_cpu(update(atom_data, dist_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=1) permute_dist_data = permute_adj(dist_data, permutation_index) permute_y_actual = cuda.to_cpu(update( permute_atom_data, permute_dist_data).data) numpy.testing.assert_allclose( permute_node(y_actual, permutation_index, axis=1), permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/gwm_tests/test_gwm.py ================================================ from chainer import cuda from chainer import functions from chainer import gradient_check import numpy import pytest from chainer_chemistry.models.gwm.gwm import GWM, WarpGateUnit, SuperNodeTransmitterUnit, GraphTransmitterUnit # NOQA from chainer_chemistry.utils.permutation import permute_node atom_size = 5 hidden_dim = 4 supernode_dim = 7 batch_size = 2 num_edge_type = 2 @pytest.fixture def graph_warp_gate_unit(): return WarpGateUnit(output_type='graph', hidden_dim=hidden_dim) @pytest.fixture def super_warp_gate_unit(): return WarpGateUnit(output_type='super', hidden_dim=supernode_dim) @pytest.fixture def super_node_transmitter_unit(): return SuperNodeTransmitterUnit(hidden_dim_super=supernode_dim, hidden_dim=hidden_dim) @pytest.fixture def graph_transmitter_unit(): return GraphTransmitterUnit(hidden_dim_super=supernode_dim, hidden_dim=hidden_dim) @pytest.fixture def gwm(): # relu is difficult to test return GWM(hidden_dim=hidden_dim, hidden_dim_super=supernode_dim, n_layers=2, activation=functions.identity, wgu_activation=functions.identity, gtu_activation=functions.identity) @pytest.fixture def data(): numpy.random.seed(0) # too difficult to pass unit test by using EmbedAtomID embed_atom_data = numpy.random.uniform( -0.01, 0.01, (batch_size, atom_size, hidden_dim)).astype('f') new_embed_atom_data = numpy.random.uniform( -0.01, 0.01, (batch_size, atom_size, hidden_dim)).astype('f') y_grad = numpy.random.uniform( -0.01, 0.01, (batch_size, atom_size, hidden_dim)).astype('f') supernode = numpy.random.uniform( -0.01, 0.01, (batch_size, supernode_dim)).astype('f') supernode_grad = numpy.random.uniform( -0.01, 0.01, (batch_size, supernode_dim)).astype('f') return embed_atom_data, new_embed_atom_data, supernode, y_grad,\ supernode_grad def test_graph_transmitter_unit_forward(graph_transmitter_unit, data): embed_atom_data = data[0] supernode = data[2] h_trans = graph_transmitter_unit(embed_atom_data, supernode) assert h_trans.array.shape == (batch_size, supernode_dim) def test_graph_transmitter_unit_backward(graph_transmitter_unit, data): embed_atom_data = data[0] supernode = data[2] supernode_grad = data[4] gradient_check.check_backward(graph_transmitter_unit, (embed_atom_data, supernode), supernode_grad, eps=0.1) def test_super_node_transmitter_unit_forward(super_node_transmitter_unit, data): supernode = data[2] g_trans = super_node_transmitter_unit(supernode, atom_size) assert g_trans.array.shape == (batch_size, atom_size, hidden_dim) def test_super_node_transmitter_unit_backward(super_node_transmitter_unit, data): supernode = data[2] y_grad = data[3] gradient_check.check_backward( lambda x: super_node_transmitter_unit(x, atom_size), supernode, y_grad) def test_graph_warp_gate_unit_forward(graph_warp_gate_unit, data): embed_atom_data = data[0] new_embed_atom_data = data[1] merged = graph_warp_gate_unit(embed_atom_data, new_embed_atom_data) assert merged.array.shape == (batch_size, atom_size, hidden_dim) def test_graph_warp_gate_unit_backward(graph_warp_gate_unit, data): embed_atom_data = data[0] new_embed_atom_data = data[1] y_grad = data[3] gradient_check.check_backward(graph_warp_gate_unit, (embed_atom_data, new_embed_atom_data), y_grad, eps=0.01) def test_super_warp_gate_unit_forward(super_warp_gate_unit, data): supernode = data[2] merged = super_warp_gate_unit(supernode, supernode) assert merged.array.shape == (batch_size, supernode_dim) def test_super_warp_gate_unit_backward(super_warp_gate_unit, data): supernode = data[2] supernode_grad = data[4] gradient_check.check_backward(super_warp_gate_unit, (supernode, supernode), supernode_grad, eps=0.01) def check_forward(gwm, embed_atom_data, new_embed_atom_data, supernode): gwm.reset_state() h_actual, g_actual = gwm(embed_atom_data, new_embed_atom_data, supernode) assert h_actual.array.shape == (batch_size, atom_size, hidden_dim) assert g_actual.array.shape == (batch_size, supernode_dim) def test_forward_cpu(gwm, data): embed_atom_data, new_embed_atom_data, supernode = data[:3] check_forward(gwm, embed_atom_data, new_embed_atom_data, supernode) @pytest.mark.gpu def test_forward_gpu(gwm, data): embed_atom_data, new_embed_atom_data, supernode = data[:3] embed_atom_data = cuda.to_gpu(embed_atom_data) new_embed_atom_data = cuda.to_gpu(new_embed_atom_data) supernode = cuda.to_gpu(supernode) gwm.to_gpu() check_forward(gwm, embed_atom_data, new_embed_atom_data, supernode) def check_backward(gwm, embed_atom_data, new_embed_atom_data, supernode, y_grad, supernode_grad): gwm.reset_state() # TODO(nakago): rtol is too high! GWM is too large to calculate # numerical differentiation gradient_check.check_backward(gwm, (embed_atom_data, new_embed_atom_data, supernode), (y_grad, supernode_grad), eps=0.1, rtol=1e1) def test_backward_cpu(gwm, data): check_backward(gwm, *data) @pytest.mark.gpu def test_backward_gpu(gwm, data): gwm.to_gpu() check_backward(gwm, *map(cuda.to_gpu, data)) def test_forward_cpu_graph_invariant(gwm, data): permutation_index = numpy.random.permutation(atom_size) gwm.reset_state() embed_atom_data, new_embed_atom_data, supernode = data[:3] h_actual, g_actual = gwm(embed_atom_data, new_embed_atom_data, supernode) permute_embed_atom_data = permute_node( embed_atom_data, permutation_index, axis=1) permute_new_embed_atom_data = permute_node( new_embed_atom_data, permutation_index, axis=1) gwm.reset_state() permute_h_actual, permute_g_actual = gwm( permute_embed_atom_data, permute_new_embed_atom_data, supernode) numpy.testing.assert_allclose( permute_node(h_actual.data, permutation_index, axis=1), permute_h_actual.data, rtol=1e-5, atol=1e-5) numpy.testing.assert_allclose(g_actual.data, permute_g_actual.data, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/gwm_tests/test_gwm_graph_conv_model.py ================================================ import itertools import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links.readout.general_readout import GeneralReadout from chainer_chemistry.links.readout.ggnn_readout import GGNNReadout from chainer_chemistry.links.readout.nfp_readout import NFPReadout from chainer_chemistry.links.readout.schnet_readout import SchNetReadout from chainer_chemistry.links.update.ggnn_update import GGNNUpdate from chainer_chemistry.links.update.gin_update import GINUpdate from chainer_chemistry.links.update.relgat_update import RelGATUpdate from chainer_chemistry.links.update.relgcn_update import RelGCNUpdate from chainer_chemistry.links.update.rsgcn_update import RSGCNUpdate from chainer_chemistry.models.gwm.gwm_graph_conv_model import GWMGraphConvModel atom_size = 5 super_dim = 7 in_channels = 6 out_dim = 4 batch_size = 2 n_edge_types = 3 # TODO(nakago): SchNetUpdate need `in_channels` kwargs, not supported. updates_2dim = [GINUpdate, RSGCNUpdate] # TODO(nakago): Support MPNNUpdate. updates_3dim = [GGNNUpdate, RelGATUpdate, RelGCNUpdate] updates = updates_2dim + updates_3dim # TODO(nakago): MPNNReadout need to specify `in_channels` and not supported. readouts = [GGNNReadout, NFPReadout, SchNetReadout] hidden_channels = [[6, 6, 6, 6], 6] use_bn = [True, False] use_weight_tying = [True, False] params = list(itertools.product( updates, readouts, hidden_channels, use_bn, use_weight_tying, )) @pytest.fixture(params=params) def plain_context(request): update, readout, ch, bn, wt = request.param if update in updates_3dim: adj_type = 3 elif update in updates_2dim: adj_type = 2 else: raise ValueError data = make_data(adj_type) model = make_model(update, readout, ch, bn, wt) return model, data @pytest.fixture(params=params) def gwm_context(request): update, readout, ch, bn, wt = request.param if update in updates_3dim: adj_type = 3 elif update in updates_2dim: adj_type = 2 else: raise ValueError data = make_data(adj_type) model = make_gwm_model(update, readout, ch, bn, wt) return model, data def make_model(update, readout, ch, bn, wt): # print('update', update, 'readout', readout, 'ch', ch, 'bn', bn, 'wt', wt) return GWMGraphConvModel( update_layer=update, readout_layer=readout, n_update_layers=3, hidden_channels=ch, n_edge_types=n_edge_types, weight_tying=wt, out_dim=out_dim, with_gwm=False, use_batchnorm=bn) def make_gwm_model(update, readout, ch, bn, wt): return GWMGraphConvModel( update_layer=update, readout_layer=readout, n_update_layers=3, hidden_channels=ch, n_edge_types=n_edge_types, weight_tying=wt, super_node_dim=super_dim, out_dim=out_dim, with_gwm=True, use_batchnorm=bn) def make_data(adj_type): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype(numpy.int32) if adj_type == 2: adj_data = numpy.random.randint( 0, high=2, size=(batch_size, atom_size, atom_size) ).astype(numpy.float32) elif adj_type == 3: adj_data = numpy.random.randint( 0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size) ).astype(numpy.float32) else: raise ValueError super_data = numpy.random.uniform(-1, 1, (batch_size, super_dim) ).astype(numpy.float32) y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_data, adj_data, super_data, y_grad def test_plain_model_forward(plain_context): model, data = plain_context atom_array = data[0] adj = data[1] y_actual = model(atom_array, adj) assert y_actual.shape == (batch_size, out_dim) if model.weight_tying: assert len(model.update_layers) == 1 else: assert len(model.update_layers) == 3 def test_gwm_model_forward(gwm_context): model, data = gwm_context atom_array = data[0] adj = data[1] super_node = data[2] y_actual = model(atom_array, adj, super_node) assert y_actual.shape == (batch_size, out_dim) if model.weight_tying: assert len(model.update_layers) == 1 else: assert len(model.update_layers) == 3 # SchNet is not supported sp_params = list(itertools.product( updates_2dim[:-1] + updates_3dim, [[6, 6, 6, 6], [4, 4, 4, 4], [6, 5, 3, 4]], )) @pytest.mark.parametrize(('update', 'ch'), sp_params) def test_plain_model_forward_general_readout( update, ch): if update in updates_3dim: adj_type = 3 elif update in updates_2dim: adj_type = 2 else: raise ValueError data = make_data(adj_type) model = GWMGraphConvModel(update_layer=update, readout_layer=GeneralReadout, hidden_channels=ch, out_dim=out_dim, n_edge_types=n_edge_types, with_gwm=False) atom_array = data[0] adj = data[1] y_actual = model(atom_array, adj) assert y_actual.shape == (batch_size, out_dim) @pytest.mark.parametrize('update', updates_2dim[:-1] + updates_3dim) def test_gwm_model_forward_general_readout(update): if update in updates_3dim: adj_type = 3 elif update in updates_2dim: adj_type = 2 else: raise ValueError data = make_data(adj_type) ch = [6, 6, 6, 6] with pytest.raises(ValueError): model = GWMGraphConvModel(update_layer=update, readout_layer=GeneralReadout, hidden_channels=ch, out_dim=out_dim, n_edge_types=n_edge_types, super_node_dim=super_dim, with_gwm=True) ch = [4, 4, 4, 4] model = GWMGraphConvModel(update_layer=update, readout_layer=GeneralReadout, hidden_channels=ch, out_dim=out_dim, n_edge_types=n_edge_types, super_node_dim=super_dim, with_gwm=True) atom_array = data[0] adj = data[1] super_node = data[2] y_actual = model(atom_array, adj, super_node) assert y_actual.shape == (batch_size, out_dim) p = list(itertools.product(updates_2dim[:-1] + updates_3dim, readouts, [True, False])) @pytest.mark.parametrize(('update', 'readout', 'gwm'), p) def test_model_forward_general_weight_tying(update, readout, gwm): if update in updates_3dim: adj_type = 3 elif update in updates_2dim: adj_type = 2 else: raise ValueError data = make_data(adj_type) ch = [6, 7, 8, 6] if gwm: with pytest.raises(ValueError): model = GWMGraphConvModel(update_layer=update, readout_layer=GeneralReadout, hidden_channels=ch, out_dim=out_dim, n_edge_types=n_edge_types, super_node_dim=super_dim, with_gwm=gwm) else: model = GWMGraphConvModel(update_layer=update, readout_layer=GeneralReadout, hidden_channels=ch, out_dim=out_dim, n_edge_types=n_edge_types, super_node_dim=super_dim, with_gwm=gwm) atom_array = data[0] adj = data[1] super_node = data[2] # NOQA y_actual = model(atom_array, adj) assert y_actual.shape == (batch_size, out_dim) @pytest.mark.parametrize(('update', 'readout', 'gwm'), p) def test_model_forward_general_concat_hidden(update, readout, gwm): if update in updates_3dim: adj_type = 3 elif update in updates_2dim: adj_type = 2 else: raise ValueError data = make_data(adj_type) ch = [6, 6, 6, 6] model = GWMGraphConvModel(update_layer=update, readout_layer=readout, hidden_channels=ch, out_dim=out_dim, n_edge_types=n_edge_types, super_node_dim=super_dim, concat_hidden=True, with_gwm=gwm) atom_array = data[0] adj = data[1] super_node = data[2] y_actual = model(atom_array, adj, super_node) assert y_actual.shape == (batch_size, out_dim * (len(ch) - 1)) @pytest.mark.parametrize(('update', 'readout', 'gwm'), p) def test_model_forward_general_sum_hidden(update, readout, gwm): if update in updates_3dim: adj_type = 3 elif update in updates_2dim: adj_type = 2 else: raise ValueError data = make_data(adj_type) ch = [6, 6, 6, 6] model = GWMGraphConvModel(update_layer=update, readout_layer=readout, hidden_channels=ch, out_dim=out_dim, n_edge_types=n_edge_types, super_node_dim=super_dim, sum_hidden=True, with_gwm=gwm) atom_array = data[0] adj = data[1] super_node = data[2] y_actual = model(atom_array, adj, super_node) assert y_actual.shape == (batch_size, out_dim) if __name__ == '__main__': # -x is to stop when first failed. pytest.main([__file__, '-v', '-s', '-x']) ================================================ FILE: tests/models_tests/prediction_tests/test_base.py ================================================ import os import chainer from chainer import cuda import numpy import pytest from chainer_chemistry.models.prediction.base import BaseForwardModel class DummyForwardModel(BaseForwardModel): def __init__(self, device=-1, dummy_str='dummy'): super(DummyForwardModel, self).__init__() with self.init_scope(): self.l = chainer.links.Linear(3, 10) self.dummy_str = dummy_str self.initialize(device) def __call__(self, x): return self.l(x) # test `_forward` is done by `Classifier` and `Regressor` concrete class. def _test_save_load_pickle(device, tmpdir): model = DummyForwardModel(device=device, dummy_str='hoge') filepath = os.path.join(str(tmpdir), 'model.pkl') model.save_pickle(filepath) model_load = DummyForwardModel.load_pickle(filepath, device=device) # --- check model class --- assert isinstance(model_load, DummyForwardModel) # --- check model attribute is same --- assert model_load.dummy_str == model.dummy_str assert model_load.dummy_str == 'hoge' assert model_load.device == chainer.get_device(device) # --- check model parameter is same --- params = model.namedparams() params_load = dict(model_load.namedparams()) for k, v in params: v_load = params_load[k] assert cuda.get_device_from_array(v_load.data).id == device assert numpy.allclose(cuda.to_cpu(v.data), cuda.to_cpu(v_load.data)) def test_save_load_pickle_cpu(tmpdir): _test_save_load_pickle(device=-1, tmpdir=tmpdir) @pytest.mark.gpu def test_save_load_pickle_gpu(tmpdir): _test_save_load_pickle(device=0, tmpdir=tmpdir) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/prediction_tests/test_classifier.py ================================================ import mock import numpy import pytest import chainer from chainer import cuda from chainer import functions from chainer import links from chainer import reporter from chainer_chemistry.models.prediction.classifier import Classifier # testing.parameterize takes a list of dictionaries. # Currently, we cannot set a function to the value of the dictionaries. # As a workaround, we wrap the function and invoke it in __call__ method. # See issue #1337 for detail. class AccuracyWithIgnoreLabel(object): def __call__(self, y, t): return functions.accuracy(y, t, ignore_label=1) class DummyPredictor(chainer.Chain): def __call__(self, x): return x @pytest.mark.parametrize( 'metrics_fun', [AccuracyWithIgnoreLabel(), None, {'user_key': AccuracyWithIgnoreLabel()}]) @pytest.mark.parametrize('compute_metrics', [True, False]) class TestClassifier(object): def setup_method(self, method): self.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32) self.t = numpy.random.randint(3, size=5).astype(numpy.int32) self.y = numpy.random.uniform(-1, 1, (5, 7)).astype(numpy.float32) def check_call( self, gpu, label_key, args, kwargs, model_args, model_kwargs, metrics_fun, compute_metrics): init_kwargs = {'label_key': label_key} if metrics_fun is not None: init_kwargs['metrics_fun'] = metrics_fun link = Classifier(chainer.Link(), **init_kwargs) if gpu: xp = cuda.cupy link.to_gpu() else: xp = numpy link.compute_metrics = compute_metrics y = chainer.Variable(self.y) link.predictor = mock.MagicMock(return_value=y) loss = link(*args, **kwargs) link.predictor.assert_called_with(*model_args, **model_kwargs) assert hasattr(link, 'y') assert link.y is not None assert hasattr(link, 'loss') xp.testing.assert_allclose(link.loss.data, loss.data) assert hasattr(link, 'metrics') if compute_metrics: assert link.metrics is not None else: assert link.metrics is None def test_call_cpu(self, metrics_fun, compute_metrics): self.check_call( False, -1, (self.x, self.t), {}, (self.x,), {}, metrics_fun, compute_metrics) def test_call_three_args_cpu(self, metrics_fun, compute_metrics): self.check_call( False, -1, (self.x, self.x, self.t), {}, (self.x, self.x), {}, metrics_fun, compute_metrics) def test_call_positive_cpu(self, metrics_fun, compute_metrics): self.check_call( False, 2, (self.x, self.x, self.t), {}, (self.x, self.x), {}, metrics_fun, compute_metrics) def test_call_kwargs_cpu(self, metrics_fun, compute_metrics): self.check_call( False, 't', (self.x,), {'t': self.t}, (self.x,), {}, metrics_fun, compute_metrics) def test_call_no_arg_cpu(self, metrics_fun, compute_metrics): self.check_call( False, 0, (self.t,), {}, (), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, -1, (self.x, self.t), {}, (self.x,), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_three_args_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, -1, (self.x, self.x, self.t), {}, (self.x, self.x), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_positive_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, 2, (self.x, self.x, self.t), {}, (self.x, self.x), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_kwargs_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, 't', (self.x,), {'t': self.t}, (self.x,), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_no_arg_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, 0, (self.t,), {}, (), {}, metrics_fun, compute_metrics) def to_gpu(self): self.x = cuda.to_gpu(self.x) self.t = cuda.to_gpu(self.t) self.y = cuda.to_gpu(self.y) def test_report_key(self, metrics_fun, compute_metrics): repo = chainer.Reporter() link = Classifier(predictor=DummyPredictor(), metrics_fun=metrics_fun) link.compute_metrics = compute_metrics repo.add_observer('target', link) with repo: observation = {} with reporter.report_scope(observation): link(self.x, self.t) # print('observation ', observation) actual_keys = set(observation.keys()) if compute_metrics: if metrics_fun is None: assert set(['target/loss']) == actual_keys elif isinstance(metrics_fun, dict): assert set(['target/loss', 'target/user_key']) == actual_keys elif callable(metrics_fun): assert set(['target/loss', 'target/accuracy']) == actual_keys else: raise TypeError() else: assert set(['target/loss']) == actual_keys class TestInvalidArgument(object): @classmethod def setup_class(cls): cls.link = Classifier(links.Linear(10, 3)) cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32) def check_invalid_argument(self): x = chainer.Variable(self.link.xp.asarray(self.x)) with pytest.raises(TypeError): # link.__call__ raises TypeError as the number of arguments # is illegal self.link(x) def test_invalid_argument_cpu(self): self.check_invalid_argument() @pytest.mark.gpu def test_invalid_argument_gpu(self): self.link.to_gpu() self.check_invalid_argument() class TestInvalidLabelKey(object): @classmethod def setup_class(cls): cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32) def test_invalid_label_key_type(self): with pytest.raises(TypeError): Classifier(links.Linear(10, 3), label_key=None) def check_invalid_key(self, gpu, label_key): link = Classifier(links.Linear(10, 3), label_key=label_key) if gpu: link.to_gpu() x = chainer.Variable(link.xp.asarray(self.x)) with pytest.raises(ValueError): link(x) def test_invalid_index_cpu(self): self.check_invalid_key(False, 1) @pytest.mark.gpu def test_invalid_argument_gpu(self): self.check_invalid_key(True, 1) def test_invalid_index_too_small_cpu(self): self.check_invalid_key(False, -2) @pytest.mark.gpu def test_invalid_index_too_small_gpu(self): self.check_invalid_key(True, -2) def test_invalid_str_key_cpu(self): self.check_invalid_key(False, 't') @pytest.mark.gpu def test_invalid_str_key_gpu(self): self.check_invalid_key(True, 't') class TestClassifierPrediction(object): @classmethod def setup_class(cls): cls.predictor = DummyPredictor() cls.x = numpy.array([[0., 1.], [-1., -2.], [4., 0.]], dtype=numpy.float32) cls.t = numpy.array([1, 0, 0], dtype=numpy.int32) def test_predict_cpu(self): clf = Classifier(self.predictor) actual_t = clf.predict(self.x) assert actual_t.shape == (3,) assert actual_t.dtype == numpy.int32 assert numpy.alltrue(actual_t == self.t) @pytest.mark.gpu def test_predict_gpu(self): clf = Classifier(self.predictor, device=0) actual_t = clf.predict(self.x) assert numpy.alltrue(actual_t == self.t) def check_predict_proba(self, device): clf = Classifier(self.predictor, device=device) actual_y = clf.predict_proba(self.x) assert actual_y.shape == (3, 2) assert actual_y.dtype == numpy.float32 assert numpy.alltrue(0 <= actual_y) assert numpy.alltrue(actual_y <= 1.) actual_t = numpy.argmax(actual_y, axis=1) assert numpy.alltrue(actual_t == self.t) def test_predict_proba_cpu(self): self.check_predict_proba(-1) @pytest.mark.gpu def test_predict_proba_gpu(self): self.check_predict_proba(0) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/prediction_tests/test_graph_conv_predictor.py ================================================ from typing import Tuple # NOQA from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.ggnn import GGNN from chainer_chemistry.models.mlp import MLP from chainer_chemistry.models.prediction import GraphConvPredictor from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 class_num = 7 n_unit = 11 out_dim = 4 batch_size = 2 n_edge_types = 3 @pytest.fixture def model(): # type: () -> GraphConvPredictor mlp = MLP(out_dim=class_num, hidden_dim=n_unit) ggnn = GGNN( out_dim=out_dim, hidden_channels=n_unit, n_edge_types=n_edge_types) return GraphConvPredictor(ggnn, mlp) @pytest.fixture def data(): # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray] numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size)).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, class_num)).astype('f') return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): # type: (GraphConvPredictor, numpy.ndarray, numpy.ndarray) -> None y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, class_num) def test_forward_cpu(model, data): # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA check_forward(model, *data[:2]) @pytest.mark.gpu def test_forward_gpu(model, data): # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = map(cuda.to_gpu, data[:2]) model.to_gpu() check_forward(model, atom_data, adj_data) def test_backward_cpu(model, data): # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = data gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(model, data): # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = map(cuda.to_gpu, data) model.to_gpu() gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) def test_forward_cpu_graph_invariant(model, data): # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( model(permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s', '-x']) ================================================ FILE: tests/models_tests/prediction_tests/test_regressor.py ================================================ import mock import numpy import pytest import chainer from chainer import cuda from chainer import links from chainer import reporter from chainer_chemistry.models.prediction.regressor import Regressor class DummyPredictor(chainer.Chain): def __call__(self, x): return 2 * x @pytest.mark.parametrize( 'metrics_fun', [None, chainer.functions.mean_absolute_error, {'user_key': chainer.functions.mean_absolute_error}]) @pytest.mark.parametrize('compute_metrics', [True, False]) class TestRegressor(object): def setup_method(self, method): self.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32) self.t = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32) self.y = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32) def check_call( self, gpu, label_key, args, kwargs, model_args, model_kwargs, metrics_fun, compute_metrics): init_kwargs = {'label_key': label_key} if metrics_fun is not None: init_kwargs['metrics_fun'] = metrics_fun link = Regressor(chainer.Link(), **init_kwargs) if gpu: xp = cuda.cupy link.to_gpu() else: xp = numpy link.compute_metrics = compute_metrics y = chainer.Variable(self.y) link.predictor = mock.MagicMock(return_value=y) loss = link(*args, **kwargs) link.predictor.assert_called_with(*model_args, **model_kwargs) assert hasattr(link, 'y') assert link.y is not None assert hasattr(link, 'loss') xp.testing.assert_allclose(link.loss.data, loss.data) assert hasattr(link, 'metrics') if compute_metrics: assert link.metrics is not None else: assert link.metrics is None def test_call_cpu(self, metrics_fun, compute_metrics): self.check_call( False, -1, (self.x, self.t), {}, (self.x,), {}, metrics_fun, compute_metrics) def test_call_three_args_cpu(self, metrics_fun, compute_metrics): self.check_call( False, -1, (self.x, self.x, self.t), {}, (self.x, self.x), {}, metrics_fun, compute_metrics) def test_call_positive_cpu(self, metrics_fun, compute_metrics): self.check_call( False, 2, (self.x, self.x, self.t), {}, (self.x, self.x), {}, metrics_fun, compute_metrics) def test_call_kwargs_cpu(self, metrics_fun, compute_metrics): self.check_call( False, 't', (self.x,), {'t': self.t}, (self.x,), {}, metrics_fun, compute_metrics) def test_call_no_arg_cpu(self, metrics_fun, compute_metrics): self.check_call( False, 0, (self.t,), {}, (), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, -1, (self.x, self.t), {}, (self.x,), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_three_args_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, -1, (self.x, self.x, self.t), {}, (self.x, self.x), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_positive_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, 2, (self.x, self.x, self.t), {}, (self.x, self.x), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_kwargs_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, 't', (self.x,), {'t': self.t}, (self.x,), {}, metrics_fun, compute_metrics) @pytest.mark.gpu def test_call_no_arg_gpu(self, metrics_fun, compute_metrics): self.to_gpu() self.check_call( True, 0, (self.t,), {}, (), {}, metrics_fun, compute_metrics) def to_gpu(self): self.x = cuda.to_gpu(self.x) self.t = cuda.to_gpu(self.t) self.y = cuda.to_gpu(self.y) def test_report_key(self, metrics_fun, compute_metrics): repo = chainer.Reporter() link = Regressor(predictor=DummyPredictor(), metrics_fun=metrics_fun) link.compute_metrics = compute_metrics repo.add_observer('target', link) with repo: observation = {} with reporter.report_scope(observation): link(self.x, self.t) # print('observation ', observation) actual_keys = set(observation.keys()) if compute_metrics: if metrics_fun is None: assert set(['target/loss']) == actual_keys elif isinstance(metrics_fun, dict): assert set(['target/loss', 'target/user_key']) == actual_keys elif callable(metrics_fun): assert set(['target/loss', 'target/metrics']) == actual_keys else: raise TypeError() else: assert set(['target/loss']) == actual_keys class TestInvalidArgument(object): @classmethod def setup_class(cls): cls.link = Regressor(links.Linear(10, 3)) cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32) def check_invalid_argument(self): x = chainer.Variable(self.link.xp.asarray(self.x)) with pytest.raises(TypeError): # link.__call__ raises TypeError as the number of arguments # is illegal self.link(x) def test_invalid_argument_cpu(self): self.check_invalid_argument() @pytest.mark.gpu def test_invalid_argument_gpu(self): self.link.to_gpu() self.check_invalid_argument() class TestInvalidLabelKey(object): @classmethod def setup_class(cls): cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32) def test_invalid_label_key_type(self): with pytest.raises(TypeError): Regressor(links.Linear(10, 3), label_key=None) def check_invalid_key(self, gpu, label_key): link = Regressor(links.Linear(10, 3), label_key=label_key) if gpu: link.to_gpu() x = chainer.Variable(link.xp.asarray(self.x)) with pytest.raises(ValueError): link(x) def test_invalid_index_cpu(self): self.check_invalid_key(False, 1) @pytest.mark.gpu def test_invalid_argument_gpu(self): self.check_invalid_key(True, 1) def test_invalid_index_too_small_cpu(self): self.check_invalid_key(False, -2) @pytest.mark.gpu def test_invalid_index_too_small_gpu(self): self.check_invalid_key(True, -2) def test_invalid_str_key_cpu(self): self.check_invalid_key(False, 't') @pytest.mark.gpu def test_invalid_str_key_gpu(self): self.check_invalid_key(True, 't') class TestRegressorPrediction(object): @classmethod def setup_class(cls): cls.predictor = DummyPredictor() cls.x = numpy.array([[0., 1.], [-1., -2.], [4., 0.]], dtype=numpy.float32) cls.t = cls.x * 2 def test_predict_cpu(self): clf = Regressor(self.predictor) actual_t = clf.predict(self.x) assert actual_t.shape == (3, 2) assert actual_t.dtype == numpy.float32 assert numpy.alltrue(actual_t == self.t) @pytest.mark.gpu def test_predict_gpu(self): clf = Regressor(self.predictor, device=0) actual_t = clf.predict(self.x) assert numpy.alltrue(actual_t == self.t) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/prediction_tests/test_set_up_predictor.py ================================================ from typing import Dict # NOQA import chainer # NOQA import pytest from chainer_chemistry.models.ggnn import GGNN from chainer_chemistry.models.gin import GIN from chainer_chemistry.models.gnn_film import GNNFiLM from chainer_chemistry.models.nfp import NFP from chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor # NOQA from chainer_chemistry.models.prediction.set_up_predictor import set_up_predictor # NOQA from chainer_chemistry.models.relgat import RelGAT from chainer_chemistry.models.relgcn import RelGCN from chainer_chemistry.models.rsgcn import RSGCN from chainer_chemistry.models.schnet import SchNet from chainer_chemistry.models.weavenet import WeaveNet from chainer_chemistry.models.gwm.gwm_net import GGNN_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import GIN_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import NFP_GWM # NOQA from chainer_chemistry.models.gwm.gwm_net import RSGCN_GWM # NOQA from chainer_chemistry.models.cwle.cwle_net import GGNN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RelGAT_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RelGCN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import GIN_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import NFP_CWLE # NOQA from chainer_chemistry.models.cwle.cwle_net import RSGCN_CWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import GGNN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RelGAT_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RelGCN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import GIN_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import NFP_GWLE # NOQA from chainer_chemistry.models.gwle.gwle_net import RSGCN_GWLE # NOQA class_num = 7 n_unit = 11 conv_layers = 3 @pytest.fixture def models_dict(): # type: () -> Dict[str, chainer.Link] return { 'nfp': NFP, 'ggnn': GGNN, 'schnet': SchNet, 'weavenet': WeaveNet, 'rsgcn': RSGCN, 'relgcn': RelGCN, 'relgat': RelGAT, 'gin': GIN, 'nfp_gwm': NFP_GWM, 'ggnn_gwm': GGNN_GWM, 'rsgcn_gwm': RSGCN_GWM, 'gin_gwm': GIN_GWM, 'gnnfilm': GNNFiLM, 'nfp_wle': NFP, 'ggnn_wle': GGNN, 'relgat_wle': RelGAT, 'relgcn_wle': RelGCN, 'rsgcn_wle': RSGCN, 'gin_wle': GIN, 'nfp_cwle': NFP_CWLE, 'ggnn_cwle': GGNN_CWLE, 'relgat_cwle': RelGAT_CWLE, 'relgcn_cwle': RelGCN_CWLE, 'rsgcn_cwle': RSGCN_CWLE, 'gin_cwle': GIN_CWLE, 'nfp_gwle': NFP_GWLE, 'ggnn_gwle': GGNN_GWLE, 'relgat_gwle': RelGAT_GWLE, 'relgcn_gwle': RelGCN_GWLE, 'rsgcn_gwle': RSGCN_GWLE, 'gin_gwle': GIN_GWLE } def test_setup_predictor(models_dict): # type: (Dict[str, chainer.Link]) -> None for method, instance in models_dict.items(): predictor = set_up_predictor( method=method, n_unit=n_unit, conv_layers=conv_layers, class_num=class_num) assert isinstance(predictor.graph_conv, instance) assert isinstance(predictor, GraphConvPredictor) def test_call_invalid_model(): # type: () -> None with pytest.raises(ValueError): set_up_predictor( method='invalid', n_unit=n_unit, conv_layers=conv_layers, class_num=class_num) def test_set_up_predictor_with_conv_kwargs(): # type: () -> None predictor = set_up_predictor( method='nfp', n_unit=n_unit, conv_layers=conv_layers, class_num=class_num, conv_kwargs={ 'max_degree': 4, 'concat_hidden': True }) assert predictor.graph_conv.max_degree == 4 assert predictor.graph_conv.concat_hidden is True if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_cgcnn.py ================================================ from chainer import cuda import numpy import pytest from chainer_chemistry.models.cgcnn import CGCNN # node_size_list means the first moleculae has three nodes, # and the seconde molecule has five nodes node_size_list = [3, 5] max_num_nbr = 6 node_feature_dim = 5 edge_feature_dim = 10 out_dim = 4 batch_size = 2 @pytest.fixture def model(): return CGCNN(out_dim=out_dim) @pytest.fixture def data(): if len(node_size_list) != batch_size: raise ValueError("Invalid fixture data for CGCNN") numpy.random.seed(0) total_node_size = sum(node_size_list) # one-hot vector atom_feat = numpy.random.choice( [0, 1], (total_node_size, node_feature_dim)).astype(numpy.float32) nbr_feat = numpy.random.rand(total_node_size, max_num_nbr, edge_feature_dim).astype(numpy.float32) # atom_idx & nbr_idx curr_idx = 0 atom_idx = [] nbr_idx = [] for val in node_size_list: atom_idx.append(numpy.arange(curr_idx, val)) for _ in range(val): max_val = curr_idx + val nbr_idx.append(numpy.random.randint(curr_idx, max_val, max_num_nbr)) curr_idx += val atom_idx = numpy.asarray(atom_idx) nbr_idx = numpy.array(nbr_idx, dtype=numpy.int32) y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_feat, nbr_feat, atom_idx, nbr_idx, y_grad def check_forward(model, data): y_actual = cuda.to_cpu(model(*data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): atom_feat, nbr_feat, atom_idx, nbr_idx = data[:-1] check_forward(model, (atom_feat, nbr_feat, atom_idx, nbr_idx)) @pytest.mark.gpu def test_forward_gpu(model, data): atom_feat, nbr_feat, atom_idx, nbr_idx = data[:-1] # atom_idx is list format... use numpy array input_data = (cuda.to_gpu(atom_feat), cuda.to_gpu(nbr_feat), atom_idx, cuda.to_gpu(nbr_idx)) model.to_gpu() check_forward(model, tuple(input_data)) # def test_backward_cpu(model, data): # input_data, y_grad = data[0:-1], data[-1] # gradient_check.check_backward(model, tuple(input_data), y_grad, # atol=5e-1, rtol=1e-1) # @pytest.mark.gpu # def test_backward_gpu(model, data): # atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] # model.to_gpu() # gradient_check.check_backward(model, (atom_data, adj_data), y_grad, # atol=5e-1, rtol=1e-1) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/models_tests/test_ggnn.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.ggnn import GGNN from chainer_chemistry.models.ggnn import SparseGGNN from chainer_chemistry.utils.extend import extend_node, extend_adj # NOQA from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node from chainer_chemistry.utils.sparse_utils import _convert_to_sparse from chainer_chemistry.utils.sparse_utils import sparse_utils_available atom_size = 5 out_dim = 4 batch_size = 2 n_edge_types = 3 @pytest.fixture def model(): numpy.random.seed(0) return GGNN(out_dim=out_dim, n_edge_types=n_edge_types) @pytest.fixture def sparse_model(): numpy.random.seed(0) return SparseGGNN(out_dim=out_dim, n_edge_types=n_edge_types) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype(numpy.int32) adj_data = numpy.random.randint( 0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size) ).astype(numpy.float32) y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_data, adj_data, y_grad def check_forward(model, *args): numpy.random.seed(0) # reset seed to initialize model params consistently y_actual = cuda.to_cpu(model(*args).data) assert y_actual.shape == (batch_size, out_dim) return y_actual def test_forward_cpu(model, sparse_model, data): atom_data, adj_data = data[0], data[1] y_dense = check_forward(model, atom_data, adj_data) # test for sparse forward result is same with dense if sparse_utils_available(): y_sparse = check_forward(sparse_model, atom_data, *_convert_to_sparse(adj_data)) numpy.testing.assert_allclose( y_dense, y_sparse, atol=1e-4, rtol=1e-4) @pytest.mark.gpu def test_forward_gpu(model, sparse_model, data): atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) model.to_gpu() check_forward(model, atom_data, adj_data) if sparse_utils_available(): sparse_model.to_gpu() check_forward(sparse_model, atom_data, *_convert_to_sparse(adj_data)) def test_backward_cpu(model, data): atom_data, adj_data, y_grad = data gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) # there is no backward test for sparse model, because there will be no # gradient for input data. @pytest.mark.gpu def test_backward_gpu(model, data): atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) def test_forward_cpu_graph_invariant(model, data): atom_data, adj_data = data[0], data[1] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu(model( permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-6) def test_forward_cpu_input_size_invariant(model, data): atom_data, adj_data = data[0], data[1] is_real_node = numpy.ones(atom_data.shape, dtype=numpy.float32) y_actual = cuda.to_cpu(model(atom_array=atom_data, adj=adj_data, is_real_node=is_real_node).data) atom_data_ex = extend_node(atom_data, out_size=8) adj_data_ex = extend_adj(adj_data, out_size=8) is_real_node_ex = extend_node(is_real_node, out_size=8) y_actual_ex = cuda.to_cpu(model( atom_array=atom_data_ex, adj=adj_data_ex, is_real_node=is_real_node_ex).data) assert numpy.allclose(y_actual, y_actual_ex, rtol=1e-5, atol=1e-6) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_gin.py ================================================ from typing import Tuple # NOQA import chainer # NOQA from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.gin import GIN from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 out_dim = 4 batch_size = 3 @pytest.fixture def model(): # type: () -> GIN return GIN(out_dim=out_dim, dropout_ratio=0) @pytest.fixture def data(): # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray] numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, atom_size, atom_size)).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f') return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): # type: (GIN, numpy.ndarray, numpy.ndarray) -> None y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data = data[:2] check_forward(model, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(model, data): # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data = map(cuda.to_gpu, data[:2]) model.to_gpu() check_forward(model, atom_data, adj_data) def test_backward_cpu(model, data): # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data, y_grad = data gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) @pytest.mark.gpu def test_backward_gpu(model, data): # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data, y_grad = map(cuda.to_gpu, data) model.to_gpu() gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) def test_forward_cpu_graph_invariant(model, data): # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data = data[:2] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( model(permute_atom_data, permute_adj_data).data) numpy.testing.assert_allclose( y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_gnn_film.py ================================================ from typing import Tuple # NOQA import chainer # NOQA from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.gnn_film import GNNFiLM from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 out_dim = 4 batch_size = 3 n_edge_types = 5 @pytest.fixture def model(): # type: () -> chainer.Chain return GNNFiLM(out_dim=out_dim) @pytest.fixture def data(): # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray] numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size) ).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f') return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): # type: (chainer.Chain, numpy.ndarray, numpy.ndarray) -> None y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] check_forward(model, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(model, data): # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = map(cuda.to_gpu, data[:2]) model.to_gpu() check_forward(model, atom_data, adj_data) def test_backward_cpu(model, data): # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = data gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) @pytest.mark.gpu def test_backward_gpu(model, data): # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data, y_grad = map(cuda.to_gpu, data) model.to_gpu() gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2) def test_forward_cpu_graph_invariant(model, data): # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None # NOQA atom_data, adj_data = data[:2] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( model(permute_atom_data, permute_adj_data).data) numpy.testing.assert_allclose( y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_megnet.py ================================================ from chainer import cuda import numpy import pytest from chainer_chemistry.models.megnet import MEGNet # node_size_list means the first moleculae has six nodes, # and the seconde molecule has four nodes node_size_list = [6, 4] # edge_size_list means the first moleculae has eight edges, # and the seconde molecule has four edges edge_size_list = [8, 4] node_feature_dim = 5 edge_feature_dim = 10 global_feature_dim = 2 out_dim = 4 batch_size = 2 @pytest.fixture def model(): return MEGNet(out_dim=out_dim) @pytest.fixture def data(): if len(node_size_list) != batch_size or len(edge_size_list) != batch_size: raise ValueError("Invalid fixture data for MEGNet") numpy.random.seed(0) total_node_size = sum(node_size_list) total_edge_size = sum(edge_size_list) atom_feat = numpy.random.rand(total_node_size, node_feature_dim).astype(numpy.float32) pair_feat = numpy.random.rand(total_edge_size, edge_feature_dim).astype(numpy.float32) global_feat = numpy.random.rand(batch_size, global_feature_dim).astype(numpy.float32) # atom idx atom_idx = numpy.hstack([[i] * node_size_list[i] for i in range(batch_size)]).astype(numpy.int32) # pair idx pair_idx = numpy.hstack([[i] * edge_size_list[i] for i in range(batch_size)]).astype(numpy.int32) # create start and end idx edge_idx = [] acc_node_size = [sum(node_size_list[:i+1]) for i in range(batch_size)] low = numpy.roll(acc_node_size + [0], 1)[0:batch_size+1] high = numpy.array(acc_node_size) for i in range(batch_size): idx = [numpy.random.choice(numpy.arange(low[i], high[i]), 2, replace=False) for _ in range(edge_size_list[i])] edge_idx.extend(idx) start_idx = numpy.array(edge_idx, dtype=numpy.int32)[:, 0] end_idx = numpy.array(edge_idx, dtype=numpy.int32)[:, 1] y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_feat, pair_feat, global_feat, \ atom_idx, pair_idx, start_idx, end_idx, y_grad def check_forward(model, data): y_actual = cuda.to_cpu(model(*data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): atom_feat, pair_feat, global_feat, \ atom_idx, pair_idx, start_idx, end_idx = data[:-1] check_forward(model, (atom_feat, pair_feat, global_feat, atom_idx, pair_idx, start_idx, end_idx)) @pytest.mark.gpu def test_forward_gpu(model, data): input_data = [cuda.to_gpu(d) for d in data[:-1]] model.to_gpu() check_forward(model, tuple(input_data)) # def test_backward_cpu(model, data): # input_data, y_grad = data[0:-1], data[-1] # gradient_check.check_backward(model, tuple(input_data), y_grad, # atol=5e-1, rtol=1e-1) # @pytest.mark.gpu # def test_backward_gpu(model, data): # data = [cuda.to_gpu(d) for d in data] # input_data, y_grad = data[0:-1], data[-1] # model.to_gpu() # gradient_check.check_backward(model, tuple(input_data), y_grad, # atol=5e-1, rtol=1e-1) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/models_tests/test_mlp.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.models.mlp import MLP batch_size = 2 hidden_dim = 16 out_dim = 4 @pytest.fixture def model(): return MLP(out_dim=out_dim) @pytest.fixture def data(): numpy.random.seed(0) hidden = numpy.random.rand(batch_size, hidden_dim).astype(numpy.float32) y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype( numpy.float32) return hidden, y_grad def check_forward(model, data): y_actual = cuda.to_cpu(model(data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): check_forward(model, data[0]) @pytest.mark.gpu def test_forward_gpu(model, data): model.to_gpu() check_forward(model, cuda.to_gpu(data[0])) def test_mlp_assert_raises(): with pytest.raises(ValueError): MLP(out_dim=out_dim, n_layers=-1) def test_backward_cpu(model, data): hidden, y_grad = data gradient_check.check_backward(model, hidden, y_grad, atol=1e0, rtol=1e0) @pytest.mark.gpu def test_backward_gpu(model, data): hidden, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward(model, hidden, y_grad, atol=1e0, rtol=1e0) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/models_tests/test_mpnn.py ================================================ from typing import List # NOQA from typing import Tuple # NOQA from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.mpnn import MPNN from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 out_dim = 4 batch_size = 2 num_edge_type = 3 @pytest.fixture(params=[('edgenet', 'set2set'), ('edgenet', 'ggnn'), ('ggnn', 'set2set'), ('ggnn', 'ggnn')]) def model(request): # type: (pytest.fixture.SubRequest) -> MPNN message_func, readout_func = request.param return MPNN( out_dim=out_dim, n_edge_types=num_edge_type, message_func=message_func, readout_func=readout_func) @pytest.fixture def data(): # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray] numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype(numpy.int32) adj_data = numpy.random.randint( 0, high=2, size=(batch_size, num_edge_type, atom_size, atom_size)).astype(numpy.float32) y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): # type: (MPNN, numpy.ndarray, numpy.ndarray) -> None y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data = data[0], data[1] check_forward(model, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(model, data): # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) model.to_gpu() check_forward(model, atom_data, adj_data) def test_backward_cpu(model, data): # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data, y_grad = data gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-0, rtol=1e-0) @pytest.mark.gpu def test_backward_gpu(model, data): # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-0, rtol=1e-0) def test_forward_cpu_graph_invariant(model, data): # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None if model.message_func == 'edgenet': return # Because EdgeNet uses NN for expanding edge vector dimension, # graph invariant is not ensured. atom_data, adj_data = data[0], data[1] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu( model(permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-3, atol=1e-3) def test_invalid_message_funcion(): # type: () -> None with pytest.raises(ValueError): MPNN(out_dim=out_dim, message_func='invalid') def test_invalid_readout_funcion(): # type: () -> None with pytest.raises(ValueError): MPNN(out_dim=out_dim, readout_func='invalid') if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_nfp.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.nfp import NFP from chainer_chemistry.utils.extend import extend_adj, extend_node # NOQA from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 out_dim = 4 batch_size = 2 @pytest.fixture def model(): return NFP(out_dim=out_dim) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype(numpy.int32) adj_data = numpy.random.randint( 0, high=2, size=(batch_size, atom_size, atom_size) ).astype(numpy.float32) y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): atom_data, adj_data = data[0], data[1] check_forward(model, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(model, data): atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) model.to_gpu() check_forward(model, atom_data, adj_data) # TODO(nakago): check why tolerance is high def test_backward_cpu(model, data): atom_data, adj_data, y_grad = data gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=1e0, rtol=1e0) # TODO(nakago): check why tolerance is high @pytest.mark.gpu def test_backward_gpu(model, data): atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=1e0, rtol=1e0) def test_forward_cpu_graph_invariant(model, data): atom_data, adj_data = data[0], data[1] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu(model( permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-6) def test_forward_cpu_input_size_invariant(model, data): atom_data, adj_data = data[0], data[1] is_real_node = numpy.ones(atom_data.shape, dtype=numpy.float32) y_actual = cuda.to_cpu(model( atom_array=atom_data, adj=adj_data, is_real_node=is_real_node).data) atom_data_ex = extend_node(atom_data, out_size=8) adj_data_ex = extend_adj(adj_data, out_size=8) is_real_node_ex = extend_node(is_real_node, out_size=8) y_actual_ex = cuda.to_cpu(model( atom_array=atom_data_ex, adj=adj_data_ex, is_real_node=is_real_node_ex).data) assert numpy.allclose(y_actual, y_actual_ex, rtol=1e-5, atol=1e-6) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_relgat.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.relgat import RelGAT from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 out_dim = 4 batch_size = 2 num_edge_type = 4 @pytest.fixture(params=[True, False]) def model(request): return RelGAT(out_dim=out_dim, concat_heads=request.param) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype(numpy.int32) adj_data = numpy.random.randint( 0, high=2, size=(batch_size, num_edge_type, atom_size, atom_size) ).astype(numpy.float32) y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): atom_data, adj_data = data[0], data[1] check_forward(model, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(model, data): atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) model.to_gpu() check_forward(model, atom_data, adj_data) # TODO(mottodora): check why tolerance is high def test_backward_cpu(model, data): atom_data, adj_data, y_grad = data params = tuple(model.params()) gradient_check.check_backward(model, (atom_data, adj_data), y_grad, params=params, no_grads=[True, True], atol=1e3, rtol=1e3) # TODO(nakago): check why tolerance is high @pytest.mark.gpu def test_backward_gpu(model, data): atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() params = tuple(model.params()) gradient_check.check_backward(model, (atom_data, adj_data), y_grad, params=params, no_grads=[True, True], atol=1e3, rtol=1e3) def test_forward_cpu_graph_invariant(model, data): atom_data, adj_data = data[0], data[1] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu(model( permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-6) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_relgcn.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.gwm.gwm_graph_conv_model import rescale_adj # NOQA from chainer_chemistry.models.relgcn import RelGCN from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 out_ch = 4 batch_size = 2 num_edge_type = 4 @pytest.fixture def model(): return RelGCN(out_dim=out_ch) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i') adj_data = numpy.random.randint( 0, high=2, size=(batch_size, num_edge_type, atom_size, atom_size)).astype('f') y_grad = numpy.random.uniform(-1, 1, (batch_size, out_ch)).astype('f') return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, out_ch) def test_forward_cpu(model, data): atom_data, adj_data = data[0], data[1] check_forward(model, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(model, data): atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) model.to_gpu() check_forward(model, atom_data, adj_data) def test_backward_cpu(model, data): atom_data, adj_data, y_grad = data gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) @pytest.mark.gpu def test_backward_gpu(model, data): atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3) def test_forward_cpu_invariant(model, data): atom_data, adj_data = data[0], data[1] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu(model( permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) def test_rescale_adj(data): adj = data[1] numpy.testing.assert_allclose(rescale_adj(adj).data.sum(axis=(1, 2)), numpy.ones((batch_size, atom_size)), atol=1e-5, rtol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_rsgcn.py ================================================ import chainer from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.links import NFPReadout from chainer_chemistry.models.rsgcn import RSGCN from chainer_chemistry.utils.extend import extend_adj from chainer_chemistry.utils.extend import extend_node from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 out_dim = 4 batch_size = 2 @pytest.fixture def model(): return RSGCN(out_dim=out_dim) @pytest.fixture def model_no_dropout(): # To check backward gradient by `gradient_check`, # we need to skip stochastic dropout function. return RSGCN(out_dim=out_dim, dropout_ratio=0.) @pytest.fixture def model_with_nfp(): return RSGCN(out_dim=out_dim, readout=NFPReadout(in_channels=out_dim, out_dim=out_dim)) @pytest.fixture def model_with_nfp_no_dropout(): return RSGCN(out_dim=out_dim, readout=NFPReadout(in_channels=out_dim, out_dim=out_dim), dropout_ratio=0.) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype(numpy.int32) # adj_data is symmetric matrix adj_data = numpy.random.uniform( 0, high=1, size=(batch_size, atom_size, atom_size) ).astype(numpy.float32) adj_data = adj_data + adj_data.swapaxes(-1, -2) y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): atom_data, adj_data = data[0], data[1] check_forward(model, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(model, data): atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) model.to_gpu() check_forward(model, atom_data, adj_data) def test_forward_cpu_with_nfp(model_with_nfp, data): atom_data, adj_data = data[0], data[1] check_forward(model_with_nfp, atom_data, adj_data) def test_backward_cpu(model_no_dropout, data): atom_data, adj_data, y_grad = data if int(chainer.__version__[0]) <= 2: # somehow the test fails with `params` when using chainer version 2... # TODO(nakago): investigate why the test fails. params = () else: params = tuple(model_no_dropout.params()) # TODO(nakago): check why tolerance is high gradient_check.check_backward( model_no_dropout, (atom_data, adj_data), y_grad, params=params, atol=1e-1, rtol=1e-1, no_grads=[True, True]) @pytest.mark.gpu def test_backward_gpu(model_no_dropout, data): atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] model_no_dropout.to_gpu() if int(chainer.__version__[0]) <= 2: # somehow the test fails with `params` when using chainer version 2... # TODO(nakago): investigate why the test fails. params = () else: params = tuple(model_no_dropout.params()) # TODO(nakago): check why tolerance is high gradient_check.check_backward( model_no_dropout, (atom_data, adj_data), y_grad, params=params, atol=1e-1, rtol=1e-1, no_grads=[True, True]) def test_backward_cpu_with_nfp(model_with_nfp_no_dropout, data): atom_data, adj_data, y_grad = data if int(chainer.__version__[0]) <= 2: params = () else: params = tuple(model_with_nfp_no_dropout.params()) gradient_check.check_backward( model_with_nfp_no_dropout, (atom_data, adj_data), y_grad, params=params, atol=1e-4, rtol=1e-4, no_grads=[True, True]) def test_forward_cpu_graph_invariant(model, data): # This RSGCN uses dropout, so we need to forward with test mode # to remove stochastic calculation. atom_data, adj_data = data[0], data[1] with chainer.using_config('train', False): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) with chainer.using_config('train', False): permute_y_actual = cuda.to_cpu(model( permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1.e-4, atol=1.e-5) def test_forward_cpu_input_size_invariant(model, data): # This RSGCN uses dropout, so we need to forward with test mode # to remove stochastic calculation. atom_data, adj_data = data[0], data[1] with chainer.using_config('train', False): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) # Set bigger size than original `atom_size`. atom_data_ex = extend_node(atom_data, out_size=8) adj_data_ex = extend_adj(adj_data, out_size=8) # print('size', atom_data.shape, adj_data.shape, # atom_data_ex.shape, adj_data_ex.shape) with chainer.using_config('train', False): y_actual_ex = cuda.to_cpu(model( atom_data_ex, adj_data_ex).data) assert numpy.allclose(y_actual, y_actual_ex, rtol=1.e-4, atol=1.e-5) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/models_tests/test_schnet.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.schnet import SchNet from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 out_dim = 4 batch_size = 2 @pytest.fixture def model(): return SchNet(out_dim=out_dim) @pytest.fixture def data(): numpy.random.seed(0) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype(numpy.int32) # symmetric matrix adj_data = numpy.random.uniform( 0, high=30, size=(batch_size, atom_size, atom_size) ).astype(numpy.float32) adj_data = (adj_data + adj_data.swapaxes(-1, -2)) / 2. y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, data): atom_data, adj_data = data[0], data[1] check_forward(model, atom_data, adj_data) @pytest.mark.gpu def test_forward_gpu(model, data): atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1]) model.to_gpu() check_forward(model, atom_data, adj_data) def test_backward_cpu(model, data): atom_data, adj_data, y_grad = data gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=5e-1, rtol=1e-1) @pytest.mark.gpu def test_backward_gpu(model, data): atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data] model.to_gpu() gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=5e-1, rtol=1e-1) def test_forward_cpu_graph_invariant(model, data): atom_data, adj_data = data[0], data[1] y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index) permute_adj_data = permute_adj(adj_data, permutation_index) permute_y_actual = cuda.to_cpu(model( permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-5) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/models_tests/test_weavenet.py ================================================ from chainer import cuda from chainer import gradient_check import numpy import pytest from chainer_chemistry.config import MAX_ATOMIC_NUM from chainer_chemistry.models.weavenet import WeaveNet from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node atom_size = 5 weave_channels = [50, 50] batch_size = 2 atom_feature_dim = 23 pair_feature_dim = 10 out_dim = weave_channels[-1] @pytest.fixture def model(): return WeaveNet(weave_channels=weave_channels, n_atom=atom_size) @pytest.fixture def model_processed(): """model to test `atom_data_processed` input""" return WeaveNet(weave_channels=weave_channels, n_atom=atom_size) @pytest.fixture def data(): numpy.random.seed(0) atom_data_processed = numpy.random.uniform( 0, high=1, size=(batch_size, atom_size, atom_feature_dim) ).astype(numpy.float32) atom_data = numpy.random.randint( 0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size) ).astype(numpy.int32) adj_data = numpy.random.uniform( 0, high=1, size=(batch_size, pair_feature_dim, atom_size, atom_size) ).astype(numpy.float32) # adj_data is symmetric along pair of atoms # adj_data = adj_data + adj_data.swapaxes(-1, -2) adj_data = adj_data.transpose((0, 3, 2, 1)).reshape( batch_size, atom_size * atom_size, pair_feature_dim ).astype(numpy.float32) y_grad = numpy.random.uniform( -1, 1, (batch_size, out_dim)).astype(numpy.float32) return atom_data_processed, atom_data, adj_data, y_grad def check_forward(model, atom_data, adj_data): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) print('y_actual', y_actual.shape) assert y_actual.shape == (batch_size, out_dim) def test_forward_cpu(model, model_processed, data): atom_data_processed, atom_data, adj_data = data[0:3] check_forward(model, atom_data, adj_data) check_forward(model_processed, atom_data_processed, adj_data) @pytest.mark.gpu def test_forward_gpu(model, model_processed, data): atom_data_processed, atom_data, adj_data = \ [cuda.to_gpu(d) for d in data[0:3]] model.to_gpu() model_processed.to_gpu() check_forward(model, atom_data, adj_data) check_forward(model_processed, atom_data_processed, adj_data) def test_backward_cpu(model, model_processed, data): atom_data_processed, atom_data, adj_data, y_grad = data gradient_check.check_backward(model, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1) gradient_check.check_backward(model_processed, (atom_data_processed, adj_data), y_grad, atol=1e-1, rtol=1e-1) @pytest.mark.gpu def test_backward_gpu(model, model_processed, data): atom_data_processed, atom_data, adj_data, y_grad = \ [cuda.to_gpu(d) for d in data] model.to_gpu() model_processed.to_gpu() gradient_check.check_backward( model, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1) gradient_check.check_backward( model_processed, (atom_data_processed, adj_data), y_grad, atol=1e-1, rtol=1e-1) def _test_forward_cpu_graph_invariant( model, atom_data, adj_data, node_permute_axis=-1): y_actual = cuda.to_cpu(model(atom_data, adj_data).data) permutation_index = numpy.random.permutation(atom_size) permute_atom_data = permute_node(atom_data, permutation_index, axis=node_permute_axis) permute_adj_data = adj_data.reshape( batch_size, atom_size, atom_size, pair_feature_dim ).astype(numpy.float32) permute_adj_data = permute_adj( permute_adj_data, permutation_index, axis=[1, 2]) permute_adj_data = permute_adj_data.reshape( batch_size, atom_size * atom_size, pair_feature_dim ).astype(numpy.float32) permute_y_actual = cuda.to_cpu(model( permute_atom_data, permute_adj_data).data) assert numpy.allclose(y_actual, permute_y_actual, rtol=1.e-4, atol=1.e-6) def test_forward_cpu_graph_invariant_embed(model, data): atom_data, adj_data = data[1], data[2] _test_forward_cpu_graph_invariant( model, atom_data, adj_data, node_permute_axis=-1) def test_forward_cpu_graph_invariant_processed(model_processed, data): atom_data_processed, adj_data = data[0], data[2] _test_forward_cpu_graph_invariant( model_processed, atom_data_processed, adj_data, node_permute_axis=1) if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/saliency_tests/calculator_tests/test_base_calculator.py ================================================ import numpy import pytest import chainer from chainer.links import Linear from chainer_chemistry.link_hooks import is_link_hooks_available if is_link_hooks_available: from chainer_chemistry.link_hooks import VariableMonitorLinkHook from chainer_chemistry.saliency.calculator.base_calculator import BaseCalculator # NOQA from chainer_chemistry.saliency.calculator import GaussianNoiseSampler class DummyCalculator(BaseCalculator): """Dummy calculator which returns target_var""" def _compute_core(self, *inputs): self.model(*inputs) return self.get_target_var(inputs) class DummyModel(chainer.Chain): def __init__(self): super(DummyModel, self).__init__() with self.init_scope(): self.l1 = Linear( 3, 1, initialW=numpy.array([[1, 3, 2]]), nobias=True) self.h = None def forward(self, x): self.h = self.l1(x) out = self.h * 3 return out @pytest.fixture def model(): return DummyModel() @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_base_calculator_compute(model): calculator = DummyCalculator(model) x = numpy.array([[1, 5, 8]], dtype=numpy.float32) saliency = calculator.compute(x) # DummyCalculator returns `saliency` as input `x`. assert numpy.allclose(saliency, x) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_base_calculator_compute_noise_sampler(model): calculator = DummyCalculator(model) x = numpy.array([[1, 5, 8]], dtype=numpy.float32) saliency = calculator.compute(x, M=2, noise_sampler=GaussianNoiseSampler()) assert saliency.shape == (2, 3) # noise is added, should be different from original input assert not numpy.allclose(saliency[0], x) assert not numpy.allclose(saliency[1], x) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_base_calculator_compute_target_extractor(model): # It should extract `target_var` as after `l1`, which is `model.h`. calculator = DummyCalculator( model, target_extractor=VariableMonitorLinkHook(model.l1)) x = numpy.array([[1, 5, 8]], dtype=numpy.float32) saliency = calculator.compute(x) assert numpy.allclose(saliency, model.h.array) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_base_calculator_aggregate(): model = DummyModel() calculator = DummyCalculator(model) saliency = numpy.array([[-1, -1, -1], [2, 2, 2]], dtype=numpy.float32) saliency_raw = calculator.aggregate(saliency, method='raw', ch_axis=None) assert numpy.allclose(saliency_raw, numpy.array([[0.5, 0.5, 0.5]], dtype=numpy.float32)) saliency_abs = calculator.aggregate(saliency, method='abs', ch_axis=None) assert numpy.allclose(saliency_abs, numpy.array([[1.5, 1.5, 1.5]], dtype=numpy.float32)) saliency_square = calculator.aggregate(saliency, method='square', ch_axis=None) assert numpy.allclose(saliency_square, numpy.array([[2.5, 2.5, 2.5]], dtype=numpy.float32)) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/saliency_tests/calculator_tests/test_calculator_utils.py ================================================ import numpy import pytest from chainer_chemistry.saliency.calculator.calculator_utils import GaussianNoiseSampler # NOQA @pytest.mark.parametrize('mode', ['relative', 'absolute']) def test_gaussian_noise_sampler(mode): shape = (3, 4, 5) target_array = numpy.random.uniform(0, 1, shape) sampler = GaussianNoiseSampler(mode=mode, scale=0.15) noise = sampler.sample(target_array) assert noise.shape == shape def test_gaussian_noise_sampler_assert_raises(): shape = (3, 4, 5) target_array = numpy.random.uniform(0, 1, shape) with pytest.raises(ValueError): sampler = GaussianNoiseSampler(mode='invalid_mode', scale=0.15) sampler.sample(target_array) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/saliency_tests/calculator_tests/test_gradient_calculator.py ================================================ import numpy import pytest import chainer from chainer.links import Linear from chainer_chemistry.link_hooks import is_link_hooks_available if is_link_hooks_available: from chainer_chemistry.link_hooks import VariableMonitorLinkHook from chainer_chemistry.saliency.calculator.gradient_calculator import GradientCalculator # NOQA class DummyModel(chainer.Chain): def __init__(self): super(DummyModel, self).__init__() with self.init_scope(): self.l1 = Linear( 3, 1, initialW=numpy.array([[1, 3, 2]]), nobias=True) def forward(self, x): return self.l1(x) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_gradient_calculator(): model = DummyModel() x = numpy.array([[1, 5, 8]], dtype=numpy.float32) calculator = GradientCalculator(model) saliency = calculator.compute(x) # Gradient is equal to `initialW` of DummyModel. assert numpy.allclose(saliency, numpy.array([[1, 3, 2]])) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_gradient_calculator_multiple_output(): model = DummyModel() x = numpy.array([[1, 5, 8], [2, 3, 4]], dtype=numpy.float32) calculator = GradientCalculator(model) # even batchsize=2 sum is applied automatically inside `compute`, # so gradient can be calculated. saliency = calculator.compute(x) # Gradient is equal to `initialW` of DummyModel. assert numpy.allclose(saliency, numpy.array([[1, 3, 2]])) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_gradient_calculator_multiply_target(): model = DummyModel() x = numpy.array([[1, 5, 8]], dtype=numpy.float32) calculator = GradientCalculator(model, multiply_target=True) saliency = calculator.compute(x) # gradient * input assert numpy.allclose(saliency, numpy.array([[1, 15, 16]])) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_gradient_calculator_target_extractor(): model = DummyModel() x = numpy.array([[1, 5, 8]], dtype=numpy.float32) calculator = GradientCalculator( model, target_extractor=VariableMonitorLinkHook(model.l1, timing='pre')) saliency = calculator.compute(x) # Gradient is equal to `initialW` of DummyModel. assert numpy.allclose(saliency, numpy.array([[[1, 3, 2]]])) assert saliency.shape == (1, 1, 3) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/saliency_tests/calculator_tests/test_integrated_gradient_calculator.py ================================================ import numpy import pytest import chainer from chainer.links import Linear from chainer_chemistry.link_hooks import is_link_hooks_available if is_link_hooks_available: from chainer_chemistry.link_hooks import VariableMonitorLinkHook from chainer_chemistry.saliency.calculator.integrated_gradients_calculator import IntegratedGradientsCalculator # NOQA class DummyModel(chainer.Chain): def __init__(self): super(DummyModel, self).__init__() with self.init_scope(): self.l1 = Linear( 3, 1, initialW=numpy.array([[1, 3, 2]]), nobias=True) def forward(self, x): return self.l1(x) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_integrated_gradient_calculator(): model = DummyModel() x = numpy.array([[1, 5, 8]], dtype=numpy.float32) calculator = IntegratedGradientsCalculator(model, steps=3) saliency = calculator.compute(x) # gradient is always [1, 3, 2] * (input - base) is [1, 5, 8] assert numpy.allclose(saliency, numpy.array([[1, 15, 16]])) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_integrated_gradient_calculator_target_extractor(): model = DummyModel() x = numpy.array([[1, 5, 8]], dtype=numpy.float32) calculator = IntegratedGradientsCalculator( model, steps=4, target_extractor=VariableMonitorLinkHook(model.l1, timing='pre')) saliency = calculator.compute(x) # gradient is always [1, 3, 2] * (input - base) is [1, 5, 8] assert numpy.allclose(saliency, numpy.array([[[1, 15, 16]]])) assert saliency.shape == (1, 1, 3) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/saliency_tests/calculator_tests/test_occlusion_calculator.py ================================================ import numpy import pytest import chainer from chainer.links import Linear, Convolution2D # NOQA from chainer_chemistry.link_hooks import is_link_hooks_available if is_link_hooks_available: from chainer_chemistry.link_hooks import VariableMonitorLinkHook from chainer_chemistry.saliency.calculator.occlusion_calculator import OcclusionCalculator # NOQA class DummyModel(chainer.Chain): def __init__(self): super(DummyModel, self).__init__() with self.init_scope(): self.l1 = Linear( 3, 1, initialW=numpy.array([[1, 3, 2]]), nobias=True) def forward(self, x): return self.l1(x) class DummyCNNModel(chainer.Chain): def __init__(self): super(DummyCNNModel, self).__init__() with self.init_scope(): self.l1 = Convolution2D( 1, 1, ksize=3, initialW=numpy.ones((1, 1, 3, 3), numpy.float32), nobias=True) def forward(self, x): return self.l1(x) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_occlusion_calculator(): model = DummyModel() x = numpy.array([[1, 5, 8]], dtype=numpy.float32) calculator = OcclusionCalculator(model, slide_axis=1) saliency = calculator.compute(x) assert numpy.allclose(saliency, numpy.array([[[1, 15, 16]]])) assert saliency.shape == (1, 1, 3) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_occlusion_calculator_cnn(): model = DummyCNNModel() # x (1, 1, 3, 3): (bs, ch, h, w) x = numpy.array([[[[1, 5, 8], [2, 4, 1], [3, 2, 9]]]], dtype=numpy.float32) calculator = OcclusionCalculator(model, slide_axis=(2, 3)) saliency = calculator.compute(x) assert numpy.allclose(saliency, x) assert saliency.shape == (1, 1, 1, 3, 3) # (M, bs, ch, h, w) @pytest.mark.skipif(not is_link_hooks_available, reason='Link Hook is not available') def test_occlusion_calculator_target_extractor(): model = DummyModel() x = numpy.array([[1, 5, 8]], dtype=numpy.float32) calculator = OcclusionCalculator( model, slide_axis=1, target_extractor=VariableMonitorLinkHook(model.l1, timing='pre')) saliency = calculator.compute(x) assert numpy.allclose(saliency, numpy.array([[[1, 15, 16]]])) assert saliency.shape == (1, 1, 3) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/saliency_tests/visualizer_tests/test_image_visualizer.py ================================================ import os import sys import matplotlib.pyplot as plt import numpy import pytest from chainer_chemistry.saliency.visualizer.image_visualizer import ImageVisualizer # NOQA is_python_version2 = sys.version_info[0] < 3 @pytest.mark.skipif(is_python_version2, reason='matplotlib configuration is necessary with' 'python version 2') def test_image_visualizer(tmpdir): # Only test file is saved without error ch = 3 h = 32 w = 32 saliency = numpy.random.uniform(0, 1, (ch, h, w)) visualizer = ImageVisualizer() # 1. test with setting save_filepath save_filepath = os.path.join(str(tmpdir), 'tmp.png') visualizer.visualize(saliency, save_filepath=save_filepath) assert os.path.exists(save_filepath) # 2. test with `save_filepath=None` runs without error image = numpy.random.uniform(0, 1, (ch, h, w)) plt.ion() visualizer.visualize( saliency, save_filepath=None, image=image, show_colorbar=True) plt.close() def test_table_visualizer_assert_raises(): visualizer = ImageVisualizer() with pytest.raises(ValueError): # --- Invalid saliency shape --- saliency_invalid = numpy.array([0.5, 0.3, 0.2]) visualizer.visualize(saliency_invalid) ch = 3 h = 32 w = 32 saliency = numpy.random.uniform(0, 1, (ch, h, w)) with pytest.raises(ValueError): # --- Invalid sort key --- image_invalid = numpy.array([0.5, 0.3, 0.2]) visualizer.visualize(saliency, image=image_invalid) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/saliency_tests/visualizer_tests/test_mol_visualizer.py ================================================ import os import numpy import pytest from rdkit import Chem from chainer_chemistry.saliency.visualizer.mol_visualizer import MolVisualizer # NOQA from chainer_chemistry.saliency.visualizer.mol_visualizer import SmilesVisualizer # NOQA def test_mol_visualizer(tmpdir): # Only test file is saved without error smiles = 'OCO' mol = Chem.MolFromSmiles(smiles) saliency = numpy.array([0.5, 0.3, 0.2]) visualizer = MolVisualizer() # 1. test with setting save_filepath save_filepath = os.path.join(str(tmpdir), 'tmp.svg') svg = visualizer.visualize(saliency, mol, save_filepath=save_filepath) assert isinstance(svg, str) assert os.path.exists(save_filepath) # 2. test with `save_filepath=None` runs without error svg = visualizer.visualize( saliency, mol, save_filepath=None, visualize_ratio=0.5,) assert isinstance(svg, str) def test_smiles_visualizer(tmpdir): # Only test file is saved without error smiles = 'OCO' saliency = numpy.array([0.5, 0.3, 0.2]) visualizer = SmilesVisualizer() # 1. test with setting save_filepath save_filepath = os.path.join(str(tmpdir), 'tmp.svg') svg = visualizer.visualize(saliency, smiles, save_filepath=save_filepath, add_Hs=False) assert os.path.exists(save_filepath) assert isinstance(svg, str) save_filepath = os.path.join(str(tmpdir), 'tmp.png') svg = visualizer.visualize(saliency, smiles, save_filepath=save_filepath, add_Hs=False) assert isinstance(svg, str) # TODO(nakago): support png save test. # Do not test for now (cairosvg is necessary) # assert os.path.exists(save_filepath) # 2. test with `save_filepath=None` runs without error svg = visualizer.visualize( saliency, smiles, save_filepath=None, visualize_ratio=0.5, add_Hs=False, use_canonical_smiles=True) assert isinstance(svg, str) def test_mol_visualizer_assert_raises(tmpdir): visualizer = MolVisualizer() smiles = 'OCO' mol = Chem.MolFromSmiles(smiles) with pytest.raises(ValueError): # --- Invalid saliency shape --- saliency = numpy.array([[0.5, 0.3, 0.2], [0.5, 0.3, 0.2]]) visualizer.visualize(saliency, mol) with pytest.raises(ValueError): # --- Invalid sort key --- saliency = numpy.array([0.5, 0.3, 0.2]) invalid_ext_filepath = os.path.join(str(tmpdir), 'tmp.hoge') visualizer.visualize(saliency, mol, save_filepath=invalid_ext_filepath) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/saliency_tests/visualizer_tests/test_table_visualizer.py ================================================ import os import sys import matplotlib.pyplot as plt import numpy import pytest from chainer_chemistry.saliency.visualizer.table_visualizer import TableVisualizer # NOQA is_python_version2 = sys.version_info[0] < 3 @pytest.mark.skipif(is_python_version2, reason='matplotlib configuration is necessary with' 'python version 2') def test_table_visualizer(tmpdir): # Only test file is saved without error saliency = numpy.array([0.5, 0.3, 0.2]) visualizer = TableVisualizer() # 1. test with setting save_filepath save_filepath = os.path.join(str(tmpdir), 'tmp.png') visualizer.visualize(saliency, save_filepath=save_filepath) assert os.path.exists(save_filepath) # 2. test with `save_filepath=None` runs without error plt.ion() visualizer.visualize( saliency, save_filepath=None, feature_names=['hoge', 'huga', 'piyo'], num_visualize=2) plt.close() def test_table_visualizer_assert_raises(): visualizer = TableVisualizer() with pytest.raises(ValueError): # --- Invalid saliency shape --- saliency = numpy.array([[0.5, 0.3, 0.2], [0.5, 0.3, 0.2]]) visualizer.visualize(saliency) with pytest.raises(ValueError): # --- Invalid sort key --- saliency = numpy.array([0.5, 0.3, 0.2]) visualizer.visualize(saliency, sort='invalidkey') with pytest.raises(ValueError): # --- Invalid feature_names key --- saliency = numpy.array([0.5, 0.3, 0.2]) feature_names = ['a', 'b', 'c', 'd'] visualizer.visualize(saliency, feature_names=feature_names) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/saliency_tests/visualizer_tests/test_visualizer_utils.py ================================================ import numpy import pytest from chainer_chemistry.saliency.visualizer.visualizer_utils import abs_max_scaler # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import min_max_scaler # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import normalize_scaler # NOQA from chainer_chemistry.saliency.visualizer.visualizer_utils import red_blue_cmap # NOQA def test_abs_max_scaler(): saliency = numpy.array([1., 2., 3.]) result = abs_max_scaler(saliency) expected = numpy.array([1. / 3, 2. / 3., 1.]) assert numpy.allclose(result, expected) # test with 0 arrays saliency = numpy.array([0, 0, 0]) result = abs_max_scaler(saliency) expected = numpy.array([0, 0, 0]) assert numpy.allclose(result, expected) def test_min_max_scaler(): saliency = numpy.array([1., -3., 3.]) result = min_max_scaler(saliency) expected = numpy.array([4. / 6, 0., 1.]) assert numpy.allclose(result, expected) # test with 0 arrays saliency = numpy.array([0, 0, 0]) result = min_max_scaler(saliency) expected = numpy.array([0, 0, 0]) assert numpy.allclose(result, expected) def test_normalize_scaler(): saliency = numpy.array([1., 2., 3.]) result = normalize_scaler(saliency) expected = numpy.array([1./6., 2./6, 3./6.]) assert numpy.allclose(result, expected) # test with 0 arrays saliency = numpy.array([0, 0, 0]) result = normalize_scaler(saliency) expected = numpy.array([0, 0, 0]) assert numpy.allclose(result, expected) def test_red_blue_cmap(): assert red_blue_cmap(1) == (1., 0., 0.) # Red assert red_blue_cmap(0) == (1., 1., 1.) # White assert red_blue_cmap(-1) == (0., 0., 1.) # Blue if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/test_init.py ================================================ import pkg_resources import chainer_chemistry import pytest def test_version(): expect = pkg_resources.get_distribution('chainer_chemistry').version actual = chainer_chemistry.__version__ assert expect == actual if __name__ == '__main__': pytest.main([__file__, '-v']) ================================================ FILE: tests/training_tests/extensions_tests/test_auto_print_report.py ================================================ import tempfile import pytest import mock from chainer import testing from chainer.training import extensions class TestAutoPrintReport(object): def _setup(self, stream=None, delete_flush=False): self.logreport = mock.MagicMock(spec=extensions.LogReport( ['epoch'], trigger=(1, 'iteration'), log_name=None)) if stream is None: self.stream = mock.MagicMock() if delete_flush: del self.stream.flush else: self.stream = stream self.report = extensions.PrintReport( ['epoch'], log_report=self.logreport, out=self.stream) self.trainer = testing.get_trainer_with_mock_updater( stop_trigger=(1, 'iteration')) self.trainer.extend(self.logreport) self.trainer.extend(self.report) self.logreport.log = [{'epoch': 0}] def test_stream_with_flush_is_flushed(self): self._setup(delete_flush=False) assert hasattr(self.stream, 'flush') self.stream.flush.assert_not_called() self.report(self.trainer) self.stream.flush.assert_called_with() def test_stream_without_flush_raises_no_exception(self): self._setup(delete_flush=True) assert not hasattr(self.stream, 'flush') self.report(self.trainer) def test_real_stream_raises_no_exception(self): with tempfile.TemporaryFile(mode='w') as stream: self._setup(stream=stream) self.report(self.trainer) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/training_tests/extensions_tests/test_prc_auc_evaluator.py ================================================ """ PRCAUCEvaluator uses `sklearn.metrics.precision_recall_curve` and `sklearn.metrics.auc` internally. Refer: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.\ prc_auc_score.html """ import numpy import pytest import chainer from chainer.iterators import SerialIterator from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset # NOQA from chainer_chemistry.training.extensions.prc_auc_evaluator import PRCAUCEvaluator # NOQA @pytest.fixture def data0(): # `t` is correct label, `y` is dummy predict value by predictor t = numpy.array([0, 0, 1, 1], dtype=numpy.int32)[:, None] y = numpy.array([0.1, 0.4, 0.35, 0.8], dtype=numpy.float32)[:, None] return y, t @pytest.fixture def data1(): # `t` is correct label, `y` is dummy predict value by predictor t = numpy.array([0, 1, -1, 0, 2, -1], dtype=numpy.int32)[:, None] y = numpy.array([0.1, 0.35, 0.2, 0.4, 0.8, 0.35], dtype=numpy.float32)[:, None] return y, t @pytest.fixture def data2(): # Example of bad example case # `t` only contains correct label, `y` is dummy predict value by predictor t = numpy.array([0, 0, 0, 0], dtype=numpy.int32)[:, None] y = numpy.array([0.1, 0.4, 0.35, 0.8], dtype=numpy.float32)[:, None] return y, t class DummyPredictor(chainer.Chain): def __call__(self, y): # it receives `y` and return `y` directly return y def test_prc_auc_evaluator(data0, data1): _test_prc_auc_evaluator_default_args(data0) _test_prc_auc_evaluator_with_labels(data1) def _test_prc_auc_evaluator_default_args(data0): predictor = DummyPredictor() dataset = NumpyTupleDataset(*data0) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = PRCAUCEvaluator( iterator, predictor, name='train', pos_labels=1, ignore_labels=None ) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected_prc_auc = 0.7916 pytest.approx(observation['target/prc_auc'], expected_prc_auc) # --- test __call__ --- result = evaluator() pytest.approx(result['train/main/prc_auc'], expected_prc_auc) def _test_prc_auc_evaluator_with_labels(data1): """test `pos_labels` and `ignore_labels` behavior""" predictor = DummyPredictor() dataset = NumpyTupleDataset(*data1) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = PRCAUCEvaluator( iterator, predictor, name='val', pos_labels=[1, 2], ignore_labels=-1, ) # --- test evaluate --- repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected_prc_auc = 0.7916 pytest.approx(observation['target/prc_auc'], expected_prc_auc) # --- test __call__ --- result = evaluator() pytest.approx(result['val/main/prc_auc'], expected_prc_auc) def test_prc_auc_evaluator_raise_value_error(data2): with pytest.raises(ValueError): _test_prc_auc_evaluator_raise_error(data2, raise_value_error=True) res = _test_prc_auc_evaluator_raise_error(data2, raise_value_error=False) assert numpy.isnan(res) def _test_prc_auc_evaluator_raise_error(data, raise_value_error=True): predictor = DummyPredictor() dataset = NumpyTupleDataset(*data) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = PRCAUCEvaluator( iterator, predictor, name='train', pos_labels=1, ignore_labels=None, raise_value_error=raise_value_error ) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() return observation['target/prc_auc'] if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/training_tests/extensions_tests/test_r2_score_evaluator.py ================================================ import numpy import pytest import chainer from chainer import cuda from chainer.iterators import SerialIterator from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset from chainer_chemistry.training.extensions.r2_score_evaluator import R2ScoreEvaluator # NOQA @pytest.fixture def inputs(): numpy.random.seed(0) x0 = numpy.random.uniform(-1, 1, (4, 3)).astype('f') # Add sufficient margin to prevent computational error diff = numpy.random.uniform(-1, 1, (4, 3)).astype('f') diff[abs(diff) < 0.01] = 0.5 x1 = x0 + diff x2 = numpy.asarray([[0.3, numpy.nan, 0.2], [numpy.nan, 0.1, 0.5], [0.9, 0.7, numpy.nan], [0.2, -0.3, 0.4]]).astype('f') return x0, x1, x2 def r2_score(pred, true, sample_weight=None, multioutput="uniform_average", ignore_nan=False): pred = cuda.to_cpu(pred) true = cuda.to_cpu(true) diff = pred - true dev = true - numpy.mean(true, axis=0) if ignore_nan: diff[numpy.isnan(diff)] = 0. dev[numpy.isnan(dev)] = 0. SS_res = numpy.asarray( numpy.sum(diff ** 2, axis=0)) SS_tot = numpy.asarray( numpy.sum(dev ** 2, axis=0)) if multioutput == 'uniform_average': if numpy.any(SS_tot == 0): return 0.0 else: return (1 - SS_res / SS_tot).mean() elif multioutput == 'raw_values': if numpy.any(SS_tot == 0): # Assign dummy value to avoid zero-division SS_tot_iszero = SS_tot == 0 SS_tot[SS_tot_iszero] = 1 return numpy.where(SS_tot_iszero, 0.0, 1 - SS_res / SS_tot) else: return 1 - SS_res / SS_tot class DummyPredictor(chainer.Chain): def __call__(self, y): # it receives `y` and return `y` directly return y def test_r2_score_evaluator(inputs): _test_r2_score_evaluator(inputs) _test_r2_score_evaluator_ignore_nan(inputs) _test_r2_score_evaluator_ignore_nan_with_nonnan_value(inputs) _test_r2_score_evaluator_raw_values(inputs) @pytest.mark.gpu def test_r2_score_evaluator_gpu(inputs): x0, x1, x2 = inputs _test_r2_score_evaluator((cuda.to_gpu(x0), cuda.to_gpu(x1), None)) _test_r2_score_evaluator_ignore_nan( (cuda.to_gpu(x0), None, cuda.to_gpu(x2))) _test_r2_score_evaluator_ignore_nan_with_nonnan_value( (cuda.to_gpu(x0), cuda.to_gpu(x1), None)) _test_r2_score_evaluator_raw_values( (cuda.to_gpu(x0), cuda.to_gpu(x1), None)) def _test_r2_score_evaluator(inputs): predictor = DummyPredictor() x0, x1, _ = inputs dataset = NumpyTupleDataset(x0, x1) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = R2ScoreEvaluator(iterator, predictor, name='train') repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected = r2_score(x0, x1) pytest.approx(observation['target/r2_score'], expected) # --- test __call__ --- result = evaluator() pytest.approx(result['train/main/r2_score'], expected) def _test_r2_score_evaluator_ignore_nan(inputs): predictor = DummyPredictor() x0, _, x2 = inputs dataset = NumpyTupleDataset(x0, x2) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = R2ScoreEvaluator( iterator, predictor, name='train', ignore_nan=True) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected = r2_score(x0, x2, ignore_nan=True) pytest.approx(observation['target/r2_score'], expected) # --- test __call__ --- result = evaluator() pytest.approx(result['train/main/r2_score'], expected) def _test_r2_score_evaluator_ignore_nan_with_nonnan_value(inputs): predictor = DummyPredictor() x0, x1, _ = inputs dataset = NumpyTupleDataset(x0, x1) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = R2ScoreEvaluator( iterator, predictor, name='train', ignore_nan=True) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected = r2_score(x0, x1, ignore_nan=True) pytest.approx(observation['target/r2_score'], expected) # --- test __call__ --- result = evaluator() pytest.approx(result['train/main/r2_score'], expected) def _test_r2_score_evaluator_raw_values(inputs): predictor = DummyPredictor() x0, x1, _ = inputs dataset = NumpyTupleDataset(x0, x1) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = R2ScoreEvaluator( iterator, predictor, name='train', multioutput='raw_values') repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected = r2_score(x0, x1, multioutput='raw_values') pytest.approx(observation['target/r2_score'], expected) # --- test __call__ --- result = evaluator() pytest.approx(result['train/main/r2_score'], expected) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/training_tests/extensions_tests/test_roc_auc_evaluator.py ================================================ """ ROCAUCEvaluator uses `sklearn.metrics.roc_auc_score` internally. Refer: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.\ roc_auc_score.html """ import numpy import pytest import chainer from chainer.iterators import SerialIterator from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset # NOQA from chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator # NOQA @pytest.fixture def data0(): # `t` is correct label, `y` is dummy predict value by predictor t = numpy.array([0, 0, 1, 1], dtype=numpy.int32)[:, None] y = numpy.array([0.1, 0.4, 0.35, 0.8], dtype=numpy.float32)[:, None] return y, t @pytest.fixture def data1(): # `t` is correct label, `y` is dummy predict value by predictor t = numpy.array([0, 1, -1, 0, 2, -1], dtype=numpy.int32)[:, None] y = numpy.array([0.1, 0.35, 0.2, 0.4, 0.8, 0.35], dtype=numpy.float32)[:, None] return y, t @pytest.fixture def data2(): # Example of bad example case # `t` only contains correct label, `y` is dummy predict value by predictor t = numpy.array([0, 0, 0, 0], dtype=numpy.int32)[:, None] y = numpy.array([0.1, 0.4, 0.35, 0.8], dtype=numpy.float32)[:, None] return y, t class DummyPredictor(chainer.Chain): def __call__(self, y): # it receives `y` and return `y` directly return y def test_roc_auc_evaluator(data0, data1): _test_roc_auc_evaluator_default_args(data0) _test_roc_auc_evaluator_with_labels(data1) def _test_roc_auc_evaluator_default_args(data0): predictor = DummyPredictor() dataset = NumpyTupleDataset(*data0) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = ROCAUCEvaluator( iterator, predictor, name='train', pos_labels=1, ignore_labels=None ) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected_roc_auc = 0.75 # print('observation ', observation) assert observation['target/roc_auc'] == expected_roc_auc # --- test __call__ --- result = evaluator() # print('result ', result) assert result['train/main/roc_auc'] == expected_roc_auc def _test_roc_auc_evaluator_with_labels(data1): """test `pos_labels` and `ignore_labels` behavior""" predictor = DummyPredictor() dataset = NumpyTupleDataset(*data1) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = ROCAUCEvaluator( iterator, predictor, name='val', pos_labels=[1, 2], ignore_labels=-1, ) # --- test evaluate --- repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() expected_roc_auc = 0.75 # print('observation ', observation) assert observation['target/roc_auc'] == expected_roc_auc # --- test __call__ --- result = evaluator() # print('result ', result) assert result['val/main/roc_auc'] == expected_roc_auc def test_roc_auc_evaluator_raise_value_error(data2): with pytest.raises(ValueError): _test_roc_auc_evaluator_raise_error(data2, raise_value_error=True) res = _test_roc_auc_evaluator_raise_error(data2, raise_value_error=False) assert numpy.isnan(res) def _test_roc_auc_evaluator_raise_error(data, raise_value_error=True): predictor = DummyPredictor() dataset = NumpyTupleDataset(*data) iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False) evaluator = ROCAUCEvaluator( iterator, predictor, name='train', pos_labels=1, ignore_labels=None, raise_value_error=raise_value_error ) repo = chainer.Reporter() repo.add_observer('target', predictor) with repo: observation = evaluator.evaluate() return observation['target/roc_auc'] if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/utils_tests/test_extend.py ================================================ import numpy import pytest from chainer_chemistry.utils.extend import extend_node, extend_adj # NOQA batchsize = 2 num_node = 3 ch = 5 x_2d = numpy.arange(batchsize * num_node).reshape( (batchsize, num_node)) x_3d = numpy.arange(batchsize * num_node * ch).reshape( (batchsize, num_node, ch)) adj_3d = numpy.arange(batchsize * num_node * num_node).reshape( (batchsize, num_node, num_node)) @pytest.mark.parametrize('x', [x_2d, x_2d.astype(numpy.float32)]) def test_extend_node_2d(x): x_extended = extend_node(x, out_size=6) x_expected = numpy.array([[0, 1, 2, 0, 0, 0], [3, 4, 5, 0, 0, 0]], dtype=x.dtype) print('x type', x_extended.dtype) assert x_extended.shape == (batchsize, 6) assert x_extended.dtype == x.dtype assert numpy.array_equal(x_extended, x_expected) @pytest.mark.parametrize('x', [x_3d, x_3d.astype(numpy.float32)]) @pytest.mark.parametrize('axis', [-1, 2]) def test_extend_node_3d(x, axis): x_extended = extend_node(x, out_size=6, axis=axis) x_expected = numpy.array([ [[0, 1, 2, 3, 4, 0], [5, 6, 7, 8, 9, 0], [10, 11, 12, 13, 14, 0]], [[15, 16, 17, 18, 19, 0], [20, 21, 22, 23, 24, 0], [25, 26, 27, 28, 29, 0]]]) assert x_extended.shape == (batchsize, num_node, 6) assert x_extended.dtype == x.dtype assert numpy.array_equal(x_extended, x_expected) def test_extend_node_assert_raises(): with pytest.raises(ValueError): extend_node(x_2d, out_size=1) @pytest.mark.parametrize('adj', [adj_3d, adj_3d.astype(numpy.float32)]) def test_extend_adj(adj): adj_extended = extend_adj(adj, out_size=6) assert adj_extended.shape == (batchsize, 6, 6) assert adj_extended.dtype == adj.dtype assert numpy.array_equal(adj_extended[:, :num_node, :num_node], adj) assert numpy.alltrue(adj_extended[:, num_node:, :] == 0) assert numpy.alltrue(adj_extended[:, :, num_node:] == 0) def test_extend_adj_assert_raises(): with pytest.raises(ValueError): extend_adj(adj_3d, out_size=1) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/utils_tests/test_json_utils.py ================================================ import os import numpy import pytest from chainer_chemistry.utils.json_utils import load_json from chainer_chemistry.utils.json_utils import save_json params = { 'a_int': 1, 'b_str': 'string', 'c_list': [1, 2, 3], 'd_tuple': (1, 2), 'n_int_scalar': numpy.array(1), 'n_int_array': numpy.array([1]), 'n_float': numpy.array([[1.0, 2.0], [3.0, 4.0]]), } try: # pathlib is not available with python 2.7 from pathlib import Path params['path'] = Path('/tmp/hoge') _is_pathlib_available = True except ImportError: _is_pathlib_available = False params_invalid = { 'lambda_function': lambda x: x * 2, } def test_save_json(tmpdir): filepath = os.path.join(str(tmpdir), 'tmp.json') save_json(filepath, params) assert os.path.exists(filepath) def test_save_json_ignore_error(tmpdir): filepath = os.path.join(str(tmpdir), 'tmp.json') # 1. should raise error when ignore_error=False with pytest.raises(TypeError): save_json(filepath, params_invalid, ignore_error=False) # 2. should not raise error when ignore_error=False save_json(filepath, params_invalid, ignore_error=True) def test_load_json(tmpdir): filepath = os.path.join(str(tmpdir), 'tmp.json') # TODO(nakago): better to remove `save_json` dependency for unittest. save_json(filepath, params) params_load = load_json(filepath) expected_params_load = { 'a_int': 1, 'b_str': 'string', 'c_list': [1, 2, 3], 'd_tuple': [1, 2], 'n_float': [[1.0, 2.0], [3.0, 4.0]], 'n_int_array': [1], 'n_int_scalar': 1, } if _is_pathlib_available: # PurePath is converted to str expected_params_load['path'] = '/tmp/hoge' assert params_load == expected_params_load if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/utils_tests/test_permutation.py ================================================ import numpy import pytest from chainer_chemistry.utils.permutation import permute_adj from chainer_chemistry.utils.permutation import permute_node batchsize = 1 num_node = 3 ch = 5 @pytest.mark.parametrize('x', [ numpy.random.randint(10, size=(batchsize, num_node), dtype=numpy.int32), numpy.random.random(size=(batchsize, num_node)) ]) def test_permute_node_2d(x): perm = numpy.random.permutation(num_node) x_perm = permute_node(x, perm) assert x.shape == x_perm.shape for i in range(num_node): assert numpy.allclose(x[:, perm[i]], x_perm[:, i]) @pytest.mark.parametrize('x', [ numpy.random.randint(10, size=(batchsize, num_node, ch), dtype=numpy.int32), numpy.random.random(size=(batchsize, num_node, ch)) ]) @pytest.mark.parametrize('axis', [-1, -2, 1, 2]) def test_permute_node_3d(x, axis): perm = numpy.random.permutation(x.shape[axis]) x_perm = permute_node(x, perm, axis=axis) assert x.shape == x_perm.shape if axis == -1 or axis == 2: for i in range(num_node): assert numpy.allclose(x[:, :, perm[i]], x_perm[:, :, i]) else: for i in range(num_node): assert numpy.allclose(x[:, perm[i], :], x_perm[:, i, :]) @pytest.mark.parametrize('adj', [ numpy.random.randint(10, size=(batchsize, num_node, num_node), dtype=numpy.int32), numpy.random.randint(10, size=(batchsize, ch, num_node, num_node), dtype=numpy.int32) ]) def test_permute_adj(adj): perm = numpy.random.permutation(num_node) adj_perm = permute_adj(adj, perm) assert adj.shape == adj_perm.shape for i in range(num_node): for j in range(num_node): assert numpy.array_equal( adj[..., perm[i], perm[j]], adj_perm[..., i, j]) def test_permute_adj_axis12(): adj = numpy.random.randint( 10, size=(batchsize, num_node, num_node, ch), dtype=numpy.int32) perm = numpy.random.permutation(num_node) adj_perm = permute_adj(adj, perm, axis=[1, 2]) assert adj.shape == adj_perm.shape for i in range(num_node): for j in range(num_node): assert numpy.allclose( adj[:, perm[i], perm[j], :], adj_perm[:, i, j, :]) def test_permute_adj_error(): adj = numpy.random.randint( 10, size=(batchsize, ch, num_node, num_node), dtype=numpy.int32) perm = numpy.random.permutation(num_node) with pytest.raises(TypeError): permute_adj(adj, perm, axis=1) with pytest.raises(ValueError): permute_adj(adj, perm, axis=[1, 2, 3]) if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/utils_tests/test_sparse_utils.py ================================================ import numpy import pytest from chainer_chemistry.utils.sparse_utils import convert_sparse_with_edge_type from chainer_chemistry.utils.sparse_utils import sparse_utils_available if not sparse_utils_available(): pytest.skip('sparse_utils is available if chainer>=5 and numpy>=1.16', allow_module_level=True) def naive_convert(data, row, col, edge_type, num_edge_type): mb, length = data.shape new_mb = mb * num_edge_type new_data = [[] for _ in range(new_mb)] new_row = [[] for _ in range(new_mb)] new_col = [[] for _ in range(new_mb)] for i in range(mb): for j in range(length): k = i * num_edge_type + edge_type[i, j] new_data[k].append(data[i, j]) new_row[k].append(row[i, j]) new_col[k].append(col[i, j]) new_length = max(len(arr) for arr in new_data) def pad(arr_2d, dtype=numpy.int32): for arr in arr_2d: arr.extend([0] * (new_length - len(arr))) return numpy.array(arr_2d) ret = [] for d, r, c in zip(pad(new_data, data.dtype), pad(new_row), pad(new_col)): ret.append(list(sorted(zip(d, r, c)))) return ret @pytest.mark.parametrize('in_shape,num_edge_type', [ ((2, 4), 4), ((5, 10), 2), ((1, 1), 1), ((10, 1), 10), ((10, 10), 10), ]) def test_convert_sparse_with_edge_type(in_shape, num_edge_type): num_nodes = 10 data = numpy.random.uniform(size=in_shape).astype(numpy.float32) row = numpy.random.randint(size=in_shape, low=0, high=num_nodes) col = numpy.random.randint(size=in_shape, low=0, high=num_nodes) edge_type = numpy.random.randint(size=in_shape, low=0, high=num_edge_type) received = convert_sparse_with_edge_type(data, row, col, num_nodes, edge_type, num_edge_type) expected = naive_convert(data, row, col, edge_type, num_edge_type) # check by minibatch-wise for i, expected_batch in enumerate(expected): d = received.data.data[i, :].tolist() r = received.row[i, :].tolist() c = received.col[i, :].tolist() received_batch = list(sorted(zip(d, r, c))) assert expected_batch == received_batch if __name__ == '__main__': pytest.main([__file__, '-v', '-s']) ================================================ FILE: tests/utils_tests/test_train_utils.py ================================================ import chainer import numpy import pytest from chainer.iterators import SerialIterator from chainer import links import chainerx from chainer_chemistry.datasets import NumpyTupleDataset from chainer_chemistry.models import Regressor from chainer_chemistry.utils import run_train input_dim = 5 output_dim = 7 train_data_size = 9 valid_data_size = 8 batch_size = 4 @pytest.fixture def model(): return Regressor(links.Linear(None, output_dim)) @pytest.fixture def train_data(): x = numpy.random.uniform( 0, 1, (train_data_size, input_dim)).astype(numpy.float32) y = numpy.random.uniform( 0, 1, (train_data_size, output_dim)).astype(numpy.float32) return NumpyTupleDataset(x, y) @pytest.fixture def valid_data(): x = numpy.random.uniform( 0, 1, (valid_data_size, input_dim)).astype(numpy.float32) y = numpy.random.uniform( 0, 1, (valid_data_size, output_dim)).astype(numpy.float32) return NumpyTupleDataset(x, y) def test_run_train_cpu(model, train_data, valid_data): run_train(model, train_data, valid=valid_data, epoch=1, batch_size=8) def test_run_train_cpu_iterator(model, train_data, valid_data): train_iter = SerialIterator(train_data, batch_size=4) valid_iter = SerialIterator(valid_data, batch_size=4, shuffle=False, repeat=False) run_train(model, train_iter, valid=valid_iter, epoch=1, batch_size=8, extensions_list=[lambda t: None]) def test_run_train_invalid(model, train_data): with pytest.raises(ValueError): run_train(model, train_data, optimizer=1) @pytest.mark.gpu def test_run_train_gpu(model, train_data, valid_data): device = 0 model.to_gpu(device) run_train(model, train_data, valid=valid_data, epoch=1, batch_size=8, device=device) @pytest.mark.skipif(not chainerx.is_available(), reason='chainerx is not available') def test_run_train_chainerx_native(model, train_data, valid_data): device = chainer.get_device('native') model.to_device(device) run_train(model, train_data, valid=valid_data, epoch=1, batch_size=8, device=device) @pytest.mark.gpu @pytest.mark.skipif(not chainerx.is_available(), reason='chainerx is not available') def test_run_train_chainerx_cuda0(model, train_data, valid_data): device = chainer.get_device('cuda:0') model.to_device(device) run_train(model, train_data, valid=valid_data, epoch=1, batch_size=8, device=device) if __name__ == '__main__': pytest.main([__file__, '-v', '-s'])