[
  {
    "path": ".codecov.yml",
    "content": "coverage:\n  status:\n    # pull-requests only\n    # Allow to drop coverage\n    project:\n      default:\n        threshold: 5.0%\n    patch:\n      default:\n        threshold: 20.0%\ncomment:\n  layout: \"header, diff\"\n  require_changes: false\n  branches: null\n  behavior: default\n  flags: null\n  paths: null"
  },
  {
    "path": ".flexci/config.pbtxt",
    "content": "# DO NOT MODIFY THIS FILE MANUALLY.\n# USE gen_config.py INSTEAD.\n\nconfigs {\n  key: \"chainerch.py37.stable.cpu.chx\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"stable\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"1\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"0\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.stable.gpu.chx\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n      gpu: 1\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"stable\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"1\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"1\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.stable.cpu\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"stable\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"0\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"0\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.stable.gpu\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n      gpu: 1\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"stable\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"0\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"1\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.latest.cpu.chx\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"latest\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"1\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"0\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.latest.gpu.chx\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n      gpu: 1\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"latest\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"1\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"1\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.latest.cpu\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"latest\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"0\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"0\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.latest.gpu\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n      gpu: 1\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"latest\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"0\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"1\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.base.cpu.chx\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"base\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"1\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"0\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.base.gpu.chx\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n      gpu: 1\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"base\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"1\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"1\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.base.cpu\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"base\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"0\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"0\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py37.base.gpu\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n      gpu: 1\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"37\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"base\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"0\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"1\"\n    }\n  }\n}\nconfigs {\n  key: \"chainerch.py36.stable.gpu\"\n  value {\n    requirement {\n      cpu: 4\n      memory: 16\n      disk: 10\n      gpu: 1\n    }\n    command: \"bash .flexci/pytest_script.sh\"\n    environment_variables {\n      key: \"PYTHON\"\n      value: \"36\"\n    }\n    environment_variables {\n      key: \"CHAINER\"\n      value: \"stable\"\n    }\n    environment_variables {\n      key: \"CHAINERX\"\n      value: \"0\"\n    }\n    environment_variables {\n      key: \"GPU\"\n      value: \"1\"\n    }\n  }\n}\n"
  },
  {
    "path": ".flexci/gen_config.py",
    "content": "\"\"\"Config generator for Flex CI\nUsage:\n    $ python gen_config.py > config.pbtxt\n\"\"\"\n\nfrom __future__ import print_function\nfrom collections import OrderedDict\nimport itertools\n\n\ndef test_config(python, chainer, target, chainerx):\n\n    if chainerx:\n        s_chainerx = '.chx'\n    else:\n        s_chainerx = ''\n    key = 'chainerch.py{}.{}.{}{}'.format(python, chainer, target, s_chainerx)\n\n    value = OrderedDict((\n        ('requirement', OrderedDict((\n            ('cpu', 4),\n            ('memory', 16),\n            ('disk', 10),\n        ))),\n        ('command', 'bash .flexci/pytest_script.sh'),\n        ('environment_variables', [\n            ('PYTHON', str(python)),\n            ('CHAINER', chainer),\n            ('CHAINERX', '1' if chainerx else '0'),\n            ('GPU', '1' if target == 'gpu' else '0'),\n        ]),\n    ))\n\n    if target == 'gpu':\n        value['requirement']['gpu'] = 1\n\n    return key, value\n\n\ndef main():\n    configs = []\n\n    for python, chainer in itertools.product(\n            (37,), ('stable', 'latest', 'base')):\n        for chainerx in (True, False):\n            configs.append(test_config(python, chainer, 'cpu', chainerx))\n            configs.append(test_config(python, chainer, 'gpu', chainerx))\n    # small test in python 36\n    configs.append(test_config(36, 'stable', 'gpu', False))\n\n    print('# DO NOT MODIFY THIS FILE MANUALLY.')\n    print('# USE gen_config.py INSTEAD.')\n    print()\n\n    dump_pbtxt('configs', configs)\n\n\ndef dump_pbtxt(key, value, level=0):\n    indent = '  ' * level\n    if isinstance(value, int):\n        print('{}{}: {}'.format(indent, key, value))\n    elif isinstance(value, str):\n        print('{}{}: \"{}\"'.format(indent, key, value))\n    elif isinstance(value, list):\n        for k, v in value:\n            print('{}{} {{'.format(indent, key))\n            dump_pbtxt('key', k, level + 1)\n            dump_pbtxt('value', v, level + 1)\n            print('{}}}'.format(indent))\n    elif isinstance(value, dict):\n        print('{}{} {{'.format(indent, key))\n        for k, v in value.items():\n            dump_pbtxt(k, v, level + 1)\n        print('{}}}'.format(indent))\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": ".flexci/pytest_script.sh",
    "content": "#!/bin/bash\nset -eux\n\nBASE=6.0.0\n\nservice docker stop\nmount -t tmpfs -o size=100% tmpfs /var/lib/docker\nservice docker start\ngcloud auth configure-docker\n\nif [ ${CHAINERX} -gt 0 ]; then\n    if [ ${GPU} -gt 0 ]; then\n        case ${CHAINER} in\n            stable)\n                DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-gpu-stable:latest\n                ;;\n            latest)\n                DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-gpu-latest:latest\n                ;;\n            base)\n                DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-gpu-base:latest\n                ;;\n        esac\n    else\n        case ${CHAINER} in\n            stable)\n                DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-cpu-stable:latest\n                ;;\n            latest)\n                DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-cpu-latest:latest\n                ;;\n            base)\n                DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON-chainerx-cpu-base:latest\n                ;;\n        esac\n    fi\n    echo \"Use installed chainer in Docker image\"\nelse\n    DOCKER_IMAGE=asia.gcr.io/pfn-public-ci/chainer-chem-py$PYTHON:latest\n    case ${CHAINER} in\n        stable)\n            echo pip install chainer >> install.sh\n            ;;\n        latest)\n            echo pip install --pre chainer >> install.sh\n            ;;\n        base)\n            echo pip install chainer==${BASE} >> install.sh\n            ;;\n    esac\n    if [ ${GPU} -gt 0 ]; then\n        case ${CHAINER} in\n            stable)\n                echo pip install cupy-cuda101 >> install.sh\n                ;;\n            latest)\n                echo pip install --pre cupy-cuda101 >> install.sh\n                ;;\n            base)\n                echo pip install cupy-cuda101==${BASE} >> install.sh\n                ;;\n        esac\n    fi\nfi\n\necho pip install pytest-cov pytest-xdist mock >> install.sh\necho pip install -e . >> install.sh\n\necho $DOCKER_IMAGE\ncat install.sh\n\nif [ ${GPU} -gt 0 ]; then\n    PYTEST_OPTION=\"not slow\"\n    RUNTIME=\"--runtime=nvidia\"\nelse\n    PYTEST_OPTION=\"not slow and not gpu\"\n    RUNTIME=\"\"\nfi\n\ndocker run $RUNTIME --interactive --rm \\\n    --volume $(pwd):/repo/ --workdir /repo/\\\n    $DOCKER_IMAGE sh -ex << EOD\n. ./install.sh\npytest -n 4 --cov=chainer_chemistry -m '${PYTEST_OPTION}' tests/\nEOD\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*,cover\n.hypothesis/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\ndocs/source/generated\n\n# PyBuilder\ntarget/\n\n# IPython Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# dotenv\n.env\n\n# virtualenv\nvenv/\nENV/\n\n# Spyder project settings\n.spyderproject\n\n# Rope project settings\n.ropeproject\n\n# PyCharm project settings\n.idea/\nexamples/tox21/input/\nexamples/qm9/input/\nexamples/molnet/input/\n\n# Trained models and plots\nexamples/tox21/eval_*\nexamples/qm9/eval_*\nexamples/molnet/eval_*\nexamples/own_dataset/eval_*\n\n# emacs\n*~\n\n# VSCode\n.vscode/\n\n# Visual Studio\n.vs/\n*.sln\n*.pyproj\n\n.pytest_cache\n"
  },
  {
    "path": ".readthedocs.yml",
    "content": "name: chainer-chemistry\ntype: sphinx\nbase: docs/source\nconda:\n  file: docs/source/environment.yml\n"
  },
  {
    "path": ".travis.yml",
    "content": "sudo: false\nlanguage: python\n\nos: linux\ndist: trusty\npython:\n  - 3.6\nenv:\nenv:\n  - CHAINER_VERSION=\"chainer==7.0.0\"\n  - CHAINER_VERSION=\"chainer\"\n  - CHAINER_VERSION=\"prerelease\"\n\ninstall:\n  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh\n  - bash miniconda.sh -b -p $HOME/miniconda\n  - export PATH=\"$HOME/miniconda/bin:$PATH\"\n  - hash -r\n  - conda config --set always_yes yes\n  - conda update -q conda\n  - conda info -a\n  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION\n  - source activate test-environment\n  - pip install codecov\n  - pip install mock\n  - conda install pip pytest pytest-cov\n  - conda install -c rdkit rdkit==2019.03.2.0\n  - if [ \"${CHAINER_VERSION}\" = \"prerelease\" ]; then\n      pip install --pre chainer;\n    else\n      pip install \"${CHAINER_VERSION}\";\n    fi\n  - pip install --no-cache-dir -e .\n\nscript:\n  - if [ \"${TRAVIS_EVENT_TYPE}\" = \"cron\" ]; then\n      pytest --cov=./ -m \"not gpu\" tests;\n      (cd examples && bash -x test_examples.sh -1);\n    else\n      pytest --cov=./ -m \"not (gpu or slow)\" tests;\n    fi\n\nafter_success:\n  - codecov"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2017 Preferred Networks, Inc.\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# Chainer Chemistry: A Library for Deep Learning in Biology and Chemistry\n\n[![PyPI](https://img.shields.io/pypi/v/chainer-chemistry.svg)](https://pypi.python.org/pypi/chainer-chemistry)\n[![GitHub license](https://img.shields.io/github/license/pfnet-research/chainer-chemistry.svg)](https://github.com/pfnet-research/chainer-chemistry/blob/master/LICENSE)\n[![travis](https://img.shields.io/travis/pfnet-research/chainer-chemistry/master.svg)](https://travis-ci.org/pfnet-research/chainer-chemistry)\n[![Read the Docs](https://readthedocs.org/projects/chainer-chemistry/badge/?version=latest)](http://chainer-chemistry.readthedocs.io/en/latest/?badge=latest)\n\n<p align=\"center\">\n  <img src=\"assets/chainer-chemistry-overview.png\" alt=\"Chainer Chemistry Overview\" width=\"600\" />\n</p>\n\nChainer Chemistry is a deep learning framework (based on Chainer) with\napplications in Biology and Chemistry. It supports various state-of-the-art\nmodels (especially GCNN - Graph Convolutional Neural Network) for chemical property prediction.\n\nFor more information, please refer to the [documentation](http://chainer-chemistry.readthedocs.io/en/latest/index.html).\nAlso, a quick introduction to deep learning for molecules and Chainer Chemistry\nis available [here](https://www.slideshare.net/KentaOono/deep-learning-for-molecules-introduction-to-chainer-chemistry-93288837).\n\n## Dependencies\n\nChainer Chemistry depends on the following packages:\n\n - [`chainer`](https://docs.chainer.org/en/stable/index.html)\n - [`pandas`](https://pandas.pydata.org)\n - [`scikit-learn`](http://scikit-learn.org/stable/)\n - [`tqdm`](https://pypi.python.org/pypi/tqdm)\n - [`h5py`](https://pypi.python.org/pypi/h5py)\n\nThese are automatically added to the system when installing the library via the\n`pip` command (see _Installation_). However, the following  needs to be\ninstalled manually:\n\n - [`rdkit (release 2019.03.2.0)`](https://github.com/rdkit/rdkit)\n\nPlease refer to the RDKit [documentation](http://www.rdkit.org/docs/Install.html)\nfor more information regarding the installation steps.\n\nNote that only the following versions of Chainer Chemistry's dependencies are\ncurrently supported:\n\n| Chainer Chemistry   | Chainer         | RDKit          | Python           |\n| ------------------: | --------------: | -------------: | ---------------: |\n| v0.1.0 ~ v0.3.0     | v2.0 ~ v3.0     | 2017.09.3.0    | 2.7, 3.5, 3.6    |\n| v0.4.0              | v3.0 ~ v4.0 *1  | 2017.09.3.0    | 2.7, 3.5, 3.6    |\n| v0.5.0              | v3.0 ~ v5.0 *2  | 2017.09.3.0    | 2.7, 3.5, 3.6    |\n| v0.6.0              | v6.0 ~      *3  | 2017.09.3.0    | 2.7, 3.5, 3.6    |\n| v0.7.0 ~ v0.7.1     | v7.0 ~          | 2019.03.2.0    | 3.6, 3.7      *4 |\n| master branch    *5 | v7.0 ~          | 2019.03.2.0    | 3.6, 3.7         |\n\n\n[Footnote]\n\n*1: We used `FunctionNode` in [this PR](https://github.com/pfnet-research/chainer-chemistry/pull/190),\nwhich is introduced after chainer v3. See [this issue](https://github.com/pfnet-research/chainer-chemistry/issues/192) for details.\n\n*2: Saliency modules only work after chainer v5.\n\n*3: Chainer v6 is released and [ChainerX](https://chainer.org/announcement/2018/12/03/chainerx.html) is newly introduced.\n    In order to support this new feature & API, we broke backward compatibility for chainer chemistry v0.6.0 release.\n    See [ChainerX Documentation](https://chainer.org/announcement/2018/12/03/chainerx.html) for details.\n\n*4: python 2.x support is dropped, following the same policy with `chainer` and `rdkit`.\n\n*5: As [announced in chainer blog](https://chainer.org/announcement/2019/12/05/released-v7.html),\n    further development will be limited to only serious bug-fixes and maintenance.\n\n## Installation\n\nChainer Chemistry can be installed using the `pip` command, as follows:\n\n```\npip install chainer-chemistry\n```\n\nExample to install rdkit with conda:\n```bash\n# newer conda version is necessary to install rdkit 2019.03.2.0\nconda install -n base conda==4.6.14\nconda install -c rdkit rdkit==2019.03.2.0\n```\n\nIf you would like to use the latest sources, please checkout the master branch\nand install with the following commands:\n\n```\ngit clone https://github.com/pfnet-research/chainer-chemistry.git\npip install -e chainer-chemistry\n```\n\n## Sample Code\n\nSample code is provided with this repository. This includes, but is not limited\nto, the following:\n\n- Training a new model on a given dataset\n- Performing inference on a given dataset, using a pretrained model\n- Evaluating and reporting performance metrics of different models on a given\ndataset\n\nPlease refer to the `examples` directory for more information.\n\n## Supported Models\n\nThe following graph convolutional neural networks are currently supported:\n\n- NFP: Neural Fingerprint [2, 3]\n- GGNN: Gated Graph Neural Network [4, 3]\n- WeaveNet [5, 3]\n- SchNet [6]\n- RSGCN: Renormalized Spectral Graph Convolutional Network [10]<br/>\n \\* The name is not from the original paper - see [PR #89](https://github.com/pfnet-research/chainer-chemistry/pull/89) for the naming convention.\n- RelGCN: Relational Graph Convolutional Network [14]\n- GAT: Graph Attention Networks [15]\n- GIN: Graph Isomorphism Networks [17]\n- MPNN: Message Passing Neural Networks [3]\n- Set2Set [19]\n- GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation [20]\n- MEGNet: MatErials Graph Network [24]\n- CGCNN: Crystal Graph Convolutional Neural Networks [25]\n\nWe test supporting the brand-new Graph Warp Module (GWM) [18]-attached models for:\n- NFP ('nfp_gwm')\n- GGNN ('ggnn_gwm')\n- RSGCN ('rsgcn_gwm')\n- GIN ('gin_gwm')\n\nIn the directory `examples/molnet_wle`, we have implemented the new preprocessing ''Weisfeiler-Lehman Embedding for Molecular Graph Neural Networks'' [26] for several GNN architectures. Please find the Readme in that directory for the usage and the details. \n\n## Supported Datasets\n\nThe following datasets are currently supported:\n\n### Chemical\n- QM9 [7, 8]\n- Tox21 [9]\n- MoleculeNet [11]\n- ZINC (only 250k dataset) [12, 13]\n- User (own) dataset\n\n### Network\n- cora [21]\n- citeseer [22]\n- reddit [23]\n\n## Research Projects\n\nIf you use Chainer Chemistry in your research, feel free to submit a\npull request and add the name of your project to this list:\n\n - BayesGrad: Explaining Predictions of Graph Convolutional Networks ([paper](https://arxiv.org/abs/1807.01985), [code](https://github.com/pfnet-research/bayesgrad))\n - Graph Warp Module: an Auxiliary Module for Boosting the Power of Graph Neural Networks ([paper](https://arxiv.org/abs/1902.01020), [code](https://github.com/k-ishiguro/chainer-chemistry/tree/gwm_for_CC))\n - GraphNVP: An Invertible Flow Model for Generating Molecular Graphs ([paper](https://arxiv.org/abs/1905.11600), [code](https://github.com/pfnet-research/graph-nvp))\n - Graph Residual Flow for Molecular Graph Generation ([paper](https://arxiv.org/abs/1909.13521))\n\n## Useful Links\n\nChainer Chemistry:\n\n - [Documentation](https://chainer-chemistry.readthedocs.io)\n - [Research Blog](https://preferredresearch.jp/2017/12/18/chainer-chemistry-beta-release/)\n\nOther Chainer frameworks:\n\n - [Chainer: A Flexible Framework of Neural Networks for Deep Learning](https://chainer.org/)\n - [ChainerRL: Deep Reinforcement Learning Library Built on Top of Chainer](https://github.com/chainer/chainerrl)\n - [ChainerCV: A Library for Deep Learning in Computer Vision](https://github.com/chainer/chainercv)\n - [ChainerMN: Scalable Distributed Deep Learning with Chainer](https://github.com/chainer/chainermn)\n - [ChainerUI: User Interface for Chainer](https://github.com/chainer/chainerui)\n\n## License\n\nThis project is released under the MIT License. Please refer to the\n[this page](https://github.com/pfnet-research/chainer-chemistry/blob/master/LICENSE)\nfor more information.\n\nPlease note that Chainer Chemistry is still in experimental development.\nWe continuously strive to improve its functionality and performance, but at\nthis stage we cannot guarantee the reproducibility of any results published in\npapers. Use the library at your own risk.\n\n\n## References\n\n[1] Seiya Tokui, Kenta Oono, Shohei Hido, and Justin Clayton. Chainer: a next-generation open source framework for deep learning. In *Proceedings of Workshop on Machine Learning Systems (LearningSys) in Advances in Neural Information Processing System (NIPS) 28*, 2015.\n\n[2] David K Duvenaud, Dougal Maclaurin, Jorge Iparraguirre, Rafael Bombarell, Timothy Hirzel, Alan Aspuru-Guzik, and Ryan P Adams. Convolutional networks on graphs for learning molecular fingerprints. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors, *Advances in Neural Information Processing Systems (NIPS) 28*, pages 2224–2232. Curran Asso- ciates, Inc., 2015.\n\n[3] Justin Gilmer, Samuel S Schoenholz, Patrick F Riley, Oriol Vinyals, and George E Dahl. Neural message passing for quantum chemistry. *arXiv preprint arXiv:1704.01212*, 2017.\n\n[4] Yujia Li, Daniel Tarlow, Marc Brockschmidt, and Richard Zemel. Gated graph sequence neural networks. *arXiv preprint arXiv:1511.05493*, 2015.\n\n[5] Steven Kearnes, Kevin McCloskey, Marc Berndl, Vijay Pande, and Patrick Riley. Molecular graph convolutions: moving beyond fingerprints. *Journal of computer-aided molecular design*, 30(8):595–608, 2016.\n\n[6] Kristof Schütt, Pieter-Jan Kindermans, Huziel Enoc Sauceda Felix, Stefan Chmiela, Alexandre Tkatchenko, and Klaus-Rober Müller. Schnet: A continuous-filter convolutional neural network for modeling quantum interactions. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, *Advances in Neural Information Processing Systems (NIPS) 30*, pages 992–1002. Curran Associates, Inc., 2017.\n\n[7] Lars Ruddigkeit, Ruud Van Deursen, Lorenz C Blum, and Jean-Louis Reymond. Enumeration of 166 billion organic small molecules in the chemical universe database gdb-17. *Journal of chemical information and modeling*, 52(11):2864–2875, 2012.\n\n[8] Raghunathan Ramakrishnan, Pavlo O Dral, Matthias Rupp, and O Anatole Von Lilienfeld. Quantum chemistry structures and properties of 134 kilo molecules. *Scientific data*, 1:140022, 2014.\n\n[9] Ruili Huang, Menghang Xia, Dac-Trung Nguyen, Tongan Zhao, Srilatha Sakamuru, Jinghua Zhao, Sampada A Shahane, Anna Rossoshek, and Anton Simeonov. Tox21challenge to build predictive models of nuclear receptor and stress response pathways as mediated by exposure to environmental chemicals and drugs. *Frontiers in Environmental Science*, 3:85, 2016.\n\n[10] Kipf, Thomas N. and Welling, Max. Semi-Supervised Classification with Graph Convolutional Networks. *International Conference on Learning Representations (ICLR)*, 2017.\n\n[11] Zhenqin Wu, Bharath Ramsundar, Evan N. Feinberg, Joseph Gomes, Caleb Geniesse, Aneesh S. Pappu, Karl Leswing, Vijay Pande, MoleculeNet: A Benchmark for Molecular Machine Learning, arXiv preprint, arXiv: 1703.00564, 2017.\n\n[12] J. J. Irwin, T. Sterling, M. M. Mysinger, E. S. Bolstad, and R. G. Coleman. Zinc: a free tool to discover chemistry for biology. *Journal of chemical information and modeling*, 52(7):1757–1768, 2012.\n\n[13] Preprocessed csv file downloaded from https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv\n\n[14] Michael Schlichtkrull, Thomas N. Kipf, Peter Bloem, Rianne van den Berg, Ivan Titov, Max Welling. Modeling Relational Data with Graph Convolutional Networks. *Extended Semantic Web Conference (ESWC)*, 2018.\n\n[15] Veličković, P., Cucurull, G., Casanova, A., Romero, A., Liò, P., & Bengio, Y. (2017). Graph Attention Networks. arXiv preprint arXiv:1710.10903.\n\n[16] Dan Busbridge, Dane Sherburn, Pietro Cavallo and Nils Y. Hammerla. (2019). Relational Graph Attention Networks. https://openreview.net/forum?id=Bklzkh0qFm\n\n[17] Keyulu Xu, Weihua Hu, Jure Leskovec, Stefanie Jegelka, ``How Powerful are Graph Neural Networks?'', \tarXiv:1810.00826 [cs.LG], 2018 (to appear at ICLR19).\n\n[18] K. Ishiguro, S. Maeda, and M. Koyama, ``Graph Warp Module: an Auxiliary Module for Boosting the Power of Graph Neural Networks'', arXiv:1902.01020 [cs.LG], 2019.\n\n[19] Oriol Vinyals, Samy Bengio, Manjunath Kudlur. Order Matters: Sequence to sequence for sets. *arXiv preprint arXiv:1511.06391*, 2015.\n\n[20] Marc Brockschmidt, ``GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation'', arXiv:1906.12192 [cs.ML], 2019.\n\n[21] McCallum, Andrew Kachites and Nigam, Kamal and Rennie, Jason and Seymore, Kristie, Automating the Construction of Internet Portals with Machine Learning. *Information Retrieval*, 2000.\n\n[22] C. Lee Giles and Kurt D. Bollacker and Steve Lawrence, CiteSeer: An Automatic Citation Indexing System. *Proceedings of the Third ACM Conference on Digital Libraries*, 1998.\n\n[23] William L. Hamilton and Zhitao Ying and Jure Leskovec, Inductive Representation Learning on Large Graphs. *Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, 4-9 December 2017*\n\n[24] Chi Chen, Weike Ye, Yunxing Zuo, Chen Zheng, and Shyue Ping Ong. Graph networks as a universal machine learning framework for molecules and crystals. *Chemistry of Materials*, 31(9):3564–3572, 2019.\n\n[25] Tian Xie and Jeffrey C Grossman. Crystal graph convolutional neural networks for an accurate and interpretable prediction of material properties. *Physical review letters*, 120(14):145301, 2018.\n\n[26] Katsuhiko Ishiguro, Kenta Oono, and Kohei Hayashi, \"Weisfeiler-Lehman Embedding for Molecular Graph Neural Networks\", arXiv: 2006.06909, 2020. [paper link](https://arxiv.org/abs/2006.06909)\n"
  },
  {
    "path": "chainer_chemistry/__init__.py",
    "content": "import warnings\n\nfrom chainer_chemistry import dataset  # NOQA\ntry:\n    from chainer_chemistry import datasets  # NOQA\nexcept ImportError as e:\n    if 'rdkit' in e.msg:\n        warnings.warn(\n            'A module chainer_chemistry.datasets was not imported, '\n            'probably because RDKit is not installed. '\n            'To install RDKit, please follow instruction in '\n            'https://github.com/pfnet-research/chainer-chemistry#installation.',  # NOQA\n            UserWarning)\n    else:\n        raise(e)\nfrom chainer_chemistry import functions  # NOQA\nfrom chainer_chemistry import links  # NOQA\nfrom chainer_chemistry import models  # NOQA\nfrom chainer_chemistry import training  # NOQA\n\n# --- config variable definitions ---\nfrom chainer_chemistry.config import *  # NOQA\n\n\nfrom chainer_chemistry import _version  # NOQA\n\n\n__version__ = _version.__version__\n"
  },
  {
    "path": "chainer_chemistry/_version.py",
    "content": "__version__ = '0.7.1'\n"
  },
  {
    "path": "chainer_chemistry/config.py",
    "content": "# --- Configuration ---\n\n\n# --- Constant definitions ---\n# The maximum atomic number in rdkit\nMAX_ATOMIC_NUM = 117\nWEAVE_DEFAULT_NUM_MAX_ATOMS = 20  # 60  # paper\n"
  },
  {
    "path": "chainer_chemistry/dataset/__init__.py",
    "content": "from chainer_chemistry.dataset.indexer import BaseFeatureIndexer  # NOQA\nfrom chainer_chemistry.dataset.indexer import BaseIndexer  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/dataset/converters/__init__.py",
    "content": "from chainer_chemistry.dataset.converters.cgcnn_converter import cgcnn_converter  # NOQA\nfrom chainer_chemistry.dataset.converters.concat_mols import concat_mols  # NOQA\nfrom chainer_chemistry.dataset.converters.megnet_converter import megnet_converter  # NOQA\n\nconverter_method_dict = {\n    'ecfp': concat_mols,\n    'nfp': concat_mols,\n    'nfp_gwm': concat_mols,\n    'ggnn': concat_mols,\n    'ggnn_gwm': concat_mols,\n    'gin': concat_mols,\n    'gin_gwm': concat_mols,\n    'schnet': concat_mols,\n    'weavenet': concat_mols,\n    'relgcn': concat_mols,\n    'rsgcn': concat_mols,\n    'rsgcn_gwm': concat_mols,\n    'relgat': concat_mols,\n    'gnnfilm': concat_mols,\n    'megnet': megnet_converter,\n    'cgcnn': cgcnn_converter\n}\n"
  },
  {
    "path": "chainer_chemistry/dataset/converters/cgcnn_converter.py",
    "content": "import numpy\n\nimport chainer\nfrom chainer.dataset.convert import to_device\nfrom chainer import functions\n\n\n@chainer.dataset.converter()\ndef cgcnn_converter(batch, device=None, padding=None):\n    \"\"\"CGCNN converter\"\"\"\n    if len(batch) == 0:\n        raise ValueError(\"batch is empty\")\n\n    atom_feat, nbr_feat, nbr_idx = [], [], []\n    batch_atom_idx, target = [], []\n    current_idx = 0\n    xp = device.xp\n    for element in batch:\n        atom_feat.append(element[0])\n        nbr_feat.append(element[1])\n        nbr_idx.append(element[2] + current_idx)\n        target.append(element[3])\n        n_atom = element[0].shape[0]\n        atom_idx = numpy.arange(n_atom) + current_idx\n        batch_atom_idx.append(atom_idx)\n        current_idx += n_atom\n\n    atom_feat = to_device(device, functions.concat(atom_feat, axis=0).data)\n    nbr_feat = to_device(device, functions.concat(nbr_feat, axis=0).data)\n    # Always use numpy array for batch_atom_index\n    # this is list of variable length array\n    batch_atom_idx = numpy.array(batch_atom_idx)\n    nbr_idx = to_device(device, functions.concat(nbr_idx, axis=0).data)\n    target = to_device(device, xp.asarray(target))\n    result = (atom_feat, nbr_feat, batch_atom_idx, nbr_idx, target)\n    return result\n"
  },
  {
    "path": "chainer_chemistry/dataset/converters/concat_mols.py",
    "content": "import chainer\n\n\n@chainer.dataset.converter()\ndef concat_mols(batch, device=None, padding=0):\n    \"\"\"Concatenates a list of molecules into array(s).\n\n    This function converts an \"array of tuples\" into a \"tuple of arrays\".\n    Specifically, given a list of examples each of which consists of\n    a list of elements, this function first makes an array\n    by taking the element in the same position from each example\n    and concatenates them along the newly-inserted first axis\n    (called `batch dimension`) into one array.\n    It repeats this for all positions and returns the resulting arrays.\n\n    The output type depends on the type of examples in ``batch``.\n    For instance, consider each example consists of two arrays ``(x, y)``.\n    Then, this function concatenates ``x`` 's into one array, and ``y`` 's\n    into another array, and returns a tuple of these two arrays. Another\n    example: consider each example is a dictionary of two entries whose keys\n    are ``'x'`` and ``'y'``, respectively, and values are arrays. Then, this\n    function concatenates ``x`` 's into one array, and ``y`` 's into another\n    array, and returns a dictionary with two entries ``x`` and ``y`` whose\n    values are the concatenated arrays.\n\n    When the arrays to concatenate have different shapes, the behavior depends\n    on the ``padding`` value. If ``padding`` is ``None``, it raises an error.\n    Otherwise, it builds an array of the minimum shape that the\n    contents of all arrays can be substituted to. The padding value is then\n    used to the extra elements of the resulting arrays.\n\n    The current implementation is identical to\n    :func:`~chainer.dataset.concat_examples` of Chainer, except the default\n    value of the ``padding`` option is changed to ``0``.\n\n    .. admonition:: Example\n\n       >>> import numpy\n       >>> from chainer_chemistry.dataset.converters import concat_mols\n       >>> x0 = numpy.array([1, 2])\n       >>> x1 = numpy.array([4, 5, 6])\n       >>> dataset = [x0, x1]\n       >>> results = concat_mols(dataset)\n       >>> print(results)\n       [[1 2 0]\n        [4 5 6]]\n\n    .. seealso:: :func:`chainer.dataset.concat_examples`\n\n    Args:\n        batch (list):\n            A list of examples. This is typically given by a dataset\n            iterator.\n        device (int):\n            Device ID to which each array is sent. Negative value\n            indicates the host memory (CPU). If it is omitted, all arrays are\n            left in the original device.\n        padding:\n            Scalar value for extra elements. If this is None (default),\n            an error is raised on shape mismatch. Otherwise, an array of\n            minimum dimensionalities that can accommodate all arrays is\n            created, and elements outside of the examples are padded by this\n            value.\n\n    Returns:\n        Array, a tuple of arrays, or a dictionary of arrays:\n        The type depends on the type of each example in the batch.\n    \"\"\"\n    return chainer.dataset.concat_examples(batch, device, padding=padding)\n"
  },
  {
    "path": "chainer_chemistry/dataset/converters/megnet_converter.py",
    "content": "import chainer\nfrom chainer.dataset.convert import to_device\n\n\n@chainer.dataset.converter()\ndef megnet_converter(batch, device=None, padding=0):\n    \"\"\"MEGNet converter\"\"\"\n    if len(batch) == 0:\n        raise ValueError(\"batch is empty\")\n\n    atom_feat, pair_feat, global_feat, target = [], [], [], []\n    atom_idx, pair_idx, start_idx, end_idx = [], [], [], []\n    batch_size = len(batch)\n    current_atom_idx = 0\n    for i in range(batch_size):\n        element = batch[i]\n        n_atom = element[0].shape[0]\n        n_pair = element[1].shape[0]\n        atom_feat.extend(element[0])\n        pair_feat.extend(element[1])\n        global_feat.append(element[2])\n        atom_idx.extend([i]*n_atom)\n        pair_idx.extend([i]*n_pair)\n        start_idx.extend(element[3][0] + current_atom_idx)\n        end_idx.extend(element[3][1] + current_atom_idx)\n        target.append(element[4])\n        current_atom_idx += n_atom\n\n    xp = device.xp\n    atom_feat = to_device(device, xp.asarray(atom_feat))\n    pair_feat = to_device(device, xp.asarray(pair_feat))\n    global_feat = to_device(device, xp.asarray(global_feat))\n    atom_idx = to_device(device, xp.asarray(atom_idx))\n    pair_idx = to_device(device, xp.asarray(pair_idx))\n    start_idx = to_device(device, xp.asarray(start_idx))\n    end_idx = to_device(device, xp.asarray(end_idx))\n    target = to_device(device, xp.asarray(target))\n    result = (atom_feat, pair_feat, global_feat, atom_idx, pair_idx,\n              start_idx, end_idx, target)\n\n    return result\n"
  },
  {
    "path": "chainer_chemistry/dataset/graph_dataset/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/dataset/graph_dataset/base_graph_data.py",
    "content": "import numpy\n\nimport chainer\n\n\nclass BaseGraphData(object):\n    \"\"\"Base class of graph data \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        for k, v in kwargs.items():\n            setattr(self, k, v)\n\n    def to_device(self, device):\n        \"\"\"Send self to `device`\n\n        Args:\n            device (chainer.backend.Device): device\n\n        Returns:\n            self sent to `device`\n        \"\"\"\n        for k, v in self.__dict__.items():\n            if isinstance(v, (numpy.ndarray)):\n                setattr(self, k, device.send(v))\n            elif isinstance(v, (chainer.utils.CooMatrix)):\n                data = device.send(v.data.array)\n                row = device.send(v.row)\n                col = device.send(v.col)\n                device_coo_matrix = chainer.utils.CooMatrix(\n                    data, row, col, v.shape, order=v.order)\n                setattr(self, k, device_coo_matrix)\n        return self\n\n\nclass PaddingGraphData(BaseGraphData):\n    \"\"\"Graph data class for padding pattern\n\n    Args:\n        x (numpy.ndarray): input node feature\n        adj (numpy.ndarray): adjacency matrix\n        y (int or numpy.ndarray): graph or node label\n    \"\"\"\n\n    def __init__(self, x=None, adj=None, super_node=None, pos=None, y=None,\n                 **kwargs):\n        self.x = x\n        self.adj = adj\n        self.super_node = super_node\n        self.pos = pos\n        self.y = y\n        self.n_nodes = x.shape[0]\n        super(PaddingGraphData, self).__init__(**kwargs)\n\n\nclass SparseGraphData(BaseGraphData):\n    \"\"\"Graph data class for sparse pattern\n\n    Args:\n        x (numpy.ndarray): input node feature\n        edge_index (numpy.ndarray): sources and destinations of edges\n        edge_attr (numpy.ndarray): attribution of edges\n        y (int or numpy.ndarray): graph or node label\n    \"\"\"\n\n    def __init__(self, x=None, edge_index=None, edge_attr=None,\n                 pos=None, super_node=None, y=None, **kwargs):\n        self.x = x\n        self.edge_index = edge_index\n        self.edge_attr = edge_attr\n        self.pos = pos\n        self.super_node = super_node\n        self.y = y\n        self.n_nodes = x.shape[0]\n        super(SparseGraphData, self).__init__(**kwargs)\n"
  },
  {
    "path": "chainer_chemistry/dataset/graph_dataset/base_graph_dataset.py",
    "content": "import numpy\n\nimport chainer\nfrom chainer._backend import Device\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_data import BaseGraphData  # NOQA\nfrom chainer_chemistry.dataset.graph_dataset.feature_converters \\\n    import batch_with_padding, batch_without_padding, concat, shift_concat, \\\n    concat_with_padding, shift_concat_with_padding  # NOQA\n\n\nclass BaseGraphDataset(object):\n    \"\"\"Base class of graph dataset (list of graph data)\"\"\"\n    _pattern = ''\n    _feature_entries = []\n    _feature_batch_method = []\n\n    def __init__(self, data_list, *args, **kwargs):\n        self.data_list = data_list\n\n    def register_feature(self, key, batch_method, skip_if_none=True):\n        \"\"\"Register feature with batch method\n\n        Args:\n            key (str): name of the feature\n            batch_method (function): batch method\n            skip_if_none (bool, optional): If true, skip if `batch_method` is\n                None. Defaults to True.\n        \"\"\"\n        if skip_if_none and getattr(self.data_list[0], key, None) is None:\n            return\n        self._feature_entries.append(key)\n        self._feature_batch_method.append(batch_method)\n\n    def update_feature(self, key, batch_method):\n        \"\"\"Update batch method of the feature\n\n        Args:\n            key (str): name of the feature\n            batch_method (function): batch method\n        \"\"\"\n\n        index = self._feature_entries.index(key)\n        self._feature_batch_method[index] = batch_method\n\n    def __len__(self):\n        return len(self.data_list)\n\n    def __getitem__(self, item):\n        return self.data_list[item]\n\n    def converter(self, batch, device=None):\n        \"\"\"Converter\n\n        Args:\n            batch (list[BaseGraphData]): list of graph data\n            device (int, optional): specifier of device. Defaults to None.\n\n        Returns:\n            self sent to `device`\n        \"\"\"\n        if not isinstance(device, Device):\n            device = chainer.get_device(device)\n        batch = [method(name, batch, device=device) for name, method in\n                 zip(self._feature_entries, self._feature_batch_method)]\n        data = BaseGraphData(\n            **{key: value for key, value in zip(self._feature_entries, batch)})\n        return data\n\n\nclass PaddingGraphDataset(BaseGraphDataset):\n    \"\"\"Graph dataset class for padding pattern\"\"\"\n    _pattern = 'padding'\n\n    def __init__(self, data_list):\n        super(PaddingGraphDataset, self).__init__(data_list)\n        self.register_feature('x', batch_with_padding)\n        self.register_feature('adj', batch_with_padding)\n        self.register_feature('super_node', batch_with_padding)\n        self.register_feature('pos', batch_with_padding)\n        self.register_feature('y', batch_without_padding)\n        self.register_feature('n_nodes', batch_without_padding)\n\n\nclass SparseGraphDataset(BaseGraphDataset):\n    \"\"\"Graph dataset class for sparse pattern\"\"\"\n    _pattern = 'sparse'\n\n    def __init__(self, data_list):\n        super(SparseGraphDataset, self).__init__(data_list)\n        self.register_feature('x', concat)\n        self.register_feature('edge_index', shift_concat)\n        self.register_feature('edge_attr', concat)\n        self.register_feature('super_node', concat)\n        self.register_feature('pos', concat)\n        self.register_feature('y', batch_without_padding)\n        self.register_feature('n_nodes', batch_without_padding)\n\n    def converter(self, batch, device=None):\n        \"\"\"Converter\n\n        add `self.batch`, which represents the index of the graph each node\n        belongs to.\n\n        Args:\n            batch (list[BaseGraphData]): list of graph data\n            device (int, optional): specifier of device. Defaults to None.\n\n        Returns:\n            self sent to `device`\n        \"\"\"\n        data = super(SparseGraphDataset, self).converter(batch, device=device)\n        if not isinstance(device, Device):\n            device = chainer.get_device(device)\n        data.batch = numpy.concatenate([\n            numpy.full((data.x.shape[0]), i, dtype=numpy.int)\n            for i, data in enumerate(batch)\n        ])\n        data.batch = device.send(data.batch)\n        return data\n\n    # for experiment\n    # use converter for the normal use\n    def converter_with_padding(self, batch, device=None):\n        self.update_feature('x', concat_with_padding)\n        self.update_feature('edge_index', shift_concat_with_padding)\n        data = super(SparseGraphDataset, self).converter(batch, device=device)\n        if not isinstance(device, Device):\n            device = chainer.get_device(device)\n        max_n_nodes = max([data.x.shape[0] for data in batch])\n        data.batch = numpy.concatenate([\n            numpy.full((max_n_nodes), i, dtype=numpy.int)\n            for i, data in enumerate(batch)\n        ])\n        data.batch = device.send(data.batch)\n        return data\n"
  },
  {
    "path": "chainer_chemistry/dataset/graph_dataset/feature_converters.py",
    "content": "import numpy\n\nfrom chainer.dataset.convert import _concat_arrays\n\n\ndef batch_with_padding(name, batch, device=None, pad=0):\n    \"\"\"Batch with padding (increase ndim by 1)\n\n    Args:\n        name (str): propaty name of graph data\n        batch (list[BaseGraphData]): list of base graph data\n        device (chainer.backend.Device, optional): device. Defaults to None.\n        pad (int, optional): padding value. Defaults to 0.\n\n    Returns:\n        BaseGraphDataset: graph dataset sent to `device`\n    \"\"\"\n    feat = _concat_arrays(\n        [getattr(example, name) for example in batch], pad)\n    return device.send(feat)\n\n\ndef batch_without_padding(name, batch, device=None):\n    \"\"\"Batch without padding (increase ndim by 1)\n\n    Args:\n        name (str): propaty name of graph data\n        batch (list[BaseGraphData]): list of base graph data\n        device (chainer.backend.Device, optional): device. Defaults to None.\n\n    Returns:\n        BaseGraphDataset: graph dataset sent to `device`\n    \"\"\"\n    feat = _concat_arrays(\n        [getattr(example, name) for example in batch], None)\n    return device.send(feat)\n\n\ndef concat_with_padding(name, batch, device=None, pad=0):\n    \"\"\"Concat without padding (ndim does not increase)\n\n    Args:\n        name (str): propaty name of graph data\n        batch (list[BaseGraphData]): list of base graph data\n        device (chainer.backend.Device, optional): device. Defaults to None.\n        pad (int, optional): padding value. Defaults to 0.\n\n    Returns:\n        BaseGraphDataset: graph dataset sent to `device`\n    \"\"\"\n    feat = batch_with_padding(name, batch, device=device, pad=pad)\n    a, b = feat.shape\n    return feat.reshape((a * b))\n\n\ndef concat(name, batch, device=None, axis=0):\n    \"\"\"Concat with padding (ndim does not increase)\n\n    Args:\n        name (str): propaty name of graph data\n        batch (list[BaseGraphData]): list of base graph data\n        device (chainer.backend.Device, optional): device. Defaults to None.\n        pad (int, optional): padding value. Defaults to 0.\n\n    Returns:\n        BaseGraphDataset: graph dataset sent to `device`\n    \"\"\"\n    feat = numpy.concatenate([getattr(data, name) for data in batch],\n                             axis=axis)\n    return device.send(feat)\n\n\ndef shift_concat(name, batch, device=None, shift_attr='x', shift_axis=1):\n    \"\"\"Concat with index shift (ndim does not increase)\n\n    Concatenate graphs into a big one.\n    Used for sparse pattern batching.\n\n    Args:\n        name (str): propaty name of graph data\n        batch (list[BaseGraphData]): list of base graph data\n        device (chainer.backend.Device, optional): device. Defaults to None.\n\n    Returns:\n        BaseGraphDataset: graph dataset sent to `device`\n    \"\"\"\n    shift_index_array = numpy.cumsum(\n        numpy.array([0] + [getattr(data, shift_attr).shape[0]\n                           for data in batch]))\n    feat = numpy.concatenate([\n        getattr(data, name) + shift_index_array[i]\n        for i, data in enumerate(batch)], axis=shift_axis)\n    return device.send(feat)\n\n\ndef shift_concat_with_padding(name, batch, device=None, shift_attr='x',\n                              shift_axis=1):\n    \"\"\"Concat with index shift and padding (ndim does not increase)\n\n    Concatenate graphs into a big one.\n    Used for sparse pattern batching.\n\n    Args:\n        name (str): propaty name of graph data\n        batch (list[BaseGraphData]): list of base graph data\n        device (chainer.backend.Device, optional): device. Defaults to None.\n\n    Returns:\n        BaseGraphDataset: graph dataset sent to `device`\n    \"\"\"\n    max_n_nodes = max([data.x.shape[0] for data in batch])\n    shift_index_array = numpy.arange(0, len(batch) * max_n_nodes, max_n_nodes)\n    feat = numpy.concatenate([\n        getattr(data, name) + shift_index_array[i]\n        for i, data in enumerate(batch)], axis=shift_axis)\n    return device.send(feat)\n"
  },
  {
    "path": "chainer_chemistry/dataset/indexer.py",
    "content": "import numpy\nimport six\n\n\nclass ExtractBySliceNotSupportedError(Exception):\n    pass\n\n\nclass BaseIndexer(object):\n    \"\"\"Base class for Indexer\"\"\"\n\n    def __getitem__(self, item):\n        raise NotImplementedError\n\n\nclass BaseFeatureIndexer(BaseIndexer):\n\n    \"\"\"Base class for FeatureIndexer\n\n    FeatureIndexer can be accessed by 2-dimensional indices, axis=0 is used for\n    dataset index and axis=1 is used for feature index.\n    For example, let `features` be the instance of `BaseFeatureIndexer`, then\n    `features[i, j]` returns `i`-th dataset of `j`-th feature.\n\n    `features[ind]` works same with `features[ind, :]`\n\n    Note that the returned value will be numpy array, even though the\n    dataset is initilized with other format (e.g. list).\n\n    \"\"\"\n\n    def __init__(self, dataset):\n        super(BaseFeatureIndexer, self).__init__()\n        self.dataset = dataset\n\n    def features_length(self):\n        \"\"\"Returns length of features\n\n        Returns (int): feature length\n\n        \"\"\"\n        raise NotImplementedError\n\n    @property\n    def dataset_length(self):\n        return len(self.dataset)\n\n    @property\n    def shape(self):\n        return self.dataset_length, self.features_length()\n\n    def extract_feature_by_slice(self, slice_index, j):\n        \"\"\"Extracts `slice_index`-th data's `j`-th feature.\n\n        Here, `slice_index` is indices of slice object.\n        This method may be override to support efficient feature extraction.\n        If not override, `ExtractBySliceNotSupportedError` is raised by\n        default, and in this case `extract_feature` is used instead.\n\n        Args:\n            slice_index (slice): slice of data index to be extracted\n            j (int): `j`-th feature to be extracted\n\n        Returns: feature\n        \"\"\"\n\n        raise ExtractBySliceNotSupportedError\n\n    def extract_feature(self, i, j):\n        \"\"\"Extracts `i`-th data's `j`-th feature\n\n        Args:\n            i (int): `i`-th data to be extracted\n            j (int): `j`-th feature to be extracted\n\n        Returns: feature\n\n        \"\"\"\n        raise NotImplementedError\n\n    def create_feature_index_list(self, feature_index):\n        if isinstance(feature_index, slice):\n            feature_index_list = numpy.arange(\n                *feature_index.indices(self.features_length())\n            )\n        elif isinstance(feature_index, (list, numpy.ndarray)):\n            if isinstance(feature_index[0],\n                          (bool, numpy.bool, numpy.bool_)):\n                if len(feature_index) != self.features_length():\n                    raise ValueError('Feature index wrong length {} instead of'\n                                     ' {}'.format(len(feature_index),\n                                                  self.features_length()))\n                feature_index_list = numpy.argwhere(feature_index\n                                                    ).ravel()\n            else:\n                feature_index_list = feature_index\n        else:\n            # assuming int type\n            feature_index_list = [feature_index]\n        return feature_index_list\n\n    def preprocess(self, item):\n        pass\n\n    def postprocess(self, item):\n        pass\n\n    def __getitem__(self, item):\n        self.preprocess(item)\n        if isinstance(item, tuple):\n            index_dim = len(item)\n            # multi dimensional access\n            if index_dim == 1:\n                # This is not unexpected case...\n                data_index = item[0]\n                feature_index_list = self.create_feature_index_list(\n                    slice(None)\n                )\n            elif index_dim == 2:\n                data_index, feature_index = item\n                feature_index_list = self.create_feature_index_list(\n                    feature_index\n                )\n            else:\n                raise IndexError('too many indices for features')\n        else:\n            data_index = item\n            feature_index_list = self.create_feature_index_list(slice(None))\n        if len(feature_index_list) == 1:\n            self._extract_single_feature = True\n            ret = self._extract_feature(data_index, feature_index_list[0])\n        else:\n            self._extract_single_feature = False\n            ret = tuple([self._extract_feature(data_index, j) for j in\n                         feature_index_list])\n        self.postprocess(item)\n        return ret\n\n    def check_type_feature_index(self, j):\n        if j >= self.features_length():\n            raise IndexError('index {} is out of bounds for axis 1 with '\n                             'size {}'.format(j, self.features_length()))\n\n    def _extract_feature(self, data_index, j):\n        \"\"\"Format `data_index` and call proper method to extract feature.\n\n        Args:\n            data_index (int, slice, list or numpy.ndarray):\n            j (int or key):\n\n        \"\"\"\n        self.check_type_feature_index(j)\n        if isinstance(data_index, slice):\n            try:\n                return self.extract_feature_by_slice(data_index, j)\n            except ExtractBySliceNotSupportedError:\n                # Accessing by each index, copy occurs\n                current, stop, step = data_index.indices(self.dataset_length)\n                res = [self.extract_feature(i, j) for i in\n                       six.moves.range(current, stop, step)]\n        elif isinstance(data_index, (list, numpy.ndarray)):\n            if len(data_index) == 0:\n                try:\n                    # HACKING\n                    return self.extract_feature_by_slice(slice(0, 0, 1), j)\n                except ExtractBySliceNotSupportedError:\n                    res = []\n            else:\n                if isinstance(data_index[0], (bool, numpy.bool, numpy.bool_)):\n                    # Access by bool flag list\n                    if len(data_index) != self.dataset_length:\n                        raise ValueError(\n                            'Feature index wrong length {} instead of'\n                            ' {}'.format(len(data_index),\n                                         self.dataset_length))\n                    data_index = numpy.argwhere(data_index).ravel()\n\n                res = [self.extract_feature(i, j) for i in data_index]\n        else:\n            # `data_index` is expected to be `int`\n            return self.extract_feature(data_index, j)\n        try:\n            feature = numpy.asarray(res)\n        except ValueError:\n            feature = numpy.empty(len(res), dtype=object)\n            feature[:] = res[:]\n        return feature\n"
  },
  {
    "path": "chainer_chemistry/dataset/indexers/__init__.py",
    "content": "from chainer_chemistry.dataset.indexers.numpy_tuple_dataset_feature_indexer import NumpyTupleDatasetFeatureIndexer  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/dataset/indexers/numpy_tuple_dataset_feature_indexer.py",
    "content": "from chainer_chemistry.dataset.indexer import BaseFeatureIndexer\n\n\nclass NumpyTupleDatasetFeatureIndexer(BaseFeatureIndexer):\n    \"\"\"FeatureIndexer for NumpyTupleDataset\n\n    Args:\n        dataset (NumpyTupleDataset): dataset instance\n\n    \"\"\"\n\n    def __init__(self, dataset):\n        super(NumpyTupleDatasetFeatureIndexer, self).__init__(dataset)\n        self.datasets = dataset.get_datasets()\n\n    def features_length(self):\n        return len(self.datasets)\n\n    def extract_feature_by_slice(self, slice_index, j):\n        return self.datasets[j][slice_index]\n\n    def extract_feature(self, i, j):\n        return self.datasets[j][i]\n"
  },
  {
    "path": "chainer_chemistry/dataset/networkx_preprocessors/base_networkx.py",
    "content": "import networkx\nimport numpy\n\nimport chainer\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_dataset import PaddingGraphDataset, SparseGraphDataset  # NOQA\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_data import PaddingGraphData, SparseGraphData  # NOQA\nfrom chainer_chemistry.dataset.graph_dataset.feature_converters import batch_without_padding  # NOQA\n\n\nclass BaseNetworkxPreprocessor(object):\n    \"\"\"Base class to preprocess `Networkx::Graph` object\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        pass\n\n    def get_x(self, graph):\n        if 'x' in graph.graph:\n            x = graph.graph['x']\n        else:\n            feature_dim, = graph.nodes[0]['x'].shape\n            x = numpy.empty((graph.number_of_nodes(), feature_dim),\n                            dtype=numpy.float32)\n            for v, data in graph.nodes.data():\n                x[v] = data['x']\n        return x\n\n    def get_y(self, graph):\n        if 'y' in graph.graph:\n            y = graph.graph['y']\n        else:\n            y = numpy.empty(graph.number_of_nodes(), dtype=numpy.int32)\n            for v, data in graph.nodes.data():\n                y[v] = data['y']\n        return y\n\n\nclass BasePaddingNetworkxPreprocessor(BaseNetworkxPreprocessor):\n    \"\"\"Base class to preprocess `Networkx::Graph` into `PaddingGraphDataset`\n    \n    \"\"\"  # NOQA\n\n    def __init__(self, use_coo=False, *args, **kwargs):\n        self.use_coo = use_coo\n\n    def construct_data(self, graph):\n        \"\"\"Construct `PaddingGraphData` from `Networkx::Graph`\n\n        Args:\n            graph (Networkx::Graph): graph\n\n        Returns:\n            PaddingGraphData: graph data of padding pattern\n        \"\"\"\n        if not self.use_coo:\n            return PaddingGraphData(\n                x=self.get_x(graph),\n                adj=networkx.to_numpy_array(graph, dtype=numpy.float32),\n                y=self.get_y(graph),\n                label_num=graph.graph['label_num']\n            )\n\n        n_edges = graph.number_of_edges() * 2\n        row = numpy.empty((n_edges), dtype=numpy.int)\n        col = numpy.empty((n_edges), dtype=numpy.int)\n        data = numpy.ones((n_edges), dtype=numpy.float32)\n        for i, edge in enumerate(graph.edges):\n            row[2 * i] = edge[0]\n            row[2 * i + 1] = edge[1]\n            col[2 * i] = edge[1]\n            col[2 * i + 1] = edge[0]\n\n        # ensure row is sorted\n        if not numpy.all(row[:-1] <= row[1:]):\n            order = numpy.argsort(row)\n            row = row[order]\n            col = col[order]\n        assert numpy.all(row[:-1] <= row[1:])\n\n        adj = chainer.utils.CooMatrix(\n            data=data, row=row, col=col,\n            shape=(graph.number_of_nodes(), graph.number_of_nodes()),\n            order='C')\n\n        return PaddingGraphData(\n            x=self.get_x(graph),\n            adj=adj,\n            y=self.get_y(graph),\n            label_num=graph.graph['label_num']\n        )\n\n    def create_dataset(self, graph_list):\n        \"\"\"Create `PaddingGraphDataset` from list of `Networkx::Graph`\n\n        Args:\n            graph_list (list[Networkx::Graph]): list of graphs\n\n        Returns:\n            PaddingGraphDataset: graph dataset of padding pattern\n        \"\"\"\n        data_list = [\n            self.construct_data(graph) for graph in graph_list\n        ]\n        dataset = PaddingGraphDataset(data_list)\n        dataset.register_feature('label_num', batch_without_padding)\n        return dataset\n\n\nclass BaseSparseNetworkxPreprocessor(BaseNetworkxPreprocessor):\n    \"\"\"Base class to preprocess `Networkx::Graph` into `SparseGraphDataset`\n\n    \"\"\"\n\n    def construct_data(self, graph):\n        \"\"\"Construct `SparseGraphData` from `Networkx::Graph`\n\n        Args:\n            graph (Networkx::Graph): graph\n\n        Returns:\n            SparseGraphData: graph data of sparse pattern\n        \"\"\"\n        edge_index = numpy.empty((2, graph.number_of_edges() * 2),\n                                 dtype=numpy.int)\n        for i, edge in enumerate(graph.edges):\n            edge_index[0][2 * i] = edge[0]\n            edge_index[0][2 * i + 1] = edge[1]\n            edge_index[1][2 * i] = edge[1]\n            edge_index[1][2 * i + 1] = edge[0]\n\n        return SparseGraphData(\n            x=self.get_x(graph),\n            edge_index=numpy.array(edge_index, dtype=numpy.int),\n            y=self.get_y(graph),\n            label_num=graph.graph['label_num']\n        )\n\n    def add_self_loop(self, graph):\n        for v in range(graph.number_of_nodes()):\n            graph.add_edge(v, v)\n        return graph\n\n    def create_dataset(self, graph_list):\n        \"\"\"Create `SparseGraphDataset` from list of `Networkx::Graph`\n\n        Args:\n            graph_list (list[Networkx::Graph]): list of graphs\n\n        Returns:\n            SparseGraphDataset: graph dataset of sparse pattern\n        \"\"\"\n        data_list = [\n            self.construct_data(graph) for graph in graph_list\n        ]\n        dataset = SparseGraphDataset(data_list)\n        dataset.register_feature('label_num', batch_without_padding)\n        return dataset\n"
  },
  {
    "path": "chainer_chemistry/dataset/networkx_preprocessors/reddit_coo.py",
    "content": "import os\n\nimport numpy\nimport scipy\n\nimport chainer\n\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_data import PaddingGraphData  # NOQA\n\n\ndef get_reddit_coo_data(dirpath):\n    \"\"\"Temporary function to obtain reddit coo data for GIN\n\n    (because it takes to much time to convert it to networkx)\n\n    Returns:\n        PaddingGraphData: `PaddingGraphData` of reddit\n    \"\"\"\n\n    print(\"Loading node feature and label\")\n    reddit_data = numpy.load(os.path.join(dirpath, \"reddit_data.npz\"))\n\n    print(\"Loading edge data\")\n    coo_adj = scipy.sparse.load_npz(os.path.join(dirpath, \"reddit_graph.npz\"))\n    row = coo_adj.row.astype(numpy.int32)\n    col = coo_adj.col.astype(numpy.int32)\n    data = coo_adj.data.astype(numpy.float32)\n\n    # ensure row is sorted\n    if not numpy.all(row[:-1] <= row[1:]):\n        order = numpy.argsort(row)\n        row = row[order]\n        col = col[order]\n    assert numpy.all(row[:-1] <= row[1:])\n\n    adj = chainer.utils.CooMatrix(\n        data=data, row=row, col=col,\n        shape=coo_adj.shape,\n        order='C')\n\n    return PaddingGraphData(\n        x=reddit_data['feature'].astype(numpy.float32),\n        adj=adj,\n        y=reddit_data['label'].astype(numpy.int32),\n        label_num=41\n    )\n"
  },
  {
    "path": "chainer_chemistry/dataset/parsers/__init__.py",
    "content": "from chainer_chemistry.dataset.parsers import base_parser  # NOQA\nfrom chainer_chemistry.dataset.parsers import csv_file_parser  # NOQA\nfrom chainer_chemistry.dataset.parsers import data_frame_parser  # NOQA\nfrom chainer_chemistry.dataset.parsers import sdf_file_parser  # NOQA\nfrom chainer_chemistry.dataset.parsers import smiles_parser  # NOQA\n\nfrom chainer_chemistry.dataset.parsers.base_parser import BaseFileParser  # NOQA\nfrom chainer_chemistry.dataset.parsers.base_parser import BaseParser  # NOQA\nfrom chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser  # NOQA\nfrom chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser  # NOQA\nfrom chainer_chemistry.dataset.parsers.sdf_file_parser import SDFFileParser  # NOQA\nfrom chainer_chemistry.dataset.parsers.smiles_parser import SmilesParser  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/dataset/parsers/base_parser.py",
    "content": "class BaseParser(object):\n    def __init__(self):\n        pass\n\n\nclass BaseFileParser(BaseParser):\n    \"\"\"base class for file parser\"\"\"\n\n    def __init__(self, preprocessor):\n        super(BaseFileParser, self).__init__()\n        self.preprocessor = preprocessor\n\n    def parse(self, filepath):\n        raise NotImplementedError\n"
  },
  {
    "path": "chainer_chemistry/dataset/parsers/csv_file_parser.py",
    "content": "import pandas\n\nfrom chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser\n\n\nclass CSVFileParser(DataFrameParser):\n    \"\"\"csv file parser\n\n    This FileParser parses .csv file.\n    It should contain column which contain SMILES as input, and\n    label column which is the target to predict.\n\n    Args:\n        preprocessor (BasePreprocessor): preprocessor instance\n        labels (str or list): labels column\n        smiles_col (str): smiles column\n        postprocess_label (Callable): post processing function if necessary\n        postprocess_fn (Callable): post processing function if necessary\n        logger:\n    \"\"\"\n\n    def __init__(self, preprocessor,\n                 labels=None,\n                 smiles_col='smiles',\n                 postprocess_label=None, postprocess_fn=None,\n                 logger=None):\n        super(CSVFileParser, self).__init__(\n            preprocessor, labels=labels, smiles_col=smiles_col,\n            postprocess_label=postprocess_label, postprocess_fn=postprocess_fn,\n            logger=logger)\n\n    def parse(self, filepath, return_smiles=False, target_index=None,\n              return_is_successful=False):\n        \"\"\"parse csv file using `preprocessor`\n\n        Label is extracted from `labels` columns and input features are\n        extracted from smiles information in `smiles` column.\n\n        Args:\n            filepath (str): file path to be parsed.\n            return_smiles (bool): If set to True, this function returns\n                preprocessed dataset and smiles list.\n                If set to False, this function returns preprocessed dataset and\n                `None`.\n            target_index (list or None): target index list to partially extract\n                dataset. If None (default), all examples are parsed.\n            return_is_successful (bool): If set to `True`, boolean list is\n                returned in the key 'is_successful'. It represents\n                preprocessing has succeeded or not for each SMILES.\n                If set to False, `None` is returned in the key 'is_success'.\n\n        Returns (dict): dictionary that contains Dataset, 1-d numpy array with\n            dtype=object(string) which is a vector of smiles for each example\n            or None.\n\n        \"\"\"\n        df = pandas.read_csv(filepath)\n        return super(CSVFileParser, self).parse(\n            df, return_smiles=return_smiles, target_index=target_index,\n            return_is_successful=return_is_successful)\n\n    def extract_total_num(self, filepath):\n        \"\"\"Extracts total number of data which can be parsed\n\n        We can use this method to determine the value fed to `target_index`\n        option of `parse` method. For example, if we want to extract input\n        feature from 10% of whole dataset, we need to know how many samples\n        are in a file. The returned value of this method may not to be same as\n        the final dataset size.\n\n        Args:\n            filepath (str): file path of to check the total number.\n\n        Returns (int): total number of dataset can be parsed.\n\n        \"\"\"\n        df = pandas.read_csv(filepath)\n        return len(df)\n"
  },
  {
    "path": "chainer_chemistry/dataset/parsers/data_frame_parser.py",
    "content": "from logging import getLogger\n\nimport numpy\nfrom rdkit import Chem\nfrom tqdm import tqdm\n\nfrom chainer_chemistry.dataset.parsers.base_parser import BaseFileParser\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\n\nimport traceback\n\n\nclass DataFrameParser(BaseFileParser):\n    \"\"\"data frame parser\n\n    This FileParser parses pandas dataframe.\n    It should contain column which contain SMILES as input, and\n    label column which is the target to predict.\n\n    Args:\n        preprocessor (BasePreprocessor): preprocessor instance\n        labels (str or list or None): labels column\n        smiles_col (str): smiles column\n        postprocess_label (Callable): post processing function if necessary\n        postprocess_fn (Callable): post processing function if necessary\n        logger:\n    \"\"\"\n\n    def __init__(self, preprocessor,\n                 labels=None,\n                 smiles_col='smiles',\n                 postprocess_label=None, postprocess_fn=None,\n                 logger=None):\n        super(DataFrameParser, self).__init__(preprocessor)\n        if isinstance(labels, str):\n            labels = [labels, ]\n        self.labels = labels  # type: list\n        self.smiles_col = smiles_col\n        self.postprocess_label = postprocess_label\n        self.postprocess_fn = postprocess_fn\n        self.logger = logger or getLogger(__name__)\n\n    def parse(self, df, return_smiles=False, target_index=None,\n              return_is_successful=False):\n        \"\"\"parse DataFrame using `preprocessor`\n\n        Label is extracted from `labels` columns and input features are\n        extracted from smiles information in `smiles` column.\n\n        Args:\n            df (pandas.DataFrame): dataframe to be parsed.\n            return_smiles (bool): If set to `True`, smiles list is returned in\n                the key 'smiles', it is a list of SMILES from which input\n                features are successfully made.\n                If set to `False`, `None` is returned in the key 'smiles'.\n            target_index (list or None): target index list to partially extract\n                dataset. If None (default), all examples are parsed.\n            return_is_successful (bool): If set to `True`, boolean list is\n                returned in the key 'is_successful'. It represents\n                preprocessing has succeeded or not for each SMILES.\n                If set to False, `None` is returned in the key 'is_success'.\n\n        Returns (dict): dictionary that contains Dataset, 1-d numpy array with\n            dtype=object(string) which is a vector of smiles for each example\n            or None.\n\n        \"\"\"\n        logger = self.logger\n        pp = self.preprocessor\n        smiles_list = []\n        is_successful_list = []\n\n        # counter = 0\n        if isinstance(pp, MolPreprocessor):\n            if target_index is not None:\n                df = df.iloc[target_index]\n\n            features = None\n            smiles_index = df.columns.get_loc(self.smiles_col)\n            if self.labels is None:\n                labels_index = []  # dummy list\n            else:\n                labels_index = [df.columns.get_loc(c) for c in self.labels]\n\n            total_count = df.shape[0]\n            fail_count = 0\n            success_count = 0\n            for row in tqdm(df.itertuples(index=False), total=df.shape[0]):\n                smiles = row[smiles_index]\n                # TODO(Nakago): Check.\n                # currently it assumes list\n                labels = [row[i] for i in labels_index]\n                try:\n                    mol = Chem.MolFromSmiles(smiles)\n                    if mol is None:\n                        fail_count += 1\n                        if return_is_successful:\n                            is_successful_list.append(False)\n                        continue\n                    # Note that smiles expression is not unique.\n                    # we obtain canonical smiles\n                    canonical_smiles, mol = pp.prepare_smiles_and_mol(mol)\n                    input_features = pp.get_input_features(mol)\n\n                    # Extract label\n                    if self.postprocess_label is not None:\n                        labels = self.postprocess_label(labels)\n\n                    if return_smiles:\n                        smiles_list.append(canonical_smiles)\n                except MolFeatureExtractionError as e:  # NOQA\n                    # This is expected error that extracting feature failed,\n                    # skip this molecule.\n                    fail_count += 1\n                    if return_is_successful:\n                        is_successful_list.append(False)\n                    continue\n                except Exception as e:\n                    logger.warning('parse(), type: {}, {}'\n                                   .format(type(e).__name__, e.args))\n                    logger.info(traceback.format_exc())\n                    fail_count += 1\n                    if return_is_successful:\n                        is_successful_list.append(False)\n                    continue\n                # Initialize features: list of list\n                if features is None:\n                    if isinstance(input_features, tuple):\n                        num_features = len(input_features)\n                    else:\n                        num_features = 1\n                    if self.labels is not None:\n                        num_features += 1\n                    features = [[] for _ in range(num_features)]\n\n                if isinstance(input_features, tuple):\n                    for i in range(len(input_features)):\n                        features[i].append(input_features[i])\n                else:\n                    features[0].append(input_features)\n                if self.labels is not None:\n                    features[len(features) - 1].append(labels)\n                success_count += 1\n                if return_is_successful:\n                    is_successful_list.append(True)\n            ret = []\n\n            for feature in features:\n                try:\n                    feat_array = numpy.asarray(feature)\n                except ValueError:\n                    # Temporal work around.\n                    # See,\n                    # https://stackoverflow.com/questions/26885508/why-do-i-get-error-trying-to-cast-np-arraysome-list-valueerror-could-not-broa\n                    feat_array = numpy.empty(len(feature), dtype=numpy.ndarray)\n                    feat_array[:] = feature[:]\n                ret.append(feat_array)\n            result = tuple(ret)\n            logger.info('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'\n                        .format(fail_count, success_count, total_count))\n        else:\n            raise NotImplementedError\n\n        smileses = numpy.array(\n            smiles_list, dtype=object) if return_smiles else None\n        if return_is_successful:\n            is_successful = numpy.array(is_successful_list)\n        else:\n            is_successful = None\n\n        if isinstance(result, tuple):\n            if self.postprocess_fn is not None:\n                result = self.postprocess_fn(*result)\n            dataset = pp.create_dataset(*result)\n        else:\n            if self.postprocess_fn is not None:\n                result = self.postprocess_fn(result)\n            dataset = pp.create_dataset(*result)\n        return {\"dataset\": dataset,\n                \"smiles\": smileses,\n                \"is_successful\": is_successful}\n\n    def extract_total_num(self, df):\n        \"\"\"Extracts total number of data which can be parsed\n\n        We can use this method to determine the value fed to `target_index`\n        option of `parse` method. For example, if we want to extract input\n        feature from 10% of whole dataset, we need to know how many samples\n        are in a file. The returned value of this method may not to be same as\n        the final dataset size.\n\n        Args:\n            df (pandas.DataFrame): dataframe to be parsed.\n\n        Returns (int): total number of dataset can be parsed.\n\n        \"\"\"\n        return len(df)\n"
  },
  {
    "path": "chainer_chemistry/dataset/parsers/sdf_file_parser.py",
    "content": "from logging import getLogger\n\nimport numpy\nfrom rdkit import Chem\nfrom tqdm import tqdm\n\nfrom chainer_chemistry.dataset.parsers.base_parser import BaseFileParser\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\n\n\nclass SDFFileParser(BaseFileParser):\n    \"\"\"sdf file parser\n\n    Args:\n        preprocessor (BasePreprocessor): preprocessor instance\n        labels (str or list): labels column\n        postprocess_label (Callable): post processing function if necessary\n        postprocess_fn (Callable): post processing function if necessary\n        logger:\n    \"\"\"\n\n    def __init__(self, preprocessor, labels=None, postprocess_label=None,\n                 postprocess_fn=None, logger=None):\n        super(SDFFileParser, self).__init__(preprocessor)\n        self.labels = labels\n        self.postprocess_label = postprocess_label\n        self.postprocess_fn = postprocess_fn\n        self.logger = logger or getLogger(__name__)\n\n    def parse(self, filepath, return_smiles=False, target_index=None,\n              return_is_successful=False):\n        \"\"\"parse sdf file using `preprocessor`\n\n        Note that label is extracted from preprocessor's method.\n\n        Args:\n            filepath (str): file path to be parsed.\n            return_smiles (bool): If set to True, this function returns\n                preprocessed dataset and smiles list.\n                If set to False, this function returns preprocessed dataset and\n                `None`.\n            target_index (list or None): target index list to partially extract\n                dataset. If None (default), all examples are parsed.\n            return_is_successful (bool): If set to `True`, boolean list is\n                returned in the key 'is_successful'. It represents\n                preprocessing has succeeded or not for each SMILES.\n                If set to False, `None` is returned in the key 'is_success'.\n\n        Returns (dict): dictionary that contains Dataset, 1-d numpy array with\n            dtype=object(string) which is a vector of smiles for each example\n            or None.\n\n        \"\"\"\n        logger = self.logger\n        pp = self.preprocessor\n        smiles_list = []\n        is_successful_list = []\n\n        if isinstance(pp, MolPreprocessor):\n            mol_supplier = Chem.SDMolSupplier(filepath)\n\n            if target_index is None:\n                target_index = list(range(len(mol_supplier)))\n\n            features = None\n\n            total_count = len(mol_supplier)\n            fail_count = 0\n            success_count = 0\n            for index in tqdm(target_index):\n                # `mol_supplier` does not accept numpy.integer, we must use int\n                mol = mol_supplier[int(index)]\n\n                if mol is None:\n                    fail_count += 1\n                    if return_is_successful:\n                        is_successful_list.append(False)\n                    continue\n                try:\n                    # Labels need to be extracted from `mol` before standardize\n                    # smiles.\n                    if self.labels is not None:\n                        label = pp.get_label(mol, self.labels)\n                        if self.postprocess_label is not None:\n                            label = self.postprocess_label(label)\n\n                    # Note that smiles expression is not unique.\n                    # we obtain canonical smiles\n                    smiles = Chem.MolToSmiles(mol)\n                    mol = Chem.MolFromSmiles(smiles)\n                    canonical_smiles, mol = pp.prepare_smiles_and_mol(mol)\n                    input_features = pp.get_input_features(mol)\n\n                    # Initialize features: list of list\n                    if features is None:\n                        if isinstance(input_features, tuple):\n                            num_features = len(input_features)\n                        else:\n                            num_features = 1\n                        if self.labels is not None:\n                            num_features += 1\n                        features = [[] for _ in range(num_features)]\n\n                    if return_smiles:\n                        smiles_list.append(canonical_smiles)\n                except MolFeatureExtractionError as e:  # NOQA\n                    # This is expected error that extracting feature failed,\n                    # skip this molecule.\n                    fail_count += 1\n                    if return_is_successful:\n                        is_successful_list.append(False)\n                    continue\n                except Exception as e:\n                    logger.warning('parse() error, type: {}, {}'\n                                   .format(type(e).__name__, e.args))\n                    fail_count += 1\n                    if return_is_successful:\n                        is_successful_list.append(False)\n                    continue\n\n                if isinstance(input_features, tuple):\n                    for i in range(len(input_features)):\n                        features[i].append(input_features[i])\n                else:\n                    features[0].append(input_features)\n                if self.labels is not None:\n                    features[len(features) - 1].append(label)\n                success_count += 1\n                if return_is_successful:\n                    is_successful_list.append(True)\n\n            ret = []\n\n            for feature in features:\n                try:\n                    feat_array = numpy.asarray(feature)\n                except ValueError:\n                    # Temporal work around to convert object-type list into\n                    # numpy array.\n                    # See, https://goo.gl/kgJXwb\n                    feat_array = numpy.empty(len(feature), dtype=numpy.ndarray)\n                    feat_array[:] = feature[:]\n                ret.append(feat_array)\n            result = tuple(ret)\n            logger.info('Preprocess finished. FAIL {}, SUCCESS {}, TOTAL {}'\n                        .format(fail_count, success_count, total_count))\n        else:\n            # Spec not finalized yet for general case\n            result = pp.process(filepath)\n\n        smileses = numpy.array(\n            smiles_list, dtype=object) if return_smiles else None\n        if return_is_successful:\n            is_successful = numpy.array(is_successful_list)\n        else:\n            is_successful = None\n\n        if isinstance(result, tuple):\n            if self.postprocess_fn is not None:\n                result = self.postprocess_fn(*result)\n            dataset = pp.create_dataset(*result)\n        else:\n            if self.postprocess_fn is not None:\n                result = self.postprocess_fn(result)\n            dataset = pp.create_dataset(*result)\n        return {\"dataset\": dataset,\n                \"smiles\": smileses,\n                \"is_successful\": is_successful}\n\n    def extract_total_num(self, filepath):\n        \"\"\"Extracts total number of data which can be parsed\n\n        We can use this method to determine the value fed to `target_index`\n        option of `parse` method. For example, if we want to extract input\n        feature from 10% of whole dataset, we need to know how many samples\n        are in a file. The returned value of this method may not to be same as\n        the final dataset size.\n\n        Args:\n            filepath (str): file path of to check the total number.\n\n        Returns (int): total number of dataset can be parsed.\n\n        \"\"\"\n        mol_supplier = Chem.SDMolSupplier(filepath)\n        return len(mol_supplier)\n"
  },
  {
    "path": "chainer_chemistry/dataset/parsers/smiles_parser.py",
    "content": "import pandas\n\nfrom chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser\n\n\nclass SmilesParser(DataFrameParser):\n    \"\"\"smiles parser\n\n    It parses `smiles_list`, which is a list of string of smiles.\n\n    Args:\n        preprocessor (BasePreprocessor): preprocessor instance\n        postprocess_label (Callable): post processing function if necessary\n        postprocess_fn (Callable): post processing function if necessary\n        logger:\n    \"\"\"\n\n    def __init__(self, preprocessor,\n                 postprocess_label=None, postprocess_fn=None,\n                 logger=None):\n        super(SmilesParser, self).__init__(\n            preprocessor, labels=None, smiles_col='smiles',\n            postprocess_label=postprocess_label, postprocess_fn=postprocess_fn,\n            logger=logger)\n\n    def parse(self, smiles_list, return_smiles=False, target_index=None,\n              return_is_successful=False):\n        \"\"\"parse `smiles_list` using `preprocessor`\n\n        Label is extracted from `labels` columns and input features are\n        extracted from smiles information in `smiles` column.\n\n        Args:\n            smiles_list (list): list of strings of smiles\n            return_smiles (bool): If set to True, this function returns\n                preprocessed dataset and smiles list.\n                If set to False, this function returns preprocessed dataset and\n                `None`.\n            target_index (list or None): target index list to partially extract\n                dataset. If None (default), all examples are parsed.\n            return_is_successful (bool): If set to `True`, boolean list is\n                returned in the key 'is_successful'. It represents\n                preprocessing has succeeded or not for each SMILES.\n                If set to False, `None` is returned in the key 'is_success'.\n\n        Returns (dict): dictionary that contains Dataset, 1-d numpy array with\n            dtype=object(string) which is a vector of smiles for each example\n            or None.\n\n        \"\"\"\n        df = pandas.DataFrame({'smiles': smiles_list})\n        return super(SmilesParser, self).parse(\n            df, return_smiles=return_smiles, target_index=target_index,\n            return_is_successful=return_is_successful)\n\n    def extract_total_num(self, smiles_list):\n        \"\"\"Extracts total number of data which can be parsed\n\n        We can use this method to determine the value fed to `target_index`\n        option of `parse` method. For example, if we want to extract input\n        feature from 10% of whole dataset, we need to know how many samples\n        are in a file. The returned value of this method may not to be same as\n        the final dataset size.\n\n        Args:\n            smiles_list (list): list of strings of smiles\n\n        Returns (int): total number of dataset can be parsed.\n\n        \"\"\"\n        return len(smiles_list)\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/__init__.py",
    "content": "from chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.base_preprocessor import BasePreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.cgcnn_preprocessor import CGCNNPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import construct_adj_matrix  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import construct_atomic_number_array  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import construct_discrete_edge_matrix  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import construct_supernode_feature  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.ecfp_preprocessor import ECFPPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import GGNNPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.gin_preprocessor import GINPreprocessor, GINSparsePreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.gnnfilm_preprocessor import GNNFiLMPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.gwm_preprocessor import GGNNGWMPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.gwm_preprocessor import GINGWMPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.gwm_preprocessor import NFPGWMPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.gwm_preprocessor import RSGCNGWMPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.megnet_preprocessor import MEGNetPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.nfp_preprocessor import NFPPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.relgat_preprocessor import RelGATPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.relgcn_preprocessor import RelGCNPreprocessor, RelGCNSparsePreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.schnet_preprocessor import SchNetPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.weavenet_preprocessor import WeaveNetPreprocessor  # NOQA\n\npreprocess_method_dict = {\n    'ecfp': ECFPPreprocessor,\n    'nfp': NFPPreprocessor,\n    'nfp_gwm': NFPGWMPreprocessor,\n    'ggnn': GGNNPreprocessor,\n    'ggnn_gwm': GGNNGWMPreprocessor,\n    'gin': GINPreprocessor,\n    'gin_gwm': GINGWMPreprocessor,\n    'schnet': SchNetPreprocessor,\n    'weavenet': WeaveNetPreprocessor,\n    'relgcn': RelGCNPreprocessor,\n    'rsgcn': RSGCNPreprocessor,\n    'rsgcn_gwm': RSGCNGWMPreprocessor,\n    'relgat': RelGATPreprocessor,\n    'relgcn_sparse': RelGCNSparsePreprocessor,\n    'gin_sparse': GINSparsePreprocessor,\n    'gnnfilm': GNNFiLMPreprocessor,\n    'megnet': MEGNetPreprocessor,\n    'cgcnn': CGCNNPreprocessor\n}\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/atomic_number_preprocessor.py",
    "content": "from chainer_chemistry.dataset.preprocessors.common \\\n    import construct_atomic_number_array\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor \\\n    import MolPreprocessor\n\n\nclass AtomicNumberPreprocessor(MolPreprocessor):\n    \"\"\"Atomic number Preprocessor\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        out_size (int): It specifies the size of array returned by\n            `get_input_features`.\n            If the number of atoms in the molecule is less than this value,\n            the returned arrays is padded to have fixed size.\n            Setting negative value indicates do not pad returned array.\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1):\n        super(AtomicNumberPreprocessor, self).__init__()\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol):\n\n        Returns:\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        return atom_array\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/base_preprocessor.py",
    "content": "\"\"\"\nPreprocessor supports feature extraction for each model (network)\n\"\"\"\n\n\nclass BasePreprocessor(object):\n    \"\"\"Base class for preprocessor\"\"\"\n\n    def __init__(self):\n        pass\n\n    def process(self, filepath):\n        pass\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/cgcnn_preprocessor.py",
    "content": "from logging import getLogger\nimport numpy\nimport os\nimport shutil\n\nfrom chainer.dataset import download\n\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.utils import GaussianDistance\nfrom chainer_chemistry.utils import load_json\n\ndownload_url = 'https://raw.githubusercontent.com/txie-93/cgcnn/master/data/sample-regression/atom_init.json'  # NOQA\nfile_name_atom_init_json = 'atom_init.json'\n\n_root = 'pfnet/chainer/cgcnn'\n\n\ndef get_atom_init_json_filepath(download_if_not_exist=True):\n    \"\"\"Construct a filepath which stores atom_init_json\n\n    This method check whether the file exist or not,  and downloaded it if\n    necessary.\n\n    Args:\n        download_if_not_exist (bool): If `True` download dataset\n            if it is not downloaded yet.\n\n    Returns (str): file path for atom_init_json\n    \"\"\"\n    cache_root = download.get_dataset_directory(_root)\n    cache_path = os.path.join(cache_root, file_name_atom_init_json)\n    if not os.path.exists(cache_path) and download_if_not_exist:\n        logger = getLogger(__name__)\n        logger.info('Downloading atom_init.json...')\n        download_file_path = download.cached_download(download_url)\n        shutil.copy(download_file_path, cache_path)\n    return cache_path\n\n\nclass CGCNNPreprocessor(MolPreprocessor):\n    \"\"\"CGCNNPreprocessor\n\n    Args:\n    For Molecule: TODO\n    \"\"\"\n\n    def __init__(self, max_num_nbr=12, max_radius=8, expand_dim=40):\n        super(CGCNNPreprocessor, self).__init__()\n\n        self.max_num_nbr = max_num_nbr\n        self.max_radius = max_radius\n        self.gdf = GaussianDistance(centers=numpy.linspace(0, 8, expand_dim))\n        feat_dict = load_json(get_atom_init_json_filepath())\n        self.atom_features = {int(key): numpy.array(value,\n                                                    dtype=numpy.float32)\n                              for key, value in feat_dict.items()}\n\n    def get_input_features(self, mol):\n        raise NotImplementedError()\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/common.py",
    "content": "\"\"\"Common preprocess method is gethered in this file\"\"\"\nimport numpy\nfrom rdkit import Chem\nfrom rdkit.Chem import rdmolops\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\n\n\nclass MolFeatureExtractionError(Exception):\n    pass\n\n\n# --- Type check ---\ndef type_check_num_atoms(mol, num_max_atoms=-1):\n    \"\"\"Check number of atoms in `mol` does not exceed `num_max_atoms`\n\n    If number of atoms in `mol` exceeds the number `num_max_atoms`, it will\n    raise `MolFeatureExtractionError` exception.\n\n    Args:\n        mol (Mol):\n        num_max_atoms (int): If negative value is set, not check number of\n            atoms.\n\n    \"\"\"\n    num_atoms = mol.GetNumAtoms()\n    if num_max_atoms >= 0 and num_atoms > num_max_atoms:\n        # Skip extracting feature. ignore this case.\n        raise MolFeatureExtractionError(\n            'Number of atoms in mol {} exceeds num_max_atoms {}'\n            .format(num_atoms, num_max_atoms))\n\n\n# --- Atom preprocessing ---\ndef construct_atomic_number_array(mol, out_size=-1):\n    \"\"\"Returns atomic numbers of atoms consisting a molecule.\n\n    Args:\n        mol (rdkit.Chem.Mol): Input molecule.\n        out_size (int): The size of returned array.\n            If this option is negative, it does not take any effect.\n            Otherwise, it must be larger than the number of atoms\n            in the input molecules. In that case, the tail of\n            the array is padded with zeros.\n\n    Returns:\n        numpy.ndarray: an array consisting of atomic numbers\n            of atoms in the molecule.\n    \"\"\"\n\n    atom_list = [a.GetAtomicNum() for a in mol.GetAtoms()]\n    n_atom = len(atom_list)\n\n    if out_size < 0:\n        return numpy.array(atom_list, dtype=numpy.int32)\n    elif out_size >= n_atom:\n        # 'empty' padding for atom_list\n        # 0 represents empty place for atom\n        atom_array = numpy.zeros(out_size, dtype=numpy.int32)\n        atom_array[:n_atom] = numpy.array(atom_list, dtype=numpy.int32)\n        return atom_array\n    else:\n        raise ValueError('`out_size` (={}) must be negative or '\n                         'larger than or equal to the number '\n                         'of atoms in the input molecules (={})'\n                         '.'.format(out_size, n_atom))\n\n\n# --- Adjacency matrix preprocessing ---\ndef construct_adj_matrix(mol, out_size=-1, self_connection=True):\n    \"\"\"Returns the adjacent matrix of the given molecule.\n\n    This function returns the adjacent matrix of the given molecule.\n    Contrary to the specification of\n    :func:`rdkit.Chem.rdmolops.GetAdjacencyMatrix`,\n    The diagonal entries of the returned matrix are all-one.\n\n    Args:\n        mol (rdkit.Chem.Mol): Input molecule.\n        out_size (int): The size of the returned matrix.\n            If this option is negative, it does not take any effect.\n            Otherwise, it must be larger than the number of atoms\n            in the input molecules. In that case, the adjacent\n            matrix is expanded and zeros are padded to right\n            columns and bottom rows.\n        self_connection (bool): Add self connection or not.\n            If True, diagonal element of adjacency matrix is filled with 1.\n\n    Returns:\n        adj_array (numpy.ndarray): The adjacent matrix of the input molecule.\n            It is 2-dimensional array with shape (atoms1, atoms2), where\n            atoms1 & atoms2 represent from and to of the edge respectively.\n            If ``out_size`` is non-negative, the returned\n            its size is equal to that value. Otherwise,\n            it is equal to the number of atoms in the the molecule.\n    \"\"\"\n\n    adj = rdmolops.GetAdjacencyMatrix(mol)\n    s0, s1 = adj.shape\n    if s0 != s1:\n        raise ValueError('The adjacent matrix of the input molecule'\n                         'has an invalid shape: ({}, {}). '\n                         'It must be square.'.format(s0, s1))\n\n    if self_connection:\n        adj = adj + numpy.eye(s0)\n    if out_size < 0:\n        adj_array = adj.astype(numpy.float32)\n    elif out_size >= s0:\n        adj_array = numpy.zeros((out_size, out_size),\n                                dtype=numpy.float32)\n        adj_array[:s0, :s1] = adj\n    else:\n        raise ValueError(\n            '`out_size` (={}) must be negative or larger than or equal to the '\n            'number of atoms in the input molecules (={}).'\n            .format(out_size, s0))\n    return adj_array\n\n\ndef construct_discrete_edge_matrix(mol, out_size=-1,\n                                   add_self_connection_channel=False):\n    \"\"\"Returns the edge-type dependent adjacency matrix of the given molecule.\n\n    Args:\n        mol (rdkit.Chem.Mol): Input molecule.\n        out_size (int): The size of the returned matrix.\n            If this option is negative, it does not take any effect.\n            Otherwise, it must be larger than the number of atoms\n            in the input molecules. In that case, the adjacent\n            matrix is expanded and zeros are padded to right\n            columns and bottom rows.\n        add_self_connection_channel (bool): Add self connection or not.\n            If True, adjacency matrix whose diagonal element filled with 1\n            is added to last channel.\n\n    Returns:\n        adj_array (numpy.ndarray): The adjacent matrix of the input molecule.\n            It is 3-dimensional array with shape (edge_type, atoms1, atoms2),\n            where edge_type represents the bond type,\n            atoms1 & atoms2 represent from and to of the edge respectively.\n            If ``out_size`` is non-negative, its size is equal to that value.\n            Otherwise, it is equal to the number of atoms in the the molecule.\n    \"\"\"\n    if mol is None:\n        raise MolFeatureExtractionError('mol is None')\n    N = mol.GetNumAtoms()\n\n    if out_size < 0:\n        size = N\n    elif out_size >= N:\n        size = out_size\n    else:\n        raise ValueError(\n            'out_size {} is smaller than number of atoms in mol {}'\n            .format(out_size, N))\n    if add_self_connection_channel:\n        adjs = numpy.zeros((5, size, size), dtype=numpy.float32)\n    else:\n        adjs = numpy.zeros((4, size, size), dtype=numpy.float32)\n\n    bond_type_to_channel = {\n        Chem.BondType.SINGLE: 0,\n        Chem.BondType.DOUBLE: 1,\n        Chem.BondType.TRIPLE: 2,\n        Chem.BondType.AROMATIC: 3\n    }\n    for bond in mol.GetBonds():\n        bond_type = bond.GetBondType()\n        ch = bond_type_to_channel[bond_type]\n        i = bond.GetBeginAtomIdx()\n        j = bond.GetEndAtomIdx()\n        adjs[ch, i, j] = 1.0\n        adjs[ch, j, i] = 1.0\n    if add_self_connection_channel:\n        adjs[-1] = numpy.eye(N)\n    return adjs\n\n\ndef mol_basic_info_feature(mol, atom_array, adj):\n    n_atoms = mol.GetNumAtoms()\n    if n_atoms != len(atom_array):\n        raise ValueError(\"[ERROR] n_atoms {} != len(atom_array) {}\"\n                         .format(n_atoms, len(atom_array)))\n\n    # Note: this is actual number of edges * 2.\n    n_edges = adj.sum()\n    return numpy.asarray([n_atoms, n_edges])\n\n\ndef mol_atom_type_feature(mol, atom_array, adj):\n    atom_count = numpy.bincount(atom_array, minlength=MAX_ATOMIC_NUM + 1)\n    return (atom_count > 0).astype(numpy.float)[1:]\n\n\ndef mol_atom_freq_feature(mol, atom_array, adj):\n    atom_count = numpy.bincount(atom_array, minlength=MAX_ATOMIC_NUM + 1)\n    return (atom_count / len(atom_array))[1:]\n\n\ndef mol_bond_type_feature(mol, atom_array, adj):\n    if adj.ndim == 2:\n        adj = numpy.expand_dims(adj, axis=0)\n    adj = adj.reshape((adj.shape[0], -1))\n    return adj.max(axis=1)\n\n\ndef mol_bond_freq_feature(mol, atom_array, adj):\n    if adj.ndim == 2:\n        adj = numpy.expand_dims(adj, axis=0)\n    adj = adj.reshape((adj.shape[0], -1))\n    adj_sum = adj.sum()\n    if adj_sum == 0:\n        return adj.sum(axis=1)\n    else:\n        return adj.sum(axis=1) / adj_sum\n\n\ndef construct_supernode_feature(mol, atom_array, adj, feature_functions=None):\n    \"\"\"Construct an input feature x' for a supernode\n\n    `out_size` is automatically inferred by `atom_array` and `adj`\n\n    Args:\n        mol (rdkit.Chem.Mol): Input molecule\n        atom_array (numpy.ndarray) : array of atoms\n        adj (numpy.ndarray): N by N 2-way array, or |E| by N by N 3-way array\n            where |E| is the number of edgetypes.\n        feature_functions (None or list): list of callable\n\n    Returns:\n        super_node_x (numpy.ndarray); 1-way array, the supernode feature.\n        len(super_node_x) will be 2 + 2 + MAX_ATOMIC_NUM*2 for 2-way adjs,\n            2 + 4*2 + MAX_ATOMIC_NUM*2 for 3-way adjs\n\n    \"\"\"\n\n    if feature_functions is None:\n        feature_functions = [\n            mol_basic_info_feature, mol_bond_type_feature,\n            mol_bond_freq_feature, mol_atom_type_feature,\n            mol_atom_freq_feature]\n    super_node_x = numpy.concatenate(\n        [func(mol, atom_array, adj) for func in feature_functions])\n    super_node_x = super_node_x.astype(numpy.float32)\n    return super_node_x\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/ecfp_preprocessor.py",
    "content": "from logging import getLogger\n\nimport numpy\nfrom rdkit.Chem import rdMolDescriptors\n\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\n\n\nclass ECFPPreprocessor(MolPreprocessor):\n\n    def __init__(self, radius=2):\n        super(ECFPPreprocessor, self).__init__()\n        self.radius = radius\n\n    def get_input_features(self, mol):\n        try:\n            fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol,\n                                                                self.radius)\n        except Exception as e:\n            logger = getLogger(__name__)\n            logger.debug('exception caught at ECFPPreprocessor:', e)\n            # Extracting feature failed\n            raise MolFeatureExtractionError\n        # TODO(Nakago): Test it.\n        return numpy.asarray(fp, numpy.float32)\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/ggnn_preprocessor.py",
    "content": "import numpy\n\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_data import SparseGraphData  # NOQA\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_dataset import SparseGraphDataset  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common \\\n    import construct_atomic_number_array, construct_discrete_edge_matrix  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\n\n\nclass GGNNPreprocessor(MolPreprocessor):\n    \"\"\"GGNN Preprocessor\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        out_size (int): It specifies the size of array returned by\n            `get_input_features`.\n            If the number of atoms in the molecule is less than this value,\n            the returned arrays is padded to have fixed size.\n            Setting negative value indicates do not pad returned array.\n        add_Hs (bool): If True, implicit Hs are added.\n        kekulize (bool): If True, Kekulizes the molecule.\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False,\n                 kekulize=False):\n        super(GGNNPreprocessor, self).__init__(\n            add_Hs=add_Hs, kekulize=kekulize)\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol): Molecule input\n\n        Returns:\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        adj_array = construct_discrete_edge_matrix(mol, out_size=self.out_size)\n\n        return atom_array, adj_array\n\n\nclass GGNNSparsePreprocessor(GGNNPreprocessor):\n    \"\"\"Sparse GGNN Preprocessor\"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False,\n                 kekulize=False):\n        super(GGNNSparsePreprocessor, self).__init__(\n            max_atoms=max_atoms, out_size=out_size, add_Hs=add_Hs,\n            kekulize=kekulize)\n\n    def construct_sparse_data(self, x, adj, y):\n        \"\"\"Construct `SparseGraphData` from `x`, `adj`, `y`\n\n        Args:\n            x (numpy.ndarray): input feature\n            adj (numpy.ndarray): adjacency matrix\n            y (numpy.ndarray): output label\n\n        Returns:\n            SparseGraphData: graph data object for sparse pattern\n        \"\"\"\n        edge_index = [[], []]\n        edge_attr = []\n        label_num, n, _ = adj.shape\n        for label in range(label_num):\n            for i in range(n):\n                for j in range(n):\n                    if adj[label, i, j] != 0.:\n                        edge_index[0].append(i)\n                        edge_index[1].append(i)\n                        edge_attr.append(label)\n        return SparseGraphData(\n            x=x,\n            edge_index=numpy.array(edge_index, dtype=numpy.int),\n            edge_attr=numpy.array(edge_attr, dtype=numpy.int),\n            y=y\n        )\n\n    def create_dataset(self, *args, **kwargs):\n        \"\"\"Create `SparseGraphData` from list of `(x, adj, y)`\n\n        Returns:\n            SparseGraphDataset: graph dataset object for sparse pattern\n        \"\"\"\n        # args: (atom_array, adj_array, label_array)\n        data_list = [\n            self.construct_sparse_data(x, adj, y) for (x, adj, y) in zip(*args)\n        ]\n        return SparseGraphDataset(data_list)\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/gin_preprocessor.py",
    "content": "import numpy\n\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_data import SparseGraphData  # NOQA\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_dataset import SparseGraphDataset  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common \\\n    import construct_atomic_number_array, construct_adj_matrix  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\n\n\nclass GINPreprocessor(MolPreprocessor):\n    \"\"\"GIN Preprocessor\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False):\n        \"\"\"initialize the GIN Preprocessor.\n\n        Args:\n            max_atoms (int): Max number of atoms for each molecule,\n                if the number of atoms is more than this value,\n                this data is simply ignored.\n                Setting negative value indicates no limit for max atoms.\n            out_size (int): It specifies the size of array returned by\n                `get_input_features`.\n                If the number of atoms in the molecule is less than this value,\n                the returned arrays is padded to have fixed size.\n                Setting negative value indicates do not pad returned array.\n            add_Hs (bool): If true, add Hydrogens explicitly.\n        \"\"\"\n        super(GINPreprocessor, self).__init__(add_Hs=add_Hs)\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol):\n\n        Returns:\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        adj_array = construct_adj_matrix(mol, out_size=self.out_size)\n        return atom_array, adj_array\n\n\nclass GINSparsePreprocessor(MolPreprocessor):\n    \"\"\"Sparse GIN Preprocessor\"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False):\n        super(GINSparsePreprocessor, self).__init__(add_Hs=add_Hs)\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        adj_array = construct_adj_matrix(mol, out_size=self.out_size)\n        return atom_array, adj_array\n\n    def construct_sparse_data(self, x, adj, y):\n        \"\"\"Construct `SparseGraphData` from `x`, `adj`, `y`\n\n        Args:\n            x (numpy.ndarray): input feature\n            adj (numpy.ndarray): adjacency matrix\n            y (numpy.ndarray): output label\n\n        Returns:\n            SparseGraphData: graph data object for sparse pattern\n        \"\"\"\n        edge_index = [[], []]\n        n, _ = adj.shape\n        for i in range(n):\n            for j in range(n):\n                if adj[i, j] != 0.:\n                    edge_index[0].append(i)\n                    edge_index[1].append(j)\n        return SparseGraphData(\n            x=x,\n            edge_index=numpy.array(edge_index, dtype=numpy.int),\n            y=y\n        )\n\n    def create_dataset(self, *args, **kwargs):\n        \"\"\"Create `SparseGraphData` from list of `(x, adj, y)`\n\n        Returns:\n            SparseGraphDataset: graph dataset object for sparse pattern\n        \"\"\"\n        # args: (atom_array, adj_array, label_array)\n        data_list = [\n            self.construct_sparse_data(x, adj, y) for (x, adj, y) in zip(*args)\n        ]\n        return SparseGraphDataset(data_list)\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/gnnfilm_preprocessor.py",
    "content": "from chainer_chemistry.dataset.preprocessors.common \\\n    import construct_atomic_number_array, construct_discrete_edge_matrix  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\n\n\nclass GNNFiLMPreprocessor(MolPreprocessor):\n    \"\"\"GNNFiLM Preprocessor\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        out_size (int): It specifies the size of array returned by\n            `get_input_features`.\n            If the number of atoms in the molecule is less than this value,\n            the returned arrays is padded to have fixed size.\n            Setting negative value indicates do not pad returned array.\n        add_Hs (bool): If True, implicit Hs are added.\n        kekulize (bool): If True, Kekulizes the molecule.\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False,\n                 kekulize=False):\n        super(GNNFiLMPreprocessor, self).__init__(\n            add_Hs=add_Hs, kekulize=kekulize)\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol): Molecule input\n\n        Returns:\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        adj_array = construct_discrete_edge_matrix(\n            mol, out_size=self.out_size, add_self_connection_channel=True)\n        return atom_array, adj_array\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/gwm_preprocessor.py",
    "content": "from chainer_chemistry.dataset.preprocessors.common import construct_supernode_feature  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import GGNNPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.gin_preprocessor import GINPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.nfp_preprocessor import NFPPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor  # NOQA\n\n\nclass NFPGWMPreprocessor(NFPPreprocessor):\n    def get_input_features(self, mol):\n        atom_array, adj_array = super(\n            NFPGWMPreprocessor, self).get_input_features(mol)\n        super_node_x = construct_supernode_feature(\n            mol, atom_array, adj_array)\n        return atom_array, adj_array, super_node_x\n\n\nclass GGNNGWMPreprocessor(GGNNPreprocessor):\n    def get_input_features(self, mol):\n        atom_array, adj_array = super(\n            GGNNGWMPreprocessor, self).get_input_features(mol)\n        super_node_x = construct_supernode_feature(\n            mol, atom_array, adj_array)\n        return atom_array, adj_array, super_node_x\n\n\nclass GINGWMPreprocessor(GINPreprocessor):\n    def get_input_features(self, mol):\n        atom_array, adj_array = super(\n            GINGWMPreprocessor, self).get_input_features(mol)\n        super_node_x = construct_supernode_feature(\n            mol, atom_array, adj_array)\n        return atom_array, adj_array, super_node_x\n\n\nclass RSGCNGWMPreprocessor(RSGCNPreprocessor):\n    def get_input_features(self, mol):\n        atom_array, adj_array = super(\n            RSGCNGWMPreprocessor, self).get_input_features(mol)\n        super_node_x = construct_supernode_feature(\n            mol, atom_array, adj_array)\n        return atom_array, adj_array, super_node_x\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/megnet_preprocessor.py",
    "content": "from logging import getLogger\nimport os\nimport traceback\n\nimport numpy\nfrom rdkit import Chem, RDConfig  # NOQA\nfrom rdkit.Chem import AllChem, ChemicalFeatures, Descriptors, rdmolops  # NOQA\n\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.utils import GaussianDistance\n\n\nMAX_ATOM_ELEMENT = 94\nATOM = ['H', 'C', 'N', 'O', 'F']\n\n\n# create singleton class\nclass ChemicalFeaturesFactory(object):\n    _instance = None\n\n    @classmethod\n    def get_instance(self):\n        if not self._instance:\n            fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')\n            self._instance = ChemicalFeatures.BuildFeatureFactory(fdefName)\n\n        return self._instance\n\n\n# --- atom feature extraction ---\ndef construct_atom_type_vec(mol, num_max_atoms, atom_list=None,\n                            include_unknown_atom=False):\n    atom_list = atom_list or ATOM\n    if include_unknown_atom:\n        # all atom not in `atom_list` as considered as \"unknown atom\"\n        # and its index is `len(atom_list)`\n        n_atom_type = len(atom_list) + 1\n    else:\n        n_atom_type = len(atom_list)\n\n    atom_type_vec = numpy.zeros((num_max_atoms, n_atom_type),\n                                dtype=numpy.float32)\n    for i in range(num_max_atoms):\n        a = mol.GetAtomWithIdx(i)\n        try:\n            atom_idx = atom_list.index(a.GetSymbol())\n        except ValueError as e:\n            if include_unknown_atom:\n                atom_idx = len(atom_list)\n            else:\n                raise MolFeatureExtractionError(e)\n        atom_type_vec[i, atom_idx] = 1.0\n    return atom_type_vec\n\n\ndef construct_atom_chirality_vec(mol, num_max_atoms):\n    chirality_vec = numpy.zeros((num_max_atoms, 2), dtype=numpy.float32)\n\n    # chiral_cc: (atom_index, chirality) : (1, 'S')\n    chiral_cc = Chem.FindMolChiralCenters(mol)\n    for chiral_dict in chiral_cc:\n        if chiral_dict[1] == 'R':\n            chirality_vec[chiral_dict[0]] = [1, 0]\n        if chiral_dict[1] == 'S':\n            chirality_vec[chiral_dict[0]] = [0, 1]\n\n    return chirality_vec\n\n\ndef construct_atom_ring_vec(mol, num_max_atoms):\n    sssr = Chem.GetSymmSSSR(mol)\n    ring_feature = numpy.zeros((num_max_atoms, 6,), dtype=numpy.float32)\n    for ring in sssr:\n        ring = list(ring)\n        for i in range(num_max_atoms):\n            if i in ring:\n                ring_size = len(ring)\n                if ring_size >= 3 and ring_size <= 8:\n                    ring_feature[i, ring_size - 3] = 1.0\n    return ring_feature\n\n\ndef construct_hybridization_vec(mol, num_max_atoms):\n    hybridization_vec = numpy.zeros((num_max_atoms, 3), dtype=numpy.float32)\n    for i in range(num_max_atoms):\n        a = mol.GetAtomWithIdx(i)\n        hybridization_type = a.GetHybridization()\n        if hybridization_type is None:\n            continue\n        hybridization_type = str(hybridization_type)\n        if hybridization_type == 'SP1':\n            hybridization_vec[i, 0] = 1.0\n        elif hybridization_type == 'SP2':\n            hybridization_vec[i, 1] = 1.0\n        elif hybridization_type == 'SP3':\n            hybridization_vec[i, 2] = 1.0\n    return hybridization_vec\n\n\ndef construct_hydrogen_bonding(mol, num_max_atoms):\n    factory = ChemicalFeaturesFactory.get_instance()\n    feats = factory.GetFeaturesForMol(mol)\n    hydrogen_bonding_vec = numpy.zeros((num_max_atoms, 2), dtype=numpy.float32)\n    for f in feats:\n        atom_type = f.GetFamily()\n        if atom_type == 'Donor':\n            idx = f.GetAtomIds()[0]\n            hydrogen_bonding_vec[idx, 0] = 1.0\n        if atom_type == 'Acceptor':\n            idx = f.GetAtomIds()[0]\n            hydrogen_bonding_vec[idx, 1] = 1.0\n    return hydrogen_bonding_vec\n\n\ndef construct_aromaticity_vec(mol, num_max_atoms):\n    aromaticity_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32)\n    aromatix_atoms = mol.GetAromaticAtoms()\n    for a in aromatix_atoms:\n        aromaticity_vec[a.GetIdx()] = 1.0\n\n    return aromaticity_vec\n\n\ndef construct_atom_feature(mol, use_all_feature, atom_list=None,\n                           include_unknown_atom=False):\n    \"\"\"construct atom feature\n\n    Args:\n        mol (Mol): mol instance\n        use_all_feature (bool):\n            If True, all atom features are extracted.\n            If False, a part of atom features is extracted.\n            You can confirm the detail in the paper.\n        atom_list (list): list of atoms to extract feature. If None, default\n            `ATOM` is used as `atom_list`\n        include_unknown_atom (bool): If False, when the `mol` includes atom\n            which is not in `atom_list`, it will raise\n            `MolFeatureExtractionError`.\n            If True, even the atom is not in `atom_list`, `atom_type` is set\n            as \"unknown\" atom.\n\n    Returns:\n        atom_feature (numpy.ndarray):\n            The shape is (num_nodes, num_node_features).\n    \"\"\"\n    num_max_atoms = mol.GetNumAtoms()\n    atom_type_vec = construct_atom_type_vec(\n        mol, num_max_atoms, atom_list=atom_list,\n        include_unknown_atom=include_unknown_atom)\n    atom_chirality_vec = construct_atom_chirality_vec(\n        mol, num_max_atoms=num_max_atoms)\n    atom_ring_vec = construct_atom_ring_vec(\n        mol, num_max_atoms=num_max_atoms)\n    hybridization_vec = construct_hybridization_vec(\n        mol, num_max_atoms=num_max_atoms)\n    hydrogen_bonding = construct_hydrogen_bonding(\n        mol, num_max_atoms=num_max_atoms)\n    aromaticity_vec = construct_aromaticity_vec(\n        mol, num_max_atoms=num_max_atoms)\n\n    if use_all_feature:\n        feature = numpy.hstack((atom_type_vec, atom_chirality_vec,\n                                atom_ring_vec, hybridization_vec,\n                                hydrogen_bonding, aromaticity_vec))\n    else:\n        feature = construct_atom_type_vec(\n            mol, num_max_atoms, atom_list=atom_list,\n            include_unknown_atom=include_unknown_atom)\n\n    return feature\n\n\n# --- pair feature extraction ---\ndef construct_bond_vec(mol, i, j):\n    bond_feature_vec = numpy.zeros((4, ), dtype=numpy.float32)\n    k = mol.GetBondBetweenAtoms(i, j)\n    if k is not None:\n        bond_type = str(k.GetBondType())\n        if bond_type == 'SINGLE':\n            bond_feature_vec[0] = 1.0\n        elif bond_type == 'DOUBLE':\n            bond_feature_vec[1] = 1.0\n        elif bond_type == 'TRIPLE':\n            bond_feature_vec[2] = 1.0\n        elif bond_type == 'AROMATIC':\n            bond_feature_vec[3] = 1.0\n        else:\n            raise ValueError(\"Unknown bond type {}\".format(bond_type))\n    return bond_feature_vec\n\n\ndef get_is_in_ring(mol):\n    \"\"\"create a cache about whether the atom is in a ring or not\n\n    Args:\n        mol (Mol): mol instance\n\n    Returns\n        is_in_ring (dict): key is the atom idx, value is the set()\n    \"\"\"\n    sssr = Chem.GetSymmSSSR(mol)\n    is_in_ring = {}\n    ring_idx = 0\n    for ring in sssr:\n        ring = list(ring)\n        for i in ring:\n            if i not in is_in_ring:\n                is_in_ring[i] = set()\n            is_in_ring[i].add(ring_idx)\n        ring_idx += 1\n\n    return is_in_ring\n\n\ndef construct_ring_feature_vec(is_in_ring, i, j):\n    ring_feature_vec = numpy.zeros((1, ), dtype=numpy.float32)\n    if i in is_in_ring and j in is_in_ring and is_in_ring[i] & is_in_ring[j]:\n        ring_feature_vec[0] = 1.0\n\n    return ring_feature_vec\n\n\ndef construct_expanded_distance_vec(distance_matrix_3d, converter, i, j):\n    # calculate the bond length\n    distance = distance_matrix_3d[i, j]\n    # convert from the bond length to vector\n    expanded_distance_vec = converter.expand(distance)\n    return expanded_distance_vec\n\n\ndef construct_pair_feature(mol, use_all_feature):\n    \"\"\"construct pair feature\n\n    Args:\n        mol (Mol): mol instance\n        use_all_feature (bool):\n            If True, all pair features are extracted.\n            If False, a part of pair features is extracted.\n            You can confirm the detail in the paper.\n\n    Returns:\n        features (numpy.ndarray): The shape is (num_edges, num_edge_features)\n        bond_idx (numpy.ndarray): The shape is (2, num_edges)\n            bond_idx[0] represents the list of StartNodeIdx and bond_idx[1]\n            represents the list of EndNodeIdx.\n    \"\"\"\n    converter = GaussianDistance()\n\n    # prepare the data for extracting the pair feature\n    bonds = mol.GetBonds()\n    # (n_nodes, n_nodes): distance in terms of the graph bond.\n    graph_distance_matrix = Chem.GetDistanceMatrix(mol)\n    is_in_ring = get_is_in_ring(mol)\n    confid = AllChem.EmbedMolecule(mol)\n    try:\n        distance_matrix_3d = rdmolops.Get3DDistanceMatrix(\n            mol, confId=confid)\n    except ValueError as e:\n        logger = getLogger(__name__)\n        logger.info('construct_distance_matrix failed, type: {}, {}'\n                    .format(type(e).__name__, e.args))\n        logger.debug(traceback.format_exc())\n        raise MolFeatureExtractionError\n\n    feature = []\n    bond_idx = []\n    for bond in bonds:\n        start_node = bond.GetBeginAtomIdx()\n        end_node = bond.GetEndAtomIdx()\n\n        # create pair feature\n        distance_feature = numpy.array(\n            graph_distance_matrix[start_node][end_node], dtype=numpy.float32)\n        bond_feature = construct_bond_vec(mol, start_node, end_node)\n        ring_feature = construct_ring_feature_vec(\n            is_in_ring, start_node, end_node)\n\n        bond_idx.append((start_node, end_node))\n        if use_all_feature:\n            expanded_distance_feature = \\\n                construct_expanded_distance_vec(\n                    distance_matrix_3d, converter, start_node, end_node)\n            feature.append(numpy.hstack((bond_feature, ring_feature,\n                                         distance_feature,\n                                         expanded_distance_feature)))\n        else:\n            expanded_distance_feature = \\\n                construct_expanded_distance_vec(\n                    distance_matrix_3d, converter, start_node, end_node)\n            feature.append(expanded_distance_feature)\n\n    bond_idx = numpy.array(bond_idx).T\n    feature = numpy.array(feature)\n    return feature, bond_idx\n\n\ndef construct_global_state_feature(mol):\n    \"\"\"construct global state feature\n\n    Args:\n        mol (Mol): mol instance\n\n    Returns:\n        feature (numpy.ndarray): 1 dimensional array\n    \"\"\"\n    n_atom = mol.GetNumAtoms()\n    ave_mol_wt = Descriptors.MolWt(mol) / n_atom\n    ave_num_of_bonds = len(mol.GetBonds()) / n_atom\n    feature = numpy.array([ave_mol_wt, ave_num_of_bonds], dtype=numpy.float32)\n    return feature\n\n\nclass MEGNetPreprocessor(MolPreprocessor):\n    \"\"\"MEGNetPreprocessor\n\n    Args:\n    For Molecule\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        add_Hs (bool): If True, implicit Hs are added.\n        use_all_feature (bool):\n            If True, all atom and pair features is extracted.\n            If it is False, a part of atom and pair features is extracted.\n            You can confirm the detail in the paper.\n        atom_list (list): list of atoms to extract feature. If None, default\n            `ATOM` is used as `atom_list`\n        include_unknown_atom (bool): If False, when the `mol` includes atom\n            which is not in `atom_list`, it will raise\n            `MolFeatureExtractionError`.\n            If True, even the atom is not in `atom_list`, `atom_type` is set\n            as \"unknown\" atom.\n        kekulize (bool): If True, Kekulizes the molecule.\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, add_Hs=True,\n                 use_all_feature=False, atom_list=None,\n                 include_unknown_atom=False, kekulize=False,\n                 max_num_nbr=12, max_radius=8, expand_dim=100):\n        super(MEGNetPreprocessor, self).__init__(\n            add_Hs=add_Hs, kekulize=kekulize)\n\n        self.max_atoms = max_atoms\n        self.add_Hs = add_Hs\n        self.use_all_feature = use_all_feature\n        self.atom_list = atom_list\n        self.include_unknown_atom = include_unknown_atom\n        self.max_num_nbr = max_num_nbr\n        self.max_radius = max_radius\n        self.expand_dim = expand_dim\n        self.gdf = GaussianDistance(centers=numpy.linspace(0, 5, expand_dim))\n\n    def get_input_features(self, mol):\n        \"\"\"get input features from mol object\n\n        Args:\n           mol (Mol):\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_feature = construct_atom_feature(mol, self.use_all_feature,\n                                              self.atom_list,\n                                              self.include_unknown_atom)\n\n        pair_feature, bond_idx = construct_pair_feature(mol,\n                                                        self.use_all_feature)\n        global_feature = construct_global_state_feature(mol)\n        return atom_feature, pair_feature, global_feature, bond_idx\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/mol_preprocessor.py",
    "content": "from rdkit import Chem\n\nfrom chainer_chemistry.dataset.preprocessors.base_preprocessor import BasePreprocessor  # NOQA\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset  # NOQA\n\n\nclass MolPreprocessor(BasePreprocessor):\n    \"\"\"preprocessor class specified for rdkit mol instance\n\n    Args:\n        add_Hs (bool): If True, implicit Hs are added.\n        kekulize (bool): If True, Kekulizes the molecule.\n    \"\"\"\n\n    def __init__(self, add_Hs=False, kekulize=False):\n        super(MolPreprocessor, self).__init__()\n        self.add_Hs = add_Hs\n        self.kekulize = kekulize\n\n    def prepare_smiles_and_mol(self, mol):\n        \"\"\"Prepare `smiles` and `mol` used in following preprocessing.\n\n        This method is called before `get_input_features` is called, by parser\n        class.\n        This method may be overriden to support custom `smile`/`mol` extraction\n\n        Args:\n            mol (mol): mol instance\n\n        Returns (tuple): (`smiles`, `mol`)\n        \"\"\"\n        # Note that smiles expression is not unique.\n        # we obtain canonical smiles which is unique in `mol`\n        canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False,\n                                            canonical=True)\n        mol = Chem.MolFromSmiles(canonical_smiles)\n        if self.add_Hs:\n            mol = Chem.AddHs(mol)\n        if self.kekulize:\n            Chem.Kekulize(mol)\n        return canonical_smiles, mol\n\n    def get_label(self, mol, label_names=None):\n        \"\"\"Extracts label information from a molecule.\n\n        This method extracts properties whose keys are\n        specified by ``label_names`` from a molecule ``mol``\n        and returns these values as a list.\n        The order of the values is same as that of ``label_names``.\n        If the molecule does not have a\n        property with some label, this function fills the corresponding\n        index of the returned list with ``None``.\n\n        Args:\n            mol (rdkit.Chem.Mol): molecule whose features to be extracted\n            label_names (None or iterable): list of label names.\n\n        Returns:\n            list of str: label information. Its length is equal to\n            that of ``label_names``. If ``label_names`` is ``None``,\n            this function returns an empty list.\n\n        \"\"\"\n        if label_names is None:\n            return []\n\n        label_list = []\n        for label_name in label_names:\n            if mol.HasProp(label_name):\n                label_list.append(mol.GetProp(label_name))\n            else:\n                label_list.append(None)\n\n                # TODO(Nakago): Review implementation\n                # Label -1 work in case of classification.\n                # However in regression, assign -1 is not a good strategy...\n                # label_list.append(-1)\n\n                # Failed to GetProp for label, skip this case.\n                # print('no label')\n                # raise MolFeatureExtractionError\n\n        return label_list\n\n    def get_input_features(self, mol):\n        \"\"\"get molecule's feature representation, descriptor.\n\n        Each subclass must override this method.\n\n        Args:\n            mol (Mol): molecule whose feature to be extracted.\n                `mol` is prepared by the method `prepare_smiles_and_mol`.\n        \"\"\"\n        raise NotImplementedError\n\n    def create_dataset(self, *args, **kwargs):\n        return NumpyTupleDataset(*args)\n\n    def process(self, filepath):\n        # Not used now...\n        pass\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/nfp_preprocessor.py",
    "content": "from chainer_chemistry.dataset.preprocessors.common import construct_adj_matrix\nfrom chainer_chemistry.dataset.preprocessors.common \\\n    import construct_atomic_number_array\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor \\\n    import MolPreprocessor\n\n\nclass NFPPreprocessor(MolPreprocessor):\n    \"\"\"NFP Preprocessor\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        out_size (int): It specifies the size of array returned by\n            `get_input_features`.\n            If the number of atoms in the molecule is less than this value,\n            the returned arrays is padded to have fixed size.\n            Setting negative value indicates do not pad returned array.\n        add_Hs (bool): If True, implicit Hs are added.\n        kekulize (bool): If True, Kekulizes the molecule.\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False,\n                 kekulize=False):\n        super(NFPPreprocessor, self).__init__(\n            add_Hs=add_Hs, kekulize=kekulize)\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol):\n\n        Returns:\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        adj_array = construct_adj_matrix(mol, out_size=self.out_size)\n        return atom_array, adj_array\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/relgat_preprocessor.py",
    "content": "from chainer_chemistry.dataset.preprocessors.common import construct_atomic_number_array  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import construct_discrete_edge_matrix  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\n\n\nclass RelGATPreprocessor(MolPreprocessor):\n    \"\"\"RelGAT Preprocessor\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        out_size (int): It specifies the size of array returned by\n            `get_input_features`.\n            If the number of atoms in the molecule is less than this value,\n            the returned arrays is padded to have fixed size.\n            Setting negative value indicates do not pad returned array.\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False):\n        super(RelGATPreprocessor, self).__init__(add_Hs=add_Hs)\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol):\n\n        Returns:\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        adj_array = construct_discrete_edge_matrix(mol, out_size=self.out_size)\n        return atom_array, adj_array\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/relgcn_preprocessor.py",
    "content": "from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import GGNNPreprocessor, GGNNSparsePreprocessor  # NOQA\n\n\nclass RelGCNPreprocessor(GGNNPreprocessor):\n    \"\"\"RelGCN Preprocessor\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        out_size (int): It specifies the size of array returned by\n            `get_input_features`.\n            If the number of atoms in the molecule is less than this value,\n            the returned arrays is padded to have fixed size.\n            Setting negative value indicates do not pad returned array.\n        add_Hs (bool): If True, implicit Hs are added.\n        kekulize (bool): If True, Kekulizes the molecule.\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False,\n                 kekulize=False):\n        super(RelGCNPreprocessor, self).__init__(\n            max_atoms=max_atoms, out_size=out_size, add_Hs=add_Hs,\n            kekulize=kekulize)\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol):\n\n        Returns:\n\n        \"\"\"\n        return super(RelGCNPreprocessor, self).get_input_features(mol)\n\n\nclass RelGCNSparsePreprocessor(GGNNSparsePreprocessor):\n    pass\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/rsgcn_preprocessor.py",
    "content": "from chainer_chemistry.dataset.preprocessors.common import construct_adj_matrix  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import construct_atomic_number_array  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA\n\n\nimport numpy\n\n\nclass RSGCNPreprocessor(MolPreprocessor):\n    \"\"\"RSGCN Preprocessor\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        out_size (int): It specifies the size of array returned by\n            `get_input_features`.\n            If the number of atoms in the molecule is less than this value,\n            the returned arrays is padded to have fixed size.\n            Setting negative value indicates do not pad returned array.\n        add_Hs (bool): If True, implicit Hs are added.\n        kekulize (bool): If True, Kekulizes the molecule.\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False,\n                 kekulize=False):\n        super(RSGCNPreprocessor, self).__init__(\n            add_Hs=add_Hs, kekulize=kekulize)\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol):\n\n        Returns:\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        num_atoms = mol.GetNumAtoms()\n\n        # Construct the atom array and adjacency matrix.\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        adj_array = construct_adj_matrix(mol, out_size=self.out_size)\n\n        # Adjust the adjacency matrix.\n        degree_vec = numpy.sum(adj_array[:num_atoms], axis=1)\n        degree_sqrt_inv = 1. / numpy.sqrt(degree_vec)\n\n        adj_array[:num_atoms, :num_atoms] *= numpy.broadcast_to(\n            degree_sqrt_inv[:, None], (num_atoms, num_atoms))\n        adj_array[:num_atoms, :num_atoms] *= numpy.broadcast_to(\n            degree_sqrt_inv[None, :], (num_atoms, num_atoms))\n\n        return atom_array, adj_array\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/schnet_preprocessor.py",
    "content": "from logging import getLogger\nimport traceback\n\nimport numpy\nfrom rdkit.Chem import AllChem\nfrom rdkit.Chem import rdmolops\n\nfrom chainer_chemistry.dataset.preprocessors.common \\\n    import construct_atomic_number_array\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor \\\n    import MolPreprocessor\n\n\ndef construct_distance_matrix(mol, out_size=-1, contain_Hs=False):\n    \"\"\"Construct distance matrix\n\n    Args:\n        mol (Chem.Mol):\n        out_size (int):\n        contain_Hs (bool):\n\n    Returns (numpy.ndarray): 2 dimensional array which represents distance\n        between atoms\n\n    \"\"\"\n    if mol is None:\n        raise MolFeatureExtractionError('mol is None')\n\n    N = mol.GetNumAtoms()\n    if out_size < 0:\n        size = N\n    elif out_size >= N:\n        size = out_size\n    else:\n        raise MolFeatureExtractionError('out_size {} is smaller than number '\n                                        'of atoms in mol {}'\n                                        .format(out_size, N))\n\n    if contain_Hs:\n        mol2 = mol\n    else:\n        mol2 = AllChem.AddHs(mol)\n\n    conf_id = AllChem.EmbedMolecule(mol2)\n    if not contain_Hs:\n        mol2 = AllChem.RemoveHs(mol2)\n\n    try:\n        dist_matrix = rdmolops.Get3DDistanceMatrix(mol2, confId=conf_id)\n    except ValueError as e:\n        logger = getLogger(__name__)\n        logger.info('construct_distance_matrix failed, type: {}, {}'\n                    .format(type(e).__name__, e.args))\n        logger.debug(traceback.format_exc())\n        raise MolFeatureExtractionError\n\n    if size > N:\n        dists = numpy.zeros((size, size), dtype=numpy.float32)\n        a0, a1 = dist_matrix.shape\n        dists[:a0, :a1] = dist_matrix\n    else:\n        dists = dist_matrix\n    return dists.astype(numpy.float32)\n\n\nclass SchNetPreprocessor(MolPreprocessor):\n    \"\"\"SchNet Preprocessor\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        out_size (int): It specifies the size of array returned by\n            `get_input_features`.\n            If the number of atoms in the molecule is less than this value,\n            the returned arrays is padded to have fixed size.\n            Setting negative value indicates do not pad returned array.\n        add_Hs (bool): If True, implicit Hs are added.\n        kekulize (bool): If True, Kekulizes the molecule.\n\n    \"\"\"\n\n    def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False,\n                 kekulize=False):\n        super(SchNetPreprocessor, self).__init__(\n            add_Hs=add_Hs, kekulize=kekulize)\n        if max_atoms >= 0 and out_size >= 0 and max_atoms > out_size:\n            raise ValueError('max_atoms {} must be less or equal to '\n                             'out_size {}'.format(max_atoms, out_size))\n        self.max_atoms = max_atoms\n        self.out_size = out_size\n\n    def get_input_features(self, mol):\n        \"\"\"get input features\n\n        Args:\n            mol (Mol):\n\n        Returns:\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        atom_array = construct_atomic_number_array(mol, out_size=self.out_size)\n        dist_array = construct_distance_matrix(mol, out_size=self.out_size,\n                                               contain_Hs=self.add_Hs)\n        return atom_array, dist_array\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/weavenet_preprocessor.py",
    "content": "import os\n\nimport numpy\nfrom rdkit import Chem\nfrom rdkit.Chem import AllChem\nfrom rdkit.Chem import ChemicalFeatures\nfrom rdkit import RDConfig\n\nfrom chainer_chemistry.config import WEAVE_DEFAULT_NUM_MAX_ATOMS\nfrom chainer_chemistry.dataset.preprocessors.common \\\n    import construct_atomic_number_array\nfrom chainer_chemistry.dataset.preprocessors.common \\\n    import MolFeatureExtractionError\nfrom chainer_chemistry.dataset.preprocessors.common import type_check_num_atoms\nfrom chainer_chemistry.dataset.preprocessors.mol_preprocessor \\\n    import MolPreprocessor\n\n\nATOM = ['H', 'C', 'N', 'O', 'S', 'Cl', 'Br', 'F', 'P', 'I']\nMAX_DISTANCE = 2  # 7\n\n\n# --- Atom feature extraction ---\ndef construct_atom_type_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS,\n                            atom_list=None, include_unknown_atom=False):\n    atom_list = atom_list or ATOM\n    if include_unknown_atom:\n        # all atom not in `atom_list` as considered as \"unknown atom\"\n        # and its index is `len(atom_list)`\n        n_atom_type = len(atom_list) + 1\n    else:\n        n_atom_type = len(atom_list)\n    n_atom = mol.GetNumAtoms()\n    atom_type_vec = numpy.zeros((num_max_atoms, n_atom_type),\n                                dtype=numpy.float32)\n    for i in range(n_atom):\n        a = mol.GetAtomWithIdx(i)\n        try:\n            atom_idx = atom_list.index(a.GetSymbol())\n        except ValueError as e:\n            if include_unknown_atom:\n                atom_idx = len(atom_list)\n            else:\n                raise MolFeatureExtractionError(e)\n        atom_type_vec[i, atom_idx] = 1.0\n    return atom_type_vec\n\n\ndef construct_formal_charge_vec(mol,\n                                num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    n_atom = mol.GetNumAtoms()\n    formal_charge_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32)\n    for i in range(n_atom):\n        a = mol.GetAtomWithIdx(i)\n        formal_charge_vec[i, 0] = a.GetFormalCharge()\n    return formal_charge_vec\n\n\ndef construct_hybridization_vec(mol,\n                                num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    # TODO(Oono)\n    # Can we enhance preprocessing speed by making factory once\n    # prior to calling this function many times?\n    n_atom = mol.GetNumAtoms()\n    hybridization_vec = numpy.zeros((num_max_atoms, 3), dtype=numpy.float32)\n    for i in range(n_atom):\n        a = mol.GetAtomWithIdx(i)\n        if a.GetHybridization() is None:\n            continue\n        hybridization_type = str(a.GetHybridization())\n        if hybridization_type == 'SP1':\n            hybridization_vec[i, 0] = 1.0\n        elif hybridization_type == 'SP2':\n            hybridization_vec[i, 1] = 1.0\n        elif hybridization_type == 'SP3':\n            hybridization_vec[i, 2] = 1.0\n    return hybridization_vec\n\n\ndef construct_partial_charge_vec(\n        mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    AllChem.ComputeGasteigerCharges(mol)\n    n = mol.GetNumAtoms()\n    partial_charge_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32)\n    for i in range(n):\n        a = mol.GetAtomWithIdx(i)\n        partial_charge_vec[i, 0] = a.GetProp(\"_GasteigerCharge\")\n    return partial_charge_vec\n\n\ndef construct_atom_ring_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    nAtom = mol.GetNumAtoms()\n    sssr = Chem.GetSymmSSSR(mol)\n    ring_feature = numpy.zeros((num_max_atoms, 6,), dtype=numpy.float32)\n    for ring in sssr:\n        ring = list(ring)\n        for i in range(nAtom):\n            if i in ring:\n                ring_size = len(ring)\n                if ring_size >= 3 and ring_size <= 8:\n                    ring_feature[i, ring_size - 3] = 1.0\n    return ring_feature\n\n\ndef construct_hydrogen_bonding(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')\n    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)\n    feats = factory.GetFeaturesForMol(mol)\n    hydrogen_bonding_vec = numpy.zeros((num_max_atoms, 2), dtype=numpy.float32)\n    for f in feats:\n        if f.GetFamily() == 'Donor':\n            idx = f.GetAtomIds()[0]\n            hydrogen_bonding_vec[idx, 0] = 1.0\n        if f.GetFamily() == 'Acceptor':\n            idx = f.GetAtomIds()[0]\n            hydrogen_bonding_vec[idx, 1] = 1.0\n    return hydrogen_bonding_vec\n\n\ndef construct_num_hydrogens_vec(mol,\n                                num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    n_hydrogen_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32)\n    n_atom = mol.GetNumAtoms()\n    for i in range(n_atom):\n        n = 0\n        for j in range(n_atom):\n            if i == j:\n                continue\n            a = mol.GetAtomWithIdx(j)\n            if a.GetSymbol() != 'H':\n                continue\n            k = mol.GetBondBetweenAtoms(i, j)\n            if k is not None:\n                n += 1\n        n_hydrogen_vec[i, 0] = n\n    return n_hydrogen_vec\n\n\ndef construct_aromaticity_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    aromaticity_vec = numpy.zeros((num_max_atoms, 1), dtype=numpy.float32)\n    aromatix_atoms = mol.GetAromaticAtoms()\n    for a in aromatix_atoms:\n        aromaticity_vec[a.GetIdx()] = 1.0\n\n    return aromaticity_vec\n\n\ndef construct_atom_feature(mol, add_Hs,\n                           num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS,\n                           atom_list=None, include_unknown_atom=False):\n    \"\"\"construct atom feature\n\n    Args:\n        mol (Mol): mol instance\n        add_Hs (bool): if the `mol` instance was added Hs, set True.\n        num_max_atoms (int): number of max atoms\n        atom_list (list): list of atoms to extract feature. If None, default\n            `ATOM` is used as `atom_list`\n        include_unknown_atom (bool): If False, when the `mol` includes atom\n            which is not in `atom_list`, it will raise\n            `MolFeatureExtractionError`.\n            If True, even the atom is not in `atom_list`, `atom_type` is set\n            as \"unknown\" atom.\n\n    Returns (numpy.ndarray): 2 dimensional array. First axis size is\n        `num_max_atoms`, representing each atom index.\n        Second axis for feature.\n\n    \"\"\"\n    atom_type_vec = construct_atom_type_vec(\n        mol, num_max_atoms, atom_list=atom_list,\n        include_unknown_atom=include_unknown_atom)\n    # TODO(nakago): Chilarity\n    formal_charge_vec = construct_formal_charge_vec(\n        mol, num_max_atoms=num_max_atoms)\n    partial_charge_vec = construct_partial_charge_vec(\n        mol, num_max_atoms=num_max_atoms)\n    atom_ring_vec = construct_atom_ring_vec(\n        mol, num_max_atoms=num_max_atoms)\n    hybridization_vec = construct_hybridization_vec(\n        mol, num_max_atoms=num_max_atoms)\n    hydrogen_bonding = construct_hydrogen_bonding(\n        mol, num_max_atoms=num_max_atoms)\n    aromaticity_vec = construct_aromaticity_vec(\n        mol, num_max_atoms=num_max_atoms)\n    if add_Hs:\n        num_hydrogens_vec = construct_num_hydrogens_vec(\n            mol, num_max_atoms=num_max_atoms)\n        feature = numpy.hstack((atom_type_vec, formal_charge_vec,\n                                partial_charge_vec, atom_ring_vec,\n                                hybridization_vec, hydrogen_bonding,\n                                aromaticity_vec, num_hydrogens_vec))\n    else:\n        feature = numpy.hstack((atom_type_vec, formal_charge_vec,\n                                partial_charge_vec, atom_ring_vec,\n                                hybridization_vec, hydrogen_bonding,\n                                aromaticity_vec))\n    return feature\n\n\n# --- Pair feature extraction ---\ndef construct_bond_vec(mol, i, j):\n    bond_feature_vec = numpy.zeros((4, ), dtype=numpy.float32)\n    k = mol.GetBondBetweenAtoms(i, j)\n    if k is not None:\n        bond_type = str(k.GetBondType())\n        if bond_type == 'SINGLE':\n            bond_feature_vec[0] = 1.0\n        elif bond_type == 'DOUBLE':\n            bond_feature_vec[1] = 1.0\n        elif bond_type == 'TRIPLE':\n            bond_feature_vec[2] = 1.0\n        elif bond_type == 'AROMATIC':\n            bond_feature_vec[3] = 1.0\n        else:\n            raise ValueError(\"Unknown bond type {}\".format(bond_type))\n    return bond_feature_vec\n\n\ndef construct_distance_vec(distance_matrix, i, j):\n    distance = min(MAX_DISTANCE, int(distance_matrix[i][j]))\n    distance_feature = numpy.zeros((MAX_DISTANCE, ), dtype=numpy.float32)\n    distance_feature[:distance] = 1.0\n    return distance_feature\n\n\ndef construct_ring_feature_vec(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    n_atom = mol.GetNumAtoms()\n    sssr = Chem.GetSymmSSSR(mol)\n    ring_feature_vec = numpy.zeros(\n        (num_max_atoms ** 2, 1,), dtype=numpy.float32)\n    for ring in sssr:\n        ring = list(ring)\n        n_atom_in_ring = len(ring)\n        for i in range(n_atom_in_ring):\n            for j in range(n_atom_in_ring):\n                a0 = ring[i]\n                a1 = ring[j]\n                ring_feature_vec[a0 * n_atom + a1] = 1\n    return ring_feature_vec\n\n\ndef construct_pair_feature(mol, num_max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS):\n    \"\"\"construct pair feature\n\n    Args:\n        mol (Mol): mol instance\n        num_max_atoms (int): number of max atoms\n\n    Returns (numpy.ndarray): 2 dimensional array. First axis size is\n        `num_max_atoms` ** 2, representing index of each atom pair.\n        Second axis for feature.\n\n    \"\"\"\n    n_atom = mol.GetNumAtoms()\n    distance_matrix = Chem.GetDistanceMatrix(mol)\n    distance_feature = numpy.zeros((num_max_atoms ** 2, MAX_DISTANCE,),\n                                   dtype=numpy.float32)\n    for i in range(n_atom):\n        for j in range(n_atom):\n            distance_feature[i * n_atom + j] = construct_distance_vec(\n                distance_matrix, i, j)\n    bond_feature = numpy.zeros((num_max_atoms ** 2, 4,), dtype=numpy.float32)\n    for i in range(n_atom):\n        for j in range(n_atom):\n            bond_feature[i * n_atom + j] = construct_bond_vec(mol, i, j)\n    ring_feature = construct_ring_feature_vec(mol, num_max_atoms=num_max_atoms)\n    feature = numpy.hstack((distance_feature, bond_feature, ring_feature))\n    return feature\n\n\nclass WeaveNetPreprocessor(MolPreprocessor):\n\n    \"\"\"WeaveNetPreprocessor\n\n     WeaveNet must have fixed-size atom list for now, zero_padding option\n     is always set to True.\n\n    Args:\n        max_atoms (int): Max number of atoms for each molecule, if the\n            number of atoms is more than this value, this data is simply\n            ignored.\n            Setting negative value indicates no limit for max atoms.\n        add_Hs (bool): If True, implicit Hs are added.\n        use_fixed_atom_feature (bool):\n            If True, atom feature is extracted used in original paper.\n            If it is False, atomic number is used instead.\n        atom_list (list): list of atoms to extract feature. If None, default\n            `ATOM` is used as `atom_list`\n        include_unknown_atom (bool): If False, when the `mol` includes atom\n            which is not in `atom_list`, it will raise\n            `MolFeatureExtractionError`.\n            If True, even the atom is not in `atom_list`, `atom_type` is set\n            as \"unknown\" atom.\n        kekulize (bool): If True, Kekulizes the molecule.\n    \"\"\"\n\n    def __init__(self, max_atoms=WEAVE_DEFAULT_NUM_MAX_ATOMS, add_Hs=True,\n                 use_fixed_atom_feature=False, atom_list=None,\n                 include_unknown_atom=False, kekulize=False):\n        super(WeaveNetPreprocessor, self).__init__(\n            add_Hs=add_Hs, kekulize=kekulize)\n        zero_padding = True\n        if zero_padding and max_atoms <= 0:\n            raise ValueError('max_atoms must be set to positive value when '\n                             'zero_padding is True')\n\n        self.max_atoms = max_atoms\n        self.add_Hs = add_Hs\n        self.zero_padding = zero_padding\n        self.use_fixed_atom_feature = use_fixed_atom_feature\n        self.atom_list = atom_list\n        self.include_unknown_atom = include_unknown_atom\n\n    def get_input_features(self, mol):\n        \"\"\"get input features for WeaveNet\n\n        WeaveNetPreprocessor automatically add `H` to `mol`\n\n        Args:\n            mol (Mol):\n\n        \"\"\"\n        type_check_num_atoms(mol, self.max_atoms)\n        if self.use_fixed_atom_feature:\n            # original paper feature extraction\n            atom_array = construct_atom_feature(mol, self.add_Hs,\n                                                self.max_atoms, self.atom_list,\n                                                self.include_unknown_atom)\n        else:\n            # embed id of atomic numbers\n            atom_array = construct_atomic_number_array(mol, self.max_atoms)\n        pair_feature = construct_pair_feature(mol,\n                                              num_max_atoms=self.max_atoms)\n        return atom_array, pair_feature\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/wle.py",
    "content": "import collections\n\nimport numpy as np\n\nfrom chainer_chemistry.dataset.preprocessors import wle_io\nfrom chainer_chemistry.dataset.preprocessors import wle_atom_array_update as wle_update\n\nDEBUG = False\n\ndef apply_wle_for_datasets(datasets, cutoff=0, k=1):\n    \"\"\"\n    Apply label Weisfeiler--Lehman Embedding for the tuple of datasets.\n\n    Args:\n        datasets: tuple of dataset (usually, train/val/test),\n                     each dataset consists of atom_array and\n                     adj_array and teach_signal\n        cutoff: int, if more than 0, the expanded labels\n                   whose freq <= cutoff will be removed.\n        k: int, the number of iterations of neighborhood\n              aggregation.\n\n    Returns:\n        - tuple of dataset (usually, train/val/test),\n               each dataest consists of atom_number_array and\n               adj_tensor with expanded labels\n        - list of all labels, used in the dataset parts.\n        - dictionary of label frequencies key:label valeu:frequency count\n    \"\"\"\n\n    atom_arrays, adj_arrays, teach_signals = wle_io.load_dataset_elements(datasets)\n\n    for _ in range(k):\n        atom_arrays, labels_frequencies = wle_update.update_atom_arrays(\n            atom_arrays, adj_arrays, cutoff)\n\n    datasets_expanded = wle_io.create_datasets(atom_arrays, adj_arrays, teach_signals)\n    expanded_labels = list(labels_frequencies.keys())\n    return tuple(datasets_expanded), expanded_labels, labels_frequencies\n\n\ndef apply_cwle_for_datasets(datasets, k=1):\n    \"\"\"\n    Apply Concatenated Weisfeiler--Lehman embedding for the tuple of datasets.\n    This also applicalbe for the Gated-sum Weisfeiler--Lehman embedding.\n\n    Args:\n        datasets: tuple of dataset (usually, train/val/test),\n                     each dataset consists of atom_array and\n                     adj_array and teach_signal\n        k: int, the number of iterations of neighborhood\n              aggregation.\n\n    Returns:\n        - tuple of dataset (usually, train/val/test),\n               each dataest consists of atom_number_array,\n               expanded_label_array, and adj_tensor\n        - list of all expanded labels, used in the dataset parts.\n        - dictionary of label frequencies key:label valeu:frequency count\n    \"\"\"\n\n    if k <= 0:\n        raise ValueError('Iterations should be a positive integer. '\n                         'Found k={}'.format(k))\n\n    atom_arrays, adj_arrays, teach_signals = wle_io.load_dataset_elements(datasets)\n\n    for i in range(k):\n        if i != k - 1:\n            atom_arrays, labels_frequencies = wle_update.update_atom_arrays(\n                atom_arrays, adj_arrays, 0)\n        else:\n            wle_arrays, labels_frequencies = wle_update.update_atom_arrays(\n                atom_arrays, adj_arrays, 0, False)\n\n    datasets_expanded = wle_io.create_datasets(\n        atom_arrays, adj_arrays, teach_signals, wle_arrays)\n    expanded_labels = list(labels_frequencies.keys())\n    return tuple(datasets_expanded), expanded_labels, labels_frequencies\n\ndef _findmaxidx(datasets, idx):\n    atom_data_size = len(datasets[0][0])\n    if atom_data_size <= idx:\n        raise ValueError(\n            'data index is out of index. '\n            'atom_data size={} <= idx={}'.format(\n                atom_data_size, idx))\n\n    max_idx = -1\n    for dataset in datasets:\n        for mol_data in dataset:\n            atom_array = mol_data[idx]\n            max_atom_idx = np.max(atom_array)\n            if max_atom_idx > max_idx:\n                max_idx = max_atom_idx\n\n    return max_idx + 1  # 0-origin\n\ndef findmaxidx(datasets, target='atom_label'):\n    \"\"\"\n    Retruns the maximum number of the symbol index in an atom array,\n    throughout the datasets.\n\n    Args:\n        datasets: dataset entity\n        target: choice of 'atom_label' of 'wle_label'\n\n    Returns:\n        _findmaxidx(datasets, 0/2)\n    \"\"\"\n\n    if target == 'atom_label':\n        return _findmaxidx(datasets, 0)\n    elif target == 'wle_label':\n        return _findmaxidx(datasets, 2)\n\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/wle_atom_array_update.py",
    "content": "import collections\n\nimport numpy as np\n\nfrom chainer_chemistry.dataset.preprocessors import wle_util\n\n\ndef update_atom_arrays(atom_arrays, adj_arrays, cutoff, with_focus_atom=True):\n    expanded_atom_lists, labels_frequencies = list_all_expanded_labels(\n            atom_arrays, adj_arrays, with_focus_atom)\n    if cutoff > 0:\n        expanded_atom_lists, labels_frequencies = shrink_expanded_labels(\n                expanded_atom_lists, labels_frequencies, cutoff)\n    expanded_labels = list(labels_frequencies.keys())\n    atom_arrays = [wle_util.to_index(l, expanded_labels)\n                        for l in expanded_atom_lists]\n    return atom_arrays, labels_frequencies\n\n\ndef shrink_expanded_labels(expanded_atom_lists,\n                           labels_frequencies,\n                           cutoff):\n    \"\"\"\n    Cut off the few-appearance expanded labels\n\n    Args:\n        expanded_atom_lists: tuple of list of expanded labels\n        labels_frequencies: list of label apperacne frequencies\n        cutoff: int, frequency cut of expanded labels\n\n    Returns:\n        - 3 (train/val/test) tuple of expanded atom arrays\n          (all nodes are associated with string representations of expanded signals)\n        - dictionary of frequencies all labels (key: label, value: frequency)\n    \"\"\"\n\n    # atom_array values are expanded label \"STRING\", not numbers\n    new_expanded_atom_lists = []\n    new_labels_frequencies = collections.defaultdict(lambda: 0)\n\n    # for each train/val/test, do\n    for set_expanded_atom_list in expanded_atom_lists:\n\n        # for each molecule sample, do\n        new_set_expanded_atom_list = []\n        for expanded_atom_list in set_expanded_atom_list:\n\n            mol_expanded_atom_list = []\n            # for each node i in the molecule,\n            # get the neighbor's atom label (number index)\n            for expanded_label in expanded_atom_list:\n                freq = labels_frequencies[expanded_label]\n\n                # check frequency here\n                if freq > cutoff:\n                    label = expanded_label\n                else:\n                    label = wle_util.get_focus_node_label(expanded_label)\n                mol_expanded_atom_list.append(label)\n                new_labels_frequencies[label] = new_labels_frequencies[label] + 1\n                # end cutoff-ifelse\n\n            # end i-for\n            new_set_expanded_atom_list.append(mol_expanded_atom_list)\n        # end zip(atom_arrays, adj_array)-for\n\n        new_expanded_atom_lists.append(new_set_expanded_atom_list)\n    # end zip(atom_arrays, adj_array)-for\n\n    return new_expanded_atom_lists, dict(new_labels_frequencies)\n\n\ndef list_all_expanded_labels(atom_arrays, adj_arrays, with_focus_atom=True):\n    \"\"\"\n    Exapnd all nodes into WLE representation. At the same time, return the list of all labels after expansion\n\n    Args:\n        atom_arrays: 3 (train/val/test) tuple of atom arrays\n        adj_arrays: 3 (train/val/test) tuple of adj.arrays\n        with_focus_atom: bool, if True, the expanded label starts from the original atom label (\"C-ON-OFN\")\n                                   if False, the exnapndd label do not include the original atom albel(\"-CN-OFN\")\n\n    Returns:\n        - 3 (train/val/test) tuple of expanded atom arrays\n             (all nodes are associated with string representations of expanded signals)\n        - list of all labels appeared in the expanded atom arrays.\n        - dictionary of frequencies all labels (key: label, value: frequency)\n\n    \"\"\"\n\n    expanded_atom_lists = []  # atom_array values are expanded label \"STRING\", not numbers\n    labels_frequencies = collections.defaultdict(lambda: 0)\n\n    # for each train/val/test, do\n    for set_atom_arrays, set_adj_arrays in zip(atom_arrays, adj_arrays):\n        # for each molecule sample, do\n        set_expanded_atom_list = []\n        for atom_array, adj_array in zip(set_atom_arrays, set_adj_arrays):\n            N = len(atom_array)  # number of molecules\n            # atom_array: N by F\n            # adj_array: N by N or N by R by N\n\n            # compress the relation axis\n            adj_array = wle_util.compress_relation_axis(adj_array)\n            assert adj_array.shape == (N, N)\n            # find neighbors\n            # array[0]: row index array[1]: column index\n            neighbors = np.nonzero(adj_array)\n\n            mol_expanded_atom_list = []\n            # for each node i in the molecule,\n            # get the neighbor's atom label (number index)\n            for i in range(N):\n                expanded_label = wle_util.get_neighbor_representation(\n                    i, atom_array, neighbors, with_focus_atom)\n                mol_expanded_atom_list.append(expanded_label)\n                labels_frequencies[expanded_label] = labels_frequencies[expanded_label] + 1\n            # end i-for\n            set_expanded_atom_list.append(mol_expanded_atom_list)\n        # end zip(atom_arrays, adj_array)-for\n\n        expanded_atom_lists.append(set_expanded_atom_list)\n    # end zip(atom_arrays, adj_array)-for\n\n    # Convert to a normal dictionary because\n    # we cannot pickle defaultdicts with lambdas.\n    return expanded_atom_lists, dict(labels_frequencies)\n"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/wle_io.py",
    "content": "import numpy as np\n\n\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\nDEBUG = False\n\n\ndef create_datasets(atom_arrays, adj_arrays, teach_signals, wle_arrays=None):\n    \"\"\"\n    Expand the atomic_num_arrays with the expanded labels,\n    then return valid datasets (tuple of NumpyTupleDataset)\n\n    Args:\n        atom_arrays: 3-tuple of list of lists.\n                        atom_arrays[i][j][k] is the id of an atom\n                        i: train/val/test\n                        j: index of a sample (i.e. molcule)\n                        k: index of an atom\n        adj_arrays: list of list of numpy.array, all mol's adjacnecy tensors\n        teach_signals: list of list of numpy.array,\n                          all teacher (supervision) signals\n        wle_arrays: None (for WLE) or 3-tuple of list of lists (for CWLE and GWLE).\n\n    Returns: 3 tuple of valid datasets (train/vel/test) in NumpyTuppleDataset\n\n    \"\"\"\n\n    output_datasets = []\n\n    # ToDo: try another indexing: e.g. orignal node label + extneions\n    assert len(atom_arrays) == len(adj_arrays) == len(teach_signals)\n    if wle_arrays is not None:\n        assert len(atom_arrays) == len(wle_arrays)\n    for i in range(len(atom_arrays)):\n        # We have swaped the axes 0 and 1 for adj-arrays. re-swap\n        set_adj_arrays = np.array(adj_arrays[i])\n        for m in range(len(set_adj_arrays)):\n            set_adj_arrays[m] = np.swapaxes(set_adj_arrays[m], 0, 1)\n\n        if wle_arrays is None:\n            dataset = NumpyTupleDataset(np.array(atom_arrays[i]),\n                                        set_adj_arrays,\n                                        np.array(teach_signals[i]))\n        else:\n            dataset = NumpyTupleDataset(np.array(atom_arrays[i]),\n                                        set_adj_arrays,\n                                        np.array(wle_arrays[i]),\n                                        np.array(teach_signals[i]))\n        output_datasets.append(dataset)\n    # end expanded-for\n\n    return output_datasets\n\n\ndef load_dataset_elements(datasets):\n    \"\"\"\n    Load all dataset tuples: atom array, adj. array, and teacher signals.\n\n    Args:\n        datasets: tuple of NumpyTupleDataset\n\n    Returns:\n        - tuple of lists of atom arrays, adj.arrays, and teacher signals.\n\n    \"\"\"\n\n    if DEBUG:\n        print('type(datasets)', type(datasets))  # tuple\n\n    atom_arrays = []  # 3 by num_mols by N by F\n    adj_arrays = []  # 3 by num_mols by N by N, or 3 by N by R by N by N by N\n    teach_signals = []  # 3 by num_mols by N by (data-dependent)\n    for dataset in datasets:\n\n        if DEBUG:\n            print('type(dataset)', type(dataset))  # NumpyTupleDataset\n\n        set_atom_arrays = []  # Mol by N\n        set_adj_arrays = []  # Mol by N by N or N by R by N by N\n        set_teach_signals = []  # Mol by (data-dependent)\n\n        for mol_data in dataset:\n\n            atom_array = mol_data[0]\n            adj_array = mol_data[1]\n            teach_signal = mol_data[2]\n\n            if DEBUG:\n                print(\"type(mol_data)=\", type(mol_data))  # tuple\n                print(\"type(atom_arrray)=\", type(atom_array))  # ndarray\n                print(\"type(adj_arrray)=\", type(adj_array))  # ndarray\n                print(\"type(teach_signal)=\", type(teach_signal))  # ndarray\n\n            set_atom_arrays.append(atom_array)\n\n            # for 3-D tensor, we swap axis here\n            set_adj_arrays.append(adj_array.swapaxes(0, 1))\n            set_teach_signals.append(teach_signal)\n        # end dataset-for\n\n        atom_arrays.append(set_atom_arrays)\n        adj_arrays.append(set_adj_arrays)\n        teach_signals.append(set_teach_signals)\n    # end datasets-for\n    return atom_arrays, adj_arrays, teach_signals"
  },
  {
    "path": "chainer_chemistry/dataset/preprocessors/wle_util.py",
    "content": "import numpy as np\n\n\nDEBUG = False\n\n\ndef _index(atom, values):\n    idx = values.index(atom)\n    if DEBUG:\n        print(\"idx=\", idx)\n        print(\"expanded_label=\", atom)\n    return idx\n\n\ndef to_index(mols, values):\n    return np.array([np.array([_index(atom, values) for atom in mol],\n                              dtype=np.int32)\n                     for mol in mols])\n\n\ndef compress_relation_axis(adj_array):\n    ndim = adj_array.ndim\n    if ndim == 2:\n        return adj_array\n    elif ndim == 3:\n        return np.sum(adj_array, axis=1, keepdims=False)\n    else:\n        raise ValueError(\n                'ndim of adjacency matrix should be 2 or 3. '\n                'Found ndim={}.'.format(ndim))\n\n\ndef _to_string(atom_label, neighbor_labels, with_focus_atom):\n    expanded_label = \".\".join(map(str, neighbor_labels))\n    if with_focus_atom:\n        expanded_label = str(atom_label) + \"-\" + expanded_label\n    if DEBUG:\n        print(\"expanded_label=\" + expanded_label)\n    return expanded_label\n\n\ndef get_neighbor_representation(idx, atom_array, neighbors, with_focus_atom):\n    atom_label = atom_array[idx]\n    neighbor = neighbors[1][np.where(neighbors[0] == idx)]\n    if DEBUG:\n        print(neighbor)\n        print(\"len(neighbor_i)=\", len(neighbor))\n    neighbor_labels = np.sort([atom_array[x] for x in neighbor if x != idx])\n    return _to_string(\n        atom_label, neighbor_labels, with_focus_atom)\n\n\ndef get_focus_node_label(expanded_label):\n    tokens = expanded_label.split('-')\n    if len(tokens) != 2:\n        raise ValueError(\n            'Invalid label={}'.format(expanded_label))\n    return tokens[0]\n"
  },
  {
    "path": "chainer_chemistry/dataset/splitters/__init__.py",
    "content": "from chainer_chemistry.dataset.splitters import base_splitter  # NOQA\nfrom chainer_chemistry.dataset.splitters import random_splitter  # NOQA\nfrom chainer_chemistry.dataset.splitters import scaffold_splitter  # NOQA\nfrom chainer_chemistry.dataset.splitters import deepchem_scaffold_splitter  # NOQA\nfrom chainer_chemistry.dataset.splitters import stratified_splitter  # NOQA\nfrom chainer_chemistry.dataset.splitters import time_splitter  # NOQA\n\nfrom chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter  # NOQA\nfrom chainer_chemistry.dataset.splitters.random_splitter import RandomSplitter  # NOQA\nfrom chainer_chemistry.dataset.splitters.scaffold_splitter import ScaffoldSplitter  # NOQA\nfrom chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import DeepChemScaffoldSplitter  # NOQA\nfrom chainer_chemistry.dataset.splitters.stratified_splitter import StratifiedSplitter  # NOQA\nfrom chainer_chemistry.dataset.splitters.time_splitter import TimeSplitter  # NOQA\n\nsplit_method_dict = {\n    'random': RandomSplitter,\n    'stratified': StratifiedSplitter,\n    'scaffold': ScaffoldSplitter,\n    'dc_scaffold': DeepChemScaffoldSplitter,\n    'time': TimeSplitter,\n}\n"
  },
  {
    "path": "chainer_chemistry/dataset/splitters/base_splitter.py",
    "content": "from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\ndef converter_default(dataset, indices):\n    return dataset[indices]\n\n\ndef converter_numpy_tuple_dataset(dataset, indices):\n    return NumpyTupleDataset(*dataset.features[indices])\n\n\nconverter_dict = {\n    NumpyTupleDataset: converter_numpy_tuple_dataset\n}\n\n\nclass BaseSplitter(object):\n    def k_fold_split(self, dataset, k):\n        raise NotImplementedError\n\n    def _split(self, dataset, **kwargs):\n        raise NotImplementedError\n\n    def train_valid_test_split(self, dataset, frac_train=0.8, frac_valid=0.1,\n                               frac_test=0.1, converter=None,\n                               return_index=True, **kwargs):\n\n        if converter is None:\n            converter = converter_dict.get(type(dataset), converter_default)\n\n        train_inds, valid_inds, test_inds = self._split(dataset, frac_train,\n                                                        frac_valid, frac_test,\n                                                        **kwargs)\n\n        if return_index:\n            return train_inds, valid_inds, test_inds\n        else:\n            train = converter(dataset, train_inds)\n            valid = converter(dataset, valid_inds)\n            test = converter(dataset, test_inds)\n            return train, valid, test,\n\n    def train_valid_split(self, dataset, frac_train=0.9, frac_valid=0.1,\n                          converter=None, return_index=True, **kwargs):\n\n        train_inds, valid_inds, test_inds = self._split(dataset, frac_train,\n                                                        frac_valid, 0.,\n                                                        **kwargs)\n        assert len(test_inds) == 0\n\n        if converter is None:\n            converter = converter_dict.get(type(dataset), converter_default)\n\n        if return_index:\n            return train_inds, valid_inds\n        else:\n            train = converter(dataset, train_inds)\n            valid = converter(dataset, valid_inds)\n            return train, valid,\n"
  },
  {
    "path": "chainer_chemistry/dataset/splitters/deepchem_scaffold_splitter.py",
    "content": "from collections import defaultdict\n\nimport numpy\nfrom rdkit import Chem\nfrom rdkit.Chem.Scaffolds import MurckoScaffold\n\nfrom chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter\n\n\ndef generate_scaffold(smiles, include_chirality=False):\n    \"\"\"return scaffold string of target molecule\"\"\"\n    mol = Chem.MolFromSmiles(smiles)\n    scaffold = MurckoScaffold\\\n        .MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)\n    return scaffold\n\n\nclass DeepChemScaffoldSplitter(BaseSplitter):\n    \"\"\"Class for doing data splits by chemical scaffold.\n\n    Referred Deepchem for the implementation,  https://github.com/deepchem/deepchem/blob/master/deepchem/splits/splitters.py\n    \"\"\"\n    def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1,\n               **kwargs):\n        print(\"Using DeepChem Scaffold\")\n        numpy.testing.assert_almost_equal(frac_train + frac_valid + frac_test,\n                                          1.)\n        seed = kwargs.get('seed', None)\n        smiles_list = kwargs.get('smiles_list')\n        include_chirality = kwargs.get('include_chirality')\n        if len(dataset) != len(smiles_list):\n            raise ValueError(\"The lengths of dataset and smiles_list are \"\n                             \"different\")\n\n        rng = numpy.random.RandomState(seed)\n\n        scaffolds = {}\n\n        data_len = len(dataset)\n        for ind, smiles in enumerate(smiles_list):\n            scaffold = generate_scaffold(smiles, include_chirality)\n            if scaffold not in scaffolds:\n                scaffolds[scaffold] = [ind]\n            else:\n                scaffolds[scaffold].append(ind)\n\n        # Sort from largest to smallest scaffold sets\n        scaffolds = {key: sorted(value) for key, value in scaffolds.items()}\n        scaffold_sets = [ scaffold_set for (scaffold, scaffold_set) in sorted(scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True) ]\n        train_cutoff = frac_train * len(dataset)\n        valid_cutoff = (frac_train + frac_valid) * len(dataset)\n        train_inds, valid_inds, test_inds = [], [], []\n\n        for scaffold_set in scaffold_sets:\n            if len(train_inds) + len(scaffold_set) > train_cutoff:\n                if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:\n                    test_inds += scaffold_set\n                else:\n                    valid_inds += scaffold_set\n            else:\n                train_inds += scaffold_set\n\n        return numpy.array(train_inds), numpy.array(valid_inds),\\\n            numpy.array(test_inds),\\\n\n\n    def train_valid_test_split(self, dataset, smiles_list, frac_train=0.8,\n                               frac_valid=0.1, frac_test=0.1, converter=None,\n                               return_index=True, seed=None,\n                               include_chirality=False, **kwargs):\n        \"\"\"Split dataset into train, valid and test set.\n\n        Split indices are generated by splitting based on the scaffold of small\n        molecules.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            smiles_list(list):\n                SMILES list corresponding to datset.\n            seed (int):\n                Random seed.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            converter(callable):\n            return_index(bool):\n                If `True`, this function returns only indices. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple): splitted dataset or indices\n\n        \"\"\"\n        return super(DeepChemScaffoldSplitter, self)\\\n            .train_valid_test_split(dataset, frac_train, frac_valid, frac_test,\n                                    converter, return_index, seed=seed,\n                                    smiles_list=smiles_list,\n                                    include_chirality=include_chirality,\n                                    **kwargs)\n\n    def train_valid_split(self, dataset, smiles_list, frac_train=0.9,\n                          frac_valid=0.1, converter=None, return_index=True,\n                          seed=None, include_chirality=False, **kwargs):\n        \"\"\"Split dataset into train and valid set.\n\n        Split indices are generated by splitting based on the scaffold of small\n        molecules.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            smiles_list(list):\n                SMILES list corresponding to datset.\n            seed (int):\n                Random seed.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            converter(callable):\n            return_index(bool):\n                If `True`, this function returns only indices. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple): splitted dataset or indices\n\n        \"\"\"\n        return super(DeepChemScaffoldSplitter, self)\\\n            .train_valid_split(dataset, frac_train, frac_valid, converter,\n                               return_index, seed=seed,\n                               smiles_list=smiles_list,\n                               include_chirality=include_chirality, **kwargs)\n"
  },
  {
    "path": "chainer_chemistry/dataset/splitters/random_splitter.py",
    "content": "import numpy\n\nfrom chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter\n\n\nclass RandomSplitter(BaseSplitter):\n    \"\"\"Class for doing random data splits.\"\"\"\n    def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1,\n               **kwargs):\n        seed = kwargs.get('seed')\n        numpy.testing.assert_almost_equal(frac_train + frac_valid + frac_test,\n                                          1.)\n\n        if seed is not None:\n            perm = numpy.random.RandomState(seed).permutation(len(dataset))\n        else:\n            perm = numpy.random.permutation(len(dataset))\n        train_data_size = int(len(dataset) * frac_train)\n        valid_data_size = int(len(dataset) * frac_valid)\n        return (perm[:train_data_size],\n                perm[train_data_size:train_data_size + valid_data_size],\n                perm[train_data_size + valid_data_size:])\n\n    def train_valid_test_split(self, dataset, frac_train=0.8, frac_valid=0.1,\n                               frac_test=0.1, converter=None,\n                               return_index=True, seed=None, **kwargs):\n        \"\"\"Generate indices to split data into train, valid and test set.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            seed (int):\n                Random seed.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            frac_test(float):\n                Fraction of dataset put into test data.\n            converter(callable):\n            return_index(bool):\n                If `True`, this function returns only indexes. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple):\n                splitted dataset or indexes\n\n        .. admonition:: Example\n            >>> from chainer_chemistry.datasets import NumpyTupleDataset\n            >>> from chainer_chemistry.dataset.splitters import RandomSplitter\n            >>> a = numpy.random.random((10, 10))\n            >>> b = numpy.random.random((10, 8))\n            >>> c = numpy.random.random((10, 1))\n            >>> d = NumpyTupleDataset(a, b, c)\n            >>> splitter = RandomSplitter()\n            >>> train, valid, test =\n                    splitter.train_valid_test_split(dataset,\n                                                    return_index=False)\n            >>> print(len(train), len(valid), len(test))\n            8, 1, 1\n\n        \"\"\"\n        return super(RandomSplitter, self).train_valid_test_split(dataset,\n                                                                  frac_train,\n                                                                  frac_valid,\n                                                                  frac_test,\n                                                                  converter,\n                                                                  return_index,\n                                                                  seed=seed,\n                                                                  **kwargs)\n\n    def train_valid_split(self, dataset, frac_train=0.9, frac_valid=0.1,\n                          converter=None, return_index=True, seed=None,\n                          **kwargs):\n        \"\"\"Generate indices to split data into train and valid set.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            seed (int):\n                Random seed.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            converter(callable):\n            return_index(bool):\n                If `True`, this function returns only indexes. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple):\n                splitted dataset or indexes\n\n        .. admonition:: Example\n            >>> from chainer_chemistry.datasets import NumpyTupleDataset\n            >>> from chainer_chemistry.dataset.splitters import RandomSplitter\n            >>> a = numpy.random.random((10, 10))\n            >>> b = numpy.random.random((10, 8))\n            >>> c = numpy.random.random((10, 1))\n            >>> d = NumpyTupleDataset(a, b, c)\n            >>> splitter = RandomSplitter()\n            >>> train, valid =\n                    splitter.train_valid_split(dataset, return_index=False)\n            >>> print(len(train), len(valid))\n            9, 1\n\n        \"\"\"\n        return super(RandomSplitter, self).train_valid_split(dataset,\n                                                             frac_train,\n                                                             frac_valid,\n                                                             converter,\n                                                             return_index,\n                                                             seed=seed,\n                                                             **kwargs)\n"
  },
  {
    "path": "chainer_chemistry/dataset/splitters/scaffold_splitter.py",
    "content": "from collections import defaultdict\n\nimport numpy\nfrom rdkit import Chem\nfrom rdkit.Chem.Scaffolds import MurckoScaffold\n\nfrom chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter\n\n\ndef generate_scaffold(smiles, include_chirality=False):\n    \"\"\"return scaffold string of target molecule\"\"\"\n    mol = Chem.MolFromSmiles(smiles)\n    scaffold = MurckoScaffold\\\n        .MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)\n    return scaffold\n\n\nclass ScaffoldSplitter(BaseSplitter):\n    \"\"\"Class for doing data splits by chemical scaffold.\n\n    Referred Deepchem for the implementation, https://git.io/fXzF4\n    \"\"\"\n    def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1,\n               **kwargs):\n        numpy.testing.assert_almost_equal(frac_train + frac_valid + frac_test,\n                                          1.)\n        seed = kwargs.get('seed', None)\n        smiles_list = kwargs.get('smiles_list')\n        include_chirality = kwargs.get('include_chirality')\n        if len(dataset) != len(smiles_list):\n            raise ValueError(\"The lengths of dataset and smiles_list are \"\n                             \"different\")\n\n        rng = numpy.random.RandomState(seed)\n\n        scaffolds = defaultdict(list)\n        for ind, smiles in enumerate(smiles_list):\n            scaffold = generate_scaffold(smiles, include_chirality)\n            scaffolds[scaffold].append(ind)\n\n        scaffold_sets = rng.permutation(list(scaffolds.values()))\n\n        n_total_valid = int(numpy.floor(frac_valid * len(dataset)))\n        n_total_test = int(numpy.floor(frac_test * len(dataset)))\n\n        train_index = []\n        valid_index = []\n        test_index = []\n\n        for scaffold_set in scaffold_sets:\n            if len(valid_index) + len(scaffold_set) <= n_total_valid:\n                valid_index.extend(scaffold_set)\n            elif len(test_index) + len(scaffold_set) <= n_total_test:\n                test_index.extend(scaffold_set)\n            else:\n                train_index.extend(scaffold_set)\n\n        return numpy.array(train_index), numpy.array(valid_index),\\\n            numpy.array(test_index),\\\n\n\n    def train_valid_test_split(self, dataset, smiles_list, frac_train=0.8,\n                               frac_valid=0.1, frac_test=0.1, converter=None,\n                               return_index=True, seed=None,\n                               include_chirality=False, **kwargs):\n        \"\"\"Split dataset into train, valid and test set.\n\n        Split indices are generated by splitting based on the scaffold of small\n        molecules.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            smiles_list(list):\n                SMILES list corresponding to datset.\n            seed (int):\n                Random seed.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            converter(callable):\n            return_index(bool):\n                If `True`, this function returns only indices. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple): splitted dataset or indices\n\n        \"\"\"\n        return super(ScaffoldSplitter, self)\\\n            .train_valid_test_split(dataset, frac_train, frac_valid, frac_test,\n                                    converter, return_index, seed=seed,\n                                    smiles_list=smiles_list,\n                                    include_chirality=include_chirality,\n                                    **kwargs)\n\n    def train_valid_split(self, dataset, smiles_list, frac_train=0.9,\n                          frac_valid=0.1, converter=None, return_index=True,\n                          seed=None, include_chirality=False, **kwargs):\n        \"\"\"Split dataset into train and valid set.\n\n        Split indices are generated by splitting based on the scaffold of small\n        molecules.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            smiles_list(list):\n                SMILES list corresponding to datset.\n            seed (int):\n                Random seed.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            converter(callable):\n            return_index(bool):\n                If `True`, this function returns only indices. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple): splitted dataset or indices\n\n        \"\"\"\n        return super(ScaffoldSplitter, self)\\\n            .train_valid_split(dataset, frac_train, frac_valid, converter,\n                               return_index, seed=seed,\n                               smiles_list=smiles_list,\n                               include_chirality=include_chirality, **kwargs)\n"
  },
  {
    "path": "chainer_chemistry/dataset/splitters/stratified_splitter.py",
    "content": "import numpy\nimport pandas\n\nfrom chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\ndef _approximate_mode(class_counts, n_draws):\n    \"\"\"Referred scikit-learn, https://git.io/fPMmB\"\"\"\n    n_class = len(class_counts)\n    continuous = class_counts * n_draws / class_counts.sum()\n    floored = numpy.floor(continuous)\n    assert n_draws // n_class == floored.sum() // n_class\n    n_remainder = int(n_draws - floored.sum())\n    remainder = continuous - floored\n    inds = numpy.argsort(remainder)[::-1]\n    inds = inds[:n_remainder]\n    floored[inds] += 1\n    assert n_draws == floored.sum()\n    return floored.astype(numpy.int)\n\n\nclass StratifiedSplitter(BaseSplitter):\n    \"\"\"Class for doing stratified data splits.\"\"\"\n\n    def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1,\n               labels=None, **kwargs):\n        numpy.testing.assert_almost_equal(frac_train + frac_valid + frac_test,\n                                          1.)\n\n        seed = kwargs.get('seed', None)\n        label_axis = kwargs.get('label_axis', -1)\n        task_index = kwargs.get('task_index', 0)\n        n_bin = kwargs.get('n_bin', 10)\n        task_type = kwargs.get('task_type', 'auto')\n        if task_type not in ['classification', 'regression', 'auto']:\n            raise ValueError(\"{} is invalid. Please use 'classification',\"\n                             \"'regression' or 'auto'\".format(task_type))\n\n        rng = numpy.random.RandomState(seed)\n\n        if isinstance(labels, list):\n            labels = numpy.array(labels)\n        elif labels is None:\n            if not isinstance(dataset, NumpyTupleDataset):\n                raise ValueError(\"Please assign label dataset.\")\n            labels = dataset.features[:, label_axis]\n\n        if labels.ndim == 1:\n            labels = labels\n        else:\n            labels = labels[:, task_index]\n\n        if task_type == 'auto':\n            if labels.dtype.kind == 'i':\n                task_type = 'classification'\n            elif labels.dtype.kind == 'f':\n                task_type = 'regression'\n            else:\n                raise ValueError\n\n        if task_type == 'classification':\n            classes, labels = numpy.unique(labels, return_inverse=True)\n        elif task_type == 'regression':\n            classes = numpy.arange(n_bin)\n            labels = pandas.qcut(labels, n_bin, labels=False)\n        else:\n            raise ValueError\n\n        n_classes = classes.shape[0]\n        n_total_valid = int(numpy.floor(frac_valid * len(dataset)))\n        n_total_test = int(numpy.floor(frac_test * len(dataset)))\n\n        class_counts = numpy.bincount(labels)\n        class_indices = numpy.split(numpy.argsort(labels,\n                                                  kind='mergesort'),\n                                    numpy.cumsum(class_counts)[:-1])\n\n        # n_total_train is the remainder: n - n_total_valid - n_total_test\n        n_valid_samples = _approximate_mode(class_counts, n_total_valid)\n        class_counts = class_counts - n_valid_samples\n        n_test_samples = _approximate_mode(class_counts, n_total_test)\n\n        train_index = []\n        valid_index = []\n        test_index = []\n\n        for i in range(n_classes):\n            n_valid = n_valid_samples[i]\n            n_test = n_test_samples[i]\n\n            perm = rng.permutation(len(class_indices[i]))\n            class_perm_index = class_indices[i][perm]\n\n            class_valid_index = class_perm_index[:n_valid]\n            class_test_index = class_perm_index[n_valid:n_valid+n_test]\n            class_train_index = class_perm_index[n_valid+n_test:]\n\n            train_index.extend(class_train_index)\n            valid_index.extend(class_valid_index)\n            test_index.extend(class_test_index)\n\n        assert n_total_valid == len(valid_index)\n        assert n_total_test == len(test_index)\n\n        return numpy.array(train_index), numpy.array(valid_index),\\\n            numpy.array(test_index),\n\n    def train_valid_test_split(self, dataset, labels=None, label_axis=-1,\n                               task_index=0, frac_train=0.8, frac_valid=0.1,\n                               frac_test=0.1, converter=None,\n                               return_index=True, seed=None, task_type='auto',\n                               n_bin=10, **kwargs):\n        \"\"\"Split dataset into train, valid and test set.\n\n        Split indices are generated by stratified splitting of labels.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            labels(numpy.ndarray):\n                Target label. If `None`, this function assumes that dataset is\n                an instance of `NumpyTupleDataset`.\n            labels_axis(int):\n                Dataset feature axis in NumpyTupleDataset.\n            task_index(int):\n                Target task index in dataset for stratification.\n            seed (int):\n                Random seed.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            return_index(bool):\n                If `True`, this function returns only indexes. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple):\n                splitted dataset or indexes\n\n        .. admonition:: Example\n            >>> from chainer_chemistry.datasets import NumpyTupleDataset\n            >>> from chainer_chemistry.dataset.splitters import StratifiedSplitter  # NOQA\n            >>>\n            >>> a = numpy.random.random((10, 10))\n            >>> b = numpy.random.random((10, 8))\n            >>> c = numpy.random.random((10, 1))\n            >>> d = NumpyTupleDataset(a, b, c)\n            >>> splitter = StratifiedSplitter()\n            >>> train, valid, test =\n                splitter.train_valid_test_split(dataset, return_index=False)\n            >>> print(len(train), len(valid))\n            8, 1, 1\n        \"\"\"\n\n        return super(StratifiedSplitter, self)\\\n            .train_valid_test_split(dataset, frac_train, frac_valid, frac_test,\n                                    converter, return_index, seed=seed,\n                                    label_axis=label_axis, task_type=task_type,\n                                    task_index=task_index, n_bin=n_bin,\n                                    labels=labels, **kwargs)\n\n    def train_valid_split(self, dataset, labels=None, label_axis=-1,\n                          task_index=0, frac_train=0.9, frac_valid=0.1,\n                          converter=None, return_index=True, seed=None,\n                          task_type='auto', n_bin=10, **kwargs):\n        \"\"\"Split dataset into train and valid set.\n\n        Split indices are generated by stratified splitting of labels.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            labels(numpy.ndarray):\n                Target label. If `None`, this function assumes that dataset is\n                an instance of `NumpyTupleDataset`.\n            labels_axis(int):\n                Dataset feature axis in NumpyTupleDataset.\n            task_index(int):\n                Target task index in dataset for stratification.\n            seed (int):\n                Random seed.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            return_index(bool):\n                If `True`, this function returns only indexes. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple):\n                splitted dataset or indexes\n\n        .. admonition:: Example\n            >>> from chainer_chemistry.datasets import NumpyTupleDataset\n            >>> from chainer_chemistry.dataset.splitters \\\n            >>>                                 import StratifiedSplitter\n            >>> a = numpy.random.random((10, 10))\n            >>> b = numpy.random.random((10, 8))\n            >>> c = numpy.random.random((10, 1))\n            >>> d = NumpyTupleDataset(a, b, c)\n            >>> splitter = StratifiedSplitter()\n            >>> train, valid =\n                    splitter.train_valid_split(dataset, return_index=False)\n            >>> print(len(train), len(valid))\n            9, 1\n        \"\"\"\n\n        return super(StratifiedSplitter, self)\\\n            .train_valid_split(dataset, frac_train, frac_valid, converter,\n                               return_index, seed=seed, label_axis=label_axis,\n                               task_type=task_type, task_index=task_index,\n                               n_bin=n_bin, labels=labels, **kwargs)\n"
  },
  {
    "path": "chainer_chemistry/dataset/splitters/time_splitter.py",
    "content": "import numpy\n\nfrom chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter\n\n\nclass TimeSplitter(BaseSplitter):\n    \"\"\"Class for doing time order splits.\"\"\"\n\n    def _split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1,\n               **kwargs):\n        numpy.testing.assert_almost_equal(\n            frac_train + frac_valid + frac_test, 1.)\n\n        time_list = kwargs.get('time_list')\n\n        train_cutoff = int(frac_train * len(dataset))\n        valid_cutoff = int((frac_train + frac_valid) * len(dataset))\n\n        index = [idx for idx, _ in sorted(\n            enumerate(time_list), key=lambda x: x[1])][:len(dataset)]\n\n        train_index = index[:train_cutoff]\n        valid_index = index[train_cutoff:valid_cutoff]\n        test_index = index[valid_cutoff:]\n\n        return numpy.array(train_index), numpy.array(valid_index), \\\n            numpy.array(test_index)\n\n    def train_valid_test_split(self, dataset, time_list=None, frac_train=0.8,\n                               frac_valid=0.1, frac_test=0.1, converter=None,\n                               return_index=True, **kwargs):\n        \"\"\"Split dataset into train, valid and test set.\n\n        Split indices are generated by splitting based on time order.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            time_list(list):\n                Time list corresponding to dataset.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            frac_test(float):\n                Fraction of dataset put into test data.\n            converter(callable):\n            return_index(bool):\n                If `True`, this function returns only indexes. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple): splitted dataset or indices\n\n        .. admonition:: Example\n            >>> from chainer_chemistry.datasets import NumpyTupleDataset\n            >>> from chainer_chemistry.dataset.splitters import TimeSplitter\n            >>> a = numpy.random.random((10, 10))\n            >>> b = numpy.random.random((10, 8))\n            >>> c = numpy.random.random((10, 1))\n            >>> d = NumpyTupleDataset(a, b, c)\n            >>> splitter = TimeSplitter()\n            >>> train, valid, test =\n                    splitter.train_valid_test_split(dataset,\n                                                    return_index=False)\n            >>> print(len(train), len(valid))\n            8, 1, 1\n        \"\"\"\n        return super(TimeSplitter, self).train_valid_test_split(\n            dataset, frac_train, frac_valid, frac_test, converter,\n            return_index, time_list=time_list, **kwargs)\n\n    def train_valid_split(self, dataset, time_list=None, frac_train=0.9,\n                          frac_valid=0.1, converter=None, return_index=True,\n                          **kwargs):\n        \"\"\"Split dataset into train and valid set.\n\n        Split indices are generated by splitting based on time order.\n\n        Args:\n            dataset(NumpyTupleDataset, numpy.ndarray):\n                Dataset.\n            time_list(list):\n                Time list corresponding to dataset.\n            frac_train(float):\n                Fraction of dataset put into training data.\n            frac_valid(float):\n                Fraction of dataset put into validation data.\n            converter(callable):\n            return_index(bool):\n                If `True`, this function returns only indexes. If `False`, this\n                function returns splitted dataset.\n\n        Returns:\n            SplittedDataset(tuple):\n                splitted dataset or indexes\n\n        .. admonition:: Example\n            >>> from chainer_chemistry.datasets import NumpyTupleDataset\n            >>> from chainer_chemistry.dataset.splitters import TimeSplitter\n            >>> a = numpy.random.random((10, 10))\n            >>> b = numpy.random.random((10, 8))\n            >>> c = numpy.random.random((10, 1))\n            >>> d = NumpyTupleDataset(a, b, c)\n            >>> splitter = TimeSplitter()\n            >>> train, valid =\n                    splitter.train_valid_split(dataset, return_index=False)\n            >>> print(len(train), len(valid))\n            9, 1\n        \"\"\"\n        return super(TimeSplitter, self).train_valid_split(\n            dataset, frac_train, frac_valid, converter, return_index,\n            time_list=time_list, **kwargs)\n"
  },
  {
    "path": "chainer_chemistry/dataset/utils.py",
    "content": "import numpy\n\n\nclass GaussianDistance(object):\n    \"\"\"Expand distance with Gaussian basis sit at centers and with width 0.5.\n\n    Args:\n        centers (numpy.ndarray): 1 dimensional array.\n            The positions of the center of the peak in a gaussian function.\n        width (float): Normal distribution in a gaussian function.\n    \"\"\"\n\n    def __init__(self, centers=None, width=0.5):\n        if centers is None:\n            centers = numpy.linspace(0, 4, 20)\n\n        self.centers = centers\n        self.width = width\n\n    def expand(self, d):\n        \"\"\"Expand distance with given parameters.\n\n        Args:\n            d (float): distance\n\n        Returns\n            expanded_distance (numpy.1darray):\n                M dimension with M the length of centers\n        \"\"\"\n        return numpy.exp(-(d-self.centers)**2 / self.width**2,\n                         dtype=numpy.float32)\n\n    def expand_from_distances(self, distances):\n        \"\"\"Expand distances with given parameters.\n\n        The original implemantation is below.\n        https://github.com/txie-93/cgcnn/blob/fdcd7eec8771e223e60e1b0abf7e6c7bc7d006bf/cgcnn/data.py#L152\n\n        Args:\n            distances (numpy.ndarray): 1 dimensional array.\n\n        Returns\n            expanded_distances (numpy.ndarray): 2 dimensional array.\n                First axis size is the number of distance,\n                Second axis size is M dimension with M the length of centers\n        \"\"\"\n        return numpy.exp(-(distances[..., numpy.newaxis] - self.centers)**2\n                         / self.width**2, dtype=numpy.float32)\n"
  },
  {
    "path": "chainer_chemistry/datasets/__init__.py",
    "content": "from chainer_chemistry.datasets import molnet  # NOQA\nfrom chainer_chemistry.datasets import qm9  # NOQA\nfrom chainer_chemistry.datasets import tox21  # NOQA\nfrom chainer_chemistry.datasets import zinc  # NOQA\n\n\n# import class and function\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset  # NOQA\nfrom chainer_chemistry.datasets.qm9 import get_qm9  # NOQA\nfrom chainer_chemistry.datasets.qm9 import get_qm9_filepath  # NOQA\nfrom chainer_chemistry.datasets.qm9 import get_qm9_label_names  # NOQA\nfrom chainer_chemistry.datasets.tox21 import get_tox21  # NOQA\nfrom chainer_chemistry.datasets.tox21 import get_tox21_filepath  # NOQA\nfrom chainer_chemistry.datasets.tox21 import get_tox21_label_names  # NOQA\nfrom chainer_chemistry.datasets.zinc import get_zinc250k  # NOQA\nfrom chainer_chemistry.datasets.zinc import get_zinc250k_filepath  # NOQA\nfrom chainer_chemistry.datasets.zinc import get_zinc250k_label_names  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/datasets/citation_network/citation.py",
    "content": "import os\n\nimport networkx as nx\nimport numpy\nfrom tqdm import tqdm\n\n\ndef citation_to_networkx(dirpath, name):\n    G = nx.Graph()\n    # node feature, node label\n    with open(os.path.join(dirpath, \"{}.content\".format(name))) as f:\n        lines = f.readlines()\n        compressor = {}\n        acc = 0\n        for line in tqdm(lines):\n            lis = line.split()\n            key, val = lis[0], lis[-1]\n            if val in compressor:\n                val = compressor[val]\n            else:\n                compressor[val] = acc\n                val = acc\n                acc += 1\n            G.add_node(key,\n                       x=numpy.array(lis[1:-1], dtype=numpy.float32),\n                       y=val)\n        G.graph['label_num'] = acc\n\n    # edge\n    with open(os.path.join(dirpath, \"{}.cites\".format(name))) as f:\n        lines = f.readlines()\n        for line in tqdm(lines):\n            u, v = line.split()\n            if u not in G.nodes.keys():\n                print(\"Warning: {} does not appear in {}{}.content\".format(\n                    u, dirpath, name))\n            elif v not in G.nodes.keys():\n                print(\"Warning: {} does not appear in {}{}.content\".format(\n                    v, dirpath, name))\n            else:\n                G.add_edge(u, v)\n\n    G = nx.convert_node_labels_to_integers(G)\n    print(\"Finished loading graph: {}\".format(dirpath))\n    print(\"number of nodes: {}, number of edges: {}\".format(\n        G.number_of_nodes(), G.number_of_edges()\n    ))\n    return G\n"
  },
  {
    "path": "chainer_chemistry/datasets/citation_network/citeseer.py",
    "content": "from logging import getLogger\nimport os\nimport tarfile\nfrom typing import List, Tuple  # NOQA\n\nfrom chainer.dataset import download\n\ndownload_url = 'https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz'\nfeat_file_name = 'citeseer.content'\nedge_file_name = 'citeseer.cites'\n\n_root = 'pfnet/chainer/citeseer'\n\n_label_names = ['Agents', 'AI', 'DB', 'IR', 'ML', 'HCI']\n\n\ndef get_citeseer_label_names():\n    # type: () -> List[str]\n    \"\"\"Return label names of Cora dataset.\"\"\"\n    return _label_names\n\n\ndef get_citeseer_dirpath(download_if_not_exist=True):\n    # type: (bool) -> str\n    \"\"\"Construct a dirpath which stores citeseer dataset.\n\n    This method check whether the file exist or not, and downloaded it\n    if necessary.\n\n    Args:\n        download_if_not_exist (bool): If ``True``, download dataset\n            if it is not downloaded yet.\n\n    Returns:\n        dirpath (str): directory path for citeseer dataset.\n    \"\"\"\n    feat_cache_path, edge_cache_path = get_citeseer_filepath(\n        download_if_not_exist=download_if_not_exist)\n    dirpath = os.path.dirname(feat_cache_path)\n    dirpath2 = os.path.dirname(edge_cache_path)\n    assert dirpath == dirpath2\n    return dirpath\n\n\ndef get_citeseer_filepath(download_if_not_exist=True):\n    # type: (bool) -> Tuple[str, str]\n    \"\"\"Construct a filepath which stores citeseer dataset.\n\n    This method check whether the file exist or not, and downloaded it\n    if necessary.\n\n    Args:\n        download_if_not_exist (bool): If ``True``, download dataset\n            if it is not downloaded yet.\n    Returns:\n        feat_cache_path (str): file path for citeseer dataset (features).\n        edge_cache_path (str): file path for citeseer dataset (edge index).\n    \"\"\"\n    feat_cache_path, edge_cache_path = _get_citeseer_filepath()\n    if not os.path.exists(feat_cache_path):\n        if download_if_not_exist:\n            is_successful = download_and_extract_citeseer(\n                save_dirpath=os.path.dirname(feat_cache_path))\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return feat_cache_path, edge_cache_path\n\n\ndef _get_citeseer_filepath():\n    # type: () -> Tuple[str, str]\n    \"\"\"Construct a filepath which stores citeseer dataset.\n\n    This method does not check if the file is already downloaded or not.\n    Returns:\n        feat_cache_path (str): file path for citeseer dataset (features).\n        edge_cache_path (str): file path for citeseer dataset (edge index).\n    \"\"\"\n    cache_root = download.get_dataset_directory(_root)\n    feat_cache_path = os.path.join(cache_root, feat_file_name)\n    edge_cache_path = os.path.join(cache_root, edge_file_name)\n    return feat_cache_path, edge_cache_path\n\n\ndef download_and_extract_citeseer(save_dirpath):\n    # type: (str) -> bool\n    print('downloading citeseer dataset...')\n    download_file_path = download.cached_download(download_url)\n    print('extracting citeseer dataset...')\n    tf = tarfile.open(download_file_path, 'r')\n    tf.extractall(os.path.dirname(save_dirpath))\n    return True\n"
  },
  {
    "path": "chainer_chemistry/datasets/citation_network/cora.py",
    "content": "from logging import getLogger\nimport os\nimport tarfile\nfrom typing import List, Tuple  # NOQA\n\nfrom chainer.dataset import download\n\ndownload_url = 'https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz'\nfeat_file_name = 'cora.content'\nedge_file_name = 'cora.cites'\n\n_root = 'pfnet/chainer/cora'\n\n_label_names = [\n    'Case_Based', 'Genetic_Algorithms', 'Neural_Networks',\n    'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning',\n    'Theory'\n]\n\n\ndef get_cora_label_names():\n    # type: () -> List[str]\n    \"\"\"Return label names of Cora dataset.\"\"\"\n    return _label_names\n\n\ndef get_cora_dirpath(download_if_not_exist=True):\n    # type: (bool) -> str\n    \"\"\"Construct a dirpath which stores Cora dataset.\n\n    This method check whether the file exist or not, and downloaded it\n    if necessary.\n\n    Args:\n        download_if_not_exist (bool): If ``True``, download dataset\n            if it is not downloaded yet.\n\n    Returns:\n        dirpath (str): directory path for Cora dataset.\n    \"\"\"\n    feat_cache_path, edge_cache_path = get_cora_filepath(\n        download_if_not_exist=download_if_not_exist)\n    dirpath = os.path.dirname(feat_cache_path)\n    dirpath2 = os.path.dirname(edge_cache_path)\n    assert dirpath == dirpath2\n    return dirpath\n\n\ndef get_cora_filepath(download_if_not_exist=True):\n    # type: (bool) -> Tuple[str, str]\n    \"\"\"Construct a filepath which stores Cora dataset.\n\n    This method check whether the file exist or not, and downloaded it\n    if necessary.\n\n    Args:\n        download_if_not_exist (bool): If ``True``, download dataset\n            if it is not downloaded yet.\n\n    Returns:\n        feat_cache_path (str): file path for Cora dataset (features).\n        edge_cache_path (str): file path for Cora dataset (edge index).\n    \"\"\"\n    feat_cache_path, edge_cache_path = _get_cora_filepath()\n    if not os.path.exists(feat_cache_path):\n        if download_if_not_exist:\n            is_successful = download_and_extract_cora(\n                save_dirpath=os.path.dirname(feat_cache_path))\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return feat_cache_path, edge_cache_path\n\n\ndef _get_cora_filepath():\n    # type: () -> Tuple[str, str]\n    \"\"\"Construct a filepath which stores Cora dataset.\n\n    This method does not check if the file is already downloaded or not.\n\n    Returns:\n        feat_cache_path (str): file path for Cora dataset (features).\n        edge_cache_path (str): file path for Cora dataset (edge index).\n    \"\"\"\n    cache_root = download.get_dataset_directory(_root)\n    feat_cache_path = os.path.join(cache_root, feat_file_name)\n    edge_cache_path = os.path.join(cache_root, edge_file_name)\n    return feat_cache_path, edge_cache_path\n\n\ndef download_and_extract_cora(save_dirpath):\n    # type: (str) -> bool\n    print('downloading cora dataset...')\n    download_file_path = download.cached_download(download_url)\n    print('extracting cora dataset...')\n    tf = tarfile.open(download_file_path, 'r')\n    tf.extractall(os.path.dirname(save_dirpath))\n    return True\n"
  },
  {
    "path": "chainer_chemistry/datasets/molnet/__init__.py",
    "content": "from chainer_chemistry.datasets.molnet import chembl_tasks  # NOQA\nfrom chainer_chemistry.datasets.molnet import molnet  # NOQA\nfrom chainer_chemistry.datasets.molnet import molnet_config  # NOQA\nfrom chainer_chemistry.datasets.molnet import pdbbind_time  # NOQA\nfrom chainer_chemistry.datasets.molnet import toxcast_tasks  # NOQA\n\nfrom chainer_chemistry.datasets.molnet.molnet import get_grid_featurized_pdbbind_dataset  # NOQA\nfrom chainer_chemistry.datasets.molnet.molnet import get_molnet_dataframe  # NOQA\nfrom chainer_chemistry.datasets.molnet.molnet import get_molnet_dataset  # NOQA\nfrom chainer_chemistry.datasets.molnet.molnet import get_molnet_filepath  # NOQA\nfrom chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/datasets/molnet/chembl_tasks.py",
    "content": "# flake8: noqa\n\nchembl_tasks = [\n    'CHEMBL1075104', 'CHEMBL1075228', 'CHEMBL1075284', 'CHEMBL1163101',\n    'CHEMBL1163125', 'CHEMBL1255149', 'CHEMBL1293289', 'CHEMBL1741186',\n    'CHEMBL1790', 'CHEMBL1792', 'CHEMBL1801', 'CHEMBL1804',\n    'CHEMBL1811', 'CHEMBL1821', 'CHEMBL1824', 'CHEMBL1827',\n    'CHEMBL1829', 'CHEMBL1833', 'CHEMBL1836', 'CHEMBL1844',\n    'CHEMBL1850', 'CHEMBL1853', 'CHEMBL1862', 'CHEMBL1865',\n    'CHEMBL1867', 'CHEMBL1868', 'CHEMBL1871', 'CHEMBL1873',\n    'CHEMBL1875', 'CHEMBL1881', 'CHEMBL1889', 'CHEMBL1898',\n    'CHEMBL1899', 'CHEMBL1900', 'CHEMBL1901', 'CHEMBL1906',\n    'CHEMBL1908', 'CHEMBL1913', 'CHEMBL1914', 'CHEMBL1916',\n    'CHEMBL1917', 'CHEMBL1921', 'CHEMBL1926', 'CHEMBL1936',\n    'CHEMBL1937', 'CHEMBL1941', 'CHEMBL1942', 'CHEMBL1951',\n    'CHEMBL1952', 'CHEMBL1955', 'CHEMBL1957', 'CHEMBL1968',\n    'CHEMBL1974', 'CHEMBL1977', 'CHEMBL1978', 'CHEMBL1980',\n    'CHEMBL1981', 'CHEMBL1983', 'CHEMBL1991', 'CHEMBL1994',\n    'CHEMBL1995', 'CHEMBL2000', 'CHEMBL2007', 'CHEMBL2014',\n    'CHEMBL2016', 'CHEMBL202', 'CHEMBL2028', 'CHEMBL203',\n    'CHEMBL2034', 'CHEMBL2035', 'CHEMBL2039', 'CHEMBL204',\n    'CHEMBL2041', 'CHEMBL2047', 'CHEMBL2049', 'CHEMBL205',\n    'CHEMBL2056', 'CHEMBL206', 'CHEMBL2069', 'CHEMBL208',\n    'CHEMBL209', 'CHEMBL210', 'CHEMBL211', 'CHEMBL213',\n    'CHEMBL214', 'CHEMBL2147', 'CHEMBL2148', 'CHEMBL215',\n    'CHEMBL216', 'CHEMBL217', 'CHEMBL218', 'CHEMBL2185',\n    'CHEMBL219', 'CHEMBL220', 'CHEMBL2208', 'CHEMBL221',\n    'CHEMBL222', 'CHEMBL223', 'CHEMBL224', 'CHEMBL2243',\n    'CHEMBL225', 'CHEMBL226', 'CHEMBL2265', 'CHEMBL2274',\n    'CHEMBL2276', 'CHEMBL228', 'CHEMBL2285', 'CHEMBL229',\n    'CHEMBL2292', 'CHEMBL230', 'CHEMBL2304402', 'CHEMBL2304404',\n    'CHEMBL231', 'CHEMBL2318', 'CHEMBL232', 'CHEMBL2326',\n    'CHEMBL2327', 'CHEMBL2329', 'CHEMBL233', 'CHEMBL2334',\n    'CHEMBL2335', 'CHEMBL2337', 'CHEMBL234', 'CHEMBL2345',\n    'CHEMBL235', 'CHEMBL236', 'CHEMBL2363', 'CHEMBL2366505',\n    'CHEMBL2366516', 'CHEMBL237', 'CHEMBL238', 'CHEMBL239',\n    'CHEMBL2391', 'CHEMBL2397', 'CHEMBL240', 'CHEMBL2409',\n    'CHEMBL241', 'CHEMBL242', 'CHEMBL2425', 'CHEMBL2431',\n    'CHEMBL244', 'CHEMBL245', 'CHEMBL246', 'CHEMBL247',\n    'CHEMBL2470', 'CHEMBL248', 'CHEMBL2487', 'CHEMBL2488',\n    'CHEMBL2489', 'CHEMBL249', 'CHEMBL2492', 'CHEMBL2499',\n    'CHEMBL251', 'CHEMBL2525', 'CHEMBL2527', 'CHEMBL253',\n    'CHEMBL2534', 'CHEMBL254', 'CHEMBL255', 'CHEMBL256',\n    'CHEMBL2563', 'CHEMBL2564', 'CHEMBL2575', 'CHEMBL258',\n    'CHEMBL2581', 'CHEMBL259', 'CHEMBL2599', 'CHEMBL260',\n    'CHEMBL261', 'CHEMBL262', 'CHEMBL2622', 'CHEMBL2637',\n    'CHEMBL264', 'CHEMBL265', 'CHEMBL2652', 'CHEMBL267',\n    'CHEMBL268', 'CHEMBL269', 'CHEMBL2695', 'CHEMBL270',\n    'CHEMBL2716', 'CHEMBL2722', 'CHEMBL273', 'CHEMBL274',\n    'CHEMBL2742', 'CHEMBL2749', 'CHEMBL275', 'CHEMBL276',\n    'CHEMBL2782', 'CHEMBL2789', 'CHEMBL279', 'CHEMBL280',\n    'CHEMBL2803', 'CHEMBL2808', 'CHEMBL2815', 'CHEMBL2820',\n    'CHEMBL2828', 'CHEMBL283', 'CHEMBL2835', 'CHEMBL284',\n    'CHEMBL2842', 'CHEMBL2858', 'CHEMBL286', 'CHEMBL2868',\n    'CHEMBL287', 'CHEMBL2871', 'CHEMBL288', 'CHEMBL2882',\n    'CHEMBL2885', 'CHEMBL2902', 'CHEMBL2903', 'CHEMBL2949',\n    'CHEMBL2954', 'CHEMBL2959', 'CHEMBL2971', 'CHEMBL2973',\n    'CHEMBL298', 'CHEMBL299', 'CHEMBL2993', 'CHEMBL2996',\n    'CHEMBL301', 'CHEMBL3012', 'CHEMBL3018', 'CHEMBL302',\n    'CHEMBL3024', 'CHEMBL3025', 'CHEMBL3037', 'CHEMBL304',\n    'CHEMBL3045', 'CHEMBL3048', 'CHEMBL3060', 'CHEMBL3072',\n    'CHEMBL308', 'CHEMBL309', 'CHEMBL3105', 'CHEMBL3116',\n    'CHEMBL312', 'CHEMBL313', 'CHEMBL3130', 'CHEMBL3138',\n    'CHEMBL3142', 'CHEMBL3145', 'CHEMBL3155', 'CHEMBL3166',\n    'CHEMBL318', 'CHEMBL3180', 'CHEMBL3181', 'CHEMBL319',\n    'CHEMBL3192', 'CHEMBL3199', 'CHEMBL3202', 'CHEMBL321',\n    'CHEMBL322', 'CHEMBL3222', 'CHEMBL3223', 'CHEMBL3227',\n    'CHEMBL3229', 'CHEMBL3230', 'CHEMBL3231', 'CHEMBL324',\n    'CHEMBL3242', 'CHEMBL325', 'CHEMBL326', 'CHEMBL3267',\n    'CHEMBL3286', 'CHEMBL3305', 'CHEMBL331', 'CHEMBL3310',\n    'CHEMBL332', 'CHEMBL333', 'CHEMBL3332', 'CHEMBL335',\n    'CHEMBL3351', 'CHEMBL3358', 'CHEMBL3360', 'CHEMBL3361',\n    'CHEMBL3371', 'CHEMBL338', 'CHEMBL339', 'CHEMBL3403',\n    'CHEMBL3426', 'CHEMBL3438', 'CHEMBL344', 'CHEMBL3464',\n    'CHEMBL3468', 'CHEMBL3471', 'CHEMBL3473', 'CHEMBL3476',\n    'CHEMBL3510', 'CHEMBL3522', 'CHEMBL3524', 'CHEMBL3553',\n    'CHEMBL3563', 'CHEMBL3568', 'CHEMBL3571', 'CHEMBL3582',\n    'CHEMBL3587', 'CHEMBL3594', 'CHEMBL3602', 'CHEMBL3614',\n    'CHEMBL3629', 'CHEMBL3650', 'CHEMBL3687', 'CHEMBL3699',\n    'CHEMBL3706', 'CHEMBL3710', 'CHEMBL3717', 'CHEMBL3729',\n    'CHEMBL3746', 'CHEMBL3759', 'CHEMBL3766', 'CHEMBL3769',\n    'CHEMBL3772', 'CHEMBL3775', 'CHEMBL3776', 'CHEMBL3778',\n    'CHEMBL3788', 'CHEMBL3795', 'CHEMBL3807', 'CHEMBL3836',\n    'CHEMBL3837', 'CHEMBL3864', 'CHEMBL3869', 'CHEMBL3892',\n    'CHEMBL3910', 'CHEMBL3912', 'CHEMBL3920', 'CHEMBL3943',\n    'CHEMBL3952', 'CHEMBL3969', 'CHEMBL3976', 'CHEMBL3979',\n    'CHEMBL3983', 'CHEMBL3991', 'CHEMBL3996', 'CHEMBL4005',\n    'CHEMBL4040', 'CHEMBL4072', 'CHEMBL4073', 'CHEMBL4077',\n    'CHEMBL4078', 'CHEMBL4093', 'CHEMBL4102', 'CHEMBL4124',\n    'CHEMBL4128', 'CHEMBL4140', 'CHEMBL4142', 'CHEMBL4145',\n    'CHEMBL4150', 'CHEMBL4153', 'CHEMBL4179', 'CHEMBL4188',\n    'CHEMBL4191', 'CHEMBL4203', 'CHEMBL4204', 'CHEMBL4224',\n    'CHEMBL4235', 'CHEMBL4247', 'CHEMBL4282', 'CHEMBL4296',\n    'CHEMBL4302', 'CHEMBL4321', 'CHEMBL4333', 'CHEMBL4336',\n    'CHEMBL4354', 'CHEMBL4361', 'CHEMBL4372', 'CHEMBL4393',\n    'CHEMBL4409', 'CHEMBL4414', 'CHEMBL4427', 'CHEMBL4429',\n    'CHEMBL4439', 'CHEMBL4465', 'CHEMBL4471', 'CHEMBL4477',\n    'CHEMBL4481', 'CHEMBL4482', 'CHEMBL4501', 'CHEMBL4523',\n    'CHEMBL4561', 'CHEMBL4586', 'CHEMBL4588', 'CHEMBL4599',\n    'CHEMBL4600', 'CHEMBL4608', 'CHEMBL4618', 'CHEMBL4625',\n    'CHEMBL4630', 'CHEMBL4644', 'CHEMBL4653', 'CHEMBL4657',\n    'CHEMBL4662', 'CHEMBL4681', 'CHEMBL4683', 'CHEMBL4687',\n    'CHEMBL4696', 'CHEMBL4722', 'CHEMBL4768', 'CHEMBL4777',\n    'CHEMBL4779', 'CHEMBL4780', 'CHEMBL4789', 'CHEMBL4792',\n    'CHEMBL4793', 'CHEMBL4794', 'CHEMBL4801', 'CHEMBL4802',\n    'CHEMBL4803', 'CHEMBL4804', 'CHEMBL4816', 'CHEMBL4822',\n    'CHEMBL4828', 'CHEMBL4829', 'CHEMBL4860', 'CHEMBL4895',\n    'CHEMBL4899', 'CHEMBL4975', 'CHEMBL4980', 'CHEMBL5017',\n    'CHEMBL5024', 'CHEMBL5067', 'CHEMBL5071', 'CHEMBL5076',\n    'CHEMBL5077', 'CHEMBL5103', 'CHEMBL5113', 'CHEMBL5122',\n    'CHEMBL5145', 'CHEMBL5147', 'CHEMBL5160', 'CHEMBL5205',\n    'CHEMBL5251', 'CHEMBL5314', 'CHEMBL5328', 'CHEMBL5331',\n    'CHEMBL5373', 'CHEMBL5407', 'CHEMBL5414', 'CHEMBL5441',\n    'CHEMBL5445', 'CHEMBL5457', 'CHEMBL5491', 'CHEMBL5508',\n    'CHEMBL5543', 'CHEMBL5570', 'CHEMBL5631', 'CHEMBL5658',\n    'CHEMBL5669', 'CHEMBL5763', 'CHEMBL5800', 'CHEMBL5847',\n    'CHEMBL5932', 'CHEMBL6007', 'CHEMBL6009', 'CHEMBL6137',\n    'CHEMBL6140', 'CHEMBL6154', 'CHEMBL6164', 'CHEMBL6166',\n    'CHEMBL6184']"
  },
  {
    "path": "chainer_chemistry/datasets/molnet/molnet.py",
    "content": "import joblib\nfrom logging import getLogger\nimport os\nimport shutil\nimport tarfile\n\nimport numpy\nimport pandas\n\nfrom chainer.dataset import download\n\nfrom chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.splitters.base_splitter import BaseSplitter\nfrom chainer_chemistry.dataset.splitters.scaffold_splitter import ScaffoldSplitter  # NOQA\nfrom chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import DeepChemScaffoldSplitter  # NOQA\nfrom chainer_chemistry.dataset.splitters import split_method_dict\nfrom chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config  # NOQA\nfrom chainer_chemistry.datasets.molnet.pdbbind_time import get_pdbbind_time\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n_root = 'pfnet/chainer/molnet'\n\n\ndef get_molnet_dataset(dataset_name, preprocessor=None, labels=None,\n                       split=None, frac_train=.8, frac_valid=.1,\n                       frac_test=.1, seed=777, return_smiles=False,\n                       return_pdb_id=False, target_index=None, task_index=0,\n                       **kwargs):\n    \"\"\"Downloads, caches and preprocess MoleculeNet dataset.\n\n    Args:\n        dataset_name (str): MoleculeNet dataset name. If you want to know the\n            detail of MoleculeNet, please refer to\n            `official site <http://moleculenet.ai/datasets-1>`_\n            If you would like to know what dataset_name is available for\n            chainer_chemistry, please refer to `molnet_config.py`.\n        preprocessor (BasePreprocessor): Preprocessor.\n            It should be chosen based on the network to be trained.\n            If it is None, default `AtomicNumberPreprocessor` is used.\n        labels (str or list): List of target labels.\n        split (str or BaseSplitter or None): How to split dataset into train,\n            validation and test. If `None`, this functions use the splitter\n            that is recommended by MoleculeNet. Additionally You can use an\n            instance of BaseSplitter or choose it from 'random', 'stratified'\n            and 'scaffold'.\n        return_smiles (bool): If set to ``True``,\n            smiles array is also returned.\n        return_pdb_id (bool): If set to ``True``,\n            PDB ID array is also returned.\n            This argument is only used when you select 'pdbbind_smiles'.\n        target_index (list or None): target index list to partially extract\n            dataset. If `None` (default), all examples are parsed.\n        task_index (int): Target task index in dataset for stratification.\n            (Stratified Splitter only)\n    Returns (dict):\n        Dictionary that contains dataset that is already split into train,\n        valid and test dataset and 1-d numpy array with dtype=object(string)\n        which is a vector of smiles for each example or `None`.\n\n    \"\"\"\n    if dataset_name not in molnet_default_config:\n        raise ValueError(\"We don't support {} dataset. Please choose from {}\".\n                         format(dataset_name,\n                                list(molnet_default_config.keys())))\n\n    if dataset_name == 'pdbbind_grid':\n        pdbbind_subset = kwargs.get('pdbbind_subset')\n        return get_pdbbind_grid(pdbbind_subset, split=split,\n                                frac_train=frac_train, frac_valid=frac_valid,\n                                frac_test=frac_test, task_index=task_index)\n    if dataset_name == 'pdbbind_smiles':\n        pdbbind_subset = kwargs.get('pdbbind_subset')\n        time_list = kwargs.get('time_list')\n        return get_pdbbind_smiles(pdbbind_subset, preprocessor=preprocessor,\n                                  labels=labels, split=split,\n                                  frac_train=frac_train, frac_valid=frac_valid,\n                                  frac_test=frac_test,\n                                  return_smiles=return_smiles,\n                                  return_pdb_id=return_pdb_id,\n                                  target_index=target_index,\n                                  task_index=task_index,\n                                  time_list=time_list)\n\n    dataset_config = molnet_default_config[dataset_name]\n    labels = labels or dataset_config['tasks']\n    if isinstance(labels, str):\n        labels = [labels, ]\n\n    if preprocessor is None:\n        preprocessor = AtomicNumberPreprocessor()\n\n    if dataset_config['task_type'] == 'regression':\n        def postprocess_label(label_list):\n            return numpy.asarray(label_list, dtype=numpy.float32)\n    elif dataset_config['task_type'] == 'classification':\n        def postprocess_label(label_list):\n            label_list = numpy.asarray(label_list)\n            label_list[numpy.isnan(label_list)] = -1\n            return label_list.astype(numpy.int32)\n\n    parser = CSVFileParser(preprocessor, labels=labels,\n                           smiles_col=dataset_config['smiles_columns'],\n                           postprocess_label=postprocess_label)\n    if dataset_config['dataset_type'] == 'one_file_csv':\n        split = dataset_config['split'] if split is None else split\n\n        if isinstance(split, str):\n            splitter = split_method_dict[split]()\n        elif isinstance(split, BaseSplitter):\n            splitter = split\n        else:\n            raise TypeError(\"split must be None, str or instance of\"\n                            \" BaseSplitter, but got {}\".format(type(split)))\n\n        if isinstance(splitter, (ScaffoldSplitter, DeepChemScaffoldSplitter)):\n            get_smiles = True\n        else:\n            get_smiles = return_smiles\n\n        result = parser.parse(get_molnet_filepath(dataset_name),\n                              return_smiles=get_smiles,\n                              target_index=target_index, **kwargs)\n        dataset = result['dataset']\n        smiles = result['smiles']\n        train_ind, valid_ind, test_ind = \\\n            splitter.train_valid_test_split(dataset, smiles_list=smiles,\n                                            task_index=task_index,\n                                            frac_train=frac_train,\n                                            frac_valid=frac_valid,\n                                            frac_test=frac_test, **kwargs)\n        train = NumpyTupleDataset(*dataset.features[train_ind])\n        valid = NumpyTupleDataset(*dataset.features[valid_ind])\n        test = NumpyTupleDataset(*dataset.features[test_ind])\n\n        result['dataset'] = (train, valid, test)\n        if return_smiles:\n            train_smiles = smiles[train_ind]\n            valid_smiles = smiles[valid_ind]\n            test_smiles = smiles[test_ind]\n            result['smiles'] = (train_smiles, valid_smiles, test_smiles)\n        else:\n            result['smiles'] = None\n    elif dataset_config['dataset_type'] == 'separate_csv':\n        result = {}\n        train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'),\n                                    return_smiles=return_smiles,\n                                    target_index=target_index)\n        valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'),\n                                    return_smiles=return_smiles,\n                                    target_index=target_index)\n        test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'),\n                                   return_smiles=return_smiles,\n                                   target_index=target_index)\n        result['dataset'] = (train_result['dataset'], valid_result['dataset'],\n                             test_result['dataset'])\n        result['smiles'] = (train_result['smiles'], valid_result['smiles'],\n                            test_result['smiles'])\n    else:\n        raise ValueError('dataset_type={} is not supported'\n                         .format(dataset_config['dataset_type']))\n\n    return result\n\n\ndef get_molnet_dataframe(dataset_name, pdbbind_subset=None):\n    \"\"\"Downloads, caches and get the dataframe of MoleculeNet dataset.\n\n    Args:\n        dataset_name (str): MoleculeNet dataset name. If you want to know the\n            detail of MoleculeNet, please refer to\n            `official site <http://moleculenet.ai/datasets-1>`_\n            If you would like to know what dataset_name is available for\n            chainer_chemistry, please refer to `molnet_config.py`.\n        pdbbind_subset (str): PDBbind dataset subset name. If you want to know\n            the detail of subset, please refer to `official site\n            <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>`\n    Returns (pandas.DataFrame or tuple):\n        DataFrame of dataset without any preprocessing. When the files of\n        dataset are seprated, this function returns multiple DataFrame.\n\n    \"\"\"\n    if dataset_name not in molnet_default_config:\n        raise ValueError(\"We don't support {} dataset. Please choose from {}\".\n                         format(dataset_name,\n                                list(molnet_default_config.keys())))\n    if dataset_name == 'pdbbind_grid':\n        raise ValueError('pdbbind_grid dataset is not supported. Please ',\n                         'choose pdbbind_smiles dataset.')\n    dataset_config = molnet_default_config[dataset_name]\n    if dataset_config['dataset_type'] == 'one_file_csv':\n        df = pandas.read_csv(get_molnet_filepath(\n            dataset_name, pdbbind_subset=pdbbind_subset))\n        return df\n    elif dataset_config['dataset_type'] == 'separate_csv':\n        train_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'train'))\n        valid_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'valid'))\n        test_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'test'))\n        return train_df, valid_df, test_df\n    else:\n        raise ValueError('dataset_type={} is not supported'\n                         .format(dataset_config['dataset_type']))\n\n\ndef get_molnet_filepath(dataset_name, filetype='onefile',\n                        download_if_not_exist=True, pdbbind_subset=None):\n    \"\"\"Construct a file path which stores MoleculeNet dataset.\n\n    This method check whether the file exist or not, and downloaded it if\n    necessary.\n\n    Args:\n        dataset_name (str): MoleculeNet dataset name.\n        file_type (str): either 'onefile', 'train', 'valid', 'test'\n        download_if_not_exist (bool): Download a file if it does not exist.\n\n    Returns (str): filepath for specific MoleculeNet dataset\n\n    \"\"\"\n    filetype_supported = ['onefile', 'train', 'valid', 'test']\n    if filetype not in filetype_supported:\n        raise ValueError(\"filetype {} not supported, please choose filetype \"\n                         \"from {}\".format(filetype, filetype_supported))\n    if filetype == 'onefile':\n        url_key = 'url'\n    else:\n        url_key = filetype + '_url'\n    if dataset_name == 'pdbbind_smiles':\n        file_url = molnet_default_config[dataset_name][url_key][pdbbind_subset]\n    else:\n        file_url = molnet_default_config[dataset_name][url_key]\n    file_name = file_url.split('/')[-1]\n    cache_path = _get_molnet_filepath(file_name)\n    if not os.path.exists(cache_path):\n        if download_if_not_exist:\n            is_successful = download_dataset(file_url,\n                                             save_filepath=cache_path)\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return cache_path\n\n\ndef _get_molnet_filepath(file_name):\n    \"\"\"Construct a filepath which stores MoleculeNet dataset in csv\n\n    This method does not check if the file is already downloaded or not.\n\n    Args:\n        file_name (str): file name of MoleculeNet dataset\n\n    Returns (str): filepath for one of MoleculeNet dataset\n\n    \"\"\"\n    cache_root = download.get_dataset_directory(_root)\n    cache_path = os.path.join(cache_root, file_name)\n    return cache_path\n\n\ndef download_dataset(dataset_url, save_filepath):\n    \"\"\"Download and caches MoleculeNet Dataset\n\n    Args:\n        dataset_url (str): URL of dataset\n        save_filepath (str): filepath for dataset\n\n    Returns (bool): If success downloading, returning `True`.\n\n    \"\"\"\n    logger = getLogger(__name__)\n    logger.warning('Downloading {} dataset, it takes time...'\n                   .format(dataset_url.split('/')[-1]))\n    download_file_path = download.cached_download(dataset_url)\n    shutil.move(download_file_path, save_filepath)\n    # pandas can load gzipped or tarball csv file\n    return True\n\n\ndef get_pdbbind_smiles(pdbbind_subset, preprocessor=None, labels=None,\n                       split=None, frac_train=.8, frac_valid=.1,\n                       frac_test=.1, return_smiles=False, return_pdb_id=True,\n                       target_index=None, task_index=0, time_list=None,\n                       **kwargs):\n    \"\"\"Downloads, caches and preprocess PDBbind dataset.\n\n    Args:\n        pdbbind_subset (str): PDBbind dataset subset name. If you want to know\n            the detail of subset, please refer to `official site\n            <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>`\n        preprocessor (BasePreprocessor): Preprocessor.\n            It should be chosen based on the network to be trained.\n            If it is None, default `AtomicNumberPreprocessor` is used.\n        labels (str or list): List of target labels.\n        split (str or BaseSplitter or None): How to split dataset into train,\n            validation and test. If `None`, this functions use the splitter\n            that is recommended by MoleculeNet. Additionally You can use an\n            instance of BaseSplitter or choose it from 'random', 'stratified'\n            and 'scaffold'.\n        return_smiles (bool): If set to ``True``,\n            smiles array is also returned.\n        return_pdb_id (bool): If set to ``True``,\n            PDB ID array is also returned.\n            This argument is only used when you select 'pdbbind_smiles'.\n        target_index (list or None): target index list to partially extract\n            dataset. If `None` (default), all examples are parsed.\n        task_index (int): Target task index in dataset for stratification.\n            (Stratified Splitter only)\n    Returns (dict):\n        Dictionary that contains dataset that is already split into train,\n        valid and test dataset and 1-d numpy arrays with dtype=object(string)\n        which are vectors of smiles and pdb_id for each example or `None`.\n\n    \"\"\"\n    config = molnet_default_config['pdbbind_smiles']\n    labels = labels or config['tasks']\n    if isinstance(labels, str):\n        labels = [labels, ]\n\n    if preprocessor is None:\n        preprocessor = AtomicNumberPreprocessor()\n\n    def postprocess_label(label_list):\n        return numpy.asarray(label_list, dtype=numpy.float32)\n\n    parser = CSVFileParser(preprocessor, labels=labels,\n                           smiles_col=config['smiles_columns'],\n                           postprocess_label=postprocess_label)\n    split = config['split'] if split is None else split\n    if isinstance(split, str):\n        splitter = split_method_dict[split]()\n    elif isinstance(split, BaseSplitter):\n        splitter = split\n    else:\n        raise TypeError(\"split must be None, str or instance of\"\n                        \" BaseSplitter, but got {}\".format(type(split)))\n\n    result = parser.parse(get_molnet_filepath('pdbbind_smiles',\n                                              pdbbind_subset=pdbbind_subset),\n                          return_smiles=return_smiles,\n                          return_is_successful=True,\n                          target_index=target_index)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    is_successful = result['is_successful']\n\n    if return_pdb_id:\n        df = pandas.read_csv(\n            get_molnet_filepath('pdbbind_smiles',\n                                pdbbind_subset=pdbbind_subset))\n        pdb_id = df['id'][is_successful]\n    else:\n        pdb_id = None\n\n    train_ind, valid_ind, test_ind = \\\n        splitter.train_valid_test_split(dataset, time_list=time_list,\n                                        smiles_list=smiles,\n                                        task_index=task_index,\n                                        frac_train=frac_train,\n                                        frac_valid=frac_valid,\n                                        frac_test=frac_test, **kwargs)\n    train = NumpyTupleDataset(*dataset.features[train_ind])\n    valid = NumpyTupleDataset(*dataset.features[valid_ind])\n    test = NumpyTupleDataset(*dataset.features[test_ind])\n\n    result['dataset'] = (train, valid, test)\n\n    if return_smiles:\n        train_smiles = smiles[train_ind]\n        valid_smiles = smiles[valid_ind]\n        test_smiles = smiles[test_ind]\n        result['smiles'] = (train_smiles, valid_smiles, test_smiles)\n    else:\n        result['smiles'] = None\n\n    if return_pdb_id:\n        train_pdb_id = pdb_id[train_ind]\n        valid_pdb_id = pdb_id[valid_ind]\n        test_pdb_id = pdb_id[test_ind]\n        result['pdb_id'] = (train_pdb_id, valid_pdb_id, test_pdb_id)\n    else:\n        result['pdb_id'] = None\n    return result\n\n\ndef get_pdbbind_grid(pdbbind_subset, split=None, frac_train=.8, frac_valid=.1,\n                     frac_test=.1, task_index=0, **kwargs):\n    \"\"\"Downloads, caches and grid-featurize PDBbind dataset.\n\n    Args:\n        pdbbind_subset (str): PDBbind dataset subset name. If you want to know\n            the detail of subset, please refer to `official site\n            <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>`\n        split (str or BaseSplitter or None): How to split dataset into train,\n            validation and test. If `None`, this functions use the splitter\n            that is recommended by MoleculeNet. Additionally You can use an\n            instance of BaseSplitter or choose it from 'random', 'stratified'\n            and 'scaffold'.\n        task_index (int): Target task index in dataset for stratification.\n            (Stratified Splitter only)\n    Returns (dict):\n        Dictionary that contains dataset that is already split into train,\n        valid and test dataset and 1-d numpy arrays with dtype=object(string)\n        which are vectors of smiles and pdb_id for each example or `None`.\n\n    \"\"\"\n    result = {}\n    dataset = get_grid_featurized_pdbbind_dataset(pdbbind_subset)\n    if split is None:\n        split = molnet_default_config['pdbbind_grid']['split']\n    if isinstance(split, str):\n        splitter = split_method_dict[split]()\n    elif isinstance(split, BaseSplitter):\n        splitter = split\n    else:\n        raise TypeError(\"split must be None, str, or instance of\"\n                        \" BaseSplitter, but got {}\".format(type(split)))\n    time_list = get_pdbbind_time()\n    train_ind, valid_ind, test_ind = \\\n        splitter.train_valid_test_split(dataset, time_list=time_list,\n                                        smiles_list=None,\n                                        task_index=task_index,\n                                        frac_train=frac_train,\n                                        frac_valid=frac_valid,\n                                        frac_test=frac_test, **kwargs)\n    train = NumpyTupleDataset(*dataset.features[train_ind])\n    valid = NumpyTupleDataset(*dataset.features[valid_ind])\n    test = NumpyTupleDataset(*dataset.features[test_ind])\n\n    result['dataset'] = (train, valid, test)\n    result['smiles'] = None\n    return result\n\n\ndef get_grid_featurized_pdbbind_dataset(subset):\n    \"\"\"Downloads and caches grid featurized PDBBind dataset.\n\n    Args:\n        subset (str): subset name of PDBBind dataset.\n\n    Returns (NumpyTupleDataset):\n        grid featurized PDBBind dataset.\n\n    \"\"\"\n    x_path, y_path = get_grid_featurized_pdbbind_filepath(subset)\n    x = joblib.load(x_path).astype('i')\n    y = joblib.load(y_path).astype('f')\n    dataset = NumpyTupleDataset(x, y)\n    return dataset\n\n\ndef get_grid_featurized_pdbbind_dirpath(subset, download_if_not_exist=True):\n    \"\"\"Construct a directory path which stores grid featurized PDBBind dataset.\n\n    This method check whether the file exist or not, and downloaded it if\n    necessary.\n\n    Args:\n        subset (str): subset name of PDBBind dataset.\n        download_if_not_exist (bool): Download a file if it does not exist.\n\n    Returns (str): directory path for specific subset of PDBBind dataset.\n\n    \"\"\"\n    subset_supported = ['core', 'full', 'refined']\n    if subset not in subset_supported:\n        raise ValueError(\"subset {} not supported, please choose filetype \"\n                         \"from {}\".format(subset, subset_supported))\n    file_url = \\\n        molnet_default_config['pdbbind_grid']['url'][subset]\n    file_name = file_url.split('/')[-1]\n    cache_path = _get_molnet_filepath(file_name)\n    if not os.path.exists(cache_path):\n        if download_if_not_exist:\n            is_successful = download_dataset(file_url,\n                                             save_filepath=cache_path)\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return cache_path\n\n\ndef get_grid_featurized_pdbbind_filepath(subset):\n    \"\"\"Construct a filepath which stores featurized PDBBind dataset in joblib\n\n    This method does not check if the file is already downloaded or not.\n\n    Args:\n        subset (str): subset name of PDBBind dataset\n\n    Returns:\n        x_path (str): filepath for feature vectors\n        y_path (str): filepath for -logKd/Ki\n\n    \"\"\"\n    dirpath = get_grid_featurized_pdbbind_dirpath(subset=subset)\n    savedir = '/'.join(dirpath.split('/')[:-1]) + '/'\n    with tarfile.open(dirpath, 'r:gz') as tar:\n        tar.extractall(savedir)\n        x_path = savedir + subset + '_grid/shard-0-X.joblib'\n        y_path = savedir + subset + '_grid/shard-0-y.joblib'\n    return x_path, y_path\n"
  },
  {
    "path": "chainer_chemistry/datasets/molnet/molnet_config.py",
    "content": "import chainer.functions as F\nimport chainer_chemistry\n\nfrom chainer_chemistry.datasets.molnet.chembl_tasks import chembl_tasks\nfrom chainer_chemistry.datasets.molnet.toxcast_tasks import toxcast_tasks\nfrom chainer_chemistry.functions import mean_absolute_error\nfrom chainer_chemistry.functions import mean_squared_error\nfrom chainer_chemistry.training.extensions.prc_auc_evaluator import PRCAUCEvaluator  # NOQA\nfrom chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator  # NOQA\n\nmolnet_base = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/'\nfeaturized_base = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \\\n                  + 'featurized_datasets/'\n\n\ndef mae(x, t):\n    return mean_absolute_error(x, t, ignore_nan=True)\n\n\ndef mse(x, t):\n    return mean_squared_error(x, t, ignore_nan=True)\n\n\ndef rmse(x, t):\n    return F.sqrt(mse(x, t))\n\n\ndef r2_score(x, t):\n    return chainer_chemistry.functions.r2_score(x, t, ignore_nan=True)\n\n\nmolnet_default_config = {\n    \"bace_Class\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'roc_auc': ROCAUCEvaluator},\n        \"smiles_columns\": 'mol',\n        \"split\": 'random',\n        \"task_type\": 'classification',\n        \"tasks\": [\"Class\"],\n        \"url\": molnet_base + 'bace.csv',\n    },\n    \"bace_pIC50\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'MAE': mae},\n        \"smiles_columns\": 'mol',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": [\"pIC50\"],\n        \"url\": molnet_base + 'bace.csv',\n    },\n    \"bbbp\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'roc_auc': ROCAUCEvaluator},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'scaffold',\n        \"task_type\": 'classification',\n        \"tasks\": [\"p_np\"],\n        \"url\": molnet_base + 'BBBP.csv',\n    },\n    # TODO(mottodora): There are many separating ways for chembl dataset\n    # TODO(mottodora): only use 5thresh dataset(sparse dataset is not used.)\n    # TODO(mottodora): support mix dataset type in example\n    \"chembl\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'mix',\n        \"tasks\": chembl_tasks,\n        \"url\": molnet_base + 'chembl_5thresh.csv.gz',\n    },\n    \"clearance\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'RMSE': rmse},\n        \"smiles_columns\": 'smile',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": [\"target\"],\n        \"url\": molnet_base + 'clearance.csv',\n    },\n    \"clintox\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'roc_auc': ROCAUCEvaluator},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'classification',\n        \"tasks\": [\"FDA_APPROVED\", \"CT_TOX\"],\n        \"url\": molnet_base + 'clintox.csv.gz',\n    },\n    \"delaney\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'RMSE': rmse},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": ['measured log solubility in mols per litre'],\n        \"url\": molnet_base + 'delaney-processed.csv',\n    },\n    \"HIV\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'roc_auc': ROCAUCEvaluator},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'scaffold',\n        \"task_type\": 'classification',\n        \"tasks\": [\"HIV_active\"],\n        \"url\": molnet_base + 'HIV.csv',\n    },\n    \"hopv\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'RMSE': rmse},\n        \"smiles_columns\": 'hopv.csv',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": ['HOMO', 'LUMO', 'electrochemical_gap', 'optical_gap',\n                  'PCE', 'V_OC', 'J_SC', 'fill_factor'],\n        \"url\": molnet_base + 'hopv.tar.gz',\n    },\n    \"kaggle\": {\n        \"dataset_type\": 'separate_csv',\n        \"loss\": mse,\n        \"metrics\": {'RMSE': rmse},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": ['3A4', 'CB1', 'DPP4', 'HIVINT', 'HIV_PROT', 'LOGD', 'METAB',\n                  'NK1', 'OX1', 'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI', 'THROMBIN'\n                  ],\n        \"test_url\": molnet_base + 'KAGGLE_test2_'\n                                  'disguised_combined_full.csv.gz',\n        \"train_url\": molnet_base + 'KAGGLE_training_'\n                                   'disguised_combined_full.csv.gz',\n        \"valid_url\": molnet_base + 'KAGGLE_test1_'\n                                   'disguised_combined_full.csv.gz',\n    },\n\n    \"lipo\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'RMSE': rmse},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": ['exp'],\n        \"url\": molnet_base + 'Lipophilicity.csv',\n    },\n    \"muv\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'prc_auc': PRCAUCEvaluator},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'classification',\n        \"tasks\": ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',\n                  'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',\n                  'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',\n                  'MUV-466', 'MUV-832'],\n        \"url\": molnet_base + 'muv.csv.gz',\n    },\n    \"nci\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'RMSE': rmse},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": ['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',\n                  'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',\n                  'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205',\n                  'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620',\n                  'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251',\n                  'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2',\n                  'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1',\n                  'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES',\n                  'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',\n                  'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7',\n                  'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549', 'T-47D'\n                  ],\n        \"url\": molnet_base + 'nci_unique.csv',\n    },\n    \"pcba\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'prc_auc': PRCAUCEvaluator},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'classification',\n        \"tasks\":\n            ['PCBA-1030', 'PCBA-1379', 'PCBA-1452', 'PCBA-1454', 'PCBA-1457',\n             'PCBA-1458', 'PCBA-1460', 'PCBA-1461', 'PCBA-1468', 'PCBA-1469',\n             'PCBA-1471', 'PCBA-1479', 'PCBA-1631', 'PCBA-1634', 'PCBA-1688',\n             'PCBA-1721', 'PCBA-2100', 'PCBA-2101', 'PCBA-2147', 'PCBA-2242',\n             'PCBA-2326', 'PCBA-2451', 'PCBA-2517', 'PCBA-2528', 'PCBA-2546',\n             'PCBA-2549', 'PCBA-2551', 'PCBA-2662', 'PCBA-2675', 'PCBA-2676',\n             'PCBA-411', 'PCBA-463254', 'PCBA-485281', 'PCBA-485290',\n             'PCBA-485294', 'PCBA-485297', 'PCBA-485313', 'PCBA-485314',\n             'PCBA-485341', 'PCBA-485349', 'PCBA-485353', 'PCBA-485360',\n             'PCBA-485364', 'PCBA-485367', 'PCBA-492947', 'PCBA-493208',\n             'PCBA-504327', 'PCBA-504332', 'PCBA-504333', 'PCBA-504339',\n             'PCBA-504444', 'PCBA-504466', 'PCBA-504467', 'PCBA-504706',\n             'PCBA-504842', 'PCBA-504845', 'PCBA-504847', 'PCBA-504891',\n             'PCBA-540276', 'PCBA-540317', 'PCBA-588342', 'PCBA-588453',\n             'PCBA-588456', 'PCBA-588579', 'PCBA-588590', 'PCBA-588591',\n             'PCBA-588795', 'PCBA-588855', 'PCBA-602179', 'PCBA-602233',\n             'PCBA-602310', 'PCBA-602313', 'PCBA-602332', 'PCBA-624170',\n             'PCBA-624171', 'PCBA-624173', 'PCBA-624202', 'PCBA-624246',\n             'PCBA-624287', 'PCBA-624288', 'PCBA-624291', 'PCBA-624296',\n             'PCBA-624297', 'PCBA-624417', 'PCBA-651635', 'PCBA-651644',\n             'PCBA-651768', 'PCBA-651965', 'PCBA-652025', 'PCBA-652104',\n             'PCBA-652105', 'PCBA-652106', 'PCBA-686970', 'PCBA-686978',\n             'PCBA-686979', 'PCBA-720504', 'PCBA-720532', 'PCBA-720542',\n             'PCBA-720551', 'PCBA-720553', 'PCBA-720579', 'PCBA-720580',\n             'PCBA-720707', 'PCBA-720708', 'PCBA-720709', 'PCBA-720711',\n             'PCBA-743255', 'PCBA-743266', 'PCBA-875', 'PCBA-881', 'PCBA-883',\n             'PCBA-884', 'PCBA-885', 'PCBA-887', 'PCBA-891', 'PCBA-899',\n             'PCBA-902', 'PCBA-903', 'PCBA-904', 'PCBA-912', 'PCBA-914',\n             'PCBA-915', 'PCBA-924', 'PCBA-925', 'PCBA-926', 'PCBA-927',\n             'PCBA-938', 'PCBA-995'],\n        \"url\": molnet_base + 'pcba.csv.gz',\n    },\n    \"pdbbind_smiles\": {\n        \"subset\": [\"core\", \"full\", \"refined\"],\n        \"dataset_type\": 'one_file_csv',\n        \"url\": {'core': molnet_base + 'core_smiles_labels.csv',\n                'full': molnet_base + 'full_smiles_labels.csv',\n                'refined': molnet_base + 'refined_smiles_labels.csv'},\n        \"smiles_columns\": 'smiles',\n        \"metrics\": {'R2': r2_score},\n        \"split\": 'time',\n        \"task_type\": 'regression',\n        \"tasks\": [\"-logKd/Ki\"],\n    },\n    \"pdbbind_grid\": {\n        \"pdbbind_subset\": [\"core\", \"full\", \"refined\"],\n        \"dataset_type\": 'joblib',\n        \"url\": {'core': featurized_base + 'core_grid.tar.gz',\n                'full': featurized_base + 'full_grid.tar.gz',\n                'refined': featurized_base + 'refined_grid.tar.gz'},\n        \"smiles_columns\": '',\n        \"metrics\": {'R2': r2_score},\n        \"split\": 'time',\n        \"task_type\": 'regression',\n        \"tasks\": [\"-logKd/Ki\"],\n    },\n    \"ppb\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'RMSE': rmse},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": [\"exp\"],\n        \"url\": molnet_base + 'PPB.csv',\n    },\n    # TODO(motoki): there are multiple data types in qm7 dataset.\n    \"qm7\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'MAE': mae},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'stratified',\n        \"task_type\": 'regression',\n        \"tasks\": [\"u0_atom\"],\n        \"url\": molnet_base + 'qm7.csv',\n    },\n    # TODO(motoki): there are sdf data types in qm8 dataset.\n    \"qm8\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'MAE': mae},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": [\"E1-CC2\", \"E2-CC2\", \"f1-CC2\", \"f2-CC2\", \"E1-PBE0\", \"E2-PBE0\",\n                  \"f1-PBE0\", \"f2-PBE0\", \"E1-PBE0\", \"E2-PBE0\", \"f1-PBE0\",\n                  \"f2-PBE0\", \"E1-CAM\", \"E2-CAM\", \"f1-CAM\", \"f2-CAM\"],\n        \"url\": molnet_base + 'qm8.csv',\n    },\n    # TODO(motoki): there are sdf data types in qm9 dataset.\n    \"qm9\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'MAE': mae},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": [\"mu\", \"alpha\", \"homo\", \"lumo\", \"gap\", \"r2\", \"zpve\", \"cv\",\n                  \"u0\", \"u298\", \"h298\", \"g298\"],\n        \"url\": molnet_base + 'qm9.csv',\n    },\n    \"SAMPL\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": mse,\n        \"metrics\": {'RMSE': rmse},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'regression',\n        \"tasks\": [\"expt\"],\n        \"url\": molnet_base + 'SAMPL.csv',\n    },\n    \"sider\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'roc_auc': ROCAUCEvaluator},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'classification',\n        \"tasks\": ['Hepatobiliary disorders',\n                  'Metabolism and nutrition disorders', 'Product issues',\n                  'Eye disorders', 'Investigations',\n                  'Musculoskeletal and connective tissue disorders',\n                  'Gastrointestinal disorders', 'Social circumstances',\n                  'Immune system disorders',\n                  'Reproductive system and breast disorders',\n                  'Neoplasms benign, malignant and unspecified '\n                  '(incl cysts and polyps)',\n                  'General disorders and administration site conditions',\n                  'Endocrine disorders', 'Surgical and medical procedures',\n                  'Vascular disorders', 'Blood and lymphatic system disorders',\n                  'Skin and subcutaneous tissue disorders',\n                  'Congenital, familial and genetic disorders',\n                  'Infections and infestations',\n                  'Respiratory, thoracic and mediastinal disorders',\n                  'Psychiatric disorders', 'Renal and urinary disorders',\n                  'Pregnancy, puerperium and perinatal conditions',\n                  'Ear and labyrinth disorders', 'Cardiac disorders',\n                  'Nervous system disorders',\n                  'Injury, poisoning and procedural complications'],\n        \"url\": molnet_base + 'sider.csv.gz',\n    },\n    \"tox21\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'roc_auc': ROCAUCEvaluator},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'classification',\n        \"tasks\": ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',\n                  'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE',\n                  'SR-MMP', 'SR-p53'],\n        \"url\": molnet_base + 'tox21.csv.gz',\n    },\n    \"toxcast\": {\n        \"dataset_type\": 'one_file_csv',\n        \"loss\": F.sigmoid_cross_entropy,\n        \"metrics\": {'binary_accuracy': F.binary_accuracy,\n                    'roc_auc': ROCAUCEvaluator},\n        \"smiles_columns\": 'smiles',\n        \"split\": 'random',\n        \"task_type\": 'classification',\n        \"tasks\": toxcast_tasks,\n        \"url\": molnet_base + 'toxcast_data.csv.gz',\n    },\n}\n"
  },
  {
    "path": "chainer_chemistry/datasets/molnet/pdbbind_time.py",
    "content": "from logging import getLogger\nimport os\nimport shutil\n\nimport pandas\n\nfrom chainer.dataset import download\n\n\ndef get_pdbbind_time():\n    \"\"\"Get time list for PDBbind dataset.\n\n    Args:\n\n    Returns(list):\n        Time list for PDBbind dataset.\n\n    \"\"\"\n    df = pandas.read_csv(get_pdbbind_time_filepath(), header=None)\n    time_list = df[1].values.tolist()\n    return time_list\n\n\ndef get_pdbbind_time_filepath(download_if_not_exist=True):\n    \"\"\"Construct a file path which stores year table of PDBbind.\n\n    This method check whether the file exist or not, and download it if\n    necessary.\n\n    Args:\n        download_if_not_exist(bool): Download a file if it does not exist.\n\n    Returns(str): filepath for year table\n\n    \"\"\"\n    url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' \\\n          'pdbbind_year.csv'\n    file_name = url.split('/')[-1]\n    cache_path = _get_pdbbind_time_filepath(file_name)\n    if not os.path.exists(cache_path):\n        if download_if_not_exist:\n            is_successful = download_pdbbind_time(url,\n                                                  save_filepath=cache_path)\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return cache_path\n\n\ndef _get_pdbbind_time_filepath(file_name):\n    \"\"\"Construct a filepath which stores year table in csv.\n\n    This method does not check if the file is already downloaded or not.\n\n    Args:\n        file_name(str): file name of year table\n\n    Returns(str): filepath for one of year table\n\n    \"\"\"\n    cache_root = download.get_dataset_directory('pfnet/chainer/molnet')\n    cache_path = os.path.join(cache_root, file_name)\n    return cache_path\n\n\ndef download_pdbbind_time(url, save_filepath):\n    \"\"\"Download and caches PDBBind year table.\n\n    Args:\n        url(str): URL of year table\n        save_filepath(str): filepath for year table\n\n    Returns(bool): If success downloading, returning `True`.\n    \"\"\"\n    download_file_path = download.cached_download(url)\n    shutil.move(download_file_path, save_filepath)\n    return True\n"
  },
  {
    "path": "chainer_chemistry/datasets/molnet/toxcast_tasks.py",
    "content": "# flake8: noqa\n\ntoxcast_tasks = ['ACEA_T47D_80hr_Negative', 'ACEA_T47D_80hr_Positive',\n       'APR_HepG2_CellCycleArrest_24h_dn',\n       'APR_HepG2_CellCycleArrest_24h_up',\n       'APR_HepG2_CellCycleArrest_72h_dn', 'APR_HepG2_CellLoss_24h_dn',\n       'APR_HepG2_CellLoss_72h_dn', 'APR_HepG2_MicrotubuleCSK_24h_dn',\n       'APR_HepG2_MicrotubuleCSK_24h_up',\n       'APR_HepG2_MicrotubuleCSK_72h_dn',\n       'APR_HepG2_MicrotubuleCSK_72h_up', 'APR_HepG2_MitoMass_24h_dn',\n       'APR_HepG2_MitoMass_24h_up', 'APR_HepG2_MitoMass_72h_dn',\n       'APR_HepG2_MitoMass_72h_up', 'APR_HepG2_MitoMembPot_1h_dn',\n       'APR_HepG2_MitoMembPot_24h_dn', 'APR_HepG2_MitoMembPot_72h_dn',\n       'APR_HepG2_MitoticArrest_24h_up', 'APR_HepG2_MitoticArrest_72h_up',\n       'APR_HepG2_NuclearSize_24h_dn', 'APR_HepG2_NuclearSize_72h_dn',\n       'APR_HepG2_NuclearSize_72h_up', 'APR_HepG2_OxidativeStress_24h_up',\n       'APR_HepG2_OxidativeStress_72h_up', 'APR_HepG2_StressKinase_1h_up',\n       'APR_HepG2_StressKinase_24h_up', 'APR_HepG2_StressKinase_72h_up',\n       'APR_HepG2_p53Act_24h_up', 'APR_HepG2_p53Act_72h_up',\n       'APR_Hepat_Apoptosis_24hr_up', 'APR_Hepat_Apoptosis_48hr_up',\n       'APR_Hepat_CellLoss_24hr_dn', 'APR_Hepat_CellLoss_48hr_dn',\n       'APR_Hepat_DNADamage_24hr_up', 'APR_Hepat_DNADamage_48hr_up',\n       'APR_Hepat_DNATexture_24hr_up', 'APR_Hepat_DNATexture_48hr_up',\n       'APR_Hepat_MitoFxnI_1hr_dn', 'APR_Hepat_MitoFxnI_24hr_dn',\n       'APR_Hepat_MitoFxnI_48hr_dn', 'APR_Hepat_NuclearSize_24hr_dn',\n       'APR_Hepat_NuclearSize_48hr_dn', 'APR_Hepat_Steatosis_24hr_up',\n       'APR_Hepat_Steatosis_48hr_up', 'ATG_AP_1_CIS_dn', 'ATG_AP_1_CIS_up',\n       'ATG_AP_2_CIS_dn', 'ATG_AP_2_CIS_up', 'ATG_AR_TRANS_dn',\n       'ATG_AR_TRANS_up', 'ATG_Ahr_CIS_dn', 'ATG_Ahr_CIS_up',\n       'ATG_BRE_CIS_dn', 'ATG_BRE_CIS_up', 'ATG_CAR_TRANS_dn',\n       'ATG_CAR_TRANS_up', 'ATG_CMV_CIS_dn', 'ATG_CMV_CIS_up',\n       'ATG_CRE_CIS_dn', 'ATG_CRE_CIS_up', 'ATG_C_EBP_CIS_dn',\n       'ATG_C_EBP_CIS_up', 'ATG_DR4_LXR_CIS_dn', 'ATG_DR4_LXR_CIS_up',\n       'ATG_DR5_CIS_dn', 'ATG_DR5_CIS_up', 'ATG_E2F_CIS_dn',\n       'ATG_E2F_CIS_up', 'ATG_EGR_CIS_up', 'ATG_ERE_CIS_dn',\n       'ATG_ERE_CIS_up', 'ATG_ERRa_TRANS_dn', 'ATG_ERRg_TRANS_dn',\n       'ATG_ERRg_TRANS_up', 'ATG_ERa_TRANS_up', 'ATG_E_Box_CIS_dn',\n       'ATG_E_Box_CIS_up', 'ATG_Ets_CIS_dn', 'ATG_Ets_CIS_up',\n       'ATG_FXR_TRANS_up', 'ATG_FoxA2_CIS_dn', 'ATG_FoxA2_CIS_up',\n       'ATG_FoxO_CIS_dn', 'ATG_FoxO_CIS_up', 'ATG_GAL4_TRANS_dn',\n       'ATG_GATA_CIS_dn', 'ATG_GATA_CIS_up', 'ATG_GLI_CIS_dn',\n       'ATG_GLI_CIS_up', 'ATG_GRE_CIS_dn', 'ATG_GRE_CIS_up',\n       'ATG_GR_TRANS_dn', 'ATG_GR_TRANS_up', 'ATG_HIF1a_CIS_dn',\n       'ATG_HIF1a_CIS_up', 'ATG_HNF4a_TRANS_dn', 'ATG_HNF4a_TRANS_up',\n       'ATG_HNF6_CIS_dn', 'ATG_HNF6_CIS_up', 'ATG_HSE_CIS_dn',\n       'ATG_HSE_CIS_up', 'ATG_IR1_CIS_dn', 'ATG_IR1_CIS_up',\n       'ATG_ISRE_CIS_dn', 'ATG_ISRE_CIS_up', 'ATG_LXRa_TRANS_dn',\n       'ATG_LXRa_TRANS_up', 'ATG_LXRb_TRANS_dn', 'ATG_LXRb_TRANS_up',\n       'ATG_MRE_CIS_up', 'ATG_M_06_TRANS_up', 'ATG_M_19_CIS_dn',\n       'ATG_M_19_TRANS_dn', 'ATG_M_19_TRANS_up', 'ATG_M_32_CIS_dn',\n       'ATG_M_32_CIS_up', 'ATG_M_32_TRANS_dn', 'ATG_M_32_TRANS_up',\n       'ATG_M_61_TRANS_up', 'ATG_Myb_CIS_dn', 'ATG_Myb_CIS_up',\n       'ATG_Myc_CIS_dn', 'ATG_Myc_CIS_up', 'ATG_NFI_CIS_dn',\n       'ATG_NFI_CIS_up', 'ATG_NF_kB_CIS_dn', 'ATG_NF_kB_CIS_up',\n       'ATG_NRF1_CIS_dn', 'ATG_NRF1_CIS_up', 'ATG_NRF2_ARE_CIS_dn',\n       'ATG_NRF2_ARE_CIS_up', 'ATG_NURR1_TRANS_dn', 'ATG_NURR1_TRANS_up',\n       'ATG_Oct_MLP_CIS_dn', 'ATG_Oct_MLP_CIS_up', 'ATG_PBREM_CIS_dn',\n       'ATG_PBREM_CIS_up', 'ATG_PPARa_TRANS_dn', 'ATG_PPARa_TRANS_up',\n       'ATG_PPARd_TRANS_up', 'ATG_PPARg_TRANS_up', 'ATG_PPRE_CIS_dn',\n       'ATG_PPRE_CIS_up', 'ATG_PXRE_CIS_dn', 'ATG_PXRE_CIS_up',\n       'ATG_PXR_TRANS_dn', 'ATG_PXR_TRANS_up', 'ATG_Pax6_CIS_up',\n       'ATG_RARa_TRANS_dn', 'ATG_RARa_TRANS_up', 'ATG_RARb_TRANS_dn',\n       'ATG_RARb_TRANS_up', 'ATG_RARg_TRANS_dn', 'ATG_RARg_TRANS_up',\n       'ATG_RORE_CIS_dn', 'ATG_RORE_CIS_up', 'ATG_RORb_TRANS_dn',\n       'ATG_RORg_TRANS_dn', 'ATG_RORg_TRANS_up', 'ATG_RXRa_TRANS_dn',\n       'ATG_RXRa_TRANS_up', 'ATG_RXRb_TRANS_dn', 'ATG_RXRb_TRANS_up',\n       'ATG_SREBP_CIS_dn', 'ATG_SREBP_CIS_up', 'ATG_STAT3_CIS_dn',\n       'ATG_STAT3_CIS_up', 'ATG_Sox_CIS_dn', 'ATG_Sox_CIS_up',\n       'ATG_Sp1_CIS_dn', 'ATG_Sp1_CIS_up', 'ATG_TAL_CIS_dn',\n       'ATG_TAL_CIS_up', 'ATG_TA_CIS_dn', 'ATG_TA_CIS_up',\n       'ATG_TCF_b_cat_CIS_dn', 'ATG_TCF_b_cat_CIS_up', 'ATG_TGFb_CIS_dn',\n       'ATG_TGFb_CIS_up', 'ATG_THRa1_TRANS_dn', 'ATG_THRa1_TRANS_up',\n       'ATG_VDRE_CIS_dn', 'ATG_VDRE_CIS_up', 'ATG_VDR_TRANS_dn',\n       'ATG_VDR_TRANS_up', 'ATG_XTT_Cytotoxicity_up', 'ATG_Xbp1_CIS_dn',\n       'ATG_Xbp1_CIS_up', 'ATG_p53_CIS_dn', 'ATG_p53_CIS_up',\n       'BSK_3C_Eselectin_down', 'BSK_3C_HLADR_down', 'BSK_3C_ICAM1_down',\n       'BSK_3C_IL8_down', 'BSK_3C_MCP1_down', 'BSK_3C_MIG_down',\n       'BSK_3C_Proliferation_down', 'BSK_3C_SRB_down',\n       'BSK_3C_Thrombomodulin_down', 'BSK_3C_Thrombomodulin_up',\n       'BSK_3C_TissueFactor_down', 'BSK_3C_TissueFactor_up',\n       'BSK_3C_VCAM1_down', 'BSK_3C_Vis_down', 'BSK_3C_uPAR_down',\n       'BSK_4H_Eotaxin3_down', 'BSK_4H_MCP1_down', 'BSK_4H_Pselectin_down',\n       'BSK_4H_Pselectin_up', 'BSK_4H_SRB_down', 'BSK_4H_VCAM1_down',\n       'BSK_4H_VEGFRII_down', 'BSK_4H_uPAR_down', 'BSK_4H_uPAR_up',\n       'BSK_BE3C_HLADR_down', 'BSK_BE3C_IL1a_down', 'BSK_BE3C_IP10_down',\n       'BSK_BE3C_MIG_down', 'BSK_BE3C_MMP1_down', 'BSK_BE3C_MMP1_up',\n       'BSK_BE3C_PAI1_down', 'BSK_BE3C_SRB_down', 'BSK_BE3C_TGFb1_down',\n       'BSK_BE3C_tPA_down', 'BSK_BE3C_uPAR_down', 'BSK_BE3C_uPAR_up',\n       'BSK_BE3C_uPA_down', 'BSK_CASM3C_HLADR_down', 'BSK_CASM3C_IL6_down',\n       'BSK_CASM3C_IL6_up', 'BSK_CASM3C_IL8_down', 'BSK_CASM3C_LDLR_down',\n       'BSK_CASM3C_LDLR_up', 'BSK_CASM3C_MCP1_down', 'BSK_CASM3C_MCP1_up',\n       'BSK_CASM3C_MCSF_down', 'BSK_CASM3C_MCSF_up', 'BSK_CASM3C_MIG_down',\n       'BSK_CASM3C_Proliferation_down', 'BSK_CASM3C_Proliferation_up',\n       'BSK_CASM3C_SAA_down', 'BSK_CASM3C_SAA_up', 'BSK_CASM3C_SRB_down',\n       'BSK_CASM3C_Thrombomodulin_down', 'BSK_CASM3C_Thrombomodulin_up',\n       'BSK_CASM3C_TissueFactor_down', 'BSK_CASM3C_VCAM1_down',\n       'BSK_CASM3C_VCAM1_up', 'BSK_CASM3C_uPAR_down', 'BSK_CASM3C_uPAR_up',\n       'BSK_KF3CT_ICAM1_down', 'BSK_KF3CT_IL1a_down',\n       'BSK_KF3CT_IP10_down', 'BSK_KF3CT_IP10_up', 'BSK_KF3CT_MCP1_down',\n       'BSK_KF3CT_MCP1_up', 'BSK_KF3CT_MMP9_down', 'BSK_KF3CT_SRB_down',\n       'BSK_KF3CT_TGFb1_down', 'BSK_KF3CT_TIMP2_down',\n       'BSK_KF3CT_uPA_down', 'BSK_LPS_CD40_down', 'BSK_LPS_Eselectin_down',\n       'BSK_LPS_Eselectin_up', 'BSK_LPS_IL1a_down', 'BSK_LPS_IL1a_up',\n       'BSK_LPS_IL8_down', 'BSK_LPS_IL8_up', 'BSK_LPS_MCP1_down',\n       'BSK_LPS_MCSF_down', 'BSK_LPS_PGE2_down', 'BSK_LPS_PGE2_up',\n       'BSK_LPS_SRB_down', 'BSK_LPS_TNFa_down', 'BSK_LPS_TNFa_up',\n       'BSK_LPS_TissueFactor_down', 'BSK_LPS_TissueFactor_up',\n       'BSK_LPS_VCAM1_down', 'BSK_SAg_CD38_down', 'BSK_SAg_CD40_down',\n       'BSK_SAg_CD69_down', 'BSK_SAg_Eselectin_down',\n       'BSK_SAg_Eselectin_up', 'BSK_SAg_IL8_down', 'BSK_SAg_IL8_up',\n       'BSK_SAg_MCP1_down', 'BSK_SAg_MIG_down',\n       'BSK_SAg_PBMCCytotoxicity_down', 'BSK_SAg_PBMCCytotoxicity_up',\n       'BSK_SAg_Proliferation_down', 'BSK_SAg_SRB_down',\n       'BSK_hDFCGF_CollagenIII_down', 'BSK_hDFCGF_EGFR_down',\n       'BSK_hDFCGF_EGFR_up', 'BSK_hDFCGF_IL8_down', 'BSK_hDFCGF_IP10_down',\n       'BSK_hDFCGF_MCSF_down', 'BSK_hDFCGF_MIG_down',\n       'BSK_hDFCGF_MMP1_down', 'BSK_hDFCGF_MMP1_up',\n       'BSK_hDFCGF_PAI1_down', 'BSK_hDFCGF_Proliferation_down',\n       'BSK_hDFCGF_SRB_down', 'BSK_hDFCGF_TIMP1_down',\n       'BSK_hDFCGF_VCAM1_down', 'CEETOX_H295R_11DCORT_dn',\n       'CEETOX_H295R_ANDR_dn', 'CEETOX_H295R_CORTISOL_dn',\n       'CEETOX_H295R_DOC_dn', 'CEETOX_H295R_DOC_up',\n       'CEETOX_H295R_ESTRADIOL_dn', 'CEETOX_H295R_ESTRADIOL_up',\n       'CEETOX_H295R_ESTRONE_dn', 'CEETOX_H295R_ESTRONE_up',\n       'CEETOX_H295R_OHPREG_up', 'CEETOX_H295R_OHPROG_dn',\n       'CEETOX_H295R_OHPROG_up', 'CEETOX_H295R_PROG_up',\n       'CEETOX_H295R_TESTO_dn', 'CLD_ABCB1_48hr', 'CLD_ABCG2_48hr',\n       'CLD_CYP1A1_24hr', 'CLD_CYP1A1_48hr', 'CLD_CYP1A1_6hr',\n       'CLD_CYP1A2_24hr', 'CLD_CYP1A2_48hr', 'CLD_CYP1A2_6hr',\n       'CLD_CYP2B6_24hr', 'CLD_CYP2B6_48hr', 'CLD_CYP2B6_6hr',\n       'CLD_CYP3A4_24hr', 'CLD_CYP3A4_48hr', 'CLD_CYP3A4_6hr',\n       'CLD_GSTA2_48hr', 'CLD_SULT2A_24hr', 'CLD_SULT2A_48hr',\n       'CLD_UGT1A1_24hr', 'CLD_UGT1A1_48hr', 'NCCT_HEK293T_CellTiterGLO',\n       'NCCT_QuantiLum_inhib_2_dn', 'NCCT_QuantiLum_inhib_dn',\n       'NCCT_TPO_AUR_dn', 'NCCT_TPO_GUA_dn',\n       'NHEERL_ZF_144hpf_TERATOSCORE_up', 'NVS_ADME_hCYP19A1',\n       'NVS_ADME_hCYP1A1', 'NVS_ADME_hCYP1A2', 'NVS_ADME_hCYP2A6',\n       'NVS_ADME_hCYP2B6', 'NVS_ADME_hCYP2C19', 'NVS_ADME_hCYP2C9',\n       'NVS_ADME_hCYP2D6', 'NVS_ADME_hCYP3A4', 'NVS_ADME_hCYP4F12',\n       'NVS_ADME_rCYP2C12', 'NVS_ENZ_hAChE', 'NVS_ENZ_hAMPKa1',\n       'NVS_ENZ_hAurA', 'NVS_ENZ_hBACE', 'NVS_ENZ_hCASP5', 'NVS_ENZ_hCK1D',\n       'NVS_ENZ_hDUSP3', 'NVS_ENZ_hES', 'NVS_ENZ_hElastase',\n       'NVS_ENZ_hFGFR1', 'NVS_ENZ_hGSK3b', 'NVS_ENZ_hMMP1',\n       'NVS_ENZ_hMMP13', 'NVS_ENZ_hMMP2', 'NVS_ENZ_hMMP3', 'NVS_ENZ_hMMP7',\n       'NVS_ENZ_hMMP9', 'NVS_ENZ_hPDE10', 'NVS_ENZ_hPDE4A1',\n       'NVS_ENZ_hPDE5', 'NVS_ENZ_hPI3Ka', 'NVS_ENZ_hPTEN',\n       'NVS_ENZ_hPTPN11', 'NVS_ENZ_hPTPN12', 'NVS_ENZ_hPTPN13',\n       'NVS_ENZ_hPTPN9', 'NVS_ENZ_hPTPRC', 'NVS_ENZ_hSIRT1',\n       'NVS_ENZ_hSIRT2', 'NVS_ENZ_hTrkA', 'NVS_ENZ_hVEGFR2',\n       'NVS_ENZ_oCOX1', 'NVS_ENZ_oCOX2', 'NVS_ENZ_rAChE', 'NVS_ENZ_rCNOS',\n       'NVS_ENZ_rMAOAC', 'NVS_ENZ_rMAOAP', 'NVS_ENZ_rMAOBC',\n       'NVS_ENZ_rMAOBP', 'NVS_ENZ_rabI2C', 'NVS_GPCR_bAdoR_NonSelective',\n       'NVS_GPCR_bDR_NonSelective', 'NVS_GPCR_g5HT4', 'NVS_GPCR_gH2',\n       'NVS_GPCR_gLTB4', 'NVS_GPCR_gLTD4',\n       'NVS_GPCR_gMPeripheral_NonSelective', 'NVS_GPCR_gOpiateK',\n       'NVS_GPCR_h5HT2A', 'NVS_GPCR_h5HT5A', 'NVS_GPCR_h5HT6',\n       'NVS_GPCR_h5HT7', 'NVS_GPCR_hAT1', 'NVS_GPCR_hAdoRA1',\n       'NVS_GPCR_hAdoRA2a', 'NVS_GPCR_hAdra2A', 'NVS_GPCR_hAdra2C',\n       'NVS_GPCR_hAdrb1', 'NVS_GPCR_hAdrb2', 'NVS_GPCR_hAdrb3',\n       'NVS_GPCR_hDRD1', 'NVS_GPCR_hDRD2s', 'NVS_GPCR_hDRD4.4',\n       'NVS_GPCR_hH1', 'NVS_GPCR_hLTB4_BLT1', 'NVS_GPCR_hM1',\n       'NVS_GPCR_hM2', 'NVS_GPCR_hM3', 'NVS_GPCR_hM4', 'NVS_GPCR_hNK2',\n       'NVS_GPCR_hOpiate_D1', 'NVS_GPCR_hOpiate_mu', 'NVS_GPCR_hTXA2',\n       'NVS_GPCR_p5HT2C', 'NVS_GPCR_r5HT1_NonSelective',\n       'NVS_GPCR_r5HT_NonSelective', 'NVS_GPCR_rAdra1B',\n       'NVS_GPCR_rAdra1_NonSelective', 'NVS_GPCR_rAdra2_NonSelective',\n       'NVS_GPCR_rAdrb_NonSelective', 'NVS_GPCR_rNK1', 'NVS_GPCR_rNK3',\n       'NVS_GPCR_rOpiate_NonSelective', 'NVS_GPCR_rOpiate_NonSelectiveNa',\n       'NVS_GPCR_rSST', 'NVS_GPCR_rTRH', 'NVS_GPCR_rV1', 'NVS_GPCR_rabPAF',\n       'NVS_GPCR_rmAdra2B', 'NVS_IC_hKhERGCh', 'NVS_IC_rCaBTZCHL',\n       'NVS_IC_rCaDHPRCh_L', 'NVS_IC_rNaCh_site2', 'NVS_LGIC_bGABARa1',\n       'NVS_LGIC_h5HT3', 'NVS_LGIC_hNNR_NBungSens',\n       'NVS_LGIC_rGABAR_NonSelective', 'NVS_LGIC_rNNR_BungSens',\n       'NVS_MP_hPBR', 'NVS_MP_rPBR', 'NVS_NR_bER', 'NVS_NR_bPR',\n       'NVS_NR_cAR', 'NVS_NR_hAR', 'NVS_NR_hCAR_Antagonist', 'NVS_NR_hER',\n       'NVS_NR_hFXR_Agonist', 'NVS_NR_hFXR_Antagonist', 'NVS_NR_hGR',\n       'NVS_NR_hPPARa', 'NVS_NR_hPPARg', 'NVS_NR_hPR', 'NVS_NR_hPXR',\n       'NVS_NR_hRAR_Antagonist', 'NVS_NR_hRARa_Agonist',\n       'NVS_NR_hTRa_Antagonist', 'NVS_NR_mERa', 'NVS_NR_rAR', 'NVS_NR_rMR',\n       'NVS_OR_gSIGMA_NonSelective', 'NVS_TR_gDAT', 'NVS_TR_hAdoT',\n       'NVS_TR_hDAT', 'NVS_TR_hNET', 'NVS_TR_hSERT', 'NVS_TR_rNET',\n       'NVS_TR_rSERT', 'NVS_TR_rVMAT2', 'OT_AR_ARELUC_AG_1440',\n       'OT_AR_ARSRC1_0480', 'OT_AR_ARSRC1_0960', 'OT_ER_ERaERa_0480',\n       'OT_ER_ERaERa_1440', 'OT_ER_ERaERb_0480', 'OT_ER_ERaERb_1440',\n       'OT_ER_ERbERb_0480', 'OT_ER_ERbERb_1440', 'OT_ERa_EREGFP_0120',\n       'OT_ERa_EREGFP_0480', 'OT_FXR_FXRSRC1_0480', 'OT_FXR_FXRSRC1_1440',\n       'OT_NURR1_NURR1RXRa_0480', 'OT_NURR1_NURR1RXRa_1440',\n       'TOX21_ARE_BLA_Agonist_ch1', 'TOX21_ARE_BLA_Agonist_ch2',\n       'TOX21_ARE_BLA_agonist_ratio', 'TOX21_ARE_BLA_agonist_viability',\n       'TOX21_AR_BLA_Agonist_ch1', 'TOX21_AR_BLA_Agonist_ch2',\n       'TOX21_AR_BLA_Agonist_ratio', 'TOX21_AR_BLA_Antagonist_ch1',\n       'TOX21_AR_BLA_Antagonist_ch2', 'TOX21_AR_BLA_Antagonist_ratio',\n       'TOX21_AR_BLA_Antagonist_viability', 'TOX21_AR_LUC_MDAKB2_Agonist',\n       'TOX21_AR_LUC_MDAKB2_Antagonist', 'TOX21_AR_LUC_MDAKB2_Antagonist2',\n       'TOX21_AhR_LUC_Agonist', 'TOX21_Aromatase_Inhibition',\n       'TOX21_AutoFluor_HEK293_Cell_blue',\n       'TOX21_AutoFluor_HEK293_Media_blue',\n       'TOX21_AutoFluor_HEPG2_Cell_blue',\n       'TOX21_AutoFluor_HEPG2_Cell_green',\n       'TOX21_AutoFluor_HEPG2_Media_blue',\n       'TOX21_AutoFluor_HEPG2_Media_green', 'TOX21_ELG1_LUC_Agonist',\n       'TOX21_ERa_BLA_Agonist_ch1', 'TOX21_ERa_BLA_Agonist_ch2',\n       'TOX21_ERa_BLA_Agonist_ratio', 'TOX21_ERa_BLA_Antagonist_ch1',\n       'TOX21_ERa_BLA_Antagonist_ch2', 'TOX21_ERa_BLA_Antagonist_ratio',\n       'TOX21_ERa_BLA_Antagonist_viability', 'TOX21_ERa_LUC_BG1_Agonist',\n       'TOX21_ERa_LUC_BG1_Antagonist', 'TOX21_ESRE_BLA_ch1',\n       'TOX21_ESRE_BLA_ch2', 'TOX21_ESRE_BLA_ratio',\n       'TOX21_ESRE_BLA_viability', 'TOX21_FXR_BLA_Antagonist_ch1',\n       'TOX21_FXR_BLA_Antagonist_ch2', 'TOX21_FXR_BLA_agonist_ch2',\n       'TOX21_FXR_BLA_agonist_ratio', 'TOX21_FXR_BLA_antagonist_ratio',\n       'TOX21_FXR_BLA_antagonist_viability', 'TOX21_GR_BLA_Agonist_ch1',\n       'TOX21_GR_BLA_Agonist_ch2', 'TOX21_GR_BLA_Agonist_ratio',\n       'TOX21_GR_BLA_Antagonist_ch2', 'TOX21_GR_BLA_Antagonist_ratio',\n       'TOX21_GR_BLA_Antagonist_viability', 'TOX21_HSE_BLA_agonist_ch1',\n       'TOX21_HSE_BLA_agonist_ch2', 'TOX21_HSE_BLA_agonist_ratio',\n       'TOX21_HSE_BLA_agonist_viability', 'TOX21_MMP_ratio_down',\n       'TOX21_MMP_ratio_up', 'TOX21_MMP_viability',\n       'TOX21_NFkB_BLA_agonist_ch1', 'TOX21_NFkB_BLA_agonist_ch2',\n       'TOX21_NFkB_BLA_agonist_ratio', 'TOX21_NFkB_BLA_agonist_viability',\n       'TOX21_PPARd_BLA_Agonist_viability',\n       'TOX21_PPARd_BLA_Antagonist_ch1', 'TOX21_PPARd_BLA_agonist_ch1',\n       'TOX21_PPARd_BLA_agonist_ch2', 'TOX21_PPARd_BLA_agonist_ratio',\n       'TOX21_PPARd_BLA_antagonist_ratio',\n       'TOX21_PPARd_BLA_antagonist_viability',\n       'TOX21_PPARg_BLA_Agonist_ch1', 'TOX21_PPARg_BLA_Agonist_ch2',\n       'TOX21_PPARg_BLA_Agonist_ratio', 'TOX21_PPARg_BLA_Antagonist_ch1',\n       'TOX21_PPARg_BLA_antagonist_ratio',\n       'TOX21_PPARg_BLA_antagonist_viability', 'TOX21_TR_LUC_GH3_Agonist',\n       'TOX21_TR_LUC_GH3_Antagonist', 'TOX21_VDR_BLA_Agonist_viability',\n       'TOX21_VDR_BLA_Antagonist_ch1', 'TOX21_VDR_BLA_agonist_ch2',\n       'TOX21_VDR_BLA_agonist_ratio', 'TOX21_VDR_BLA_antagonist_ratio',\n       'TOX21_VDR_BLA_antagonist_viability', 'TOX21_p53_BLA_p1_ch1',\n       'TOX21_p53_BLA_p1_ch2', 'TOX21_p53_BLA_p1_ratio',\n       'TOX21_p53_BLA_p1_viability', 'TOX21_p53_BLA_p2_ch1',\n       'TOX21_p53_BLA_p2_ch2', 'TOX21_p53_BLA_p2_ratio',\n       'TOX21_p53_BLA_p2_viability', 'TOX21_p53_BLA_p3_ch1',\n       'TOX21_p53_BLA_p3_ch2', 'TOX21_p53_BLA_p3_ratio',\n       'TOX21_p53_BLA_p3_viability', 'TOX21_p53_BLA_p4_ch1',\n       'TOX21_p53_BLA_p4_ch2', 'TOX21_p53_BLA_p4_ratio',\n       'TOX21_p53_BLA_p4_viability', 'TOX21_p53_BLA_p5_ch1',\n       'TOX21_p53_BLA_p5_ch2', 'TOX21_p53_BLA_p5_ratio',\n       'TOX21_p53_BLA_p5_viability', 'Tanguay_ZF_120hpf_AXIS_up',\n       'Tanguay_ZF_120hpf_ActivityScore', 'Tanguay_ZF_120hpf_BRAI_up',\n       'Tanguay_ZF_120hpf_CFIN_up', 'Tanguay_ZF_120hpf_CIRC_up',\n       'Tanguay_ZF_120hpf_EYE_up', 'Tanguay_ZF_120hpf_JAW_up',\n       'Tanguay_ZF_120hpf_MORT_up', 'Tanguay_ZF_120hpf_OTIC_up',\n       'Tanguay_ZF_120hpf_PE_up', 'Tanguay_ZF_120hpf_PFIN_up',\n       'Tanguay_ZF_120hpf_PIG_up', 'Tanguay_ZF_120hpf_SNOU_up',\n       'Tanguay_ZF_120hpf_SOMI_up', 'Tanguay_ZF_120hpf_SWIM_up',\n       'Tanguay_ZF_120hpf_TRUN_up', 'Tanguay_ZF_120hpf_TR_up',\n       'Tanguay_ZF_120hpf_YSE_up']\n"
  },
  {
    "path": "chainer_chemistry/datasets/numpy_tuple_dataset.py",
    "content": "import os\nimport six\n\nimport numpy\n\nfrom chainer_chemistry.dataset.converters import concat_mols\nfrom chainer_chemistry.dataset.indexers.numpy_tuple_dataset_feature_indexer import NumpyTupleDatasetFeatureIndexer  # NOQA\n\n\nclass NumpyTupleDataset(object):\n\n    \"\"\"Dataset of a tuple of datasets.\n\n    It combines multiple datasets into one dataset. Each example is represented\n    by a tuple whose ``i``-th item corresponds to the i-th dataset.\n    And each ``i``-th dataset is expected to be an instance of numpy.ndarray.\n\n    Args:\n        datasets: Underlying datasets. The ``i``-th one is used for the\n            ``i``-th item of each example. All datasets must have the same\n            length.\n\n    \"\"\"\n\n    def __init__(self, *datasets):\n        if not datasets:\n            raise ValueError('no datasets are given')\n        length = len(datasets[0])\n        for i, dataset in enumerate(datasets):\n            if len(dataset) != length:\n                raise ValueError(\n                    'dataset of the index {} has a wrong length'.format(i))\n        self._datasets = datasets\n        self._length = length\n        self._features_indexer = NumpyTupleDatasetFeatureIndexer(self)\n\n    def __getitem__(self, index):\n        batches = [dataset[index] for dataset in self._datasets]\n        if isinstance(index, (slice, list, numpy.ndarray)):\n            length = len(batches[0])\n            return [tuple([batch[i] for batch in batches])\n                    for i in six.moves.range(length)]\n        else:\n            return tuple(batches)\n\n    def __len__(self):\n        return self._length\n\n    def get_datasets(self):\n        return self._datasets\n\n    @property\n    def converter(self):\n        return concat_mols\n\n    @property\n    def features(self):\n        \"\"\"Extract features according to the specified index.\n\n        - axis 0 is used to specify dataset id (`i`-th dataset)\n        - axis 1 is used to specify feature index\n\n        .. admonition:: Example\n\n           >>> import numpy\n           >>> from chainer_chemistry.datasets import NumpyTupleDataset\n           >>> x = numpy.array([0, 1, 2], dtype=numpy.float32)\n           >>> t = x * x\n           >>> numpy_tuple_dataset = NumpyTupleDataset(x, t)\n           >>> targets = numpy_tuple_dataset.features[:, 1]\n           >>> print('targets', targets)  # We can extract only target value\n           targets [0, 1, 4]\n\n        \"\"\"\n        return self._features_indexer\n\n    @classmethod\n    def save(cls, filepath, numpy_tuple_dataset):\n        \"\"\"save the dataset to filepath in npz format\n\n        Args:\n            filepath (str): filepath to save dataset. It is recommended to end\n                with '.npz' extension.\n            numpy_tuple_dataset (NumpyTupleDataset): dataset instance\n\n        \"\"\"\n        if not isinstance(numpy_tuple_dataset, NumpyTupleDataset):\n            raise TypeError('numpy_tuple_dataset is not instance of '\n                            'NumpyTupleDataset, got {}'\n                            .format(type(numpy_tuple_dataset)))\n        numpy.savez(filepath, *numpy_tuple_dataset._datasets)\n\n    @classmethod\n    def load(cls, filepath, allow_pickle=True):\n        if not os.path.exists(filepath):\n            return None\n        load_data = numpy.load(filepath, allow_pickle=allow_pickle)\n        result = []\n        i = 0\n        while True:\n            key = 'arr_{}'.format(i)\n            if key in load_data.keys():\n                result.append(load_data[key])\n                i += 1\n            else:\n                break\n        return NumpyTupleDataset(*result)\n"
  },
  {
    "path": "chainer_chemistry/datasets/qm9.py",
    "content": "import glob\nfrom logging import getLogger\nimport os\nimport shutil\nimport tarfile\nimport tempfile\n\nfrom chainer.dataset import download\nimport numpy\nimport pandas\nfrom tqdm import tqdm\n\nfrom chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\n\ndownload_url = 'https://ndownloader.figshare.com/files/3195389'\nfile_name = 'qm9.csv'\n\n_root = 'pfnet/chainer/qm9'\n\n_label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2',\n                'zpve', 'U0', 'U', 'H', 'G', 'Cv']\n_smiles_column_names = ['SMILES1', 'SMILES2']\n\n\ndef get_qm9_label_names():\n    \"\"\"Returns label names of QM9 datasets.\"\"\"\n    return _label_names\n\n\ndef get_qm9(preprocessor=None, labels=None, return_smiles=False,\n            target_index=None):\n    \"\"\"Downloads, caches and preprocesses QM9 dataset.\n\n    Args:\n        preprocessor (BasePreprocessor): Preprocessor.\n            This should be chosen based on the network to be trained.\n            If it is None, default `AtomicNumberPreprocessor` is used.\n        labels (str or list): List of target labels.\n        return_smiles (bool): If set to ``True``,\n            smiles array is also returned.\n        target_index (list or None): target index list to partially extract\n            dataset. If None (default), all examples are parsed.\n\n    Returns:\n        dataset, which is composed of `features`, which depends on\n        `preprocess_method`.\n\n    \"\"\"\n    labels = labels or get_qm9_label_names()\n    if isinstance(labels, str):\n        labels = [labels, ]\n\n    def postprocess_label(label_list):\n        # This is regression task, cast to float value.\n        return numpy.asarray(label_list, dtype=numpy.float32)\n\n    if preprocessor is None:\n        preprocessor = AtomicNumberPreprocessor()\n    parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label,\n                           labels=labels, smiles_col='SMILES1')\n    result = parser.parse(get_qm9_filepath(), return_smiles=return_smiles,\n                          target_index=target_index)\n\n    if return_smiles:\n        return result['dataset'], result['smiles']\n    else:\n        return result['dataset']\n\n\ndef get_qm9_filepath(download_if_not_exist=True):\n    \"\"\"Construct a filepath which stores qm9 dataset for config_name\n\n    This method check whether the file exist or not,  and downloaded it if\n    necessary.\n\n    Args:\n        download_if_not_exist (bool): If `True` download dataset\n            if it is not downloaded yet.\n\n    Returns (str): file path for qm9 dataset (formatted to csv)\n\n    \"\"\"\n    cache_path = _get_qm9_filepath()\n    if not os.path.exists(cache_path):\n        if download_if_not_exist:\n            is_successful = download_and_extract_qm9(save_filepath=cache_path)\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return cache_path\n\n\ndef _get_qm9_filepath():\n    \"\"\"Construct a filepath which stores QM9 dataset in csv\n\n    This method does not check if the file is already downloaded or not.\n\n    Returns (str): filepath for qm9 dataset\n\n    \"\"\"\n    cache_root = download.get_dataset_directory(_root)\n    cache_path = os.path.join(cache_root, file_name)\n    return cache_path\n\n\ndef download_and_extract_qm9(save_filepath):\n    logger = getLogger(__name__)\n    logger.warning('Extracting QM9 dataset, it takes time...')\n    download_file_path = download.cached_download(download_url)\n    tf = tarfile.open(download_file_path, 'r')\n    temp_dir = tempfile.mkdtemp()\n    tf.extractall(temp_dir)\n    file_re = os.path.join(temp_dir, '*.xyz')\n    file_pathes = glob.glob(file_re)\n    # Make sure the order is sorted\n    file_pathes.sort()\n    ls = []\n    for path in tqdm(file_pathes):\n        with open(path, 'r') as f:\n            data = [line.strip() for line in f]\n\n        num_atom = int(data[0])\n        properties = list(map(float, data[1].split('\\t')[1:]))\n        smiles = data[3 + num_atom].split('\\t')\n        new_ls = smiles + properties\n        ls.append(new_ls)\n\n    df = pandas.DataFrame(ls, columns=_smiles_column_names + _label_names)\n    df.to_csv(save_filepath)\n    shutil.rmtree(temp_dir)\n    return True\n"
  },
  {
    "path": "chainer_chemistry/datasets/reddit/reddit.py",
    "content": "from logging import getLogger\nimport os\nfrom zipfile import ZipFile\n\nimport networkx as nx\nimport numpy\nimport scipy\n\nfrom chainer.dataset import download\n\ndownload_url = 'https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/reddit.zip'\nfeat_file_name = 'reddit_data.npz'\nedge_file_name = 'reddit_graph.npz'\n\n_root = 'pfnet/chainer/reddit'\n\n\ndef reddit_to_networkx(dirpath):\n    print(\"Loading graph data\")\n    coo_adj = scipy.sparse.load_npz(os.path.join(dirpath, edge_file_name))\n    G = nx.from_scipy_sparse_matrix(coo_adj)\n\n    print(\"Loading node feature and label\")\n    # node feature, edge label\n    reddit_data = numpy.load(os.path.join(dirpath, feat_file_name))\n    G.graph['x'] = reddit_data['feature'].astype(numpy.float32)\n    G.graph['y'] = reddit_data['label'].astype(numpy.int32)\n\n    G.graph['label_num'] = 41\n    # G = nx.convert_node_labels_to_integers(G)\n    print(\"Finish loading graph: {}\".format(dirpath))\n    return G\n\n\ndef get_reddit_dirpath(download_if_not_exist=True):\n    # type: (bool) -> str\n    \"\"\"Construct a dirpath which stores reddit dataset.\n\n    This method check whether the file exist or not, and downloaded it\n    if necessary.\n\n    Args:\n        download_if_not_exist (bool): If ``True``, download dataset\n            if it is not downloaded yet.\n\n    Returns:\n        dirpath (str): directory path for reddit dataset.\n    \"\"\"\n    feat_cache_path, edge_cache_path = get_reddit_filepath(\n        download_if_not_exist=download_if_not_exist)\n    dirpath = os.path.dirname(feat_cache_path)\n    dirpath2 = os.path.dirname(edge_cache_path)\n    assert dirpath == dirpath2\n    return dirpath\n\n\ndef get_reddit_filepath(download_if_not_exist=True):\n    # type: (bool) -> Tuple[str, str]\n    \"\"\"Construct a filepath which stores reddit dataset.\n\n    This method check whether the file exist or not, and downloaded it\n    if necessary.\n\n    Args:\n        download_if_not_exist (bool): If ``True``, download dataset\n            if it is not downloaded yet.\n    Returns:\n        feat_cache_path (str): file path for reddit dataset (features).\n        edge_cache_path (str): file path for reddit dataset (edge index).\n    \"\"\"\n    feat_cache_path, edge_cache_path = _get_reddit_filepath()\n    if not os.path.exists(feat_cache_path):\n        if download_if_not_exist:\n            is_successful = download_and_extract_reddit(\n                save_dirpath=os.path.dirname(feat_cache_path))\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return feat_cache_path, edge_cache_path\n\n\ndef _get_reddit_filepath():\n    # type: () -> Tuple[str, str]\n    \"\"\"Construct a filepath which stores reddit dataset.\n\n    This method does not check if the file is already downloaded or not.\n\n    Returns:\n        feat_cache_path (str): file path for reddit dataset (features).\n        edge_cache_path (str): file path for reddit dataset (edge index).\n    \"\"\"\n    cache_root = download.get_dataset_directory(_root)\n    feat_cache_path = os.path.join(cache_root, feat_file_name)\n    edge_cache_path = os.path.join(cache_root, edge_file_name)\n    return feat_cache_path, edge_cache_path\n\n\ndef download_and_extract_reddit(save_dirpath):\n    # type: (str) -> bool\n    print('downloading reddit dataset...')\n    download_file_path = download.cached_download(download_url)\n    print('extracting reddit dataset...')\n    zip = ZipFile(download_file_path, 'r')\n    zip.extractall(save_dirpath)\n    return True\n"
  },
  {
    "path": "chainer_chemistry/datasets/tox21.py",
    "content": "from logging import getLogger\nimport os\nimport shutil\nimport zipfile\n\nfrom chainer.dataset import download\nimport numpy\n\nfrom chainer_chemistry.dataset.parsers.sdf_file_parser import SDFFileParser\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\n\n\n_config = {\n    'train': {\n        'url': 'https://tripod.nih.gov/tox21/challenge/download?'\n        'id=tox21_10k_data_allsdf',\n        'filename': 'tox21_10k_data_all.sdf'\n    },\n    'val': {\n        'url': 'https://tripod.nih.gov/tox21/challenge/download?'\n        'id=tox21_10k_challenge_testsdf',\n        'filename': 'tox21_10k_challenge_test.sdf'\n    },\n    'test': {\n        'url': 'https://tripod.nih.gov/tox21/challenge/download?'\n        'id=tox21_10k_challenge_scoresdf',\n        'filename': 'tox21_10k_challenge_score.sdf'\n    }\n}\n\n_root = 'pfnet/chainer/tox21'\n\n_label_names = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',\n                'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',\n                'SR-HSE', 'SR-MMP', 'SR-p53']\n\n\ndef get_tox21_label_names():\n    \"\"\"Returns label names of Tox21 datasets.\"\"\"\n    return _label_names\n\n\ndef get_tox21(preprocessor=None, labels=None, return_smiles=False,\n              train_target_index=None, val_target_index=None,\n              test_target_index=None):\n    \"\"\"Downloads, caches and preprocesses Tox21 dataset.\n\n    Args:\n        preprocesssor (BasePreprocessor): Preprocessor.\n            This should be chosen based on the network to be trained.\n            If it is None, default `AtomicNumberPreprocessor` is used.\n        labels (str or list): List of target labels.\n        return_smiles (bool): If set to True, smiles array is also returned.\n        train_target_index (list or None): target index list to partially\n            extract train dataset. If None (default), all examples are parsed.\n        val_target_index (list or None): target index list to partially\n            extract val dataset. If None (default), all examples are parsed.\n        test_target_index (list or None): target index list to partially\n            extract test dataset. If None (default), all examples are parsed.\n\n    Returns:\n        The 3-tuple consisting of train, validation and test\n        datasets, respectively. Each dataset is composed of `features`,\n        which depends on `preprocess_method`.\n    \"\"\"\n    labels = labels or get_tox21_label_names()\n    if isinstance(labels, str):\n        labels = [labels, ]\n\n    def postprocess_label(label_list):\n        # Set -1 to the place where the label is not found,\n        # this corresponds to not calculate loss with `sigmoid_cross_entropy`\n        t = numpy.array([-1 if label is None else label for label in\n                         label_list], dtype=numpy.int32)\n        return t\n\n    if preprocessor is None:\n        preprocessor = AtomicNumberPreprocessor()\n    parser = SDFFileParser(preprocessor,\n                           postprocess_label=postprocess_label,\n                           labels=labels)\n\n    train_result = parser.parse(\n        get_tox21_filepath('train'), return_smiles=return_smiles,\n        target_index=train_target_index\n    )\n    val_result = parser.parse(\n        get_tox21_filepath('val'), return_smiles=return_smiles,\n        target_index=val_target_index\n    )\n\n    test_result = parser.parse(\n        get_tox21_filepath('test'), return_smiles=return_smiles,\n        target_index=test_target_index\n    )\n\n    if return_smiles:\n        train, train_smiles = train_result['dataset'], train_result['smiles']\n        val, val_smiles = val_result['dataset'], val_result['smiles']\n        test, test_smiles = test_result['dataset'], test_result['smiles']\n        return train, val, test, train_smiles, val_smiles, test_smiles\n    else:\n        train = train_result['dataset']\n        val = val_result['dataset']\n        test = test_result['dataset']\n        return train, val, test\n\n\ndef _get_tox21_filepath(dataset_type):\n    \"\"\"Returns a file path in which the tox21 dataset is cached.\n\n    This function returns a file path in which `dataset_type`\n    of the tox21 dataset is cached.\n    Note that this function does not check if the dataset has actually\n    been downloaded or not.\n\n    Args:\n        dataset_type(str): Name of the target dataset type.\n            Either 'train', 'val', or 'test'.\n\n    Returns (str): file path for the tox21 dataset\n\n    \"\"\"\n    if dataset_type not in _config.keys():\n        raise ValueError(\"Invalid dataset type '{}'. Accepted values are \"\n                         \"'train', 'val' or 'test'.\".format(dataset_type))\n\n    c = _config[dataset_type]\n    sdffile = c['filename']\n\n    cache_root = download.get_dataset_directory(_root)\n    cache_path = os.path.join(cache_root, sdffile)\n    return cache_path\n\n\ndef get_tox21_filepath(dataset_type, download_if_not_exist=True):\n    \"\"\"Returns a file path in which the tox21 dataset is cached.\n\n    This function returns a file path in which `dataset_type`\n    of the tox21 dataset is or will be cached.\n\n    If the dataset is not cached and if ``download_if_not_exist``\n    is ``True``, this function also downloads the dataset.\n\n    Args:\n        dataset_type: Name of the target dataset type.\n            Either 'train', 'val', or 'test'\n        download_if_not_exist (bool): If `True` download dataset\n            if it is not downloaded yet.\n\n    Returns (str): file path for tox21 dataset\n\n    \"\"\"\n    cache_filepath = _get_tox21_filepath(dataset_type)\n    if not os.path.exists(cache_filepath):\n        if download_if_not_exist:\n            is_successful = _download_and_extract_tox21(dataset_type,\n                                                        cache_filepath)\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return cache_filepath\n\n\ndef _download_and_extract_tox21(config_name, save_filepath):\n    is_successful = False\n    c = _config[config_name]\n    url = c['url']\n    sdffile = c['filename']\n\n    # Download tox21 dataset\n    download_file_path = download.cached_download(url)\n\n    # Extract zipfile to get sdffile\n    with zipfile.ZipFile(download_file_path, 'r') as z:\n        z.extract(sdffile)\n        shutil.move(sdffile, save_filepath)\n\n    is_successful = True\n    return is_successful\n\n\ndef download_and_extract_tox21():\n    \"\"\"Downloads and extracts Tox21 dataset.\n\n    Returns: None\n\n    \"\"\"\n    for config in ['train', 'val', 'test']:\n        _download_and_extract_tox21(config, _get_tox21_filepath(config))\n"
  },
  {
    "path": "chainer_chemistry/datasets/zinc.py",
    "content": "from logging import getLogger\nimport os\n\nfrom chainer.dataset import download\nimport numpy\nimport pandas\n\nfrom chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\n\ndownload_url = 'https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv'  # NOQA\nfile_name_250k = 'zinc250k.csv'\n\n_root = 'pfnet/chainer/zinc'\n\n_label_names = ['logP', 'qed', 'SAS']\n_smiles_column_names = ['smiles']\n\n\ndef get_zinc250k_label_names():\n    \"\"\"Returns label names of ZINC250k datasets.\"\"\"\n    return _label_names\n\n\ndef get_zinc250k(preprocessor=None, labels=None, return_smiles=False,\n                 target_index=None):\n    \"\"\"Downloads, caches and preprocesses Zinc 250K dataset.\n\n    Args:\n        preprocessor (BasePreprocessor): Preprocessor.\n            This should be chosen based on the network to be trained.\n            If it is None, default `AtomicNumberPreprocessor` is used.\n        labels (str or list): List of target labels.\n        return_smiles (bool): If set to ``True``,\n            smiles array is also returned.\n        target_index (list or None): target index list to partially extract\n            dataset. If None (default), all examples are parsed.\n\n    Returns:\n        dataset, which is composed of `features`, which depends on\n        `preprocess_method`.\n\n    \"\"\"\n    labels = labels or get_zinc250k_label_names()\n    if isinstance(labels, str):\n        labels = [labels, ]\n\n    def postprocess_label(label_list):\n        # This is regression task, cast to float value.\n        return numpy.asarray(label_list, dtype=numpy.float32)\n\n    if preprocessor is None:\n        preprocessor = AtomicNumberPreprocessor()\n    parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label,\n                           labels=labels, smiles_col='smiles')\n    result = parser.parse(get_zinc250k_filepath(), return_smiles=return_smiles,\n                          target_index=target_index)\n\n    if return_smiles:\n        return result['dataset'], result['smiles']\n    else:\n        return result['dataset']\n\n\ndef get_zinc250k_filepath(download_if_not_exist=True):\n    \"\"\"Construct a filepath which stores ZINC250k dataset for config_name\n\n    This method check whether the file exist or not,  and downloaded it if\n    necessary.\n\n    Args:\n        download_if_not_exist (bool): If `True` download dataset\n            if it is not downloaded yet.\n\n    Returns (str): file path for ZINC250k dataset (csv format)\n\n    \"\"\"\n    cache_path = _get_zinc250k_filepath()\n    if not os.path.exists(cache_path):\n        if download_if_not_exist:\n            is_successful = download_and_extract_zinc250k(\n                save_filepath=cache_path)\n            if not is_successful:\n                logger = getLogger(__name__)\n                logger.warning('Download failed.')\n    return cache_path\n\n\ndef _get_zinc250k_filepath():\n    \"\"\"Construct a filepath which stores ZINC250k dataset in csv\n\n    This method does not check if the file is already downloaded or not.\n\n    Returns (str): filepath for ZINC250k dataset\n\n    \"\"\"\n    cache_root = download.get_dataset_directory(_root)\n    cache_path = os.path.join(cache_root, file_name_250k)\n    return cache_path\n\n\ndef _remove_new_line(s):\n    return s.replace('\\n', '')\n\n\ndef download_and_extract_zinc250k(save_filepath):\n    logger = getLogger(__name__)\n    logger.info('Extracting ZINC250k dataset...')\n    download_file_path = download.cached_download(download_url)\n    df = pandas.read_csv(download_file_path)\n    # 'smiles' column contains '\\n', need to remove it.\n    df['smiles'] = df['smiles'].apply(_remove_new_line)\n    df.to_csv(save_filepath, columns=_smiles_column_names + _label_names)\n    return True\n"
  },
  {
    "path": "chainer_chemistry/functions/__init__.py",
    "content": "from chainer_chemistry.functions.activation.megnet_softplus import megnet_softplus  # NOQA\nfrom chainer_chemistry.functions.activation.shifted_softplus import shifted_softplus  # NOQA\nfrom chainer_chemistry.functions.activation.softmax import softmax  # NOQA\n\nfrom chainer_chemistry.functions.evaluation.r2_score import r2_score  # NOQA\nfrom chainer_chemistry.functions.evaluation.r2_score import R2Score  # NOQA\n\nfrom chainer_chemistry.functions.loss.mean_absolute_error import mean_absolute_error  # NOQA\nfrom chainer_chemistry.functions.loss.mean_absolute_error import MeanAbsoluteError  # NOQA\nfrom chainer_chemistry.functions.loss.mean_squared_error import mean_squared_error  # NOQA\nfrom chainer_chemistry.functions.loss.mean_squared_error import MeanSquaredError  # NOQA\n\nfrom chainer_chemistry.functions.math.matmul import matmul  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/functions/activation/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/functions/activation/megnet_softplus.py",
    "content": "from chainer import functions\n\n\ndef megnet_softplus(x):\n    \"\"\"Modified softplus function used by MEGNet\n\n    The original implemantation is below.\n    https://github.com/materialsvirtuallab/megnet/blob/f91773f0f3fa8402b494638af9ef2ed2807fcba7/megnet/activations.py#L6\n\n    Args:\n        x (Variable): Input variable\n    Returns:\n        output (Variable): Output variable whose shape is same with `x`\n    \"\"\"\n    return functions.relu(x) + \\\n        functions.log(0.5 * functions.exp(-functions.absolute(x)) + 0.5)\n"
  },
  {
    "path": "chainer_chemistry/functions/activation/shifted_softplus.py",
    "content": "import chainer\nfrom chainer import functions\n\n\ndef shifted_softplus(x, beta=1, shift=0.5, threshold=20):\n    \"\"\"shifted softplus function, which holds f(0)=0.\n\n     Args:\n        x (Variable): Input variable\n        beta (float): Parameter :math:`\\\\beta`.\n        shift (float): Shift Parameter\n        threshold (float): threshold to avoid overflow\n\n    Returns:\n        output (Variable): Output variable whose shape is same with `x`\n    \"\"\"\n    xp = chainer.cuda.get_array_module(x)\n    cond = chainer.as_variable(x).array > threshold\n    x = functions.where(cond, x,\n                        functions.softplus(x, beta=beta))\n    x += xp.log(shift)\n    return x\n"
  },
  {
    "path": "chainer_chemistry/functions/activation/softmax.py",
    "content": "from chainer import functions\n\n\ndef softmax(x, axis=1, mask=None, mask_value=1e10):\n    \"\"\"softmax function, which supports `mask`.\n\n    Args:\n        x (Variable): Input variable\n        axis (int): The axis along which the softmax is to be computed.\n        mask (Variable or None):\n            Default value is `None` which does not use mask,\n            this case the result is same with original `softmax` computation.\n            When `mask` is not `None`, it is assumed to have value 1 or 0.\n            1 indicates actual feature, and 0 indicates virtual feature to be\n            masked.\n        mask_value (int): The value used for masking.\n\n    Returns:\n        output (Variable): Output variable whose shape is same with `x`\n\n    \"\"\"\n    if mask is None:\n        h = x\n    else:\n        if x.shape != mask.shape:\n            raise ValueError(\"x.shape={} and mask.shape={} must be same!\"\n                             .format(x.shape, mask.shape))\n        h = x + (mask - 1.) * mask_value\n    return functions.softmax(h, axis=axis)\n"
  },
  {
    "path": "chainer_chemistry/functions/evaluation/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/functions/evaluation/r2_score.py",
    "content": "from chainer.backends import cuda\nfrom chainer import function\nfrom chainer.utils import type_check\n\n\nclass R2Score(function.Function):\n\n    def __init__(self, sample_weight, multioutput, ignore_nan=False):\n        if sample_weight is not None:\n            raise NotImplementedError()\n        if multioutput in ['uniform_average', 'raw_values']:\n            self.multioutput = multioutput\n        else:\n            raise ValueError(\"invalid multioutput argument\")\n        self.ignore_nan = ignore_nan\n\n    def check_type_forward(self, in_types):\n        type_check.expect(in_types.size() == 2)\n        pred_type, true_type = in_types\n\n        type_check.expect(\n            pred_type.dtype.kind == 'f',\n            true_type.dtype.kind == 'f'\n        )\n\n        type_check.expect(\n            pred_type.shape == true_type.shape,\n        )\n\n    def forward(self, inputs):\n        xp = cuda.get_array_module(*inputs)\n        pred, true = inputs\n        diff = pred - true\n        dev = true - xp.mean(true, axis=0)\n        if self.ignore_nan:\n            diff[xp.isnan(diff)] = 0.\n            dev[xp.isnan(dev)] = 0.\n        SS_res = xp.asarray(\n            xp.sum(diff ** 2, axis=0))\n        SS_tot = xp.asarray(\n            xp.sum(dev ** 2, axis=0))\n        SS_tot_iszero = SS_tot == 0\n        SS_tot[SS_tot_iszero] = 1  # Assign dummy value to avoid zero-division\n        ret = xp.where(\n            SS_tot_iszero, 0.0, 1 - SS_res / SS_tot).astype(pred.dtype)\n        if self.multioutput == 'uniform_average':\n            return xp.asarray(ret.mean()),\n        elif self.multioutput == 'raw_values':\n            return ret,\n\n\ndef r2_score(pred, true, sample_weight=None, multioutput='uniform_average',\n             ignore_nan=False):\n    \"\"\"Computes R^2(coefficient of determination) regression score function.\n\n    Args:\n        pred(Variable): Variable holding a vector, matrix or tensor of\n                estimated target values.\n        true(Variable): Variable holding a vector, matrix or tensor of\n                correct target values.\n        sample_weight: This argument is for compatibility with scikit-learn's\n                implementation of r2_score. Current implementation admits None\n                only.\n        multioutput(string): ['uniform_average', 'raw_values']. if\n                'uniform_average', this function returns an average of R^2\n                score of multiple output. If 'raw_average', this function\n                return a set of R^2 score of multiple output.\n\n    Returns:\n        Variable: A Variable holding a scalar array of the R^2 score if\n        'multioutput' is 'uniform_average' or a vector of R^2 scores if\n        'multioutput' is 'raw_values'.\n\n    .. note:: This function is non-differentiable.\n    \"\"\"\n    return R2Score(sample_weight=sample_weight,\n                   multioutput=multioutput, ignore_nan=ignore_nan)(pred, true)\n"
  },
  {
    "path": "chainer_chemistry/functions/loss/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/functions/loss/mean_absolute_error.py",
    "content": "import numpy\n\nimport chainer\nfrom chainer.backends import cuda\nfrom chainer import function_node\nfrom chainer.utils import type_check\n\n\nclass MeanAbsoluteError(function_node.FunctionNode):\n\n    \"\"\"Mean absolute error function.\"\"\"\n\n    def __init__(self, ignore_nan=False):\n        # TODO(mottodora): implement task weight calculation\n        self.ignore_nan = ignore_nan\n\n    def check_type_forward(self, in_types):\n        type_check.expect(in_types.size() == 2)\n        type_check.expect(\n            in_types[0].dtype == numpy.float32,\n            in_types[1].dtype == numpy.float32,\n            in_types[0].shape == in_types[1].shape\n        )\n\n    def forward_cpu(self, inputs):\n        self.retain_inputs((0, 1))\n        x0, x1 = inputs\n        diff = (inputs[0] - inputs[1]).ravel()\n        # TODO(mottodora): add reduce option\n        if self.ignore_nan:\n            diff[numpy.isnan(diff)] = 0.\n        return numpy.array(abs(diff).sum() / diff.size, dtype=diff.dtype),\n\n    def forward_gpu(self, inputs):\n        self.retain_inputs((0, 1))\n        cupy = cuda.cupy\n        diff = (inputs[0] - inputs[1]).ravel()\n        if self.ignore_nan:\n            diff[cupy.isnan(diff)] = 0.\n        return abs(diff).sum() / diff.dtype.type(diff.size),\n\n    def backward(self, indexes, gy):\n        x0, x1 = self.get_retained_inputs()\n        xp = cuda.get_array_module(x0)\n        diff = x0 - x1\n        if self.ignore_nan:\n            diff = chainer.functions.where(xp.isnan(diff.array),\n                                           xp.zeros_like(diff.array), diff)\n        gy0 = chainer.functions.broadcast_to(gy[0], diff.shape)\n        gx0 = gy0 * chainer.functions.sign(diff) * 1. / diff.size\n        return gx0, -gx0\n\n\ndef mean_absolute_error(x0, x1, ignore_nan=False):\n    \"\"\"Mean absolute error function.\n\n    This function computes mean absolute error between two variables. The mean\n    is taken over the minibatch.\n\n    Args:\n        x0 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \\\n        :class:`cupy.ndarray`): Input variable.\n        x1 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \\\n        :class:`cupy.ndarray`): Input variable.\n        ignore_nan (bool): If `True`, this function compute mean absolute error\n            ignoring NaNs. The arithmetic mean is the sum of the non-NaN\n            elements along the axis divided by the number of whole elements.\n\n    Returns:\n        ~chainer.Variable:\n            A variable holding an array representing the mean absolute\n            error of two inputs.\n    \"\"\"\n    return MeanAbsoluteError(ignore_nan).apply((x0, x1))[0]\n"
  },
  {
    "path": "chainer_chemistry/functions/loss/mean_squared_error.py",
    "content": "import numpy\n\nfrom chainer import cuda\nfrom chainer import function_node\nimport chainer.functions\nfrom chainer.utils import type_check\n\n\nclass MeanSquaredError(function_node.FunctionNode):\n\n    \"\"\"Mean squared error (a.k.a. Euclidean loss) function.\"\"\"\n\n    def __init__(self, ignore_nan=False):\n        # TODO(mottodora): implement task weight calculation\n        self.ignore_nan = ignore_nan\n\n    def check_type_forward(self, in_types):\n        type_check.expect(in_types.size() == 2)\n        type_check.expect(\n            in_types[0].dtype == numpy.float32,\n            in_types[1].dtype == numpy.float32,\n            in_types[0].shape == in_types[1].shape\n        )\n\n    def forward_cpu(self, inputs):\n        self.retain_inputs((0, 1))\n        diff = (inputs[0] - inputs[1]).ravel()\n        # TODO(mottodora): add reduce option\n        if self.ignore_nan:\n            diff[numpy.isnan(diff)] = 0.\n        return numpy.array(diff.dot(diff) / diff.size, dtype=diff.dtype),\n\n    def forward_gpu(self, inputs):\n        cupy = cuda.cupy\n        self.retain_inputs((0, 1))\n        diff = (inputs[0] - inputs[1]).ravel()\n        # TODO(mottodora): add reduce option\n        if self.ignore_nan:\n            diff[cupy.isnan(diff)] = 0.\n        return diff.dot(diff) / diff.dtype.type(diff.size),\n\n    def backward(self, indexes, gy):\n        x0, x1 = self.get_retained_inputs()\n        xp = cuda.get_array_module(x0)\n        ret = []\n        diff = x0 - x1\n        if self.ignore_nan:\n            diff = chainer.functions.where(xp.isnan(diff.array),\n                                           xp.zeros_like(diff.array), diff)\n        gy0 = chainer.functions.broadcast_to(gy[0], diff.shape)\n        gx0 = gy0 * diff * (2. / diff.size)\n        if 0 in indexes:\n            ret.append(gx0)\n        if 1 in indexes:\n            ret.append(-gx0)\n        return ret\n\n\ndef mean_squared_error(x0, x1, ignore_nan=False):\n    \"\"\"Mean squared error function.\n\n    This function computes mean squared error between two variables. The mean\n    is taken over the minibatch. Note that the error is not scaled by 1/2.\n\n    Args:\n        x0 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \\\n        :class:`cupy.ndarray`): Input variable.\n        x1 (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \\\n        :class:`cupy.ndarray`): Input variable.\n        ignore_nan (bool): If `True`, this function compute mean squared error\n            ignoring NaNs. The arithmetic mean is the sum of the non-NaN\n            elements along the axis divided by the number of whole elements.\n\n    Returns:\n        ~chainer.Variable:\n            A variable holding an array representing the mean squared\n            error of two inputs.\n    \"\"\"\n    return MeanSquaredError(ignore_nan).apply((x0, x1))[0]\n"
  },
  {
    "path": "chainer_chemistry/functions/math/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/functions/math/matmul.py",
    "content": "import chainer\n\n\nif int(chainer.__version__[0]) >= 3:\n    _matmul_fn = chainer.functions.matmul\nelse:\n    _matmul_fn = chainer.functions.batch_matmul\n\n\ndef matmul(a, b, transa=False, transb=False):\n    \"\"\"Computes the matrix multiplication of two arrays.\n\n    Args:\n        a (Variable): The left operand of the matrix multiplication.\n            If ``a`` and ``b`` are both 1-D arrays, ``matmul`` returns a dot\n            product of vector `a` and vector `b`. If 2-D arrays, ``matmul``\n            returns matrix product of ``a`` and ``b``. If arrays' dimension is\n            larger than 2, they are treated as a stack of matrices residing in\n            the last two indexes. ``matmul`` returns a stack of each two\n            arrays. ``a`` and ``b`` must have the same dimension.\n        b (Variable): The right operand of the matrix multiplication.\n            Its array is treated as a matrix in the same way as ``a``'s array.\n        transa (bool): If ``True``, each matrices in ``a`` will be transposed.\n            If ``a.ndim == 1``, do nothing.\n        transb (bool): If ``True``, each matrices in ``b`` will be transposed.\n            If ``b.ndim == 1``, do nothing.\n\n    Returns:\n        ~chainer.Variable: The result of the matrix multiplication.\n\n    .. admonition:: Example\n\n        >>> a = np.array([[1, 0], [0, 1]], 'f')\n        >>> b = np.array([[4, 1], [2, 2]], 'f')\n        >>> F.matmul(a, b).data\n        array([[ 4.,  1.],\n               [ 2.,  2.]], dtype=float32)\n\n    \"\"\"\n    return _matmul_fn(a, b, transa=transa, transb=transb)\n"
  },
  {
    "path": "chainer_chemistry/iterators/__init__.py",
    "content": "from chainer_chemistry.iterators.balanced_serial_iterator import BalancedSerialIterator  # NOQA\nfrom chainer_chemistry.iterators.index_iterator import IndexIterator  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/iterators/balanced_serial_iterator.py",
    "content": "from __future__ import division\n\nfrom logging import getLogger\n\nfrom chainer.dataset import iterator\nimport numpy\n\nfrom chainer_chemistry.iterators.index_iterator import IndexIterator\n\n\nclass BalancedSerialIterator(iterator.Iterator):\n\n    \"\"\"Dataset iterator that serially reads the examples with balancing label.\n\n    Args:\n        dataset: Dataset to iterate.\n        batch_size (int): Number of examples within each minibatch.\n        labels (list or numpy.ndarray): 1d array which specifies label feature\n            of `dataset`. Its size must be same as the length of `dataset`.\n        repeat (bool): If ``True``, it infinitely loops over the dataset.\n            Otherwise, it stops iteration at the end of the first epoch.\n        shuffle (bool): If ``True``, the order of examples is shuffled at the\n            beginning of each epoch.\n            Otherwise, the order is permanently same as that of `dataset`.\n        batch_balancing (bool):  If ``True``, examples are sampled in the way\n            that each label examples are roughly evenly sampled in each\n            minibatch. Otherwise, the iterator only guarantees that total\n            numbers of examples are same among label features.\n        ignore_labels (int or list or None): Labels to be ignored.\n            If not ``None``, the example whose label is in `ignore_labels`\n            are not sampled by this iterator.\n\n    \"\"\"\n\n    def __init__(self, dataset, batch_size, labels, repeat=True, shuffle=True,\n                 batch_balancing=False, ignore_labels=None,\n                 logger=getLogger(__name__)):\n        assert len(dataset) == len(labels)\n        labels = numpy.asarray(labels)\n        if len(dataset) != labels.size:\n            raise ValueError('dataset length {} and labels size {} must be '\n                             'same!'.format(len(dataset), labels.size))\n        labels = numpy.ravel(labels)\n        self.dataset = dataset\n        self.batch_size = batch_size\n        self.labels = labels\n        self.logger = logger\n\n        if ignore_labels is None:\n            ignore_labels = []\n        elif isinstance(ignore_labels, int):\n            ignore_labels = [ignore_labels, ]\n        self.ignore_labels = list(ignore_labels)\n        self._repeat = repeat\n        self._shuffle = shuffle\n        self._batch_balancing = batch_balancing\n\n        self.labels_iterator_dict = {}\n\n        max_label_count = -1\n        include_label_count = 0\n        for label in numpy.unique(labels):\n            label_index = numpy.argwhere(labels == label).ravel()\n            label_count = len(label_index)\n            ii = IndexIterator(label_index, shuffle=shuffle)\n            self.labels_iterator_dict[label] = ii\n            if label in self.ignore_labels:\n                continue\n            if max_label_count < label_count:\n                max_label_count = label_count\n            include_label_count += 1\n\n        self.max_label_count = max_label_count\n        self.N_augmented = max_label_count * include_label_count\n        self.reset()\n\n    def __next__(self):\n        if not self._repeat and self.epoch > 0:\n            raise StopIteration\n\n        self._previous_epoch_detail = self.epoch_detail\n\n        i = self.current_position\n        i_end = i + self.batch_size\n        N = self.N_augmented\n\n        batch = [self.dataset[index] for index in self._order[i:i_end]]\n\n        if i_end >= N:\n            if self._repeat:\n                rest = i_end - N\n                self._update_order()\n                if rest > 0:\n                    batch.extend([self.dataset[index]\n                                  for index in self._order[:rest]])\n                self.current_position = rest\n            else:\n                self.current_position = 0\n\n            self.epoch += 1\n            self.is_new_epoch = True\n        else:\n            self.is_new_epoch = False\n            self.current_position = i_end\n\n        return batch\n\n    next = __next__\n\n    @property\n    def epoch_detail(self):\n        return self.epoch + self.current_position / self.N_augmented\n\n    @property\n    def previous_epoch_detail(self):\n        # This iterator saves ``-1`` as _previous_epoch_detail instead of\n        # ``None`` because some serializers do not support ``None``.\n        if self._previous_epoch_detail < 0:\n            return None\n        return self._previous_epoch_detail\n\n    def serialize(self, serializer):\n        self.current_position = serializer('current_position',\n                                           self.current_position)\n        self.epoch = serializer('epoch', self.epoch)\n        self.is_new_epoch = serializer('is_new_epoch', self.is_new_epoch)\n        if self._order is not None:\n            serializer('order', self._order)\n        self._previous_epoch_detail = serializer(\n            'previous_epoch_detail', self._previous_epoch_detail)\n\n        for label, index_iterator in self.labels_iterator_dict.items():\n            self.labels_iterator_dict[label].serialize(\n                serializer['index_iterator_{}'.format(label)])\n\n    def _update_order(self):\n        indices_list = []\n        for label, index_iterator in self.labels_iterator_dict.items():\n            if label in self.ignore_labels:\n                # Not include index of ignore_labels\n                continue\n            indices_list.append(index_iterator.get_next_indices(\n                self.max_label_count))\n\n        if self._batch_balancing:\n            # `indices_list` contains same number of indices of each label.\n            # we can `transpose` and `ravel` it to get each label's index in\n            # sequence, which guarantees that label in each batch is balanced.\n            indices = numpy.array(indices_list).transpose().ravel()\n            self._order = indices\n        else:\n            indices = numpy.array(indices_list).ravel()\n            self._order = numpy.random.permutation(indices)\n\n    def reset(self):\n        self._update_order()\n        self.current_position = 0\n        self.epoch = 0\n        self.is_new_epoch = False\n\n        # use -1 instead of None internally.\n        self._previous_epoch_detail = -1.\n\n    def show_label_stats(self):\n        self.logger.warning('   label    count     rate     status')\n        total = 0\n        for label, index_iterator in self.labels_iterator_dict.items():\n            count = len(index_iterator.index_list)\n            total += count\n\n        for label, index_iterator in self.labels_iterator_dict.items():\n            count = len(index_iterator.index_list)\n            rate = count / len(self.dataset)\n            status = 'ignored' if label in self.ignore_labels else 'included'\n            self.logger.warning('{:>8} {:>8} {:>8.4f} {:>10}'\n                                .format(label, count, rate, status))\n"
  },
  {
    "path": "chainer_chemistry/iterators/index_iterator.py",
    "content": "import numpy\n\nfrom chainer.dataset import iterator\n\n\nclass IndexIterator(iterator.Iterator):\n    \"\"\"Index iterator\n\n    IndexIterator is used internally in `BalancedSerialIterator`, as each\n    label's index iterator\n\n    Args:\n        index_list (list): list of int which represents indices.\n        shuffle (bool): shuffle flag. If True, indices specified by\n            ``index_list`` will be randomly shuffled.\n        num (int): number of indices to be extracted when ``___next___`` is\n            called.\n    \"\"\"\n\n    def __init__(self, index_list, shuffle=True, num=0):\n        self.index_list = numpy.asarray(index_list)\n        assert self.index_list.ndim == 1\n        self.index_length = len(index_list)\n        self.current_index_list = None\n        self.current_pos = 0\n        self.shuffle = shuffle\n        self.num = num\n\n        self.update_current_index_list()\n\n    def update_current_index_list(self):\n        if self.shuffle:\n            self.current_index_list = numpy.random.permutation(self.index_list)\n        else:\n            self.current_index_list = self.index_list\n\n    def __next__(self):\n        return self.get_next_indices(self.num)\n\n    def get_next_indices(self, num):\n        \"\"\"get next indices\n\n        Args:\n            num (int): number for indices to extract.\n\n        Returns (numpy.ndarray): 1d array of indices\n\n        .. admonition:: Example\n\n           >>> ii = IndexIterator([1, 3, 5, 10], shuffle=True)\n           >>> print(ii.get_next_indices(5))\n           [ 5  1 10  3 10]\n           >>> print(ii.get_next_indices(5))\n           [ 3  1  5 10  1]\n\n        \"\"\"\n\n        indices = []\n        if self.current_pos + num < self.index_length:\n            indices.append(self.current_index_list[\n                           self.current_pos: self.current_pos + num])\n            self.current_pos += num\n        else:\n            indices.append(self.current_index_list[self.current_pos:])\n            num -= (self.index_length - self.current_pos)\n            # When `num` is twice bigger than `self.index_length`, `index_list`\n            # is repeated `q` times to get desired length of `indices`.\n            q, r = divmod(num, self.index_length)\n            if self.shuffle:\n                for _ in range(q):\n                    indices.append(numpy.random.permutation(self.index_list))\n            else:\n                indices.append(numpy.tile(self.index_list, q))\n            self.update_current_index_list()\n            indices.append(self.current_index_list[:r])\n            self.current_pos = r\n\n        return numpy.concatenate(indices).ravel()\n\n    def serialize(self, serializer):\n        self.current_index_list = serializer('current_index_list',\n                                             self.current_index_list)\n        self.current_pos = serializer('current_pos', self.current_pos)\n"
  },
  {
    "path": "chainer_chemistry/link_hooks/__init__.py",
    "content": "try:\n    from chainer_chemistry.link_hooks import variable_monitor_link_hook  # NOQA\n\n    from chainer_chemistry.link_hooks.variable_monitor_link_hook import VariableMonitorLinkHook  # NOQA\n    is_link_hooks_available = True\nexcept ImportError:\n    import warnings\n    warnings.warn('link_hooks failed to import, you need to upgrade chainer '\n                  'version to use link_hooks feature')\n    is_link_hooks_available = False\n"
  },
  {
    "path": "chainer_chemistry/link_hooks/variable_monitor_link_hook.py",
    "content": "from collections import OrderedDict\nfrom logging import getLogger\n\nimport chainer\nfrom chainer.link_hook import _ForwardPostprocessCallbackArgs, _ForwardPreprocessCallbackArgs  # NOQA\n\n\ndef _default_extract_pre(hook, args):\n    \"\"\"Default extract_fn when `timing='pre`\n\n    Args:\n        hook (VariableMonitorLinkHook):\n        args (_ForwardPreprocessCallbackArgs):\n\n    Returns (chainer.Variable): First input variable to the link.\n    \"\"\"\n    return args.args[0]\n\n\ndef _default_extract_post(hook, args):\n    \"\"\"Default extract_fn when `timing='post`\n\n    Args:\n        hook (VariableMonitorLinkHook):\n        args (_ForwardPostprocessCallbackArgs):\n\n    Returns (chainer.Variable): Output variable to the link.\n    \"\"\"\n    return args.out\n\n\nclass VariableMonitorLinkHook(chainer.LinkHook):\n    \"\"\"Monitor Variable of specific link input/output\n\n    Args:\n        target_link (chainer.Link): target link to monitor variable.\n        name (str): name of this link hook\n        timing (str): timing of this link hook to monitor. 'pre' or 'post'.\n            If 'pre', the input of `target_link` is monitored.\n            If 'post', the output of `target_link` is monitored.\n        extract_fn (callable): Specify custom method to extract target variable\n            Default behavior is to extract first input when `timing='pre'`,\n            or extract output when `timing='post'`.\n            It takes `hook, args` as argument.\n        logger:\n\n    .. admonition:: Example\n\n       >>> import numpy\n       >>> from chainer import cuda, links, functions  # NOQA\n       >>> from chainer_chemistry.link_hooks.variable_monitor_link_hook import VariableMonitorLinkHook  # NOQA\n\n       >>> class DummyModel(chainer.Chain):\n       >>>    def __init__(self):\n       >>>        super(DummyModel, self).__init__()\n       >>>        with self.init_scope():\n       >>>            self.l1 = links.Linear(None, 1)\n       >>>        self.h = None\n       >>>\n       >>>    def forward(self, x):\n       >>>        h = self.l1(x)\n       >>>        out = functions.sigmoid(h)\n       >>>        return out\n\n       >>> model = DummyModel()\n       >>> hook = VariableMonitorLinkHook(model.l1, timing='post')\n       >>> x = numpy.array([1, 2, 3])\n\n       >>> # Example 1. `get_variable` of `target_link`.\n       >>> with hook:\n       >>>     out = model(x)\n       >>> # You can extract `h`, which is output of `model.l1` as follows.\n       >>> var_h = hook.get_variable()\n\n       >>> # Example 2. `add_process` to override value of target variable.\n       >>> def _process_zeros(hook, args, target_var):\n       >>>     xp = cuda.get_array_module(target_var.array)\n       >>>     target_var.array = xp.zeros(target_var.array.shape)\n       >>> hook.add_process('_process_zeros', _process_zeros)\n       >>> with hook:\n       >>>     # During the forward, `h` is overriden to value 0.\n       >>>     out = model(x)\n       >>> # Remove _process_zeros method\n       >>> hook.delete_process('_process_zeros')\n    \"\"\"\n\n    def __init__(self, target_link, name='VariableMonitorLinkHook',\n                 timing='post', extract_fn=None, logger=None):\n        if not isinstance(target_link, chainer.Link):\n            raise TypeError('target_link must be instance of chainer.Link!'\n                            'actual {}'.format(type(target_link)))\n        if timing not in ['pre', 'post']:\n            raise ValueError(\n                \"Unexpected value timing={}, \"\n                \"must be either pre or post\"\n                .format(timing))\n        super(VariableMonitorLinkHook, self).__init__()\n        self.target_link = target_link\n\n        # This LinkHook maybe instantiated multiple times.\n        # So it is allowed to change name by argument.\n        self.name = name\n        self.logger = logger or getLogger(__name__)\n\n        if extract_fn is None:\n            if timing == 'pre':\n                extract_fn = _default_extract_pre\n            elif timing == 'post':\n                extract_fn = _default_extract_post\n            else:\n                raise ValueError(\"Unexpected value timing={}\"\n                                 .format(timing))\n        self.extract_fn = extract_fn\n        self.process_fns = OrderedDict()  # Additional process, if necessary\n\n        self.timing = timing\n        self.result = None\n\n    def add_process(self, key, fn):\n        \"\"\"Add additional process for target variable\n\n        Args:\n            key (str): id for this process, you may remove added process by\n                `delete_process` with this key.\n            fn (callable): function which takes `hook, args, target_var` as\n                arguments.\n        \"\"\"\n        if not isinstance(key, str):\n            raise TypeError('key must be str, actual {}'.format(type(key)))\n        if not callable(fn):\n            raise TypeError('fn must be callable')\n        self.process_fns[key] = fn\n\n    def delete_process(self, key):\n        \"\"\"Delete process added at `add_process`\n\n        Args:\n            key (str): id for the process, named at `add_process`.\n        \"\"\"\n        if not isinstance(key, str):\n            raise TypeError('key must be str, actual {}'.format(type(key)))\n        if key in self.process_fns.keys():\n            del self.process_fns[key]\n        else:\n            # Nothing to delete\n            self.logger.warning('{} is not in process_fns, skip delete_process'\n                                .format(key))\n\n    def get_variable(self):\n        \"\"\"Get target variable, which is input or output of `target_link`.\n\n        Returns (chainer.Variable): target variable\n        \"\"\"\n        return self.result\n\n    def forward_preprocess(self, args):\n        if self.timing == 'pre' and args.link is self.target_link:\n            self.result = self.extract_fn(self, args)\n            if self.process_fns is not None:\n                for key, fn in self.process_fns.items():\n                    fn(self, args, self.result)\n\n    def forward_postprocess(self, args):\n        if self.timing == 'post' and args.link is self.target_link:\n            self.result = self.extract_fn(self, args)\n            if self.process_fns is not None:\n                for key, fn in self.process_fns.items():\n                    fn(self, args, self.result)\n"
  },
  {
    "path": "chainer_chemistry/links/__init__.py",
    "content": "from chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID  # NOQA\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear  # NOQA\nfrom chainer_chemistry.links.connection.graph_mlp import GraphMLP  # NOQA\n\nfrom chainer_chemistry.links.normalization.graph_batch_normalization import GraphBatchNormalization  # NOQA\n\nfrom chainer_chemistry.links.readout.general_readout import GeneralReadout  # NOQA\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout  # NOQA\nfrom chainer_chemistry.links.readout.mpnn_readout import MPNNReadout  # NOQA\nfrom chainer_chemistry.links.readout.nfp_readout import NFPReadout  # NOQA\nfrom chainer_chemistry.links.readout.schnet_readout import SchNetReadout  # NOQA\nfrom chainer_chemistry.links.readout.set2set import Set2Set  # NOQA\n\nfrom chainer_chemistry.links.scaler.flow_scaler import FlowScaler  # NOQA\nfrom chainer_chemistry.links.scaler.standard_scaler import StandardScaler  # NOQA\n\nfrom chainer_chemistry.links.update.ggnn_update import GGNNUpdate  # NOQA\nfrom chainer_chemistry.links.update.gin_update import GINUpdate  # NOQA\nfrom chainer_chemistry.links.update.mpnn_update import EdgeNet  # NOQA\nfrom chainer_chemistry.links.update.mpnn_update import MPNNUpdate  # NOQA\nfrom chainer_chemistry.links.update.nfp_update import NFPUpdate  # NOQA\nfrom chainer_chemistry.links.update.relgat_update import RelGATUpdate  # NOQA\nfrom chainer_chemistry.links.update.relgcn_update import RelGCNUpdate  # NOQA\nfrom chainer_chemistry.links.update.rsgcn_update import RSGCNUpdate  # NOQA\nfrom chainer_chemistry.links.update.schnet_update import SchNetUpdate  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/links/array/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/links/array/shape_transformer_to_2d.py",
    "content": "import chainer\nfrom chainer import functions\n\n\nclass ShapeTransformerTo2D(chainer.Link):\n    \"\"\"Transforms input array `x` to 2-dim and reverts.\n\n    It converts array to be 2-dim, where 1th axis is `axis` and the rest is\n    gathered to 0th axis.\n\n    Note that this class does not have any parameters\n    but behaves as \"function wrapper\" which has internal attribute to\n    `transform` and `inverse_transform`.\n\n    Args:\n        axis (int): feature axis, which will be 1st axis.\n    \"\"\"\n\n    def __init__(self, axis=1):\n        super(ShapeTransformerTo2D, self).__init__()\n        self.original_shape = None\n        self.transpose_order = None\n        self.axis = axis\n\n    def transform(self, x):\n        self.original_shape = x.shape\n        axis = self.axis\n        if axis < 0:\n            axis += x.ndim\n\n        transpose_order = [i for i in range(x.ndim)]\n        transpose_order.pop(axis)\n        transpose_order = transpose_order + [axis]\n        x = functions.transpose(x, tuple(transpose_order))\n        x = functions.reshape(x, (-1, self.original_shape[axis]))\n        self.transpose_order = transpose_order\n        return x\n\n    def inverse_transform(self, x):\n        if x.ndim != 2:\n            raise ValueError(\n                \"[ERROR] Unexpected value x.shape={}, 2-dim array is expected\"\n                .format(x.shape))\n        if self.original_shape is None:\n            raise AttributeError(\n                '[Error] original_shape is None, call transform beforehand!')\n\n        ndim = len(self.original_shape)\n        axis = self.axis\n        if axis < 0:\n            axis += ndim\n        inverse_transpose_order = [i for i in range(ndim - 1)]\n        inverse_transpose_order.insert(axis, ndim-1)\n        x = functions.reshape(x, tuple([self.original_shape[i]\n                                        for i in self.transpose_order]))\n        x = functions.transpose(x, tuple(inverse_transpose_order))\n        return x\n"
  },
  {
    "path": "chainer_chemistry/links/connection/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/links/connection/embed_atom_id.py",
    "content": "import chainer\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\n\n\nclass EmbedAtomID(chainer.links.EmbedID):\n    \"\"\"Embeddning specialized to atoms.\n\n    This is a chain in the sense of Chainer that converts\n    an atom, represented by a sequence of molecule IDs,\n    to a sequence of embedding vectors of molecules.\n    The operation is done in a minibatch manner, as most chains do.\n\n    The forward propagation of link consists of ID embedding,\n    which converts the input `x` into vector embedding `h` where\n    its shape represents (minibatch, atom, channel)\n\n    .. seealso:: :class:`chainer.links.EmbedID`\n    \"\"\"\n\n    def __init__(self, out_size, in_size=MAX_ATOMIC_NUM, initialW=None,\n                 ignore_label=None):\n        super(EmbedAtomID, self).__init__(\n            in_size=in_size, out_size=out_size, initialW=initialW,\n            ignore_label=ignore_label)\n\n    def __call__(self, x):\n        \"\"\"Forward propagaion.\n\n        Args:\n            x (:class:`chainer.Variable`, or :class:`numpy.ndarray` \\\n            or :class:`cupy.ndarray`):\n                Input array that should be an integer array\n                whose ``ndim`` is 2. This method treats the array\n                as a minibatch of atoms, each of which consists\n                of a sequence of molecules represented by integer IDs.\n                The first axis should be an index of atoms\n                (i.e. minibatch dimension) and the second one be an\n                index of molecules.\n\n        Returns:\n            :class:`chainer.Variable`:\n                A 3-dimensional array consisting of embedded vectors of atoms,\n                representing (minibatch, atom, channel).\n\n        \"\"\"\n\n        h = super(EmbedAtomID, self).__call__(x)\n        return h\n"
  },
  {
    "path": "chainer_chemistry/links/connection/graph_linear.py",
    "content": "import chainer\n\n\nclass GraphLinear(chainer.links.Linear):\n    \"\"\"Graph Linear layer.\n\n    This function assumes its input is 3-dimensional.\n    Differently from :class:`chainer.functions.linear`, it applies an affine\n    transformation to the third axis of input `x`.\n\n    .. seealso:: :class:`chainer.links.Linear`\n    \"\"\"\n\n    def __call__(self, x):\n        \"\"\"Forward propagation.\n\n        Args:\n            x (:class:`chainer.Variable`, or :class:`numpy.ndarray`\\\n            or :class:`cupy.ndarray`):\n                Input array that should be a float array whose ``ndim`` is 3.\n\n                It represents a minibatch of atoms, each of which consists\n                of a sequence of molecules. Each molecule is represented\n                by integer IDs. The first axis is an index of atoms\n                (i.e. minibatch dimension) and the second one an index\n                of molecules.\n\n        Returns:\n            :class:`chainer.Variable`:\n                A 3-dimeisional array.\n\n        \"\"\"\n        h = x\n        # (minibatch, atom, ch)\n        s0, s1, s2 = h.shape\n        h = chainer.functions.reshape(h, (s0 * s1, s2))\n        h = super(GraphLinear, self).__call__(h)\n        h = chainer.functions.reshape(h, (s0, s1, self.out_size))\n        return h\n"
  },
  {
    "path": "chainer_chemistry/links/connection/graph_mlp.py",
    "content": "import numpy\n\nimport chainer\nfrom chainer.functions import relu\n\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass GraphMLP(chainer.Chain):\n\n    \"\"\"Graph MLP layer\n\n    Args:\n        channels (list or numpy.ndarray): list of int, representing each\n            layer's hidden dim. e.g., if [32, 16], it will construct 2-layer\n            MLP with hidden dim 32 and output dim 16.\n        in_channels (int or None): input channel size.\n        activation (chainer.functions): activation function\n    \"\"\"\n\n    def __init__(self, channels, in_channels=None, activation=relu):\n        super(GraphMLP, self).__init__()\n        if not isinstance(channels, (list, numpy.ndarray)):\n            raise TypeError('channels {} is expected to be list, actual {}'\n                            .format(channels, type(channels)))\n\n        channels_list = [in_channels] + list(channels)\n        layers = [GraphLinear(channels_list[i], channels_list[i+1])\n                  for i in range(len(channels_list) - 1)]\n        with self.init_scope():\n            self.layers = chainer.ChainList(*layers)\n        self.activation = activation\n\n    def __call__(self, x):\n        h = x\n        for l in self.layers[:-1]:\n            h = self.activation(l(h))\n        h = self.layers[-1](h)\n        return h\n"
  },
  {
    "path": "chainer_chemistry/links/normalization/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/links/normalization/graph_batch_normalization.py",
    "content": "import chainer\n\n\nclass GraphBatchNormalization(chainer.links.BatchNormalization):\n    \"\"\"Graph Batch Normalization layer.\n\n    .. seealso:: :class:`chainer.links.BatchNormalization`\n    \"\"\"\n\n    def __call__(self, x):\n        \"\"\"Forward propagation.\n\n        Args:\n            x (:class:`chainer.Variable`, or :class:`numpy.ndarray`\\\n            or :class:`cupy.ndarray`):\n                Input array that should be a float array whose ``ndim`` is 3.\n\n                It represents a minibatch of atoms, each of which consists\n                of a sequence of molecules. Each molecule is represented\n                by integer IDs. The first axis is an index of atoms\n                (i.e. minibatch dimension) and the second one an index\n                of molecules.\n\n        Returns:\n            :class:`chainer.Variable`:\n                A 3-dimeisional array.\n\n        \"\"\"\n        h = x\n        # (minibatch, atom, ch)\n\n        # The implemenataion of batch normalization for graph convolution below\n        # is rather naive. To be precise, it is necessary to consider the\n        # difference in the number of atoms for each graph. However, the\n        # implementation below does not take it into account, and assumes\n        # that all graphs have the same number of atoms, hence extra numbers\n        # of zero are included when average is computed. In other word, the\n        # results of batch normalization below is biased.\n\n        s0, s1, s2 = h.shape\n        h = chainer.functions.reshape(h, (s0 * s1, s2))\n        h = super(GraphBatchNormalization, self).__call__(h)\n        h = chainer.functions.reshape(h, (s0, s1, s2))\n\n        return h\n"
  },
  {
    "path": "chainer_chemistry/links/readout/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/links/readout/cgcnn_readout.py",
    "content": "import chainer\nfrom chainer import functions, links  # NOQA\n\n\nclass CGCNNReadout(chainer.Chain):\n    \"\"\"CGCNN submodule for readout part.\n\n    Args:\n        out_dim (int): dimension of output feature vector\n    \"\"\"\n\n    def __init__(self, out_dim=128):\n        super(CGCNNReadout, self).__init__()\n        with self.init_scope():\n            self.linear = links.Linear(None, out_dim)\n\n    def __call__(self, atom_feat, atom_idx):\n        average_pool = [functions.mean(atom_feat[idx], axis=0, keepdims=True)\n                        for idx in atom_idx]\n        h = functions.concat(average_pool, axis=0)\n        h = self.linear(h)\n        h = functions.softplus(h)\n        return h\n"
  },
  {
    "path": "chainer_chemistry/links/readout/general_readout.py",
    "content": "import chainer\nfrom chainer import functions\n\n\nclass GeneralReadout(chainer.Link):\n    \"\"\"General submodule for readout part.\n\n    This class can be used for `rsgcn` and `weavenet`.\n    Note that this class has no learnable parameter,\n    even though this is subclass of `chainer.Link`.\n    This class is under `links` module for consistency\n    with other readout module.\n\n    Args:\n        mode (str):\n        activation (callable): activation function\n    \"\"\"\n\n    def __init__(self, mode='sum', activation=None, **kwargs):\n        super(GeneralReadout, self).__init__()\n        self.mode = mode\n        self.activation = activation\n\n    def __call__(self, h, axis=1, **kwargs):\n        if self.activation is not None:\n            h = self.activation(h)\n        else:\n            h = h\n\n        if self.mode == 'sum':\n            y = functions.sum(h, axis=axis)\n        elif self.mode == 'max':\n            y = functions.max(h, axis=axis)\n        elif self.mode == 'summax':\n            h_sum = functions.sum(h, axis=axis)\n            h_max = functions.max(h, axis=axis)\n            y = functions.concat((h_sum, h_max), axis=axis)\n        else:\n            raise ValueError('mode {} is not supported'.format(self.mode))\n        return y\n\n\nclass ScatterGeneralReadout(chainer.Link):\n    \"\"\"General submodule for readout part by scatter operation.\n\n    This class is used in sparse pattern.\n\n    Args:\n        mode (str):\n        activation (callable): activation function\n    \"\"\"\n\n    def __init__(self, mode='sum', activation=None, **kwargs):\n        super(ScatterGeneralReadout, self).__init__()\n        self.mode = mode\n        self.activation = activation\n\n    def __call__(self, h, batch, **kwargs):\n        if self.activation is not None:\n            h = self.activation(h)\n        else:\n            h = h\n\n        if self.mode == 'sum':\n            y = self.xp.zeros((batch[-1] + 1, h.shape[1]),\n                              dtype=self.xp.float32)\n            y = functions.scatter_add(y, batch, h)\n        else:\n            raise ValueError('mode {} is not supported'.format(self.mode))\n        return y\n"
  },
  {
    "path": "chainer_chemistry/links/readout/ggnn_readout.py",
    "content": "import chainer\nfrom chainer import functions\n\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass GGNNReadout(chainer.Chain):\n    \"\"\"GGNN submodule for readout part.\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        in_channels (int or None): dimension of feature vector associated to\n            each node. `in_channels` is the total dimension of `h` and `h0`.\n        nobias (bool): If ``True``, then this function does not use\n            the bias\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function for node representation\n            `functions.tanh` was suggested in original paper.\n        activation_agg (~chainer.Function or ~chainer.FunctionNode):\n            activate function for aggregation\n            `functions.tanh` was suggested in original paper.\n    \"\"\"\n\n    def __init__(self, out_dim, in_channels=None, nobias=False,\n                 activation=functions.identity,\n                 activation_agg=functions.identity):\n        super(GGNNReadout, self).__init__()\n        with self.init_scope():\n            self.i_layer = GraphLinear(in_channels, out_dim, nobias=nobias)\n            self.j_layer = GraphLinear(in_channels, out_dim, nobias=nobias)\n        self.out_dim = out_dim\n        self.in_channels = in_channels\n        self.nobias = nobias\n        self.activation = activation\n        self.activation_agg = activation_agg\n\n    def __call__(self, h, h0=None, is_real_node=None):\n        # --- Readout part ---\n        # h, h0: (minibatch, node, ch)\n        # is_real_node: (minibatch, node)\n        h1 = functions.concat((h, h0), axis=2) if h0 is not None else h\n\n        g1 = functions.sigmoid(self.i_layer(h1))\n        g2 = self.activation(self.j_layer(h1))\n        g = g1 * g2\n        if is_real_node is not None:\n            # mask virtual node feature to be 0\n            mask = self.xp.broadcast_to(\n                is_real_node[:, :, None], g.shape)\n            g = g * mask\n        # sum along node axis\n        g = self.activation_agg(functions.sum(g, axis=1))\n        return g\n"
  },
  {
    "path": "chainer_chemistry/links/readout/megnet_readout.py",
    "content": "import chainer\nfrom chainer import functions, links  # NOQA\n\n\nfrom chainer_chemistry.functions import megnet_softplus\nfrom chainer_chemistry.links.readout.set2set import Set2Set\n\n\nclass MEGNetReadout(chainer.Chain):\n    \"\"\"MEGNet submodule for readout part.\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        in_channels (int): dimension of feature vector associated to\n            each node. Must not be `None`.\n        n_layers (int): number of LSTM layers for set2set\n        processing_steps (int): number of processing for set2set\n        dropout_ratio (float): ratio of dropout\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function for megnet model\n            `megnet_softplus` was used in original paper.\n    \"\"\"\n\n    def __init__(self, out_dim=32, in_channels=32, n_layers=1,\n                 processing_steps=3, dropout_ratio=-1,\n                 activation=megnet_softplus):\n        super(MEGNetReadout, self).__init__()\n        if processing_steps <= 0:\n            raise ValueError(\"[ERROR] Unexpected value processing_steps={}\"\n                             .format(processing_steps))\n\n        self.processing_steps = processing_steps\n        self.dropout_ratio = dropout_ratio\n        self.activation = activation\n        with self.init_scope():\n            self.set2set_for_atom = Set2Set(\n                in_channels=in_channels, n_layers=n_layers)\n            self.set2set_for_pair = Set2Set(\n                in_channels=in_channels, n_layers=n_layers)\n            self.linear = links.Linear(None, out_dim)\n\n    def __call__(self, atoms_feat, pair_feat, global_feat):\n        a_f = atoms_feat\n        p_f = pair_feat\n        g_f = global_feat\n\n        # readout for atom and pair feature\n        self.set2set_for_atom.reset_state()\n        self.set2set_for_pair.reset_state()\n        for i in range(self.processing_steps):\n            a_f_r = self.set2set_for_atom(a_f)\n            p_f_r = self.set2set_for_pair(p_f)\n\n        # concating all features\n        h = functions.concat((a_f_r, p_f_r, g_f), axis=1)\n        if self.dropout_ratio > 0.0:\n            h = functions.dropout(h, ratio=self.dropout_ratio)\n        out = self.activation(self.linear(h))\n        return out\n"
  },
  {
    "path": "chainer_chemistry/links/readout/mpnn_readout.py",
    "content": "import chainer\nfrom chainer import functions\nfrom chainer import links\n\nfrom chainer_chemistry.links.readout.set2set import Set2Set\n\n\nclass MPNNReadout(chainer.Chain):\n    \"\"\"MPNN submodule for readout part.\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        in_channels (int): dimension of feature vector associated to\n            each node. Must not be `None`.\n        n_layers (int): number of LSTM layers for set2set\n        processing_steps (int): number of processing for set2set\n    \"\"\"\n\n    def __init__(self, out_dim, in_channels, n_layers=1, processing_steps=3):\n        # type: (int, int, int, int) -> None\n        super(MPNNReadout, self).__init__()\n        if processing_steps <= 0:\n            raise ValueError(\"[ERROR] Unexpected value processing_steps={}\"\n                             .format(processing_steps))\n        with self.init_scope():\n            self.set2set = Set2Set(in_channels=in_channels, n_layers=n_layers)\n            self.linear1 = links.Linear(in_channels * 2, in_channels)\n            self.linear2 = links.Linear(in_channels, out_dim)\n        self.out_dim = out_dim\n        self.in_channels = in_channels\n        self.n_layers = n_layers\n        self.processing_steps = processing_steps\n\n    def __call__(self, h, **kwargs):\n        # type: (chainer.Variable) -> chainer.Variable\n        # h: (mb, node, ch)\n        self.set2set.reset_state()\n        for i in range(self.processing_steps):\n            g = self.set2set(h)  # g: (mb, ch * 2)\n        g = functions.relu(self.linear1(g))  # g: (mb, ch)\n        g = self.linear2(g)  # g: (mb, out_dim)\n        return g\n"
  },
  {
    "path": "chainer_chemistry/links/readout/nfp_readout.py",
    "content": "import chainer\nfrom chainer import functions\n\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass NFPReadout(chainer.Chain):\n    \"\"\"NFP submodule for readout part.\n\n    Args:\n        out_dim (int): output dimension of feature vector associated\n            to each graph\n        in_channels (int or None): dimension of feature vector associated to\n            each node\n    \"\"\"\n\n    def __init__(self, out_dim, in_channels):\n        super(NFPReadout, self).__init__()\n        with self.init_scope():\n            self.output_weight = GraphLinear(in_channels, out_dim)\n        self.in_channels = in_channels\n        self.out_dim = out_dim\n\n    def __call__(self, h, is_real_node=None, **kwargs):\n        # h: (minibatch, node, ch)\n        # is_real_node: (minibatch, node)\n\n        # ---Readout part ---\n        i = self.output_weight(h)\n        i = functions.softmax(i, axis=2)  # softmax along channel axis\n        if is_real_node is not None:\n            # mask virtual node feature to be 0\n            mask = self.xp.broadcast_to(\n                is_real_node[:, :, None], i.shape)\n            i = i * mask\n        i = functions.sum(i, axis=1)  # sum along atom's axis\n        return i\n"
  },
  {
    "path": "chainer_chemistry/links/readout/scatter_ggnn_readout.py",
    "content": "import numpy\n\nimport chainer\nfrom chainer import functions\n\n\nclass ScatterGGNNReadout(chainer.Chain):\n    \"\"\"GGNN submodule for readout part using scatter operation.\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        in_channels (int or None): dimension of feature vector associated to\n            each node. `in_channels` is the total dimension of `h` and `h0`.\n        nobias (bool): If ``True``, then this function does not use\n            the bias\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function for node representation\n            `functions.tanh` was suggested in original paper.\n        activation_agg (~chainer.Function or ~chainer.FunctionNode):\n            activate function for aggregation\n            `functions.tanh` was suggested in original paper.\n        concat_n_info (bool): If ``True``, node information is concated\n            to the result.\n    \"\"\"\n\n    def __init__(self, out_dim, in_channels=None, nobias=False,\n                 activation=functions.identity,\n                 activation_agg=functions.identity,\n                 concat_n_info=False):\n        super(ScatterGGNNReadout, self).__init__()\n        self.concat_n_info = concat_n_info\n        if self.concat_n_info:\n            out_dim -= 1\n        with self.init_scope():\n            self.i_layer = chainer.links.Linear(\n                in_channels, out_dim, nobias=nobias)\n            self.j_layer = chainer.links.Linear(\n                in_channels, out_dim, nobias=nobias)\n        self.out_dim = out_dim\n        self.in_channels = in_channels\n        self.nobias = nobias\n        self.activation = activation\n        self.activation_agg = activation_agg\n\n    def __call__(self, h, batch, h0=None, is_real_node=None):\n        # --- Readout part ---\n        h1 = functions.concat((h, h0), axis=1) if h0 is not None else h\n\n        g1 = functions.sigmoid(self.i_layer(h1))\n        g2 = self.activation(self.j_layer(h1))\n        g = g1 * g2\n\n        # sum along node axis\n        y = self.xp.zeros((int(batch[-1]) + 1, self.out_dim),\n                          dtype=numpy.float32)\n        y = functions.scatter_add(y, batch, g)\n        y = self.activation_agg(y)\n\n        if self.concat_n_info:\n            n_nodes = self.xp.zeros(y.shape[0], dtype=self.xp.float32)\n            n_nodes = functions.scatter_add(n_nodes, batch,\n                                            self.xp.ones(batch.shape[0]))\n            y = functions.concat((y, n_nodes.reshape((-1, 1))))\n\n        return y\n"
  },
  {
    "path": "chainer_chemistry/links/readout/schnet_readout.py",
    "content": "import chainer\nfrom chainer import functions\n\nfrom chainer_chemistry.functions import shifted_softplus\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass SchNetReadout(chainer.Chain):\n    \"\"\"SchNet submodule for readout part.\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        in_channels (int or None): dimension of feature vector for each node\n        hidden_channels (int): dimension of feature vector for each node\n    \"\"\"\n\n    def __init__(self, out_dim=1, in_channels=None,\n                 hidden_channels=32):\n        super(SchNetReadout, self).__init__()\n        with self.init_scope():\n            self.linear1 = GraphLinear(in_channels, hidden_channels)\n            self.linear2 = GraphLinear(hidden_channels, out_dim)\n        self.out_dim = out_dim\n        self.hidden_dim = in_channels\n\n    def __call__(self, h, **kwargs):\n        h = self.linear1(h)\n        h = shifted_softplus(h)\n        h = self.linear2(h)\n        h = functions.sum(h, axis=1)\n        return h\n"
  },
  {
    "path": "chainer_chemistry/links/readout/set2set.py",
    "content": "from typing import List, Optional  # NOQA\n\nimport chainer\nfrom chainer import cuda\nfrom chainer import functions\nfrom chainer import links\nimport numpy  # NOQA\n\n\nclass Set2Set(chainer.Chain):\n    r\"\"\"MPNN subsubmodule for readout part.\n\n    See: Oriol Vinyals+, \\\n        Order Matters: Sequence to sequence for sets. November 2015.\n        `arXiv:1511.06391 <https://arxiv.org/abs/1511.06391>`\n\n    Args:\n        in_channels (int): dimension of input feature vector\n        n_layers (int): number of LSTM layers\n\n    Returns (chainer.Variable):\n        Output feature vector: (minibatch, in_channels * 2)\n\n    \"\"\"\n\n    def __init__(self, in_channels, n_layers=1):\n        # type: (int, int) -> None\n        super(Set2Set, self).__init__()\n        with self.init_scope():\n            self.lstm_layer = links.NStepLSTM(\n                n_layers=n_layers,\n                in_size=in_channels * 2,\n                out_size=in_channels,\n                dropout=0)\n        self.in_channels = in_channels\n        self.n_layers = n_layers\n        self.hx = None  # type: Optional[chainer.Variable]\n        self.cx = None  # type: Optional[chainer.Variable]\n        self.q_star = None  # type: Optional[List]\n\n    def __call__(self, h):\n        # type: (chainer.Variable) -> chainer.Variable\n        xp = cuda.get_array_module(h)\n        mb, node, ch = h.shape  # type: int, int, int\n        if self.q_star is None:\n            self.q_star = [\n                xp.zeros((1, self.in_channels * 2)).astype('f')\n                for _ in range(mb)\n            ]\n        self.hx, self.cx, q = self.lstm_layer(self.hx, self.cx, self.q_star)\n        # self.hx: (mb, mb, ch)\n        # self.cx: (mb, mb, ch)\n        # q: List[(1, ch) * mb]\n        q = functions.stack(q)  # q: (mb, 1, ch)\n        q_ = functions.transpose(q, axes=(0, 2, 1))  # q_: (mb, ch, 1)\n        e = functions.matmul(h, q_)  # e: (mb, node, 1)\n        a = functions.softmax(e)  # a: (mb, node, 1)\n        a = functions.broadcast_to(a, h.shape)  # a: (mb, node, ch)\n        r = functions.sum((a * h), axis=1, keepdims=True)  # r: (mb, 1, ch)\n        q_star_ = functions.concat((q, r), axis=2)  # q_star_: (mb, 1, ch*2)\n        self.q_star = functions.separate(q_star_)\n        return functions.reshape(q_star_, (mb, ch * 2))\n\n    def reset_state(self):\n        # type: () -> None\n        self.hx = None\n        self.cx = None\n        self.q_star = None\n"
  },
  {
    "path": "chainer_chemistry/links/scaler/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/links/scaler/base.py",
    "content": "import chainer\n\n\ndef to_array(x):\n    \"\"\"Convert x into numpy.ndarray or cupy.ndarray\"\"\"\n    if isinstance(x, chainer.Variable):\n        x = x.data\n    return x\n\n\nclass BaseScaler(chainer.Link):\n    \"\"\"Base class for scaler.\n\n    x maybe array or Variable\n    \"\"\"\n\n    def fit(self, x, **kwargs):\n        \"\"\"fit parameter from given input `x`.\n\n        It should return self after fitting parameters.\n        \"\"\"\n        raise NotImplementedError\n\n    def transform(self, x, **kwargs):\n        \"\"\"transform input `x` using fitted parameters.\n\n        This method should be called after `fit` is called.\n        \"\"\"\n        raise NotImplementedError\n\n    def inverse_transform(self, x, **kwargs):\n        \"\"\"inverse operation of `transform`.\n\n        This method should be called after `fit` is called.\n        \"\"\"\n        raise NotImplementedError\n\n    def fit_transform(self, x, **kwargs):\n        return self.fit(x, **kwargs).transform(x)\n\n    # `__call__` method invokes `forward` method.\n    def forward(self, x, **kwargs):\n        return self.transform(x, **kwargs)\n"
  },
  {
    "path": "chainer_chemistry/links/scaler/flow_scaler.py",
    "content": "import numpy\n\nimport chainer\n\nfrom chainer_chemistry.links.scaler.base import BaseScaler, to_array  # NOQA\n\n\ndef _sigmoid_derivative(x):\n    h = chainer.functions.sigmoid(x)\n    return chainer.grad([h], [x], enable_double_backprop=True)[0]\n\n\ndef format_x(x):\n    \"\"\"x may be array or Variable\"\"\"\n    # currently, only consider the case x is 2-dim, (batchsize, feature)\n    if x.ndim == 1:\n        # Deal with as 1 feature with several samples.\n        x = x[:, None]\n    if x.ndim != 2:\n        raise ValueError(\n            \"Unexpected value x.shape={}, only x.ndim=2 is supported.\"\n            .format(x.shape))\n    return x\n\n\nclass FlowScaler(BaseScaler):\n    \"\"\"Flow Scaler.\n\n    Flow Scaler is a Scaler that scale data into the normal distribution.\n    This scaler uses a technique named \"flow\". By using this technique,\n    parametrized bijective function is learned to scale data that distributes\n    arbitrary continuous distribution into specified continuous distribution.\n    In this scaler, multi-layer perceptron whose weight is restricted into\n    positive range is used as parametrized bijective function.\n\n    Args:\n        hidden_num(int): number of units in hidden layer of multi-layer\n            perceptron.\n    \"\"\"\n\n    def __init__(self, hidden_num=20):\n        super(FlowScaler, self).__init__()\n\n        self.hidden_num = hidden_num\n\n        self.mean = None\n        self.register_persistent('mean')\n        self.std = None\n        self.register_persistent('std')\n        self.eps = numpy.float32(1e-6)\n        W_initializer = chainer.initializers.Normal(0.1)\n        with self.init_scope():\n            self.W1_ = chainer.Parameter(W_initializer)\n            self.b1 = chainer.Parameter(0)\n            self.W2_ = chainer.Parameter(W_initializer)\n            self.b2 = chainer.Parameter(0)\n\n    def _initialize_params(self, in_size):\n        self.W1_.initialize((self.hidden_num, in_size, 1, 1, 1, 1))\n        self.b1.initialize((self.hidden_num, in_size, 1))\n        self.W2_.initialize((1, in_size, 1, self.hidden_num, 1, 1))\n        self.b2.initialize((1, in_size, 1))\n\n    @property\n    def W1(self):\n        return chainer.functions.softplus(self.W1_)\n\n    @property\n    def W2(self):\n        return chainer.functions.softplus(self.W2_)\n\n    def _forward(self, x):\n        x = chainer.functions.expand_dims(x, axis=1)\n        x = chainer.functions.expand_dims(x, axis=3)\n        h = chainer.functions.local_convolution_2d(x, self.W1, self.b1)\n        h = chainer.functions.sigmoid(h)\n        h = chainer.functions.local_convolution_2d(h, self.W2, self.b2)\n        h = h[:, 0, :, 0]\n        return h\n\n    def _derivative(self, x):\n        x = chainer.functions.expand_dims(x, axis=1)\n        x = chainer.functions.expand_dims(x, axis=3)\n        h = chainer.functions.local_convolution_2d(x, self.W1, self.b1)\n        h = _sigmoid_derivative(h)\n        h = h * chainer.functions.expand_dims(self.W1[:, :, 0, 0, 0], axis=0)\n        h = chainer.functions.local_convolution_2d(h, self.W2)\n        h = h[:, 0, :, 0]\n        return h\n\n    def _loss(self, x):\n        # loss = -log(p(f(x))) - log|f'(x)|\n        x_nan = self.xp.isnan(x)\n        x_not_nan = self.xp.logical_not(x_nan)\n        x = self.xp.nan_to_num(x)\n        if not isinstance(x, chainer.Variable):\n            x = chainer.Variable(x.astype(numpy.float32))\n        y = self._forward(x)\n        gy = self._derivative(x)\n        # gy, = chainer.grad([y], [x], enable_double_backprop=True)\n        std_gaussian = chainer.distributions.Normal(\n            self.xp.zeros(shape=x.shape, dtype=numpy.float32),\n            self.xp.ones(shape=x.shape, dtype=numpy.float32))\n        loss = -std_gaussian.log_prob(y)\n        loss -= chainer.functions.log(abs(gy) + self.eps)\n        loss = chainer.functions.sum(loss[x_not_nan]) / x_not_nan.sum()\n        chainer.reporter.report({'loss': loss}, self)\n        return loss\n\n    def fit(self, x, batch_size=100, iteration=3000):\n        \"\"\"Fitting parameter.\n\n        Args:\n            x(:class:`~chainer.Variable` or :ref:`ndarray`): data for learning.\n            batch_size(int): size of batch used for learning multi-layer\n                perceptron.\n            iteration(int): number of iteration.\n\n        Returns:\n            self (FlowScaler): this instance.\n        \"\"\"\n        if isinstance(x, chainer.Variable):\n            x = x.array\n\n        x = format_x(x)\n\n        self._initialize_params(x.shape[1])\n\n        xp = self.xp\n        if xp is numpy:\n            self.mean = xp.nanmean(x, axis=0)\n            self.std = xp.nanstd(x, axis=0)\n        else:\n            if int(xp.sum(xp.isnan(x))) > 0:\n                raise NotImplementedError(\n                    \"FlowScaling with nan value on GPU is not supported.\")\n            # cupy.nanmean, cupy.nanstd is not implemented yet.\n            self.mean = xp.mean(x, axis=0)\n            self.std = xp.std(x, axis=0)\n\n        x = (x - self.mean) / (self.std + self.eps)\n\n        optimizer = chainer.optimizers.Adam(0.3)\n        optimizer.setup(self)\n\n        train = chainer.datasets.TupleDataset(x)\n        train_iter = chainer.iterators.SerialIterator(train, batch_size)\n\n        updater = chainer.training.updaters.StandardUpdater(\n            train_iter, optimizer, loss_func=self._loss)\n\n        trainer = chainer.training.Trainer(\n            updater, (iteration, 'iteration'))\n        trainer.extend(chainer.training.extensions.LogReport(\n            trigger=(100, 'iteration')))\n        trainer.extend(chainer.training.extensions.PrintReport(\n            ['epoch', 'iteration', 'main/loss', 'elapsed_time']))\n\n        trainer.run()\n\n        return self\n\n    def transform(self, x, batch_size=100):\n        \"\"\"Transform.\n\n        Args:\n            x(:class:`~chainer.Variable` or :ref:`ndarray`): data.\n            batch_size(int): size of batch used for learning multi-layer\n                perceptron.\n\n        Returns:\n            scaled_x(:class:`~chainer.Variable` or :ref:`ndarray`):\n                transformed data.\n        \"\"\"\n        if self.mean is None:\n            raise AttributeError('[Error] mean is None, call fit beforehand!')\n\n        x_ = format_x(x)\n        x_ = (x_ - self.mean) / (self.std + self.eps)\n\n        y = []\n        for i in range((len(x) - 1) // batch_size + 1):\n            y.append(self._forward(\n                x_[i*batch_size: (i+1)*batch_size]))\n\n        y = chainer.functions.concat(y, axis=0)\n\n        if x.ndim == 1:\n            y = y[:, 0]\n\n        if isinstance(x_, chainer.Variable):\n            return y\n        else:\n            return y.data\n"
  },
  {
    "path": "chainer_chemistry/links/scaler/max_abs_scaler.py",
    "content": "from logging import getLogger\n\nimport numpy\n\nfrom chainer import cuda, Variable  # NOQA\n\nfrom chainer_chemistry.links.scaler.base import BaseScaler, to_array  # NOQA\nfrom chainer_chemistry.links.array.shape_transformer_to_2d import ShapeTransformerTo2D  # NOQA\n\n\ndef format_x(x):\n    \"\"\"x may be array or Variable.\"\"\"\n    # currently, only consider the case x is 2-dim, (batchsize, feature)\n    if x.ndim == 1:\n        # Deal with as 1 feature with several samples.\n        x = x[:, None]\n    return x\n\n\nclass MaxAbsScaler(BaseScaler):\n\n    def __init__(self):\n        super(MaxAbsScaler, self).__init__()\n        self.indices = None\n        self.register_persistent('indices')\n        self.max_abs = None\n        self.register_persistent('max_abs')\n\n    def fit(self, x, indices=None, axis=1):\n        \"\"\"Fitting parameter.\n\n        Args:\n            x (numpy.ndarray or cupy.ndarray or Variable):\n            indices (list or tuple or None):\n                indices for applying standard scaling.\n            axis (int): axis to calculate mean & std.\n\n        Returns:\n            self (StandardScaler): this instance.\n        \"\"\"\n        x = to_array(x)\n        x = format_x(x)\n        x = ShapeTransformerTo2D(axis=axis).transform(x).array\n\n        if indices is None:\n            pass\n        elif isinstance(indices, (list, tuple)):\n            indices = numpy.asarray(indices)\n        self.indices = indices\n        if self.indices is not None:\n            x = x[:, self.indices]\n\n        xp = self.xp\n        if xp is numpy:\n            x = cuda.to_cpu(x)\n        else:\n            x = cuda.to_gpu(x)\n        self.max_abs = xp.nanmax(xp.abs(x), axis=0)\n\n        # result consistency check\n        if xp.sum(self.max_abs == 0) > 0:\n            logger = getLogger(__name__)\n            ind = numpy.argwhere(cuda.to_cpu(self.max_abs) == 0)[:, 0]\n            logger.warning('fit: max_abs was 0 at indices {}'.format(ind))\n        return self\n\n    def _compute_max_abs_all(self, input_dim):\n        if self.indices is None:\n            max_abs_all = self.xp.ones(input_dim, dtype=self.xp.float32)\n            max_abs_all[self.max_abs != 0] = self.max_abs[self.max_abs != 0]\n            return max_abs_all\n        else:\n            max_abs_all = self.xp.ones(input_dim, dtype=self.xp.float32)\n            non_zero_indices = self.indices[self.max_abs != 0]\n            max_abs_all[non_zero_indices] = self.max_abs[self.max_abs != 0]\n            return max_abs_all\n\n    def transform(self, x, axis=1):\n        is_array = not isinstance(x, Variable)\n        if self.max_abs is None:\n            raise AttributeError(\n                '[Error] max_abs is None, call fit beforehand!')\n        x = format_x(x)\n        shape_transformer = ShapeTransformerTo2D(axis=axis)\n        x = shape_transformer.transform(x)\n        max_abs_all = self._compute_max_abs_all(x.shape[1])\n        x = x / max_abs_all[None, :]\n        x = shape_transformer.inverse_transform(x)\n        if is_array:\n            x = x.array\n        return x\n\n    def inverse_transform(self, x, axis=1):\n        is_array = not isinstance(x, Variable)\n        if self.max_abs is None:\n            raise AttributeError(\n                '[Error] max_abs is None, call fit beforehand!')\n        x = format_x(x)\n        shape_transformer = ShapeTransformerTo2D(axis=axis)\n        x = shape_transformer.transform(x)\n        max_abs_all = self._compute_max_abs_all(x.shape[1])\n        x = x * max_abs_all[None, :]\n        x = shape_transformer.inverse_transform(x)\n        if is_array:\n            x = x.array\n        return x\n"
  },
  {
    "path": "chainer_chemistry/links/scaler/min_max_scaler.py",
    "content": "from logging import getLogger\n\nimport numpy\n\nfrom chainer import cuda, Variable  # NOQA\n\nfrom chainer_chemistry.links.scaler.base import BaseScaler, to_array  # NOQA\nfrom chainer_chemistry.links.array.shape_transformer_to_2d import ShapeTransformerTo2D  # NOQA\n\n\ndef format_x(x):\n    \"\"\"x may be array or Variable.\"\"\"\n    # currently, only consider the case x is 2-dim, (batchsize, feature)\n    if x.ndim == 1:\n        # Deal with as 1 feature with several samples.\n        x = x[:, None]\n    return x\n\n\nclass MinMaxScaler(BaseScaler):\n\n    def __init__(self):\n        super(MinMaxScaler, self).__init__()\n        self.indices = None\n        self.register_persistent('indices')\n        self.min = None\n        self.register_persistent('min')\n        self.max = None\n        self.register_persistent('max')\n\n    def fit(self, x, indices=None, axis=1):\n        \"\"\"Fitting parameter.\n\n        Args:\n            x (numpy.ndarray or cupy.ndarray or Variable):\n            indices (list or tuple or None):\n                indices for applying standard scaling.\n            axis (int): axis to calculate min & max.\n\n        Returns:\n            self (MinMaxScaler): this instance.\n        \"\"\"\n        x = to_array(x)\n        x = format_x(x)\n        x = ShapeTransformerTo2D(axis=axis).transform(x).array\n\n        if indices is None:\n            pass\n        elif isinstance(indices, (list, tuple)):\n            indices = numpy.asarray(indices)\n        self.indices = indices\n        if self.indices is not None:\n            x = x[:, self.indices]\n\n        xp = self.xp\n        if xp is numpy:\n            x = cuda.to_cpu(x)\n        else:\n            x = cuda.to_gpu(x)\n        self.min = xp.nanmin(x, axis=0)\n        self.max = xp.nanmax(x, axis=0)\n\n        # result consistency check\n        if xp.sum(self.max - self.min == 0) > 0:\n            logger = getLogger(__name__)\n            ind = numpy.argwhere(cuda.to_cpu(self.max-self.min) == 0)[:, 0]\n            logger.warning('fit: max-min was 0 at indices {}'.format(ind))\n        return self\n\n    def _compute_min_diff_all(self, input_dim):\n        diff = self.max - self.min\n        diff_nonzero_indices = diff != 0\n        if self.indices is None:\n            diff_all = self.xp.ones(input_dim, dtype=self.xp.float32)\n            diff_all[diff_nonzero_indices] = diff[diff_nonzero_indices]\n            return self.min, diff_all\n        else:\n            min_all = self.xp.zeros(input_dim, dtype=self.xp.float32)\n            min_all[self.indices] = self.min\n            diff_all = self.xp.ones(input_dim, dtype=self.xp.float32)\n            non_zero_indices = self.indices[diff_nonzero_indices]\n            diff_all[non_zero_indices] = diff[diff_nonzero_indices]\n            return min_all, diff_all\n\n    def transform(self, x, axis=1):\n        is_array = not isinstance(x, Variable)\n        if self.min is None:\n            raise AttributeError(\n                '[Error] min is None, call fit beforehand!')\n        x = format_x(x)\n        shape_transformer = ShapeTransformerTo2D(axis=axis)\n        x = shape_transformer.transform(x)\n        min_all, diff_all = self._compute_min_diff_all(x.shape[1])\n        x = (x - min_all[None, :]) / diff_all[None, :]\n        x = shape_transformer.inverse_transform(x)\n        if is_array:\n            x = x.array\n        return x\n\n    def inverse_transform(self, x, axis=1):\n        is_array = not isinstance(x, Variable)\n        if self.min is None:\n            raise AttributeError(\n                '[Error] min is None, call fit beforehand!')\n        x = format_x(x)\n        shape_transformer = ShapeTransformerTo2D(axis=axis)\n        x = shape_transformer.transform(x)\n\n        min_all, diff_all = self._compute_min_diff_all(x.shape[1])\n        x = x * diff_all[None, :] + min_all[None, :]\n\n        x = shape_transformer.inverse_transform(x)\n        if is_array:\n            x = x.array\n        return x\n"
  },
  {
    "path": "chainer_chemistry/links/scaler/standard_scaler.py",
    "content": "from logging import getLogger\n\nimport numpy\n\nfrom chainer import cuda, Variable  # NOQA\n\nfrom chainer_chemistry.links.scaler.base import BaseScaler, to_array  # NOQA\nfrom chainer_chemistry.links.array.shape_transformer_to_2d import ShapeTransformerTo2D  # NOQA\n\n\ndef format_x(x):\n    \"\"\"x may be array or Variable.\"\"\"\n    # currently, only consider the case x is 2-dim, (batchsize, feature)\n    if x.ndim == 1:\n        # Deal with as 1 feature with several samples.\n        x = x[:, None]\n    return x\n\n\nclass StandardScaler(BaseScaler):\n\n    def __init__(self):\n        super(StandardScaler, self).__init__()\n        self.indices = None\n        self.register_persistent('indices')\n        self.mean = None\n        self.register_persistent('mean')\n        self.std = None\n        self.register_persistent('std')\n\n    def fit(self, x, indices=None, axis=1):\n        \"\"\"Fitting parameter.\n\n        Args:\n            x (numpy.ndarray or cupy.ndarray or Variable):\n            indices (list or tuple or None):\n                indices for applying standard scaling.\n            axis (int): axis to calculate mean & std.\n\n        Returns:\n            self (StandardScaler): this instance.\n        \"\"\"\n        x = to_array(x)\n        x = format_x(x)\n        x = ShapeTransformerTo2D(axis=axis).transform(x).array\n\n        if indices is None:\n            pass\n        elif isinstance(indices, (list, tuple)):\n            indices = numpy.asarray(indices)\n        self.indices = indices\n        if self.indices is not None:\n            x = x[:, self.indices]\n\n        xp = self.xp\n        if xp is numpy:\n            x = cuda.to_cpu(x)\n            self.mean = xp.nanmean(x, axis=0)\n            self.std = xp.nanstd(x, axis=0)\n        else:\n            x = cuda.to_gpu(x)\n            if int(xp.sum(xp.isnan(x))) > 0:\n                raise NotImplementedError(\n                    \"StandardScaling with nan value on GPU is not supported.\")\n            # cupy.nanmean, cupy.nanstd is not implemented yet.\n            self.mean = xp.mean(x, axis=0)\n            self.std = xp.std(x, axis=0)\n\n        # result consistency check\n        if xp.sum(self.std == 0) > 0:\n            logger = getLogger(__name__)\n            ind = numpy.argwhere(cuda.to_cpu(self.std) == 0)[:, 0]\n            logger.warning('fit: std was 0 at indices {}'.format(ind))\n        return self\n\n    def _compute_mean_std_all(self, input_dim):\n        if self.indices is None:\n            std_all = self.xp.ones(input_dim, dtype=self.xp.float32)\n            std_all[self.std != 0] = self.std[self.std != 0]\n            return self.mean, std_all\n        else:\n            mean_all = self.xp.zeros(input_dim, dtype=self.xp.float32)\n            mean_all[self.indices] = self.mean\n            std_all = self.xp.ones(input_dim, dtype=self.xp.float32)\n            non_zero_indices = self.indices[self.std != 0]\n            std_all[non_zero_indices] = self.std[self.std != 0]\n            return mean_all, std_all\n\n    def transform(self, x, axis=1):\n        is_array = not isinstance(x, Variable)\n        if self.mean is None:\n            raise AttributeError('[Error] mean is None, call fit beforehand!')\n        x = format_x(x)\n        shape_transformer = ShapeTransformerTo2D(axis=axis)\n        x = shape_transformer.transform(x)\n        mean_all, std_all = self._compute_mean_std_all(x.shape[1])\n        x = (x - mean_all[None, :]) / std_all[None, :]\n        x = shape_transformer.inverse_transform(x)\n        if is_array:\n            x = x.array\n        return x\n\n    def inverse_transform(self, x, axis=1):\n        is_array = not isinstance(x, Variable)\n        if self.mean is None:\n            raise AttributeError('[Error] mean is None, call fit beforehand!')\n        x = format_x(x)\n        shape_transformer = ShapeTransformerTo2D(axis=axis)\n        x = shape_transformer.transform(x)\n        mean_all, std_all = self._compute_mean_std_all(x.shape[1])\n        x = x * std_all[None, :] + mean_all[None, :]\n        x = shape_transformer.inverse_transform(x)\n        if is_array:\n            x = x.array\n        return x\n"
  },
  {
    "path": "chainer_chemistry/links/update/__init__.py",
    "content": ""
  },
  {
    "path": "chainer_chemistry/links/update/cgcnn_update.py",
    "content": "import chainer\nfrom chainer import links, functions  # NOQA\n\n\nclass CGCNNUpdate(chainer.Chain):\n    \"\"\"Update submodule for CGCNN\n\n    Args:\n        n_site_features (int): hidden dimension of atom feature vector.\n            This value must be the same as n_site_feat.\n    \"\"\"\n\n    def __init__(self, n_site_features=64):\n        super(CGCNNUpdate, self).__init__()\n        with self.init_scope():\n            self.fc = links.Linear(None, 2*n_site_features)\n            self.bn1 = links.BatchNormalization(2*n_site_features)\n            self.bn2 = links.BatchNormalization(n_site_features)\n\n    def __call__(self, site_feat, nbr_feat, nbr_feat_idx):\n        n_site, n_nbr, n_nbr_feat = nbr_feat.shape\n        _, n_site_feat = site_feat.shape\n        site_nbr_feat = site_feat[nbr_feat_idx]\n        total_feat = functions.concat([\n            functions.broadcast_to(site_feat[:, None, :],\n                                   (n_site, n_nbr, n_site_feat)),\n            site_nbr_feat,\n            nbr_feat\n        ], axis=2)\n\n        total_feat = self.fc(total_feat.reshape(\n            n_site*n_nbr, 2*n_site_feat+n_nbr_feat))\n        total_feat = self.bn1(total_feat).reshape(n_site, n_nbr, 2*n_site_feat)\n        feat_gate, feat_core = functions.split_axis(total_feat, 2, axis=-1)\n        feat_gate = functions.sigmoid(feat_gate)\n        feat_core = functions.softplus(feat_core)\n        feat_sum = functions.sum(feat_gate * feat_core, axis=1)\n        feat_sum = self.bn2(feat_sum)\n        out = functions.softplus(site_feat + feat_sum)\n        return out\n"
  },
  {
    "path": "chainer_chemistry/links/update/ggnn_update.py",
    "content": "import chainer\nfrom chainer import functions\nfrom chainer import links\n\nimport chainer_chemistry\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\nfrom chainer_chemistry.utils import is_sparse\n\n\nclass GGNNUpdate(chainer.Chain):\n    \"\"\"GGNN submodule for update part.\n\n    Args:\n        in_channels (int or None): input dim of feature vector for each node\n        hidden_channels (int): dimension of feature vector for each node\n        out_channels (int or None): output dime of feature vector for each node\n            When `None`, `hidden_channels` is used.\n        n_edge_types (int): number of types of edge\n    \"\"\"\n\n    def __init__(self, in_channels=None, hidden_channels=16,\n                 out_channels=None, n_edge_types=4, **kwargs):\n        if out_channels is None:\n            out_channels = hidden_channels\n        super(GGNNUpdate, self).__init__()\n        if in_channels is None:\n            gru_in_channels = None\n        else:\n            gru_in_channels = in_channels + hidden_channels\n        with self.init_scope():\n            self.graph_linear = GraphLinear(\n                in_channels, n_edge_types * hidden_channels)\n            self.update_layer = links.GRU(gru_in_channels, out_channels)\n        self.n_edge_types = n_edge_types\n        self.in_channels = in_channels\n        self.hidden_channels = hidden_channels\n        self.out_channels = out_channels\n\n    def __call__(self, h, adj, **kwargs):\n        hidden_ch = self.hidden_channels\n        # --- Message part ---\n        mb, atom, in_ch = h.shape\n        m = functions.reshape(self.graph_linear(h),\n                              (mb, atom, hidden_ch, self.n_edge_types))\n        # m: (minibatch, atom, ch, edge_type)\n        # Transpose\n        m = functions.transpose(m, (0, 3, 1, 2))\n        # m: (minibatch, edge_type, atom, ch)\n\n        # (minibatch * edge_type, atom, out_ch)\n        m = functions.reshape(m, (mb * self.n_edge_types, atom, hidden_ch))\n\n        if is_sparse(adj):\n            m = functions.sparse_matmul(adj, m)\n        else:\n            adj = functions.reshape(adj, (mb * self.n_edge_types, atom, atom))\n            m = chainer_chemistry.functions.matmul(adj, m)\n\n        # (minibatch * edge_type, atom, out_ch)\n        m = functions.reshape(m, (mb, self.n_edge_types, atom, hidden_ch))\n        m = functions.sum(m, axis=1)\n        # (minibatch, atom, out_ch)\n\n        # --- Update part ---\n        # Contraction\n        h = functions.reshape(h, (mb * atom, in_ch))\n\n        # Contraction\n        m = functions.reshape(m, (mb * atom, hidden_ch))\n\n        out_h = self.update_layer(functions.concat((h, m), axis=1))\n        # Expansion\n        out_h = functions.reshape(out_h, (mb, atom, self.out_channels))\n        return out_h\n\n    def reset_state(self):\n        self.update_layer.reset_state()\n"
  },
  {
    "path": "chainer_chemistry/links/update/gin_update.py",
    "content": "import chainer\nfrom chainer import functions\n\nimport chainer_chemistry\nfrom chainer_chemistry.links import GraphMLP\n\n\nclass GINUpdate(chainer.Chain):\n    r\"\"\"GIN submodule for update part.\n\n    Simplest implementation of Graph Isomorphism Network (GIN):\n    N-layered MLP + ReLU\n    No learnable epsilon\n\n    Batch Normalization is not implemented. instead we use dropout\n\n    # TODO: implement Batch Normalization inside GraphMLP\n    # Linear -> BN -> relu is used.\n\n    See: Xu, Hu, Leskovec, and Jegelka, \\\n        \"How powerful are graph neural networks?\", in ICLR 2019.\n\n    Args:\n        in_channels (int or None): input dim of feature vector for each node\n        hidden_channels (int): dimension of feature vector for each node\n        out_channels (int or None): output dime of feature vector for each node\n            When `None`, `hidden_channels` is used.\n        dropout_ratio (float): ratio of dropout, instead of batch normalization\n        n_layers (int): layers used in `GraphMLP`\n    \"\"\"\n\n    def __init__(self, in_channels=None, hidden_channels=16, out_channels=None,\n                 dropout_ratio=0.5, n_layers=2, **kwargs):\n        if out_channels is None:\n            out_channels = hidden_channels\n        super(GINUpdate, self).__init__()\n        channels = [hidden_channels] * (n_layers - 1) + [out_channels]\n        with self.init_scope():\n            # two Linear + RELU\n            self.graph_mlp = GraphMLP(\n                channels=channels, in_channels=in_channels,\n                activation=functions.relu)\n        self.dropout_ratio = dropout_ratio\n\n    def __call__(self, h, adj, **kwargs):\n        \"\"\"Describing a layer.\n\n        Args:\n            h (numpy.ndarray): minibatch by num_nodes by hidden_dim\n                numpy array. local node hidden states\n            adj (numpy.ndarray): minibatch by num_nodes by num_nodes 1/0 array.\n                Adjacency matrices over several bond types\n\n        Returns:\n            updated h\n        \"\"\"\n        # Support for one graph (node classification task)\n        if h.ndim == 2:\n            h = h[None]\n\n        # (minibatch, atom, ch)\n        mb, atom, ch = h.shape\n\n        # --- Message part ---\n        if isinstance(adj, chainer.utils.CooMatrix):\n            # coo pattern\n            # Support for one graph\n            if adj.data.ndim == 1:\n                adj.data = adj.data[None]\n                adj.col = adj.col[None]\n                adj.row = adj.row[None]\n            fv = functions.sparse_matmul(adj, h)\n        else:\n            # padding pattern\n            # adj (mb, atom, atom)\n            # fv   (minibatch, atom, ch)\n            fv = chainer_chemistry.functions.matmul(adj, h)\n            assert (fv.shape == (mb, atom, ch))\n\n        # sum myself\n        sum_h = fv + h\n        assert (sum_h.shape == (mb, atom, ch))\n\n        # apply MLP\n        new_h = self.graph_mlp(sum_h)\n        new_h = functions.relu(new_h)\n        if self.dropout_ratio > 0.0:\n            new_h = functions.dropout(new_h, ratio=self.dropout_ratio)\n        return new_h\n\n\nclass GINSparseUpdate(chainer.Chain):\n    \"\"\"sparse GIN submodule for update part\"\"\"\n\n    def __init__(self, in_channels=None, hidden_channels=16, out_channels=None,\n                 dropout_ratio=0.5, n_layers=2, **kwargs):\n        # To avoid circular reference\n        from chainer_chemistry.models.mlp import MLP\n\n        if out_channels is None:\n            out_channels = hidden_channels\n        super(GINSparseUpdate, self).__init__()\n        with self.init_scope():\n            self.mlp = MLP(\n                out_dim=out_channels, hidden_dim=hidden_channels,\n                n_layers=n_layers, activation=functions.relu\n            )\n        self.dropout_ratio = dropout_ratio\n\n    def __call__(self, h, edge_index):\n        # add self node feature\n        new_h = h\n        messages = h[edge_index[0]]\n        new_h = functions.scatter_add(new_h, edge_index[1], messages)\n        # apply MLP\n        new_h = self.mlp(new_h)\n        if self.dropout_ratio > 0.0:\n            new_h = functions.dropout(new_h, ratio=self.dropout_ratio)\n        return new_h\n"
  },
  {
    "path": "chainer_chemistry/links/update/gnn_film_update.py",
    "content": "import chainer\nfrom chainer import functions\nfrom chainer import links\n\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass GNNFiLMUpdate(chainer.Chain):\n    \"\"\"GNNFiLM submodule for update part.\n\n    Args:\n        hidden_channels (int): dimension of feature vector associated to\n            each atom\n        n_edge_types (int): number of types of edge\n    \"\"\"\n\n    def __init__(self, hidden_channels=16, n_edge_types=5,\n                 activation=functions.relu):\n        super(GNNFiLMUpdate, self).__init__()\n        self.n_edge_types = n_edge_types\n        self.activation = activation\n        with self.init_scope():\n            self.W_linear = GraphLinear(\n                in_size=None, out_size=self.n_edge_types * hidden_channels,\n                nobias=True)  # W_l in eq. (6)\n            self.W_g = GraphLinear(\n                in_size=None, out_size=self.n_edge_types * hidden_channels * 2,\n                nobias=True)  # g in eq. (6)\n            self.norm_layer = links.LayerNormalization()  # l in eq. (6)\n\n    def forward(self, h, adj):\n        # --- Message part ---\n\n        xp = self.xp\n        mb, atom, ch = h.shape\n        newshape = adj.shape + (ch, )\n        adj = functions.broadcast_to(adj[:, :, :, :, xp.newaxis], newshape)\n        messages = functions.reshape(self.W_linear(h),\n                                     (mb, atom, ch, self.n_edge_types))\n        messages = functions.transpose(messages, (3, 0, 1, 2))\n        film_weights = functions.reshape(self.W_g(h),\n                                         (mb, atom, 2 * ch, self.n_edge_types))\n        film_weights = functions.transpose(film_weights, (3, 0, 1, 2))\n        # (n_edge_types, minibatch, atom, out_ch)\n        gamma = film_weights[:, :, :, :ch]\n        # (n_edge_types, minibatch, atom, out_ch)\n        beta = film_weights[:, :, :, ch:]\n\n        # --- Update part ---\n\n        messages = functions.expand_dims(\n            gamma, axis=3) * functions.expand_dims(\n            messages, axis=2) + functions.expand_dims(beta, axis=3)\n        messages = self.activation(messages)\n        # (minibatch, n_edge_types, atom, atom, out_ch)\n        messages = functions.transpose(messages, (1, 0, 2, 3, 4))\n        messages = adj * messages\n        messages = functions.sum(messages, axis=3)  # sum across atoms\n        messages = functions.sum(messages, axis=1)  # sum across n_edge_types\n        messages = functions.reshape(messages, (mb * atom, ch))\n        messages = self.norm_layer(messages)\n        messages = functions.reshape(messages, (mb, atom, ch))\n        return messages\n"
  },
  {
    "path": "chainer_chemistry/links/update/megnet_update.py",
    "content": "import chainer\nfrom chainer import functions, links  # NOQA\n\n\nfrom chainer_chemistry.functions import megnet_softplus\n\n\nclass DenseLayer(chainer.Chain):\n    def __init__(self, hidden_dim=[64, 32], activation=megnet_softplus):\n        super(DenseLayer, self).__init__()\n        self.n_layers = len(hidden_dim)\n        self.activation = activation\n        with self.init_scope():\n            self.update_layer = chainer.ChainList(\n                *[links.Linear(None, hidden_dim[i])\n                  for i in range(self.n_layers)])\n\n    def __call__(self, v):\n        for i in range(self.n_layers):\n            v = self.activation(self.update_layer[i](v))\n        return v\n\n\nclass UpdateLayer(chainer.Chain):\n    def __init__(self, hidden_dim=[64, 64, 32], activation=megnet_softplus):\n        super(UpdateLayer, self).__init__()\n        self.n_layers = len(hidden_dim)\n        self.activation = activation\n        with self.init_scope():\n            self.update_layer = chainer.ChainList(\n                *[links.Linear(None, hidden_dim[i])\n                    for i in range(self.n_layers)])\n\n    def __call__(self, v):\n        for i in range(self.n_layers):\n            v = self.update_layer[i](v)\n            # doesn't pass the activation at the last layer\n            if i != (self.n_layers-1):\n                v = self.activation(v)\n        return v\n\n\ndef get_mean_feat(feat, idx, out_shape, xp):\n    \"\"\"Return mean node or edge feature in each graph.\n\n    This method is the same as average pooling\n    about node or edge feature in each graph.\n    \"\"\"\n    zero = xp.zeros(out_shape, dtype=xp.float32)\n    sum_vec = functions.scatter_add(zero, idx, feat)\n    one = xp.ones(feat.shape, dtype=xp.float32)\n    degree = functions.scatter_add(zero, idx, one)\n    return sum_vec / degree\n\n\nclass MEGNetUpdate(chainer.Chain):\n    \"\"\"Update submodule for MEGNet\n\n    Args:\n        dim_for_dense (list): dimension list of dense layer\n        dim_for_update (list): dimension list of update layer\n        dropout_ratio (float): ratio of dropout\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function for megnet model\n            `megnet_softplus` was used in original paper.\n        skip_intermediate (bool): When `True`, intermediate feature after dense\n            calculation is used for skip connection. When `False`, input\n            feature is used for skip connection.\n            It is `True` for first layer, and `False` for other layer in the\n            original paper.\n    \"\"\"\n\n    def __init__(self, dim_for_dense=[64, 32], dim_for_update=[64, 64, 32],\n                 dropout_ratio=-1, activation=megnet_softplus,\n                 skip_intermediate=True):\n        super(MEGNetUpdate, self).__init__()\n        if len(dim_for_dense) != 2:\n            raise ValueError('dim_for_dense must have 2 elements')\n\n        if len(dim_for_update) != 3:\n            raise ValueError('dim_for_update must have 3 elements')\n\n        self.dropout_ratio = dropout_ratio\n        with self.init_scope():\n            # for dense layer\n            self.dense_for_atom = DenseLayer(dim_for_dense, activation)\n            self.dense_for_pair = DenseLayer(dim_for_dense, activation)\n            self.dense_for_global = DenseLayer(dim_for_dense, activation)\n            # for update layer\n            self.update_for_atom = UpdateLayer(dim_for_update, activation)\n            self.update_for_pair = UpdateLayer(dim_for_update, activation)\n            self.update_for_global = UpdateLayer(dim_for_update, activation)\n        self.skip_intermediate = skip_intermediate\n\n    def __call__(self, atoms_feat, pair_feat, global_feat,\n                 atom_idx, pair_idx, start_idx, end_idx):\n        # 1) Pass the Dense layer\n        a_f_d = self.dense_for_atom(atoms_feat)\n        p_f_d = self.dense_for_pair(pair_feat)\n        g_f_d = self.dense_for_global(global_feat)\n\n        # 2) Update the edge vector\n        start_node = a_f_d[start_idx]\n        end_node = a_f_d[end_idx]\n        g_f_extend_with_pair_idx = g_f_d[pair_idx]\n        concat_p_v = functions.concat((p_f_d, start_node, end_node,\n                                       g_f_extend_with_pair_idx))\n        update_p = self.update_for_pair(concat_p_v)\n\n        # 3) Update the node vector\n        # 1. get sum edge feature of all nodes using scatter_add method\n        zero = self.xp.zeros(a_f_d.shape, dtype=self.xp.float32)\n        sum_edeg_vec = functions.scatter_add(zero, start_idx, update_p) + \\\n            functions.scatter_add(zero, end_idx, update_p)\n        # 2. get degree of all nodes using scatter_add method\n        one = self.xp.ones(p_f_d.shape, dtype=self.xp.float32)\n        degree = functions.scatter_add(zero, start_idx, one) + \\\n            functions.scatter_add(zero, end_idx, one)\n        # 3. get mean edge feature of all nodes\n        mean_edge_vec = sum_edeg_vec / degree\n        # 4. concating\n        g_f_extend_with_atom_idx = g_f_d[atom_idx]\n        concat_a_v = functions.concat((a_f_d, mean_edge_vec,\n                                       g_f_extend_with_atom_idx))\n        update_a = self.update_for_atom(concat_a_v)\n\n        # 4) Update the global vector\n        out_shape = g_f_d.shape\n        ave_p = get_mean_feat(update_p, pair_idx, out_shape, self.xp)\n        ave_a = get_mean_feat(update_a, atom_idx, out_shape, self.xp)\n        concat_g_v = functions.concat((ave_a, ave_p, g_f_d), axis=1)\n        update_g = self.update_for_global(concat_g_v)\n\n        # 5) Skip connection\n        if self.skip_intermediate:\n            # Skip intermediate feature, used for first layer.\n            new_a_f = update_a + a_f_d\n            new_p_f = update_p + p_f_d\n            new_g_f = update_g + g_f_d\n        else:\n            # Skip input feature, used all layer except first layer.\n            # input feature must be same dimension with updated feature.\n            new_a_f = update_a + atoms_feat\n            new_p_f = update_p + pair_feat\n            new_g_f = update_g + global_feat\n\n        # 6) dropout\n        if self.dropout_ratio > 0.0:\n            new_a_f = functions.dropout(new_a_f, ratio=self.dropout_ratio)\n            new_p_f = functions.dropout(new_p_f, ratio=self.dropout_ratio)\n            new_g_f = functions.dropout(new_g_f, ratio=self.dropout_ratio)\n\n        return new_a_f, new_p_f, new_g_f\n"
  },
  {
    "path": "chainer_chemistry/links/update/mpnn_update.py",
    "content": "import chainer\nfrom chainer import functions\nfrom chainer import links\n\nimport chainer_chemistry\n\n\nclass MPNNUpdate(chainer.Chain):\n    r\"\"\"MPNN submodule for update part.\n\n    See: Justin Gilmer+, \\\n        Neural Message Passing for Quantum Chemistry. April 2017.\n        `arXiv:1704.01212 <https://arxiv.org/abs/1704.01212>`\n\n    Args:\n        in_channels (int or None): input dim of feature vector for each node\n        hidden_channels (int): dimension of feature vector for each node\n        out_channels (int or None): output dime of feature vector for each node\n            When `None`, `hidden_channels` is used.\n        nn (~chainer.Link):\n\n    \"\"\"\n\n    def __init__(self, in_channels=None, hidden_channels=16, out_channels=None,\n                 nn=None, **kwargs):\n        if out_channels is None:\n            out_channels = hidden_channels\n        if in_channels is None:\n            # Current `EdgeNet` hidden_channels must be same with input `h` dim\n            in_channels = out_channels\n        super(MPNNUpdate, self).__init__()\n        with self.init_scope():\n            self.message_layer = EdgeNet(out_channels=hidden_channels, nn=nn)\n            self.update_layer = links.GRU(2 * hidden_channels, out_channels)\n        self.in_channels = in_channels  # currently it is not used...\n        self.hidden_channels = hidden_channels\n        self.out_channels = out_channels\n        self.nn = nn\n\n    def __call__(self, h, adj, **kwargs):\n        # type: (chainer.Variable, chainer.Variable) -> chainer.Variable\n        # adj: (mb, edge_type, node, node)\n        mb, node, ch = h.shape\n        h = self.message_layer(h, adj)  # h: (mb, node, hidden_dim*2)\n        h = functions.reshape(h, (mb * node, self.hidden_channels * 2))\n        h = self.update_layer(h)  # h: (mb*node, hidden_dim)\n        h = functions.reshape(h, (mb, node, self.out_channels))\n        return h\n\n    def reset_state(self):\n        self.update_layer.reset_state()\n\n\nclass EdgeNet(chainer.Chain):\n    \"\"\"MPNN submodule for message part.\n\n    Edge Network expands edge vector dimension to (d x d) matrix.\n    If undirected graph, adj_in and adj_out are same.\n\n    Args:\n        out_channels (int): dimension of output feature vector\n            Currently, it must be same with input dimension.\n        nn (~chainer.Link):\n    \"\"\"\n\n    def __init__(self, out_channels, nn=None):\n        # type: (int, chainer.Link) -> None\n        super(EdgeNet, self).__init__()\n        if nn is None:\n            from chainer_chemistry.models.mlp import MLP\n            nn = MLP(out_dim=out_channels**2, hidden_dim=16)\n        if not isinstance(nn, chainer.Link):\n            raise ValueError('nn {} must be chainer.Link'.format(nn))\n        with self.init_scope():\n            self.nn_layer_in = nn\n            self.nn_layer_out = nn\n        self.out_channels = out_channels\n\n    def __call__(self, h, adj):\n        # type: (chainer.Variable, chainer.Variable) -> chainer.Variable\n        mb, node, ch = h.shape\n        if ch != self.out_channels:\n            raise ValueError('hidden_channels must be equal to dimension '\n                             'of feature vector associated to each atom, '\n                             '{}, but it was set to {}'.format(\n                                 ch, self.out_channels))\n        # adj: (mb, edge_type, node, node)\n        edge_type = adj.shape[1]\n        adj_in = adj\n        adj_out = functions.transpose(adj, axes=(0, 1, 3, 2))\n\n        # expand edge vector to matrix\n        adj_in = functions.reshape(adj_in, (-1, edge_type))\n        # adj_in: (mb*node*node, edge_type)\n        adj_in = self.nn_layer_in(adj_in)\n        # adj_in: (mb*node*node, out_ch*out_ch)\n        adj_in = functions.reshape(adj_in, (mb, node, node, ch, ch))\n        adj_in = functions.reshape(\n            functions.transpose(adj_in, axes=(0, 1, 3, 2, 4)),\n            (mb, node * ch, node * ch))\n\n        adj_out = functions.reshape(adj_out, (-1, edge_type))\n        # adj_out: (mb*node*node, edge_type)\n        adj_out = self.nn_layer_out(adj_out)\n        # adj_out: (mb*node*node, out_ch*out_ch)\n        adj_out = functions.reshape(adj_out, (mb, node, node, ch, ch))\n        adj_out = functions.reshape(\n            functions.transpose(adj_out, axes=(0, 1, 3, 2, 4)),\n            (mb, node * ch, node * ch))\n\n        # calculate message\n        h = functions.reshape(h, (mb, node * ch, 1))\n        message_in = chainer_chemistry.functions.matmul(adj_in, h)\n        # message_in: (mb, node*ch, 1)\n        message_in = functions.reshape(message_in, (mb, node, ch))\n        # message_in: (mb, node, out_ch)\n        message_out = chainer_chemistry.functions.matmul(adj_out, h)\n        # message_out: (mb, node*ch, 1)\n        message_out = functions.reshape(message_out, (mb, node, ch))\n        message = functions.concat([message_in, message_out], axis=2)\n        return message  # message: (mb, node, out_ch * 2)\n"
  },
  {
    "path": "chainer_chemistry/links/update/nfp_update.py",
    "content": "import chainer\nfrom chainer import functions\nimport numpy\n\nimport chainer_chemistry\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass NFPUpdate(chainer.Chain):\n    \"\"\"NFP submodule for update part.\n\n    Args:\n        in_channels (int or None): input channel dimension\n        out_channels (int): output channel dimension\n        max_degree (int): max degree of edge\n    \"\"\"\n\n    def __init__(self, in_channels, out_channels, max_degree=6,\n                 **kwargs):\n        super(NFPUpdate, self).__init__()\n        num_degree_type = max_degree + 1\n        with self.init_scope():\n            self.graph_linears = chainer.ChainList(\n                *[GraphLinear(in_channels, out_channels)\n                  for _ in range(num_degree_type)])\n        self.max_degree = max_degree\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n\n    def __call__(self, h, adj, deg_conds):\n        # h: (minibatch, atom, ch)\n        # h encodes each atom's info in ch axis of size hidden_dim\n        # adjs: (minibatch, atom, atom)\n\n        # --- Message part ---\n        # Take sum along adjacent atoms\n\n        # fv: (minibatch, atom, ch)\n        fv = chainer_chemistry.functions.matmul(adj, h)\n\n        # --- Update part ---\n        # TODO(nakago): self.xp is chainerx\n        if self.xp is numpy:\n            zero_array = numpy.zeros(fv.shape, dtype=numpy.float32)\n        else:\n            zero_array = self.xp.zeros_like(fv.array)\n\n        fvds = [functions.where(cond, fv, zero_array) for cond in deg_conds]\n\n        out_h = 0\n        for graph_linear, fvd in zip(self.graph_linears, fvds):\n            out_h = out_h + graph_linear(fvd)\n\n        # out_h shape (minibatch, max_num_atoms, hidden_dim)\n        out_h = functions.sigmoid(out_h)\n        return out_h\n"
  },
  {
    "path": "chainer_chemistry/links/update/relgat_update.py",
    "content": "import chainer\nfrom chainer import functions\n\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass RelGATUpdate(chainer.Chain):\n    \"\"\"RelGAT submodule for update part.\n\n    Args:\n        in_channels (int or None): dimension of input feature vector\n        out_channels (int): dimension of output feature vector\n        n_heads (int): number of multi-head-attentions.\n        n_edge_types (int): number of edge types.\n        dropout_ratio (float): dropout ratio of the normalized attention\n            coefficients\n        negative_slope (float): LeakyRELU angle of the negative slope\n        softmax_mode (str): take the softmax over the logits 'across' or\n            'within' relation. If you would like to know the detail discussion,\n            please refer Relational GAT paper.\n        concat_heads (bool) : Whether to concat or average multi-head\n            attentions\n    \"\"\"\n    def __init__(self, in_channels, out_channels, n_heads=3, n_edge_types=4,\n                 dropout_ratio=-1., negative_slope=0.2, softmax_mode='across',\n                 concat_heads=False):\n        super(RelGATUpdate, self).__init__()\n        with self.init_scope():\n            self.message_layer = GraphLinear(\n                in_channels, out_channels * n_edge_types * n_heads)\n            self.attention_layer = GraphLinear(out_channels * 2, 1)\n\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n        self.n_heads = n_heads\n        self.n_edge_types = n_edge_types\n        self.dropout_ratio = dropout_ratio\n        self.softmax_mode = softmax_mode\n        self.concat_heads = concat_heads\n        self.negative_slope = negative_slope\n\n    def __call__(self, h, adj, **kwargs):\n        xp = self.xp\n        # (minibatch, atom, channel)\n        mb, atom, ch = h.shape\n        # (minibatch, atom, EDGE_TYPE * heads * out_dim)\n        h = self.message_layer(h)\n        # (minibatch, atom, EDGE_TYPE, heads, out_dim)\n        h = functions.reshape(h, (mb, atom, self.n_edge_types, self.n_heads,\n                                  self.out_channels))\n        # concat all pairs of atom\n        # (minibatch, 1, atom, heads, out_dim)\n        h_i = functions.reshape(h, (mb, 1, atom, self.n_edge_types,\n                                    self.n_heads, self.out_channels))\n        # (minibatch, atom, atom, heads, out_dim)\n        h_i = functions.broadcast_to(h_i, (mb, atom, atom, self.n_edge_types,\n                                           self.n_heads, self.out_channels))\n\n        # (minibatch, atom, 1, EDGE_TYPE, heads, out_dim)\n        h_j = functions.reshape(h, (mb, atom, 1, self.n_edge_types,\n                                    self.n_heads, self.out_channels))\n        # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim)\n        h_j = functions.broadcast_to(h_j, (mb, atom, atom, self.n_edge_types,\n                                           self.n_heads, self.out_channels))\n\n        # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim * 2)\n        e = functions.concat([h_i, h_j], axis=5)\n\n        # (minibatch, EDGE_TYPE, heads, atom, atom, out_dim * 2)\n        e = functions.transpose(e, (0, 3, 4, 1, 2, 5))\n        # (minibatch * EDGE_TYPE * heads, atom * atom, out_dim * 2)\n        e = functions.reshape(e, (mb * self.n_edge_types * self.n_heads,\n                                  atom * atom, self.out_channels * 2))\n        # (minibatch * EDGE_TYPE * heads, atom * atom, 1)\n        e = self.attention_layer(e)\n\n        # (minibatch, EDGE_TYPE, heads, atom, atom)\n        e = functions.reshape(e, (mb, self.n_edge_types, self.n_heads, atom,\n                                  atom))\n        e = functions.leaky_relu(e, self.negative_slope)\n\n        # (minibatch, EDGE_TYPE, atom, atom)\n        if isinstance(adj, chainer.Variable):\n            cond = adj.array.astype(xp.bool)\n        else:\n            cond = adj.astype(xp.bool)\n        # (minibatch, EDGE_TYPE, 1, atom, atom)\n        cond = xp.reshape(cond, (mb, self.n_edge_types, 1, atom, atom))\n        # (minibatch, EDGE_TYPE, heads, atom, atom)\n        cond = xp.broadcast_to(cond, e.array.shape)\n        # TODO(mottodora): find better way to ignore non connected\n        e = functions.where(cond, e,\n                            xp.broadcast_to(xp.array(-10000), e.array.shape)\n                            .astype(xp.float32))\n        # In Relational Graph Attention Networks eq.(7)\n        # ARGAT: take the softmax over the logits across node neighborhoods\n        # irrespective of relation\n        if self.softmax_mode == 'across':\n            # (minibatch, heads, atom, EDGE_TYPE, atom)\n            e = functions.transpose(e, (0, 2, 3, 1, 4))\n            # (minibatch, heads, atom, EDGE_TYPE * atom)\n            e = functions.reshape(e, (mb, self.n_heads, atom,\n                                      self.n_edge_types * atom))\n            # (minibatch, heads, atom, EDGE_TYPE * atom)\n            alpha = functions.softmax(e, axis=3)\n            if self.dropout_ratio >= 0:\n                alpha = functions.dropout(alpha, ratio=self.dropout_ratio)\n            # (minibatch, heads, atom, EDGE_TYPE, atom)\n            alpha = functions.reshape(alpha, (mb, self.n_heads, atom,\n                                              self.n_edge_types, atom))\n            # (minibatch, EDGE_TYPE, heads, atom, atom)\n            alpha = functions.transpose(alpha, (0, 3, 1, 2, 4))\n\n        # In Relational Graph Attention Networks eq.(6)\n        # WIRGAT: take the softmax over the logits independently for each\n        # relation\n        elif self.softmax_mode == 'within':\n            alpha = functions.softmax(e, axis=4)\n            if self.dropout_ratio >= 0:\n                alpha = functions.dropout(alpha, ratio=self.dropout_ratio)\n        else:\n            raise ValueError(\"{} is invalid. Please use 'across' or 'within'\"\n                             .format(self.softmax_mode))\n\n        # before: (minibatch, atom, EDGE_TYPE, heads, out_dim)\n        # after: (minibatch, EDGE_TYPE, heads, atom, out_dim)\n        h = functions.transpose(h, (0, 2, 3, 1, 4))\n        # (minibatch, EDGE_TYPE, heads, atom, out_dim)\n        h_new = functions.matmul(alpha, h)\n        # (minibatch, heads, atom, out_dim)\n        h_new = functions.sum(h_new, axis=1)\n        if self.concat_heads:\n            # -> (minibatch, atom, heads, out_dim)\n            h_new = functions.transpose(h_new, (0, 2, 1, 3))\n            bs, n_nodes, n_heads, outdim = h_new.shape\n            # (minibatch, atom, heads * out_dim)\n            h_new = functions.reshape(h_new, (bs, n_nodes, n_heads * outdim))\n        else:\n            # (minibatch, atom, out_dim)\n            h_new = functions.mean(h_new, axis=1)\n        return h_new\n"
  },
  {
    "path": "chainer_chemistry/links/update/relgcn_update.py",
    "content": "import chainer\nfrom chainer import functions\n\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass RelGCNUpdate(chainer.Chain):\n    \"\"\"RelGUN submodule for update part.\n\n    Args:\n        in_channels (int or None): input channel dimension\n        out_channels (int): output channel dimension\n        num_edge_type (int): number of types of edge\n    \"\"\"\n\n    def __init__(self, in_channels, out_channels, n_edge_types=4,\n                 **kwargs):\n        super(RelGCNUpdate, self).__init__()\n        with self.init_scope():\n            self.graph_linear_self = GraphLinear(in_channels, out_channels)\n            self.graph_linear_edge = GraphLinear(\n                in_channels, out_channels * n_edge_types)\n        self.n_edge_types = n_edge_types\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n\n    def __call__(self, h, adj, **kwargs):\n        \"\"\"main calculation\n\n        Args:\n            h: (batchsize, num_nodes, in_channels)\n            adj: (batchsize, num_edge_type, num_nodes, num_nodes)\n\n        Returns:\n            (batchsize, num_nodes, ch)\n        \"\"\"\n        mb, node, ch = h.shape\n\n        # --- self connection, apply linear function ---\n        hs = self.graph_linear_self(h)\n        # --- relational feature, from neighbor connection ---\n        # Expected number of neighbors of a vertex\n        # Since you have to divide by it, if its 0, you need to\n        # arbitrarily set it to 1\n        m = self.graph_linear_edge(h)\n        m = functions.reshape(\n            m, (mb, node, self.out_channels, self.n_edge_types))\n        m = functions.transpose(m, (0, 3, 1, 2))\n        # m: (batchsize, edge_type, node, ch)\n        # hrL (batchsize, edge_type, node, ch)\n        hr = functions.matmul(adj, m)\n        # hr: (batchsize, node, ch)\n        hr = functions.sum(hr, axis=1)\n        return hs + hr\n\n\nclass RelGCNSparseUpdate(chainer.Chain):\n    \"\"\"sparse RelGCN submodule for update part\"\"\"\n\n    def __init__(self, in_channels, out_channels, n_edge_types):\n        super(RelGCNSparseUpdate, self).__init__()\n        self.out_channels = out_channels\n        self.n_edge_types = n_edge_types\n        with self.init_scope():\n            self.root_weight = chainer.links.Linear(in_channels, out_channels)\n            self.edge_weight = chainer.links.Linear(\n                in_channels, n_edge_types * out_channels)\n\n    def __call__(self, h, edge_index, edge_attr):\n        next_h = self.root_weight(h)\n        features = self.edge_weight(\n            h) .reshape(-1, self.n_edge_types, self.out_channels)\n        messages = features[edge_index[0], edge_attr, :]\n        return functions.scatter_add(next_h, edge_index[1], messages)\n"
  },
  {
    "path": "chainer_chemistry/links/update/rsgcn_update.py",
    "content": "import chainer\n\nimport chainer_chemistry\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass RSGCNUpdate(chainer.Chain):\n    \"\"\"RSGCN submodule for message and update part.\n\n    Args:\n        in_channels (int or None): input channel dimension\n        out_channels (int): output channel dimension\n    \"\"\"\n\n    def __init__(self, in_channels, out_channels, **kwargs):\n        super(RSGCNUpdate, self).__init__()\n        with self.init_scope():\n            self.graph_linear = GraphLinear(\n                in_channels, out_channels, nobias=True)\n        self.in_channels = in_channels\n        self.out_channels = out_channels\n\n    def __call__(self, h, adj, **kwargs):\n        # --- Message part ---\n        h = chainer_chemistry.functions.matmul(adj, h)\n        # --- Update part ---\n        h = self.graph_linear(h)\n        return h\n"
  },
  {
    "path": "chainer_chemistry/links/update/schnet_update.py",
    "content": "\"\"\"\nChainer implementation of CFConv.\n\nSchNet: A continuous-filter convolutional neural network for modeling\n    quantum interactions\nKristof et al.\nSee: https://arxiv.org/abs/1706.08566\n\"\"\"\n\nimport chainer\nfrom chainer import functions\nfrom chainer import links\n\nfrom chainer_chemistry.functions import shifted_softplus\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\n\n\nclass CFConv(chainer.Chain):\n    \"\"\"CFConv\n\n    Args:\n        num_rbf (int): Number of RBF kernel\n        radius_resolution (float): resolution of radius.\n            Roughly `num_rbf * radius_resolution` ball is convolved in 1 step.\n        gamma (float): coefficient to apply kernel.\n        hidden_dim (int): hidden dim\n    \"\"\"\n\n    def __init__(self, num_rbf=300, radius_resolution=0.1, gamma=10.0,\n                 hidden_dim=64):\n        super(CFConv, self).__init__()\n        with self.init_scope():\n            self.dense1 = links.Linear(num_rbf, hidden_dim)\n            self.dense2 = links.Linear(hidden_dim)\n        self.hidden_dim = hidden_dim\n        self.num_rbf = num_rbf\n        self.radius_resolution = radius_resolution\n        self.gamma = gamma\n\n    def __call__(self, h, dist):\n        \"\"\"main calculation\n\n        Args:\n            h (numpy.ndarray): axis 0 represents minibatch index,\n                axis 1 represents atom_index and axis2 represents\n                feature dimension.\n            dist (numpy.ndarray): axis 0 represents minibatch index,\n                axis 1 and 2 represent distance between atoms.\n        \"\"\"\n        mb, atom, ch = h.shape\n        if ch != self.hidden_dim:\n            raise ValueError('h.shape[2] {} and hidden_dim {} must be same!'\n                             .format(ch, self.hidden_dim))\n        embedlist = self.xp.arange(\n            self.num_rbf).astype('f') * self.radius_resolution\n        dist = functions.reshape(dist, (mb, atom, atom, 1))\n        dist = functions.broadcast_to(dist, (mb, atom, atom, self.num_rbf))\n        dist = functions.exp(- self.gamma * (dist - embedlist) ** 2)\n        dist = functions.reshape(dist, (-1, self.num_rbf))\n        dist = self.dense1(dist)\n        dist = shifted_softplus(dist)\n        dist = self.dense2(dist)\n        dist = shifted_softplus(dist)\n        dist = functions.reshape(dist, (mb, atom, atom, self.hidden_dim))\n        h = functions.reshape(h, (mb, atom, 1, self.hidden_dim))\n        h = functions.broadcast_to(h, (mb, atom, atom, self.hidden_dim))\n        h = functions.sum(h * dist, axis=1)\n        return h\n\n\nclass SchNetUpdate(chainer.Chain):\n    \"\"\"Update submodule for SchNet\n\n    `in_channels` and `hidden_channels` must be same with `hidden_channels` in\n     this module.\n\n    Args:\n        hidden_channels (int):\n        num_rbf (int):\n        radius_resolution (float):\n        gamma (float):\n    \"\"\"\n\n    def __init__(self, hidden_channels=64, num_rbf=300,\n                 radius_resolution=0.1, gamma=10.0):\n        super(SchNetUpdate, self).__init__()\n        with self.init_scope():\n            self.linear = chainer.ChainList(\n                *[GraphLinear(None, hidden_channels) for _ in range(3)])\n            self.cfconv = CFConv(\n                num_rbf=num_rbf, radius_resolution=radius_resolution,\n                gamma=gamma, hidden_dim=hidden_channels)\n        self.hidden_channels = hidden_channels\n\n    def __call__(self, h, adj, **kwargs):\n        v = self.linear[0](h)\n        v = self.cfconv(v, adj)\n        v = self.linear[1](v)\n        v = shifted_softplus(v)\n        v = self.linear[2](v)\n        return h + v\n"
  },
  {
    "path": "chainer_chemistry/models/__init__.py",
    "content": "from chainer_chemistry.models import ggnn  # NOQA\nfrom chainer_chemistry.models import gin  # NOQA\nfrom chainer_chemistry.models import gwm  # NOQA\nfrom chainer_chemistry.models import mlp  # NOQA\nfrom chainer_chemistry.models import mpnn  # NOQA\nfrom chainer_chemistry.models import nfp  # NOQA\nfrom chainer_chemistry.models import prediction  # NOQA\nfrom chainer_chemistry.models import relgat  # NOQA\nfrom chainer_chemistry.models import relgcn  # NOQA\nfrom chainer_chemistry.models import rsgcn  # NOQA\nfrom chainer_chemistry.models import schnet  # NOQA\nfrom chainer_chemistry.models import weavenet  # NOQA\n\nfrom chainer_chemistry.models.ggnn import GGNN  # NOQA\nfrom chainer_chemistry.models.ggnn import SparseGGNN  # NOQA\nfrom chainer_chemistry.models.gin import GIN  # NOQA\nfrom chainer_chemistry.models.gnn_film import GNNFiLM  # NOQA\nfrom chainer_chemistry.models.mlp import MLP  # NOQA\nfrom chainer_chemistry.models.mpnn import MPNN  # NOQA\nfrom chainer_chemistry.models.nfp import NFP  # NOQA\nfrom chainer_chemistry.models.relgat import RelGAT  # NOQA\nfrom chainer_chemistry.models.relgcn import RelGCN  # NOQA\nfrom chainer_chemistry.models.rsgcn import RSGCN  # NOQA\nfrom chainer_chemistry.models.schnet import SchNet  # NOQA\nfrom chainer_chemistry.models.weavenet import WeaveNet  # NOQA\n\nfrom chainer_chemistry.models.gwm.gwm_net import GGNN_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import GIN_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import NFP_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import RSGCN_GWM  # NOQA\n\nfrom chainer_chemistry.models.cwle.cwle_net import GGNN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RelGAT_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RelGCN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import GIN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import NFP_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RSGCN_CWLE  # NOQA\n\nfrom chainer_chemistry.models.gwle.gwle_net import GGNN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RelGAT_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RelGCN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import GIN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import NFP_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RSGCN_GWLE  # NOQA\n\nfrom chainer_chemistry.models.prediction.base import BaseForwardModel  # NOQA\nfrom chainer_chemistry.models.prediction.classifier import Classifier  # NOQA\nfrom chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor  # NOQA\nfrom chainer_chemistry.models.prediction.regressor import Regressor  # NOQA\nfrom chainer_chemistry.models.prediction.set_up_predictor import set_up_predictor  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/models/cgcnn.py",
    "content": "import chainer\nfrom chainer import links\n\nfrom chainer_chemistry.links.readout.cgcnn_readout import CGCNNReadout\nfrom chainer_chemistry.links.update.cgcnn_update import CGCNNUpdate\n\n\nclass CGCNN(chainer.Chain):\n    \"\"\"CGCNN\n\n    See Tian Xie et al, \\\n        Crystal Graph Convolutional Neural Networks for an Accurate and\n        Interpretable Prediction of Material Properties. \\\n        `arXiv:1710.10324 <https://arxiv.org/abs/1710.10324>`_\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        n_update_layers (int): number of CGCNNUpdate layers\n        n_atom_features (int): hidden dimension of atom feature vector\n    \"\"\"\n\n    def __init__(self, out_dim=128, n_update_layers=3, n_atom_features=64):\n        super(CGCNN, self).__init__()\n        with self.init_scope():\n            self.atom_feature_embedding = links.Linear(None, n_atom_features)\n            self.crystal_convs = chainer.ChainList(\n                *[CGCNNUpdate(n_atom_features) for _ in range(n_update_layers)]\n            )\n            self.readout = CGCNNReadout(out_dim=out_dim)\n\n    def __call__(self, atom_feat, nbr_feat, atom_idx, feat_idx):\n        # atom feature embedding\n        atom_feat = self.atom_feature_embedding(atom_feat)\n        # --- CGCNN update ---\n        for conv_layer in self.crystal_convs:\n            atom_feat = conv_layer(atom_feat, nbr_feat, feat_idx)\n        # --- CGCNN readout ---\n        pool = self.readout(atom_feat, atom_idx)\n        return pool\n"
  },
  {
    "path": "chainer_chemistry/models/cwle/__init__.py",
    "content": "#from chainer_chemistry.models.cwle import cwle\nfrom chainer_chemistry.models.cwle import cwle_graph_conv_model\nfrom chainer_chemistry.models.cwle import cwle_net\n#from chainer_chemistry.models.cwle.gwm import GWM\nfrom chainer_chemistry.models.cwle.cwle_graph_conv_model import CWLEGraphConvModel  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import GGNN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RelGAT_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RelGCN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import GIN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import NFP_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RSGCN_CWLE  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/models/cwle/cwle_graph_conv_model.py",
    "content": "import chainer\nfrom chainer import cuda\nfrom chainer import links\nfrom chainer import functions\n\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\nfrom chainer_chemistry.links.normalization.graph_batch_normalization import GraphBatchNormalization  # NOQA\nfrom chainer_chemistry.links.readout.general_readout import GeneralReadout\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\n#from chainer_chemistry.models.gwm.gwm import GWM\nfrom chainer_chemistry.models.relgcn import rescale_adj\n\n\nMAX_WLE_NUM = 800\n\ndef to_array(x):\n    \"\"\"Convert x into numpy.ndarray or cupy.ndarray\"\"\"\n    if isinstance(x, chainer.Variable):\n        x = x.array\n    return x\n\n\nclass CWLEGraphConvModel(chainer.Chain):\n    \"\"\"Unified module of Graph Convolution Model with CWLE\n\n    Note that this module is experimental, all update_layer and\n    readout_layer combination is not supported.\n    Please refer `test_gwm_graph_conv_model.py` for tested combinations.\n    This module might not be maintained in the future.\n\n    Args:\n        hidden_channels (int or list): hidden channels for update\n        out_dim (int): output dim\n        update_layer (chainer.links.Link):\n        readout_layer (chainer.links.Link):\n        n_update_layers (int or None):\n        out_channels (None or lsit):\n        wle_dim (int):\n        n_atom_types (int):\n        n_edge_types (int):\n        dropout_ratio (float):\n        with_wle (bool): enabler for Combined NLE\n        concat_hidden (bool):\n        sum_hidden (bool):\n        weight_tying (bool):\n        scale_adj (bool):\n        activation (callable):\n        use_batchnorm (bool):\n        n_activation (int or None):\n        update_kwargs (dict or None):\n        readout_kwargs (dict or None):\n        wle_kwargs (dict or None):\n        n_wle_types (string):\n    \"\"\"\n    def __init__(self, hidden_channels, out_dim, update_layer, readout_layer,\n                 n_update_layers=None, out_channels=None, wle_dim=None,\n                 n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4,\n                 dropout_ratio=-1.0, with_wle=True,\n                 concat_hidden=False, sum_hidden=False, weight_tying=False,\n                 scale_adj=False, activation=None, use_batchnorm=False,\n                 n_activation=None, update_kwargs=None, readout_kwargs=None,\n                 wle_kwargs=None, n_wle_types=MAX_WLE_NUM):\n        super(CWLEGraphConvModel, self).__init__()\n\n        # General: length of hidden_channels must be n_layers + 1\n        if isinstance(hidden_channels, int):\n            if n_update_layers is None:\n                raise ValueError('n_update_layers is None')\n            else:\n                hidden_channels = [hidden_channels\n                                   for _ in range(n_update_layers + 1)]\n        elif isinstance(hidden_channels, list):\n            if out_channels is None:\n                n_update_layers = len(hidden_channels) - 1\n            else:\n                n_update_layers = len(hidden_channels)\n        else:\n            raise TypeError('Unexpected value for hidden_channels {}'\n                            .format(hidden_channels))\n\n        if readout_layer == GeneralReadout and hidden_channels[-1] != out_dim:\n            # When use GWM, hidden channels must be same. But GeneralReadout\n            # cannot change the dimension. So when use General Readout and GWM,\n            # hidden channel and out_dim should be same.\n            if with_wle:\n                raise ValueError('Unsupported combination.')\n            else:\n                hidden_channels[-1] = out_dim\n\n        # When use with_gwm, concat_hidden, sum_hidden and weight_tying option,\n        # hidden_channels must be same\n        if with_wle or concat_hidden or sum_hidden or weight_tying:\n            if not all([in_dim == hidden_channels[0]\n                        for in_dim in hidden_channels]):\n                raise ValueError(\n                    'hidden_channels must be same but different {}'\n                    .format(hidden_channels))\n\n        if with_wle and wle_dim is None:\n            print('[WARNING] wle_dim is None, set to {}'\n                  .format(hidden_channels[0]))\n            wle_dim = hidden_channels[0]\n\n        if out_channels is None:\n            in_channels_list = hidden_channels[:-1]\n            out_channels_list = hidden_channels[1:]\n        else:\n            # For RelGAT concat_heads option\n            in_channels_list = hidden_channels\n            out_channels_list = out_channels\n        assert len(in_channels_list) == n_update_layers\n        assert len(out_channels_list) == n_update_layers\n\n        n_use_update_layers = 1 if weight_tying else n_update_layers\n        n_readout_layers = n_use_update_layers if concat_hidden or sum_hidden else 1\n        n_activation = n_use_update_layers if n_activation is None else n_activation\n\n        if update_kwargs is None:\n            update_kwargs = {}\n        if readout_kwargs is None:\n            readout_kwargs = {}\n        if wle_kwargs is None:\n            wle_kwargs = {}\n\n        with self.init_scope():\n            self.embed = EmbedAtomID(out_size=hidden_channels[0],\n                                     in_size=n_atom_types) # +1 for label 0\n            self.update_layers = chainer.ChainList(\n                *[update_layer(in_channels=in_channels_list[i],\n                               out_channels=out_channels_list[i],\n                               n_edge_types=n_edge_types, **update_kwargs)\n                  for i in range(n_use_update_layers)])\n            # when use weight_tying option, hidden_channels must be same. So we can use -1 index\n            self.readout_layers = chainer.ChainList(\n                *[readout_layer(out_dim=out_dim,\n                                # in_channels=hidden_channels[-1],\n                                in_channels=None,\n                                **readout_kwargs)\n                  for _ in range(n_readout_layers)])\n            if with_wle:\n                self.embed_wle = links.EmbedID(out_size=wle_dim, in_size=n_wle_types)\n                self.linear_for_concat_wle = GraphLinear(in_size=wle_dim + hidden_channels[0],\n                                                         out_size=hidden_channels[0])\n            if use_batchnorm:\n                self.bnorms = chainer.ChainList(\n                    *[GraphBatchNormalization(\n                        out_channels_list[i]) for i in range(n_use_update_layers)])\n\n        self.readout_layer = readout_layer\n        self.update_layer = update_layer\n        self.weight_tying = weight_tying\n        self.with_wle = with_wle\n        self.concat_hidden = concat_hidden\n        self.sum_hidden = sum_hidden\n        self.scale_adj = scale_adj\n        self.activation = activation\n        self.dropout_ratio = dropout_ratio\n        self.use_batchnorm = use_batchnorm\n        self.n_activation = n_activation\n        self.n_update_layers = n_update_layers\n        self.n_edge_types = n_edge_types\n\n    def __call__(self, atom_array, adj, wle_array=None, is_real_node=None):\n        self.reset_state()\n\n        if atom_array.dtype == self.xp.int32:\n            h = self.embed(atom_array)\n        else:\n            # TODO: GraphLinear or GraphMLP can be used.\n            h = atom_array\n\n        h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n\n        # all Combined NLE processes are done here.\n        if self.with_wle:\n            h_s = self.embed_wle(wle_array)\n            h_h_s = functions.concat( (h, h_s), axis=2 )\n            h = self.linear_for_concat_wle(h_h_s)\n\n        additional_kwargs = self.preprocess_addtional_kwargs(\n            atom_array, adj, wle_array=wle_array, is_real_node=is_real_node)\n\n        if self.scale_adj:\n            adj = rescale_adj(adj)\n\n        g_list = []\n        for step in range(self.n_update_layers):\n            update_layer_index = 0 if self.weight_tying else step\n            h = self.update_layers[update_layer_index](\n                h=h, adj=adj, **additional_kwargs)\n\n            if self.use_batchnorm:\n                h = self.bnorms[update_layer_index](h)\n\n            if self.dropout_ratio > 0.:\n                h = functions.dropout(h, ratio=self.dropout_ratio)\n\n            if self.activation is not None and step < self.n_activation:\n                h = self.activation(h)\n\n            if self.concat_hidden or self.sum_hidden:\n                g = self.readout_layers[step](\n                    h=h, h0=h0, is_real_node=is_real_node, **additional_kwargs)\n                g_list.append(g)\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            if self.sum_hidden:\n                g = functions.sum(functions.stack(g_list), axis=0)\n            else:\n                g = self.readout_layers[0](\n                    h=h, h0=h0, is_real_node=is_real_node)\n\n            return g\n\n\n    def reset_state(self):\n        if hasattr(self.update_layers[0], 'reset_state'):\n            [update_layer.reset_state() for update_layer in self.update_layers]\n\n\n    def preprocess_addtional_kwargs(self, *args, **kwargs):\n        return {}\n"
  },
  {
    "path": "chainer_chemistry/models/cwle/cwle_net.py",
    "content": "from chainer import functions\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import GINUpdate, NFPReadout, NFPUpdate, \\\n    RSGCNUpdate, GeneralReadout  # NOQA\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.update.ggnn_update import GGNNUpdate\nfrom chainer_chemistry.links.update.relgat_update import RelGATUpdate\nfrom chainer_chemistry.links.update.relgcn_update \\\n    import RelGCNUpdate, RelGCNSparseUpdate\n\n\nfrom chainer_chemistry.models.cwle.cwle_graph_conv_model import CWLEGraphConvModel  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_graph_conv_model import to_array\nfrom chainer_chemistry.models.cwle.cwle_graph_conv_model import MAX_WLE_NUM\n\n\nclass GGNN_CWLE(CWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(GGNN_CWLE, self).__init__(\n            update_layer=GGNNUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            weight_tying=weight_tying, n_edge_types=n_edge_types,\n            with_wle=with_wle, readout_kwargs=readout_kwargs,\n            n_wle_types=n_wle_types)\n\n\nclass RelGCN_CWLE(CWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(RelGCN_CWLE, self).__init__(\n            update_layer=RelGCNUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            weight_tying=weight_tying, n_edge_types=n_edge_types,\n            with_wle=with_wle, readout_kwargs=readout_kwargs,\n            n_wle_types=n_wle_types)\n\n\nclass RelGAT_CWLE(CWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(RelGAT_CWLE, self).__init__(\n            update_layer=RelGATUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            weight_tying=weight_tying, n_edge_types=n_edge_types,\n            with_wle=with_wle, readout_kwargs=readout_kwargs,\n            n_wle_types=n_wle_types)\n\n\nclass GIN_CWLE(CWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16,\n                 n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM,\n                 dropout_ratio=0.5, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        update_kwargs = {'dropout_ratio': dropout_ratio}\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(GIN_CWLE, self).__init__(\n            update_layer=GINUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers, n_atom_types=n_atom_types,\n            concat_hidden=concat_hidden, weight_tying=weight_tying,\n            n_edge_types=n_edge_types, with_wle=with_wle,\n            update_kwargs=update_kwargs, readout_kwargs=readout_kwargs,\n            n_wle_types=n_wle_types)\n\n\nclass NFP_CWLE(CWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 max_degree=6, n_atom_types=MAX_ATOMIC_NUM,\n                 concat_hidden=False, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        update_kwargs = {'max_degree': max_degree}\n        super(NFP_CWLE, self).__init__(\n            update_layer=NFPUpdate, readout_layer=NFPReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            sum_hidden=True, with_wle=with_wle, update_kwargs=update_kwargs,\n            n_wle_types=n_wle_types)\n        self.max_degree = max_degree\n        self.n_degree_type = max_degree + 1\n        self.ch0 = hidden_channels\n\n    def preprocess_addtional_kwargs(self, *args, **kwargs):\n        atom_array, adj = args[:2]\n        bs, num_node = atom_array.shape[:2]\n        # For NFP Update\n        if adj.ndim == 4:\n            degree_mat = self.xp.sum(to_array(adj), axis=(1, 2))\n        elif adj.ndim == 3:\n            degree_mat = self.xp.sum(to_array(adj), axis=1)\n        else:\n            raise ValueError('Unexpected value adj '\n                             .format(adj.shape))\n        # deg_conds: (minibatch, atom, ch)\n        deg_conds = [self.xp.broadcast_to(\n            ((degree_mat - degree) == 0)[:, :, None],\n            (bs, num_node, self.ch0))\n            for degree in range(1, self.n_degree_type + 1)]\n        return {'deg_conds': deg_conds}\n\n\nclass RSGCN_CWLE(CWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=32, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM,\n                 use_batch_norm=False, readout=None, dropout_ratio=0.5,\n                 with_wle=True, n_wle_types=MAX_WLE_NUM):\n        if readout is None:\n            readout = GeneralReadout\n        super(RSGCN_CWLE, self).__init__(\n            update_layer=RSGCNUpdate, readout_layer=readout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers, n_atom_types=n_atom_types,\n            use_batchnorm=use_batch_norm, activation=functions.relu,\n            n_activation=n_update_layers-1, dropout_ratio=dropout_ratio,\n            with_wle=with_wle, n_wle_types=n_wle_types)\n"
  },
  {
    "path": "chainer_chemistry/models/ggnn.py",
    "content": "import chainer\nfrom chainer import functions, cuda  # NOQA\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import EmbedAtomID\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.update.ggnn_update import GGNNUpdate\nfrom chainer_chemistry.utils import convert_sparse_with_edge_type\n\n\nclass GGNN(chainer.Chain):\n    \"\"\"Gated Graph Neural Networks (GGNN)\n\n    See: Li, Y., Tarlow, D., Brockschmidt, M., & Zemel, R. (2015).\\\n        Gated graph sequence neural networks. \\\n        `arXiv:1511.05493 <https://arxiv.org/abs/1511.05493>`_\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (int): dimension of feature vector for each node\n        n_update_layers (int): number of layers\n        n_atom_types (int): number of types of atoms\n        concat_hidden (bool): If set to True, readout is executed in each layer\n            and the result is concatenated\n        weight_tying (bool): enable weight_tying or not\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function\n        n_edge_types (int): number of edge type.\n            Defaults to 4 for single, double, triple and aromatic bond.\n    \"\"\"\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4):\n        super(GGNN, self).__init__()\n        n_readout_layer = n_update_layers if concat_hidden else 1\n        n_message_layer = 1 if weight_tying else n_update_layers\n        with self.init_scope():\n            # Update\n            self.embed = EmbedAtomID(\n                out_size=hidden_channels, in_size=n_atom_types)\n            self.update_layers = chainer.ChainList(*[GGNNUpdate(\n                hidden_channels=hidden_channels, n_edge_types=n_edge_types)\n                for _ in range(n_message_layer)])\n            # Readout\n            self.readout_layers = chainer.ChainList(*[GGNNReadout(\n                out_dim=out_dim, in_channels=hidden_channels * 2,\n                activation=activation, activation_agg=activation)\n                for _ in range(n_readout_layer)])\n        self.out_dim = out_dim\n        self.hidden_channels = hidden_channels\n        self.n_update_layers = n_update_layers\n        self.n_edge_types = n_edge_types\n        self.activation = activation\n        self.concat_hidden = concat_hidden\n        self.weight_tying = weight_tying\n\n    def __call__(self, atom_array, adj, is_real_node=None):\n        \"\"\"Forward propagation\n\n        Args:\n            atom_array (numpy.ndarray): minibatch of molecular which is\n                represented with atom IDs (representing C, O, S, ...)\n                `atom_array[mol_index, atom_index]` represents `mol_index`-th\n                molecule's `atom_index`-th atomic number\n            adj (numpy.ndarray): minibatch of adjancency matrix with edge-type\n                information\n            is_real_node (numpy.ndarray): 2-dim array (minibatch, num_nodes).\n                1 for real node, 0 for virtual node.\n                If `None`, all node is considered as real node.\n        Returns:\n            ~chainer.Variable: minibatch of fingerprint\n        \"\"\"\n        # reset state\n        self.reset_state()\n        if atom_array.dtype == self.xp.int32:\n            h = self.embed(atom_array)  # (minibatch, max_num_atoms)\n        else:\n            h = atom_array\n        h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n        g_list = []\n        for step in range(self.n_update_layers):\n            message_layer_index = 0 if self.weight_tying else step\n            h = self.update_layers[message_layer_index](h, adj)\n            if self.concat_hidden:\n                g = self.readout_layers[step](h, h0, is_real_node)\n                g_list.append(g)\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            g = self.readout_layers[0](h, h0, is_real_node)\n            return g\n\n    def reset_state(self):\n        [update_layer.reset_state() for update_layer in self.update_layers]\n\n\nclass SparseGGNN(GGNN):\n    \"\"\"GGNN model for sparse matrix inputs.\n\n    The constructor of this model is the same with that of GGNN.\n    See the documentation of GGNN for the detail.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        super(SparseGGNN, self).__init__(*args, **kwargs)\n\n    def __call__(self, atom_array, data, row, col, edge_type,\n                 is_real_node=None):\n        \"\"\"Forward propagation\n\n        Args:\n            atom_array (numpy.ndarray): minibatch of molecular which is\n                represented with atom IDs (representing C, O, S, ...)\n                `atom_array[mol_index, atom_index]` represents `mol_index`-th\n                molecule's `atom_index`-th atomic number\n            data (numpy.ndarray): the entries of the batched sparse matrix.\n            row (numpy.ndarray): the row indices of the matrix entries.\n            col (numpy.ndarray): the column indices of the matrix entries.\n            edge_type (numpy.ndarray): edge type information of edges.\n            is_real_node (numpy.ndarray): 2-dim array (minibatch, num_nodes).\n                1 for real node, 0 for virtual node.\n                If `None`, all node is considered as real node.\n\n        Returns:\n            ~chainer.Variable: minibatch of fingerprint\n        \"\"\"\n        num_nodes = atom_array.shape[1]\n        adj = convert_sparse_with_edge_type(\n            data, row, col, num_nodes, edge_type, self.n_edge_types)\n        return super(SparseGGNN, self).__call__(\n            atom_array, adj, is_real_node=is_real_node)\n"
  },
  {
    "path": "chainer_chemistry/models/gin.py",
    "content": "import chainer\nfrom chainer import functions, cuda  # NOQA\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import EmbedAtomID\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.readout.scatter_ggnn_readout import ScatterGGNNReadout  # NOQA\nfrom chainer_chemistry.links.update.gin_update import GINUpdate, GINSparseUpdate  # NOQA\n\n\nclass GIN(chainer.Chain):\n    \"\"\"Simple implementation of Graph Isomorphism Network (GIN)\n\n    See: Xu, Hu, Leskovec, and Jegelka, \\\n    \"How powerful are graph neural networks?\", in ICLR 2019.\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (int): dimension of feature vector for each node\n        n_update_layers (int): number of layers\n        n_atom_types (int): number of types of atoms\n        concat_hidden (bool): If set to True, readout is executed in each layer\n            and the result is concatenated\n        dropout_ratio (float): dropout ratio. Negative value indicates not\n            apply dropout\n        weight_tying (bool): enable weight_tying or not\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function\n        n_edge_types (int): number of edge type.\n            Defaults to 4 for single, double, triple and aromatic bond.\n    \"\"\"\n\n    def __init__(self, out_dim, node_embedding=False, hidden_channels=16,\n                 out_channels=None,\n                 n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM,\n                 dropout_ratio=0.5, concat_hidden=False,\n                 weight_tying=False, activation=functions.identity,\n                 n_edge_types=4):\n        super(GIN, self).__init__()\n        n_message_layer = 1 if weight_tying else n_update_layers\n        n_readout_layer = n_update_layers if concat_hidden else 1\n        with self.init_scope():\n            # embedding\n            self.embed = EmbedAtomID(out_size=hidden_channels,\n                                     in_size=n_atom_types)\n            self.first_mlp = GINUpdate(\n                hidden_channels=hidden_channels, dropout_ratio=dropout_ratio,\n                out_channels=hidden_channels).graph_mlp\n\n            # two non-linear MLP part\n            if out_channels is None:\n                out_channels = hidden_channels\n            self.update_layers = chainer.ChainList(*[GINUpdate(\n                hidden_channels=hidden_channels, dropout_ratio=dropout_ratio,\n                out_channels=(out_channels if i == n_message_layer - 1 else\n                              hidden_channels))\n                for i in range(n_message_layer)])\n\n            # Readout\n            self.readout_layers = chainer.ChainList(*[GGNNReadout(\n                out_dim=out_dim, in_channels=hidden_channels * 2,\n                activation=activation, activation_agg=activation)\n                for _ in range(n_readout_layer)])\n        # end with\n\n        self.node_embedding = node_embedding\n        self.out_dim = out_dim\n        self.hidden_channels = hidden_channels\n        self.n_update_layers = n_update_layers\n        self.n_message_layers = n_message_layer\n        self.n_readout_layer = n_readout_layer\n        self.dropout_ratio = dropout_ratio\n        self.concat_hidden = concat_hidden\n        self.weight_tying = weight_tying\n        self.n_edge_types = n_edge_types\n\n    def __call__(self, atom_array, adj, is_real_node=None):\n        \"\"\"forward propagation\n\n        Args:\n            atom_array (numpy.ndarray): mol-minibatch by node numpy.ndarray,\n                minibatch of molecular which is represented with atom IDs\n                (representing C, O, S, ...) atom_array[m, i] = a represents\n                m-th molecule's i-th node is value a (atomic number)\n            adj (numpy.ndarray): mol-minibatch by relation-types by node by\n                node numpy.ndarray,\n                minibatch of multple relational adjancency matrix with\n                edge-type information adj[i, j] = b represents\n                m-th molecule's  edge from node i to node j has value b\n            is_real_node:\n\n        Returns:\n            numpy.ndarray: final molecule representation\n        \"\"\"\n        if atom_array.dtype == self.xp.int32:\n            h = self.embed(atom_array)  # (minibatch, max_num_atoms)\n        else:\n            h = atom_array\n\n        h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n\n        g_list = []\n        for step in range(self.n_update_layers):\n            message_layer_index = 0 if self.weight_tying else step\n            h = self.update_layers[message_layer_index](h, adj)\n            if step != self.n_message_layers - 1:\n                h = functions.relu(h)\n            if self.concat_hidden:\n                g = self.readout_layers[step](h, h0, is_real_node)\n                g_list.append(g)\n\n        if self.node_embedding:\n            return h\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            g = self.readout_layers[0](h, h0, is_real_node)\n            return g\n\n\nclass GINSparse(chainer.Chain):\n    \"\"\"Simple implementation of sparseGraph Isomorphism Network (GIN)\"\"\"\n\n    def __init__(self, out_dim, node_embedding=False, hidden_channels=16,\n                 out_channels=None,\n                 n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM,\n                 dropout_ratio=0.5, concat_hidden=False,\n                 weight_tying=False, activation=functions.identity,\n                 n_edge_types=4):\n        super(GINSparse, self).__init__()\n        n_message_layer = 1 if weight_tying else n_update_layers\n        n_readout_layer = n_update_layers if concat_hidden else 1\n        with self.init_scope():\n            # embedding\n            self.embed = EmbedAtomID(out_size=hidden_channels,\n                                     in_size=n_atom_types)\n            self.first_mlp = GINSparseUpdate(\n                hidden_channels=hidden_channels, dropout_ratio=dropout_ratio,\n                out_channels=hidden_channels).mlp\n\n            # two non-linear MLP part\n            if out_channels is None:\n                out_channels = hidden_channels\n            self.update_layers = chainer.ChainList(*[GINSparseUpdate(\n                hidden_channels=hidden_channels, dropout_ratio=dropout_ratio,\n                out_channels=(out_channels if i == n_message_layer - 1 else\n                              hidden_channels))\n                for i in range(n_message_layer)])\n\n            # Readout\n            self.readout_layers = chainer.ChainList(*[ScatterGGNNReadout(\n                out_dim=out_dim, in_channels=hidden_channels * 2,\n                activation=activation, activation_agg=activation)\n                for _ in range(n_readout_layer)])\n        # end with\n\n        self.node_embedding = node_embedding\n        self.out_dim = out_dim\n        self.hidden_channels = hidden_channels\n        self.n_message_layers = n_message_layer\n        self.n_readout_layer = n_readout_layer\n        self.dropout_ratio = dropout_ratio\n        self.concat_hidden = concat_hidden\n        self.weight_tying = weight_tying\n        self.n_edge_types = n_edge_types\n\n    def __call__(self, sparse_batch, is_real_node=None):\n        if sparse_batch.x.dtype == self.xp.int32:\n            h = self.embed(sparse_batch.x)  # (minibatch, max_num_atoms)\n        else:\n            h = self.first_mlp(sparse_batch.x)\n\n        h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n\n        g_list = []\n        for step in range(self.n_message_layers):\n            message_layer_index = 0 if self.weight_tying else step\n            h = self.update_layers[message_layer_index](\n                h, sparse_batch.edge_index)\n            if step != self.n_message_layers - 1:\n                h = functions.relu(h)\n            if self.concat_hidden:\n                g = self.readout_layers[step](h, h0, is_real_node)\n                g_list.append(g)\n\n        if self.node_embedding:\n            return h\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            g = self.readout_layers[0](h, sparse_batch.batch, h0, is_real_node)\n            return g\n"
  },
  {
    "path": "chainer_chemistry/models/gnn_film.py",
    "content": "import chainer\nfrom chainer import cuda\nfrom chainer import functions\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.update.gnn_film_update import GNNFiLMUpdate\n\n\nclass GNNFiLM(chainer.Chain):\n    \"\"\"Graph Neural Networks with Feature-wise Linear Modulation (GNN_FiLM)\n\n    Marc Brockschmidt (2019).\\\n        GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation \\\n        `arXiv:1906.12192 <https://arxiv.org/abs/1906.12192>`_\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (int): dimension of feature vector\n            associated to each atom\n        n_update_layers (int): number of layers\n        n_atom_types (int): number of types of atoms\n        concat_hidden (bool): If set to True, readout is executed in each layer\n            and the result is concatenated\n        weight_tying (bool): enable weight_tying or not\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function\n        n_edge_types (int): number of edge type.\n            Defaults to 5 for single, double, triple, aromatic bond\n            and self-connection.\n    \"\"\"\n\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=5):\n        super(GNNFiLM, self).__init__()\n        n_readout_layer = n_update_layers if concat_hidden else 1\n        n_message_layer = 1 if weight_tying else n_update_layers\n        with self.init_scope():\n            # Update\n            self.embed = EmbedAtomID(out_size=hidden_channels,\n                                     in_size=n_atom_types)\n            self.update_layers = chainer.ChainList(*[GNNFiLMUpdate(\n                hidden_channels=hidden_channels, n_edge_types=n_edge_types)\n                for _ in range(n_message_layer)])\n            # Readout\n            # self.readout_layers = chainer.ChainList(*[GeneralReadout(\n            #     out_dim=out_dim, hidden_channels=hidden_channels,\n            #     activation=activation, activation_agg=activation)\n            #     for _ in range(n_readout_layer)])\n            self.readout_layers = chainer.ChainList(*[GGNNReadout(\n                out_dim=out_dim, in_channels=hidden_channels * 2,\n                activation=activation, activation_agg=activation)\n                for _ in range(n_readout_layer)])\n        self.out_dim = out_dim\n        self.hidden_channels = hidden_channels\n        self.n_update_layers = n_update_layers\n        self.n_edge_types = n_edge_types\n        self.activation = activation\n        self.concat_hidden = concat_hidden\n        self.weight_tying = weight_tying\n\n    def __call__(self, atom_array, adj, is_real_node=None):\n        \"\"\"Forward propagation\n\n        Args:\n            atom_array (numpy.ndarray): minibatch of molecular which is\n                represented with atom IDs (representing C, O, S, ...)\n                `atom_array[mol_index, atom_index]` represents `mol_index`-th\n                molecule's `atom_index`-th atomic number\n            adj (numpy.ndarray): minibatch of adjancency matrix with edge-type\n                information\n            is_real_node (numpy.ndarray): 2-dim array (minibatch, num_nodes).\n                1 for real node, 0 for virtual node.\n                If `None`, all node is considered as real node.\n\n        Returns:\n            ~chainer.Variable: minibatch of fingerprint\n        \"\"\"\n        # reset state\n        # self.reset_state()\n        if atom_array.dtype == self.xp.int32:\n            h = self.embed(atom_array)  # (minibatch, max_num_atoms)\n        else:\n            h = atom_array\n        h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n        g_list = []\n        for step in range(self.n_update_layers):\n            message_layer_index = 0 if self.weight_tying else step\n            h = self.update_layers[message_layer_index](h, adj)\n            if self.concat_hidden:\n                g = self.readout_layers[step](h, h0, is_real_node)\n                g_list.append(g)\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            g = self.readout_layers[0](h, h0, is_real_node)\n            return g\n\n    def reset_state(self):\n        [update_layer.reset_state() for update_layer in self.update_layers]\n"
  },
  {
    "path": "chainer_chemistry/models/gwle/__init__.py",
    "content": "#from chainer_chemistry.models.cwle import cwle\nfrom chainer_chemistry.models.gwle import gwle_graph_conv_model\nfrom chainer_chemistry.models.gwle import gwle_net\n\nfrom chainer_chemistry.models.gwle.gwle_net import GGNN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RelGAT_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RelGCN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import GIN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import NFP_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RSGCN_GWLE  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/models/gwle/gwle_graph_conv_model.py",
    "content": "import chainer\nfrom chainer import cuda\nfrom chainer import links\nfrom chainer import functions\n\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear\nfrom chainer_chemistry.links.normalization.graph_batch_normalization import GraphBatchNormalization  # NOQA\nfrom chainer_chemistry.links.readout.general_readout import GeneralReadout\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\n#from chainer_chemistry.models.gwm.gwm import GWM\nfrom chainer_chemistry.models.relgcn import rescale_adj\n\n\nMAX_WLE_NUM = 800\n\ndef to_array(x):\n    \"\"\"Convert x into numpy.ndarray or cupy.ndarray\"\"\"\n    if isinstance(x, chainer.Variable):\n        x = x.array\n    return x\n\n\nclass GWLEGraphConvModel(chainer.Chain):\n    \"\"\"Unified module of Graph Convolution Model with GWLE\n\n    Note that this module is experimental, all update_layer and\n    readout_layer combination is not supported.\n    Please refer `test_gwm_graph_conv_model.py` for tested combinations.\n    This module might not be maintained in the future.\n\n    Args:\n        hidden_channels (int or list): hidden channels for update\n        out_dim (int): output dim\n        update_layer (chainer.links.Link):\n        readout_layer (chainer.links.Link):\n        n_update_layers (int or None):\n        out_channels (None or lsit):\n        wle_dim (int):\n        n_atom_types (int):\n        n_edge_types (int):\n        dropout_ratio (float):\n        with_wle (bool): enabler for Combined NLE\n        concat_hidden (bool):\n        sum_hidden (bool):\n        weight_tying (bool):\n        scale_adj (bool):\n        activation (callable):\n        use_batchnorm (bool):\n        n_activation (int or None):\n        update_kwargs (dict or None):\n        readout_kwargs (dict or None):\n        wle_kwargs (dict or None):\n        n_wle_types (int):\n    \"\"\"\n    def __init__(self, hidden_channels, out_dim, update_layer, readout_layer,\n                 n_update_layers=None, out_channels=None, wle_dim=None,\n                 n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4,\n                 dropout_ratio=-1.0, with_wle=True,\n                 concat_hidden=False, sum_hidden=False, weight_tying=False,\n                 scale_adj=False, activation=None, use_batchnorm=False,\n                 n_activation=None, update_kwargs=None, readout_kwargs=None,\n                 wle_kwargs=None, n_wle_types=MAX_WLE_NUM):\n        super(GWLEGraphConvModel, self).__init__()\n\n        # General: length of hidden_channels must be n_layers + 1\n        if isinstance(hidden_channels, int):\n            if n_update_layers is None:\n                raise ValueError('n_update_layers is None')\n            else:\n                hidden_channels = [hidden_channels\n                                   for _ in range(n_update_layers + 1)]\n        elif isinstance(hidden_channels, list):\n            if out_channels is None:\n                n_update_layers = len(hidden_channels) - 1\n            else:\n                n_update_layers = len(hidden_channels)\n        else:\n            raise TypeError('Unexpected value for hidden_channels {}'\n                            .format(hidden_channels))\n\n        if readout_layer == GeneralReadout and hidden_channels[-1] != out_dim:\n            # When use GWM, hidden channels must be same. But GeneralReadout\n            # cannot change the dimension. So when use General Readout and GWM,\n            # hidden channel and out_dim should be same.\n            if with_wle:\n                raise ValueError('Unsupported combination.')\n            else:\n                hidden_channels[-1] = out_dim\n\n        # When use with_gwm, concat_hidden, sum_hidden and weight_tying option,\n        # hidden_channels must be same\n        if with_wle or concat_hidden or sum_hidden or weight_tying:\n            if not all([in_dim == hidden_channels[0]\n                        for in_dim in hidden_channels]):\n                raise ValueError(\n                    'hidden_channels must be same but different {}'\n                    .format(hidden_channels))\n\n        if with_wle and wle_dim is None:\n            print('[WARNING] wle_dim is None, set to {}'\n                  .format(hidden_channels[0]))\n            wle_dim = hidden_channels[0]\n\n        if out_channels is None:\n            in_channels_list = hidden_channels[:-1]\n            out_channels_list = hidden_channels[1:]\n        else:\n            # For RelGAT concat_heads option\n            in_channels_list = hidden_channels\n            out_channels_list = out_channels\n        assert len(in_channels_list) == n_update_layers\n        assert len(out_channels_list) == n_update_layers\n\n        n_use_update_layers = 1 if weight_tying else n_update_layers\n        n_readout_layers = n_use_update_layers if concat_hidden or sum_hidden else 1\n        n_activation = n_use_update_layers if n_activation is None else n_activation\n\n        if update_kwargs is None:\n            update_kwargs = {}\n        if readout_kwargs is None:\n            readout_kwargs = {}\n        if wle_kwargs is None:\n            wle_kwargs = {}\n\n        with self.init_scope():\n            self.embed = EmbedAtomID(out_size=hidden_channels[0],\n                                     in_size=n_atom_types) # +1 for label 0\n            self.update_layers = chainer.ChainList(\n                *[update_layer(in_channels=in_channels_list[i],\n                               out_channels=out_channels_list[i],\n                               n_edge_types=n_edge_types, **update_kwargs)\n                  for i in range(n_use_update_layers)])\n            # when use weight_tying option, hidden_channels must be same. So we can use -1 index\n            self.readout_layers = chainer.ChainList(\n                *[readout_layer(out_dim=out_dim,\n                                # in_channels=hidden_channels[-1],\n                                in_channels=None,\n                                **readout_kwargs)\n                  for _ in range(n_readout_layers)])\n            if with_wle:\n                self.embed_wle = links.EmbedID(out_size=wle_dim, in_size=n_wle_types)\n                # Gates\n                self.gate_W1 = GraphLinear(in_size=hidden_channels[0], out_size=hidden_channels[0])\n                self.gate_W2 = GraphLinear(in_size=wle_dim, out_size=hidden_channels[0])\n\n            if use_batchnorm:\n                self.bnorms = chainer.ChainList(\n                    *[GraphBatchNormalization(\n                        out_channels_list[i]) for i in range(n_use_update_layers)])\n\n        self.readout_layer = readout_layer\n        self.update_layer = update_layer\n        self.weight_tying = weight_tying\n        self.with_wle = with_wle\n        self.concat_hidden = concat_hidden\n        self.sum_hidden = sum_hidden\n        self.scale_adj = scale_adj\n        self.activation = activation\n        self.dropout_ratio = dropout_ratio\n        self.use_batchnorm = use_batchnorm\n        self.n_activation = n_activation\n        self.n_update_layers = n_update_layers\n        self.n_edge_types = n_edge_types\n\n    def __call__(self, atom_array, adj, wle_array=None, is_real_node=None):\n        self.reset_state()\n\n        if atom_array.dtype == self.xp.int32:\n            h = self.embed(atom_array)\n        else:\n            # TODO: GraphLinear or GraphMLP can be used.\n            h = atom_array\n\n        h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n\n        # all Combined NLE processes are done here.\n        if self.with_wle:\n            h_s = self.embed_wle(wle_array)\n\n            # gated sum\n            gate_input = self.gate_W1(h) + self.gate_W2(h_s)\n            gate_coefff = functions.sigmoid(gate_input)\n            h = (1.0 - gate_coefff) * h + gate_coefff * h_s\n\n        additional_kwargs = self.preprocess_addtional_kwargs(\n            atom_array, adj, wle_array=wle_array, is_real_node=is_real_node)\n\n        if self.scale_adj:\n            adj = rescale_adj(adj)\n\n        g_list = []\n        for step in range(self.n_update_layers):\n            update_layer_index = 0 if self.weight_tying else step\n            h = self.update_layers[update_layer_index](\n                h=h, adj=adj, **additional_kwargs)\n\n            if self.use_batchnorm:\n                h = self.bnorms[update_layer_index](h)\n\n            if self.dropout_ratio > 0.:\n                h = functions.dropout(h, ratio=self.dropout_ratio)\n\n            if self.activation is not None and step < self.n_activation:\n                h = self.activation(h)\n\n            if self.concat_hidden or self.sum_hidden:\n                g = self.readout_layers[step](\n                    h=h, h0=h0, is_real_node=is_real_node, **additional_kwargs)\n                g_list.append(g)\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            if self.sum_hidden:\n                g = functions.sum(functions.stack(g_list), axis=0)\n            else:\n                g = self.readout_layers[0](\n                    h=h, h0=h0, is_real_node=is_real_node)\n\n            return g\n\n\n    def reset_state(self):\n        if hasattr(self.update_layers[0], 'reset_state'):\n            [update_layer.reset_state() for update_layer in self.update_layers]\n\n\n    def preprocess_addtional_kwargs(self, *args, **kwargs):\n        return {}\n"
  },
  {
    "path": "chainer_chemistry/models/gwle/gwle_net.py",
    "content": "from chainer import functions\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import GINUpdate, NFPReadout, NFPUpdate, \\\n    RSGCNUpdate, GeneralReadout  # NOQA\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.update.ggnn_update import GGNNUpdate\nfrom chainer_chemistry.links.update.relgat_update import RelGATUpdate\nfrom chainer_chemistry.links.update.relgcn_update \\\n    import RelGCNUpdate, RelGCNSparseUpdate\n\n\nfrom chainer_chemistry.models.gwle.gwle_graph_conv_model import GWLEGraphConvModel  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_graph_conv_model import to_array\nfrom chainer_chemistry.models.cwle.cwle_graph_conv_model import MAX_WLE_NUM\n\n\nclass GGNN_GWLE(GWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(GGNN_GWLE, self).__init__(\n            update_layer=GGNNUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            weight_tying=weight_tying, n_edge_types=n_edge_types,\n            with_wle=with_wle, readout_kwargs=readout_kwargs,\n            n_wle_types=n_wle_types)\n\n\nclass RelGCN_GWLE(GWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(RelGCN_GWLE, self).__init__(\n            update_layer=RelGCNUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            weight_tying=weight_tying, n_edge_types=n_edge_types,\n            with_wle=with_wle, readout_kwargs=readout_kwargs,\n            n_wle_types=n_wle_types)\n\n\nclass RelGAT_GWLE(GWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(RelGAT_GWLE, self).__init__(\n            update_layer=RelGATUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            weight_tying=weight_tying, n_edge_types=n_edge_types,\n            with_wle=with_wle, readout_kwargs=readout_kwargs,\n            n_wle_types=n_wle_types)\n\n\nclass GIN_GWLE(GWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16,\n                 n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM,\n                 dropout_ratio=0.5, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        update_kwargs = {'dropout_ratio': dropout_ratio}\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(GIN_GWLE, self).__init__(\n            update_layer=GINUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers, n_atom_types=n_atom_types,\n            concat_hidden=concat_hidden, weight_tying=weight_tying,\n            n_edge_types=n_edge_types, with_wle=with_wle,\n            update_kwargs=update_kwargs, readout_kwargs=readout_kwargs,\n            n_wle_types=n_wle_types)\n\n\nclass NFP_GWLE(GWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 max_degree=6, n_atom_types=MAX_ATOMIC_NUM,\n                 concat_hidden=False, with_wle=True, n_wle_types=MAX_WLE_NUM):\n        update_kwargs = {'max_degree': max_degree}\n        super(NFP_GWLE, self).__init__(\n            update_layer=NFPUpdate, readout_layer=NFPReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            sum_hidden=True, with_wle=with_wle, update_kwargs=update_kwargs,\n            n_wle_types=n_wle_types)\n        self.max_degree = max_degree\n        self.n_degree_type = max_degree + 1\n        self.ch0 = hidden_channels\n\n    def preprocess_addtional_kwargs(self, *args, **kwargs):\n        atom_array, adj = args[:2]\n        bs, num_node = atom_array.shape[:2]\n        # For NFP Update\n        if adj.ndim == 4:\n            degree_mat = self.xp.sum(to_array(adj), axis=(1, 2))\n        elif adj.ndim == 3:\n            degree_mat = self.xp.sum(to_array(adj), axis=1)\n        else:\n            raise ValueError('Unexpected value adj '\n                             .format(adj.shape))\n        # deg_conds: (minibatch, atom, ch)\n        deg_conds = [self.xp.broadcast_to(\n            ((degree_mat - degree) == 0)[:, :, None],\n            (bs, num_node, self.ch0))\n            for degree in range(1, self.n_degree_type + 1)]\n        return {'deg_conds': deg_conds}\n\n\nclass RSGCN_GWLE(GWLEGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=32, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM,\n                 use_batch_norm=False, readout=None, dropout_ratio=0.5,\n                 with_wle=True, n_wle_types=MAX_WLE_NUM):\n        if readout is None:\n            readout = GeneralReadout\n        super(RSGCN_GWLE, self).__init__(\n            update_layer=RSGCNUpdate, readout_layer=readout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers, n_atom_types=n_atom_types,\n            use_batchnorm=use_batch_norm, activation=functions.relu,\n            n_activation=n_update_layers-1, dropout_ratio=dropout_ratio,\n            with_wle=with_wle, n_wle_types=n_wle_types)\n"
  },
  {
    "path": "chainer_chemistry/models/gwm/__init__.py",
    "content": "from chainer_chemistry.models.gwm import gwm  # NOQA\nfrom chainer_chemistry.models.gwm import gwm_graph_conv_model  # NOQA\nfrom chainer_chemistry.models.gwm import gwm_net  # NOQA\n\nfrom chainer_chemistry.models.gwm.gwm import GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_graph_conv_model import GWMGraphConvModel  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import GGNN_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import GIN_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import NFP_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import RSGCN_GWM  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/models/gwm/gwm.py",
    "content": "import chainer\nfrom chainer import functions\nfrom chainer import links\n\nfrom chainer_chemistry.links import GraphLinear\n\n\nclass WarpGateUnit(chainer.Chain):\n    \"\"\"WarpGateUnit\n\n    It computes gated-sum mixing `merged` feature from normal node feature `h`\n    and super node feature `g`,\n\n    See Section \"3.4 Warp Gate\" of the paper.\n\n    Args:\n        output_type (str): supported type as below.\n            graph:\n            super:\n        hidden_dim (int): hidden dim\n        dropout_ratio (float): negative value indicates to not apply dropout.\n        activation (callable):\n    \"\"\"\n    def __init__(self, output_type='graph', hidden_dim=16,\n                 dropout_ratio=-1, activation=functions.sigmoid):\n        super(WarpGateUnit, self).__init__()\n        if output_type == 'graph':\n            LinearLink = GraphLinear\n        elif output_type == 'super':\n            LinearLink = links.Linear\n        else:\n            raise ValueError(\n                'output_type = {} is unexpected. graph or super is supported.'\n                .format(output_type))\n\n        with self.init_scope():\n            self.H = LinearLink(in_size=hidden_dim, out_size=hidden_dim)\n            self.G = LinearLink(in_size=hidden_dim, out_size=hidden_dim)\n\n        self.hidden_dim = hidden_dim\n        self.dropout_ratio = dropout_ratio\n        self.output_type = output_type\n        self.activation = activation\n\n    def __call__(self, h, g):\n        # TODO(nakago): more efficient computation. Maybe we can calculate\n        # self.G(g) as Linear layer followed by broadcast to each atom.\n        z = self.H(h) + self.G(g)\n\n        if self.dropout_ratio > 0.0:\n            z = functions.dropout(z, ratio=self.dropout_ratio)\n        z = self.activation(z)\n        merged = (1 - z) * h + z * g\n        return merged\n\n\nclass SuperNodeTransmitterUnit(chainer.Chain):\n    \"\"\"SuperNodeTransmitterUnit\n\n    It calculates message from super node to normal node.\n\n    Args:\n        hidden_dim_super (int):\n        hidden_dim (int): hiddem dim for\n        dropout_ratio (float): negative value indicates to not apply dropout.\n    \"\"\"\n\n    def __init__(self, hidden_dim_super=16, hidden_dim=16, dropout_ratio=-1):\n        super(SuperNodeTransmitterUnit, self).__init__()\n        with self.init_scope():\n            self.F_super = links.Linear(in_size=hidden_dim_super,\n                                        out_size=hidden_dim)\n        self.hidden_dim = hidden_dim\n        self.hidden_dim_super = hidden_dim_super\n        self.dropout_ratio = dropout_ratio\n\n    def __call__(self, g, n_nodes):\n        \"\"\"main calculation\n\n        Args:\n            g: super node feature. shape (bs, hidden_dim_super)\n            n_nodes (int): number of nodes\n\n        Returns:\n            g_trans: super --> original transmission\n        \"\"\"\n        mb = len(g)\n        # for local updates\n        g_trans = self.F_super(g)\n        # intermediate_h_super.shape == (mb, self.hidden_dim)\n        g_trans = functions.tanh(g_trans)\n        # intermediate_h_super.shape == (mb, 1, self.hidden_dim)\n        g_trans = functions.expand_dims(g_trans, 1)\n        # intermediate_h_super.shape == (mb, atom, self.hidden_dim)\n        g_trans = functions.broadcast_to(g_trans,\n                                         (mb, n_nodes, self.hidden_dim))\n        return g_trans\n\n\nclass GraphTransmitterUnit(chainer.Chain):\n    \"\"\"GraphTransmitterUnit\n\n    It calculates message from normal node to super node.\n\n    Args:\n        hidden_dim_super (int):\n        hidden_dim (int):\n        n_heads (int):\n        dropout_ratio (float):\n        activation (callable):\n    \"\"\"\n    def __init__(self, hidden_dim_super=16, hidden_dim=16, n_heads=8,\n                 dropout_ratio=-1, activation=functions.tanh):\n        super(GraphTransmitterUnit, self).__init__()\n        hdim_n = hidden_dim * n_heads\n        with self.init_scope():\n            self.V_super = GraphLinear(hidden_dim, hdim_n)\n            self.W_super = links.Linear(hdim_n, hidden_dim_super)\n            self.B = GraphLinear(hidden_dim, n_heads * hidden_dim_super)\n        self.hidden_dim = hidden_dim\n        self.hidden_dim_super = hidden_dim_super\n        self.dropout_ratio = dropout_ratio\n        self.n_heads = n_heads\n        self.activation = activation\n\n    def __call__(self, h, g, step=0):\n        mb, atom, ch = h.shape\n\n        h_j = self.V_super(h)\n        h_j = functions.reshape(h_j, (mb, atom, self.n_heads, ch))\n        # h_j (mb, atom, self.n_heads, ch)\n        h_j = functions.transpose(h_j, (0, 2, 1, 3))\n\n        # expand h_super\n        # g_extend.shape (mb, 1, self.hidden_dim_super)\n        g_extend = functions.expand_dims(g, 1)\n        # g_extend.shape == (mb, self.n_heads, self.hidden_dim_super)\n        g_extend = functions.broadcast_to(g_extend, (mb, self.n_heads,\n                                                     self.hidden_dim_super))\n        # g_extend.shape == (mb, self.n_heads, 1, self.hidden_dim_super)\n        g_extend = functions.expand_dims(g_extend, 2)\n\n        # update for attention-message B h_i\n        # h (mb, atom, ch)\n        # Bh_i.shape == (mb, atom, self.n_heads * self.hidden_dim_super)\n        Bh_i = self.B(h)\n        # Bh_i.shpae == (mb, atom, num_head, ch)\n        Bh_i = functions.reshape(Bh_i, (mb, atom, self.n_heads,\n                                        self.hidden_dim_super))\n        # Bh_i.shape == (mb, num_head, atom, ch)\n        Bh_i = functions.transpose(Bh_i, [0, 2, 1, 3])\n\n        # take g^{T} * B * h_i\n        # indexed by i\n        # mb, self.n_haeds atom(i)\n        # b_hi.shape == (mb, self.n_heads, 1, atom)\n        # This will reduce the last hidden_dim_super axis\n        b_hi = functions.matmul(g_extend, Bh_i, transb=True)\n\n        # softmax. sum/normalize over the last axis.\n        # mb, self.n_heda, atom(i-normzlied)\n        # attention_i.shape == (mb, self.n_heads, 1, atom)\n        attention_i = functions.softmax(b_hi, axis=3)\n        if self.dropout_ratio > 0.0:\n            attention_i = functions.dropout(attention_i,\n                                            ratio=self.dropout_ratio)\n\n        # element-wise product --> sum over i\n        # mb, num_head, hidden_dim_super\n        # attention_sum.shape == (mb, self.n_heads, 1, ch)\n        attention_sum = functions.matmul(attention_i, h_j)\n        # attention_sum.shape == (mb, self.n_heads * ch)\n        attention_sum = functions.reshape(attention_sum,\n                                          (mb, self.n_heads * ch))\n\n        # weighting h for different heads\n        # intermediate_h.shape == (mb, self.n_heads * ch)\n        # compress heads\n        h_trans = self.W_super(attention_sum)\n        # intermediate_h.shape == (mb, self.hidden_dim_super)\n        h_trans = self.activation(h_trans)\n        return h_trans\n\n\nclass GWM(chainer.Chain):\n    \"\"\"Graph Warping Module (GWM)\n\n    Module for a single layer update.\n\n    See: Ishiguro, Maeda, and Koyama. \"Graph Warp Module: an Auxiliary Module\n        for Boosting the Power of Graph NeuralNetworks\", arXiv, 2019.\n\n    Args:\n        hidden_dim (int): dimension of hidden vectors\n            associated to each atom (local node)\n        hidden_dim_super (int); dimension of super-node hidden vector\n        n_layers (int): number of layers\n        n_heads (int): number of heads\n        dropout_ratio (float): dropout ratio.\n            Negative value indicates to not apply dropout.\n        tying_flag (bool): enable if you want to share params across layers.\n        activation (callable):\n        wgu_activation (callable):\n        gtu_activation (callable):\n    \"\"\"\n    def __init__(self, hidden_dim=16, hidden_dim_super=16, n_layers=4,\n                 n_heads=8, dropout_ratio=-1,\n                 tying_flag=False, activation=functions.relu,\n                 wgu_activation=functions.sigmoid,\n                 gtu_activation=functions.tanh):\n        super(GWM, self).__init__()\n\n        n_use_layers = 1 if tying_flag else n_layers\n\n        with self.init_scope():\n            self.update_super = chainer.ChainList(\n                *[links.Linear(in_size=hidden_dim_super,\n                               out_size=hidden_dim_super)\n                  for _ in range(n_use_layers)]\n            )\n\n            # for Transmitter unit\n            self.super_transmitter = chainer.ChainList(\n                *[SuperNodeTransmitterUnit(\n                    hidden_dim=hidden_dim, hidden_dim_super=hidden_dim_super,\n                    dropout_ratio=dropout_ratio) for _ in range(n_use_layers)])\n            self.graph_transmitter = chainer.ChainList(\n                *[GraphTransmitterUnit(\n                    hidden_dim=hidden_dim, hidden_dim_super=hidden_dim_super,\n                    n_heads=n_heads, dropout_ratio=dropout_ratio,\n                    activation=gtu_activation) for _ in range(n_use_layers)])\n\n            # for Warp Gate unit\n            self.wgu_local = chainer.ChainList(\n                *[WarpGateUnit(\n                    output_type='graph', hidden_dim=hidden_dim,\n                    dropout_ratio=dropout_ratio, activation=wgu_activation)\n                    for _ in range(n_use_layers)])\n            self.wgu_super = chainer.ChainList(\n                *[WarpGateUnit(\n                    output_type='super', hidden_dim=hidden_dim_super,\n                    dropout_ratio=dropout_ratio, activation=wgu_activation)\n                    for _ in range(n_use_layers)])\n\n            # Weight tying: not layer-wise but recurrent through layers\n            self.GRU_local = links.GRU(in_size=hidden_dim, out_size=hidden_dim)\n            self.GRU_super = links.GRU(in_size=hidden_dim_super,\n                                       out_size=hidden_dim_super)\n        # end init_scope-with\n        self.hidden_dim = hidden_dim\n        self.hidden_dim_super = hidden_dim_super\n        self.n_layers = n_layers\n        self.n_heads = n_heads\n        self.dropout_ratio = dropout_ratio\n        self.tying_flag = tying_flag\n        self.activation = activation\n        self.wgu_activation = wgu_activation\n\n    def __call__(self, h, h_new, g, step=0):\n        \"\"\"main calculation\n\n        Note: Do not forget to reset GRU for each batch.\n\n        Args:\n            h: Minibatch by num_nodes by hidden_dim numpy array.\n                current local node hidden states as input of the vanilla GNN\n            h_new: Minibatch by num_nodes by hidden_dim numpy array.\n                updated local node hidden states as output from the vanilla GNN\n            g: Minibatch by bond_types by num_nodes by num_nodes 1/0\n                array. Adjacency matrices over several bond types\n            step: Minibatch by hidden_dim_super numpy array.\n                current super node hiddden state\n\n        Returns: Updated h and g\n        \"\"\"\n        # (minibatch, atom, ch)\n        mb, n_nodes, ch = h.shape\n        # non linear update of the super node\n        g_new = self.activation(self.update_super[step](g))\n\n        # Transmitter unit: inter-module message passing\n        # original --> super transmission\n        h_trans = self.graph_transmitter[step](h, g)\n        # g_trans: super --> original transmission\n        g_trans = self.super_transmitter[step](g, n_nodes)\n\n        # Warp Gate unit\n        merged_h = self.wgu_local[step](h_new, g_trans)\n        merged_g = self.wgu_super[step](h_trans, g_new)\n\n        # Self recurrent\n        out_h = functions.reshape(merged_h, (mb * n_nodes, self.hidden_dim))\n        out_h = self.GRU_local(out_h)\n        out_h = functions.reshape(out_h, (mb, n_nodes, self.hidden_dim))\n\n        out_g = self.GRU_super(merged_g)\n\n        return out_h, out_g\n\n    def reset_state(self):\n        self.GRU_local.reset_state()\n        self.GRU_super.reset_state()\n"
  },
  {
    "path": "chainer_chemistry/models/gwm/gwm_graph_conv_model.py",
    "content": "import chainer\nfrom chainer import cuda\nfrom chainer import functions\nfrom chainer import links\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.normalization.graph_batch_normalization import GraphBatchNormalization  # NOQA\nfrom chainer_chemistry.links.readout.general_readout import GeneralReadout\nfrom chainer_chemistry.models.gwm.gwm import GWM\nfrom chainer_chemistry.models.relgcn import rescale_adj\n\n\ndef to_array(x):\n    \"\"\"Convert x into numpy.ndarray or cupy.ndarray\"\"\"\n    if isinstance(x, chainer.Variable):\n        x = x.array\n    return x\n\n\nclass GWMGraphConvModel(chainer.Chain):\n    \"\"\"Unified module of Graph Convolution Model with GWM\n\n    Note that this module is experimental, all update_layer and\n    readout_layer combination is not supported.\n    Please refer `test_gwm_graph_conv_model.py` for tested combinations.\n    This module might not be maintained in the future.\n\n    Args:\n        hidden_channels (int or list): hidden channels for update\n        out_dim (int): output dim\n        update_layer (chainer.links.Link):\n        readout_layer (chainer.links.Link):\n        n_update_layers (int or None):\n        out_channels (None or lsit):\n        super_node_dim (int):\n        n_atom_types (int):\n        n_edge_types (int):\n        dropout_ratio (float):\n        with_gwm (bool):\n        concat_hidden (bool):\n        sum_hidden (bool):\n        weight_tying (bool):\n        scale_adj (bool):\n        activation (callable):\n        use_batchnorm (bool):\n        n_activation (int or None):\n        update_kwargs (dict or None):\n        readout_kwargs (dict or None):\n        gwm_kwargs (dict or None):\n    \"\"\"\n    def __init__(self, hidden_channels, out_dim, update_layer, readout_layer,\n                 n_update_layers=None, out_channels=None, super_node_dim=None,\n                 n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4,\n                 dropout_ratio=-1.0, with_gwm=True,\n                 concat_hidden=False, sum_hidden=False, weight_tying=False,\n                 scale_adj=False, activation=None, use_batchnorm=False,\n                 n_activation=None, update_kwargs=None, readout_kwargs=None,\n                 gwm_kwargs=None):\n        super(GWMGraphConvModel, self).__init__()\n\n        # General: length of hidden_channels must be n_layers + 1\n        if isinstance(hidden_channels, int):\n            if n_update_layers is None:\n                raise ValueError('n_update_layers is None')\n            else:\n                hidden_channels = [hidden_channels\n                                   for _ in range(n_update_layers + 1)]\n        elif isinstance(hidden_channels, list):\n            if out_channels is None:\n                n_update_layers = len(hidden_channels) - 1\n            else:\n                n_update_layers = len(hidden_channels)\n        else:\n            raise TypeError('Unexpected value for hidden_channels {}'\n                            .format(hidden_channels))\n\n        if readout_layer == GeneralReadout and hidden_channels[-1] != out_dim:\n            # When use GWM, hidden channels must be same. But GeneralReadout\n            # cannot change the dimension. So when use General Readout and GWM,\n            # hidden channel and out_dim should be same.\n            if with_gwm:\n                raise ValueError('Unsupported combination.')\n            else:\n                hidden_channels[-1] = out_dim\n\n        # When use with_gwm, concat_hidden, sum_hidden and weight_tying option,\n        # hidden_channels must be same\n        if with_gwm or concat_hidden or sum_hidden or weight_tying:\n            if not all([in_dim == hidden_channels[0]\n                        for in_dim in hidden_channels]):\n                raise ValueError(\n                    'hidden_channels must be same but different {}'\n                    .format(hidden_channels))\n\n        if with_gwm and super_node_dim is None:\n            print('[WARNING] super_node_dim is None, set to {}'\n                  .format(hidden_channels[0]))\n            super_node_dim = hidden_channels[0]\n\n        if out_channels is None:\n            in_channels_list = hidden_channels[:-1]\n            out_channels_list = hidden_channels[1:]\n        else:\n            # For RelGAT concat_heads option\n            in_channels_list = hidden_channels\n            out_channels_list = out_channels\n        assert len(in_channels_list) == n_update_layers\n        assert len(out_channels_list) == n_update_layers\n\n        n_use_update_layers = 1 if weight_tying else n_update_layers\n        n_readout_layers = n_use_update_layers if concat_hidden or sum_hidden else 1  # NOQA\n        n_activation = n_use_update_layers if n_activation is None else n_activation  # NOQA\n\n        if update_kwargs is None:\n            update_kwargs = {}\n        if readout_kwargs is None:\n            readout_kwargs = {}\n        if gwm_kwargs is None:\n            gwm_kwargs = {}\n\n        with self.init_scope():\n            self.embed = EmbedAtomID(out_size=hidden_channels[0],\n                                     in_size=n_atom_types)\n            self.update_layers = chainer.ChainList(\n                *[update_layer(in_channels=in_channels_list[i],\n                               out_channels=out_channels_list[i],\n                               n_edge_types=n_edge_types, **update_kwargs)\n                  for i in range(n_use_update_layers)])\n            # when use weight_tying option, hidden_channels must be same.\n            # So we can use -1 index\n            self.readout_layers = chainer.ChainList(\n                *[readout_layer(out_dim=out_dim,\n                                # in_channels=hidden_channels[-1],\n                                in_channels=None,\n                                **readout_kwargs)\n                  for _ in range(n_readout_layers)])\n            if with_gwm:\n                self.gwm = GWM(hidden_dim=hidden_channels[0],\n                               hidden_dim_super=super_node_dim,\n                               n_layers=n_use_update_layers, **gwm_kwargs)\n                self.embed_super = links.Linear(None, out_size=super_node_dim)\n                self.linear_for_concat_super = links.Linear(in_size=None,\n                                                            out_size=out_dim)\n            if use_batchnorm:\n                self.bnorms = chainer.ChainList(\n                    *[GraphBatchNormalization(\n                        out_channels_list[i]) for i in\n                        range(n_use_update_layers)])\n\n        self.readout_layer = readout_layer\n        self.update_layer = update_layer\n        self.weight_tying = weight_tying\n        self.with_gwm = with_gwm\n        self.concat_hidden = concat_hidden\n        self.sum_hidden = sum_hidden\n        self.scale_adj = scale_adj\n        self.activation = activation\n        self.dropout_ratio = dropout_ratio\n        self.use_batchnorm = use_batchnorm\n        self.n_activation = n_activation\n        self.n_update_layers = n_update_layers\n        self.n_edge_types = n_edge_types\n\n    def __call__(self, atom_array, adj, super_node=None, is_real_node=None):\n        self.reset_state()\n\n        if atom_array.dtype == self.xp.int32:\n            h = self.embed(atom_array)\n        else:\n            # TODO(nakago): GraphLinear or GraphMLP can be used.\n            h = atom_array\n\n        h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n        if self.with_gwm:\n            h_s = self.embed_super(super_node)\n\n        additional_kwargs = self.preprocess_addtional_kwargs(\n            atom_array, adj, super_node=super_node, is_real_node=is_real_node)\n\n        if self.scale_adj:\n            adj = rescale_adj(adj)\n\n        g_list = []\n        for step in range(self.n_update_layers):\n            update_layer_index = 0 if self.weight_tying else step\n            h_new = self.update_layers[update_layer_index](\n                h=h, adj=adj, **additional_kwargs)\n\n            if self.with_gwm:\n                h_new, h_s = self.gwm(h, h_new, h_s, update_layer_index)\n            h = h_new\n\n            if self.use_batchnorm:\n                h = self.bnorms[update_layer_index](h)\n\n            if self.dropout_ratio > 0.:\n                h = functions.dropout(h, ratio=self.dropout_ratio)\n\n            if self.activation is not None and step < self.n_activation:\n                h = self.activation(h)\n\n            if self.concat_hidden or self.sum_hidden:\n                g = self.readout_layers[step](\n                    h=h, h0=h0, is_real_node=is_real_node, **additional_kwargs)\n                g_list.append(g)\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            if self.sum_hidden:\n                g = functions.sum(functions.stack(g_list), axis=0)\n            else:\n                g = self.readout_layers[0](\n                    h=h, h0=h0, is_real_node=is_real_node)\n            if self.with_gwm:\n                g = functions.concat((g, h_s), axis=1)\n                g = functions.relu(self.linear_for_concat_super(g))\n            return g\n\n    def reset_state(self):\n        if hasattr(self.update_layers[0], 'reset_state'):\n            [update_layer.reset_state() for update_layer in self.update_layers]\n\n        if self.with_gwm:\n            self.gwm.reset_state()\n\n    def preprocess_addtional_kwargs(self, *args, **kwargs):\n        return {}\n"
  },
  {
    "path": "chainer_chemistry/models/gwm/gwm_net.py",
    "content": "from chainer import functions\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import GINUpdate, NFPReadout, NFPUpdate, \\\n    RSGCNUpdate, GeneralReadout  # NOQA\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.update.ggnn_update import GGNNUpdate\nfrom chainer_chemistry.models.gwm.gwm_graph_conv_model import GWMGraphConvModel, to_array  # NOQA\n\n\nclass GGNN_GWM(GWMGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_gwm=True):\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(GGNN_GWM, self).__init__(\n            update_layer=GGNNUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            weight_tying=weight_tying, n_edge_types=n_edge_types,\n            with_gwm=with_gwm, readout_kwargs=readout_kwargs)\n\n\nclass GIN_GWM(GWMGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16,\n                 n_update_layers=4, n_atom_types=MAX_ATOMIC_NUM,\n                 dropout_ratio=0.5, concat_hidden=False,\n                 weight_tying=True, activation=functions.identity,\n                 n_edge_types=4, with_gwm=True):\n        update_kwargs = {'dropout_ratio': dropout_ratio}\n        readout_kwargs = {'activation': activation,\n                          'activation_agg': activation}\n        super(GIN_GWM, self).__init__(\n            update_layer=GINUpdate, readout_layer=GGNNReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers, n_atom_types=n_atom_types,\n            concat_hidden=concat_hidden, weight_tying=weight_tying,\n            n_edge_types=n_edge_types, with_gwm=with_gwm,\n            update_kwargs=update_kwargs, readout_kwargs=readout_kwargs\n        )\n\n\nclass NFP_GWM(GWMGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 max_degree=6, n_atom_types=MAX_ATOMIC_NUM,\n                 concat_hidden=False, with_gwm=True):\n        update_kwargs = {'max_degree': max_degree}\n        super(NFP_GWM, self).__init__(\n            update_layer=NFPUpdate, readout_layer=NFPReadout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers,\n            n_atom_types=n_atom_types, concat_hidden=concat_hidden,\n            sum_hidden=True, with_gwm=with_gwm, update_kwargs=update_kwargs\n        )\n        self.max_degree = max_degree\n        self.n_degree_type = max_degree + 1\n        self.ch0 = hidden_channels\n\n    def preprocess_addtional_kwargs(self, *args, **kwargs):\n        atom_array, adj = args[:2]\n        bs, num_node = atom_array.shape[:2]\n        # For NFP Update\n        if adj.ndim == 4:\n            degree_mat = self.xp.sum(to_array(adj), axis=(1, 2))\n        elif adj.ndim == 3:\n            degree_mat = self.xp.sum(to_array(adj), axis=1)\n        else:\n            raise ValueError('Unexpected value adj '\n                             .format(adj.shape))\n        # deg_conds: (minibatch, atom, ch)\n        deg_conds = [self.xp.broadcast_to(\n            ((degree_mat - degree) == 0)[:, :, None],\n            (bs, num_node, self.ch0))\n            for degree in range(1, self.n_degree_type + 1)]\n        return {'deg_conds': deg_conds}\n\n\nclass RSGCN_GWM(GWMGraphConvModel):\n    def __init__(self, out_dim, hidden_channels=32, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM,\n                 use_batch_norm=False, readout=None, dropout_ratio=0.5,\n                 with_gwm=True):\n        if readout is None:\n            readout = GeneralReadout\n        super(RSGCN_GWM, self).__init__(\n            update_layer=RSGCNUpdate, readout_layer=readout,\n            out_dim=out_dim, hidden_channels=hidden_channels,\n            n_update_layers=n_update_layers, n_atom_types=n_atom_types,\n            use_batchnorm=use_batch_norm, activation=functions.relu,\n            n_activation=n_update_layers-1, dropout_ratio=dropout_ratio,\n            with_gwm=with_gwm)\n"
  },
  {
    "path": "chainer_chemistry/models/megnet.py",
    "content": "import chainer\nfrom chainer.backend import get_array_module\nfrom chainer import functions\n\nfrom chainer_chemistry.functions import megnet_softplus\nfrom chainer_chemistry.links.readout.megnet_readout import MEGNetReadout\nfrom chainer_chemistry.links.update.megnet_update import MEGNetUpdate\n\n\ndef reshaped_feat(feat, idx):\n    \"\"\"Convert node stack pattern into pad pattern\n\n    This method is converting from node stack pattern to pad pattern\n    about node and edge feature. This is because the current set2set\n    implementation is only focus on pad pattern feature.\n    \"\"\"\n    xp = get_array_module(idx)\n    max_idx = int(xp.max(idx))\n    vec_list = [feat[idx == i] for i in range(max_idx+1)]\n    return functions.pad_sequence(vec_list)\n\n\nclass MEGNet(chainer.Chain):\n    \"\"\"MEGNet\n\n    See Chi Chen et al, \\\n        Graph Networks as a Universal Machine Learning Framework for Molecules\n        and Crystals. \\\n        `arXiv:1812.05055 <https://arxiv.org/abs/1812.05055>`_\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        n_update_layers (int): the number of MEGNetUpdate layers\n        dropout_ratio (float): ratio of dropout\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function for megnet model\n            `megnet_softplus` was used in original paper.\n    \"\"\"\n\n    def __init__(self, out_dim=32, n_update_layers=3, dropout_ratio=-1,\n                 activation=megnet_softplus):\n        super(MEGNet, self).__init__()\n        if n_update_layers <= 0:\n            raise ValueError('n_update_layers must be a positive integer, '\n                             'but it was set to {}'.format(n_update_layers))\n\n        self.n_update_layers = n_update_layers\n        with self.init_scope():\n            self.update_layers = chainer.ChainList(\n                *[MEGNetUpdate(\n                    dim_for_dense=[64, 32], dim_for_update=[64, 64, 32],\n                    dropout_ratio=dropout_ratio, activation=activation,\n                    skip_intermediate=(i == 0)\n                ) for i in range(n_update_layers)])\n            self.readout = MEGNetReadout(out_dim=out_dim, in_channels=32,\n                                         n_layers=1, processing_steps=3,\n                                         dropout_ratio=dropout_ratio,\n                                         activation=activation)\n\n    def __call__(self, atoms_feat, pair_feat, global_feat, *args):\n        a_f = atoms_feat\n        p_f = pair_feat\n        g_f = global_feat\n        # --- MGENet update ---\n        for i in range(self.n_update_layers):\n            a_f, p_f, g_f = self.update_layers[i](a_f, p_f, g_f, *args)\n        # --- reshape ---\n        atom_idx = args[0]\n        pair_idx = args[1]\n        a_f = reshaped_feat(a_f, atom_idx)\n        p_f = reshaped_feat(p_f, pair_idx)\n        # --- MGENet readout ---\n        out = self.readout(a_f, p_f, g_f)\n        return out\n"
  },
  {
    "path": "chainer_chemistry/models/mlp.py",
    "content": "import chainer\nfrom chainer.functions import relu\nfrom chainer import links\n\n\nclass MLP(chainer.Chain):\n\n    \"\"\"Basic implementation for MLP\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_dim (int): dimension of feature vector\n            associated to each atom\n        n_layers (int): number of layers\n        activation (chainer.functions): activation function\n    \"\"\"\n\n    def __init__(self, out_dim, hidden_dim=16, n_layers=2, activation=relu):\n        super(MLP, self).__init__()\n        if n_layers <= 0:\n            raise ValueError('n_layers must be a positive integer, but it was '\n                             'set to {}'.format(n_layers))\n        layers = [links.Linear(None, hidden_dim) for i in range(n_layers - 1)]\n        with self.init_scope():\n            self.layers = chainer.ChainList(*layers)\n            self.l_out = links.Linear(None, out_dim)\n        self.activation = activation\n\n    def __call__(self, x):\n        h = x\n        for l in self.layers:\n            h = self.activation(l(h))\n        h = self.l_out(h)\n        return h\n"
  },
  {
    "path": "chainer_chemistry/models/mpnn.py",
    "content": "from functools import partial\nfrom typing import Optional  # NOQA\n\nimport chainer\nfrom chainer import cuda, functions  # NOQA\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import EmbedAtomID\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.readout.mpnn_readout import MPNNReadout\nfrom chainer_chemistry.links.update.ggnn_update import GGNNUpdate\nfrom chainer_chemistry.links.update.mpnn_update import MPNNUpdate\n\n\nclass MPNN(chainer.Chain):\n    \"\"\"Message Passing Neural Networks (MPNN).\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (int): dimension of feature vector for each node\n        n_update_layers (int): number of update layers\n        n_atom_types (int): number of types of atoms\n        concat_hidden (bool): If set to True, readout is executed in\n            each layer and the result is concatenated\n        weight_tying (bool): enable weight_tying or not\n        n_edge_types (int): number of edge type.\n            Defaults to 4 for single, double, triple and aromatic bond.\n        nn (~chainer.Link): Neural Networks for expanding edge vector\n            dimension\n        message_func (str): message function. 'edgenet' and 'ggnn' are\n            supported.\n        readout_func (str): readout function. 'set2set' and 'ggnn' are\n            supported.\n    \"\"\"\n\n    def __init__(\n            self,\n            out_dim,  # type: int\n            hidden_channels=16,  # type: int\n            n_update_layers=4,  # type: int\n            n_atom_types=MAX_ATOMIC_NUM,  # type: int\n            concat_hidden=False,  # type: bool\n            weight_tying=True,  # type: bool\n            n_edge_types=4,  # type: int\n            nn=None,  # type: Optional[chainer.Link]\n            message_func='edgenet',  # type: str\n            readout_func='set2set',  # type: str\n    ):\n        # type: (...) -> None\n        super(MPNN, self).__init__()\n        if message_func not in ('edgenet', 'ggnn'):\n            raise ValueError(\n                'Invalid message function: {}'.format(message_func))\n        if readout_func not in ('set2set', 'ggnn'):\n            raise ValueError(\n                'Invalid readout function: {}'.format(readout_func))\n        n_readout_layer = n_update_layers if concat_hidden else 1\n        n_message_layer = 1 if weight_tying else n_update_layers\n        with self.init_scope():\n            # Update\n            self.embed = EmbedAtomID(out_size=hidden_channels,\n                                     in_size=n_atom_types)\n            if message_func == 'ggnn':\n                self.update_layers = chainer.ChainList(*[\n                    GGNNUpdate(\n                        hidden_channels=hidden_channels,\n                        n_edge_types=n_edge_types)\n                    for _ in range(n_message_layer)\n                ])\n            else:\n                self.update_layers = chainer.ChainList(*[\n                    MPNNUpdate(hidden_channels=hidden_channels, nn=nn)\n                    for _ in range(n_message_layer)\n                ])\n\n            # Readout\n            if readout_func == 'ggnn':\n                self.readout_layers = chainer.ChainList(*[\n                    GGNNReadout(out_dim=out_dim,\n                                in_channels=hidden_channels * 2)\n                    for _ in range(n_readout_layer)\n                ])\n            else:\n                self.readout_layers = chainer.ChainList(*[\n                    MPNNReadout(\n                        out_dim=out_dim, in_channels=hidden_channels,\n                        n_layers=1)\n                    for _ in range(n_readout_layer)\n                ])\n        self.out_dim = out_dim\n        self.hidden_channels = hidden_channels\n        self.n_update_layers = n_update_layers\n        self.n_edge_types = n_edge_types\n        self.concat_hidden = concat_hidden\n        self.weight_tying = weight_tying\n        self.message_func = message_func\n        self.readout_func = readout_func\n\n    def __call__(self, atom_array, adj):\n        # type: (numpy.ndarray, numpy.ndarray) -> chainer.Variable\n        \"\"\"Forward propagation.\n\n        Args:\n            atom_array (numpy.ndarray): minibatch of molecular which is\n                represented with atom IDs (representing C, O, S, ...)\n                `atom_array[mol_index, atom_index]` represents `mol_index`-th\n                molecule's `atom_index`-th atomic number\n            adj (numpy.ndarray): minibatch of adjancency matrix with edge-type\n                information\n        Returns:\n            ~chainer.Variable: minibatch of fingerprint\n        \"\"\"\n        # reset state\n        self.reset_state()\n        if atom_array.dtype == self.xp.int32:\n            h = self.embed(atom_array)\n        else:\n            h = atom_array\n        if self.readout_func == 'ggnn':\n            h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n            readout_layers = [\n                partial(readout_layer, h0=h0)\n                for readout_layer in self.readout_layers\n            ]\n        else:\n            readout_layers = self.readout_layers\n        g_list = []\n        for step in range(self.n_update_layers):\n            message_layer_index = 0 if self.weight_tying else step\n            h = self.update_layers[message_layer_index](h, adj)\n            if self.concat_hidden:\n                g = readout_layers[step](h)\n                g_list.append(g)\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            g = readout_layers[0](h)\n            return g\n\n    def reset_state(self):\n        # type: () -> None\n        [update_layer.reset_state() for update_layer in self.update_layers]\n"
  },
  {
    "path": "chainer_chemistry/models/nfp.py",
    "content": "import chainer\nfrom chainer import Variable, functions  # NOQA\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import EmbedAtomID\nfrom chainer_chemistry.links.readout.nfp_readout import NFPReadout\nfrom chainer_chemistry.links.update.nfp_update import NFPUpdate\n\n\nclass NFP(chainer.Chain):\n    \"\"\"Neural Finger Print (NFP)\n\n    See: David K Duvenaud, Dougal Maclaurin, Jorge Iparraguirre, Rafael\n        Bombarell, Timothy Hirzel, Alan Aspuru-Guzik, and Ryan P Adams (2015).\n        Convolutional networks on graphs for learning molecular fingerprints.\n        *Advances in Neural Information Processing Systems (NIPS) 28*,\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (int): dimension of feature vector for each node\n        n_update_layers (int): number of layers\n        max_degree (int): max degree of atoms\n            when molecules are regarded as graphs\n        n_atom_types (int): number of types of atoms\n        concat_hidden (bool): If set to True, readout is executed in each layer\n            and the result is concatenated\n    \"\"\"\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 max_degree=6, n_atom_types=MAX_ATOMIC_NUM,\n                 concat_hidden=False):\n        super(NFP, self).__init__()\n        n_degree_types = max_degree + 1\n        with self.init_scope():\n            self.embed = EmbedAtomID(in_size=n_atom_types,\n                                     out_size=hidden_channels)\n            self.layers = chainer.ChainList(\n                *[NFPUpdate(hidden_channels, hidden_channels,\n                            max_degree=max_degree)\n                  for _ in range(n_update_layers)])\n            self.readout_layers = chainer.ChainList(\n                *[NFPReadout(out_dim=out_dim, in_channels=hidden_channels)\n                  for _ in range(n_update_layers)])\n        self.out_dim = out_dim\n        self.hidden_channels = hidden_channels\n        self.max_degree = max_degree\n        self.n_degree_types = n_degree_types\n        self.n_update_layers = n_update_layers\n        self.concat_hidden = concat_hidden\n\n    def __call__(self, atom_array, adj, is_real_node=None):\n        \"\"\"Forward propagation\n\n        Args:\n            atom_array (numpy.ndarray): minibatch of molecular which is\n                represented with atom IDs (representing C, O, S, ...)\n                `atom_array[mol_index, atom_index]` represents `mol_index`-th\n                molecule's `atom_index`-th atomic number\n            adj (numpy.ndarray): minibatch of adjancency matrix\n                `adj[mol_index]` represents `mol_index`-th molecule's\n                adjacency matrix\n            is_real_node (numpy.ndarray): 2-dim array (minibatch, num_nodes).\n                1 for real node, 0 for virtual node.\n                If `None`, all node is considered as real node.\n\n        Returns:\n            ~chainer.Variable: minibatch of fingerprint\n        \"\"\"\n        if atom_array.dtype == self.xp.int32:\n            # atom_array: (minibatch, atom)\n            h = self.embed(atom_array)\n        else:\n            h = atom_array\n        # h: (minibatch, atom, ch)\n        g = 0\n\n        # --- NFP update & readout ---\n        # degree_mat: (minibatch, max_num_atoms)\n        if isinstance(adj, Variable):\n            adj_array = adj.data\n        else:\n            adj_array = adj\n        degree_mat = self.xp.sum(adj_array, axis=1)\n        # deg_conds: (minibatch, atom, ch)\n        deg_conds = [self.xp.broadcast_to(\n            ((degree_mat - degree) == 0)[:, :, None], h.shape)\n            for degree in range(1, self.n_degree_types + 1)]\n        g_list = []\n        for update, readout in zip(self.layers, self.readout_layers):\n            h = update(h, adj, deg_conds)\n            dg = readout(h, is_real_node)\n            g = g + dg\n            if self.concat_hidden:\n                g_list.append(g)\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=2)\n        else:\n            return g\n"
  },
  {
    "path": "chainer_chemistry/models/prediction/__init__.py",
    "content": "from chainer_chemistry.models.prediction import base  # NOQA\nfrom chainer_chemistry.models.prediction import classifier  # NOQA\nfrom chainer_chemistry.models.prediction import graph_conv_predictor  # NOQA\nfrom chainer_chemistry.models.prediction import regressor  # NOQA\n\nfrom chainer_chemistry.models.prediction.base import BaseForwardModel  # NOQA\nfrom chainer_chemistry.models.prediction.classifier import Classifier  # NOQA\nfrom chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor  # NOQA\nfrom chainer_chemistry.models.prediction.regressor import Regressor  # NOQA\nfrom chainer_chemistry.models.prediction.set_up_predictor import set_up_predictor  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/models/prediction/base.py",
    "content": "import pickle\n\nimport numpy\n\nimport chainer\nfrom chainer import cuda\nfrom chainer.dataset.convert import concat_examples\nfrom chainer.iterators import SerialIterator\nfrom chainer import link\nimport chainerx  # NOQA\n\n\ndef _to_tuple(x):\n    if not isinstance(x, tuple):\n        x = (x,)\n    return x\n\n\ndef _extract_numpy(x):\n    if isinstance(x, chainer.Variable):\n        x = x.data\n    return cuda.to_cpu(x)\n\n\nclass BaseForwardModel(link.Chain):\n\n    \"\"\"A base model which supports forward functionality.\n\n    It also supports pickle save/load functionality.\n    \"\"\"\n\n    def __init__(self):\n        super(BaseForwardModel, self).__init__()\n\n        self.inputs = None\n\n    def initialize(self, device=-1):\n        \"\"\"Initialization of the model.\n\n        It must be executed **after** the link registration\n        (often done by `with self.init_scope()` finished.\n\n        Args:\n            device (int or chainer._backend.Device):\n                GPU device id of this model to be used.\n                -1 indicates to use in CPU.\n\n        \"\"\"\n        self.update_device(device=device)\n\n    def update_device(self, device=-1):\n        if not isinstance(device, chainer._backend.Device):\n            device = chainer.get_device(device)  # type: chainerx.Device\n\n        if self.device != device:\n            device.use()\n            # reset current state\n            self.to_cpu()\n            # update the model to specified device\n            self.to_device(device)\n\n    def _forward(self, data, fn, batchsize=16,\n                 converter=concat_examples, retain_inputs=False,\n                 preprocess_fn=None, postprocess_fn=None):\n        \"\"\"Forward data by iterating with batch\n\n        Args:\n            data: \"train_x array\" or \"chainer dataset\"\n            fn (Callable): Main function to forward. Its input argument is\n                either Variable, cupy.ndarray or numpy.ndarray, and returns\n                Variable.\n            batchsize (int): batch size\n            converter (Callable): convert from `data` to `inputs`\n            retain_inputs (bool): If True, this instance keeps inputs in\n                `self.inputs` or not.\n            preprocess_fn (Callable): Its input is numpy.ndarray or\n                cupy.ndarray, it can return either Variable, cupy.ndarray or\n                numpy.ndarray\n            postprocess_fn (Callable): Its input argument is Variable,\n                but this method may return either Variable, cupy.ndarray or\n                numpy.ndarray.\n\n        Returns (tuple or numpy.ndarray): forward result\n\n        \"\"\"\n        input_list = None\n        output_list = None\n        it = SerialIterator(data, batch_size=batchsize, repeat=False,\n                            shuffle=False)\n        for batch in it:\n            inputs = converter(batch, self.device)\n            inputs = _to_tuple(inputs)\n\n            if preprocess_fn:\n                inputs = preprocess_fn(*inputs)\n                inputs = _to_tuple(inputs)\n\n            outputs = fn(*inputs)\n            outputs = _to_tuple(outputs)\n\n            # Init\n            if retain_inputs:\n                if input_list is None:\n                    input_list = [[] for _ in range(len(inputs))]\n                for j, input in enumerate(inputs):\n                    input_list[j].append(cuda.to_cpu(input))\n            if output_list is None:\n                output_list = [[] for _ in range(len(outputs))]\n\n            if postprocess_fn:\n                outputs = postprocess_fn(*outputs)\n                outputs = _to_tuple(outputs)\n            for j, output in enumerate(outputs):\n                output_list[j].append(_extract_numpy(output))\n\n        if retain_inputs:\n            self.inputs = [numpy.concatenate(\n                in_array) for in_array in input_list]\n\n        result = [numpy.concatenate(output) for output in output_list]\n        if len(result) == 1:\n            return result[0]\n        else:\n            return result\n\n    def save_pickle(self, filepath, protocol=None):\n        \"\"\"Save the model to `filepath` as a pickle file\n\n        This function send the parameters to CPU before saving the model so\n        that the pickled file can be loaded with in CPU-only environment. \n        After the model is saved, it is sent back to the original device.\n\n        Saved pickle file can be loaded with `load_pickle` static method.\n\n        Note that the transportability of the saved file follows the\n        specification of `pickle` module, namely serialized data depends on the\n        specific class or attribute structure when saved. The file may not be\n        loaded in different environment (version of python or dependent\n        libraries), or after large refactoring of the pickled object class.\n        If you want to avoid it, use `chainer.serializers.save_npz`\n        method instead to save only model parameters.\n\n    .. admonition:: Example\n\n       >>> from chainer_chemistry.models import BaseForwardModel\n       >>> class DummyForwardModel(BaseForwardModel):\n       >>> \n       >>>     def __init__(self, device=-1):\n       >>>         super(DummyForwardModel, self).__init__()\n       >>>         with self.init_scope():\n       >>>             self.l = chainer.links.Linear(3, 10)\n       >>>         self.initialize(device)\n       >>> \n       >>>     def __call__(self, x):\n       >>>         return self.l(x)\n       >>>\n       >>> model = DummyForwardModel()\n       >>> filepath = 'model.pkl'\n       >>> model.save_pickle(filepath)  \n\n        Args:\n            filepath (str): file path of pickle file.\n            protocol (int or None): protocol version used in `pickle`.\n                Use 2 if you need python2/python3 compatibility.\n                3 or higher is used for python3.\n                Please refer the official document [1] for more details.\n                [1]: https://docs.python.org/3.6/library/pickle.html#module-interface\n\n        \"\"\"  # NOQA\n        current_device = self.device\n\n        # --- Move the model to CPU for saving ---\n        self.update_device(-1)\n        with open(filepath, mode='wb') as f:\n            pickle.dump(self, f, protocol=protocol)\n\n        # --- Revert the model to original device ---\n        self.update_device(current_device)\n\n    @staticmethod\n    def load_pickle(filepath, device=-1):\n        \"\"\"Load the model from `filepath` of pickle file, and send to `device`\n\n        The file saved by `save_pickle` method can be loaded, but it may fail\n        to load when loading from different develop environment or after\n        updating library version.\n        See `save_pickle` method for the transportability of the saved file.\n\n    .. admonition:: Example\n\n       >>> from chainer_chemistry.models import BaseForwardModel\n       >>> filepath = 'model.pkl'\n       >>> # `load_pickle` is static method, call from Class to get an instance\n       >>> model = BaseForwardModel.load_pickle(filepath)\n\n        Args:\n            filepath (str): file path of pickle file.\n            device (int or chainerx.Device): GPU device id of this model to be\n                used. -1 indicates to use in CPU.\n\n        \"\"\"\n        with open(filepath, mode='rb') as f:\n            model = pickle.load(f)\n\n        if not isinstance(model, BaseForwardModel):\n            raise TypeError('Unexpected type {}'.format(type(model)))\n\n        # --- Revert the model to specified device ---\n        model.initialize(device)\n        return model\n"
  },
  {
    "path": "chainer_chemistry/models/prediction/classifier.py",
    "content": "import warnings\n\nimport numpy\n\nimport chainer\nfrom chainer.dataset.convert import concat_examples\nfrom chainer.functions.evaluation import accuracy\nfrom chainer.functions.loss import softmax_cross_entropy\nfrom chainer import cuda, Variable  # NOQA\nfrom chainer import reporter\nfrom chainer_chemistry.models.prediction.base import BaseForwardModel\n\n\ndef _argmax(*args):\n    x = args[0]\n    return chainer.functions.argmax(x, axis=1)\n\n\nclass Classifier(BaseForwardModel):\n\n    \"\"\"A simple classifier model.\n\n    This is an example of chain that wraps another chain. It computes the\n    loss and accuracy based on a given input/label pair.\n\n    Args:\n        predictor (~chainer.Link): Predictor network.\n        lossfun (function): Loss function.\n        accfun (function): DEPRECATED. Please use `metrics_fun` instead.\n        metrics_fun (function or dict or None): Function that computes metrics.\n        label_key (int or str): Key to specify label variable from arguments.\n            When it is ``int``, a variable in positional arguments is used.\n            And when it is ``str``, a variable in keyword arguments is used.\n        device (int or chainer._backend.Device):\n             GPU device id of this Regressor to be used.\n             -1 indicates to use in CPU.\n\n    Attributes:\n        predictor (~chainer.Link): Predictor network.\n        lossfun (function): Loss function.\n        accfun (function): DEPRECATED. Please use `metrics_fun` instead.\n        y (~chainer.Variable): Prediction for the last minibatch.\n        loss (~chainer.Variable): Loss value for the last minibatch.\n        metrics (dict): Metrics computed in last minibatch\n        compute_metrics (bool): If ``True``, compute metrics on the forward\n            computation. The default value is ``True``.\n\n    .. note::\n        The differences between original `Classifier` class in chainer and\n        chainer chemistry are as follows.\n        1. `predict` and `predict_proba` methods are supported.\n        2. `device` can be managed internally by the `Classifier`\n        3. `accfun` is deprecated, `metrics_fun` is used instead.\n        4. `metrics_fun` can be `dict` which specifies the metrics name as key\n           and function as value.\n\n    .. note::\n        This link uses :func:`chainer.softmax_cross_entropy` with\n        default arguments as a loss function (specified by ``lossfun``),\n        if users do not explicitly change it. In particular, the loss function\n        does not support double backpropagation.\n        If you need second or higher order differentiation, you need to turn\n        it on with ``enable_double_backprop=True``:\n\n          >>> import chainer.functions as F\n          >>> import chainer.links as L\n          >>>\n          >>> def lossfun(x, t):\n          ...     return F.softmax_cross_entropy(\n          ...         x, t, enable_double_backprop=True)\n          >>>\n          >>> predictor = L.Linear(10)\n          >>> model = L.Classifier(predictor, lossfun=lossfun)\n\n    \"\"\"\n\n    compute_metrics = True\n\n    def __init__(self, predictor,\n                 lossfun=softmax_cross_entropy.softmax_cross_entropy,\n                 accfun=None, metrics_fun=accuracy.accuracy,\n                 label_key=-1, device=-1):\n        if not (isinstance(label_key, (int, str))):\n            raise TypeError('label_key must be int or str, but is %s' %\n                            type(label_key))\n        if accfun is not None:\n            warnings.warn(\n                'accfun is deprecated, please use metrics_fun instead')\n            warnings.warn('overriding metrics by accfun...')\n            # override metrics by accfun\n            metrics_fun = accfun\n\n        super(Classifier, self).__init__()\n        self.lossfun = lossfun\n        if metrics_fun is None:\n            self.compute_metrics = False\n            self.metrics_fun = {}\n        elif callable(metrics_fun):\n            self.metrics_fun = {'accuracy': metrics_fun}\n        elif isinstance(metrics_fun, dict):\n            self.metrics_fun = metrics_fun\n        else:\n            raise TypeError('Unexpected type metrics_fun must be None or '\n                            'Callable or dict. actual {}'.format(type(accfun)))\n        self.y = None\n        self.loss = None\n        self.metrics = None\n        self.label_key = label_key\n\n        with self.init_scope():\n            self.predictor = predictor\n\n        # `initialize` must be called after `init_scope`.\n        self.initialize(device)\n\n    def _convert_to_scalar(self, value):\n        \"\"\"Converts an input value to a scalar if its type is a Variable,\n\n        numpy or cupy array, otherwise it returns the value as it is.\n        \"\"\"\n        if isinstance(value, Variable):\n            value = value.array\n        if numpy.isscalar(value):\n            return value\n        if type(value) is not numpy.array:\n            value = cuda.to_cpu(value)\n        return numpy.asscalar(value)\n\n    def __call__(self, *args, **kwargs):\n        \"\"\"Computes the loss value for an input and label pair.\n\n        It also computes accuracy and stores it to the attribute.\n\n        Args:\n            args (list of ~chainer.Variable): Input minibatch.\n            kwargs (dict of ~chainer.Variable): Input minibatch.\n\n        When ``label_key`` is ``int``, the correpoding element in ``args``\n        is treated as ground truth labels. And when it is ``str``, the\n        element in ``kwargs`` is used.\n        The all elements of ``args`` and ``kwargs`` except the ground trush\n        labels are features.\n        It feeds features to the predictor and compare the result\n        with ground truth labels.\n\n        Returns:\n            ~chainer.Variable: Loss value.\n\n        \"\"\"\n\n        # --- Separate `args` and `t` ---\n        if isinstance(self.label_key, int):\n            if not (-len(args) <= self.label_key < len(args)):\n                msg = 'Label key %d is out of bounds' % self.label_key\n                raise ValueError(msg)\n            t = args[self.label_key]\n            if self.label_key == -1:\n                args = args[:-1]\n            else:\n                args = args[:self.label_key] + args[self.label_key + 1:]\n        elif isinstance(self.label_key, str):\n            if self.label_key not in kwargs:\n                msg = 'Label key \"%s\" is not found' % self.label_key\n                raise ValueError(msg)\n            t = kwargs[self.label_key]\n            del kwargs[self.label_key]\n        else:\n            raise TypeError('Label key type {} not supported'\n                            .format(type(self.label_key)))\n\n        self.y = None\n        self.loss = None\n        self.metrics = None\n        self.y = self.predictor(*args, **kwargs)\n        self.loss = self.lossfun(self.y, t)\n        reporter.report(\n            {'loss': self._convert_to_scalar(self.loss)}, self)\n        if self.compute_metrics:\n            # Note: self.accuracy is `dict`, which is different from original\n            # chainer implementation\n            self.metrics = {key: self._convert_to_scalar(value(self.y, t))\n                            for key, value in self.metrics_fun.items()}\n            reporter.report(self.metrics, self)\n        return self.loss\n\n    def predict_proba(\n            self, data, batchsize=16, converter=concat_examples,\n            retain_inputs=False, preprocess_fn=None,\n            postprocess_fn=chainer.functions.softmax):\n        \"\"\"Calculate probability of each category.\n\n        Args:\n            data: \"train_x array\" or \"chainer dataset\"\n            fn (Callable): Main function to forward. Its input argument is\n                either Variable, cupy.ndarray or numpy.ndarray, and returns\n                Variable.\n            batchsize (int): batch size\n            converter (Callable): convert from `data` to `inputs`\n            preprocess_fn (Callable): Its input is numpy.ndarray or\n                cupy.ndarray, it can return either Variable, cupy.ndarray or\n                numpy.ndarray\n            postprocess_fn (Callable): Its input argument is Variable,\n                but this method may return either Variable, cupy.ndarray or\n                numpy.ndarray.\n            retain_inputs (bool): If True, this instance keeps inputs in\n                `self.inputs` or not.\n\n        Returns (tuple or numpy.ndarray): Typically, it is 2-dimensional float\n            array with shape (batchsize, number of category) which represents\n            each examples probability to be each category.\n\n        \"\"\"\n        with chainer.no_backprop_mode(), chainer.using_config('train', False):\n            proba = self._forward(\n                data, fn=self.predictor, batchsize=batchsize,\n                converter=converter, retain_inputs=retain_inputs,\n                preprocess_fn=preprocess_fn, postprocess_fn=postprocess_fn)\n        return proba\n\n    def predict(\n            self, data, batchsize=16, converter=concat_examples,\n            retain_inputs=False, preprocess_fn=None, postprocess_fn=_argmax):\n        \"\"\"Predict label of each category by taking .\n\n        Args:\n            data: input data\n            batchsize (int): batch size\n            converter (Callable): convert from `data` to `inputs`\n            preprocess_fn (Callable): Its input is numpy.ndarray or\n                cupy.ndarray, it can return either Variable, cupy.ndarray or\n                numpy.ndarray\n            postprocess_fn (Callable): Its input argument is Variable,\n                but this method may return either Variable, cupy.ndarray or\n                numpy.ndarray.\n            retain_inputs (bool): If True, this instance keeps inputs in\n                `self.inputs` or not.\n\n        Returns (tuple or numpy.ndarray): Typically, it is 1-dimensional int\n            array with shape (batchsize, ) which represents each examples\n            category prediction.\n\n        \"\"\"\n        with chainer.no_backprop_mode(), chainer.using_config('train', False):\n            predict_labels = self._forward(\n                data, fn=self.predictor, batchsize=batchsize,\n                converter=converter, retain_inputs=retain_inputs,\n                preprocess_fn=preprocess_fn, postprocess_fn=postprocess_fn)\n        return predict_labels\n\n    # --- For backward compatibility ---\n    @property\n    def compute_accuracy(self):\n        warnings.warn('compute_accuracy is deprecated,'\n                      'please use compute_metrics instead')\n        return self.compute_metrics\n\n    @compute_accuracy.setter\n    def compute_accuracy(self, value):\n        warnings.warn('compute_accuracy is deprecated,'\n                      'please use compute_metrics instead')\n        self.compute_metrics = value\n\n    @property\n    def accuracy(self):\n        warnings.warn('accuracy is deprecated,'\n                      'please use metrics instead')\n        return self.metrics\n\n    @accuracy.setter\n    def accuracy(self, value):\n        warnings.warn('accuracy is deprecated,'\n                      'please use metrics instead')\n        self.metrics = value\n\n    @property\n    def accfun(self):\n        warnings.warn('accfun is deprecated,'\n                      'please use metrics_fun instead')\n        return self.metrics_fun\n\n    @accfun.setter\n    def accfun(self, value):\n        warnings.warn('accfun is deprecated,'\n                      'please use metrics_fun instead')\n        self.metrics_fun = value\n"
  },
  {
    "path": "chainer_chemistry/models/prediction/graph_conv_predictor.py",
    "content": "from typing import Optional  # NOQA\n\nimport chainer\nimport numpy  # NOQA\n\n\nclass GraphConvPredictor(chainer.Chain):\n    \"\"\"Wrapper class that combines a graph convolution and MLP.\"\"\"\n\n    def __init__(\n            self,\n            graph_conv,  # type: chainer.Link\n            mlp=None,  # type: Optional[chainer.Link]\n            label_scaler=None,  # type: Optional[chainer.Link]\n            postprocess_fn=None  # type: Optional[chainer.FunctionNode]\n    ):\n        # type: (...) -> None\n        \"\"\"Initialize the graph convolution predictor.\n\n        Args:\n            graph_conv (chainer.Chain): The graph convolution network\n                required to obtain molecule feature representation.\n            mlp (chainer.Chain or None): Multi layer perceptron;\n                used as the final fully connected layer. Set it to\n                `None` if no operation is necessary after the\n                `graph_conv` calculation.\n            label_scaler (chainer.Link or None): scaler link\n            postprocess_fn (chainer.FunctionNode or None):\n                postprocess function for prediction.\n        \"\"\"\n        super(GraphConvPredictor, self).__init__()\n        with self.init_scope():\n            self.graph_conv = graph_conv\n            if isinstance(mlp, chainer.Link):\n                self.mlp = mlp\n            if isinstance(label_scaler, chainer.Link):\n                self.label_scaler = label_scaler\n        if not isinstance(mlp, chainer.Link):\n            self.mlp = mlp\n        if not isinstance(label_scaler, chainer.Link):\n            self.label_scaler = label_scaler\n        self.postprocess_fn = postprocess_fn or chainer.functions.identity\n\n    def __call__(self, *args, **kwargs):\n        x = self.graph_conv(*args, **kwargs)\n        if self.mlp:\n            x = self.mlp(x)\n        if self.label_scaler is not None:\n            x = self.label_scaler.inverse_transform(x)\n        return x\n\n    def predict(self, atoms, adjs):\n        # type: (numpy.ndarray, numpy.ndarray) -> chainer.Variable\n        # TODO(nakago): support super_node & is_real_node args.\n        with chainer.no_backprop_mode(), chainer.using_config('train', False):\n            x = self.__call__(atoms, adjs)\n            return self.postprocess_fn(x)\n"
  },
  {
    "path": "chainer_chemistry/models/prediction/node_classifier.py",
    "content": "from chainer import reporter\nfrom chainer_chemistry.models.prediction.classifier import Classifier\n\n\nclass NodeClassifier(Classifier):\n    \"\"\"A simple node classifier model.\"\"\"\n\n    def __call__(self, data, train_mask, valid_mask, *args, **kwargs):\n        \"\"\"Computes the loss value for an input and label pair.\"\"\"\n        self.metrics = None\n        self.y = self.predictor(data)\n        # Support for padding pattern\n        if self.y.ndim == 3:\n            assert self.y.shape[0] == 1\n            self.y = self.y[0]\n        self.train_loss = self.lossfun(self.y[train_mask], data.y[train_mask])\n        self.valid_loss = self.lossfun(self.y[valid_mask], data.y[valid_mask])\n        reporter.report(\n            {'loss(train)': self._convert_to_scalar(self.train_loss)}, self)\n        reporter.report(\n            {'loss(valid)': self._convert_to_scalar(self.valid_loss)}, self)\n        if self.compute_metrics:\n            # Note: self.accuracy is `dict`, which is different from original\n            # chainer implementation\n            self.train_metrics = {key + \"(train)\":\n                                  self._convert_to_scalar(\n                                      value(self.y[train_mask],\n                                            data.y[train_mask]))\n                                  for key, value in self.metrics_fun.items()}\n            self.valid_metrics = {key + \"(valid)\":\n                                  self._convert_to_scalar(\n                                      value(self.y[valid_mask],\n                                            data.y[valid_mask]))\n                                  for key, value in self.metrics_fun.items()}\n            reporter.report(self.train_metrics, self)\n            reporter.report(self.valid_metrics, self)\n        return self.train_loss\n"
  },
  {
    "path": "chainer_chemistry/models/prediction/regressor.py",
    "content": "import numpy\n\nimport chainer\nfrom chainer.dataset.convert import concat_examples\nfrom chainer import cuda, Variable  # NOQA\nfrom chainer import reporter\n\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_data import BaseGraphData  # NOQA\nfrom chainer_chemistry.models.prediction.base import BaseForwardModel\n\n\nclass Regressor(BaseForwardModel):\n    \"\"\"A simple regressor model.\n\n    This is an example of chain that wraps another chain. It computes the\n    loss and metrics based on a given input/label pair.\n\n    Args:\n        predictor (~chainer.Link): Predictor network.\n        lossfun (function): Loss function.\n        metrics_fun (function or dict or None): Function that computes metrics.\n        label_key (int or str): Key to specify label variable from arguments.\n            When it is ``int``, a variable in positional arguments is used.\n            And when it is ``str``, a variable in keyword arguments is used.\n        device (int or chainer._backend.Device):\n             GPU device id of this Regressor to be used.\n             -1 indicates to use in CPU.\n\n    Attributes:\n        predictor (~chainer.Link): Predictor network.\n        lossfun (function): Loss function.\n        y (~chainer.Variable): Prediction for the last minibatch.\n        loss (~chainer.Variable): Loss value for the last minibatch.\n        metrics (dict): Metrics computed in last minibatch\n        compute_metrics (bool): If ``True``, compute metrics on the forward\n            computation. The default value is ``True``.\n\n    \"\"\"\n\n    compute_metrics = True\n\n    def __init__(self, predictor,\n                 lossfun=chainer.functions.mean_squared_error,\n                 metrics_fun=None, label_key=-1, device=-1):\n        if not (isinstance(label_key, (int, str))):\n            raise TypeError('label_key must be int or str, but is %s' %\n                            type(label_key))\n        super(Regressor, self).__init__()\n        self.lossfun = lossfun\n        if metrics_fun is None:\n            self.compute_metrics = False\n            self.metrics_fun = {}\n        elif callable(metrics_fun):\n            self.metrics_fun = {'metrics': metrics_fun}\n        elif isinstance(metrics_fun, dict):\n            self.metrics_fun = metrics_fun\n        else:\n            raise TypeError('Unexpected type metrics_fun must be None or '\n                            'Callable or dict. actual {}'\n                            .format(type(metrics_fun)))\n        self.y = None\n        self.loss = None\n        self.metrics = None\n        self.label_key = label_key\n\n        with self.init_scope():\n            self.predictor = predictor\n\n        # `initialize` must be called after `init_scope`.\n        self.initialize(device)\n\n    def _convert_to_scalar(self, value):\n        \"\"\"Converts an input value to a scalar if its type is a Variable,\n\n        numpy or cupy array, otherwise it returns the value as it is.\n        \"\"\"\n        if isinstance(value, Variable):\n            value = value.array\n        if numpy.isscalar(value):\n            return value\n        if type(value) is not numpy.array:\n            value = cuda.to_cpu(value)\n        return numpy.asscalar(value)\n\n    def __call__(self, *args, **kwargs):\n        \"\"\"Computes the loss value for an input and label pair.\n\n        It also computes metrics and stores it to the attribute.\n\n        Args:\n            args (list of ~chainer.Variable): Input minibatch.\n            kwargs (dict of ~chainer.Variable): Input minibatch.\n\n        When ``label_key`` is ``int``, the correpoding element in ``args``\n        is treated as ground truth labels. And when it is ``str``, the\n        element in ``kwargs`` is used.\n        The all elements of ``args`` and ``kwargs`` except the ground trush\n        labels are features.\n        It feeds features to the predictor and compare the result\n        with ground truth labels.\n\n        Returns:\n            ~chainer.Variable: Loss value.\n\n        \"\"\"\n\n        # --- Separate `args` and `t` ---\n        if isinstance(args[0], BaseGraphData):\n            # for graph dataset\n            t = args[0].y\n        elif isinstance(self.label_key, int):\n            if not (-len(args) <= self.label_key < len(args)):\n                msg = 'Label key %d is out of bounds' % self.label_key\n                raise ValueError(msg)\n            t = args[self.label_key]\n            if self.label_key == -1:\n                args = args[:-1]\n            else:\n                args = args[:self.label_key] + args[self.label_key + 1:]\n        elif isinstance(self.label_key, str):\n            if self.label_key not in kwargs:\n                msg = 'Label key \"%s\" is not found' % self.label_key\n                raise ValueError(msg)\n            t = kwargs[self.label_key]\n            del kwargs[self.label_key]\n        else:\n            raise TypeError('Label key type {} not supported'\n                            .format(type(self.label_key)))\n\n        self.y = None\n        self.loss = None\n        self.metrics = None\n        self.y = self.predictor(*args, **kwargs)\n        self.loss = self.lossfun(self.y, t)\n\n        # When the reported data is a numpy array, the loss and metrics values\n        # are scalars. When the reported data is a cupy array, sometimes the\n        # same values become arrays instead. This seems to be a bug inside the\n        # reporter class, which needs to be addressed and fixed. Until then,\n        # the reported values will be converted to numpy arrays.\n        reporter.report(\n            {'loss': self._convert_to_scalar(self.loss)}, self)\n\n        if self.compute_metrics:\n            # Note: self.metrics_fun is `dict`,\n            # which is different from original chainer implementation\n            self.metrics = {key: self._convert_to_scalar(value(self.y, t))\n                            for key, value in self.metrics_fun.items()}\n            reporter.report(self.metrics, self)\n        return self.loss\n\n    def predict(\n            self, data, batchsize=16, converter=concat_examples,\n            retain_inputs=False, preprocess_fn=None, postprocess_fn=None):\n        \"\"\"Predict label of each category by taking .\n\n        Args:\n            data: input data\n            batchsize (int): batch size\n            converter (Callable): convert from `data` to `inputs`\n            preprocess_fn (Callable): Its input is numpy.ndarray or\n                cupy.ndarray, it can return either Variable, cupy.ndarray or\n                numpy.ndarray\n            postprocess_fn (Callable): Its input argument is Variable,\n                but this method may return either Variable, cupy.ndarray or\n                numpy.ndarray.\n            retain_inputs (bool): If True, this instance keeps inputs in\n                `self.inputs` or not.\n\n        Returns (tuple or numpy.ndarray): Typically, it is 1-dimensional int\n            array with shape (batchsize, ) which represents each examples\n            category prediction.\n\n        \"\"\"\n        with chainer.no_backprop_mode(), chainer.using_config('train', False):\n            predict_labels = self._forward(\n                data, fn=self.predictor, batchsize=batchsize,\n                converter=converter, retain_inputs=retain_inputs,\n                preprocess_fn=preprocess_fn, postprocess_fn=postprocess_fn)\n        return predict_labels\n"
  },
  {
    "path": "chainer_chemistry/models/prediction/set_up_predictor.py",
    "content": "from typing import Any  # NOQA\nfrom typing import Dict  # NOQA\nfrom typing import Optional  # NOQA\n\nimport chainer  # NOQA\n\nfrom chainer_chemistry.models.cgcnn import CGCNN\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.ggnn import GGNN\nfrom chainer_chemistry.models.gin import GIN, GINSparse  # NOQA\nfrom chainer_chemistry.models.gnn_film import GNNFiLM\nfrom chainer_chemistry.models.megnet import MEGNet\nfrom chainer_chemistry.models.mlp import MLP\nfrom chainer_chemistry.models.nfp import NFP\nfrom chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor  # NOQA\nfrom chainer_chemistry.models.relgat import RelGAT\nfrom chainer_chemistry.models.relgcn import RelGCN, RelGCNSparse  # NOQA\nfrom chainer_chemistry.models.rsgcn import RSGCN\nfrom chainer_chemistry.models.schnet import SchNet\nfrom chainer_chemistry.models.weavenet import WeaveNet\n\n\nfrom chainer_chemistry.models.gwm.gwm_net import GGNN_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import GIN_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import NFP_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import RSGCN_GWM  # NOQA\n\nfrom chainer_chemistry.models.cwle.cwle_net import GGNN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RelGAT_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RelGCN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import GIN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import NFP_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RSGCN_CWLE  # NOQA\n\nfrom chainer_chemistry.models.gwle.gwle_net import GGNN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RelGAT_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RelGCN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import GIN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import NFP_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RSGCN_GWLE  # NOQA\n\nfrom chainer_chemistry.models.cwle.cwle_graph_conv_model import MAX_WLE_NUM\n\n\ndef set_up_predictor(\n        method,  # type: str\n        n_unit,  # type: int\n        conv_layers,  # type: int\n        class_num,  # type: int\n        label_scaler=None,  # type: Optional[chainer.Link]\n        postprocess_fn=None,  # type: Optional[chainer.FunctionNode]\n        n_atom_types=MAX_ATOMIC_NUM,\n        conv_kwargs=None,  # type: Optional[Dict[str, Any]]\n        n_wle_types=MAX_WLE_NUM  # type: int\n):\n    # type: (...) -> GraphConvPredictor\n    \"\"\"Set up the predictor, consisting of a GCN and a MLP.\n\n    Args:\n        method (str): Method name.\n        n_unit (int): Number of hidden units.\n        conv_layers (int): Number of convolutional layers for the graph\n            convolution network.\n        class_num (int): Number of output classes.\n        label_scaler (chainer.Link or None): scaler link\n        postprocess_fn (chainer.FunctionNode or None):\n            postprocess function for prediction.\n        conv_kwargs (dict): keyword args for GraphConvolution model.\n    \"\"\"\n    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)  # type: Optional[MLP]\n    if conv_kwargs is None:\n        conv_kwargs = {}\n\n    if method == 'nfp' or method == 'nfp_wle':\n        print('Set up NFP predictor...')\n        conv = NFP(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'ggnn' or method == 'ggnn_wle':\n        print('Set up GGNN predictor...')\n        conv = GGNN(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'schnet':\n        print('Set up SchNet predictor...')\n        conv = SchNet(\n            out_dim=class_num,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n        mlp = None\n    elif method == 'weavenet':\n        print('Set up WeaveNet predictor...')\n        conv = WeaveNet(hidden_dim=n_unit, n_atom_types=n_atom_types, **conv_kwargs)\n    elif method == 'rsgcn' or method == 'rsgcn_wle':\n        print('Set up RSGCN predictor...')\n        conv = RSGCN(out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'relgcn' or method == 'relgcn_wle':\n        print('Set up Relational GCN predictor...')\n        num_edge_type = 4\n        conv = RelGCN(\n            out_dim=n_unit,\n            n_edge_types=num_edge_type,\n            scale_adj=True,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'relgat' or method == 'relgat_wle':\n        print('Set up Relational GAT predictor...')\n        conv = RelGAT(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'gin' or method == 'gin_wle':\n        print('Set up GIN predictor...')\n        conv = GIN(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'nfp_gwm':\n        print('Set up NFP_GWM predictor...')\n        conv = NFP_GWM(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'ggnn_gwm':\n        print('Set up GGNN_GWM predictor...')\n        conv = GGNN_GWM(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'rsgcn_gwm':\n        print('Set up RSGCN_GWM predictor...')\n        conv = RSGCN_GWM(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'gin_gwm':\n        print('Set up GIN_GWM predictor...')\n        conv = GIN_GWM(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'nfp_cwle':\n        print('Set up NFP_CWLE predictor...')\n        conv = NFP_CWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'ggnn_cwle':\n        print('Set up GGNN_CWLE predictor...')\n        conv = GGNN_CWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'relgat_cwle':\n        print('Set up RelGAT_CWLE predictor...')\n        conv = RelGAT_CWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'relgcn_cwle':\n        print('Set up RelGCN_CWLE predictor...')\n        conv = RelGCN_CWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'rsgcn_cwle':\n        print('Set up RSGCN_CWLE predictor...')\n        conv = RSGCN_CWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'gin_cwle':\n        print('Set up GIN_CWLE predictor...')\n        conv = GIN_CWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'nfp_gwle':\n        print('Set up NFP_GWLE predictor...')\n        conv = NFP_GWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'ggnn_gwle':\n        print('Set up GGNN_GWLE predictor...')\n        conv = GGNN_GWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'relgat_gwle':\n        print('Set up RelGAT_GWLE predictor...')\n        conv = RelGAT_GWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'relgcn_gwle':\n        print('Set up RelGCN_GWLE predictor...')\n        conv = RelGCN_GWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'rsgcn_gwle':\n        print('Set up RSGCN_GWLE predictor...')\n        conv = RSGCN_GWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'gin_cwle':\n        print('Set up GIN_CWLE predictor...')\n        conv = GIN_CWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'gin_gwle':\n        print('Set up GIN_gWLE predictor...')\n        conv = GIN_GWLE(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            n_wle_types=n_wle_types,\n            **conv_kwargs)\n    elif method == 'relgcn_sparse':\n        print('Set up RelGCNSparse predictor...')\n        conv = RelGCNSparse(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'gin_sparse':\n        print('Set up GIN predictor...')\n        conv = GINSparse(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'gnnfilm':\n        print('Training a GNN_FiLM predictor...')\n        conv = GNNFiLM(\n            out_dim=n_unit,\n            hidden_channels=n_unit,\n            n_update_layers=conv_layers,\n            n_edge_types=5,\n            n_atom_types=n_atom_types,\n            **conv_kwargs)\n    elif method == 'megnet':\n        print('Set up MEGNet predictor...')\n        conv = MEGNet(\n            out_dim=n_unit,\n            n_update_layers=conv_layers,\n            **conv_kwargs)\n    elif method == 'cgcnn':\n        print('Set up CGCNN predictor...')\n        conv = CGCNN(\n            out_dim=n_unit,\n            n_update_layers=conv_layers,\n            **conv_kwargs)\n    else:\n        raise ValueError('[ERROR] Invalid method: {}'.format(method))\n\n    predictor = GraphConvPredictor(conv, mlp, label_scaler, postprocess_fn)\n    return predictor\n"
  },
  {
    "path": "chainer_chemistry/models/relgat.py",
    "content": "# -*- coding: utf-8 -*-\nimport chainer\nfrom chainer import functions, cuda  # NOQA\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import EmbedAtomID\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.update.relgat_update import RelGATUpdate\n\n\nclass RelGAT(chainer.Chain):\n    \"\"\"Relational Graph Attention Networks (GAT)\n\n    See: Veličković, Petar, et al. (2017).\\\n        Graph Attention Networks.\\\n        `arXiv:1701.10903 <https://arxiv.org/abs/1710.10903>`\\\n        Dan Busbridge, et al. (2018).\\\n        Relational Graph Attention Networks\n        `<https://openreview.net/forum?id=Bklzkh0qFm>`\\\n\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (int): dimension of feature vector for each node\n        n_update_layers (int): number of layers\n        n_atom_types (int): number of types of atoms\n        concat_hidden (bool): If set to True, readout is executed in each layer\n            and the result is concatenated\n        dropout_ratio (float): dropout ratio of the normalized attention\n            coefficients\n        weight_tying (bool): enable weight_tying or not\n        activation (~chainer.Function or ~chainer.FunctionNode):\n            activate function\n        n_edge_types (int): number of edge type.\n            Defaults to 4 for single, double, triple and aromatic bond.\n        n_heads (int): number of multi-head-attentions.\n        negative_slope (float): LeakyRELU angle of the negative slope\n        softmax_mode (str): take the softmax over the logits 'across' or\n            'within' relation. If you would like to know the detail discussion,\n            please refer Relational GAT paper.\n        concat_heads (bool) : Whether to concat or average multi-head\n            attentions\n    \"\"\"\n    def __init__(self, out_dim, hidden_channels=16, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM, concat_hidden=False,\n                 dropout_ratio=-1., weight_tying=False,\n                 activation=functions.identity, n_edge_types=4,\n                 n_heads=3, negative_slope=0.2,\n                 softmax_mode='across', concat_heads=False):\n        super(RelGAT, self).__init__()\n        n_readout_layer = n_update_layers if concat_hidden else 1\n        n_message_layer = n_update_layers\n        with self.init_scope():\n            self.embed = EmbedAtomID(out_size=hidden_channels,\n                                     in_size=n_atom_types)\n            update_layers = []\n            for i in range(n_message_layer):\n                if i > 0 and concat_heads:\n                    input_dim = hidden_channels * n_heads\n                else:\n                    input_dim = hidden_channels\n                update_layers.append(\n                    RelGATUpdate(input_dim, hidden_channels, n_heads=n_heads,\n                                 n_edge_types=n_edge_types,\n                                 dropout_ratio=dropout_ratio,\n                                 negative_slope=negative_slope,\n                                 softmax_mode=softmax_mode,\n                                 concat_heads=concat_heads))\n            self.update_layers = chainer.ChainList(*update_layers)\n            if concat_heads:\n                in_channels = hidden_channels * (n_heads + 1)\n            else:\n                in_channels = hidden_channels * 2\n            self.readout_layers = chainer.ChainList(*[GGNNReadout(\n                out_dim=out_dim, in_channels=in_channels,\n                activation=activation, activation_agg=activation)\n                for _ in range(n_readout_layer)])\n\n        self.out_dim = out_dim\n        self.n_heads = n_heads\n        self.hidden_channels = hidden_channels\n        self.n_update_layers = n_update_layers\n        self.concat_hidden = concat_hidden\n        self.concat_heads = concat_heads\n        self.weight_tying = weight_tying\n        self.negative_slope = negative_slope\n        self.n_edge_types = n_edge_types\n        self.dropout_ratio = dropout_ratio\n\n    def __call__(self, atom_array, adj):\n        \"\"\"Forward propagation\n\n        Args:\n            atom_array (numpy.ndarray): minibatch of molecular which is\n                represented with atom IDs (representing C, O, S, ...)\n                `atom_array[mol_index, atom_index]` represents `mol_index`-th\n                molecule's `atom_index`-th atomic number\n            adj (numpy.ndarray): minibatch of adjancency matrix with edge-type\n                information\n\n        Returns:\n            ~chainer.Variable: minibatch of fingerprint\n        \"\"\"\n        # reset state\n        if atom_array.dtype == self.xp.int32:\n            h = self.embed(atom_array)  # (minibatch, max_num_atoms)\n        else:\n            h = atom_array\n        h0 = functions.copy(h, cuda.get_device_from_array(h.data).id)\n        g_list = []\n        for step in range(self.n_update_layers):\n            message_layer_index = 0 if self.weight_tying else step\n            h = self.update_layers[message_layer_index](h, adj)\n            if self.concat_hidden:\n                g = self.readout_layers[step](h, h0)\n                g_list.append(g)\n\n        if self.concat_hidden:\n            return functions.concat(g_list, axis=1)\n        else:\n            g = self.readout_layers[0](h, h0)\n            return g\n"
  },
  {
    "path": "chainer_chemistry/models/relgcn.py",
    "content": "import chainer\nfrom chainer import functions, cuda  # NOQA\n\nfrom chainer.links import Linear\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.scatter_ggnn_readout import ScatterGGNNReadout  # NOQA\nfrom chainer_chemistry.links import EmbedAtomID, GraphLinear  # NOQA\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.update.relgcn_update import RelGCNUpdate, RelGCNSparseUpdate  # NOQA\n\n\ndef rescale_adj(adj):\n    \"\"\"Normalize adjacency matrix\n\n    It ensures that activations are on a similar scale irrespective of\n    the number of neighbors\n\n    Args:\n        adj (:class:`chainer.Variable`, or :class:`numpy.ndarray` \\\n        or :class:`cupy.ndarray`):\n            adjacency matrix\n\n    Returns:\n        :class:`chainer.Variable`: normalized adjacency matrix\n    \"\"\"\n    xp = cuda.get_array_module(adj)\n    num_neighbors = functions.sum(adj, axis=(1, 2))\n    base = xp.ones(num_neighbors.shape, dtype=xp.float32)\n    cond = num_neighbors.data != 0\n    num_neighbors_inv = 1 / functions.where(cond, num_neighbors, base)\n    return adj * functions.broadcast_to(\n        num_neighbors_inv[:, None, None, :], adj.shape)\n\n\nclass RelGCN(chainer.Chain):\n\n    \"\"\"Relational GCN (RelGCN)\n\n    See: Michael Schlichtkrull+, \\\n        Modeling Relational Data with Graph Convolutional Networks. \\\n        March 2017. \\\n        `arXiv:1703.06103 <https://arxiv.org/abs/1703.06103>`\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (None or int or list):\n            dimension of feature vector for each node\n        n_update_layers (int): number of layers\n        n_atom_types (int): number of types of atoms\n        n_edge_types (int): number of edge type.\n            Defaults to 4 for single, double, triple and aromatic bond.\n        scale_adj (bool): If ``True``, then this network normalizes\n            adjacency matrix\n    \"\"\"\n\n    def __init__(self, out_dim=64, hidden_channels=None, n_update_layers=None,\n                 n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4, input_type='int',\n                 scale_adj=False):\n        super(RelGCN, self).__init__()\n        if hidden_channels is None:\n            hidden_channels = [16, 128, 64]\n        elif isinstance(hidden_channels, int):\n            if not isinstance(n_update_layers, int):\n                raise ValueError(\n                    'Must specify n_update_layers when hidden_channels is int')\n            hidden_channels = [hidden_channels] * n_update_layers\n        with self.init_scope():\n            if input_type == 'int':\n                self.embed = EmbedAtomID(out_size=hidden_channels[0],\n                                         in_size=n_atom_types)\n            elif input_type == 'float':\n                self.embed = GraphLinear(None, hidden_channels[0])\n            else:\n                raise ValueError(\"[ERROR] Unexpected value input_type={}\"\n                                 .format(input_type))\n            self.rgcn_convs = chainer.ChainList(*[\n                RelGCNUpdate(hidden_channels[i], hidden_channels[i + 1],\n                             n_edge_types)\n                for i in range(len(hidden_channels) - 1)])\n            self.rgcn_readout = GGNNReadout(\n                out_dim=out_dim, in_channels=hidden_channels[-1],\n                nobias=True, activation=functions.tanh)\n        # self.num_relations = num_edge_type\n        self.input_type = input_type\n        self.scale_adj = scale_adj\n\n    def __call__(self, x, adj):\n        \"\"\"main calculation\n\n        Args:\n            x: (batchsize, num_nodes, in_channels)\n            adj: (batchsize, num_edge_type, num_nodes, num_nodes)\n\n        Returns: (batchsize, hidden_channels)\n        \"\"\"\n        if x.dtype == self.xp.int32:\n            assert self.input_type == 'int'\n        else:\n            assert self.input_type == 'float'\n        h = self.embed(x)  # (minibatch, max_num_atoms)\n        if self.scale_adj:\n            adj = rescale_adj(adj)\n        for rgcn_conv in self.rgcn_convs:\n            h = functions.tanh(rgcn_conv(h, adj))\n        h = self.rgcn_readout(h)\n        return h\n\n\nclass RelGCNSparse(chainer.Chain):\n\n    \"\"\"Relational GCN (RelGCN) Sparse Pattern\n\n    See: Michael Schlichtkrull+, \\\n        Modeling Relational Data with Graph Convolutional Networks. \\\n        March 2017. \\\n        `arXiv:1703.06103 <https://arxiv.org/abs/1703.06103>`\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (None or int or list):\n            dimension of feature vector for each node\n        n_update_layers (int): number of layers\n        n_atom_types (int): number of types of atoms\n        n_edge_types (int): number of edge type.\n            Defaults to 4 for single, double, triple and aromatic bond.\n        scale_adj (bool): If ``True``, then this network normalizes\n            adjacency matrix\n    \"\"\"\n\n    def __init__(self, out_dim=64, hidden_channels=None, n_update_layers=None,\n                 n_atom_types=MAX_ATOMIC_NUM, n_edge_types=4, input_type='int',\n                 scale_adj=False):\n        super(RelGCNSparse, self).__init__()\n        if hidden_channels is None:\n            hidden_channels = [16, 128, 64]\n        elif isinstance(hidden_channels, int):\n            if not isinstance(n_update_layers, int):\n                raise ValueError(\n                    'Must specify n_update_layers when hidden_channels is int')\n            hidden_channels = [hidden_channels] * n_update_layers\n        with self.init_scope():\n            if input_type == 'int':\n                self.embed = EmbedAtomID(out_size=hidden_channels[0],\n                                         in_size=n_atom_types)\n            elif input_type == 'float':\n                self.embed = Linear(None, hidden_channels[0])\n            else:\n                raise ValueError(\"[ERROR] Unexpected value input_type={}\"\n                                 .format(input_type))\n            self.rgcn_convs = chainer.ChainList(*[\n                RelGCNSparseUpdate(hidden_channels[i], hidden_channels[i + 1],\n                                   n_edge_types)\n                for i in range(len(hidden_channels) - 1)])\n            self.rgcn_readout = ScatterGGNNReadout(\n                out_dim=out_dim, in_channels=hidden_channels[-1],\n                nobias=True, activation=functions.tanh)\n        # self.num_relations = num_edge_type\n        self.input_type = input_type\n        self.scale_adj = scale_adj\n\n    def __call__(self, sparse_batch):\n        \"\"\"main calculation\n\n        Args:\n            x: (batchsize, num_nodes, in_channels)\n            adj: (batchsize, num_edge_type, num_nodes, num_nodes)\n\n        Returns: (batchsize, hidden_channels)\n        \"\"\"\n        if sparse_batch.x.dtype == self.xp.int32:\n            assert self.input_type == 'int'\n        else:\n            assert self.input_type == 'float'\n        h = self.embed(sparse_batch.x)  # (minibatch, max_num_atoms)\n        if self.scale_adj:\n            raise NotImplementedError\n        for rgcn_conv in self.rgcn_convs:\n            h = functions.tanh(rgcn_conv(\n                h, sparse_batch.edge_index, sparse_batch.edge_attr))\n        h = self.rgcn_readout(h, sparse_batch.batch)\n        return h\n"
  },
  {
    "path": "chainer_chemistry/models/rsgcn.py",
    "content": "import chainer\nfrom chainer import functions, Variable  # NOQA\n\nimport chainer_chemistry\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.general_readout import GeneralReadout\nfrom chainer_chemistry.links.update.rsgcn_update import RSGCNUpdate\n\n\nclass RSGCN(chainer.Chain):\n\n    \"\"\"Renormalized Spectral Graph Convolutional Network (RSGCN)\n\n    See: Thomas N. Kipf and Max Welling, \\\n        Semi-Supervised Classification with Graph Convolutional Networks. \\\n        September 2016. \\\n        `arXiv:1609.02907 <https://arxiv.org/abs/1609.02907>`_\n\n    The name of this model \"Renormalized Spectral Graph Convolutional Network\n    (RSGCN)\" is named by us rather than the authors of the paper above.\n    The authors call this model just \"Graph Convolution Network (GCN)\", but\n    we think that \"GCN\" is bit too general and may cause namespace issue.\n    That is why we did not name this model as GCN.\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (int): dimension of feature vector for each node\n        n_update_layers (int): number of layers\n        n_atom_types (int): number of types of atoms\n        use_batch_norm (bool): If True, batch normalization is applied after\n            graph convolution.\n        readout (Callable): readout function. If None,\n            `GeneralReadout(mode='sum)` is used.\n            To the best of our knowledge, the paper of RSGCN model does\n            not give any suggestion on readout.\n        dropout_ratio (float): ratio used in dropout function.\n            If 0 or negative value is set, dropout function is skipped.\n    \"\"\"\n\n    def __init__(self, out_dim, hidden_channels=32, n_update_layers=4,\n                 n_atom_types=MAX_ATOMIC_NUM,\n                 use_batch_norm=False, readout=None, dropout_ratio=0.5):\n        super(RSGCN, self).__init__()\n        in_dims = [hidden_channels for _ in range(n_update_layers)]\n        out_dims = [hidden_channels for _ in range(n_update_layers)]\n        out_dims[n_update_layers - 1] = out_dim\n        if readout is None:\n            readout = GeneralReadout()\n        with self.init_scope():\n            self.embed = chainer_chemistry.links.EmbedAtomID(out_size=hidden_channels, in_size=n_atom_types)\n            self.gconvs = chainer.ChainList(\n                *[RSGCNUpdate(in_dims[i], out_dims[i])\n                  for i in range(n_update_layers)])\n            if use_batch_norm:\n                self.bnorms = chainer.ChainList(\n                    *[chainer_chemistry.links.GraphBatchNormalization(\n                        out_dims[i]) for i in range(n_update_layers)])\n            else:\n                self.bnorms = [None for _ in range(n_update_layers)]\n            if isinstance(readout, chainer.Link):\n                self.readout = readout\n        if not isinstance(readout, chainer.Link):\n            self.readout = readout\n        self.out_dim = out_dim\n        self.hidden_channels = hidden_channels\n        self.n_update_layers = n_update_layers\n        self.dropout_ratio = dropout_ratio\n\n\n    def __call__(self, atom_array, adj, **kwargs):\n        \"\"\"Forward propagation\n\n        Args:\n            atom_array (numpy.ndarray): minibatch of molecular which is\n                represented with atom IDs (representing C, O, S, ...)\n                `atom_array[mol_index, atom_index]` represents `mol_index`-th\n                molecule's `atom_index`-th atomic number\n            adj (numpy.ndarray): minibatch of adjancency matrix\n                `adj[mol_index]` represents `mol_index`-th molecule's\n                adjacency matrix\n        Returns:\n            ~chainer.Variable: minibatch of fingerprint\n        \"\"\"\n\n        if atom_array.dtype == self.xp.int32:\n            # atom_array: (minibatch, nodes)\n            h = self.embed(atom_array)\n        else:\n            h = atom_array\n        # h: (minibatch, nodes, ch)\n\n        if isinstance(adj, Variable):\n            w_adj = adj.data\n        else:\n            w_adj = adj\n        w_adj = Variable(w_adj, requires_grad=False)\n\n        # --- RSGCN update ---\n        for i, (gconv, bnorm) in enumerate(zip(self.gconvs,\n                                               self.bnorms)):\n            #print(h.shape)\n\n            h = gconv(h, w_adj)\n            if bnorm is not None:\n                h = bnorm(h)\n            if self.dropout_ratio > 0.:\n                h = functions.dropout(h, ratio=self.dropout_ratio)\n            if i < self.n_update_layers - 1:\n                h = functions.relu(h)\n\n        # --- readout ---\n        y = self.readout(h)\n        return y\n"
  },
  {
    "path": "chainer_chemistry/models/schnet.py",
    "content": "import chainer\nfrom chainer import functions\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import EmbedAtomID\nfrom chainer_chemistry.links.readout.schnet_readout import SchNetReadout\nfrom chainer_chemistry.links.update.schnet_update import SchNetUpdate\n\n\nclass SchNet(chainer.Chain):\n    \"\"\"SchNet\n\n    See Kristof et al, \\\n        SchNet: A continuous-filter convolutional neural network for modeling\n        quantum interactions. \\\n        `arXiv:1706.08566 <https://arxiv.org/abs/1706.08566>`_\n\n    Args:\n        out_dim (int): dimension of output feature vector\n        hidden_channels (int): dimension of feature vector for each node\n        n_update_layers (int): number of layers\n        readout_hidden_dim (int): dimension of feature vector\n            associated to each molecule\n        n_atom_types (int): number of types of atoms\n        concat_hidden (bool): If set to True, readout is executed in each layer\n            and the result is concatenated\n        num_rbf (int): Number of RDF kernels used in `CFConv`.\n        radius_resolution (float): Resolution of radius.\n            The range (radius_resolution * 1 ~ radius_resolution * num_rbf)\n            are taken inside `CFConv`.\n        gamma (float): exponential factor of `CFConv`'s radius kernel.\n    \"\"\"\n\n    def __init__(self, out_dim=1, hidden_channels=64, n_update_layers=3,\n                 readout_hidden_dim=32, n_atom_types=MAX_ATOMIC_NUM,\n                 concat_hidden=False, num_rbf=300, radius_resolution=0.1,\n                 gamma=10.0):\n        super(SchNet, self).__init__()\n        with self.init_scope():\n            self.embed = EmbedAtomID(out_size=hidden_channels,\n                                     in_size=n_atom_types)\n            self.update_layers = chainer.ChainList(\n                *[SchNetUpdate(\n                    hidden_channels,\n                    num_rbf=num_rbf, radius_resolution=radius_resolution,\n                    gamma=gamma) for _ in range(n_update_layers)])\n            self.readout_layer = SchNetReadout(\n                out_dim, in_channels=None, hidden_channels=readout_hidden_dim)\n        self.out_dim = out_dim\n        self.hidden_channels = hidden_channels\n        self.readout_hidden_dim = readout_hidden_dim\n        self.n_update_layers = n_update_layers\n        self.concat_hidden = concat_hidden\n\n    def __call__(self, atom_features, dist_features):\n        x = self.embed(atom_features)\n        h = []\n        # --- update part ---\n        for i in range(self.n_update_layers):\n            x = self.update_layers[i](x, dist_features)\n            if self.concat_hidden:\n                h.append(x)\n        # --- readout part ---\n        if self.concat_hidden:\n            x = functions.concat(h, axis=2)\n        x = self.readout_layer(x)\n        return x\n"
  },
  {
    "path": "chainer_chemistry/models/weavenet.py",
    "content": "import chainer\nfrom chainer import functions\nfrom chainer import links\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.config import WEAVE_DEFAULT_NUM_MAX_ATOMS\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.readout.general_readout import GeneralReadout\n\n\nWEAVENET_DEFAULT_WEAVE_CHANNELS = [50, ]\n\n\nclass LinearLayer(chainer.Chain):\n\n    def __init__(self, n_channel, n_layer):\n        super(LinearLayer, self).__init__()\n        with self.init_scope():\n            self.layers = chainer.ChainList(\n                *[links.Linear(None, n_channel) for _ in range(n_layer)]\n            )\n        self.n_output_channel = n_channel\n\n    def forward(self, x):\n        n_batch, n_atom, n_channel = x.shape\n        x = functions.reshape(x, (n_batch * n_atom, n_channel))\n        for l in self.layers:\n            x = l(x)\n            x = functions.relu(x)\n        x = functions.reshape(x, (n_batch, n_atom, self.n_output_channel))\n        return x\n\n\nclass AtomToPair(chainer.Chain):\n    def __init__(self, n_channel, n_layer, n_atom):\n        super(AtomToPair, self).__init__()\n        with self.init_scope():\n            self.linear_layers = chainer.ChainList(\n                *[links.Linear(None, n_channel) for _ in range(n_layer)]\n            )\n        self.n_atom = n_atom\n        self.n_channel = n_channel\n\n    def forward(self, x):\n        n_batch, n_atom, n_feature = x.shape\n        atom_repeat = functions.reshape(x, (n_batch, 1, n_atom, n_feature))\n        atom_repeat = functions.broadcast_to(\n            atom_repeat, (n_batch, n_atom, n_atom, n_feature))\n        atom_repeat = functions.reshape(atom_repeat,\n                                        (n_batch, n_atom * n_atom, n_feature))\n\n        atom_tile = functions.reshape(x, (n_batch, n_atom, 1, n_feature))\n        atom_tile = functions.broadcast_to(\n            atom_tile, (n_batch, n_atom, n_atom, n_feature))\n        atom_tile = functions.reshape(atom_tile,\n                                      (n_batch, n_atom * n_atom, n_feature))\n\n        pair_x0 = functions.concat((atom_tile, atom_repeat), axis=2)\n        pair_x0 = functions.reshape(pair_x0,\n                                    (n_batch * n_atom * n_atom, n_feature * 2))\n        for l in self.linear_layers:\n            pair_x0 = l(pair_x0)\n            pair_x0 = functions.relu(pair_x0)\n        pair_x0 = functions.reshape(pair_x0,\n                                    (n_batch, n_atom * n_atom, self.n_channel))\n\n        pair_x1 = functions.concat((atom_repeat, atom_tile), axis=2)\n        pair_x1 = functions.reshape(pair_x1,\n                                    (n_batch * n_atom * n_atom, n_feature * 2))\n        for l in self.linear_layers:\n            pair_x1 = l(pair_x1)\n            pair_x1 = functions.relu(pair_x1)\n        pair_x1 = functions.reshape(pair_x1,\n                                    (n_batch, n_atom * n_atom, self.n_channel))\n        return pair_x0 + pair_x1\n\n\nclass PairToAtom(chainer.Chain):\n    def __init__(self, n_channel, n_layer, n_atom, mode='sum'):\n        super(PairToAtom, self).__init__()\n        with self.init_scope():\n            self.linearLayer = chainer.ChainList(\n                *[links.Linear(None, n_channel) for _ in range(n_layer)]\n            )\n            self.readout = GeneralReadout(mode=mode)\n        self.n_atom = n_atom\n        self.n_channel = n_channel\n        self.mode = mode\n\n    def forward(self, x):\n        n_batch, n_pair, n_feature = x.shape\n        a = functions.reshape(\n            x, (n_batch * (self.n_atom * self.n_atom), n_feature))\n        for l in self.linearLayer:\n            a = l(a)\n            a = functions.relu(a)\n        a = functions.reshape(a, (n_batch, self.n_atom, self.n_atom,\n                                  self.n_channel))\n        a = self.readout(a, axis=2)\n        return a\n\n\nclass WeaveModule(chainer.Chain):\n\n    def __init__(self, n_atom, output_channel, n_sub_layer,\n                 readout_mode='sum'):\n        super(WeaveModule, self).__init__()\n        with self.init_scope():\n            self.atom_layer = LinearLayer(output_channel, n_sub_layer)\n            self.pair_layer = LinearLayer(output_channel, n_sub_layer)\n            self.atom_to_atom = LinearLayer(output_channel, n_sub_layer)\n            self.pair_to_pair = LinearLayer(output_channel, n_sub_layer)\n            self.atom_to_pair = AtomToPair(output_channel, n_sub_layer, n_atom)\n            self.pair_to_atom = PairToAtom(output_channel, n_sub_layer, n_atom,\n                                           mode=readout_mode)\n        self.n_atom = n_atom\n        self.n_channel = output_channel\n        self.readout_mode = readout_mode\n\n    def forward(self, atom_x, pair_x, atom_only=False):\n        a0 = self.atom_to_atom.forward(atom_x)\n        a1 = self.pair_to_atom.forward(pair_x)\n        a = functions.concat([a0, a1], axis=2)\n        next_atom = self.atom_layer.forward(a)\n        next_atom = functions.relu(next_atom)\n        if atom_only:\n            return next_atom\n\n        p0 = self.atom_to_pair.forward(atom_x)\n        p1 = self.pair_to_pair.forward(pair_x)\n        p = functions.concat([p0, p1], axis=2)\n        next_pair = self.pair_layer.forward(p)\n        next_pair = functions.relu(next_pair)\n        return next_atom, next_pair\n\n\nclass WeaveNet(chainer.Chain):\n    \"\"\"WeaveNet implementation\n\n    Args:\n        weave_channels (list): list of int, output dimension for each weave\n            module\n        hidden_dim (int): hidden dim\n        n_atom (int): number of atom of input array\n        n_sub_layer (int): number of layer for each `AtomToPair`, `PairToAtom`\n            layer\n        n_atom_types (int): number of atom id\n        readout_mode (str): 'sum' or 'max' or 'summax'\n    \"\"\"\n\n    def __init__(self, weave_channels=None, hidden_dim=16,\n                 n_atom=WEAVE_DEFAULT_NUM_MAX_ATOMS,\n                 n_sub_layer=1, n_atom_types=MAX_ATOMIC_NUM,\n                 readout_mode='sum'):\n        weave_channels = weave_channels or WEAVENET_DEFAULT_WEAVE_CHANNELS\n        weave_module = [\n            WeaveModule(n_atom, c, n_sub_layer, readout_mode=readout_mode)\n            for c in weave_channels\n        ]\n\n        super(WeaveNet, self).__init__()\n        with self.init_scope():\n            self.embed = EmbedAtomID(out_size=hidden_dim, in_size=n_atom_types)\n            self.weave_module = chainer.ChainList(*weave_module)\n            self.readout = GeneralReadout(mode=readout_mode)\n        self.readout_mode = readout_mode\n\n    def __call__(self, atom_x, pair_x, train=True):\n        if atom_x.dtype == self.xp.int32:\n            # atom_array: (minibatch, atom)\n            atom_x = self.embed(atom_x)\n\n        for i in range(len(self.weave_module)):\n            if i == len(self.weave_module) - 1:\n                # last layer, only `atom_x` is needed.\n                atom_x = self.weave_module[i].forward(atom_x, pair_x,\n                                                      atom_only=True)\n            else:\n                # not last layer, both `atom_x` and `pair_x` are needed\n                atom_x, pair_x = self.weave_module[i].forward(atom_x, pair_x)\n        x = self.readout(atom_x, axis=1)\n        return x\n"
  },
  {
    "path": "chainer_chemistry/saliency/__init__.py",
    "content": "from chainer_chemistry.saliency import calculator  # NOQA\nfrom chainer_chemistry.saliency import visualizer  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/saliency/calculator/__init__.py",
    "content": "from chainer_chemistry.saliency.calculator import base_calculator  # NOQA\nfrom chainer_chemistry.saliency.calculator import calculator_utils  # NOQA\nfrom chainer_chemistry.saliency.calculator import gradient_calculator  # NOQA\nfrom chainer_chemistry.saliency.calculator import integrated_gradients_calculator  # NOQA\nfrom chainer_chemistry.saliency.calculator import occlusion_calculator  # NOQA\n\nfrom chainer_chemistry.saliency.calculator.base_calculator import BaseCalculator  # NOQA\nfrom chainer_chemistry.saliency.calculator.gradient_calculator import GradientCalculator  # NOQA\nfrom chainer_chemistry.saliency.calculator.integrated_gradients_calculator import IntegratedGradientsCalculator  # NOQA\nfrom chainer_chemistry.saliency.calculator.occlusion_calculator import OcclusionCalculator  # NOQA\n\nfrom chainer_chemistry.saliency.calculator.calculator_utils import GaussianNoiseSampler  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/saliency/calculator/base_calculator.py",
    "content": "from logging import getLogger\n\nimport numpy\n\nimport chainer\nfrom chainer import cuda\nfrom chainer.dataset.convert import concat_examples, _concat_arrays_with_padding  # NOQA\nfrom chainer.iterators import SerialIterator\n\nfrom chainer_chemistry.link_hooks import is_link_hooks_available\nfrom tqdm import tqdm\n\nif is_link_hooks_available:\n    from chainer import LinkHook\n    from chainer_chemistry.link_hooks import VariableMonitorLinkHook\n\n_sampling_axis = 0\n\n\ndef _to_tuple(x):\n    if not isinstance(x, tuple):\n        x = (x,)\n    return x\n\n\ndef _to_variable(x):\n    if not isinstance(x, chainer.Variable):\n        x = chainer.Variable(x)\n    return x\n\n\ndef _extract_numpy(x):\n    if isinstance(x, chainer.Variable):\n        x = x.data\n    return cuda.to_cpu(x)\n\n\ndef _concat(batch_list):\n    try:\n        return numpy.concatenate(batch_list)\n    except Exception as e:  # NOQA\n        # Thre is a case that each input has different shape,\n        # we cannot concatenate into array in this case.\n\n        elem_list = [elem for batch in batch_list for elem in batch]\n        return _concat_arrays_with_padding(elem_list, padding=0)\n\n\ndef add_linkhook(linkhook, prefix='', logger=None):\n    link_hooks = chainer._get_link_hooks()\n    name = prefix + linkhook.name\n    if name in link_hooks:\n        logger = logger or getLogger(__name__)\n        logger.warning('hook {} already exists, overwrite.'.format(name))\n        pass  # skip this case...\n        # raise KeyError('hook %s already exists' % name)\n    link_hooks[name] = linkhook\n    linkhook.added(None)\n    return linkhook\n\n\ndef delete_linkhook(linkhook, prefix='', logger=None):\n    name = prefix + linkhook.name\n    link_hooks = chainer._get_link_hooks()\n    if name not in link_hooks.keys():\n        logger = logger or getLogger(__name__)\n        logger.warning('linkhook {} is not registered'.format(name))\n        return\n    link_hooks[name].deleted(None)\n    del link_hooks[name]\n\n\nclass BaseCalculator(object):\n\n    \"\"\"Base class for saliency calculator\n\n    Use `compute`, `aggregate` method to calculate saliency.\n    This base class supports to calculate SmoothGrad[1] and BayesGrad[2] of\n    concrete subclass.\n\n    See: Daniel Smilkov, Nikhil Thorat, Been Kim, Fernanda Viegas, and Martin\n        Wattenberg. SmoothGrad: removing noise by adding noise.\n        `arXiv:1706.03825 <https://arxiv.org/abs/1706.03825>`_\n\n    See: Akita, Hirotaka and Nakago, Kosuke and Komatsu, Tomoki and Sugawara,\n        Yohei and Maeda, Shin-ichi and Baba, Yukino and Kashima, Hisashi\n        BayesGrad: Explaining Predictions of Graph Convolutional Networks\n        `arXiv:1807.01985 <https://arxiv.org/abs/1807.01985>`_\n\n    Args:\n        model (chainer.Chain): target model to calculate saliency.\n        target_extractor (VariableMonitorLinkHook or None):\n            It determines `target_var`, target variable to calculate saliency.\n            If `None`, first argument of input to the model is treated as\n            `target_var`.\n        output_extractor (VariableMonitorLinkHook or None):\n            It determines `output_var`, output variable to calculate saliency.\n            If `None`, output of the model is treated as `output_var`.\n        device (int or None): device id to calculate saliency.\n            If `None`, device id is inferred automatically from `model`.\n        logger:\n    \"\"\"\n\n    def __init__(self, model, target_extractor=None, output_extractor=None,\n                 device=None, logger=None):\n        self.model = model  # type: chainer.Chain\n        if device is not None:\n            self._device = device\n        else:\n            self._device = cuda.get_device_from_array(*model.params()).id\n        self.target_extractor = target_extractor\n        self.output_extractor = output_extractor\n        self.logger = logger or getLogger(__name__)\n\n    def compute(self, data, M=1, batchsize=16,\n                converter=concat_examples, retain_inputs=False,\n                preprocess_fn=None, postprocess_fn=None, train=False,\n                noise_sampler=None, show_progress=True):\n        \"\"\"computes saliency_samples\n\n        Args:\n            data: dataset to calculate saliency\n            M (int): sampling size. `M > 1` may be set with SmoothGrad or\n                BayesGrad configuration. See `train` and `noise_sampler`\n                description.\n            batchsize (int): batch size\n            converter (function): converter to make batch from `data`\n            retain_inputs (bool): retain input flag\n            preprocess_fn (function or None): preprocess function\n            postprocess_fn (function or None): postprocess function\n            train (bool): chainer.config.train flag. When the `model` contains\n                `dropout` (or other stochastic) function, `train=True`\n                 corresponds to calculate BayesGrad.\n            noise_sampler: noise sampler class with `sample` method.\n                If this is set, noise is added to `target_var`. It can be\n                used to calculate SmoothGrad.\n                If `None`, noise is not sampled.\n            show_progress (bool): Show progress bar or not.\n\n        Returns:\n            saliency_samples (numpy.ndarray): M samples of saliency array.\n                Its shape is (M,) + target_var.shape, i.e., sampling axis is\n                added to the first axis.\n        \"\"\"\n        saliency_list = []\n        for _ in tqdm(range(M), disable=not show_progress):\n            with chainer.using_config('train', train):\n                saliency = self._forward(\n                    data, batchsize=batchsize,\n                    converter=converter,\n                    retain_inputs=retain_inputs, preprocess_fn=preprocess_fn,\n                    postprocess_fn=postprocess_fn, noise_sampler=noise_sampler)\n            saliency_array = cuda.to_cpu(saliency)\n            saliency_list.append(saliency_array)\n        return numpy.stack(saliency_list, axis=_sampling_axis)\n\n    def aggregate(self, saliency_arrays, method='raw', ch_axis=None):\n        \"\"\"Aggregate saliency samples into one saliency score.\n\n        Args:\n            saliency_arrays (numpy.ndarray): M samples of saliency array\n                calculated by `compute` method.\n            method (str): It supports following methods for aggregation.\n                raw: simply take mean of samples.\n                absolute: calc absolute mean of samples.\n                square: calc squared mean of samples.\n            ch_axis (int, tuple or None): channel axis. The ch_axis is\n                considered as reduced axis for saliency calculation.\n\n        Returns:\n            saliency (numpy.ndarray): saliency score\n        \"\"\"\n        if method == 'raw':\n            h = saliency_arrays  # do nothing\n        elif method == 'abs':\n            h = numpy.abs(saliency_arrays)\n        elif method == 'square':\n            h = saliency_arrays ** 2\n        else:\n            raise ValueError(\"[ERROR] Unexpected value method={}\"\n                             .format(method))\n\n        if ch_axis is not None:\n            h = numpy.sum(h, axis=ch_axis)\n        sampling_axis = _sampling_axis\n        return numpy.mean(h, axis=sampling_axis)\n\n    def _compute_core(self, *inputs):\n        \"\"\"Core computation routine\n\n        Each concrete subclass should implement this method\n        \"\"\"\n        raise NotImplementedError\n\n    def get_target_var(self, inputs):\n        if isinstance(self.target_extractor, VariableMonitorLinkHook):\n            target_var = self.target_extractor.get_variable()\n        else:\n            if isinstance(inputs, tuple):\n                target_var = inputs[0]\n            else:\n                target_var = inputs\n\n        if target_var is None:\n            self.logger.warning(\n                'target_var is None. This may be caused because \"model\" is not'\n                ' forwarded in advance or \"model\" does not implement \"forward\"'\n                ' method and LinkHook is not triggered.')\n        return target_var\n\n    def get_output_var(self, outputs):\n        if isinstance(self.output_extractor, VariableMonitorLinkHook):\n            output_var = self.output_extractor.get_variable()\n        else:\n            output_var = outputs\n        if output_var is None:\n            self.logger.warning(\n                'output_var is None. This may be caused because \"model\" is not'\n                ' forwarded in advance or \"model\" does not implement \"forward\"'\n                ' method and LinkHook is not triggered.')\n        return output_var\n\n    def _forward(self, data, batchsize=16,\n                 converter=concat_examples, retain_inputs=False,\n                 preprocess_fn=None, postprocess_fn=None, noise_sampler=None):\n        \"\"\"Forward data by iterating with batch\n\n        Args:\n            data: \"train_x array\" or \"chainer dataset\"\n            batchsize (int): batch size\n            converter (Callable): convert from `data` to `inputs`\n            retain_inputs (bool): If True, this instance keeps inputs in\n                `self.inputs` or not.\n            preprocess_fn (Callable): Its input is numpy.ndarray or\n                cupy.ndarray, it can return either Variable, cupy.ndarray or\n                numpy.ndarray\n            postprocess_fn (Callable): Its input argument is Variable,\n                but this method may return either Variable, cupy.ndarray or\n                numpy.ndarray.\n\n        Returns (tuple or numpy.ndarray): forward result\n        \"\"\"\n        input_list = None\n        output_list = None\n        it = SerialIterator(data, batch_size=batchsize, repeat=False,\n                            shuffle=False)\n        if isinstance(self.target_extractor, LinkHook):\n            add_linkhook(self.target_extractor, prefix='/saliency/target/',\n                         logger=self.logger)\n        if isinstance(self.output_extractor, LinkHook):\n            add_linkhook(self.output_extractor, prefix='/saliency/output/',\n                         logger=self.logger)\n\n        for batch in it:\n            inputs = converter(batch, self._device)\n            inputs = _to_tuple(inputs)\n\n            if preprocess_fn:\n                inputs = preprocess_fn(*inputs)\n                inputs = _to_tuple(inputs)\n\n            inputs = [_to_variable(x) for x in inputs]\n\n            # --- Main saliency computation ----\n            if noise_sampler is None:\n                # VanillaGrad computation\n                outputs = self._compute_core(*inputs)\n            else:\n                # SmoothGrad computation\n                if self.target_extractor is None:\n                    # inputs[0] is considered as \"target_var\"\n                    noise = noise_sampler.sample(inputs[0].array)\n                    inputs[0].array += noise\n                    outputs = self._compute_core(*inputs)\n                else:\n                    # Add process to LinkHook\n                    def add_noise(hook, args, target_var):\n                        noise = noise_sampler.sample(target_var.array)\n                        target_var.array += noise\n                    self.target_extractor.add_process('/saliency/add_noise',\n                                                      add_noise)\n                    outputs = self._compute_core(*inputs)\n                    self.target_extractor.delete_process('/saliency/add_noise')\n            # --- Main saliency computation end ---\n\n            # Init\n            if retain_inputs:\n                if input_list is None:\n                    input_list = [[] for _ in range(len(inputs))]\n                for j, input in enumerate(inputs):\n                    input_list[j].append(cuda.to_cpu(input))\n\n            if output_list is None:\n                output_list = [[] for _ in range(len(outputs))]\n\n            if postprocess_fn:\n                outputs = postprocess_fn(*outputs)\n                outputs = _to_tuple(outputs)\n            for j, output in enumerate(outputs):\n                output_list[j].append(_extract_numpy(output))\n\n        if isinstance(self.target_extractor, LinkHook):\n            delete_linkhook(self.target_extractor, prefix='/saliency/target/',\n                            logger=self.logger)\n        if isinstance(self.output_extractor, LinkHook):\n            delete_linkhook(self.output_extractor, prefix='/saliency/output/',\n                            logger=self.logger)\n\n        if retain_inputs:\n            self.inputs = [numpy.concatenate(\n                in_array) for in_array in input_list]\n\n        result = [_concat(output) for output in output_list]\n        if len(result) == 1:\n            return result[0]\n        else:\n            self.logger.error('return multiple result handling is not '\n                              'implemented yet and not supported.')\n            return result\n"
  },
  {
    "path": "chainer_chemistry/saliency/calculator/calculator_utils.py",
    "content": "from chainer import cuda\n\n\nclass GaussianNoiseSampler(object):\n    \"\"\"Default noise sampler class to calculate SmoothGrad\"\"\"\n\n    def __init__(self, mode='relative', scale=0.15):\n        self.mode = mode\n        self.scale = scale\n\n    def sample(self, target_array):\n        xp = cuda.get_array_module(target_array)\n        noise = xp.random.normal(\n            0, self.scale, target_array.shape)\n        if self.mode == 'absolute':\n            # `scale` is used as is\n            pass\n        elif self.mode == 'relative':\n            # `scale_axis` is used to calculate `max` and `min` of target_array\n            # As default, all axes except batch axis are used.\n            scale_axis = tuple(range(1, target_array.ndim))\n            vmax = xp.max(target_array, axis=scale_axis, keepdims=True)\n            vmin = xp.min(target_array, axis=scale_axis, keepdims=True)\n            noise = noise * (vmax - vmin)\n        else:\n            raise ValueError(\"[ERROR] Unexpected value mode={}\"\n                             .format(self.mode))\n        return noise\n"
  },
  {
    "path": "chainer_chemistry/saliency/calculator/gradient_calculator.py",
    "content": "import chainer  # NOQA\nfrom chainer import functions\nfrom chainer_chemistry.saliency.calculator.base_calculator import BaseCalculator  # NOQA\n\n\nclass GradientCalculator(BaseCalculator):\n\n    \"\"\"Gradient saliency calculator\n\n    Use `compute`, `aggregate` method to calculate saliency.\n\n    See: Dumitru Erhan, Yoshua Bengio, Aaron Courville, Pascal Vincent (2009).\n        Visualizing Higher-Layer Features of a Deep Network.\n\n    See: Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman.\n        Deep inside convolutional networks: Visualising image classication\n        models and saliency maps.\n        `arXiv:1312.6034 <https://arxiv.org/abs/1312.6034>`_\n\n    Args:\n        model (chainer.Chain): target model to calculate saliency.\n        target_extractor (VariableMonitorLinkHook or None):\n            It determines `target_var`, target variable to calculate saliency.\n            If `None`, first argument of input to the model is treated as\n            `target_var`.\n        output_extractor (VariableMonitorLinkHook or None):\n            It determines `output_var`, output variable to calculate saliency.\n            If `None`, output of the model is treated as `output_var`.\n        eval_fun (callable): If\n        multiply_target (bool):\n            If `False`, return value is `target_var.grad`.\n            If `True`,  return value is `target_var.grad * target_var`.\n        device (int or None): device id to calculate saliency.\n            If `None`, device id is inferred automatically from `model`.\n    \"\"\"\n\n    def __init__(self, model, target_extractor=None, output_extractor=None,\n                 eval_fun=None, multiply_target=False, device=None):\n        super(GradientCalculator, self).__init__(\n            model, target_extractor=target_extractor,\n            output_extractor=output_extractor, device=device)\n        self.eval_fun = eval_fun or model.__call__\n        self.multiply_target = multiply_target\n\n    def _compute_core(self, *inputs):\n        self.model.cleargrads()\n        outputs = self.eval_fun(*inputs)\n        target_var = self.get_target_var(inputs)\n        target_var.grad = None  # Need to reset grad beforehand of backward.\n        output_var = self.get_output_var(outputs)\n\n        # --- type check for output_var ---\n        if output_var.size != 1:\n            self.logger.warning(\n                'output_var.size is not 1, calculate scalar value. '\n                'functions.sum is applied.')\n            output_var = functions.sum(output_var)\n\n        output_var.backward(retain_grad=True)\n        saliency = target_var.grad\n        if self.multiply_target:\n            saliency *= target_var.data\n        outputs = (saliency,)\n        return outputs\n"
  },
  {
    "path": "chainer_chemistry/saliency/calculator/integrated_gradients_calculator.py",
    "content": "import numpy\n\nfrom chainer_chemistry.saliency.calculator.gradient_calculator import GradientCalculator  # NOQA\n\n\nclass IntegratedGradientsCalculator(GradientCalculator):\n\n    \"\"\"Integrated gradient saliency calculator\n\n    Use `compute`, `aggregate` method to calculate saliency.\n\n    See: Mukund Sundararajan, Ankur Taly, and Qiqi Yan (2017).\n        Axiomatic attribution for deep networks. PMLR.\n        URL http://proceedings.mlr.press/v70/sundararajan17a.html.\n\n    Args:\n        model (chainer.Chain): target model to calculate saliency.\n        target_extractor (VariableMonitorLinkHook or None):\n            It determines `target_var`, target variable to calculate saliency.\n            If `None`, first argument of input to the model is treated as\n            `target_var`.\n        output_extractor (VariableMonitorLinkHook or None):\n            It determines `output_var`, output variable to calculate saliency.\n            If `None`, output of the model is treated as `output_var`.\n        eval_fun (callable): If\n        baseline (numpy.ndarray or None):\n            If `None`, baseline is set as 0.\n        steps (int): Number of separation to calculate integrated gradient.\n        device (int or None): device id to calculate saliency.\n            If `None`, device id is inferred automatically from `model`.\n    \"\"\"\n    def __init__(self, model, target_extractor=None, output_extractor=None,\n                 eval_fun=None, baseline=None, steps=25, device=None):\n\n        super(IntegratedGradientsCalculator, self).__init__(\n            model, target_extractor=target_extractor,\n            output_extractor=output_extractor, multiply_target=False,\n            eval_fun=eval_fun, device=device)\n        self.baseline = baseline or 0.\n        self.steps = steps\n\n    def _compute_core(self, *inputs):\n\n        total_grads = 0.\n        self.model.cleargrads()\n        # Need to forward once to get target_var\n        outputs = self.eval_fun(*inputs)  # NOQA\n        target_var = self.get_target_var(inputs)\n        # output_var = self.get_output_var(outputs)\n\n        base = self.baseline\n        diff = target_var.array - base\n\n        for alpha in numpy.linspace(0., 1., self.steps):\n            if self.target_extractor is None:\n                interpolated_inputs = base + alpha * diff\n                inputs[0].array = interpolated_inputs\n                total_grads += super(\n                    IntegratedGradientsCalculator, self)._compute_core(\n                    *inputs)[0]\n            else:\n                def interpolate_target_var(hook, args, _target_var):\n                    interpolated_inputs = base + alpha * diff\n                    _target_var.array[:] = interpolated_inputs\n\n                self.target_extractor.add_process(\n                    '/saliency/interpolate_target_var', interpolate_target_var)\n                total_grads += super(\n                    IntegratedGradientsCalculator, self)._compute_core(\n                    *inputs)[0]\n                self.target_extractor.delete_process(\n                    '/saliency/interpolate_target_var')\n        saliency = total_grads * diff / self.steps\n        return saliency,\n"
  },
  {
    "path": "chainer_chemistry/saliency/calculator/occlusion_calculator.py",
    "content": "import itertools\nimport six\n\nimport chainer\nfrom chainer import cuda\n\nfrom chainer_chemistry.saliency.calculator.base_calculator import BaseCalculator  # NOQA\n\n\ndef _to_tuple(x):\n    if isinstance(x, int):\n        x = (x,)\n    elif isinstance(x, (list, tuple)):\n        x = tuple(x)\n    else:\n        raise TypeError('Unexpected type {}'.format(type(x)))\n    return x\n\n\nclass OcclusionCalculator(BaseCalculator):\n\n    \"\"\"Occlusion saliency calculator\n\n    Use `compute`, `aggregate` method to calculate saliency.\n\n    See: Matthew D Zeiler and Rob Fergus (2014).\n        Visualizing and understanding convolutional networks.\n        In European conference on computer vision, pp. 818-833. Springer.\n\n    Args:\n        model (chainer.Chain): target model to calculate saliency.\n        target_extractor (VariableMonitorLinkHook or None):\n            It determines `target_var`, target variable to calculate saliency.\n            If `None`, first argument of input to the model is treated as\n            `target_var`.\n        output_extractor (VariableMonitorLinkHook or None):\n            It determines `output_var`, output variable to calculate saliency.\n            If `None`, output of the model is treated as `output_var`.\n        eval_fun (callable): If\n        enable_backprop (bool): chainer.config.enable_backprop option.\n        size (int or tuple): occlusion window size.\n            If `int`, window has same size along `slide_axis`.\n            If `tuple`, its length must be same with `slide_axis`.\n        slide_axis (int or tuple): slide axis which occlusion window moves.\n        device (int or None): device id to calculate saliency.\n            If `None`, device id is inferred automatically from `model`.\n    \"\"\"\n    def __init__(self, model, target_extractor=None, output_extractor=None,\n                 eval_fun=None, device=None,\n                 enable_backprop=False, size=1, slide_axis=(2, 3)):\n        super(OcclusionCalculator, self).__init__(\n            model, target_extractor=target_extractor,\n            output_extractor=output_extractor, device=device)\n\n        self.eval_fun = eval_fun or model.__call__\n        self.enable_backprop = enable_backprop\n        self.slide_axis = _to_tuple(slide_axis)\n        size = _to_tuple(size)\n        if len(self.slide_axis) != size:\n            size = size * len(self.slide_axis)\n        self.size = size\n\n    def _compute_core(self, *inputs):\n        # Usually, backward() is not necessary for calculating occlusion\n        with chainer.using_config('enable_backprop', self.enable_backprop):\n            original_result = self.eval_fun(*inputs)\n        target_var = self.get_target_var(inputs)\n        original_target_array = target_var.array.copy()\n        original_score = self.get_output_var(original_result)\n\n        xp = cuda.get_array_module(target_var.array)\n        value = 0.\n\n        # fill with `value`\n        target_dim = target_var.ndim\n        batch_size = target_var.shape[0]\n        occlusion_window_shape = [1] * target_dim\n        occlusion_window_shape[0] = batch_size\n        for axis, size in zip(self.slide_axis, self.size):\n            occlusion_window_shape[axis] = size\n        occlusion_scores_shape = [1] * target_dim\n        occlusion_scores_shape[0] = batch_size\n        for axis, size in zip(self.slide_axis, self.size):\n            occlusion_scores_shape[axis] = target_var.shape[axis]\n        occlusion_window = xp.ones(occlusion_window_shape,\n                                   dtype=target_var.dtype) * value\n        occlusion_scores = xp.zeros(occlusion_scores_shape, dtype=xp.float32)\n\n        def _extract_index(slide_axis, size, start_indices):\n            colon = slice(None)\n            index = [colon] * target_dim\n            for axis, size, start in zip(slide_axis, size, start_indices):\n                index[axis] = slice(start, start + size, 1)\n            return tuple(index)\n\n        end_list = [target_var.data.shape[axis] - size + 1 for axis, size\n                    in zip(self.slide_axis, self.size)]\n\n        for start in itertools.product(*[six.moves.range(end)\n                                         for end in end_list]):\n            occlude_index = _extract_index(self.slide_axis, self.size, start)\n\n            if self.target_extractor is None:\n                inputs[0].array = original_target_array.copy()\n                inputs[0].array[occlude_index] = occlusion_window\n                with chainer.using_config('enable_backprop',\n                                          self.enable_backprop):\n                    occluded_result = self.eval_fun(*inputs)\n            else:\n                def mask_target_var(hook, args, _target_var):\n                    _target_var.array = original_target_array.copy()\n                    _target_var.array[occlude_index] = occlusion_window\n\n                self.target_extractor.add_process(\n                    '/saliency/mask_target_var', mask_target_var)\n                with chainer.using_config('enable_backprop',\n                                          self.enable_backprop):\n                    occluded_result = self.eval_fun(*inputs)\n                self.target_extractor.delete_process(\n                    '/saliency/mask_target_var')\n\n            occluded_score = self.get_output_var(occluded_result)\n            score_diff_var = original_score - occluded_score  # (bs, 1)\n            # expand_dim for ch_axis\n            score_diff = xp.reshape(score_diff_var.array,\n                                    occlusion_window_shape)\n            occlusion_scores[occlude_index] += score_diff\n        outputs = (occlusion_scores,)\n        return outputs\n"
  },
  {
    "path": "chainer_chemistry/saliency/visualizer/__init__.py",
    "content": "from chainer_chemistry.saliency.visualizer import base_visualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer import image_visualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer import mol_visualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer import table_visualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer import visualizer_utils  # NOQA\n\nfrom chainer_chemistry.saliency.visualizer.base_visualizer import BaseVisualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer.image_visualizer import ImageVisualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer.mol_visualizer import MolVisualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer.mol_visualizer import SmilesVisualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer.table_visualizer import TableVisualizer  # NOQA\n\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import abs_max_scaler  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import min_max_scaler  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import normalize_scaler  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import red_blue_cmap  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/saliency/visualizer/base_visualizer.py",
    "content": "class BaseVisualizer(object):\n\n    \"\"\"Base saliency visualizer\"\"\"\n\n    def visualize(self, *args, **kwargs):\n        \"\"\"Main visualization routine\n\n        Each concrete subclass should implement this method\n        \"\"\"\n        raise NotImplementedError\n"
  },
  {
    "path": "chainer_chemistry/saliency/visualizer/image_visualizer.py",
    "content": "from logging import getLogger\n\nimport matplotlib.cm as cm\nimport matplotlib.pyplot as plt\nimport numpy\n\nfrom chainer import cuda\n\nfrom chainer_chemistry.saliency.visualizer.base_visualizer import BaseVisualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import abs_max_scaler  # NOQA\n\n\nclass ImageVisualizer(BaseVisualizer):\n\n    \"\"\"Saliency visualizer for image data\n\n    Args:\n        logger:\n    \"\"\"\n\n    def __init__(self, logger=None):\n        self.logger = logger or getLogger(__name__)\n\n    def visualize(self, saliency, image=None, save_filepath=None,\n                  scaler=abs_max_scaler, title='Image saliency map',\n                  cmap=cm.jet, alpha=0.5, show_colorbar=False,\n                  bbox_inches='tight'):\n        \"\"\"Visualize or save `saliency` of image.\n\n        Args:\n            saliency (numpy.ndarray): Saliency array. Must be either\n                2-dim (h, w) or 3-dim (ch, h, w).\n            image (numpy.ndarray or PIL.Image or None): If set, image is drawn\n                in background, and saliency is shown in foreground.\n                If numpy array, must be in the order of 2-dim (h, w) or\n                3-dim (ch, h, w).\n            save_filepath (str or None): If specified, file is saved to path.\n            scaler (callable): function which takes `x` as input and outputs\n                scaled `x`, for plotting.\n            title (str or None): title of plot\n            cmap: color map used to plot saliency\n            alpha (float): alpha value of fore ground saliency. This option is\n                used only when `image` is set.\n            show_colorbar (bool): show colorbar in plot or not.\n            bbox_inches (str or Bbox or None): used for `plt.savefig` option.\n        \"\"\"\n        # --- type check ---\n        if saliency.ndim == 3:\n            # (ch, h, w) -> (h, w, ch)\n            saliency = cuda.to_cpu(saliency)\n            saliency_image = numpy.transpose(saliency, (1, 2, 0))\n        elif saliency.ndim == 2:\n            # (h, w)\n            saliency_image = saliency\n        else:\n            raise ValueError(\"[ERROR] Unexpected value saliency.shape={}\"\n                             .format(saliency.shape))\n\n        if image is not None:\n            # If `image` is PIL Image, convert to numpy array\n            image = numpy.asarray(image)\n            if image.ndim == 3:\n                # Convert to (h, w, ch) order\n                if image.shape[0] == 3 or image.shape[0] == 4:\n                    # Assume (ch, h, w) order -> (h, w, ch)\n                    image = numpy.transpose(image, (1, 2, 0))\n            elif image.ndim == 2:\n                # (h, w) order\n                pass\n            else:\n                raise ValueError(\"[ERROR] Unexpected value image.shape={}\"\n                                 .format(image.shape))\n            if image.shape[:2] != saliency_image.shape[:2]:\n                self.logger.warning(\n                    'saliency and image height or width is different\\n'\n                    'saliency_image.shape {}, image.shape [}'\n                    .format(saliency_image.shape, image.shape))\n\n        # Normalize to [-1, 1] or [0, 1]\n        if scaler is not None:\n            saliency_image = scaler(saliency_image)\n\n        fig = plt.figure()\n        plt.clf()\n        if title is not None:\n            plt.title(title)\n\n        if image is None:\n            # Only show saliency image, not set alpha\n            im = plt.imshow(saliency_image, cmap=cmap)\n        else:\n            # Show original image, and overlay saliency image with alpha\n            plt.imshow(image)\n            im = plt.imshow(saliency_image, alpha=alpha, cmap=cmap)\n\n        if show_colorbar:\n            fig.colorbar(im)\n        if save_filepath:\n            plt.savefig(save_filepath, bbox_inches=bbox_inches)\n        else:\n            plt.show()\n"
  },
  {
    "path": "chainer_chemistry/saliency/visualizer/mol_visualizer.py",
    "content": "from logging import getLogger\n\nimport numpy\n\nfrom rdkit import Chem\nfrom rdkit.Chem.Draw import rdMolDraw2D\nfrom rdkit.Chem import rdDepictor\n\n\nfrom chainer_chemistry.saliency.visualizer.base_visualizer import BaseVisualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import red_blue_cmap, abs_max_scaler  # NOQA\n\n\ndef _convert_to_2d(axes, nrows, ncols):\n    if nrows == 1 and ncols == 1:\n        axes = numpy.array([[axes]])\n    elif nrows == 1:\n        axes = axes[None, :]\n    elif ncols == 1:\n        axes = axes[:, None]\n    else:\n        pass\n    assert axes.ndim == 2\n    return axes\n\n\ndef is_visible(begin, end):\n    if begin <= 0 or end <= 0:\n        return 0\n    elif begin >= 1 or end >= 1:\n        return 1\n    else:\n        return (begin + end) * 0.5\n\n\nclass MolVisualizer(BaseVisualizer):\n\n    \"\"\"Saliency visualizer for mol data\n\n    Args:\n        logger:\n    \"\"\"\n\n    def __init__(self, logger=None):\n        self.logger = logger or getLogger(__name__)\n\n    def visualize(self, saliency, mol, save_filepath=None,\n                  visualize_ratio=1.0, color_fn=red_blue_cmap,\n                  scaler=abs_max_scaler, legend='', raise_import_error=False\n                  ):\n        \"\"\"Visualize or save `saliency` with molecule\n\n        returned value can be used for visualization.\n\n        .. admonition:: Example\n\n           >>> svg = visualizer.visualize(saliency, mol)\n           >>>\n           >>> # For a Jupyter user, it will show figure on notebook.\n           >>> from IPython.core.display import SVG\n           >>> SVG(svg.replace('svg:', ''))\n           >>>\n           >>> # For a user who want to save a file as png\n           >>> import cairosvg\n           >>> cairosvg.svg2png(bytestring=svg, write_to=\"foo.png\")\n\n        Args:\n            saliency (numpy.ndarray): 1-dim saliency array (num_node,)\n            mol (Chem.Mol): mol instance of this saliency\n            save_filepath (str or None): If specified, file is saved to path.\n            visualize_ratio (float): If set, only plot saliency color of top-X\n                atoms.\n            color_fn (callable): color function to show saliency\n            scaler (callable): function which takes `x` as input and outputs\n                scaled `x`, for plotting.\n            legend (str): legend for the plot\n            raise_import_error (bool): raise error when `ImportError` is raised\n\n        Returns:\n            svg (str): drawed svg text.\n        \"\"\"\n        rdDepictor.Compute2DCoords(mol)\n        Chem.SanitizeMol(mol)\n        Chem.Kekulize(mol)\n        num_atoms = mol.GetNumAtoms()\n\n        # --- type check ---\n        if saliency.ndim != 1:\n            raise ValueError(\"Unexpected value saliency.shape={}\"\n                             .format(saliency.shape))\n\n        # Cut saliency array for unnecessary tail part\n        saliency = saliency[:num_atoms]\n        if scaler is not None:\n            # Normalize to [-1, 1] or [0, 1]\n            saliency = scaler(saliency)\n\n        abs_saliency = numpy.abs(saliency)\n        if visualize_ratio < 1.0:\n            threshold_index = int(num_atoms * visualize_ratio)\n            idx = numpy.argsort(abs_saliency)\n            idx = numpy.flip(idx, axis=0)\n            # set threshold to top `visualize_ratio` saliency\n            threshold = abs_saliency[idx[threshold_index]]\n            saliency = numpy.where(abs_saliency < threshold, 0., saliency)\n        else:\n            threshold = numpy.min(saliency)\n\n        highlight_atoms = list(map(lambda g: g.__int__(), numpy.where(\n            abs_saliency >= threshold)[0]))\n        atom_colors = {i: color_fn(e) for i, e in enumerate(saliency)}\n        bondlist = [bond.GetIdx() for bond in mol.GetBonds()]\n\n        def color_bond(bond):\n            begin = saliency[bond.GetBeginAtomIdx()]\n            end = saliency[bond.GetEndAtomIdx()]\n            return color_fn(is_visible(begin, end))\n        bondcolorlist = {i: color_bond(bond)\n                         for i, bond in enumerate(mol.GetBonds())}\n        drawer = rdMolDraw2D.MolDraw2DSVG(500, 375)\n        drawer.DrawMolecule(\n            mol, highlightAtoms=highlight_atoms,\n            highlightAtomColors=atom_colors, highlightBonds=bondlist,\n            highlightBondColors=bondcolorlist, legend=legend)\n        drawer.FinishDrawing()\n        svg = drawer.GetDrawingText()\n        if save_filepath:\n            extention = save_filepath.split('.')[-1]\n            if extention == 'svg':\n                with open(save_filepath, 'w') as f:\n                    f.write(svg)\n            elif extention == 'png':\n                # TODO(nakago): check it is possible without cairosvg or not\n                try:\n                    import cairosvg\n                    cairosvg.svg2png(bytestring=svg, write_to=save_filepath)\n                except ImportError as e:\n                    self.logger.error(\n                        'cairosvg is not installed! '\n                        'Please install cairosvg to save by png format.\\n'\n                        'pip install cairosvg')\n                    if raise_import_error:\n                        raise e\n            else:\n                raise ValueError(\n                    'Unsupported extention {} for save_filepath {}'\n                    .format(extention, save_filepath))\n        return svg\n\n\nclass SmilesVisualizer(MolVisualizer):\n\n    def visualize(self, saliency, smiles, save_filepath=None,\n                  visualize_ratio=1.0, color_fn=red_blue_cmap,\n                  scaler=abs_max_scaler, legend='', add_Hs=False,\n                  use_canonical_smiles=True, raise_import_error=False):\n        \"\"\"Visualize or save `saliency` with molecule\n\n        See parent `MolVisualizer` class for further usage.\n\n        Args:\n            saliency (numpy.ndarray): 1-dim saliency array (num_node,)\n            smiles (str): smiles of the molecule.\n            save_filepath (str or None): If specified, file is saved to path.\n            visualize_ratio (float): If set, only plot saliency color of top-X\n                atoms.\n            color_fn (callable): color function to show saliency\n            scaler (callable): function which takes `x` as input and outputs\n                scaled `x`, for plotting.\n            legend (str): legend for the plot\n            add_Hs (bool): Add explicit H or not\n            use_canonical_smiles (bool): If `True`, smiles are converted to\n                canonical smiles before constructing `mol`\n            raise_import_error (bool): raise error when `ImportError` is raised\n\n        Returns:\n            svg (str): drawed svg text.\n        \"\"\"\n        mol = Chem.MolFromSmiles(smiles)\n        if use_canonical_smiles:\n            smiles = Chem.MolToSmiles(mol, canonical=True)\n            mol = Chem.MolFromSmiles(smiles)\n        if add_Hs:\n            mol = Chem.AddHs(mol)\n        return super(SmilesVisualizer, self).visualize(\n            saliency, mol, save_filepath=save_filepath,\n            visualize_ratio=visualize_ratio, color_fn=color_fn, scaler=scaler,\n            legend=legend, raise_import_error=raise_import_error)\n"
  },
  {
    "path": "chainer_chemistry/saliency/visualizer/table_visualizer.py",
    "content": "import matplotlib.pyplot as plt\nimport numpy\n\nfrom chainer_chemistry.saliency.visualizer.base_visualizer import BaseVisualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import abs_max_scaler  # NOQA\n\n\nclass TableVisualizer(BaseVisualizer):\n\n    \"\"\"Saliency visualizer for table data\"\"\"\n\n    def visualize(self, saliency, feature_names=None, save_filepath=None,\n                  num_visualize=-1, scaler=abs_max_scaler,\n                  sort='descending', title='Feature Importance', color='b',\n                  xlabel='Importance', bbox_inches='tight'):\n        \"\"\"Visualize or save `saliency` in bar plot.\n\n        Args:\n            saliency (numpy.ndarray): 1-dim saliency array (num_feature,)\n            feature_names (list or numpy.ndarray): Feature names of `saliency`\n            save_filepath (str or None): If specified, file is saved to path.\n            num_visualize (int): If positive value is set, only plot specified\n               number of features.\n            scaler (callable): function which takes `x` as input and outputs\n                scaled `x`, for plotting.\n            sort (str): Below sort options are supported.\n                none: not sort\n                ascending: plot in ascending order\n                descending: plot in descending order\n            title (str or None): title of plot\n            color (str): color of bar in plot\n            xlabel (str): x label legend\n            bbox_inches (str or Bbox or None): used for `plt.savefig` option.\n\n        \"\"\"\n        # --- type check ---\n        if saliency.ndim != 1:\n            raise ValueError(\"[ERROR] Unexpected value saliency.shape={}\"\n                             .format(saliency.shape))\n\n        num_total_feat = saliency.shape[0]\n        if feature_names is not None:\n            # type check\n            if len(feature_names) != num_total_feat:\n                raise ValueError(\n                    \"feature_names={} must have same length with `saliency`\"\n                    .format(feature_names))\n        else:\n            feature_names = numpy.arange(num_total_feat)\n\n        if sort == 'none':\n            indices = numpy.arange(num_total_feat)\n        elif sort == 'ascending':\n            indices = numpy.argsort(saliency)[::-1]\n        elif sort == 'descending':\n            indices = numpy.argsort(saliency)\n        else:\n            raise ValueError(\"[ERROR] Unexpected value sort={}\".format(sort))\n\n        saliency = saliency[indices]\n        feature_names = numpy.asarray(feature_names)[indices]\n\n        if scaler is not None:\n            # Normalize to [-1, 1] or [0, 1]\n            saliency = scaler(saliency)\n\n        if num_visualize > 0:\n            saliency = saliency[:num_visualize]\n            if feature_names is not None:\n                feature_names = feature_names[:num_visualize]\n        else:\n            num_visualize = num_total_feat\n\n        plt.figure()\n        plt.clf()\n        if title is not None:\n            plt.title(title)\n        plt.barh(range(num_visualize), saliency, color=color, align='center')\n        plt.yticks(range(num_visualize), feature_names)\n        plt.xlabel(xlabel)\n        if save_filepath:\n            plt.savefig(save_filepath, bbox_inches=bbox_inches)\n        else:\n            plt.show()\n"
  },
  {
    "path": "chainer_chemistry/saliency/visualizer/visualizer_utils.py",
    "content": "from logging import getLogger\nimport numpy  # NOQA\n\nfrom chainer import cuda\n\n\ndef red_blue_cmap(x):\n    \"\"\"Red to Blue color map\n\n    Args:\n        x (float): value between -1 ~ 1, represents normalized saliency score\n\n    Returns (tuple): tuple of 3 float values representing R, G, B.\n    \"\"\"\n    if x > 0:\n        # Red for positive value\n        # x=0 -> 1, 1, 1 (white)\n        # x=1 -> 1, 0, 0 (red)\n        return 1., 1. - x, 1. - x\n    else:\n        # Blue for negative value\n        x *= -1\n        return 1. - x, 1. - x, 1.\n\n\ndef min_max_scaler(saliency, logger=None):\n    \"\"\"Normalize saliency to value 0~1\n\n    Args:\n        saliency (numpy.ndarray or cupy.ndarray): saliency array\n        logger:\n\n    Returns (numpy.ndarray or cupy.ndarray): normalized saliency array\n\n    \"\"\"\n    xp = cuda.get_array_module(saliency)\n    maxv = xp.max(saliency)\n    minv = xp.min(saliency)\n    if maxv == minv:\n        logger = logger or getLogger(__name__)\n        logger.info('All saliency value is 0')\n        saliency = xp.zeros_like(saliency)\n    else:\n        saliency = (saliency - minv) / (maxv - minv)\n    return saliency\n\n\ndef abs_max_scaler(saliency, logger=None):\n    \"\"\"Normalize saliency to value -1~1\n\n    Args:\n        saliency (numpy.ndarray or cupy.ndarray): saliency array\n        logger:\n\n    Returns (numpy.ndarray or cupy.ndarray): normalized saliency array\n\n    \"\"\"\n    xp = cuda.get_array_module(saliency)\n    maxv = xp.max(xp.abs(saliency))\n    if maxv <= 0:\n        logger = logger or getLogger(__name__)\n        logger.info('All saliency value is 0')\n        return xp.zeros_like(saliency)\n    else:\n        return saliency / maxv\n\n\ndef normalize_scaler(saliency, axis=None, logger=None):\n    \"\"\"Normalize saliency to be sum=1\n\n    Args:\n        saliency (numpy.ndarray or cupy.ndarray): saliency array.\n        axis (int): axis to take sum for normalization.\n        logger:\n\n    Returns (numpy.ndarray or cupy.ndarray): normalized saliency array\n\n    \"\"\"\n    xp = cuda.get_array_module(saliency)\n    if xp.sum(saliency < 0) > 0:\n        logger = logger or getLogger(__name__)\n        logger.warning('saliency array contains negative number, '\n                       'which is unexpected!')\n    vsum = xp.sum(xp.abs(saliency), axis=axis, keepdims=True)\n    if vsum <= 0:\n        logger = logger or getLogger(__name__)\n        logger.info('All saliency value is 0')\n        return xp.zeros_like(saliency)\n    else:\n        return saliency / vsum\n"
  },
  {
    "path": "chainer_chemistry/training/__init__.py",
    "content": "from chainer_chemistry.training import extensions  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/training/extensions/__init__.py",
    "content": "from chainer_chemistry.training.extensions import batch_evaluator  # NOQA\nfrom chainer_chemistry.training.extensions import r2_score_evaluator  # NOQA\nfrom chainer_chemistry.training.extensions import roc_auc_evaluator  # NOQA\n\n# import class and function\nfrom chainer_chemistry.training.extensions.batch_evaluator import BatchEvaluator  # NOQA\nfrom chainer_chemistry.training.extensions.r2_score_evaluator import R2ScoreEvaluator  # NOQA\nfrom chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/training/extensions/auto_print_report.py",
    "content": "from copy import deepcopy\nimport os\nimport sys\n\nfrom chainer.training import extension\nfrom chainer.training.extensions import log_report as log_report_module\nfrom chainer.training.extensions import util\n\n\ndef create_header_and_templates(entries):\n    # format information\n    entry_widths = [max(10, len(s)) for s in entries]\n\n    header = '  '.join(('{:%d}' % w for w in entry_widths)).format(\n        *entries) + '\\n'\n    templates = []\n    for entry, w in zip(entries, entry_widths):\n        templates.append((entry, '{:<%dg}  ' % w, ' ' * (w + 2)))\n    return header, templates\n\n\ndef filter_and_sort_entries(all_entries, unit='epoch'):\n    entries = deepcopy(all_entries)\n    # TODO(nakago): sort other entries if necessary\n\n    if 'iteration' in entries:\n        # move iteration to head\n        entries.pop(entries.index('iteration'))\n        if unit == 'iteration':\n            entries = ['iteration'] + entries\n    if 'epoch' in entries:\n        # move epoch to head\n        entries.pop(entries.index('epoch'))\n        if unit == 'epoch':\n            entries = ['epoch'] + entries\n    if 'elapsed_time' in entries:\n        # move elapsed_time to tail\n        entries.pop(entries.index('elapsed_time'))\n        entries.append('elapsed_time')\n    return entries\n\n\nclass AutoPrintReport(extension.Extension):\n\n    \"\"\"`PrintReport` with auto `entries` detection.\n\n    This extension uses the log accumulated by a :class:`LogReport` extension\n    to print specified entries of the log in a human-readable format.\n\n    Args:\n        log_report (str or LogReport): Log report to accumulate the\n            observations. This is either the name of a LogReport extensions\n            registered to the trainer, or a LogReport instance to use\n            internally.\n        out: Stream to print the bar. Standard output is used by default.\n\n    \"\"\"\n\n    def __init__(self, log_report='LogReport', out=sys.stdout):\n        self._entries = []\n        self._log_report = log_report\n        self._out = out\n\n        self._log_len = 0  # number of observations already printed\n\n        header, templates = create_header_and_templates([])\n        self._header = header  # printed at the first call\n        self._templates = templates\n        self._all_entries = []\n\n    def get_log_report(self, trainer):\n        log_report = self._log_report\n        if isinstance(log_report, str):\n            log_report = trainer.get_extension(log_report)\n        elif isinstance(log_report, log_report_module.LogReport):\n            log_report(trainer)  # update the log report\n        else:\n            raise TypeError('log report has a wrong type %s' %\n                            type(log_report))\n        return log_report\n\n    def __call__(self, trainer):\n        # --- update entries ---\n        log_report = self.get_log_report(trainer)\n        log = log_report.log\n\n        updated_flag = False\n        aggregate_entries = log[self._log_len:]\n        for obs in aggregate_entries:\n            for entry in obs.keys():\n                if entry not in self._all_entries:\n                    self._all_entries.append(entry)\n                    updated_flag = True\n\n        if updated_flag:\n            if hasattr(log_report, '_trigger') and hasattr(log_report._trigger,\n                                                           'unit'):\n                unit = log_report._trigger.unit\n            else:\n                # Failed to infer `unit`, use epoch as default\n                unit = 'epoch'\n            entries = filter_and_sort_entries(self._all_entries, unit=unit)\n            self._entries = entries\n            header, templates = create_header_and_templates(entries)\n            self._header = header  # printed at the first call\n            self._templates = templates\n\n        out = self._out\n\n        if self._header:\n            out.write(self._header)\n            self._header = None\n\n        log_len = self._log_len\n        while len(log) > log_len:\n            # delete the printed contents from the current cursor\n            if os.name == 'nt':\n                util.erase_console(0, 0)\n            else:\n                out.write('\\033[J')\n            self._print(log[log_len])\n            log_len += 1\n        self._log_len = log_len\n\n    def serialize(self, serializer):\n        log_report = self._log_report\n        if isinstance(log_report, log_report_module.LogReport):\n            log_report.serialize(serializer['_log_report'])\n\n    def _print(self, observation):\n        out = self._out\n        for entry, template, empty in self._templates:\n            if entry in observation:\n                out.write(template.format(observation[entry]))\n            else:\n                out.write(empty)\n        out.write('\\n')\n        if hasattr(out, 'flush'):\n            out.flush()\n"
  },
  {
    "path": "chainer_chemistry/training/extensions/batch_evaluator.py",
    "content": "import copy\nfrom logging import getLogger\n\nimport numpy\n\nimport chainer\nfrom chainer import cuda\nfrom chainer.dataset import convert\nfrom chainer import reporter\nfrom chainer.training.extensions import Evaluator\n\n\ndef _get_1d_numpy_array(v):\n    \"\"\"Convert array or Variable to 1d numpy array\n\n    Args:\n        v (numpy.ndarray or cupy.ndarray or chainer.Variable): array to be\n            converted to 1d numpy array\n\n    Returns (numpy.ndarray): Raveled 1d numpy array\n\n    \"\"\"\n    if isinstance(v, chainer.Variable):\n        v = v.data\n    return cuda.to_cpu(v).ravel()\n\n\nclass BatchEvaluator(Evaluator):\n\n    def __init__(self, iterator, target, converter=convert.concat_examples,\n                 device=None, eval_hook=None, eval_func=None, metrics_fun=None,\n                 name=None, logger=None):\n        super(BatchEvaluator, self).__init__(\n            iterator, target, converter=converter, device=device,\n            eval_hook=eval_hook, eval_func=eval_func)\n        self.name = name\n        self.logger = logger or getLogger()\n\n        if callable(metrics_fun):\n            # TODO(mottodora): use better name or infer\n            self.metrics_fun = {\"evaluation\": metrics_fun}\n        elif isinstance(metrics_fun, dict):\n            self.metrics_fun = metrics_fun\n        else:\n            raise TypeError('Unexpected type metrics_fun must be Callable or '\n                            'dict.')\n\n    def evaluate(self):\n        iterator = self._iterators['main']\n        eval_func = self.eval_func or self._targets['main']\n\n        if self.eval_hook:\n            self.eval_hook(self)\n\n        if hasattr(iterator, 'reset'):\n            iterator.reset()\n            it = iterator\n        else:\n            it = copy.copy(iterator)\n\n        y_total = []\n        t_total = []\n        for batch in it:\n            in_arrays = self.converter(batch, self.device)\n            with chainer.no_backprop_mode(), chainer.using_config('train',\n                                                                  False):\n                y = eval_func(*in_arrays[:-1])\n            t = in_arrays[-1]\n            y_data = _get_1d_numpy_array(y)\n            t_data = _get_1d_numpy_array(t)\n            y_total.append(y_data)\n            t_total.append(t_data)\n\n        y_total = numpy.concatenate(y_total).ravel()\n        t_total = numpy.concatenate(t_total).ravel()\n        # metrics_value = self.metrics_fun(y_total, t_total)\n        metrics = {key: metric_fun(y_total, t_total) for key, metric_fun in\n                   self.metrics_fun.items()}\n\n        observation = {}\n        with reporter.report_scope(observation):\n            reporter.report(metrics, self._targets['main'])\n        return observation\n"
  },
  {
    "path": "chainer_chemistry/training/extensions/prc_auc_evaluator.py",
    "content": "import numpy\n\nfrom chainer.dataset import convert\nfrom sklearn import metrics\n\nfrom chainer_chemistry.training.extensions.batch_evaluator import BatchEvaluator  # NOQA\n\n\ndef _to_list(a):\n    \"\"\"convert value `a` to list\n\n    Args:\n        a: value to be convert to `list`\n\n    Returns (list):\n\n    \"\"\"\n    if isinstance(a, (int, float)):\n        return [a, ]\n    else:\n        # expected to be list or some iterable class\n        return a\n\n\nclass PRCAUCEvaluator(BatchEvaluator):\n\n    \"\"\"Evaluator which calculates PRC AUC score\n\n    Note that this Evaluator is only applicable to binary classification task.\n\n    Args:\n        iterator: Dataset iterator for the dataset to calculate PRC AUC score.\n            It can also be a dictionary of iterators. If this is just an\n            iterator, the iterator is registered by the name ``'main'``.\n        target: Link object or a dictionary of links to evaluate. If this is\n            just a link object, the link is registered by the name ``'main'``.\n        converter: Converter function to build input arrays and true label.\n            :func:`~chainer.dataset.concat_examples` is used by default.\n            It is expected to return input arrays of the form\n            `[x_0, ..., x_n, t]`, where `x_0, ..., x_n` are the inputs to\n            the evaluation function and `t` is the true label.\n        device: Device to which the training data is sent. Negative value\n            indicates the host memory (CPU).\n        eval_hook: Function to prepare for each evaluation process. It is\n            called at the beginning of the evaluation. The evaluator extension\n            object is passed at each call.\n        eval_func: Evaluation function called at each iteration. The target\n            link to evaluate as a callable is used by default.\n        name (str): name of this extension. When `name` is None,\n            `default_name='validation'` which is defined in super class\n            `Evaluator` is used as extension name. This name affects to the\n            reported key name.\n        pos_labels (int or list): labels of the positive class, other classes\n            are considered as negative.\n        ignore_labels (int or list or None): labels to be ignored.\n            `None` is used to not ignore all labels.\n        raise_value_error (bool): If `False`, `ValueError` caused by\n            `roc_auc_score` calculation is suppressed and ignored with a\n            warning message.\n        logger:\n\n    Attributes:\n        converter: Converter function.\n        device: Device to which the training data is sent.\n        eval_hook: Function to prepare for each evaluation process.\n        eval_func: Evaluation function called at each iteration.\n        pos_labels (list): labels of the positive class\n        ignore_labels (list): labels to be ignored.\n\n    \"\"\"\n\n    def __init__(self, iterator, target, converter=convert.concat_examples,\n                 device=None, eval_hook=None, eval_func=None, name=None,\n                 pos_labels=1, ignore_labels=None, raise_value_error=True,\n                 logger=None):\n        metrics_fun = {'prc_auc': self.prc_auc_score}\n        super(PRCAUCEvaluator, self).__init__(\n            iterator, target, converter=converter, device=device,\n            eval_hook=eval_hook, eval_func=eval_func, metrics_fun=metrics_fun,\n            name=name, logger=logger)\n\n        self.pos_labels = _to_list(pos_labels)\n        self.ignore_labels = _to_list(ignore_labels)\n        self.raise_value_error = raise_value_error\n\n    def prc_auc_score(self, y_total, t_total):\n        # --- ignore labels if specified ---\n        if self.ignore_labels:\n            valid_ind = numpy.in1d(t_total, self.ignore_labels, invert=True)\n            y_total = y_total[valid_ind]\n            t_total = t_total[valid_ind]\n\n        # --- set positive labels to 1, negative labels to 0 ---\n        pos_indices = numpy.in1d(t_total, self.pos_labels)\n        t_total = numpy.where(pos_indices, 1, 0)\n\n        if len(numpy.unique(t_total)) != 2:\n            if self.raise_value_error:\n                raise ValueError(\"Only one class present in y_true. PRC AUC \"\n                                 \"score is not defined in that case.\")\n            else:\n                return numpy.nan\n\n        precision, recall, _ = metrics.precision_recall_curve(t_total, y_total)\n        prc_auc = metrics.auc(recall, precision)\n        return prc_auc\n"
  },
  {
    "path": "chainer_chemistry/training/extensions/r2_score_evaluator.py",
    "content": "from chainer.backends import cuda\nfrom chainer.dataset import convert\n\nfrom chainer_chemistry.training.extensions.batch_evaluator import BatchEvaluator  # NOQA\n\n\nclass R2ScoreEvaluator(BatchEvaluator):\n\n    \"\"\"Evaluator with calculates R^2 (coefficient of determination)\n\n    regression score.\n\n    Args:\n        iterator: Dataset iterator for the dataset to calculate\n            R^2(coefficient of determination) regression score.\n            It can also be a dictionary of iterators. If this is just an\n            iterator, the iterator is registered by the name ``'main'``.\n        target: Link object or a dictionary of links to evaluate. If this is\n            just a link object, the link is registered by the name ``'main'``.\n        converter: Converter function to build input arrays and true label.\n            :func:`~chainer.dataset.concat_examples` is used by default.\n            It is expected to return input arrays of the form\n            `[x_0, ..., x_n, t]`, where `x_0, ..., x_n` are the inputs to\n            the evaluation function and `t` is the true label.\n        device: Device to which the training data is sent. Negative value\n            indicates the host memory (CPU).\n        eval_hook: Function to prepare for each evaluation process. It is\n            called at the beginning of the evaluation. The evaluator extension\n            object is passed at each call.\n        eval_func: Evaluation function called at each iteration. The target\n            link to evaluate as a callable is used by default.\n        name (str): name of this extension. When `name` is None,\n            `default_name='validation'` which is defined in super class\n            `Evaluator` is used as extension name. This name affects to the\n            reported key name.\n        pos_labels (int or list): labels of the positive class, other classes\n            are considered as negative.\n        ignore_labels (int or list or None): labels to be ignored.\n            `None` is used to not ignore all labels.\n        raise_value_error (bool): If `False`, `ValueError` caused by\n            `roc_auc_score` calculation is suppressed and ignored with a\n            warning message.\n        logger:\n        sample_weight: This argument is for compatibility with\n            scikit-learn's implementation of r2_score. Current\n            implementation admits None only.\n        multioutput (str): If 'uniform_average', this function returns an\n            average of R^2 score of multiple output. If 'raw_average', this\n            function return a set of R^2 score of multiple output.\n\n    Attributes:\n        converter: Converter function.\n        device: Device to which the training data is sent.\n        eval_hook: Function to prepare for each evaluation process.\n        eval_func: Evaluation function called at each iteration.\n        pos_labels (list): labels of the positive class\n        ignore_labels (list): labels to be ignored.\n    \"\"\"\n\n    def __init__(self, iterator, target, converter=convert.concat_examples,\n                 device=None, eval_hook=None, eval_func=None, name=None,\n                 pos_label=1, ignore_labels=None, raise_value_error=True,\n                 logger=None, sample_weight=None,\n                 multioutput='uniform_average', ignore_nan=False):\n        metrics_fun = {'r2_score': self.r2_score}\n        super(R2ScoreEvaluator, self).__init__(\n            iterator, target, converter=converter, device=device,\n            eval_hook=eval_hook, eval_func=eval_func, metrics_fun=metrics_fun,\n            name=name, logger=logger)\n\n        self.pos_label = pos_label\n        self.ignore_labels = ignore_labels\n        self.raise_value_error = raise_value_error\n        self.sample_weight = sample_weight\n        self.multioutput = multioutput\n        self.ignore_nan = ignore_nan\n\n    def r2_score(self, pred, true, sample_weight=None,\n                 multioutput='uniform_average', ignore_nan=False):\n\n        if self.sample_weight is not None:\n            raise NotImplementedError()\n        if self.multioutput not in ['uniform_average', 'raw_values']:\n            raise ValueError('invalid multioutput argument')\n\n        xp = cuda.get_array_module(pred)\n        diff = pred - true\n        dev = true - xp.mean(true, axis=0)\n        if self.ignore_nan:\n            diff[xp.isnan(diff)] = 0.\n            dev[xp.isnan(dev)] = 0.\n        SS_res = xp.asarray(xp.sum(diff ** 2, axis=0))\n        SS_tot = xp.asarray(xp.sum(dev ** 2, axis=0))\n        SS_tot_iszero = SS_tot == 0\n        SS_tot[SS_tot_iszero] = 1  # Assign dummy value to avoid zero-division\n        ret = xp.where(\n            SS_tot_iszero, 0.0, 1 - SS_res / SS_tot).astype(pred.dtype)\n        if self.multioutput == 'uniform_average':\n            return xp.asarray(ret.mean())\n        elif self.multioutput == 'raw_values':\n            return ret\n"
  },
  {
    "path": "chainer_chemistry/training/extensions/roc_auc_evaluator.py",
    "content": "import numpy\n\nfrom chainer.dataset import convert\nfrom sklearn import metrics\n\nfrom chainer_chemistry.training.extensions.batch_evaluator import BatchEvaluator  # NOQA\n\n\ndef _to_list(a):\n    \"\"\"convert value `a` to list\n\n    Args:\n        a: value to be convert to `list`\n\n    Returns (list):\n\n    \"\"\"\n    if isinstance(a, (int, float)):\n        return [a, ]\n    else:\n        # expected to be list or some iterable class\n        return a\n\n\nclass ROCAUCEvaluator(BatchEvaluator):\n\n    \"\"\"Evaluator which calculates ROC AUC score\n\n    Note that this Evaluator is only applicable to binary classification task.\n\n    Args:\n        iterator: Dataset iterator for the dataset to calculate ROC AUC score.\n            It can also be a dictionary of iterators. If this is just an\n            iterator, the iterator is registered by the name ``'main'``.\n        target: Link object or a dictionary of links to evaluate. If this is\n            just a link object, the link is registered by the name ``'main'``.\n        converter: Converter function to build input arrays and true label.\n            :func:`~chainer.dataset.concat_examples` is used by default.\n            It is expected to return input arrays of the form\n            `[x_0, ..., x_n, t]`, where `x_0, ..., x_n` are the inputs to\n            the evaluation function and `t` is the true label.\n        device: Device to which the training data is sent. Negative value\n            indicates the host memory (CPU).\n        eval_hook: Function to prepare for each evaluation process. It is\n            called at the beginning of the evaluation. The evaluator extension\n            object is passed at each call.\n        eval_func: Evaluation function called at each iteration. The target\n            link to evaluate as a callable is used by default.\n        name (str): name of this extension. When `name` is None,\n            `default_name='validation'` which is defined in super class\n            `Evaluator` is used as extension name. This name affects to the\n            reported key name.\n        pos_labels (int or list): labels of the positive class, other classes\n            are considered as negative.\n        ignore_labels (int or list or None): labels to be ignored.\n            `None` is used to not ignore all labels.\n        raise_value_error (bool): If `False`, `ValueError` caused by\n            `roc_auc_score` calculation is suppressed and ignored with a\n            warning message.\n        logger:\n\n    Attributes:\n        converter: Converter function.\n        device: Device to which the training data is sent.\n        eval_hook: Function to prepare for each evaluation process.\n        eval_func: Evaluation function called at each iteration.\n        pos_labels (list): labels of the positive class\n        ignore_labels (list): labels to be ignored.\n\n    \"\"\"\n\n    def __init__(self, iterator, target, converter=convert.concat_examples,\n                 device=None, eval_hook=None, eval_func=None, name=None,\n                 pos_labels=1, ignore_labels=None, raise_value_error=True,\n                 logger=None):\n        metrics_fun = {'roc_auc': self.roc_auc_score}\n        super(ROCAUCEvaluator, self).__init__(\n            iterator, target, converter=converter, device=device,\n            eval_hook=eval_hook, eval_func=eval_func, metrics_fun=metrics_fun,\n            name=name, logger=logger)\n\n        self.pos_labels = _to_list(pos_labels)\n        self.ignore_labels = _to_list(ignore_labels)\n        self.raise_value_error = raise_value_error\n\n    def roc_auc_score(self, y_total, t_total):\n        # --- ignore labels if specified ---\n        if self.ignore_labels:\n            valid_ind = numpy.in1d(t_total, self.ignore_labels, invert=True)\n            y_total = y_total[valid_ind]\n            t_total = t_total[valid_ind]\n\n        # --- set positive labels to 1, negative labels to 0 ---\n        pos_indices = numpy.in1d(t_total, self.pos_labels)\n        t_total = numpy.where(pos_indices, 1, 0)\n        try:\n            roc_auc = metrics.roc_auc_score(t_total, y_total)\n        except ValueError as e:\n            # When only one class present in `y_true`, `ValueError` is raised.\n            # ROC AUC score is not defined in that case.\n            if self.raise_value_error:\n                raise e\n            else:\n                self.logger.warning(\n                    'ValueError detected during roc_auc_score calculation. {}'\n                    .format(e.args))\n                roc_auc = numpy.nan\n        return roc_auc\n"
  },
  {
    "path": "chainer_chemistry/utils/__init__.py",
    "content": "from chainer_chemistry.utils.json_utils import load_json  # NOQA\nfrom chainer_chemistry.utils.json_utils import save_json  # NOQA\nfrom chainer_chemistry.utils.sparse_utils import convert_sparse_with_edge_type  # NOQA\nfrom chainer_chemistry.utils.sparse_utils import is_sparse  # NOQA\nfrom chainer_chemistry.utils.train_utils import run_train  # NOQA\n"
  },
  {
    "path": "chainer_chemistry/utils/extend.py",
    "content": "from collections import Iterable\nfrom logging import getLogger\nimport six\n\nfrom chainer import cuda\n\n\ndef _to_list(a):\n    if isinstance(a, Iterable):\n        a = list(a)\n    else:\n        a = [a]\n    return a\n\n\ndef extend_node(node, out_size, axis=-1, value=0):\n    \"\"\"Extend size of `node` array\n\n    For now, this function works same with `extend_array` method,\n    this is just an alias function.\n\n    Args:\n        node (numpy.ndarray): the array whose `axis` to be extended.\n            first axis is considered as \"batch\" axis.\n        out_size (int): target output size for specified `axis`.\n        axis (int): node feature axis to be extended.\n            Default is `axis=-1`, which extends only last axis.\n        value (int or float): value to be filled for extended place.\n\n    Returns (numpy.ndarray): extended `node` array, extended place is filled\n        with `value`\n\n    \"\"\"\n    return extend_arrays_to_size(\n        node, out_size=out_size, axis=axis, value=value)\n\n\ndef extend_adj(adj, out_size, axis=None, value=0):\n    \"\"\"Extend size of `adj` array\n\n    For now, this function only differs default `axis` value from\n    `extend_array` method, this is an alias function.\n\n    Args:\n        adj (numpy.ndarray): the array whose `axis` to be extended.\n            first axis is considered as \"batch\" axis.\n        out_size (int): target output size for specified `axis`.\n        axis (list or None): node feature axis to be extended. Default is None,\n            in this case `axis=[-1, -2]` is used to extend last 2 axes.\n        value (int or float): value to be filled for extended place.\n\n    Returns (numpy.ndarray): extended `adj` array, extended place is filled\n        with `value`\n\n    \"\"\"\n    axis = axis or [-1, -2]\n    return extend_arrays_to_size(\n        adj, out_size=out_size, axis=axis, value=value)\n\n\ndef extend_arrays_to_size(arrays, out_size, axis=-1, value=0):\n    \"\"\"Extend size of `arrays` array\n\n    Args:\n        arrays (numpy.ndarray): the array whose `axis` to be extended.\n            first axis is considered as \"batch\" axis.\n        out_size (int): target output size for specified `axis`.\n        axis (int or list): node feature axis to be extended.\n        value (int or float): value to be filled for extended place.\n\n    Returns (numpy.ndarray): extended array, extended place is filled\n        with `value`\n\n    \"\"\"\n    batch_size = len(arrays)\n\n    in_shape = _to_list(arrays[0].shape)\n    out_shape = [batch_size] + in_shape\n\n    axis = _to_list(axis)\n    for ax in axis:\n        if ax == 0:\n            logger = getLogger(__name__)\n            logger.warning('axis 0 detected, but axis=0 is expected to be '\n                           'batch size dimension.')\n        if out_shape[ax] > out_size:\n            raise ValueError(\n                'current size={} is larger than out_size={} at axis={}'\n                .format(out_shape[ax], out_size, ax))\n        out_shape[ax] = out_size\n    return extend_arrays_to_shape(arrays, out_shape, value=value)\n\n\ndef extend_arrays_to_shape(arrays, out_shape, value=0):\n    # Ref: `_concat_arrays_with_padding` method in chainer convert.py\n    # https://github.com/chainer/chainer/blob/master/chainer/dataset/convert.py\n    xp = cuda.get_array_module(arrays[0])\n    with cuda.get_device_from_array(arrays[0]):\n        result = xp.full(out_shape, value, dtype=arrays[0].dtype)\n        for i in six.moves.range(len(arrays)):\n            src = arrays[i]\n            slices = tuple(slice(dim) for dim in src.shape)\n            result[(i,) + slices] = src\n    return result\n"
  },
  {
    "path": "chainer_chemistry/utils/json_utils.py",
    "content": "import json\nfrom logging import getLogger\nimport numpy\ntry:\n    from pathlib import PurePath\n    _is_pathlib_available = True\nexcept ImportError:\n    _is_pathlib_available = False\n\nfrom chainer import cuda\n\n\nclass JSONEncoderEX(json.JSONEncoder):\n    \"\"\"Encoder class used for `json.dump`\"\"\"\n\n    def default(self, obj):\n        if isinstance(obj, numpy.integer):\n            return int(obj)\n        elif isinstance(obj, numpy.floating):\n            return float(obj)\n        elif isinstance(obj, numpy.ndarray):\n            return obj.tolist()\n        elif isinstance(obj, cuda.ndarray):\n            return cuda.to_cpu(obj).tolist()\n        elif _is_pathlib_available and isinstance(obj, PurePath):\n            # save as str representation\n            # convert windows path separator to linux format\n            return str(obj).replace('\\\\', '/')\n        else:\n            return super(JSONEncoderEX, self).default(obj)\n\n\ndef save_json(filepath, params, ignore_error=False, indent=4, logger=None):\n    \"\"\"Save `params` to `filepath` in json format.\n\n    It also supports `numpy` & `cupy` array serialization by converting them to\n    `list` format.\n\n    Args:\n        filepath (str): filepath to save args\n        params (dict or list): parameters to be saved.\n        ignore_error (bool): If `True`, it will ignore exception with printing\n            error logs, which prevents to stop.\n        indent (int): Indent for saved file.\n        logger:\n\n    \"\"\"\n    try:\n        with open(filepath, 'w') as f:\n            json.dump(params, f, indent=indent, cls=JSONEncoderEX)\n    except Exception as e:\n        if not ignore_error:\n            raise e\n        else:\n            logger = logger or getLogger(__name__)\n            logger.warning('Error occurred at save_json, but ignoring...')\n            logger.warning('The file {} may not be saved or corrupted.'\n                           .format(filepath))\n            logger.warning(e)\n\n\ndef load_json(filepath):\n    \"\"\"Load params, which is stored in json format.\n\n    Args:\n        filepath (str): filepath to json file to load.\n\n    Returns (dict or list): params\n    \"\"\"\n    with open(filepath, 'r') as f:\n        params = json.load(f)\n    return params\n"
  },
  {
    "path": "chainer_chemistry/utils/permutation.py",
    "content": "import numpy\n\n\ndef permute_node(node, permutation_index, axis=-1):\n    \"\"\"Permute index of `node` array\n\n    Args:\n        node (numpy.ndarray): the array whose `axis` to be permuted.\n        permutation_index (numpy.ndarray): 1d numpy array whose size should be\n            same as permutation axis of `node`.\n        axis (int): permutation axis.\n\n    Returns (numpy.ndarray): permutated `node` array.\n\n    \"\"\"\n    if node.shape[axis] != len(permutation_index):\n        raise ValueError(\n            'node.shape[{}] = {} and len(permutation_index) = {} do not match!'\n            .format(axis, node.shape[axis], len(permutation_index)))\n    out_node = numpy.take(node, permutation_index, axis=axis).copy()\n    return out_node\n\n\ndef permute_adj(adj, permutation_index, axis=None):\n    \"\"\"Permute index of adjacency matrix array\n\n    Args:\n        adj (numpy.ndarray): the array whose `axis` to be permuted.\n            It is considered as adjacency matrix.\n        permutation_index (numpy.ndarray): 1d numpy array whose size should be\n            same as permutation axis of `node`.\n        axis (list or tuple or None): list of 2d int, indicates the permutation\n            axis. When None is passed (default), it uses -1 and -2 as `axis`,\n            it means that last 2 axis are considered to be permuted.\n\n    Returns (numpy.ndarray): permutated `adj` array.\n\n    \"\"\"\n    if axis is not None:\n        if not isinstance(axis, (list, tuple)):\n            raise TypeError('axis must be list or tuple, got {}'\n                            .format(type(axis)))\n        if len(axis) != 2:\n            raise ValueError('axis length must 2, got {}'.format(len(axis)))\n    else:\n        axis = [-1, -2]  # default value is to use last 2 axis\n\n    num_node = len(permutation_index)\n    for ax in axis:\n        if adj.shape[ax] != len(permutation_index):\n            raise ValueError(\n                'adj.shape[{}] = {} and len(permutation_index) = {} do not '\n                'match!'.format(axis, adj.shape[axis], len(permutation_index)))\n\n    out_adj = numpy.zeros_like(adj)\n    ndim = adj.ndim\n    for i in range(num_node):\n        for j in range(num_node):\n            in_indices = [slice(None)] * ndim\n            out_indices = [slice(None)] * ndim\n            in_indices[axis[0]] = i\n            in_indices[axis[1]] = j\n            out_indices[axis[0]] = permutation_index[i]\n            out_indices[axis[1]] = permutation_index[j]\n            out_adj[tuple(in_indices)] = adj[tuple(out_indices)]\n    return out_adj\n"
  },
  {
    "path": "chainer_chemistry/utils/sparse_utils.py",
    "content": "import chainer\nfrom chainer import cuda\nimport numpy as np\n\ntry:\n    from chainer.utils import CooMatrix\n    _coomatrix_imported = True\nexcept Exception:\n    _coomatrix_imported = False\n\n\ndef _flatten(x):\n    if isinstance(x, chainer.Variable):\n        x = x.data\n    x = chainer.backends.cuda.to_cpu(x)\n    return x.flatten()\n\n\ndef sparse_utils_available():\n    from distutils.version import StrictVersion\n    return _coomatrix_imported and\\\n        StrictVersion(np.__version__) >= StrictVersion('1.16')\n\n\ndef is_sparse(x):\n    if _coomatrix_imported and isinstance(x, CooMatrix):\n        return True\n    else:\n        return False\n\n\ndef convert_sparse_with_edge_type(data, row, col, num_nodes,\n                                  edge_type, num_edge_type):\n    \"\"\"Convert a sparse matrix with edge type to a regular COO matrix.\n\n    Args:\n        data (numpy.ndarray): the entries of the batched sparse matrix.\n        row (numpy.ndarray): the row indices of the matrix entries.\n        col (numpy.ndarray): the column indices of the matrix entries.\n        num_nodes (int): the number of nodes in the batched graph.\n        edge_type (numpy.ndarray): edge type information of edges.\n        num_edge_type (int): number of edge type.\n\n    Returns (chainer.utils.CooMatrix): new sparse COO matrix whose minibatch\n        size is equal to ((original minibatch size) * num_edge_type).\n    \"\"\"\n    assert len(data.shape) == 2\n    assert row.shape == data.shape\n    assert col.shape == data.shape\n    assert edge_type.shape == data.shape\n\n    mb, length = data.shape\n    xp = cuda.get_array_module(data)\n\n    data = _flatten(data)\n    row = _flatten(row)\n    col = _flatten(col)\n    edge_type = _flatten(edge_type)\n\n    # From now on, suppose that\n    # edge_type = [[1, 1, 3, 1], [0, 2, 1, 0]] as example.\n    # Then,\n    # pos_mb    = [1, 1, 3, 1, 4, 6, 5, 4].\n    pos_mb = np.repeat(np.arange(mb), length) * num_edge_type + edge_type\n\n    # argsort    = [0, 1, 3, 2, 4, 7, 6, 5]\n    # sorted_pos = [1, 1, 1, 3, 4, 4, 5, 6]\n    argsort = pos_mb.argsort()\n    sorted_pos = pos_mb[argsort]\n\n    # df         = [0, 0, 0, 1, 1, 0, 1, 1]\n    df = np.diff(sorted_pos, prepend=sorted_pos[0]) != 0\n    # extract    = [3, 4, 6, 7]\n    extract = np.arange(mb * length)[df]\n    # d_extract  = [3, 1, 2, 1]\n    d_extract = np.diff(extract, prepend=0)\n\n    # p          = [0, 0, 0, 3, 1, 0, 2, 1]\n    p = np.zeros(mb * length, dtype=np.int32)\n    p[df] = d_extract\n    # pos_i_perm = [0, 1, 2, 0, 0, 1, 0, 0]\n    pos_i_perm = np.arange(mb * length) - p.cumsum()\n    # pos_i      = [0, 1, 0, 2, 0, 0, 0, 1]\n    pos_i = np.zeros_like(pos_i_perm)\n    pos_i[argsort] = pos_i_perm\n\n    # new_length = 3\n    new_length = pos_i.max() + 1\n    new_mb = mb * num_edge_type\n\n    new_data = xp.zeros((new_mb, new_length), dtype=data.dtype)\n    new_data[pos_mb, pos_i] = data\n\n    new_row = xp.zeros((new_mb, new_length), dtype=np.int32)\n    new_row[pos_mb, pos_i] = row\n\n    new_col = xp.zeros((new_mb, new_length), dtype=np.int32)\n    new_col[pos_mb, pos_i] = col\n\n    new_shape = (num_nodes, num_nodes)\n    return chainer.utils.CooMatrix(new_data, new_row, new_col, new_shape)\n\n\ndef _convert_to_sparse(dense_adj):\n    # naive conversion function mainly for testing\n    xp = cuda.get_array_module(dense_adj)\n    dense_adj = cuda.to_cpu(dense_adj)\n    batch_size, num_edge_type, atom_size = dense_adj.shape[:3]\n    data = []\n    row = []\n    col = []\n    edge_type = []\n    for mb in range(batch_size):\n        data.append([])\n        row.append([])\n        col.append([])\n        edge_type.append([])\n        for e in range(num_edge_type):\n            for i in range(atom_size):\n                for j in range(atom_size):\n                    data[-1].append(dense_adj[mb, e, i, j])\n                    row[-1].append(i)\n                    col[-1].append(j)\n                    edge_type[-1].append(e)\n\n    data = xp.array(data, dtype=dense_adj.dtype)\n    row = xp.array(row, dtype=xp.int32)\n    col = xp.array(col, dtype=xp.int32)\n    edge_type = xp.array(edge_type, dtype=xp.int32)\n\n    return data, row, col, edge_type\n"
  },
  {
    "path": "chainer_chemistry/utils/train_utils.py",
    "content": "import chainer\nfrom chainer import optimizers, training, Optimizer  # NOQA\nfrom chainer._backend import Device\nfrom chainer.dataset import convert, Iterator  # NOQA\nfrom chainer.iterators import SerialIterator\nfrom chainer.training import extensions\n\nfrom chainer_chemistry.training.extensions.auto_print_report import AutoPrintReport  # NOQA\n\n\ndef run_train(model, train, valid=None,\n              batch_size=16, epoch=10,\n              optimizer=None,\n              out='result',\n              extensions_list=None,\n              device=-1,\n              converter=convert.concat_examples,\n              use_default_extensions=True,\n              resume_path=None):\n    \"\"\"Util function to train chainer's model with StandardUpdater.\n\n    Typical Regression/Classification tasks suffices to use this method to\n    train chainer model.\n\n    Args:\n        model (chainer.Chain): model to train\n        train (dataset or Iterator): training dataset or train iterator\n        valid (dataset or Iterator): validation dataset or valid iterator\n        batch_size (int): batch size for training\n        epoch (int): epoch for training\n        optimizer (Optimizer):\n        out (str): path for `trainer`'s out directory\n        extensions_list (None or list): list of extensions to add to `trainer`\n        device (Device): chainer Device\n        converter (callable):\n        use_default_extensions (bool): If `True`, default extensions are added\n            to `trainer`.\n        resume_path (None or str): If specified, `trainer` is resumed with this\n            serialized file.\n    \"\"\"\n    if optimizer is None:\n        # Use Adam optimizer as default\n        optimizer = optimizers.Adam()\n    elif not isinstance(optimizer, Optimizer):\n        raise ValueError(\"[ERROR] optimizer must be instance of Optimizer, \"\n                         \"but passed {}\".format(type(Optimizer)))\n\n    optimizer.setup(model)\n\n    if isinstance(train, Iterator):\n        train_iter = train\n    else:\n        # Assume `train` as training dataset, Use SerialIterator as default.\n        train_iter = SerialIterator(train, batch_size=batch_size)\n\n    updater = training.StandardUpdater(\n        train_iter, optimizer, device=device, converter=converter)\n    trainer = training.Trainer(updater, (epoch, 'epoch'), out=out)\n    if use_default_extensions:\n        if valid is not None:\n            if isinstance(valid, Iterator):\n                valid_iter = valid\n            else:\n                # Assume `valid` as validation dataset,\n                # Use SerialIterator as default.\n                valid_iter = SerialIterator(valid, batch_size=batch_size,\n                                            shuffle=False, repeat=False)\n            trainer.extend(extensions.Evaluator(\n                valid_iter, model, device=device, converter=converter))\n\n        trainer.extend(extensions.LogReport())\n        trainer.extend(AutoPrintReport())\n        trainer.extend(extensions.ProgressBar(update_interval=10))\n        # TODO(nakago): consider to include snapshot as default extension.\n        # trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch'))\n\n    if extensions_list is not None:\n        for e in extensions_list:\n            trainer.extend(e)\n\n    if resume_path:\n        chainer.serializers.load_npz(resume_path, trainer)\n    trainer.run()\n\n    return\n\n\ndef run_node_classification_train(model, data,\n                                  train_mask, valid_mask,\n                                  epoch=10,\n                                  optimizer=None,\n                                  out='result',\n                                  extensions_list=None,\n                                  device=-1,\n                                  converter=None,\n                                  use_default_extensions=True,\n                                  resume_path=None):\n    if optimizer is None:\n        # Use Adam optimizer as default\n        optimizer = optimizers.Adam()\n    elif not isinstance(optimizer, Optimizer):\n        raise ValueError(\"[ERROR] optimizer must be instance of Optimizer, \"\n                         \"but passed {}\".format(type(Optimizer)))\n\n    optimizer.setup(model)\n\n    def one_batch_converter(batch, device):\n        if not isinstance(device, Device):\n            device = chainer.get_device(device)\n        data, train_mask, valid_mask = batch[0]\n        return (data.to_device(device),\n                device.send(train_mask), device.send(valid_mask))\n\n    data_iter = SerialIterator([(data, train_mask, valid_mask)], batch_size=1)\n    updater = training.StandardUpdater(\n        data_iter, optimizer, device=device,\n        converter=one_batch_converter)\n    trainer = training.Trainer(updater, (epoch, 'epoch'), out=out)\n    if use_default_extensions:\n        trainer.extend(extensions.LogReport())\n        trainer.extend(AutoPrintReport())\n        trainer.extend(extensions.ProgressBar(update_interval=10))\n        # TODO(nakago): consider to include snapshot as default extension.\n        # trainer.extend(extensions.snapshot(), trigger=(frequency, 'epoch'))\n\n    if extensions_list is not None:\n        for e in extensions_list:\n            trainer.extend(e)\n\n    if resume_path:\n        chainer.serializers.load_npz(resume_path, trainer)\n    trainer.run()\n\n    return\n"
  },
  {
    "path": "docker/conda/python36/Dockerfile",
    "content": "FROM nvidia/cuda:10.1-cudnn7-devel\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    git \\\n    wget \\\n    bzip2 \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    libblas3 \\\n    libblas-dev \\\n    libxext6 \\\n    libgl1-mesa-glx \\\n    libxrender-dev \\\n    && \\\n    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*\n\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\nENV PATH /opt/conda/bin:$PATH\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    /opt/conda/bin/conda clean -tipsy && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\nRUN conda update -n base -c defaults conda\nRUN conda create -n py36 python=3.6 conda && \\\n    . /opt/conda/etc/profile.d/conda.sh && \\\n    conda init bash && \\\n    conda activate py36 && \\\n    conda install -c rdkit rdkit && \\\n    pip install pytest mock\n\nADD conda-entrypoint.sh /conda-entrypoint.sh\nENTRYPOINT [ \"/conda-entrypoint.sh\" ]\n"
  },
  {
    "path": "docker/conda/python36/conda-entrypoint.sh",
    "content": "#!/bin/bash\n. /opt/conda/etc/profile.d/conda.sh\nconda activate py36\nexec \"$@\"\n"
  },
  {
    "path": "docker/conda/python37/Dockerfile",
    "content": "FROM nvidia/cuda:10.1-cudnn7-devel\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    git \\\n    wget \\\n    bzip2 \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    libblas3 \\\n    libblas-dev \\\n    libxext6 \\\n    libgl1-mesa-glx \\\n    libxrender-dev \\\n    && \\\n    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*\n\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\nENV PATH /opt/conda/bin:$PATH\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    /opt/conda/bin/conda clean -tipsy && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\nRUN conda update -n base -c defaults conda\nRUN conda create -n py37 python=3.7 conda && \\\n    . /opt/conda/etc/profile.d/conda.sh && \\\n    conda init bash && \\\n    conda activate py37 && \\\n    conda install -c rdkit rdkit && \\\n    pip install pytest mock\n\nADD conda-entrypoint.sh /conda-entrypoint.sh\nENTRYPOINT [ \"/conda-entrypoint.sh\" ]\n"
  },
  {
    "path": "docker/conda/python37/conda-entrypoint.sh",
    "content": "#!/bin/bash\n. /opt/conda/etc/profile.d/conda.sh\nconda activate py37\nexec \"$@\"\n"
  },
  {
    "path": "docker/conda/python37-chainerx-cpu-base/Dockerfile",
    "content": "FROM nvidia/cuda:10.1-cudnn7-devel\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    git \\\n    wget \\\n    bzip2 \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    libblas3 \\\n    libblas-dev \\\n    libxext6 \\\n    libgl1-mesa-glx \\\n    libxrender-dev \\\n    && \\\n    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*\n\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\nENV PATH /opt/conda/bin:$PATH\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    /opt/conda/bin/conda clean -tipsy && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\nRUN conda update -n base -c defaults conda\nENV MAKEFLAGS -j4\nRUN conda create -n py37 python=3.7 conda && \\\n    . /opt/conda/etc/profile.d/conda.sh && \\\n    conda init bash && \\\n    conda activate py37 && \\\n    CHAINER_BUILD_CHAINERX=1 pip install -vvvv --no-cache-dir chainer==6.0.0 && \\\n    conda install -c rdkit rdkit==2019.03.4.0\n\nADD conda-entrypoint.sh /conda-entrypoint.sh\nENTRYPOINT [ \"/conda-entrypoint.sh\" ]\n"
  },
  {
    "path": "docker/conda/python37-chainerx-cpu-base/conda-entrypoint.sh",
    "content": "#!/bin/bash\n. /opt/conda/etc/profile.d/conda.sh\nconda activate py37\nexec \"$@\"\n"
  },
  {
    "path": "docker/conda/python37-chainerx-cpu-latest/Dockerfile",
    "content": "FROM nvidia/cuda:10.1-cudnn7-devel\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    git \\\n    wget \\\n    bzip2 \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    libblas3 \\\n    libblas-dev \\\n    libxext6 \\\n    libgl1-mesa-glx \\\n    libxrender-dev \\\n    && \\\n    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*\n\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\nENV PATH /opt/conda/bin:$PATH\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    /opt/conda/bin/conda clean -tipsy && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\nRUN conda update -n base -c defaults conda\nENV MAKEFLAGS -j4\nRUN conda create -n py37 python=3.7 conda && \\\n    . /opt/conda/etc/profile.d/conda.sh && \\\n    conda init bash && \\\n    conda activate py37 && \\\n    CHAINER_BUILD_CHAINERX=1 pip install -vvvv --no-cache-dir chainer==7.0.0b2 && \\\n    conda install -c rdkit rdkit==2019.03.4.0\n\nADD conda-entrypoint.sh /conda-entrypoint.sh\nENTRYPOINT [ \"/conda-entrypoint.sh\" ]\n"
  },
  {
    "path": "docker/conda/python37-chainerx-cpu-latest/conda-entrypoint.sh",
    "content": "#!/bin/bash\n. /opt/conda/etc/profile.d/conda.sh\nconda activate py37\nexec \"$@\"\n"
  },
  {
    "path": "docker/conda/python37-chainerx-cpu-stable/Dockerfile",
    "content": "FROM nvidia/cuda:10.1-cudnn7-devel\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    git \\\n    wget \\\n    bzip2 \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    libblas3 \\\n    libblas-dev \\\n    libxext6 \\\n    libgl1-mesa-glx \\\n    libxrender-dev \\\n    && \\\n    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*\n\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\nENV PATH /opt/conda/bin:$PATH\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    /opt/conda/bin/conda clean -tipsy && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\nRUN conda update -n base -c defaults conda\nENV MAKEFLAGS -j4\nRUN conda create -n py37 python=3.7 conda && \\\n    . /opt/conda/etc/profile.d/conda.sh && \\\n    conda init bash && \\\n    conda activate py37 && \\\n    CHAINER_BUILD_CHAINERX=1 pip install -vvvv --no-cache-dir chainer==6.2.0 && \\\n    conda install -c rdkit rdkit==2019.03.4.0\n\nADD conda-entrypoint.sh /conda-entrypoint.sh\nENTRYPOINT [ \"/conda-entrypoint.sh\" ]\n"
  },
  {
    "path": "docker/conda/python37-chainerx-cpu-stable/conda-entrypoint.sh",
    "content": "#!/bin/bash\n. /opt/conda/etc/profile.d/conda.sh\nconda activate py37\nexec \"$@\"\n"
  },
  {
    "path": "docker/conda/python37-chainerx-gpu-base/Dockerfile",
    "content": "FROM nvidia/cuda:10.1-cudnn7-devel\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    git \\\n    wget \\\n    bzip2 \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    libblas3 \\\n    libblas-dev \\\n    libxext6 \\\n    libgl1-mesa-glx \\\n    libxrender-dev \\\n    && \\\n    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*\n\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\nENV PATH /opt/conda/bin:$PATH\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    /opt/conda/bin/conda clean -tipsy && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\nRUN conda update -n base -c defaults conda\nENV MAKEFLAGS -j4\nRUN conda create -n py37 python=3.7 conda && \\\n    . /opt/conda/etc/profile.d/conda.sh && \\\n    conda init bash && \\\n    conda activate py37 && \\\n    CHAINER_BUILD_CHAINERX=1 CHAINERX_BUILD_CUDA=1 pip install -vvvv --no-cache-dir cupy-cuda101==6.0.0 chainer==6.0.0 && \\\n    conda install -c rdkit rdkit==2019.03.4.0\n\nADD conda-entrypoint.sh /conda-entrypoint.sh\nENTRYPOINT [ \"/conda-entrypoint.sh\" ]\n"
  },
  {
    "path": "docker/conda/python37-chainerx-gpu-base/conda-entrypoint.sh",
    "content": "#!/bin/bash\n. /opt/conda/etc/profile.d/conda.sh\nconda activate py37\nexec \"$@\"\n"
  },
  {
    "path": "docker/conda/python37-chainerx-gpu-latest/Dockerfile",
    "content": "FROM nvidia/cuda:10.1-cudnn7-devel\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    git \\\n    wget \\\n    bzip2 \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    libblas3 \\\n    libblas-dev \\\n    libxext6 \\\n    libgl1-mesa-glx \\\n    libxrender-dev \\\n    && \\\n    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*\n\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\nENV PATH /opt/conda/bin:$PATH\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    /opt/conda/bin/conda clean -tipsy && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\nRUN conda update -n base -c defaults conda\nENV MAKEFLAGS -j4\nRUN conda create -n py37 python=3.7 conda && \\\n    . /opt/conda/etc/profile.d/conda.sh && \\\n    conda init bash && \\\n    conda activate py37 && \\\n    CHAINER_BUILD_CHAINERX=1 CHAINERX_BUILD_CUDA=1 pip install -vvvv --no-cache-dir cupy-cuda101==7.0.0b2 chainer==7.0.0b2 && \\\n    conda install -c rdkit rdkit==2019.03.4.0\n\nADD conda-entrypoint.sh /conda-entrypoint.sh\nENTRYPOINT [ \"/conda-entrypoint.sh\" ]\n"
  },
  {
    "path": "docker/conda/python37-chainerx-gpu-latest/conda-entrypoint.sh",
    "content": "#!/bin/bash\n. /opt/conda/etc/profile.d/conda.sh\nconda activate py37\nexec \"$@\"\n"
  },
  {
    "path": "docker/conda/python37-chainerx-gpu-stable/Dockerfile",
    "content": "FROM nvidia/cuda:10.1-cudnn7-devel\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    git \\\n    wget \\\n    bzip2 \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    libblas3 \\\n    libblas-dev \\\n    libxext6 \\\n    libgl1-mesa-glx \\\n    libxrender-dev \\\n    && \\\n    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*\n\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\n\nENV PATH /opt/conda/bin:$PATH\nRUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \\\n    /bin/bash ~/miniconda.sh -b -p /opt/conda && \\\n    rm ~/miniconda.sh && \\\n    /opt/conda/bin/conda clean -tipsy && \\\n    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \\\n    echo \". /opt/conda/etc/profile.d/conda.sh\" >> ~/.bashrc && \\\n    echo \"conda activate base\" >> ~/.bashrc\nRUN conda update -n base -c defaults conda\nENV MAKEFLAGS -j4\nRUN conda create -n py37 python=3.7 conda && \\\n    . /opt/conda/etc/profile.d/conda.sh && \\\n    conda init bash && \\\n    conda activate py37 && \\\n    CHAINER_BUILD_CHAINERX=1 CHAINERX_BUILD_CUDA=1 pip install -vvvv --no-cache-dir cupy-cuda101==6.2.0 chainer==6.2.0 && \\\n    conda install -c rdkit rdkit==2019.03.4.0\n\nADD conda-entrypoint.sh /conda-entrypoint.sh\nENTRYPOINT [ \"/conda-entrypoint.sh\" ]\n"
  },
  {
    "path": "docker/conda/python37-chainerx-gpu-stable/conda-entrypoint.sh",
    "content": "#!/bin/bash\n. /opt/conda/etc/profile.d/conda.sh\nconda activate py37\nexec \"$@\"\n"
  },
  {
    "path": "docker/python3/Dockerfile",
    "content": "FROM chainer/chainer:v6.1.0-python3\n\nRUN apt-get update -y && \\\n    apt-get install -y --no-install-recommends \\\n    curl ca-certificates \\\n    libboost-dev \\\n    libboost-python-dev \\\n    libboost-serialization-dev \\\n    libboost-iostreams-dev \\\n    libboost-thread-dev \\\n    libboost-system-dev \\\n    libeigen3-dev && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\n# build & install rdkit\nARG RDKIT_VERSION=Release_2017_09_3\nRUN curl -sLo ${RDKIT_VERSION}.tar.gz https://github.com/rdkit/rdkit/archive/${RDKIT_VERSION}.tar.gz && \\\n    tar xf ${RDKIT_VERSION}.tar.gz && \\\n    mkdir -p rdkit-${RDKIT_VERSION}/build && \\\n    base_dir=$(pwd) && \\\n    cd rdkit-${RDKIT_VERSION}/build && \\\n    cmake \\\n    -D RDK_BUILD_SWIG_SUPPORT=OFF \\\n    -D RDK_BUILD_PYTHON_WRAPPERS=ON \\\n    -D RDK_BUILD_COMPRESSED_SUPPLIERS=ON \\\n    -D RDK_BUILD_INCHI_SUPPORT=ON \\\n    -D RDK_BUILD_AVALON_SUPPORT=ON \\\n    -D RDK_BUILD_CPP_TESTS=OFF \\\n    -D RDK_INSTALL_INTREE=OFF \\\n    -D RDK_INSTALL_STATIC_LIBS=OFF \\\n    -D PYTHON_EXECUTABLE=/usr/bin/python3.5 \\\n    -D PYTHON_NUMPY_INCLUDE_PATH=/usr/local/lib/python3.5/dist-packages/numpy/core/include \\\n    -D PYTHON_INSTDIR=/usr/local/lib/python3.5/dist-packages \\\n    -D Python_ADDITIONAL_VERSIONS=3.5 \\\n    -D CMAKE_BUILD_TYPE=Release \\\n    -D CMAKE_INSTALL_PREFIX=/usr/local \\\n    .. && \\\n    make -j $(nproc) && \\\n    make install && \\\n    cd \"$base_dir\" && \\\n    rm -rf rdkit-${RDKIT_VERSION} ${RDKIT_VERSION}.tar.gz && \\\n    ldconfig\n\n# install chainer-chemistry\n# matplotlib >= 3.1 requires upgrade of pip\n# pandas >= 0.25 doesn't support python3.5.2 which is installed for ubuntu16.04\nRUN pip3 install --no-cache-dir matplotlib==3.0 pandas==0.24 chainer-chemistry\n\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHINXBUILD   = sphinx-build\nSPHINXPROJ    = Chainer-Chemistry\nSOURCEDIR     = source\nBUILDDIR      = build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)"
  },
  {
    "path": "docs/source/_autosummary_check.py",
    "content": "import inspect\nimport os\nimport types\n\nimport chainer_chemistry.functions\nimport chainer_chemistry.links\nimport chainer_chemistry.models\n\n\ndef _is_rst_exists(entity):\n    return os.path.exists('source/generated/{}.rst'.format(entity))\n\n\ndef check(app, exception):\n    missing_entities = []\n\n    missing_entities += [\n        name for name in _list_chainer_functions()\n        if not _is_rst_exists(name)]\n\n    missing_entities += [\n        name for name in _list_chainer_links()\n        if not _is_rst_exists(name)]\n\n    missing_entities += [\n        name for name in _list_chainer_models()\n        if not _is_rst_exists(name)]\n\n    if len(missing_entities) != 0:\n        app.warn('\\n'.join([\n            'Undocumented entities found.',\n            '',\n        ] + missing_entities))\n\n\ndef _list_chainer_functions():\n    # List exported functions under chainer.functions.\n    return ['chainer_chemistry.functions.{}'.format(name)\n            for (name, func) in chainer_chemistry.functions.__dict__.items()\n            if isinstance(func, types.FunctionType)]\n\n\ndef _list_chainer_links():\n    # List exported classes under chainer.links.\n    return ['chainer_chemistry.links.{}'.format(name)\n            for (name, link) in chainer_chemistry.links.__dict__.items()\n            if inspect.isclass(link)]\n\n\ndef _list_chainer_models():\n    # List exported classes under chainer.links.\n    return ['chainer_chemistry.models.{}'.format(name)\n            for (name, model) in chainer_chemistry.models.__dict__.items()\n            if inspect.isclass(model)]\n"
  },
  {
    "path": "docs/source/conf.py",
    "content": "#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\n#\n# This file is execfile()d with the current directory set to its\n# containing dir.\n#\n# Note that not all possible configuration values are present in this\n# autogenerated file.\n#\n# All configuration values have a default; values that are commented out\n# serve to show the default.\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\nimport os\nimport pkg_resources\nimport sys\n\nsys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))\nimport sphinx_rtd_theme\nimport _autosummary_check\n\n\n__version__ = pkg_resources.get_distribution('chainer-chemistry').version\n\n# -- General configuration ------------------------------------------------\n\n# If your documentation needs a minimal Sphinx version, state it here.\n#\n# needs_sphinx = '1.0'\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = ['sphinx.ext.autodoc',\n              'sphinx.ext.doctest',\n              'sphinx.ext.intersphinx',\n              'sphinx.ext.todo',\n              'sphinx.ext.coverage',\n              'sphinx.ext.mathjax',\n              'sphinx.ext.ifconfig',\n              'sphinx.ext.viewcode',\n              'sphinx.ext.autosummary',\n              'sphinx.ext.napoleon']\n\nautosummary_generate = True\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# The suffix(es) of source filenames.\n# You can specify multiple suffix as a list of string:\n#\n# source_suffix = ['.rst', '.md']\nsource_suffix = '.rst'\n\n# The master toctree document.\nmaster_doc = 'index'\n\n# General information about the project.\nproject = 'Chainer Chemistry'\ncopyright = '2017, Preferred Networks, Inc.'\nauthor = 'Preferred Networks, Inc.'\n\n# The version info for the project you're documenting, acts as replacement for\n# |version| and |release|, also used in various other places throughout the\n# built documents.\n#\n# The short X.Y version.\nversion = __version__\n# The full version, including alpha/beta/rc tags.\nrelease = __version__\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#\n# This is also used if you do content translation via gettext catalogs.\n# Usually you set \"language\" from the command line for these cases.\nlanguage = None\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This patterns also effect to html_static_path and html_extra_path\nexclude_patterns = []\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = 'sphinx'\n\n# If true, `todo` and `todoList` produce output, else they produce nothing.\ntodo_include_todos = True\n\n\n# -- Options for HTML output ----------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\n\nhtml_theme = \"sphinx_rtd_theme\"\n\nhtml_theme_path = [sphinx_rtd_theme.get_html_theme_path()]\n\n# Theme options are theme-specific and customize the look and feel of a theme\n# further.  For a list of options available for each theme, see the\n# documentation.\n#\n# html_theme_options = {}\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\nhtml_static_path = ['_static']\n\n# Custom sidebar templates, must be a dictionary that maps document names\n# to template names.\n#\n# This is required for the alabaster theme\n# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars\nhtml_sidebars = {\n    '**': [\n        'relations.html',  # needs 'show_related': True theme option to display\n        'searchbox.html',\n    ]\n}\n\n\n# -- Options for HTMLHelp output ------------------------------------------\n\n# Output file base name for HTML help builder.\nhtmlhelp_basename = 'Chainer-Chemistrydoc'\n\n\n# -- Options for LaTeX output ---------------------------------------------\n\nlatex_elements = {\n    # The paper size ('letterpaper' or 'a4paper').\n    #\n    # 'papersize': 'letterpaper',\n\n    # The font size ('10pt', '11pt' or '12pt').\n    #\n    # 'pointsize': '10pt',\n\n    # Additional stuff for the LaTeX preamble.\n    #\n    # 'preamble': '',\n\n    # Latex figure (float) alignment\n    #\n    # 'figure_align': 'htbp',\n}\n\n# Grouping the document tree into LaTeX files. List of tuples\n# (source start file, target name, title,\n#  author, documentclass [howto, manual, or own class]).\nlatex_documents = [\n    (master_doc, 'Chainer-Chemistry.tex', 'Chainer Chemistry Documentation',\n     'Preferred Networks, Inc.', 'manual'),\n]\n\n\n# -- Options for manual page output ---------------------------------------\n\n# One entry per manual page. List of tuples\n# (source start file, name, description, authors, manual section).\nman_pages = [\n    (master_doc, 'chainer-chemistry', 'Chainer Chemistry Documentation',\n     [author], 1)\n]\n\n\n# -- Options for Texinfo output -------------------------------------------\n\n# Grouping the document tree into Texinfo files. List of tuples\n# (source start file, target name, title, author,\n#  dir menu entry, description, category)\ntexinfo_documents = [\n    (master_doc, 'Chainer Chemistry', 'Chainer Chemistry Documentation',\n     author, 'Chainer Chemistry', 'One line description of project.',\n     'Miscellaneous'),\n]\n\n\n# Example configuration for intersphinx: refer to the Python standard library.\nintersphinx_mapping = {'https://docs.python.org/': None}\n\ndef setup(app):\n    app.connect('build-finished', _build_finished)\n\ndef _build_finished(app, exception):\n    if exception is None:\n        _autosummary_check.check(app, exception)\n"
  },
  {
    "path": "docs/source/contribution.rst",
    "content": "==================\nContribution guide\n==================\n\nWe welcome any type of contribution that helps to improve and promote Chainer Chemistry.\nTypical contribution includes:\n\n* Send pull requests (PRs) to the `repository <https://github.com/pfnet-research/chainer-chemistry>`_ (We recommend developers making PRs to read the :ref:`development-policy` before starting to implement).\n* Report bugs or problems as `issues <https://github.com/pfnet-research/chainer-chemistry/issues>`_.\n* Send questions to developer community sites like `Stackoverflow <https://stackoverflow.com>`_ or Chainer Slack (`en <https://docs.google.com/forms/d/1yYoDBNDm1s21sDcLZZIODU34XHDSEhXU_rRl8QoupZc/viewform>`_, `jp <https://docs.google.com/forms/d/e/1FAIpQLSfqL9XjnqZUIwLOz4K9Oxm8-Ce246IRP51-vZa7HOrofJT9rA/viewform>`_).\n* Write a blog post about Chainer Chemistry or its use case."
  },
  {
    "path": "docs/source/dataset.rst",
    "content": "=======\nDataset\n=======\n\n\nConverters\n==========\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.dataset.converters.concat_mols\n\n\nIndexers\n========\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.dataset.indexer.BaseIndexer\n   chainer_chemistry.dataset.indexer.BaseFeatureIndexer\n   chainer_chemistry.dataset.indexers.NumpyTupleDatasetFeatureIndexer\n\n\nParsers\n=======\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.dataset.parsers.BaseParser\n   chainer_chemistry.dataset.parsers.CSVFileParser\n   chainer_chemistry.dataset.parsers.SDFFileParser\n   chainer_chemistry.dataset.parsers.DataFrameParser\n   chainer_chemistry.dataset.parsers.SmilesParser\n\n\nPreprocessors\n=============\n\nBase preprocessors\n------------------\n\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.dataset.preprocessors.BasePreprocessor\n   chainer_chemistry.dataset.preprocessors.MolPreprocessor\n\nConcrete preprocessors\n----------------------\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n\n   chainer_chemistry.dataset.preprocessors.AtomicNumberPreprocessor\n   chainer_chemistry.dataset.preprocessors.ECFPPreprocessor\n   chainer_chemistry.dataset.preprocessors.GGNNPreprocessor\n   chainer_chemistry.dataset.preprocessors.NFPPreprocessor\n   chainer_chemistry.dataset.preprocessors.SchNetPreprocessor\n   chainer_chemistry.dataset.preprocessors.WeaveNetPreprocessor\n   chainer_chemistry.dataset.preprocessors.RelGATPreprocessor\n   chainer_chemistry.dataset.preprocessors.RelGCNPreprocessor\n   chainer_chemistry.dataset.preprocessors.RSGCNPreprocessor\n\nUtilities\n---------\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.dataset.preprocessors.MolFeatureExtractionError\n   chainer_chemistry.dataset.preprocessors.type_check_num_atoms\n   chainer_chemistry.dataset.preprocessors.construct_atomic_number_array\n   chainer_chemistry.dataset.preprocessors.construct_adj_matrix\n\n\n\nSplitters\n==========\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.dataset.splitters.RandomSplitter\n   chainer_chemistry.dataset.splitters.StratifiedSplitter\n   chainer_chemistry.dataset.splitters.ScaffoldSplitter\n"
  },
  {
    "path": "docs/source/datasets.rst",
    "content": "========\nDatasets\n========\n\nDataset implementations\n=======================\n\n.. autosummary::\n\t:toctree: generated/\n        :nosignatures:\n\n\tchainer_chemistry.datasets.NumpyTupleDataset\n\n\nDataset loaders\n===============\n\n.. autosummary::\n\t:toctree: generated/\n        :nosignatures:\n\n\n\tchainer_chemistry.datasets.tox21.get_tox21\n\tchainer_chemistry.datasets.qm9.get_qm9\n\tchainer_chemistry.datasets.molnet.get_molnet_dataset\n\tchainer_chemistry.datasets.molnet.get_molnet_dataframe\n"
  },
  {
    "path": "docs/source/development.rst",
    "content": ".. _development-policy:\n\n==================\nDevelopment policy\n==================\n\nIn this section, we describe the development policy that the core developers follow.\nDevelopers who are thinking to send PRs to the repository are encouraged to read the following sections\nbefore starting implementation.\n\n\nVersioning policy\n=================\n\nBasically, we follow the `semantic versioning v2.0.0 <https://semver.org/spec/v2.0.0.html>`_.\nIn Chainer Chemistry, *public APIs* in the sense of semantic versioning are ones in `the document <http://chainer-chemistry.readthedocs.io/en/latest/index.html>`_.\n\nWe follow these rules about versioning during the major version zero in addition to ones described in the the semantic versioning:\n\n* We do not plan any scheduled releases.\n* We do not plan any pre releases.\n* We release the minor version when the core development team agrees. Typically, we do so when (1) sufficient number of features are added since the last minor release (2) the latest release cannot run the example code in the master branch of the repository (3) critical bugs are found. But we are not restricted to them.\n* If we find critical bugs, we should release a patch version or a minor version that fixes them. The core development team will determine which version to release.\n\nWe do not have a concrete plan about versioning strategy after v1.0.0.\n\n\nCompatibiity policy\n===================\n\nAs an immediate consequence of the semantic versioning, we may break compatibility of public APIs including addition, deletion, and changes in their semantics anytime in the major version zero.\nSince APIs of Chainer Chemistry are still immature and unstable, we expect introduction of new features can sometime involve compatibility break.\nIf we are faced with a dilemma between cost for backward compatibility and benefit of new features, we are likely to give up the former because we want to place importance on introducing new features as soon as possible. Of course, we care backward compatibility whenever it is easy and low-cost.\n\nLike `ChainerCV <https://twitter.com>`_, Chainer Chemistry provides several off-the-shelf deep learning models (e.g. Neural Finger Print) whose papers are available in such as arXiv or conferences related to machine learning.\nAlthough, most of published papers reports evaluation results of the models with publicly available datasets, we do *NOT* guarantee the reproducibility of experiments in the papers.\n\nAt some point, coding examples in the master branch of the official repository may not work even with the latest release. In that case, users are recommended to either use the example code of the latest release or update the library code to the master branch.\n\nAs of v0.3.0, we have introduced `BaseForwardModel`, which provides methods for serializing itself to and loading from a file.\nAs these methods intenally use `pickle <https://docs.python.org/3/library/pickle.html>`_, portability of the class depends on that of pickling.\nEspecially, serialized instances of `BaseForwardModel` made with older Chainer Chemistry may not be loaded with newer one, partly because we may change their internal structures for refactoring, performance improvement, and so on.\nSee the document of `BaseForwardModel` and their subclasses (e.g. `Classifier`, `Regressor`).\n\nBranch strategy\n===============\n\nThe official repository of Chainer Chemistry is https://github.com/pfnet-research/chainer-chemistry. \nWe use the *master* branch of the repository for development. Therefore, developer who makes PRs should send them to the master branch.\n\nDuring major version zero, we do not maintain any released versions.\nWhen a bug is found, changes for the bug should be merged to the next version (either minor or patch). If the bug is critical, we will release the next version as soon as possible.\n\n\nCoding guideline\n================\n\nWe basically adopt `PEP8 <https://www.python.org/dev/peps/pep-0008/>_` as a style guide.\nYou can check it with `flake8`, which we can install by::\n\n   $ pip install flake8\n\nand run with ``flake8`` command.\n\nIn addition to PEP8, we use upper camel case (e.g. ``FooBar``) for class names and snake case (e.g. ``foo_bar``) for function, method, variable and package names.\nAlthough we recommend developers to follow these rules as well, they are not mandatory.\n\nFor documents, we follow the `Google Python Style Guide <http://google.github.io/styleguide/pyguide.html#Comments>`_\nand compile it with `Napoleon <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/index.html>`_,\nwhich is an extension of `Sphinx <http://www.sphinx-doc.org/en/stable/>`_.\n\n\nTesting guideline\n=================\n\nChainer Chemistry uses `pytest <https://docs.pytest.org/en/latest/index.html>`_  as a unit-test framework.\nAll unit tests are located in ``tests/`` directory. We can run tests with normal usage of pytest.\nFor example, the following command runs all unit tests::\n\n   $ pytest tests\n\nSome unit tests require GPUs, which are annotated with ``@pytest.mark.gpu``.\nTherefore, you can skip them with ``-m`` option::\n\n   $ pytest -m \"not gpu\" tests\n\nIf a develop who write a unit test that uses GPUs, you must anotate it with ``@pytest.mark.gpu``.\n\nSimilarly, some unit tests take long time to complete.\nWe annotated them with ``@pytest.mark.slow`` and can skip them with ``-m`` option::\n\n   $ pytest -m \"not slow\" tests\n\nAny unit test that uses GPUs muct be annotated with ``@pytest.mark.slow``.\n\nWe can skip both GPU and slow tests with the following command::\n\n   $ pytest -m \"not (gpu or slow)\" tests\n\n\nTerminology\n===========\n\nIn the context of machine learning, especially chemoinformatics, we use several terms such as feature, feature vectors, descriptor and so on\nto indicate representation of inputs. To avoid disambiguity and align naming convention within the library code, we use these terms in the following way:\n\n* *Feature* is a representation of a sample of interest (typically molecules in Chainer Chemistry).\n* *Label* is a target value of we want to predict.\n* *Input feature* is a representation of a sample from which we want to predict the target value.\n\nFor example, consider a suepervised learning task whose dataset consisting of input-output pairs ``((x_1, y_1), ..., (x_N, y_N))``, where ``N`` is the number of samples.\nIn Chainer Chemistry ``x_i` and ``y_i`` are called input feature and label, respectively and a pair of ``(x_i, y_i)`` is feature for each ``i``.\n\n\nRelation to Chainer\n===================\n\n`Chainer <https://chainer.org>`_ is a deep learning framework written in Python that features dynamic\ncomputational graph construction (the \"define-by-run\" paradigm) for flexible and intuitive model development.\nAs the name indicates, Chainer Chemistry is an extension library of Chainer built on top of it.\nThe core development team members of Chainer and that of Chainer Chemistry work together tightly."
  },
  {
    "path": "docs/source/environment.yml",
    "content": "\nname: chainer-chemistry\nchannels: !!python/tuple\n- defaults\ndependencies:\n- rdkit::boost=1.63.0=py36_1\n- rdkit::rdkit=2017.09.1=py36_1\n"
  },
  {
    "path": "docs/source/functions.rst",
    "content": "=========\nFunctions\n=========\n\nFunction implementations\n========================\n\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.functions.matmul\n   chainer_chemistry.functions.mean_squared_error\n   chainer_chemistry.functions.mean_absolute_error\n   chainer_chemistry.functions.r2_score\n"
  },
  {
    "path": "docs/source/index.rst",
    "content": "Chainer Chemistry: Chainer extension library for Biology and Chemistry\n======================================================================\n\n`Chainer Chemistry <https://github.com/pfnet-research/chainer-chemistry>`_ is a collection of tools to train and run neural networks for tasks in biology and chemistry using `Chainer <https://chainer.org>`_ .\n\nFeatures\n--------\n\n* State-of-the-art deep learning neural network models (especially graph convolutions) for chemical molecules (NFP, GGNN, Weave, SchNet etc.)\n* Preprocessors of molecules tailored for these models\n* Parsers for several standard file formats (CSV, SDF etc.)\n* Loaders for several well-known datasets (QM9, Tox21 etc.)\n\n\nIntroductory to deep learning for molecules and Chainer Chemistry is also available `here (SlideShare) <https://www.slideshare.net/KentaOono/deep-learning-for-molecules-introduction-to-chainer-chemistry-93288837>`_.\n\n.. toctree::\n   :maxdepth: 1\n   :caption: Contents\n\n   install\n   tutorial\n   contribution\n   development\n   reference\n\n"
  },
  {
    "path": "docs/source/install.rst",
    "content": "============\nInstallation\n============\n\nDependency\n========================\n\nFollowing packages are required to install Chainer Chemistry and are automatically\ninstalled when you install the library by `pip` command.\n\n* `chainer <https://docs.chainer.org/en/stable/index.html>`_\n* `pandas <https://pandas.pydata.org>`_\n* `scikit-learn <http://scikit-learn.org/stable/>`_\n* `tqdm <https://pypi.python.org/pypi/tqdm>`_\n\nAlso, it uses following library, which you need to manually install.\n\n* `rdkit <https://github.com/rdkit/rdkit>`_\n\nSee the `official document <http://www.rdkit.org/docs/Install.html>`_ for installation.\nIf you have setup ``anaconda``, you may install ``rdkit`` by following command::\n\n   $ conda install -c rdkit rdkit\n\n\nInstall via pip\n========================\n\nIt can be installed by ``pip`` command::\n\n   $ pip install chainer-chemistry\n\nInstall from source\n========================\n\nThe tarball of the source tree is available via ``pip download chainer-chemistry``.\nYou can use ``setup.py`` to install Chainer Chemistry from the tarball::\n\n   $ tar zxf chainer-chemistry-x.x.x.tar.gz\n   $ cd chainer-chemistry-x.x.x\n   $ python setup.py install\n\nInstall from the latest source from the master branch::\n\n   $ git clone https://github.com/pfnet-research/chainer-chemistry.git\n   $ pip install -e chainer-chemistry\n\nRun example training code\n=========================\n\n`The official repository <https://github.com/pfnet-research/chainer-chemistry>`_ provides examples\nof training several graph convolution networks. The code can be obtained by cloning the repository::\n\n   $ git clone https://github.com/pfnet-research/chainer-chemistry.git\n\nThe following code is how to train Neural Fingerprint (NFP) with the Tox21 dataset on CPU::\n\n   $ cd chainer-chemistry/examples/tox21\n   $ python train_tox21.py --method=nfp  --gpu=-1  # set --gpu=0 if you have GPU"
  },
  {
    "path": "docs/source/iterators.rst",
    "content": "=========\nIterators\n=========\n\n\nIterator Implementations\n========================\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.iterators.BalancedSerialIterator\n   chainer_chemistry.iterators.IndexIterator\n"
  },
  {
    "path": "docs/source/links.rst",
    "content": "=====\nLinks\n=====\n\nLink implementations\n====================\n\n.. autosummary::\n\t:toctree: generated/\n        :nosignatures:\n\n\tchainer_chemistry.links.EmbedAtomID\n\tchainer_chemistry.links.GraphLinear\n    chainer_chemistry.links.GraphBatchNormalization\n\nScaler implementations\n======================\n.. autosummary::\n\t:toctree: generated/\n        :nosignatures:\n\n    chainer_chemistry.links.StandardScaler\n\n\nUpdate implementations\n======================\n\n.. autosummary::\n\t:toctree: generated/\n        :nosignatures:\n\n    chainer_chemistry.links.GGNNUpdate\n    chainer_chemistry.links.NFPUpdate\n    chainer_chemistry.links.RelGATUpdate\n    chainer_chemistry.links.RelGCNUpdate\n    chainer_chemistry.links.RSGCNUpdate\n    chainer_chemistry.links.SchNetUpdate\n\n\nReadout implementations\n=======================\n\n.. autosummary::\n\t:toctree: generated/\n        :nosignatures:\n\n    chainer_chemistry.links.GeneralReadout\n    chainer_chemistry.links.GGNNReadout\n    chainer_chemistry.links.NFPReadout\n    chainer_chemistry.links.SchNetReadout\n"
  },
  {
    "path": "docs/source/models.rst",
    "content": "======\nModels\n======\n\nModel implementations\n=====================\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.models.NFP\n   chainer_chemistry.models.GGNN\n   chainer_chemistry.models.MLP\n   chainer_chemistry.models.SchNet\n   chainer_chemistry.models.WeaveNet\n   chainer_chemistry.models.RelGAT\n   chainer_chemistry.models.RelGCN\n   chainer_chemistry.models.RSGCN\n\n\nWrapper models\n==============\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.models.BaseForwardModel\n   chainer_chemistry.models.Classifier\n   chainer_chemistry.models.Regressor\n"
  },
  {
    "path": "docs/source/reference.rst",
    "content": "=============\nAPI Reference\n=============\n\n.. toctree::\n   :maxdepth: 1\n\n   dataset\n   datasets\n   functions\n   iterators\n   links\n   models\n   utils\n   training\n"
  },
  {
    "path": "docs/source/requirements.txt",
    "content": "chainer\nscipy\nscikit-learn\npandas\ntqdm\n"
  },
  {
    "path": "docs/source/training.rst",
    "content": "=========\nTraining\n=========\n\n\n\nExtensions\n==========\n\n.. autosummary::\n   :toctree: generated/\n   :nosignatures:\n\n   chainer_chemistry.training.extensions.batch_evaluator.BatchEvaluator\n   chainer_chemistry.training.extensions.roc_auc_evaluator.ROCAUCEvaluator\n   chainer_chemistry.training.extensions.prc_auc_evaluator.PRCAUCEvaluator"
  },
  {
    "path": "docs/source/tutorial.rst",
    "content": "============\nTutorial\n============\n\nAbstract\n========================\n\nIn this tutorial, we predict Highest Occupied Molecular Orbital (HOMO) level of the molecules in `QM9 dataset <http://quantum-machine.org/datasets/>`_ [1][2] by `Neural Finger Print (NFP) <https://arxiv.org/abs/1509.09292>`_ [3][4].\nWe concentrate on exaplaining usage of Chainer Chemistry briefly and do not look over the detail of NFP implementation.\n\n\n.. _environment:\nTested Environment\n========================\n- Chainer Chemistry >= 0.0.1 (See :doc:`install`)\n- Chainer >= 2.0.2\n- CUDA == 8.0, CuPy >= 1.0.3 (Required only when using GPU)\n\n  - For CUDA 9.0, CuPy >= 2.0.0 is required\n- sklearn >= 0.17.1 (Only for preprocessing)\n\n\nQM9 Dataset\n========================\nQM9 is a publicly available dataset of small organic molecule structures and their simulated properties for data driven researches of material property prediction and chemical space exploration.\nIt contains 133,885 stable small organic molecules made up of CHONF.\nThe available properties are geometric, energetic, electronic, and thermodynamic ones.\n\nIn this tutorial, we predict HOMO level in the properties.\nPhysically, we need quantum chemical calculations to compute HOMO level.\nFrom mathematical viewpoint it requires a solution of an internal eigenvalue problem for a Hamiltonian matrix.\nIt is a big challenge to predict HOMO level accurately by a neural network,\nbecause the network should approximate both calculating the Hamiltonian matrix and solving the internal eigenvalue problem.\n\n\nHOMO prediction by NFP\n========================\n\nAt first you should clone the library repository from `GitHub <https://github.com/pfnet-research/chainer-chemistry>`_.\nThere is a Python script ``examples/qm9/train_qm9.py`` in the repository.\nIt executes a whole training procedure, that is, downloads QM9 dataset, preprocess it, define an NFP model and run trainning on them.\n\nExecute the following commands on a machine satisfying the tested environment in :ref:`environment`.\n\n.. code-block:: shell\n\n    ~$ git clone git@github.com:pfnet-research/chainer-chemistry.git\n    ~$ cd chainer-chemistry/examples/qm9/\n\nHereafter all shell commands should be executed in this directory.\n\nIf you are a beginner for Chainer, `Chainer handson <https://github.com/mitmul/chainer-handson>`_ will greatly help you.\nEspecially the explanation of inclusion relationship of Chainer classes in Sec. 4 in `Chap. 2 <https://github.com/mitmul/chainer-notebooks/blob/master/2_how_to_use_trainer.ipynb>`_ is helpful when you read the sample script.\n\nNext the dataset preparation part and the model definition part in ``train_qm9.py`` are explained.\nIf you are not interested in them, skip :ref:`dataset-preparation` and :ref:`model-definition`, and jump to :ref:`run`.\n\n\n.. _dataset-preparation:\n\nDataset Preparation\n------------------------\n\nChainer Chemistry accepts the same dataset type with Chainer, such as ``chainer.datasets.SubDataset``.\nIn this section we learn how to download QM9 dataset and use it as a Chainer dataset.\n\nThe following Python script downloads and saves the dataset in ``.npz`` format.\n\n.. code-block:: python\n\n    #!/usr/bin/env python\n    from chainer_chemistry import datasets as D\n    from chainer_chemistry.dataset.preprocessors import preprocess_method_dict\n    from chainer_chemistry.datasets import NumpyTupleDataset\n\n    preprocessor = preprocess_method_dict['nfp']()\n    dataset = D.get_qm9(preprocessor, labels='homo')\n    cache_dir = 'input/nfp_homo/'\n    os.makedirs(cache_dir)\n    NumpyTupleDataset.save(cache_dir + 'data.npz', dataset)\n\nThe last two lines save the dataset to ``input/nfp_homo/data.npz`` and we need not to download the dataset next time.\n\nThe following Python script read the dataset from the saved ``.npz`` file and split the data points into training and validation sets.\n\n.. code-block:: python\n\n    #!/usr/bin/env python\n    from chainer.datasets import split_dataset_random\n    from chainer_chemistry import datasets as D\n    from chainer_chemistry.dataset.preprocessors import preprocess_method_dict\n    from chainer_chemistry.datasets import NumpyTupleDataset\n\n    cache_dir = 'input/nfp_homo/'\n    dataset = NumpyTupleDataset.load(cache_dir + 'data.npz')\n    train_data_ratio = 0.7\n    train_data_size = int(len(dataset) * train_data_ratio)\n    train, val = split_dataset_random(dataset, train_data_size, 777)\n    print('train dataset size:', len(train))\n    print('validation dataset size:', len(val))\n\nThe function ``split_dataset_random()`` returns a tuple of two ``chainer.datasets.SubDataset`` objects (training and validation set).\nNow you have prepared training and validation data points and you can construct ``chainer.iterator.Iterator`` objects, needed for updaters in Chainer.\n\n\n.. _model-definition:\n\nModel Definition\n------------------------\n\nIn Chainer, a neural network model is defined as a ``chainer.Chain`` object.\n\nGraph convolutional networks such as NFP are generally connection of graph convolution layers and multi perceptron layers.\nTherefore it is convenient to define a class which inherits ``chainer.Chain`` and compose two ``chainer.Chain`` objects corresponding to the two kind of layers.\n\nExecute the following Python script and check you can define such a class.\n``NFP`` and ``MLP`` are already defined ``chainer.Chain`` classes.\n\n.. code-block:: python\n\n    #!/usr/bin/env python\n    import chainer\n    from chainer_chemistry.models import MLP, NFP\n\n    class GraphConvPredictor(chainer.Chain):\n\n        def __init__(self, graph_conv, mlp):\n            super(GraphConvPredictor, self).__init__()\n            with self.init_scope():\n                self.graph_conv = graph_conv\n                self.mlp = mlp\n\n        def __call__(self, atoms, adjs):\n            x = self.graph_conv(atoms, adjs)\n            x = self.mlp(x)\n            return x\n\n    n_unit = 16\n    conv_layers = 4\n    model = GraphConvPredictor(NFP(n_unit, n_unit, conv_layers),\n                               MLP(n_unit, 1))\n\n\n.. _run:\n\nRun\n------------------------\n\nYou have defined the dataset and the NFP model on Chainer.\nThere are no other procedures specific to Chainer Chemistry.\nHereafter you should just follow the usual procedures in Chainer to execute training.\n\nThe sample script ``examples/qm9/train_qm9.py`` contains all the procedures and you can execute training just by invoking the script.\nThe following command starts training for 20 epochs and reports loss and accuracy during training.\nThey are reported for each of ``main`` (dataset for training) and ``validation`` (dataset for validation).\n\nThe ``--gpu 0`` option is to utilize a GPU with device id = 0.\nIf you do not have a GPU, set ``--gpu -1`` or just drop ``--gpu 0`` to use CPU for all the calculation.\nIn most cases, calculation with GPU is much faster than that only with CPU.\n\n.. code-block:: shell\n\n    ~/chainer-chemistry/examples/qm9$ python train_qm9.py --method nfp --label homo --gpu 0  # If GPU is unavailable, set --gpu -1\n\n    Train NFP model...\n    epoch       main/loss   main/accuracy  validation/main/loss  validation/main/accuracy  elapsed_time\n    1           0.746135    0.0336724      0.680088              0.0322597                 58.4605\n    2           0.642823    0.0311715      0.622942              0.0307055                 113.748\n    (...)\n    19          0.540646    0.0277585      0.532406              0.0276445                 1052.41\n    20          0.537062    0.0276631      0.551695              0.0277499                 1107.29\n\nAfter finished, you will find ``log`` file in ``result/`` directory.\n\n\nEvaluation\n------------------------\n\nIn the loss and accuracy report, we are mainly interested in ``validation/main/accuracy``.\nAlthough it decreases during training, the ``accuracy`` field is actually mean absolute error.\nThe unit is Hartree.\nTherefore the last line means validation mean absolute error is 0.0277499 Hartree.\nSee ``scaled_abs_error()`` function in ``train_qm9.py`` for the detailed definition of mean absolute error.\n\n.. 1 kcal/mol = 0.0016 Hartree = 0.043 eV = 500 K\n.. 17.4133 kcal/mol = 0.0277499 Hartree = 0.755114 eV = 8762.78 K\n.. DFT error of HOMO level reported in https://arxiv.org/pdf/1702.05532.pdf is 2.0 eV = 0.073 Hartree.\n\nYou can also train other type models like GGNN, SchNet or WeaveNet, and other target values like LUMO, dipole moment and internal energy, just by changing ``--model`` and ``--label`` options, respectively.\nSee output of ``python train_qm9.py --help``.\n\n\nUsing your own dataset\n========================\nYou can use your own dataset in Chainer Chemistry.\n`example/own_dataset <https://github.com/pfnet-research/chainer-chemistry/tree/master/examples/own_dataset/>`_ shows an example.\n\nReference\n========================\n[1] L. Ruddigkeit, R. van Deursen, L. C. Blum, J.-L. Reymond, Enumeration of 166 billion organic small molecules in the chemical universe database GDB-17, J. Chem. Inf. Model. 52, 2864–2875, 2012.\n\n[2] R. Ramakrishnan, P. O. Dral, M. Rupp, O. A. von Lilienfeld, Quantum chemistry structures and properties of 134 kilo molecules, Scientific Data 1, 140022, 2014.\n\n[3] Duvenaud, D. K., Maclaurin, D., Iparraguirre, J., Bombarell, R., Hirzel, T., Aspuru-Guzik, A., & Adams, R. P. (2015). Convolutional networks on graphs for learning molecular fingerprints. In Advances in neural information processing systems (pp. 2224-2232).\n\n[4] Gilmer, J., Schoenholz, S. S., Riley, P. F., Vinyals, O., & Dahl, G. E. (2017). Neural message passing for quantum chemistry. arXiv preprint arXiv:1704.01212.\n"
  },
  {
    "path": "docs/source/utils.rst",
    "content": "=========\nUtilities\n========="
  },
  {
    "path": "examples/.gitignore",
    "content": "result/\n"
  },
  {
    "path": "examples/README.md",
    "content": "# Chainer Chemistry examples\n\nThese examples are implemented to train the model.\n\n* Tox21: 12 types of toxity classification\n* QM9: Chemical property regression\n* Own dataset: Own dataset (prepared in csv format) regression\n* Molcule Net: Various dataset for both classification and regression\n\n## Test\n\nTo test code of all examples, run\n\n```\nbash -x test_examples.sh -1 # for CPU\nbash -x test_examples.sh 0  # for GPU\n```\n\nIf you encounter errors, please report them to\n[Github issues](https://github.com/pfnet-research/chainer-chemistry/issues)\nalong with error logs. We appreciate your help.\n"
  },
  {
    "path": "examples/molnet/README.md",
    "content": "# MoleculeNet\n\n[MoleculeNet](http://moleculenet.ai/) provides various dataset, which ranges\nPhysics, Chemistry, Bio and Physiology.\n\nYou can specify dataset type, and train the model for the dataset.\n\n## How to run the code\n\n### Train the model by specifying dataset\n\nYou can specify dataset type by `--dataset` option.\nPlease refer [molnet_config.py](https://github.com/pfnet-research/chainer-chemistry/blob/master/chainer_chemistry/datasets/molnet/molnet_config.py) \nfor the list of available dataset in Chainer Chemistry.\n\nFor example, if you want to train \"bbbp\" dataset,\n\nWith CPU:\n```angular2html\npython train_molnet.py --dataset=bbbp\n```\n\nWith GPU:\n```angular2html\npython train_molnet.py --dataset=bbbp -g 0\n```\n"
  },
  {
    "path": "examples/molnet/evaluate_models_molnet.sh",
    "content": "#!/usr/bin/env bash\nset -e\n\n# List of available datasets.\n# TODO: Investigate why training on `clearance` fails.\ndatasets=(bace_Class bace_pIC50 bbbp clintox delaney HIV hopv lipo \\\n          muv nci pcba ppb qm7 qm8 qm9 SAMPL sider tox21 toxcast)\nmethods=(relgcn)\n\n# device identifier; set it to -1 to train on the CPU (default).\ndevice=${1:--1}\n\n# Remove directories with previously trained models.\n[ -d result ] && rm -rf result\n\nfor dataset in ${datasets[@]}; do\n    for method in ${methods[@]}; do\n        python train_molnet.py \\\n            --dataset ${dataset} \\\n            --method ${method} \\\n            --device ${device} \\\n            --epoch 1 \\\n            --unit-num 10 \\\n            --conv-layers 1 \\\n            --num-data 100 \\\n            --out result\n\n        python predict_molnet.py \\\n            --dataset ${dataset} \\\n            --method ${method} \\\n            --in-dir result \\\n            --device ${device} \\\n            --num-data 100\n    done\ndone\n"
  },
  {
    "path": "examples/molnet/predict_molnet.py",
    "content": "#!/usr/bin/env python\nfrom __future__ import print_function\n\nimport argparse\nimport os\n\nimport chainer\nfrom chainer.iterators import SerialIterator\nfrom chainer.training.extensions import Evaluator\nfrom chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator  # NOQA\n# Proposed by Ishiguro\n# ToDo: consider go/no-go with following modification\n# Re-load the best-validation score snapshot using serializers\n# from chainer import serializers\n\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry.datasets import NumpyTupleDataset\nfrom chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config  # NOQA\nfrom chainer_chemistry.models.prediction import Classifier\nfrom chainer_chemistry.models.prediction import Regressor\nfrom chainer_chemistry.utils import save_json\n\n# These import is necessary for pickle to work\nfrom chainer_chemistry.links.scaler.standard_scaler import StandardScaler  # NOQA\nfrom chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor  # NOQA\nfrom train_molnet import dataset_part_filename\nfrom train_molnet import download_entire_dataset\n\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models.\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'gin', 'gnnfilm', 'megnet',\n                   'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm']\n#    scale_list = ['standardize', 'none']\n    dataset_names = list(molnet_default_config.keys())\n\n    # Set up the argument parser.\n    parser = argparse.ArgumentParser(description='Prediction on Molnet.')\n    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,\n                        default='bbbp',\n                        help='name of the dataset that training is run on')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        help='method name', default='nfp')\n    parser.add_argument('--label', '-l', type=str, default='',\n                        help='target label for regression; empty string means '\n                        'predicting all properties at once')\n#    parser.add_argument('--scale', type=str, choices=scale_list,\n#                        help='label scaling method', default='standardize')\n    parser.add_argument(\n        '--device', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--in-dir', '-i', type=str, default='result',\n                        help='directory to load model data from')\n    parser.add_argument('--num-data', type=int, default=-1,\n                        help='amount of data to be parsed; -1 indicates '\n                        'parsing all data.')\n    return parser.parse_args()\n\n\ndef main():\n    args = parse_arguments()\n\n    # Set up some useful variables that will be used later on.\n    dataset_name = args.dataset\n    method = args.method\n    num_data = args.num_data\n\n    if args.label:\n        labels = args.label\n        cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name,\n                                                            method, labels))\n    else:\n        labels = None\n        cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name,\n                                                             method))\n\n    # Load the cached dataset.\n    filename = dataset_part_filename('test', num_data)\n    path = os.path.join(cache_dir, filename)\n    if os.path.exists(path):\n        print('Loading cached dataset from {}.'.format(path))\n        test = NumpyTupleDataset.load(path)\n    else:\n        _, _, test = download_entire_dataset(dataset_name, num_data, labels,\n                                             method, cache_dir)\n\n    # Model-related data is stored this directory.\n    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))\n\n    model_filename = {'classification': 'classifier.pkl',\n                      'regression': 'regressor.pkl'}\n    task_type = molnet_default_config[dataset_name]['task_type']\n    model_path = os.path.join(model_dir, model_filename[task_type])\n    print(\"model_path=\" + model_path)\n    print('Loading model weights from {}...'.format(model_path))\n\n    device = chainer.get_device(args.device)\n    if task_type == 'classification':\n        model = Classifier.load_pickle(model_path, device=device)\n    elif task_type == 'regression':\n        model = Regressor.load_pickle(model_path, device=device)\n    else:\n        raise ValueError('Invalid task type ({}) encountered when processing '\n                         'dataset ({}).'.format(task_type, dataset_name))\n\n    # Re-load the best-validation score snapshot\n    # serializers.load_npz(os.path.join(\n    #     model_dir, \"best_val_\" + model_filename[task_type]), model)\n\n    # Run an evaluator on the test dataset.\n    print('Evaluating...')\n    converter = converter_method_dict[method]\n    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)\n    eval_result = Evaluator(test_iterator, model, converter=converter,\n                            device=device)()\n    print('Evaluation result: ', eval_result)\n\n    # Add more stats\n    if task_type == 'regression':\n        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))\n        # eval_result['main/loss'] = loss\n\n        # convert to native values..\n        for k, v in eval_result.items():\n            eval_result[k] = float(v)\n\n    elif task_type == \"classification\":\n        # For Classifier, we do not equip the model with ROC-AUC evalation\n        # function. use separate ROC-AUC Evaluator\n        rocauc_result = ROCAUCEvaluator(\n            test_iterator, model, converter=converter, device=device,\n            eval_func=model.predictor, name='test', ignore_labels=-1)()\n        print('ROCAUC Evaluation result: ', rocauc_result)\n        save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result)\n    else:\n        print('[WARNING] unknown task_type {}.'.format(task_type))\n\n    # Save the evaluation results.\n    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/molnet/summary_eval_molnet.py",
    "content": "#! -*- coding: utf-8 -*-\nimport argparse\nimport json\nimport matplotlib\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\nimport os\nimport seaborn as sns\n\nimport numpy as np\n\n\nfrom chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config  # NOQA\nfrom pandas import DataFrame\n\n\ndef save_evaluation_plot(x, y_mean, metric, dataset_name, filename):\n    plt.figure()\n\n    sns.set()\n    ax = sns.barplot(y=x, x=y_mean)\n\n    # If \"text\" does not work, change the attribute name to \"s\"\n    for n, (label, _y) in enumerate(zip(x, y_mean)):\n        ax.annotate(\n            s='{:.3f}'.format(abs(_y)),\n            xy=(_y, n),\n            ha='right',\n            va='center',\n            xytext=(-5, 0),\n            textcoords='offset points',\n            color='white')\n\n    plt.title('Performance on ' + dataset_name)\n    plt.xlabel(metric)\n    plt.savefig(filename)\n\ndef main():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--prefix', required=True)\n    parser.add_argument('--methods', nargs='+', required=True)\n    parser.add_argument('--dataset', required=True)\n    parser.add_argument('--runs', type=int, required=True)\n    parser.add_argument('--out_prefix', default=\"result_\")\n    args = parser.parse_args()\n\n    #\n    # load the config file in the designated directory\n    #\n\n    dataset_name = args.dataset\n    task_type = molnet_default_config[dataset_name]['task_type']\n    print('task type=\\'' + str(task_type) + \"\\'\")\n\n    if task_type=='regression':\n        metrics = ['main/MAE', 'main/RMSE']\n    elif task_type=='classification':\n        metrics = ['test/main/roc_auc']\n\n    x = args.methods\n\n    for metric in metrics:\n\n        y = np.zeros( (len(args.methods), args.runs) )\n\n        for m, method in enumerate(args.methods):\n            for run in range(0, args.runs):\n                #for run in range(1, args.runs+1):\n                with open(os.path.join(args.prefix + \"_\" + method + \"_\" + str(run), 'eval_result.json')) as f:\n                    result = json.load(f)\n                    y[m, run-1,] = result[metric]\n                # end with\n            # end run-for\n\n        # end method-for\n\n        metric_lastslash = metric.rindex(\"/\")\n        metric_name = metric[metric_lastslash+1:]\n\n        # draw figure\n        save_evaluation_plot(x, np.mean(y, axis=1), metric, dataset_name, args.out_prefix + metric_name + '.png')\n        save_evaluation_plot(x, np.mean(y, axis=1), metric, dataset_name, args.out_prefix + metric_name + '.pdf')\n\n        # output as text. mean/std\n        y_mean = np.mean(y, axis=1)\n        y_std = np.std(y, axis=1)\n\n        with open(args.out_prefix + \"_summary_\" + metric_name + \".tsv\", \"w\") as fout:\n            for m, method in enumerate(args.methods):\n                fout.write(method + \"\\t\" + str(y_mean[m]) + \"\\t\" + str(y_std[m]) + \"\\n\")\n            # end-for\n        # end with\n\n    # end metric-for\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/molnet/test_molnet.sh",
    "content": "#!/usr/bin/env bash\n\nset -e\n\n# List of available datasets.\n# TODO: Investigate why training on `clearance` fails.\ndatasets=(bace_Class bace_pIC50 bbbp clintox delaney HIV hopv lipo \\\n          muv nci pcba ppb qm7 qm8 qm9 SAMPL sider tox21 toxcast)\n\n# device identifier; set it to -1 to train on the CPU (default).\ndevice=${1:--1}\n\n# Remove directories with previously trained models.\n[ -d input ] && rm -rf input\n\nfor dataset in ${datasets[@]}\ndo\n    # Run the training script for the current dataset.\n    python train_molnet.py \\\n        --dataset $dataset \\\n        --method nfp \\\n        --conv-layers 1 \\\n        --device ${device} \\\n        --epoch 1 \\\n        --unit-num 10 \\\n        --out nfp_${dataset} \\\n        --batchsize 32 \\\n        --num-data=100\ndone\n"
  },
  {
    "path": "examples/molnet/train_molnet.py",
    "content": "#!/usr/bin/env python\nfrom __future__ import print_function\n\nimport argparse\nimport numpy\nimport os\nimport types\n\nimport chainer\nfrom chainer import iterators\nfrom chainer import optimizers\nfrom chainer import training\n\nfrom chainer.training import extensions as E\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry.dataset.preprocessors import preprocess_method_dict\nfrom chainer_chemistry import datasets as D\nfrom chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config  # NOQA\nfrom chainer_chemistry.datasets import NumpyTupleDataset\nfrom chainer_chemistry.links import StandardScaler\nfrom chainer_chemistry.models.prediction import Classifier\nfrom chainer_chemistry.models.prediction import Regressor\nfrom chainer_chemistry.models.prediction import set_up_predictor\nfrom chainer_chemistry.training.extensions import BatchEvaluator, ROCAUCEvaluator  # NOQA\nfrom chainer_chemistry.training.extensions.auto_print_report import AutoPrintReport  # NOQA\nfrom chainer_chemistry.utils import save_json\n\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models and datasets.\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'gin', 'gnnfilm', 'megnet',\n                   'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm']\n    dataset_names = list(molnet_default_config.keys())\n    scale_list = ['standardize', 'none']\n\n    parser = argparse.ArgumentParser(description='molnet example')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        help='method name', default='nfp')\n    parser.add_argument('--label', '-l', type=str, default='',\n                        help='target label for regression; empty string means '\n                        'predicting all properties at once')\n    parser.add_argument('--conv-layers', '-c', type=int, default=4,\n                        help='number of convolution layers')\n    parser.add_argument('--batchsize', '-b', type=int, default=32,\n                        help='batch size')\n    parser.add_argument(\n        '--device', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--out', '-o', type=str, default='result',\n                        help='path to save the computed model to')\n    parser.add_argument('--epoch', '-e', type=int, default=20,\n                        help='number of epochs')\n    parser.add_argument('--unit-num', '-u', type=int, default=16,\n                        help='number of units in one layer of the model')\n    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,\n                        default='bbbp',\n                        help='name of the dataset that training is run on')\n    parser.add_argument('--protocol', type=int, default=2,\n                        help='pickle protocol version')\n    parser.add_argument('--num-data', type=int, default=-1,\n                        help='amount of data to be parsed; -1 indicates '\n                        'parsing all data.')\n    parser.add_argument('--scale', type=str, choices=scale_list,\n                        help='label scaling method', default='standardize')\n    return parser.parse_args()\n\n\ndef dataset_part_filename(dataset_part, num_data):\n    \"\"\"Returns the filename corresponding to a train/valid/test parts of a\n    dataset, based on the amount of data samples that need to be parsed.\n    Args:\n        dataset_part: String containing any of the following 'train', 'valid'\n                      or 'test'.\n        num_data: Amount of data samples to be parsed from the dataset.\n    \"\"\"\n    if num_data >= 0:\n        return '{}_data_{}.npz'.format(dataset_part, str(num_data))\n    return '{}_data.npz'.format(dataset_part)\n\n\ndef download_entire_dataset(dataset_name, num_data, labels, method, cache_dir):\n    \"\"\"Downloads the train/valid/test parts of a dataset and stores them in the\n    cache directory.\n    Args:\n        dataset_name: Dataset to be downloaded.\n        num_data: Amount of data samples to be parsed from the dataset.\n        labels: Target labels for regression.\n        method: Method name. See `parse_arguments`.\n        cache_dir: Directory to store the dataset to.\n    \"\"\"\n\n    print('Downloading {}...'.format(dataset_name))\n    preprocessor = preprocess_method_dict[method]()\n\n    # Select the first `num_data` samples from the dataset.\n    target_index = numpy.arange(num_data) if num_data >= 0 else None\n    dataset_parts = D.molnet.get_molnet_dataset(dataset_name, preprocessor,\n                                                labels=labels,\n                                                target_index=target_index)\n    dataset_parts = dataset_parts['dataset']\n\n    # Cache the downloaded dataset.\n    if not os.path.exists(cache_dir):\n        os.makedirs(cache_dir)\n\n    for i, part in enumerate(['train', 'valid', 'test']):\n        filename = dataset_part_filename(part, num_data)\n        path = os.path.join(cache_dir, filename)\n        NumpyTupleDataset.save(path, dataset_parts[i])\n    return dataset_parts\n\n\ndef fit_scaler(datasets):\n    \"\"\"Standardizes (scales) the dataset labels.\n    Args:\n        datasets: Tuple containing the datasets.\n    Returns:\n        Datasets with standardized labels and the scaler object.\n    \"\"\"\n    scaler = StandardScaler()\n\n    # Collect all labels in order to apply scaling over the entire dataset.\n    labels = None\n    offsets = []\n    for dataset in datasets:\n        if labels is None:\n            labels = dataset.get_datasets()[-1]\n        else:\n            labels = numpy.vstack([labels, dataset.get_datasets()[-1]])\n        offsets.append(len(labels))\n\n    scaler.fit(labels)\n\n    return scaler\n\n\ndef main():\n    args = parse_arguments()\n\n    # Set up some useful variables that will be used later on.\n    dataset_name = args.dataset\n    method = args.method\n    num_data = args.num_data\n    n_unit = args.unit_num\n    conv_layers = args.conv_layers\n\n    task_type = molnet_default_config[dataset_name]['task_type']\n    model_filename = {'classification': 'classifier.pkl',\n                      'regression': 'regressor.pkl'}\n\n    print('Using dataset: {}...'.format(dataset_name))\n\n    # Set up some useful variables that will be used later on.\n    if args.label:\n        labels = args.label\n        cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name,\n                                                            method, labels))\n        class_num = len(labels) if isinstance(labels, list) else 1\n    else:\n        labels = None\n        cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name,\n                                                             method))\n        class_num = len(molnet_default_config[args.dataset]['tasks'])\n\n    # Load the train and validation parts of the dataset.\n    filenames = [dataset_part_filename(p, num_data)\n                 for p in ['train', 'valid']]\n\n    paths = [os.path.join(cache_dir, f) for f in filenames]\n    if all([os.path.exists(path) for path in paths]):\n        dataset_parts = []\n        for path in paths:\n            print('Loading cached dataset from {}.'.format(path))\n            dataset_parts.append(NumpyTupleDataset.load(path))\n    else:\n        dataset_parts = download_entire_dataset(dataset_name, num_data, labels,\n                                                method, cache_dir)\n    train, valid = dataset_parts[0], dataset_parts[1]\n\n    # Scale the label values, if necessary.\n    scaler = None\n    if args.scale == 'standardize':\n        if task_type == 'regression':\n            print('Applying standard scaling to the labels.')\n            scaler = fit_scaler(dataset_parts)\n        else:\n            print('Label scaling is not available for classification tasks.')\n    else:\n        print('No label scaling was selected.')\n\n    # Set up the predictor.\n    predictor = set_up_predictor(method, n_unit, conv_layers, class_num,\n                                 label_scaler=scaler)\n\n    # Set up the iterators.\n    train_iter = iterators.SerialIterator(train, args.batchsize)\n    valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False,\n                                          shuffle=False)\n\n    # Load metrics for the current dataset.\n    metrics = molnet_default_config[dataset_name]['metrics']\n    metrics_fun = {k: v for k, v in metrics.items()\n                   if isinstance(v, types.FunctionType)}\n    loss_fun = molnet_default_config[dataset_name]['loss']\n\n    device = chainer.get_device(args.device)\n    if task_type == 'regression':\n        model = Regressor(predictor, lossfun=loss_fun,\n                          metrics_fun=metrics_fun, device=device)\n    elif task_type == 'classification':\n        model = Classifier(predictor, lossfun=loss_fun,\n                           metrics_fun=metrics_fun, device=device)\n    else:\n        raise ValueError('Invalid task type ({}) encountered when processing '\n                         'dataset ({}).'.format(task_type, dataset_name))\n\n    # Set up the optimizer.\n    optimizer = optimizers.Adam()\n    optimizer.setup(model)\n\n    # Save model-related output to this directory.\n    if not os.path.exists(args.out):\n        os.makedirs(args.out)\n    save_json(os.path.join(args.out, 'args.json'), vars(args))\n    model_dir = os.path.join(args.out, os.path.basename(cache_dir))\n    if not os.path.exists(model_dir):\n        os.makedirs(model_dir)\n\n    # Set up the updater.\n    converter = converter_method_dict[method]\n    updater = training.StandardUpdater(train_iter, optimizer, device=device,\n                                       converter=converter)\n\n    # Set up the trainer.\n    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir)\n    trainer.extend(E.Evaluator(valid_iter, model, device=device,\n                               converter=converter))\n    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))\n    trainer.extend(E.LogReport())\n\n    # TODO: consider go/no-go of the following block\n    # # (i) more reporting for val/evalutaion\n    # # (ii) best validation score snapshot\n    # if task_type == 'regression':\n    #     metric_name_list = list(metrics.keys())\n    #     if 'RMSE' in metric_name_list:\n    #         trainer.extend(E.snapshot_object(model, \"best_val_\" + model_filename[task_type]),\n    #                        trigger=training.triggers.MinValueTrigger('validation/main/RMSE'))\n    #     elif 'MAE' in metric_name_list:\n    #         trainer.extend(E.snapshot_object(model, \"best_val_\" + model_filename[task_type]),\n    #                        trigger=training.triggers.MinValueTrigger('validation/main/MAE'))\n    #     else:\n    #         print(\"[WARNING] No validation metric defined?\")\n    #\n    # elif task_type == 'classification':\n    #     train_eval_iter = iterators.SerialIterator(\n    #         train, args.batchsize, repeat=False, shuffle=False)\n    #     trainer.extend(ROCAUCEvaluator(\n    #         train_eval_iter, predictor, eval_func=predictor,\n    #         device=args.gpu, converter=concat_mols, name='train',\n    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))\n    #     # extension name='validation' is already used by `Evaluator`,\n    #     # instead extension name `val` is used.\n    #     trainer.extend(ROCAUCEvaluator(\n    #         valid_iter, predictor, eval_func=predictor,\n    #         device=args.gpu, converter=concat_mols, name='val',\n    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))\n    #\n    #     trainer.extend(E.snapshot_object(\n    #         model, \"best_val_\" + model_filename[task_type]),\n    #         trigger=training.triggers.MaxValueTrigger('val/main/roc_auc'))\n    # else:\n    #     raise NotImplementedError(\n    #         'Not implemented task_type = {}'.format(task_type))\n\n    trainer.extend(AutoPrintReport())\n    trainer.extend(E.ProgressBar())\n    trainer.run()\n\n    # Save the model's parameters.\n    model_path = os.path.join(model_dir,  model_filename[task_type])\n    print('Saving the trained model to {}...'.format(model_path))\n    model.save_pickle(model_path, protocol=args.protocol)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/molnet_wle/README.md",
    "content": "# Weisfeiler-Lehman Embedding preprocessor implementations\n\nIn this directory, we provide an implementaion of [Weisfeiler-Lehman Embedding (WLE)](https://arxiv.org/abs/2006.06909) [1] preprocessor for ChainerChemistry GNN models. \n\n## How to run the code\n\n### Test run command\n\n```bash\n# Training tox21 dataset using RSGCN-CWLE model. Short 3 epoch for testing.\npython train_molnet_wle.py --dataset tox21 --method rsgcn_cwle --epoch 3  --device 0\n\n# Prediction with trained model\npython predict_molnet_wle.py --dataset tox21 --method rsgcn_cwle --in-dir result --device 0\n```\n\n### Train the model by specifying dataset\n\nBasically, no changes from the original molnet examples (examples/molnet/train_molnet.py).\nThe main difference is the choice of '--method' option.\nTo test WLE, choose one of 'xxx_wle', 'xxx_cwle', and 'xxx_gwle' where 'xxx' is a GNN architecture identifier (e.g. 'rsgcn', 'relgat').\n\n- xxx_wle: apply the naive WLE to the GNN 'xxx'\n- xxx_cwle (recommended): apply the Concat WLE to the GNN 'xxx'\n- xxx_gwle: apply the Gated-sum WLE to the GNN 'xxx'\n\n#### Additional options\n\nIntroducing the WLE, we have some more additional options.\nIn general you do not need to specify these options (use the default values!).\n\n\n## Performance\n\nThe paper [1] shows that the use of (C)WLE consistently improves the generalization (test) performance of the several GNN architectures (if hyperparameters are optimized by a Black-box optimizer such as [Optuna] (https://preferred.jp/ja/projects/optuna/).\n\n\n\n## References\n\n[1] Katsuhiko Ishiguro, Kenta Oono, and Kohei Hayashi, \"Weisfeiler-Lehman Embedding for Molecular Graph Neural Networks\", arXiv: 2006.06909, 2020. [paper link](https://arxiv.org/abs/2006.06909) \n\n"
  },
  {
    "path": "examples/molnet_wle/predict_molnet_wle.py",
    "content": "#!/usr/bin/env python\nfrom __future__ import print_function\n\nimport argparse\nimport os\n\nimport chainer\nfrom chainer.iterators import SerialIterator\nfrom chainer.training.extensions import Evaluator\nfrom chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator  # NOQA\n# Proposed by Ishiguro\n# ToDo: consider go/no-go with following modification\n# Re-load the best-validation score snapshot using serializers\nfrom chainer import serializers\n\nfrom chainer_chemistry.dataset.converters import concat_mols\nfrom chainer_chemistry.datasets import NumpyTupleDataset\nfrom chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config  # NOQA\nfrom chainer_chemistry.models.prediction import Classifier\nfrom chainer_chemistry.models.prediction import Regressor\nfrom chainer_chemistry.utils import save_json\n\n# These import is necessary for pickle to work\nfrom chainer import functions as F\nfrom chainer_chemistry.links.scaler.standard_scaler import StandardScaler  # NOQA\nfrom chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor  # NOQA\nfrom train_molnet_wle import dict_for_wles\nfrom train_molnet_wle import dataset_part_filename\nfrom train_molnet_wle import download_entire_dataset\n\ndict_for_wles()\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models.\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'gin', 'gnnfilm',\n                   'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm',\n                   'nfp_wle', 'ggnn_wle', 'relgat_wle', 'relgcn_wle', 'rsgcn_wle', 'gin_wle',\n                   'nfp_cwle', 'ggnn_cwle', 'relgat_cwle', 'relgcn_cwle', 'rsgcn_cwle', 'gin_cwle',\n                   'nfp_gwle', 'ggnn_gwle', 'relgat_gwle', 'relgcn_gwle', 'rsgcn_gwle', 'gin_gwle']\n#    scale_list = ['standardize', 'none']\n    dataset_names = list(molnet_default_config.keys())\n\n    # Set up the argument parser.\n    parser = argparse.ArgumentParser(description='Prediction on Molnet.')\n    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,\n                        default='bbbp',\n                        help='name of the dataset that training is run on')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        help='method name', default='nfp')\n    parser.add_argument('--label', '-l', type=str, default='',\n                        help='target label for regression; empty string means '\n                        'predicting all properties at once')\n#    parser.add_argument('--scale', type=str, choices=scale_list,\n#                        help='label scaling method', default='standardize')\n    parser.add_argument(\n        '--device', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--in-dir', '-i', type=str, default='result',\n                        help='directory to load model data from')\n    parser.add_argument('--num-data', type=int, default=-1,\n                        help='amount of data to be parsed; -1 indicates '\n                        'parsing all data.')\n    return parser.parse_args()\n\n\ndef main():\n    args = parse_arguments()\n\n    # Set up some useful variables that will be used later on.\n    dataset_name = args.dataset\n    method = args.method\n    num_data = args.num_data\n\n    if args.label:\n        labels = args.label\n        cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name,\n                                                            method, labels))\n    else:\n        labels = None\n        cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name,\n                                                             method))\n\n    # Load the cached dataset.\n    filename = dataset_part_filename('test', num_data)\n    path = os.path.join(cache_dir, filename)\n    if os.path.exists(path):\n        print('Loading cached dataset from {}.'.format(path))\n        test = NumpyTupleDataset.load(path)\n    else:\n        _, _, test = download_entire_dataset(dataset_name, num_data, labels,\n                                             method, cache_dir)\n\n    # Model-related data is stored this directory.\n    model_dir = os.path.join(args.in_dir, os.path.basename(cache_dir))\n\n    model_filename = {'classification': 'classifier.pkl',\n                      'regression': 'regressor.pkl'}\n    task_type = molnet_default_config[dataset_name]['task_type']\n    model_path = os.path.join(model_dir, model_filename[task_type])\n    print(\"model_path=\" + model_path)\n    print('Loading model weights from {}...'.format(model_path))\n\n    device = chainer.get_device(args.device)\n    if task_type == 'classification':\n        model = Classifier.load_pickle(model_path, device=device)\n    elif task_type == 'regression':\n        model = Regressor.load_pickle(model_path, device=device)\n    else:\n        raise ValueError('Invalid task type ({}) encountered when processing '\n                         'dataset ({}).'.format(task_type, dataset_name))\n\n    # Re-load the best-validation score snapshot\n    # serializers.load_npz(os.path.join(\n    #     model_dir, \"best_val_\" + model_filename[task_type]), model)\n\n    # Run an evaluator on the test dataset.\n    print('Evaluating...')\n    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)\n    eval_result = Evaluator(test_iterator, model, converter=concat_mols,\n                            device=device)()\n    print('Evaluation result: ', eval_result)\n\n    # Add more stats\n    if task_type == 'regression':\n        # loss = cuda.to_cpu(numpy.array(eval_result['main/loss']))\n        # eval_result['main/loss'] = loss\n\n        # convert to native values..\n        for k, v in eval_result.items():\n            eval_result[k] = float(v)\n\n    elif task_type == \"classification\":\n        # For Classifier, we do not equip the model with ROC-AUC evalation\n        # function. use separate ROC-AUC Evaluator\n        rocauc_result = ROCAUCEvaluator(\n            test_iterator, model, converter=concat_mols, device=device,\n            eval_func=model.predictor, name='test', ignore_labels=-1)()\n        print('ROCAUC Evaluation result: ', rocauc_result)\n\n        # add\n        for k, v in rocauc_result.items():\n            eval_result[k] = float(v)\n        #save_json(os.path.join(model_dir, 'rocauc_result.json'), rocauc_result)\n    else:\n        print('[WARNING] unknown task_type {}.'.format(task_type))\n\n    # Save the evaluation results.\n    save_json(os.path.join(model_dir, 'eval_result.json'), eval_result)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/molnet_wle/train_molnet_wle.py",
    "content": "#!/usr/bin/env python\nfrom __future__ import print_function\n\nimport argparse\nimport numpy\nimport os\nimport types\n\nimport pickle\n\nimport chainer\nfrom chainer import iterators\nfrom chainer import optimizers\nfrom chainer import training\n\nfrom chainer.training import extensions as E\n\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry.dataset.preprocessors import preprocess_method_dict, wle\nfrom chainer_chemistry import datasets as D\nfrom chainer_chemistry.datasets.molnet.molnet_config import molnet_default_config  # NOQA\nfrom chainer_chemistry.datasets import NumpyTupleDataset\nfrom chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import DeepChemScaffoldSplitter  # NOQA\nfrom chainer_chemistry.links import StandardScaler\nfrom chainer_chemistry.models.prediction import Classifier\nfrom chainer_chemistry.models.prediction import Regressor\nfrom chainer_chemistry.models.prediction import set_up_predictor\nfrom chainer_chemistry.training.extensions.auto_print_report import AutoPrintReport  # NOQA\nfrom chainer_chemistry.utils import save_json\nfrom chainer_chemistry.models.cwle.cwle_graph_conv_model import MAX_WLE_NUM\n\n\ndef dict_for_wles():\n    wle_keys = ['nfp_wle', 'ggnn_wle',  'relgat_wle', 'relgcn_wle', 'rsgcn_wle', 'gin_wle',\n                   'nfp_cwle', 'ggnn_cwle',  'relgat_cwle', 'relgcn_cwle', 'rsgcn_cwle', 'gin_cwle',\n                   'nfp_gwle', 'ggnn_gwle',  'relgat_gwle', 'relgcn_gwle', 'rsgcn_gwle', 'gin_gwle']\n\n    from chainer_chemistry.dataset.converters.concat_mols import concat_mols\n    from chainer_chemistry.dataset.preprocessors.nfp_preprocessor import NFPPreprocessor\n    from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import GGNNPreprocessor\n    from chainer_chemistry.dataset.preprocessors.gin_preprocessor import GINPreprocessor\n    from chainer_chemistry.dataset.preprocessors.relgat_preprocessor import RelGATPreprocessor\n    from chainer_chemistry.dataset.preprocessors.relgcn_preprocessor import RelGCNPreprocessor\n    from chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor\n\n    for key in wle_keys:\n        converter_method_dict[key] = concat_mols\n\n        if key.startswith('nfp'):\n            preprocess_method_dict[key] = NFPPreprocessor\n        elif key.startswith('ggnn'):\n            preprocess_method_dict[key] = GGNNPreprocessor\n        elif key.startswith('gin'):\n            preprocess_method_dict[key] = GINPreprocessor\n        elif key.startswith('relgcn'):\n            preprocess_method_dict[key] = RelGCNPreprocessor\n        elif key.startswith('rsgcn'):\n            preprocess_method_dict[key] = RSGCNPreprocessor\n        elif key.startswith('relgat'):\n            preprocess_method_dict[key] = RelGATPreprocessor\n        else:\n            assert key in wle_keys # should be die\ndict_for_wles()\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models and datasets.\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'gin', 'gnnfilm',\n                   'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm',\n                   'nfp_wle', 'ggnn_wle',  'relgat_wle', 'relgcn_wle', 'rsgcn_wle', 'gin_wle',\n                   'nfp_cwle', 'ggnn_cwle',  'relgat_cwle', 'relgcn_cwle', 'rsgcn_cwle', 'gin_cwle',\n                   'nfp_gwle', 'ggnn_gwle',  'relgat_gwle', 'relgcn_gwle', 'rsgcn_gwle', 'gin_gwle']\n    dataset_names = list(molnet_default_config.keys())\n    scale_list = ['standardize', 'none']\n\n    parser = argparse.ArgumentParser(description='molnet example')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        help='method name', default='nfp')\n    parser.add_argument('--label', '-l', type=str, default='',\n                        help='target label for regression; empty string means '\n                        'predicting all properties at once')\n    parser.add_argument('--conv-layers', '-c', type=int, default=4,\n                        help='number of convolution layers')\n    parser.add_argument('--batchsize', '-b', type=int, default=32,\n                        help='batch size')\n    parser.add_argument(\n        '--device', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--out', '-o', type=str, default='result',\n                        help='path to save the computed model to')\n    parser.add_argument('--epoch', '-e', type=int, default=20,\n                        help='number of epochs')\n    parser.add_argument('--unit-num', '-u', type=int, default=16,\n                        help='number of units in one layer of the model')\n    parser.add_argument('--dataset', '-d', type=str, choices=dataset_names,\n                        default='bbbp',\n                        help='name of the dataset that training is run on')\n    parser.add_argument('--protocol', type=int, default=2,\n                        help='pickle protocol version')\n    parser.add_argument('--num-data', type=int, default=-1,\n                        help='amount of data to be parsed; -1 indicates '\n                        'parsing all data.')\n    parser.add_argument('--scale', type=str, choices=scale_list,\n                        help='label scaling method', default='standardize')\n    parser.add_argument('--adam-alpha', type=float, help='alpha of adam', default=0.001)\n\n    # WLE options\n    parser.add_argument('--cutoff-wle', type=int, default=0, help=\"set more than zero to cut-off WL expanded labels\")\n    parser.add_argument('--hop-num', '-k', type=int, default=1, help=\"The number of iterations of WLs\")\n\n    return parser.parse_args()\n\n\ndef dataset_part_filename(dataset_part, num_data):\n    \"\"\"Returns the filename corresponding to a train/valid/test parts of a\n    dataset, based on the amount of data samples that need to be parsed.\n    Args:\n        dataset_part: String containing any of the following 'train', 'valid'\n                      or 'test'.\n        num_data: Amount of data samples to be parsed from the dataset.\n    \"\"\"\n    if num_data >= 0:\n        return '{}_data_{}.npz'.format(dataset_part, str(num_data))\n    return '{}_data.npz'.format(dataset_part)\n\n\ndef download_entire_dataset(dataset_name, num_data, labels, method, cache_dir, apply_wle_flag=False, cutoff_wle=0, apply_cwle_flag=False, apply_gwle_flag=False, n_hop=1):\n    \"\"\"Downloads the train/valid/test parts of a dataset and stores them in the\n    cache directory.\n    Args:\n        dataset_name: Dataset to be downloaded.\n        num_data: Amount of data samples to be parsed from the dataset.\n        labels: Target labels for regression.\n        method: Method name. See `parse_arguments`.\n        cache_dir: Directory to store the dataset to.\n        apply_wle_flag: boolean, set True if you apply the naive WL embeddding\n        cutoff_wle: int set more than zero to cut off WEEs\n        apply_cwle_flag: boolean, set True if you apply Concatenating WLE (CWLE)\n        apply_gwle_flag: boolean, set True if you apply Gated-sum WLE (GWLE)\n    \"\"\"\n\n    print('Downloading {}...'.format(dataset_name))\n    preprocessor = preprocess_method_dict[method]()\n\n    # Select the first `num_data` samples from the dataset.\n    target_index = numpy.arange(num_data) if num_data >= 0 else None\n\n    # To force DeepChem scaffold split\n    dc_scaffold_splitter = DeepChemScaffoldSplitter()\n    dataset_parts = D.molnet.get_molnet_dataset(dataset_name, preprocessor,\n                                                labels=labels,\n                                                split=dc_scaffold_splitter,\n                                                target_index=target_index)\n\n    dataset_parts = dataset_parts['dataset']\n\n    # Cache the downloaded dataset.\n    if not os.path.exists(cache_dir):\n        os.makedirs(cache_dir)\n\n    # apply Neighboring Label Expansion\n    if apply_wle_flag:\n        dataset_parts_expand, labels_expanded, labels_frequency = wle.apply_wle_for_datasets(dataset_parts, cutoff_wle, n_hop)\n        dataset_parts = dataset_parts_expand\n        num_expanded_symbols = len(labels_expanded)\n        print(\"WLE Expanded Labels Applied to datasets: vocab=\", num_expanded_symbols)\n        print(labels_expanded)\n\n        # save in text\n        file_name = \"WLE_labels.dat\"\n        path = os.path.join(cache_dir, file_name)\n        with open(path, \"w\") as fout:\n            for label in labels_expanded:\n                fout.write(label + \" \" + str(labels_frequency[label]) + \"\\n\")\n\n        # save binaries\n        file_name = \"WLE_labels.pkl\"\n        outfile = cache_dir + \"/\" + file_name\n        with open(outfile, \"wb\") as fout:\n            pickle.dump( (labels_expanded, labels_frequency), fout)\n\n    elif apply_cwle_flag:\n        dataset_parts_expand, labels_expanded, labels_frequency = wle.apply_cwle_for_datasets(dataset_parts, n_hop)\n        dataset_parts = dataset_parts_expand\n        num_expanded_symbols = len(labels_expanded)\n        print(\"Concatenating WLE Expanded Labels Applied to datasets: vocab=\", num_expanded_symbols)\n        print(labels_expanded)\n\n        # save in text\n        file_name = \"CWLE_labels.dat\"\n        path = os.path.join(cache_dir, file_name)\n        with open(path, \"w\") as fout:\n            for label in labels_expanded:\n                fout.write(label + \" \" + str(labels_frequency[label]) + \"\\n\")\n\n        # save binaries\n        file_name = \"CWLE_labels.pkl\"\n        outfile = cache_dir + \"/\" + file_name\n        with open(outfile, \"wb\") as fout:\n            pickle.dump( (labels_expanded, labels_frequency), fout)\n\n    elif apply_gwle_flag:\n        dataset_parts_expand, labels_expanded, labels_frequency = wle.apply_cwle_for_datasets(dataset_parts, n_hop)\n        dataset_parts = dataset_parts_expand\n        num_expanded_symbols = len(labels_expanded)\n        print(\"Gated-sum WLE Expanded Labels Applied to datasets: vocab=\", num_expanded_symbols)\n        print(labels_expanded)\n\n        # save in text\n        file_name = \"GWLE_labels.dat\"\n        path = os.path.join(cache_dir, file_name)\n        with open(path, \"w\") as fout:\n            for label in labels_expanded:\n                fout.write(label + \" \" + str(labels_frequency[label]) + \"\\n\")\n\n        # save binaries\n        file_name = \"GWLE_labels.pkl\"\n        outfile = cache_dir + \"/\" + file_name\n        with open(outfile, \"wb\") as fout:\n            pickle.dump( (labels_expanded, labels_frequency), fout)\n\n\n    else:\n        labels_expanded = []\n\n    # ToDO: scaler should be placed here\n    # ToDo: fit the scaler\n    # ToDo: transform dataset_parts[0-2]\n\n    for i, part in enumerate(['train', 'valid', 'test']):\n        filename = dataset_part_filename(part, num_data)\n        path = os.path.join(cache_dir, filename)\n        if False:\n            print(type(dataset_parts[i]))\n            print(type(dataset_parts[i][0]))\n            print(type(dataset_parts[i][0][0]))\n            print(type(dataset_parts[i][0][1]))\n            print(type(dataset_parts[i][0][2]))\n            print(dataset_parts[i][0][0].shape)\n            print(dataset_parts[i][0][1].shape)\n            print(dataset_parts[i][0][2].shape)\n            print(dataset_parts[i][0][0].dtype)\n            print(dataset_parts[i][0][1].dtype)\n            print(dataset_parts[i][0][2].dtype)\n        NumpyTupleDataset.save(path, dataset_parts[i])\n\n    return dataset_parts\n\n\ndef fit_scaler(datasets):\n    \"\"\"Standardizes (scales) the dataset labels.\n    Args:\n        datasets: Tuple containing the datasets.\n    Returns:\n        Datasets with standardized labels and the scaler object.\n    \"\"\"\n    scaler = StandardScaler()\n\n    # Collect all labels in order to apply scaling over the entire dataset.\n    labels = None\n    offsets = []\n    for dataset in datasets:\n        if labels is None:\n            labels = dataset.get_datasets()[-1]\n        else:\n            labels = numpy.vstack([labels, dataset.get_datasets()[-1]])\n        offsets.append(len(labels))\n\n    scaler.fit(labels)\n\n    return scaler\n\n\ndef main():\n    args = parse_arguments()\n    print(args)\n\n    # Set up some useful variables that will be used later on.\n    dataset_name = args.dataset\n    method = args.method\n    num_data = args.num_data\n    n_unit = args.unit_num\n    conv_layers = args.conv_layers\n    adam_alpha = args.adam_alpha\n\n    cutoff_wle = args.cutoff_wle\n    n_hop = args.hop_num\n\n    apply_wle_flag = method in ['nfp_wle', 'ggnn_wle',  'relgat_wle', 'relgcn_wle', 'rsgcn_wle', 'gin_wle']\n    apply_cwle_flag = method in ['nfp_cwle', 'ggnn_cwle',  'relgat_cwle', 'relgcn_cwle', 'rsgcn_cwle', 'gin_cwle']\n    apply_gwle_flag = method in ['nfp_gwle', 'ggnn_gwle',  'relgat_gwle', 'relgcn_gwle', 'rsgcn_gwle', 'gin_gwle']\n\n    task_type = molnet_default_config[dataset_name]['task_type']\n    model_filename = {'classification': 'classifier.pkl',\n                      'regression': 'regressor.pkl'}\n\n    print('Using dataset: {}...'.format(dataset_name))\n\n    # Set up some useful variables that will be used later on.\n    if args.label:\n        labels = args.label\n        cache_dir = os.path.join('input', '{}_{}_{}'.format(dataset_name,\n                                                            method, labels))\n        class_num = len(labels) if isinstance(labels, list) else 1\n    else:\n        labels = None\n        cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name,\n                                                             method))\n        class_num = len(molnet_default_config[args.dataset]['tasks'])\n\n    # Load the train and validation parts of the dataset.\n    filenames = [dataset_part_filename(p, num_data)\n                 for p in ['train', 'valid', 'test']]\n\n\n    # ToDo: We need to incoporeat scaler into download_entire_dataset, instead of predictors. \n    paths = [os.path.join(cache_dir, f) for f in filenames]\n    if all([os.path.exists(path) for path in paths]):\n        dataset_parts = []\n        for path in paths:\n            print('Loading cached dataset from {}.'.format(path))\n            dataset_parts.append(NumpyTupleDataset.load(path))\n    else:\n        dataset_parts = download_entire_dataset(dataset_name, num_data, labels,\n                                                method, cache_dir,\n                                                apply_wle_flag, cutoff_wle, apply_cwle_flag, apply_gwle_flag, n_hop)\n    train, valid = dataset_parts[0], dataset_parts[1]\n\n    # ToDo: scaler must be incorporated into download_entire_datasets. not here\n    # Scale the label values, if necessary.\n    scaler = None\n    if args.scale == 'standardize':\n        if task_type == 'regression':\n            print('Applying standard scaling to the labels.')\n            scaler = fit_scaler(dataset_parts)\n        else:\n            print('Label scaling is not available for classification tasks.')\n    else:\n        print('No label scaling was selected.')\n\n    # ToDo: set label_scaler always None\n    # Set up the predictor.\n\n    if apply_wle_flag:\n        # find the num_atoms\n        max_symbol_index = wle.findmaxidx(dataset_parts)\n        print(\"number of expanded symbols (WLE) = \", max_symbol_index)\n        predictor = set_up_predictor(\n            method, n_unit, conv_layers, class_num,\n            label_scaler=scaler, n_atom_types=max_symbol_index)\n    elif apply_cwle_flag or apply_gwle_flag:\n        n_wle_types = wle.findmaxidx(\n            dataset_parts, 'wle_label')\n        # Kenta Oono (oono@preferred.jp)\n        # In the previous implementation, we use MAX_WLE_NUM\n        # as the dimension of one-hot vectors for WLE labels\n        # when the model is CWLE or WLNE and hop_num k = 1.\n        # When k >= 2, # of wle labels can be larger than MAX_WLE_NUM,\n        # which causes an error.\n        # Therefore, we have increased the dimension of vectors.\n        # To align with the previous experiments,\n        # we change n_wle_types only if it exceeds MAX_WLE_NUM.\n        n_wle_types = max(n_wle_types, MAX_WLE_NUM)\n        print(\"number of expanded symbols (CWLE/GWLE) = \", n_wle_types)\n        predictor = set_up_predictor(\n            method, n_unit, conv_layers, class_num,\n            label_scaler=scaler, n_wle_types=n_wle_types)\n    else:\n        predictor = set_up_predictor(\n            method, n_unit, conv_layers, class_num,\n            label_scaler=scaler)\n\n    # Set up the iterators.\n    train_iter = iterators.SerialIterator(train, args.batchsize)\n    valid_iter = iterators.SerialIterator(valid, args.batchsize, repeat=False,\n                                          shuffle=False)\n\n    # Load metrics for the current dataset.\n    metrics = molnet_default_config[dataset_name]['metrics']\n    metrics_fun = {k: v for k, v in metrics.items()\n                   if isinstance(v, types.FunctionType)}\n    loss_fun = molnet_default_config[dataset_name]['loss']\n\n    device = chainer.get_device(args.device)\n    if task_type == 'regression':\n        model = Regressor(predictor, lossfun=loss_fun,\n                          metrics_fun=metrics_fun, device=device)\n    elif task_type == 'classification':\n        model = Classifier(predictor, lossfun=loss_fun,\n                           metrics_fun=metrics_fun, device=device)\n    else:\n        raise ValueError('Invalid task type ({}) encountered when processing '\n                         'dataset ({}).'.format(task_type, dataset_name))\n\n    # Set up the optimizer.\n    optimizer = optimizers.Adam(alpha=adam_alpha)\n    optimizer.setup(model)\n\n    # Save model-related output to this directory.\n    if not os.path.exists(args.out):\n        os.makedirs(args.out)\n    save_json(os.path.join(args.out, 'args.json'), vars(args))\n    model_dir = os.path.join(args.out, os.path.basename(cache_dir))\n    if not os.path.exists(model_dir):\n        os.makedirs(model_dir)\n\n    # Set up the updater.\n    converter = converter_method_dict[method]\n    updater = training.StandardUpdater(train_iter, optimizer, device=device,\n                                       converter=converter)\n\n    # Set up the trainer.\n    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=model_dir)\n    trainer.extend(E.Evaluator(valid_iter, model, device=device,\n                               converter=converter))\n    trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch'))\n    trainer.extend(E.LogReport())\n\n    # TODO: consider go/no-go of the following block\n    # # (i) more reporting for val/evalutaion\n    # # (ii) best validation score snapshot\n    # if task_type == 'regression':\n    #     metric_name_list = list(metrics.keys())\n    #     if 'RMSE' in metric_name_list:\n    #         trainer.extend(E.snapshot_object(model, \"best_val_\" + model_filename[task_type]),\n    #                        trigger=training.triggers.MinValueTrigger('validation/main/RMSE'))\n    #     elif 'MAE' in metric_name_list:\n    #         trainer.extend(E.snapshot_object(model, \"best_val_\" + model_filename[task_type]),\n    #                        trigger=training.triggers.MinValueTrigger('validation/main/MAE'))\n    #     else:\n    #         print(\"[WARNING] No validation metric defined?\")\n    #\n    # elif task_type == 'classification':\n    #     train_eval_iter = iterators.SerialIterator(\n    #         train, args.batchsize, repeat=False, shuffle=False)\n    #     trainer.extend(ROCAUCEvaluator(\n    #         train_eval_iter, predictor, eval_func=predictor,\n    #         device=args.gpu, converter=concat_mols, name='train',\n    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))\n    #     # extension name='validation' is already used by `Evaluator`,\n    #     # instead extension name `val` is used.\n    #     trainer.extend(ROCAUCEvaluator(\n    #         valid_iter, predictor, eval_func=predictor,\n    #         device=args.gpu, converter=concat_mols, name='val',\n    #         pos_labels=1, ignore_labels=-1, raise_value_error=False))\n    #\n    #     trainer.extend(E.snapshot_object(\n    #         model, \"best_val_\" + model_filename[task_type]),\n    #         trigger=training.triggers.MaxValueTrigger('val/main/roc_auc'))\n    # else:\n    #     raise NotImplementedError(\n    #         'Not implemented task_type = {}'.format(task_type))\n\n    trainer.extend(AutoPrintReport())\n    trainer.extend(E.ProgressBar())\n    trainer.run()\n\n    # Save the model's parameters.\n    model_path = os.path.join(model_dir,  model_filename[task_type])\n    print('Saving the trained model to {}...'.format(model_path))\n    model.save_pickle(model_path, protocol=args.protocol)\n\n    # dump the parameter, if CWLE\n    #if apply_cwle_flag:\n    #    cwle = predictor.graph_conv\n    #    #print(cwle)\n    #    concatW = cwle.linear_for_concat_wle.W.data\n    #    #print(type(concatW))\n    #\n    #    # dump the raw W\n    #    out_prefix = args.out + \"/\" + method + \"_\" + dataset_name +\"_learnedW\"\n    #    with open(out_prefix + \".dat\", 'w') as fout:\n    #        import csv\n    #        writer = csv.writer(fout, lineterminator=\"\\n\")\n    #        writer.writerows(concatW)\n    #    # end with\n    #\n    #    import matplotlib\n    #    matplotlib.use('Agg')\n    #    import matplotlib.pyplot as plt\n    #    # visualize\n    #    fig1, ax1 = plt.subplots()\n    #    plt.imshow(concatW, cmap=\"jet\")\n    #    plt.colorbar(ax=ax1)\n    #\n    #    plt.title('Learned W on ' + dataset_name + ' + ' + method)\n    #    plt.savefig(out_prefix + \".png\")\n    #    plt.savefig(out_prefix + \".pdf\")\n    #\n    #    # visualize the absolute value\n    #    fig2, ax2 = plt.subplots()\n    #    plt.imshow(numpy.abs(concatW), cmap=\"jet\")\n    #    plt.colorbar(ax=ax2)\n    #\n    #    plt.title('Learned abs(W) on ' + dataset_name + ' + ' + method)\n    #    plt.savefig(out_prefix + \"_abs.png\")\n    #    plt.savefig(out_prefix + \"_abs.pdf\")\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/network_graph/README.md",
    "content": "# Network Node Classification Example\n\nThis example performs semi-supervised node classification.\n\n## Dependencies\n\nBefore running the example, the following packages also need to be installed:\n\n- [`matplotlib`](https://matplotlib.org/)\n- [`seaborn`](https://seaborn.pydata.org/)\n- [`scikit-learn`](http://scikit-learn.org/stable/)\n\n\n## Supported dataset\n\n- [Cora](https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz)\n- [Citeseer](https://linqs-data.soe.ucsc.edu/public/lbc/citeseer.tgz)\n- [Reddit](https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/reddit.zip)\n    - we use the dataset provided by [dmlc/dgl](https://github.com/dmlc/dgl/blob/master/python/dgl/data/reddit.py) repository.\n\nNote that dataset is downloaded automatically.\n\n## How to run the code\n\n### Train a model\n\nTo train a model, run the following:\n\nOn the CPU:\n```angular2html\npython train_network_graph.py --dataset cora\n```\n\nTrain sparse model with GPU:\n```angular2html\npython train_network_graph.py --dataset cora --device 0 --method gin_sparse\n```\n\n### Train a model with reddit dataset\n\nreddit dataset contains, it can run only with specific configuration.\nPlease turn on coo option to run training of reddit dataset.\n\n```angular2html\npython train_network_graph.py --dataset reddit --device 0 --method gin --coo true\n```\n"
  },
  {
    "path": "examples/network_graph/citeseer/.gitignore",
    "content": "citeseer.cites\nciteseer.content\nREADME"
  },
  {
    "path": "examples/network_graph/cora/.gitignore",
    "content": "cora.cites\ncora.content\nREADME"
  },
  {
    "path": "examples/network_graph/padding_model_wrapper.py",
    "content": "import chainer\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_data import PaddingGraphData  # NOQA\n\n\nclass PaddingModelWrapper(chainer.Chain):\n    def __init__(self, predictor):\n        super(PaddingModelWrapper, self).__init__()\n        with self.init_scope():\n            self.predictor = predictor\n\n    def forward(self, data):\n        assert isinstance(data, PaddingGraphData)\n        return self.predictor(data.x, data.adj)\n"
  },
  {
    "path": "examples/network_graph/reddit/.gitignore",
    "content": "reddit.zip\nreddit_data.npz\nreddit_graph.npz"
  },
  {
    "path": "examples/network_graph/train_network_graph.py",
    "content": "import argparse\nfrom distutils.util import strtobool\nimport numpy\n\nfrom chainer_chemistry.datasets.citation_network.citation import citation_to_networkx  # NOQA\nfrom chainer_chemistry.datasets.citation_network.citeseer import \\\n    get_citeseer_dirpath\nfrom chainer_chemistry.datasets.citation_network.cora import get_cora_dirpath\nfrom chainer_chemistry.datasets.reddit.reddit import reddit_to_networkx, \\\n    get_reddit_dirpath\nfrom chainer_chemistry.dataset.networkx_preprocessors.base_networkx import BasePaddingNetworkxPreprocessor, BaseSparseNetworkxPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.graph_dataset.base_graph_data import PaddingGraphData  # NOQA\nfrom chainer_chemistry.utils.train_utils import run_node_classification_train\nfrom chainer_chemistry.models.prediction.node_classifier import NodeClassifier\nfrom chainer_chemistry.models.gin import GINSparse, GIN\nfrom chainer_chemistry.dataset.networkx_preprocessors.reddit_coo import get_reddit_coo_data  # NOQA\n\nfrom padding_model_wrapper import PaddingModelWrapper  # NOQA\n\n\ndef get_cora():\n    return citation_to_networkx(get_cora_dirpath(), \"cora\")\n\n\ndef get_citeseer():\n    return citation_to_networkx(get_citeseer_dirpath(), \"citeseer\")\n\n\ndef get_reddit():\n    return reddit_to_networkx(get_reddit_dirpath())\n\n\ndataset_dict = {\n    'cora': get_cora,\n    'citeseer': get_citeseer,\n    'reddit': get_reddit,\n}\nmethod_dict = {\n    'gin': GIN,\n    'gin_sparse': GINSparse,\n}\npreprocessor_dict = {\n    'gin': BasePaddingNetworkxPreprocessor,\n    'gin_sparse': BaseSparseNetworkxPreprocessor,\n}\n\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models.\n    dataset_list = ['cora', 'citeseer', 'reddit']\n    method_list = ['gin', 'gin_sparse']\n\n    # Set up the argument parser.\n    parser = argparse.ArgumentParser(\n        description='Node classification on network a graph')\n    parser.add_argument('--dataset', type=str, choices=dataset_list,\n                        default='cora', help='dataset name')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        default='gin_sparse', help='method name')\n    parser.add_argument('--conv-layers', '-c', type=int, default=2,\n                        help='number of convolution layers')\n    parser.add_argument(\n        '--device', '-d', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--out', '-o', type=str, default='result',\n                        help='path to save the computed model to')\n    parser.add_argument('--epoch', '-e', type=int, default=20,\n                        help='number of epochs')\n    parser.add_argument('--unit-num', '-u', type=int, default=32,\n                        help='number of units in one layer of the model')\n    parser.add_argument('--seed', '-s', type=int, default=777,\n                        help='random seed value')\n    parser.add_argument('--train-data-ratio', '-r', type=float, default=0.2,\n                        help='ratio of training data w.r.t the dataset')\n    parser.add_argument('--dropout', type=float, default=0.0,\n                        help='dropout ratio')\n    parser.add_argument('--coo', type=strtobool, default='false',\n                        help='use Coo matrix')\n    return parser.parse_args()\n\n\ndef generate_random_mask(n, train_num, seed=777):\n    numpy.random.seed(seed)\n    mask = numpy.zeros(n, dtype=bool)\n    mask[:train_num] = True\n    numpy.random.shuffle(mask)\n    return mask, numpy.logical_not(mask)  # (train_mask, val_mask)\n\n\nif __name__ == '__main__':\n    args = parse_arguments()\n    if args.dataset == 'reddit' and args.coo:\n        # because it takes time to load reddit coo data via networkx\n        data = get_reddit_coo_data(get_reddit_dirpath())\n    else:\n        networkx_graph = dataset_dict[args.dataset]()\n        preprocessor = preprocessor_dict[args.method](use_coo=args.coo)\n        data = preprocessor.construct_data(networkx_graph)\n    print('label num: {}'.format(data.label_num))\n\n    gnn = method_dict[args.method](out_dim=None, node_embedding=True,\n                                   out_channels=data.label_num,\n                                   hidden_channels=args.unit_num,\n                                   n_update_layers=args.conv_layers,\n                                   dropout_ratio=args.dropout)\n\n    if isinstance(data, PaddingGraphData):\n        gnn = PaddingModelWrapper(gnn)\n\n    predictor = NodeClassifier(gnn, device=args.device)\n    train_label_num = int(data.n_nodes * args.train_data_ratio)\n    train_mask, valid_mask = generate_random_mask(\n        data.n_nodes, train_label_num)\n    print(\"train label: {}, validation label: {}\".format(\n        train_label_num, data.n_nodes - train_label_num))\n    run_node_classification_train(\n        predictor, data, train_mask, valid_mask,\n        epoch=args.epoch, device=args.device)\n"
  },
  {
    "path": "examples/own_dataset/README.md",
    "content": "# Example of using your own dataset\n\nThis example shows how to train models with your own dataset stored in the CSV format.\n\nA regression task is performed using [`Regressor`](http://chainer-chemistry.readthedocs.io/en/stable/generated/chainer_chemistry.models.Regressor.html#chainer_chemistry.models.Regressor). For a classification setting that makes use of [`Classifier`](http://chainer-chemistry.readthedocs.io/en/stable/generated/chainer_chemistry.models.Classifier.html#chainer_chemistry.models.Classifier), \nplease refer to the `tox21` example.\n\n## Dependencies\n\nBefore running the example, the following packages also need to be installed:\n\n- [`matplotlib`](https://matplotlib.org/)\n- [`seaborn`](https://seaborn.pydata.org/)\n- [`scikit-learn`](http://scikit-learn.org/stable/)\n\n## How to run the code\n\n### Dataset preparation\n\nPrepare a CSV file containing the training data samples, one per row. Each row contains the SMILES string of one molecule, followed by the (label) values of the molecule's desired properties. The first line of the CSV file contains label names.\n\nBelow you can find an example:\n\n```\nSMILES,value1,value2\nCC1CC1CN1CC1C,-0.2190999984741211,0.08590000122785568\nC#CCC(=N)OC=O,-0.2750999927520752,-0.032999999821186066\nCc1cnc(C=O)n1C,-0.23080000281333923,-0.053700000047683716\nN=COCC(C=O)CO,-0.26260000467300415,-0.043699998408555984\n[...]\n```\n\nSave one CSV file for training (e.g., `dataset_train.csv`) and one for testing (e.g., `dataset_test.csv`). Then pass them to the training and testing scripts, as shown below.\n\n### Train a model\n\nTo train a new model, run the following:\n```\npython train_own_dataset.py --datafile dataset_train.csv --label value1 value2\n```\n\nThe `--label` option specifies which columns in `dataset_train.csv` are trained.\nType `python train_own_dataset.py --help` to see the complete set of options.\n\n### Inference using a pretrained model\n\nTo perform inference using a pretrained model, run the following:\n```\npython predict_own_dataset.py --datafile dataset_test.csv --label value1 value2\n```\nType `python test_own_dataset.py --help` to see the complete set of options.\n\n### Evaluation of implemented models\n\nTo evaluate the performance of the currently implemented models, run the following:\n```\nbash evaluate_own_dataset.sh [gpu_id] [epoch]\n```\nwhere `gpu_id` is the identifier of your GPU and `epoch` is the number of training epochs.\nTo run the code on CPU, set `gpu_id` to `-1`.\n\nThe scripts start the training process. Inference is then performed and evaluation metrics are reported. \nFor regression tasks (such as the current example), these are MAE and RMSE. \nOne plot per metric is created (saved as `eval_[metric]_own.png` in the example directory), which outputs these values as reported by the different models.\n"
  },
  {
    "path": "examples/own_dataset/dataset_test.csv",
    "content": "SMILES,value1,value2\nCC1CC1CN1CC1C,-0.2190999984741211,0.08590000122785568\nC#CCC(=N)OC=O,-0.2750999927520752,-0.032999999821186066\nCc1cnc(C=O)n1C,-0.23080000281333923,-0.053700000047683716\nN=COCC(C=O)CO,-0.26260000467300415,-0.043699998408555984\nCC1=C2CC3C(C1)C23C,-0.19580000638961792,-0.022700000554323196\nCC1C=CCC(C)C1O,-0.2443999946117401,0.019099999219179153\nC1c2n[nH]nc2C2CN12,-0.23350000381469727,-0.011800000444054604\nCOC1(C#N)CCC1C,-0.27480000257492065,0.02250000089406967\nN=CNC(=O)C1CCO1,-0.25049999356269836,-0.020800000056624413\nO=CC1(O)COC1=O,-0.27869999408721924,-0.06939999759197235\n"
  },
  {
    "path": "examples/own_dataset/dataset_train.csv",
    "content": "SMILES,value1,value2\nCC1=CC2CC(CC1)O2,-0.227400004863739,0.010400000028312206\nO=Cc1nccn1C=O,-0.2678000032901764,-0.09380000084638596\nCCC(C)(C)C(O)C=O,-0.2685000002384186,-0.038100000470876694\nC#CCC(C)(CO)OC,-0.2535000145435333,0.044599998742341995\nNc1coc(=O)nc1N,-0.2303999960422516,-0.04170000180602074\nCC12C=CC(CCC1)C2,-0.2312999963760376,0.02239999920129776\nCC12CCC1C2OC=O,-0.2605000138282776,0.005400000140070915\nCC1C2CC3(COC3)N12,-0.23430000245571136,0.0697999969124794\nO=C1NC=NC12CC2,-0.24070000648498535,-0.017000000923871994\nC1=CC2CN2CC2NC12,-0.22169999778270721,0.007699999958276749\nCC1C2COCC12O,-0.2467000037431717,0.07410000264644623\nCC(=O)C1OCOC1=O,-0.2590000033378601,-0.042500000447034836\nCC1N2C3CC1(C)C32,-0.2295999974012375,0.0835999995470047\nCC1=CC2OC2(C#N)C1,-0.25999999046325684,-0.019899999722838402\nOC1CCC1,-0.25600001215934753,0.08009999990463257\nC#CC1(O)COC1C#N,-0.2849000096321106,-0.01769999973475933\nCC1(C#N)CC12CCC2,-0.2685000002384186,0.03460000082850456\nCCCC(N)(C#N)CO,-0.25760000944137573,0.028999999165534973\nNC1=NC2(CC2)CC1=O,-0.22470000386238098,-0.053700000047683716\nC#CC12C3CC1(C)OC32,-0.2273000031709671,0.026900000870227814\nCC(C)C#CCC=O,-0.24539999663829803,-0.02669999934732914\nCC#CC(C=O)CC,-0.24169999361038208,-0.02539999969303608\nCC1OC2C1=CC1OC12,-0.2485000044107437,-0.01769999973475933\nCNC(=N)C(C#N)OC,-0.23420000076293945,-0.0013000000035390258\nC#CC(C#C)OCC=O,-0.26100000739097595,-0.031599998474121094\nCN1CC(O)C12CC2,-0.20479999482631683,0.08730000257492065\nOC1C2C3OC4C1C2C34,-0.24469999969005585,0.04230000078678131\nOCC1C(O)C2CC12O,-0.24169999361038208,0.05739999935030937\nO=C([O-])C12[NH2+]CC1C2O,-0.2508000135421753,-0.0003000000142492354\nCn1cc(O)c(CO)n1,-0.2045000046491623,0.01850000023841858\nO=C1COC2C3OC2C13,-0.2498999983072281,-0.03700000047683716\nC1#CCCOC=NCC1,-0.24279999732971191,0.012600000016391277\nO=c1ocncc1CO,-0.2563000023365021,-0.06289999932050705\nCC1NC1C(O)C(N)=O,-0.2547999918460846,0.023800000548362732\nCC1OC(=N)CC2CC21,-0.2498999983072281,0.032499998807907104\nOC12CCC3CN3C1C2,-0.21709999442100525,0.07280000299215317\nC#CC(CCO)OC,-0.2581999897956848,0.033900000154972076\nCCC1COC(CO)=N1,-0.2540999948978424,0.019200000911951065\nON=C1C=CC2C(O)C12,-0.2184000015258789,-0.04349999874830246\nCN=c1cconn1,-0.23919999599456787,-0.037700001150369644\nCC1(C)CC2CC2C1O,-0.2540999948978424,0.066600002348423\nCCC1CCC(=N)O1,-0.2526000142097473,0.032600000500679016\nO=C1C2CCC1C1NC21,-0.2282000035047531,-0.00279999990016222\nCCOc1ccc(C)o1,-0.19059999287128448,0.033799998462200165\nO=C1C2CC3C4C2C1N34,-0.23479999601840973,-0.026100000366568565\nO=C1C=CCC=CC1=O,-0.24130000174045563,-0.08780000358819962\nCc1cc(F)c[nH]c1=O,-0.2117999941110611,-0.042100001126527786\nCC1=CCc2nocc21,-0.22419999539852142,-0.019200000911951065\nN#CC1(O)CN=COC1,-0.26980000734329224,-0.002400000113993883\nNc1n[nH]cc1N1CC1,-0.18649999797344208,0.03739999979734421\nCN1C2CC3(O)C1C23C,-0.19619999825954437,0.07779999822378159\nN=c1nccco1,-0.23680000007152557,-0.0689999982714653\nCOC12COC1(C)C2C,-0.22339999675750732,0.07020000368356705\nCCOC1COC(=N)O1,-0.2547000050544739,0.0560000017285347\nCOC1(C(N)=O)CC1,-0.23800000548362732,0.0284000001847744\nC#CCC#CC1NC1C,-0.23970000445842743,0.03180000185966492\nC1NC1CN1C2CCC21,-0.2379000037908554,0.06539999693632126\nCC(O)c1cc(N)[nH]n1,-0.21449999511241913,0.029899999499320984\nCC1(O)C(O)C1C=O,-0.24230000376701355,-0.022099999710917473\nC#CC1(C)C2C3OC3C21,-0.23819999396800995,0.025800000876188278\nc1c[nH]c2cccc-2c1,-0.17229999601840973,-0.037300001829862595\nCCC1(O)C(C)C1C=O,-0.24089999496936798,-0.01810000091791153\nC1=C2C(CC1)CC1NC21,-0.2231999933719635,0.01940000057220459\nC#CC1C2C(O)C1C2O,-0.24420000612735748,0.041999999433755875\nCC1(C)CN2CC(C2)O1,-0.2093999981880188,0.07599999755620956\nCC1OC1C1C2CN1C2,-0.22990000247955322,0.08429999649524689\nCC(=O)C12CC(=O)C1C2,-0.25049999356269836,-0.04270000010728836\nCC12C3=NCC1CC2O3,-0.23119999468326569,-0.016599999740719795\nc1cc2onnc2[nH]1,-0.23520000278949738,-0.042399998754262924\nO=CCCC1OC2CC12,-0.24369999766349792,-0.01850000023841858\nOCCC1C2C3CC3N12,-0.2175000011920929,0.06040000170469284\nOCC#CC1CC1,-0.23720000684261322,0.03359999880194664\nOC1C2CC3C1N1C2C31,-0.22709999978542328,0.0640999972820282\nCC1(C=O)C=CC(=O)N1,-0.25369998812675476,-0.05649999901652336\nCC1CC23CC12CCO3,-0.20999999344348907,0.08139999955892563\nCC(O)(C(N)=O)C1CO1,-0.24469999969005585,0.02889999933540821\nCC1=NC2(CC2)C(=N)N1,-0.2134999930858612,0.0024999999441206455\nN#CCCC(=O)C(N)=O,-0.25949999690055847,-0.08160000294446945\nCC(O)(C#N)COC=N,-0.27379998564720154,0.00570000009611249\nCC12C=CC(C)(N1)C2O,-0.22859999537467957,-0.0012000000569969416\nCC12COC1CCO2,-0.2468000054359436,0.07940000295639038\nc1noc2c1CCOC2,-0.24819999933242798,-0.010700000450015068\nC#CC1CCCCOC1,-0.2467000037431717,0.053599998354911804\nCN1C2C3OC2(C=O)C31,-0.23469999432563782,-0.04619999974966049\nCCn1cc(O)nn1,-0.22519999742507935,0.0013000000035390258\nCCOC(=NC)C(C)=O,-0.23420000076293945,-0.05640000104904175\nCC12CC1(C#N)C1CC12,-0.26750001311302185,0.02070000022649765\nCC(=O)C1OC1CC=O,-0.251800000667572,-0.04360000044107437\nNc1cc(=O)cno1,-0.23770000040531158,-0.053700000047683716\nO=C1CC=CCC1O,-0.25519999861717224,-0.027300000190734863\n"
  },
  {
    "path": "examples/own_dataset/evaluate_own_dataset.sh",
    "content": "#!/usr/bin/env bash\n\nset -e\n\n# List of available graph convolution methods.\nmethods=(nfp ggnn schnet weavenet rsgcn relgcn relgat megnet)\n\n# device identifier; set it to -1 to train on the CPU (default).\ndevice=${1:--1}\n# Number of training epochs (default: 1).\nepoch=${2:-1}\n\nfor method in ${methods[@]}\ndo\n    # Train with the current method.\n    python train_own_dataset.py \\\n        --method ${method} \\\n        --label value1 \\\n        --conv-layers 1 \\\n        --device ${device} \\\n        --epoch ${epoch} \\\n        --unit-num 10 \\\n        --out eval_${method}\n\n    # Run inference on the test set.\n    python predict_own_dataset.py \\\n        --method ${method} \\\n        --label value1 \\\n        --conv-layers 1 \\\n        --device ${device} \\\n        --epoch ${epoch} \\\n        --unit-num 10 \\\n        --in-dir eval_${method} \\\n        --out eval_${method}\ndone\n\n# Create plot showing the evaluation performance.\npython plot.py --prefix eval_ --methods ${methods[@]}\n"
  },
  {
    "path": "examples/own_dataset/plot.py",
    "content": "#!/usr/bin/env python\n\nimport argparse\nimport json\nimport matplotlib.pyplot as plt\nimport os\nimport seaborn as sns\n\n\ndef save_evaluation_plot(x, y, metric, filename):\n    plt.figure()\n\n    sns.set()\n    ax = sns.barplot(y=x, x=y)\n\n    for n, (label, _y) in enumerate(zip(x, y)):\n        ax.annotate(\n            '{:.3f}'.format(abs(_y)),\n            xy=(_y, n),\n            ha='right',\n            va='center',\n            xytext=(-5, 0),\n            textcoords='offset points',\n            color='white')\n\n    plt.title('Performance on own dataset')\n    plt.xlabel(metric)\n    plt.savefig(filename)\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--prefix', required=True)\n    parser.add_argument('--methods', nargs='+', required=True)\n    args = parser.parse_args()\n\n    metrics = ['mean_abs_error', 'root_mean_sqr_error']\n    x = args.methods\n    y = {metric: [] for metric in metrics}\n\n    for method in args.methods:\n        with open(os.path.join(args.prefix + method, 'eval_result.json')) as f:\n            result = json.load(f)\n            for metric in metrics:\n                y[metric].append(result['main/' + metric])\n\n    for metric in metrics:\n        save_evaluation_plot(\n            x, y[metric], metric, 'eval_' + metric + '_own.png')\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/own_dataset/predict_own_dataset.py",
    "content": "#!/usr/bin/env python\n\nfrom __future__ import print_function\n\n\nimport chainer\nimport numpy\nimport os\n\nfrom argparse import ArgumentParser\nfrom chainer.iterators import SerialIterator\nfrom chainer.training.extensions import Evaluator\n\nfrom chainer_chemistry.models.prediction import Regressor\nfrom chainer_chemistry.dataset.parsers import CSVFileParser\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry.dataset.preprocessors import preprocess_method_dict\n\n# These imports are necessary for pickle to work.\nfrom chainer_chemistry.links.scaler.standard_scaler import StandardScaler  # NOQA\nfrom chainer_chemistry.models.prediction import GraphConvPredictor  # NOQA\nfrom chainer_chemistry.utils import save_json\nfrom train_own_dataset import rmse\n\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models.\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'megnet']\n    scale_list = ['standardize', 'none']\n\n    # Set up the argument parser.\n    parser = ArgumentParser(description='Regression on own dataset')\n    parser.add_argument('--datafile', '-d', type=str,\n                        default='dataset_test.csv',\n                        help='csv file containing the dataset')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        help='method name', default='nfp')\n    parser.add_argument('--label', '-l', nargs='+',\n                        default=['value1', 'value2'],\n                        help='target label for regression')\n    parser.add_argument('--scale', type=str, choices=scale_list,\n                        help='label scaling method', default='standardize')\n    parser.add_argument('--conv-layers', '-c', type=int, default=4,\n                        help='number of convolution layers')\n    parser.add_argument('--batchsize', '-b', type=int, default=32,\n                        help='batch size')\n    parser.add_argument(\n        '--device', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--out', '-o', type=str, default='result',\n                        help='path to save the computed model to')\n    parser.add_argument('--epoch', '-e', type=int, default=10,\n                        help='number of epochs')\n    parser.add_argument('--unit-num', '-u', type=int, default=16,\n                        help='number of units in one layer of the model')\n    parser.add_argument('--protocol', type=int, default=2,\n                        help='pickle protocol version')\n    parser.add_argument('--in-dir', '-i', type=str, default='result',\n                        help='directory containing the saved model')\n    parser.add_argument('--model-filename', type=str, default='regressor.pkl',\n                        help='saved model filename')\n    return parser.parse_args()\n\n\ndef main():\n    # Parse the arguments.\n    args = parse_arguments()\n\n    if args.label:\n        labels = args.label\n    else:\n        raise ValueError('No target label was specified.')\n\n    # Dataset preparation.\n    def postprocess_label(label_list):\n        return numpy.asarray(label_list, dtype=numpy.float32)\n\n    print('Preprocessing dataset...')\n    preprocessor = preprocess_method_dict[args.method]()\n    parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label,\n                           labels=labels, smiles_col='SMILES')\n    dataset = parser.parse(args.datafile)['dataset']\n\n    test = dataset\n\n    print('Predicting...')\n    # Set up the regressor.\n    device = chainer.get_device(args.device)\n    model_path = os.path.join(args.in_dir, args.model_filename)\n    regressor = Regressor.load_pickle(model_path, device=device)\n\n    # Perform the prediction.\n    print('Evaluating...')\n    converter = converter_method_dict[args.method]\n    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)\n    eval_result = Evaluator(test_iterator, regressor, converter=converter,\n                            device=device)()\n    print('Evaluation result: ', eval_result)\n\n    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/own_dataset/test_own_dataset.sh",
    "content": "#!/usr/bin/env bash\nset -e\n\n# device specifier given from first argument, default value is -1\ndevice=${1:--1}\nfor method in nfp ggnn schnet weavenet rsgcn relgcn megnet\ndo\n    python train_own_dataset.py --datafile dataset_train.csv --method ${method} --label value1 --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --batchsize 32 --out eval_${method}\n    python predict_own_dataset.py --datafile dataset_test.csv --method ${method} --label value1 --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --in-dir eval_${method} --out eval_${method}\ndone\n"
  },
  {
    "path": "examples/own_dataset/train_own_dataset.py",
    "content": "#!/usr/bin/env python\n\nfrom __future__ import print_function\n\nimport chainer\nimport numpy\nimport os\n\nfrom argparse import ArgumentParser\nfrom chainer.datasets import split_dataset_random\nfrom chainer import functions as F\n\n\nfrom chainer_chemistry.dataset.parsers import CSVFileParser\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry.dataset.preprocessors import preprocess_method_dict\nfrom chainer_chemistry.links.scaler.standard_scaler import StandardScaler\nfrom chainer_chemistry.models import Regressor\nfrom chainer_chemistry.models.prediction import set_up_predictor\nfrom chainer_chemistry.utils import run_train\n\n\ndef rmse(x0, x1):\n    return F.sqrt(F.mean_squared_error(x0, x1))\n\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models.\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'mpnn', 'gnnfilm', 'megnet']\n    scale_list = ['standardize', 'none']\n\n    # Set up the argument parser.\n    parser = ArgumentParser(description='Regression on own dataset')\n    parser.add_argument('--datafile', '-d', type=str,\n                        default='dataset_train.csv',\n                        help='csv file containing the dataset')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        help='method name', default='nfp')\n    parser.add_argument('--label', '-l', nargs='+',\n                        default=['value1', 'value2'],\n                        help='target label for regression')\n    parser.add_argument('--scale', type=str, choices=scale_list,\n                        help='label scaling method', default='standardize')\n    parser.add_argument('--conv-layers', '-c', type=int, default=4,\n                        help='number of convolution layers')\n    parser.add_argument('--batchsize', '-b', type=int, default=32,\n                        help='batch size')\n    parser.add_argument(\n        '--device', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--out', '-o', type=str, default='result',\n                        help='path to save the computed model to')\n    parser.add_argument('--epoch', '-e', type=int, default=10,\n                        help='number of epochs')\n    parser.add_argument('--unit-num', '-u', type=int, default=16,\n                        help='number of units in one layer of the model')\n    parser.add_argument('--seed', '-s', type=int, default=777,\n                        help='random seed value')\n    parser.add_argument('--train-data-ratio', '-r', type=float, default=0.7,\n                        help='ratio of training data w.r.t the dataset')\n    parser.add_argument('--protocol', type=int, default=2,\n                        help='pickle protocol version')\n    parser.add_argument('--model-filename', type=str, default='regressor.pkl',\n                        help='saved model filename')\n    return parser.parse_args()\n\n\ndef main():\n    # Parse the arguments.\n    args = parse_arguments()\n\n    if args.label:\n        labels = args.label\n        class_num = len(labels) if isinstance(labels, list) else 1\n    else:\n        raise ValueError('No target label was specified.')\n\n    # Dataset preparation. Postprocessing is required for the regression task.\n    def postprocess_label(label_list):\n        return numpy.asarray(label_list, dtype=numpy.float32)\n\n    # Apply a preprocessor to the dataset.\n    print('Preprocessing dataset...')\n    preprocessor = preprocess_method_dict[args.method]()\n    parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label,\n                           labels=labels, smiles_col='SMILES')\n    dataset = parser.parse(args.datafile)['dataset']\n\n    # Scale the label values, if necessary.\n    if args.scale == 'standardize':\n        scaler = StandardScaler()\n        scaler.fit(dataset.get_datasets()[-1])\n    else:\n        scaler = None\n\n    # Split the dataset into training and validation.\n    train_data_size = int(len(dataset) * args.train_data_ratio)\n    train, _ = split_dataset_random(dataset, train_data_size, args.seed)\n\n    # Set up the predictor.\n    predictor = set_up_predictor(\n        args.method, args.unit_num,\n        args.conv_layers, class_num, label_scaler=scaler)\n\n    # Set up the regressor.\n    device = chainer.get_device(args.device)\n    metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse}\n    regressor = Regressor(predictor, lossfun=F.mean_squared_error,\n                          metrics_fun=metrics_fun, device=device)\n\n    print('Training...')\n    converter = converter_method_dict[args.method]\n    run_train(regressor, train, valid=None,\n              batch_size=args.batchsize, epoch=args.epoch,\n              out=args.out, extensions_list=None,\n              device=device, converter=converter,\n              resume_path=None)\n\n    # Save the regressor's parameters.\n    model_path = os.path.join(args.out, args.model_filename)\n    print('Saving the trained model to {}...'.format(model_path))\n\n    # TODO(nakago): ChainerX array cannot be sent to numpy array when internal\n    # state has gradients.\n    if hasattr(regressor.predictor.graph_conv, 'reset_state'):\n        regressor.predictor.graph_conv.reset_state()\n\n    regressor.save_pickle(model_path, protocol=args.protocol)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/qm9/README.md",
    "content": "# QM9 Regression Example\n\nThis example performs regression on the QM9 dataset.\n\n## Dependencies\n\nBefore running the example, the following packages also need to be installed:\n\n- [`matplotlib`](https://matplotlib.org/)\n- [`seaborn`](https://seaborn.pydata.org/)\n- [`scikit-learn`](http://scikit-learn.org/stable/)\n\n## How to run the code\n\n### Train a model\n\nTo train a model, run the following:\n\nOn the CPU:\n```angular2html\npython train_qm9.py\n```\n\nOn the GPU:\n```angular2html\npython train_qm9.py -g 0\n```\n\n### Inference using a pretrained model\n\nAs of v0.3.0, the `Regressor` class has been introduced, which provides the\n`predict` method for easier inference. `Regressor` also supports the\n`load_pickle` method, which allows for loading of a pretrained model, using the\n`pickle` library.\n\nThe perform inference using a pretrained model, run the following:\n\nOn the CPU:\n```\npython predict_qm9.py [-i /path/to/training/result/directory]\n```\n\nOn the GPU:\n```\npython predict_qm9.py -g 0 [-i /path/to/training/result/directory]\n```\n\n### Evaluation of implemented models\n\nTo evaluate the performance of the currently implemented models, run the\nfollowing:\n\nOn the CPU:\n```\nbash evaluate_models_qm9.sh -1 [epoch]\n```\n\nOn the GPU:\n```\nbash evaluate_models_qm9.sh 0 [epoch]\n```\n\nThis scripts start the training process for a number of `epoch` epochs per\nmodel. Inference is then performed and evaluation metrics are reported. For\nregression tasks (such as with QM9), these are MAE and RMSE. One plot per\nmetric is then createad (saved as `eval_[metric]_qm9.png` in the example\ndirectory), which outputs these values as reported by the diffent models.\n"
  },
  {
    "path": "examples/qm9/evaluate_models_qm9.sh",
    "content": "set -eu\n\n# List of available graph convolution methods.\nmethods=(nfp ggnn schnet weavenet rsgcn relgcn relgat megnet)\n\nprefix=eval_\n\n# device identifier; set it to -1 to train on the CPU (default).\ndevice=${1:--1}\n# Number of training epochs (default: 1).\nepoch=${2:-1}\nlabel=${3:-all}\n\necho evaluating label ${label}\n\nfor method in ${methods[@]}\ndo\n    result_dir=${prefix}${method}\n\n    python train_qm9.py \\\n        --method ${method} \\\n        --device ${device} \\\n        --out ${result_dir} \\\n        --epoch ${epoch} \\\n        --label ${label}\n\n    python predict_qm9.py \\\n        --in-dir ${result_dir} \\\n        --method ${method} \\\n        --label ${label}\ndone\n\npython plot.py --prefix ${prefix} --methods ${methods[@]}\n"
  },
  {
    "path": "examples/qm9/plot.py",
    "content": "#! -*- coding: utf-8 -*-\nimport argparse\nimport json\nfrom collections import defaultdict\n\nimport matplotlib.pyplot as plt\nimport os\nimport seaborn as sns\nfrom chainer_chemistry.utils import load_json\n\n\ndef save_evaluation_plot(x, y, metric, filename):\n    plt.figure()\n\n    sns.set()\n    ax = sns.barplot(y=x, x=y)\n\n    for n, (label, _y) in enumerate(zip(x, y)):\n        ax.annotate(\n            s='{:.4g}'.format(abs(_y)),\n            xy=(_y, n),\n            ha='left',\n            va='center',\n            xytext=(5, 0),\n            textcoords='offset points',\n            color='gray')\n\n    plt.title('Performance on qm9: {}'.format(metric))\n    plt.xlabel(metric)\n    plt.savefig(filename)\n    plt.close()\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n    parser.add_argument('--prefix', required=True)\n    parser.add_argument('--methods', nargs='+', required=True)\n    args = parser.parse_args()\n\n    x = args.methods\n    y = defaultdict(list)\n\n    for method in args.methods:\n        result = load_json(os.path.join(\n            args.prefix + method, 'eval_result_mae.json'))\n        for label, value in result.items():\n            y[label].append(value)\n\n    for label in y.keys():\n        save_evaluation_plot(\n            x, y[label], label, 'eval_qm9_{}_mae.png'.format(label))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/qm9/predict_qm9.py",
    "content": "#!/usr/bin/env python\nfrom __future__ import print_function\n\nimport argparse\nimport os\n\nimport chainer\nimport numpy\nimport pandas\n\nfrom chainer.datasets import split_dataset_random\nfrom chainer.iterators import SerialIterator\nfrom chainer.training.extensions import Evaluator\n\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry.dataset.preprocessors import preprocess_method_dict\nfrom chainer_chemistry import datasets as D\nfrom chainer_chemistry.datasets import NumpyTupleDataset\nfrom chainer_chemistry.models.prediction import Regressor\nfrom chainer_chemistry.utils import save_json\n\n# These import is necessary for pickle to work\nfrom chainer_chemistry.links.scaler.standard_scaler import StandardScaler  # NOQA\nfrom chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor  # NOQA\nfrom train_qm9 import rmse\n\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models.\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'gin', 'gnnfilm', 'relgcn_sparse', 'gin_sparse',\n                   'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm', 'megnet']\n    label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2',\n                   'zpve', 'U0', 'U', 'H', 'G', 'Cv']\n    scale_list = ['standardize', 'none']\n\n    # Set up the argument parser.\n    parser = argparse.ArgumentParser(description='Regression on QM9.')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        help='method name', default='nfp')\n    parser.add_argument('--label', '-l', type=str,\n                        choices=label_names + ['all'], default='all',\n                        help='target label for regression; all means '\n                        'predicting all properties at once')\n    parser.add_argument('--scale', type=str, choices=scale_list,\n                        help='label scaling method', default='standardize')\n    parser.add_argument(\n        '--device', '-d', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--seed', '-s', type=int, default=777,\n                        help='random seed value')\n    parser.add_argument('--train-data-ratio', '-r', type=float, default=0.7,\n                        help='ratio of training data w.r.t the dataset')\n    parser.add_argument('--in-dir', '-i', type=str, default='result',\n                        help='directory to load model data from')\n    parser.add_argument('--model-filename', type=str, default='regressor.pkl',\n                        help='saved model filename')\n    parser.add_argument('--num-data', type=int, default=-1,\n                        help='amount of data to be parsed; -1 indicates '\n                        'parsing all data.')\n    return parser.parse_args()\n\n\ndef main():\n    # Parse the arguments.\n    args = parse_arguments()\n    device = chainer.get_device(args.device)\n\n    # Set up some useful variables that will be used later on.\n    method = args.method\n    if args.label != 'all':\n        label = args.label\n        cache_dir = os.path.join('input', '{}_{}'.format(method, label))\n        labels = [label]\n    else:\n        labels = D.get_qm9_label_names()\n        cache_dir = os.path.join('input', '{}_all'.format(method))\n\n    # Get the filename corresponding to the cached dataset, based on the amount\n    # of data samples that need to be parsed from the original dataset.\n    num_data = args.num_data\n    if num_data >= 0:\n        dataset_filename = 'data_{}.npz'.format(num_data)\n    else:\n        dataset_filename = 'data.npz'\n\n    # Load the cached dataset.\n    dataset_cache_path = os.path.join(cache_dir, dataset_filename)\n\n    dataset = None\n    if os.path.exists(dataset_cache_path):\n        print('Loading cached data from {}.'.format(dataset_cache_path))\n        dataset = NumpyTupleDataset.load(dataset_cache_path)\n    if dataset is None:\n        print('Preprocessing dataset...')\n        preprocessor = preprocess_method_dict[method]()\n        if num_data >= 0:\n            # Select the first `num_data` samples from the dataset.\n            target_index = numpy.arange(num_data)\n            dataset = D.get_qm9(preprocessor, labels=labels,\n                                target_index=target_index)\n        else:\n            # Load the entire dataset.\n            dataset = D.get_qm9(preprocessor, labels=labels)\n\n        # Cache the newly preprocessed dataset.\n        if not os.path.exists(cache_dir):\n            os.mkdir(cache_dir)\n        if isinstance(dataset, NumpyTupleDataset):\n            NumpyTupleDataset.save(dataset_cache_path, dataset)\n\n    # Use a predictor with scaled output labels.\n    model_path = os.path.join(args.in_dir, args.model_filename)\n    regressor = Regressor.load_pickle(model_path, device=device)\n\n    # Split the dataset into training and testing.\n    train_data_size = int(len(dataset) * args.train_data_ratio)\n    _, test = split_dataset_random(dataset, train_data_size, args.seed)\n\n    # This callback function extracts only the inputs and discards the labels.\n    # TODO(nakago): consider how to switch which `converter` to use.\n    if isinstance(dataset, NumpyTupleDataset):\n        converter = converter_method_dict[method]\n\n        @chainer.dataset.converter()\n        def extract_inputs(batch, device=None):\n            return converter(batch, device=device)[:-1]\n\n        # Extract the ground-truth labels as numpy array.\n        original_t = converter(test, device=-1)[-1]\n    else:\n        converter = dataset.converter\n        extract_inputs = converter\n\n        # Extract the ground-truth labels as numpy array.\n        original_t = converter(test, device=-1).y\n\n    # Predict the output labels.\n    print('Predicting...')\n    y_pred = regressor.predict(\n        test, converter=extract_inputs)\n\n    df_dict = {}\n    for i, l in enumerate(labels):\n        df_dict.update({'y_pred_{}'.format(l): y_pred[:, i],\n                        't_{}'.format(l): original_t[:, i], })\n    df = pandas.DataFrame(df_dict)\n\n    # Show a prediction/ground truth table with 5 random examples.\n    print(df.sample(5))\n\n    n_eval = 10\n    for target_label in range(y_pred.shape[1]):\n        label_name = labels[target_label]\n        diff = y_pred[:n_eval, target_label] - original_t[:n_eval,\n                                                          target_label]\n        print('label_name = {}, y_pred = {}, t = {}, diff = {}'\n              .format(label_name, y_pred[:n_eval, target_label],\n                      original_t[:n_eval, target_label], diff))\n\n    # Run an evaluator on the test dataset.\n    print('Evaluating...')\n    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)\n    eval_result = Evaluator(test_iterator, regressor, converter=converter,\n                            device=device)()\n    print('Evaluation result: ', eval_result)\n    # Save the evaluation results.\n    save_json(os.path.join(args.in_dir, 'eval_result.json'), eval_result)\n\n    # Calculate mean abs error for each label\n    mae = numpy.mean(numpy.abs(y_pred - original_t), axis=0)\n    eval_result = {}\n    for i, l in enumerate(labels):\n        eval_result.update({l: mae[i]})\n    save_json(os.path.join(args.in_dir, 'eval_result_mae.json'), eval_result)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/qm9/qm9_dataset_exploration.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## QM9 Dataset exploration\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The purpose of this notebook is as follows,\\n\",\n    \"\\n\",\n    \" - Explain [QM9 dataset](http://quantum-machine.org/datasets/): Check the labels and visualization of molecules to understand what kind of data are stored.\\n\",\n    \" - Explain internal structure of QM9 dataset in `chainer_chemistry`: We handle the dataset with `NumpyTupleDataset`.\\n\",\n    \" - Explain how `preprocessor` and `parser` work on `chainer_chemistry`: One concrete example using `GGNNPreprocessor` is explained.\\n\",\n    \"\\n\",\n    \"It is out of scope of this notebook to explain how to train graph convolutional network using this dataset, please refer [document tutorial](http://chainer-chemistry.readthedocs.io/en/latest/tutorial.html#) or try `train_qm9.py` in [QM9 example](https://github.com/pfnet-research/chainer-chemistry/tree/master/examples/qm9) for the model training.\\n\",\n    \"\\n\",\n    \"[Note]\\n\",\n    \"This notebook is executed on 1, March, 2018.\\n\",\n    \"The behavior of QM9 dataset in `chainer_chemistry` might change in the future.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Loading modules and set loglevel.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import logging\\n\",\n    \"from rdkit import RDLogger\\n\",\n    \"from chainer_chemistry import datasets\\n\",\n    \"\\n\",\n    \"# Disable errors by RDKit occurred in preprocessing QM9 dataset.\\n\",\n    \"lg = RDLogger.logger()\\n\",\n    \"lg.setLevel(RDLogger.CRITICAL)\\n\",\n    \"\\n\",\n    \"# show INFO level log from chainer chemistry\\n\",\n    \"logging.basicConfig(level=logging.INFO)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"QM9 dataset can be downloaded automatically by chainer chemistry. \\n\",\n    \"Original format of QM9 dataset is zipped file where each molecule's information is stored in each \\\"xyz\\\" file.\\n\",\n    \"\\n\",\n    \"Chainer Chemistry automatically merge these information in one csv file internally, you may check the file path of this csv file with `get_qm9_filepath` method. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"dataset_filepath = datasets.get_qm9_filepath()\\n\",\n    \"\\n\",\n    \"print('dataset_filepath =', dataset_filepath)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The dataset contains several chemical/physical properties. The labels of QM9 dataset can be checked by `get_qm9_label_names` method.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"QM9 label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"label_names = datasets.get_qm9_label_names()\\n\",\n    \"print('QM9 label_names =', label_names)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"More detail information is described in `readme.txt` of QM9 dataset, which can be downloaded from \\n\",\n    \" - [https://figshare.com/articles/Readme_file%3A_Data_description_for__Quantum_chemistry_structures_and_properties_of_134_kilo_molecules_/1057641](https://figshare.com/articles/Readme_file%3A_Data_description_for__Quantum_chemistry_structures_and_properties_of_134_kilo_molecules_/1057641)\\n\",\n    \"\\n\",\n    \"Below is the description of each property(label), written in readme.txt\\n\",\n    \"\\n\",\n    \"<a id='table1'></a>\\n\",\n    \"<blockquote cite=\\\"https://figshare.com/articles/Readme_file%3A_Data_description_for__Quantum_chemistry_structures_and_properties_of_134_kilo_molecules_/1057641\\\">\\n\",\n    \"<pre>\\n\",\n    \"I.  Property  Unit         Description\\n\",\n    \"--  --------  -----------  --------------\\n\",\n    \" 1  tag       -            \\\"gdb9\\\"; string constant to ease extraction via grep\\n\",\n    \" 2  index     -            Consecutive, 1-based integer identifier of molecule\\n\",\n    \" 3  A         GHz          Rotational constant A\\n\",\n    \" 4  B         GHz          Rotational constant B\\n\",\n    \" 5  C         GHz          Rotational constant C\\n\",\n    \" 6  mu        Debye        Dipole moment\\n\",\n    \" 7  alpha     Bohr^3       Isotropic polarizability\\n\",\n    \" 8  homo      Hartree      Energy of Highest occupied molecular orbital (HOMO)\\n\",\n    \" 9  lumo      Hartree      Energy of Lowest occupied molecular orbital (LUMO)\\n\",\n    \"10  gap       Hartree      Gap, difference between LUMO and HOMO\\n\",\n    \"11  r2        Bohr^2       Electronic spatial extent\\n\",\n    \"12  zpve      Hartree      Zero point vibrational energy\\n\",\n    \"13  U0        Hartree      Internal energy at 0 K\\n\",\n    \"14  U         Hartree      Internal energy at 298.15 K\\n\",\n    \"15  H         Hartree      Enthalpy at 298.15 K\\n\",\n    \"16  G         Hartree      Free energy at 298.15 K\\n\",\n    \"17  Cv        cal/(mol K)  Heat capacity at 298.15 K\\n\",\n    \"</pre>\\n\",\n    \"</blockquote>\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Preprocessing dataset\\n\",\n    \"\\n\",\n    \"Dataset extraction depends on the preprocessing method, which is determined by `preprocessor`.\\n\",\n    \"\\n\",\n    \"Here, let's look an example of using `GGNNPreprocessor` preprocessor for QM9 dataset extraction.\\n\",\n    \"\\n\",\n    \"Procedure is as follows,\\n\",\n    \"\\n\",\n    \"1. Instantiate `preprocessor` (here `GGNNPreprocessor` is used).\\n\",\n    \"2. call `get_qm9` method with `preprocessor`.\\n\",\n    \" - `labels=None` option is used to extract all labels. In this case, 15 types of physical properties are extracted (see above).\\n\",\n    \"\\n\",\n    \"Note that `return_smiles` option can be used to get SMILES information together with the dataset itself.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|████████████████████████████████████████████████████████████████████████| 133885/133885 [01:35<00:00, 1406.47it/s]\\n\",\n      \"INFO:chainer_chemistry.dataset.parsers.csv_file_parser:Preprocess finished. FAIL 0, SUCCESS 133885, TOTAL 133885\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import \\\\\\n\",\n    \"    GGNNPreprocessor\\n\",\n    \"    \\n\",\n    \"preprocessor = GGNNPreprocessor()\\n\",\n    \"dataset, dataset_smiles = datasets.get_qm9(preprocessor, labels=None, return_smiles=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Check extracted dataset\\n\",\n    \"\\n\",\n    \"First, let's check type and number of dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"dataset information...\\n\",\n      \"dataset <class 'chainer_chemistry.datasets.numpy_tuple_dataset.NumpyTupleDataset'> 133885\\n\",\n      \"smiles information...\\n\",\n      \"dataset_smiles <class 'numpy.ndarray'> 133885\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print('dataset information...')\\n\",\n    \"print('dataset', type(dataset), len(dataset))\\n\",\n    \"\\n\",\n    \"print('smiles information...')\\n\",\n    \"print('dataset_smiles', type(dataset_smiles), len(dataset_smiles))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"As you can see, QM9 dataset consists of 133885 data.\\n\",\n    \"\\n\",\n    \"The dataset is a class of `NumpyTupleDataset`, where i-th dataset features can be accessed by `dataset[i]`.\\n\",\n    \"\\n\",\n    \"When `GGNNPreprocessor` is used, each dataset consists of following features\\n\",\n    \" 1. atom feature: representing atomic number of given molecule. \\n\",\n    \" 2. adjacency matrix feature: representing adjacency matrix of given molecule.\\n\",\n    \"    `GGNNPreprocessor` extracts adjacency matrix of each bonding type.\\n\",\n    \" 3. label feature: representing chemical properties (label) of given molecule.\\n\",\n    \"    Please refer [above table](#table1) for details.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Let's look an example of 7777-th dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"index=7777, SMILES=CC1=NCCC(C)O1\\n\",\n      \"atom (8,) [6 6 7 6 6 6 6 8]\\n\",\n      \"adj (4, 8, 8)\\n\",\n      \"adjacency matrix for SINGLE bond type\\n\",\n      \" [[0. 1. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [1. 0. 0. 0. 0. 0. 0. 1.]\\n\",\n      \" [0. 0. 0. 1. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 1. 0. 1. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 1. 0. 1. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 1. 0. 1. 1.]\\n\",\n      \" [0. 0. 0. 0. 0. 1. 0. 0.]\\n\",\n      \" [0. 1. 0. 0. 0. 1. 0. 0.]]\\n\",\n      \"adjacency matrix for DOUBLE bond type\\n\",\n      \" [[0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 1. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 1. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]]\\n\",\n      \"adjacency matrix for TRIPLE bond type\\n\",\n      \" [[0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]]\\n\",\n      \"adjacency matrix for AROMATIC bond type\\n\",\n      \" [[0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0.]]\\n\",\n      \"labels [ 3.1431000e+00  1.8749400e+00  1.2443100e+00  1.9313999e+00\\n\",\n      \"  7.3379997e+01 -2.3750000e-01  3.4699999e-02  2.7219999e-01\\n\",\n      \"  1.0124120e+03  1.6597100e-01 -3.6510001e+02 -3.6509183e+02\\n\",\n      \" -3.6509088e+02 -3.6513235e+02  3.0584999e+01]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"index = 7777\\n\",\n    \"\\n\",\n    \"print('index={}, SMILES={}'.format(index, dataset_smiles[index]))\\n\",\n    \"atom, adj, labels = dataset[index]\\n\",\n    \"# This molecule has N=8 atoms.\\n\",\n    \"print('atom', atom.shape, atom)\\n\",\n    \"# adjacency matrix is NxN matrix, where N is number of atoms in the molecule.\\n\",\n    \"# Unlike usual adjacency matrix, diagonal elements are filled with 1, for NFP calculation purpose.\\n\",\n    \"print('adj', adj.shape)\\n\",\n    \"print('adjacency matrix for SINGLE bond type\\\\n', adj[0])\\n\",\n    \"print('adjacency matrix for DOUBLE bond type\\\\n', adj[1])\\n\",\n    \"print('adjacency matrix for TRIPLE bond type\\\\n', adj[2])\\n\",\n    \"print('adjacency matrix for AROMATIC bond type\\\\n', adj[3])\\n\",\n    \"\\n\",\n    \"print('labels', labels)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Visualizing the molecule\\n\",\n    \"\\n\",\n    \"One might want to visualize molecule given SMILES information. Here is an example code:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# This script is referred from http://rdkit.blogspot.jp/2015/02/new-drawing-code.html\\n\",\n    \"# and http://cheminformist.itmol.com/TEST/wp-content/uploads/2015/07/rdkit_moldraw2d_2.html\\n\",\n    \"from __future__ import print_function\\n\",\n    \"from rdkit import Chem\\n\",\n    \"from rdkit.Chem.Draw import IPythonConsole\\n\",\n    \"from IPython.display import SVG\\n\",\n    \"\\n\",\n    \"from rdkit.Chem import rdDepictor\\n\",\n    \"from rdkit.Chem.Draw import rdMolDraw2D\\n\",\n    \"def moltosvg(mol,molSize=(450,150),kekulize=True):\\n\",\n    \"    mc = Chem.Mol(mol.ToBinary())\\n\",\n    \"    if kekulize:\\n\",\n    \"        try:\\n\",\n    \"            Chem.Kekulize(mc)\\n\",\n    \"        except:\\n\",\n    \"            mc = Chem.Mol(mol.ToBinary())\\n\",\n    \"    if not mc.GetNumConformers():\\n\",\n    \"        rdDepictor.Compute2DCoords(mc)\\n\",\n    \"    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])\\n\",\n    \"    drawer.DrawMolecule(mc)\\n\",\n    \"    drawer.FinishDrawing()\\n\",\n    \"    svg = drawer.GetDrawingText()\\n\",\n    \"    return svg\\n\",\n    \"\\n\",\n    \"def render_svg(svg):\\n\",\n    \"    # It seems that the svg renderer used doesn't quite hit the spec.\\n\",\n    \"    # Here are some fixes to make it work in the notebook, although I think\\n\",\n    \"    # the underlying issue needs to be resolved at the generation step\\n\",\n    \"    return SVG(svg.replace('svg:',''))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"smiles: CC1=NCCC(C)O1\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"image/svg+xml\": [\n       \"<svg baseProfile=\\\"full\\\" height=\\\"150px\\\" version=\\\"1.1\\\" width=\\\"450px\\\" xml:space=\\\"preserve\\\" xmlns:rdkit=\\\"http://www.rdkit.org/xml\\\" xmlns:svg=\\\"http://www.w3.org/2000/svg\\\" xmlns:xlink=\\\"http://www.w3.org/1999/xlink\\\">\\n\",\n       \"<rect height=\\\"150\\\" style=\\\"opacity:1.0;fill:#FFFFFF;stroke:none\\\" width=\\\"450\\\" x=\\\"0\\\" y=\\\"0\\\"> </rect>\\n\",\n       \"<path d=\\\"M 106.906,143.182 165.953,109.091\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 165.953,109.091 165.953,78.75\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 165.953,78.75 165.953,48.4091\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 179.589,99.9886 179.589,78.75\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 179.589,78.75 179.589,57.5114\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 165.953,109.091 191.974,124.114\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 191.974,124.114 217.995,139.138\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 172.453,37.156 198.727,21.9871\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 198.727,21.9871 225,6.81818\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 225,6.81818 284.047,40.9091\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 284.047,40.9091 284.047,109.091\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 284.047,109.091 343.094,143.182\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 284.047,109.091 258.026,124.114\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 258.026,124.114 232.005,139.138\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<text style=\\\"font-size:15px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"159.452\\\" y=\\\"48.4091\\\"><tspan>N</tspan></text>\\n\",\n       \"<text style=\\\"font-size:15px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"217.995\\\" y=\\\"150.682\\\"><tspan>O</tspan></text>\\n\",\n       \"</svg>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.SVG object>\"\n      ]\n     },\n     \"execution_count\": 11,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"smiles = dataset_smiles[index]\\n\",\n    \"mol = Chem.MolFromSmiles(dataset_smiles[index])\\n\",\n    \"\\n\",\n    \"print('smiles:', smiles)\\n\",\n    \"svg = moltosvg(mol)\\n\",\n    \"render_svg(svg)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"[Note] SVG images cannot be displayed on GitHub, but you can see an image of molecule when you execute it on jupyter notebook.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Interactively watch through the QM9 dataset\\n\",\n    \"\\n\",\n    \"Jupyter notebook provides handy module to check/visualize the data. Here interact module can be used to interactively check the internal of QM9 dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"index=114829, SMILES=CCC1CC2OCC1O2\\n\",\n      \"atom [6 6 6 6 6 8 6 6 8]\\n\",\n      \"labels [   3.248    1.224    1.16     1.911   76.47    -0.249    0.082    0.331\\n\",\n      \" 1179.493    0.185 -424.24  -424.232 -424.231 -424.272   31.209]\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"image/svg+xml\": [\n       \"<svg baseProfile=\\\"full\\\" height=\\\"150px\\\" version=\\\"1.1\\\" width=\\\"450px\\\" xml:space=\\\"preserve\\\" xmlns:rdkit=\\\"http://www.rdkit.org/xml\\\" xmlns:svg=\\\"http://www.w3.org/2000/svg\\\" xmlns:xlink=\\\"http://www.w3.org/1999/xlink\\\">\\n\",\n       \"<rect height=\\\"150\\\" style=\\\"opacity:1.0;fill:#FFFFFF;stroke:none\\\" width=\\\"450\\\" x=\\\"0\\\" y=\\\"0\\\"> </rect>\\n\",\n       \"<path d=\\\"M 400.396,89.0292 334.271,30.345\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 334.271,30.345 250.386,58.269\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 250.386,58.269 223.671,142.546\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 250.386,58.269 178.489,6.81818\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 223.671,142.546 135.263,143.182\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 135.263,143.182 102.817,119.963\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 102.817,119.963 70.3709,96.7438\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 135.263,143.182 167.336,119.526\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 167.336,119.526 199.408,95.8694\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 65.7435,84.231 77.9124,45.8424\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 77.9124,45.8424 90.0812,7.45367\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 90.0812,7.45367 178.489,6.81818\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 178.489,6.81818 191.203,45.0105\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 191.203,45.0105 203.917,83.2028\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<text style=\\\"font-size:15px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"56.3613\\\" y=\\\"99.231\\\"><tspan>O</tspan></text>\\n\",\n       \"<text style=\\\"font-size:15px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"199.408\\\" y=\\\"98.2028\\\"><tspan>O</tspan></text>\\n\",\n       \"</svg>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.SVG object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"from ipywidgets import interact\\n\",\n    \"import numpy as np\\n\",\n    \"np.set_printoptions(precision=3, suppress=True)\\n\",\n    \"\\n\",\n    \"def show_dataset(index):\\n\",\n    \"    print('index={}, SMILES={}'.format(index, dataset_smiles[index]))\\n\",\n    \"    atom, adj, labels = dataset[index]\\n\",\n    \"    print('atom', atom)\\n\",\n    \"    # print('adj', adj)\\n\",\n    \"    print('labels', labels)\\n\",\n    \"    mol = Chem.MolFromSmiles(dataset_smiles[index])\\n\",\n    \"    return render_svg(moltosvg(mol))\\n\",\n    \"\\n\",\n    \"interact(show_dataset, index=(0, len(dataset) - 1, 1))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Appendix: how to save the molecule figure?\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"### 1. Save with SVG format\\n\",\n    \"\\n\",\n    \"First method is simply save svg in file.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"dirpath = 'images'\\n\",\n    \"\\n\",\n    \"if not os.path.exists(dirpath):\\n\",\n    \"    os.mkdir(dirpath)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def save_svg(mol, filepath):\\n\",\n    \"    svg = moltosvg(mol)\\n\",\n    \"    with open(filepath, \\\"w\\\") as fw:\\n\",\n    \"        fw.write(svg)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"drawing images\\\\mol_7777.svg\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"index = 7777\\n\",\n    \"save_filepath = os.path.join(dirpath, 'mol_{}.svg'.format(index))\\n\",\n    \"print('drawing {}'.format(save_filepath))\\n\",\n    \"\\n\",\n    \"mol = Chem.MolFromSmiles(dataset_smiles[index])\\n\",\n    \"save_svg(mol, save_filepath)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2. Save with png format\\n\",\n    \"\\n\",\n    \"`rdkit` provides `Draw.MolToFile` method to visualize mol instance and save it to png format.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"from rdkit.Chem import Draw\\n\",\n    \"\\n\",\n    \"def save_png(mol, filepath, size=(600, 600)):\\n\",\n    \"    Draw.MolToFile(mol, filepath, size=size)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"drawing images\\\\mol_7777.png\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from rdkit.Chem import Draw\\n\",\n    \"index = 7777\\n\",\n    \"save_filepath = os.path.join(dirpath, 'mol_{}.png'.format(index))\\n\",\n    \"print('drawing {}'.format(save_filepath))\\n\",\n    \"\\n\",\n    \"mol = Chem.MolFromSmiles(dataset_smiles[index])\\n\",\n    \"save_png(mol, save_filepath, size=(600, 600))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"anaconda-cloud\": {},\n  \"kernelspec\": {\n   \"display_name\": \"Python [conda root]\",\n   \"language\": \"python\",\n   \"name\": \"conda-root-py\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.5.2\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "examples/qm9/test_qm9.sh",
    "content": "#!/usr/bin/env bash\n\nset -e\n\n# List of available graph convolution methods.\n# schnet test is skipped, since it takes long time to preprocess...\nmethods=(nfp ggnn weavenet rsgcn relgcn relgat gin gnnfilm megnet nfp_gwm ggnn_gwm rsgcn_gwm gin_gwm relgcn_sparse gin_sparse megnet)\n\n# device identifier; set it to -1 to train on the CPU (default).\ndevice=${1:--1}\n# Number of training epochs (default: 1).\nepoch=${2:-1}\n\nfor method in ${methods[@]}\ndo\n    # Remove any previously cached models.\n    [ -d \"input\" ] && rm -rf input\n\n    # Train with the current method (one label).\n    python train_qm9.py \\\n        --method ${method} \\\n        --label A \\\n        --conv-layers 1 \\\n        --device ${device} \\\n        --epoch ${epoch} \\\n        --unit-num 10 \\\n        --num-data 100\n\n    # Predict with the current method (one label).\n    python predict_qm9.py \\\n        --method ${method} \\\n        --label A \\\n        --device ${device} \\\n        --num-data 100\n\n    # Train with the current method (all labels).\n    python train_qm9.py \\\n        --method ${method} \\\n        --conv-layers 1 \\\n        --device ${device} \\\n        --epoch ${epoch} \\\n        --unit-num 10 \\\n        --num-data 100\n\n    # Predict with the current method (all labels).\n    python predict_qm9.py \\\n        --method ${method} \\\n        --device ${device} \\\n        --num-data 100\ndone\n"
  },
  {
    "path": "examples/qm9/train_qm9.py",
    "content": "#!/usr/bin/env python\nfrom __future__ import print_function\n\nimport argparse\n\nimport chainer\nimport numpy\nimport os\n\nfrom chainer.datasets import split_dataset_random\nfrom chainer import functions as F\n\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry.dataset.preprocessors import preprocess_method_dict\nfrom chainer_chemistry import datasets as D\nfrom chainer_chemistry.datasets import NumpyTupleDataset\nfrom chainer_chemistry.links.scaler.standard_scaler import StandardScaler\nfrom chainer_chemistry.models.prediction.regressor import Regressor\nfrom chainer_chemistry.models.prediction import set_up_predictor\nfrom chainer_chemistry.utils import run_train\n\n\ndef rmse(x0, x1):\n    return F.sqrt(F.mean_squared_error(x0, x1))\n\n\ndef parse_arguments():\n    # Lists of supported preprocessing methods/models.\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'gin', 'gnnfilm', 'relgcn_sparse', 'gin_sparse',\n                   'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm', 'megnet']\n    label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2',\n                   'zpve', 'U0', 'U', 'H', 'G', 'Cv']\n    scale_list = ['standardize', 'none']\n\n    # Set up the argument parser.\n    parser = argparse.ArgumentParser(description='Regression on QM9.')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        default='nfp', help='method name')\n    parser.add_argument('--label', '-l', type=str,\n                        choices=label_names + ['all'], default='all',\n                        help='target label for regression; all means '\n                        'predicting all properties at once')\n    parser.add_argument('--scale', type=str, choices=scale_list,\n                        default='standardize', help='label scaling method')\n    parser.add_argument('--conv-layers', '-c', type=int, default=4,\n                        help='number of convolution layers')\n    parser.add_argument('--batchsize', '-b', type=int, default=32,\n                        help='batch size')\n    parser.add_argument(\n        '--device', '-d', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--out', '-o', type=str, default='result',\n                        help='path to save the computed model to')\n    parser.add_argument('--epoch', '-e', type=int, default=20,\n                        help='number of epochs')\n    parser.add_argument('--unit-num', '-u', type=int, default=16,\n                        help='number of units in one layer of the model')\n    parser.add_argument('--seed', '-s', type=int, default=777,\n                        help='random seed value')\n    parser.add_argument('--train-data-ratio', '-r', type=float, default=0.7,\n                        help='ratio of training data w.r.t the dataset')\n    parser.add_argument('--protocol', type=int, default=2,\n                        help='pickle protocol version')\n    parser.add_argument('--model-filename', type=str, default='regressor.pkl',\n                        help='saved model filename')\n    parser.add_argument('--num-data', type=int, default=-1,\n                        help='amount of data to be parsed; -1 indicates '\n                        'parsing all data.')\n    return parser.parse_args()\n\n\ndef main():\n    # Parse the arguments.\n    args = parse_arguments()\n\n    # Set up some useful variables that will be used later on.\n    method = args.method\n    if args.label != 'all':\n        labels = args.label\n        cache_dir = os.path.join('input', '{}_{}'.format(method, labels))\n        class_num = len(labels) if isinstance(labels, list) else 1\n    else:\n        labels = None\n        cache_dir = os.path.join('input', '{}_all'.format(method))\n        class_num = len(D.get_qm9_label_names())\n\n    # Get the filename corresponding to the cached dataset, based on the amount\n    # of data samples that need to be parsed from the original dataset.\n    num_data = args.num_data\n    if num_data >= 0:\n        dataset_filename = 'data_{}.npz'.format(num_data)\n    else:\n        dataset_filename = 'data.npz'\n\n    # Load the cached dataset.\n    dataset_cache_path = os.path.join(cache_dir, dataset_filename)\n\n    dataset = None\n    if os.path.exists(dataset_cache_path):\n        print('Loading cached dataset from {}.'.format(dataset_cache_path))\n        dataset = NumpyTupleDataset.load(dataset_cache_path)\n    if dataset is None:\n        print('Preprocessing dataset...')\n        preprocessor = preprocess_method_dict[method]()\n\n        if num_data >= 0:\n            # Select the first `num_data` samples from the dataset.\n            target_index = numpy.arange(num_data)\n            dataset = D.get_qm9(preprocessor, labels=labels,\n                                target_index=target_index)\n        else:\n            # Load the entire dataset.\n            dataset = D.get_qm9(preprocessor, labels=labels)\n\n        # Cache the laded dataset.\n        if not os.path.exists(cache_dir):\n            os.makedirs(cache_dir)\n        if isinstance(dataset, NumpyTupleDataset):\n            NumpyTupleDataset.save(dataset_cache_path, dataset)\n        # TODO: support caching of other dataset type...\n\n    # Scale the label values, if necessary.\n    if args.scale == 'standardize':\n        print('Fit StandardScaler to the labels.')\n        scaler = StandardScaler()\n        if isinstance(dataset, NumpyTupleDataset):\n            scaler.fit(dataset.get_datasets()[-1])\n        else:\n            y = numpy.array([data.y for data in dataset])\n            scaler.fit(y)\n    else:\n        print('No standard scaling was selected.')\n        scaler = None\n\n    # Split the dataset into training and validation.\n    train_data_size = int(len(dataset) * args.train_data_ratio)\n    train, valid = split_dataset_random(dataset, train_data_size, args.seed)\n\n    # Set up the predictor.\n    predictor = set_up_predictor(method, args.unit_num, args.conv_layers,\n                                 class_num, scaler)\n\n    # Set up the regressor.\n    device = chainer.get_device(args.device)\n    metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse}\n    regressor = Regressor(predictor, lossfun=F.mean_squared_error,\n                          metrics_fun=metrics_fun, device=device)\n\n    # TODO(nakago): consider how to switch which `converter` to use.\n    if isinstance(dataset, NumpyTupleDataset):\n        converter = converter_method_dict[method]\n    else:\n        converter = dataset.converter\n\n    print('Training...')\n    run_train(regressor, train, valid=valid,\n              batch_size=args.batchsize, epoch=args.epoch,\n              out=args.out, extensions_list=None,\n              device=device, converter=converter,\n              resume_path=None)\n\n    # Save the regressor's parameters.\n    model_path = os.path.join(args.out, args.model_filename)\n    print('Saving the trained model to {}...'.format(model_path))\n    regressor.save_pickle(model_path, protocol=args.protocol)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/test_examples.sh",
    "content": "#!/usr/bin/env bash\n\nset -e\n\ngpu=${1:--1}\necho Using gpu ${gpu}\n\n# Tox21\necho --- Testing Tox21 ---\ncd tox21 && bash -x test_tox21.sh ${gpu} && cd ..\n\n# QM9\necho --- Testing QM9 ---\ncd qm9 && bash -x test_qm9.sh ${gpu} && cd ..\n\n# Own dataset\necho --- Testing on own dataset ---\ncd own_dataset && bash -x test_own_dataset.sh ${gpu} && cd ..\n\n# MolNet\necho --- Testing MolNet dataset ---\ncd molnet && bash -x test_molnet.sh ${gpu} && cd ..\n"
  },
  {
    "path": "examples/tox21/.gitignore",
    "content": "prediction.npz"
  },
  {
    "path": "examples/tox21/README.md",
    "content": "# Training graph convolution models with Tox21 dataset\n\nThis is an example of learning toxicity of chemical molecules with graph convolution networks in a multi-task supervised setting.\n\nWe use graph convolution models that takes molecules represented as graphs as predictor.\nChainer Chemistry provides off-the-shelf graph convolution models including [NFP](https://arxiv.org/abs/1509.09292), [GGNN](https://arxiv.org/abs/1511.05493), [SchNet](https://arxiv.org/abs/1706.08566) and so on.\n\nWe use Tox21 dataset, provided by [The Toxicology in the 21st Century (Tox21)](https://ncats.nih.gov/tox21).\nIt is one of the most widely used datasets in bio and chemo informatics\nand consists of the chemical information of molecules and their assessments of toxicity.\n\n## How to run the code\n\n### Train the model with tox21 dataset\n\nWith CPU:\n```angular2html\npython train_tox21.py\n```\n\nWith GPU:\n```angular2html\npython train_tox21.py -g 0\n```\n\nThis script trains the model with the tox21 dataset\nand outputs trained parameters and other information to a specified directory.\nWe specify an ID of GPU in use by `-g` or `--gpu` option.\nNegative value indicate running the code with CPU.\nThe output directory can be specified by `-o` option.\nIts default value is `result`.\nThe Tox21 dataset consists of several assays.\nSome molecules can have more than one types of assay results.\nWe can specify which assay to use by specifying an assay name with `-l` option.\nAssay names are available by running the script with `-h` or `--help`\nor execute the following command:\n\n```\npython -c import chainer_chemistry; chainer_chemistry.datasets.get_tox21_label_names()\n```\n\nIf `-l` option is not specified, this script conducts multitask learning with all labels.\n\nThe full options available including `-g` and `-o` are found\nby running the following command:\n\n```\npython train_tox21.py -h\n```\n\n### Inference with a trained model using Classifier\n\nAs of v0.3.0, `Classifier` class is introduced which supports `predict` and\n`predict_proba` methods for easier inference.\n\n`Classifier` also supports `load_pickle` method, user may load\nthe instance of pretrained-model using `pickle` file.\n\nThe example implemented in `predict_tox21_with_classifier.py`.\n\nWith CPU:\n```\npython predict_tox21_with_classifier.py [-i /path/to/training/result/directory]\n```\n\nWith GPU:\n```\npython predict_tox21_with_classifier.py -g 0 [-i /path/to/training/result/directory]\n```\n\n### Evaluation of Models\n`seaborn` is required to run this script.\n\n```\nbash examples/tox21/evaluate_models_tox21.sh\n```\n\nThis script evaluates each method and generate a graph.\n"
  },
  {
    "path": "examples/tox21/data.py",
    "content": "import os\n\nimport numpy\n\nfrom chainer_chemistry.dataset.preprocessors import preprocess_method_dict\nfrom chainer_chemistry import datasets as D\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\nclass _CacheNamePolicy(object):\n\n    train_file_name = 'train.npz'\n    val_file_name = 'val.npz'\n    test_file_name = 'test.npz'\n\n    def _get_cache_directory_path(self, method, labels, prefix, num_data):\n        num_data_str = '_{}'.format(num_data) if num_data >= 0 else ''\n        if labels:\n            return os.path.join(prefix,\n                                '{}_{}{}'.format(method, labels, num_data_str))\n        else:\n            return os.path.join(prefix,\n                                '{}_all{}'.format(method, num_data_str))\n\n    def __init__(self, method, labels, prefix='input', num_data=-1):\n        self.method = method\n        self.labels = labels\n        self.prefix = prefix\n        self.num_data = num_data\n        self.cache_dir = self._get_cache_directory_path(\n            method, labels, prefix, num_data)\n\n    def get_train_file_path(self):\n        return os.path.join(self.cache_dir, self.train_file_name)\n\n    def get_val_file_path(self):\n        return os.path.join(self.cache_dir, self.val_file_name)\n\n    def get_test_file_path(self):\n        return os.path.join(self.cache_dir, self.test_file_name)\n\n    def create_cache_directory(self):\n        try:\n            os.makedirs(self.cache_dir)\n        except OSError:\n            if not os.path.isdir(self.cache_dir):\n                raise\n\n\ndef load_dataset(method, labels, prefix='input', num_data=-1):\n    policy = _CacheNamePolicy(method, labels, prefix, num_data=num_data)\n    train_path = policy.get_train_file_path()\n    val_path = policy.get_val_file_path()\n    test_path = policy.get_test_file_path()\n\n    train, val, test = None, None, None\n    print()\n    if os.path.exists(policy.cache_dir):\n        print('load from cache {}'.format(policy.cache_dir))\n        train = NumpyTupleDataset.load(train_path)\n        val = NumpyTupleDataset.load(val_path)\n        test = NumpyTupleDataset.load(test_path)\n    if train is None or val is None or test is None:\n        print('preprocessing dataset...')\n        preprocessor = preprocess_method_dict[method]()\n        if num_data >= 0:\n            # Use `num_data` examples for train\n            target_index = numpy.arange(num_data)\n            train, val, test = D.get_tox21(\n                preprocessor, labels=labels,\n                train_target_index=target_index, val_target_index=None,\n                test_target_index=None\n            )\n        else:\n            train, val, test = D.get_tox21(preprocessor, labels=labels)\n        # Cache dataset\n        policy.create_cache_directory()\n        NumpyTupleDataset.save(train_path, train)\n        NumpyTupleDataset.save(val_path, val)\n        NumpyTupleDataset.save(test_path, test)\n    return train, val, test\n"
  },
  {
    "path": "examples/tox21/evaluate_models_tox21.sh",
    "content": "set -eu\n\ndevice=-1\nmethods=(nfp ggnn schnet weavenet rsgcn relgcn relgat megnet)\nprefix=eval_\n\nfor method in ${methods[@]}\ndo\n    result_dir=${prefix}${method}\n    python train_tox21.py --method ${method} --device ${device} --out ${result_dir}\n    python predict_tox21_with_classifier.py --in-dir ${result_dir}\ndone\n\npython plot.py --prefix ${prefix} --methods ${methods[@]}\n"
  },
  {
    "path": "examples/tox21/plot.py",
    "content": "#! -*- coding: utf-8 -*-\nimport argparse\nimport json\nimport os\n\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nparser = argparse.ArgumentParser()\nparser.add_argument('--prefix', required=True)\nparser.add_argument('--methods', nargs='+', required=True)\nargs = parser.parse_args()\n\nsns.set()\n\nx = args.methods\ny = []\nfor method in args.methods:\n    with open(os.path.join(args.prefix + method, 'eval_result.json')) as f:\n        result = json.load(f)\n        y.append(result[\"test/main/roc_auc\"])\n\nax = sns.barplot(y=x, x=y)\n\nfor n, (label, _y) in enumerate(zip(x, y)):\n    ax.annotate(\n        s='{:.3f}'.format(abs(_y)),\n        xy=(_y, n),\n        ha='right', va='center',\n        xytext=(-5, 0),\n        textcoords='offset points',\n        color='white'\n    )\nplt.title(\"Performance on tox21\")\nplt.xlabel(\"ROC-AUC\")\nplt.savefig('eval_results_tox21.png')\n"
  },
  {
    "path": "examples/tox21/predict_tox21_with_classifier.py",
    "content": "import os\n\nimport argparse\nimport json\n\nimport chainer\nimport numpy\n\nfrom chainer import cuda\nimport chainer.functions as F\nfrom chainer.iterators import SerialIterator\nfrom chainer.training.extensions import Evaluator\nfrom rdkit import RDLogger\nimport six\n\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry.models.prediction import Classifier\nfrom chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator  # NOQA\n\nimport data\n\n\n# Disable errors by RDKit occurred in preprocessing Tox21 dataset.\nlg = RDLogger.logger()\nlg.setLevel(RDLogger.CRITICAL)\n\n\ndef main():\n    parser = argparse.ArgumentParser(\n        description='Predict with a trained model.')\n    parser.add_argument('--in-dir', '-i', type=str, default='result',\n                        help='Path to the result directory of the training '\n                        'script.')\n    parser.add_argument('--batchsize', '-b', type=int, default=128,\n                        help='batch size')\n    parser.add_argument(\n        '--device', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--model-filename', type=str, default='classifier.pkl',\n                        help='file name for pickled model')\n    parser.add_argument('--num-data', type=int, default=-1,\n                        help='Number of data to be parsed from parser.'\n                             '-1 indicates to parse all data.')\n    args = parser.parse_args()\n\n    with open(os.path.join(args.in_dir, 'config.json'), 'r') as i:\n        config = json.loads(i.read())\n\n    method = config['method']\n    labels = config['labels']\n\n    _, test, _ = data.load_dataset(method, labels, num_data=args.num_data)\n    y_test = test.get_datasets()[-1]\n\n    device = chainer.get_device(args.device)\n    # Load pretrained model\n    clf = Classifier.load_pickle(\n        os.path.join(args.in_dir, args.model_filename),\n        device=device)  # type: Classifier\n\n    # ---- predict ---\n    print('Predicting...')\n\n    # We need to feed only input features `x` to `predict`/`predict_proba`.\n    # This converter extracts only inputs (x1, x2, ...) from the features which\n    # consist of input `x` and label `t` (x1, x2, ..., t).\n    converter = converter_method_dict[method]\n\n    def extract_inputs(batch, device=None):\n        return converter(batch, device=device)[:-1]\n\n    def postprocess_pred(x):\n        x_array = cuda.to_cpu(x.data)\n        return numpy.where(x_array > 0, 1, 0)\n    y_pred = clf.predict(test, converter=extract_inputs,\n                         postprocess_fn=postprocess_pred)\n    y_proba = clf.predict_proba(test, converter=extract_inputs,\n                                postprocess_fn=F.sigmoid)\n\n    # `predict` method returns the prediction label (0: non-toxic, 1:toxic)\n    print('y_pread.shape = {}, y_pred[:5, 0] = {}'\n          .format(y_pred.shape, y_pred[:5, 0]))\n    # `predict_proba` method returns the probability to be toxic\n    print('y_proba.shape = {}, y_proba[:5, 0] = {}'\n          .format(y_proba.shape, y_proba[:5, 0]))\n    # --- predict end ---\n\n    if y_pred.ndim == 1:\n        y_pred = y_pred[:, None]\n\n    if y_pred.shape != y_test.shape:\n        raise RuntimeError('The shape of the prediction result array and '\n                           'that of the ground truth array do not match. '\n                           'Contents of the input directory may be corrupted '\n                           'or modified.')\n\n    statistics = []\n    for t, p in six.moves.zip(y_test.T, y_pred.T):\n        idx = t != -1\n        n_correct = (t[idx] == p[idx]).sum()\n        n_total = len(t[idx])\n        accuracy = float(n_correct) / n_total\n        statistics.append([n_correct, n_total, accuracy])\n\n    print('{:>6} {:>8} {:>8} {:>8}'\n          .format('TaskID', 'Correct', 'Total', 'Accuracy'))\n    for idx, (n_correct, n_total, accuracy) in enumerate(statistics):\n        print('task{:>2} {:>8} {:>8} {:>8.4f}'\n              .format(idx, n_correct, n_total, accuracy))\n\n    prediction_result_file = 'prediction.npz'\n    print('Save prediction result to {}'.format(prediction_result_file))\n    numpy.savez_compressed(prediction_result_file, y_pred)\n\n    # --- evaluate ---\n    # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`\n    print('Evaluating...')\n    test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)\n    eval_result = Evaluator(\n        test_iterator, clf, converter=converter, device=device)()\n    print('Evaluation result: ', eval_result)\n    rocauc_result = ROCAUCEvaluator(\n        test_iterator, clf, converter=converter, device=device,\n        eval_func=clf.predictor, name='test', ignore_labels=-1)()\n    print('ROCAUC Evaluation result: ', rocauc_result)\n    with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:\n        json.dump(rocauc_result, f)\n    # --- evaluate end ---\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "examples/tox21/test_tox21.sh",
    "content": "#!/usr/bin/env bash\n\nset -e\n\n# device specifier given from first argument, default value is -1\ndevice=${1:--1}\n\n# Preprocessor parse result must contain both pos/neg samples\ntox21_num_data=100\n\nfor method in nfp ggnn\ndo\n    if [ ! -f \"input\" ]; then\n        rm -rf input\n    fi\n\n    # Tox21 classification task with only one label\n    out_dir=nr_ar_${method}\n    python train_tox21.py --method ${method} --label NR-AR --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --out ${out_dir} --batchsize 32 --num-data=${tox21_num_data}\n    python predict_tox21_with_classifier.py --in-dir ${out_dir} --device ${device} --num-data=${tox21_num_data}\n\n    # Tox21 classification task with all labels\n    out_dir=all_${method}\n    python train_tox21.py --method ${method} --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --out ${out_dir} --batchsize 16 --num-data=${tox21_num_data}\n    python predict_tox21_with_classifier.py --in-dir ${out_dir} --num-data=${tox21_num_data}\ndone\n\n# BalancedSerialIterator test with Tox21\npython train_tox21.py --method nfp --label NR-AR --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --out nr_ar_nfp_balanced --iterator-type balanced --eval-mode 0 --num-data 1000\n# ROCAUCEvaluator test with Tox21\npython train_tox21.py --method nfp --label NR-AR --conv-layers 1 --device ${device} --epoch 1 --unit-num 10 --out nr_ar_nfp_balanced --iterator-type serial --eval-mode 1 --num-data 1000\n"
  },
  {
    "path": "examples/tox21/tox21_dataset_exploration.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"source\": [\n    \"## Tox 21 dataset exploration\\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The purpose of this notebook is as follows,\\n\",\n    \"\\n\",\n    \" - Explain [Tox21 dataset](https://tripod.nih.gov/tox21/challenge/): Check the labels and visualization of molecules to understand what kind of data are stored.\\n\",\n    \" - Explain internal structure of tox21 dataset in `chainer_chemistry`: We handle the dataset with `NumpyTupleDataset`.\\n\",\n    \" - Explain how `preprocessor` and `parser` work on `chainer_chemistry`: One concrete example using `NFPPreprocessor` is explained.\\n\",\n    \"\\n\",\n    \"It is out of scope of this notebook to explain how to train graph convolutional network using this dataset, please refer [document tutorial](http://chainer-chemistry.readthedocs.io/en/latest/tutorial.html#) or try `train_tox21.py` in [tox21 example](https://github.com/pfnet-research/chainer-chemistry/tree/master/examples/tox21) for the model training.\\n\",\n    \"\\n\",\n    \"[Note]\\n\",\n    \"This notebook is executed on 1, March, 2018. \\n\",\n    \"The behavior of tox21 dataset in `chainer_chemistry` might change in the future.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Loading modules and set loglevel.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import logging\\n\",\n    \"from rdkit import RDLogger\\n\",\n    \"from chainer_chemistry import datasets\\n\",\n    \"\\n\",\n    \"# Disable errors by RDKit occurred in preprocessing Tox21 dataset.\\n\",\n    \"lg = RDLogger.logger()\\n\",\n    \"lg.setLevel(RDLogger.CRITICAL)\\n\",\n    \"\\n\",\n    \"# show INFO level log from chainer chemistry\\n\",\n    \"logging.basicConfig(level=logging.INFO)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Tox 21 dataset consists of train/validation/test data and they can be downloaded automatically with chainer chemistry. \\n\",\n    \"The format of tox21 dataset is \\\"sdf\\\" file.\\n\",\n    \"You can check the file path of downloaded sdf file with `get_tox21_filepath` method. \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"train_filepath = datasets.get_tox21_filepath('train')\\n\",\n    \"val_filepath = datasets.get_tox21_filepath('val')\\n\",\n    \"test_filepath = datasets.get_tox21_filepath('test')\\n\",\n    \"\\n\",\n    \"print('train_filepath =', train_filepath)\\n\",\n    \"print('val_filepath =', val_filepath)\\n\",\n    \"print('test_filepath =', test_filepath)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Dataset contains 12 types of toxity, the label of toxity can be checked by `get_tox21_label_names` method.\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"tox21 label_names = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"label_names = datasets.get_tox21_label_names()\\n\",\n    \"print('tox21 label_names =', label_names)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Preprocessing dataset\\n\",\n    \"\\n\",\n    \"Dataset extraction depends on the preprocessing method, which is determined by `preprocessor`.\\n\",\n    \"\\n\",\n    \"Here, let's look an example of using `NFPPreprocessor` preprocessor for tox21 dataset exraction.\\n\",\n    \"\\n\",\n    \"Procedure is as follows,\\n\",\n    \"\\n\",\n    \"1. Instantiate `preprocessor` (here `NFPPreprocessor` is used).\\n\",\n    \"2. call `get_tox21` method with `preprocessor`.\\n\",\n    \" - `labels=None` option is used to extract all labels. In this case, 12 types of toxity labels are extracted (see above).\\n\",\n    \"\\n\",\n    \"[Note] \\n\",\n    \" - `return_smiles` option can be used to get SMILES information together with the dataset itself.\\n\",\n    \" - Preprocessing result depends on RDKit version. \\n\",\n    \"You might get different results due to the difference of RDKit behavior between version.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"RDKit version:  2017.03.3\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import rdkit\\n\",\n    \"\\n\",\n    \"print('RDKit version: ', rdkit.__version__)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|███████████████████████████████████████████████████████████████████████████| 11764/11764 [00:22<00:00, 531.76it/s]\\n\",\n      \"INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 11757, TOTAL 11757\\n\",\n      \"100%|███████████████████████████████████████████████████████████████████████████████| 296/296 [00:00<00:00, 488.77it/s]\\n\",\n      \"INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 295, TOTAL 295\\n\",\n      \"100%|███████████████████████████████████████████████████████████████████████████████| 647/647 [00:01<00:00, 609.91it/s]\\n\",\n      \"INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 645, TOTAL 645\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from chainer_chemistry.dataset.preprocessors.nfp_preprocessor import \\\\\\n\",\n    \"    NFPPreprocessor\\n\",\n    \"\\n\",\n    \"preprocessor = NFPPreprocessor()\\n\",\n    \"train, val, test, train_smiles, val_smiles, test_smiles = datasets.get_tox21(preprocessor, labels=None, return_smiles=True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Dataset extraction depends on the `preprocessor`, and you may use other type of `preprocessor` as well.\\n\",\n    \"\\n\",\n    \"Below is another example of using `GGNNPreprocessor` for dataset extraction. But it takes little bit of time, you can skip it for the following tutorial.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"100%|███████████████████████████████████████████████████████████████████████████| 11764/11764 [00:29<00:00, 401.74it/s]\\n\",\n      \"INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 11757, TOTAL 11757\\n\",\n      \"100%|███████████████████████████████████████████████████████████████████████████████| 296/296 [00:00<00:00, 336.99it/s]\\n\",\n      \"INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 295, TOTAL 295\\n\",\n      \"100%|███████████████████████████████████████████████████████████████████████████████| 647/647 [00:01<00:00, 479.05it/s]\\n\",\n      \"INFO:chainer_chemistry.dataset.parsers.sdf_file_parser:Preprocess finished. FAIL 0, SUCCESS 645, TOTAL 645\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"from chainer_chemistry.dataset.preprocessors.ggnn_preprocessor import \\\\\\n\",\n    \"    GGNNPreprocessor\\n\",\n    \"\\n\",\n    \"# uncomment it if you want to try `GGNNPreprocessor`\\n\",\n    \"ggnn_preprocessor = GGNNPreprocessor()\\n\",\n    \"results = datasets.get_tox21(ggnn_preprocessor, labels=None, return_smiles=True)\\n\",\n    \"train_ggnn, val_ggnn, test_ggnn, train_smiles_ggnn, val_smiles_ggnn, test_smiles_ggnn = results\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Check extracted dataset\\n\",\n    \"\\n\",\n    \"First, let's check number of data for train/validation/test dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"dataset information...\\n\",\n      \"train <class 'chainer_chemistry.datasets.numpy_tuple_dataset.NumpyTupleDataset'> 11757\\n\",\n      \"val <class 'chainer_chemistry.datasets.numpy_tuple_dataset.NumpyTupleDataset'> 295\\n\",\n      \"test <class 'chainer_chemistry.datasets.numpy_tuple_dataset.NumpyTupleDataset'> 645\\n\",\n      \"smiles information...\\n\",\n      \"train_smiles <class 'numpy.ndarray'> 11757\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print('dataset information...')\\n\",\n    \"print('train', type(train), len(train))\\n\",\n    \"print('val', type(val), len(val))\\n\",\n    \"print('test', type(test), len(test))\\n\",\n    \"\\n\",\n    \"print('smiles information...')\\n\",\n    \"print('train_smiles', type(train_smiles), len(train_smiles))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"There are 11757 data in `train`, 295 data in `val` and 645 data in `test` respectively.\\n\",\n    \"(You might get different result with different version of `rdkit`.)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"The dataset is a class of `NumpyTupleDataset`, where i-th dataset features can be accessed by `dataset[i]`.\\n\",\n    \"\\n\",\n    \"When `NFPPreprocessor` is used, each dataset consists of following features\\n\",\n    \" 1. atom feature: representing atomic number of given molecule. \\n\",\n    \" 2. adjacency matrix feature: representing adjacency matrix of given molecule.\\n\",\n    \" 3. label feature: representing toxity (label) of given molecule.\\n\",\n    \"    Here, 0 indicates negative (no toxity), 1 indicates positive (toxic) and -1 indicates data is not available, respectively.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Let's look an example of 6-th train dataset\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"index=6, SMILES=Cc1ccc([N+](=O)[O-])c2c1O[Hg]2\\n\",\n      \"atom (12,) [ 6  6  6  6  6  7  8  8  6  6  8 80]\\n\",\n      \"adj (12, 12)\\n\",\n      \"[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\\n\",\n      \" [0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]\\n\",\n      \" [0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1.]\\n\",\n      \" [0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]\\n\",\n      \" [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1.]]\\n\",\n      \"labels [-1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"index = 6\\n\",\n    \"\\n\",\n    \"print('index={}, SMILES={}'.format(index, train_smiles[index]))\\n\",\n    \"atom, adj, labels = train[index]\\n\",\n    \"# This molecule has N=12 atoms.\\n\",\n    \"print('atom', atom.shape, atom)\\n\",\n    \"# adjacency matrix is NxN matrix, where N is number of atoms in the molecule.\\n\",\n    \"# Unlike usual adjacency matrix, diagonal elements are filled with 1, for NFP calculation purpose.\\n\",\n    \"print('adj', adj.shape)\\n\",\n    \"print(adj)\\n\",\n    \"print('labels', labels)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Visualizing the molecule\\n\",\n    \"\\n\",\n    \"One might want to visualize molecule given SMILES information.\\n\",\n    \"Here is an example code:\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"# This script is referred from http://rdkit.blogspot.jp/2015/02/new-drawing-code.html\\n\",\n    \"# and http://cheminformist.itmol.com/TEST/wp-content/uploads/2015/07/rdkit_moldraw2d_2.html\\n\",\n    \"from __future__ import print_function\\n\",\n    \"from rdkit import Chem\\n\",\n    \"from rdkit.Chem.Draw import IPythonConsole\\n\",\n    \"from IPython.display import SVG\\n\",\n    \"\\n\",\n    \"from rdkit.Chem import rdDepictor\\n\",\n    \"from rdkit.Chem.Draw import rdMolDraw2D\\n\",\n    \"def moltosvg(mol,molSize=(450,150),kekulize=True):\\n\",\n    \"    mc = Chem.Mol(mol.ToBinary())\\n\",\n    \"    if kekulize:\\n\",\n    \"        try:\\n\",\n    \"            Chem.Kekulize(mc)\\n\",\n    \"        except:\\n\",\n    \"            mc = Chem.Mol(mol.ToBinary())\\n\",\n    \"    if not mc.GetNumConformers():\\n\",\n    \"        rdDepictor.Compute2DCoords(mc)\\n\",\n    \"    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])\\n\",\n    \"    drawer.DrawMolecule(mc)\\n\",\n    \"    drawer.FinishDrawing()\\n\",\n    \"    svg = drawer.GetDrawingText()\\n\",\n    \"    return svg\\n\",\n    \"\\n\",\n    \"def render_svg(svg):\\n\",\n    \"    # It seems that the svg renderer used doesn't quite hit the spec.\\n\",\n    \"    # Here are some fixes to make it work in the notebook, although I think\\n\",\n    \"    # the underlying issue needs to be resolved at the generation step\\n\",\n    \"    return SVG(svg.replace('svg:',''))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"smiles: Cc1ccc([N+](=O)[O-])c2c1O[Hg]2\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"image/svg+xml\": [\n       \"<svg baseProfile=\\\"full\\\" height=\\\"150px\\\" version=\\\"1.1\\\" width=\\\"450px\\\" xml:space=\\\"preserve\\\" xmlns:rdkit=\\\"http://www.rdkit.org/xml\\\" xmlns:svg=\\\"http://www.w3.org/2000/svg\\\" xmlns:xlink=\\\"http://www.w3.org/1999/xlink\\\">\\n\",\n       \"<rect height=\\\"150\\\" style=\\\"opacity:1.0;fill:#FFFFFF;stroke:none\\\" width=\\\"450\\\" x=\\\"0\\\" y=\\\"0\\\"> </rect>\\n\",\n       \"<path d=\\\"M 110.606,38.6588 158.225,44.9246\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 158.225,44.9246 187.461,6.81818\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 170.231,45.0558 190.697,18.3813\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 158.225,44.9246 176.608,89.2969\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 187.461,6.81818 235.08,13.084\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 235.08,13.084 253.463,57.4563\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 228.963,23.4165 241.831,54.4771\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 253.463,57.4563 272.4,59.9481\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 272.4,59.9481 291.338,62.4399\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 253.463,57.4563 224.227,95.5627\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 300.293,74.3658 306.002,88.1468\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 306.002,88.1468 311.712,101.928\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 309.167,70.6892 314.877,84.4702\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 314.877,84.4702 320.586,98.2511\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 307.838,54.9167 315.7,44.6689\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 315.7,44.6689 323.562,34.4211\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 224.227,95.5627 176.608,89.2969\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 218.337,85.099 185.004,80.7129\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 224.227,95.5627 219.014,135.177\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 176.608,89.2969 174.002,109.104\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 174.002,109.104 171.395,128.911\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 177.818,137.9 191.749,139.733\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 191.749,139.733 205.68,141.566\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<text style=\\\"font-size:16px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"291.338\\\" y=\\\"72.5275\\\"><tspan>N</tspan><tspan style=\\\"baseline-shift:super;font-size:12px;\\\">+</tspan><tspan/></text>\\n\",\n       \"<text style=\\\"font-size:16px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"311.989\\\" y=\\\"116.099\\\"><tspan>O</tspan></text>\\n\",\n       \"<text style=\\\"font-size:16px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"321.242\\\" y=\\\"34.4211\\\"><tspan>O</tspan><tspan style=\\\"baseline-shift:super;font-size:12px;\\\">-</tspan><tspan/></text>\\n\",\n       \"<text style=\\\"font-size:16px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"162.866\\\" y=\\\"144.921\\\"><tspan>O</tspan></text>\\n\",\n       \"<text style=\\\"font-size:16px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#000000\\\" x=\\\"205.68\\\" y=\\\"151.187\\\"><tspan>Hg</tspan></text>\\n\",\n       \"</svg>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.SVG object>\"\n      ]\n     },\n     \"execution_count\": 19,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"smiles = train_smiles[index]\\n\",\n    \"mol = Chem.MolFromSmiles(train_smiles[index])\\n\",\n    \"\\n\",\n    \"print('smiles:', smiles)\\n\",\n    \"render_svg(moltosvg(mol))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"[Note] SVG images cannot be displayed on GitHub, but you can see an image of molecule when you execute it on jupyter notebook.\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Interactively watch through the tox21 dataset\\n\",\n    \"\\n\",\n    \"Jupyter notebook provides handy module to check/visualize the data.\\n\",\n    \"Here `interact` module can be used to interactively check the internal of tox 21 dataset.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"index=5878, SMILES=CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)C(NC(=O)Cc3csc(N)n3)C2SC1.Cl\\n\",\n      \"atom [ 6  7  6  6  6  7  7  7  7  6 16  6  6  6  6  8  8  7  6  8  6  7  6  8\\n\",\n      \"  6  6  6 16  6  7  7  6 16  6 17]\\n\",\n      \"labels [ 0  0  0  0  0  0  0 -1  0 -1  0  0]\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"image/svg+xml\": [\n       \"<svg baseProfile=\\\"full\\\" height=\\\"150px\\\" version=\\\"1.1\\\" width=\\\"450px\\\" xml:space=\\\"preserve\\\" xmlns:rdkit=\\\"http://www.rdkit.org/xml\\\" xmlns:svg=\\\"http://www.w3.org/2000/svg\\\" xmlns:xlink=\\\"http://www.w3.org/1999/xlink\\\">\\n\",\n       \"<rect height=\\\"150\\\" style=\\\"opacity:1.0;fill:#FFFFFF;stroke:none\\\" width=\\\"450\\\" x=\\\"0\\\" y=\\\"0\\\"> </rect>\\n\",\n       \"<path d=\\\"M 63.4663,27.5705 72.4585,31.2406\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 72.4585,31.2406 81.4507,34.9107\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 88.1003,33.689 95.5312,27.9255\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 95.5312,27.9255 102.962,22.162\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 85.3008,40.1037 86.5994,49.5872\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 86.5994,49.5872 87.8981,59.0706\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 87.8981,59.0706 109.207,67.7678\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 109.207,67.7678 110.506,77.2513\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 110.506,77.2513 111.805,86.7348\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 109.005,93.7589 99.0422,103.312\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 115.655,92.3537 124.134,96.9007\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 124.134,96.9007 132.613,101.448\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 97.5715,110.336 103.879,123.386\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 102.662,110.29 107.078,119.425\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 109.058,126.767 125.212,124.555\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 129.227,120.264 130.92,110.856\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 130.92,110.856 132.613,101.448\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 125.205,116.626 126.39,110.04\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 126.39,110.04 127.575,103.455\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 132.613,101.448 141.438,97.1819\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 141.438,97.1819 150.264,92.9162\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 156.407,93.5191 164.389,98.9441\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 164.389,98.9441 172.37,104.369\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 172.37,104.369 193.092,94.3528\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 193.092,94.3528 212.128,107.29\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 198.535,92.4864 211.86,101.543\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 193.092,94.3528 194.779,71.399\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 212.128,107.29 210.441,130.244\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 212.128,107.29 220.826,103.086\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 220.826,103.086 229.525,98.8812\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 209.147,132.148 216.874,137.399\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 216.874,137.399 224.6,142.65\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 211.735,128.341 219.461,133.592\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 219.461,133.592 227.187,138.843\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 210.441,130.244 203.534,133.583\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 203.534,133.583 196.627,136.922\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 236.174,97.5184 245.989,98.2396\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 245.989,98.2396 255.804,98.9607\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 233.132,93.4382 233.834,83.8792\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 233.834,83.8792 234.536,74.3203\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 254.061,100.465 259.925,107.258\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 259.925,107.258 265.789,114.052\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 257.546,97.4569 263.41,104.251\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 263.41,104.251 269.273,111.044\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 255.804,98.9607 257.49,76.0069\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 257.49,76.0069 263.98,70.4058\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 263.98,70.4058 270.469,64.8047\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 257.49,76.0069 234.536,74.3203\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 281.563,63.2847 289.106,65.9117\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 289.106,65.9117 296.649,68.5387\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 294.388,68.9699 296.178,78.3561\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 296.178,78.3561 297.968,87.7422\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 298.91,68.1076 300.7,77.4937\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 300.7,77.4937 302.49,86.8799\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#FF0000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 296.649,68.5387 314.072,53.5005\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 314.072,53.5005 335.807,61.0706\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 335.807,61.0706 354.707,47.936\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 341.269,62.8804 354.499,53.6862\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 335.807,61.0706 338.554,70.1695\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 338.554,70.1695 341.301,79.2683\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 354.707,47.936 362.338,53.7283\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 362.338,53.7283 369.968,59.5205\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 371.703,65.6881 368.586,74.6377\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 368.586,74.6377 365.469,83.5873\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 365.469,83.5873 370.57,90.9275\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 370.57,90.9275 375.672,98.2677\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 365.469,83.5873 355.626,83.3807\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 355.626,83.3807 345.784,83.1741\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 362.613,78.9232 355.723,78.7786\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 355.723,78.7786 348.833,78.6339\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#0000FF;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 234.536,74.3203 226.554,68.8953\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 226.554,68.8953 218.573,63.4703\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 212.429,62.8674 203.604,67.1332\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#CCCC00;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<path d=\\\"M 203.604,67.1332 194.779,71.399\\\" style=\\\"fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:2px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1\\\"/>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"81.4507\\\" y=\\\"40.1037\\\"><tspan>N</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"109.005\\\" y=\\\"94.4067\\\"><tspan>N</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"92.3926\\\" y=\\\"110.336\\\"><tspan>N</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"102.409\\\" y=\\\"131.058\\\"><tspan>N</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"125.212\\\" y=\\\"127.936\\\"><tspan>N</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#CCCC00\\\" x=\\\"150.264\\\" y=\\\"95.2674\\\"><tspan>S</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"225.894\\\" y=\\\"147.018\\\"><tspan>O</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"182.812\\\" y=\\\"144.096\\\"><tspan>HO</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"229.525\\\" y=\\\"101.11\\\"><tspan>N</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"267.259\\\" y=\\\"120.22\\\"><tspan>O</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"268.264\\\" y=\\\"64.8047\\\"><tspan>NH</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#FF0000\\\" x=\\\"297.378\\\" y=\\\"94.9829\\\"><tspan>O</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#CCCC00\\\" x=\\\"369.968\\\" y=\\\"65.6881\\\"><tspan>S</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"370.674\\\" y=\\\"106.707\\\"><tspan>NH</tspan><tspan style=\\\"baseline-shift:sub;font-size:5.25px;\\\">2</tspan><tspan/></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#0000FF\\\" x=\\\"339.134\\\" y=\\\"86.9402\\\"><tspan>N</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#CCCC00\\\" x=\\\"212.429\\\" y=\\\"65.2186\\\"><tspan>S</tspan></text>\\n\",\n       \"<text style=\\\"font-size:7px;font-style:normal;font-weight:normal;fill-opacity:1;stroke:none;font-family:sans-serif;text-anchor:start;fill:#00CC00\\\" x=\\\"209.454\\\" y=\\\"10.6541\\\"><tspan>ClH</tspan></text>\\n\",\n       \"</svg>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.SVG object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"<function __main__.show_train_dataset>\"\n      ]\n     },\n     \"execution_count\": 20,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"from ipywidgets import interact\\n\",\n    \"\\n\",\n    \"def show_train_dataset(index):\\n\",\n    \"    atom, adj, labels = train[index]\\n\",\n    \"    smiles = train_smiles[index]\\n\",\n    \"    print('index={}, SMILES={}'.format(index, smiles))\\n\",\n    \"    print('atom', atom)\\n\",\n    \"    # print('adj', adj)\\n\",\n    \"    print('labels', labels)\\n\",\n    \"    mol = Chem.MolFromSmiles(train_smiles[index])\\n\",\n    \"    return render_svg(moltosvg(mol))\\n\",\n    \"\\n\",\n    \"interact(show_train_dataset, index=(0, len(train) - 1, 1))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Appendix: how to save the molecule figure?\\n\",\n    \"\\n\",\n    \"### 1. Save with SVG format\\n\",\n    \"\\n\",\n    \"First method is simply save svg in file.\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"import os\\n\",\n    \"dirpath = 'images'\\n\",\n    \"\\n\",\n    \"if not os.path.exists(dirpath):\\n\",\n    \"    os.mkdir(dirpath)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"def save_svg(mol, filepath):\\n\",\n    \"    svg = moltosvg(mol)\\n\",\n    \"    with open(filepath, \\\"w\\\") as fw:\\n\",\n    \"        fw.write(svg)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"drawing images\\\\mol_6.svg\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"index = 6\\n\",\n    \"save_filepath = os.path.join(dirpath, 'mol_{}.svg'.format(index))\\n\",\n    \"print('drawing {}'.format(save_filepath))\\n\",\n    \"\\n\",\n    \"mol = Chem.MolFromSmiles(train_smiles[index])\\n\",\n    \"save_svg(mol, save_filepath)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2. Save with png format\\n\",\n    \"\\n\",\n    \"`rdkit` provides `Draw.MolToFile` method to visualize mol instance and save it to png format.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"from rdkit.Chem import Draw\\n\",\n    \"\\n\",\n    \"def save_png(mol, filepath, size=(600, 600)):\\n\",\n    \"    Draw.MolToFile(mol, filepath, size=size)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 25,\n   \"metadata\": {\n    \"collapsed\": false\n   },\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"drawing images\\\\mol_6.png\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"index = 6\\n\",\n    \"save_filepath = os.path.join(dirpath, 'mol_{}.png'.format(index))\\n\",\n    \"print('drawing {}'.format(save_filepath))\\n\",\n    \"\\n\",\n    \"mol = Chem.MolFromSmiles(train_smiles[index])\\n\",\n    \"save_png(mol, save_filepath, size=(600, 600))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"collapsed\": true\n   },\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"anaconda-cloud\": {},\n  \"kernelspec\": {\n   \"display_name\": \"Python [conda root]\",\n   \"language\": \"python\",\n   \"name\": \"conda-root-py\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.5.2\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 0\n}\n"
  },
  {
    "path": "examples/tox21/train_tox21.py",
    "content": "#!/usr/bin/env python\n\nfrom __future__ import print_function\n\nimport os\n\nimport logging\n\nimport argparse\nimport chainer\nfrom chainer import functions as F\nfrom chainer import iterators as I\nfrom rdkit import RDLogger\n\nfrom chainer_chemistry.dataset.converters import converter_method_dict\nfrom chainer_chemistry import datasets as D\nfrom chainer_chemistry.iterators.balanced_serial_iterator import BalancedSerialIterator  # NOQA\nfrom chainer_chemistry.models.prediction import Classifier\nfrom chainer_chemistry.models.prediction import set_up_predictor\nfrom chainer_chemistry.training.extensions import ROCAUCEvaluator  # NOQA\nfrom chainer_chemistry.utils import run_train, save_json\n\nimport data\n\n# Disable errors by RDKit occurred in preprocessing Tox21 dataset.\n\nlg = RDLogger.logger()\nlg.setLevel(RDLogger.CRITICAL)\n# show INFO level log from chainer chemistry\nlogging.basicConfig(level=logging.INFO)\n\n\ndef main():\n    # Supported preprocessing/network list\n    method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',\n                   'relgat', 'megnet']\n    label_names = D.get_tox21_label_names()\n    iterator_type = ['serial', 'balanced']\n\n    parser = argparse.ArgumentParser(\n        description='Multitask Learning with Tox21.')\n    parser.add_argument('--method', '-m', type=str, choices=method_list,\n                        default='nfp', help='graph convolution model to use '\n                        'as a predictor.')\n    parser.add_argument('--label', '-l', type=str, choices=label_names,\n                        default='', help='target label for logistic '\n                        'regression. Use all labels if this option '\n                        'is not specified.')\n    parser.add_argument('--iterator-type', type=str, choices=iterator_type,\n                        default='serial', help='iterator type. If `balanced` '\n                        'is specified, data is sampled to take same number of'\n                        'positive/negative labels during training.')\n    parser.add_argument('--eval-mode', type=int, default=1,\n                        help='Evaluation mode.'\n                        '0: only binary_accuracy is calculated.'\n                        '1: binary_accuracy and ROC-AUC score is calculated')\n    parser.add_argument('--conv-layers', '-c', type=int, default=4,\n                        help='number of convolution layers')\n    parser.add_argument('--batchsize', '-b', type=int, default=32,\n                        help='batch size')\n    parser.add_argument(\n        '--device', type=str, default='-1',\n        help='Device specifier. Either ChainerX device specifier or an '\n             'integer. If non-negative integer, CuPy arrays with specified '\n             'device id are used. If negative integer, NumPy arrays are used')\n    parser.add_argument('--out', '-o', type=str, default='result',\n                        help='path to output directory')\n    parser.add_argument('--epoch', '-e', type=int, default=10,\n                        help='number of epochs')\n    parser.add_argument('--unit-num', '-u', type=int, default=16,\n                        help='number of units in one layer of the model')\n    parser.add_argument('--resume', '-r', type=str, default='',\n                        help='path to a trainer snapshot')\n    parser.add_argument('--frequency', '-f', type=int, default=-1,\n                        help='Frequency of taking a snapshot')\n    parser.add_argument('--protocol', type=int, default=2,\n                        help='protocol version for pickle')\n    parser.add_argument('--model-filename', type=str, default='classifier.pkl',\n                        help='file name for pickled model')\n    parser.add_argument('--num-data', type=int, default=-1,\n                        help='Number of data to be parsed from parser.'\n                             '-1 indicates to parse all data.')\n    args = parser.parse_args()\n\n    method = args.method\n    if args.label:\n        labels = args.label\n        class_num = len(labels) if isinstance(labels, list) else 1\n    else:\n        labels = None\n        class_num = len(label_names)\n\n    # Dataset preparation\n    train, val, _ = data.load_dataset(method, labels, num_data=args.num_data)\n\n    # Network\n    predictor_ = set_up_predictor(\n        method, args.unit_num, args.conv_layers, class_num)\n\n    iterator_type = args.iterator_type\n    if iterator_type == 'serial':\n        train_iter = I.SerialIterator(train, args.batchsize)\n    elif iterator_type == 'balanced':\n        if class_num > 1:\n            raise ValueError('BalancedSerialIterator can be used with only one'\n                             'label classification, please specify label to'\n                             'be predicted by --label option.')\n        train_iter = BalancedSerialIterator(\n            train, args.batchsize, train.features[:, -1], ignore_labels=-1)\n        train_iter.show_label_stats()\n    else:\n        raise ValueError('Invalid iterator type {}'.format(iterator_type))\n\n    device = chainer.get_device(args.device)\n    classifier = Classifier(predictor_,\n                            lossfun=F.sigmoid_cross_entropy,\n                            metrics_fun=F.binary_accuracy,\n                            device=device)\n\n    extensions_list = []\n    eval_mode = args.eval_mode\n    converter = converter_method_dict[method]\n    if eval_mode == 1:\n        train_eval_iter = I.SerialIterator(train, args.batchsize,\n                                           repeat=False, shuffle=False)\n\n        extensions_list.append(ROCAUCEvaluator(\n            train_eval_iter, classifier, eval_func=predictor_,\n            device=device, converter=converter, name='train',\n            pos_labels=1, ignore_labels=-1, raise_value_error=False))\n        # extension name='validation' is already used by `Evaluator`,\n        # instead extension name `val` is used.\n        val_iter = I.SerialIterator(val, args.batchsize,\n                                    repeat=False, shuffle=False)\n        extensions_list.append(ROCAUCEvaluator(\n            val_iter, classifier, eval_func=predictor_,\n            device=device, converter=converter, name='val',\n            pos_labels=1, ignore_labels=-1))\n\n    run_train(classifier, train_iter, valid=val,\n              batch_size=args.batchsize, epoch=args.epoch, out=args.out,\n              device=device, converter=converter,\n              extensions_list=extensions_list, resume_path=args.resume)\n\n    # frequency = args.epoch if args.frequency == -1 else max(1, args.frequency)\n    # trainer.extend(E.snapshot(), trigger=(frequency, 'epoch'))\n    # trainer.run()\n\n    config = {'method': args.method,\n              'conv_layers': args.conv_layers,\n              'unit_num': args.unit_num,\n              'labels': args.label}\n    save_json(os.path.join(args.out, 'config.json'), config)\n\n    classifier.save_pickle(os.path.join(args.out, args.model_filename),\n                           protocol=args.protocol)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "setup.py",
    "content": "from distutils.core import setup\nimport os\n\nfrom setuptools import find_packages\n\nsetup_requires = []\ninstall_requires = [\n    'chainer >=7.0.0',\n    'joblib',\n    'matplotlib',\n    'pandas',\n    'scikit-learn',\n    'scipy',\n    'tqdm',\n]\n\n\nhere = os.path.abspath(os.path.dirname(__file__))\n# Get __version__ variable\nexec(open(os.path.join(here, 'chainer_chemistry', '_version.py')).read())\n\nsetup(name='chainer-chemistry',\n      version=__version__,  # NOQA\n      description='Chainer Chemistry: A Library for Deep Learning in Biology\\\n      and Chemistry',\n      author='Kosuke Nakago',\n      author_email='nakago@preferred.jp',\n      packages=find_packages(),\n      license='MIT',\n      url='http://chainer-chemistry.readthedocs.io/en/latest/index.html',\n      setup_requires=setup_requires,\n      install_requires=install_requires\n      )\n"
  },
  {
    "path": "tests/dataset_tests/parsers_tests/test_csv_file_parser.py",
    "content": "import os\n\nimport numpy\nimport pandas\nimport pytest\nfrom rdkit import Chem\nimport six\n\nfrom chainer_chemistry.dataset.parsers import CSVFileParser\nfrom chainer_chemistry.dataset.preprocessors import NFPPreprocessor\n\n\n@pytest.fixture\ndef mol_smiles():\n    mol_smiles1 = 'CN=C=O'\n    mol_smiles2 = 'Cc1ccccc1'\n    mol_smiles3 = 'CC1=CC2CC(CC1)O2'\n    return [mol_smiles1, mol_smiles2, mol_smiles3]\n\n\n@pytest.fixture\ndef mols(mol_smiles):\n    return [Chem.MolFromSmiles(smiles) for smiles in mol_smiles]\n\n\n@pytest.fixture()\ndef label_a():\n    return [2.1, 5.3, -1.2]\n\n\n@pytest.fixture()\ndef csv_file(tmpdir, mol_smiles, label_a):\n    fname = os.path.join(str(tmpdir), 'test.csv')\n    df = pandas.DataFrame({\n        'smiles': mol_smiles,\n        'labelA': label_a\n    })\n    df.to_csv(fname)\n    return fname\n\n\n@pytest.fixture()\ndef csv_file_invalid(tmpdir):\n    \"\"\"CSV file with invalid SMILES\"\"\"\n    fname = os.path.join(str(tmpdir), 'test_invalid.csv')\n    df = pandas.DataFrame({\n        'smiles': ['var', 'CN=C=O', 'hoge', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2'],\n        'labelA': [0., 2.1, 0., 5.3, -1.2],\n    })\n    df.to_csv(fname)\n    return fname\n\n\ndef check_input_features(actual, expect):\n    assert len(actual) == len(expect)\n    for d, e in six.moves.zip(actual, expect):\n        numpy.testing.assert_array_equal(d, e)\n\n\ndef check_features(actual, expect_input_features, expect_label):\n    assert len(actual) == len(expect_input_features) + 1\n    # input features testing\n    for d, e in six.moves.zip(actual[:-1], expect_input_features):\n        numpy.testing.assert_array_equal(d, e)\n    # label testing\n    assert actual[-1] == expect_label\n\n\ndef test_csv_file_parser_not_return_smiles(csv_file, mols):\n    preprocessor = NFPPreprocessor()\n    parser = CSVFileParser(preprocessor, smiles_col='smiles')\n    # Actually, `dataset, smiles = parser.parse(..)` is enough.\n    result = parser.parse(csv_file, return_smiles=False)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    is_successful = result['is_successful']\n    assert len(dataset) == 3\n    assert smiles is None\n    assert is_successful is None\n\n    # As we want test CSVFileParser, we assume\n    # NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_input_features(dataset[i], expect)\n\n\ndef test_csv_file_parser_return_smiles(csv_file, mols, label_a):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles')\n    result = parser.parse(csv_file, return_smiles=True)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    assert len(dataset) == 3\n\n    # As we want test CSVFileParser, we assume\n    # NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_features(dataset[i], expect, label_a[i])\n\n    # check smiles array\n    assert type(smiles) == numpy.ndarray\n    assert smiles.ndim == 1\n    assert len(smiles) == len(dataset)\n    assert smiles[0] == 'CN=C=O'\n    assert smiles[1] == 'Cc1ccccc1'\n    assert smiles[2] == 'CC1=CC2CC(CC1)O2'\n\n\ndef test_csv_file_parser_target_index(csv_file_invalid, mols, label_a):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles')\n    result = parser.parse(csv_file_invalid, return_smiles=True,\n                          target_index=[1, 2, 4], return_is_successful=True)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    assert len(dataset) == 2\n    is_successful = result['is_successful']\n    assert numpy.array_equal(is_successful, numpy.array([True, False, True]))\n    assert len(is_successful) == 3\n\n    # As we want test CSVFileParser, we assume\n    # NFPPreprocessor works as documented.\n    expect = preprocessor.get_input_features(mols[0])\n    check_features(dataset[0], expect, label_a[0])\n\n    expect = preprocessor.get_input_features(mols[2])\n    check_features(dataset[1], expect, label_a[2])\n\n    # check smiles array\n    assert type(smiles) == numpy.ndarray\n    assert smiles.ndim == 1\n    assert len(smiles) == len(dataset)\n    assert smiles[0] == 'CN=C=O'\n    assert smiles[1] == 'CC1=CC2CC(CC1)O2'\n\n\ndef test_csv_file_parser_extract_total_num(csv_file):\n    preprocessor = NFPPreprocessor()\n    parser = CSVFileParser(preprocessor, labels='labelA', smiles_col='smiles')\n    num = parser.extract_total_num(csv_file)\n    assert num == 3\n\n\ndef test_csv_parser_return_is_successful(csv_file_invalid, mols, label_a):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = CSVFileParser(preprocessor, labels='labelA',\n                           smiles_col='smiles')\n    result = parser.parse(csv_file_invalid, return_smiles=True,\n                          return_is_successful=True)\n\n    dataset = result['dataset']\n    # smiles = result['smiles']\n    assert len(dataset) == 3\n    is_successful = result['is_successful']\n    assert len(is_successful) == 5\n    # print('is_successful', is_successful)\n    assert numpy.alltrue(is_successful[[1, 3, 4]])\n    assert numpy.alltrue(~is_successful[[0, 2]])\n\n    # We assume NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_features(dataset[i], expect, label_a[i])\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-s', '-v'])\n"
  },
  {
    "path": "tests/dataset_tests/parsers_tests/test_data_frame_parser.py",
    "content": "import numpy\nimport pandas\nimport pytest\nfrom rdkit import Chem\nimport six\n\nfrom chainer_chemistry.dataset.parsers import DataFrameParser\nfrom chainer_chemistry.dataset.preprocessors import NFPPreprocessor\n\n\n@pytest.fixture\ndef mol_smiles():\n    mol_smiles1 = 'CN=C=O'\n    mol_smiles2 = 'Cc1ccccc1'\n    mol_smiles3 = 'CC1=CC2CC(CC1)O2'\n    return [mol_smiles1, mol_smiles2, mol_smiles3]\n\n\n@pytest.fixture\ndef mols(mol_smiles):\n    return [Chem.MolFromSmiles(smiles) for smiles in mol_smiles]\n\n\n@pytest.fixture()\ndef label_a():\n    return [2.1, 5.3, -1.2]\n\n\n@pytest.fixture()\ndef data_frame(mol_smiles, label_a):\n    df = pandas.DataFrame({\n        'smiles': mol_smiles,\n        'labelA': label_a\n    })\n    return df\n\n\ndef check_input_features(actual, expect):\n    assert len(actual) == len(expect)\n    for d, e in six.moves.zip(actual, expect):\n        numpy.testing.assert_array_equal(d, e)\n\n\ndef check_features(actual, expect_input_features, expect_label):\n    assert len(actual) == len(expect_input_features) + 1\n    # input features testing\n    for d, e in six.moves.zip(actual[:-1], expect_input_features):\n        numpy.testing.assert_array_equal(d, e)\n    # label testing\n    assert actual[-1] == expect_label\n\n\ndef test_data_frame_parser_not_return_smiles(data_frame, mols):\n    \"\"\"Test default behavior\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = DataFrameParser(preprocessor, smiles_col='smiles')\n    # Actually, `dataset, smiles = parser.parse(..)` is enough.\n    result = parser.parse(data_frame, return_smiles=False)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    is_successful = result['is_successful']\n    assert len(dataset) == 3\n    assert smiles is None\n    assert is_successful is None\n\n    # As we want test DataFrameParser, we assume\n    # NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_input_features(dataset[i], expect)\n\n\ndef test_data_frame_parser_return_smiles(data_frame, mols, label_a):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = DataFrameParser(preprocessor, labels='labelA',\n                             smiles_col='smiles')\n    result = parser.parse(data_frame, return_smiles=True)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    assert len(dataset) == 3\n\n    # We assume NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_features(dataset[i], expect, label_a[i])\n\n    # check smiles array\n    assert type(smiles) == numpy.ndarray\n    assert smiles.ndim == 1\n    assert len(smiles) == len(dataset)\n    assert smiles[0] == 'CN=C=O'\n    assert smiles[1] == 'Cc1ccccc1'\n    assert smiles[2] == 'CC1=CC2CC(CC1)O2'\n\n\ndef test_data_frame_parser_target_index(data_frame, mols, label_a):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = DataFrameParser(preprocessor, labels='labelA',\n                             smiles_col='smiles')\n    result = parser.parse(data_frame, return_smiles=True, target_index=[0, 2],\n                          return_is_successful=True)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    assert len(dataset) == 2\n    is_successful = result['is_successful']\n    assert numpy.alltrue(is_successful)\n    assert len(is_successful) == 2\n\n    # We assume NFPPreprocessor works as documented.\n    expect = preprocessor.get_input_features(mols[0])\n    check_features(dataset[0], expect, label_a[0])\n\n    expect = preprocessor.get_input_features(mols[2])\n    check_features(dataset[1], expect, label_a[2])\n\n    # check smiles array\n    assert type(smiles) == numpy.ndarray\n    assert smiles.ndim == 1\n    assert len(smiles) == len(dataset)\n    assert smiles[0] == 'CN=C=O'\n    assert smiles[1] == 'CC1=CC2CC(CC1)O2'\n\n\ndef test_data_frame_parser_return_is_successful(mols, label_a):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = DataFrameParser(preprocessor, labels='labelA',\n                             smiles_col='smiles')\n    df = pandas.DataFrame({\n        'smiles': ['var', 'CN=C=O', 'hoge', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2'],\n        'labelA': [0., 2.1, 0., 5.3, -1.2],\n    })\n    result = parser.parse(df, return_smiles=True, return_is_successful=True)\n\n    dataset = result['dataset']\n    # smiles = result['smiles']\n    assert len(dataset) == 3\n    is_successful = result['is_successful']\n    assert len(is_successful) == 5\n    # print('is_successful', is_successful)\n    assert numpy.alltrue(is_successful[[1, 3, 4]])\n    assert numpy.alltrue(~is_successful[[0, 2]])\n\n    # We assume NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_features(dataset[i], expect, label_a[i])\n\n\ndef test_data_frame_parser_extract_total_num(data_frame):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = DataFrameParser(preprocessor)\n    num = parser.extract_total_num(data_frame)\n    assert num == 3\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-s', '-v'])\n"
  },
  {
    "path": "tests/dataset_tests/parsers_tests/test_sdf_file_parser.py",
    "content": "import os\n\nimport numpy\nimport pytest\nfrom rdkit import Chem\nimport six\n\nfrom chainer_chemistry.dataset.parsers import SDFFileParser\nfrom chainer_chemistry.dataset.preprocessors import NFPPreprocessor\n\n\n@pytest.fixture\ndef mols():\n    mol1 = Chem.MolFromSmiles('CN=C=O')\n    mol2 = Chem.MolFromSmiles('Cc1ccccc1')\n    mol3 = Chem.MolFromSmiles('CC1=CC2CC(CC1)O2')\n    return [mol1, mol2, mol3]\n\n\n@pytest.fixture()\ndef sdf_file(tmpdir, mols):\n    # Chem.AllChem.Compute2DCoords(mol1)\n    fname = os.path.join(str(tmpdir), 'test.sdf')\n    writer = Chem.SDWriter(fname)\n    for mol in mols:\n        writer.write(mol)\n    return fname\n\n\n@pytest.fixture()\ndef sdf_file_long(tmpdir):\n    \"\"\"SDFFile with long smiles (ccc...)\"\"\"\n    fname = os.path.join(str(tmpdir), 'test_long.sdf')\n    writer = Chem.SDWriter(fname)\n    for smiles in ['CCCCCCCCCCCC', 'CN=C=O', 'CCCCCCCCCCCCCCCC',\n                   'Cc1ccccc1', 'CC1=CC2CC(CC1)O2']:\n        mol = Chem.MolFromSmiles(smiles)\n        writer.write(mol)\n    return fname\n\n\ndef check_input_features(actual, expect):\n    assert len(actual) == len(expect)\n    for d, e in six.moves.zip(actual, expect):\n        numpy.testing.assert_array_equal(d, e)\n\n\ndef test_sdf_file_parser_not_return_smiles(sdf_file, mols):\n    preprocessor = NFPPreprocessor()\n    parser = SDFFileParser(preprocessor)\n    result = parser.parse(sdf_file, return_smiles=False)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    is_successful = result['is_successful']\n    assert len(dataset) == 3\n    assert smiles is None\n    assert is_successful is None\n\n    # As we want test SDFFileParser, we assume\n    # NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_input_features(dataset[i], expect)\n\n\ndef test_sdf_file_parser_return_smiles(sdf_file, mols):\n    preprocessor = NFPPreprocessor()\n    parser = SDFFileParser(preprocessor)\n    result = parser.parse(sdf_file, return_smiles=True)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    assert len(dataset) == 3\n\n    # As we want test SDFFileParser, we assume\n    # NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_input_features(dataset[i], expect)\n\n    # check smiles array\n    assert type(smiles) == numpy.ndarray\n    assert smiles.ndim == 1\n    assert len(smiles) == len(dataset)\n    assert smiles[0] == 'CN=C=O'\n    assert smiles[1] == 'Cc1ccccc1'\n    assert smiles[2] == 'CC1=CC2CC(CC1)O2'\n\n\ndef test_sdf_file_parser_target_index(sdf_file, mols):\n    preprocessor = NFPPreprocessor()\n    parser = SDFFileParser(preprocessor)\n    result = parser.parse(sdf_file, return_smiles=True, target_index=[0, 2],\n                          return_is_successful=True)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    assert len(dataset) == 2\n    is_successful = result['is_successful']\n    assert numpy.alltrue(is_successful)\n    assert len(is_successful) == 2\n\n    # As we want test SDFFileParser, we assume\n    # NFPPreprocessor works as documented.\n    expect = preprocessor.get_input_features(mols[0])\n    check_input_features(dataset[0], expect)\n\n    expect = preprocessor.get_input_features(mols[2])\n    check_input_features(dataset[1], expect)\n\n    # check smiles array\n    assert type(smiles) == numpy.ndarray\n    assert smiles.ndim == 1\n    assert len(smiles) == len(dataset)\n    assert smiles[0] == 'CN=C=O'\n    assert smiles[1] == 'CC1=CC2CC(CC1)O2'\n\n\ndef test_sdf_file_parser_return_is_successful(sdf_file_long, mols):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor(max_atoms=10)\n    parser = SDFFileParser(preprocessor)\n    result = parser.parse(sdf_file_long,\n                          return_smiles=True, return_is_successful=True)\n\n    dataset = result['dataset']\n    # smiles = result['smiles']\n    assert len(dataset) == 3\n    is_successful = result['is_successful']\n    assert len(is_successful) == 5\n    assert numpy.alltrue(is_successful[[1, 3, 4]])\n    assert numpy.alltrue(~is_successful[[0, 2]])\n\n    # We assume NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_input_features(dataset[i], expect)\n\n\ndef test_sdf_file_parser_extract_total_num(sdf_file):\n    preprocessor = NFPPreprocessor()\n    parser = SDFFileParser(preprocessor)\n    num = parser.extract_total_num(sdf_file)\n    assert num == 3\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-s', '-v'])\n"
  },
  {
    "path": "tests/dataset_tests/parsers_tests/test_smiles_parser.py",
    "content": "import numpy\nimport pytest\nfrom rdkit import Chem\nimport six\n\nfrom chainer_chemistry.dataset.parsers import SmilesParser\nfrom chainer_chemistry.dataset.preprocessors import NFPPreprocessor\n\n\n@pytest.fixture\ndef mol_smiles():\n    mol_smiles1 = 'CN=C=O'\n    mol_smiles2 = 'Cc1ccccc1'\n    mol_smiles3 = 'CC1=CC2CC(CC1)O2'\n    return [mol_smiles1, mol_smiles2, mol_smiles3]\n\n\n@pytest.fixture\ndef mols(mol_smiles):\n    return [Chem.MolFromSmiles(smiles) for smiles in mol_smiles]\n\n\ndef check_input_features(actual, expect):\n    assert len(actual) == len(expect)\n    for d, e in six.moves.zip(actual, expect):\n        numpy.testing.assert_array_equal(d, e)\n\n\ndef test_smiles_parser_not_return_smiles(mol_smiles, mols):\n    preprocessor = NFPPreprocessor()\n    parser = SmilesParser(preprocessor)\n    result = parser.parse(mol_smiles, return_smiles=False)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    is_successful = result['is_successful']\n    assert len(dataset) == 3\n    assert smiles is None\n    assert is_successful is None\n\n    # As we want test CSVFileParser, we assume\n    # NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_input_features(dataset[i], expect)\n\n\ndef test_smiles_parser_return_smiles(mol_smiles, mols):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = SmilesParser(preprocessor)\n    result = parser.parse(mol_smiles, return_smiles=True)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    assert len(dataset) == 3\n\n    # As we want test CSVFileParser, we assume\n    # NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_input_features(dataset[i], expect)\n\n    # check smiles array\n    assert type(smiles) == numpy.ndarray\n    assert smiles.ndim == 1\n    assert len(smiles) == len(dataset)\n    assert smiles[0] == 'CN=C=O'\n    assert smiles[1] == 'Cc1ccccc1'\n    assert smiles[2] == 'CC1=CC2CC(CC1)O2'\n\n\ndef test_smiles_parser_target_index(mol_smiles, mols):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = SmilesParser(preprocessor)\n    result = parser.parse(mol_smiles, return_smiles=True, target_index=[0, 2],\n                          return_is_successful=True)\n    dataset = result['dataset']\n    smiles = result['smiles']\n    assert len(dataset) == 2\n    is_successful = result['is_successful']\n    assert numpy.alltrue(is_successful)\n    assert len(is_successful) == 2\n\n    # As we want test CSVFileParser, we assume\n    # NFPPreprocessor works as documented.\n    expect = preprocessor.get_input_features(mols[0])\n    check_input_features(dataset[0], expect)\n\n    expect = preprocessor.get_input_features(mols[2])\n    check_input_features(dataset[1], expect)\n\n    # check smiles array\n    assert type(smiles) == numpy.ndarray\n    assert smiles.ndim == 1\n    assert len(smiles) == len(dataset)\n    assert smiles[0] == 'CN=C=O'\n    assert smiles[1] == 'CC1=CC2CC(CC1)O2'\n\n\ndef test_smiles_parser_return_is_successful(mols):\n    \"\"\"test `labels` option and retain_smiles=True.\"\"\"\n    preprocessor = NFPPreprocessor()\n    parser = SmilesParser(preprocessor)\n    mol_smiles_with_invalid = [\n        'var', 'CN=C=O', 'hoge', 'Cc1ccccc1', 'CC1=CC2CC(CC1)O2']\n    result = parser.parse(mol_smiles_with_invalid, return_smiles=True,\n                          return_is_successful=True)\n\n    dataset = result['dataset']\n    assert len(dataset) == 3\n    is_successful = result['is_successful']\n    assert len(is_successful) == 5\n    assert numpy.alltrue(is_successful[[1, 3, 4]])\n    assert numpy.alltrue(~is_successful[[0, 2]])\n\n    # We assume NFPPreprocessor works as documented.\n    for i in range(3):\n        expect = preprocessor.get_input_features(mols[i])\n        check_input_features(dataset[i], expect)\n\n\ndef test_smiles_parser_extract_total_num(mol_smiles):\n    preprocessor = NFPPreprocessor()\n    parser = SmilesParser(preprocessor)\n    num = parser.extract_total_num(mol_smiles)\n    assert num == 3\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-s', '-v'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessor_tests/test_common.py",
    "content": "import numpy\nimport pytest\nfrom rdkit import Chem\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.dataset.preprocessors import common\nfrom chainer_chemistry.utils.extend import extend_adj\n\n\n@pytest.fixture\ndef sample_molecule():\n    return Chem.MolFromSmiles('CN=C=O')\n\n\n@pytest.fixture\ndef sample_molecule_2():\n    return Chem.MolFromSmiles('Cc1ccccc1')\n\n\nclass TestGetAtomicNumbers(object):\n\n    def test_normal(self, sample_molecule):\n        actual = common.construct_atomic_number_array(sample_molecule)\n\n        assert actual.shape == (4,)\n        expect = numpy.array([6, 7, 6, 8], dtype=numpy.int32)\n        numpy.testing.assert_equal(actual, expect)\n\n    def test_padding(self, sample_molecule):\n        actual = common.construct_atomic_number_array(sample_molecule, 5)\n\n        assert actual.shape == (5,)\n        expect = numpy.array([6, 7, 6, 8, 0], dtype=numpy.int32)\n        numpy.testing.assert_equal(actual, expect)\n\n    def test_normal_truncated(self, sample_molecule):\n        with pytest.raises(ValueError):\n            adj = common.construct_atomic_number_array(sample_molecule, 3)  # NOQA\n\n\nclass TestGetAdjMatrix(object):\n\n    def test_normal(self, sample_molecule_2):\n        adj = common.construct_adj_matrix(sample_molecule_2)\n\n        assert adj.shape == (7, 7)\n        expect = numpy.array(\n            [[1., 1., 0., 0., 0., 0., 0., ],\n             [1., 1., 1., 0., 0., 0., 1., ],\n             [0., 1., 1., 1., 0., 0., 0., ],\n             [0., 0., 1., 1., 1., 0., 0., ],\n             [0., 0., 0., 1., 1., 1., 0., ],\n             [0., 0., 0., 0., 1., 1., 1., ],\n             [0., 1., 0., 0., 0., 1., 1., ]],\n            dtype=numpy.float32)\n        numpy.testing.assert_equal(adj, expect)\n\n    def test_normal_no_self_connection(self, sample_molecule_2):\n        adj = common.construct_adj_matrix(sample_molecule_2,\n                                          self_connection=False)\n\n        assert adj.shape == (7, 7)\n        expect = numpy.array(\n            [[0., 1., 0., 0., 0., 0., 0.],\n             [1., 0., 1., 0., 0., 0., 1.],\n             [0., 1., 0., 1., 0., 0., 0.],\n             [0., 0., 1., 0., 1., 0., 0.],\n             [0., 0., 0., 1., 0., 1., 0.],\n             [0., 0., 0., 0., 1., 0., 1.],\n             [0., 1., 0., 0., 0., 1., 0.]],\n            dtype=numpy.float32)\n        numpy.testing.assert_equal(adj, expect)\n\n    def test_normal_padding(self, sample_molecule_2):\n        adj = common.construct_adj_matrix(sample_molecule_2, 8)\n\n        assert adj.shape == (8, 8)\n        expect = numpy.array(\n            [[1., 1., 0., 0., 0., 0., 0., 0.],\n             [1., 1., 1., 0., 0., 0., 1., 0.],\n             [0., 1., 1., 1., 0., 0., 0., 0.],\n             [0., 0., 1., 1., 1., 0., 0., 0.],\n             [0., 0., 0., 1., 1., 1., 0., 0.],\n             [0., 0., 0., 0., 1., 1., 1., 0.],\n             [0., 1., 0., 0., 0., 1., 1., 0.],\n             [0., 0., 0., 0., 0., 0., 0., 0.]],\n            dtype=numpy.float32)\n        numpy.testing.assert_equal(adj, expect)\n\n    def test_normal_truncated(self, sample_molecule_2):\n        with pytest.raises(ValueError):\n            adj = common.construct_adj_matrix(sample_molecule_2, 6)  # NOQA\n\n\nclass TestConstructDiscreteEdgeMatrix(object):\n\n    expect_adj = numpy.array(\n            [[[0., 1., 0., 0., 0., 0., 0.],\n              [1., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.]],\n             [[0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.]],\n             [[0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 0., 0., 0., 0., 0.]],\n             [[0., 0., 0., 0., 0., 0., 0.],\n              [0., 0., 1., 0., 0., 0., 1.],\n              [0., 1., 0., 1., 0., 0., 0.],\n              [0., 0., 1., 0., 1., 0., 0.],\n              [0., 0., 0., 1., 0., 1., 0.],\n              [0., 0., 0., 0., 1., 0., 1.],\n              [0., 1., 0., 0., 0., 1., 0.]]], dtype=numpy.float32)\n\n    def test_default(self, sample_molecule_2):\n        adj = common.construct_discrete_edge_matrix(sample_molecule_2)\n        assert adj.shape == (4, 7, 7)\n        numpy.testing.assert_equal(adj, self.expect_adj)\n\n    def test_add_self_connection_channel(self, sample_molecule_2):\n        adj = common.construct_discrete_edge_matrix(\n            sample_molecule_2, add_self_connection_channel=True)\n        assert adj.shape == (5, 7, 7)\n        numpy.testing.assert_equal(adj[:4], self.expect_adj)\n        numpy.testing.assert_equal(adj[4], numpy.eye(7, 7))\n\n    def test_padding(self, sample_molecule_2):\n        adj = common.construct_discrete_edge_matrix(sample_molecule_2, 8)\n\n        assert adj.shape == (4, 8, 8)\n        expect = extend_adj(self.expect_adj, out_size=8, axis=[-1, -2])\n        numpy.testing.assert_equal(adj, expect)\n\n    def test_truncated(self, sample_molecule_2):\n        with pytest.raises(ValueError):\n            adj = common.construct_discrete_edge_matrix(sample_molecule_2, 6)  # NOQA\n\n\ndef test_construct_super_node_feature_adj_ndim2(sample_molecule):\n    adj = common.construct_adj_matrix(sample_molecule)\n    atom_array = common.construct_atomic_number_array(sample_molecule)\n    s = common.construct_supernode_feature(sample_molecule, atom_array, adj)\n    # print(s)\n    assert s.shape == (MAX_ATOMIC_NUM * 2 + 4,)\n    assert s[0] == len(atom_array)\n    assert s[1] == adj.sum()\n    assert s[2] == 1\n    assert s[3] == 1\n    assert s[3 + 6] == 1  # C\n    assert s[3 + 7] == 1  # N\n    assert s[3 + 8] == 1  # O\n    assert s[3 + MAX_ATOMIC_NUM] == 0  # other\n    assert s[3 + MAX_ATOMIC_NUM + 6] == 2 / len(atom_array)\n    assert s[3 + MAX_ATOMIC_NUM + 7] == 1 / len(atom_array)\n    assert s[3 + MAX_ATOMIC_NUM + 8] == 1 / len(atom_array)\n    assert s[3 + MAX_ATOMIC_NUM * 2] == 0\n\n\ndef test_construct_super_node_feature_adj_ndim3(sample_molecule):\n    adj = common.construct_discrete_edge_matrix(sample_molecule)\n    atom_array = common.construct_atomic_number_array(sample_molecule)\n    s = common.construct_supernode_feature(sample_molecule, atom_array, adj)\n    assert s.shape == (MAX_ATOMIC_NUM * 2 + 10,)\n    assert s[0] == len(atom_array)\n    assert s[1] == adj.sum()\n    assert s[2] == 1\n    assert s[3] == 1\n    assert s[4] == 0\n    assert s[5] == 0\n    assert pytest.approx(s[6], 1 * 2 / adj.sum())  # symmetric\n    assert pytest.approx(s[7], 2 * 2 / adj.sum())  # symmetric\n    assert s[8] == 0\n    assert s[9] == 0\n    assert s[9 + 6] == 1  # C\n    assert s[9 + 6] == 1  # N\n    assert s[9 + 7] == 1  # O\n    assert s[9 + MAX_ATOMIC_NUM] == 0  # other\n    assert s[9 + MAX_ATOMIC_NUM + 6] == 2 / len(atom_array)\n    assert s[9 + MAX_ATOMIC_NUM + 7] == 1 / len(atom_array)\n    assert s[9 + MAX_ATOMIC_NUM + 8] == 1 / len(atom_array)\n    assert s[9 + MAX_ATOMIC_NUM * 2] == 0\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_atomic_number_preprocessor.py",
    "content": "import numpy\nimport pytest\nfrom rdkit import Chem\n\nfrom chainer_chemistry.dataset.parsers import SmilesParser\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\n\n\n@pytest.fixture\ndef mol():\n    ret = Chem.MolFromSmiles('CN=C=O')\n    return ret\n\n\ndef test_atomic_number_default_preprocessor(mol):\n    preprocessor = AtomicNumberPreprocessor()\n    ret_atom_array = preprocessor.get_input_features(mol)\n    expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32)\n    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)\n\n\ndef test_atomic_number_non_default_padding_preprocessor(mol):\n    preprocessor = AtomicNumberPreprocessor(out_size=10)\n    ret_atom_array = preprocessor.get_input_features(mol)\n    expect_atom_array = numpy.array([6, 7, 6, 8, 0, 0, 0, 0, 0, 0],\n                                    dtype=numpy.int32)\n    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)\n\n\ndef test_atomic_number_non_default_max_atoms_preprocessor(mol):\n    preprocessor = AtomicNumberPreprocessor(max_atoms=5)\n    ret_atom_array = preprocessor.get_input_features(mol)\n    expect_atom_array = numpy.array([6, 7, 6, 8],\n                                    dtype=numpy.int32)\n    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)\n\n    preprocessor = AtomicNumberPreprocessor(max_atoms=3)\n    with pytest.raises(MolFeatureExtractionError):\n        preprocessor.get_input_features(mol)\n\n\ndef test_atomic_number_preprocessor(mol):\n    preprocessor = AtomicNumberPreprocessor(max_atoms=5, out_size=10)\n    ret_atom_array = preprocessor.get_input_features(mol)\n    expect_atom_array = numpy.array([6, 7, 6, 8, 0, 0, 0, 0, 0, 0],\n                                    dtype=numpy.int32)\n    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)\n\n\ndef test_atomic_number_preprocessor_default():\n    preprocessor = AtomicNumberPreprocessor()\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset']\n    index = numpy.random.choice(len(dataset), None)\n    atoms, = dataset[index]\n\n    assert atoms.ndim == 1\n    assert atoms.dtype == numpy.int32\n\n\ndef test_atomic_number_preprocessor_assert_raises():\n    with pytest.raises(ValueError):\n        AtomicNumberPreprocessor(max_atoms=3, out_size=2)  # NOQA\n\n\nif __name__ == '__main__':\n    pytest.main()\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_cgcnn_preprocessor.py",
    "content": "import pytest\n\nfrom chainer_chemistry.dataset.preprocessors import CGCNNPreprocessor\n\n\ndef test_cgcnn_preprocessor_init():\n    pp = CGCNNPreprocessor()\n    print('pp.atom_features', pp.atom_features)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_gat_preprocessor.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.parsers import SDFFileParser\nfrom chainer_chemistry.dataset.preprocessors import RelGATPreprocessor\nfrom chainer_chemistry.datasets import get_tox21_filepath\n\n\n@pytest.mark.slow\ndef test_gat_preprocessor():\n    preprocessor = RelGATPreprocessor()\n\n    def postprocess_label(label_list):\n        # Set -1 to the place where the label is not found,\n        # this corresponds to not calculate loss with `sigmoid_cross_entropy`\n        return [-1 if label is None else label for label in label_list]\n\n    dataset = SDFFileParser(preprocessor, postprocess_label=postprocess_label\n                            ).parse(get_tox21_filepath('train'))[\"dataset\"]\n\n    index = numpy.random.choice(len(dataset), None)\n    atoms, adjs = dataset[index]\n\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    # (edge_type, atom from, atom to)\n    assert adjs.ndim == 3\n    assert adjs.dtype == numpy.float32\n\n\ndef test_gat_preprocessor_assert_raises():\n    with pytest.raises(ValueError):\n        pp = RelGATPreprocessor(max_atoms=3, out_size=2)  # NOQA\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_ggnn_preprocessor.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.parsers import SmilesParser\nfrom chainer_chemistry.dataset.preprocessors import GGNNPreprocessor\n\n\ndef test_ggnn_preprocessor():\n    preprocessor = GGNNPreprocessor()\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1']\n    )[\"dataset\"]\n\n    index = numpy.random.choice(len(dataset), None)\n    atoms, adjs = dataset[index]\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    # (edge_type, atom from, atom to)\n    assert adjs.ndim == 3\n    assert adjs.dtype == numpy.float32\n\n    atoms0, adjs0 = dataset[0]\n    assert numpy.allclose(atoms0, numpy.array([6, 7], numpy.int32))\n    expect_adjs = numpy.array(\n        [[[0., 0.],\n          [0., 0.]],\n         [[0., 0.],\n          [0., 0.]],\n         [[0., 1.],\n          [1., 0.]],\n         [[0., 0.],\n          [0., 0.]]], dtype=numpy.float32)\n    assert numpy.allclose(adjs0, expect_adjs)\n\n    atoms1, adjs1 = dataset[1]\n    assert numpy.allclose(\n        atoms1, numpy.array([6, 6, 6, 7, 6, 6, 8, 7, 6], numpy.int32))\n    # include aromatic bond (ch=3)\n    expect_adjs = numpy.array(\n        [[[0., 1., 0., 0., 0., 0., 0., 0., 0.],\n          [1., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n          [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 1.],\n          [0., 0., 0., 0., 0., 0., 0., 1., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 1., 0., 0.],\n          [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 1., 0., 0., 0., 0., 1., 0.],\n          [0., 1., 0., 1., 0., 0., 0., 0., 0.],\n          [0., 0., 1., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 1., 0., 0., 0., 1., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 1., 0., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=numpy.float32)\n    assert numpy.allclose(adjs1, expect_adjs)\n\n\ndef test_ggnn_preprocessor_kekulize():\n    preprocessor = GGNNPreprocessor(kekulize=True)\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1']\n    )[\"dataset\"]\n    atoms1, adjs1 = dataset[1]\n    assert numpy.allclose(\n        atoms1, numpy.array([6, 6, 6, 7, 6, 6, 8, 7, 6], numpy.int32))\n    # NOT include aromatic bond (ch=3)\n    expect_adjs = numpy.array(\n        [[[0., 1., 0., 0., 0., 0., 0., 0., 0.],\n          [1., 0., 0., 0., 0., 0., 0., 1., 0.],\n          [0., 0., 0., 1., 0., 0., 0., 0., 0.],\n          [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 1., 0., 1., 0.],\n          [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 1., 0., 0., 1., 0., 0., 0., 1.],\n          [0., 0., 0., 0., 0., 0., 0., 1., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n          [0., 1., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 1., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 1., 0., 0.],\n          [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=numpy.float32)\n    assert numpy.allclose(adjs1, expect_adjs)\n\n\ndef test_ggnn_preprocessor_assert_raises():\n    with pytest.raises(ValueError):\n        pp = GGNNPreprocessor(max_atoms=3, out_size=2)  # NOQA\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_gwm_preprocessor.py",
    "content": "import pytest\nfrom rdkit import Chem\n\nfrom chainer_chemistry.dataset.preprocessors.gwm_preprocessor import (\n    NFPGWMPreprocessor, GGNNGWMPreprocessor, GINGWMPreprocessor,\n    RSGCNGWMPreprocessor)  # NOQA\n\n\n@pytest.fixture\ndef mol():\n    ret = Chem.MolFromSmiles('CN=C=O')\n    return ret\n\n\n@pytest.mark.parametrize('pp_type', [\n    NFPGWMPreprocessor, GGNNGWMPreprocessor, GINGWMPreprocessor,\n    RSGCNGWMPreprocessor])\ndef test_gwm_preprocessor(mol, pp_type):\n    pp = pp_type()\n    ret = pp.get_input_features(mol)\n    # currently all preprocessor returns `super_node_x` at 3rd args.\n    assert len(ret) == 3\n    super_node_x = ret[2]\n\n    # print('super_node_x', super_node_x.shape, super_node_x)\n    assert super_node_x.ndim == 1\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_mol_preprocessor.py",
    "content": "import pytest\nfrom rdkit import Chem\n\nfrom chainer_chemistry.dataset.preprocessors import MolPreprocessor\n\n\n@pytest.fixture\ndef mol():\n    ret = Chem.MolFromSmiles('CN=C=O')\n    ret.SetProp('foo', '1')\n    ret.SetProp('bar', '2')\n    return ret\n\n\n@pytest.fixture\ndef pp():\n    return MolPreprocessor()\n\n\nclass TestGetLabel(object):\n\n    def test_default(self, mol, pp):\n        labels = pp.get_label(mol)\n        assert labels == []\n\n    def test_empty(self, mol, pp):\n        labels = pp.get_label(mol, [])\n        assert labels == []\n\n    def test_one_label(self, mol, pp):\n        labels = pp.get_label(mol, ['foo'])\n        assert labels == ['1']\n\n    def test_two_labels(self, mol, pp):\n        labels = pp.get_label(mol, ['bar', 'foo'])\n        assert labels == ['2', '1']\n\n    def test_non_existent_label(self, mol, pp):\n        labels = pp.get_label(mol, ['foo', 'buz'])\n        assert labels == ['1', None]\n\n\nif __name__ == '__main__':\n    pytest.main()\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_nfp_preprocessor.py",
    "content": "import numpy\nimport pytest\nfrom rdkit import Chem\n\nfrom chainer_chemistry.dataset.parsers import SmilesParser\nfrom chainer_chemistry.dataset.preprocessors import NFPPreprocessor\n\n\n@pytest.fixture\ndef mol():\n    ret = Chem.MolFromSmiles('CN=C=O')\n    return ret\n\n\n@pytest.fixture\ndef pp():\n    return NFPPreprocessor()\n\n\ndef test_nfp_preprocessor(mol, pp):\n    ret = pp.get_input_features(mol)\n    assert len(ret) == 2\n    actual_atom_array, actual_adj_array = ret\n\n    expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32)\n    numpy.testing.assert_array_equal(actual_atom_array, expect_atom_array)\n\n    expect_adj_array = numpy.array([[1, 1, 0, 0],\n                                    [1, 1, 1, 0],\n                                    [0, 1, 1, 1],\n                                    [0, 0, 1, 1]], dtype=numpy.float32)\n    numpy.testing.assert_array_equal(actual_adj_array, expect_adj_array)\n\n\ndef test_nfp_preprocessor_default():\n    preprocessor = NFPPreprocessor()\n\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset']\n\n    index = numpy.random.choice(len(dataset), None)\n    atoms, adjs = dataset[index]\n\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    # (atom from, atom to)\n    assert adjs.ndim == 2\n    assert adjs.dtype == numpy.float32\n\n\ndef test_nfp_preprocessor_assert_raises():\n    with pytest.raises(ValueError):\n        pp = NFPPreprocessor(max_atoms=3, out_size=2)  # NOQA\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_relgcn_preprocessor.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.parsers import SmilesParser\nfrom chainer_chemistry.dataset.preprocessors import RelGCNPreprocessor\n\n\ndef test_relgcn_preprocessor():\n    preprocessor = RelGCNPreprocessor()\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1']\n    )[\"dataset\"]\n\n    index = numpy.random.choice(len(dataset), None)\n    atoms, adjs = dataset[index]\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    # (edge_type, atom from, atom to)\n    assert adjs.ndim == 3\n    assert adjs.dtype == numpy.float32\n\n    atoms0, adjs0 = dataset[0]\n    assert numpy.allclose(atoms0, numpy.array([6, 7], numpy.int32))\n    expect_adjs = numpy.array(\n        [[[0., 0.],\n          [0., 0.]],\n         [[0., 0.],\n          [0., 0.]],\n         [[0., 1.],\n          [1., 0.]],\n         [[0., 0.],\n          [0., 0.]]], dtype=numpy.float32)\n    assert numpy.allclose(adjs0, expect_adjs)\n\n    atoms1, adjs1 = dataset[1]\n    assert numpy.allclose(\n        atoms1, numpy.array([6, 6, 6, 7, 6, 6, 8, 7, 6], numpy.int32))\n    # include aromatic bond (ch=3)\n    expect_adjs = numpy.array(\n        [[[0., 1., 0., 0., 0., 0., 0., 0., 0.],\n          [1., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n          [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 1.],\n          [0., 0., 0., 0., 0., 0., 0., 1., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 1., 0., 0.],\n          [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 1., 0., 0., 0., 0., 1., 0.],\n          [0., 1., 0., 1., 0., 0., 0., 0., 0.],\n          [0., 0., 1., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 1., 0., 0., 0., 1., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 1., 0., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=numpy.float32)\n    assert numpy.allclose(adjs1, expect_adjs)\n\n\ndef test_relgcn_preprocessor_kekulize():\n    preprocessor = RelGCNPreprocessor(kekulize=True)\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1']\n    )[\"dataset\"]\n    atoms1, adjs1 = dataset[1]\n    assert numpy.allclose(\n        atoms1, numpy.array([6, 6, 6, 7, 6, 6, 8, 7, 6], numpy.int32))\n    # NOT include aromatic bond (ch=3)\n    expect_adjs = numpy.array(\n        [[[0., 1., 0., 0., 0., 0., 0., 0., 0.],\n          [1., 0., 0., 0., 0., 0., 0., 1., 0.],\n          [0., 0., 0., 1., 0., 0., 0., 0., 0.],\n          [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 1., 0., 1., 0.],\n          [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 1., 0., 0., 1., 0., 0., 0., 1.],\n          [0., 0., 0., 0., 0., 0., 0., 1., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n          [0., 1., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n          [0., 0., 0., 1., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 1., 0., 0.],\n          [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n         [[0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.],\n          [0., 0., 0., 0., 0., 0., 0., 0., 0.]]], dtype=numpy.float32)\n    assert numpy.allclose(adjs1, expect_adjs)\n\n\ndef test_relgcn_preprocessor_assert_raises():\n    with pytest.raises(ValueError):\n        pp = RelGCNPreprocessor(max_atoms=3, out_size=2)  # NOQA\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_rsgcn_preprocessor.py",
    "content": "import numpy\nimport pytest\nfrom rdkit import Chem\n\nfrom chainer_chemistry.dataset.parsers import SmilesParser\nfrom chainer_chemistry.dataset.preprocessors.common import MolFeatureExtractionError  # NOQA\nfrom chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor  # NOQA\n\n\n@pytest.fixture\ndef mol():\n    return Chem.MolFromSmiles('CN=C=O')\n\n\ndef test_rsgcn_default_preprocessor(mol):\n    preprocessor = RSGCNPreprocessor()\n    ret_atom_array, ret_adj_array = preprocessor.get_input_features(mol)\n    expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32)\n    expect_adj_array = numpy.array(\n        [[0.5, 0.4082, 0, 0], [0.4082, 0.3333, 0.3333, 0],\n         [0, 0.3333, 0.3333, 0.4082], [0, 0, 0.4082, 0.5]],\n        dtype=numpy.float32)\n\n    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)\n    numpy.testing.assert_allclose(\n        ret_adj_array, expect_adj_array, rtol=1e-03, atol=1e-03)\n\n\ndef test_rsgcn_non_default_padding_preprocessor(mol):\n    preprocessor = RSGCNPreprocessor(out_size=7)\n    ret_atom_array, ret_adj_array = preprocessor.get_input_features(mol)\n    expect_atom_array = numpy.array([6, 7, 6, 8, 0, 0, 0], dtype=numpy.int32)\n    expect_adj_array = numpy.array(\n        [[0.5, 0.4082, 0, 0, 0, 0, 0], [0.4082, 0.3333, 0.3333, 0, 0, 0, 0],\n         [0, 0.3333, 0.3333, 0.4082, 0, 0, 0], [0, 0, 0.4082, 0.5, 0, 0, 0],\n         [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]],\n        dtype=numpy.float32)\n\n    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)\n    numpy.testing.assert_allclose(\n        ret_adj_array, expect_adj_array, rtol=1e-03, atol=1e-03)\n\n\ndef test_rsgcn_non_default_max_atoms_preprocessor(mol):\n    preprocessor = RSGCNPreprocessor(max_atoms=5)\n    ret_atom_array, ret_adj_array = preprocessor.get_input_features(mol)\n    expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32)\n    expect_adj_array = numpy.array(\n        [[0.5, 0.4082, 0, 0], [0.4082, 0.3333, 0.3333, 0],\n         [0, 0.3333, 0.3333, 0.4082], [0, 0, 0.4082, 0.5]],\n        dtype=numpy.float32)\n\n    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)\n    numpy.testing.assert_allclose(\n        ret_adj_array, expect_adj_array, rtol=1e-03, atol=1e-03)\n\n    preprocessor = RSGCNPreprocessor(max_atoms=3)\n    with pytest.raises(MolFeatureExtractionError):\n        preprocessor.get_input_features(mol)\n\n\ndef test_rsgcn_preprocessor(mol):\n    preprocessor = RSGCNPreprocessor(max_atoms=4, out_size=4)\n    ret_atom_array, ret_adj_array = preprocessor.get_input_features(mol)\n    expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32)\n    expect_adj_array = numpy.array(\n        [[0.5, 0.4082, 0, 0], [0.4082, 0.3333, 0.3333, 0],\n         [0, 0.3333, 0.3333, 0.4082], [0, 0, 0.4082, 0.5]],\n        dtype=numpy.float32)\n\n    numpy.testing.assert_array_equal(ret_atom_array, expect_atom_array)\n    numpy.testing.assert_allclose(\n        ret_adj_array, expect_adj_array, rtol=1e-03, atol=1e-03)\n\n\ndef test_rsgcn_preprocessor_default():\n    preprocessor = RSGCNPreprocessor()\n\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset']\n\n    index = numpy.random.choice(len(dataset), None)\n    atoms, adjacency = dataset[index]\n\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    assert adjacency.ndim == 2\n    assert adjacency.dtype == numpy.float32\n\n\ndef test_rsgcn_preprocessor_assert_raises():\n    with pytest.raises(ValueError):\n        RSGCNPreprocessor(max_atoms=3, out_size=2)  # NOQA\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_schnet_preprocessor.py",
    "content": "import numpy\nimport pytest\nfrom rdkit import Chem\n\nfrom chainer_chemistry.dataset.parsers import SmilesParser\nfrom chainer_chemistry.dataset.preprocessors.schnet_preprocessor import SchNetPreprocessor  # NOQA\n\n\n@pytest.fixture\ndef mol():\n    ret = Chem.MolFromSmiles('CN=C=O')\n    return ret\n\n\n@pytest.fixture\ndef pp():\n    return SchNetPreprocessor()\n\n\ndef test_schnet_preprocessor(mol, pp):\n    ret = pp.get_input_features(mol)\n    assert len(ret) == 2\n    actual_atom_array, actual_adj_array = ret\n\n    expect_atom_array = numpy.array([6, 7, 6, 8], dtype=numpy.int32)\n    numpy.testing.assert_array_equal(actual_atom_array, expect_atom_array)\n\n    # TODO(nakago): write test for adj matrix.\n    # print(actual_adj_array)\n    # expect_adj_array = numpy.array([[1, 1, 0, 0],\n    #                                 [1, 1, 1, 0],\n    #                                 [0, 1, 1, 1],\n    #                                 [0, 0, 1, 1]], dtype=numpy.float32)\n    # numpy.testing.assert_array_equal(actual_adj_array, expect_adj_array)\n\n\ndef test_schnet_preprocessor_default():\n    preprocessor = SchNetPreprocessor()\n\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1'])['dataset']\n\n    index = numpy.random.choice(len(dataset), None)\n    atoms, adjs = dataset[index]\n\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    # (atom from, atom to)\n    assert adjs.ndim == 2\n    assert adjs.dtype == numpy.float32\n\n\ndef test_schnet_preprocessor_assert_raises():\n    with pytest.raises(ValueError):\n        pp = SchNetPreprocessor(max_atoms=3, out_size=2)  # NOQA\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_weavenet_preprocessor.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.parsers import SmilesParser\nfrom chainer_chemistry.dataset.preprocessors.weavenet_preprocessor import WeaveNetPreprocessor  # NOQA\n\n\n@pytest.mark.parametrize('max_atoms', [20, 30])\n@pytest.mark.parametrize('use_fixed_atom_feature', [True, False])\ndef test_weave_preprocessor(max_atoms, use_fixed_atom_feature):\n    preprocessor = WeaveNetPreprocessor(\n        max_atoms=max_atoms, use_fixed_atom_feature=use_fixed_atom_feature)\n    dataset = SmilesParser(preprocessor).parse(\n        ['C#N', 'Cc1cnc(C=O)n1C', 'c1ccccc1']\n    )[\"dataset\"]\n\n    index = numpy.random.choice(len(dataset), None)\n    atoms, adjs = dataset[index]\n    if use_fixed_atom_feature:\n        assert atoms.ndim == 2  # (atom, ch)\n        assert atoms.dtype == numpy.float32\n    else:\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n    # (atom from * atom to, ch)\n    assert adjs.ndim == 2\n    assert adjs.shape[0] == max_atoms * max_atoms\n    assert adjs.dtype == numpy.float32\n\n    # TODO(nakago): test feature extraction behavior...\n    atoms0, adjs0 = dataset[0]\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_wle.py",
    "content": "import numpy as np\r\nimport pytest\r\n\r\nfrom chainer_chemistry.dataset.preprocessors import wle as WLE  # NOQA\r\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\r\n\r\n\r\n@pytest.fixture\r\ndef small_datasets():\r\n    N_1 = 3\r\n    N_2 = 5\r\n\r\n    # one-hot atom labels: 1 tp N\r\n    atom_array_1 = np.arange(N_1)\r\n    atom_array_2 = np.arange(N_2)\r\n\r\n    # adj-array, manually\r\n    # all connectes. expanded labels is a permutaion of 0,1,2\r\n    adj_array_1 = np.array([[1, 1, 1],\r\n                            [1, 1, 1],\r\n                            [1, 1, 1]]).astype(np.int32)\r\n    # node 0 --> 0-1.2\r\n    # node 1 --> 1-0.2\r\n    # node 2 --> 2-0.1\r\n\r\n    adj_array_2 = np.array([[1, 1, 0, 0, 1],\r\n                            [1, 1, 0, 0, 1],\r\n                            [0, 0, 1, 1, 0],\r\n                            [0, 0, 1, 1, 0],\r\n                            [1, 1, 0, 0, 1]]).astype(np.float32)\r\n    # node 0 --> 0-1.4\r\n    # node 1 --> 1-0.4\r\n    # node 2 --> 2-3\r\n    # node 3 --> 3-2\r\n    # node 4 --> 4-0.1\r\n\r\n    # supervised labels, dummy\r\n    teach_signal_1 = np.array(1).astype(np.int)\r\n    teach_signal_2 = np.array(0).astype(np.int)\r\n\r\n    # concat in a one numpy array!\r\n    atom_arrays = np.array([atom_array_1, atom_array_2])\r\n    adj_arrays = np.array([adj_array_1, adj_array_2])\r\n    teach_signals = np.array([teach_signal_1, teach_signal_2])\r\n\r\n    # train/val/test dataset, respectively\r\n    datasets = [NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals),\r\n                NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals),\r\n                NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals)]\r\n    return datasets\r\n\r\n\r\ndef _get_elements(datasets, idx):\r\n    return [[mol[1] for mol in d] for d in datasets]\r\n\r\n\r\ndef _get_atom_arrays(datasets):\r\n    return _get_elements(datasets, 0)\r\n\r\n\r\ndef _get_adj_arrays(datasets):\r\n    return _get_elements(datasets, 1)\r\n\r\n\r\ndef _get_wle_arrays(datasets):\r\n    return _get_elements(datasets, 2)\r\n\r\n\r\ndef _get_teach_signals(datasets, is_cwle=False):\r\n    if is_cwle:\r\n        return _get_elements(datasets, 2)\r\n    else:\r\n        return _get_elements(datasets, 3)\r\n\r\n\r\ndef _check_np_array(actuals, expects):\r\n    assert len(actuals) == len(expects) == 3  # train/test/val\r\n    for actual_adjs, expect_adjs in zip(actuals, expects):\r\n        assert len(actual_adjs) == len(expect_adjs)\r\n        [np.testing.assert_array_equal(a, e)\r\n            for a, e in zip(actual_adjs, expect_adjs)]\r\n\r\n\r\ndef test_wle(small_datasets):\r\n    ret_value = WLE.apply_wle_for_datasets(small_datasets, 0)\r\n    actual_datasets, actual_labels, actual_frequency = ret_value\r\n\r\n    expected_frequency = {'0-1.2': 3,\r\n                          '1-0.2': 3,\r\n                          '2-0.1': 3,\r\n                          '0-1.4': 3,\r\n                          '1-0.4': 3,\r\n                          '2-3': 3,\r\n                          '3-2': 3,\r\n                          '4-0.1': 3}\r\n    assert expected_frequency == actual_frequency\r\n\r\n    expected_labels = set(expected_frequency.keys())\r\n    assert expected_labels == set(actual_labels)\r\n\r\n    actual_adj_arrays = _get_adj_arrays(actual_datasets)\r\n    expect_adj_arrays = _get_adj_arrays(small_datasets)\r\n    _check_np_array(actual_adj_arrays, expect_adj_arrays)\r\n\r\n    actual_signal_arrays = _get_teach_signals(actual_datasets)\r\n    expect_signal_arrays = _get_teach_signals(small_datasets)\r\n    _check_np_array(actual_signal_arrays, expect_signal_arrays)\r\n\r\n    # Check atom_arrays of train/val/test datasets are identical.\r\n    # 2 is the number of samples in each (train/val/test) dataset.\r\n    atom_arrays = _get_atom_arrays(actual_datasets)\r\n    first_mols = [d[0] for d in atom_arrays]\r\n    second_mols = [d[1] for d in atom_arrays]\r\n    for mols in (first_mols, second_mols):\r\n        assert len(mols) == 3\r\n        np.testing.assert_array_equal(mols[0], mols[1])\r\n        np.testing.assert_array_equal(mols[1], mols[2])\r\n\r\n\r\ndef test_2_hop_wle(small_datasets):\r\n    k = 2\r\n    ret_value = WLE.apply_wle_for_datasets(small_datasets, 0, k)\r\n    actual_datasets, actual_labels, actual_frequency = ret_value\r\n\r\n    expected_frequency = {'0-1.2': 3,\r\n                          '1-0.2': 3,\r\n                          '2-0.1': 3,\r\n                          '3-4.7': 3,\r\n                          '4-3.7': 3,\r\n                          '5-6': 3,\r\n                          '6-5': 3,\r\n                          '7-3.4': 3}\r\n    # Kenta Oono (oono@preferred.jp)\r\n    # The following assertion checks too strong condition.\r\n    # Specifically it assumes that the WLE algorithm assigns\r\n    # the extended atom labels appeared in the first iteration\r\n    # in a certain order and runs the second iteration.\r\n    # Strictly speaking, this is not required in the algorithm.\r\n    assert expected_frequency == actual_frequency\r\n\r\n    expected_labels = set(expected_frequency.keys())\r\n    assert expected_labels == set(actual_labels)\r\n\r\n    actual_adj_arrays = _get_adj_arrays(actual_datasets)\r\n    expect_adj_arrays = _get_adj_arrays(small_datasets)\r\n    _check_np_array(actual_adj_arrays, expect_adj_arrays)\r\n\r\n    actual_signal_arrays = _get_teach_signals(actual_datasets)\r\n    expect_signal_arrays = _get_teach_signals(small_datasets)\r\n    _check_np_array(actual_signal_arrays, expect_signal_arrays)\r\n\r\n    # Check atom_arrays of train/val/test datasets are identical.\r\n    # 2 is the number of samples in each (train/val/test) dataset.\r\n    atom_arrays = _get_atom_arrays(actual_datasets)\r\n    first_mols = [d[0] for d in atom_arrays]\r\n    second_mols = [d[1] for d in atom_arrays]\r\n    for mols in (first_mols, second_mols):\r\n        assert len(mols) == 3\r\n        np.testing.assert_array_equal(mols[0], mols[1])\r\n        np.testing.assert_array_equal(mols[1], mols[2])\r\n\r\n\r\ndef test_cwle(small_datasets):\r\n    ret_value = WLE.apply_cwle_for_datasets(small_datasets)\r\n    actual_datasets, actual_labels, actual_frequency = ret_value\r\n\r\n    expected_frequency = {'1.2': 3,\r\n                          '0.2': 3,\r\n                          '0.1': 6,\r\n                          '1.4': 3,\r\n                          '0.4': 3,\r\n                          '3': 3,\r\n                          '2': 3}\r\n    assert expected_frequency == actual_frequency\r\n\r\n    expected_labels = set(expected_frequency.keys())\r\n    assert expected_labels == set(actual_labels)\r\n\r\n    actual_adj_arrays = _get_adj_arrays(actual_datasets)\r\n    expect_adj_arrays = _get_adj_arrays(small_datasets)\r\n    _check_np_array(actual_adj_arrays, expect_adj_arrays)\r\n\r\n    actual_signal_arrays = _get_teach_signals(actual_datasets, True)\r\n    expect_signal_arrays = _get_teach_signals(small_datasets)\r\n    _check_np_array(actual_signal_arrays, expect_signal_arrays)\r\n\r\n    # Check atom_arrays of train/val/test datasets are identical.\r\n    atom_arrays = _get_atom_arrays(actual_datasets)\r\n    first_mols = [d[0] for d in atom_arrays]\r\n    second_mols = [d[1] for d in atom_arrays]\r\n    for mols in (first_mols, second_mols):\r\n        assert len(mols) == 3\r\n        np.testing.assert_array_equal(mols[0], mols[1])\r\n        np.testing.assert_array_equal(mols[1], mols[2])\r\n\r\n    # Check wle_arrays of train/val/test datasets are identical.\r\n    wle_arrays = _get_wle_arrays(actual_datasets)\r\n    first_mols = [d[0] for d in wle_arrays]\r\n    second_mols = [d[1] for d in wle_arrays]\r\n    for mols in [first_mols, second_mols]:\r\n        assert len(mols) == 3\r\n        np.testing.assert_array_equal(mols[0], mols[1])\r\n        np.testing.assert_array_equal(mols[1], mols[2])\r\n\r\n\r\ndef test_findmaxidx_atom_label(small_datasets):\r\n    actual = WLE.findmaxidx(small_datasets, 'atom_label')\r\n    expect = 5\r\n    assert actual == expect\r\n\r\n\r\n@pytest.fixture\r\ndef cwle_datasets():\r\n    B = 10\r\n    D_atom = 5\r\n    D_wle = 50\r\n    K_large = 10000\r\n\r\n    atom_arrays = [np.full((B, D_atom), K_large) for _ in range(3)]\r\n    adj_arrays = [np.eye(B, dtype=np.int32) for _ in range(3)]\r\n    wle_arrays = [np.arange(B * D_wle, dtype=np.int32).reshape(B, -1)\r\n                  for _ in range(3)]\r\n    signal_arrays = [np.full(B, K_large) for _ in range(3)]\r\n\r\n    print(wle_arrays[0].shape)\r\n\r\n    datasets = [NumpyTupleDataset(atom_arrays[i],\r\n                                  adj_arrays[i],\r\n                                  wle_arrays[i],\r\n                                  signal_arrays[i])\r\n                for i in range(3)]\r\n    return datasets\r\n\r\n\r\ndef test_findmaxidx_wle(cwle_datasets):\r\n    actual = WLE.findmaxidx(cwle_datasets, 'wle_label')\r\n    expect = 10 * 50\r\n    assert actual == expect\r\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_wle_atom_array_update.py",
    "content": "import itertools\n\nimport numpy as np\nimport pytest\n\nfrom chainer_chemistry.dataset.preprocessors import wle_atom_array_update as wle_update\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\n@pytest.fixture\ndef k3_datasets():\n    train_atoms = np.array([np.zeros(3, dtype=np.int32)])\n    val_atoms = np.array([np.ones(3, dtype=np.int32)])\n    test_atoms = np.array([np.full(3, 2, dtype=np.int32)])\n\n    train_adjs = np.array([np.ones((3, 3), dtype=np.int32)])\n    val_adjs = np.array([np.ones((3, 3), dtype=np.int32)])\n    test_adjs = np.array([np.ones((3, 3), dtype=np.int32)])\n    return ((train_atoms, val_atoms, test_atoms),\n            (train_adjs, val_adjs, test_adjs))\n\n\ndef _is_all_same(arr):\n    arr = np.array(arr)\n    assert arr.size > 0\n    return np.all(arr == arr.item(0))\n\n\ndef _is_all_different(arr):\n    for x, y in itertools.combinations(arr, 2):\n        if x == y:\n            return False\n    return True\n\n\n@pytest.mark.parametrize('cutoff', (0, 1, 2, 3, 4))\ndef test_update_atom_array(k3_datasets, cutoff):\n    atom_arrays, adj_arrays = k3_datasets\n    actual_atom_arrays, actual_label_frequency = wle_update.update_atom_arrays(\n        atom_arrays, adj_arrays, cutoff)\n\n    mols = [d[0] for d in actual_atom_arrays]\n    for m in mols:\n        assert _is_all_same(m)\n\n    # train/val/test atoms must have different labels.\n    assert _is_all_different((mols[0][0], mols[1][0], mols[2][0]))\n\n    if cutoff >= 3:\n        expect_label_frequency = {'0': 3, '1': 3, '2': 3}\n    else:\n        expect_label_frequency = {'0-0.0': 3, '1-1.1': 3, '2-2.2': 3}\n    assert actual_label_frequency == expect_label_frequency\n\n\n@pytest.fixture\ndef single_atom_datasets():\n    train_atoms = np.array([[0], [1], [2]], dtype=np.int32)\n    val_atoms = np.array([[1], [1], [5]], dtype=np.int32)\n    test_atoms = np.array([[4], [4], [2]], dtype=np.int32)\n\n    train_adjs = np.array([[[1]], [[1]], [[1]]], dtype=np.int32)\n    val_adjs = np.array([[[1]], [[1]], [[1]]], dtype=np.int32)\n    test_adjs = np.array([[[1]], [[1]], [[1]]], dtype=np.int32)   \n    return ((train_atoms, val_atoms, test_atoms),\n            (train_adjs, val_adjs, test_adjs))\n\n\n@pytest.mark.parametrize('cutoff', (0, 1, 2))\ndef test_update_atom_array_2(single_atom_datasets, cutoff):\n    atom_arrays, adj_arrays = single_atom_datasets\n    actual_atom_arrays, actual_label_frequency = wle_update.update_atom_arrays(\n        atom_arrays, adj_arrays, cutoff)\n\n    # Note that labels after expansion need not\n    # same as the original atom labels.\n    # For example, assigning ids accoring to\n    # appearance order\n    # 0 -> 0, 1 -> 1, 2 -> 2, 5 -> 3, 4 -> 4,\n    # results in \n    # Atom arrays\n    #   train: [[0], [1], [2]]\n    #   val:   [[1], [1], [3]]\n    #   test:  [[4], [4], [2]]\n    # Label Frequency\n    #    {'0': 1, '1': 3, '2': 2, '3': 1, '4': 2}\n    # This is acceptable.\n\n    train, val, test =  actual_atom_arrays\n    assert _is_all_same((train[1], val[0], val[1]))\n    assert _is_all_same((train[2], test[2]))\n    assert _is_all_same((test[0], test[1]))\n    assert _is_all_different((train[0], train[1], train[2], val[2], test[0]))\n\n    expect_label_frequency = {'0-': 1, '1-': 3, '2-': 2, '4-': 2, '5-': 1}\n    # Equal as a multiset.\n    assert (sorted(actual_label_frequency.values())\n             == sorted(expect_label_frequency.values()))\n\n\n@pytest.fixture\ndef different_sample_size_datasets():\n    train_atoms = np.array([[0]], dtype=np.int32)\n    val_atoms = np.array([[0], [0]], dtype=np.int32)\n    test_atoms = np.array([[0], [0], [0]], dtype=np.int32)\n\n    train_adjs = np.array([[[1]]], dtype=np.int32)\n    val_adjs = np.array([[[1]], [[1]]], dtype=np.int32)\n    test_adjs = np.array([[[1]], [[1]], [[1]]], dtype=np.int32)\n    return ((train_atoms, val_atoms, test_atoms),\n            (train_adjs, val_adjs, test_adjs))\n\n\ndef test_update_atom_array_with_diffent_sample_sizes(\n    different_sample_size_datasets):\n    atom_arrays, adj_arrays = different_sample_size_datasets\n    actual_atom_arrays, actual_label_frequency = wle_update.update_atom_arrays(\n        atom_arrays, adj_arrays, 0)\n\n    all_atoms = sum([list(a.ravel()) for a in actual_atom_arrays], [])\n    assert _is_all_same(all_atoms)\n\n    expect_label_frequency = {'0-': 6}\n    assert actual_label_frequency == expect_label_frequency\n\n\n@pytest.fixture\ndef different_graph_size_datasets():\n    train_atoms = np.array([[0]], dtype=np.int32)\n    val_atoms = np.array([[0, 0]], dtype=np.int32)\n    test_atoms = np.array([[0, 0, 0]], dtype=np.int32)\n\n    train_adjs = np.array([[[1]]], dtype=np.int32)\n    val_adjs = np.array([[[1, 1],\n                          [1, 1]]], dtype=np.int32)\n    test_adjs = np.array([[[1, 1, 1],\n                           [1, 1, 1],\n                           [1, 1, 1]]], dtype=np.int32)\n    return ((train_atoms, val_atoms, test_atoms),\n            (train_adjs, val_adjs, test_adjs))\n\n\ndef test_update_atom_array_with_different_graph_size(\n    different_graph_size_datasets):\n    atom_arrays, adj_arrays = different_graph_size_datasets\n    actual_atom_arrays, actual_label_frequency = wle_update.update_atom_arrays(\n        atom_arrays, adj_arrays, 0)\n\n    mols = [d[0] for d in actual_atom_arrays]\n    for m in mols:\n        assert _is_all_same(m)\n\n    expect_label_frequency = {'0-': 1, '0-0': 2, '0-0.0': 3}\n    assert actual_label_frequency == expect_label_frequency\n\n\n@pytest.fixture\ndef line_graph_datasets():\n    train_atoms = np.zeros(5, dtype=np.int32).reshape(1, -1)\n    val_atoms = np.array([[1]], dtype=np.int32)\n    test_atoms = np.array([[1]], dtype=np.int32)\n\n    train_adjs = np.array([[[1, 1, 0, 0, 0],\n                            [1, 1, 1, 0, 0],\n                            [0, 1, 1, 1, 0],\n                            [0, 0, 1, 1, 1],\n                            [0, 0, 0, 1, 1]]],\n                            dtype=np.int32)\n    val_adjs = np.array([[[1]]], dtype=np.int32)\n    test_adjs = np.array([[[1]]], dtype=np.int32)\n    return ((train_atoms, val_atoms, test_atoms),\n            (train_adjs, val_adjs, test_adjs))\n\n\ndef test_update_atom_array_twice(line_graph_datasets):\n    atom_arrays, adj_arrays = line_graph_datasets\n\n    for _ in range(2):\n        atom_arrays, actual_label_frequency = wle_update.update_atom_arrays(\n            atom_arrays, adj_arrays, 0)\n\n    expect_label_frequency = {'0-1': 2,\n                              '1-0.1': 2,\n                              '1-1.1': 1,\n                              '2-': 2}    # atoms in test and val datasets\n    assert actual_label_frequency == expect_label_frequency\n\n\n@pytest.fixture\ndef small_datasets():\n    N_1 = 3\n    N_2 = 5\n\n    # one-hot atom labels: 1 tp N\n    atom_array_1 = np.arange(N_1)\n    atom_array_2 = np.arange(N_2)\n\n    # adj-array, manually\n    # all connectes. expanded labels is a permutaion of 0,1,2\n    adj_array_1 = np.ones((3, 3), dtype=np.int32)\n    # node 0 --> 0-1.2\n    # node 1 --> 1-0.2\n    # node 2 --> 2-0.1\n\n    adj_array_2 = np.array([[1, 1, 0, 0, 1],\n                            [1, 1, 0, 0, 1],\n                            [0, 0, 1, 1, 0],\n                            [0, 0, 1, 1, 0],\n                            [1, 1, 0, 0, 1]]).astype(np.float32)\n    # node 0 --> 0-1.4\n    # node 1 --> 1-0.4\n    # node 2 --> 2-3\n    # node 3 --> 3-2\n    # node 4 --> 4-0.1\n\n    # supervised labels, dummy\n    teach_signal_1 = np.array(1).astype(np.int)\n    teach_signal_2 = np.array(0).astype(np.int)\n\n    # concat in a one numpy array!\n    atom_arrays = np.array([atom_array_1, atom_array_2])\n    adj_arrays = np.array([adj_array_1, adj_array_2])\n    teach_signals = np.array([teach_signal_1, teach_signal_2])\n\n    # train/val/test dataset, respectively\n    datasets = [NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals),\n                NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals),\n                NumpyTupleDataset(atom_arrays, adj_arrays, teach_signals)]\n    return datasets\n\n\ndef test_list_all_expanded_labels_with_focus_atom(small_datasets):\n    atom_arrays = [[mol[0] for mol in d] for d in small_datasets]\n    adj_arrays = [[mol[1] for mol in d] for d in small_datasets]\n\n    actual_atom_lists, actual_frequencies = wle_update.list_all_expanded_labels(\n        atom_arrays, adj_arrays, True)\n\n    expected_frequency = {'0-1.2': 3,\n                          '1-0.2': 3,\n                          '2-0.1': 3,\n                          '0-1.4': 3,\n                          '1-0.4': 3,\n                          '2-3': 3,\n                          '3-2': 3,\n                          '4-0.1': 3}\n    assert expected_frequency == actual_frequencies\n\n    expect_atom_list = [\n        set(['0-1.2', '1-0.2', '2-0.1']),\n        set(['0-1.4', '1-0.4', '2-3', '3-2', '4-0.1'])]\n    for actual_atom_list in actual_atom_lists:\n        for a, e in zip(actual_atom_list, expect_atom_list):\n            assert set(a) == e\n\n\ndef test_list_all_expanded_labels_without_focus_atom(small_datasets):\n    atom_arrays = [[mol[0] for mol in d] for d in small_datasets]\n    adj_arrays = [[mol[1] for mol in d] for d in small_datasets]\n    actual_atom_lists, actual_frequencies = wle_update.list_all_expanded_labels(\n        atom_arrays, adj_arrays, False)\n\n    expected_frequency = {'1.2': 3,\n                          '0.2': 3,\n                          '0.1': 6,\n                          '1.4': 3,\n                          '0.4': 3,\n                          '3': 3,\n                          '2': 3}\n    assert expected_frequency == actual_frequencies\n\n    expect_atom_list = [\n        set(['1.2', '0.2', '0.1']),\n        set(['1.4', '0.4', '3', '2', '0.1'])]\n    for actual_atom_list in actual_atom_lists:\n        for a, e in zip(actual_atom_list, expect_atom_list):\n            assert set(a) == e\n"
  },
  {
    "path": "tests/dataset_tests/preprocessors_tests/test_wle_util.py",
    "content": "import numpy as np\nimport pytest\n\nfrom chainer_chemistry.dataset.preprocessors import wle_util\n\n\ndef test_to_index():\n    values = ['foo', 'bar', 'buz', 'non-exist']\n    mols = [['foo', 'bar', 'buz'], ['foo', 'foo'], ['buz', 'bar']]\n\n    actual = wle_util.to_index(mols, values)\n    expect = np.array([np.array([0, 1, 2], np.int32),\n                       np.array([0, 0], np.int32),\n                       np.array([2, 1], np.int32)])\n    assert len(actual) == len(expect)\n    for a, e in zip(actual, expect):\n        np.testing.assert_array_equal(a, e)\n\n\ndef test_to_index_non_existence():\n    values = ['foo', 'bar']\n    mols = [['strange_label']]\n\n    with pytest.raises(ValueError):\n        wle_util.to_index(mols, values)\n\n\ndef test_compress_relation_axis_2_dim():\n    arr = np.random.uniform(size=(10, 2))\n    actual = wle_util.compress_relation_axis(arr)\n    np.testing.assert_array_equal(actual, arr)\n\n\ndef test_compress_relation_axis_3_dim():\n    arr = np.array(\n        [\n            [\n                [1, 0],\n                [2, 0],\n            ],\n            [\n                [1, 1],\n                [0, 0]\n            ]\n        ]\n    )\n    arr = np.swapaxes(arr, 0, 1)\n    ret = wle_util.compress_relation_axis(arr)\n    actual = ret != 0\n    expect = np.array(\n        [[True, True],\n        [True, False]]\n    )\n    np.testing.assert_array_equal(actual, expect)\n\n\ndef test_compress_relation_axis_invalid_ndim():\n    arr = np.zeros(3)\n    with pytest.raises(ValueError):\n        wle_util.compress_relation_axis(arr)\n\n    arr = np.zeros((1, 2, 3, 4))\n    with pytest.raises(ValueError):\n        wle_util.compress_relation_axis(arr)\n\n\n@pytest.fixture\ndef small_molecule():\n    # a-b-c d\n    atom_array = ['a', 'b', 'c', 'd']\n    neighbors = np.array(\n        [\n            [0, 1, 1, 2],  # first end of edges\n            [1, 0, 2, 1]   # second end of edges\n        ]\n    )\n    return atom_array, neighbors\n\n\ndef test_get_neighbor_representation_with_focus_atom(small_molecule):\n    atom_array, neighbors = small_molecule\n    expects = ['a-b', 'b-a.c', 'c-b', 'd-']\n    for i in range(len(expects)):\n        actual = wle_util.get_neighbor_representation(\n            i, atom_array, neighbors, True)\n        assert actual == expects[i]\n\n\ndef test_get_neighbor_representation_without_focus_atom(small_molecule):\n    atom_array, neighbors = small_molecule\n    expects = ['b', 'a.c', 'b', '']\n    for i in range(len(expects)):\n        actual = wle_util.get_neighbor_representation(\n            i, atom_array, neighbors, False)\n        assert actual == expects[i]\n\n\n@pytest.mark.parametrize('label, expect', [\n    ('a-b', 'a'),\n    ('a-b.c', 'a'),\n    ('aa-b', 'aa'),\n    ('a-', 'a'),\n    ('aa-', 'aa'),\n])\ndef test_get_focus_node_label(label, expect):\n    actual = wle_util.get_focus_node_label(label)\n    assert actual == expect\n\n\n@pytest.mark.parametrize('label', ['aa', 'a-a-a', 'a--'])\ndef test_get_focus_node_label_invalid(label):\n    with pytest.raises(ValueError):\n        wle_util.get_focus_node_label(label)\n"
  },
  {
    "path": "tests/dataset_tests/splitters_tests/test_deepchem_scaffold_splitter.py",
    "content": "import numpy\nimport pandas\nimport pytest\n\nfrom chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser  # NOQA\nfrom chainer_chemistry.dataset.preprocessors import AtomicNumberPreprocessor\nfrom chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import generate_scaffold  # NOQA\nfrom chainer_chemistry.dataset.splitters.deepchem_scaffold_splitter import DeepChemScaffoldSplitter  # NOQA\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\n@pytest.fixture\ndef smiles_list():\n    smileses = [\n        \"CC1=CC2CC(CC1)O2\",\n        \"O=Cc1nccn1C=O\",\n        \"CCC(C)(C)C(O)C=O\",\n        \"C#CCC(C)(CO)OC\",\n        \"Nc1coc(=O)nc1N\",\n        \"CC12C=CC(CCC1)C2\",\n        \"CC12CCC1C2OC=O\",\n        \"CC1C2CC3(COC3)N12\",\n        \"O=C1NC=NC12CC2\",\n        \"C1=CC2CN2CC2NC12\",\n    ]\n    return smileses\n\n\n@pytest.fixture\ndef dataset(smiles_list):\n    df = pandas.DataFrame(data={'smiles': smiles_list,\n                                'value': numpy.random.rand(10)})\n    pp = AtomicNumberPreprocessor()\n    parser = DataFrameParser(pp, labels='value')\n    dataset = parser.parse(df, return_smiles=True)\n    return dataset\n\n\ndef test_generate_scaffold():\n    smiles = \"Nc1coc(=O)nc1N\"\n    actual = generate_scaffold(smiles)\n    expect = 'O=c1nccco1'\n    assert actual == expect\n\n\ndef test_split(dataset):\n    splitter = DeepChemScaffoldSplitter()\n    train_ind, valid_ind, test_ind = splitter._split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'])\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 8\n    assert valid_ind.shape[0] == 1\n    assert test_ind.shape[0] == 1\n\n    train_ind, valid_ind, test_ind = splitter._split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'],\n        frac_train=0.5, frac_valid=0.3, frac_test=0.2)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 5\n    assert valid_ind.shape[0] == 3\n    assert test_ind.shape[0] == 2\n\n\ndef test_split_fix_seed(dataset):\n    splitter = DeepChemScaffoldSplitter()\n    train_ind1, valid_ind1, test_ind1 = splitter._split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'], seed=44)\n    train_ind2, valid_ind2, test_ind2 = splitter._split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'], seed=44)\n\n    assert numpy.array_equal(train_ind1, train_ind2)\n    assert numpy.array_equal(valid_ind1, valid_ind2)\n    assert numpy.array_equal(test_ind1, test_ind2)\n\n\ndef test_split_fail(dataset):\n    splitter = DeepChemScaffoldSplitter()\n    with pytest.raises(AssertionError):\n        train_ind, valid_ind, test_ind = splitter._split(\n            dataset=dataset['dataset'], smiles_list=dataset['smiles'],\n            frac_train=0.4, frac_valid=0.3, frac_test=0.2)\n\n\ndef test_train_valid_test_split(dataset):\n    splitter = DeepChemScaffoldSplitter()\n    train_ind, valid_ind, test_ind = splitter.train_valid_test_split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'])\n\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 8\n    assert valid_ind.shape[0] == 1\n    assert test_ind.shape[0] == 1\n\n\ndef test_train_valid_test_split_return_dataset(dataset):\n    splitter = DeepChemScaffoldSplitter()\n    train, valid, test = splitter.train_valid_test_split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'],\n        return_index=False)\n\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert type(test) == NumpyTupleDataset\n    assert len(train) == 8\n    assert len(valid) == 1\n    assert len(test) == 1\n\n\ndef test_train_valid_split(dataset):\n    splitter = DeepChemScaffoldSplitter()\n    train_ind, valid_ind = splitter.train_valid_split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'])\n\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 9\n    assert valid_ind.shape[0] == 1\n\n\ndef test_train_valid_split_return_dataset(dataset):\n    splitter = DeepChemScaffoldSplitter()\n    train, valid = splitter.train_valid_split(dataset=dataset['dataset'],\n                                              smiles_list=dataset['smiles'],\n                                              return_index=False)\n\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert len(train) == 9\n    assert len(valid) == 1\n"
  },
  {
    "path": "tests/dataset_tests/splitters_tests/test_random_splitter.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.splitters.random_splitter import RandomSplitter\nfrom chainer_chemistry.datasets import NumpyTupleDataset\n\n\n@pytest.fixture\ndef dataset():\n    a = numpy.random.random((10, 10))\n    b = numpy.random.random((10, 8))\n    c = numpy.random.random((10, 1))\n    return NumpyTupleDataset(a, b, c)\n\n\n@pytest.fixture\ndef ndarray_dataset():\n    a = numpy.random.random((10, 10))\n    return a\n\n\ndef test_split(dataset):\n    splitter = RandomSplitter()\n    train_ind, valid_ind, test_ind = splitter._split(dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 8\n    assert valid_ind.shape[0] == 1\n    assert test_ind.shape[0] == 1\n\n    train_ind, valid_ind, test_ind = splitter._split(dataset,\n                                                     frac_train=0.5,\n                                                     frac_valid=0.3,\n                                                     frac_test=0.2)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 5\n    assert valid_ind.shape[0] == 3\n    assert test_ind.shape[0] == 2\n\n\ndef test_split_fix_seed(dataset):\n    splitter = RandomSplitter()\n    train_ind1, valid_ind1, test_ind1 = splitter._split(dataset, seed=44)\n    train_ind2, valid_ind2, test_ind2 = splitter._split(dataset, seed=44)\n\n    assert numpy.array_equal(train_ind1, train_ind2)\n    assert numpy.array_equal(valid_ind1, valid_ind2)\n    assert numpy.array_equal(test_ind1, test_ind2)\n\n\ndef test_split_fail(dataset):\n    splitter = RandomSplitter()\n    with pytest.raises(AssertionError):\n        train_ind, valid_ind, test_ind = splitter._split(dataset,\n                                                         frac_train=0.4,\n                                                         frac_valid=0.3,\n                                                         frac_test=0.2)\n\n\ndef test_train_valid_test_split(dataset):\n    splitter = RandomSplitter()\n    train_ind, valid_ind, test_ind = splitter.train_valid_test_split(dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 8\n    assert valid_ind.shape[0] == 1\n    assert test_ind.shape[0] == 1\n\n\ndef test_train_valid_test_split_return_dataset(dataset):\n    splitter = RandomSplitter()\n    train, valid, test = splitter.train_valid_test_split(dataset,\n                                                         return_index=False)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert type(test) == NumpyTupleDataset\n    assert len(train) == 8\n    assert len(valid) == 1\n    assert len(test) == 1\n\n\ndef test_train_valid_test_split_ndarray_return_dataset(ndarray_dataset):\n    splitter = RandomSplitter()\n    train, valid, test = splitter.train_valid_test_split(ndarray_dataset,\n                                                         return_index=False)\n    assert type(train) == numpy.ndarray\n    assert type(valid) == numpy.ndarray\n    assert type(test) == numpy.ndarray\n    assert len(train) == 8\n    assert len(valid) == 1\n    assert len(test) == 1\n\n\ndef test_train_valid_split(dataset):\n    splitter = RandomSplitter()\n    train_ind, valid_ind = splitter.train_valid_split(dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 9\n    assert valid_ind.shape[0] == 1\n\n\ndef test_train_valid_split_return_dataset(dataset):\n    splitter = RandomSplitter()\n    train, valid = splitter.train_valid_split(dataset, return_index=False)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert len(train) == 9\n    assert len(valid) == 1\n"
  },
  {
    "path": "tests/dataset_tests/splitters_tests/test_scaffold_splitter.py",
    "content": "import numpy\nimport pandas\nimport pytest\n\nfrom chainer_chemistry.dataset.parsers.data_frame_parser import DataFrameParser  # NOQA\nfrom chainer_chemistry.dataset.preprocessors import AtomicNumberPreprocessor\nfrom chainer_chemistry.dataset.splitters.scaffold_splitter import generate_scaffold  # NOQA\nfrom chainer_chemistry.dataset.splitters.scaffold_splitter import ScaffoldSplitter  # NOQA\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\n@pytest.fixture\ndef smiles_list():\n    smileses = [\n        \"CC1=CC2CC(CC1)O2\",\n        \"O=Cc1nccn1C=O\",\n        \"CCC(C)(C)C(O)C=O\",\n        \"C#CCC(C)(CO)OC\",\n        \"Nc1coc(=O)nc1N\",\n        \"CC12C=CC(CCC1)C2\",\n        \"CC12CCC1C2OC=O\",\n        \"CC1C2CC3(COC3)N12\",\n        \"O=C1NC=NC12CC2\",\n        \"C1=CC2CN2CC2NC12\",\n    ]\n    return smileses\n\n\n@pytest.fixture\ndef dataset(smiles_list):\n    df = pandas.DataFrame(data={'smiles': smiles_list,\n                                'value': numpy.random.rand(10)})\n    pp = AtomicNumberPreprocessor()\n    parser = DataFrameParser(pp, labels='value')\n    dataset = parser.parse(df, return_smiles=True)\n    return dataset\n\n\ndef test_generate_scaffold():\n    smiles = \"Nc1coc(=O)nc1N\"\n    actual = generate_scaffold(smiles)\n    expect = 'O=c1nccco1'\n    assert actual == expect\n\n\ndef test_split(dataset):\n    splitter = ScaffoldSplitter()\n    train_ind, valid_ind, test_ind = splitter._split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'])\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 8\n    assert valid_ind.shape[0] == 1\n    assert test_ind.shape[0] == 1\n\n    train_ind, valid_ind, test_ind = splitter._split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'],\n        frac_train=0.5, frac_valid=0.3, frac_test=0.2)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 5\n    assert valid_ind.shape[0] == 3\n    assert test_ind.shape[0] == 2\n\n\ndef test_split_fix_seed(dataset):\n    splitter = ScaffoldSplitter()\n    train_ind1, valid_ind1, test_ind1 = splitter._split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'], seed=44)\n    train_ind2, valid_ind2, test_ind2 = splitter._split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'], seed=44)\n\n    assert numpy.array_equal(train_ind1, train_ind2)\n    assert numpy.array_equal(valid_ind1, valid_ind2)\n    assert numpy.array_equal(test_ind1, test_ind2)\n\n\ndef test_split_fail(dataset):\n    splitter = ScaffoldSplitter()\n    with pytest.raises(AssertionError):\n        train_ind, valid_ind, test_ind = splitter._split(\n            dataset=dataset['dataset'], smiles_list=dataset['smiles'],\n            frac_train=0.4, frac_valid=0.3, frac_test=0.2)\n\n\ndef test_train_valid_test_split(dataset):\n    splitter = ScaffoldSplitter()\n    train_ind, valid_ind, test_ind = splitter.train_valid_test_split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'])\n\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 8\n    assert valid_ind.shape[0] == 1\n    assert test_ind.shape[0] == 1\n\n\ndef test_train_valid_test_split_return_dataset(dataset):\n    splitter = ScaffoldSplitter()\n    train, valid, test = splitter.train_valid_test_split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'],\n        return_index=False)\n\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert type(test) == NumpyTupleDataset\n    assert len(train) == 8\n    assert len(valid) == 1\n    assert len(test) == 1\n\n\ndef test_train_valid_split(dataset):\n    splitter = ScaffoldSplitter()\n    train_ind, valid_ind = splitter.train_valid_split(\n        dataset=dataset['dataset'], smiles_list=dataset['smiles'])\n\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 9\n    assert valid_ind.shape[0] == 1\n\n\ndef test_train_valid_split_return_dataset(dataset):\n    splitter = ScaffoldSplitter()\n    train, valid = splitter.train_valid_split(dataset=dataset['dataset'],\n                                              smiles_list=dataset['smiles'],\n                                              return_index=False)\n\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert len(train) == 9\n    assert len(valid) == 1\n"
  },
  {
    "path": "tests/dataset_tests/splitters_tests/test_stratified_splitter.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.splitters.stratified_splitter import StratifiedSplitter  # NOQA\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\n@pytest.fixture\ndef cls_dataset():\n    a = numpy.random.random((30, 10))\n    b = numpy.random.random((30, 8))\n    c = numpy.concatenate([numpy.zeros(20), numpy.ones(10)]).astype(numpy.int)\n    return NumpyTupleDataset(a, b, c)\n\n\n@pytest.fixture\ndef cls_label():\n    c = numpy.concatenate([numpy.zeros(20), numpy.ones(10)]).astype(numpy.int)\n    return c\n\n\n@pytest.fixture\ndef cls_ndarray_dataset():\n    a = numpy.concatenate([numpy.zeros(20), numpy.ones(10)]).astype(numpy.int)\n    b = numpy.concatenate([numpy.zeros(20), numpy.ones(10)]).astype(numpy.int)\n    return a, b\n\n\n@pytest.fixture\ndef reg_dataset():\n    a = numpy.random.random((100, 10))\n    b = numpy.random.random((100, 8))\n    c = numpy.arange(100).astype(numpy.float)\n    return NumpyTupleDataset(a, b, c)\n\n\ndef test_classification_split(cls_dataset):\n    splitter = StratifiedSplitter()\n    train_ind, valid_ind, test_ind = splitter._split(cls_dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 24\n    assert valid_ind.shape[0] == 3\n    assert test_ind.shape[0] == 3\n\n    train = NumpyTupleDataset(*cls_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*cls_dataset.features[test_ind])\n    assert (train.features[:, -1] == 1).sum() == 8\n    assert (valid.features[:, -1] == 1).sum() == 1\n    assert (test.features[:, -1] == 1).sum() == 1\n\n    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,\n                                                     frac_train=0.5,\n                                                     frac_valid=0.3,\n                                                     frac_test=0.2)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 15\n    assert valid_ind.shape[0] == 9\n    assert test_ind.shape[0] == 6\n\n    train = NumpyTupleDataset(*cls_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*cls_dataset.features[test_ind])\n    assert (train.features[:, -1] == 1).sum() == 5\n    assert (valid.features[:, -1] == 1).sum() == 3\n    assert (test.features[:, -1] == 1).sum() == 2\n\n\ndef test_classification_split_by_labels_ndarray(cls_dataset, cls_label):\n    splitter = StratifiedSplitter()\n    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,\n                                                     labels=cls_label)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 24\n    assert valid_ind.shape[0] == 3\n    assert test_ind.shape[0] == 3\n\n    train = NumpyTupleDataset(*cls_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*cls_dataset.features[test_ind])\n    assert (train.features[:, -1] == 1).sum() == 8\n    assert (valid.features[:, -1] == 1).sum() == 1\n    assert (test.features[:, -1] == 1).sum() == 1\n\n    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,\n                                                     labels=cls_label,\n                                                     frac_train=0.5,\n                                                     frac_valid=0.3,\n                                                     frac_test=0.2)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 15\n    assert valid_ind.shape[0] == 9\n    assert test_ind.shape[0] == 6\n\n    train = NumpyTupleDataset(*cls_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*cls_dataset.features[test_ind])\n    assert (train.features[:, -1] == 1).sum() == 5\n    assert (valid.features[:, -1] == 1).sum() == 3\n    assert (test.features[:, -1] == 1).sum() == 2\n\n\ndef test_classification_split_by_labels_list(cls_dataset, cls_label):\n    cls_label = cls_label.tolist()\n    splitter = StratifiedSplitter()\n    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,\n                                                     labels=cls_label)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 24\n    assert valid_ind.shape[0] == 3\n    assert test_ind.shape[0] == 3\n\n    train = NumpyTupleDataset(*cls_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*cls_dataset.features[test_ind])\n    assert (train.features[:, -1] == 1).sum() == 8\n    assert (valid.features[:, -1] == 1).sum() == 1\n    assert (test.features[:, -1] == 1).sum() == 1\n\n    train_ind, valid_ind, test_ind = splitter._split(cls_dataset,\n                                                     labels=cls_label,\n                                                     frac_train=0.5,\n                                                     frac_valid=0.3,\n                                                     frac_test=0.2)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 15\n    assert valid_ind.shape[0] == 9\n    assert test_ind.shape[0] == 6\n\n    train = NumpyTupleDataset(*cls_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*cls_dataset.features[test_ind])\n    assert (train.features[:, -1] == 1).sum() == 5\n    assert (valid.features[:, -1] == 1).sum() == 3\n    assert (test.features[:, -1] == 1).sum() == 2\n\n\ndef test_regression_split(reg_dataset):\n    splitter = StratifiedSplitter()\n    train_ind, valid_ind, test_ind = splitter._split(reg_dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 80\n    assert valid_ind.shape[0] == 10\n    assert test_ind.shape[0] == 10\n\n    train = NumpyTupleDataset(*reg_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*reg_dataset.features[test_ind])\n    assert 45.0 < train.features[:, -1].mean() < 55.0\n    assert 45.0 < valid.features[:, -1].mean() < 55.0\n    assert 45.0 < test.features[:, -1].mean() < 55.0\n\n    train_ind, valid_ind, test_ind = splitter._split(reg_dataset,\n                                                     frac_train=0.5,\n                                                     frac_valid=0.3,\n                                                     frac_test=0.2)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 50\n    assert valid_ind.shape[0] == 30\n    assert test_ind.shape[0] == 20\n\n    train = NumpyTupleDataset(*reg_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*reg_dataset.features[test_ind])\n    assert 45.0 < train.features[:, -1].mean() < 55.0\n    assert 45.0 < valid.features[:, -1].mean() < 55.0\n    assert 45.0 < test.features[:, -1].mean() < 55.0\n\n\ndef test_classification_split_fix_seed(cls_dataset):\n    splitter = StratifiedSplitter()\n    train_ind1, valid_ind1, test_ind1 = splitter._split(cls_dataset, seed=44)\n    train_ind2, valid_ind2, test_ind2 = splitter._split(cls_dataset, seed=44)\n\n    assert numpy.array_equal(train_ind1, train_ind2)\n    assert numpy.array_equal(valid_ind1, valid_ind2)\n    assert numpy.array_equal(test_ind1, test_ind2)\n\n\ndef test_split_fail_by_frac_ratio(cls_dataset):\n    splitter = StratifiedSplitter()\n    with pytest.raises(AssertionError):\n        train_ind, valid_ind, test_ind = splitter._split(cls_dataset,\n                                                         frac_train=0.4,\n                                                         frac_valid=0.3,\n                                                         frac_test=0.2)\n\n\ndef test_split_fail_by_invalid_task_type(cls_dataset):\n    splitter = StratifiedSplitter()\n    with pytest.raises(ValueError):\n        train_ind, valid_ind, test_ind = splitter._split(cls_dataset,\n                                                         frac_train=0.5,\n                                                         frac_valid=0.3,\n                                                         frac_test=0.2,\n                                                         task_type='mix')\n\n\ndef test_regression_split_fix_seed(reg_dataset):\n    splitter = StratifiedSplitter()\n    train_ind1, valid_ind1, test_ind1 = splitter._split(reg_dataset, seed=44)\n    train_ind2, valid_ind2, test_ind2 = splitter._split(reg_dataset, seed=44)\n\n    assert numpy.array_equal(train_ind1, train_ind2)\n    assert numpy.array_equal(valid_ind1, valid_ind2)\n    assert numpy.array_equal(test_ind1, test_ind2)\n\n\ndef test_train_valid_test_classification_split(cls_dataset):\n    splitter = StratifiedSplitter()\n    train_ind, valid_ind, test_ind =\\\n        splitter.train_valid_test_split(cls_dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 24\n    assert valid_ind.shape[0] == 3\n    assert test_ind.shape[0] == 3\n\n    train = NumpyTupleDataset(*cls_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*cls_dataset.features[test_ind])\n    assert (train.features[:, -1] == 1).sum() == 8\n    assert (valid.features[:, -1] == 1).sum() == 1\n    assert (test.features[:, -1] == 1).sum() == 1\n\n\ndef test_train_valid_test_classification_split_return_dataset(cls_dataset):\n    splitter = StratifiedSplitter()\n    train, valid, test = splitter.train_valid_test_split(cls_dataset,\n                                                         return_index=False)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert type(test) == NumpyTupleDataset\n    assert len(train) == 24\n    assert len(valid) == 3\n    assert len(test) == 3\n    assert (train.features[:, -1] == 1).sum() == 8\n    assert (valid.features[:, -1] == 1).sum() == 1\n    assert (test.features[:, -1] == 1).sum() == 1\n\n\ndef test_train_valid_test_classification_split_ndarray_return_dataset(\n        cls_ndarray_dataset):\n    cls_dataset, cls_label = cls_ndarray_dataset\n    splitter = StratifiedSplitter()\n    train, valid, test = splitter.train_valid_test_split(cls_dataset,\n                                                         labels=cls_label,\n                                                         return_index=False)\n    assert type(train) == numpy.ndarray\n    assert type(valid) == numpy.ndarray\n    assert type(test) == numpy.ndarray\n    assert len(train) == 24\n    assert len(valid) == 3\n    assert len(test) == 3\n    assert (train == 1).sum() == 8\n    assert (valid == 1).sum() == 1\n    assert (test == 1).sum() == 1\n\n\ndef test_train_valid_test_regression_split(reg_dataset):\n    splitter = StratifiedSplitter()\n    train_ind, valid_ind, test_ind =\\\n        splitter.train_valid_test_split(reg_dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 80\n    assert valid_ind.shape[0] == 10\n    assert test_ind.shape[0] == 10\n\n    train = NumpyTupleDataset(*reg_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])\n    test = NumpyTupleDataset(*reg_dataset.features[test_ind])\n    assert 45.0 < train.features[:, -1].mean() < 55.0\n    assert 45.0 < valid.features[:, -1].mean() < 55.0\n    assert 45.0 < test.features[:, -1].mean() < 55.0\n\n\ndef test_train_valid_test_regression_split_return_dataset(reg_dataset):\n    splitter = StratifiedSplitter()\n    train, valid, test = splitter.train_valid_test_split(reg_dataset,\n                                                         return_index=False)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert type(test) == NumpyTupleDataset\n    assert len(train) == 80\n    assert len(valid) == 10\n    assert len(test) == 10\n    assert 45.0 < train.features[:, -1].mean() < 55.0\n    assert 45.0 < valid.features[:, -1].mean() < 55.0\n    assert 45.0 < test.features[:, -1].mean() < 55.0\n\n\ndef test_train_valid_classification_split(cls_dataset):\n    splitter = StratifiedSplitter()\n    train_ind, valid_ind = splitter.train_valid_split(cls_dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 27\n    assert valid_ind.shape[0] == 3\n\n    train = NumpyTupleDataset(*cls_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*cls_dataset.features[valid_ind])\n    assert (train.features[:, -1] == 1).sum() == 9\n    assert (valid.features[:, -1] == 1).sum() == 1\n\n\ndef test_train_valid_classification_split_return_dataset(cls_dataset):\n    splitter = StratifiedSplitter()\n    train, valid = splitter.train_valid_split(cls_dataset, return_index=False)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert len(train) == 27\n    assert len(valid) == 3\n    assert (train.features[:, -1] == 1).sum() == 9\n    assert (valid.features[:, -1] == 1).sum() == 1\n\n\ndef test_train_valid_classification_split_ndarray_return_dataset(\n        cls_ndarray_dataset):\n    cls_dataset, cls_label = cls_ndarray_dataset\n    splitter = StratifiedSplitter()\n    train, valid = splitter.train_valid_split(cls_dataset, labels=cls_label,\n                                              return_index=False)\n    assert type(train) == numpy.ndarray\n    assert type(valid) == numpy.ndarray\n    assert len(train) == 27\n    assert len(valid) == 3\n    assert (train == 1).sum() == 9\n    assert (valid == 1).sum() == 1\n\n\ndef test_train_valid_test_cls_split_by_labels_return_dataset(cls_dataset,\n                                                             cls_label):\n    splitter = StratifiedSplitter()\n    train, valid, test = splitter.train_valid_test_split(cls_dataset,\n                                                         labels=cls_label,\n                                                         return_index=False)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert type(test) == NumpyTupleDataset\n    assert len(train) == 24\n    assert len(valid) == 3\n    assert len(test) == 3\n    assert (train.features[:, -1] == 1).sum() == 8\n    assert (valid.features[:, -1] == 1).sum() == 1\n    assert (test.features[:, -1] == 1).sum() == 1\n\n\ndef test_train_valid_cls_split_by_labels_return_dataset(cls_dataset,\n                                                        cls_label):\n    splitter = StratifiedSplitter()\n    train, valid = splitter.train_valid_split(cls_dataset, labels=cls_label,\n                                              return_index=False)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert len(train) == 27\n    assert len(valid) == 3\n    assert (train.features[:, -1] == 1).sum() == 9\n    assert (valid.features[:, -1] == 1).sum() == 1\n\n\ndef test_train_valid_regression_split(reg_dataset):\n    splitter = StratifiedSplitter()\n    train_ind, valid_ind = splitter.train_valid_split(reg_dataset)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 90\n    assert valid_ind.shape[0] == 10\n\n    train = NumpyTupleDataset(*reg_dataset.features[train_ind])\n    valid = NumpyTupleDataset(*reg_dataset.features[valid_ind])\n    assert 45.0 < train.features[:, -1].mean() < 55.0\n    assert 45.0 < valid.features[:, -1].mean() < 55.0\n\n\ndef test_train_valid_regression_split_return_dataset(reg_dataset):\n    splitter = StratifiedSplitter()\n    train, valid = splitter.train_valid_split(reg_dataset, return_index=False)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert len(train) == 90\n    assert len(valid) == 10\n    assert 45.0 < train.features[:, -1].mean() < 55.0\n    assert 45.0 < valid.features[:, -1].mean() < 55.0\n"
  },
  {
    "path": "tests/dataset_tests/splitters_tests/test_time_splitter.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.splitters.time_splitter import TimeSplitter\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\n@pytest.fixture\ndef time_list():\n    times = [\n        1980,\n        1990,\n        2010,\n        2020,\n        2000,\n        2050,\n        2030,\n        2040,\n        1960,\n        1970\n    ]\n    return times\n\n\n@pytest.fixture()\ndef dataset():\n    a = numpy.random.random((10, 10))\n    b = numpy.random.random((10, 8))\n    c = numpy.random.random((10, 1))\n    return NumpyTupleDataset(a, b, c)\n\n\ndef test_split(dataset, time_list):\n    splitter = TimeSplitter()\n    train_ind, valid_ind, test_ind = splitter._split(\n        dataset, time_list=time_list)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 8\n    assert valid_ind.shape[0] == 1\n    assert test_ind.shape[0] == 1\n    assert train_ind.tolist() == [8, 9, 0, 1, 4, 2, 3, 6]\n    assert valid_ind.tolist() == [7]\n    assert test_ind.tolist() == [5]\n\n    train_ind, valid_ind, test_ind = splitter._split(\n        dataset, frac_train=0.5, frac_valid=0.3, frac_test=0.2,\n        time_list=time_list)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 5\n    assert valid_ind.shape[0] == 3\n    assert test_ind.shape[0] == 2\n    assert train_ind.tolist() == [8, 9, 0, 1, 4]\n    assert valid_ind.tolist() == [2, 3, 6]\n    assert test_ind.tolist() == [7, 5]\n\n\ndef test_split_fail(dataset, time_list):\n    splitter = TimeSplitter()\n    with pytest.raises(AssertionError):\n        train_ind, valid_ind, test_ind = splitter._split(\n            dataset, frac_train=0.4, frac_valid=0.3, frac_test=0.2,\n            time_list=time_list)\n\n\ndef test_train_valid_test_split(dataset, time_list):\n    splitter = TimeSplitter()\n    train_ind, valid_ind, test_ind = splitter.train_valid_test_split(\n        dataset, time_list=time_list)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 8\n    assert valid_ind.shape[0] == 1\n    assert test_ind.shape[0] == 1\n    assert train_ind.tolist() == [8, 9, 0, 1, 4, 2, 3, 6]\n    assert valid_ind.tolist() == [7]\n    assert test_ind.tolist() == [5]\n\n\ndef test_train_valid_test_split_return_dataset(dataset, time_list):\n    splitter = TimeSplitter()\n    train, valid, test = splitter.train_valid_test_split(\n        dataset, return_index=False, time_list=time_list)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert type(test) == NumpyTupleDataset\n    assert len(train) == 8\n    assert len(valid) == 1\n    assert len(test) == 1\n\n\ndef test_train_valid_split(dataset, time_list):\n    splitter = TimeSplitter()\n    train_ind, valid_ind = splitter.train_valid_split(\n        dataset, time_list=time_list)\n    assert type(train_ind) == numpy.ndarray\n    assert train_ind.shape[0] == 9\n    assert valid_ind.shape[0] == 1\n    assert train_ind.tolist() == [8, 9, 0, 1, 4, 2, 3, 6, 7]\n    assert valid_ind.tolist() == [5]\n\n\ndef test_train_split_return_dataset(dataset, time_list):\n    splitter = TimeSplitter()\n    train, valid = splitter.train_valid_split(\n        dataset, return_index=False, time_list=time_list)\n    assert type(train) == NumpyTupleDataset\n    assert type(valid) == NumpyTupleDataset\n    assert len(train) == 9\n    assert len(valid) == 1\n"
  },
  {
    "path": "tests/dataset_tests/test_converters.py",
    "content": "import chainer\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.converters import concat_mols\n\n\n@pytest.fixture\ndef data_1d():\n    a = numpy.array([1, 2])\n    b = numpy.array([4, 5, 6])\n    return a, b\n\n\n@pytest.fixture\ndef data_1d_expect():\n    a = numpy.array([1, 2, 0])\n    b = numpy.array([4, 5, 6])\n    return a, b\n\n\n@pytest.fixture\ndef data_2d():\n    a = numpy.array([[1, 2], [3, 4]])\n    b = numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n    return a, b\n\n\n@pytest.fixture\ndef data_2d_expect():\n    a = numpy.array([[1, 2, 0], [3, 4, 0], [0, 0, 0]])\n    b = numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n    return a, b\n\n\ndef test_concat_mols_1d_cpu(data_1d, data_1d_expect):\n    result = concat_mols(data_1d, device=-1)\n    assert numpy.array_equal(result[0], data_1d_expect[0])\n    assert numpy.array_equal(result[1], data_1d_expect[1])\n\n\ndef test_concat_mols_2d_cpu(data_2d, data_2d_expect):\n    result = concat_mols(data_2d, device=-1)\n    assert numpy.array_equal(result[0], data_2d_expect[0])\n    assert numpy.array_equal(result[1], data_2d_expect[1])\n\n\n@pytest.mark.gpu\ndef test_concat_mols_1d_gpu(data_1d, data_1d_expect):\n    result = concat_mols(data_1d, device=0)\n    assert chainer.cuda.get_device_from_array(result[0]).id == 0\n    assert chainer.cuda.get_device_from_array(result[1]).id == 0\n    assert numpy.array_equal(chainer.cuda.to_cpu(result[0]),\n                             data_1d_expect[0])\n    assert numpy.array_equal(chainer.cuda.to_cpu(result[1]),\n                             data_1d_expect[1])\n\n\n@pytest.mark.gpu\ndef test_concat_mols_2d_gpu(data_2d, data_2d_expect):\n    result = concat_mols(data_2d, device=0)\n    assert chainer.cuda.get_device_from_array(result[0]).id == 0\n    assert chainer.cuda.get_device_from_array(result[1]).id == 0\n    assert numpy.array_equal(chainer.cuda.to_cpu(result[0]),\n                             data_2d_expect[0])\n    assert numpy.array_equal(chainer.cuda.to_cpu(result[1]),\n                             data_2d_expect[1])\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/dataset_tests/test_numpy_tuple_feature_indexer.py",
    "content": "import numpy\nimport pytest\n\n\nfrom chainer_chemistry.dataset.indexers.numpy_tuple_dataset_feature_indexer import NumpyTupleDatasetFeatureIndexer  # NOQA\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\n\n\n@pytest.fixture\ndef data():\n    a = numpy.array([1, 2])\n    b = numpy.array([4, 5])\n    c = numpy.array([[6, 7, 8], [8, 9, 10]])\n    return a, b, c\n\n\n@pytest.fixture\ndef indexer(data):\n    dataset = NumpyTupleDataset(*data)\n    indexer = NumpyTupleDatasetFeatureIndexer(dataset)\n    return indexer\n\n\nclass TestNumpyTupleDatasetFeatureIndexer(object):\n\n    def test_feature_length(self, indexer):\n        assert indexer.features_length() == 3\n\n    @pytest.mark.parametrize('slice_index', [\n        0, 1, slice(0, 2, None), slice(0, 0, None)])\n    @pytest.mark.parametrize('j', [0, 1])\n    def test_extract_feature_by_slice(self, indexer, data, slice_index, j):\n        numpy.testing.assert_array_equal(\n            indexer.extract_feature_by_slice(slice_index, j),\n            data[j][slice_index])\n        # indexer's __getitem__ should call `extract_feature_by_slice` method,\n        # result should be same with above.\n        numpy.testing.assert_array_equal(\n            indexer[slice_index, j],\n            data[j][slice_index])\n\n    @pytest.mark.parametrize('ndarray_index', [\n        numpy.asarray([0, 1]), numpy.asarray([1]),\n        numpy.asarray([], dtype=numpy.int32)])\n    @pytest.mark.parametrize('j', [0, 1])\n    def test_extract_feature_by_ndarray(self, indexer, data, ndarray_index, j):\n        numpy.testing.assert_array_equal(\n            indexer.extract_feature_by_slice(ndarray_index, j),\n            data[j][ndarray_index])\n        # indexer's __getitem__ should call `extract_feature_by_slice` method,\n        # result should be same with above.\n        numpy.testing.assert_array_equal(\n            indexer[ndarray_index, j],\n            data[j][ndarray_index])\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/datasets_tests/molnet_tests/test_molnet.py",
    "content": "import os\n\nimport numpy\nimport pandas\nimport pytest\n\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\nfrom chainer_chemistry.datasets import molnet\nfrom chainer_chemistry.datasets import NumpyTupleDataset\n\nexpect_bbbp_lengths = [1633, 203, 203]\nexpect_bbbp_lengths2 = [1021, 611, 407]\nexpect_clearance_lengths = [669, 83, 85]\nexpect_pdbbind_lengths = [134, 16, 18]\nexpect_featurized_pdbbind_lengths = [151, 18, 20]\nexpect_qm7_lengths = [5468, 683, 683]\n\n\ndef test_get_molnet_filepath_without_download():\n    filepath = molnet.get_molnet_filepath('bbbp', download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)  # ensure a cache file does not exist.\n\n    filepath = molnet.get_molnet_filepath('bbbp', download_if_not_exist=False)\n    assert isinstance(filepath, str)\n    assert not os.path.exists(filepath)\n\n\n@pytest.mark.slow\ndef test_get_molnet_filepath_with_download():\n    filepath = molnet.get_molnet_filepath('bbbp', download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)  # ensure a cache file does not exist.\n\n    filepath = molnet.get_molnet_filepath('bbbp', download_if_not_exist=True)\n    assert isinstance(filepath, str)\n    assert os.path.exists(filepath)\n\n\ndef test_get_grid_featurized_pdbbind_dataset():\n    # Test core dataset\n    dataset = molnet.get_grid_featurized_pdbbind_dataset('core')\n    assert isinstance(dataset, NumpyTupleDataset)\n    x, y = dataset.get_datasets()\n    assert x.shape == (189, 2052)\n    assert x.dtype == numpy.int32\n    assert y.shape == (189, 1)\n    assert y.dtype == numpy.float32\n\n    # Test full dataset\n    dataset = molnet.get_grid_featurized_pdbbind_dataset('full')\n    assert isinstance(dataset, NumpyTupleDataset)\n    x, y = dataset.get_datasets()\n    assert x.shape == (11303, 2052)\n    assert x.dtype == numpy.int32\n    assert y.shape == (11303, 1)\n    assert y.dtype == numpy.float32\n\n    # Test refined dataset\n    dataset = molnet.get_grid_featurized_pdbbind_dataset('refined')\n    assert isinstance(dataset, NumpyTupleDataset)\n    x, y = dataset.get_datasets()\n    assert x.shape == (3568, 2052)\n    assert x.dtype == numpy.int32\n    assert y.shape == (3568, 1)\n    assert y.dtype == numpy.float32\n\n\n# bbbp is one of classification task dataset\n@pytest.mark.slow\ndef test_get_molnet_bbbp_dataset():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp)\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    datasets = datasets['dataset']\n    assert len(datasets) == 3\n    assert type(datasets[0]) == NumpyTupleDataset\n    assert type(datasets[1]) == NumpyTupleDataset\n    assert type(datasets[2]) == NumpyTupleDataset\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.int32\n        assert len(dataset) == expect_bbbp_lengths[i]\n\n\n# bbbp is one of classification task dataset\n@pytest.mark.slow\ndef test_get_molnet_bbbp_dataset_change_split_ratio():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp,\n                                         frac_train=0.5, frac_valid=0.3,\n                                         frac_test=0.2)\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    datasets = datasets['dataset']\n    assert len(datasets) == 3\n    assert type(datasets[0]) == NumpyTupleDataset\n    assert type(datasets[1]) == NumpyTupleDataset\n    assert type(datasets[2]) == NumpyTupleDataset\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.int32\n        assert len(dataset) == expect_bbbp_lengths2[i]\n\n\n@pytest.mark.slow\ndef test_get_molnet_bbbp_dataset_with_smiles():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    datasets = molnet.get_molnet_dataset('bbbp', preprocessor=pp,\n                                         return_smiles=True)\n\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    smileses = datasets['smiles']\n    datasets = datasets['dataset']\n    assert len(smileses) == 3\n    assert len(datasets) == 3\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to) assert label.ndim == 1 # NOQA\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.int32\n        assert len(dataset) == expect_bbbp_lengths[i]\n        assert len(smileses[i]) == expect_bbbp_lengths[i]\n\n\n# clearance is one of classification task dataset\n@pytest.mark.slow\ndef test_get_molnet_clearance_dataset():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    datasets = molnet.get_molnet_dataset('clearance', preprocessor=pp)\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    datasets = datasets['dataset']\n    assert len(datasets) == 3\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.float32\n\n        # --- Test number of dataset ---\n        assert len(dataset) == expect_clearance_lengths[i]\n\n\n@pytest.mark.slow\ndef test_get_molnet_clearance_dataset_with_return_smiles_enabled():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    datasets = molnet.get_molnet_dataset('clearance', preprocessor=pp,\n                                         return_smiles=True)\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    smileses = datasets['smiles']\n    datasets = datasets['dataset']\n    assert len(datasets) == 3\n    assert len(smileses) == 3\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.float32\n\n        # --- Test number of dataset ---\n        assert len(dataset) == expect_clearance_lengths[i]\n        assert len(smileses[i]) == expect_clearance_lengths[i]\n\n\n@pytest.mark.slow\ndef test_get_molnet_pdbbind_dataset():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    time_list = numpy.random.randint(1000, size=168).tolist()\n    datasets = molnet.get_molnet_dataset('pdbbind_smiles', preprocessor=pp,\n                                         pdbbind_subset='core',\n                                         time_list=time_list, split='random')\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    assert 'pdb_id' in datasets.keys()\n    datasets = datasets['dataset']\n    assert len(datasets) == 3\n    assert type(datasets[0]) == NumpyTupleDataset\n    assert type(datasets[1]) == NumpyTupleDataset\n    assert type(datasets[2]) == NumpyTupleDataset\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.float32\n\n        # --- Test number of dataset ---\n        assert len(dataset) == expect_pdbbind_lengths[i]\n\n\n@pytest.mark.slow\ndef test_get_molnet_pdbbind_dataset_with_pdb_id():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    time_list = numpy.random.randint(1000, size=168).tolist()\n    datasets = molnet.get_molnet_dataset('pdbbind_smiles', preprocessor=pp,\n                                         pdbbind_subset='core',\n                                         return_pdb_id=True,\n                                         time_list=time_list, split='random')\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    assert 'pdb_id' in datasets.keys()\n    pdb_ids = datasets['pdb_id']\n    datasets = datasets['dataset']\n    assert len(pdb_ids) == 3\n    assert len(datasets) == 3\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert label.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.float32\n\n        # --Test number of dataset ---\n        assert len(dataset) == expect_pdbbind_lengths[i]\n        assert len(pdb_ids[i]) == expect_pdbbind_lengths[i]\n\n\n@pytest.mark.slow\ndef test_get_molnet_grid_featurized_pdbbind_dataset():\n    # test default behavioer\n    datasets = molnet.get_molnet_dataset('pdbbind_grid', pdbbind_subset='core',\n                                         split='random')\n    assert 'dataset' in datasets.keys()\n    datasets = datasets['dataset']\n    assert len(datasets) == 3\n    assert type(datasets[0]) == NumpyTupleDataset\n    assert type(datasets[1]) == NumpyTupleDataset\n    assert type(datasets[2]) == NumpyTupleDataset\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.float32\n\n        # --- Test number of dataset ---\n        assert len(dataset) == expect_featurized_pdbbind_lengths[i]\n\n\n# For qm7 dataset, stratified splitting is recommended.\n@pytest.mark.slow\ndef test_get_molnet_qm7_dataset():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    datasets = molnet.get_molnet_dataset('qm7', preprocessor=pp)\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    datasets = datasets['dataset']\n    assert len(datasets) == 3\n    assert type(datasets[0]) == NumpyTupleDataset\n    assert type(datasets[1]) == NumpyTupleDataset\n    assert type(datasets[2]) == NumpyTupleDataset\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.float32\n\n        # --- Test number of dataset ---\n        assert len(dataset) == expect_qm7_lengths[i]\n\n\n# For qm7 dataset, stratified splitting is recommended.\n@pytest.mark.slow\ndef test_get_molnet_qm7_dataset_with_smiles():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    datasets = molnet.get_molnet_dataset('qm7', preprocessor=pp,\n                                         return_smiles=True)\n    assert 'smiles' in datasets.keys()\n    assert 'dataset' in datasets.keys()\n    smileses = datasets['smiles']\n    datasets = datasets['dataset']\n    assert len(datasets) == 3\n    assert len(smileses) == 3\n    assert type(datasets[0]) == NumpyTupleDataset\n    assert type(datasets[1]) == NumpyTupleDataset\n    assert type(datasets[2]) == NumpyTupleDataset\n\n    # Test each train, valid and test dataset\n    for i, dataset in enumerate(datasets):\n        # --- Test dataset is correctly obtained ---\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        # (atom from, atom to) or (edge_type, atom from, atom to)\n        assert label.ndim == 1\n        assert label.shape[0] == 1\n        assert label.dtype == numpy.float32\n\n        # --- Test number of dataset ---\n        assert len(dataset) == expect_qm7_lengths[i]\n        assert len(smileses[i]) == expect_qm7_lengths[i]\n\n\ndef test_get_molnet_bbbp_dataframe():\n    datasets = molnet.get_molnet_dataframe('bbbp')\n    assert isinstance(datasets, pandas.DataFrame)\n    assert len(datasets) == 2050\n\n\ndef test_get_molnet_pdbbind_smiles_dataframe():\n    datasets = molnet.get_molnet_dataframe('pdbbind_smiles',\n                                           pdbbind_subset='core')\n    assert isinstance(datasets, pandas.DataFrame)\n    assert len(datasets) == 168\n\n\ndef test_get_molnet_pdbbind_grid_dataframe():\n    with pytest.raises(ValueError):\n        datasets = molnet.get_molnet_dataframe('pdbbind_grid',  # NOQA\n                                               pdbbind_subset='core')\n\n\nif __name__ == '__main__':\n    args = [__file__, '-v', '-s']\n    pytest.main(args=args)\n"
  },
  {
    "path": "tests/datasets_tests/molnet_tests/test_pdbbind_time.py",
    "content": "import os\n\nimport pytest\n\nfrom chainer_chemistry.datasets.molnet import pdbbind_time\n\n\n@pytest.mark.slow\ndef test_get_pdbbind_time_filepath():\n    filepath = pdbbind_time.get_pdbbind_time_filepath(\n        download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)\n\n    filepath = pdbbind_time.get_pdbbind_time_filepath(\n        download_if_not_exist=True)\n    assert isinstance(filepath, str)\n    assert os.path.exists(filepath)\n\n\ndef test_get_pdbbind_time():\n    time_list = pdbbind_time.get_pdbbind_time()\n    assert isinstance(time_list, list)\n    for time in time_list:\n        assert 1900 < time < 2100\n"
  },
  {
    "path": "tests/datasets_tests/test_numpy_tuple_dataset.py",
    "content": "import os\nimport tempfile\n\nimport numpy\nimport pytest\nimport six\n\nfrom chainer_chemistry.datasets import NumpyTupleDataset\n\n\n@pytest.fixture\ndef data():\n    a = numpy.array([1, 2])\n    b = numpy.array([4, 5])\n    c = numpy.array([[6, 7, 8], [8, 9, 10]])\n    return a, b, c\n\n\n@pytest.fixture\ndef long_data():\n    a = numpy.array([1, 2, 3, 4])\n    b = numpy.array([4, 5, 6, 7])\n    c = numpy.array([[6, 7, 8], [8, 9, 10], [11, 12, 13], [14, 15, 16]])\n    return a, b, c\n\n\nclass TestNumpyTupleDataset(object):\n\n    def test_len(self, data):\n        dataset = NumpyTupleDataset(*data)\n        assert len(dataset) == 2\n\n    @pytest.mark.parametrize('index', [0, 1])\n    def test_get_item_integer_index(self, data, index):\n        dataset = NumpyTupleDataset(*data)\n        actual = dataset[index]\n\n        assert len(actual) == len(data)\n        for a, d in six.moves.zip(actual, data):\n            numpy.testing.assert_array_equal(a, d[index])\n\n    @pytest.mark.parametrize('index', [slice(0, 2, None)])\n    def test_get_item_slice_index(self, data, index):\n        dataset = NumpyTupleDataset(*data)\n        actual = dataset[index]\n\n        batches = [d[index] for d in data]\n        length = len(batches[0])\n        expect = [tuple([batch[i] for batch in batches])\n                  for i in six.moves.range(length)]\n\n        assert len(actual) == len(expect)\n        for tuple_a, tuple_e in six.moves.zip(actual, expect):\n            assert len(tuple_a) == len(tuple_e)\n            for a, e in six.moves.zip(tuple_a, tuple_e):\n                numpy.testing.assert_array_equal(a, e)\n\n    @pytest.mark.parametrize('index', [\n        numpy.asarray([2, 0]), numpy.asarray([1]),\n        numpy.asarray([], dtype=numpy.int32)])\n    def test_get_item_ndarray_index(self, long_data, index):\n        dataset = NumpyTupleDataset(*long_data)\n        actual = dataset[index]\n\n        batches = [d[index] for d in long_data]\n        length = len(batches[0])\n        expect = [tuple([batch[i] for batch in batches])\n                  for i in six.moves.range(length)]\n\n        assert len(actual) == len(expect)\n        for tuple_a, tuple_e in six.moves.zip(actual, expect):\n            assert len(tuple_a) == len(tuple_e)\n            for a, e in six.moves.zip(tuple_a, tuple_e):\n                numpy.testing.assert_array_equal(a, e)\n\n    @pytest.mark.parametrize('index', [[2, 0], [1]])\n    def test_get_item_list_index(self, long_data, index):\n        dataset = NumpyTupleDataset(*long_data)\n        actual = dataset[index]\n\n        batches = [d[index] for d in long_data]\n        length = len(batches[0])\n        expect = [tuple([batch[i] for batch in batches])\n                  for i in six.moves.range(length)]\n\n        assert len(actual) == len(expect)\n        for tuple_a, tuple_e in six.moves.zip(actual, expect):\n            assert len(tuple_a) == len(tuple_e)\n            for a, e in six.moves.zip(tuple_a, tuple_e):\n                numpy.testing.assert_array_equal(a, e)\n\n    def test_invalid_datasets(self):\n        a = numpy.array([1, 2])\n        b = numpy.array([1, 2, 3])\n        with pytest.raises(ValueError):\n            NumpyTupleDataset(a, b)\n\n    def test_save_load(self, data):\n        tmp_cache_path = os.path.join(tempfile.mkdtemp(), 'tmp.npz')\n        dataset = NumpyTupleDataset(*data)\n        NumpyTupleDataset.save(tmp_cache_path, dataset)\n        assert os.path.exists(tmp_cache_path)\n        load_dataset = NumpyTupleDataset.load(tmp_cache_path)\n        os.remove(tmp_cache_path)\n\n        assert len(dataset._datasets) == len(load_dataset._datasets)\n        for a, d in six.moves.zip(dataset._datasets, load_dataset._datasets):\n            numpy.testing.assert_array_equal(a, d)\n\n    def test_get_datasets(self, data):\n        dataset = NumpyTupleDataset(*data)\n        datasets = dataset.get_datasets()\n        assert len(datasets) == len(data)\n        for i in range(len(datasets)):\n            numpy.testing.assert_array_equal(datasets[i], data[i])\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/datasets_tests/test_qm9.py",
    "content": "import os\n\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\nfrom chainer_chemistry.datasets import qm9\n\n\nQM9_NUM_LABEL = 15\nQM9_NUM_DATASET = 133885\n\n\ndef test_get_qm9_filepath_without_download():\n    filepath = qm9.get_qm9_filepath(download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)  # ensure a cache file does not exist.\n\n    filepath = qm9.get_qm9_filepath(download_if_not_exist=False)\n    assert isinstance(filepath, str)\n    assert not os.path.exists(filepath)\n\n\n@pytest.mark.slow\ndef test_get_qm9_filepath_with_download():\n    filepath = qm9.get_qm9_filepath(download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)  # ensure a cache file does not exist.\n\n    # This method downloads the file if not exist\n    filepath = qm9.get_qm9_filepath(download_if_not_exist=True)\n    assert isinstance(filepath, str)\n    assert os.path.exists(filepath)\n\n\n@pytest.mark.slow\ndef test_get_qm9():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    dataset = qm9.get_qm9(preprocessor=pp)\n\n    # --- Test dataset is correctly obtained ---\n    index = numpy.random.choice(len(dataset), None)\n    atoms, label = dataset[index]\n\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    # (atom from, atom to) or (edge_type, atom from, atom to)\n    assert label.ndim == 1\n    assert label.shape[0] == QM9_NUM_LABEL\n    assert label.dtype == numpy.float32\n\n    # --- Test number of dataset ---\n    assert len(dataset) == QM9_NUM_DATASET\n\n\n@pytest.mark.slow\ndef test_get_qm9_smiles():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    dataset, smiles = qm9.get_qm9(preprocessor=pp, return_smiles=True)\n\n    # --- Test dataset is correctly obtained ---\n    index = numpy.random.choice(len(dataset), None)\n    atoms, label = dataset[index]\n\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    # (atom from, atom to) or (edge_type, atom from, atom to)\n    assert label.ndim == 1\n    assert label.shape[0] == QM9_NUM_LABEL\n    assert label.dtype == numpy.float32\n\n    # --- Test number of dataset ---\n    assert len(dataset) == QM9_NUM_DATASET\n    assert len(smiles) == QM9_NUM_DATASET\n\n    # --- Test order of dataset ---\n    atoms0, labels0 = dataset[0]\n    assert smiles[0] == 'C'\n    assert numpy.alltrue(atoms0 == numpy.array([6], dtype=numpy.int32))\n\n    atoms7777, labels7777 = dataset[7777]\n    assert smiles[7777] == 'CC1=NCCC(C)O1'\n    assert numpy.alltrue(\n        atoms7777 == numpy.array([6, 6, 7, 6, 6, 6, 6, 8], dtype=numpy.int32))\n\n    atoms133884, labels133884 = dataset[133884]\n    assert smiles[133884] == 'C1N2C3C4C5OC13C2C54'\n    assert numpy.alltrue(\n        atoms133884 == numpy.array([6, 7, 6, 6, 6, 8, 6, 6, 6],\n                                   dtype=numpy.int32))\n\n\ndef test_get_qm9_label_names():\n    label_names = qm9.get_qm9_label_names()\n    assert isinstance(label_names, list)\n    for label in label_names:\n        assert isinstance(label, str)\n\n\nif __name__ == '__main__':\n    args = [__file__, '-v', '-s']\n    pytest.main(args=args)\n"
  },
  {
    "path": "tests/datasets_tests/test_tox21.py",
    "content": "import os\n\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\nfrom chainer_chemistry.datasets import tox21\n\n\nTOX21_NUM_LABEL = 12\ndataset_types = [\n    'train',\n    'val',\n    'test'\n]\n\n\n@pytest.mark.parametrize('dataset_type', dataset_types)\ndef test_get_tox21_filepath_without_download(dataset_type):\n    filepath = tox21.get_tox21_filepath(dataset_type,\n                                        download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)  # ensure a cache file does not exist.\n\n    filepath = tox21.get_tox21_filepath(dataset_type,\n                                        download_if_not_exist=False)\n    assert isinstance(filepath, str)\n    assert not os.path.exists(filepath)\n\n\n@pytest.mark.slow\n@pytest.mark.parametrize('dataset_type', dataset_types)\ndef test_get_tox21_filepath_with_download(dataset_type):\n    filepath = tox21.get_tox21_filepath(dataset_type,\n                                        download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)  # ensure a cache file does not exist.\n\n    # This method downloads the file if not exist\n    filepath = tox21.get_tox21_filepath(dataset_type,\n                                        download_if_not_exist=True)\n    assert isinstance(filepath, str)\n    assert os.path.exists(filepath)\n\n\n@pytest.mark.slow\ndef test_get_tox21():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    train, val, test = tox21.get_tox21(preprocessor=pp)\n\n    # --- Test dataset is correctly obtained ---\n    for dataset in [train, val, test]:\n        index = numpy.random.choice(len(dataset), None)\n        atoms, label = dataset[index]\n\n        assert atoms.ndim == 1  # (atom, )\n        assert atoms.dtype == numpy.int32\n        assert label.ndim == 1\n        assert label.shape[0] == TOX21_NUM_LABEL\n        assert label.dtype == numpy.int32\n\n\ndef test_get_tox21_label_names():\n    label_names = tox21.get_tox21_label_names()\n    assert isinstance(label_names, list)\n    for label in label_names:\n        assert isinstance(label, str)\n\n\ndef test_get_tox21_filepath_assert_raises():\n    with pytest.raises(ValueError):\n        tox21.get_tox21_filepath('other')\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/datasets_tests/test_zinc.py",
    "content": "import os\n\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.dataset.preprocessors.atomic_number_preprocessor import AtomicNumberPreprocessor  # NOQA\nfrom chainer_chemistry.datasets import zinc\n\n\nZINC250K_NUM_LABEL = 3\nZINC250K_NUM_DATASET = 249455\n\n\ndef test_get_zinc_filepath_without_download():\n    filepath = zinc.get_zinc250k_filepath(download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)  # ensure a cache file does not exist.\n\n    filepath = zinc.get_zinc250k_filepath(download_if_not_exist=False)\n    assert isinstance(filepath, str)\n    assert not os.path.exists(filepath)\n\n\n@pytest.mark.slow\ndef test_get_zinc_filepath_with_download():\n    filepath = zinc.get_zinc250k_filepath(download_if_not_exist=False)\n    if os.path.exists(filepath):\n        os.remove(filepath)  # ensure a cache file does not exist.\n\n    # This method downloads the file if not exist\n    filepath = zinc.get_zinc250k_filepath(download_if_not_exist=True)\n    assert isinstance(filepath, str)\n    assert os.path.exists(filepath)\n\n\n@pytest.mark.slow\ndef test_get_zinc():\n    # test default behavior\n    pp = AtomicNumberPreprocessor()\n    dataset = zinc.get_zinc250k(preprocessor=pp)\n\n    # --- Test dataset is correctly obtained ---\n    index = numpy.random.choice(len(dataset), None)\n    atoms, label = dataset[index]\n\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    assert label.ndim == 1\n    assert label.shape[0] == ZINC250K_NUM_LABEL\n    assert label.dtype == numpy.float32\n\n    # --- Test number of dataset ---\n    assert len(dataset) == ZINC250K_NUM_DATASET\n\n\n@pytest.mark.slow\ndef test_get_zinc_smiles():\n    # test smiles extraction and dataset order\n    pp = AtomicNumberPreprocessor()\n    target_index = [0, 7777, 249454]  # set target_index for fast testing...\n    dataset, smiles = zinc.get_zinc250k(preprocessor=pp, return_smiles=True,\n                                        target_index=target_index)\n\n    # --- Test dataset is correctly obtained ---\n    index = numpy.random.choice(len(dataset), None)\n    atoms, label = dataset[index]\n\n    assert atoms.ndim == 1  # (atom, )\n    assert atoms.dtype == numpy.int32\n    # (atom from, atom to) or (edge_type, atom from, atom to)\n    assert label.ndim == 1\n    assert label.shape[0] == ZINC250K_NUM_LABEL\n    assert label.dtype == numpy.float32\n\n    # --- Test number of dataset ---\n    assert len(dataset) == len(target_index)\n    assert len(smiles) == len(target_index)\n\n    # --- Test order of dataset ---\n    assert smiles[0] == 'CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1'\n    atoms0, labels0 = dataset[0]\n    assert numpy.alltrue(atoms0 == numpy.array(\n        [6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 8, 7, 6, 6, 6, 6, 6, 6, 9, 6,\n         6], dtype=numpy.int32))\n    assert numpy.alltrue(labels0 == numpy.array(\n        [5.0506, 0.70201224, 2.0840945], dtype=numpy.float32))\n\n    assert smiles[1] == 'CCCc1cc(NC(=O)Nc2ccc3c(c2)OCCO3)n(C)n1'\n    atoms7777, labels7777 = dataset[1]\n    assert numpy.alltrue(atoms7777 == numpy.array(\n        [6, 6, 6, 6, 6, 6, 7, 6, 8, 7, 6, 6, 6, 6, 6, 6, 8, 6, 6, 8, 7, 6, 7],\n        dtype=numpy.int32))\n    assert numpy.alltrue(labels7777 == numpy.array(\n        [2.7878, 0.9035222, 2.3195992], dtype=numpy.float32))\n\n    assert smiles[2] == 'O=C(CC(c1ccccc1)c1ccccc1)N1CCN(S(=O)(=O)c2ccccc2[N+](=O)[O-])CC1'  # NOQA\n    atoms249454, labels249454 = dataset[2]\n    assert numpy.alltrue(atoms249454 == numpy.array(\n        [8,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,\n         6,  6,  7, 16,  8,  8,  6,  6,  6,  6,  6,  6,  7,  8,  8,  6,  6],\n        dtype=numpy.int32))\n    assert numpy.alltrue(labels249454 == numpy.array(\n        [3.6499, 0.37028658, 2.2142494], dtype=numpy.float32))\n\n\ndef test_get_zinc_label_names():\n    label_names = zinc.get_zinc250k_label_names()\n    assert isinstance(label_names, list)\n    for label in label_names:\n        assert isinstance(label, str)\n\n\nif __name__ == '__main__':\n    args = [__file__, '-v', '-s']\n    pytest.main(args=args)\n"
  },
  {
    "path": "tests/functions_tests/activation/test_megnet_softplus.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer import cuda\n\nfrom chainer_chemistry.functions.activation.megnet_softplus \\\n    import megnet_softplus\n\n\ndef test_forward_cpu():\n    x = numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32)\n    output = megnet_softplus(x)\n    expect_output = numpy.array([\n        [0.62011445, 1.4337809, 2.3554401],\n        [5.3093286, 4.313568, 3.3250027]], dtype=numpy.float32)\n    numpy.allclose(output.array, expect_output)\n\n\ndef test_forward_zero_cpu():\n    x = numpy.zeros((2, 3), dtype=numpy.float32)\n    output = megnet_softplus(x)\n    expect_output = numpy.zeros((2, 3), dtype=numpy.float32)\n    numpy.allclose(output.array, expect_output)\n\n\ndef test_forward_avoid_overflow_cpu():\n    x = numpy.array([1e5], dtype=numpy.float32)\n    output = megnet_softplus(x)\n    expect_output = numpy.array([1e5], dtype=numpy.float32)\n    numpy.allclose(output.array, expect_output)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu():\n    x = cuda.to_gpu(numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32))\n    output = megnet_softplus(x)\n    expect_output = numpy.array([\n        [0.62011445, 1.4337809, 2.3554401],\n        [5.3093286, 4.313568, 3.3250027]], dtype=numpy.float32)\n    numpy.allclose(cuda.to_cpu(output.array), expect_output)\n\n\n@pytest.mark.gpu\ndef test_forward_zero_gpu():\n    x = cuda.to_gpu(numpy.zeros((2, 3), dtype=numpy.float32))\n    output = megnet_softplus(x)\n    expect_output = numpy.zeros((2, 3), dtype=numpy.float32)\n    numpy.allclose(cuda.to_cpu(output.array), expect_output)\n\n\n@pytest.mark.gpu\ndef test_forward_avoid_overflow_gpu():\n    x = cuda.to_gpu(numpy.array([1e5], dtype=numpy.float32))\n    output = megnet_softplus(x)\n    expect_output = numpy.array([1e5], dtype=numpy.float32)\n    numpy.allclose(cuda.to_cpu(output.array), expect_output)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/functions_tests/activation/test_shifted_softplus.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer import cuda\n\nfrom chainer_chemistry.functions.activation.shifted_softplus import shifted_softplus  # NOQA\n\n\ndef test_forward_cpu():\n    x = numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32)\n    output = shifted_softplus(x)\n    expect_output = numpy.array([\n        [0.62011445, 1.4337809, 2.3554401],\n        [5.3093286, 4.313568, 3.3250027]], dtype=numpy.float32)\n    numpy.allclose(output.array, expect_output)\n\n\ndef test_forward_zero_cpu():\n    x = numpy.zeros((2, 3), dtype=numpy.float32)\n    output = shifted_softplus(x)\n    expect_output = numpy.zeros((2, 3), dtype=numpy.float32)\n    numpy.allclose(output.array, expect_output)\n\n\ndef test_forward_avoid_overflow_cpu():\n    x = numpy.array([1e5], dtype=numpy.float32)\n    output = shifted_softplus(x)\n    expect_output = numpy.array([1e5], dtype=numpy.float32)\n    numpy.allclose(output.array, expect_output)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu():\n    x = cuda.to_gpu(numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32))\n    output = shifted_softplus(x)\n    expect_output = numpy.array([\n        [0.62011445, 1.4337809, 2.3554401],\n        [5.3093286, 4.313568, 3.3250027]], dtype=numpy.float32)\n    numpy.allclose(cuda.to_cpu(output.array), expect_output)\n\n\n@pytest.mark.gpu\ndef test_forward_zero_gpu():\n    x = cuda.to_gpu(numpy.zeros((2, 3), dtype=numpy.float32))\n    output = shifted_softplus(x)\n    expect_output = numpy.zeros((2, 3), dtype=numpy.float32)\n    numpy.allclose(cuda.to_cpu(output.array), expect_output)\n\n\n@pytest.mark.gpu\ndef test_forward_avoid_overflow_gpu():\n    x = cuda.to_gpu(numpy.array([1e5], dtype=numpy.float32))\n    output = shifted_softplus(x)\n    expect_output = numpy.array([1e5], dtype=numpy.float32)\n    numpy.allclose(cuda.to_cpu(output.array), expect_output)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/functions_tests/activation/test_softmax.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer import cuda\n\nfrom chainer_chemistry.functions.activation.softmax import softmax\n\n\ndef test_forward_cpu():\n    x = numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32)\n    output = softmax(x)\n    expect_output = numpy.array([\n        [0.09003057, 0.24472848, 0.66524094],\n        [0.66524094, 0.24472848, 0.09003057]], dtype=numpy.float32)\n    numpy.allclose(output.array, expect_output)\n\n\ndef test_forward_cpu_with_mask():\n    x = numpy.array([[1, 2, 3, 2, 5], [1, 6, 5, 4, 2]], dtype=numpy.float32)\n    mask = numpy.array([[1, 1, 1, 0, 0], [0, 1, 1, 1, 0]], dtype=numpy.float32)\n    output = softmax(x, mask=mask)\n    expect_output = numpy.array([\n        [0.09003057, 0.24472848, 0.66524094, 0., 0.],\n        [0., 0.66524094, 0.24472848, 0.09003057, 0.]], dtype=numpy.float32)\n    numpy.allclose(output.array, expect_output)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu():\n    x = cuda.to_gpu(numpy.array([[1, 2, 3], [6, 5, 4]], dtype=numpy.float32))\n    output = softmax(x)\n    expect_output = numpy.array([\n        [0.09003057, 0.24472848, 0.66524094],\n        [0.66524094, 0.24472848, 0.09003057]], dtype=numpy.float32)\n    numpy.allclose(cuda.to_cpu(output.array), expect_output)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu_with_mask():\n    x = numpy.array([[1, 2, 3, 2, 5], [1, 6, 5, 4, 2]], dtype=numpy.float32)\n    mask = numpy.array([[1, 1, 1, 0, 0], [0, 1, 1, 1, 0]], dtype=numpy.float32)\n    x, mask = map(cuda.to_gpu, (x, mask))\n    output = softmax(x, mask=mask)\n    expect_output = numpy.array([\n        [0.09003057, 0.24472848, 0.66524094, 0., 0.],\n        [0., 0.66524094, 0.24472848, 0.09003057, 0.]], dtype=numpy.float32)\n    numpy.allclose(cuda.to_cpu(output.array), expect_output)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/functions_tests/evaluation/test_r2_score.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer import cuda\n\nimport chainer_chemistry\n\n\ndef r2_score(pred, true, sample_weight=None, multioutput=\"uniform_average\",\n             ignore_nan=False):\n    pred = cuda.to_cpu(pred)\n    true = cuda.to_cpu(true)\n    diff = pred - true\n    dev = true - numpy.mean(true, axis=0)\n    if ignore_nan:\n        diff[numpy.isnan(diff)] = 0.\n        dev[numpy.isnan(dev)] = 0.\n    SS_res = numpy.asarray(\n        numpy.sum(diff ** 2, axis=0))\n    SS_tot = numpy.asarray(\n        numpy.sum(dev ** 2, axis=0))\n\n    if multioutput == 'uniform_average':\n        if numpy.any(SS_tot == 0):\n            return 0.0\n        else:\n            return (1 - SS_res / SS_tot).mean()\n    elif multioutput == 'raw_values':\n        if numpy.any(SS_tot == 0):\n            # Assign dummy value to avoid zero-division\n            SS_tot_iszero = SS_tot == 0\n            SS_tot[SS_tot_iszero] = 1\n\n            return numpy.where(SS_tot_iszero, 0.0, 1 - SS_res / SS_tot)\n        else:\n            return 1 - SS_res / SS_tot\n\n\n@pytest.fixture\ndef inputs():\n    numpy.random.seed(0)\n    x0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    # Add sufficient margin to prevent computational error\n    diff = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    diff[abs(diff) < 0.01] = 0.5\n    x1 = x0 + diff\n    x2 = numpy.asarray([[0.3, numpy.nan, 0.2],\n                        [numpy.nan, 0.1, 0.5],\n                        [0.9, 0.7, numpy.nan],\n                        [0.2, -0.3, 0.4]]).astype(numpy.float32)\n    return x0, x1, x2\n\n\ndef check_forward(inputs):\n    x0, x1, _ = inputs\n    y = chainer_chemistry.functions.r2_score(x0, x1)\n    assert y.data.dtype == 'f'\n    assert y.data.shape == ()\n\n    expect = r2_score(x0, x1)\n    assert numpy.allclose(cuda.to_cpu(y.data), expect)\n\n\ndef check_forward_ignore_nan(inputs):\n    x0, _, x2 = inputs\n    y = chainer_chemistry.functions.r2_score(x0, x2, ignore_nan=True)\n    assert y.data.dtype == 'f'\n    assert y.data.shape == ()\n\n    expect = r2_score(x0, x2, ignore_nan=True)\n    assert numpy.allclose(cuda.to_cpu(y.data), expect)\n\n\ndef check_forward_ignore_nan_with_nonnan_value(inputs):\n    x0, x1, _ = inputs\n    y = chainer_chemistry.functions.r2_score(x0, x1, ignore_nan=True)\n    assert y.data.dtype == 'f'\n    assert y.data.shape == ()\n\n    expect = r2_score(x0, x1, ignore_nan=True)\n    assert numpy.allclose(y.data, expect)\n\n\ndef test_forward_cpu(inputs):\n    check_forward(inputs)\n    check_forward_ignore_nan(inputs)\n    check_forward_ignore_nan_with_nonnan_value(inputs)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(inputs):\n    x0, x1, x2 = inputs\n    check_forward((cuda.to_gpu(x0), cuda.to_gpu(x1), None))\n    check_forward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2)))\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/functions_tests/loss/test_mean_absolute_error.py",
    "content": "import numpy\nimport pytest\n\nimport chainer\nfrom chainer import cuda\nfrom chainer import gradient_check\n\nimport chainer_chemistry\n\n\n@pytest.fixture\ndef inputs():\n    numpy.random.seed(0)\n    x0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    # Add sufficient margin to prevent computational error\n    diff = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    diff[abs(diff) < 0.01] = 0.5\n    x1 = x0 + diff\n    x2 = numpy.asarray([[0.3, numpy.nan, 0.2],\n                        [numpy.nan, 0.1, 0.5],\n                        [0.9, 0.7, numpy.nan],\n                        [0.2, -0.3, 0.4]]).astype(numpy.float32)\n    return x0, x1, x2\n\n\n@pytest.fixture\ndef grads():\n    numpy.random.seed(0)\n    gy = numpy.random.uniform(-1, 1, ()).astype(numpy.float32)\n    ggx0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    ggx1 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    return gy, ggx0, ggx1\n\n\ndef check_forward(inputs):\n    x0_data, x1_data, _ = inputs\n    x0 = chainer.Variable(x0_data)\n    x1 = chainer.Variable(x1_data)\n    loss = chainer_chemistry.functions.mean_absolute_error(x0, x1)\n    loss_value = cuda.to_cpu(loss.data)\n    assert loss.dtype == numpy.float32\n    assert loss_value.shape == ()\n\n    loss_expect = numpy.zeros(())\n    x0_data = cuda.to_cpu(x0_data)\n    x1_data = cuda.to_cpu(x1_data)\n    for i in numpy.ndindex(x0_data.shape):\n        loss_expect += abs((x0_data[i] - x1_data[i]))\n    loss_expect /= x0_data.size\n    assert numpy.allclose(loss_value, loss_expect)\n\n\ndef check_forward_ignore_nan(inputs):\n    x0_data, _, x2_data = inputs\n    x0 = chainer.Variable(x0_data)\n    x2 = chainer.Variable(x2_data)\n    loss = chainer_chemistry.functions.mean_absolute_error(x0, x2,\n                                                           ignore_nan=True)\n    loss_value = cuda.to_cpu(loss.data)\n    assert loss.dtype == numpy.float32\n    assert loss_value.shape == ()\n\n    loss_expect = numpy.zeros(())\n    x0_data = cuda.to_cpu(x0_data)\n    x2_data = cuda.to_cpu(x2_data)\n    nan_mask = numpy.invert(numpy.isnan(x2_data)).astype(x2_data.dtype)\n    for i in numpy.ndindex(x0_data.shape):\n        loss_expect += abs(x0_data[i] -\n                           numpy.nan_to_num(x2_data[i])) * nan_mask[i]\n    loss_expect /= x0_data.size\n    assert numpy.allclose(loss_value, loss_expect)\n\n\ndef check_forward_ignore_nan_with_nonnan_value(inputs):\n    x0_data, x1_data, _ = inputs\n    x0 = chainer.Variable(x0_data)\n    x1 = chainer.Variable(x1_data)\n    loss = chainer_chemistry.functions.mean_absolute_error(x0, x1,\n                                                           ignore_nan=True)\n    loss_value = cuda.to_cpu(loss.data)\n    assert loss.dtype == numpy.float32\n    assert loss_value.shape == ()\n\n    loss_expect = numpy.zeros(())\n    x0_data = cuda.to_cpu(x0_data)\n    x1_data = cuda.to_cpu(x1_data)\n    nan_mask = numpy.invert(numpy.isnan(x1_data)).astype(x1_data.dtype)\n    for i in numpy.ndindex(x0_data.shape):\n        loss_expect += abs(x0_data[i] -\n                           numpy.nan_to_num(x1_data[i])) * nan_mask[i]\n    loss_expect /= x0_data.size\n    assert numpy.allclose(loss_value, loss_expect)\n\n\ndef test_forward_cpu(inputs):\n    check_forward(inputs)\n    check_forward_ignore_nan(inputs)\n    check_forward_ignore_nan_with_nonnan_value(inputs)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(inputs):\n    x0, x1, x2 = inputs\n    check_forward((cuda.to_gpu(x0), cuda.to_gpu(x1), None))\n    check_forward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2)))\n\n\ndef check_backward(inputs):\n    x0_data, x1_data, _ = inputs\n    gradient_check.check_backward(\n        chainer_chemistry.functions.mean_absolute_error,\n        (x0_data, x1_data), None, eps=1e-2)\n\n\ndef check_backward_ignore_nan(inputs):\n    x0_data, _, x2_data = inputs\n\n    def func(x0, x1):\n        return chainer_chemistry.functions.mean_absolute_error(x0, x1,\n                                                               ignore_nan=True)\n    gradient_check.check_backward(func, (x0_data, x2_data), None, eps=1e-2,\n                                  atol=1e-3, rtol=1e-3)\n\n\ndef check_backward_ignore_nan_with_nonnan_value(inputs):\n    x0_data, x1_data, _ = inputs\n\n    def func(x0, x1):\n        return chainer_chemistry.functions.mean_absolute_error(x0, x1,\n                                                               ignore_nan=True)\n    gradient_check.check_backward(func, (x0_data, x1_data), None, eps=1e-2)\n\n\ndef test_backward_cpu(inputs):\n    check_backward(inputs)\n    check_backward_ignore_nan(inputs)\n    check_backward_ignore_nan_with_nonnan_value(inputs)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(inputs):\n    x0, x1, x2 = inputs\n    check_backward((cuda.to_gpu(x0), cuda.to_gpu(x1), None))\n    check_backward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2)))\n    check_backward_ignore_nan_with_nonnan_value((cuda.to_gpu(x0),\n                                                 cuda.to_gpu(x1), None))\n\n\ndef check_double_backward(inputs, grads):\n    x0, x1, _ = inputs\n    gy, ggx0, ggx1 = grads\n\n    def func(*xs):\n        y = chainer_chemistry.functions.mean_absolute_error(*xs)\n        return y * y\n    gradient_check.check_double_backward(func, (x0, x1), gy, (ggx0, ggx1))\n\n\ndef check_double_backward_ignore_nan(inputs, grads):\n    x0, _, x2 = inputs\n    gy, ggx0, ggx1 = grads\n\n    def func(*xs):\n        y = chainer_chemistry.functions.mean_absolute_error(*xs,\n                                                            ignore_nan=True)\n        return y * y\n    gradient_check.check_double_backward(func, (x0, x2), gy, (ggx0, ggx1))\n\n\ndef check_double_backward_ignore_nan_with_nonnan_value(inputs, grads):\n    x0, x1, _ = inputs\n    gy, ggx0, ggx1 = grads\n\n    def func(*xs):\n        y = chainer_chemistry.functions.mean_absolute_error(*xs,\n                                                            ignore_nan=True)\n        return y * y\n    gradient_check.check_double_backward(func, (x0, x1), gy, (ggx0, ggx1))\n\n\ndef test_double_backward_cpu(inputs, grads):\n    check_double_backward(inputs, grads)\n    check_double_backward_ignore_nan(inputs, grads)\n    check_double_backward_ignore_nan_with_nonnan_value(inputs, grads)\n\n\n@pytest.mark.gpu\ndef test_double_backward_gpu(inputs, grads):\n    x0, x1, x2 = inputs\n    gy, ggx0, ggx1 = grads\n    check_double_backward((cuda.to_gpu(x0), cuda.to_gpu(x1), None),\n                          (cuda.to_gpu(gy), cuda.to_gpu(ggx0),\n                           cuda.to_gpu(ggx1)))\n    check_double_backward_ignore_nan_with_nonnan_value((cuda.to_gpu(x0),\n                                                        cuda.to_gpu(x1),\n                                                        None),\n                                                       (cuda.to_gpu(gy),\n                                                        cuda.to_gpu(ggx0),\n                                                        cuda.to_gpu(ggx1)))\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/functions_tests/loss/test_mean_squared_error.py",
    "content": "import numpy\nimport pytest\n\nimport chainer\nfrom chainer import cuda\nfrom chainer import gradient_check\n\nimport chainer_chemistry\n\n\n@pytest.fixture\ndef inputs():\n    numpy.random.seed(0)\n    x0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    x1 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    x2 = numpy.asarray([[0.3, numpy.nan, 0.2],\n                        [numpy.nan, 0.1, 0.5],\n                        [0.9, 0.7, numpy.nan],\n                        [0.2, -0.3, 0.4]]).astype(numpy.float32)\n    return x0, x1, x2\n\n\n@pytest.fixture\ndef grads():\n    numpy.random.seed(0)\n    gy = numpy.random.uniform(-1, 1, ()).astype(numpy.float32)\n    ggx0 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    ggx1 = numpy.random.uniform(-1, 1, (4, 3)).astype(numpy.float32)\n    return gy, ggx0, ggx1\n\n\ndef check_forward(inputs):\n    x0_data, x1_data, _ = inputs\n    x0 = chainer.Variable(x0_data)\n    x1 = chainer.Variable(x1_data)\n    loss = chainer_chemistry.functions.mean_squared_error(x0, x1)\n    loss_value = cuda.to_cpu(loss.data)\n    assert loss.dtype == numpy.float32\n    assert loss_value.shape == ()\n\n    loss_expect = numpy.zeros(())\n    x0_data = cuda.to_cpu(x0_data)\n    x1_data = cuda.to_cpu(x1_data)\n    for i in numpy.ndindex(x0_data.shape):\n        loss_expect += ((x0_data[i] - x1_data[i]) ** 2)\n    loss_expect /= x0_data.size\n    assert numpy.allclose(loss_value, loss_expect)\n\n\ndef check_forward_ignore_nan(inputs):\n    x0_data, _, x2_data = inputs\n    x0 = chainer.Variable(x0_data)\n    x2 = chainer.Variable(x2_data)\n    loss = chainer_chemistry.functions.mean_squared_error(x0, x2,\n                                                          ignore_nan=True)\n    loss_value = cuda.to_cpu(loss.data)\n    assert loss.dtype == numpy.float32\n    assert loss_value.shape == ()\n\n    loss_expect = numpy.zeros(())\n    x0_data = cuda.to_cpu(x0_data)\n    x2_data = cuda.to_cpu(x2_data)\n    nan_mask = numpy.invert(numpy.isnan(x2_data)).astype(x2_data.dtype)\n    for i in numpy.ndindex(x0_data.shape):\n        loss_expect += ((x0_data[i] - numpy.nan_to_num(x2_data[i])) ** 2\n                        * nan_mask[i])\n    loss_expect /= x0_data.size\n    assert numpy.allclose(loss_value, loss_expect)\n\n\ndef check_forward_ignore_nan_with_nonnan_value(inputs):\n    x0_data, x1_data, _ = inputs\n    x0 = chainer.Variable(x0_data)\n    x1 = chainer.Variable(x1_data)\n    loss = chainer_chemistry.functions.mean_squared_error(x0, x1,\n                                                          ignore_nan=True)\n    loss_value = cuda.to_cpu(loss.data)\n    assert loss.dtype == numpy.float32\n    assert loss_value.shape == ()\n\n    loss_expect = numpy.zeros(())\n    x0_data = cuda.to_cpu(x0_data)\n    x1_data = cuda.to_cpu(x1_data)\n    nan_mask = numpy.invert(numpy.isnan(x1_data)).astype(x1_data.dtype)\n    for i in numpy.ndindex(x0_data.shape):\n        loss_expect += ((x0_data[i] - numpy.nan_to_num(x1_data[i])) ** 2\n                        * nan_mask[i])\n    loss_expect /= x0_data.size\n    assert numpy.allclose(loss_value, loss_expect)\n\n\ndef test_forward_cpu(inputs):\n    check_forward(inputs)\n    check_forward_ignore_nan(inputs)\n    check_forward_ignore_nan_with_nonnan_value(inputs)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(inputs):\n    x0, x1, x2 = inputs\n    check_forward((cuda.to_gpu(x0), cuda.to_gpu(x1), None))\n    check_forward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2)))\n\n\ndef check_backward(inputs):\n    x0_data, x1_data, _ = inputs\n    gradient_check.check_backward(\n        chainer_chemistry.functions.mean_squared_error,\n        (x0_data, x1_data), None, eps=1e-2)\n\n\ndef check_backward_ignore_nan(inputs):\n    x0_data, _, x2_data = inputs\n\n    def func(x0, x1):\n        return chainer_chemistry.functions.mean_squared_error(x0, x1,\n                                                              ignore_nan=True)\n    gradient_check.check_backward(func, (x0_data, x2_data), None, eps=1e-2)\n\n\ndef check_backward_ignore_nan_with_nonnan_value(inputs):\n    x0_data, x1_data, _ = inputs\n\n    def func(x0, x1):\n        return chainer_chemistry.functions.mean_squared_error(x0, x1,\n                                                              ignore_nan=True)\n    gradient_check.check_backward(func, (x0_data, x1_data), None, eps=1e-2)\n\n\ndef test_backward_cpu(inputs):\n    check_backward(inputs)\n    check_backward_ignore_nan(inputs)\n    check_backward_ignore_nan_with_nonnan_value(inputs)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(inputs):\n    x0, x1, x2 = inputs\n    check_backward((cuda.to_gpu(x0), cuda.to_gpu(x1), None))\n    check_backward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2)))\n    check_backward_ignore_nan_with_nonnan_value((cuda.to_gpu(x0),\n                                                 cuda.to_gpu(x1), None))\n\n\ndef check_double_backward(inputs, grads):\n    x0, x1, _ = inputs\n    gy, ggx0, ggx1 = grads\n    gradient_check.check_double_backward(\n        chainer_chemistry.functions.mean_squared_error,\n        (x0, x1), gy, (ggx0, ggx1))\n\n\ndef check_double_backward_ignore_nan(inputs, grads):\n    x0, _, x2 = inputs\n    gy, ggx0, ggx1 = grads\n\n    def func(x0, x1):\n        return chainer_chemistry.functions.mean_squared_error(x0, x1,\n                                                              ignore_nan=True)\n    gradient_check.check_double_backward(func, (x0, x2), gy, (ggx0, ggx1))\n\n\ndef check_double_backward_ignore_nan_with_nonnan_value(inputs, grads):\n    x0, x1, _ = inputs\n    gy, ggx0, ggx1 = grads\n\n    def func(x0, x1):\n        return chainer_chemistry.functions.mean_squared_error(x0, x1,\n                                                              ignore_nan=True)\n    gradient_check.check_double_backward(func, (x0, x1), gy, (ggx0, ggx1))\n\n\ndef test_double_backward_cpu(inputs, grads):\n    check_double_backward(inputs, grads)\n    check_double_backward_ignore_nan(inputs, grads)\n    check_double_backward_ignore_nan_with_nonnan_value(inputs, grads)\n\n\n@pytest.mark.gpu\ndef test_double_backward_gpu(inputs, grads):\n    x0, x1, x2 = inputs\n    gy, ggx0, ggx1 = grads\n    check_double_backward((cuda.to_gpu(x0), cuda.to_gpu(x1), None),\n                          (cuda.to_gpu(gy), cuda.to_gpu(ggx0),\n                           cuda.to_gpu(ggx1)))\n    check_double_backward_ignore_nan((cuda.to_gpu(x0), None, cuda.to_gpu(x2)),\n                                     (cuda.to_gpu(gy), cuda.to_gpu(ggx0),\n                                      cuda.to_gpu(ggx1)))\n    check_double_backward_ignore_nan_with_nonnan_value((cuda.to_gpu(x0),\n                                                        cuda.to_gpu(x1),\n                                                        None),\n                                                       (cuda.to_gpu(gy),\n                                                        cuda.to_gpu(ggx0),\n                                                        cuda.to_gpu(ggx1)))\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/iterators_tests/test_balanced_serial_iterator.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer import serializer\n\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\nfrom chainer_chemistry.iterators.balanced_serial_iterator import BalancedSerialIterator  # NOQA\n\n\nclass DummySerializer(serializer.Serializer):\n\n    def __init__(self, target):\n        super(DummySerializer, self).__init__()\n        self.target = target\n\n    def __getitem__(self, key):\n        target_child = dict()\n        self.target[key] = target_child\n        return DummySerializer(target_child)\n\n    def __call__(self, key, value):\n        self.target[key] = value\n        return self.target[key]\n\n\nclass DummyDeserializer(serializer.Deserializer):\n\n    def __init__(self, target):\n        super(DummyDeserializer, self).__init__()\n        self.target = target\n\n    def __getitem__(self, key):\n        target_child = self.target[key]\n        return DummyDeserializer(target_child)\n\n    def __call__(self, key, value):\n        if value is None:\n            value = self.target[key]\n        elif isinstance(value, numpy.ndarray):\n            numpy.copyto(value, self.target[key])\n        else:\n            value = type(value)(numpy.asarray(self.target[key]))\n        return value\n\n\ndef test_balanced_serial_iterator():\n    _test_balanced_serial_iterator_no_batch_balancing()\n    _test_balanced_serial_iterator_with_batch_balancing()\n\n\ndef _test_balanced_serial_iterator_no_batch_balancing():\n    x = numpy.arange(8)\n    t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1])\n    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=9,\n                                      labels=t, ignore_labels=-1,\n                                      batch_balancing=False)\n    # In this case, we have 3 examples of label=1.\n    # When BalancedSerialIterator runs, all label examples are sampled 3 times\n    # in one epoch.\n    # Therefore, number of data is \"augmented\" as 9\n    # 3 (number of label types) * 3 (number of maximum examples in one label)\n    expect_N_augmented = 9\n    assert iterator.N_augmented == expect_N_augmented\n    # iterator.show_label_stats()  # we can show label stats\n\n    batch = iterator.next()\n\n    assert len(batch) == 9\n    labels_batch = numpy.array([example[-1] for example in batch])\n\n    assert numpy.sum(labels_batch == 0) == 3\n    assert numpy.sum(labels_batch == 1) == 3\n    assert numpy.sum(labels_batch == 2) == 3\n\n\ndef _test_balanced_serial_iterator_with_batch_balancing():\n    x = numpy.arange(8)\n    t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1])\n    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3,\n                                      labels=t, ignore_labels=-1,\n                                      batch_balancing=True)\n    expect_N_augmented = 9\n    assert iterator.N_augmented == expect_N_augmented\n    batch1 = iterator.next()\n    batch2 = iterator.next()\n    batch3 = iterator.next()\n    for batch in [batch1, batch2, batch3]:\n        assert len(batch) == 3\n        labels_batch = numpy.array([example[-1] for example in batch])\n        assert numpy.sum(labels_batch == 0) == 1\n        assert numpy.sum(labels_batch == 1) == 1\n        assert numpy.sum(labels_batch == 2) == 1\n\n\ndef test_balanced_serial_iterator_serialization():\n    _test_balanced_serial_iterator_serialization_no_batch_balancing()\n    _test_balanced_serial_iterator_serialization_with_batch_balancing()\n\n\ndef _test_balanced_serial_iterator_serialization_no_batch_balancing():\n    x = numpy.arange(8)\n    t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1])\n    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=9,\n                                      labels=t, ignore_labels=-1,\n                                      batch_balancing=False)\n    batch = iterator.next()  # NOQA\n\n    assert iterator.current_position == 0\n    assert iterator.epoch == 1\n    assert iterator.is_new_epoch\n\n    target = dict()\n    iterator.serialize(DummySerializer(target))\n    current_index_list_orig = dict()\n    current_pos_orig = dict()\n    for label, index_iterator in iterator.labels_iterator_dict.items():\n        ii_label = 'index_iterator_{}'.format(label)\n        current_index_list_orig[ii_label] = index_iterator.current_index_list\n        current_pos_orig[ii_label] = index_iterator.current_pos\n\n    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=9,\n                                      labels=t, ignore_labels=-1,\n                                      batch_balancing=False)\n    iterator.serialize(DummyDeserializer(target))\n    assert iterator.current_position == 0\n    assert iterator.epoch == 1\n    assert iterator.is_new_epoch\n    for label, index_iterator in iterator.labels_iterator_dict.items():\n        ii_label = 'index_iterator_{}'.format(label)\n        assert numpy.array_equal(index_iterator.current_index_list,\n                                 current_index_list_orig[ii_label])\n        assert index_iterator.current_pos == current_pos_orig[ii_label]\n\n\ndef _test_balanced_serial_iterator_serialization_with_batch_balancing():\n    x = numpy.arange(8)\n    t = numpy.asarray([0, 0, -1, 1, 1, 2, -1, 1])\n    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3,\n                                      labels=t, ignore_labels=-1,\n                                      batch_balancing=True)\n    batch1 = iterator.next()  # NOQA\n    batch2 = iterator.next()  # NOQA\n    batch3 = iterator.next()  # NOQA\n\n    assert iterator.current_position == 0\n    assert iterator.epoch == 1\n    assert iterator.is_new_epoch\n\n    target = dict()\n    iterator.serialize(DummySerializer(target))\n    current_index_list_orig = dict()\n    current_pos_orig = dict()\n    for label, index_iterator in iterator.labels_iterator_dict.items():\n        ii_label = 'index_iterator_{}'.format(label)\n        current_index_list_orig[ii_label] = index_iterator.current_index_list\n        current_pos_orig[ii_label] = index_iterator.current_pos\n\n    iterator = BalancedSerialIterator(NumpyTupleDataset(x, t), batch_size=3,\n                                      labels=t, ignore_labels=-1,\n                                      batch_balancing=True)\n    iterator.serialize(DummyDeserializer(target))\n    assert iterator.current_position == 0\n    assert iterator.epoch == 1\n    assert iterator.is_new_epoch\n    for label, index_iterator in iterator.labels_iterator_dict.items():\n        ii_label = 'index_iterator_{}'.format(label)\n        assert numpy.array_equal(index_iterator.current_index_list,\n                                 current_index_list_orig[ii_label])\n        assert index_iterator.current_pos == current_pos_orig[ii_label]\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-s', '-v'])\n"
  },
  {
    "path": "tests/iterators_tests/test_index_iterator.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer import serializer\n\nfrom chainer_chemistry.iterators.index_iterator import IndexIterator\n\n\nclass DummySerializer(serializer.Serializer):\n\n    def __init__(self, target):\n        super(DummySerializer, self).__init__()\n        self.target = target\n\n    def __getitem__(self, key):\n        target_child = dict()\n        self.target[key] = target_child\n        return DummySerializer(target_child)\n\n    def __call__(self, key, value):\n        self.target[key] = value\n        return self.target[key]\n\n\nclass DummyDeserializer(serializer.Deserializer):\n\n    def __init__(self, target):\n        super(DummyDeserializer, self).__init__()\n        self.target = target\n\n    def __getitem__(self, key):\n        target_child = self.target[key]\n        return DummyDeserializer(target_child)\n\n    def __call__(self, key, value):\n        if value is None:\n            value = self.target[key]\n        elif isinstance(value, numpy.ndarray):\n            numpy.copyto(value, self.target[key])\n        else:\n            value = type(value)(numpy.asarray(self.target[key]))\n        return value\n\n\ndef test_index_iterator():\n    _test_index_iterator_no_shuffle()\n    _test_index_iterator_with_shuffle()\n\n\ndef _test_index_iterator_no_shuffle():\n    index_list = [1, 3, 5, 10]\n    ii = IndexIterator(index_list, shuffle=False, num=2)\n\n    indices1 = ii.get_next_indices(3)\n    indices2 = ii.get_next_indices(6)\n    indices3 = ii.__next__()\n\n    assert isinstance(indices1, numpy.ndarray)\n    assert len(indices1) == 3\n    assert isinstance(indices2, numpy.ndarray)\n    assert len(indices2) == 6\n    assert isinstance(indices3, numpy.ndarray)\n    assert len(indices3) == 2\n    assert indices1[0] == index_list[0]\n    assert indices1[1] == index_list[1]\n    assert indices1[2] == index_list[2]\n    assert indices2[0] == index_list[3]\n    assert indices2[1] == index_list[0]\n    assert indices2[2] == index_list[1]\n    assert indices2[3] == index_list[2]\n    assert indices2[4] == index_list[3]\n    assert indices2[5] == index_list[0]\n    assert indices3[0] == index_list[1]\n    assert indices3[1] == index_list[2]\n\n\ndef _test_index_iterator_with_shuffle():\n    index_list = [1, 3, 5, 10]\n    ii = IndexIterator(index_list, shuffle=True, num=2)\n\n    indices1 = ii.get_next_indices(3)\n    indices2 = ii.get_next_indices(6)\n    indices3 = ii.__next__()\n\n    assert isinstance(indices1, numpy.ndarray)\n    assert len(indices1) == 3\n    assert isinstance(indices2, numpy.ndarray)\n    assert len(indices2) == 6\n    assert isinstance(indices3, numpy.ndarray)\n    assert len(indices3) == 2\n    for indices in [indices1, indices2, indices3]:\n        for index in indices:\n            assert index in index_list\n\n\ndef test_index_iterator_serialization():\n    _test_index_iterator_serialization_no_shuffle()\n    _test_index_iterator_serialization_with_shuffle()\n\n\ndef _test_index_iterator_serialization_no_shuffle():\n    index_list = [1, 3, 5, 10]\n    ii = IndexIterator(index_list, shuffle=False, num=2)\n\n    indices1 = ii.get_next_indices(3)  # NOQA\n    indices2 = ii.get_next_indices(6)  # NOQA\n    indices3 = ii.__next__()  # NOQA\n\n    assert len(ii.current_index_list) == len(index_list)\n    assert numpy.array_equal(ii.current_index_list, numpy.asarray(index_list))\n    assert ii.current_pos == (3 + 6) % len(index_list) + 2\n\n    target = dict()\n    ii.serialize(DummySerializer(target))\n\n    ii = IndexIterator(index_list, shuffle=False, num=2)\n    ii.serialize(DummyDeserializer(target))\n    assert len(ii.current_index_list) == len(index_list)\n    assert numpy.array_equal(ii.current_index_list, numpy.asarray(index_list))\n    assert ii.current_pos == (3 + 6) % len(index_list) + 2\n\n\ndef _test_index_iterator_serialization_with_shuffle():\n    index_list = [1, 3, 5, 10]\n    ii = IndexIterator(index_list, shuffle=True, num=2)\n\n    indices1 = ii.get_next_indices(3)  # NOQA\n    indices2 = ii.get_next_indices(6)  # NOQA\n    indices3 = ii.__next__()  # NOQA\n\n    assert len(ii.current_index_list) == len(index_list)\n    for index in ii.current_index_list:\n        assert index in index_list\n    assert ii.current_pos == (3 + 6) % len(index_list) + 2\n\n    target = dict()\n    ii.serialize(DummySerializer(target))\n    current_index_list_orig = ii.current_index_list\n\n    ii = IndexIterator(index_list, shuffle=True, num=2)\n    ii.serialize(DummyDeserializer(target))\n    assert numpy.array_equal(ii.current_index_list, current_index_list_orig)\n    assert ii.current_pos == (3 + 6) % len(index_list) + 2\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-s', '-v'])\n"
  },
  {
    "path": "tests/link_hooks_tests/test_variable_monitor_link_hook.py",
    "content": "import numpy\nimport pytest\n\nimport chainer\nfrom chainer import Variable, cuda  # NOQA\nfrom chainer.links import Linear\n\nfrom chainer_chemistry.link_hooks import is_link_hooks_available\nif is_link_hooks_available:\n    from chainer_chemistry.link_hooks import VariableMonitorLinkHook\n\n\nclass DummyModel(chainer.Chain):\n    def __init__(self):\n        super(DummyModel, self).__init__()\n        with self.init_scope():\n            self.l1 = Linear(\n                3, 1, initialW=numpy.array([[1, 3, 2]]),\n                nobias=True)\n        self.h = None\n\n    def forward(self, x):\n        self.h = self.l1(x)\n        out = self.h * 3\n        return out\n\n\n@pytest.fixture\ndef model():\n    return DummyModel()\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_variable_monitor_link_hook_pre(model):\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    x = Variable(x)\n    pre_hook = VariableMonitorLinkHook(target_link=model.l1, timing='pre')\n    with pre_hook:\n        model(x)\n    var = pre_hook.get_variable()\n    assert var is x\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_variable_monitor_link_hook_post(model):\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    x = Variable(x)\n    pre_hook = VariableMonitorLinkHook(target_link=model.l1, timing='post')\n    with pre_hook:\n        model(x)\n    var = pre_hook.get_variable()\n    assert var is model.h\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_variable_monitor_link_hook_process(model):\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    x = Variable(x)\n    pre_hook = VariableMonitorLinkHook(target_link=model.l1, timing='post')\n\n    # Add process\n    def _process_zeros(hook, args, target_var):\n        xp = cuda.get_array_module(target_var.array)\n        target_var.array = xp.zeros(target_var.array.shape)\n    pre_hook.add_process('_process_zeros', _process_zeros)\n    with pre_hook:\n        model(x)\n\n    assert numpy.allclose(model.h.array, numpy.zeros(model.h.shape))\n    assert '_process_zeros' in pre_hook.process_fns.keys()\n\n    # Delete process\n    pre_hook.delete_process('_process_zeros')\n    assert '_process_zeros' not in pre_hook.process_fns.keys()\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_variable_monitor_link_hook_assert_raises(model):\n    with pytest.raises(TypeError):\n        # target_link must be chainer.Link\n        pre_hook = VariableMonitorLinkHook(target_link='hoge')\n\n    with pytest.raises(ValueError):\n        # check timing args\n        pre_hook = VariableMonitorLinkHook(target_link=model.l1, timing='hoge')  # NOQA\n\n    hook = VariableMonitorLinkHook(target_link=model.l1)\n\n    def _process(hook, args, target_var):\n        pass\n\n    with pytest.raises(TypeError):\n        # key is wrong\n        hook.add_process(1, _process)\n\n    with pytest.raises(TypeError):\n        # fn is wrong\n        hook.add_process('hoge', 'var')\n\n    hook.add_process('hoge', _process)\n    with pytest.raises(TypeError):\n        # key is wrong\n        hook.delete_process(1)\n    hook.delete_process('hoge')\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/array_tests/test_shape_transformer_to_2d.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.links.array.shape_transformer_to_2d import ShapeTransformerTo2D  # NOQA\n\n\n@pytest.mark.parametrize('axis', [0, 1, -1])\ndef test_shape_transformer_2d_2d_array(axis):\n    st = ShapeTransformerTo2D(axis=axis)\n    x = numpy.arange(6).reshape((2, 3))\n    xt = st.transform(x)\n    xit = st.inverse_transform(xt)\n    if axis == 0:\n        assert numpy.allclose(xt.array, numpy.array([[0, 3], [1, 4], [2, 5]]))\n    elif axis == 1 or axis == -1:\n        assert numpy.allclose(x, xt.array)\n\n    assert numpy.allclose(x, xit.array)\n\n\n@pytest.mark.parametrize('axis', [0, 1, 2, -1])\ndef test_shape_transformer_2d_3d_array(axis):\n    st = ShapeTransformerTo2D(axis=axis)\n    x = numpy.arange(12).reshape((2, 3, 2))\n    xt = st.transform(x)\n    xit = st.inverse_transform(xt)\n    if axis == 0:\n        assert numpy.allclose(\n            xt.array,\n            numpy.array([[0, 6], [1, 7], [2, 8], [3, 9], [4, 10], [5, 11]]))\n    elif axis == 1:\n        assert numpy.allclose(\n            xt.array,\n            numpy.array([[0, 2, 4], [1, 3, 5], [6, 8, 10], [7, 9, 11]]))\n    elif axis == 2 or axis == -1:\n        assert numpy.allclose(\n            xt.array, x.reshape(6, 2))\n    assert numpy.allclose(x, xit.array)\n\n\ndef test_shape_transformer_2d_error():\n    st = ShapeTransformerTo2D(axis=1)\n    x = numpy.arange(6).reshape(2, 3)\n    with pytest.raises(AttributeError):\n        # call `inverse_transform` before `transform`\n        xt = st.inverse_transform(x)  # NOQA\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/connection_tests/test_embed_atom_id.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry import links\n\nin_size = 3\natom_size = 5\nout_size = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef model():\n    l = links.EmbedAtomID(in_size=in_size, out_size=out_size)\n    l.cleargrads()\n    return l\n\n\n@pytest.fixture\ndef data():\n    x_data = numpy.random.randint(\n        in_size, size=(batch_size, atom_size)).astype(numpy.int32)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, out_size)).astype(numpy.float32)\n    return x_data, y_grad\n\n\ndef check_forward(model, x_data):\n    def forward(W, x):\n        y = W[x]\n        return y\n\n    y_expect = forward(cuda.to_cpu(model.W.data),\n                       cuda.to_cpu(x_data))\n    y_actual = cuda.to_cpu(model(x_data).data)\n    numpy.testing.assert_equal(y_actual, y_expect)\n\n\ndef test_forward_cpu(model, data):\n    x_data = data[0]\n    check_forward(model, x_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    x_data = cuda.to_gpu(data[0])\n    model.to_gpu()\n    check_forward(model, x_data)\n\n\ndef test_backward_cpu(model, data):\n    x_data, y_grad = data\n    gradient_check.check_backward(model, x_data, y_grad, model.W,\n                                  atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    x_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(model, x_data, y_grad, model.W,\n                                  atol=1e-3, rtol=1e-3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/links_tests/connection_tests/test_graph_linear.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.links.connection.graph_linear import GraphLinear  # NOQA\n\nin_size = 3\natom_size = 5\nout_size = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef model():\n    l = GraphLinear(in_size=in_size, out_size=out_size)\n    l.cleargrads()\n    return l\n\n\n@pytest.fixture\ndef data():\n    x_data = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, in_size)).astype(numpy.float32)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, out_size)).astype(numpy.float32)\n    return x_data, y_grad\n\n\ndef test_forward_cpu(model, data):\n    # only testing shape for now...\n    x_data = data[0]\n    y_actual = model(x_data)\n    assert y_actual.shape == (batch_size, atom_size, out_size)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    x_data = cuda.to_gpu(data[0])\n    model.to_gpu()\n    y_actual = model(x_data)\n    assert y_actual.shape == (batch_size, atom_size, out_size)\n\n\ndef test_backward_cpu(model, data):\n    x_data, y_grad = data\n    gradient_check.check_backward(model, x_data, y_grad, list(model.params()),\n                                  atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    x_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(model, x_data, y_grad, list(model.params()),\n                                  atol=1e-3, rtol=1e-3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/connection_tests/test_graph_mlp.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.links.connection.graph_mlp import GraphMLP  # NOQA\n\nin_size = 3\natom_size = 5\nout_size = 4\nchannels = [16, out_size]\nbatch_size = 2\n\n\n@pytest.fixture\ndef model():\n    l = GraphMLP(channels, in_channels=in_size)\n    l.cleargrads()\n    return l\n\n\n@pytest.fixture\ndef data():\n    x_data = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, in_size)).astype(numpy.float32)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, out_size)).astype(numpy.float32)\n    return x_data, y_grad\n\n\ndef test_forward_cpu(model, data):\n    # only testing shape for now...\n    x_data = data[0]\n    y_actual = model(x_data)\n    assert y_actual.shape == (batch_size, atom_size, out_size)\n    assert len(model.layers) == len(channels)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    x_data = cuda.to_gpu(data[0])\n    model.to_gpu()\n    y_actual = model(x_data)\n    assert y_actual.shape == (batch_size, atom_size, out_size)\n    assert len(model.layers) == len(channels)\n\n\ndef test_backward_cpu(model, data):\n    x_data, y_grad = data\n    gradient_check.check_backward(model, x_data, y_grad, list(model.params()),\n                                  atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    x_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(model, x_data, y_grad, list(model.params()),\n                                  atol=1e-3, rtol=1e-3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/readout_tests/test_cgcnn_readout.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer import cuda\n\nfrom chainer_chemistry.links.readout.cgcnn_readout import CGCNNReadout\n\n\n# node_size_list means the first moleculae has three nodes,\n# and the seconde molecule has five nodes\nnode_size_list = [3, 5]\nnode_feature_dim = 32\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef readout():\n    return CGCNNReadout(out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    if len(node_size_list) != batch_size:\n        raise ValueError(\"Invalid fixture data for CGCNN\")\n\n    numpy.random.seed(0)\n    total_node_size = sum(node_size_list)\n    # atom_feat\n    atom_feat = numpy.random.rand(\n        total_node_size, node_feature_dim).astype(numpy.float32)\n    # atom_idx\n    curr_idx = 0\n    atom_idx = []\n    for val in node_size_list:\n        atom_idx.append(numpy.arange(curr_idx, val))\n        curr_idx += val\n    atom_idx = numpy.asarray(atom_idx)\n\n    y_grad = numpy.random.uniform(-1, 1,\n                                  (batch_size, out_dim)).astype(numpy.float32)\n    return atom_feat, atom_idx, y_grad\n\n\ndef check_forward(readout, data):\n    y_actual = cuda.to_cpu(readout(*data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(readout, data):\n    atom_feat, atom_idx = data[:-1]\n    check_forward(readout, (atom_feat, atom_idx))\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(readout, data):\n    atom_feat, atom_idx, _ = data\n    # atom_idx is list format... use numpy array\n    input_data = (cuda.to_gpu(atom_feat), atom_idx)\n    readout.to_gpu()\n    check_forward(readout, tuple(input_data))\n\n\n# def test_backward_cpu(readout, data):\n#     input_data, y_grad = data[0:-1], data[-1]\n#     gradient_check.check_backward(readout, tuple(input_data), y_grad,\n#                                   atol=5e-1, rtol=1e-1)\n\n\n# @pytest.mark.gpu\n# def test_backward_gpu(readout, data):\n#     data = [cuda.to_gpu(d) for d in data]\n#     input_data, y_grad = data[0:-1], data[-1]\n#     readout.to_gpu()\n#     gradient_check.check_backward(readout, tuple(input_data), y_grad,\n#                                   atol=5e-1, rtol=1e-1)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/links_tests/readout_tests/test_general_readout.py",
    "content": "from chainer import cuda\nfrom chainer import functions\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.general_readout import GeneralReadout\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nhidden_dim = 7\nbatch_size = 2\n\n\n@pytest.fixture\ndef readouts():\n    modes = ['sum', 'max', 'summax']\n    return (GeneralReadout(mode=mode) for mode in modes)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.uniform(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, hidden_dim)\n    ).astype('f')\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, hidden_dim)).astype('f')\n    return atom_data, y_grad\n\n\ndef check_forward(readout, atom_data):\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n    if readout.mode == ('sum' and 'max'):\n        assert y_actual.shape == (batch_size, hidden_dim)\n    elif readout.mode == 'summax':\n        assert y_actual.shape == (batch_size, hidden_dim * 2)\n\n\ndef test_forward_cpu(readouts, data):\n    atom_data = data[0]\n    for readout in readouts:\n        check_forward(readout, atom_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(readouts, data):\n    atom_data = cuda.to_gpu(data[0])\n    for readout in readouts:\n        readout.to_gpu()\n        check_forward(readout, atom_data)\n\n\ndef test_forward_cpu_assert_raises(data):\n    atom_data = data[0]\n    readout = GeneralReadout(mode='invalid')\n    with pytest.raises(ValueError):\n        cuda.to_cpu(readout(atom_data).data)\n\n\ndef test_backward_cpu(readouts, data):\n    atom_data, y_grad = data\n    for readout in readouts:\n        if readout.mode == 'summax':\n            y_grad = functions.concat((y_grad, y_grad), axis=1).data\n        gradient_check.check_backward(\n            readout, atom_data, y_grad, atol=1e-2, rtol=1e-2)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(readouts, data):\n    atom_data, y_grad = map(cuda.to_gpu, data)\n    for readout in readouts:\n        readout.to_gpu()\n        if readout.mode == 'summax':\n            y_grad = functions.concat((y_grad, y_grad), axis=1).data\n        # TODO(nakago): check why tolerance is so high.\n        gradient_check.check_backward(\n            readout, atom_data, y_grad, atol=1e-1, rtol=1e-1)\n\n\ndef test_forward_cpu_graph_invariant(readouts, data):\n    atom_data = data[0]\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    for readout in readouts:\n        y_actual = cuda.to_cpu(readout(atom_data).data)\n        permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data)\n        numpy.testing.assert_allclose(\n            y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/readout_tests/test_ggnn_readout.py",
    "content": "from chainer import cuda\nfrom chainer import functions\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 7\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef readout():\n    return GGNNReadout(out_dim=out_dim, in_channels=None)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.uniform(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, in_channels)\n        ).astype('f')\n    atom_data0 = functions.copy(\n        atom_data, cuda.get_device_from_array(atom_data.data).id).data\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype('f')\n    return atom_data, atom_data0, y_grad\n\n\ndef check_forward(readout, atom_data, atom_data0):\n    y_actual = cuda.to_cpu(readout(atom_data, atom_data0).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(readout, data):\n    atom_data, atom_data0 = data[:2]\n    check_forward(readout, atom_data, atom_data0)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(readout, data):\n    atom_data, atom_data0 = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    readout.to_gpu()\n    check_forward(readout, atom_data, atom_data0)\n\n\ndef test_backward_cpu(readout, data):\n    atom_data, atom_data0, y_grad = data\n    gradient_check.check_backward(\n        readout, (atom_data, atom_data0), y_grad, atol=1e-1, rtol=1e-1)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(readout, data):\n    atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n    readout.to_gpu()\n    gradient_check.check_backward(readout, (atom_data, adj_data), y_grad,\n                                  atol=1e-1, rtol=1e-1)\n\n\ndef test_forward_cpu_graph_invariant(readout, data):\n    atom_data, atom_data0 = data[:2]\n    y_actual = cuda.to_cpu(readout(atom_data, atom_data0).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_atom_data0 = permute_node(atom_data0, permutation_index, axis=1)\n    permute_y_actual = cuda.to_cpu(readout(\n        permute_atom_data, permute_atom_data0).data)\n    numpy.testing.assert_allclose(\n        y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/readout_tests/test_megnet_readout.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.links.readout.megnet_readout import MEGNetReadout\n\n\nmax_node_num = 6\nmax_edge_num = 10\n# This value is the same as the atom and pair feature dimension\nin_channels = 10\nglobal_feature_dim = 5\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef readout():\n    return MEGNetReadout(in_channels=in_channels, out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_feat = numpy.random.rand(batch_size, max_node_num,\n                                  in_channels).astype(numpy.float32)\n    pair_feat = numpy.random.rand(batch_size, max_edge_num,\n                                  in_channels).astype(numpy.float32)\n    global_feat = numpy.random.rand(batch_size,\n                                    global_feature_dim).astype(numpy.float32)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n\n    return atom_feat, pair_feat, global_feat, y_grad\n\n\ndef check_forward(readout, data):\n    y_actual = cuda.to_cpu(readout(*data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(readout, data):\n    atom_feat, pair_feat, global_feat = data[:-1]\n    check_forward(readout, (atom_feat, pair_feat, global_feat))\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(readout, data):\n    input_data = [cuda.to_gpu(d) for d in data[:-1]]\n    readout.to_gpu()\n    check_forward(readout, tuple(input_data))\n\n\ndef test_backward_cpu(readout, data):\n    input_data, y_grad = data[0:-1], data[-1]\n    gradient_check.check_backward(readout, tuple(input_data), y_grad,\n                                  atol=5e-1, rtol=1e-1)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(readout, data):\n    data = [cuda.to_gpu(d) for d in data]\n    input_data, y_grad = data[0:-1], data[-1]\n    readout.to_gpu()\n    gradient_check.check_backward(readout, tuple(input_data), y_grad,\n                                  atol=5e-1, rtol=1e-1)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/readout_tests/test_mpnn_readout.py",
    "content": "from typing import Tuple  # NOQA\n\nimport numpy\nimport pytest\n\nfrom chainer import cuda\nfrom chainer import gradient_check\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.mpnn_readout import MPNNReadout\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 7\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef readout():\n    # type: () -> MPNNReadout\n    return MPNNReadout(out_dim=out_dim, in_channels=in_channels, n_layers=2)\n\n\n@pytest.fixture\ndef data():\n    # type: () -> Tuple[numpy.ndarray, numpy.ndarray]\n    numpy.random.seed(0)\n    atom_data = numpy.random.uniform(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size,\n                                      in_channels)).astype('f')\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f')\n    return atom_data, y_grad\n\n\ndef check_forward(readout, atom_data):\n    # type: (MPNNReadout, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_foward_cpu(readout, data):\n    # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data = data[0]\n    check_forward(readout, atom_data)\n\n\n@pytest.mark.gpu\ndef test_foward_gpu(readout, data):\n    # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data = cuda.to_gpu(data[0])\n    readout.to_gpu()\n    check_forward(readout, atom_data)\n\n\ndef test_backward_cpu(readout, data):\n    # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, y_grad = data\n    gradient_check.check_backward(\n        readout, atom_data, y_grad, atol=1e-1, rtol=1e-1)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(readout, data):\n    # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, y_grad = map(cuda.to_gpu, data)\n    readout.to_gpu()\n    gradient_check.check_backward(\n        readout, atom_data, y_grad, atol=1e-1, rtol=1e-1)\n\n\ndef test_foward_cpu_graph_invariant(readout, data):\n    # type: (MPNNReadout, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data = data[0]\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data)\n    numpy.testing.assert_allclose(\n        y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/readout_tests/test_nfp_readout.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.nfp_readout import NFPReadout\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nhidden_dim = 7\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef readout():\n    return NFPReadout(in_channels=hidden_dim, out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.uniform(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, hidden_dim)\n    ).astype('f')\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f')\n    return atom_data, y_grad\n\n\ndef check_forward(readout, atom_data):\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(readout, data):\n    atom_data = data[0]\n    check_forward(readout, atom_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(readout, data):\n    atom_data = cuda.to_gpu(data[0])\n    readout.to_gpu()\n    check_forward(readout, atom_data)\n\n\ndef test_backward_cpu(readout, data):\n    atom_data, y_grad = data\n    gradient_check.check_backward(\n        readout, atom_data, y_grad, atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(readout, data):\n    atom_data, y_grad = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    readout.to_gpu()\n    gradient_check.check_backward(\n        readout, atom_data, y_grad, atol=1e-3, rtol=1e-3)\n\n\ndef test_forward_cpu_graph_invariant(readout, data):\n    atom_data = data[0]\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data)\n    numpy.testing.assert_allclose(\n        y_actual, permute_y_actual, rtol=1e-3, atol=1e-3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/readout_tests/test_schnet_readout.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.schnet_readout import SchNetReadout\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 7\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef readout():\n    return SchNetReadout(out_dim=out_dim, in_channels=in_channels)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.uniform(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size, in_channels)\n    ).astype('f')\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f')\n    return atom_data, y_grad\n\n\ndef check_forward(readout, atom_data):\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(readout, data):\n    atom_data = data[0]\n    check_forward(readout, atom_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(readout, data):\n    atom_data = cuda.to_gpu(data[0])\n    readout.to_gpu()\n    check_forward(readout, atom_data)\n\n\ndef test_backward_cpu(readout, data):\n    atom_data, y_grad = data\n    gradient_check.check_backward(\n        readout, atom_data, y_grad, atol=1e-1, rtol=1e-1)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(readout, data):\n    atom_data, y_grad = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    readout.to_gpu()\n    gradient_check.check_backward(\n        readout, atom_data, y_grad, atol=1e-1, rtol=1e-1)\n\n\ndef test_forward_cpu_graph_invariant(readout, data):\n    atom_data = data[0]\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data)\n    numpy.testing.assert_allclose(\n        y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/readout_tests/test_set2set.py",
    "content": "from typing import Tuple  # NOQA\n\nimport numpy\nimport pytest\n\nfrom chainer import cuda\nfrom chainer import gradient_check\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.set2set import Set2Set\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 7\nbatch_size = 2\n\n\n@pytest.fixture\ndef readout():\n    # type: () -> Set2Set\n    return Set2Set(in_channels=in_channels, n_layers=2)\n\n\n@pytest.fixture\ndef data():\n    # type: () -> Tuple[numpy.ndarray, numpy.ndarray]\n    numpy.random.seed(0)\n    atom_data = numpy.random.uniform(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size,\n                                      in_channels)).astype('f')\n    y_grad = numpy.random.uniform(-1, 1,\n                                  (batch_size, in_channels * 2)).astype('f')\n    return atom_data, y_grad\n\n\ndef check_forward(readout, atom_data):\n    # type: (Set2Set, numpy.ndarray) -> None\n    readout.reset_state()\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n    assert y_actual.shape == (batch_size, in_channels * 2)\n\n\ndef test_forward_cpu(readout, data):\n    # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data = data[0]\n    check_forward(readout, atom_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(readout, data):\n    # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data = cuda.to_gpu(data[0])\n    readout.to_gpu()\n    check_forward(readout, atom_data)\n\n\ndef check_backward(readout, atom_data, y_grad):\n    # type: (Set2Set, numpy.ndarray, numpy.ndarray) -> None\n    \"\"\"Check gradient of Set2Set.\n\n    This function is different from other backward tests.\n    Because of LSTM, reset_state method has to be called explicitly\n    before gradient calculation.\n\n    Args:\n        readout:\n        atom_data:\n        y_grad:\n    \"\"\"\n    def f(atom_data):\n        readout.reset_state()\n        return readout(atom_data),\n\n    gradient_check.check_backward(\n        f, (atom_data), y_grad, atol=1e-1, rtol=1e-1)\n\n\ndef test_backward_cpu(readout, data):\n    # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    check_backward(readout, *data)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(readout, data):\n    # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, y_grad = [cuda.to_gpu(d) for d in data]\n    readout.to_gpu()\n    check_backward(readout, atom_data, y_grad)\n\n\ndef test_forward_cpu_graph_invariant(readout, data):\n    # type: (Set2Set, Tuple[numpy.ndarray, numpy.ndarray]) -> None\n    atom_data = data[0]\n    readout.reset_state()\n    y_actual = cuda.to_cpu(readout(atom_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    readout.reset_state()\n    permute_y_actual = cuda.to_cpu(readout(permute_atom_data).data)\n    numpy.testing.assert_allclose(\n        y_actual, permute_y_actual, rtol=1e-6, atol=1e-6)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/scaler_tests/test_flow_scaler.py",
    "content": "import os\n\nimport numpy\nimport pytest\nimport scipy.stats\nfrom chainer import serializers, Variable, cuda, testing  # NOQA\n\nfrom chainer_chemistry.links.scaler.flow_scaler import FlowScaler\n\n\n@testing.with_requires('chainer>=5.0.0')\n@pytest.mark.slow\ndef test_flow_scaler_transform_uniform():\n    x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32)\n\n    scaler = FlowScaler(5)\n    scaler.fit(x)  # fit takes time\n    x_scaled = scaler.transform(x)\n\n    assert scipy.stats.kstest(x_scaled, 'norm').pvalue > 0.05\n\n\n@testing.with_requires('chainer>=5.0.0')\n@pytest.mark.slow\ndef test_flow_scaler_transform_mix_gaussian():\n    plus = numpy.random.binomial(n=1, p=0.6, size=100).astype(numpy.float32)\n    x = plus * numpy.random.normal(10, 5, size=100).astype(numpy.float32)\n    x += (1 - plus) * numpy.random.normal(\n        -10, 5, size=100).astype(numpy.float32)\n\n    scaler = FlowScaler(5)\n    scaler.fit(x)  # fit takes time\n    x_scaled = scaler.transform(x)\n\n    assert scipy.stats.kstest(x_scaled, 'norm').pvalue > 0.05\n\n\n@testing.with_requires('chainer>=5.0.0')\n@pytest.mark.slow\ndef test_flow_scaler_transform_variable():\n    x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32)\n    xvar = Variable(x)\n    scaler = FlowScaler(5)\n    scaler.fit(xvar)  # fit takes time\n    x_scaled = scaler.transform(xvar)\n\n    assert isinstance(x_scaled, Variable)\n    assert scipy.stats.kstest(x_scaled.array, 'norm').pvalue > 0.05\n\n\n@testing.with_requires('chainer>=5.0.0')\n@pytest.mark.gpu\n@pytest.mark.slow\ndef test_flow_scaler_transform_gpu():\n    x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32)\n\n    scaler = FlowScaler(5)\n    scaler.to_gpu()\n    x = cuda.to_gpu(x)\n    scaler.fit(x)  # fit takes time\n    x_scaled = scaler.transform(x)\n\n    assert isinstance(x_scaled, cuda.cupy.ndarray)\n    assert scipy.stats.kstest(cuda.to_cpu(x_scaled), 'norm').pvalue > 0.05\n\n\n@testing.with_requires('chainer>=5.0.0')\n@pytest.mark.slow\ndef test_flow_scaler_serialize(tmpdir):\n    x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32)\n    scaler = FlowScaler(5)\n    scaler.fit(x)  # fit takes time\n    x_scaled = scaler.transform(x)\n\n    scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz')\n    serializers.save_npz(scaler_filepath, scaler)\n\n    scaler2 = FlowScaler(5)\n    serializers.load_npz(scaler_filepath, scaler2)\n    x_scaled2 = scaler2.transform(x)\n\n    assert numpy.allclose(scaler.W1.array, scaler2.W1.array)\n    assert numpy.allclose(scaler.b1.array, scaler2.b1.array)\n    assert numpy.allclose(scaler.W2.array, scaler2.W2.array)\n    assert numpy.allclose(scaler.b2.array, scaler2.b2.array)\n    assert numpy.allclose(x_scaled, x_scaled2)\n\n\n@testing.with_requires('chainer>=5.0.0')\ndef test_flow_scaler_pipeline():\n    # Only to test each method without fail, for fast testing.\n    x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32)\n\n    scaler = FlowScaler(5)\n    scaler.fit(x, iteration=1)\n    x_scaled = scaler.transform(x)\n\n    assert x_scaled.shape == x.shape\n\n\n@testing.with_requires('chainer>=5.0.0')\n@pytest.mark.gpu\ndef test_flow_scaler_pipeline_gpu():\n    # Only to test each method without fail, for fast testing.\n    x = numpy.random.uniform(50, 100, size=100).astype(numpy.float32)\n    x = cuda.to_gpu(x)\n\n    scaler = FlowScaler(5)\n    scaler.to_gpu()\n    scaler.fit(x, iteration=1)\n    x_scaled = scaler.transform(x)\n\n    assert isinstance(x_scaled, cuda.cupy.ndarray)\n    assert x_scaled.shape == x.shape\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/scaler_tests/test_max_abs_scaler.py",
    "content": "import os\n\nimport numpy\nimport pytest\nfrom chainer import serializers, Variable, cuda  # NOQA\n\nfrom chainer_chemistry.links.scaler.max_abs_scaler import MaxAbsScaler\n\n\n@pytest.fixture\ndef data():\n    x = numpy.array(\n        [[0.1, 10., 0.3],\n         [0.2, 20., 0.1],\n         [-0.3, 30., 0.],\n         [0.4, -40., 0.]],\n        dtype=numpy.float32)\n    expect_x_scaled = numpy.array(\n        [[0.25, 0.25, 1.],\n         [0.5, 0.5, 0.3333333],\n         [-0.75, 0.75, 0.],\n         [1., -1., 0.]],\n        dtype=numpy.float32)\n    return x, expect_x_scaled\n\n\n@pytest.mark.parametrize('indices', [None, [0], [1, 2]])\ndef test_max_abs_scaler_transform(data, indices):\n    x, expect_x_scaled = data\n    scaler = MaxAbsScaler()\n    scaler.fit(x, indices=indices)\n\n    x_scaled = scaler.transform(x)\n\n    if indices is None:\n        indices = numpy.arange(x.shape[1])\n\n    numpy.allclose(scaler.max_abs, numpy.array([0.4, 40, 0.3])[indices])\n\n    for index in range(x.shape[1]):\n        if index in indices:\n            assert numpy.allclose(x_scaled[:, index],\n                                  expect_x_scaled[:, index])\n        else:\n            assert numpy.allclose(x_scaled[:, index], x[:, index])\n\n\ndef test_max_abs_scaler_transform_variable(data):\n    x, expect_x_scaled = data\n    xvar = Variable(x)\n    scaler = MaxAbsScaler()\n    scaler.fit(xvar)\n    x_scaled = scaler.transform(xvar)\n\n    assert isinstance(x_scaled, Variable)\n    assert numpy.allclose(x_scaled.array, expect_x_scaled)\n\n\n@pytest.mark.gpu\ndef test_max_abs_scaler_transform_gpu(data):\n    x, expect_x_scaled = data\n    scaler = MaxAbsScaler()\n    scaler.to_gpu()\n    x = cuda.to_gpu(x)\n    scaler.fit(x)\n    x_scaled = scaler.transform(x)\n\n    assert isinstance(x_scaled, cuda.cupy.ndarray)\n    assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled)\n\n\n@pytest.mark.parametrize('indices', [None, [0], [1, 2]])\ndef test_max_abs_scaler_inverse_transform(data, indices):\n    x, expect_x_scaled = data\n    scaler = MaxAbsScaler()\n    scaler.fit(x, indices=indices)\n    x_inverse = scaler.inverse_transform(expect_x_scaled)\n\n    if indices is None:\n        indices = numpy.arange(x.shape[1])\n    for index in range(x.shape[1]):\n        if index in indices:\n            assert numpy.allclose(x_inverse[:, index], x[:, index])\n        else:\n            assert numpy.allclose(x_inverse[:, index],\n                                  expect_x_scaled[:, index])\n\n\n@pytest.mark.parametrize('axis', [1, 2])\ndef test_max_abs_scaler_3darray(data, axis):\n    x, expect_x_scaled = data\n    s0, s1 = x.shape\n    if axis == 1:\n        # feature axis is 1, insert other axis to 2nd axis\n        x = numpy.broadcast_to(x[:, :, None], (s0, s1, 2))\n        expect_x_scaled = numpy.broadcast_to(\n            expect_x_scaled[:, :, None], (s0, s1, 2))\n    elif axis == 2:\n        # feature axis is 2, insert other axis to 1st axis\n        x = numpy.broadcast_to(x[:, None, :], (s0, 3, s1))\n        expect_x_scaled = numpy.broadcast_to(\n            expect_x_scaled[:, None, :], (s0, 3, s1))\n    assert x.ndim == 3\n    indices = None\n    scaler = MaxAbsScaler()\n    scaler.fit(x, indices=indices, axis=axis)\n    x_scaled = scaler.transform(x, axis=axis)\n    assert x_scaled.shape == expect_x_scaled.shape\n    assert numpy.allclose(x_scaled, expect_x_scaled, atol=1e-7)\n\n    x_inverse = scaler.inverse_transform(expect_x_scaled, axis=axis)\n\n    for index in numpy.arange(x.shape[1]):\n        assert numpy.allclose(x_inverse[:, index], x[:, index], atol=1e-7)\n\n\ndef test_max_abs_scaler_fit_transform(data):\n    x, expect_x_scaled = data\n    scaler = MaxAbsScaler()\n    x_scaled = scaler.fit_transform(x)\n    assert numpy.allclose(x_scaled, expect_x_scaled)\n\n\n# TODO(nakago): fix Chainer serializer.\n# Behavior changed from numpy versioin 1.16.3.\n# allow_pickle=True must be passed to numpy.load function,\n# in order to load `None`.\n# For now, skip test for serialize `None`.\n# @pytest.mark.parametrize('indices', [None, [0]])\n@pytest.mark.parametrize('indices', [[0]])\ndef test_max_abs_scaler_serialize(tmpdir, data, indices):\n    x, expect_x_scaled = data\n    scaler = MaxAbsScaler()\n    scaler.fit(x, indices=indices)\n\n    scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz')\n    serializers.save_npz(scaler_filepath, scaler)\n\n    scaler2 = MaxAbsScaler()\n    serializers.load_npz(scaler_filepath, scaler2)\n\n    # print('scaler2 attribs:', scaler2.max_abs, scaler2.indices)\n    assert numpy.allclose(scaler.max_abs, scaler2.max_abs)\n    assert scaler.indices == scaler2.indices\n\n\ndef test_max_abs_scaler_assert_raises():\n    x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]],\n                    dtype=numpy.float32)\n    scaler = MaxAbsScaler()\n\n    # call transform before fit raises error\n    with pytest.raises(AttributeError):\n        scaler.transform(x)\n    with pytest.raises(AttributeError):\n        scaler.inverse_transform(x)\n\n\ndef test_max_abs_scaler_transform_zero_max():\n    x = numpy.array([[0, 2], [0, 2], [0, 2]], dtype=numpy.float32)\n    expect_x_scaled = numpy.array([[0, 1], [0, 1], [0, 1]],\n                                  dtype=numpy.float32)\n    scaler = MaxAbsScaler()\n    scaler.fit(x)\n    x_scaled = scaler.transform(x)\n    # print('max_abs', scaler.max_abs)\n    assert numpy.allclose(x_scaled, expect_x_scaled)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/scaler_tests/test_min_max_scaler.py",
    "content": "import os\n\nimport numpy\nimport pytest\nfrom chainer import serializers, Variable, cuda  # NOQA\n\nfrom chainer_chemistry.links.scaler.min_max_scaler import MinMaxScaler\n\n\n@pytest.fixture\ndef data():\n    x = numpy.array(\n        [[0.1, 10., 0.3],\n         [0.2, 20., 0.1],\n         [-0.3, 30., 0.],\n         [0.4, -40., 0.]],\n        dtype=numpy.float32)\n    expect_x_scaled = numpy.array(\n        [[0.57142854, 0.71428573, 1.],\n         [0.7142857, 0.85714287, 0.3333333],\n         [0., 1., 0.],\n         [1., 0., 0.]],\n        dtype=numpy.float32)\n    return x, expect_x_scaled\n\n\n@pytest.mark.parametrize('indices', [None, [0], [1, 2]])\ndef test_min_max_scaler_transform(data, indices):\n    x, expect_x_scaled = data\n    scaler = MinMaxScaler()\n    scaler.fit(x, indices=indices)\n\n    x_scaled = scaler.transform(x)\n\n    if indices is None:\n        indices = numpy.arange(x.shape[1])\n\n    numpy.allclose(scaler.max, numpy.array([0.4, 30, 0.3])[indices])\n    numpy.allclose(scaler.min, numpy.array([-0.3, -40, 0])[indices])\n\n    for index in range(x.shape[1]):\n        if index in indices:\n            assert numpy.allclose(x_scaled[:, index],\n                                  expect_x_scaled[:, index])\n        else:\n            assert numpy.allclose(x_scaled[:, index], x[:, index])\n\n\ndef test_min_max_scaler_transform_variable(data):\n    x, expect_x_scaled = data\n    xvar = Variable(x)\n    scaler = MinMaxScaler()\n    scaler.fit(xvar)\n    x_scaled = scaler.transform(xvar)\n\n    assert isinstance(x_scaled, Variable)\n    assert numpy.allclose(x_scaled.array, expect_x_scaled)\n\n\n@pytest.mark.gpu\ndef test_min_max_scaler_transform_gpu(data):\n    x, expect_x_scaled = data\n    scaler = MinMaxScaler()\n    scaler.to_gpu()\n    x = cuda.to_gpu(x)\n    scaler.fit(x)\n    x_scaled = scaler.transform(x)\n\n    assert isinstance(x_scaled, cuda.cupy.ndarray)\n    assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled)\n\n\n@pytest.mark.parametrize('indices', [None, [0], [1, 2]])\ndef test_min_max_scaler_inverse_transform(data, indices):\n    x, expect_x_scaled = data\n    scaler = MinMaxScaler()\n    scaler.fit(x, indices=indices)\n    x_inverse = scaler.inverse_transform(expect_x_scaled)\n\n    if indices is None:\n        indices = numpy.arange(x.shape[1])\n    for index in range(x.shape[1]):\n        if index in indices:\n            assert numpy.allclose(x_inverse[:, index], x[:, index])\n        else:\n            assert numpy.allclose(x_inverse[:, index],\n                                  expect_x_scaled[:, index])\n\n\n@pytest.mark.parametrize('axis', [1, 2])\ndef test_min_max_scaler_3darray(data, axis):\n    x, expect_x_scaled = data\n    s0, s1 = x.shape\n    if axis == 1:\n        # feature axis is 1, insert other axis to 2nd axis\n        x = numpy.broadcast_to(x[:, :, None], (s0, s1, 2))\n        expect_x_scaled = numpy.broadcast_to(\n            expect_x_scaled[:, :, None], (s0, s1, 2))\n    elif axis == 2:\n        # feature axis is 2, insert other axis to 1st axis\n        x = numpy.broadcast_to(x[:, None, :], (s0, 3, s1))\n        expect_x_scaled = numpy.broadcast_to(\n            expect_x_scaled[:, None, :], (s0, 3, s1))\n    assert x.ndim == 3\n    indices = None\n    scaler = MinMaxScaler()\n    scaler.fit(x, indices=indices, axis=axis)\n    x_scaled = scaler.transform(x, axis=axis)\n    assert x_scaled.shape == expect_x_scaled.shape\n    assert numpy.allclose(x_scaled, expect_x_scaled, atol=1e-7)\n\n    x_inverse = scaler.inverse_transform(expect_x_scaled, axis=axis)\n\n    for index in numpy.arange(x.shape[1]):\n        assert numpy.allclose(x_inverse[:, index], x[:, index], atol=1e-7)\n\n\ndef test_min_max_scaler_fit_transform(data):\n    x, expect_x_scaled = data\n    scaler = MinMaxScaler()\n    x_scaled = scaler.fit_transform(x)\n    assert numpy.allclose(x_scaled, expect_x_scaled)\n\n\n# TODO(nakago): fix Chainer serializer.\n# Behavior changed from numpy versioin 1.16.3.\n# allow_pickle=True must be passed to numpy.load function,\n# in order to load `None`.\n# For now, skip test for serialize `None`.\n# @pytest.mark.parametrize('indices', [None, [0]])\n@pytest.mark.parametrize('indices', [[0]])\ndef test_min_max_scaler_serialize(tmpdir, data, indices):\n    x, expect_x_scaled = data\n    scaler = MinMaxScaler()\n    scaler.fit(x, indices=indices)\n\n    scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz')\n    serializers.save_npz(scaler_filepath, scaler)\n\n    scaler2 = MinMaxScaler()\n    serializers.load_npz(scaler_filepath, scaler2)\n\n    # print('scaler2 attribs:', scaler2.min, scaler2.max, scaler2.indices)\n    assert numpy.allclose(scaler.min, scaler2.min)\n    assert numpy.allclose(scaler.max, scaler2.max)\n    assert scaler.indices == scaler2.indices\n\n\ndef test_min_max_scaler_assert_raises():\n    x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]],\n                    dtype=numpy.float32)\n    scaler = MinMaxScaler()\n\n    # call transform before fit raises error\n    with pytest.raises(AttributeError):\n        scaler.transform(x)\n    with pytest.raises(AttributeError):\n        scaler.inverse_transform(x)\n\n\ndef test_min_max_scaler_transform_zero_max():\n    x = numpy.array([[0, 2], [0, 2], [0, 2]], dtype=numpy.float32)\n    expect_x_scaled = numpy.array([[0, 0], [0, 0], [0, 0]],\n                                  dtype=numpy.float32)\n    scaler = MinMaxScaler()\n    scaler.fit(x)\n    x_scaled = scaler.transform(x)\n    # print('min', scaler.min, 'max', scaler.max)\n    assert numpy.allclose(x_scaled, expect_x_scaled)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/scaler_tests/test_standard_scaler.py",
    "content": "import os\n\nimport chainer\nimport numpy\nimport pytest\nfrom chainer import serializers, Variable, cuda  # NOQA\n\nfrom chainer_chemistry.links.scaler.standard_scaler import StandardScaler\n\n\n@pytest.fixture\ndef data():\n    x = numpy.array(\n        [[0.1, 10., 0.3],\n         [0.2, 20., 0.1],\n         [0.3, 30., 0.],\n         [0.4, 40., 0.]],\n        dtype=numpy.float32)\n    expect_x_scaled = numpy.array(\n        [[-1.3416407, -1.3416408, 1.6329931],\n         [-0.44721353, -0.4472136, 0.],\n         [0.44721368, 0.4472136, -0.8164965],\n         [1.3416407, 1.3416408, -0.8164965]],\n        dtype=numpy.float32)\n    return x, expect_x_scaled\n\n\n@pytest.mark.parametrize('indices', [None, [0], [1, 2]])\ndef test_standard_scaler_transform(data, indices):\n    x, expect_x_scaled = data\n    scaler = StandardScaler()\n    scaler.fit(x, indices=indices)\n    x_scaled = scaler.transform(x)\n\n    if indices is None:\n        indices = numpy.arange(x.shape[1])\n    for index in range(x.shape[1]):\n        if index in indices:\n            assert numpy.allclose(x_scaled[:, index],\n                                  expect_x_scaled[:, index])\n        else:\n            assert numpy.allclose(x_scaled[:, index], x[:, index])\n\n\ndef test_standard_scaler_transform_variable(data):\n    x, expect_x_scaled = data\n    xvar = Variable(x)\n    scaler = StandardScaler()\n    scaler.fit(xvar)\n    x_scaled = scaler.transform(xvar)\n\n    assert isinstance(x_scaled, Variable)\n    assert numpy.allclose(x_scaled.array, expect_x_scaled)\n\n\n@pytest.mark.gpu\ndef test_standard_scaler_transform_gpu(data):\n    x, expect_x_scaled = data\n    scaler = StandardScaler()\n    scaler.to_gpu()\n    x = cuda.to_gpu(x)\n    scaler.fit(x)\n    x_scaled = scaler.transform(x)\n\n    assert isinstance(x_scaled, cuda.cupy.ndarray)\n    assert numpy.allclose(cuda.to_cpu(x_scaled), expect_x_scaled)\n\n\n@pytest.mark.parametrize('indices', [None, [0], [1, 2]])\ndef test_standard_scaler_inverse_transform(data, indices):\n    x, expect_x_scaled = data\n    scaler = StandardScaler()\n    scaler.fit(x, indices=indices)\n    x_inverse = scaler.inverse_transform(expect_x_scaled)\n\n    if indices is None:\n        indices = numpy.arange(x.shape[1])\n    for index in range(x.shape[1]):\n        if index in indices:\n            assert numpy.allclose(x_inverse[:, index], x[:, index])\n        else:\n            assert numpy.allclose(x_inverse[:, index],\n                                  expect_x_scaled[:, index])\n\n\n@pytest.mark.parametrize('axis', [1, 2])\ndef test_standard_scaler_3darray(data, axis):\n    x, expect_x_scaled = data\n    s0, s1 = x.shape\n    if axis == 1:\n        # feature axis is 1, insert other axis to 2nd axis\n        x = numpy.broadcast_to(x[:, :, None], (s0, s1, 2))\n        expect_x_scaled = numpy.broadcast_to(\n            expect_x_scaled[:, :, None], (s0, s1, 2))\n    elif axis == 2:\n        # feature axis is 2, insert other axis to 1st axis\n        x = numpy.broadcast_to(x[:, None, :], (s0, 3, s1))\n        expect_x_scaled = numpy.broadcast_to(\n            expect_x_scaled[:, None, :], (s0, 3, s1))\n    assert x.ndim == 3\n    indices = None\n    scaler = StandardScaler()\n    scaler.fit(x, indices=indices, axis=axis)\n    x_scaled = scaler.transform(x, axis=axis)\n    assert x_scaled.shape == expect_x_scaled.shape\n    assert numpy.allclose(x_scaled, expect_x_scaled, atol=1e-7)\n\n    x_inverse = scaler.inverse_transform(expect_x_scaled, axis=axis)\n\n    for index in numpy.arange(x.shape[1]):\n        assert numpy.allclose(x_inverse[:, index], x[:, index], atol=1e-7)\n\n\ndef test_standard_scaler_fit_transform(data):\n    x, expect_x_scaled = data\n    scaler = StandardScaler()\n    x_scaled = scaler.fit_transform(x)\n    assert numpy.allclose(x_scaled, expect_x_scaled)\n\n\n# TODO(nakago): fix Chainer serializer.\n# Behavior changed from numpy versioin 1.16.3.\n# allow_pickle=True must be passed to numpy.load function,\n# in order to load `None`.\n# For now, skip test for serialize `None`.\n# @pytest.mark.parametrize('indices', [None, [0]])\n@pytest.mark.parametrize('indices', [[0]])\ndef test_standard_scaler_serialize(tmpdir, data, indices):\n    x, expect_x_scaled = data\n    scaler = StandardScaler()\n    scaler.fit(x, indices=indices)\n\n    scaler_filepath = os.path.join(str(tmpdir), 'scaler.npz')\n    serializers.save_npz(scaler_filepath, scaler)\n\n    scaler2 = StandardScaler()\n    serializers.load_npz(scaler_filepath, scaler2)\n\n    # print('scaler2 attribs:', scaler2.mean, scaler2.std, scaler2.indices)\n    assert numpy.allclose(scaler.mean, scaler2.mean)\n    assert numpy.allclose(scaler.std, scaler2.std)\n    assert scaler.indices == scaler2.indices\n\n\ndef test_standard_scaler_assert_raises():\n    x = numpy.array([[0.1, 0.2, 0.3], [0.5, 0.3, 0.1]],\n                    dtype=numpy.float32)\n    scaler = StandardScaler()\n\n    # call transform before fit raises error\n    with pytest.raises(AttributeError):\n        scaler.transform(x)\n    with pytest.raises(AttributeError):\n        scaler.inverse_transform(x)\n\n\ndef test_standard_scaler_transform_zero_std():\n    x = numpy.array([[1, 2], [1, 2], [1, 2]], dtype=numpy.float32)\n    expect_x_scaled = numpy.array([[0, 0], [0, 0], [0, 0]],\n                                  dtype=numpy.float32)\n    scaler = StandardScaler()\n    scaler.fit(x)\n    x_scaled = scaler.transform(x)\n    assert numpy.allclose(x_scaled, expect_x_scaled)\n\n\ndef test_standard_scaler_forward(data):\n    # test `forward` and `__call__` method.\n    indices = [0]\n    x, expect_x_scaled = data\n    scaler = StandardScaler()\n    scaler.fit(x, indices=indices)\n    x_scaled_transform = scaler.transform(x)\n    x_scaled_forward = scaler.forward(x)\n\n    assert numpy.allclose(x_scaled_transform, x_scaled_forward)\n\n    if int(chainer.__version__.split('.')[0]) >= 5:\n        # `__call__` invokes `forward` method from version 5.\n        # Skip test for chainer v4.\n        x_scaled_call = scaler(x)\n        assert numpy.allclose(x_scaled_transform, x_scaled_call)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_cgcnn_update.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer import cuda\n\nfrom chainer_chemistry.links.update.cgcnn_update import CGCNNUpdate\n\n\n# node_size_list means the first moleculae has three nodes,\n# and the seconde molecule has five nodes\nnode_size_list = [3, 5]\nmax_num_nbr = 6\nnode_feature_dim = 10\nedge_feature_dim = 15\nout_dim = node_feature_dim\nbatch_size = 2\n\n\n@pytest.fixture\ndef update():\n    return CGCNNUpdate(n_site_features=node_feature_dim)\n\n\n@pytest.fixture\ndef data():\n    if len(node_size_list) != batch_size:\n        raise ValueError(\"Invalid fixture data for CGCNN\")\n\n    numpy.random.seed(0)\n    total_node_size = sum(node_size_list)\n    atom_feat = numpy.random.rand(total_node_size,\n                                  node_feature_dim).astype(numpy.float32)\n    nbr_feat = numpy.random.rand(total_node_size, max_num_nbr,\n                                 edge_feature_dim).astype(numpy.float32)\n    # nbr_idx\n    curr_idx = 0\n    nbr_idx = []\n    for val in node_size_list:\n        for _ in range(val):\n            max_val = curr_idx + val\n            nbr_idx.append(numpy.random.randint(curr_idx,\n                                                max_val, max_num_nbr))\n        curr_idx += val\n    nbr_idx = numpy.array(nbr_idx, dtype=numpy.int32)\n\n    y_grad = numpy.random.uniform(-1, 1,\n                                  (batch_size, out_dim)).astype(numpy.float32)\n    return atom_feat, nbr_feat, nbr_idx, y_grad\n\n\ndef check_forward(update, data):\n    y_actual = cuda.to_cpu(update(*data).data)\n    assert y_actual.shape == (sum(node_size_list), out_dim)\n\n\ndef test_forward_cpu(update, data):\n    atom_feat, nbr_feat, nbr_idx = data[:-1]\n    check_forward(update, (atom_feat, nbr_feat, nbr_idx))\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    input_data = [cuda.to_gpu(d) for d in data[:-1]]\n    update.to_gpu()\n    check_forward(update, tuple(input_data))\n\n\n# def test_backward_cpu(update, data):\n#     input_data, y_grad = data[0:-1], data[-1]\n#     gradient_check.check_backward(update, tuple(input_data), y_grad,\n#                                   atol=5e-1, rtol=1e-1)\n\n\n# @pytest.mark.gpu\n# def test_backward_gpu(update, data):\n#     atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n#     update.to_gpu()\n#     gradient_check.check_backward(update, (atom_data, adj_data), y_grad,\n#                                   atol=5e-1, rtol=1e-1)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_ggnn_update.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.ggnn_update import GGNNUpdate\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\nfrom chainer_chemistry.utils.sparse_utils import _convert_to_sparse\nfrom chainer_chemistry.utils.sparse_utils import convert_sparse_with_edge_type\nfrom chainer_chemistry.utils.sparse_utils import sparse_utils_available\n\natom_size = 5\nin_channels = 4\nhidden_channels = 7\nbatch_size = 2\nn_edge_types = 2\n\n\n@pytest.fixture\ndef update():\n    return GGNNUpdate(in_channels=in_channels, hidden_channels=hidden_channels,\n                      n_edge_types=n_edge_types)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype('i')\n    adj_data = numpy.random.uniform(\n        0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size)\n    ).astype('f')\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_channels)).astype('f')\n\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels)\n    embed_atom_data = embed(atom_data).data\n    return embed_atom_data, adj_data, y_grad\n\n\n@pytest.mark.skipif(not sparse_utils_available())\ndef convert_to_sparse(dense_adj):\n    # auxiliary function\n    data, row, col, edge_type = _convert_to_sparse(dense_adj)\n    return convert_sparse_with_edge_type(data, row, col, atom_size,\n                                         edge_type, n_edge_types)\n\n\ndef check_forward(update, atom_data, adj_data):\n    update.reset_state()\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_channels)\n    return y_actual\n\n\ndef test_forward_cpu(update, data):\n    atom_data, adj_data = data[:2]\n    y_dense = check_forward(update, atom_data, adj_data)\n\n    if sparse_utils_available():\n        sparse_adj = convert_to_sparse(adj_data)\n        y_sparse = check_forward(update, atom_data, sparse_adj)\n\n        # results for dense matrix and sparse matrix must be same\n        numpy.testing.assert_allclose(\n            y_dense, y_sparse, atol=1e-4, rtol=1e-4)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    update.to_gpu()\n    y_dense = check_forward(update, atom_data, adj_data)\n\n    if sparse_utils_available():\n        sparse_adj = convert_to_sparse(adj_data)\n        y_sparse = check_forward(update, atom_data, sparse_adj)\n\n        numpy.testing.assert_allclose(\n            cuda.to_cpu(y_dense), cuda.to_cpu(y_sparse), atol=1e-4, rtol=1e-4)\n\n\ndef check_backward(update, atom_data, adj_data, y_grad):\n    \"\"\"Check gradient of GGNNUpdate.\n\n    This function is different from other backward tests.\n    Because of GRU, reset_state method has to be called explicitly\n    before gradient calculation.\n\n    Args:\n        update (callable):\n        atom_data (numpy.ndarray):\n        adj_data (numpy.ndarray):\n        y_grad (numpy.ndarray):\n    \"\"\"\n    def f(atom_data):\n        # skip adj_data check.\n        update.reset_state()\n        return update(atom_data, adj_data)\n\n    gradient_check.check_backward(\n        f, (atom_data), y_grad, atol=1e-1, rtol=1e-1)\n\n\ndef test_backward_cpu(update, data):\n    atom_data, adj_data, y_grad = data\n    check_backward(update, atom_data, adj_data, y_grad)\n\n    if sparse_utils_available():\n        sparse_adj = convert_to_sparse(adj_data)\n        check_backward(update, atom_data, sparse_adj, y_grad)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    update.to_gpu()\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data)\n    check_backward(update, atom_data, adj_data, y_grad)\n\n    if sparse_utils_available():\n        sparse_adj = convert_to_sparse(adj_data)\n        check_backward(update, atom_data, sparse_adj, y_grad)\n\n\ndef test_forward_cpu_graph_invariant(update, data):\n    permutation_index = numpy.random.permutation(atom_size)\n    atom_data, adj_data = data[:2]\n    update.reset_state()\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    update.reset_state()\n    permute_y_actual = cuda.to_cpu(update(\n        permute_atom_data, permute_adj_data).data)\n    numpy.testing.assert_allclose(\n        permute_node(y_actual, permutation_index, axis=1),\n        permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_gin_update.py",
    "content": "from typing import Tuple  # NOQA\n\nimport chainer  # NOQA\nfrom chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.gin_update import GINUpdate\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 4\nhidden_channels = 6\nbatch_size = 3\nnum_edge_type = 7\n\n\n@pytest.fixture\ndef update():\n    # type: () -> GINUpdate\n    return GINUpdate(in_channels=in_channels, hidden_channels=hidden_channels,\n                     dropout_ratio=0)\n\n\n@pytest.fixture\ndef data():\n    # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, atom_size, atom_size)).astype('f')\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_channels)).astype('f')\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels)\n    embed_atom_data = embed(atom_data).data\n    return embed_atom_data, adj_data, y_grad\n\n\n# Test Update Function\ndef check_forward(update, atom_data, adj_data):\n    # type: (GINUpdate, numpy.ndarray, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_channels)\n\n\ndef test_forward_cpu(update, data):\n    # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    check_forward(update, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    update.to_gpu()\n    check_forward(update, atom_data, adj_data)\n\n\ndef test_backward_cpu(update, data):\n    # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data[:3])\n    update.to_gpu()\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3)\n\n\ndef test_forward_cpu_graph_invariant(update, data):\n    # type: (GINUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        update(permute_atom_data, permute_adj_data).data)\n    numpy.testing.assert_allclose(\n        permute_node(y_actual, permutation_index, axis=1),\n        permute_y_actual,\n        rtol=1e-3,\n        atol=1e-3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_gnn_film_update.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.gnn_film_update import GNNFiLMUpdate\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 7\nhidden_channels = 7\nbatch_size = 2\nn_edge_types = 5\n\n\n@pytest.fixture\ndef update():\n    return GNNFiLMUpdate(hidden_channels=hidden_channels,\n                         n_edge_types=n_edge_types)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype('i')\n    adj_data = numpy.random.uniform(\n        0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size)\n    ).astype('f')\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_channels)).astype('f')\n\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels)\n    embed_atom_data = embed(atom_data).data\n    adj_data = adj_data\n    return embed_atom_data, adj_data, y_grad\n\n\n# Test Update Function\ndef check_forward(update, atom_data, adj_data):\n    # type: (GNNFiLMUpdate, numpy.ndarray, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_channels)\n\n\ndef test_forward_cpu(update, data):\n    # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    check_forward(update, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    update.to_gpu()\n    check_forward(update, atom_data, adj_data)\n\n\ndef test_backward_cpu(update, data):\n    # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data[:3])\n    update.to_gpu()\n    # print(type(adj_data))\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\ndef test_forward_cpu_graph_invariant(update, data):\n    # type: (GNNFiLMUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        update(permute_atom_data, permute_adj_data).data)\n    numpy.testing.assert_allclose(\n        permute_node(y_actual, permutation_index, axis=1),\n        permute_y_actual,\n        rtol=1e-3,\n        atol=1e-3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_megnet_update.py",
    "content": "from chainer import cuda\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.links.update.megnet_update import MEGNetUpdate\n\n\n# node_size_list means the first moleculae has six nodes,\n# and the seconde molecule has four nodes\nnode_size_list = [6, 4]\n# edge_size_list means the first moleculae has eight edges,\n# and the seconde molecule has four edges\nedge_size_list = [8, 4]\nnode_feature_dim = 5\nedge_feature_dim = 10\nglobal_feature_dim = 2\nout_dim = 32\nbatch_size = 2\n\n\n@pytest.fixture\ndef update():\n    return MEGNetUpdate()\n\n\n@pytest.fixture\ndef data():\n    if len(node_size_list) != batch_size or len(edge_size_list) != batch_size:\n        raise ValueError(\"Invalid fixture for MEGNet\")\n\n    numpy.random.seed(0)\n    total_node_size = sum(node_size_list)\n    total_edge_size = sum(edge_size_list)\n    atom_feat = numpy.random.rand(total_node_size,\n                                  node_feature_dim).astype(numpy.float32)\n    pair_feat = numpy.random.rand(total_edge_size,\n                                  edge_feature_dim).astype(numpy.float32)\n    global_feat = numpy.random.rand(batch_size,\n                                    global_feature_dim).astype(numpy.float32)\n\n    # atom idx\n    atom_idx = numpy.hstack([[i] * node_size_list[i]\n                             for i in range(batch_size)]).astype(numpy.int32)\n    # pair idx\n    pair_idx = numpy.hstack([[i] * edge_size_list[i]\n                             for i in range(batch_size)]).astype(numpy.int32)\n    # create start and end idx\n    edge_idx = []\n    acc_node_size = [sum(node_size_list[:i+1]) for i in range(batch_size)]\n    low = numpy.roll(acc_node_size + [0], 1)[0:batch_size+1]\n    high = numpy.array(acc_node_size)\n    for i in range(batch_size):\n        idx = [numpy.random.choice(numpy.arange(low[i], high[i]), 2,\n                                   replace=False)\n               for _ in range(edge_size_list[i])]\n        edge_idx.extend(idx)\n\n    start_idx = numpy.array(edge_idx, dtype=numpy.int32)[:, 0]\n    end_idx = numpy.array(edge_idx, dtype=numpy.int32)[:, 1]\n\n    y_grad_atom = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    y_grad_pair = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    y_grad_global = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n\n    return atom_feat, pair_feat, global_feat, \\\n        atom_idx, pair_idx, start_idx, end_idx, \\\n        y_grad_atom, y_grad_pair, y_grad_global\n\n\ndef check_forward(update, data):\n    y_actual = [cuda.to_cpu(d.data) for d in update(*data)]\n    atom_feat, pair_feat, global_feat = y_actual\n    assert atom_feat.shape == (sum(node_size_list), out_dim)\n    assert pair_feat.shape == (sum(edge_size_list), out_dim)\n    assert global_feat.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(update, data):\n    atom_feat, pair_feat, global_feat, \\\n        atom_idx, pair_idx, start_idx, end_idx = data[:-3]\n    check_forward(update, (atom_feat, pair_feat, global_feat, atom_idx,\n                           pair_idx, start_idx, end_idx))\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    input_data = [cuda.to_gpu(d) for d in data[:-3]]\n    update.to_gpu()\n    check_forward(update, tuple(input_data))\n\n\n# def test_backward_cpu(update, data):\n#     input_data, y_grad = data[0:-3], data[-3:]\n#     gradient_check.check_backward(update, tuple(input_data), tuple(y_grad),\n#                                   atol=5e-1, rtol=1e-1)\n\n\n# @pytest.mark.gpu\n# def test_backward_gpu(update, data):\n#     data = [cuda.to_gpu(d) for d in data]\n#     input_data, y_grad = data[0:-3], data[-3:]\n#     update.to_gpu()\n#     gradient_check.check_backward(update, tuple(input_data), tuple(y_grad),\n#                                   atol=5e-1, rtol=1e-1)\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_mpnn_update.py",
    "content": "from typing import Tuple  # NOQA\n\nimport chainer  # NOQA\nfrom chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.mpnn_update import EdgeNet\nfrom chainer_chemistry.links.update.mpnn_update import MPNNUpdate\n\natom_size = 5\nhidden_channels = 4\nbatch_size = 3\nnum_edge_type = 7\n\n\n@pytest.fixture\ndef message():\n    # type: () -> EdgeNet\n    return EdgeNet(out_channels=hidden_channels)\n\n\n@pytest.fixture\ndef update():\n    # type: () -> MPNNUpdate\n    return MPNNUpdate(hidden_channels=hidden_channels)\n\n\n@pytest.fixture\ndef data():\n    # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]  # NOQA\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, num_edge_type, atom_size,\n                         atom_size)).astype('f')\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_channels)).astype('f')\n    y_grad_ = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_channels)).astype('f')\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=hidden_channels)\n    embed_atom_data = embed(atom_data).data\n    return embed_atom_data, adj_data, y_grad, y_grad_\n\n\n# Test Message Function\ndef check_message_forward(message, atom_data, adj_data):\n    # type: (EdgeNet, numpy.ndarray, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(message(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_channels * 2)\n\n\ndef test_message_forward_cpu(message, data):\n    # type: (EdgeNet, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    check_message_forward(message, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_message_forward_gpu(message, data):\n    # type: (EdgeNet, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    message.to_gpu()\n    check_message_forward(message, atom_data, adj_data)\n\n\ndef test_message_backward_cpu(message, data):\n    # type: (EdgeNet, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad, y_grad_ = data\n    y_grad = numpy.concatenate([y_grad, y_grad_], axis=2)\n    gradient_check.check_backward(\n        message, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\n@pytest.mark.gpu\ndef test_message_backward_gpu(message, data):\n    # type: (EdgeNet, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad, y_grad_ = map(cuda.to_gpu, data)\n    xp = cuda.get_array_module(atom_data)\n    y_grad = xp.concatenate([y_grad, y_grad_], axis=2)\n    message.to_gpu()\n    gradient_check.check_backward(\n        message, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1)\n\n\n# Test Update Function\ndef check_forward(update, atom_data, adj_data):\n    # type: (MPNNUpdate, numpy.ndarray, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_channels)\n\n\ndef test_forward_cpu(update, data):\n    # type: (MPNNUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    check_forward(update, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    # type: (MPNNUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    update.to_gpu()\n    check_forward(update, atom_data, adj_data)\n\n\ndef check_backward(update, atom_data, adj_data, y_grad):\n    # type: (MPNNUpdate, numpy.ndarray, numpy.ndarray, numpy.ndarray) -> None\n    \"\"\"Check gradient of MPNNUpdate.\n\n    This function is different from other backward tests.\n    Because of GRU, reset_state method has to be called explicitly\n    before gradient calculation.\n\n    Args:\n        update (callable):\n        atom_data (numpy.ndarray):\n        adj_data (numpy.ndarray):\n        y_grad (numpy.ndarray):\n    \"\"\"\n    def f(*args, **kwargs):\n        update.reset_state()\n        return update(*args, **kwargs)\n    gradient_check.check_backward(\n        f, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1)\n\n\ndef test_backward_cpu(update, data):\n    # type: (MPNNUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = data[:3]\n    check_backward(update, atom_data, adj_data, y_grad)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    # type: (MPNNUpdate, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data[:3])\n    update.to_gpu()\n    # gradient_check.check_backward(update, (atom_data, adj_data), y_grad,\n    #                               atol=1e-1, rtol=1e-1)\n    check_backward(update, atom_data, adj_data, y_grad)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s', '-x'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_nfp_update.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.nfp_update import NFPUpdate\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\n\natom_size = 5\nhidden_channels = 4\nbatch_size = 2\nnum_degree_type = 7\n\n\n@pytest.fixture\ndef update():\n    return NFPUpdate(in_channels=hidden_channels, out_channels=hidden_channels)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, atom_size, atom_size)).astype('f')\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_channels)).astype('f')\n\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=hidden_channels)\n    embed_atom_data = embed(atom_data).data\n    degree_mat = numpy.sum(adj_data, axis=1)\n    deg_conds = numpy.array([numpy.broadcast_to(\n        ((degree_mat - degree) == 0)[:, :, None], embed_atom_data.shape)\n        for degree in range(1, num_degree_type + 1)])\n    return embed_atom_data, adj_data, deg_conds, y_grad\n\n\ndef check_forward(update, atom_data, adj_data, deg_conds):\n    y_actual = cuda.to_cpu(update(atom_data, adj_data, deg_conds).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_channels)\n\n\ndef test_forward_cpu(update, data):\n    atom_data, adj_data, deg_conds = data[:3]\n    check_forward(update, atom_data, adj_data, deg_conds)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    atom_data, adj_data, deg_conds = map(cuda.to_gpu, data[:3])\n    update.to_gpu()\n    check_forward(update, atom_data, adj_data, deg_conds)\n\n\ndef test_backward_cpu(update, data):\n    atom_data, adj_data, deg_conds, y_grad = data\n    gradient_check.check_backward(\n        update, (atom_data, adj_data, deg_conds), y_grad, atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    atom_data, adj_data, deg_conds, y_grad = map(cuda.to_gpu, data)\n    update.to_gpu()\n    gradient_check.check_backward(\n        update, (atom_data, adj_data, deg_conds), y_grad, atol=1e-3, rtol=1e-3)\n\n\ndef test_forward_cpu_graph_invariant(update, data):\n    atom_data, adj_data, deg_conds = data[:3]\n    y_actual = cuda.to_cpu(update(atom_data, adj_data, deg_conds).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    # atom_data: (batch_size, atom_size, hidden_channels)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    # deg_conds: (num_degree_type, batch_size, atom_size, hidden_channels)\n    permute_deg_conds = permute_node(deg_conds, permutation_index, axis=2)\n    permute_y_actual = cuda.to_cpu(update(\n        permute_atom_data, permute_adj_data, permute_deg_conds).data)\n    numpy.testing.assert_allclose(\n        permute_node(y_actual, permutation_index, axis=1), permute_y_actual,\n        rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_relgat_update.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.relgat_update import RelGATUpdate\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\n\nin_channels = 3\nout_channels = 4\natom_size = 5\nbatch_size = 2\nnum_edge_type = 7\n\n\n@pytest.fixture\ndef update():\n    return RelGATUpdate(in_channels=in_channels,\n                        out_channels=out_channels,\n                        n_edge_types=num_edge_type)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, num_edge_type, atom_size, atom_size)\n    ).astype('f')\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, out_channels)).astype('f')\n\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels)\n    embed_atom_data = embed(atom_data).data\n    return embed_atom_data, adj_data, y_grad\n\n\ndef check_forward(update, atom_data, adj_data):\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, atom_size, out_channels)\n\n\ndef test_forward_cpu(update, data):\n    atom_data, adj_data = data[:2]\n    check_forward(update, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    update.to_gpu()\n    check_forward(update, atom_data, adj_data)\n\n\ndef test_backward_cpu(update, data):\n    atom_data, adj_data, y_grad = data\n    # gradient_check.check_backward(\n    #     update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3)\n    params = tuple(update.params())  # NOQA\n    gradient_check.check_backward(update, (atom_data, adj_data), y_grad,\n                                  no_grads=[False, True],\n                                  atol=1e-1, rtol=1e-1)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data)\n    update.to_gpu()\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1)\n\n\ndef test_forward_cpu_graph_invariant(update, data):\n    atom_data, adj_data = data[:2]\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        update(permute_atom_data, permute_adj_data).data)\n    numpy.testing.assert_allclose(\n        permute_node(y_actual, permutation_index, axis=1), permute_y_actual,\n        rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_relgcn_update.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.relgcn_update import RelGCNUpdate\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 3\nhidden_dim = 4\nbatch_size = 2\nn_edge_types = 7\n\n\n@pytest.fixture\ndef update():\n    return RelGCNUpdate(in_channels=in_channels, out_channels=hidden_dim,\n                        n_edge_types=n_edge_types)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size)\n    ).astype('f')\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_dim)).astype('f')\n\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels)\n    embed_atom_data = embed(atom_data).data\n    return embed_atom_data, adj_data, y_grad\n\n\ndef check_forward(update, atom_data, adj_data):\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_dim)\n\n\ndef test_forward_cpu(update, data):\n    atom_data, adj_data = data[:2]\n    check_forward(update, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    update.to_gpu()\n    check_forward(update, atom_data, adj_data)\n\n\ndef test_backward_cpu(update, data):\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data)\n    update.to_gpu()\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\ndef test_forward_cpu_graph_invariant(update, data):\n    atom_data, adj_data = data[:2]\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        update(permute_atom_data, permute_adj_data).data)\n    numpy.testing.assert_allclose(\n        permute_node(y_actual, permutation_index, axis=1), permute_y_actual,\n        rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_rsgcn_update.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.rsgcn_update import RSGCNUpdate\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 3\nhidden_dim = 4\nbatch_size = 2\nnum_edge_type = 7\n\n\n@pytest.fixture\ndef update():\n    return RSGCNUpdate(in_channels=in_channels, out_channels=hidden_dim)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, atom_size, atom_size)).astype('f')\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_dim)).astype('f')\n\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels)\n    embed_atom_data = embed(atom_data).data\n    return embed_atom_data, adj_data, y_grad\n\n\ndef check_forward(update, atom_data, adj_data):\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_dim)\n\n\ndef test_forward_cpu(update, data):\n    atom_data, adj_data = data[:2]\n    check_forward(update, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    update.to_gpu()\n    check_forward(update, atom_data, adj_data)\n\n\ndef test_backward_cpu(update, data):\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data)\n    update.to_gpu()\n    gradient_check.check_backward(\n        update, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3)\n\n\ndef test_forward_cpu_graph_invariant(update, data):\n    atom_data, adj_data = data[:2]\n    y_actual = cuda.to_cpu(update(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        update(permute_atom_data, permute_adj_data).data)\n    numpy.testing.assert_allclose(\n        permute_node(y_actual, permutation_index, axis=1), permute_y_actual,\n        rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/links_tests/update_tests/test_schnet_update.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.connection.embed_atom_id import EmbedAtomID\nfrom chainer_chemistry.links.update.schnet_update import SchNetUpdate\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nin_channels = 4\nhidden_channels = in_channels  # must be same for now\nbatch_size = 2\n\n\n@pytest.fixture\ndef update():\n    return SchNetUpdate(hidden_channels=hidden_channels)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype('i')\n    # symmetric matrix\n    dist_data = numpy.random.uniform(\n        0, high=30, size=(batch_size, atom_size, atom_size)).astype('f')\n    dist_data = (dist_data + dist_data.swapaxes(-1, -2)) / 2.\n\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, atom_size, hidden_channels)).astype('f')\n    embed = EmbedAtomID(in_size=MAX_ATOMIC_NUM, out_size=in_channels)\n    embed_atom_data = embed(atom_data).data\n    return embed_atom_data, dist_data, y_grad\n\n\ndef check_forward(update, atom_data, dist_data):\n    y_actual = cuda.to_cpu(update(atom_data, dist_data).data)\n    assert y_actual.shape == (batch_size, atom_size, hidden_channels)\n\n\ndef test_forward_cpu(update, data):\n    atom_data, dist_data = data[:2]\n    check_forward(update, atom_data, dist_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(update, data):\n    atom_data, dist_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    update.to_gpu()\n    check_forward(update, atom_data, dist_data)\n\n\ndef test_backward_cpu(update, data):\n    atom_data, dist_data, y_grad = data\n    gradient_check.check_backward(\n        update, (atom_data, dist_data), y_grad, atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(update, data):\n    atom_data, dist_data, y_grad = map(cuda.to_gpu, data)\n    update.to_gpu()\n    gradient_check.check_backward(\n        update, (atom_data, dist_data), y_grad, atol=1e-3, rtol=1e-3)\n\n\ndef test_forward_cpu_graph_invariant(update, data):\n    atom_data, dist_data = data[:2]\n    y_actual = cuda.to_cpu(update(atom_data, dist_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index, axis=1)\n    permute_dist_data = permute_adj(dist_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(update(\n        permute_atom_data, permute_dist_data).data)\n    numpy.testing.assert_allclose(\n        permute_node(y_actual, permutation_index, axis=1),\n        permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/gwm_tests/test_gwm.py",
    "content": "from chainer import cuda\nfrom chainer import functions\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.models.gwm.gwm import GWM, WarpGateUnit, SuperNodeTransmitterUnit, GraphTransmitterUnit  # NOQA\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nhidden_dim = 4\nsupernode_dim = 7\nbatch_size = 2\nnum_edge_type = 2\n\n\n@pytest.fixture\ndef graph_warp_gate_unit():\n    return WarpGateUnit(output_type='graph', hidden_dim=hidden_dim)\n\n\n@pytest.fixture\ndef super_warp_gate_unit():\n    return WarpGateUnit(output_type='super', hidden_dim=supernode_dim)\n\n\n@pytest.fixture\ndef super_node_transmitter_unit():\n    return SuperNodeTransmitterUnit(hidden_dim_super=supernode_dim,\n                                    hidden_dim=hidden_dim)\n\n\n@pytest.fixture\ndef graph_transmitter_unit():\n    return GraphTransmitterUnit(hidden_dim_super=supernode_dim,\n                                hidden_dim=hidden_dim)\n\n\n@pytest.fixture\ndef gwm():\n    # relu is difficult to test\n    return GWM(hidden_dim=hidden_dim, hidden_dim_super=supernode_dim,\n               n_layers=2, activation=functions.identity,\n               wgu_activation=functions.identity,\n               gtu_activation=functions.identity)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    # too difficult to pass unit test by using EmbedAtomID\n    embed_atom_data = numpy.random.uniform(\n        -0.01, 0.01, (batch_size, atom_size, hidden_dim)).astype('f')\n    new_embed_atom_data = numpy.random.uniform(\n        -0.01, 0.01, (batch_size, atom_size, hidden_dim)).astype('f')\n    y_grad = numpy.random.uniform(\n        -0.01, 0.01, (batch_size, atom_size, hidden_dim)).astype('f')\n    supernode = numpy.random.uniform(\n        -0.01, 0.01, (batch_size, supernode_dim)).astype('f')\n    supernode_grad = numpy.random.uniform(\n        -0.01, 0.01, (batch_size, supernode_dim)).astype('f')\n\n    return embed_atom_data, new_embed_atom_data, supernode, y_grad,\\\n        supernode_grad\n\n\ndef test_graph_transmitter_unit_forward(graph_transmitter_unit, data):\n    embed_atom_data = data[0]\n    supernode = data[2]\n    h_trans = graph_transmitter_unit(embed_atom_data, supernode)\n    assert h_trans.array.shape == (batch_size, supernode_dim)\n\n\ndef test_graph_transmitter_unit_backward(graph_transmitter_unit, data):\n    embed_atom_data = data[0]\n    supernode = data[2]\n    supernode_grad = data[4]\n    gradient_check.check_backward(graph_transmitter_unit,\n                                  (embed_atom_data, supernode),\n                                  supernode_grad, eps=0.1)\n\n\ndef test_super_node_transmitter_unit_forward(super_node_transmitter_unit,\n                                             data):\n    supernode = data[2]\n    g_trans = super_node_transmitter_unit(supernode, atom_size)\n    assert g_trans.array.shape == (batch_size, atom_size, hidden_dim)\n\n\ndef test_super_node_transmitter_unit_backward(super_node_transmitter_unit,\n                                              data):\n    supernode = data[2]\n    y_grad = data[3]\n    gradient_check.check_backward(\n        lambda x: super_node_transmitter_unit(x, atom_size), supernode, y_grad)\n\n\ndef test_graph_warp_gate_unit_forward(graph_warp_gate_unit, data):\n    embed_atom_data = data[0]\n    new_embed_atom_data = data[1]\n    merged = graph_warp_gate_unit(embed_atom_data, new_embed_atom_data)\n    assert merged.array.shape == (batch_size, atom_size, hidden_dim)\n\n\ndef test_graph_warp_gate_unit_backward(graph_warp_gate_unit, data):\n    embed_atom_data = data[0]\n    new_embed_atom_data = data[1]\n    y_grad = data[3]\n    gradient_check.check_backward(graph_warp_gate_unit,\n                                  (embed_atom_data, new_embed_atom_data),\n                                  y_grad, eps=0.01)\n\n\ndef test_super_warp_gate_unit_forward(super_warp_gate_unit, data):\n    supernode = data[2]\n    merged = super_warp_gate_unit(supernode, supernode)\n    assert merged.array.shape == (batch_size, supernode_dim)\n\n\ndef test_super_warp_gate_unit_backward(super_warp_gate_unit, data):\n    supernode = data[2]\n    supernode_grad = data[4]\n    gradient_check.check_backward(super_warp_gate_unit,\n                                  (supernode, supernode),\n                                  supernode_grad, eps=0.01)\n\n\ndef check_forward(gwm, embed_atom_data, new_embed_atom_data, supernode):\n    gwm.reset_state()\n    h_actual, g_actual = gwm(embed_atom_data, new_embed_atom_data, supernode)\n    assert h_actual.array.shape == (batch_size, atom_size, hidden_dim)\n    assert g_actual.array.shape == (batch_size, supernode_dim)\n\n\ndef test_forward_cpu(gwm, data):\n    embed_atom_data, new_embed_atom_data, supernode = data[:3]\n    check_forward(gwm, embed_atom_data, new_embed_atom_data, supernode)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(gwm, data):\n    embed_atom_data, new_embed_atom_data, supernode = data[:3]\n    embed_atom_data = cuda.to_gpu(embed_atom_data)\n    new_embed_atom_data = cuda.to_gpu(new_embed_atom_data)\n    supernode = cuda.to_gpu(supernode)\n    gwm.to_gpu()\n    check_forward(gwm, embed_atom_data, new_embed_atom_data, supernode)\n\n\ndef check_backward(gwm, embed_atom_data, new_embed_atom_data, supernode,\n                   y_grad, supernode_grad):\n    gwm.reset_state()\n\n    # TODO(nakago): rtol is too high! GWM is too large to calculate\n    # numerical differentiation\n    gradient_check.check_backward(gwm, (embed_atom_data, new_embed_atom_data,\n                                        supernode), (y_grad, supernode_grad),\n                                  eps=0.1, rtol=1e1)\n\n\ndef test_backward_cpu(gwm, data):\n    check_backward(gwm, *data)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(gwm, data):\n    gwm.to_gpu()\n    check_backward(gwm, *map(cuda.to_gpu, data))\n\n\ndef test_forward_cpu_graph_invariant(gwm, data):\n    permutation_index = numpy.random.permutation(atom_size)\n    gwm.reset_state()\n    embed_atom_data, new_embed_atom_data, supernode = data[:3]\n    h_actual, g_actual = gwm(embed_atom_data, new_embed_atom_data, supernode)\n\n    permute_embed_atom_data = permute_node(\n        embed_atom_data, permutation_index, axis=1)\n    permute_new_embed_atom_data = permute_node(\n        new_embed_atom_data, permutation_index, axis=1)\n    gwm.reset_state()\n    permute_h_actual, permute_g_actual = gwm(\n        permute_embed_atom_data, permute_new_embed_atom_data, supernode)\n    numpy.testing.assert_allclose(\n        permute_node(h_actual.data, permutation_index, axis=1),\n        permute_h_actual.data, rtol=1e-5, atol=1e-5)\n\n    numpy.testing.assert_allclose(g_actual.data, permute_g_actual.data,\n                                  rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/gwm_tests/test_gwm_graph_conv_model.py",
    "content": "import itertools\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links.readout.general_readout import GeneralReadout\nfrom chainer_chemistry.links.readout.ggnn_readout import GGNNReadout\nfrom chainer_chemistry.links.readout.nfp_readout import NFPReadout\nfrom chainer_chemistry.links.readout.schnet_readout import SchNetReadout\nfrom chainer_chemistry.links.update.ggnn_update import GGNNUpdate\nfrom chainer_chemistry.links.update.gin_update import GINUpdate\nfrom chainer_chemistry.links.update.relgat_update import RelGATUpdate\nfrom chainer_chemistry.links.update.relgcn_update import RelGCNUpdate\nfrom chainer_chemistry.links.update.rsgcn_update import RSGCNUpdate\nfrom chainer_chemistry.models.gwm.gwm_graph_conv_model import GWMGraphConvModel\n\n\natom_size = 5\nsuper_dim = 7\nin_channels = 6\nout_dim = 4\nbatch_size = 2\nn_edge_types = 3\n\n# TODO(nakago): SchNetUpdate need `in_channels` kwargs, not supported.\nupdates_2dim = [GINUpdate, RSGCNUpdate]\n# TODO(nakago): Support MPNNUpdate.\nupdates_3dim = [GGNNUpdate, RelGATUpdate, RelGCNUpdate]\nupdates = updates_2dim + updates_3dim\n\n# TODO(nakago): MPNNReadout need to specify `in_channels` and not supported.\nreadouts = [GGNNReadout, NFPReadout, SchNetReadout]\nhidden_channels = [[6, 6, 6, 6], 6]\nuse_bn = [True, False]\nuse_weight_tying = [True, False]\n\nparams = list(itertools.product(\n    updates, readouts, hidden_channels, use_bn, use_weight_tying,\n))\n\n\n@pytest.fixture(params=params)\ndef plain_context(request):\n    update, readout, ch, bn, wt = request.param\n    if update in updates_3dim:\n        adj_type = 3\n    elif update in updates_2dim:\n        adj_type = 2\n    else:\n        raise ValueError\n    data = make_data(adj_type)\n    model = make_model(update, readout, ch, bn, wt)\n    return model, data\n\n\n@pytest.fixture(params=params)\ndef gwm_context(request):\n    update, readout, ch, bn, wt = request.param\n    if update in updates_3dim:\n        adj_type = 3\n    elif update in updates_2dim:\n        adj_type = 2\n    else:\n        raise ValueError\n    data = make_data(adj_type)\n    model = make_gwm_model(update, readout, ch, bn, wt)\n    return model, data\n\n\ndef make_model(update, readout, ch, bn, wt):\n    # print('update', update, 'readout', readout, 'ch', ch, 'bn', bn, 'wt', wt)\n    return GWMGraphConvModel(\n        update_layer=update, readout_layer=readout, n_update_layers=3,\n        hidden_channels=ch, n_edge_types=n_edge_types, weight_tying=wt,\n        out_dim=out_dim, with_gwm=False, use_batchnorm=bn)\n\n\ndef make_gwm_model(update, readout, ch, bn, wt):\n    return GWMGraphConvModel(\n        update_layer=update, readout_layer=readout, n_update_layers=3,\n        hidden_channels=ch, n_edge_types=n_edge_types, weight_tying=wt,\n        super_node_dim=super_dim, out_dim=out_dim, with_gwm=True,\n        use_batchnorm=bn)\n\n\ndef make_data(adj_type):\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype(numpy.int32)\n    if adj_type == 2:\n        adj_data = numpy.random.randint(\n            0, high=2, size=(batch_size, atom_size, atom_size)\n        ).astype(numpy.float32)\n    elif adj_type == 3:\n        adj_data = numpy.random.randint(\n            0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size)\n        ).astype(numpy.float32)\n    else:\n        raise ValueError\n    super_data = numpy.random.uniform(-1, 1, (batch_size, super_dim)\n                                      ).astype(numpy.float32)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    return atom_data, adj_data, super_data, y_grad\n\n\ndef test_plain_model_forward(plain_context):\n    model, data = plain_context\n    atom_array = data[0]\n    adj = data[1]\n    y_actual = model(atom_array, adj)\n    assert y_actual.shape == (batch_size, out_dim)\n    if model.weight_tying:\n        assert len(model.update_layers) == 1\n    else:\n        assert len(model.update_layers) == 3\n\n\ndef test_gwm_model_forward(gwm_context):\n    model, data = gwm_context\n    atom_array = data[0]\n    adj = data[1]\n    super_node = data[2]\n    y_actual = model(atom_array, adj, super_node)\n    assert y_actual.shape == (batch_size, out_dim)\n    if model.weight_tying:\n        assert len(model.update_layers) == 1\n    else:\n        assert len(model.update_layers) == 3\n\n\n# SchNet is not supported\nsp_params = list(itertools.product(\n    updates_2dim[:-1] + updates_3dim,\n    [[6, 6, 6, 6], [4, 4, 4, 4], [6, 5, 3, 4]],\n))\n\n\n@pytest.mark.parametrize(('update', 'ch'), sp_params)\ndef test_plain_model_forward_general_readout(\n        update, ch):\n    if update in updates_3dim:\n        adj_type = 3\n    elif update in updates_2dim:\n        adj_type = 2\n    else:\n        raise ValueError\n    data = make_data(adj_type)\n    model = GWMGraphConvModel(update_layer=update,\n                              readout_layer=GeneralReadout,\n                              hidden_channels=ch,\n                              out_dim=out_dim,\n                              n_edge_types=n_edge_types,\n                              with_gwm=False)\n    atom_array = data[0]\n    adj = data[1]\n    y_actual = model(atom_array, adj)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\n@pytest.mark.parametrize('update',\n                         updates_2dim[:-1] + updates_3dim)\ndef test_gwm_model_forward_general_readout(update):\n    if update in updates_3dim:\n        adj_type = 3\n    elif update in updates_2dim:\n        adj_type = 2\n    else:\n        raise ValueError\n    data = make_data(adj_type)\n    ch = [6, 6, 6, 6]\n    with pytest.raises(ValueError):\n        model = GWMGraphConvModel(update_layer=update,\n                                  readout_layer=GeneralReadout,\n                                  hidden_channels=ch,\n                                  out_dim=out_dim,\n                                  n_edge_types=n_edge_types,\n                                  super_node_dim=super_dim,\n                                  with_gwm=True)\n    ch = [4, 4, 4, 4]\n    model = GWMGraphConvModel(update_layer=update,\n                              readout_layer=GeneralReadout,\n                              hidden_channels=ch,\n                              out_dim=out_dim,\n                              n_edge_types=n_edge_types,\n                              super_node_dim=super_dim,\n                              with_gwm=True)\n    atom_array = data[0]\n    adj = data[1]\n    super_node = data[2]\n    y_actual = model(atom_array, adj, super_node)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\np = list(itertools.product(updates_2dim[:-1] + updates_3dim, readouts,\n                           [True, False]))\n\n\n@pytest.mark.parametrize(('update', 'readout', 'gwm'), p)\ndef test_model_forward_general_weight_tying(update, readout, gwm):\n    if update in updates_3dim:\n        adj_type = 3\n    elif update in updates_2dim:\n        adj_type = 2\n    else:\n        raise ValueError\n    data = make_data(adj_type)\n    ch = [6, 7, 8, 6]\n    if gwm:\n        with pytest.raises(ValueError):\n            model = GWMGraphConvModel(update_layer=update,\n                                      readout_layer=GeneralReadout,\n                                      hidden_channels=ch,\n                                      out_dim=out_dim,\n                                      n_edge_types=n_edge_types,\n                                      super_node_dim=super_dim,\n                                      with_gwm=gwm)\n    else:\n        model = GWMGraphConvModel(update_layer=update,\n                                  readout_layer=GeneralReadout,\n                                  hidden_channels=ch,\n                                  out_dim=out_dim,\n                                  n_edge_types=n_edge_types,\n                                  super_node_dim=super_dim,\n                                  with_gwm=gwm)\n        atom_array = data[0]\n        adj = data[1]\n        super_node = data[2]  # NOQA\n        y_actual = model(atom_array, adj)\n        assert y_actual.shape == (batch_size, out_dim)\n\n\n@pytest.mark.parametrize(('update', 'readout', 'gwm'), p)\ndef test_model_forward_general_concat_hidden(update, readout, gwm):\n    if update in updates_3dim:\n        adj_type = 3\n    elif update in updates_2dim:\n        adj_type = 2\n    else:\n        raise ValueError\n    data = make_data(adj_type)\n    ch = [6, 6, 6, 6]\n    model = GWMGraphConvModel(update_layer=update,\n                              readout_layer=readout,\n                              hidden_channels=ch,\n                              out_dim=out_dim,\n                              n_edge_types=n_edge_types,\n                              super_node_dim=super_dim,\n                              concat_hidden=True,\n                              with_gwm=gwm)\n    atom_array = data[0]\n    adj = data[1]\n    super_node = data[2]\n    y_actual = model(atom_array, adj, super_node)\n    assert y_actual.shape == (batch_size, out_dim * (len(ch) - 1))\n\n\n@pytest.mark.parametrize(('update', 'readout', 'gwm'), p)\ndef test_model_forward_general_sum_hidden(update, readout, gwm):\n    if update in updates_3dim:\n        adj_type = 3\n    elif update in updates_2dim:\n        adj_type = 2\n    else:\n        raise ValueError\n    data = make_data(adj_type)\n    ch = [6, 6, 6, 6]\n    model = GWMGraphConvModel(update_layer=update,\n                              readout_layer=readout,\n                              hidden_channels=ch,\n                              out_dim=out_dim,\n                              n_edge_types=n_edge_types,\n                              super_node_dim=super_dim,\n                              sum_hidden=True,\n                              with_gwm=gwm)\n    atom_array = data[0]\n    adj = data[1]\n    super_node = data[2]\n    y_actual = model(atom_array, adj, super_node)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\nif __name__ == '__main__':\n    # -x is to stop when first failed.\n    pytest.main([__file__, '-v', '-s', '-x'])\n"
  },
  {
    "path": "tests/models_tests/prediction_tests/test_base.py",
    "content": "import os\n\nimport chainer\nfrom chainer import cuda\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.models.prediction.base import BaseForwardModel\n\n\nclass DummyForwardModel(BaseForwardModel):\n\n    def __init__(self, device=-1, dummy_str='dummy'):\n        super(DummyForwardModel, self).__init__()\n        with self.init_scope():\n            self.l = chainer.links.Linear(3, 10)\n\n        self.dummy_str = dummy_str\n        self.initialize(device)\n\n    def __call__(self, x):\n        return self.l(x)\n\n\n# test `_forward` is done by `Classifier` and `Regressor` concrete class.\ndef _test_save_load_pickle(device, tmpdir):\n    model = DummyForwardModel(device=device, dummy_str='hoge')\n\n    filepath = os.path.join(str(tmpdir), 'model.pkl')\n    model.save_pickle(filepath)\n    model_load = DummyForwardModel.load_pickle(filepath, device=device)\n\n    # --- check model class ---\n    assert isinstance(model_load, DummyForwardModel)\n    # --- check model attribute is same ---\n    assert model_load.dummy_str == model.dummy_str\n    assert model_load.dummy_str == 'hoge'\n    assert model_load.device == chainer.get_device(device)\n\n    # --- check model parameter is same ---\n    params = model.namedparams()\n    params_load = dict(model_load.namedparams())\n    for k, v in params:\n        v_load = params_load[k]\n        assert cuda.get_device_from_array(v_load.data).id == device\n        assert numpy.allclose(cuda.to_cpu(v.data), cuda.to_cpu(v_load.data))\n\n\ndef test_save_load_pickle_cpu(tmpdir):\n    _test_save_load_pickle(device=-1, tmpdir=tmpdir)\n\n\n@pytest.mark.gpu\ndef test_save_load_pickle_gpu(tmpdir):\n    _test_save_load_pickle(device=0, tmpdir=tmpdir)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/prediction_tests/test_classifier.py",
    "content": "import mock\nimport numpy\nimport pytest\n\nimport chainer\nfrom chainer import cuda\nfrom chainer import functions\nfrom chainer import links\nfrom chainer import reporter\n\nfrom chainer_chemistry.models.prediction.classifier import Classifier\n\n\n# testing.parameterize takes a list of dictionaries.\n# Currently, we cannot set a function to the value of the dictionaries.\n# As a workaround, we wrap the function and invoke it in __call__ method.\n# See issue #1337 for detail.\nclass AccuracyWithIgnoreLabel(object):\n\n    def __call__(self, y, t):\n        return functions.accuracy(y, t, ignore_label=1)\n\n\nclass DummyPredictor(chainer.Chain):\n    def __call__(self, x):\n        return x\n\n\n@pytest.mark.parametrize(\n    'metrics_fun', [AccuracyWithIgnoreLabel(), None,\n                    {'user_key': AccuracyWithIgnoreLabel()}])\n@pytest.mark.parametrize('compute_metrics', [True, False])\nclass TestClassifier(object):\n\n    def setup_method(self, method):\n        self.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)\n        self.t = numpy.random.randint(3, size=5).astype(numpy.int32)\n        self.y = numpy.random.uniform(-1, 1, (5, 7)).astype(numpy.float32)\n\n    def check_call(\n            self, gpu, label_key, args, kwargs, model_args, model_kwargs,\n            metrics_fun, compute_metrics):\n        init_kwargs = {'label_key': label_key}\n        if metrics_fun is not None:\n            init_kwargs['metrics_fun'] = metrics_fun\n        link = Classifier(chainer.Link(), **init_kwargs)\n\n        if gpu:\n            xp = cuda.cupy\n            link.to_gpu()\n        else:\n            xp = numpy\n\n        link.compute_metrics = compute_metrics\n\n        y = chainer.Variable(self.y)\n        link.predictor = mock.MagicMock(return_value=y)\n\n        loss = link(*args, **kwargs)\n        link.predictor.assert_called_with(*model_args, **model_kwargs)\n\n        assert hasattr(link, 'y')\n        assert link.y is not None\n\n        assert hasattr(link, 'loss')\n        xp.testing.assert_allclose(link.loss.data, loss.data)\n\n        assert hasattr(link, 'metrics')\n        if compute_metrics:\n            assert link.metrics is not None\n        else:\n            assert link.metrics is None\n\n    def test_call_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, -1, (self.x, self.t), {}, (self.x,), {},\n            metrics_fun, compute_metrics)\n\n    def test_call_three_args_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, -1, (self.x, self.x, self.t), {}, (self.x, self.x), {},\n            metrics_fun, compute_metrics)\n\n    def test_call_positive_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, 2, (self.x, self.x, self.t), {}, (self.x, self.x), {},\n            metrics_fun, compute_metrics)\n\n    def test_call_kwargs_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, 't', (self.x,), {'t': self.t}, (self.x,), {},\n            metrics_fun, compute_metrics)\n\n    def test_call_no_arg_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, 0, (self.t,), {}, (), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, -1, (self.x, self.t), {}, (self.x,), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_three_args_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, -1, (self.x, self.x, self.t), {}, (self.x, self.x), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_positive_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, 2, (self.x, self.x, self.t), {}, (self.x, self.x), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_kwargs_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, 't', (self.x,), {'t': self.t}, (self.x,), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_no_arg_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, 0, (self.t,), {}, (), {}, metrics_fun, compute_metrics)\n\n    def to_gpu(self):\n        self.x = cuda.to_gpu(self.x)\n        self.t = cuda.to_gpu(self.t)\n        self.y = cuda.to_gpu(self.y)\n\n    def test_report_key(self, metrics_fun, compute_metrics):\n        repo = chainer.Reporter()\n\n        link = Classifier(predictor=DummyPredictor(),\n                          metrics_fun=metrics_fun)\n        link.compute_metrics = compute_metrics\n        repo.add_observer('target', link)\n        with repo:\n            observation = {}\n            with reporter.report_scope(observation):\n                link(self.x, self.t)\n\n        # print('observation ', observation)\n        actual_keys = set(observation.keys())\n        if compute_metrics:\n            if metrics_fun is None:\n                assert set(['target/loss']) == actual_keys\n            elif isinstance(metrics_fun, dict):\n                assert set(['target/loss', 'target/user_key']) == actual_keys\n            elif callable(metrics_fun):\n                assert set(['target/loss', 'target/accuracy']) == actual_keys\n            else:\n                raise TypeError()\n        else:\n            assert set(['target/loss']) == actual_keys\n\n\nclass TestInvalidArgument(object):\n\n    @classmethod\n    def setup_class(cls):\n        cls.link = Classifier(links.Linear(10, 3))\n        cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)\n\n    def check_invalid_argument(self):\n        x = chainer.Variable(self.link.xp.asarray(self.x))\n        with pytest.raises(TypeError):\n            # link.__call__ raises TypeError as the number of arguments\n            # is illegal\n            self.link(x)\n\n    def test_invalid_argument_cpu(self):\n        self.check_invalid_argument()\n\n    @pytest.mark.gpu\n    def test_invalid_argument_gpu(self):\n        self.link.to_gpu()\n        self.check_invalid_argument()\n\n\nclass TestInvalidLabelKey(object):\n\n    @classmethod\n    def setup_class(cls):\n        cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)\n\n    def test_invalid_label_key_type(self):\n        with pytest.raises(TypeError):\n            Classifier(links.Linear(10, 3), label_key=None)\n\n    def check_invalid_key(self, gpu, label_key):\n        link = Classifier(links.Linear(10, 3), label_key=label_key)\n        if gpu:\n            link.to_gpu()\n        x = chainer.Variable(link.xp.asarray(self.x))\n        with pytest.raises(ValueError):\n            link(x)\n\n    def test_invalid_index_cpu(self):\n        self.check_invalid_key(False, 1)\n\n    @pytest.mark.gpu\n    def test_invalid_argument_gpu(self):\n        self.check_invalid_key(True, 1)\n\n    def test_invalid_index_too_small_cpu(self):\n        self.check_invalid_key(False, -2)\n\n    @pytest.mark.gpu\n    def test_invalid_index_too_small_gpu(self):\n        self.check_invalid_key(True, -2)\n\n    def test_invalid_str_key_cpu(self):\n        self.check_invalid_key(False, 't')\n\n    @pytest.mark.gpu\n    def test_invalid_str_key_gpu(self):\n        self.check_invalid_key(True, 't')\n\n\nclass TestClassifierPrediction(object):\n\n    @classmethod\n    def setup_class(cls):\n        cls.predictor = DummyPredictor()\n        cls.x = numpy.array([[0., 1.], [-1., -2.], [4., 0.]],\n                            dtype=numpy.float32)\n        cls.t = numpy.array([1, 0, 0], dtype=numpy.int32)\n\n    def test_predict_cpu(self):\n        clf = Classifier(self.predictor)\n        actual_t = clf.predict(self.x)\n        assert actual_t.shape == (3,)\n        assert actual_t.dtype == numpy.int32\n        assert numpy.alltrue(actual_t == self.t)\n\n    @pytest.mark.gpu\n    def test_predict_gpu(self):\n        clf = Classifier(self.predictor, device=0)\n        actual_t = clf.predict(self.x)\n        assert numpy.alltrue(actual_t == self.t)\n\n    def check_predict_proba(self, device):\n        clf = Classifier(self.predictor, device=device)\n        actual_y = clf.predict_proba(self.x)\n        assert actual_y.shape == (3, 2)\n        assert actual_y.dtype == numpy.float32\n        assert numpy.alltrue(0 <= actual_y)\n        assert numpy.alltrue(actual_y <= 1.)\n\n        actual_t = numpy.argmax(actual_y, axis=1)\n        assert numpy.alltrue(actual_t == self.t)\n\n    def test_predict_proba_cpu(self):\n        self.check_predict_proba(-1)\n\n    @pytest.mark.gpu\n    def test_predict_proba_gpu(self):\n        self.check_predict_proba(0)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/prediction_tests/test_graph_conv_predictor.py",
    "content": "from typing import Tuple  # NOQA\n\nfrom chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.ggnn import GGNN\nfrom chainer_chemistry.models.mlp import MLP\nfrom chainer_chemistry.models.prediction import GraphConvPredictor\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nclass_num = 7\nn_unit = 11\nout_dim = 4\nbatch_size = 2\nn_edge_types = 3\n\n\n@pytest.fixture\ndef model():\n    # type: () -> GraphConvPredictor\n    mlp = MLP(out_dim=class_num, hidden_dim=n_unit)\n    ggnn = GGNN(\n        out_dim=out_dim, hidden_channels=n_unit, n_edge_types=n_edge_types)\n    return GraphConvPredictor(ggnn, mlp)\n\n\n@pytest.fixture\ndef data():\n    # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, n_edge_types, atom_size,\n                         atom_size)).astype('f')\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, class_num)).astype('f')\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    # type: (GraphConvPredictor, numpy.ndarray, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, class_num)\n\n\ndef test_forward_cpu(model, data):\n    # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    check_forward(model, *data[:2])\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\ndef test_backward_cpu(model, data):\n    # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data)\n    model.to_gpu()\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-3, rtol=1e-3)\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    # type: (GraphConvPredictor, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        model(permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s', '-x'])\n"
  },
  {
    "path": "tests/models_tests/prediction_tests/test_regressor.py",
    "content": "import mock\nimport numpy\nimport pytest\n\nimport chainer\nfrom chainer import cuda\nfrom chainer import links\nfrom chainer import reporter\n\nfrom chainer_chemistry.models.prediction.regressor import Regressor\n\n\nclass DummyPredictor(chainer.Chain):\n    def __call__(self, x):\n        return 2 * x\n\n\n@pytest.mark.parametrize(\n    'metrics_fun', [None, chainer.functions.mean_absolute_error,\n                    {'user_key': chainer.functions.mean_absolute_error}])\n@pytest.mark.parametrize('compute_metrics', [True, False])\nclass TestRegressor(object):\n\n    def setup_method(self, method):\n        self.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)\n        self.t = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)\n        self.y = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)\n\n    def check_call(\n            self, gpu, label_key, args, kwargs, model_args, model_kwargs,\n            metrics_fun, compute_metrics):\n        init_kwargs = {'label_key': label_key}\n        if metrics_fun is not None:\n            init_kwargs['metrics_fun'] = metrics_fun\n        link = Regressor(chainer.Link(), **init_kwargs)\n\n        if gpu:\n            xp = cuda.cupy\n            link.to_gpu()\n        else:\n            xp = numpy\n\n        link.compute_metrics = compute_metrics\n\n        y = chainer.Variable(self.y)\n        link.predictor = mock.MagicMock(return_value=y)\n\n        loss = link(*args, **kwargs)\n        link.predictor.assert_called_with(*model_args, **model_kwargs)\n\n        assert hasattr(link, 'y')\n        assert link.y is not None\n\n        assert hasattr(link, 'loss')\n        xp.testing.assert_allclose(link.loss.data, loss.data)\n\n        assert hasattr(link, 'metrics')\n        if compute_metrics:\n            assert link.metrics is not None\n        else:\n            assert link.metrics is None\n\n    def test_call_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, -1, (self.x, self.t), {}, (self.x,), {},\n            metrics_fun, compute_metrics)\n\n    def test_call_three_args_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, -1, (self.x, self.x, self.t), {}, (self.x, self.x), {},\n            metrics_fun, compute_metrics)\n\n    def test_call_positive_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, 2, (self.x, self.x, self.t), {}, (self.x, self.x), {},\n            metrics_fun, compute_metrics)\n\n    def test_call_kwargs_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, 't', (self.x,), {'t': self.t}, (self.x,), {},\n            metrics_fun, compute_metrics)\n\n    def test_call_no_arg_cpu(self, metrics_fun, compute_metrics):\n        self.check_call(\n            False, 0, (self.t,), {}, (), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, -1, (self.x, self.t), {}, (self.x,), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_three_args_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, -1, (self.x, self.x, self.t), {}, (self.x, self.x), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_positive_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, 2, (self.x, self.x, self.t), {}, (self.x, self.x), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_kwargs_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, 't', (self.x,), {'t': self.t}, (self.x,), {},\n            metrics_fun, compute_metrics)\n\n    @pytest.mark.gpu\n    def test_call_no_arg_gpu(self, metrics_fun, compute_metrics):\n        self.to_gpu()\n        self.check_call(\n            True, 0, (self.t,), {}, (), {}, metrics_fun, compute_metrics)\n\n    def to_gpu(self):\n        self.x = cuda.to_gpu(self.x)\n        self.t = cuda.to_gpu(self.t)\n        self.y = cuda.to_gpu(self.y)\n\n    def test_report_key(self, metrics_fun, compute_metrics):\n        repo = chainer.Reporter()\n\n        link = Regressor(predictor=DummyPredictor(),\n                         metrics_fun=metrics_fun)\n        link.compute_metrics = compute_metrics\n        repo.add_observer('target', link)\n        with repo:\n            observation = {}\n            with reporter.report_scope(observation):\n                link(self.x, self.t)\n\n        # print('observation ', observation)\n        actual_keys = set(observation.keys())\n        if compute_metrics:\n            if metrics_fun is None:\n                assert set(['target/loss']) == actual_keys\n            elif isinstance(metrics_fun, dict):\n                assert set(['target/loss', 'target/user_key']) == actual_keys\n            elif callable(metrics_fun):\n                assert set(['target/loss', 'target/metrics']) == actual_keys\n            else:\n                raise TypeError()\n        else:\n            assert set(['target/loss']) == actual_keys\n\n\nclass TestInvalidArgument(object):\n\n    @classmethod\n    def setup_class(cls):\n        cls.link = Regressor(links.Linear(10, 3))\n        cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)\n\n    def check_invalid_argument(self):\n        x = chainer.Variable(self.link.xp.asarray(self.x))\n        with pytest.raises(TypeError):\n            # link.__call__ raises TypeError as the number of arguments\n            # is illegal\n            self.link(x)\n\n    def test_invalid_argument_cpu(self):\n        self.check_invalid_argument()\n\n    @pytest.mark.gpu\n    def test_invalid_argument_gpu(self):\n        self.link.to_gpu()\n        self.check_invalid_argument()\n\n\nclass TestInvalidLabelKey(object):\n\n    @classmethod\n    def setup_class(cls):\n        cls.x = numpy.random.uniform(-1, 1, (5, 10)).astype(numpy.float32)\n\n    def test_invalid_label_key_type(self):\n        with pytest.raises(TypeError):\n            Regressor(links.Linear(10, 3), label_key=None)\n\n    def check_invalid_key(self, gpu, label_key):\n        link = Regressor(links.Linear(10, 3), label_key=label_key)\n        if gpu:\n            link.to_gpu()\n        x = chainer.Variable(link.xp.asarray(self.x))\n        with pytest.raises(ValueError):\n            link(x)\n\n    def test_invalid_index_cpu(self):\n        self.check_invalid_key(False, 1)\n\n    @pytest.mark.gpu\n    def test_invalid_argument_gpu(self):\n        self.check_invalid_key(True, 1)\n\n    def test_invalid_index_too_small_cpu(self):\n        self.check_invalid_key(False, -2)\n\n    @pytest.mark.gpu\n    def test_invalid_index_too_small_gpu(self):\n        self.check_invalid_key(True, -2)\n\n    def test_invalid_str_key_cpu(self):\n        self.check_invalid_key(False, 't')\n\n    @pytest.mark.gpu\n    def test_invalid_str_key_gpu(self):\n        self.check_invalid_key(True, 't')\n\n\nclass TestRegressorPrediction(object):\n\n    @classmethod\n    def setup_class(cls):\n        cls.predictor = DummyPredictor()\n        cls.x = numpy.array([[0., 1.], [-1., -2.], [4., 0.]],\n                            dtype=numpy.float32)\n        cls.t = cls.x * 2\n\n    def test_predict_cpu(self):\n        clf = Regressor(self.predictor)\n        actual_t = clf.predict(self.x)\n        assert actual_t.shape == (3, 2)\n        assert actual_t.dtype == numpy.float32\n        assert numpy.alltrue(actual_t == self.t)\n\n    @pytest.mark.gpu\n    def test_predict_gpu(self):\n        clf = Regressor(self.predictor, device=0)\n        actual_t = clf.predict(self.x)\n        assert numpy.alltrue(actual_t == self.t)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/prediction_tests/test_set_up_predictor.py",
    "content": "from typing import Dict  # NOQA\n\nimport chainer  # NOQA\nimport pytest\n\nfrom chainer_chemistry.models.ggnn import GGNN\nfrom chainer_chemistry.models.gin import GIN\nfrom chainer_chemistry.models.gnn_film import GNNFiLM\nfrom chainer_chemistry.models.nfp import NFP\nfrom chainer_chemistry.models.prediction.graph_conv_predictor import GraphConvPredictor  # NOQA\nfrom chainer_chemistry.models.prediction.set_up_predictor import set_up_predictor  # NOQA\nfrom chainer_chemistry.models.relgat import RelGAT\nfrom chainer_chemistry.models.relgcn import RelGCN\nfrom chainer_chemistry.models.rsgcn import RSGCN\nfrom chainer_chemistry.models.schnet import SchNet\nfrom chainer_chemistry.models.weavenet import WeaveNet\n\nfrom chainer_chemistry.models.gwm.gwm_net import GGNN_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import GIN_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import NFP_GWM  # NOQA\nfrom chainer_chemistry.models.gwm.gwm_net import RSGCN_GWM  # NOQA\n\nfrom chainer_chemistry.models.cwle.cwle_net import GGNN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RelGAT_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RelGCN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import GIN_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import NFP_CWLE  # NOQA\nfrom chainer_chemistry.models.cwle.cwle_net import RSGCN_CWLE  # NOQA\n\nfrom chainer_chemistry.models.gwle.gwle_net import GGNN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RelGAT_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RelGCN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import GIN_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import NFP_GWLE  # NOQA\nfrom chainer_chemistry.models.gwle.gwle_net import RSGCN_GWLE  # NOQA\n\n\nclass_num = 7\nn_unit = 11\nconv_layers = 3\n\n\n@pytest.fixture\ndef models_dict():\n    # type: () -> Dict[str, chainer.Link]\n    return {\n        'nfp': NFP,\n        'ggnn': GGNN,\n        'schnet': SchNet,\n        'weavenet': WeaveNet,\n        'rsgcn': RSGCN,\n        'relgcn': RelGCN,\n        'relgat': RelGAT,\n        'gin': GIN,\n        'nfp_gwm': NFP_GWM,\n        'ggnn_gwm': GGNN_GWM,\n        'rsgcn_gwm': RSGCN_GWM,\n        'gin_gwm': GIN_GWM,\n        'gnnfilm': GNNFiLM,\n        'nfp_wle': NFP,\n        'ggnn_wle': GGNN,\n        'relgat_wle': RelGAT,\n        'relgcn_wle': RelGCN,\n        'rsgcn_wle': RSGCN,\n        'gin_wle': GIN,\n        'nfp_cwle': NFP_CWLE,\n        'ggnn_cwle': GGNN_CWLE,\n        'relgat_cwle': RelGAT_CWLE,\n        'relgcn_cwle': RelGCN_CWLE,\n        'rsgcn_cwle': RSGCN_CWLE,\n        'gin_cwle': GIN_CWLE,\n        'nfp_gwle': NFP_GWLE,\n        'ggnn_gwle': GGNN_GWLE,\n        'relgat_gwle': RelGAT_GWLE,\n        'relgcn_gwle': RelGCN_GWLE,\n        'rsgcn_gwle': RSGCN_GWLE,\n        'gin_gwle': GIN_GWLE\n    }\n\n\ndef test_setup_predictor(models_dict):\n    # type: (Dict[str, chainer.Link]) -> None\n    for method, instance in models_dict.items():\n        predictor = set_up_predictor(\n            method=method,\n            n_unit=n_unit,\n            conv_layers=conv_layers,\n            class_num=class_num)\n        assert isinstance(predictor.graph_conv, instance)\n        assert isinstance(predictor, GraphConvPredictor)\n\n\ndef test_call_invalid_model():\n    # type: () -> None\n    with pytest.raises(ValueError):\n        set_up_predictor(\n            method='invalid',\n            n_unit=n_unit,\n            conv_layers=conv_layers,\n            class_num=class_num)\n\n\ndef test_set_up_predictor_with_conv_kwargs():\n    # type: () -> None\n    predictor = set_up_predictor(\n        method='nfp',\n        n_unit=n_unit,\n        conv_layers=conv_layers,\n        class_num=class_num,\n        conv_kwargs={\n            'max_degree': 4,\n            'concat_hidden': True\n        })\n    assert predictor.graph_conv.max_degree == 4\n    assert predictor.graph_conv.concat_hidden is True\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_cgcnn.py",
    "content": "from chainer import cuda\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.models.cgcnn import CGCNN\n\n\n# node_size_list means the first moleculae has three nodes,\n# and the seconde molecule has five nodes\nnode_size_list = [3, 5]\nmax_num_nbr = 6\nnode_feature_dim = 5\nedge_feature_dim = 10\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef model():\n    return CGCNN(out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    if len(node_size_list) != batch_size:\n        raise ValueError(\"Invalid fixture data for CGCNN\")\n\n    numpy.random.seed(0)\n    total_node_size = sum(node_size_list)\n    # one-hot vector\n    atom_feat = numpy.random.choice(\n        [0, 1], (total_node_size, node_feature_dim)).astype(numpy.float32)\n    nbr_feat = numpy.random.rand(total_node_size, max_num_nbr,\n                                 edge_feature_dim).astype(numpy.float32)\n    # atom_idx & nbr_idx\n    curr_idx = 0\n    atom_idx = []\n    nbr_idx = []\n    for val in node_size_list:\n        atom_idx.append(numpy.arange(curr_idx, val))\n        for _ in range(val):\n            max_val = curr_idx + val\n            nbr_idx.append(numpy.random.randint(curr_idx,\n                                                max_val, max_num_nbr))\n        curr_idx += val\n    atom_idx = numpy.asarray(atom_idx)\n    nbr_idx = numpy.array(nbr_idx, dtype=numpy.int32)\n\n    y_grad = numpy.random.uniform(-1, 1,\n                                  (batch_size, out_dim)).astype(numpy.float32)\n    return atom_feat, nbr_feat, atom_idx, nbr_idx, y_grad\n\n\ndef check_forward(model, data):\n    y_actual = cuda.to_cpu(model(*data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    atom_feat, nbr_feat, atom_idx, nbr_idx = data[:-1]\n    check_forward(model, (atom_feat, nbr_feat, atom_idx, nbr_idx))\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    atom_feat, nbr_feat, atom_idx, nbr_idx = data[:-1]\n    # atom_idx is list format... use numpy array\n    input_data = (cuda.to_gpu(atom_feat), cuda.to_gpu(nbr_feat),\n                  atom_idx, cuda.to_gpu(nbr_idx))\n    model.to_gpu()\n    check_forward(model, tuple(input_data))\n\n\n# def test_backward_cpu(model, data):\n#     input_data, y_grad = data[0:-1], data[-1]\n#     gradient_check.check_backward(model, tuple(input_data), y_grad,\n#                                   atol=5e-1, rtol=1e-1)\n\n\n# @pytest.mark.gpu\n# def test_backward_gpu(model, data):\n#     atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n#     model.to_gpu()\n#     gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n#                                   atol=5e-1, rtol=1e-1)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/models_tests/test_ggnn.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.ggnn import GGNN\nfrom chainer_chemistry.models.ggnn import SparseGGNN\nfrom chainer_chemistry.utils.extend import extend_node, extend_adj  # NOQA\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\nfrom chainer_chemistry.utils.sparse_utils import _convert_to_sparse\nfrom chainer_chemistry.utils.sparse_utils import sparse_utils_available\n\natom_size = 5\nout_dim = 4\nbatch_size = 2\nn_edge_types = 3\n\n\n@pytest.fixture\ndef model():\n    numpy.random.seed(0)\n    return GGNN(out_dim=out_dim, n_edge_types=n_edge_types)\n\n\n@pytest.fixture\ndef sparse_model():\n    numpy.random.seed(0)\n    return SparseGGNN(out_dim=out_dim, n_edge_types=n_edge_types)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype(numpy.int32)\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size)\n    ).astype(numpy.float32)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, *args):\n    numpy.random.seed(0)  # reset seed to initialize model params consistently\n    y_actual = cuda.to_cpu(model(*args).data)\n    assert y_actual.shape == (batch_size, out_dim)\n    return y_actual\n\n\ndef test_forward_cpu(model, sparse_model, data):\n    atom_data, adj_data = data[0], data[1]\n    y_dense = check_forward(model, atom_data, adj_data)\n    # test for sparse forward result is same with dense\n    if sparse_utils_available():\n        y_sparse = check_forward(sparse_model, atom_data,\n                                 *_convert_to_sparse(adj_data))\n        numpy.testing.assert_allclose(\n            y_dense, y_sparse, atol=1e-4, rtol=1e-4)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, sparse_model, data):\n    atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n    if sparse_utils_available():\n        sparse_model.to_gpu()\n        check_forward(sparse_model, atom_data, *_convert_to_sparse(adj_data))\n\n\ndef test_backward_cpu(model, data):\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=1e-3, rtol=1e-3)\n    # there is no backward test for sparse model, because there will be no\n    # gradient for input data.\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=1e-3, rtol=1e-3)\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    atom_data, adj_data = data[0], data[1]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(model(\n        permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-6)\n\n\ndef test_forward_cpu_input_size_invariant(model, data):\n    atom_data, adj_data = data[0], data[1]\n    is_real_node = numpy.ones(atom_data.shape, dtype=numpy.float32)\n    y_actual = cuda.to_cpu(model(atom_array=atom_data, adj=adj_data,\n                                 is_real_node=is_real_node).data)\n\n    atom_data_ex = extend_node(atom_data, out_size=8)\n    adj_data_ex = extend_adj(adj_data, out_size=8)\n    is_real_node_ex = extend_node(is_real_node, out_size=8)\n    y_actual_ex = cuda.to_cpu(model(\n        atom_array=atom_data_ex, adj=adj_data_ex,\n        is_real_node=is_real_node_ex).data)\n    assert numpy.allclose(y_actual, y_actual_ex, rtol=1e-5, atol=1e-6)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_gin.py",
    "content": "from typing import Tuple  # NOQA\n\nimport chainer  # NOQA\nfrom chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.gin import GIN\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nout_dim = 4\nbatch_size = 3\n\n\n@pytest.fixture\ndef model():\n    # type: () -> GIN\n    return GIN(out_dim=out_dim, dropout_ratio=0)\n\n\n@pytest.fixture\ndef data():\n    # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, atom_size, atom_size)).astype('f')\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f')\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    # type: (GIN, numpy.ndarray, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data = data[:2]\n    check_forward(model, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\ndef test_backward_cpu(model, data):\n    # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data)\n    model.to_gpu()\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    # type: (GIN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data = data[:2]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        model(permute_atom_data, permute_adj_data).data)\n    numpy.testing.assert_allclose(\n        y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_gnn_film.py",
    "content": "from typing import Tuple  # NOQA\n\nimport chainer  # NOQA\nfrom chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.gnn_film import GNNFiLM\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nout_dim = 4\nbatch_size = 3\nn_edge_types = 5\n\n\n@pytest.fixture\ndef model():\n    # type: () -> chainer.Chain\n    return GNNFiLM(out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, n_edge_types, atom_size, atom_size)\n    ).astype('f')\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype('f')\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    # type: (chainer.Chain, numpy.ndarray, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    check_forward(model, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = map(cuda.to_gpu, data[:2])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\ndef test_backward_cpu(model, data):\n    # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data, y_grad = map(cuda.to_gpu, data)\n    model.to_gpu()\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-2, rtol=1e-2)\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    # type: (chainer.Chain, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None  # NOQA\n    atom_data, adj_data = data[:2]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        model(permute_atom_data, permute_adj_data).data)\n    numpy.testing.assert_allclose(\n        y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_megnet.py",
    "content": "from chainer import cuda\nimport numpy\nimport pytest\n\n\nfrom chainer_chemistry.models.megnet import MEGNet\n\n\n# node_size_list means the first moleculae has six nodes,\n# and the seconde molecule has four nodes\nnode_size_list = [6, 4]\n# edge_size_list means the first moleculae has eight edges,\n# and the seconde molecule has four edges\nedge_size_list = [8, 4]\nnode_feature_dim = 5\nedge_feature_dim = 10\nglobal_feature_dim = 2\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef model():\n    return MEGNet(out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    if len(node_size_list) != batch_size or len(edge_size_list) != batch_size:\n        raise ValueError(\"Invalid fixture data for MEGNet\")\n\n    numpy.random.seed(0)\n    total_node_size = sum(node_size_list)\n    total_edge_size = sum(edge_size_list)\n    atom_feat = numpy.random.rand(total_node_size,\n                                  node_feature_dim).astype(numpy.float32)\n    pair_feat = numpy.random.rand(total_edge_size,\n                                  edge_feature_dim).astype(numpy.float32)\n    global_feat = numpy.random.rand(batch_size,\n                                    global_feature_dim).astype(numpy.float32)\n\n    # atom idx\n    atom_idx = numpy.hstack([[i] * node_size_list[i]\n                             for i in range(batch_size)]).astype(numpy.int32)\n    # pair idx\n    pair_idx = numpy.hstack([[i] * edge_size_list[i]\n                             for i in range(batch_size)]).astype(numpy.int32)\n    # create start and end idx\n    edge_idx = []\n    acc_node_size = [sum(node_size_list[:i+1]) for i in range(batch_size)]\n    low = numpy.roll(acc_node_size + [0], 1)[0:batch_size+1]\n    high = numpy.array(acc_node_size)\n    for i in range(batch_size):\n        idx = [numpy.random.choice(numpy.arange(low[i], high[i]), 2,\n                                   replace=False)\n               for _ in range(edge_size_list[i])]\n        edge_idx.extend(idx)\n\n    start_idx = numpy.array(edge_idx, dtype=numpy.int32)[:, 0]\n    end_idx = numpy.array(edge_idx, dtype=numpy.int32)[:, 1]\n\n    y_grad = numpy.random.uniform(-1, 1,\n                                  (batch_size, out_dim)).astype(numpy.float32)\n\n    return atom_feat, pair_feat, global_feat, \\\n        atom_idx, pair_idx, start_idx, end_idx, y_grad\n\n\ndef check_forward(model, data):\n    y_actual = cuda.to_cpu(model(*data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    atom_feat, pair_feat, global_feat, \\\n        atom_idx, pair_idx, start_idx, end_idx = data[:-1]\n    check_forward(model, (atom_feat, pair_feat, global_feat, atom_idx,\n                          pair_idx, start_idx, end_idx))\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    input_data = [cuda.to_gpu(d) for d in data[:-1]]\n    model.to_gpu()\n    check_forward(model, tuple(input_data))\n\n\n# def test_backward_cpu(model, data):\n#     input_data, y_grad = data[0:-1], data[-1]\n#     gradient_check.check_backward(model, tuple(input_data), y_grad,\n#                                   atol=5e-1, rtol=1e-1)\n\n\n# @pytest.mark.gpu\n# def test_backward_gpu(model, data):\n#     data = [cuda.to_gpu(d) for d in data]\n#     input_data, y_grad = data[0:-1], data[-1]\n#     model.to_gpu()\n#     gradient_check.check_backward(model, tuple(input_data), y_grad,\n#                                   atol=5e-1, rtol=1e-1)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/models_tests/test_mlp.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.models.mlp import MLP\n\nbatch_size = 2\nhidden_dim = 16\nout_dim = 4\n\n\n@pytest.fixture\ndef model():\n    return MLP(out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    hidden = numpy.random.rand(batch_size, hidden_dim).astype(numpy.float32)\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, out_dim)).astype(\n        numpy.float32)\n    return hidden, y_grad\n\n\ndef check_forward(model, data):\n    y_actual = cuda.to_cpu(model(data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    check_forward(model, data[0])\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    model.to_gpu()\n    check_forward(model, cuda.to_gpu(data[0]))\n\n\ndef test_mlp_assert_raises():\n    with pytest.raises(ValueError):\n        MLP(out_dim=out_dim, n_layers=-1)\n\n\ndef test_backward_cpu(model, data):\n    hidden, y_grad = data\n    gradient_check.check_backward(model, hidden, y_grad, atol=1e0, rtol=1e0)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    hidden, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(model, hidden, y_grad, atol=1e0, rtol=1e0)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/models_tests/test_mpnn.py",
    "content": "from typing import List  # NOQA\nfrom typing import Tuple  # NOQA\n\nfrom chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.mpnn import MPNN\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nout_dim = 4\nbatch_size = 2\nnum_edge_type = 3\n\n\n@pytest.fixture(params=[('edgenet', 'set2set'), ('edgenet', 'ggnn'),\n                        ('ggnn', 'set2set'), ('ggnn', 'ggnn')])\ndef model(request):\n    # type: (pytest.fixture.SubRequest) -> MPNN\n    message_func, readout_func = request.param\n    return MPNN(\n        out_dim=out_dim,\n        n_edge_types=num_edge_type,\n        message_func=message_func,\n        readout_func=readout_func)\n\n\n@pytest.fixture\ndef data():\n    # type: () -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size,\n                                      atom_size)).astype(numpy.int32)\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, num_edge_type, atom_size,\n                         atom_size)).astype(numpy.float32)\n    y_grad = numpy.random.uniform(-1, 1,\n                                  (batch_size, out_dim)).astype(numpy.float32)\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    # type: (MPNN, numpy.ndarray, numpy.ndarray) -> None\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data = data[0], data[1]\n    check_forward(model, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\ndef test_backward_cpu(model, data):\n    # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-0, rtol=1e-0)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-0, rtol=1e-0)\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    # type: (MPNN, Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]) -> None\n    if model.message_func == 'edgenet':\n        return\n        # Because EdgeNet uses NN for expanding edge vector dimension,\n        # graph invariant is not ensured.\n    atom_data, adj_data = data[0], data[1]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(\n        model(permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-3, atol=1e-3)\n\n\ndef test_invalid_message_funcion():\n    # type: () -> None\n    with pytest.raises(ValueError):\n        MPNN(out_dim=out_dim, message_func='invalid')\n\n\ndef test_invalid_readout_funcion():\n    # type: () -> None\n    with pytest.raises(ValueError):\n        MPNN(out_dim=out_dim, readout_func='invalid')\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_nfp.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.nfp import NFP\nfrom chainer_chemistry.utils.extend import extend_adj, extend_node  # NOQA\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef model():\n    return NFP(out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype(numpy.int32)\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, atom_size, atom_size)\n    ).astype(numpy.float32)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    atom_data, adj_data = data[0], data[1]\n    check_forward(model, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\n# TODO(nakago): check why tolerance is high\ndef test_backward_cpu(model, data):\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=1e0, rtol=1e0)\n\n\n# TODO(nakago): check why tolerance is high\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=1e0, rtol=1e0)\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    atom_data, adj_data = data[0], data[1]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(model(\n        permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-6)\n\n\ndef test_forward_cpu_input_size_invariant(model, data):\n    atom_data, adj_data = data[0], data[1]\n    is_real_node = numpy.ones(atom_data.shape, dtype=numpy.float32)\n    y_actual = cuda.to_cpu(model(\n        atom_array=atom_data, adj=adj_data,\n        is_real_node=is_real_node).data)\n\n    atom_data_ex = extend_node(atom_data, out_size=8)\n    adj_data_ex = extend_adj(adj_data, out_size=8)\n    is_real_node_ex = extend_node(is_real_node, out_size=8)\n\n    y_actual_ex = cuda.to_cpu(model(\n        atom_array=atom_data_ex, adj=adj_data_ex,\n        is_real_node=is_real_node_ex).data)\n    assert numpy.allclose(y_actual, y_actual_ex, rtol=1e-5, atol=1e-6)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_relgat.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.relgat import RelGAT\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nout_dim = 4\nbatch_size = 2\nnum_edge_type = 4\n\n\n@pytest.fixture(params=[True, False])\ndef model(request):\n    return RelGAT(out_dim=out_dim, concat_heads=request.param)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype(numpy.int32)\n    adj_data = numpy.random.randint(\n        0, high=2, size=(batch_size, num_edge_type, atom_size, atom_size)\n    ).astype(numpy.float32)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    atom_data, adj_data = data[0], data[1]\n    check_forward(model, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\n# TODO(mottodora): check why tolerance is high\ndef test_backward_cpu(model, data):\n    atom_data, adj_data, y_grad = data\n    params = tuple(model.params())\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  params=params, no_grads=[True, True],\n                                  atol=1e3, rtol=1e3)\n\n\n# TODO(nakago): check why tolerance is high\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    params = tuple(model.params())\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  params=params, no_grads=[True, True],\n                                  atol=1e3, rtol=1e3)\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    atom_data, adj_data = data[0], data[1]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(model(\n        permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-6)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_relgcn.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.gwm.gwm_graph_conv_model import rescale_adj  # NOQA\nfrom chainer_chemistry.models.relgcn import RelGCN\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nout_ch = 4\nbatch_size = 2\nnum_edge_type = 4\n\n\n@pytest.fixture\ndef model():\n    return RelGCN(out_dim=out_ch)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)).astype('i')\n    adj_data = numpy.random.randint(\n        0, high=2,\n        size=(batch_size, num_edge_type, atom_size, atom_size)).astype('f')\n    y_grad = numpy.random.uniform(-1, 1, (batch_size, out_ch)).astype('f')\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, out_ch)\n\n\ndef test_forward_cpu(model, data):\n    atom_data, adj_data = data[0], data[1]\n    check_forward(model, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\ndef test_backward_cpu(model, data):\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=1e-3, rtol=1e-3)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=1e-3, rtol=1e-3)\n\n\ndef test_forward_cpu_invariant(model, data):\n    atom_data, adj_data = data[0], data[1]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(model(\n        permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\ndef test_rescale_adj(data):\n    adj = data[1]\n    numpy.testing.assert_allclose(rescale_adj(adj).data.sum(axis=(1, 2)),\n                                  numpy.ones((batch_size, atom_size)),\n                                  atol=1e-5, rtol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_rsgcn.py",
    "content": "import chainer\nfrom chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.links import NFPReadout\nfrom chainer_chemistry.models.rsgcn import RSGCN\nfrom chainer_chemistry.utils.extend import extend_adj\nfrom chainer_chemistry.utils.extend import extend_node\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef model():\n    return RSGCN(out_dim=out_dim)\n\n\n@pytest.fixture\ndef model_no_dropout():\n    # To check backward gradient by `gradient_check`,\n    # we need to skip stochastic dropout function.\n    return RSGCN(out_dim=out_dim, dropout_ratio=0.)\n\n\n@pytest.fixture\ndef model_with_nfp():\n    return RSGCN(out_dim=out_dim,\n                 readout=NFPReadout(in_channels=out_dim, out_dim=out_dim))\n\n\n@pytest.fixture\ndef model_with_nfp_no_dropout():\n    return RSGCN(out_dim=out_dim,\n                 readout=NFPReadout(in_channels=out_dim, out_dim=out_dim),\n                 dropout_ratio=0.)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype(numpy.int32)\n    # adj_data is symmetric matrix\n    adj_data = numpy.random.uniform(\n        0, high=1, size=(batch_size, atom_size, atom_size)\n    ).astype(numpy.float32)\n    adj_data = adj_data + adj_data.swapaxes(-1, -2)\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    atom_data, adj_data = data[0], data[1]\n    check_forward(model, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\ndef test_forward_cpu_with_nfp(model_with_nfp, data):\n    atom_data, adj_data = data[0], data[1]\n    check_forward(model_with_nfp, atom_data, adj_data)\n\n\ndef test_backward_cpu(model_no_dropout, data):\n    atom_data, adj_data, y_grad = data\n    if int(chainer.__version__[0]) <= 2:\n        # somehow the test fails with `params` when using chainer version 2...\n        # TODO(nakago): investigate why the test fails.\n        params = ()\n    else:\n        params = tuple(model_no_dropout.params())\n    # TODO(nakago): check why tolerance is high\n    gradient_check.check_backward(\n        model_no_dropout, (atom_data, adj_data), y_grad,\n        params=params,\n        atol=1e-1, rtol=1e-1, no_grads=[True, True])\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model_no_dropout, data):\n    atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model_no_dropout.to_gpu()\n    if int(chainer.__version__[0]) <= 2:\n        # somehow the test fails with `params` when using chainer version 2...\n        # TODO(nakago): investigate why the test fails.\n        params = ()\n    else:\n        params = tuple(model_no_dropout.params())\n    # TODO(nakago): check why tolerance is high\n    gradient_check.check_backward(\n        model_no_dropout, (atom_data, adj_data), y_grad,\n        params=params,\n        atol=1e-1, rtol=1e-1, no_grads=[True, True])\n\n\ndef test_backward_cpu_with_nfp(model_with_nfp_no_dropout, data):\n    atom_data, adj_data, y_grad = data\n    if int(chainer.__version__[0]) <= 2:\n        params = ()\n    else:\n        params = tuple(model_with_nfp_no_dropout.params())\n    gradient_check.check_backward(\n        model_with_nfp_no_dropout, (atom_data, adj_data), y_grad,\n        params=params,\n        atol=1e-4, rtol=1e-4, no_grads=[True, True])\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    # This RSGCN uses dropout, so we need to forward with test mode\n    # to remove stochastic calculation.\n    atom_data, adj_data = data[0], data[1]\n    with chainer.using_config('train', False):\n        y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    with chainer.using_config('train', False):\n        permute_y_actual = cuda.to_cpu(model(\n            permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1.e-4, atol=1.e-5)\n\n\ndef test_forward_cpu_input_size_invariant(model, data):\n    # This RSGCN uses dropout, so we need to forward with test mode\n    # to remove stochastic calculation.\n    atom_data, adj_data = data[0], data[1]\n    with chainer.using_config('train', False):\n        y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    # Set bigger size than original `atom_size`.\n    atom_data_ex = extend_node(atom_data, out_size=8)\n    adj_data_ex = extend_adj(adj_data, out_size=8)\n    # print('size', atom_data.shape, adj_data.shape,\n    #       atom_data_ex.shape, adj_data_ex.shape)\n    with chainer.using_config('train', False):\n        y_actual_ex = cuda.to_cpu(model(\n            atom_data_ex, adj_data_ex).data)\n    assert numpy.allclose(y_actual, y_actual_ex, rtol=1.e-4, atol=1.e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/models_tests/test_schnet.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.schnet import SchNet\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nout_dim = 4\nbatch_size = 2\n\n\n@pytest.fixture\ndef model():\n    return SchNet(out_dim=out_dim)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype(numpy.int32)\n    # symmetric matrix\n    adj_data = numpy.random.uniform(\n        0, high=30, size=(batch_size, atom_size, atom_size)\n    ).astype(numpy.float32)\n    adj_data = (adj_data + adj_data.swapaxes(-1, -2)) / 2.\n\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    return atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, data):\n    atom_data, adj_data = data[0], data[1]\n    check_forward(model, atom_data, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, data):\n    atom_data, adj_data = cuda.to_gpu(data[0]), cuda.to_gpu(data[1])\n    model.to_gpu()\n    check_forward(model, atom_data, adj_data)\n\n\ndef test_backward_cpu(model, data):\n    atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=5e-1, rtol=1e-1)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, data):\n    atom_data, adj_data, y_grad = [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=5e-1, rtol=1e-1)\n\n\ndef test_forward_cpu_graph_invariant(model, data):\n    atom_data, adj_data = data[0], data[1]\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index)\n    permute_adj_data = permute_adj(adj_data, permutation_index)\n    permute_y_actual = cuda.to_cpu(model(\n        permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1e-5, atol=1e-5)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/models_tests/test_weavenet.py",
    "content": "from chainer import cuda\nfrom chainer import gradient_check\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.config import MAX_ATOMIC_NUM\nfrom chainer_chemistry.models.weavenet import WeaveNet\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\natom_size = 5\nweave_channels = [50, 50]\nbatch_size = 2\natom_feature_dim = 23\npair_feature_dim = 10\nout_dim = weave_channels[-1]\n\n\n@pytest.fixture\ndef model():\n    return WeaveNet(weave_channels=weave_channels, n_atom=atom_size)\n\n\n@pytest.fixture\ndef model_processed():\n    \"\"\"model to test `atom_data_processed` input\"\"\"\n    return WeaveNet(weave_channels=weave_channels, n_atom=atom_size)\n\n\n@pytest.fixture\ndef data():\n    numpy.random.seed(0)\n    atom_data_processed = numpy.random.uniform(\n        0, high=1, size=(batch_size, atom_size, atom_feature_dim)\n    ).astype(numpy.float32)\n\n    atom_data = numpy.random.randint(\n        0, high=MAX_ATOMIC_NUM, size=(batch_size, atom_size)\n    ).astype(numpy.int32)\n    adj_data = numpy.random.uniform(\n        0, high=1, size=(batch_size, pair_feature_dim, atom_size, atom_size)\n    ).astype(numpy.float32)\n    # adj_data is symmetric along pair of atoms\n    # adj_data = adj_data + adj_data.swapaxes(-1, -2)\n    adj_data = adj_data.transpose((0, 3, 2, 1)).reshape(\n        batch_size, atom_size * atom_size, pair_feature_dim\n    ).astype(numpy.float32)\n\n    y_grad = numpy.random.uniform(\n        -1, 1, (batch_size, out_dim)).astype(numpy.float32)\n    return atom_data_processed, atom_data, adj_data, y_grad\n\n\ndef check_forward(model, atom_data, adj_data):\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n    print('y_actual', y_actual.shape)\n    assert y_actual.shape == (batch_size, out_dim)\n\n\ndef test_forward_cpu(model, model_processed, data):\n    atom_data_processed, atom_data, adj_data = data[0:3]\n    check_forward(model, atom_data, adj_data)\n    check_forward(model_processed, atom_data_processed, adj_data)\n\n\n@pytest.mark.gpu\ndef test_forward_gpu(model, model_processed, data):\n    atom_data_processed, atom_data, adj_data = \\\n        [cuda.to_gpu(d) for d in data[0:3]]\n    model.to_gpu()\n    model_processed.to_gpu()\n    check_forward(model, atom_data, adj_data)\n    check_forward(model_processed, atom_data_processed, adj_data)\n\n\ndef test_backward_cpu(model, model_processed, data):\n    atom_data_processed, atom_data, adj_data, y_grad = data\n    gradient_check.check_backward(model, (atom_data, adj_data), y_grad,\n                                  atol=1e-1, rtol=1e-1)\n    gradient_check.check_backward(model_processed, (atom_data_processed,\n                                                    adj_data), y_grad,\n                                  atol=1e-1, rtol=1e-1)\n\n\n@pytest.mark.gpu\ndef test_backward_gpu(model, model_processed, data):\n    atom_data_processed, atom_data, adj_data, y_grad = \\\n        [cuda.to_gpu(d) for d in data]\n    model.to_gpu()\n    model_processed.to_gpu()\n    gradient_check.check_backward(\n        model, (atom_data, adj_data), y_grad, atol=1e-1, rtol=1e-1)\n    gradient_check.check_backward(\n        model_processed, (atom_data_processed, adj_data), y_grad,\n        atol=1e-1, rtol=1e-1)\n\n\ndef _test_forward_cpu_graph_invariant(\n        model, atom_data, adj_data, node_permute_axis=-1):\n    y_actual = cuda.to_cpu(model(atom_data, adj_data).data)\n\n    permutation_index = numpy.random.permutation(atom_size)\n    permute_atom_data = permute_node(atom_data, permutation_index,\n                                     axis=node_permute_axis)\n    permute_adj_data = adj_data.reshape(\n        batch_size, atom_size, atom_size, pair_feature_dim\n    ).astype(numpy.float32)\n    permute_adj_data = permute_adj(\n        permute_adj_data, permutation_index, axis=[1, 2])\n    permute_adj_data = permute_adj_data.reshape(\n        batch_size, atom_size * atom_size, pair_feature_dim\n    ).astype(numpy.float32)\n    permute_y_actual = cuda.to_cpu(model(\n        permute_atom_data, permute_adj_data).data)\n    assert numpy.allclose(y_actual, permute_y_actual, rtol=1.e-4, atol=1.e-6)\n\n\ndef test_forward_cpu_graph_invariant_embed(model, data):\n    atom_data, adj_data = data[1], data[2]\n    _test_forward_cpu_graph_invariant(\n        model, atom_data, adj_data, node_permute_axis=-1)\n\n\ndef test_forward_cpu_graph_invariant_processed(model_processed, data):\n    atom_data_processed, adj_data = data[0], data[2]\n    _test_forward_cpu_graph_invariant(\n        model_processed, atom_data_processed, adj_data, node_permute_axis=1)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/saliency_tests/calculator_tests/test_base_calculator.py",
    "content": "import numpy\nimport pytest\n\nimport chainer\nfrom chainer.links import Linear\n\nfrom chainer_chemistry.link_hooks import is_link_hooks_available\nif is_link_hooks_available:\n    from chainer_chemistry.link_hooks import VariableMonitorLinkHook\n    from chainer_chemistry.saliency.calculator.base_calculator import BaseCalculator  # NOQA\n    from chainer_chemistry.saliency.calculator import GaussianNoiseSampler\n\n    class DummyCalculator(BaseCalculator):\n        \"\"\"Dummy calculator which returns target_var\"\"\"\n\n        def _compute_core(self, *inputs):\n            self.model(*inputs)\n            return self.get_target_var(inputs)\n\n\nclass DummyModel(chainer.Chain):\n    def __init__(self):\n        super(DummyModel, self).__init__()\n        with self.init_scope():\n            self.l1 = Linear(\n                3, 1, initialW=numpy.array([[1, 3, 2]]),\n                nobias=True)\n        self.h = None\n\n    def forward(self, x):\n        self.h = self.l1(x)\n        out = self.h * 3\n        return out\n\n\n@pytest.fixture\ndef model():\n    return DummyModel()\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_base_calculator_compute(model):\n    calculator = DummyCalculator(model)\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n\n    saliency = calculator.compute(x)\n    # DummyCalculator returns `saliency` as input `x`.\n    assert numpy.allclose(saliency, x)\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_base_calculator_compute_noise_sampler(model):\n    calculator = DummyCalculator(model)\n\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    saliency = calculator.compute(x, M=2, noise_sampler=GaussianNoiseSampler())\n    assert saliency.shape == (2, 3)\n    # noise is added, should be different from original input\n    assert not numpy.allclose(saliency[0], x)\n    assert not numpy.allclose(saliency[1], x)\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_base_calculator_compute_target_extractor(model):\n    # It should extract `target_var` as after `l1`, which is `model.h`.\n    calculator = DummyCalculator(\n        model, target_extractor=VariableMonitorLinkHook(model.l1))\n\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    saliency = calculator.compute(x)\n    assert numpy.allclose(saliency, model.h.array)\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_base_calculator_aggregate():\n    model = DummyModel()\n    calculator = DummyCalculator(model)\n\n    saliency = numpy.array([[-1, -1, -1], [2, 2, 2]], dtype=numpy.float32)\n    saliency_raw = calculator.aggregate(saliency, method='raw', ch_axis=None)\n    assert numpy.allclose(saliency_raw,\n                          numpy.array([[0.5, 0.5, 0.5]], dtype=numpy.float32))\n    saliency_abs = calculator.aggregate(saliency, method='abs', ch_axis=None)\n    assert numpy.allclose(saliency_abs,\n                          numpy.array([[1.5, 1.5, 1.5]], dtype=numpy.float32))\n    saliency_square = calculator.aggregate(saliency, method='square',\n                                           ch_axis=None)\n    assert numpy.allclose(saliency_square,\n                          numpy.array([[2.5, 2.5, 2.5]], dtype=numpy.float32))\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/saliency_tests/calculator_tests/test_calculator_utils.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.saliency.calculator.calculator_utils import GaussianNoiseSampler  # NOQA\n\n\n@pytest.mark.parametrize('mode', ['relative', 'absolute'])\ndef test_gaussian_noise_sampler(mode):\n    shape = (3, 4, 5)\n    target_array = numpy.random.uniform(0, 1, shape)\n    sampler = GaussianNoiseSampler(mode=mode, scale=0.15)\n    noise = sampler.sample(target_array)\n    assert noise.shape == shape\n\n\ndef test_gaussian_noise_sampler_assert_raises():\n    shape = (3, 4, 5)\n    target_array = numpy.random.uniform(0, 1, shape)\n    with pytest.raises(ValueError):\n        sampler = GaussianNoiseSampler(mode='invalid_mode', scale=0.15)\n        sampler.sample(target_array)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/saliency_tests/calculator_tests/test_gradient_calculator.py",
    "content": "import numpy\nimport pytest\n\nimport chainer\nfrom chainer.links import Linear\n\nfrom chainer_chemistry.link_hooks import is_link_hooks_available\nif is_link_hooks_available:\n    from chainer_chemistry.link_hooks import VariableMonitorLinkHook\n    from chainer_chemistry.saliency.calculator.gradient_calculator import GradientCalculator  # NOQA\n\n\nclass DummyModel(chainer.Chain):\n    def __init__(self):\n        super(DummyModel, self).__init__()\n        with self.init_scope():\n            self.l1 = Linear(\n                3, 1, initialW=numpy.array([[1, 3, 2]]),\n                nobias=True)\n\n    def forward(self, x):\n        return self.l1(x)\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_gradient_calculator():\n    model = DummyModel()\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    calculator = GradientCalculator(model)\n    saliency = calculator.compute(x)\n    # Gradient is equal to `initialW` of DummyModel.\n    assert numpy.allclose(saliency, numpy.array([[1, 3, 2]]))\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_gradient_calculator_multiple_output():\n    model = DummyModel()\n    x = numpy.array([[1, 5, 8], [2, 3, 4]], dtype=numpy.float32)\n    calculator = GradientCalculator(model)\n    # even batchsize=2 sum is applied automatically inside `compute`,\n    # so gradient can be calculated.\n    saliency = calculator.compute(x)\n    # Gradient is equal to `initialW` of DummyModel.\n    assert numpy.allclose(saliency, numpy.array([[1, 3, 2]]))\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_gradient_calculator_multiply_target():\n    model = DummyModel()\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    calculator = GradientCalculator(model, multiply_target=True)\n    saliency = calculator.compute(x)\n    # gradient * input\n    assert numpy.allclose(saliency, numpy.array([[1, 15, 16]]))\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_gradient_calculator_target_extractor():\n    model = DummyModel()\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    calculator = GradientCalculator(\n        model,\n        target_extractor=VariableMonitorLinkHook(model.l1, timing='pre'))\n    saliency = calculator.compute(x)\n    # Gradient is equal to `initialW` of DummyModel.\n    assert numpy.allclose(saliency, numpy.array([[[1, 3, 2]]]))\n    assert saliency.shape == (1, 1, 3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/saliency_tests/calculator_tests/test_integrated_gradient_calculator.py",
    "content": "import numpy\nimport pytest\n\nimport chainer\nfrom chainer.links import Linear\n\nfrom chainer_chemistry.link_hooks import is_link_hooks_available\nif is_link_hooks_available:\n    from chainer_chemistry.link_hooks import VariableMonitorLinkHook\n    from chainer_chemistry.saliency.calculator.integrated_gradients_calculator import IntegratedGradientsCalculator  # NOQA\n\n\nclass DummyModel(chainer.Chain):\n    def __init__(self):\n        super(DummyModel, self).__init__()\n        with self.init_scope():\n            self.l1 = Linear(\n                3, 1, initialW=numpy.array([[1, 3, 2]]),\n                nobias=True)\n\n    def forward(self, x):\n        return self.l1(x)\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_integrated_gradient_calculator():\n    model = DummyModel()\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    calculator = IntegratedGradientsCalculator(model, steps=3)\n    saliency = calculator.compute(x)\n    # gradient is always [1, 3, 2] * (input - base) is [1, 5, 8]\n    assert numpy.allclose(saliency, numpy.array([[1, 15, 16]]))\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_integrated_gradient_calculator_target_extractor():\n    model = DummyModel()\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    calculator = IntegratedGradientsCalculator(\n        model, steps=4,\n        target_extractor=VariableMonitorLinkHook(model.l1, timing='pre'))\n    saliency = calculator.compute(x)\n    # gradient is always [1, 3, 2] * (input - base) is [1, 5, 8]\n    assert numpy.allclose(saliency, numpy.array([[[1, 15, 16]]]))\n    assert saliency.shape == (1, 1, 3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/saliency_tests/calculator_tests/test_occlusion_calculator.py",
    "content": "import numpy\nimport pytest\n\nimport chainer\nfrom chainer.links import Linear, Convolution2D  # NOQA\n\nfrom chainer_chemistry.link_hooks import is_link_hooks_available\nif is_link_hooks_available:\n    from chainer_chemistry.link_hooks import VariableMonitorLinkHook\n    from chainer_chemistry.saliency.calculator.occlusion_calculator import OcclusionCalculator  # NOQA\n\n\nclass DummyModel(chainer.Chain):\n    def __init__(self):\n        super(DummyModel, self).__init__()\n        with self.init_scope():\n            self.l1 = Linear(\n                3, 1, initialW=numpy.array([[1, 3, 2]]),\n                nobias=True)\n\n    def forward(self, x):\n        return self.l1(x)\n\n\nclass DummyCNNModel(chainer.Chain):\n    def __init__(self):\n        super(DummyCNNModel, self).__init__()\n        with self.init_scope():\n            self.l1 = Convolution2D(\n                1, 1, ksize=3,\n                initialW=numpy.ones((1, 1, 3, 3), numpy.float32), nobias=True)\n\n    def forward(self, x):\n        return self.l1(x)\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_occlusion_calculator():\n    model = DummyModel()\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    calculator = OcclusionCalculator(model, slide_axis=1)\n    saliency = calculator.compute(x)\n    assert numpy.allclose(saliency, numpy.array([[[1, 15, 16]]]))\n    assert saliency.shape == (1, 1, 3)\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_occlusion_calculator_cnn():\n    model = DummyCNNModel()\n    # x (1, 1, 3, 3): (bs, ch, h, w)\n    x = numpy.array([[[[1, 5, 8], [2, 4, 1], [3, 2, 9]]]], dtype=numpy.float32)\n    calculator = OcclusionCalculator(model, slide_axis=(2, 3))\n    saliency = calculator.compute(x)\n    assert numpy.allclose(saliency, x)\n    assert saliency.shape == (1, 1, 1, 3, 3)  # (M, bs, ch, h, w)\n\n\n@pytest.mark.skipif(not is_link_hooks_available,\n                    reason='Link Hook is not available')\ndef test_occlusion_calculator_target_extractor():\n    model = DummyModel()\n    x = numpy.array([[1, 5, 8]], dtype=numpy.float32)\n    calculator = OcclusionCalculator(\n        model, slide_axis=1,\n        target_extractor=VariableMonitorLinkHook(model.l1, timing='pre'))\n    saliency = calculator.compute(x)\n    assert numpy.allclose(saliency, numpy.array([[[1, 15, 16]]]))\n    assert saliency.shape == (1, 1, 3)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/saliency_tests/visualizer_tests/test_image_visualizer.py",
    "content": "import os\nimport sys\n\nimport matplotlib.pyplot as plt\nimport numpy\nimport pytest\n\n\nfrom chainer_chemistry.saliency.visualizer.image_visualizer import ImageVisualizer  # NOQA\n\n\nis_python_version2 = sys.version_info[0] < 3\n\n\n@pytest.mark.skipif(is_python_version2,\n                    reason='matplotlib configuration is necessary with'\n                           'python version 2')\ndef test_image_visualizer(tmpdir):\n    # Only test file is saved without error\n    ch = 3\n    h = 32\n    w = 32\n    saliency = numpy.random.uniform(0, 1, (ch, h, w))\n    visualizer = ImageVisualizer()\n\n    # 1. test with setting save_filepath\n    save_filepath = os.path.join(str(tmpdir), 'tmp.png')\n    visualizer.visualize(saliency, save_filepath=save_filepath)\n    assert os.path.exists(save_filepath)\n\n    # 2. test with `save_filepath=None` runs without error\n    image = numpy.random.uniform(0, 1, (ch, h, w))\n    plt.ion()\n    visualizer.visualize(\n        saliency, save_filepath=None, image=image, show_colorbar=True)\n    plt.close()\n\n\ndef test_table_visualizer_assert_raises():\n    visualizer = ImageVisualizer()\n    with pytest.raises(ValueError):\n        # --- Invalid saliency shape ---\n        saliency_invalid = numpy.array([0.5, 0.3, 0.2])\n        visualizer.visualize(saliency_invalid)\n\n    ch = 3\n    h = 32\n    w = 32\n    saliency = numpy.random.uniform(0, 1, (ch, h, w))\n\n    with pytest.raises(ValueError):\n        # --- Invalid sort key ---\n        image_invalid = numpy.array([0.5, 0.3, 0.2])\n        visualizer.visualize(saliency, image=image_invalid)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/saliency_tests/visualizer_tests/test_mol_visualizer.py",
    "content": "import os\n\nimport numpy\nimport pytest\nfrom rdkit import Chem\n\nfrom chainer_chemistry.saliency.visualizer.mol_visualizer import MolVisualizer  # NOQA\nfrom chainer_chemistry.saliency.visualizer.mol_visualizer import SmilesVisualizer  # NOQA\n\n\ndef test_mol_visualizer(tmpdir):\n    # Only test file is saved without error\n    smiles = 'OCO'\n    mol = Chem.MolFromSmiles(smiles)\n    saliency = numpy.array([0.5, 0.3, 0.2])\n    visualizer = MolVisualizer()\n\n    # 1. test with setting save_filepath\n    save_filepath = os.path.join(str(tmpdir), 'tmp.svg')\n    svg = visualizer.visualize(saliency, mol, save_filepath=save_filepath)\n    assert isinstance(svg, str)\n    assert os.path.exists(save_filepath)\n\n    # 2. test with `save_filepath=None` runs without error\n    svg = visualizer.visualize(\n        saliency, mol, save_filepath=None, visualize_ratio=0.5,)\n    assert isinstance(svg, str)\n\n\ndef test_smiles_visualizer(tmpdir):\n    # Only test file is saved without error\n    smiles = 'OCO'\n    saliency = numpy.array([0.5, 0.3, 0.2])\n    visualizer = SmilesVisualizer()\n\n    # 1. test with setting save_filepath\n    save_filepath = os.path.join(str(tmpdir), 'tmp.svg')\n    svg = visualizer.visualize(saliency, smiles, save_filepath=save_filepath,\n                               add_Hs=False)\n    assert os.path.exists(save_filepath)\n    assert isinstance(svg, str)\n    save_filepath = os.path.join(str(tmpdir), 'tmp.png')\n    svg = visualizer.visualize(saliency, smiles, save_filepath=save_filepath,\n                               add_Hs=False)\n    assert isinstance(svg, str)\n    # TODO(nakago): support png save test.\n    # Do not test for now (cairosvg is necessary)\n    # assert os.path.exists(save_filepath)\n\n    # 2. test with `save_filepath=None` runs without error\n    svg = visualizer.visualize(\n        saliency, smiles, save_filepath=None, visualize_ratio=0.5,\n        add_Hs=False, use_canonical_smiles=True)\n    assert isinstance(svg, str)\n\n\ndef test_mol_visualizer_assert_raises(tmpdir):\n    visualizer = MolVisualizer()\n    smiles = 'OCO'\n    mol = Chem.MolFromSmiles(smiles)\n\n    with pytest.raises(ValueError):\n        # --- Invalid saliency shape ---\n        saliency = numpy.array([[0.5, 0.3, 0.2], [0.5, 0.3, 0.2]])\n        visualizer.visualize(saliency, mol)\n\n    with pytest.raises(ValueError):\n        # --- Invalid sort key ---\n        saliency = numpy.array([0.5, 0.3, 0.2])\n        invalid_ext_filepath = os.path.join(str(tmpdir), 'tmp.hoge')\n        visualizer.visualize(saliency, mol, save_filepath=invalid_ext_filepath)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/saliency_tests/visualizer_tests/test_table_visualizer.py",
    "content": "import os\nimport sys\n\nimport matplotlib.pyplot as plt\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.saliency.visualizer.table_visualizer import TableVisualizer  # NOQA\n\n\nis_python_version2 = sys.version_info[0] < 3\n\n\n@pytest.mark.skipif(is_python_version2,\n                    reason='matplotlib configuration is necessary with'\n                           'python version 2')\ndef test_table_visualizer(tmpdir):\n    # Only test file is saved without error\n    saliency = numpy.array([0.5, 0.3, 0.2])\n    visualizer = TableVisualizer()\n\n    # 1. test with setting save_filepath\n    save_filepath = os.path.join(str(tmpdir), 'tmp.png')\n    visualizer.visualize(saliency, save_filepath=save_filepath)\n    assert os.path.exists(save_filepath)\n    # 2. test with `save_filepath=None` runs without error\n    plt.ion()\n    visualizer.visualize(\n        saliency, save_filepath=None, feature_names=['hoge', 'huga', 'piyo'],\n        num_visualize=2)\n    plt.close()\n\n\ndef test_table_visualizer_assert_raises():\n    visualizer = TableVisualizer()\n    with pytest.raises(ValueError):\n        # --- Invalid saliency shape ---\n        saliency = numpy.array([[0.5, 0.3, 0.2], [0.5, 0.3, 0.2]])\n        visualizer.visualize(saliency)\n\n    with pytest.raises(ValueError):\n        # --- Invalid sort key ---\n        saliency = numpy.array([0.5, 0.3, 0.2])\n        visualizer.visualize(saliency, sort='invalidkey')\n\n    with pytest.raises(ValueError):\n        # --- Invalid feature_names key ---\n        saliency = numpy.array([0.5, 0.3, 0.2])\n        feature_names = ['a', 'b', 'c', 'd']\n        visualizer.visualize(saliency, feature_names=feature_names)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/saliency_tests/visualizer_tests/test_visualizer_utils.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import abs_max_scaler  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import min_max_scaler  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import normalize_scaler  # NOQA\nfrom chainer_chemistry.saliency.visualizer.visualizer_utils import red_blue_cmap  # NOQA\n\n\ndef test_abs_max_scaler():\n    saliency = numpy.array([1., 2., 3.])\n    result = abs_max_scaler(saliency)\n    expected = numpy.array([1. / 3, 2. / 3., 1.])\n    assert numpy.allclose(result, expected)\n\n    # test with 0 arrays\n    saliency = numpy.array([0, 0, 0])\n    result = abs_max_scaler(saliency)\n    expected = numpy.array([0, 0, 0])\n    assert numpy.allclose(result, expected)\n\n\ndef test_min_max_scaler():\n    saliency = numpy.array([1., -3., 3.])\n    result = min_max_scaler(saliency)\n    expected = numpy.array([4. / 6, 0., 1.])\n    assert numpy.allclose(result, expected)\n\n    # test with 0 arrays\n    saliency = numpy.array([0, 0, 0])\n    result = min_max_scaler(saliency)\n    expected = numpy.array([0, 0, 0])\n    assert numpy.allclose(result, expected)\n\n\ndef test_normalize_scaler():\n    saliency = numpy.array([1., 2., 3.])\n    result = normalize_scaler(saliency)\n    expected = numpy.array([1./6., 2./6, 3./6.])\n    assert numpy.allclose(result, expected)\n\n    # test with 0 arrays\n    saliency = numpy.array([0, 0, 0])\n    result = normalize_scaler(saliency)\n    expected = numpy.array([0, 0, 0])\n    assert numpy.allclose(result, expected)\n\n\ndef test_red_blue_cmap():\n    assert red_blue_cmap(1) == (1., 0., 0.)  # Red\n    assert red_blue_cmap(0) == (1., 1., 1.)  # White\n    assert red_blue_cmap(-1) == (0., 0., 1.)  # Blue\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/test_init.py",
    "content": "import pkg_resources\n\nimport chainer_chemistry\nimport pytest\n\n\ndef test_version():\n    expect = pkg_resources.get_distribution('chainer_chemistry').version\n    actual = chainer_chemistry.__version__\n    assert expect == actual\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v'])\n"
  },
  {
    "path": "tests/training_tests/extensions_tests/test_auto_print_report.py",
    "content": "import tempfile\n\nimport pytest\n\nimport mock\n\nfrom chainer import testing\nfrom chainer.training import extensions\n\n\nclass TestAutoPrintReport(object):\n    def _setup(self, stream=None, delete_flush=False):\n        self.logreport = mock.MagicMock(spec=extensions.LogReport(\n            ['epoch'], trigger=(1, 'iteration'), log_name=None))\n        if stream is None:\n            self.stream = mock.MagicMock()\n            if delete_flush:\n                del self.stream.flush\n        else:\n            self.stream = stream\n        self.report = extensions.PrintReport(\n            ['epoch'], log_report=self.logreport, out=self.stream)\n\n        self.trainer = testing.get_trainer_with_mock_updater(\n            stop_trigger=(1, 'iteration'))\n        self.trainer.extend(self.logreport)\n        self.trainer.extend(self.report)\n        self.logreport.log = [{'epoch': 0}]\n\n    def test_stream_with_flush_is_flushed(self):\n        self._setup(delete_flush=False)\n        assert hasattr(self.stream, 'flush')\n        self.stream.flush.assert_not_called()\n        self.report(self.trainer)\n        self.stream.flush.assert_called_with()\n\n    def test_stream_without_flush_raises_no_exception(self):\n        self._setup(delete_flush=True)\n        assert not hasattr(self.stream, 'flush')\n        self.report(self.trainer)\n\n    def test_real_stream_raises_no_exception(self):\n        with tempfile.TemporaryFile(mode='w') as stream:\n            self._setup(stream=stream)\n            self.report(self.trainer)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/training_tests/extensions_tests/test_prc_auc_evaluator.py",
    "content": "\"\"\"\nPRCAUCEvaluator uses `sklearn.metrics.precision_recall_curve` and\n`sklearn.metrics.auc` internally.\nRefer: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.\\\nprc_auc_score.html\n\"\"\"\nimport numpy\nimport pytest\n\nimport chainer\nfrom chainer.iterators import SerialIterator\n\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset  # NOQA\nfrom chainer_chemistry.training.extensions.prc_auc_evaluator import PRCAUCEvaluator  # NOQA\n\n\n@pytest.fixture\ndef data0():\n    # `t` is correct label, `y` is dummy predict value by predictor\n    t = numpy.array([0, 0, 1, 1], dtype=numpy.int32)[:, None]\n    y = numpy.array([0.1, 0.4, 0.35, 0.8], dtype=numpy.float32)[:, None]\n    return y, t\n\n\n@pytest.fixture\ndef data1():\n    # `t` is correct label, `y` is dummy predict value by predictor\n    t = numpy.array([0, 1, -1, 0, 2, -1], dtype=numpy.int32)[:, None]\n    y = numpy.array([0.1, 0.35, 0.2, 0.4, 0.8, 0.35],\n                    dtype=numpy.float32)[:, None]\n    return y, t\n\n\n@pytest.fixture\ndef data2():\n    # Example of bad example case\n    # `t` only contains correct label, `y` is dummy predict value by predictor\n    t = numpy.array([0, 0, 0, 0], dtype=numpy.int32)[:, None]\n    y = numpy.array([0.1, 0.4, 0.35, 0.8], dtype=numpy.float32)[:, None]\n    return y, t\n\n\nclass DummyPredictor(chainer.Chain):\n\n    def __call__(self, y):\n        # it receives `y` and return `y` directly\n        return y\n\n\ndef test_prc_auc_evaluator(data0, data1):\n    _test_prc_auc_evaluator_default_args(data0)\n    _test_prc_auc_evaluator_with_labels(data1)\n\n\ndef _test_prc_auc_evaluator_default_args(data0):\n\n    predictor = DummyPredictor()\n    dataset = NumpyTupleDataset(*data0)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = PRCAUCEvaluator(\n        iterator, predictor, name='train',\n        pos_labels=1, ignore_labels=None\n    )\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    expected_prc_auc = 0.7916\n    pytest.approx(observation['target/prc_auc'], expected_prc_auc)\n\n    # --- test __call__ ---\n    result = evaluator()\n    pytest.approx(result['train/main/prc_auc'], expected_prc_auc)\n\n\ndef _test_prc_auc_evaluator_with_labels(data1):\n    \"\"\"test `pos_labels` and `ignore_labels` behavior\"\"\"\n\n    predictor = DummyPredictor()\n    dataset = NumpyTupleDataset(*data1)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = PRCAUCEvaluator(\n        iterator, predictor, name='val',\n        pos_labels=[1, 2], ignore_labels=-1,\n    )\n\n    # --- test evaluate ---\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    expected_prc_auc = 0.7916\n    pytest.approx(observation['target/prc_auc'], expected_prc_auc)\n\n    # --- test __call__ ---\n    result = evaluator()\n    pytest.approx(result['val/main/prc_auc'], expected_prc_auc)\n\n\ndef test_prc_auc_evaluator_raise_value_error(data2):\n    with pytest.raises(ValueError):\n        _test_prc_auc_evaluator_raise_error(data2, raise_value_error=True)\n\n    res = _test_prc_auc_evaluator_raise_error(data2, raise_value_error=False)\n    assert numpy.isnan(res)\n\n\ndef _test_prc_auc_evaluator_raise_error(data, raise_value_error=True):\n\n    predictor = DummyPredictor()\n    dataset = NumpyTupleDataset(*data)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = PRCAUCEvaluator(\n        iterator, predictor, name='train',\n        pos_labels=1, ignore_labels=None,\n        raise_value_error=raise_value_error\n    )\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    return observation['target/prc_auc']\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/training_tests/extensions_tests/test_r2_score_evaluator.py",
    "content": "import numpy\nimport pytest\n\nimport chainer\nfrom chainer import cuda\nfrom chainer.iterators import SerialIterator\n\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset\nfrom chainer_chemistry.training.extensions.r2_score_evaluator import R2ScoreEvaluator  # NOQA\n\n\n@pytest.fixture\ndef inputs():\n    numpy.random.seed(0)\n    x0 = numpy.random.uniform(-1, 1, (4, 3)).astype('f')\n    # Add sufficient margin to prevent computational error\n    diff = numpy.random.uniform(-1, 1, (4, 3)).astype('f')\n    diff[abs(diff) < 0.01] = 0.5\n    x1 = x0 + diff\n    x2 = numpy.asarray([[0.3, numpy.nan, 0.2],\n                        [numpy.nan, 0.1, 0.5],\n                        [0.9, 0.7, numpy.nan],\n                        [0.2, -0.3, 0.4]]).astype('f')\n    return x0, x1, x2\n\n\ndef r2_score(pred, true, sample_weight=None, multioutput=\"uniform_average\",\n             ignore_nan=False):\n    pred = cuda.to_cpu(pred)\n    true = cuda.to_cpu(true)\n    diff = pred - true\n    dev = true - numpy.mean(true, axis=0)\n    if ignore_nan:\n        diff[numpy.isnan(diff)] = 0.\n        dev[numpy.isnan(dev)] = 0.\n    SS_res = numpy.asarray(\n        numpy.sum(diff ** 2, axis=0))\n    SS_tot = numpy.asarray(\n        numpy.sum(dev ** 2, axis=0))\n\n    if multioutput == 'uniform_average':\n        if numpy.any(SS_tot == 0):\n            return 0.0\n        else:\n            return (1 - SS_res / SS_tot).mean()\n    elif multioutput == 'raw_values':\n        if numpy.any(SS_tot == 0):\n            # Assign dummy value to avoid zero-division\n            SS_tot_iszero = SS_tot == 0\n            SS_tot[SS_tot_iszero] = 1\n\n            return numpy.where(SS_tot_iszero, 0.0, 1 - SS_res / SS_tot)\n        else:\n            return 1 - SS_res / SS_tot\n\n\nclass DummyPredictor(chainer.Chain):\n\n    def __call__(self, y):\n        # it receives `y` and return `y` directly\n        return y\n\n\ndef test_r2_score_evaluator(inputs):\n    _test_r2_score_evaluator(inputs)\n    _test_r2_score_evaluator_ignore_nan(inputs)\n    _test_r2_score_evaluator_ignore_nan_with_nonnan_value(inputs)\n    _test_r2_score_evaluator_raw_values(inputs)\n\n\n@pytest.mark.gpu\ndef test_r2_score_evaluator_gpu(inputs):\n    x0, x1, x2 = inputs\n    _test_r2_score_evaluator((cuda.to_gpu(x0), cuda.to_gpu(x1), None))\n    _test_r2_score_evaluator_ignore_nan(\n        (cuda.to_gpu(x0), None, cuda.to_gpu(x2)))\n    _test_r2_score_evaluator_ignore_nan_with_nonnan_value(\n        (cuda.to_gpu(x0), cuda.to_gpu(x1), None))\n    _test_r2_score_evaluator_raw_values(\n        (cuda.to_gpu(x0), cuda.to_gpu(x1), None))\n\n\ndef _test_r2_score_evaluator(inputs):\n    predictor = DummyPredictor()\n    x0, x1, _ = inputs\n    dataset = NumpyTupleDataset(x0, x1)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = R2ScoreEvaluator(iterator, predictor, name='train')\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    expected = r2_score(x0, x1)\n    pytest.approx(observation['target/r2_score'], expected)\n\n    # --- test __call__ ---\n    result = evaluator()\n    pytest.approx(result['train/main/r2_score'], expected)\n\n\ndef _test_r2_score_evaluator_ignore_nan(inputs):\n    predictor = DummyPredictor()\n    x0, _, x2 = inputs\n    dataset = NumpyTupleDataset(x0, x2)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = R2ScoreEvaluator(\n        iterator, predictor, name='train', ignore_nan=True)\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    expected = r2_score(x0, x2, ignore_nan=True)\n    pytest.approx(observation['target/r2_score'], expected)\n\n    # --- test __call__ ---\n    result = evaluator()\n    pytest.approx(result['train/main/r2_score'], expected)\n\n\ndef _test_r2_score_evaluator_ignore_nan_with_nonnan_value(inputs):\n    predictor = DummyPredictor()\n    x0, x1, _ = inputs\n    dataset = NumpyTupleDataset(x0, x1)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = R2ScoreEvaluator(\n        iterator, predictor, name='train', ignore_nan=True)\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    expected = r2_score(x0, x1, ignore_nan=True)\n    pytest.approx(observation['target/r2_score'], expected)\n\n    # --- test __call__ ---\n    result = evaluator()\n    pytest.approx(result['train/main/r2_score'], expected)\n\n\ndef _test_r2_score_evaluator_raw_values(inputs):\n    predictor = DummyPredictor()\n    x0, x1, _ = inputs\n    dataset = NumpyTupleDataset(x0, x1)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = R2ScoreEvaluator(\n        iterator, predictor, name='train', multioutput='raw_values')\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    expected = r2_score(x0, x1, multioutput='raw_values')\n    pytest.approx(observation['target/r2_score'], expected)\n\n    # --- test __call__ ---\n    result = evaluator()\n    pytest.approx(result['train/main/r2_score'], expected)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/training_tests/extensions_tests/test_roc_auc_evaluator.py",
    "content": "\"\"\"\nROCAUCEvaluator uses `sklearn.metrics.roc_auc_score` internally.\nRefer: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.\\\nroc_auc_score.html\n\"\"\"\nimport numpy\nimport pytest\n\nimport chainer\nfrom chainer.iterators import SerialIterator\n\nfrom chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset  # NOQA\nfrom chainer_chemistry.training.extensions.roc_auc_evaluator import ROCAUCEvaluator  # NOQA\n\n\n@pytest.fixture\ndef data0():\n    # `t` is correct label, `y` is dummy predict value by predictor\n    t = numpy.array([0, 0, 1, 1], dtype=numpy.int32)[:, None]\n    y = numpy.array([0.1, 0.4, 0.35, 0.8], dtype=numpy.float32)[:, None]\n    return y, t\n\n\n@pytest.fixture\ndef data1():\n    # `t` is correct label, `y` is dummy predict value by predictor\n    t = numpy.array([0, 1, -1, 0, 2, -1], dtype=numpy.int32)[:, None]\n    y = numpy.array([0.1, 0.35, 0.2, 0.4, 0.8, 0.35],\n                    dtype=numpy.float32)[:, None]\n    return y, t\n\n\n@pytest.fixture\ndef data2():\n    # Example of bad example case\n    # `t` only contains correct label, `y` is dummy predict value by predictor\n    t = numpy.array([0, 0, 0, 0], dtype=numpy.int32)[:, None]\n    y = numpy.array([0.1, 0.4, 0.35, 0.8], dtype=numpy.float32)[:, None]\n    return y, t\n\n\nclass DummyPredictor(chainer.Chain):\n\n    def __call__(self, y):\n        # it receives `y` and return `y` directly\n        return y\n\n\ndef test_roc_auc_evaluator(data0, data1):\n    _test_roc_auc_evaluator_default_args(data0)\n    _test_roc_auc_evaluator_with_labels(data1)\n\n\ndef _test_roc_auc_evaluator_default_args(data0):\n\n    predictor = DummyPredictor()\n    dataset = NumpyTupleDataset(*data0)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = ROCAUCEvaluator(\n        iterator, predictor, name='train',\n        pos_labels=1, ignore_labels=None\n    )\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    expected_roc_auc = 0.75\n    # print('observation ', observation)\n    assert observation['target/roc_auc'] == expected_roc_auc\n\n    # --- test __call__ ---\n    result = evaluator()\n    # print('result ', result)\n    assert result['train/main/roc_auc'] == expected_roc_auc\n\n\ndef _test_roc_auc_evaluator_with_labels(data1):\n    \"\"\"test `pos_labels` and `ignore_labels` behavior\"\"\"\n\n    predictor = DummyPredictor()\n    dataset = NumpyTupleDataset(*data1)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = ROCAUCEvaluator(\n        iterator, predictor, name='val',\n        pos_labels=[1, 2], ignore_labels=-1,\n    )\n\n    # --- test evaluate ---\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    expected_roc_auc = 0.75\n    # print('observation ', observation)\n    assert observation['target/roc_auc'] == expected_roc_auc\n\n    # --- test __call__ ---\n    result = evaluator()\n    # print('result ', result)\n    assert result['val/main/roc_auc'] == expected_roc_auc\n\n\ndef test_roc_auc_evaluator_raise_value_error(data2):\n    with pytest.raises(ValueError):\n        _test_roc_auc_evaluator_raise_error(data2, raise_value_error=True)\n\n    res = _test_roc_auc_evaluator_raise_error(data2, raise_value_error=False)\n    assert numpy.isnan(res)\n\n\ndef _test_roc_auc_evaluator_raise_error(data, raise_value_error=True):\n\n    predictor = DummyPredictor()\n    dataset = NumpyTupleDataset(*data)\n\n    iterator = SerialIterator(dataset, 2, repeat=False, shuffle=False)\n    evaluator = ROCAUCEvaluator(\n        iterator, predictor, name='train',\n        pos_labels=1, ignore_labels=None,\n        raise_value_error=raise_value_error\n    )\n    repo = chainer.Reporter()\n    repo.add_observer('target', predictor)\n    with repo:\n        observation = evaluator.evaluate()\n\n    return observation['target/roc_auc']\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/utils_tests/test_extend.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.utils.extend import extend_node, extend_adj  # NOQA\n\n\nbatchsize = 2\nnum_node = 3\nch = 5\n\nx_2d = numpy.arange(batchsize * num_node).reshape(\n    (batchsize, num_node))\nx_3d = numpy.arange(batchsize * num_node * ch).reshape(\n    (batchsize, num_node, ch))\nadj_3d = numpy.arange(batchsize * num_node * num_node).reshape(\n    (batchsize, num_node, num_node))\n\n\n@pytest.mark.parametrize('x', [x_2d, x_2d.astype(numpy.float32)])\ndef test_extend_node_2d(x):\n    x_extended = extend_node(x, out_size=6)\n    x_expected = numpy.array([[0, 1, 2, 0, 0, 0],\n                              [3, 4, 5, 0, 0, 0]], dtype=x.dtype)\n\n    print('x type', x_extended.dtype)\n    assert x_extended.shape == (batchsize, 6)\n    assert x_extended.dtype == x.dtype\n    assert numpy.array_equal(x_extended, x_expected)\n\n\n@pytest.mark.parametrize('x', [x_3d, x_3d.astype(numpy.float32)])\n@pytest.mark.parametrize('axis', [-1, 2])\ndef test_extend_node_3d(x, axis):\n    x_extended = extend_node(x, out_size=6, axis=axis)\n    x_expected = numpy.array([\n        [[0, 1, 2, 3, 4, 0],\n         [5, 6, 7, 8, 9, 0],\n         [10, 11, 12, 13, 14, 0]],\n        [[15, 16, 17, 18, 19, 0],\n         [20, 21, 22, 23, 24, 0],\n         [25, 26, 27, 28, 29, 0]]])\n\n    assert x_extended.shape == (batchsize, num_node, 6)\n    assert x_extended.dtype == x.dtype\n    assert numpy.array_equal(x_extended, x_expected)\n\n\ndef test_extend_node_assert_raises():\n    with pytest.raises(ValueError):\n        extend_node(x_2d, out_size=1)\n\n\n@pytest.mark.parametrize('adj', [adj_3d, adj_3d.astype(numpy.float32)])\ndef test_extend_adj(adj):\n    adj_extended = extend_adj(adj, out_size=6)\n    assert adj_extended.shape == (batchsize, 6, 6)\n    assert adj_extended.dtype == adj.dtype\n    assert numpy.array_equal(adj_extended[:, :num_node, :num_node], adj)\n    assert numpy.alltrue(adj_extended[:, num_node:, :] == 0)\n    assert numpy.alltrue(adj_extended[:, :, num_node:] == 0)\n\n\ndef test_extend_adj_assert_raises():\n    with pytest.raises(ValueError):\n        extend_adj(adj_3d, out_size=1)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/utils_tests/test_json_utils.py",
    "content": "import os\n\nimport numpy\nimport pytest\n\nfrom chainer_chemistry.utils.json_utils import load_json\nfrom chainer_chemistry.utils.json_utils import save_json\n\nparams = {\n    'a_int': 1,\n    'b_str': 'string',\n    'c_list': [1, 2, 3],\n    'd_tuple': (1, 2),\n    'n_int_scalar': numpy.array(1),\n    'n_int_array': numpy.array([1]),\n    'n_float': numpy.array([[1.0, 2.0], [3.0, 4.0]]),\n}\ntry:\n    # pathlib is not available with python 2.7\n    from pathlib import Path\n    params['path'] = Path('/tmp/hoge')\n    _is_pathlib_available = True\nexcept ImportError:\n    _is_pathlib_available = False\n\n\nparams_invalid = {\n    'lambda_function': lambda x: x * 2,\n}\n\n\ndef test_save_json(tmpdir):\n    filepath = os.path.join(str(tmpdir), 'tmp.json')\n    save_json(filepath, params)\n    assert os.path.exists(filepath)\n\n\ndef test_save_json_ignore_error(tmpdir):\n    filepath = os.path.join(str(tmpdir), 'tmp.json')\n\n    # 1. should raise error when ignore_error=False\n    with pytest.raises(TypeError):\n        save_json(filepath, params_invalid, ignore_error=False)\n\n    # 2. should not raise error when ignore_error=False\n    save_json(filepath, params_invalid, ignore_error=True)\n\n\ndef test_load_json(tmpdir):\n    filepath = os.path.join(str(tmpdir), 'tmp.json')\n    # TODO(nakago): better to remove `save_json` dependency for unittest.\n    save_json(filepath, params)\n\n    params_load = load_json(filepath)\n    expected_params_load = {\n        'a_int': 1,\n        'b_str': 'string',\n        'c_list': [1, 2, 3],\n        'd_tuple': [1, 2],\n        'n_float': [[1.0, 2.0], [3.0, 4.0]],\n        'n_int_array': [1],\n        'n_int_scalar': 1,\n    }\n    if _is_pathlib_available:\n        # PurePath is converted to str\n        expected_params_load['path'] = '/tmp/hoge'\n    assert params_load == expected_params_load\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/utils_tests/test_permutation.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.utils.permutation import permute_adj\nfrom chainer_chemistry.utils.permutation import permute_node\n\n\nbatchsize = 1\nnum_node = 3\nch = 5\n\n\n@pytest.mark.parametrize('x', [\n    numpy.random.randint(10, size=(batchsize, num_node), dtype=numpy.int32),\n    numpy.random.random(size=(batchsize, num_node))\n])\ndef test_permute_node_2d(x):\n    perm = numpy.random.permutation(num_node)\n    x_perm = permute_node(x, perm)\n\n    assert x.shape == x_perm.shape\n    for i in range(num_node):\n        assert numpy.allclose(x[:, perm[i]], x_perm[:, i])\n\n\n@pytest.mark.parametrize('x', [\n    numpy.random.randint(10, size=(batchsize, num_node, ch),\n                         dtype=numpy.int32),\n    numpy.random.random(size=(batchsize, num_node, ch))\n])\n@pytest.mark.parametrize('axis', [-1, -2, 1, 2])\ndef test_permute_node_3d(x, axis):\n    perm = numpy.random.permutation(x.shape[axis])\n    x_perm = permute_node(x, perm, axis=axis)\n\n    assert x.shape == x_perm.shape\n    if axis == -1 or axis == 2:\n        for i in range(num_node):\n            assert numpy.allclose(x[:, :, perm[i]], x_perm[:, :, i])\n    else:\n        for i in range(num_node):\n            assert numpy.allclose(x[:, perm[i], :], x_perm[:, i, :])\n\n\n@pytest.mark.parametrize('adj', [\n    numpy.random.randint(10, size=(batchsize, num_node, num_node),\n                         dtype=numpy.int32),\n    numpy.random.randint(10, size=(batchsize, ch, num_node, num_node),\n                         dtype=numpy.int32)\n])\ndef test_permute_adj(adj):\n    perm = numpy.random.permutation(num_node)\n    adj_perm = permute_adj(adj, perm)\n\n    assert adj.shape == adj_perm.shape\n    for i in range(num_node):\n        for j in range(num_node):\n            assert numpy.array_equal(\n                adj[..., perm[i], perm[j]], adj_perm[..., i, j])\n\n\ndef test_permute_adj_axis12():\n    adj = numpy.random.randint(\n        10, size=(batchsize, num_node, num_node, ch), dtype=numpy.int32)\n    perm = numpy.random.permutation(num_node)\n    adj_perm = permute_adj(adj, perm, axis=[1, 2])\n\n    assert adj.shape == adj_perm.shape\n    for i in range(num_node):\n        for j in range(num_node):\n            assert numpy.allclose(\n                adj[:, perm[i], perm[j], :], adj_perm[:, i, j, :])\n\n\ndef test_permute_adj_error():\n    adj = numpy.random.randint(\n        10, size=(batchsize, ch, num_node, num_node), dtype=numpy.int32)\n    perm = numpy.random.permutation(num_node)\n    with pytest.raises(TypeError):\n        permute_adj(adj, perm, axis=1)\n\n    with pytest.raises(ValueError):\n        permute_adj(adj, perm, axis=[1, 2, 3])\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/utils_tests/test_sparse_utils.py",
    "content": "import numpy\nimport pytest\n\nfrom chainer_chemistry.utils.sparse_utils import convert_sparse_with_edge_type\nfrom chainer_chemistry.utils.sparse_utils import sparse_utils_available\n\nif not sparse_utils_available():\n    pytest.skip('sparse_utils is available if chainer>=5 and numpy>=1.16',\n                allow_module_level=True)\n\n\ndef naive_convert(data, row, col, edge_type, num_edge_type):\n    mb, length = data.shape\n    new_mb = mb * num_edge_type\n    new_data = [[] for _ in range(new_mb)]\n    new_row = [[] for _ in range(new_mb)]\n    new_col = [[] for _ in range(new_mb)]\n\n    for i in range(mb):\n        for j in range(length):\n            k = i * num_edge_type + edge_type[i, j]\n            new_data[k].append(data[i, j])\n            new_row[k].append(row[i, j])\n            new_col[k].append(col[i, j])\n\n    new_length = max(len(arr) for arr in new_data)\n\n    def pad(arr_2d, dtype=numpy.int32):\n        for arr in arr_2d:\n            arr.extend([0] * (new_length - len(arr)))\n        return numpy.array(arr_2d)\n\n    ret = []\n    for d, r, c in zip(pad(new_data, data.dtype),\n                       pad(new_row), pad(new_col)):\n        ret.append(list(sorted(zip(d, r, c))))\n    return ret\n\n\n@pytest.mark.parametrize('in_shape,num_edge_type', [\n    ((2, 4), 4),\n    ((5, 10), 2),\n    ((1, 1), 1),\n    ((10, 1), 10),\n    ((10, 10), 10),\n])\ndef test_convert_sparse_with_edge_type(in_shape, num_edge_type):\n    num_nodes = 10\n\n    data = numpy.random.uniform(size=in_shape).astype(numpy.float32)\n    row = numpy.random.randint(size=in_shape, low=0, high=num_nodes)\n    col = numpy.random.randint(size=in_shape, low=0, high=num_nodes)\n    edge_type = numpy.random.randint(size=in_shape, low=0, high=num_edge_type)\n\n    received = convert_sparse_with_edge_type(data, row, col, num_nodes,\n                                             edge_type, num_edge_type)\n    expected = naive_convert(data, row, col, edge_type, num_edge_type)\n\n    # check by minibatch-wise\n    for i, expected_batch in enumerate(expected):\n        d = received.data.data[i, :].tolist()\n        r = received.row[i, :].tolist()\n        c = received.col[i, :].tolist()\n\n        received_batch = list(sorted(zip(d, r, c)))\n\n        assert expected_batch == received_batch\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  },
  {
    "path": "tests/utils_tests/test_train_utils.py",
    "content": "import chainer\nimport numpy\nimport pytest\n\nfrom chainer.iterators import SerialIterator\nfrom chainer import links\nimport chainerx\n\nfrom chainer_chemistry.datasets import NumpyTupleDataset\nfrom chainer_chemistry.models import Regressor\nfrom chainer_chemistry.utils import run_train\n\n\ninput_dim = 5\noutput_dim = 7\ntrain_data_size = 9\nvalid_data_size = 8\nbatch_size = 4\n\n\n@pytest.fixture\ndef model():\n    return Regressor(links.Linear(None, output_dim))\n\n\n@pytest.fixture\ndef train_data():\n    x = numpy.random.uniform(\n        0, 1, (train_data_size, input_dim)).astype(numpy.float32)\n    y = numpy.random.uniform(\n        0, 1, (train_data_size, output_dim)).astype(numpy.float32)\n    return NumpyTupleDataset(x, y)\n\n\n@pytest.fixture\ndef valid_data():\n    x = numpy.random.uniform(\n        0, 1, (valid_data_size, input_dim)).astype(numpy.float32)\n    y = numpy.random.uniform(\n        0, 1, (valid_data_size, output_dim)).astype(numpy.float32)\n    return NumpyTupleDataset(x, y)\n\n\ndef test_run_train_cpu(model, train_data, valid_data):\n    run_train(model, train_data, valid=valid_data, epoch=1, batch_size=8)\n\n\ndef test_run_train_cpu_iterator(model, train_data, valid_data):\n    train_iter = SerialIterator(train_data, batch_size=4)\n    valid_iter = SerialIterator(valid_data, batch_size=4,\n                                shuffle=False, repeat=False)\n    run_train(model, train_iter, valid=valid_iter, epoch=1, batch_size=8,\n              extensions_list=[lambda t: None])\n\n\ndef test_run_train_invalid(model, train_data):\n    with pytest.raises(ValueError):\n        run_train(model, train_data, optimizer=1)\n\n\n@pytest.mark.gpu\ndef test_run_train_gpu(model, train_data, valid_data):\n    device = 0\n    model.to_gpu(device)\n    run_train(model, train_data, valid=valid_data, epoch=1, batch_size=8,\n              device=device)\n\n\n@pytest.mark.skipif(not chainerx.is_available(),\n                    reason='chainerx is not available')\ndef test_run_train_chainerx_native(model, train_data, valid_data):\n    device = chainer.get_device('native')\n    model.to_device(device)\n    run_train(model, train_data, valid=valid_data, epoch=1, batch_size=8,\n              device=device)\n\n\n@pytest.mark.gpu\n@pytest.mark.skipif(not chainerx.is_available(),\n                    reason='chainerx is not available')\ndef test_run_train_chainerx_cuda0(model, train_data, valid_data):\n    device = chainer.get_device('cuda:0')\n    model.to_device(device)\n    run_train(model, train_data, valid=valid_data, epoch=1, batch_size=8,\n              device=device)\n\n\nif __name__ == '__main__':\n    pytest.main([__file__, '-v', '-s'])\n"
  }
]