[
  {
    "path": ".clang-format",
    "content": "---\nLanguage:        Cpp\n# BasedOnStyle:  Google\nAccessModifierOffset: -1\nAlignAfterOpenBracket: Align\nAlignArrayOfStructures: None\nAlignConsecutiveMacros: None\nAlignConsecutiveAssignments: None\nAlignConsecutiveBitFields: None\nAlignConsecutiveDeclarations: None\nAlignEscapedNewlines: Left\nAlignOperands:   Align\nAlignTrailingComments: true\nAllowAllArgumentsOnNextLine: true\nAllowAllParametersOfDeclarationOnNextLine: true\nAllowShortEnumsOnASingleLine: true\nAllowShortBlocksOnASingleLine: Never\nAllowShortCaseLabelsOnASingleLine: false\nAllowShortFunctionsOnASingleLine: All\nAllowShortLambdasOnASingleLine: All\nAllowShortIfStatementsOnASingleLine: WithoutElse\nAllowShortLoopsOnASingleLine: true\nAlwaysBreakAfterDefinitionReturnType: None\nAlwaysBreakAfterReturnType: None\nAlwaysBreakBeforeMultilineStrings: true\nAlwaysBreakTemplateDeclarations: Yes\nAttributeMacros:\n  - __capability\nBinPackArguments: true\nBinPackParameters: true\nBraceWrapping:\n  AfterCaseLabel:  false\n  AfterClass:      false\n  AfterControlStatement: Never\n  AfterEnum:       false\n  AfterFunction:   false\n  AfterNamespace:  false\n  AfterObjCDeclaration: false\n  AfterStruct:     false\n  AfterUnion:      false\n  AfterExternBlock: false\n  BeforeCatch:     false\n  BeforeElse:      false\n  BeforeLambdaBody: false\n  BeforeWhile:     false\n  IndentBraces:    false\n  SplitEmptyFunction: true\n  SplitEmptyRecord: true\n  SplitEmptyNamespace: true\nBreakBeforeBinaryOperators: None\nBreakBeforeConceptDeclarations: true\nBreakBeforeBraces: Attach\nBreakBeforeInheritanceComma: false\nBreakInheritanceList: BeforeColon\nBreakBeforeTernaryOperators: true\nBreakConstructorInitializersBeforeComma: false\nBreakConstructorInitializers: BeforeColon\nBreakAfterJavaFieldAnnotations: false\nBreakStringLiterals: true\nColumnLimit:     160\nCommentPragmas:  '^ IWYU pragma:'\nQualifierAlignment: Leave\nCompactNamespaces: false\nConstructorInitializerIndentWidth: 4\nContinuationIndentWidth: 4\nCpp11BracedListStyle: true\nDeriveLineEnding: true\nDerivePointerAlignment: true\nDisableFormat:   false\nEmptyLineAfterAccessModifier: Never\nEmptyLineBeforeAccessModifier: LogicalBlock\nExperimentalAutoDetectBinPacking: false\nPackConstructorInitializers: NextLine\nBasedOnStyle:    ''\nConstructorInitializerAllOnOneLineOrOnePerLine: false\nAllowAllConstructorInitializersOnNextLine: true\nFixNamespaceComments: true\nForEachMacros:\n  - foreach\n  - Q_FOREACH\n  - BOOST_FOREACH\nIfMacros:\n  - KJ_IF_MAYBE\nIncludeBlocks:   Regroup\nIncludeCategories:\n  - Regex:           '^<ext/.*\\.h>'\n    Priority:        2\n    SortPriority:    0\n    CaseSensitive:   false\n  - Regex:           '^<.*\\.h>'\n    Priority:        1\n    SortPriority:    0\n    CaseSensitive:   false\n  - Regex:           '^<.*'\n    Priority:        2\n    SortPriority:    0\n    CaseSensitive:   false\n  - Regex:           '.*'\n    Priority:        3\n    SortPriority:    0\n    CaseSensitive:   false\nIncludeIsMainRegex: '([-_](test|unittest))?$'\nIncludeIsMainSourceRegex: ''\nIndentAccessModifiers: false\nIndentCaseLabels: true\nIndentCaseBlocks: false\nIndentGotoLabels: true\nIndentPPDirectives: BeforeHash\nIndentExternBlock: AfterExternBlock\nIndentRequires:  false\nIndentWidth:     4\nIndentWrappedFunctionNames: false\nInsertTrailingCommas: None\nJavaScriptQuotes: Leave\nJavaScriptWrapImports: true\nKeepEmptyLinesAtTheStartOfBlocks: false\nLambdaBodyIndentation: Signature\nMacroBlockBegin: ''\nMacroBlockEnd:   ''\nMaxEmptyLinesToKeep: 1\nNamespaceIndentation: None\nObjCBinPackProtocolList: Never\nObjCBlockIndentWidth: 2\nObjCBreakBeforeNestedBlockParam: true\nObjCSpaceAfterProperty: false\nObjCSpaceBeforeProtocolList: true\nPenaltyBreakAssignment: 2\nPenaltyBreakBeforeFirstCallParameter: 1\nPenaltyBreakComment: 300\nPenaltyBreakFirstLessLess: 120\nPenaltyBreakOpenParenthesis: 0\nPenaltyBreakString: 1000\nPenaltyBreakTemplateDeclaration: 10\nPenaltyExcessCharacter: 1000000\nPenaltyReturnTypeOnItsOwnLine: 200\nPenaltyIndentedWhitespace: 0\nPointerAlignment: Left\nPPIndentWidth:   -1\nRawStringFormats:\n  - Language:        Cpp\n    Delimiters:\n      - cc\n      - CC\n      - cpp\n      - Cpp\n      - CPP\n      - 'c++'\n      - 'C++'\n    CanonicalDelimiter: ''\n    BasedOnStyle:    google\n  - Language:        TextProto\n    Delimiters:\n      - pb\n      - PB\n      - proto\n      - PROTO\n    EnclosingFunctions:\n      - EqualsProto\n      - EquivToProto\n      - PARSE_PARTIAL_TEXT_PROTO\n      - PARSE_TEST_PROTO\n      - PARSE_TEXT_PROTO\n      - ParseTextOrDie\n      - ParseTextProtoOrDie\n      - ParseTestProto\n      - ParsePartialTestProto\n    CanonicalDelimiter: pb\n    BasedOnStyle:    google\nReferenceAlignment: Pointer\nReflowComments:  true\nRemoveBracesLLVM: false\nSeparateDefinitionBlocks: Leave\nShortNamespaceLines: 1\nSortIncludes:    CaseSensitive\nSortJavaStaticImport: Before\nSortUsingDeclarations: true\nSpaceAfterCStyleCast: false\nSpaceAfterLogicalNot: false\nSpaceAfterTemplateKeyword: true\nSpaceBeforeAssignmentOperators: true\nSpaceBeforeCaseColon: false\nSpaceBeforeCpp11BracedList: false\nSpaceBeforeCtorInitializerColon: true\nSpaceBeforeInheritanceColon: true\nSpaceBeforeParens: ControlStatements\nSpaceBeforeParensOptions:\n  AfterControlStatements: true\n  AfterForeachMacros: true\n  AfterFunctionDefinitionName: false\n  AfterFunctionDeclarationName: false\n  AfterIfMacros:   true\n  AfterOverloadedOperator: false\n  BeforeNonEmptyParentheses: false\nSpaceAroundPointerQualifiers: Default\nSpaceBeforeRangeBasedForLoopColon: true\nSpaceInEmptyBlock: false\nSpaceInEmptyParentheses: false\nSpacesBeforeTrailingComments: 2\nSpacesInAngles:  Never\nSpacesInConditionalStatement: false\nSpacesInContainerLiterals: true\nSpacesInCStyleCastParentheses: false\nSpacesInLineCommentPrefix:\n  Minimum:         1\n  Maximum:         -1\nSpacesInParentheses: false\nSpacesInSquareBrackets: false\nSpaceBeforeSquareBrackets: false\nBitFieldColonSpacing: Both\nStandard:        Auto\nStatementAttributeLikeMacros:\n  - Q_EMIT\nStatementMacros:\n  - Q_UNUSED\n  - QT_REQUIRE_VERSION\nTabWidth:        8\nUseCRLF:         false\nUseTab:          Never\nWhitespaceSensitiveMacros:\n  - STRINGIZE\n  - PP_STRINGIZE\n  - BOOST_PP_STRINGIZE\n  - NS_SWIFT_NAME\n  - CF_SWIFT_NAME\n...\n\n"
  },
  {
    "path": ".flake8",
    "content": "#########################\n# Flake8 Configuration  #\n# (.flake8)             #\n#########################\n[flake8]\nignore =\n    # first argument of a classmethod should be named 'cls'\n    N804\n    # line break before binary operator\n    W503\n    # whitespace before ':'\n    E203\nexclude =\n    .tox\n    .git,\n    __pycache__,\n    build,\n    *.pyc,\n    *third_party*,\n    scripts\nmax-line-length = 120\nmax-complexity = 25\nimport-order-style = pycharm\napplication-import-names =\n    marius\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.md",
    "content": "---\nname: Bug report\nabout: Create a report to help us improve\ntitle: ''\nlabels: bug\nassignees: ''\n\n---\n\n**Describe the bug**\nA clear and concise description of what the bug is.\n\n**To Reproduce**\nSteps to reproduce the behavior:\n1. Go to '...'\n2. Click on '....'\n3. Scroll down to '....'\n4. See error\n\n**Expected behavior**\nA clear and concise description of what you expected to happen.\n\n**Environment**\nList your operating system, and dependency versions. You can obtain this by running `marius_env_info` from the command line.\n\n**Additional context**\nAdd any other context about the problem here.\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/documentation-improvement.md",
    "content": "---\nname: Documentation Improvement\nabout: 'Suggest improvements to the documentation '\ntitle: ''\nlabels: documentation\nassignees: ''\n\n---\n\n**What is the documentation lacking? Please describe.**\nA clear and concise description of what the problem is. \n\n**Describe the improvement you'd like**\nA clear and concise description of what you want to added/fixed.\n\n**Additional context**\nProvide additional information and links to the relevant sections of the documentation (if applicable).\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.md",
    "content": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: ''\nlabels: enhancement\nassignees: ''\n\n---\n\n**Is your feature request related to a problem? Please describe.**\nA clear and concise description of what the problem is. Ex. I'm always frustrated when [...]\n\n**Describe the solution you'd like**\nA clear and concise description of what you want to happen.\n\n**Describe alternatives you've considered**\nA clear and concise description of any alternative solutions or features you've considered.\n\n**Additional context**\nAdd any other context or screenshots about the feature request here.\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/general-question.md",
    "content": "---\nname: General Question\nabout: Ask a question\ntitle: ''\nlabels: question\nassignees: ''\n\n---\n\n\n"
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE/pull_request_template.md",
    "content": "If there is no outstanding issue related to this change, please open an issue before submitting this pull request. For small and trivial changes, this step can be skipped.\n\n**Describe the pull request.**\nA clear and concise description of what the pull request contains. \n\n**How was this tested?**\nDescribe the tests that were added and any manual testing if applicable.\n\n**Please link the issue(s) this relates to.**\n\n**Additional context**\nAdd any other context or screenshots for the pull request here. Include notes on any follow-up work that may be required.\n"
  },
  {
    "path": ".github/workflows/build_and_test.yml",
    "content": "name: Build and Test\n\non:\n  push:\n    branches:\n      - main\n  pull_request:\n    branches:\n      - main\nenv:\n  BUILD_TYPE: Release\n\njobs:\n  build:\n    name: ${{ matrix.config.name }}\n    runs-on: ${{ matrix.config.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        config:\n        - {\n            name: \"Ubuntu 20.04 GCC\", artifact: \"Linux.7z\",\n            os: ubuntu-20.04,\n            cc: \"gcc\", cxx: \"g++\"\n          }\n    steps:\n    - uses: actions/checkout@v2\n\n    - name: Install dependencies\n      working-directory: ${{github.workspace}}\n      shell: bash\n      run:   |\n        \n        python3 --version\n      \n        sudo python3 -m pip install pyarrow\n\n        if [ \"$RUNNER_OS\" == \"Linux\" ]; then\n             sudo pip3 install torch --extra-index-url https://download.pytorch.org/whl/cpu\n        else\n             echo \"$RUNNER_OS not supported\"\n             exit 1\n        fi\n      \n    - name: Install Marius\n      working-directory: ${{github.workspace}}\n      shell: bash\n      run: |\n        sudo pip3 install .[tests] --verbose\n        marius_env_info\n\n    - name: Run Tests\n      working-directory: ${{github.workspace}}\n      shell: bash\n      run: OMP_NUM_THREADS=1 MARIUS_TEST_HOME=test/ python3 -m pytest test/python --verbose\n\n"
  },
  {
    "path": ".github/workflows/db2graph_test_postgres.yml",
    "content": "name: Testing DB2GRAPH using postgres\non:\n  push:\n    branches:\n      - main\n  pull_request:\n    branches:\n      - main\n\njobs:\n\n  db2graph:\n    runs-on: ubuntu-latest\n    container: ${{ matrix.python_container }}\n    strategy:\n      matrix:\n        python_container: [\"python:3.7\", \"python:3.8\", \"python:3.9\", \"python:3.10\"]\n\n    services:\n      postgres:\n        # Docker Hub image\n        image: postgres\n        # Provide the password for postgres\n        env:\n          POSTGRES_PASSWORD: postgres\n        # Set health checks to wait until postgres has started\n        options: >-\n          --health-cmd pg_isready\n          --health-interval 10s\n          --health-timeout 5s\n          --health-retries 5\n\n    steps:\n      # Downloads a copy of the code in your repository before running CI tests\n      - name: Check out repository code\n        uses: actions/checkout@v3\n\n      - name: Installing dependencies\n        run: MARIUS_NO_BINDINGS=1 python3 -m pip install .[db2graph,tests]\n\n      - name: Running pytest\n        run: MARIUS_NO_BINDINGS=1 pytest -s test/db2graph/test_postgres.py\n        # Environment variables used in the test\n        env:\n          # The hostname used to communicate with the PostgreSQL service container\n          POSTGRES_HOST: postgres\n          # The default PostgreSQL port - using default port\n          POSTGRES_PORT: 5432"
  },
  {
    "path": ".github/workflows/lint.yml",
    "content": "name: Lint\n\non: [push, pull_request]\n\njobs:\n  linting:\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actions/checkout@v2\n    - name: Install Tox\n      run: pip3 install tox\n    - name: Update clang-format\n      run: pip3 install --upgrade pip; pip3 install clang-format\n    - name: Check linting with Flake8\n      run: tox -e check_lint\n"
  },
  {
    "path": ".gitignore",
    "content": "CMakeCache.txt\nCMakeFiles\nCMakeScripts\nTesting\nMakefile\ncmake_install.cmake\ninstall_manifest.txt\ncompile_commands.json\nCTestTestfile.cmake\n.idea/\ncmake-*/\nlogs/\ndata/\n!src/cpp/src/data\n!src/cpp/include/data\ntest/test_data/generated/\n*.dylib\n\n# Created by https://www.toptal.com/developers/gitignore/api/python\n# Edit at https://www.toptal.com/developers/gitignore?templates=python\n\n### Python ###\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndocs_build/\ndocs_html/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\npytestdebug.log\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\ndoc/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# poetry\n#poetry.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n# .env\n.env/\n.venv/\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\npythonenv*\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# operating system-related files\n# file properties cache/storage on macOS\n*.DS_Store\n# thumbnail cache on Windows\nThumbs.db\n\n# profiling data\n.prof\n\n\n# End of https://www.toptal.com/developers/gitignore/api/python\n"
  },
  {
    "path": ".gitmodules",
    "content": "[submodule \"src/cpp/third_party/pybind11\"]\n\tpath = src/cpp/third_party/pybind11\n\turl = https://github.com/pybind/pybind11.git\n[submodule \"src/cpp/third_party/spdlog\"]\n\tpath = src/cpp/third_party/spdlog\n\turl = https://github.com/gabime/spdlog.git\n[submodule \"src/cpp/third_party/googletest\"]\n\tpath = src/cpp/third_party/googletest\n\turl = https://github.com/google/googletest.git\n[submodule \"src/cpp/third_party/parallel-hashmap\"]\n\tpath = src/cpp/third_party/parallel-hashmap\n\turl = https://github.com/greg7mdp/parallel-hashmap.git\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "cmake_minimum_required(VERSION 3.12.2)\nset(CMAKE_CXX_STANDARD 17)\nset(CMAKE_CXX_STANDARD_REQUIRED ON)\ncmake_policy(SET CMP0048 NEW)\n\nproject(marius VERSION 0.1 LANGUAGES CXX)\n\ninclude(FindPackageHandleStandardArgs)\n\nadd_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)\n\nset(CMAKE_CXX_VISIBILITY_PRESET default)\n\nif (\"${CMAKE_CXX_COMPILER_ID}\" MATCHES \"Clang\")\n    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0)\n        message(FATAL_ERROR \"Clang version must be at least 11!\")\n    endif()\n    set(CLANG TRUE)\nelseif (\"${CMAKE_CXX_COMPILER_ID}\" MATCHES \"GNU\")\n    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)\n        message(FATAL_ERROR \"GCC version must be at least 7.0!\")\n    endif()\n    set(GCC TRUE)\nelse ()\n    message(FATAL_ERROR \"Unknown compiler\")\nendif ()\n\nif (${CMAKE_SYSTEM_NAME} MATCHES \"Darwin\")\n    set(CMAKE_MACOSX_RPATH 1)\nendif ()\n\nif(${USE_CUDA})\n    add_definitions(-DMARIUS_CUDA=${USE_CUDA})\n    set(CMAKE_CUDA_STANDARD 14)\n    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)\n    enable_language(CUDA)\n    set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr\")\nendif()\n\n# Find torch location\nexecute_process(\n        COMMAND python3 -c \"import torch; import os; print(os.path.dirname(torch.__file__), end='')\"\n        OUTPUT_VARIABLE TorchPath\n)\nlist(APPEND CMAKE_PREFIX_PATH ${TorchPath})\n\nexecute_process(\n        COMMAND python3 -c \"import torch; print(torch.__version__, end='')\"\n        OUTPUT_VARIABLE TorchVersion\n)\n\nmessage(STATUS \"Torch Version: ${TorchVersion}\")\n\n# Add the cmake folder so the FindSphinx module is found\n\nset(MARIUS_CPP_SOURCE ${CMAKE_CURRENT_LIST_DIR}/src/cpp)\nset(CMAKE_MODULE_PATH \"${MARIUS_CPP_SOURCE}/cmake\" ${CMAKE_MODULE_PATH})\nset(project_INCLUDE_DIR ${MARIUS_CPP_SOURCE}/include)\nset(project_SOURCE_DIR ${MARIUS_CPP_SOURCE}/src)\nset(project_CUDA_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/src/cuda/include)\nset(project_CUDA_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/src/cuda/src)\nset(project_CUDA_THIRD_PARTY_DIR ${CMAKE_CURRENT_LIST_DIR}/src/cuda/third_party)\nset(project_TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/test)\nset(project_DOCS_DIR ${CMAKE_CURRENT_LIST_DIR}/docs)\nset(project_BINDINGS_DIR ${MARIUS_CPP_SOURCE}/python_bindings)\nset(project_THIRD_PARTY_DIR ${MARIUS_CPP_SOURCE}/third_party)\n\nset(project_WORKING_DIR ${CMAKE_CURRENT_BINARY_DIR})\nadd_definitions(-DMARIUS_BASE_DIRECTORY=\"${CMAKE_CURRENT_LIST_DIR}\")\nadd_definitions(-DMARIUS_TEST_DIRECTORY=\"${project_TEST_DIR}\")\n\nif (EXISTS ${project_INCLUDE_DIR})\n    file(GLOB_RECURSE project_HEADERS ${project_HEADERS} ${project_INCLUDE_DIR}/*.h)\nendif ()\nif (EXISTS ${project_SOURCE_DIR})\n    file(GLOB_RECURSE project_SOURCES ${project_SOURCES} ${project_SOURCE_DIR}/*.cpp)\nendif ()\n\nif(${USE_CUDA})\n    if (EXISTS ${project_CUDA_INCLUDE_DIR})\n        file(GLOB_RECURSE project_CUDA_HEADERS ${project_CUDA_INCLUDE_DIR} ${project_CUDA_INCLUDE_DIR}/*.cuh)\n    endif ()\n    if (EXISTS ${project_CUDA_SOURCE_DIR})\n        file(GLOB_RECURSE project_CUDA_SOURCES ${project_CUDA_SOURCE_DIR} ${project_CUDA_SOURCE_DIR}/*.cu)\n    endif ()\n\n    if (EXISTS ${project_CUDA_THIRD_PARTY_DIR})\n        file(GLOB_RECURSE project_CUDA_THIRD_PARTY_HEADERS ${project_CUDA_THIRD_PARTY_DIR} ${project_CUDA_THIRD_PARTY_DIR}/*.cuh ${project_CUDA_THIRD_PARTY_DIR}/*.h)\n    endif ()\n    if (EXISTS ${project_CUDA_THIRD_PARTY_DIR})\n        file(GLOB_RECURSE project_CUDA_THIRD_PARTY_SOURCES ${project_CUDA_THIRD_PARTY_DIR} ${project_CUDA_THIRD_PARTY_DIR}/*.cu ${project_CUDA_THIRD_PARTY_DIR}/*.cpp)\n    endif ()\nendif ()\n\nmessage(STATUS \"project_CUDA_THIRD_PARTY_HEADERS ${project_CUDA_THIRD_PARTY_HEADERS}\")\nmessage(STATUS \"project_CUDA_THIRD_PARTY_SOURCES ${project_CUDA_THIRD_PARTY_SOURCES}\")\n\nfind_package(Python3 COMPONENTS Development Interpreter REQUIRED)\nfind_package(Torch REQUIRED)\n\nexecute_process(\n        COMMAND python3 -c \"import torch; print(torch._C._PYBIND11_COMPILER_TYPE, end='')\"\n        OUTPUT_VARIABLE _PYBIND11_COMPILER_TYPE\n)\nexecute_process(\n        COMMAND python3 -c \"import torch; print(torch._C._PYBIND11_STDLIB, end='')\"\n        OUTPUT_VARIABLE _PYBIND11_STDLIB\n)\nexecute_process(\n        COMMAND python3 -c \"import torch; print(torch._C._PYBIND11_BUILD_ABI, end='')\"\n        OUTPUT_VARIABLE _PYBIND11_BUILD_ABI\n)\n\nmessage(STATUS \"PYBIND11_COMPILER_TYPE:\" ${_PYBIND11_COMPILER_TYPE})\nmessage(STATUS \"PYBIND11_STDLIB:\" ${_PYBIND11_STDLIB})\nmessage(STATUS \"PYBIND11_BUILD_ABI:\" ${_PYBIND11_BUILD_ABI})\n\nadd_compile_definitions(PYBIND11_COMPILER_TYPE=\"${_PYBIND11_COMPILER_TYPE}\" PYBIND11_STDLIB=\"${_PYBIND11_STDLIB}\" PYBIND11_BUILD_ABI=\"${_PYBIND11_BUILD_ABI}\")\n\nset(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}\")\nmessage(STATUS \"CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}\")\n\nmessage(STATUS \"Python3_INCLUDE_DIRS ${Python3_INCLUDE_DIRS}\")\nadd_subdirectory(${project_THIRD_PARTY_DIR})\nset_property(TARGET spdlog PROPERTY POSITION_INDEPENDENT_CODE ON)\n\ninclude_directories(${Python3_INCLUDE_DIRS})\ninclude_directories(${project_INCLUDE_DIR})\ninclude_directories(${project_CUDA_INCLUDE_DIR})\ninclude_directories(${project_CUDA_THIRD_PARTY_DIR})\ninclude_directories(${TORCH_INCLUDE_DIRS})\ninclude_directories(${project_THIRD_PARTY_DIR}/parallel-hashmap/)\ninclude_directories(${project_BINDINGS})\n\nadd_library(${PROJECT_NAME}\n            SHARED\n            ${project_SOURCES}\n            ${project_HEADERS}\n            ${project_CUDA_HEADERS}\n            ${project_CUDA_SOURCES}\n            ${project_CUDA_THIRD_PARTY_HEADERS}\n            ${project_CUDA_THIRD_PARTY_SOURCES})\n\nif(NOT APPLE)\n    target_link_libraries(${PROJECT_NAME} ${Python3_LIBRARIES})\nelse()\n    set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS \"-undefined dynamic_lookup\")\nendif()\n\ntarget_link_libraries(${PROJECT_NAME} ${TORCH_LIBRARIES})\ntarget_link_libraries(${PROJECT_NAME} spdlog)\nset_target_properties(${PROJECT_NAME} PROPERTIES PUBLIC_HEADER \"${project_HEADERS}\")\nset_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)\n\nif(${USE_CUDA})\n    set(NVCC_FLAGS \"{NVCC_FLAGS} --expt-relaxed-constexpr\")\nendif()\n\nif(${USE_OMP})\n    add_definitions(-DMARIUS_OMP=${USE_OMP})\n    if(APPLE)\n        if(CMAKE_C_COMPILER_ID MATCHES \"Clang\")\n            set(OpenMP_C \"${CMAKE_C_COMPILER}\")\n            set(OpenMP_C_FLAGS \"-Xpreprocessor -fopenmp\")\n            set(OpenMP_C_LIB_NAMES \"omp\")\n            set(OpenMP_omp_LIBRARY omp)\n        endif()\n        if(CMAKE_CXX_COMPILER_ID MATCHES \"Clang\")\n            set(OpenMP_CXX \"${CMAKE_CXX_COMPILER}\")\n            set(OpenMP_CXX_FLAGS \"-Xpreprocessor -fopenmp\")\n            set(OpenMP_CXX_LIB_NAMES \"omp\")\n            set(OpenMP_omp_LIBRARY omp)\n        endif()\n    endif()\n\n    if(\"${CMAKE_CXX_COMPILER_ID}\" MATCHES \"GNU\")\n        set(OpenMP_CXX \"${CMAKE_CXX_COMPILER}\")\n        set(OpenMP_CXX_FLAGS \"-fopenmp\")\n    endif()\n    find_package(OpenMP REQUIRED)\n    target_link_libraries(${PROJECT_NAME} OpenMP::OpenMP_CXX)\nendif()\n\nif (EXISTS ${project_INCLUDE_DIR})\n    target_include_directories(${PROJECT_NAME} PUBLIC ${project_INCLUDE_DIR})\nendif ()\nif (EXISTS ${project_SOURCE_DIR})\n    target_include_directories(${PROJECT_NAME} PRIVATE ${project_SOURCE_DIR})\nendif ()\n\nIF(CMAKE_BUILD_TYPE MATCHES Debug AND MARIUS_USE_TSAN)\n    message(\"Using address sanitizer\")\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -fsanitize=thread\")\n    set(CMAKE_MODULE_LINKER_FLAGS \"{$CMAKE_MODULE_LINKER_FLAGS} -fsanitize=thread\")\nENDIF(CMAKE_BUILD_TYPE MATCHES Debug AND MARIUS_USE_TSAN)\n\nIF(CMAKE_BUILD_TYPE MATCHES Debug AND MARIUS_USE_ASAN)\n    message(\"Using thread sanitizer\")\n    set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -fsanitize=address -fsanitize=leak\")\n    set(CMAKE_MODULE_LINKER_FLAGS \"{$CMAKE_MODULE_LINKER_FLAGS} -fsanitize=address -fsanitize=leak\")\nENDIF(CMAKE_BUILD_TYPE MATCHES Debug AND MARIUS_USE_ASAN)\n\n\nIF(BUILD_DOCS)\n    add_subdirectory(${project_DOCS_DIR})\nENDIF()\n\nif (EXISTS ${project_TEST_DIR})\n    enable_testing()\n    add_subdirectory(${project_TEST_DIR})\nendif ()\n\nadd_executable(marius_train ${project_SOURCE_DIR}/marius.cpp)\nadd_executable(marius_eval ${project_SOURCE_DIR}/marius.cpp)\ntarget_link_libraries(marius_train ${PROJECT_NAME})\ntarget_link_libraries(marius_eval ${PROJECT_NAME})\n\nfind_library(TORCH_PYTHON_LIBRARY torch_python PATHS \"${TORCH_INSTALL_PREFIX}/lib\")\nmessage(STATUS \"TORCH_PYTHON_LIBRARY: ${TORCH_PYTHON_LIBRARY}\")\n\nfile(GLOB_RECURSE CONFIG_BINDINGS ${project_BINDINGS} ${project_BINDINGS_DIR}/configuration/*.cpp)\npybind11_add_module(_config ${CONFIG_BINDINGS})\ntarget_link_libraries(_config PRIVATE ${PROJECT_NAME} ${TORCH_PYTHON_LIBRARY})\n\nfile(GLOB_RECURSE DATA_BINDINGS ${project_BINDINGS} ${project_BINDINGS_DIR}/data/*.cpp)\npybind11_add_module(_data ${DATA_BINDINGS})\ntarget_link_libraries(_data PRIVATE ${PROJECT_NAME} ${TORCH_PYTHON_LIBRARY})\n\nfile(GLOB_RECURSE NN_BINDINGS ${project_BINDINGS} ${project_BINDINGS_DIR}/nn/*.cpp)\npybind11_add_module(_nn ${NN_BINDINGS})\ntarget_link_libraries(_nn PRIVATE ${PROJECT_NAME} ${TORCH_PYTHON_LIBRARY})\n\nfile(GLOB_RECURSE MANAGER_BINDINGS ${project_BINDINGS} ${project_BINDINGS_DIR}/manager/*.cpp)\npybind11_add_module(_manager ${MANAGER_BINDINGS})\ntarget_link_libraries(_manager PRIVATE ${PROJECT_NAME} ${TORCH_PYTHON_LIBRARY})\n\nfile(GLOB_RECURSE PIPELINE_BINDINGS ${project_BINDINGS} ${project_BINDINGS_DIR}/pipeline/*.cpp)\npybind11_add_module(_pipeline ${PIPELINE_BINDINGS})\ntarget_link_libraries(_pipeline PRIVATE ${PROJECT_NAME} ${TORCH_PYTHON_LIBRARY})\n\nfile(GLOB_RECURSE REPORT_BINDINGS ${project_BINDINGS} ${project_BINDINGS_DIR}/reporting/*.cpp)\npybind11_add_module(_report ${REPORT_BINDINGS})\ntarget_link_libraries(_report PRIVATE ${PROJECT_NAME} ${TORCH_PYTHON_LIBRARY})\n\nfile(GLOB_RECURSE STORAGE_BINDINGS ${project_BINDINGS} ${project_BINDINGS_DIR}/storage/*.cpp)\npybind11_add_module(_storage ${STORAGE_BINDINGS})\ntarget_link_libraries(_storage PRIVATE ${PROJECT_NAME} ${TORCH_PYTHON_LIBRARY})\n\nadd_custom_target(bindings)\nadd_dependencies(bindings _config _data _manager _nn _pipeline _report _storage)\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing to Marius\n\nAny contributions users wish to make to Marius are welcome. To name a few, here are some ways to contribute:\n\n\n- Adding new models\n- Adding new datasets and converters\n- Downstream inference examples\n- Documentation improvements\n- Bug Fixes\n\n## Contributing Code\n\n1. Fork the Marius repository\n2. Clone the forked repo and create a new branch for your change  \n- `git clone https://github.com/<YourUsername>/marius`  \n- `git checkout -b <feature_branch>`\n   \n3. Add your changes to the feature branch \n\n4. Write tests for your changes  \n- C++ Tests are located in a gtest under `test/cpp`\n- Python tests are located in `test/python`\n\n5. Run tests and verify nothing is broken.\nSee the testing README for how to build and run the tests `test/README.md`\n\n## Submitting a Pull Request\n\nOnce your changes have been completed, or if you want to submit an in-progress pull request to get eyes on it. Please follow the following steps:\n\n1. Sync your feature branch with the main branch\n\n- `git remote add upstream https://github.com/marius-team/marius.git`\n\n- `git fetch upstream main`\n\n- `git merge upstream/main`\n\n2. Create and submit a pull request that follows the provided template. The pull request will be reviewed by the maintainers of Marius.\n\n3. Address the comments from the reviewer(s) and update your pull request accordingly. \n\n4. Once the review process is complete your changes will be merged in!\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "graft src\ninclude CMakeLists.txt\nglobal-exclude *.py[cod] __pycache__ *.so *.dylib .DS_Store *.gpickle"
  },
  {
    "path": "README.md",
    "content": "# Marius and MariusGNN #\n\nThis repository contains the code for the Marius and MariusGNN papers. \nWe have combined the two works into one unified system for training \ngraph embeddings and graph neural networks over large-scale graphs \non a single machine using the entire memory hierarchy.\n\nMarius ([OSDI '21 Paper](https://www.usenix.org/conference/osdi21/presentation/mohoney)) is designed to mitigate/reduce data movement overheads for graph embeddings using:\n- Pipelined training and IO\n- Partition caching and a buffer-aware data ordering to minimize IO for disk-based training (called BETA)\n\nMariusGNN ([EuroSys '23 Paper](https://dl.acm.org/doi/abs/10.1145/3552326.3567501)) \nutilizes the data movement optimizations from Marius and adds support for scalable graph neural network training through:\n- An optimized data structure for neighbor sampling and GNN aggregation (called DENSE)\n- An improved data ordering for disk-based training (called COMET) which minimizes IO and maximizes model accuracy (with COMET now subsuming BETA)\n\n## Build and Install ##\n\n### Requirements ###\n\n* CUDA >= 10.1\n* CuDNN >= 7 \n* PyTorch >= 1.8\n* Python >= 3.7\n* GCC >= 7 (On Linux) or Clang >= 11.0 (On MacOS)\n* CMake >= 3.12\n* Make >= 3.8\n\n### Docker Installation ###\nWe recommend using Docker for build and installation. \nWe provide a Dockerfile which installs all the necessary \nrequirements and provide end-to-end instructions in `examples/docker/`.\n\n\n### Pip Installation ###\nWith the required dependencies installed, Marius and MariusGNN can be built using Pip:  \n\n```\ngit clone https://github.com/marius-team/marius.git\ncd marius\npip3 install .\n```\n\n### Installation Result ###\n\nAfter installation, the Python API can be accessed with ``import marius``.\n\nThe following command line tools will be also be installed:\n- marius_train: Train models using configuration files and the command line\n- marius_eval: Command line model evaluation\n- marius_preprocess: Built-in dataset downloading and preprocessing\n- marius_predict: Batch inference tool for link prediction or node classification\n\n## Command Line Interface ##\n\nThe command line interface supports performant in-memory and out-of-core \ntraining and evaluation of graph learning models. Experimental results \nfrom our papers can be reproduced using this interface (we also provide\nan exact experiment artifact for each paper in separate branches).\n\n### Quick Start: ###\n\nFirst make sure Marius is installed. \n\nPreprocess the FB15K_237 dataset with `marius_preprocess --dataset fb15k_237 --output_dir datasets/fb15k_237_example/`\n\nTrain using the example configuration file (assuming we are in the root directory of the repository) `marius_train examples/configuration/fb15k_237.yaml`\n\nAfter running this configuration file, the MRR output by the system should be about .25 after 10 epochs.\n\nPerform batch inference on the test set with `marius_predict --config examples/configuration/fb15k_237.yaml --metrics mrr --save_scores --save_ranks`\n\nSee the [full example](http://marius-project.org/marius/examples/config/lp_fb15k237.html#small-scale-link-prediction-fb15k-237) for details.\n\n## Python API ##\n\nThe Python API is currently experimental and can be used to perform in-memory training and evaluation of graph learning models. \n\nSee the [documentation](http://marius-project.org/marius/examples/python/index.html#) and `examples/python/` for Python API usage and examples.\n\n\n## Citing Marius or MariusGNN ##\nMarius (out-of-core graph embeddings)\n```\n@inproceedings{Marius,\n    author = {Jason Mohoney and Roger Waleffe and Henry Xu and Theodoros Rekatsinas and Shivaram Venkataraman},\n    title = {Marius: Learning Massive Graph Embeddings on a Single Machine},\n    booktitle = {15th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 21)},\n    year = {2021},\n    isbn = {9781939133229},\n    pages = {533--549},\n    url = {https://www.usenix.org/conference/osdi21/presentation/mohoney},\n    publisher = {{USENIX} Association}\n}\n```\n\nMariusGNN (out-of-core GNN training)\n```\n@inproceedings{MariusGNN, \n    author = {Roger Waleffe and Jason Mohoney and Theodoros Rekatsinas and Shivaram Venkataraman},\n    title = {MariusGNN: Resource-Efficient Out-of-Core Training of Graph Neural Networks}, \n    booktitle = {Proceedings of the Eighteenth European Conference on Computer Systems}, \n    year = {2023}, \n    isbn = {9781450394871}, \n    pages = {144–161},\n    url = {https://doi.org/10.1145/3552326.3567501},\n    publisher = {Association for Computing Machinery}\n}\n```\n"
  },
  {
    "path": "docs/.nojekyll",
    "content": ""
  },
  {
    "path": "docs/CMakeLists.txt",
    "content": "# https://devblogs.microsoft.com/cppblog/clear-functional-c-documentation-with-sphinx-breathe-doxygen-cmake/\nfind_package(Doxygen REQUIRED)\nfind_package(Sphinx REQUIRED)\n\n# Find all the public headers\nfile(GLOB_RECURSE PROJECT_HEADERS ${project_INCLUDE_DIR}/*.h)\n\nset(DOXYGEN_INPUT_DIR ${PROJECT_SOURCE_DIR}/src/cpp/include)\nset(DOXYGEN_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/doxygen)\nset(DOXYGEN_INDEX_FILE ${DOXYGEN_OUTPUT_DIR}/xml/index.xml)\nset(DOXYFILE_IN ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in)\nset(DOXYFILE_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)\n\n# Replace variables inside @@ with the current values\nconfigure_file(${DOXYFILE_IN} ${DOXYFILE_OUT} @ONLY)\n\n# Doxygen won't create this for us\nfile(MAKE_DIRECTORY ${DOXYGEN_OUTPUT_DIR})\n\n# Only regenerate Doxygen when the Doxyfile or public headers change\nadd_custom_command(OUTPUT ${DOXYGEN_INDEX_FILE}\n        DEPENDS ${PROJECT_HEADERS}\n        COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYFILE_OUT}\n        MAIN_DEPENDENCY ${DOXYFILE_OUT} ${DOXYFILE_IN}\n        COMMENT \"Generating docs\"\n        VERBATIM)\n\n# Nice named target so we can run the job easily\nadd_custom_target(Doxygen ALL DEPENDS ${DOXYGEN_INDEX_FILE})\n\nset(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR})\nset(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/html)\nset(SPHINX_INDEX_FILE ${SPHINX_BUILD}/index.html)\n\n# Only regenerate Sphinx when:\n# - Doxygen has rerun\n# - Our doc files have been updated\n# - The Sphinx config has been updated\nadd_custom_command(OUTPUT ${SPHINX_INDEX_FILE}\n        COMMAND\n        ${SPHINX_EXECUTABLE} -b html\n        # Tell Breathe where to find the Doxygen output\n        -Dbreathe_projects.Marius=${DOXYGEN_OUTPUT_DIR}/xml\n        ${SPHINX_SOURCE} ${SPHINX_BUILD}\n        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}\n        DEPENDS\n        # Other docs files you want to track should go here (or in some variable)\n        ${CMAKE_CURRENT_SOURCE_DIR}/index.rst\n        ${DOXYGEN_INDEX_FILE}\n        MAIN_DEPENDENCY ${SPHINX_SOURCE}/conf.py\n        COMMENT \"Generating documentation with Sphinx\")\n\n# Nice named target so we can run the job easily\nadd_custom_target(Sphinx ALL DEPENDS ${SPHINX_INDEX_FILE})\n\n# Add an install target to install the docs\ninclude(GNUInstallDirs)\ninstall(DIRECTORY ${SPHINX_BUILD}\n        DESTINATION ${CMAKE_INSTALL_DOCDIR})"
  },
  {
    "path": "docs/Doxyfile",
    "content": "# Doxyfile 1.8.20\n\n# This file describes the settings to be used by the documentation system\n# doxygen (www.doxygen.org) for a project.\n#\n# All text after a double hash (##) is considered a comment and is placed in\n# front of the TAG it is preceding.\n#\n# All text after a single hash (#) is considered a comment and will be ignored.\n# The format is:\n# TAG = value [value, ...]\n# For lists, items can also be appended using:\n# TAG += value [value, ...]\n# Values that contain spaces should be placed between quotes (\\\" \\\").\n\n#---------------------------------------------------------------------------\n# Project related configuration options\n#---------------------------------------------------------------------------\n\n# This tag specifies the encoding used for all characters in the configuration\n# file that follow. The default is UTF-8 which is also the encoding used for all\n# text before the first occurrence of this tag. Doxygen uses libiconv (or the\n# iconv built into libc) for the transcoding. See\n# https://www.gnu.org/software/libiconv/ for the list of possible encodings.\n# The default value is: UTF-8.\n\nDOXYFILE_ENCODING      = UTF-8\n\n# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by\n# double-quotes, unless you are using Doxywizard) that should identify the\n# project for which the documentation is generated. This name is used in the\n# title of most generated pages and in a few other places.\n# The default value is: My Project.\n\nPROJECT_NAME           = \"Marius\"\n\n# The PROJECT_NUMBER tag can be used to enter a project or revision number. This\n# could be handy for archiving the generated documentation or if some version\n# control system is used.\n\nPROJECT_NUMBER         =\n\n# Using the PROJECT_BRIEF tag one can provide an optional one line description\n# for a project that appears at the top of each page and should give viewer a\n# quick idea about the purpose of the project. Keep the description short.\n\nPROJECT_BRIEF          =\n\n# With the PROJECT_LOGO tag one can specify a logo or an icon that is included\n# in the documentation. The maximum height of the logo should not exceed 55\n# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy\n# the logo to the output directory.\n\nPROJECT_LOGO           =\n\n# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path\n# into which the generated documentation will be written. If a relative path is\n# entered, it will be relative to the location where doxygen was started. If\n# left blank the current directory will be used.\n\nOUTPUT_DIRECTORY       =\n\n# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-\n# directories (in 2 levels) under the output directory of each output format and\n# will distribute the generated files over these directories. Enabling this\n# option can be useful when feeding doxygen a huge amount of source files, where\n# putting all generated files in the same directory would otherwise causes\n# performance problems for the file system.\n# The default value is: NO.\n\nCREATE_SUBDIRS         = NO\n\n# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII\n# characters to appear in the names of generated files. If set to NO, non-ASCII\n# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode\n# U+3044.\n# The default value is: NO.\n\nALLOW_UNICODE_NAMES    = NO\n\n# The OUTPUT_LANGUAGE tag is used to specify the language in which all\n# documentation generated by doxygen is written. Doxygen will use this\n# information to generate all constant output in the proper language.\n# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,\n# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),\n# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,\n# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),\n# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,\n# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,\n# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,\n# Ukrainian and Vietnamese.\n# The default value is: English.\n\nOUTPUT_LANGUAGE        = English\n\n# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all\n# documentation generated by doxygen is written. Doxygen will use this\n# information to generate all generated output in the proper direction.\n# Possible values are: None, LTR, RTL and Context.\n# The default value is: None.\n\nOUTPUT_TEXT_DIRECTION  = None\n\n# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member\n# descriptions after the members that are listed in the file and class\n# documentation (similar to Javadoc). Set to NO to disable this.\n# The default value is: YES.\n\nBRIEF_MEMBER_DESC      = YES\n\n# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief\n# description of a member or function before the detailed description\n#\n# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the\n# brief descriptions will be completely suppressed.\n# The default value is: YES.\n\nREPEAT_BRIEF           = YES\n\n# This tag implements a quasi-intelligent brief description abbreviator that is\n# used to form the text in various listings. Each string in this list, if found\n# as the leading text of the brief description, will be stripped from the text\n# and the result, after processing the whole list, is used as the annotated\n# text. Otherwise, the brief description is used as-is. If left blank, the\n# following values are used ($name is automatically replaced with the name of\n# the entity):The $name class, The $name widget, The $name file, is, provides,\n# specifies, contains, represents, a, an and the.\n\nABBREVIATE_BRIEF       = \"The $name class\" \\\n                         \"The $name widget\" \\\n                         \"The $name file\" \\\n                         is \\\n                         provides \\\n                         specifies \\\n                         contains \\\n                         represents \\\n                         a \\\n                         an \\\n                         the\n\n# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then\n# doxygen will generate a detailed section even if there is only a brief\n# description.\n# The default value is: NO.\n\nALWAYS_DETAILED_SEC    = NO\n\n# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all\n# inherited members of a class in the documentation of that class as if those\n# members were ordinary class members. Constructors, destructors and assignment\n# operators of the base classes will not be shown.\n# The default value is: NO.\n\nINLINE_INHERITED_MEMB  = NO\n\n# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path\n# before files name in the file list and in the header files. If set to NO the\n# shortest path that makes the file name unique will be used\n# The default value is: YES.\n\nFULL_PATH_NAMES        = YES\n\n# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.\n# Stripping is only done if one of the specified strings matches the left-hand\n# part of the path. The tag can be used to show relative paths in the file list.\n# If left blank the directory from which doxygen is run is used as the path to\n# strip.\n#\n# Note that you can specify absolute paths here, but also relative paths, which\n# will be relative from the directory where doxygen is started.\n# This tag requires that the tag FULL_PATH_NAMES is set to YES.\n\nSTRIP_FROM_PATH        =\n\n# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the\n# path mentioned in the documentation of a class, which tells the reader which\n# header file to include in order to use a class. If left blank only the name of\n# the header file containing the class definition is used. Otherwise one should\n# specify the list of include paths that are normally passed to the compiler\n# using the -I flag.\n\nSTRIP_FROM_INC_PATH    =\n\n# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but\n# less readable) file names. This can be useful is your file systems doesn't\n# support long names like on DOS, Mac, or CD-ROM.\n# The default value is: NO.\n\nSHORT_NAMES            = NO\n\n# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the\n# first line (until the first dot) of a Javadoc-style comment as the brief\n# description. If set to NO, the Javadoc-style will behave just like regular Qt-\n# style comments (thus requiring an explicit @brief command for a brief\n# description.)\n# The default value is: NO.\n\nJAVADOC_AUTOBRIEF      = NO\n\n# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line\n# such as\n# /***************\n# as being the beginning of a Javadoc-style comment \"banner\". If set to NO, the\n# Javadoc-style will behave just like regular comments and it will not be\n# interpreted by doxygen.\n# The default value is: NO.\n\nJAVADOC_BANNER         = NO\n\n# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first\n# line (until the first dot) of a Qt-style comment as the brief description. If\n# set to NO, the Qt-style will behave just like regular Qt-style comments (thus\n# requiring an explicit \\brief command for a brief description.)\n# The default value is: NO.\n\nQT_AUTOBRIEF           = NO\n\n# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a\n# multi-line C++ special comment block (i.e. a block of //! or /// comments) as\n# a brief description. This used to be the default behavior. The new default is\n# to treat a multi-line C++ comment block as a detailed description. Set this\n# tag to YES if you prefer the old behavior instead.\n#\n# Note that setting this tag to YES also means that rational rose comments are\n# not recognized any more.\n# The default value is: NO.\n\nMULTILINE_CPP_IS_BRIEF = NO\n\n# By default Python docstrings are displayed as preformatted text and doxygen's\n# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the\n# doxygen's special commands can be used and the contents of the docstring\n# documentation blocks is shown as doxygen documentation.\n# The default value is: YES.\n\nPYTHON_DOCSTRING       = YES\n\n# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the\n# documentation from any documented member that it re-implements.\n# The default value is: YES.\n\nINHERIT_DOCS           = YES\n\n# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new\n# page for each member. If set to NO, the documentation of a member will be part\n# of the file/class/namespace that contains it.\n# The default value is: NO.\n\nSEPARATE_MEMBER_PAGES  = NO\n\n# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen\n# uses this value to replace tabs by spaces in code fragments.\n# Minimum value: 1, maximum value: 16, default value: 4.\n\nTAB_SIZE               = 4\n\n# This tag can be used to specify a number of aliases that act as commands in\n# the documentation. An alias has the form:\n# name=value\n# For example adding\n# \"sideeffect=@par Side Effects:\\n\"\n# will allow you to put the command \\sideeffect (or @sideeffect) in the\n# documentation, which will result in a user-defined paragraph with heading\n# \"Side Effects:\". You can put \\n's in the value part of an alias to insert\n# newlines (in the resulting output). You can put ^^ in the value part of an\n# alias to insert a newline as if a physical newline was in the original file.\n# When you need a literal { or } or , in the value part of an alias you have to\n# escape them by means of a backslash (\\), this can lead to conflicts with the\n# commands \\{ and \\} for these it is advised to use the version @{ and @} or use\n# a double escape (\\\\{ and \\\\})\n\nALIASES                =\n\n# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources\n# only. Doxygen will then generate output that is more tailored for C. For\n# instance, some of the names that are used will be different. The list of all\n# members will be omitted, etc.\n# The default value is: NO.\n\nOPTIMIZE_OUTPUT_FOR_C  = NO\n\n# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or\n# Python sources only. Doxygen will then generate output that is more tailored\n# for that language. For instance, namespaces will be presented as packages,\n# qualified scopes will look different, etc.\n# The default value is: NO.\n\nOPTIMIZE_OUTPUT_JAVA   = NO\n\n# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran\n# sources. Doxygen will then generate output that is tailored for Fortran.\n# The default value is: NO.\n\nOPTIMIZE_FOR_FORTRAN   = NO\n\n# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL\n# sources. Doxygen will then generate output that is tailored for VHDL.\n# The default value is: NO.\n\nOPTIMIZE_OUTPUT_VHDL   = NO\n\n# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice\n# sources only. Doxygen will then generate output that is more tailored for that\n# language. For instance, namespaces will be presented as modules, types will be\n# separated into more groups, etc.\n# The default value is: NO.\n\nOPTIMIZE_OUTPUT_SLICE  = NO\n\n# Doxygen selects the parser to use depending on the extension of the files it\n# parses. With this tag you can assign which parser to use for a given\n# extension. Doxygen has a built-in mapping, but you can override or extend it\n# using this tag. The format is ext=language, where ext is a file extension, and\n# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,\n# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,\n# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:\n# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser\n# tries to guess whether the code is fixed or free formatted code, this is the\n# default for Fortran type files). For instance to make doxygen treat .inc files\n# as Fortran files (default is PHP), and .f files as C (default is Fortran),\n# use: inc=Fortran f=C.\n#\n# Note: For files without extension you can use no_extension as a placeholder.\n#\n# Note that for custom extensions you also need to set FILE_PATTERNS otherwise\n# the files are not read by doxygen.\n\nEXTENSION_MAPPING      =\n\n# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments\n# according to the Markdown format, which allows for more readable\n# documentation. See https://daringfireball.net/projects/markdown/ for details.\n# The output of markdown processing is further processed by doxygen, so you can\n# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in\n# case of backward compatibilities issues.\n# The default value is: YES.\n\nMARKDOWN_SUPPORT       = YES\n\n# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up\n# to that level are automatically included in the table of contents, even if\n# they do not have an id attribute.\n# Note: This feature currently applies only to Markdown headings.\n# Minimum value: 0, maximum value: 99, default value: 5.\n# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.\n\nTOC_INCLUDE_HEADINGS   = 5\n\n# When enabled doxygen tries to link words that correspond to documented\n# classes, or namespaces to their corresponding documentation. Such a link can\n# be prevented in individual cases by putting a % sign in front of the word or\n# globally by setting AUTOLINK_SUPPORT to NO.\n# The default value is: YES.\n\nAUTOLINK_SUPPORT       = YES\n\n# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want\n# to include (a tag file for) the STL sources as input, then you should set this\n# tag to YES in order to let doxygen match functions declarations and\n# definitions whose arguments contain STL classes (e.g. func(std::string);\n# versus func(std::string) {}). This also make the inheritance and collaboration\n# diagrams that involve STL classes more complete and accurate.\n# The default value is: NO.\n\nBUILTIN_STL_SUPPORT    = NO\n\n# If you use Microsoft's C++/CLI language, you should set this option to YES to\n# enable parsing support.\n# The default value is: NO.\n\nCPP_CLI_SUPPORT        = NO\n\n# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:\n# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen\n# will parse them like normal C++ but will assume all classes use public instead\n# of private inheritance when no explicit protection keyword is present.\n# The default value is: NO.\n\nSIP_SUPPORT            = NO\n\n# For Microsoft's IDL there are propget and propput attributes to indicate\n# getter and setter methods for a property. Setting this option to YES will make\n# doxygen to replace the get and set methods by a property in the documentation.\n# This will only work if the methods are indeed getting or setting a simple\n# type. If this is not the case, or you want to show the methods anyway, you\n# should set this option to NO.\n# The default value is: YES.\n\nIDL_PROPERTY_SUPPORT   = YES\n\n# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC\n# tag is set to YES then doxygen will reuse the documentation of the first\n# member in the group (if any) for the other members of the group. By default\n# all members of a group must be documented explicitly.\n# The default value is: NO.\n\nDISTRIBUTE_GROUP_DOC   = NO\n\n# If one adds a struct or class to a group and this option is enabled, then also\n# any nested class or struct is added to the same group. By default this option\n# is disabled and one has to add nested compounds explicitly via \\ingroup.\n# The default value is: NO.\n\nGROUP_NESTED_COMPOUNDS = NO\n\n# Set the SUBGROUPING tag to YES to allow class member groups of the same type\n# (for instance a group of public functions) to be put as a subgroup of that\n# type (e.g. under the Public Functions section). Set it to NO to prevent\n# subgrouping. Alternatively, this can be done per class using the\n# \\nosubgrouping command.\n# The default value is: YES.\n\nSUBGROUPING            = YES\n\n# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions\n# are shown inside the group in which they are included (e.g. using \\ingroup)\n# instead of on a separate page (for HTML and Man pages) or section (for LaTeX\n# and RTF).\n#\n# Note that this feature does not work in combination with\n# SEPARATE_MEMBER_PAGES.\n# The default value is: NO.\n\nINLINE_GROUPED_CLASSES = NO\n\n# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions\n# with only public data fields or simple typedef fields will be shown inline in\n# the documentation of the scope in which they are defined (i.e. file,\n# namespace, or group documentation), provided this scope is documented. If set\n# to NO, structs, classes, and unions are shown on a separate page (for HTML and\n# Man pages) or section (for LaTeX and RTF).\n# The default value is: NO.\n\nINLINE_SIMPLE_STRUCTS  = NO\n\n# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or\n# enum is documented as struct, union, or enum with the name of the typedef. So\n# typedef struct TypeS {} TypeT, will appear in the documentation as a struct\n# with name TypeT. When disabled the typedef will appear as a member of a file,\n# namespace, or class. And the struct will be named TypeS. This can typically be\n# useful for C code in case the coding convention dictates that all compound\n# types are typedef'ed and only the typedef is referenced, never the tag name.\n# The default value is: NO.\n\nTYPEDEF_HIDES_STRUCT   = NO\n\n# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This\n# cache is used to resolve symbols given their name and scope. Since this can be\n# an expensive process and often the same symbol appears multiple times in the\n# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small\n# doxygen will become slower. If the cache is too large, memory is wasted. The\n# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range\n# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536\n# symbols. At the end of a run doxygen will report the cache usage and suggest\n# the optimal cache size from a speed point of view.\n# Minimum value: 0, maximum value: 9, default value: 0.\n\nLOOKUP_CACHE_SIZE      = 0\n\n# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use\n# during processing. When set to 0 doxygen will based this on the number of\n# cores available in the system. You can set it explicitly to a value larger\n# than 0 to get more control over the balance between CPU load and processing\n# speed. At this moment only the input processing can be done using multiple\n# threads. Since this is still an experimental feature the default is set to 1,\n# which efficively disables parallel processing. Please report any issues you\n# encounter. Generating dot graphs in parallel is controlled by the\n# DOT_NUM_THREADS setting.\n# Minimum value: 0, maximum value: 32, default value: 1.\n\nNUM_PROC_THREADS       = 1\n\n#---------------------------------------------------------------------------\n# Build related configuration options\n#---------------------------------------------------------------------------\n\n# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in\n# documentation are documented, even if no documentation was available. Private\n# class members and static file members will be hidden unless the\n# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.\n# Note: This will also disable the warnings about undocumented members that are\n# normally produced when WARNINGS is set to YES.\n# The default value is: NO.\n\nEXTRACT_ALL            = YES\n\n# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will\n# be included in the documentation.\n# The default value is: NO.\n\nEXTRACT_PRIVATE        = YES\n\n# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual\n# methods of a class will be included in the documentation.\n# The default value is: NO.\n\nEXTRACT_PRIV_VIRTUAL   = YES\n\n# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal\n# scope will be included in the documentation.\n# The default value is: NO.\n\nEXTRACT_PACKAGE        = YES\n\n# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be\n# included in the documentation.\n# The default value is: NO.\n\nEXTRACT_STATIC         = YES\n\n# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined\n# locally in source files will be included in the documentation. If set to NO,\n# only classes defined in header files are included. Does not have any effect\n# for Java sources.\n# The default value is: YES.\n\nEXTRACT_LOCAL_CLASSES  = YES\n\n# This flag is only useful for Objective-C code. If set to YES, local methods,\n# which are defined in the implementation section but not in the interface are\n# included in the documentation. If set to NO, only methods in the interface are\n# included.\n# The default value is: NO.\n\nEXTRACT_LOCAL_METHODS  = NO\n\n# If this flag is set to YES, the members of anonymous namespaces will be\n# extracted and appear in the documentation as a namespace called\n# 'anonymous_namespace{file}', where file will be replaced with the base name of\n# the file that contains the anonymous namespace. By default anonymous namespace\n# are hidden.\n# The default value is: NO.\n\nEXTRACT_ANON_NSPACES   = NO\n\n# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all\n# undocumented members inside documented classes or files. If set to NO these\n# members will be included in the various overviews, but no documentation\n# section is generated. This option has no effect if EXTRACT_ALL is enabled.\n# The default value is: NO.\n\nHIDE_UNDOC_MEMBERS     = NO\n\n# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all\n# undocumented classes that are normally visible in the class hierarchy. If set\n# to NO, these classes will be included in the various overviews. This option\n# has no effect if EXTRACT_ALL is enabled.\n# The default value is: NO.\n\nHIDE_UNDOC_CLASSES     = NO\n\n# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend\n# declarations. If set to NO, these declarations will be included in the\n# documentation.\n# The default value is: NO.\n\nHIDE_FRIEND_COMPOUNDS  = NO\n\n# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any\n# documentation blocks found inside the body of a function. If set to NO, these\n# blocks will be appended to the function's detailed documentation block.\n# The default value is: NO.\n\nHIDE_IN_BODY_DOCS      = NO\n\n# The INTERNAL_DOCS tag determines if documentation that is typed after a\n# \\internal command is included. If the tag is set to NO then the documentation\n# will be excluded. Set it to YES to include the internal documentation.\n# The default value is: NO.\n\nINTERNAL_DOCS          = NO\n\n# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file\n# names in lower-case letters. If set to YES, upper-case letters are also\n# allowed. This is useful if you have classes or files whose names only differ\n# in case and if your file system supports case sensitive file names. Windows\n# (including Cygwin) and Mac users are advised to set this option to NO.\n# The default value is: system dependent.\n\nCASE_SENSE_NAMES       = NO\n\n# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with\n# their full class and namespace scopes in the documentation. If set to YES, the\n# scope will be hidden.\n# The default value is: NO.\n\nHIDE_SCOPE_NAMES       = NO\n\n# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will\n# append additional text to a page's title, such as Class Reference. If set to\n# YES the compound reference will be hidden.\n# The default value is: NO.\n\nHIDE_COMPOUND_REFERENCE= NO\n\n# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of\n# the files that are included by a file in the documentation of that file.\n# The default value is: YES.\n\nSHOW_INCLUDE_FILES     = YES\n\n# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each\n# grouped member an include statement to the documentation, telling the reader\n# which file to include in order to use the member.\n# The default value is: NO.\n\nSHOW_GROUPED_MEMB_INC  = NO\n\n# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include\n# files with double quotes in the documentation rather than with sharp brackets.\n# The default value is: NO.\n\nFORCE_LOCAL_INCLUDES   = NO\n\n# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the\n# documentation for inline members.\n# The default value is: YES.\n\nINLINE_INFO            = YES\n\n# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the\n# (detailed) documentation of file and class members alphabetically by member\n# name. If set to NO, the members will appear in declaration order.\n# The default value is: YES.\n\nSORT_MEMBER_DOCS       = YES\n\n# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief\n# descriptions of file, namespace and class members alphabetically by member\n# name. If set to NO, the members will appear in declaration order. Note that\n# this will also influence the order of the classes in the class list.\n# The default value is: NO.\n\nSORT_BRIEF_DOCS        = NO\n\n# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the\n# (brief and detailed) documentation of class members so that constructors and\n# destructors are listed first. If set to NO the constructors will appear in the\n# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.\n# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief\n# member documentation.\n# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting\n# detailed member documentation.\n# The default value is: NO.\n\nSORT_MEMBERS_CTORS_1ST = NO\n\n# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy\n# of group names into alphabetical order. If set to NO the group names will\n# appear in their defined order.\n# The default value is: NO.\n\nSORT_GROUP_NAMES       = NO\n\n# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by\n# fully-qualified names, including namespaces. If set to NO, the class list will\n# be sorted only by class name, not including the namespace part.\n# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.\n# Note: This option applies only to the class list, not to the alphabetical\n# list.\n# The default value is: NO.\n\nSORT_BY_SCOPE_NAME     = NO\n\n# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper\n# type resolution of all parameters of a function it will reject a match between\n# the prototype and the implementation of a member function even if there is\n# only one candidate or it is obvious which candidate to choose by doing a\n# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still\n# accept a match between prototype and implementation in such cases.\n# The default value is: NO.\n\nSTRICT_PROTO_MATCHING  = NO\n\n# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo\n# list. This list is created by putting \\todo commands in the documentation.\n# The default value is: YES.\n\nGENERATE_TODOLIST      = YES\n\n# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test\n# list. This list is created by putting \\test commands in the documentation.\n# The default value is: YES.\n\nGENERATE_TESTLIST      = YES\n\n# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug\n# list. This list is created by putting \\bug commands in the documentation.\n# The default value is: YES.\n\nGENERATE_BUGLIST       = YES\n\n# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)\n# the deprecated list. This list is created by putting \\deprecated commands in\n# the documentation.\n# The default value is: YES.\n\nGENERATE_DEPRECATEDLIST= YES\n\n# The ENABLED_SECTIONS tag can be used to enable conditional documentation\n# sections, marked by \\if <section_label> ... \\endif and \\cond <section_label>\n# ... \\endcond blocks.\n\nENABLED_SECTIONS       =\n\n# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the\n# initial value of a variable or macro / define can have for it to appear in the\n# documentation. If the initializer consists of more lines than specified here\n# it will be hidden. Use a value of 0 to hide initializers completely. The\n# appearance of the value of individual variables and macros / defines can be\n# controlled using \\showinitializer or \\hideinitializer command in the\n# documentation regardless of this setting.\n# Minimum value: 0, maximum value: 10000, default value: 30.\n\nMAX_INITIALIZER_LINES  = 30\n\n# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at\n# the bottom of the documentation of classes and structs. If set to YES, the\n# list will mention the files that were used to generate the documentation.\n# The default value is: YES.\n\nSHOW_USED_FILES        = YES\n\n# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This\n# will remove the Files entry from the Quick Index and from the Folder Tree View\n# (if specified).\n# The default value is: YES.\n\nSHOW_FILES             = YES\n\n# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces\n# page. This will remove the Namespaces entry from the Quick Index and from the\n# Folder Tree View (if specified).\n# The default value is: YES.\n\nSHOW_NAMESPACES        = YES\n\n# The FILE_VERSION_FILTER tag can be used to specify a program or script that\n# doxygen should invoke to get the current version for each file (typically from\n# the version control system). Doxygen will invoke the program by executing (via\n# popen()) the command command input-file, where command is the value of the\n# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided\n# by doxygen. Whatever the program writes to standard output is used as the file\n# version. For an example see the documentation.\n\nFILE_VERSION_FILTER    =\n\n# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed\n# by doxygen. The layout file controls the global structure of the generated\n# output files in an output format independent way. To create the layout file\n# that represents doxygen's defaults, run doxygen with the -l option. You can\n# optionally specify a file name after the option, if omitted DoxygenLayout.xml\n# will be used as the name of the layout file.\n#\n# Note that if you run doxygen from a directory containing a file called\n# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE\n# tag is left empty.\n\nLAYOUT_FILE            =\n\n# The CITE_BIB_FILES tag can be used to specify one or more bib files containing\n# the reference definitions. This must be a list of .bib files. The .bib\n# extension is automatically appended if omitted. This requires the bibtex tool\n# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.\n# For LaTeX the style of the bibliography can be controlled using\n# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the\n# search path. See also \\cite for info how to create references.\n\nCITE_BIB_FILES         =\n\n#---------------------------------------------------------------------------\n# Configuration options related to warning and progress messages\n#---------------------------------------------------------------------------\n\n# The QUIET tag can be used to turn on/off the messages that are generated to\n# standard output by doxygen. If QUIET is set to YES this implies that the\n# messages are off.\n# The default value is: NO.\n\nQUIET                  = NO\n\n# The WARNINGS tag can be used to turn on/off the warning messages that are\n# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES\n# this implies that the warnings are on.\n#\n# Tip: Turn warnings on while writing the documentation.\n# The default value is: YES.\n\nWARNINGS               = YES\n\n# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate\n# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag\n# will automatically be disabled.\n# The default value is: YES.\n\nWARN_IF_UNDOCUMENTED   = YES\n\n# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for\n# potential errors in the documentation, such as not documenting some parameters\n# in a documented function, or documenting parameters that don't exist or using\n# markup commands wrongly.\n# The default value is: YES.\n\nWARN_IF_DOC_ERROR      = YES\n\n# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that\n# are documented, but have no documentation for their parameters or return\n# value. If set to NO, doxygen will only warn about wrong or incomplete\n# parameter documentation, but not about the absence of documentation. If\n# EXTRACT_ALL is set to YES then this flag will automatically be disabled.\n# The default value is: NO.\n\nWARN_NO_PARAMDOC       = NO\n\n# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when\n# a warning is encountered.\n# The default value is: NO.\n\nWARN_AS_ERROR          = NO\n\n# The WARN_FORMAT tag determines the format of the warning messages that doxygen\n# can produce. The string should contain the $file, $line, and $text tags, which\n# will be replaced by the file and line number from which the warning originated\n# and the warning text. Optionally the format may contain $version, which will\n# be replaced by the version of the file (if it could be obtained via\n# FILE_VERSION_FILTER)\n# The default value is: $file:$line: $text.\n\nWARN_FORMAT            = \"$file:$line: $text\"\n\n# The WARN_LOGFILE tag can be used to specify a file to which warning and error\n# messages should be written. If left blank the output is written to standard\n# error (stderr).\n\nWARN_LOGFILE           =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the input files\n#---------------------------------------------------------------------------\n\n# The INPUT tag is used to specify the files and/or directories that contain\n# documented source files. You may enter file names like myfile.cpp or\n# directories like /usr/src/myproject. Separate the files or directories with\n# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING\n# Note: If this tag is empty the current directory is searched.\n\nINPUT                  =\n\n# This tag can be used to specify the character encoding of the source files\n# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses\n# libiconv (or the iconv built into libc) for the transcoding. See the libiconv\n# documentation (see: https://www.gnu.org/software/libiconv/) for the list of\n# possible encodings.\n# The default value is: UTF-8.\n\nINPUT_ENCODING         = UTF-8\n\n# If the value of the INPUT tag contains directories, you can use the\n# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and\n# *.h) to filter out the source-files in the directories.\n#\n# Note that for custom extensions or not directly supported extensions you also\n# need to set EXTENSION_MAPPING for the extension otherwise the files are not\n# read by doxygen.\n#\n# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,\n# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,\n# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,\n# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),\n# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen\n# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,\n# *.vhdl, *.ucf, *.qsf and *.ice.\n\nFILE_PATTERNS          = *.c \\\n                         *.cc \\\n                         *.cxx \\\n                         *.cpp \\\n                         *.c++ \\\n                         *.java \\\n                         *.ii \\\n                         *.ixx \\\n                         *.ipp \\\n                         *.i++ \\\n                         *.inl \\\n                         *.idl \\\n                         *.ddl \\\n                         *.odl \\\n                         *.h \\\n                         *.hh \\\n                         *.hxx \\\n                         *.hpp \\\n                         *.h++ \\\n                         *.cs \\\n                         *.d \\\n                         *.php \\\n                         *.php4 \\\n                         *.php5 \\\n                         *.phtml \\\n                         *.inc \\\n                         *.m \\\n                         *.markdown \\\n                         *.md \\\n                         *.mm \\\n                         *.dox \\\n                         *.doc \\\n                         *.txt \\\n                         *.py \\\n                         *.pyw \\\n                         *.f90 \\\n                         *.f95 \\\n                         *.f03 \\\n                         *.f08 \\\n                         *.f18 \\\n                         *.f \\\n                         *.for \\\n                         *.vhd \\\n                         *.vhdl \\\n                         *.ucf \\\n                         *.qsf \\\n                         *.ice\n\n# The RECURSIVE tag can be used to specify whether or not subdirectories should\n# be searched for input files as well.\n# The default value is: NO.\n\nRECURSIVE              = NO\n\n# The EXCLUDE tag can be used to specify files and/or directories that should be\n# excluded from the INPUT source files. This way you can easily exclude a\n# subdirectory from a directory tree whose root is specified with the INPUT tag.\n#\n# Note that relative paths are relative to the directory from which doxygen is\n# run.\n\nEXCLUDE                =\n\n# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or\n# directories that are symbolic links (a Unix file system feature) are excluded\n# from the input.\n# The default value is: NO.\n\nEXCLUDE_SYMLINKS       = NO\n\n# If the value of the INPUT tag contains directories, you can use the\n# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude\n# certain files from those directories.\n#\n# Note that the wildcards are matched against the file with absolute path, so to\n# exclude all test directories for example use the pattern */test/*\n\nEXCLUDE_PATTERNS       =\n\n# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names\n# (namespaces, classes, functions, etc.) that should be excluded from the\n# output. The symbol name can be a fully qualified name, a word, or if the\n# wildcard * is used, a substring. Examples: ANamespace, AClass,\n# AClass::ANamespace, ANamespace::*Test\n#\n# Note that the wildcards are matched against the file with absolute path, so to\n# exclude all test directories use the pattern */test/*\n\nEXCLUDE_SYMBOLS        =\n\n# The EXAMPLE_PATH tag can be used to specify one or more files or directories\n# that contain example code fragments that are included (see the \\include\n# command).\n\nEXAMPLE_PATH           =\n\n# If the value of the EXAMPLE_PATH tag contains directories, you can use the\n# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and\n# *.h) to filter out the source-files in the directories. If left blank all\n# files are included.\n\nEXAMPLE_PATTERNS       = *\n\n# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be\n# searched for input files to be used with the \\include or \\dontinclude commands\n# irrespective of the value of the RECURSIVE tag.\n# The default value is: NO.\n\nEXAMPLE_RECURSIVE      = NO\n\n# The IMAGE_PATH tag can be used to specify one or more files or directories\n# that contain images that are to be included in the documentation (see the\n# \\image command).\n\nIMAGE_PATH             =\n\n# The INPUT_FILTER tag can be used to specify a program that doxygen should\n# invoke to filter for each input file. Doxygen will invoke the filter program\n# by executing (via popen()) the command:\n#\n# <filter> <input-file>\n#\n# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the\n# name of an input file. Doxygen will then use the output that the filter\n# program writes to standard output. If FILTER_PATTERNS is specified, this tag\n# will be ignored.\n#\n# Note that the filter must not add or remove lines; it is applied before the\n# code is scanned, but not when the output code is generated. If lines are added\n# or removed, the anchors will not be placed correctly.\n#\n# Note that for custom extensions or not directly supported extensions you also\n# need to set EXTENSION_MAPPING for the extension otherwise the files are not\n# properly processed by doxygen.\n\nINPUT_FILTER           =\n\n# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern\n# basis. Doxygen will compare the file name with each pattern and apply the\n# filter if there is a match. The filters are a list of the form: pattern=filter\n# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how\n# filters are used. If the FILTER_PATTERNS tag is empty or if none of the\n# patterns match the file name, INPUT_FILTER is applied.\n#\n# Note that for custom extensions or not directly supported extensions you also\n# need to set EXTENSION_MAPPING for the extension otherwise the files are not\n# properly processed by doxygen.\n\nFILTER_PATTERNS        =\n\n# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using\n# INPUT_FILTER) will also be used to filter the input files that are used for\n# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).\n# The default value is: NO.\n\nFILTER_SOURCE_FILES    = NO\n\n# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file\n# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and\n# it is also possible to disable source filtering for a specific pattern using\n# *.ext= (so without naming a filter).\n# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.\n\nFILTER_SOURCE_PATTERNS =\n\n# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that\n# is part of the input, its contents will be placed on the main page\n# (index.html). This can be useful if you have a project on for instance GitHub\n# and want to reuse the introduction page also for the doxygen output.\n\nUSE_MDFILE_AS_MAINPAGE =\n\n#---------------------------------------------------------------------------\n# Configuration options related to source browsing\n#---------------------------------------------------------------------------\n\n# If the SOURCE_BROWSER tag is set to YES then a list of source files will be\n# generated. Documented entities will be cross-referenced with these sources.\n#\n# Note: To get rid of all source code in the generated output, make sure that\n# also VERBATIM_HEADERS is set to NO.\n# The default value is: NO.\n\nSOURCE_BROWSER         = NO\n\n# Setting the INLINE_SOURCES tag to YES will include the body of functions,\n# classes and enums directly into the documentation.\n# The default value is: NO.\n\nINLINE_SOURCES         = NO\n\n# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any\n# special comment blocks from generated source code fragments. Normal C, C++ and\n# Fortran comments will always remain visible.\n# The default value is: YES.\n\nSTRIP_CODE_COMMENTS    = YES\n\n# If the REFERENCED_BY_RELATION tag is set to YES then for each documented\n# entity all documented functions referencing it will be listed.\n# The default value is: NO.\n\nREFERENCED_BY_RELATION = NO\n\n# If the REFERENCES_RELATION tag is set to YES then for each documented function\n# all documented entities called/used by that function will be listed.\n# The default value is: NO.\n\nREFERENCES_RELATION    = NO\n\n# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set\n# to YES then the hyperlinks from functions in REFERENCES_RELATION and\n# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will\n# link to the documentation.\n# The default value is: YES.\n\nREFERENCES_LINK_SOURCE = YES\n\n# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the\n# source code will show a tooltip with additional information such as prototype,\n# brief description and links to the definition and documentation. Since this\n# will make the HTML file larger and loading of large files a bit slower, you\n# can opt to disable this feature.\n# The default value is: YES.\n# This tag requires that the tag SOURCE_BROWSER is set to YES.\n\nSOURCE_TOOLTIPS        = YES\n\n# If the USE_HTAGS tag is set to YES then the references to source code will\n# point to the HTML generated by the htags(1) tool instead of doxygen built-in\n# source browser. The htags tool is part of GNU's global source tagging system\n# (see https://www.gnu.org/software/global/global.html). You will need version\n# 4.8.6 or higher.\n#\n# To use it do the following:\n# - Install the latest version of global\n# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file\n# - Make sure the INPUT points to the root of the source tree\n# - Run doxygen as normal\n#\n# Doxygen will invoke htags (and that will in turn invoke gtags), so these\n# tools must be available from the command line (i.e. in the search path).\n#\n# The result: instead of the source browser generated by doxygen, the links to\n# source code will now point to the output of htags.\n# The default value is: NO.\n# This tag requires that the tag SOURCE_BROWSER is set to YES.\n\nUSE_HTAGS              = NO\n\n# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a\n# verbatim copy of the header file for each class for which an include is\n# specified. Set to NO to disable this.\n# See also: Section \\class.\n# The default value is: YES.\n\nVERBATIM_HEADERS       = YES\n\n#---------------------------------------------------------------------------\n# Configuration options related to the alphabetical class index\n#---------------------------------------------------------------------------\n\n# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all\n# compounds will be generated. Enable this if the project contains a lot of\n# classes, structs, unions or interfaces.\n# The default value is: YES.\n\nALPHABETICAL_INDEX     = YES\n\n# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in\n# which the alphabetical index list will be split.\n# Minimum value: 1, maximum value: 20, default value: 5.\n# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.\n\nCOLS_IN_ALPHA_INDEX    = 5\n\n# In case all classes in a project start with a common prefix, all classes will\n# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag\n# can be used to specify a prefix (or a list of prefixes) that should be ignored\n# while generating the index headers.\n# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.\n\nIGNORE_PREFIX          =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the HTML output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output\n# The default value is: YES.\n\nGENERATE_HTML          = YES\n\n# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it.\n# The default directory is: html.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_OUTPUT            = html\n\n# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each\n# generated HTML page (for example: .htm, .php, .asp).\n# The default value is: .html.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_FILE_EXTENSION    = .html\n\n# The HTML_HEADER tag can be used to specify a user-defined HTML header file for\n# each generated HTML page. If the tag is left blank doxygen will generate a\n# standard header.\n#\n# To get valid HTML the header file that includes any scripts and style sheets\n# that doxygen needs, which is dependent on the configuration options used (e.g.\n# the setting GENERATE_TREEVIEW). It is highly recommended to start with a\n# default header using\n# doxygen -w html new_header.html new_footer.html new_stylesheet.css\n# YourConfigFile\n# and then modify the file new_header.html. See also section \"Doxygen usage\"\n# for information on how to generate the default header that doxygen normally\n# uses.\n# Note: The header is subject to change so you typically have to regenerate the\n# default header when upgrading to a newer version of doxygen. For a description\n# of the possible markers and block names see the documentation.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_HEADER            =\n\n# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each\n# generated HTML page. If the tag is left blank doxygen will generate a standard\n# footer. See HTML_HEADER for more information on how to generate a default\n# footer and what special commands can be used inside the footer. See also\n# section \"Doxygen usage\" for information on how to generate the default footer\n# that doxygen normally uses.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_FOOTER            =\n\n# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style\n# sheet that is used by each HTML page. It can be used to fine-tune the look of\n# the HTML output. If left blank doxygen will generate a default style sheet.\n# See also section \"Doxygen usage\" for information on how to generate the style\n# sheet that doxygen normally uses.\n# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as\n# it is more robust and this tag (HTML_STYLESHEET) will in the future become\n# obsolete.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_STYLESHEET        =\n\n# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined\n# cascading style sheets that are included after the standard style sheets\n# created by doxygen. Using this option one can overrule certain style aspects.\n# This is preferred over using HTML_STYLESHEET since it does not replace the\n# standard style sheet and is therefore more robust against future updates.\n# Doxygen will copy the style sheet files to the output directory.\n# Note: The order of the extra style sheet files is of importance (e.g. the last\n# style sheet in the list overrules the setting of the previous ones in the\n# list). For an example see the documentation.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_EXTRA_STYLESHEET  =\n\n# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or\n# other source files which should be copied to the HTML output directory. Note\n# that these files will be copied to the base HTML output directory. Use the\n# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these\n# files. In the HTML_STYLESHEET file, use the file name only. Also note that the\n# files will be copied as-is; there are no commands or markers available.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_EXTRA_FILES       =\n\n# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen\n# will adjust the colors in the style sheet and background images according to\n# this color. Hue is specified as an angle on a colorwheel, see\n# https://en.wikipedia.org/wiki/Hue for more information. For instance the value\n# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300\n# purple, and 360 is red again.\n# Minimum value: 0, maximum value: 359, default value: 220.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_COLORSTYLE_HUE    = 220\n\n# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors\n# in the HTML output. For a value of 0 the output will use grayscales only. A\n# value of 255 will produce the most vivid colors.\n# Minimum value: 0, maximum value: 255, default value: 100.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_COLORSTYLE_SAT    = 100\n\n# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the\n# luminance component of the colors in the HTML output. Values below 100\n# gradually make the output lighter, whereas values above 100 make the output\n# darker. The value divided by 100 is the actual gamma applied, so 80 represents\n# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not\n# change the gamma.\n# Minimum value: 40, maximum value: 240, default value: 80.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_COLORSTYLE_GAMMA  = 80\n\n# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML\n# page will contain the date and time when the page was generated. Setting this\n# to YES can help to show when doxygen was last run and thus if the\n# documentation is up to date.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_TIMESTAMP         = NO\n\n# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML\n# documentation will contain a main index with vertical navigation menus that\n# are dynamically created via JavaScript. If disabled, the navigation index will\n# consists of multiple levels of tabs that are statically embedded in every HTML\n# page. Disable this option to support browsers that do not have JavaScript,\n# like the Qt help browser.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_DYNAMIC_MENUS     = YES\n\n# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML\n# documentation will contain sections that can be hidden and shown after the\n# page has loaded.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_DYNAMIC_SECTIONS  = NO\n\n# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries\n# shown in the various tree structured indices initially; the user can expand\n# and collapse entries dynamically later on. Doxygen will expand the tree to\n# such a level that at most the specified number of entries are visible (unless\n# a fully collapsed tree already exceeds this amount). So setting the number of\n# entries 1 will produce a full collapsed tree by default. 0 is a special value\n# representing an infinite number of entries and will result in a full expanded\n# tree by default.\n# Minimum value: 0, maximum value: 9999, default value: 100.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_INDEX_NUM_ENTRIES = 100\n\n# If the GENERATE_DOCSET tag is set to YES, additional index files will be\n# generated that can be used as input for Apple's Xcode 3 integrated development\n# environment (see: https://developer.apple.com/xcode/), introduced with OSX\n# 10.5 (Leopard). To create a documentation set, doxygen will generate a\n# Makefile in the HTML output directory. Running make will produce the docset in\n# that directory and running make install will install the docset in\n# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at\n# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy\n# genXcode/_index.html for more information.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_DOCSET        = NO\n\n# This tag determines the name of the docset feed. A documentation feed provides\n# an umbrella under which multiple documentation sets from a single provider\n# (such as a company or product suite) can be grouped.\n# The default value is: Doxygen generated docs.\n# This tag requires that the tag GENERATE_DOCSET is set to YES.\n\nDOCSET_FEEDNAME        = \"Doxygen generated docs\"\n\n# This tag specifies a string that should uniquely identify the documentation\n# set bundle. This should be a reverse domain-name style string, e.g.\n# com.mycompany.MyDocSet. Doxygen will append .docset to the name.\n# The default value is: org.doxygen.Project.\n# This tag requires that the tag GENERATE_DOCSET is set to YES.\n\nDOCSET_BUNDLE_ID       = org.doxygen.Project\n\n# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify\n# the documentation publisher. This should be a reverse domain-name style\n# string, e.g. com.mycompany.MyDocSet.documentation.\n# The default value is: org.doxygen.Publisher.\n# This tag requires that the tag GENERATE_DOCSET is set to YES.\n\nDOCSET_PUBLISHER_ID    = org.doxygen.Publisher\n\n# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.\n# The default value is: Publisher.\n# This tag requires that the tag GENERATE_DOCSET is set to YES.\n\nDOCSET_PUBLISHER_NAME  = Publisher\n\n# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three\n# additional HTML index files: index.hhp, index.hhc, and index.hhk. The\n# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop\n# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on\n# Windows.\n#\n# The HTML Help Workshop contains a compiler that can convert all HTML output\n# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML\n# files are now used as the Windows 98 help format, and will replace the old\n# Windows help format (.hlp) on all Windows platforms in the future. Compressed\n# HTML files also contain an index, a table of contents, and you can search for\n# words in the documentation. The HTML workshop also contains a viewer for\n# compressed HTML files.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_HTMLHELP      = NO\n\n# The CHM_FILE tag can be used to specify the file name of the resulting .chm\n# file. You can add a path in front of the file if the result should not be\n# written to the html output directory.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nCHM_FILE               =\n\n# The HHC_LOCATION tag can be used to specify the location (absolute path\n# including file name) of the HTML help compiler (hhc.exe). If non-empty,\n# doxygen will try to run the HTML help compiler on the generated index.hhp.\n# The file has to be specified with full path.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nHHC_LOCATION           =\n\n# The GENERATE_CHI flag controls if a separate .chi index file is generated\n# (YES) or that it should be included in the main .chm file (NO).\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nGENERATE_CHI           = NO\n\n# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)\n# and project file content.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nCHM_INDEX_ENCODING     =\n\n# The BINARY_TOC flag controls whether a binary table of contents is generated\n# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it\n# enables the Previous and Next buttons.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nBINARY_TOC             = NO\n\n# The TOC_EXPAND flag can be set to YES to add extra items for group members to\n# the table of contents of the HTML help documentation and to the tree view.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTMLHELP is set to YES.\n\nTOC_EXPAND             = NO\n\n# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and\n# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that\n# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help\n# (.qch) of the generated HTML documentation.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_QHP           = NO\n\n# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify\n# the file name of the resulting .qch file. The path specified is relative to\n# the HTML output folder.\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQCH_FILE               =\n\n# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help\n# Project output. For more information please see Qt Help Project / Namespace\n# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).\n# The default value is: org.doxygen.Project.\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_NAMESPACE          = org.doxygen.Project\n\n# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt\n# Help Project output. For more information please see Qt Help Project / Virtual\n# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-\n# folders).\n# The default value is: doc.\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_VIRTUAL_FOLDER     = doc\n\n# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom\n# filter to add. For more information please see Qt Help Project / Custom\n# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-\n# filters).\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_CUST_FILTER_NAME   =\n\n# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the\n# custom filter to add. For more information please see Qt Help Project / Custom\n# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-\n# filters).\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_CUST_FILTER_ATTRS  =\n\n# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this\n# project's filter section matches. Qt Help Project / Filter Attributes (see:\n# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHP_SECT_FILTER_ATTRS  =\n\n# The QHG_LOCATION tag can be used to specify the location of Qt's\n# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the\n# generated .qhp file.\n# This tag requires that the tag GENERATE_QHP is set to YES.\n\nQHG_LOCATION           =\n\n# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be\n# generated, together with the HTML files, they form an Eclipse help plugin. To\n# install this plugin and make it available under the help contents menu in\n# Eclipse, the contents of the directory containing the HTML and XML files needs\n# to be copied into the plugins directory of eclipse. The name of the directory\n# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.\n# After copying Eclipse needs to be restarted before the help appears.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_ECLIPSEHELP   = NO\n\n# A unique identifier for the Eclipse help plugin. When installing the plugin\n# the directory name containing the HTML and XML files should also have this\n# name. Each documentation set should have its own identifier.\n# The default value is: org.doxygen.Project.\n# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.\n\nECLIPSE_DOC_ID         = org.doxygen.Project\n\n# If you want full control over the layout of the generated HTML pages it might\n# be necessary to disable the index and replace it with your own. The\n# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top\n# of each HTML page. A value of NO enables the index and the value YES disables\n# it. Since the tabs in the index contain the same information as the navigation\n# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nDISABLE_INDEX          = NO\n\n# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index\n# structure should be generated to display hierarchical information. If the tag\n# value is set to YES, a side panel will be generated containing a tree-like\n# index structure (just like the one that is generated for HTML Help). For this\n# to work a browser that supports JavaScript, DHTML, CSS and frames is required\n# (i.e. any modern browser). Windows users are probably better off using the\n# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can\n# further fine-tune the look of the index. As an example, the default style\n# sheet generated by doxygen has an example that shows how to put an image at\n# the root of the tree instead of the PROJECT_NAME. Since the tree basically has\n# the same information as the tab index, you could consider setting\n# DISABLE_INDEX to YES when enabling this option.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nGENERATE_TREEVIEW      = NO\n\n# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that\n# doxygen will group on one line in the generated HTML documentation.\n#\n# Note that a value of 0 will completely suppress the enum values from appearing\n# in the overview section.\n# Minimum value: 0, maximum value: 20, default value: 4.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nENUM_VALUES_PER_LINE   = 4\n\n# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used\n# to set the initial width (in pixels) of the frame in which the tree is shown.\n# Minimum value: 0, maximum value: 1500, default value: 250.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nTREEVIEW_WIDTH         = 250\n\n# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to\n# external symbols imported via tag files in a separate window.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nEXT_LINKS_IN_WINDOW    = NO\n\n# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg\n# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see\n# https://inkscape.org) to generate formulas as SVG images instead of PNGs for\n# the HTML output. These images will generally look nicer at scaled resolutions.\n# Possible values are: png (the default) and svg (looks nicer but requires the\n# pdf2svg or inkscape tool).\n# The default value is: png.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nHTML_FORMULA_FORMAT    = png\n\n# Use this tag to change the font size of LaTeX formulas included as images in\n# the HTML documentation. When you change the font size after a successful\n# doxygen run you need to manually remove any form_*.png images from the HTML\n# output directory to force them to be regenerated.\n# Minimum value: 8, maximum value: 50, default value: 10.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nFORMULA_FONTSIZE       = 10\n\n# Use the FORMULA_TRANSPARENT tag to determine whether or not the images\n# generated for formulas are transparent PNGs. Transparent PNGs are not\n# supported properly for IE 6.0, but are supported on all modern browsers.\n#\n# Note that when changing this option you need to delete any form_*.png files in\n# the HTML output directory before the changes have effect.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nFORMULA_TRANSPARENT    = YES\n\n# The FORMULA_MACROFILE can contain LaTeX \\newcommand and \\renewcommand commands\n# to create new LaTeX commands to be used in formulas as building blocks. See\n# the section \"Including formulas\" for details.\n\nFORMULA_MACROFILE      =\n\n# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see\n# https://www.mathjax.org) which uses client side JavaScript for the rendering\n# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX\n# installed or if you want to formulas look prettier in the HTML output. When\n# enabled you may also need to install MathJax separately and configure the path\n# to it using the MATHJAX_RELPATH option.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nUSE_MATHJAX            = NO\n\n# When MathJax is enabled you can set the default output format to be used for\n# the MathJax output. See the MathJax site (see:\n# http://docs.mathjax.org/en/latest/output.html) for more details.\n# Possible values are: HTML-CSS (which is slower, but has the best\n# compatibility), NativeMML (i.e. MathML) and SVG.\n# The default value is: HTML-CSS.\n# This tag requires that the tag USE_MATHJAX is set to YES.\n\nMATHJAX_FORMAT         = HTML-CSS\n\n# When MathJax is enabled you need to specify the location relative to the HTML\n# output directory using the MATHJAX_RELPATH option. The destination directory\n# should contain the MathJax.js script. For instance, if the mathjax directory\n# is located at the same level as the HTML output directory, then\n# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax\n# Content Delivery Network so you can quickly see the result without installing\n# MathJax. However, it is strongly recommended to install a local copy of\n# MathJax from https://www.mathjax.org before deployment.\n# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.\n# This tag requires that the tag USE_MATHJAX is set to YES.\n\nMATHJAX_RELPATH        = https://cdn.jsdelivr.net/npm/mathjax@2\n\n# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax\n# extension names that should be enabled during MathJax rendering. For example\n# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols\n# This tag requires that the tag USE_MATHJAX is set to YES.\n\nMATHJAX_EXTENSIONS     =\n\n# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces\n# of code that will be used on startup of the MathJax code. See the MathJax site\n# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an\n# example see the documentation.\n# This tag requires that the tag USE_MATHJAX is set to YES.\n\nMATHJAX_CODEFILE       =\n\n# When the SEARCHENGINE tag is enabled doxygen will generate a search box for\n# the HTML output. The underlying search engine uses javascript and DHTML and\n# should work on any modern browser. Note that when using HTML help\n# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)\n# there is already a search function so this one should typically be disabled.\n# For large projects the javascript based search engine can be slow, then\n# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to\n# search using the keyboard; to jump to the search box use <access key> + S\n# (what the <access key> is depends on the OS and browser, but it is typically\n# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down\n# key> to jump into the search results window, the results can be navigated\n# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel\n# the search. The filter options can be selected when the cursor is inside the\n# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>\n# to select a filter and <Enter> or <escape> to activate or cancel the filter\n# option.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_HTML is set to YES.\n\nSEARCHENGINE           = YES\n\n# When the SERVER_BASED_SEARCH tag is enabled the search engine will be\n# implemented using a web server instead of a web client using JavaScript. There\n# are two flavors of web server based searching depending on the EXTERNAL_SEARCH\n# setting. When disabled, doxygen will generate a PHP script for searching and\n# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing\n# and searching needs to be provided by external tools. See the section\n# \"External Indexing and Searching\" for details.\n# The default value is: NO.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nSERVER_BASED_SEARCH    = NO\n\n# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP\n# script for searching. Instead the search results are written to an XML file\n# which needs to be processed by an external indexer. Doxygen will invoke an\n# external search engine pointed to by the SEARCHENGINE_URL option to obtain the\n# search results.\n#\n# Doxygen ships with an example indexer (doxyindexer) and search engine\n# (doxysearch.cgi) which are based on the open source search engine library\n# Xapian (see: https://xapian.org/).\n#\n# See the section \"External Indexing and Searching\" for details.\n# The default value is: NO.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nEXTERNAL_SEARCH        = NO\n\n# The SEARCHENGINE_URL should point to a search engine hosted by a web server\n# which will return the search results when EXTERNAL_SEARCH is enabled.\n#\n# Doxygen ships with an example indexer (doxyindexer) and search engine\n# (doxysearch.cgi) which are based on the open source search engine library\n# Xapian (see: https://xapian.org/). See the section \"External Indexing and\n# Searching\" for details.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nSEARCHENGINE_URL       =\n\n# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed\n# search data is written to a file for indexing by an external tool. With the\n# SEARCHDATA_FILE tag the name of this file can be specified.\n# The default file is: searchdata.xml.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nSEARCHDATA_FILE        = searchdata.xml\n\n# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the\n# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is\n# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple\n# projects and redirect the results back to the right project.\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nEXTERNAL_SEARCH_ID     =\n\n# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen\n# projects other than the one defined by this configuration file, but that are\n# all added to the same external search index. Each project needs to have a\n# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of\n# to a relative location where the documentation can be found. The format is:\n# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...\n# This tag requires that the tag SEARCHENGINE is set to YES.\n\nEXTRA_SEARCH_MAPPINGS  =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the LaTeX output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.\n# The default value is: YES.\n\nGENERATE_LATEX         = YES\n\n# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it.\n# The default directory is: latex.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_OUTPUT           = latex\n\n# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be\n# invoked.\n#\n# Note that when not enabling USE_PDFLATEX the default is latex when enabling\n# USE_PDFLATEX the default is pdflatex and when in the later case latex is\n# chosen this is overwritten by pdflatex. For specific output languages the\n# default can have been set differently, this depends on the implementation of\n# the output language.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_CMD_NAME         =\n\n# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate\n# index for LaTeX.\n# Note: This tag is used in the Makefile / make.bat.\n# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file\n# (.tex).\n# The default file is: makeindex.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nMAKEINDEX_CMD_NAME     = makeindex\n\n# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to\n# generate index for LaTeX. In case there is no backslash (\\) as first character\n# it will be automatically added in the LaTeX code.\n# Note: This tag is used in the generated output file (.tex).\n# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.\n# The default value is: makeindex.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_MAKEINDEX_CMD    = makeindex\n\n# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX\n# documents. This may be useful for small projects and may help to save some\n# trees in general.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nCOMPACT_LATEX          = NO\n\n# The PAPER_TYPE tag can be used to set the paper type that is used by the\n# printer.\n# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x\n# 14 inches) and executive (7.25 x 10.5 inches).\n# The default value is: a4.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nPAPER_TYPE             = a4\n\n# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names\n# that should be included in the LaTeX output. The package can be specified just\n# by its name or with the correct syntax as to be used with the LaTeX\n# \\usepackage command. To get the times font for instance you can specify :\n# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}\n# To use the option intlimits with the amsmath package you can specify:\n# EXTRA_PACKAGES=[intlimits]{amsmath}\n# If left blank no extra packages will be included.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nEXTRA_PACKAGES         =\n\n# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the\n# generated LaTeX document. The header should contain everything until the first\n# chapter. If it is left blank doxygen will generate a standard header. See\n# section \"Doxygen usage\" for information on how to let doxygen write the\n# default header to a separate file.\n#\n# Note: Only use a user-defined header if you know what you are doing! The\n# following commands have a special meaning inside the header: $title,\n# $datetime, $date, $doxygenversion, $projectname, $projectnumber,\n# $projectbrief, $projectlogo. Doxygen will replace $title with the empty\n# string, for the replacement values of the other commands the user is referred\n# to HTML_HEADER.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_HEADER           =\n\n# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the\n# generated LaTeX document. The footer should contain everything after the last\n# chapter. If it is left blank doxygen will generate a standard footer. See\n# LATEX_HEADER for more information on how to generate a default footer and what\n# special commands can be used inside the footer.\n#\n# Note: Only use a user-defined footer if you know what you are doing!\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_FOOTER           =\n\n# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined\n# LaTeX style sheets that are included after the standard style sheets created\n# by doxygen. Using this option one can overrule certain style aspects. Doxygen\n# will copy the style sheet files to the output directory.\n# Note: The order of the extra style sheet files is of importance (e.g. the last\n# style sheet in the list overrules the setting of the previous ones in the\n# list).\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_EXTRA_STYLESHEET =\n\n# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or\n# other source files which should be copied to the LATEX_OUTPUT output\n# directory. Note that the files will be copied as-is; there are no commands or\n# markers available.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_EXTRA_FILES      =\n\n# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is\n# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will\n# contain links (just like the HTML output) instead of page references. This\n# makes the output suitable for online browsing using a PDF viewer.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nPDF_HYPERLINKS         = YES\n\n# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as\n# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX\n# files. Set this option to YES, to get a higher quality PDF documentation.\n#\n# See also section LATEX_CMD_NAME for selecting the engine.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nUSE_PDFLATEX           = YES\n\n# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode\n# command to the generated LaTeX files. This will instruct LaTeX to keep running\n# if errors occur, instead of asking the user for help. This option is also used\n# when generating formulas in HTML.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_BATCHMODE        = NO\n\n# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the\n# index chapters (such as File Index, Compound Index, etc.) in the output.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_HIDE_INDICES     = NO\n\n# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source\n# code with syntax highlighting in the LaTeX output.\n#\n# Note that which sources are shown also depends on other settings such as\n# SOURCE_BROWSER.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_SOURCE_CODE      = NO\n\n# The LATEX_BIB_STYLE tag can be used to specify the style to use for the\n# bibliography, e.g. plainnat, or ieeetr. See\n# https://en.wikipedia.org/wiki/BibTeX and \\cite for more info.\n# The default value is: plain.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_BIB_STYLE        = plain\n\n# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated\n# page will contain the date and time when the page was generated. Setting this\n# to NO can help when comparing the output of multiple runs.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_TIMESTAMP        = NO\n\n# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)\n# path from which the emoji images will be read. If a relative path is entered,\n# it will be relative to the LATEX_OUTPUT directory. If left blank the\n# LATEX_OUTPUT directory will be used.\n# This tag requires that the tag GENERATE_LATEX is set to YES.\n\nLATEX_EMOJI_DIRECTORY  =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the RTF output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The\n# RTF output is optimized for Word 97 and may not look too pretty with other RTF\n# readers/editors.\n# The default value is: NO.\n\nGENERATE_RTF           = NO\n\n# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it.\n# The default directory is: rtf.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_OUTPUT             = rtf\n\n# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF\n# documents. This may be useful for small projects and may help to save some\n# trees in general.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nCOMPACT_RTF            = NO\n\n# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will\n# contain hyperlink fields. The RTF file will contain links (just like the HTML\n# output) instead of page references. This makes the output suitable for online\n# browsing using Word or some other Word compatible readers that support those\n# fields.\n#\n# Note: WordPad (write) and others do not support links.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_HYPERLINKS         = NO\n\n# Load stylesheet definitions from file. Syntax is similar to doxygen's\n# configuration file, i.e. a series of assignments. You only have to provide\n# replacements, missing definitions are set to their default value.\n#\n# See also section \"Doxygen usage\" for information on how to generate the\n# default style sheet that doxygen normally uses.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_STYLESHEET_FILE    =\n\n# Set optional variables used in the generation of an RTF document. Syntax is\n# similar to doxygen's configuration file. A template extensions file can be\n# generated using doxygen -e rtf extensionFile.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_EXTENSIONS_FILE    =\n\n# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code\n# with syntax highlighting in the RTF output.\n#\n# Note that which sources are shown also depends on other settings such as\n# SOURCE_BROWSER.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_RTF is set to YES.\n\nRTF_SOURCE_CODE        = NO\n\n#---------------------------------------------------------------------------\n# Configuration options related to the man page output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for\n# classes and files.\n# The default value is: NO.\n\nGENERATE_MAN           = NO\n\n# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it. A directory man3 will be created inside the directory specified by\n# MAN_OUTPUT.\n# The default directory is: man.\n# This tag requires that the tag GENERATE_MAN is set to YES.\n\nMAN_OUTPUT             = man\n\n# The MAN_EXTENSION tag determines the extension that is added to the generated\n# man pages. In case the manual section does not start with a number, the number\n# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is\n# optional.\n# The default value is: .3.\n# This tag requires that the tag GENERATE_MAN is set to YES.\n\nMAN_EXTENSION          = .3\n\n# The MAN_SUBDIR tag determines the name of the directory created within\n# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by\n# MAN_EXTENSION with the initial . removed.\n# This tag requires that the tag GENERATE_MAN is set to YES.\n\nMAN_SUBDIR             =\n\n# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it\n# will generate one additional man file for each entity documented in the real\n# man page(s). These additional files only source the real man page, but without\n# them the man command would be unable to find the correct page.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_MAN is set to YES.\n\nMAN_LINKS              = NO\n\n#---------------------------------------------------------------------------\n# Configuration options related to the XML output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that\n# captures the structure of the code including all documentation.\n# The default value is: NO.\n\nGENERATE_XML           = NO\n\n# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a\n# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of\n# it.\n# The default directory is: xml.\n# This tag requires that the tag GENERATE_XML is set to YES.\n\nXML_OUTPUT             = xml\n\n# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program\n# listings (including syntax highlighting and cross-referencing information) to\n# the XML output. Note that enabling this will significantly increase the size\n# of the XML output.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_XML is set to YES.\n\nXML_PROGRAMLISTING     = YES\n\n# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include\n# namespace members in file scope as well, matching the HTML output.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_XML is set to YES.\n\nXML_NS_MEMB_FILE_SCOPE = NO\n\n#---------------------------------------------------------------------------\n# Configuration options related to the DOCBOOK output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files\n# that can be used to generate PDF.\n# The default value is: NO.\n\nGENERATE_DOCBOOK       = NO\n\n# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.\n# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in\n# front of it.\n# The default directory is: docbook.\n# This tag requires that the tag GENERATE_DOCBOOK is set to YES.\n\nDOCBOOK_OUTPUT         = docbook\n\n# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the\n# program listings (including syntax highlighting and cross-referencing\n# information) to the DOCBOOK output. Note that enabling this will significantly\n# increase the size of the DOCBOOK output.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_DOCBOOK is set to YES.\n\nDOCBOOK_PROGRAMLISTING = NO\n\n#---------------------------------------------------------------------------\n# Configuration options for the AutoGen Definitions output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an\n# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures\n# the structure of the code including all documentation. Note that this feature\n# is still experimental and incomplete at the moment.\n# The default value is: NO.\n\nGENERATE_AUTOGEN_DEF   = NO\n\n#---------------------------------------------------------------------------\n# Configuration options related to the Perl module output\n#---------------------------------------------------------------------------\n\n# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module\n# file that captures the structure of the code including all documentation.\n#\n# Note that this feature is still experimental and incomplete at the moment.\n# The default value is: NO.\n\nGENERATE_PERLMOD       = NO\n\n# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary\n# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI\n# output from the Perl module output.\n# The default value is: NO.\n# This tag requires that the tag GENERATE_PERLMOD is set to YES.\n\nPERLMOD_LATEX          = NO\n\n# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely\n# formatted so it can be parsed by a human reader. This is useful if you want to\n# understand what is going on. On the other hand, if this tag is set to NO, the\n# size of the Perl module output will be much smaller and Perl will parse it\n# just the same.\n# The default value is: YES.\n# This tag requires that the tag GENERATE_PERLMOD is set to YES.\n\nPERLMOD_PRETTY         = YES\n\n# The names of the make variables in the generated doxyrules.make file are\n# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful\n# so different doxyrules.make files included by the same Makefile don't\n# overwrite each other's variables.\n# This tag requires that the tag GENERATE_PERLMOD is set to YES.\n\nPERLMOD_MAKEVAR_PREFIX =\n\n#---------------------------------------------------------------------------\n# Configuration options related to the preprocessor\n#---------------------------------------------------------------------------\n\n# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all\n# C-preprocessor directives found in the sources and include files.\n# The default value is: YES.\n\nENABLE_PREPROCESSING   = YES\n\n# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names\n# in the source code. If set to NO, only conditional compilation will be\n# performed. Macro expansion can be done in a controlled way by setting\n# EXPAND_ONLY_PREDEF to YES.\n# The default value is: NO.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nMACRO_EXPANSION        = NO\n\n# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then\n# the macro expansion is limited to the macros specified with the PREDEFINED and\n# EXPAND_AS_DEFINED tags.\n# The default value is: NO.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nEXPAND_ONLY_PREDEF     = NO\n\n# If the SEARCH_INCLUDES tag is set to YES, the include files in the\n# INCLUDE_PATH will be searched if a #include is found.\n# The default value is: YES.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nSEARCH_INCLUDES        = YES\n\n# The INCLUDE_PATH tag can be used to specify one or more directories that\n# contain include files that are not input files but should be processed by the\n# preprocessor.\n# This tag requires that the tag SEARCH_INCLUDES is set to YES.\n\nINCLUDE_PATH           =\n\n# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard\n# patterns (like *.h and *.hpp) to filter out the header-files in the\n# directories. If left blank, the patterns specified with FILE_PATTERNS will be\n# used.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nINCLUDE_FILE_PATTERNS  =\n\n# The PREDEFINED tag can be used to specify one or more macro names that are\n# defined before the preprocessor is started (similar to the -D option of e.g.\n# gcc). The argument of the tag is a list of macros of the form: name or\n# name=definition (no spaces). If the definition and the \"=\" are omitted, \"=1\"\n# is assumed. To prevent a macro definition from being undefined via #undef or\n# recursively expanded use the := operator instead of the = operator.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nPREDEFINED             =\n\n# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this\n# tag can be used to specify a list of macro names that should be expanded. The\n# macro definition that is found in the sources will be used. Use the PREDEFINED\n# tag if you want to use a different macro definition that overrules the\n# definition found in the source code.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nEXPAND_AS_DEFINED      =\n\n# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will\n# remove all references to function-like macros that are alone on a line, have\n# an all uppercase name, and do not end with a semicolon. Such function macros\n# are typically used for boiler-plate code, and will confuse the parser if not\n# removed.\n# The default value is: YES.\n# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.\n\nSKIP_FUNCTION_MACROS   = YES\n\n#---------------------------------------------------------------------------\n# Configuration options related to external references\n#---------------------------------------------------------------------------\n\n# The TAGFILES tag can be used to specify one or more tag files. For each tag\n# file the location of the external documentation should be added. The format of\n# a tag file without this location is as follows:\n# TAGFILES = file1 file2 ...\n# Adding location for the tag files is done as follows:\n# TAGFILES = file1=loc1 \"file2 = loc2\" ...\n# where loc1 and loc2 can be relative or absolute paths or URLs. See the\n# section \"Linking to external documentation\" for more information about the use\n# of tag files.\n# Note: Each tag file must have a unique name (where the name does NOT include\n# the path). If a tag file is not located in the directory in which doxygen is\n# run, you must also specify the path to the tagfile here.\n\nTAGFILES               =\n\n# When a file name is specified after GENERATE_TAGFILE, doxygen will create a\n# tag file that is based on the input files it reads. See section \"Linking to\n# external documentation\" for more information about the usage of tag files.\n\nGENERATE_TAGFILE       =\n\n# If the ALLEXTERNALS tag is set to YES, all external class will be listed in\n# the class index. If set to NO, only the inherited external classes will be\n# listed.\n# The default value is: NO.\n\nALLEXTERNALS           = NO\n\n# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed\n# in the modules index. If set to NO, only the current project's groups will be\n# listed.\n# The default value is: YES.\n\nEXTERNAL_GROUPS        = YES\n\n# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in\n# the related pages index. If set to NO, only the current project's pages will\n# be listed.\n# The default value is: YES.\n\nEXTERNAL_PAGES         = YES\n\n#---------------------------------------------------------------------------\n# Configuration options related to the dot tool\n#---------------------------------------------------------------------------\n\n# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram\n# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to\n# NO turns the diagrams off. Note that this option also works with HAVE_DOT\n# disabled, but it is recommended to install and use dot, since it yields more\n# powerful graphs.\n# The default value is: YES.\n\nCLASS_DIAGRAMS         = YES\n\n# You can include diagrams made with dia in doxygen documentation. Doxygen will\n# then run dia to produce the diagram and insert it in the documentation. The\n# DIA_PATH tag allows you to specify the directory where the dia binary resides.\n# If left empty dia is assumed to be found in the default search path.\n\nDIA_PATH               =\n\n# If set to YES the inheritance and collaboration graphs will hide inheritance\n# and usage relations if the target is undocumented or is not a class.\n# The default value is: YES.\n\nHIDE_UNDOC_RELATIONS   = YES\n\n# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is\n# available from the path. This tool is part of Graphviz (see:\n# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent\n# Bell Labs. The other options in this section have no effect if this option is\n# set to NO\n# The default value is: NO.\n\nHAVE_DOT               = NO\n\n# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed\n# to run in parallel. When set to 0 doxygen will base this on the number of\n# processors available in the system. You can set it explicitly to a value\n# larger than 0 to get control over the balance between CPU load and processing\n# speed.\n# Minimum value: 0, maximum value: 32, default value: 0.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_NUM_THREADS        = 0\n\n# When you want a differently looking font in the dot files that doxygen\n# generates you can specify the font name using DOT_FONTNAME. You need to make\n# sure dot is able to find the font, which can be done by putting it in a\n# standard location or by setting the DOTFONTPATH environment variable or by\n# setting DOT_FONTPATH to the directory containing the font.\n# The default value is: Helvetica.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_FONTNAME           = Helvetica\n\n# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of\n# dot graphs.\n# Minimum value: 4, maximum value: 24, default value: 10.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_FONTSIZE           = 10\n\n# By default doxygen will tell dot to use the default font as specified with\n# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set\n# the path where dot can find it using this tag.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_FONTPATH           =\n\n# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for\n# each documented class showing the direct and indirect inheritance relations.\n# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nCLASS_GRAPH            = YES\n\n# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a\n# graph for each documented class showing the direct and indirect implementation\n# dependencies (inheritance, containment, and class references variables) of the\n# class with other documented classes.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nCOLLABORATION_GRAPH    = YES\n\n# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for\n# groups, showing the direct groups dependencies.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nGROUP_GRAPHS           = YES\n\n# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and\n# collaboration diagrams in a style similar to the OMG's Unified Modeling\n# Language.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nUML_LOOK               = NO\n\n# If the UML_LOOK tag is enabled, the fields and methods are shown inside the\n# class node. If there are many fields or methods and many nodes the graph may\n# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the\n# number of items for each type to make the size more manageable. Set this to 0\n# for no limit. Note that the threshold may be exceeded by 50% before the limit\n# is enforced. So when you set the threshold to 10, up to 15 fields may appear,\n# but if the number exceeds 15, the total amount of fields shown is limited to\n# 10.\n# Minimum value: 0, maximum value: 100, default value: 10.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nUML_LIMIT_NUM_FIELDS   = 10\n\n# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and\n# collaboration graphs will show the relations between templates and their\n# instances.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nTEMPLATE_RELATIONS     = NO\n\n# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to\n# YES then doxygen will generate a graph for each documented file showing the\n# direct and indirect include dependencies of the file with other documented\n# files.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nINCLUDE_GRAPH          = YES\n\n# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are\n# set to YES then doxygen will generate a graph for each documented file showing\n# the direct and indirect include dependencies of the file with other documented\n# files.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nINCLUDED_BY_GRAPH      = YES\n\n# If the CALL_GRAPH tag is set to YES then doxygen will generate a call\n# dependency graph for every global function or class method.\n#\n# Note that enabling this option will significantly increase the time of a run.\n# So in most cases it will be better to enable call graphs for selected\n# functions only using the \\callgraph command. Disabling a call graph can be\n# accomplished by means of the command \\hidecallgraph.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nCALL_GRAPH             = NO\n\n# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller\n# dependency graph for every global function or class method.\n#\n# Note that enabling this option will significantly increase the time of a run.\n# So in most cases it will be better to enable caller graphs for selected\n# functions only using the \\callergraph command. Disabling a caller graph can be\n# accomplished by means of the command \\hidecallergraph.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nCALLER_GRAPH           = NO\n\n# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical\n# hierarchy of all classes instead of a textual one.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nGRAPHICAL_HIERARCHY    = YES\n\n# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the\n# dependencies a directory has on other directories in a graphical way. The\n# dependency relations are determined by the #include relations between the\n# files in the directories.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDIRECTORY_GRAPH        = YES\n\n# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images\n# generated by dot. For an explanation of the image formats see the section\n# output formats in the documentation of the dot tool (Graphviz (see:\n# http://www.graphviz.org/)).\n# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order\n# to make the SVG files visible in IE 9+ (other browsers do not have this\n# requirement).\n# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,\n# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and\n# png:gdiplus:gdiplus.\n# The default value is: png.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_IMAGE_FORMAT       = png\n\n# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to\n# enable generation of interactive SVG images that allow zooming and panning.\n#\n# Note that this requires a modern browser other than Internet Explorer. Tested\n# and working are Firefox, Chrome, Safari, and Opera.\n# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make\n# the SVG files visible. Older versions of IE do not have SVG support.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nINTERACTIVE_SVG        = NO\n\n# The DOT_PATH tag can be used to specify the path where the dot tool can be\n# found. If left blank, it is assumed the dot tool can be found in the path.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_PATH               =\n\n# The DOTFILE_DIRS tag can be used to specify one or more directories that\n# contain dot files that are included in the documentation (see the \\dotfile\n# command).\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOTFILE_DIRS           =\n\n# The MSCFILE_DIRS tag can be used to specify one or more directories that\n# contain msc files that are included in the documentation (see the \\mscfile\n# command).\n\nMSCFILE_DIRS           =\n\n# The DIAFILE_DIRS tag can be used to specify one or more directories that\n# contain dia files that are included in the documentation (see the \\diafile\n# command).\n\nDIAFILE_DIRS           =\n\n# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the\n# path where java can find the plantuml.jar file. If left blank, it is assumed\n# PlantUML is not used or called during a preprocessing step. Doxygen will\n# generate a warning when it encounters a \\startuml command in this case and\n# will not generate output for the diagram.\n\nPLANTUML_JAR_PATH      =\n\n# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a\n# configuration file for plantuml.\n\nPLANTUML_CFG_FILE      =\n\n# When using plantuml, the specified paths are searched for files specified by\n# the !include statement in a plantuml block.\n\nPLANTUML_INCLUDE_PATH  =\n\n# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes\n# that will be shown in the graph. If the number of nodes in a graph becomes\n# larger than this value, doxygen will truncate the graph, which is visualized\n# by representing a node as a red box. Note that doxygen if the number of direct\n# children of the root node in a graph is already larger than\n# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that\n# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.\n# Minimum value: 0, maximum value: 10000, default value: 50.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_GRAPH_MAX_NODES    = 50\n\n# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs\n# generated by dot. A depth value of 3 means that only nodes reachable from the\n# root by following a path via at most 3 edges will be shown. Nodes that lay\n# further from the root node will be omitted. Note that setting this option to 1\n# or 2 may greatly reduce the computation time needed for large code bases. Also\n# note that the size of a graph can be further restricted by\n# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.\n# Minimum value: 0, maximum value: 1000, default value: 0.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nMAX_DOT_GRAPH_DEPTH    = 0\n\n# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent\n# background. This is disabled by default, because dot on Windows does not seem\n# to support this out of the box.\n#\n# Warning: Depending on the platform used, enabling this option may lead to\n# badly anti-aliased labels on the edges of a graph (i.e. they become hard to\n# read).\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_TRANSPARENT        = NO\n\n# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output\n# files in one run (i.e. multiple -o and -T options on the command line). This\n# makes dot run faster, but since only newer versions of dot (>1.8.10) support\n# this, this feature is disabled by default.\n# The default value is: NO.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_MULTI_TARGETS      = NO\n\n# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page\n# explaining the meaning of the various boxes and arrows in the dot generated\n# graphs.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nGENERATE_LEGEND        = YES\n\n# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot\n# files that are used to generate the various graphs.\n# The default value is: YES.\n# This tag requires that the tag HAVE_DOT is set to YES.\n\nDOT_CLEANUP            = YES\n"
  },
  {
    "path": "docs/Doxyfile.in",
    "content": "#...\nINPUT = \"@DOXYGEN_INPUT_DIR@\"\n#...\nOUTPUT_DIRECTORY = \"@DOXYGEN_OUTPUT_DIR@\"\n#...\nGENERATE_XML = YES\n#...\n"
  },
  {
    "path": "docs/README.md",
    "content": "# Building the Docs #\n\n1. Clone main repository: `git clone https://github.com/marius-team/marius.git`.\n\n2. Clone `gh-pages` branch into seperate directory `html`: `git clone -b gh-pages https://github.com/marius-team/marius.git html`\n\n3. Enter main repo: `cd marius`. Create build directory and run CMake with `BUILD_DOCS` enabled: `mkdir build; cd build; cmake ../ -DBUILD_DOCS=1`.\n\n2. Build the documentation with Sphinx `make Sphinx -j`\n\n3. Output html files will be generated in our `html` directory. Push changes to `gh-pages` for site to update at https://marius-project.org/marius/."
  },
  {
    "path": "docs/_static/css/marius_theme.css",
    "content": "@import url(\"theme.css\");\n\n.wy-nav-content {\n    max-width: 50vw;\n}\n\n:root {\n    /*--marius_purple: #180A5B;*/\n    /*--marius_offwhite: #FFFDF3;*/\n    --marius_offwhite: #FFFDFB;\n    --marius_lightblue: #7175a2;\n    --marius_lighterblue: #999ecd;\n}\n\nh1, h2, h3, h4, h5, h6 {\n    font-family: Avenir;\n    font-weight: 800;\n}\n\n.rst-content table.docutils caption, .rst-content table.field-list caption, .wy-table caption {\n    font-style: normal;\n    font-weight: 400;\n    color: black;\n}\n\n.wy-body-for-nav {\n    font-family: FreightSans, Helvetica Neue, Helvetica, Arial, sans-serif;\n    font-weight: 400;\n    color: black;\n}\n\n.rst-footer-buttons {\n    display: none;\n}\n\n.wy-side-nav-search, .wy-nav-top {\n    /*background: var(--marius_purple);*/\n    background: var(--marius_lightblue);\n}\n\n.wy-nav-side {\n    /* background: var(--marius_offwhite); */\n    background: var(--marius_lighterblue);\n    border-right: 1px solid #e1e4e5;\n    color: black;\n}\n\n.wy-menu-vertical a {\n    color: black;\n}\n\n.wy-menu-vertical li.current a {\n    border-right: None;\n    font-weight: 400;\n    color: var(--marius_offwhite);\n}\n\n.wy-menu-vertical li.current>a, .wy-menu-vertical li.on a {\n    /* background: var(--marius_offwhite); */\n    background: var(--marius_lighterblue);\n}\n\n.wy-nav-content {\n    background: var(--marius_offwhite);\n}\n\n.wy-nav-content-wrap {\n    background: var(--marius_offwhite);\n}\n\n.wy-table-responsive table td, .wy-table-responsive table th {\n    white-space: normal;\n}\n\n/*hide nested bullet points from toclist*/\n.rst-content .section ul li, .rst-content .toctree-wrapper ul li, .rst-content section ul li, .wy-plain-list-disc li, article ul li {\n    list-style: none;\n    margin: 0;\n    padding: 0;\n    font-weight: normal;\n}\n\n/*hide nested bullet points from toclist*/\n.rst-content .section ul li li, .rst-content .toctree-wrapper ul li li, .rst-content section ul li li, .wy-plain-list-disc li li, article ul li li {\n    list-style: none;\n    margin-left: 24px;\n    padding: 0;\n    font-weight: normal;\n}\n\n.wy-menu-vertical a:hover {\n    background: none;\n}\n\n.wy-menu-vertical li.current {\n    background: none;\n\n}\n\n.wy-menu-vertical li.current:hover {\n    background: none;\n}\n\n.wy-menu-vertical li.toctree-l2.current>a, .wy-menu-vertical li.toctree-l2.current li.toctree-l3>a {\n    background: none;\n}\n\n.wy-menu-vertical li.toctree-l2.current>a:hover, .wy-menu-vertical li.toctree-l2.current li.toctree-l3>a:hover {\n    background: none;\n}\n\n.wy-menu-vertical li.current a:hover {\n    background: none;\n}\n\n.wy-menu-vertical li.toctree-l3.current>a, .wy-menu-vertical li.toctree-l3.current li.toctree-l4>a {\n    background: none;\n}\n\n.wy-menu-vertical li.toctree-l3.current>a:hover, .wy-menu-vertical li.toctree-l3.current li.toctree-l4>a:hover {\n    background: none;\n}\n\n\n.wy-menu-vertical li.toctree-l1.current>a {\n    border-top: none;\n    border-bottom: none;\n}\n\n.wy-menu-vertical li.toctree-l1.current>a:hover {\n    background: none;\n}\n\n.wy-menu-vertical li.current>a {\n    border-top: none;\n    border-bottom: none;\n}\n\nhtml.writer-html4 .rst-content dl:not(.docutils) .property, html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .property {\n    display: inline;\n    padding-right: 8px;\n    max-width: 100%;\n}\n\ndiv.leftside {\n    width: 60%;\n    padding: 0px 10px 0px 0px;\n    float: left;\n}\n\ndiv.rightside {\n    margin-left: 10%;\n    /* float: right; */\n}"
  },
  {
    "path": "docs/_templates/layout.html",
    "content": "{% extends \"!layout.html\" %}\n  {% block menu %} {{ super() }}\n\n  <!-- <style>\n    a.gh-font {\n        font-weight: 800;\n        color: rgb(160, 45, 45);\n    }\n    </style> -->\n  <p>\n  <!-- <svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 1000 1000\">\n    <path\n      d=\"M439.55 236.05L244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z\" />\n  </svg> -->\n  <a href=\"https://github.com/marius-team/marius\">GitHub</a>\n</p>\n{% endblock %}"
  },
  {
    "path": "docs/conf.py",
    "content": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common options. For a full\n# list see the documentation:\n# https://www.sphinx-doc.org/en/master/usage/configuration.html\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\n# import os\n# import sys\n# sys.path.insert(0, os.path.abspath('.'))\n\n\n# -- Project information -----------------------------------------------------\n\nproject = \"Marius\"\n# copyright = '2020, Jason Mohoney'\nauthor = \"Jason Mohoney\"\n\n# The full version, including alpha/beta/rc tags\nrelease = \"0.0.2\"\n\n\n# -- General configuration ---------------------------------------------------\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\"breathe\", \"sphinx.ext.autodoc\", \"sphinx_autodoc_typehints\"]\n\n# Breathe Configuration\nbreathe_default_project = \"Marius\"\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = [\"_templates\"]\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = [\"_build\", \"Thumbs.db\", \".DS_Store\"]\n\nautodoc_typehints = \"description\"\nautodoc_member_order = \"bysource\"\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\nhtml_theme = \"sphinx_rtd_theme\"\n\nhtml_style = \"css/marius_theme.css\"\n\nhtml_logo = \"marius_logo_scaled.png\"\n\nhtml_favicon = \"favicon.ico\"\n\nhtml_theme_options = {\"logo_only\": True}\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\nhtml_static_path = [\"_static\"]\n"
  },
  {
    "path": "docs/config_interface/configuration.rst",
    "content": "\nOverview\n======================\n\nThe configuration interface allows for high-performance training and evaluation of models without need for writing code.\n\nConfiguration files are defined in YAML format and are grouped up into four sections:\n\n- Model: Defines the architecture of the model, neighbor sampling configuration, loss, and optimizer(s)\n- Storage: Specifies the input dataset and how to store the graph, features, and embeddings.\n- Training: Sets options for the training procedure and hyperparameters. E.g. batch size, negative sampling.\n- Evaluation: Sets options for the evaluation procedure (if any). The options here are similar to those in the training section.\n\n\nLink Prediction Example\n-----------------------\n\nIn this example, we show how to define a configuration file for training a :doc:`3-layer GraphSage GNN <../examples/config/lp_fb15k237>` for link prediction on :doc:`fb15k_237 <../examples/config/lp_fb15k237>`.\n\nThis example assumes that marius has been installed with :doc:`pip <../build>` the dataset has been preprocessed with the following command:\n\n``marius_preprocess --dataset fb15k_237 --output_dir /home/data/datasets/fb15k_237/``\n\n\n1. Define the model:\n^^^^^^^^^^^^^^^^^^^^\n\n+-------------------------------------------+-----------------------------------------------+\n|                                           |                                               |\n|.. code-block:: yaml                       |.. image:: ../assets/configuration_lp.png      |\n|                                           |  :width: 700                                  |\n|   model:                                  |                                               |\n|     encoder:                              |                                               |\n|       train_neighbor_sampling:            |                                               |\n|         - type: ALL                       |                                               |\n|         - type: ALL                       |                                               |\n|         - type: ALL                       |                                               |\n|       layers:                             |                                               |\n|         - - type: EMBEDDING               |                                               |\n|             output_dim: 50                |                                               |\n|             bias: true                    |                                               |\n|                                           |                                               |\n|           - type: FEATURE                 |                                               |\n|             output_dim: 50                |                                               |\n|             bias: true                    |                                               |\n|                                           |                                               |\n|         - - type: REDUCTION               |                                               |\n|             input_dim: 100                |                                               |\n|             output_dim: 50                |                                               |\n|             bias: true                    |                                               |\n|             options:                      |                                               |\n|               type: LINEAR                |                                               |\n|                                           |                                               |\n|         - - type: GNN                     |                                               |\n|             options:                      |                                               |\n|             type: GRAPH_SAGE              |                                               |\n|             aggregator: MEAN              |                                               |\n|             input_dim: 50                 |                                               |\n|             output_dim: 50                |                                               |\n|             bias: true                    |                                               |\n|             init:                         |                                               |\n|               type: GLOROT_NORMAL         |                                               |\n|                                           |                                               |\n|         - - type: GNN                     |                                               |\n|             options:                      |                                               |\n|             type: GRAPH_SAGE              |                                               |\n|             aggregator: MEAN              |                                               |\n|             input_dim: 50                 |                                               |\n|             output_dim: 50                |                                               |\n|             bias: true                    |                                               |\n|             init:                         |                                               |\n|               type: GLOROT_NORMAL         |                                               |\n|                                           |                                               |\n|         - - type: GNN                     |                                               |\n|             options:                      |                                               |\n|             type: GRAPH_SAGE              |                                               |\n|             aggregator: MEAN              |                                               |\n|             input_dim: 50                 |                                               |\n|             output_dim: 50                |                                               |\n|             bias: true                    |                                               |\n|             init:                         |                                               |\n|               type: GLOROT_NORMAL         |                                               |\n|                                           |                                               |\n|     decoder:                              |                                               |\n|       type: DISTMULT                      |                                               |\n|     loss:                                 |                                               |\n|       type: SOFTMAX_CE                    |                                               |\n|       options:                            |                                               |\n|         reduction: SUM                    |                                               |\n|     dense_optimizer:                      |                                               |\n|       type: ADAM                          |                                               |\n|       options:                            |                                               |\n|         learning_rate: 0.01               |                                               |\n|     sparse_optimizer:                     |                                               |\n|       type: ADAGRAD                       |                                               |\n|       options:                            |                                               |\n|         learning_rate: 0.1                |                                               |\n|                                           |                                               |\n+-------------------------------------------+-----------------------------------------------+\n\nThe above model configuration has 5 stages in the encoder section, each stage separated by a `--`. The first stage has 2 layers, one embedding layer with output \ndimension 50 and another feature layer with output dimension of 50. The reduction layer in stage 2 takes input the combined vector of dimension \n100 and outputs a 50 dimensional vector. It is followed by 3 stages of GNN layers. The output from the encoder is fed to the decoder of type DISMULT. \nThe loss function being used is SoftmaxCrossEntropy with sum as the reduction method. The dense optimizer is for all model parameters except the node embeddings.\nNode embedings are optimized by the sparse optimizer. \n\n2. Set storage and dataset:\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. code-block:: yaml\n\n   storage:\n     device_type: cpu\n     dataset:\n       dataset_dir: /home/data/datasets/fb15k_237/\n     edges:\n       type: DEVICE_MEMORY\n       options:\n         dtype: int\n     embeddings:\n       type: DEVICE_MEMORY\n       options:\n         dtype: float\n\nThe storage configuration provides information on the location and statistics of the pre-processed dataset. It also specfies where \nto store the embeddings and edges during training. The `device_type` is set to `cpu` here, `cuda` mode can be used for gpu training.\n`DEVICE_MEMORY` in this case states that the embeddings need to stored in cpu memory.\n\n3. Configure training and evaluation\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. code-block:: yaml\n\n   training:\n     batch_size: 1000\n     negative_sampling:\n       num_chunks: 10\n       negatives_per_positive: 10\n       degree_fraction: 0\n       filtered: false\n     num_epochs: 10\n     pipeline:\n       sync: true\n     epochs_per_shuffle: 1\n     logs_per_epoch: 10\n   evaluation:\n     batch_size: 1000\n     negative_sampling:\n       filtered: true\n     epochs_per_eval: 1\n     pipeline:\n       sync: true\n\nThe training configuration specifies number of data samples in each batch and the total number of epochs to train the model for. \nMarius groups edges into chunks and reuses negative samples within the chunk. `num_chunks`*`negatives_per_positive` negative edges are \nsampled for each positive edge. Marius also uses pipelining to overlap data movement with training which introduces bounded staleness \nin the system. We can explicitly set sync to true if we want every minibatch to see the latest embeddings. \n\nNode Classification Example\n---------------------------\n\nIn this example, we show how to define a configuration file for training a :doc:`3-layer GAT GNN <../examples/config/nc_ogbn_arxiv>` for node classification on :doc:`ogbn_arxiv <../examples/config/nc_ogbn_arxiv>`.\n\nThis example assumes that marius has been installed with :doc:`pip <../build>` the dataset has been preprocessed with the following command:\n\n``marius_preprocess --dataset ogbn_arxiv --output_dir /home/data/datasets/ogbn_arxiv/``\n\n\n1. Define the model:\n^^^^^^^^^^^^^^^^^^^^\n\n+-------------------------------------------+-----------------------------------------------+\n|                                           |                                               |\n|.. code-block:: yaml                       |.. image:: ../assets/configuration_nc.png      |\n|                                           |                                               |\n|   model:                                  |                                               |\n|     learning_task: NODE_CLASSIFICATION    |                                               |\n|     encoder:                              |                                               |\n|       train_neighbor_sampling:            |                                               |\n|         - type: ALL                       |                                               |\n|       layers:                             |                                               |\n|         - - type: FEATURE                 |                                               |\n|             output_dim: 128               |                                               |\n|             bias: false                   |                                               |\n|             init:                         |                                               |\n|               type: GLOROT_NORMAL         |                                               |\n|         - - type: GNN                     |                                               |\n|             options:                      |                                               |\n|               type: GRAPH_SAGE            |                                               |\n|               aggregator: MEAN            |                                               |\n|             input_dim: 128                |                                               |\n|             output_dim: 40                |                                               |\n|             bias: true                    |                                               |\n|             init:                         |                                               |\n|               type: GLOROT_NORMAL         |                                               |\n|     decoder:                              |                                               |\n|       type: NODE                          |                                               |\n|     loss:                                 |                                               |\n|       type: CROSS_ENTROPY                 |                                               |\n|       options:                            |                                               |\n|         reduction: SUM                    |                                               |\n|     dense_optimizer:                      |                                               |\n|       type: ADAM                          |                                               |\n|       options:                            |                                               |\n|         learning_rate: 0.01               |                                               |\n|     sparse_optimizer:                     |                                               |\n|       type: ADAGRAD                       |                                               |\n|       options:                            |                                               |\n|         learning_rate: 0.1                |                                               |\n|                                           |                                               |\n+-------------------------------------------+-----------------------------------------------+\n\nThe above node classification example has 2 layers in the encoder section, one feature layer and another GNN layer. The number of\ntraining/evaluation sampling layers should be equal to the number of GNN stages in the model. The model has a decoder of type node\nclassification. The loss function being used is Cross Entropy with sum as the reduction method.\n\n2. Set storage and dataset:\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. code-block:: yaml\n\n   storage:\n     device_type: cuda\n     dataset:\n       dataset_dir: /home/data/datasets/ogbn_arxiv/\n     edges:\n       type: DEVICE_MEMORY\n     nodes:\n       type: DEVICE_MEMORY\n     features:\n       type: DEVICE_MEMORY\n     embeddings:\n       type: DEVICE_MEMORY\n       options:\n         dtype: float\n     prefetch: true\n     shuffle_input: true\n     full_graph_evaluation: true\n\nThe storage configuration here is very similar to the one shown above in Link Prediction.\n\n3. Configure training and evaluation\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. code-block:: yaml\n\n   training:\n     batch_size: 1000\n     num_epochs: 5\n     pipeline:\n       sync: true\n     epochs_per_shuffle: 1\n     logs_per_epoch: 1\n   evaluation:\n     batch_size: 1000\n     pipeline:\n       sync: true\n     epochs_per_eval: 1\n\nThe above training configuration has specifications for a training batch size of 1000 and total epochs of 5. The `logs_per_epoch` attribute \nsets how often to report progres during training. `epochs_per_eval` sets how often to evaluate the model. \n\nDefining Encoder Architectures\n------------------------------\n\nThe interface enables users to define complex model architectures. The layers field can be seen as a double-list, a list of stages wherein \neach stage is again a list of layers. We need to ensure that the total output dimension of a stage is equal to the net input dimension of \nthe next stage. We need to ensure that the following conditions are met while stacking layers of a model,\n\n#. Embedding/Feature layers have only output dimension. The `input_dim` is set to -1 by default\n#. A Reduction layer can have inputs from multiple layers in the previous stage and has a single output\n#. The number of training/evaluation sampling layers should be equal to the GNN stages in the model\n\nAdvanced Configuration\n----------------------\n\nPipeline\n^^^^^^^^\nMarius uses pipelining training architecture that can interleave data access, transfer, and computation to achieve high utilization. This \nintroduces the possibility of a few mini-batches using stale parameters during training. If `sync` is set to true, the training becomes \nsynchronous and there is no staleness. Below is a sample configuration where the training is async, there is bounded staleness in the system.\n\n\n.. code-block:: yaml\n\n   pipeline:\n     sync: false\n     staleness_bound: 16\n     batch_host_queue_size: 4\n     batch_device_queue_size: 4\n     gradients_device_queue_size: 4\n     gradients_host_queue_size: 4\n     batch_loader_threads: 4\n     batch_transfer_threads: 2\n     compute_threads: 1\n     gradient_transfer_threads: 2\n     gradient_update_threads: 4\n\n\n.. image:: ../assets/marius_arch.png\n  :width: 700\n  :align: center\n\n\nMarius follows a 5-staged pipeline architecture, 4 of which are responsible for data movement and the other is for model computation \nand in-GPU parameter updates. The `pipeline` field has options for setting thread counts for each of these stages. `staleness_bound` \nsets the maximum number of minibatches that can be present in the pipeline at any time. It implies that after a set of node embedding \nupdates, at most of 16 mini-batches use stale node embeddings. \n\nPartition Buffer\n^^^^^^^^^^^^^^^^\nOne of the storage backends supported for node embeddings is the `PARTITION_BUFFER` mode, where the nodes are bucketed into p partitions \nand every edge falls into one of the p^2 buckets. When pre-processed in the partitioned mode, the edges are ordered in a wat that reduces\nthe number of node-embedding bucket swaps from the buffer. \n\nThe following command pre-processes the fb15k_237 dataset into 10 partitions as required by Marius for training in `PARTITION_BUFFER` mode.\n\n``marius_preprocess --dataset fb15k_237 --num_partitions 10 --output_dir /home/data/datasets/fb15k_237_partitioned/``\n\nNow, we can set the storage backend for node embeddings to `PARTITION_BUFFER` mode\n\n\n.. code-block:: yaml\n\n   embeddings:\n     type: PARTITION_BUFFER\n     options:\n       dtype: float\n       num_partitions: 10\n       buffer_capacity: 5\n       prefetching: true\n\n\n`num_partitions` should hold the same value that was earlier supplied to `marius_preprocess`. `buffer_capacity` states the maximum number of \nnode embedding buckets that can be present in the memory at any given time. Setting `prefetching` enables the system to prefetch partitions \nasynchronously leading to reduction in IO wait times and additional memory overheads. "
  },
  {
    "path": "docs/config_interface/full_schema.rst",
    "content": ".. _config_schema\n\nConfiguration Schema\n=========================\n\n.. list-table:: MariusConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - model\n     - ModelConfig\n     - Defines model architecture, learning task, optimizers and loss function.\n     - Yes\n   * - storage\n     - StorageConfig\n     - Defines the input graph and how to store the graph (edges, features) and learned model (embeddings).\n     - Yes\n   * - training\n     - TrainingConfig\n     - Hyperparameters for training.\n     - Training\n   * - evaluation\n     - EvaluationConfig\n     - Hyperparameters for evaluation.\n     - Evaluation\n\nBelow is a sample end-to-end configuration file for link prediction on `fb15_237` dataset. The model consists of an embedding layer\nin the encoder phase which is directly fed to the `DISTMULT` decoder. Both embeddings and edges are stored in `cpu` memory. \n\n.. code-block:: yaml \n\n   model:\n     learning_task: LINK_PREDICTION\n     encoder:\n       layers:\n         - - type: EMBEDDING\n             output_dim: 50\n             bias: true\n             init:\n               type: GLOROT_NORMAL\n     decoder:\n       type: DISTMULT\n     loss:\n       type: SOFTMAX_CE\n       options:\n         reduction: SUM\n     dense_optimizer:\n       type: ADAM\n       options:\n         learning_rate: 0.01\n     sparse_optimizer:\n       type: ADAGRAD\n       options:\n         learning_rate: 0.1\n   storage:\n     full_graph_evaluation: true\n     device_type: cpu\n     dataset:\n       dataset_dir: /home/data/datasets/fb15k_237/\n     edges:\n       type: DEVICE_MEMORY\n       options:\n         dtype: int\n     embeddings:\n       type: DEVICE_MEMORY\n       options:\n         dtype: float\n   training:\n     batch_size: 1000\n     negative_sampling:\n       num_chunks: 10\n       negatives_per_positive: 10\n       degree_fraction: 0\n       filtered: false\n     num_epochs: 10\n     pipeline:\n       sync: true\n     epochs_per_shuffle: 1\n     logs_per_epoch: 10\n     resume_training: false\n   evaluation:\n     batch_size: 1000\n     negative_sampling:\n       filtered: true\n     epochs_per_eval: 1\n     pipeline:\n       sync: true\n\n\nModel Configuration\n--------------------\n\n\n.. list-table:: ModelConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - random_seed\n     - Int\n     - Random seed used to initialize, train, and evaluate the model. If not given, a seed will be generated.\n     - No\n   * - learning_task\n     - String\n     - Learning task for which the model is used. Valid values are [\"LINK_PREDICTION\", \"NODE_CLASSIFICATION\"] (case insensitive). \"LP\" and \"NC\" can be used for shorthand.\n     - Yes\n   * - :ref: encoder\n     - :ref:`EncoderConfig<encoder-conf-section>`\n     - Defines the architecture of the encoder and configuration of neighbor samplers.\n     - Yes\n   * - :ref: decoder\n     - :ref:`DecoderConfig<decoder-conf-section>`\n     - Denotes the decoder to apply to the output of the encoder. The decoder is learning task specific.\n     - Yes\n   * - :ref: loss\n     - :ref:`LossConfig<loss-conf-section>`\n     - Loss function to apply over the output of the decoder.\n     - Required for training\n   * - dense_optimizer\n     - :ref:`OptimizerConfig<optimizer-conf-section>`\n     - Optimizer to use for dense model parameters. Where dense model parameters refer to all parameters besides the node embeddings. Where node embeddings are handled by the sparse_optimizer.\n     - Required for training\n   * - sparse_optimizer\n     - :ref:`OptimizerConfig<optimizer-conf-section>`\n     - Optimizer to use for the node embedding parameters. Currently only ADAGRAD is supported.\n     - No\n\nBelow is a full view of the `model` attribute and the corresponding parameters that can be set in the model configuration. It consists\nof an embedding layer in the encoder phase and a `DISTMULT` decoder.\n\n.. code-block:: yaml\n\n   model:\n     random_seed: 456356765463\n     learning_task: LINK_PREDICTION\n     encoder:\n       layers:\n         - - type: EMBEDDING\n             output_dim: 50\n             bias: true\n             init:\n               type: GLOROT_NORMAL\n             optimizer:\n               type: DEFAULT\n               options:\n                 learning_rate: 0.1\n     decoder:\n       type: DISTMULT\n       options:\n         inverse_edges: true\n         use_relation_features: false\n         edge_decoder_method: CORRUPT_NODE\n       optimizer:\n         type: ADAGRAD\n         options:\n           learning_rate: 0.1\n     loss:\n       type: SOFTMAX_CE\n       options:\n         reduction: SUM\n     dense_optimizer:\n       type: ADAM\n       options:\n         learning_rate: 0.01\n     sparse_optimizer:\n       type: ADAGRAD\n       options:\n         learning_rate: 0.1\n\n.. _encoder-conf-section:\n\nEncoder Configuration\n^^^^^^^^^^^^^^^^^^^^^\n\n.. list-table:: EncoderConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - use_incoming_nbrs\n     - Boolean\n     - Whether to use incoming neighbors for the encoder. One of use_incoming_nbrs or use_outgoing_nbrs must be set to true.\n     - No\n    * - use_outgoing_nbrs\n     - Boolean\n     - Whether to use outgoing neighbors for the encoder. One of use_incoming_nbrs or use_outgoing_nbrs must be set to true.\n     - No\n   * - layers\n     - List[List[:ref:`LayerConfig<layer-conf-section>`]]\n     - Defines architecture of the encoder. Layers of the encoder are grouped into stages, where the layers within a stage are executed in parallel and the output of stage is the input to the successive stage.\n     - Yes\n   * - train_neighbor_sampling\n     - List[:ref:`NeighborSamplingConfig<neighbor-sampling-conf-section>`]\n     - Sets the neighbor sampling configuration for each GNN layer for training (and evaluation if eval_neighbor_sampling is not set). Defined as a list of neighbor sampling configurations, where the size of the list must match the number of GNN layers in the encoder.\n     - Only for GNNs\n   * - eval_neighbor_sampling\n     - List[:ref:`NeighborSamplingConfig<neighbor-sampling-conf-section>`]\n     - Sets the neighbor sampling configuration for each GNN layer for evaluation. Defined as a list of neighbor sampling configurations, where the size of the list must match the number of GNN layers in the encoder. If this field is not set then the sampling configuration used for training will be used for evaluation.\n     - No\n\nThe below example depicts a configuration where there is one embedding layer, followed by three GNN layers.  \n\n.. code-block:: yaml\n\n   encoder:\n     train_neighbor_sampling:\n       - type: ALL\n       - type: ALL\n       - type: ALL\n     eval_neighbor_sampling:\n       - type: ALL\n       - type: ALL\n       - type: ALL\n     layers:\n       - - type: EMBEDDING\n           output_dim: 10\n           bias: true\n           init:\n             type: GLOROT_NORMAL\n\n       - - type: GNN\n           options:\n             type: GAT\n           input_dim: 10\n           output_dim: 10\n           bias: true\n           init:\n             type: GLOROT_NORMAL\n\n       - - type: GNN\n           options:\n             type: GAT\n           input_dim: 10\n           output_dim: 10\n           bias: true\n           init:\n             type: GLOROT_NORMAL\n\n       - - type: GNN\n           options:\n             type: GAT\n           input_dim: 10\n           output_dim: 10\n           bias: true\n           init:\n             type: GLOROT_NORMAL\n\n\n.. _neighbor-sampling-conf-section:\n\n.. list-table:: NeighborSamplingConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - Denotes the type of the neighbor sampling layer. Options: [\"ALL\", \"UNIFORM\", \"DROPOUT\"].\n     - Yes\n   * - options\n     - NeighborSamplingOptions\n     - Specific options depending on the type of sampling layer.\n     - No\n\n\n.. list-table:: UniformSamplingOptions[NeighborSamplingOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - max_neighbors\n     - Int\n     - Number of neighbors to sample in a given uniform sampling layer.\n     - Yes\n\nThe below configuration might work for a graph configuration where there are 2 GNN layers. The configuration specifies that at most \n10 neighboring nodes will be samples for any given node embedding during training.\n\n.. code-block:: yaml \n\n   train_neighbor_sampling:\n     - type: UNIFORM\n       options:\n         max_neighbors: 10\n     - type: UNIFORM\n       options:\n         max_neighbors: 10\n\n\n.. list-table:: DropoutSamplingOptions[NeighborSamplingOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - rate\n     - Float\n     - The dropout rate for a dropout layer.\n     - Yes\n\n`DROPOUT` mode neighbor sampling randomly drops `rate * 100` percent neighbors during sampling. \n\n.. code-block:: yaml \n\n   train_neighbor_sampling:\n     - type: DROPOUT\n       options:\n         rate: 0.05\n\n\n.. _layer-conf-section:\n\nLayer Configuration\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n.. list-table:: LayerConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - Denotes the type of layer. Options: [\"EMBEDDING\", \"FEATURE\", \"GNN\" \"REDUCTION\"]\n     - Yes\n   * - options\n     - LayerOptions\n     - Layer specific options depending on the type.\n     - No\n   * - input_dim\n     - Int\n     - The dimension of the input to the layer.\n     - GNN and Reduction layers\n   * - output_dim\n     - Int\n     - The output of dimension of the layer.\n     - Yes\n   * - init\n     - :ref:`InitConfig<init-conf-section>`\n     - Initialization method for the layer parameters. (Default GLOROT_UNIFORM).\n     - No\n   * - optimizer\n     - OptimizerConfig\n     - Optimizer to use for the parameters of this layer. If not given, the dense_optimizer is used.\n     - No\n   * - bias\n     - Bool\n     - Enable a bias to be applied to the output of the layer. (Default False)\n     - No\n   * - bias_init\n     - :ref:`InitConfig<init-conf-section>`\n     - Initialization method for the bias. The default initialization is zeroes.\n     - No\n   * - activation\n     - String\n     - Activation function to apply to the output of the layer. Options [\"RELU\", \"SIGMOID\", \"NONE\"]. (Default \"NONE\")\n     - No\n\nBelow is a configuration for creating and embedding layer with output dimension 50. It is initialized with zeros and has no activation \nset.\n\n.. code-block:: yaml\n\n   layers:\n   - - type: EMBEDDING\n       input_dim: -1\n       output_dim: 50\n       init:\n         type: GLOROT_NORMAL\n       optimizer:\n         type: DEFAULT\n         options:\n           learning_rate: 0.1\n       bias: true\n       bias_init:\n         type: ZEROS\n       activation: NONE\n\n\nA GNN layer of type GAT (Graph Attention) with input and output dimension of 50 is as follows.\n\n.. code-block:: yaml \n\n   layers:\n   - - type: GNN\n       options:\n         type: GAT\n       input_dim: 50\n       output_dim: 50\n       bias: true\n       init:\n         type: GLOROT_NORMAL\n\n\nA Reduction layer of type Linear, with input dimension of 100 and output dimension of 50 is as follows. \n\n.. code-block:: yaml\n\n   layers:\n   - - type: REDUCTION\n       input_dim: 100\n       ouptut_dim: 50\n       bias: true\n       options:\n         type: LINEAR\n\n\nBelow is a simple Feature layer with output dimension of 50. The input dimension is set to -1 by default since both Feature and \nEmbedding layers do not have any input. \n\n.. code-block:: yaml\n\n   layers:\n   - - type: FEATURE\n       output_dim: 50\n       bias: true\n\n\nLayer Options\n\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n**GNN Layer Options**\n\n.. list-table:: GraphSageLayerOptions[LayerOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - The type of the GNN layer, for GraphSage, this must be equal to \"GRAPH_SAGE\".\n     - Yes\n   * - aggregator\n     - String\n     - Aggregation to use for graph sage, options are [\"GCN\", \"MEAN\"]. (Default \"MEAN\")\n     - No\n\nA GNN layer of type `GRAPH_SAGE` with aggregator set to `MEAN`. Another possbile option is `GCN` (Graph Convolution).\n\n.. code-block:: yaml\n\n   - - type: GNN\n       options:\n         type: GRAPH_SAGE\n         aggregator: MEAN\n\n\n.. list-table:: GATLayerOptions[LayerOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - The type of the GNN layer, for GAT, this must be equal to \"GAT\".\n     - Yes\n   * - num_heads\n     - Int\n     - Number of attention heads to use. (Default 10)\n     - No\n   * - average_heads\n     - Bool\n     - If true, the attention heads will be averaged, otherwise they will be concatenated. (Default True)\n     - No\n   * - negative_slope\n     - Float\n     - Negative slope to use for LeakyReLU. (Default .2)\n     - No\n   * - input_dropout\n     - Float\n     - Dropout rate to apply to the input to the layer. (Default 0.0)\n     - No\n   * - attention_dropout\n     - Float\n     - Dropout rate to apply to the attention weights. (Default 0.0)\n     - No\n\nA GNN layer of type `GAT` (Graph Attention) with 50 attention heads. `input_dropout` is set to 0.1 implying that 10 percent of the \ninput tensor values will be randomly dropped.\n\n.. code-block:: yaml\n\n   - - type: GNN\n       options:\n         type: GAT\n         num_heads: 50\n         average_heads: True\n         input_dropout: 0.1\n\n\n**Reduction Layer Options**\n\n.. list-table:: ReductionLayerOptions[LayerOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - The type of the reduction layer. Options are: [\"CONCAT\", \"LINEAR\"]. (Default \"CONCAT\")\n     - Yes\n\nA reduction layer of type `LINEAR`. Another possible type for the reduction layer is `CONCAT`.\n\n.. code-block:: yaml\n\n   - - type: REDUCTION\n       options:\n         type: LINEAR\n\n\n.. _init-conf-section:\n\nInitialization Configuration\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n.. list-table:: InitConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - The type of the initialization. Options are: [\"GLOROT_UNIFORM\", \"GLOROT_NORMAL\", \"UNIFORM\", \"NORMAL\", \"ZEROES\", \"ONES\", \"CONSTANT\"]. Default \"GLOROT_UNIFORM\"\n     - Yes\n   * - options\n     - InitOptions\n     - Initialization specific options depending on the type.\n     - No\n\n.. code-block:: yaml\n\n   init:\n     type: GLOROT_NORMAL\n     options: {}\n\n\n**Uniform Init Options**\n\n.. list-table:: UniformInitOptions[InitOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - scale_factor\n     - Float\n     - The scale factor of the uniform distribution. (Default 1)\n     - No\n\nThe below configuration is used to initialize a layer with a uniform distribution of values ranging between [-scale_factor, +scale_factor]\n\n.. code-block:: yaml\n\n   init:\n     type: UNIFORM\n     options:\n       scale_factor: 1\n\n\n**Normal Init Options**\n\n.. list-table:: NormalInitOptions[InitOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - mean\n     - Float\n     - The mean of the distribution. (Default 0.0)\n     - No\n   * - std\n     - Float\n     - The standard deviation of the distribution. (Default 1.0)\n     - No\n\nThe below configuration is used to initialize a layer with values belonging to a noraml distribution, with mean 0.5 and standard \ndeviation 0.1.\n\n.. code-block:: yaml\n\n   init:\n     type: NORMAL\n     options:\n       mean: 0.5\n       std: 0.1\n\n\n**Constant Init Options**\n\n.. list-table:: ConstantInitOptions[InitOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - constant\n     - Float\n     - The value to set all parameters. (Default 0.0)\n     - No\n\n`CONSTANT` initialization mode initializes all parameters of the layer to the specified constant value. \n\n.. code-block:: yaml\n\n   init:\n     type: CONSTANT\n     options:\n       constant: 0.4\n\n.. _decoder-conf-section:\n\nDecoder Configuration\n^^^^^^^^^^^^^^^^^^^^^\n\n.. list-table:: DecoderConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - Denotes the type of decoder. Options: [\"DISTMULT\", \"TRANSE\", \"COMPLEX\", \"NODE\"]. The first three are decoders for link prediction and the \"NODE\" decoder is used for node classification.\n     - Yes\n   * - options\n     - DecoderOptions\n     - Decoder specific options depending on the type.\n     - No\n   * - optimizer\n     - OptimizerConfig\n     - Optimizer to use for the parameters of the decoder (if any). If not given, the dense_optimizer is used.\n     - No\n\nBelow is a `DISTMULT` decoder with Adagrad Optimizer, that optimizes the loss function over edges as well as their inverses (dest->rel->src).\n\n.. code-block:: yaml\n\n   decoder:\n     type: DISTMULT\n     options:\n       inverse_edges: true\n     optimizer:\n       type: ADAGRAD\n       options:\n         learning_rate: 0.1\n\n\nDecoder Options\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n**Edge Decoder Options**\n\n.. list-table:: EdgeDecoderOptions[DecoderOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - inverse_edges\n     - Bool\n     - If true, the decoder will use two embeddings per edge-type (relation). Where one embedding is applied to the source node of an edge, and the other is applied to the destination node of an edge. Furthermore, the scores of the inverse of the edges will be computed (dst->rel->src) and used in the loss. (Default True)\n     - No\n   * - edge_decoder_method\n     - String\n     - Specifies how to apply the decoder to a given set of edges, and negatives. Options are [\"infer\", \"train\"]. (Default \"train\")\n     - No\n\n.. code-block:: yaml\n\n   decoder:\n     type: DISTMULT\n     options:\n       inverse_edges: true\n       edge_decoder_method: CORRUPT_NODE\n\n\n.. _loss-conf-section:\n\nLoss Configuration\n^^^^^^^^^^^^^^^^^^\n\n.. list-table:: LossConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - Denotes the type of the loss function. Options: [\"SOFTMAX_CE\", \"RANKING\", \"CROSS_ENTROPY\", \"BCE_AFTER_SIGMOID\", \"BCE_WITH_LOGITS\", \"MSE\", \"SOFTPLUS\"].\n     - Yes\n   * - options\n     - LossOptions\n     - Loss function specific options depending on the type.\n     - No\n\nBelow is the configuration for a `SOFTMAX_CE` loss function with `SUM` as the reduction method.\n\n.. code-block:: yaml\n\n   loss:\n     type: SOFTMAX_CE\n     options:\n       reduction: SUM\n\n\n**Loss Options**\n\n.. list-table:: LossOptions\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - reduction\n     - String\n     - The reduction to use for the loss. Options are [\"SUM\", \"MEAN\"]. (Default \"SUM\")\n     - No\n\nBelow is the configuration for a `SOFTMAX_CE` loss function with `MEAN` as the reduction method.\n\n.. code-block:: yaml\n\n   loss:\n     type: SOFTMAX_CE\n     options:\n       reduction: MEAN\n\n\n.. list-table:: RankingLossOptions[LossOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - reduction\n     - String\n     - The reduction to use for the loss. Options are [\"SUM\", \"MEAN\"]. (Default \"SUM\")\n     - No\n   * - margin\n     - Float\n     - The margin for the ranking loss function. (Default .1)\n     - No\n\nBelow is the configuration for a `RANKING` loss function with `margin` set to 1. \n\n.. code-block:: yaml\n\n   loss:\n     type: RANKING\n     options:\n       reduction: SUM\n       margin: 1\n\n\n.. _optimizer-conf-section:\n\nOptimizer Configuration\n^^^^^^^^^^^^^^^^^^^^^^^\n\n.. list-table:: OptimizerConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - Denotes the type of the optimizer. Options: [\"SGD\", \"ADAM\", \"ADAGRAD\"].\n     - Yes\n   * - options\n     - OptimizerOptions\n     - Optimizer specific options depending on the type.\n     - No\n\nThe configuration for an `ADAGRAD` optimizer with learning rate of 0.1 is as follows\n\n.. code-block:: yaml\n\n   optimizer:\n     type: ADAGRAD\n     options:\n       learning_rate: 0.1\n\n\n**SGD Options**\n\n.. list-table:: SGDOptions[OptimizerOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - learning_rate\n     - Float\n     - SGD learning rate. (Default .1)\n     - No\n\n.. code-block:: yaml\n\n   optimizer:\n     type: SGD\n     options:\n       learning_rate: 0.1\n\n\n**Adagrad Options**\n\n.. list-table:: AdagradOptions[OptimizerOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - learning_rate\n     - Float\n     - Adagrad learning rate. (Default .1)\n     - No\n   * - eps\n     - Float\n     - Term added to the denominator to improve numerical stability. (Default 1e-10)\n     - No\n   * - init_value\n     - Float\n     - Initial accumulator value. (Default 0.0)\n     - No\n   * - lr_decay\n     - Float\n     - Learning rate decay. (Default 0.0)\n     - No\n   * - weight_decay\n     - Float\n     - Weight decay (L2 penalty). (Default 0.0)\n     - No\n\nThe below configuration shows the options that can be set for `ADAGRAD` optimizer.\n\n.. code-block:: yaml\n\n   optimizer:\n     type: ADAGRAD\n     options:\n       learning_rate: 0.1\n       eps: 1.0e-10\n       init_value: 0.0\n       lr_decay: 0.0\n       weight_decay: 0.0\n\n\n**Adam Options**\n\n.. list-table:: AdamOptions[OptimizerOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - learning_rate\n     - Float\n     - Adam learning rate. (Default .1)\n     - No\n   * - amsgrad\n     - Bool\n     - Whether to use the AMSGrad variant of ADAM.\n     - No\n   * - beta_1\n     - Float\n     - Coefficient used for computing running averages of gradient and its square. (Default .9)\n     - No\n   * - beta_2\n     - Float\n     - Coefficient used for computing running averages of gradient and its square. (Default .999)\n     - No\n   * - eps\n     - Float\n     - Term added to the denominator to improve numerical stability. (Default 1e-8)\n     - No\n   * - weight_decay\n     - Float\n     - Weight decay (L2 penalty). (Default 0.0)\n     - No\n\nThe below configuration shows the options that can be set for `ADAM` optimizer.\n\n.. code-block:: yaml\n\n   optimizer:\n     type: ADAM\n     options:\n       learning_rate: 0.01\n       amsgrad: false\n       beta_1: 0.9\n       beta_2: 0.999\n       eps: 1.0e-08\n       weight_decay: 0.0\n\n\nStorage Configuration\n----------------------\n\n.. list-table:: StorageConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - device_type\n     - String\n     - Whether to use cpu or gpu training. Options are [\"CPU\", \"CUDA\"]. (Default \"CPU\")\n     - No\n   * - dataset\n     - DatasetConfig\n     - Contains information about the input dataset.\n     - Yes\n   * - edges\n     - StorageBackendConfig\n     - Storage backend of the edges. (Default edges.type = DEVICE_MEMORY, edges.options.dtype = int32)\n     - No\n   * - embeddings\n     - StorageBackendConfig\n     - Storage backend of the node embedding. (Default embeddings.type = DEVICE_MEMORY, embeddings.options.dtype = float32)\n     - No\n   * - features\n     - StorageBackendConfig\n     - Storage backend of the node features. (Default features.type DEVICE_MEMORY, features.options.dtype = float32)\n     - No\n   * - prefetch\n     - Bool\n     - If true and the nodes/features storage configuration uses a partition buffer, then node partitions and edge buckets will be prefetched. Note that this introduces additional memory overheads. (Default True)\n     - No\n   * - full_graph_evaluation\n     - Bool\n     - If true and the nodes/features storage configuration uses a partition buffer, evaluation will be performed with the full graph in memory (if there is enough memory). This is useful for fair comparisons across different storage configurations. (Default False)\n     - No\n   * - model_dir\n     - String\n     - Saves the model parameters in the given directory. If not specified, stores in `model_x` directory within the `dataset_dir` where x changes incrementally from 0 - 10. A maximum of 11 models are stored when `model_dir` is not specified, post which the contents in `model_10/` directory are overwritten with the latest parameters.\n     - No\n\nBelow is a storage configuration that contains the path to the pre-processed data and specifies storage backends to be used for edges, features \nand embeddings.\n\n.. code-block:: yaml \n\n   storage:\n     device_type: cpu\n     dataset:\n       dataset_dir: /home/data/datasets/fb15k_237/\n     edges:\n       type: DEVICE_MEMORY\n       options:\n         dtype: int\n     nodes:\n       type: DEVICE_MEMORY\n       options:\n         dtype: int\n     embeddings:\n       type: DEVICE_MEMORY\n       options:\n         dtype: float\n     features:\n       type: DEVICE_MEMORY\n       options:\n         dtype: float\n     prefetch: true\n     shuffle_input: true\n     full_graph_evaluation: true\n     export_encoded_nodes: true\n     log_level: info\n\n\nDataset Configuration\n^^^^^^^^^^^^^^^^^^^^^\n\n.. list-table:: DatasetConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - dataset_dir\n     - String\n     - Directory containing the prepreprocessed dataset. Also used to store model parameters and embedding table.\n     - Yes\n   * - num_edges\n     - Int\n     - Number of edges in the input graph. If link prediction, this should be set to the number of training edges.\n     - No\n   * - num_nodes\n     - Int\n     - Number of nodes in the input graph.\n     - No\n   * - num_relations\n     - Int\n     - Number of relations (edge-types) in the input graph. (Default 1)\n     - No\n   * - num_train\n     - Int\n     - Number of training examples. In link prediction the examples are edges, in node classification they are nodes.\n     - No\n   * - num_valid\n     - Int\n     - Number of validation examples. If not given, no validation will be performed\n     - No\n   * - num_test\n     - Int\n     - Number of test examples. If not given, only training will occur.\n     - No (Evaluation)\n   * - node_feature_dim\n     - Int\n     - Dimension of the node features, if any.\n     - No\n   * - num_classes\n     - Int\n     - Number of class labels.\n     - No (Node classification)\n\nFor Marius in-built datasets, the below numbers are retrieved from output of `marius_preprocess`. For custom user datasets, a \nfile with the dataset statistics mentioned above should be present in the `dataset_dir`. Below is the cofiguration for the `fb15k_237` dataset. \n\n.. code-block:: yaml \n\n   storage:\n     dataset:\n       dataset_dir: /home/data/datasets/fb15k_237/\n\n\nStorage Backend Configuration\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. list-table:: StorageBackendConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - type\n     - String\n     - The type of storage backend to use. The valid options depend on the data being stored. For edges, the valid backends are [\"FLAT_FILE\", \"HOST_MEMORY\" and \"DEVICE_MEMORY\"]. For embeddings and features, the valid chocies are [\"PARTITION_BUFFER\", \"HOST_MEMORY\", \"DEVICE_MEMORY\"]\n     - Yes\n   * - options\n     - StorageOptions\n     - Storage backend options depending on the type of storage.\n     - No\n\nBelow configuration specifies that the edges be stored in `DEVICE_MEMORY`, i.e CPU/GPU memory based on `device_type`.\n\n.. code-block:: yaml\n\n   edges:\n     type: DEVICE_MEMORY\n     options:\n       dtype: int\n\n\nStorage Backend Options\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n.. list-table:: StorageOptions\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - dtype\n     - String\n     - The datatype of the storage. Valid options [\"FLOAT\", \"FLOAT32\", \"DOUBLE\", \"FLOAT64\", \"INT\", \"INT32\", \"LONG, \"INT64\"]. The default value depends on the data being stored. For edges, the default is \"INT32\", otherwise the default is \"FLOAT32\"\n     - No\n\nA configuration defining the datatype of the input edges as `int`.\n\n.. code-block:: yaml\n\n   edges:\n     options:\n       dtype: int\n\n\n.. list-table:: PartitionBufferOptions[StorageOptions]\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - dtype\n     - String\n     - The datatype of the storage. Valid options [\"FLOAT\", \"FLOAT32\", \"DOUBLE\", \"FLOAT64\"]. (Default \"FLOAT32\")\n     - No\n   * - num_partitions\n     - Int\n     - Number of node partitions.\n     - Yes\n   * - buffer_capacity\n     - Int\n     - Number of partitions which can fit in the buffer.\n     - Yes\n   * - prefetching\n     - Bool\n     - If true, partitions will be prefetched and written to storage asynchronously. This prevents IO wait times at the cost of additional memory overheads. (Default True)\n     - No\n\nBelow is a disk-based storage configuration, where at max of `buffer_capacity` embeddings buckets are stored in memory at any given time. \nThe dataset must be partitioned using `marius_preprocess` with `--num_partitions` set accordingly. \n\n.. code-block:: yaml\n\n   embeddings:\n     type: PARTITION_BUFFER\n     options:\n       dtype: float\n       num_partitions: 10\n       buffer_capacity: 5\n       prefetching: true\n\n\nTraining Configuration\n-----------------------\n\n.. list-table:: TrainingConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - batch_size\n     - Int\n     - Amount of training examples per batch. (Default 1000)\n     - No\n   * - negative_sampling\n     - NegativeSamplingConfig\n     - Negative sampling configuration for link prediction.\n     - Link Prediction\n   * - num_epochs\n     - Int\n     - Number of epochs to train.\n     - Yes\n   * - pipeline\n     - PipelineConfig\n     - Advanced configuration of the training pipeline. Defaults to synchronous training.\n     - No\n   * - epochs_per_shuffle\n     - Int\n     - Sets how often to shuffle the training data. (Default 1)\n     - No\n   * - logs_per_epoch\n     - Int\n     - Sets how often to report progress during an epoch. (Default 10)\n     - No\n   * - save_model\n     - Bool\n     - If true, the model will be saved at the end of training. (Default True)\n     - No\n   * - resume_training\n     - Bool\n     - If true, the training procedure will resume from the previous state and will train `num_epochs` further epochs.  (Default False)\n     - No\n   * - resume_from_checkpoint\n     - String\n     - If set, loads the model from the given directory and resumes training procedure. Will train `num_epochs` further epochs and store the new model parameters in `model_dir`.\n     - No\n\nA training configuration with batchsize of 1000 and a total of 10 epochs is as follows. `pipeline` is set to true, which ensures that \nthe training is synchronous and doesn't allow staleness. Marius groups edges into chunks and reuses negative samples within the chunk. \n`num_chunks`*`negatives_per_positive` negative edges are sampled for each positive edge.\n\n.. code-block:: yaml\n\n   training:\n     batch_size: 1000\n     negative_sampling:\n       num_chunks: 10\n       negatives_per_positive: 10\n       degree_fraction: 0.0\n       filtered: false\n     num_epochs: 10\n     pipeline:\n       sync: true\n     epochs_per_shuffle: 1\n     logs_per_epoch: 10\n     save_model: true\n     resume_training: false\n\n\nEvaluation Configuration\n-------------------------\n\n.. list-table:: EvaluationConfig\n   :widths: 15 10 50 15\n   :header-rows: 1\n\n   * - Key\n     - Type\n     - Description\n     - Required\n   * - batch_size\n     - Int\n     - Amount of evaluation examples per batch. (Default 1000)\n     - No\n   * - negative_sampling\n     - NegativeSamplingConfig\n     - Negative sampling configuration for link prediction.\n     - Link Prediction\n   * - pipeline\n     - PipelineConfig\n     - Advanced configuration of the evaluation pipeline. Defaults to synchronous evaluation.\n     - No\n   * - epochs_per_eval\n     - Int\n     - Sets how often to evaluate the model. (Default 1)\n     - No\n\nAn evaluation configuration with batchsize of 1000 is as follows. `num_chunks`*`negatives_per_positive` negative edges are sampled \nfor each positive edge.\n\n.. code-block:: yaml\n\n   evaluation:\n     batch_size: 1000\n     negative_sampling:\n       num_chunks: 1\n       negatives_per_positive: 1000\n       degree_fraction: 0.0\n       filtered: true\n     pipeline:\n       sync: true\n     epochs_per_eval: 1"
  },
  {
    "path": "docs/config_interface/index.rst",
    "content": "\nConfiguration Interface\n**************************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n    :caption: Contents\n\n    configuration\n    samples\n    full_schema\n\n\n"
  },
  {
    "path": "docs/config_interface/samples.rst",
    "content": "\nSample Files\n======================\n\nModel Configs\n-------------\n\nDistMult\n^^^^^^^^\n\n+-----------------------------------------------+---------------------------------------------+\n|                                               |                                             |\n|.. code-block:: yaml                           |.. image:: ../assets/samples_dismult.png     |\n|                                               |                                             |\n|   model:                                      |                                             |\n|     learning_task: LINK_PREDICTION            |                                             |\n|     encoder:                                  |                                             |\n|       layers:                                 |                                             |\n|         - - type: EMBEDDING                   |                                             |\n|             output_dim: 50                    |                                             |\n|             bias: true                        |                                             |\n|             init:                             |                                             |\n|               type: GLOROT_NORMAL             |                                             |\n|     decoder:                                  |                                             |\n|       type: DISTMULT                          |                                             |\n|     loss:                                     |                                             |\n|       type: SOFTMAX_CE                        |                                             |\n|       options:                                |                                             |\n|         reduction: SUM                        |                                             |\n|     dense_optimizer:                          |                                             |\n|       type: ADAM                              |                                             |\n|       options:                                |                                             |\n|         learning_rate: 0.01                   |                                             |\n|     sparse_optimizer:                         |                                             |\n|       type: ADAGRAD                           |                                             |\n|       options:                                |                                             |\n|         learning_rate: 0.1                    |                                             |\n|                                               |                                             |\n+-----------------------------------------------+---------------------------------------------+\n\n\nThe above configuration has a simple embedding layer whose output is fed to the decoder layer, which uses a SoftmaxCrossEntropy loss function to\noptimize the loss value. An Adagrad sparse optimizer is used for the node embeddings and Adam Optimizer for all other model parameters.\n\nGraph Sage (3-layer)\n^^^^^^^^^^^^^^^^^^^^\n\n+----------------------------------------+--------------------------------------+\n|                                        |                                      |\n|.. code-block:: yaml                    |.. image:: ../assets/samples_gs.png   |\n|                                        |  :width: 700                         |\n|   model:                               |                                      |\n|     learning_task: LINK_PREDICTION     |                                      |\n|     encoder:                           |                                      |\n|       train_neighbor_sampling:         |                                      |\n|         - type: ALL                    |                                      |\n|         - type: ALL                    |                                      |\n|         - type: ALL                    |                                      |\n|       layers:                          |                                      |\n|         - - type: EMBEDDING            |                                      |\n|             output_dim: 50             |                                      |\n|             bias: true                 |                                      |\n|             init:                      |                                      |\n|               type: GLOROT_NORMAL      |                                      |\n|         - - type: GNN                  |                                      |\n|             options:                   |                                      |\n|               type: GRAPH_SAGE         |                                      |\n|               aggregator: MEAN         |                                      |\n|             input_dim: 50              |                                      |\n|             output_dim: 50             |                                      |\n|             bias: true                 |                                      |\n|             init:                      |                                      |\n|               type: GLOROT_NORMAL      |                                      |\n|         - - type: GNN                  |                                      |\n|             options:                   |                                      |\n|               type: GRAPH_SAGE         |                                      |\n|               aggregator: MEAN         |                                      |\n|             input_dim: 50              |                                      |\n|             output_dim: 50             |                                      |\n|             bias: true                 |                                      |\n|             init:                      |                                      |\n|               type: GLOROT_NORMAL      |                                      |\n|         - - type: GNN                  |                                      |\n|             options:                   |                                      |\n|               type: GRAPH_SAGE         |                                      |\n|               aggregator: MEAN         |                                      |\n|             input_dim: 50              |                                      |\n|             output_dim: 50             |                                      |\n|             bias: true                 |                                      |\n|             init:                      |                                      |\n|               type: GLOROT_NORMAL      |                                      |\n|     decoder:                           |                                      |\n|       type: DISTMULT                   |                                      |\n|     loss:                              |                                      |\n|       type: SOFTMAX_CE                 |                                      |\n|       options:                         |                                      |\n|         reduction: SUM                 |                                      |\n|     dense_optimizer:                   |                                      |\n|       type: ADAM                       |                                      |\n|       options:                         |                                      |\n|         learning_rate: 0.01            |                                      |\n|     sparse_optimizer:                  |                                      |\n|       type: ADAGRAD                    |                                      |\n|       options:                         |                                      |\n|         learning_rate: 0.1             |                                      |\n|                                        |                                      |\n+----------------------------------------+--------------------------------------+\n\n\nGraph Sage (3 layer) has an initial stage consisting of an embedding layer. It is connected to 3 stages of GraphSage GNN layers. \nThe number of training/evaluation neighbor sampling layers is equal to the GNN stages defined in the model. \n\nGAT (3-layer)\n^^^^^^^^^^^^^\n\n.. code-block:: yaml\n\n   model:\n     learning_task: LINK_PREDICTION\n     encoder:\n       train_neighbor_sampling:\n         - type: ALL\n         - type: ALL\n         - type: ALL\n       layers:\n         - - type: EMBEDDING\n             output_dim: 50\n             bias: true\n             init:\n               type: GLOROT_NORMAL\n         - - type: GNN\n             options:\n               type: GAT\n             input_dim: 50\n             output_dim: 50\n             bias: true\n             init:\n               type: GLOROT_NORMAL\n         - - type: GNN\n             options:\n               type: GAT\n             input_dim: 50\n             output_dim: 50\n             bias: true\n             init:\n               type: GLOROT_NORMAL\n         - - type: GNN\n             options:\n               type: GAT\n             input_dim: 50\n             output_dim: 50\n             bias: true\n             init:\n               type: GLOROT_NORMAL\n     decoder:\n       type: DISTMULT\n     loss:\n       type: SOFTMAX_CE\n       options:\n         reduction: SUM\n     dense_optimizer:\n       type: ADAM\n       options:\n         learning_rate: 0.01\n     sparse_optimizer:\n       type: ADAGRAD\n       options:\n         learning_rate: 0.1\n\nGAT (3 layer) has an initial stage consisting of an embedding layer. It is connected to 3 stages of GAT GNN layers. The number of \ntraining/evaluation neighbor sampling layers is equal to the GNN stages defined in the model. \n\nEmbeddings + Features + Edges\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nThe supported storage backends for embeddings and features are `PARTITION_BUFFER`, `DEVICE_MEMORY` and `HOST_MEMORY`. For edges, \nthe supported backends are `FLAT_FILE`, `DEVICE_MEMORY`, `HOST_MEMORY`.\n\nStorage Configs\n---------------\n\nGPU Memory\n^^^^^^^^^^\n.. code-block:: yaml\n\n   storage:\n     device_type: cuda\n     dataset:\n       dataset_dir: /home/data/datasets/fb15k_237/\n     edges:\n       type: DEVICE_MEMORY\n       options:\n         dtype: int\n     embeddings:\n       type: DEVICE_MEMORY\n       options:\n         dtype: float\n\nIn the above configuration, both edges and embeddings are stored in GPU memory. \n\nMixed CPU-GPU\n^^^^^^^^^^^^^\n\n.. code-block:: yaml\n\n   storage:\n     device_type: cuda\n     dataset:\n       dataset_dir: /home/data/datasets/fb15k_237/\n     edges:\n       type: HOST_MEMORY\n       options:\n         dtype: int\n     embeddings:\n       type: HOST_MEMORY\n       options:\n         dtype: float\n\nThis configuration places the edge data in the CPU memory and maintains the embedding data in GPU memory.\n\nDisk-Based\n^^^^^^^^^^\n\n.. code-block:: yaml\n\n   storage:\n     device_type: cuda\n     dataset:\n       dataset_dir: /home/data/datasets/fb15k_237/\n     edges:\n       type: FLAT_FILE\n       options:\n         dtype: int\n     embeddings:\n       type: DEVICE_MEMORY\n       options:\n         dtype: float\n\nIn this configuration, the edge data is stored in a flat file, on disk. FLAT_FILE storage backend is supported for edges alone,\nbecause there is no need for an index lookup. Instead, edges are traversed sequentially.\n\nMarius supports `PARTITION_BUFFER` mode to store embedding data, where all data is stored on disk and only necessary chunks are \nfetched and kept in the buffer. The edges are traversed in an order that minimizes bukcet swaps in the buffer. It can be configured \nas follows\n\n.. code-block:: yaml\n\n   storage:\n     device_type: cuda\n     dataset:\n       dataset_dir: /home/data/datasets/fb15k_237_partitioned/\n     edges:\n       type: FLAT_FILE\n       options:\n         dtype: int\n     embeddings:\n       type: PARTITION_BUFFER\n       options:\n         dtype: float\n         num_partitions: 10\n         buffer_capacity: 5\n\nThe above configuration states that at most 5 node embedding buckets can be present in memory at any given time. \n\nTraining Configs\n----------------\n\nSynchronous Training\n^^^^^^^^^^^^^^^^^^^^\n\nTo speed up training, Graph Learning systems use pipelined architecture and try to overlap data movement with computation. This\nintroduces bounded staleness in the system, wherein after a set of updates to the node embeddings, the existing mini-batches in the \npipeline use stale node embeddings. Marius provides an explicit option to turn off asynchronous training and ensure that every\nmini-batch sees the latest updated node embeddings. The following can be used the set training as synchronous\n\n.. code-block:: yaml\n\n   training:\n     batch_size: 1000\n     negative_sampling:\n       num_chunks: 10\n       negatives_per_positive: 10\n       degree_fraction: 0\n       filtered: false\n     num_epochs: 10\n     pipeline:\n       sync: true\n\n\nPipelined Training\n^^^^^^^^^^^^^^^^^^\n\nMarius uses pipelining training architecture that can interleave data access, transfer, and computation to achieve high utilization. This \nintroduces the possibility of a few mini-batches using stale parameters during training. Below is a sample configuration where the training \nis async, and the staleness is set to 16 i.e. at most 16 mini-batches use stale node embeddings after any set of node embeddings are updated.\n\n.. code-block:: yaml\n\n   pipeline:\n     sync: false\n     gpu_sync_interval: 16\n     gpu_model_average: true\n     staleness_bound: 16\n     batch_host_queue_size: 4\n     batch_device_queue_size: 4\n     gradients_device_queue_size: 4\n     gradients_host_queue_size: 4\n     batch_loader_threads: 4\n     batch_transfer_threads: 2\n     compute_threads: 1\n     gradient_transfer_threads: 2\n     gradient_update_threads: 4\n\nMarius follows a 5-staged pipeline architecture, 4 of which are responsible for data movement and the other is for model computation \nand in-GPU parameter updates. The `pipeline` field has options for setting thread counts for each of these stages.\n\nEvaluation Configs\n-------------------\n\nLink Prediction Filtered\n^^^^^^^^^^^^^^^^^^^^^^^^\n\nAn Evaluation configuration for Link Prediction with a batchsize of 1000. When `filtered` is set to true, false negative sampled edges\nwill be filtered out. \n\n.. code-block:: yaml\n\n   evaluation:\n     batch_size: 1000\n     negative_sampling:\n       num_chunks: 1\n       negatives_per_positive: 1000\n       degree_fraction: 0.0\n       filtered: true\n     pipeline:\n       sync: true\n     epochs_per_eval: 1\n\nLink Prediction Unfiltered\n^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nUnfiltered Evaluation configuration for Link Prediction with a batchsize of 1000. False negative sampled edges will not be filtered out.\n\n.. code-block:: yaml\n\n   evaluation:\n     batch_size: 1000\n     negative_sampling:\n       num_chunks: 10\n       negatives_per_positive: 100\n       filtered: false\n     pipeline:\n       sync: true\n     epochs_per_eval: 1\n\nNode Classification\n^^^^^^^^^^^^^^^^^^^\n\nSample Evaluation configuration for a Node Classification tasks.\n\n.. code-block:: yaml\n\n   evaluation:\n     batch_size: 1000\n     pipeline:\n       sync: true\n     epochs_per_eval: 1\n "
  },
  {
    "path": "docs/db2graph/db2graph.rst",
    "content": "Db2Graph: Database to Graph conversion tool\n============================================\n\nIntroduction\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n**Db2Graph** converts **relational databases** into **graphs as sets of triples** which can be used as **input datasets for Marius**, allowing streamlined preprocessing from database to Marius. Db2Graph comes with Marius but can be used as a standalone tool. Conversion with Db2Graph is achieved in the following steps: \n\n#. Users import/create the database locally\n\n#. Users define the configuration file and edge SQL SELECT queries\n\n#. Db2Graph executes the SQL SELECT queries\n\n#. Db2Graph transforms the result set of queries into sets of triples\n\nBelow we lay out the requirements, definitions, and steps for using Db2Graph, and a real example use case:\n\nRequirements\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nDb2Graph currently supports graph conversion from three relational database management systems: **MySQL**, **MariaDB**, and **PostgreSQL**. Db2Graph requires no additional installation as all the required python packages are part of Marius installation. Please refer to `mairus installation <https://github.com/marius-team/marius/blob/main/README.md>`_ for installing the required packages.\n\nSystem Design\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nDb2Graph classifies a graph into the following two types:\n\n* Entity Nodes: Nodes that are globally unique. Global uniqueness is ensured by appending ``table-name_col-name_val`` to the literal. In a graph, entity nodes either point to other entity nodes or are pointed to by other entity nodes.\n* Edges of Entity Node to Entity Node: Directed edges where both source and destination are entity nodes.\n\nDuring the conversion, we assume that all nodes are **case insensitive**. We ignore the following set of **invalid nodes names**: ``\"0\", None, \"\", 0, \"not reported\", \"None\", \"none\"``.\n\nDb2Graph outputs a set of triplets in the format of ``[source node] [edge] [destination node]`` where each element in the triplets is delimited by a single tab. This output format aligns with the input format of Marius, allowing streamlined preprocessing from database to using Marius.\n\nHow to Use\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nFirst, make sure marius is installed with the optional db2graph dependencies: `python3 -m pip install .[db2graph]`.\n\nAssuming that a database has already been created, graph conversion with Db2Graph can be achieved in the following steps:\n\n#. | First, create a YAML configuration file ``config.yaml`` and a query definition files to contain SQL SELECT queries of type ``edges_queries``. Assume that the config file and query file are placed in a ``./conf/`` directory. \n\n    .. code-block:: bash\n    \n       $ ls -l .\n       conf/  \n         config.yaml                         # config file\n         edges_queries.txt             # defines edges_queries\n\n   | Define the configuration file in ``config.yaml``. Below is a sample configuration file. Note that all fields are required. An error would be thrown if the query files do not exist.\n    \n        .. code-block:: yaml\n        \n            db_server: postgre-sql\n            db_name: sample_db\n            db_user: sample_user\n            db_password: sample_password\n            db_host: localhost\n            edges_queries: conf/edges_queries.txt\n\n    .. list-table::\n       :widths: 15 10 50 15\n       :header-rows: 1\n    \n       * - Key\n         - Type\n         - Description\n         - Required\n       * - db_server\n         - String\n         - Denotes the RDBMS to use. Options: [“maria-db”, “postgre-sql”, \"my-sql\"].\n         - Yes\n       * - db_name\n         - String\n         - Denotes the name of the database.\n         - Yes\n       * - db_user\n         - String\n         - Denotes the user name to access the database.\n         - Yes\n       * - db_password\n         - String\n         - Password to access the database.\n         - Yes\n       * - db_host\n         - String\n         - Denotes the hostname of the database.\n         - Yes\n       * - edges_queries\n         - String\n         - Path to the text file that contains the SQL SELECT queries fetching edges from entity nodes to entity nodes.\n         - Yes\n\n#. | Next, define SQL SELECT queries. Assume the file ``conf/edges_queries.txt`` has been created. In it, define queries with the following format with no empty lines in-between lines. Each edge consists of two rows: A single ``relation_name`` followed by another row of SQL SELECT query. Note that you can include any SQL keyword after WHERE clause.\n    \n    .. code-block:: sql\n           \n           relation_name_A_to_B -- this is the name of the edge from A to B\n           SELECT table1_name.column_name_A, table2_name.column_name_B FROM table1_name, table1_name WHERE ...; -- this row represents an edge from source entity node A to destination entity node B\n           relation_name_B_to_C -- this is the name of the edge from B to C\n           SELECT table1_name.column_name_B, table2_name.column_name_C FROM table1_name, table2_name WHERE ...; -- this row represents an edge from source entity node B to destination entity node C\n\n   | The user can expand or shorten the list of queries in the above query definition file to query a certain subset of data from the database.\n\n   .. note:: \n       Db2Graph validates the correctness of format of each query. However, it does not validate the correctness of the queries. That is, it assumes that all column names and table names exist in the given database schema provided by the user. An error will be thrown in the event that the validation check fails.\n    \n   .. note:: \n       There cannot be ``AS`` alias within the queries. Any alias violates the correctness of the queries in Db2Graph.\n    \n#. | Lastly, execute Db2Graph with the following commands. Two flags are required. Note that prints will include both errors and general information, and those are also logged to ``./output_dir/output.log``:\n\n    .. code-block:: bash\n        \n           $ marius_db2graph --config_path conf/config.yaml --output_directory output_dir/\n           Starting marius_db2graph conversion tool for config: conf/config.yaml\n           ...\n           Edge file written to output_dir/edges.txt\n\n   | The  ``--config_path`` flag specifies where the configuration file created by the user is.\n\n   | The  ``--output_directory`` flag specifies where the data will be output and is set by the user. In this example, assume we have not created the output_dir directory. ``db2graph`` will create it for us. \n\n   | The conversion result will be written to ``edges.txt`` in a newly created directory named ``./output_dir``:\n    \n    .. code-block:: bash\n        \n           $ ls -l .\n           output_dir/\n             edges.txt                       # generated file with sets of triples\n             output.log                          # output log file\n           conf/  \n             config.yaml                         # config file\n             edges_queries.txt             # defines edges_queries    \n          $ cat output_dir/edges.txt\n          column_name_A    relation_name_A_to_B    column_name_B\n          column_name_B    relation_name_B_to_C    column_name_C\n    \nEnd-to-end Example Use Case\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\nWe use `the Sakila DVD store database <https://dev.mysql.com/doc/sakila/en/>`_ from MySQL to demonstrate an end-to-end example from converting a database into a graph using Db2Graph to preprocessing and training the dataset using Marius. For simplicity, we have provided a dockerfile and a bash script which install Marius along with Db2Graph and initialize the Sakila database for you. \n\n#. | First, download an place the provided ``dockerfile`` and ``run.sh`` in the current working directory. Create and run a docker container using the dockerfile. This dockerfile pre-installs Marius and all dependencies needed for using Marius in this end-to-end example. It also copies ``run.sh`` into the container. \n\n    .. code-block:: bash\n    \n       $ docker build -t db2graph_image . # Builds a docker image named db2graph_image\n       $ docker run --name db2graph_container -itd db2graph_image # Create the container named db2graph_container\n       $ docker exec -it db2graph_container bash # Run the container in interactive mode in bash\n\n   | In the root directory of the container, execute ``run.sh``. This script downloads and initializes the Sakila database. Note that the username is set to ``root``, the database name is set to ``sakila_user``, and the password is set to ``sakila_password``.\n    \n       .. code-block:: bash\n    \n        $ run.sh\n        $ cd marius/\n\n   | To verify that the database has been install correctly:\n    \n       .. code-block:: bash\n    \n        $ mysql\n        mysql> USE sakila;\n        mysql> SHOW FULL tables;\n        +----------------------------+------------+\n        | Tables_in_sakila           | Table_type |\n        +----------------------------+------------+\n        | actor                      | BASE TABLE |\n        | actor_info                 | VIEW       |\n         ...\n        23 rows in set (0.01 sec)    \n\n    .. note::\n       \n       If you see any error of type ``ERROR 2002 (HY000): Can't' connect to local MySQL server through socket '/var/run/mysqld/mysqld.sock' (111)``, run the command ``systemctl start mysql`` and retry.\n\n#. | Next, create the configuration file for using Db2Graph. Assuming we are in the ``marius/`` root directory, create & navigate to the ``datasets/sakila`` directory. Create the ``conf/config.yaml`` and ``conf/edges_queries.txt`` files if they have not been created. \n\n    .. code-block:: bash \n       \n       $ mkdir -p datasets/sakila/conf/\n       $ vi datasets/sakila/conf/config.yaml\n       $ vi datasets/sakila/conf/edges_queries.txt\n\n   | In ``datasets/sakila/conf/config.yaml``, define the following fields:\n    \n    .. code-block:: yaml\n        \n            db_server: my-sql\n            db_name: sakila\n            db_user: sakila_user\n            db_password: sakila_password\n            db_host: 127.0.0.1\n            edges_queries: datasets/sakila/conf/edges_queries.txt\n\n   | In ``datasets/sakila/conf/edges_queries.txt``, define the following queries. Note that we create three edges/relationships: An actor acted in a film; A film sold by a store; A film categorized as a category.\n    \n    .. code-block:: sql\n           \n           acted_in\n           SELECT actor.first_name, film.title FROM actor, film_actor, film WHERE actor.actor_id = film_actor.actor_id AND film_actor.film_id = film.film_id ORDER BY film.title ASC;\n           sold_by\n           SELECT film.title, address.address FROM film, inventory, store, address WHERE film.film_id = inventory.film_id AND inventory.store_id = store.store_id AND store.address_id = address.address_id ORDER BY film.title ASC;\n           categorized_as\n           SELECT film.title, category.name FROM film, film_category, category WHERE film.film_id = film_category.film_id AND film_category.category_id = category.category_id ORDER BY film.title ASC;  \n\n   | For simplicity, we limit the queries to focus on the film table. The user can expand or shorten the list of queries in each of the above query definition files to query a certain subset of data from the database. For the Sakila database structure, please refer to `this MySQL documentation <https://dev.mysql.com/doc/sakila/en/sakila-structure.html>`_.\n\n    .. note::\n       \n       The queries above have ``ORDER BY`` clause at the end, which is not compulsory (and can have performance impact). We have kept it for the example because it will ensure same output across multiple runs. For optimal performance remove the ``ORDER BY`` clause.\n   \n#. | Lastly, execute Db2Graph with the following script:\n\n    .. code-block:: bash\n        \n           $ marius_db2graph --config_path datasets/sakila/conf/config.yaml --output_directory datasets/sakila/\n           Starting marius_db2graph conversion tool for config: datasets/sakila/conf/config.yaml\n           ...\n           Total execution time: 0.382 seconds\n           Edge file written to datasets/sakila/edges.txt\n\n   | The conversion result was written to ``edges.txt`` in the specified directory ``datasets/sakila/``. In ``edges.txt``, there should be 7915 edges representing the three relationships we defined earlier:\n    \n    .. code-block:: bash\n        \n           $ ls -1 datasets/sakila/\n           edges.txt                       # generated file with sets of triples\n           marius_db2graph.log             # output log file\n           conf/  \n             ...    \n          $ cat datasets/sakila/edges.txt\n          actor_first_name_rock   acted_in        film_title_academy dinosaur\n          actor_first_name_mary   acted_in        film_title_academy dinosaur\n          actor_first_name_oprah  acted_in        film_title_academy dinosaur\n          ...\n\n    .. note::\n       \n       This concludes the example for using Db2Graph. For an end-to-end example of using Db2Graph with Marius, continue through the sections below. For example, for a custom link prediction example, follow `Custom Link Prediction example <https://github.com/marius-team/marius/blob/main/docs/examples/python/lp_custom.rst>`_ from the docs. Please refer to docs/examples to see all the examples.\n   \n#. | Preprocessing and training a custom dataset like the Sakila database is straightforward with the ``marius_preprocess`` and ``marius_train`` commands. These commands come with ``marius`` when ``marius`` is installed.\n\n    .. code-block:: bash\n        \n           $  marius_preprocess --output_dir datasets/sakila/ --edges datasets/sakila/edges.txt --dataset_split 0.8 0.1 0.1 --delim=\"\\t\"\n           Preprocess custom dataset\n           Reading edges\n           /usr/local/lib/python3.8/dist-packages/marius/tools/preprocess/converters/readers/pandas_readers.py:55: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n             train_edges_df = pd.read_csv(self.train_edges, delimiter=self.delim, skiprows=self.header_length, header=None)\n           Remapping Edges\n           Node mapping written to: datasets/sakila/nodes/node_mapping.txt\n           Relation mapping written to: datasets/sakila/edges/relation_mapping.txt\n           Splitting into: 0.8/0.1/0.1 fractions\n           Dataset statistics written to: datasets/sakila/dataset.yaml\n\n   | In the above command, we set ``dataset_split`` to a list of ``0.8 0.1 0.1``. Under the hood, this splits ``edge.txt`` into ``edges/train_edges.bin``, ``edges/validation_edges.bin`` and ``edges/test_edges.bin`` based on the given list of fractions.\n\n   | Note that ``edge.txt`` contains three columns delimited by tabs, so we set ``--delim=\"\\t\"``.\n\n   | The  ``--edges`` flag specifies the raw edge list file that ``marius_preprocess`` will preprocess (and train later).\n\n   | The  ``--output_directory`` flag specifies where the preprocessed graph will be output and is set by the user. In this example, assume we have not created the datasets/fb15k_237_example repository. ``marius_preprocess`` will create it for us. \n\n   | For detailed usages of  ``marius_preprocess``, please execute the following command:\n\n    .. code-block:: bash\n\n        $ marius_preprocess -h\n\n   | Let's check again what was created inside the ``datasets/sakila/`` directory:\n\n    .. code-block:: bash\n\n      $ ls -1 datasets/sakila/ \n      dataset.yaml                       # input dataset statistics                                \n      nodes/  \n        node_mapping.txt                 # mapping of raw node ids to integer uuids\n      edges/   \n        relation_mapping.txt             # mapping of relations to integer uuids\n        test_edges.bin                   # preprocessed testing edge list \n        train_edges.bin                  # preprocessed training edge list \n        validation_edges.bin             # preprocessed validation edge list \n      conf/                              # directory containing config files\n        ...  \n\n   | Let's check what is inside the generated ``dataset.yaml`` file:\n\n    .. code-block:: bash\n\n      $ cat datasets/sakila/dataset.yaml\n        dataset_dir: /marius/datasets/sakila/\n        num_edges: 6332\n        num_nodes: 1146\n        num_relations: 3\n        num_train: 6332\n        num_valid: 791\n        num_test: 792\n        node_feature_dim: -1\n        rel_feature_dim: -1\n        num_classes: -1\n        initialized: false\n\n    .. note:: \n      If the above ``marius_preprocess`` command fails due to any missing directory errors, please create the ``<output_directory>/edges`` and ``<output_directory>/nodes`` directories as a workaround.\n\n   | To train a model, we need to define a YAML configuration file based on information created from ``marius_preprocess``. An example YAML configuration file for the Sakila database (link prediction model with DistMult) is given in ``examples/configuration/sakila.yaml``. Note that the ``dataset_dir`` is set to the preprocessing output directory, in our example, ``datasets/sakila/``.\n   \n   | Let's create the same YAML configuration file for the Sakila database from scratch. We follow the structure of the configuration file and create each of the four sections one by one. In a YAML file, indentation is used to denote nesting and all parameters are in the format of key-value pairs. \n  \n    .. code-block:: bash\n\n      $ vi datasets/sakila/sakila.yaml \n\n    .. note:: \n      String values in the configuration file are case insensitive but we use capital letters for convention.\n\n   | First, we define the **model**. We begin by setting all required parameters. This includes ``learning_task``, ``encoder``, ``decoder``, and ``loss``. The rest of the configurations can be fine-tuned by the user.\n\n    .. code-block:: yaml\n    \n        model:\n          learning_task: LINK_PREDICTION # set the learning task to link prediction\n          encoder:\n            layers:\n              - - type: EMBEDDING # set the encoder to be an embedding table with 50-dimensional embeddings\n                  output_dim: 50\n          decoder:\n            type: DISTMULT # set the decoder to DistMult\n            options:\n              input_dim: 50\n          loss:\n            type: SOFTMAX_CE\n            options:\n              reduction: SUM\n          dense_optimizer: # optimizer to use for dense model parameters. In this case these are the DistMult relation (edge-type) embeddings\n              type: ADAM\n              options:\n                learning_rate: 0.1\n          sparse_optimizer: # optimizer to use for node embedding table\n              type: ADAGRAD\n              options:\n                learning_rate: 0.1\n        storage:\n          # omit\n        training:\n          # omit\n        evaluation:\n          # omit\n      \n   | Next, we set the **storage** and **dataset**. We begin by setting all required parameters. This includes ``dataset``. Here, the ``dataset_dir`` is set to ``datasets/sakila/``, which is the preprocessing output directory.\n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          device_type: cuda\n          dataset:\n            dataset_dir: /marius/datasets/sakila/\n          edges:\n            type: DEVICE_MEMORY\n          embeddings:\n            type: DEVICE_MEMORY\n          save_model: true\n        training:\n          # omit\n        evaluation:\n          # omit\n\n   | Lastly, we configure **training** and **evaluation**. We begin by setting all required parameters. We begin by setting all required parameters. This includes ``num_epochs`` and ``negative_sampling``. We set ``num_epochs=10`` (10 epochs to train) to demonstrate this example. Note that ``negative_sampling`` is required for link prediction.\n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          # omit\n        training:\n          batch_size: 1000\n          negative_sampling:\n            num_chunks: 10\n            negatives_per_positive: 500\n            degree_fraction: 0.0\n            filtered: false\n          num_epochs: 10\n          pipeline:\n            sync: true\n          epochs_per_shuffle: 1        \n        evaluation:\n          batch_size: 1000\n          negative_sampling:\n            filtered: true\n          pipeline:\n            sync: true   \n\n   | After defining our configuration file, training is run with ``marius_train <your_config.yaml>``.\n\n   | We can now train our example using the configuration file we just created by running the following command (assuming we are in the ``marius`` root directory):\n\n    .. code-block:: bash\n\n      $ marius_train datasets/sakila/sakila.yaml  \n      [2022-06-19 07:01:39.828] [info] [marius.cpp:44] Start initialization\n      [06/19/22 07:01:44.287] Initialization Complete: 4.458s\n      [06/19/22 07:01:44.292] ################ Starting training epoch 1 ################\n      [06/19/22 07:01:44.308] Edges processed: [1000/6332], 15.79%\n      [06/19/22 07:01:44.311] Edges processed: [2000/6332], 31.59%\n      [06/19/22 07:01:44.313] Edges processed: [3000/6332], 47.38%\n      [06/19/22 07:01:44.315] Edges processed: [4000/6332], 63.17%\n      [06/19/22 07:01:44.317] Edges processed: [5000/6332], 78.96%\n      [06/19/22 07:01:44.320] Edges processed: [6000/6332], 94.76%\n      [06/19/22 07:01:44.322] Edges processed: [6332/6332], 100.00%\n      [06/19/22 07:01:44.322] ################ Finished training epoch 1 ################\n      [06/19/22 07:01:44.322] Epoch Runtime: 29ms\n      [06/19/22 07:01:44.322] Edges per Second: 218344.83\n      [06/19/22 07:01:44.322] Evaluating validation set\n      [06/19/22 07:01:44.329]\n      =================================\n      Link Prediction: 1582 edges evaluated\n      Mean Rank: 548.639697\n      MRR: 0.005009\n      Hits@1: 0.000632\n      Hits@3: 0.001264\n      Hits@5: 0.001264\n      Hits@10: 0.001896\n      Hits@50: 0.034766\n      Hits@100: 0.075221\n      =================================\n      [06/19/22 07:01:44.330] Evaluating test set\n      [06/19/22 07:01:44.333]\n      =================================\n      Link Prediction: 1584 edges evaluated\n      Mean Rank: 525.809343\n      MRR: 0.006225\n      Hits@1: 0.000000\n      Hits@3: 0.001263\n      Hits@5: 0.004419\n      Hits@10: 0.005682\n      Hits@50: 0.046086\n      Hits@100: 0.107323\n      =================================\n\n   | After running this configuration for 10 epochs, we should see a result similar to below:\n\n    .. code-block:: bash\n\n      [06/19/22 07:01:44.524] ################ Starting training epoch 10 ################\n      [06/19/22 07:01:44.527] Edges processed: [1000/6332], 15.79%\n      [06/19/22 07:01:44.529] Edges processed: [2000/6332], 31.59%\n      [06/19/22 07:01:44.531] Edges processed: [3000/6332], 47.38%\n      [06/19/22 07:01:44.533] Edges processed: [4000/6332], 63.17%\n      [06/19/22 07:01:44.536] Edges processed: [5000/6332], 78.96%\n      [06/19/22 07:01:44.538] Edges processed: [6000/6332], 94.76%\n      [06/19/22 07:01:44.540] Edges processed: [6332/6332], 100.00%\n      [06/19/22 07:01:44.540] ################ Finished training epoch 10 ################\n      [06/19/22 07:01:44.540] Epoch Runtime: 16ms\n      [06/19/22 07:01:44.540] Edges per Second: 395749.97\n      [06/19/22 07:01:44.540] Evaluating validation set\n      [06/19/22 07:01:44.544]\n      =================================\n      Link Prediction: 1582 edges evaluated\n      Mean Rank: 469.225664\n      MRR: 0.047117\n      Hits@1: 0.030973\n      Hits@3: 0.044880\n      Hits@5: 0.051833\n      Hits@10: 0.071429\n      Hits@50: 0.136536\n      Hits@100: 0.197219\n      =================================\n      [06/19/22 07:01:44.544] Evaluating test set\n      [06/19/22 07:01:44.547]\n      =================================\n      Link Prediction: 1584 edges evaluated\n      Mean Rank: 456.828283\n      MRR: 0.041465\n      Hits@1: 0.023990\n      Hits@3: 0.040404\n      Hits@5: 0.051768\n      Hits@10: 0.068813\n      Hits@50: 0.147096\n      Hits@100: 0.210227\n      =================================\n   \n   | Let's check again what was added in the ``datasets/sakila/`` directory. For clarity, we only list the files that were created in training. Notice that several files have been created, including the trained model, the embedding table, a full configuration file, and output logs:\n\n    .. code-block:: bash\n\n      $ ls datasets/sakila/ \n      model_0/\n        embeddings.bin                   # trained node embeddings of the graph\n        embeddings_state.bin             # node embedding optimizer state\n        model.pt                         # contains the dense model parameters, embeddings of the edge-types\n        model_stlsate.pt                 # optimizer state of the trained model parameters\n        node_mapping.txt                 # mapping of raw node ids to integer uuids\n        relation_mapping.txt             # mapping of relations to integer uuids\n        full_config.yaml                 # detailed config generated based on user-defined config\n        metadata.csv                     # information about metadata\n        logs/                            # logs containing output, error, debug information, and etc.\n      nodes/  \n        ...\n      edges/   \n        ...\n      ...\n\n    .. note:: \n        ``model.pt`` contains the dense model parameters. For DistMult, this is the embeddings of the edge-types. For GNN encoders, this file will include the GNN parameters.\n      "
  },
  {
    "path": "docs/examples/config/index.rst",
    "content": ".. _configuration_examples\n\n\nConfiguration Examples\n**************************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    lp_fb15k237\n    lp_paleobiology\n    lp_custom\n    nc_ogbn_arxiv\n    nc_custom\n\n\n"
  },
  {
    "path": "docs/examples/config/lp_custom.rst",
    "content": "Custom Dataset Link Prediction\n---------------------------------------------\n\nIn this tutorial, we use the **OGBN_Arxiv dataset** as an example to demonstrate a step-by-step walkthrough from preprocessing a **custom dataset** to defining the configuration file and to training **a link prediction model with the DistMult algorithm**.\n\n1. Preprocess Dataset\n^^^^^^^^^^^^^^^^^^^^^\n\nPreprocessing a custom dataset is straightforward with the ``marius_preprocess`` command. This command comes with ``marius`` when ``marius`` is installed. See (TODO link) for installation information.\n\nLet's start by downloading and extracting the OGBN_Arxiv dataset we will use in this example if it has not been downloaded (assuming we are in the ``marius`` root directory):\n \n.. code-block:: bash\n\n   $ wget http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip      # download original dataset\n   $ unzip arxiv.zip -d datasets/custom_lp_example/                     # extract downloaded dataset\n   $ gzip -dr datasets/custom_lp_example/arxiv/raw/                     # extract raw dataset files\n   $ gzip -dr datasets/custom_lp_example/arxiv/split/time/              # extract raw split files\n\nAfter the previous step, we should have the directory ``datasets/custom_lp_example/arxiv/raw/`` created containing the following raw files downloaded and extracted from the OGBN_Arxiv dataset:\n\n.. code-block:: bash\n\n   $ ls -1 datasets/custom_lp_example/arxiv/raw/ \n    edge.csv                        # raw edge list\n    node-feat.csv                   # raw node features\n    node-label.csv                  # raw node lables\n    node_year.csv  \n    num-edge-list.csv  \n    num-node-list.csv\n   $ head -5 arxiv/raw/edge.csv\n    104447,13091\n    15858,47283\n    107156,69161\n    107156,136440\n    107156,107366\n\nAssuming ``marius_preprocess`` has been built, we preprocess the OGBN_Arxiv dataset by running the following command (assuming we are in the ``marius`` root directory):\n\n.. code-block:: bash\n\n   $ marius_preprocess --output_dir datasets/custom_lp_example/ \n                        --edges datasets/custom_lp_example/arxiv/raw/edge.csv \n                        --dataset_split 0.8 0.1 0.1 --delim=\",\" --columns 0 1\n    Preprocess custom dataset\n    Reading edges\n    Remapping Edges\n    Node mapping written to: datasets/custom_lp/nodes/node_mapping.txt\n    Dataset statistics written to: datasets/custom_lp/dataset.yaml\n\nIn the above command, we set ``dataset_split`` to a list of ``0.8 0.1 0.1``. Under the hood, this splits ``edge.csv`` into ``edges/train_edges.bin``, ``edges/validation_edges.bin`` and ``edges/test_edges.bin`` based on the given list of fractions.\n\nNote that ``edge.csv`` contains two columns delimited by comma, so we set ``--columns 0,1`` and ``--delim=\",\"``.\n\nThe  ``--edges`` flag specifies the raw edge list file that ``marius_preprocess`` will preprocess (and train later).\n\nThe  ``--output_directory`` flag specifies where the preprocessed graph will be output and is set by the user. In this example, assume we have not created the datasets/fb15k_237_example repository. ``marius_preprocess`` will create it for us. \n\nFor detailed usages of  ``marius_preprocess``, please execute the following command:\n\n.. code-block:: bash\n\n   $ marius_preprocess -h\n\nLet's check again what was created inside the ``datasets/custom_lp_example/`` directory:\n\n.. code-block:: bash\n\n   $ ls -1 datasets/fb15k_237_example/ \n   dataset.yaml                       # input dataset statistics                                \n   nodes/  \n     node_mapping.txt                 # mapping of raw node ids to integer uuids\n   edges/   \n     test_edges.bin                   # preprocessed testing edge list \n     train_edges.bin                  # preprocessed training edge list \n     validation_edges.bin             # preprocessed validation edge list \n   arxiv/                             # existing arxiv dir\n     ...  \n\nLet's check what is inside the generated ``dataset.yaml`` file:\n\n.. code-block:: bash\n\n   $ cat datasets/custom_lp_example/dataset.yaml\n    dataset_dir: /marius-internal/datasets/custom_lp_example/\n    num_edges: 932994\n    num_nodes: 169343\n    num_relations: 1\n    num_train: 932994\n    num_valid: 116624\n    num_test: 116625\n    node_feature_dim: -1\n    rel_feature_dim: -1\n    num_classes: -1\n    initialized: false\n\n.. note:: \n   If the above ``marius_preprocess`` command fails due to any missing directory errors, please create the ``<output_directory>/edges`` and ``<output_directory>/nodes`` directories as a workaround.\n\n2. Define Configuration File\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nTo train a model, we need to define a YAML configuration file based on information created from marius_preprocess. \n\nThe configuration file contains information including but not limited to the inputs to the model, training procedure, and hyperparameters to optimize. Given a configuration file, marius assembles a model depending on the given parameters. The configuration file is grouped up into four sections:\n\n* Model: Defines the architecture of the model, neighbor sampling configuration, loss, and optimizer(s)\n* Storage: Specifies the input dataset and how to store the graph, features, and embeddings.\n* Training: Sets options for the training procedure and hyperparameters. E.g. batch size, negative sampling.\n* Evaluation: Sets options for the evaluation procedure (if any). The options here are similar to those in the training section.\n\nFor the full configuration schema, please refer to ``docs/config_interface``. \n\nAn example YAML configuration file for the OGBN_Arxiv dataset (link prediction model with DistMult) is given in ``examples/configuration/custom_lp.yaml``. Note that the ``dataset_dir`` is set to the preprocessing output directory, in our example, ``datasets/custom_lp_example/``.\n\nLet's create the same YAML configuration file for the OGBN_Arxiv dataset from scratch. We follow the structure of the configuration file and create each of the four sections one by one. In a YAML file, indentation is used to denote nesting and all parameters are in the format of key-value pairs. \n\n.. note:: \n   String values in the configuration file are case insensitive but we use capital letters for convention.\n\n#. First, we define the **model**. We begin by setting all required parameters. This includes ``learning_task``, ``encoder``, ``decoder``, and ``loss``. The rest of the configurations can be fine-tuned by the user.\n\n    .. code-block:: yaml\n    \n        model:\n          learning_task: LINK_PREDICTION # set the learning task to link prediction\n          encoder:\n            layers:\n              - - type: EMBEDDING # set the encoder to be an embedding table with 50-dimensional embeddings\n                  output_dim: 50\n          decoder:\n            type: DISTMULT # set the decoder to DistMult\n            options:\n              input_dim: 50\n          loss:\n            type: SOFTMAX_CE\n            options:\n              reduction: SUM\n          dense_optimizer: # optimizer to use for dense model parameters. In this case these are the DistMult relation (edge-type) embeddings\n              type: ADAM\n              options:\n                learning_rate: 0.1\n          sparse_optimizer: # optimizer to use for node embedding table\n              type: ADAGRAD\n              options:\n                learning_rate: 0.1\n        storage:\n          # omit\n        training:\n          # omit\n        evaluation:\n          # omit\n      \n#. Next, we set the **storage** and **dataset**. We begin by setting all required parameters. This includes ``dataset``. Here, the ``dataset_dir`` is set to ``datasets/custom_lp_example/``, which is the preprocessing output directory.\n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          device_type: cuda\n          dataset:\n            dataset_dir: /marius-internal/datasets/custom_lp_example/\n          edges:\n            type: DEVICE_MEMORY\n          embeddings:\n            type: DEVICE_MEMORY\n          save_model: true\n        training:\n          # omit\n        evaluation:\n          # omit\n\n#. Lastly, we configure **training** and **evaluation**. We begin by setting all required parameters. We begin by setting all required parameters. This includes ``num_epochs`` and ``negative_sampling``. We set ``num_epochs=10`` (10 epochs to train) to demonstrate this example. Note that ``negative_sampling`` is required for link prediction.\n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          # omit\n        training:\n          batch_size: 1000\n          negative_sampling:\n            num_chunks: 10\n            negatives_per_positive: 500\n            degree_fraction: 0.0\n            filtered: false\n          num_epochs: 10\n          pipeline:\n            sync: true\n          epochs_per_shuffle: 1        \n        evaluation:\n          batch_size: 1000\n          negative_sampling:\n            filtered: true\n          pipeline:\n            sync: true   \n     \n3. Train Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nAfter defining our configuration file, training is run with ``marius_train <your_config.yaml>``.\n\nWe can now train our example using the configuration file we just created by running the following command (assuming we are in the ``marius`` root directory):\n\n.. code-block:: bash\n\n   $ marius_train datasets/custom_lp_example/custom_lp.yaml\n    [2022-04-04 17:11:53.029] [info] [marius.cpp:45] Start initialization\n    [04/04/22 17:11:57.581] Initialization Complete: 4.552s\n    [04/04/22 17:11:57.650] ################ Starting training epoch 1 ################\n    [04/04/22 17:11:57.824] Edges processed: [94000/932994], 10.08%\n    [04/04/22 17:11:57.988] Edges processed: [188000/932994], 20.15%\n    [04/04/22 17:11:58.153] Edges processed: [282000/932994], 30.23%\n    [04/04/22 17:11:58.317] Edges processed: [376000/932994], 40.30%\n    [04/04/22 17:11:58.484] Edges processed: [470000/932994], 50.38%\n    [04/04/22 17:11:58.650] Edges processed: [564000/932994], 60.45%\n    [04/04/22 17:11:58.817] Edges processed: [658000/932994], 70.53%\n    [04/04/22 17:11:59.008] Edges processed: [752000/932994], 80.60%\n    [04/04/22 17:11:59.200] Edges processed: [846000/932994], 90.68%\n    [04/04/22 17:11:59.408] Edges processed: [932994/932994], 100.00%\n    [04/04/22 17:11:59.408] ################ Finished training epoch 1 ################\n    [04/04/22 17:11:59.408] Epoch Runtime: 1758ms\n    [04/04/22 17:11:59.408] Edges per Second: 530713.3\n    [04/04/22 17:11:59.408] Evaluating validation set\n    [04/04/22 17:12:00.444]\n    =================================\n    Link Prediction: 116624 edges evaluated\n    Mean Rank: 10927.984317\n    MRR: 0.088246\n    Hits@1: 0.043936\n    Hits@3: 0.091285\n    Hits@5: 0.123697\n    Hits@10: 0.176499\n    Hits@50: 0.337538\n    Hits@100: 0.414872\n    =================================\n    [04/04/22 17:12:00.444] Evaluating test set\n    [04/04/22 17:12:01.470]\n    =================================\n    Link Prediction: 116625 edges evaluated\n    Mean Rank: 10928.291687\n    MRR: 0.088237\n    Hits@1: 0.043798\n    Hits@3: 0.091670\n    Hits@5: 0.123190\n    Hits@10: 0.176377\n    Hits@50: 0.337749\n    Hits@100: 0.414697\n    =================================\n\nAfter running this configuration for 10 epochs, we should see a result similar to below:\n\n.. code-block:: bash\n\n    =================================\n    [04/04/22 17:12:32.312] ################ Starting training epoch 10 ################\n    [04/04/22 17:12:32.475] Edges processed: [94000/932994], 10.08%\n    [04/04/22 17:12:32.638] Edges processed: [188000/932994], 20.15%\n    [04/04/22 17:12:32.800] Edges processed: [282000/932994], 30.23%\n    [04/04/22 17:12:32.963] Edges processed: [376000/932994], 40.30%\n    [04/04/22 17:12:33.126] Edges processed: [470000/932994], 50.38%\n    [04/04/22 17:12:33.313] Edges processed: [564000/932994], 60.45%\n    [04/04/22 17:12:33.500] Edges processed: [658000/932994], 70.53%\n    [04/04/22 17:12:33.666] Edges processed: [752000/932994], 80.60%\n    [04/04/22 17:12:33.835] Edges processed: [846000/932994], 90.68%\n    [04/04/22 17:12:33.988] Edges processed: [932994/932994], 100.00%\n    [04/04/22 17:12:33.988] ################ Finished training epoch 10 ################\n    [04/04/22 17:12:33.988] Epoch Runtime: 1676ms\n    [04/04/22 17:12:33.988] Edges per Second: 556679\n    [04/04/22 17:12:33.988] Evaluating validation set\n    [04/04/22 17:12:35.010]\n    =================================\n    Link Prediction: 116624 edges evaluated\n    Mean Rank: 5765.685716\n    MRR: 0.132049\n    Hits@1: 0.048926\n    Hits@3: 0.149883\n    Hits@5: 0.210797\n    Hits@10: 0.304637\n    Hits@50: 0.536768\n    Hits@100: 0.626072\n    =================================\n    [04/04/22 17:12:35.011] Evaluating test set\n    [04/04/22 17:12:36.034]\n    =================================\n    Link Prediction: 116625 edges evaluated\n    Mean Rank: 5797.073741\n    MRR: 0.132749\n    Hits@1: 0.049406\n    Hits@3: 0.151588\n    Hits@5: 0.211944\n    Hits@10: 0.304437\n    Hits@50: 0.536549\n    Hits@100: 0.626006\n    =================================\n\n\nLet's check again what was added in the ``datasets/custom_lp_example/`` directory. For clarity, we only list the files that were created in training. Notice that several files have been created, including the trained model, the embedding table, a full configuration file, and output logs:\n\n.. code-block:: bash\n\n   $ ls datasets/custom_lp_example/ \n   model.pt                           # contains the dense model parameters, embeddings of the edge-types\n   model_state.pt                     # optimizer state of the trained model parameters\n   full_config.yaml                   # detailed config generated based on user-defined config\n   metadata.csv                       # information about metadata\n   logs/                              # logs containing output, error, debug information, and etc.\n   nodes/  \n     embeddings.bin                   # trained node embeddings of the graph\n     embeddings_state.bin             # node embedding optimizer state\n     ...\n   edges/   \n     ...\n   ...\n\n.. note:: \n    ``model.pt`` contains the dense model parameters. For DistMult, this is the embeddings of the edge-types. For GNN encoders, this file will include the GNN parameters.\n\n4. Inference\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n4.1 Command Line\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n4.2 Load Into Python\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n"
  },
  {
    "path": "docs/examples/config/lp_fb15k237.rst",
    "content": "Small Scale Link Prediction (FB15K-237)\n---------------------------------------------\n\nIn this tutorial, we use the **FB15K_237 knowledge graph** as an example to demonstrate a step-by-step walkthrough from preprocessing the dataset to defining the configuration file and to training **a link prediction model with the DistMult algorithm**.\n\n1. Preprocess Dataset\n^^^^^^^^^^^^^^^^^^^^^\n\nPreprocessing a dataset is straightforward with the ``marius_preprocess`` command. This command comes with ``marius`` when ``marius`` is installed. See (TODO link) for installation information.\n\nAssuming ``marius_preprocess`` has been built, we preprocess the FB15K_237 dataset by running the following command (assuming we are in the ``marius`` root directory):\n\n.. code-block:: bash\n\n   $ marius_preprocess --dataset fb15k_237 --output_directory datasets/fb15k_237_example/\n   Downloading FB15K-237.2.zip to datasets/fb15k_237_example/FB15K-237.2.zip\n   Reading edges\n   Remapping Edges\n   Node mapping written to: datasets/fb15k_237_example/nodes/node_mapping.txt\n   Relation mapping written to: datasets/fb15k_237_example/edges/relation_mapping.txt\n   Dataset statistics written to: datasets/fb15k_237_example/dataset.yaml\n\nThe  ``--dataset`` flag specifies which of the pre-set datasets ``marius_preprocess`` will preprocess and download.\n\nThe  ``--output_directory`` flag specifies where the preprocessed graph will be output and is set by the user. In this example, assume we have not created the datasets/fb15k_237_example repository. ``marius_preprocess`` will create it for us. \n\nFor detailed usages of  ``marius_preprocess``, please execute the following command:\n\n.. code-block:: bash\n\n   $ marius_preprocess -h\n\nLet's check what is inside the created directory:\n\n.. code-block:: bash\n\n   $ ls -l datasets/fb15k_237_example/ \n   dataset.yaml                       # input dataset statistics                                \n   nodes/  \n     node_mapping.txt                 # mapping of raw node ids to integer uuids\n   edges/   \n     relation_mapping.txt             # mapping of raw edge(relation) ids to integer uuids\n     test_edges.bin                   # preprocessed testing edge list \n     train_edges.bin                  # preprocessed training edge list \n     validation_edges.bin             # preprocessed validation edge list \n   train.txt                          # raw training edge list                                              \n   test.txt                           # raw testing edge list    \n   valid.txt                          # raw validation edge list    \n   text_cvsc.txt                      # relation triples as used in Toutanova and Chen CVSM-2015\n   text_emnlp.txt                     # relation triples as used inToutanova et al. EMNLP-2015\n   README.txt                         # README of the downloaded FB15K-237 dataset\n\nLet's check what is inside the generated ``dataset.yaml`` file:\n\n.. code-block:: bash\n\n   $ cat datasets/fb15k_237_example/dataset.yaml\n   dataset_dir: /marius-internal/datasets/fb15k_237_example/\n   num_edges: 272115\n   num_nodes: 14541\n   num_relations: 237\n   num_train: 272115\n   num_valid: 17535\n   num_test: 20466\n   node_feature_dim: -1\n   rel_feature_dim: -1\n   num_classes: -1\n   initialized: false\n\n.. note:: \n   If the above ``marius_preprocess`` command fails due to any missing directory errors, please create the ``<output_directory>/edges`` and ``<output_directory>/nodes`` directories as a workaround.\n\n2. Define Configuration File\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nTo train a model, we need to define a YAML configuration file based on information created from marius_preprocess. \n\nThe configuration file contains information including but not limited to the inputs to the model, training procedure, and hyperparameters to optimize. Given a configuration file, marius assembles a model depending on the given parameters. The configuration file is grouped up into four sections:\n\n* Model: Defines the architecture of the model, neighbor sampling configuration, loss, and optimizer(s)\n* Storage: Specifies the input dataset and how to store the graph, features, and embeddings.\n* Training: Sets options for the training procedure and hyperparameters. E.g. batch size, negative sampling.\n* Evaluation: Sets options for the evaluation procedure (if any). The options here are similar to those in the training section.\n\nFor the full configuration schema, please refer to ``docs/config_interface``.\n\nAn example YAML configuration file for the FB15K_237 dataset is given in ``examples/configuration/fb15k_237.yaml``. Note that the ``dataset_dir`` is set to the preprocessing output directory, in our example, ``datasets/fb15k_237_example/``.\n\nLet's create the same YAML configuration file for the FB15K_237 dataset from scratch. We follow the structure of the configuration file and create each of the four sections one by one. In a YAML file, indentation is used to denote nesting and all parameters are in the format of key-value pairs. \n\n#. First, we define the **model**. We begin by setting all required parameters. This includes ``learning_task``, ``encoder``, ``decoder``, and ``loss``. The rest of the configurations can be fine-tuned by the user.\n\n    .. code-block:: yaml\n    \n        model:\n          learning_task: LINK_PREDICTION # set the learning task to link prediction\n          encoder:\n            layers:\n              - - type: EMBEDDING # set the encoder to be an embedding table with 50-dimensional embeddings\n                  output_dim: 50\n          decoder:\n            type: DISTMULT # set the decoder to DistMult\n            options:\n              input_dim: 50\n          loss:\n            type: SOFTMAX_CE\n            options:\n              reduction: SUM\n          dense_optimizer: # optimizer to use for dense model parameters. In this case these are the DistMult relation (edge-type) embeddings\n              type: ADAM\n              options:\n                learning_rate: 0.1\n          sparse_optimizer: # optimizer to use for node embedding table\n              type: ADAGRAD\n              options:\n                learning_rate: 0.1\n        storage:\n          # omit\n        training:\n          # omit\n        evaluation:\n          # omit\n      \n#. Next, we set the **storage** and **dataset**. We begin by setting all required parameters. This includes ``dataset``. Here, the ``dataset_dir`` is set to ``datasets/fb15k_237_example/``, which is the preprocessing output directory.\n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          device_type: cuda\n          dataset:\n            dataset_dir: datasets/fb15k_237_example/\n          edges:\n            type: DEVICE_MEMORY\n          embeddings:\n            type: DEVICE_MEMORY\n          save_model: true\n        training:\n          # omit\n        evaluation:\n          # omit\n\n#. Lastly, we configure **training** and **evaluation**. We begin by setting all required parameters. This includes ``num_epochs`` and ``negative_sampling``. We set ``num_epochs=10`` (10 epochs to train) to demonstrate this example. Note that ``negative_sampling`` is required for link prediction.\n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          # omit\n        training:\n          batch_size: 1000\n          negative_sampling:\n            num_chunks: 10\n            negatives_per_positive: 500\n            degree_fraction: 0.0\n            filtered: false\n          num_epochs: 10\n          pipeline:\n            sync: true\n          epochs_per_shuffle: 1        \n        evaluation:\n          batch_size: 1000\n          negative_sampling:\n            filtered: true\n          pipeline:\n            sync: true   \n     \n3. Train Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nAfter defining our configuration file, training is run with ``marius_train <your_config.yaml>``.\n\nWe can now train our example using the configuration file we just created by running the following command (assuming we are in the ``marius`` root directory):\n\n.. code-block:: bash\n\n   $ marius_train datasets/fb15k_237_example/fb15k_237.yaml\n    [2022-04-03 14:53:15.106] [info] [marius.cpp:45] Start initialization\n    [04/03/22 14:53:19.140] Initialization Complete: 4.034s\n    [04/03/22 14:53:19.147] ################ Starting training epoch 1 ################\n    [04/03/22 14:53:19.224] Edges processed: [28000/272115], 10.29%\n    [04/03/22 14:53:19.295] Edges processed: [56000/272115], 20.58%\n    [04/03/22 14:53:19.369] Edges processed: [84000/272115], 30.87%\n    [04/03/22 14:53:19.447] Edges processed: [112000/272115], 41.16%\n    [04/03/22 14:53:19.525] Edges processed: [140000/272115], 51.45%\n    [04/03/22 14:53:19.603] Edges processed: [168000/272115], 61.74%\n    [04/03/22 14:53:19.685] Edges processed: [196000/272115], 72.03%\n    [04/03/22 14:53:19.765] Edges processed: [224000/272115], 82.32%\n    [04/03/22 14:53:19.851] Edges processed: [252000/272115], 92.61%\n    [04/03/22 14:53:19.906] Edges processed: [272115/272115], 100.00%\n    [04/03/22 14:53:19.906] ################ Finished training epoch 1 ################\n    [04/03/22 14:53:19.906] Epoch Runtime: 758ms\n    [04/03/22 14:53:19.906] Edges per Second: 358990.75\n    [04/03/22 14:53:19.906] Evaluating validation set\n    [04/03/22 14:53:19.972]\n    =================================\n    Link Prediction: 35070 edges evaluated\n    Mean Rank: 443.786313\n    MRR: 0.233709\n    Hits@1: 0.157998\n    Hits@3: 0.258597\n    Hits@5: 0.308640\n    Hits@10: 0.382407\n    Hits@50: 0.560137\n    Hits@100: 0.633191\n    =================================\n    [04/03/22 14:53:19.972] Evaluating test set\n    [04/03/22 14:53:20.043]\n    =================================\n    Link Prediction: 40932 edges evaluated\n    Mean Rank: 454.272940\n    MRR: 0.230645\n    Hits@1: 0.155282\n    Hits@3: 0.253103\n    Hits@5: 0.304065\n    Hits@10: 0.382073\n    Hits@50: 0.559758\n    Hits@100: 0.630192\n    =================================\n\nAfter running this configuration for 10 epochs, we should see a result similar to below with a MRR roughly equal to 0.25:\n\n.. code-block:: bash\n\n    =================================\n    [04/03/22 14:53:27.861] ################ Starting training epoch 10 ################\n    [04/03/22 14:53:27.944] Edges processed: [28000/272115], 10.29%\n    [04/03/22 14:53:28.023] Edges processed: [56000/272115], 20.58%\n    [04/03/22 14:53:28.115] Edges processed: [84000/272115], 30.87%\n    [04/03/22 14:53:28.220] Edges processed: [112000/272115], 41.16%\n    [04/03/22 14:53:28.315] Edges processed: [140000/272115], 51.45%\n    [04/03/22 14:53:28.410] Edges processed: [168000/272115], 61.74%\n    [04/03/22 14:53:28.506] Edges processed: [196000/272115], 72.03%\n    [04/03/22 14:53:28.602] Edges processed: [224000/272115], 82.32%\n    [04/03/22 14:53:28.699] Edges processed: [252000/272115], 92.61%\n    [04/03/22 14:53:28.772] Edges processed: [272115/272115], 100.00%\n    [04/03/22 14:53:28.772] ################ Finished training epoch 10 ################\n    [04/03/22 14:53:28.772] Epoch Runtime: 911ms\n    [04/03/22 14:53:28.772] Edges per Second: 298699.22\n    [04/03/22 14:53:28.772] Evaluating validation set\n    [04/03/22 14:53:28.834]\n    =================================\n    Link Prediction: 35070 edges evaluated\n    Mean Rank: 303.712946\n    MRR: 0.259462\n    Hits@1: 0.173253\n    Hits@3: 0.286570\n    Hits@5: 0.348104\n    Hits@10: 0.434474\n    Hits@50: 0.626775\n    Hits@100: 0.706045\n    =================================\n    [04/03/22 14:53:28.835] Evaluating test set\n    [04/03/22 14:53:28.904]\n    =================================\n    Link Prediction: 40932 edges evaluated\n    Mean Rank: 317.841664\n    MRR: 0.255330\n    Hits@1: 0.169794\n    Hits@3: 0.281858\n    Hits@5: 0.341860\n    Hits@10: 0.429859\n    Hits@50: 0.625208\n    Hits@100: 0.703875\n    =================================\n\nLet's check again what was added in the ``datasets/fb15k_237_example/`` directory. For clarity, we only list the files that were created in training. Notice that several files have been created, including the trained model, the embedding table, a full configuration file, and output logs:\n\n.. code-block:: bash\n\n   $ ls datasets/fb15k_237_example/ \n   model.pt                           # contains the dense model parameters, embeddings of the edge-types\n   model_state.pt                     # optimizer state of the trained model parameters\n   full_config.yaml                   # detailed config generated based on user-defined config\n   metadata.csv                       # information about metadata\n   logs/                              # logs containing output, error, debug,  information\n   nodes/  \n     embeddings.bin                   # trained node embeddings of the graph\n     embeddings_state.bin             # node embedding optimizer state\n     ...\n   edges/   \n     ...\n   ...\n\n.. note::\n  ``model.pt`` contains the dense model parameters. For DistMult, this is the embeddings of the edge-types. For GNN encoders, this file will include the GNN parameters.\n\n4. Inference\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n4.1 Command Line\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n4.2 Load Into Python\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n"
  },
  {
    "path": "docs/examples/config/lp_paleobiology.rst",
    "content": ".. _lp_paleo:\n\nPaleobiology Dataset Link Prediction\n---------------------------------------------\nIn this tutorial, we will use Marius to perform link prediction on a paleobiology knowledge graph dataset, i.e. predicting the existence of an edge between two nodes in a graph. This will cover the end-to-end process of downloading a dataset, running Marius to learn the embeddings of all the nodes and edges in the graph, and then using these embeddings to infer new links and make discoveries.\n\nDataset Information\n^^^^^^^^^^^^^^^^^^^^^\nIn our example we will train on a graph-structured paleobiology dataset which contains information about fossils and their relations to Earth. In this knowledge graph dataset, nodes represent different types of entities and directed edges are relationships between them. This dataset contains 14,752 nodes and 107,247 edges, with 5 possible relation types.\n\nThe **nodes/entities** in this graph fall into 10 different types. They are:\n\n1. Country\n2. State\n3. County\n4. Lithology\n5. Formation\n6. Geological Group\n7. Member\n8. Taxon\n9. Environment\n10. Geological Interval\n\nThe directed edges/relations in this graph fall into 5 categories:\n\n1. Consist of\n2. Collected from\n3. Located in\n4. Assigned to\n5. Interpretted as\n\nEvery triplet in this knowledge graph dataset follows the structure of\n\n``<[Source Node], [Relation], [Target Node]>``\n\nFor example, the triplet\n\n``<47880_taxon, collected_from, Wisconsin_state>``\n\nsignifies a directed edge of type ``collected from`` pointing from the taxon (i.e. biological group) node with an ID number of 47880 to the state node ``wisconsin_state``. With our dataset comes a taxon ID lookup table, from which we can see that the ID of 47880 represents Mammuthus, or mammoths. Semantically, this triplet means that mammoth fossils have been collected from Wisconsin.\n\nThe goal of generating embeddings for the edges in this graph will be to use them to predict gaps in our knowledge base. After training our embeddings, we can fix a target node and relation type and predict potential source nodes. For example, we can fix our taxon type and relation type and make a query for potential source nodes, i.e. possible locations mammoths could be collected from which were not present in our knowledge base.\n\n``<47880_taxon, collected_from, ?>``\n\nUsing our embeddings, we can find the source node(s) with the highest probability of existing. If the probability is higher than some threshold, we can say that these predicted links should be considered true.\n\n1. Download the Dataset\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nTo download the dataset and helper files, clone the following repository into the directory of your choosing::\n\n    git clone https://github.com/marius-team/marius-examples.git\n\nEnter the link prediction example directory::\n\n    cd marius-examples/link-predict-example\n\nThis contains our paleobiology dataset, located in ``dataset/``, along with a sample configuration file ``paleo_config.yaml`` and link prediction Python script predict.py.\n\n2. Train Model with Marius\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nWe assume Marius has been downloaded and installed with PIP::\n\n    git clone https://github.com/marius-team/marius.git\n    cd marius; python3 -m pip install .\n\n**Step 1: Define a Configuration File**\n\nBefore we can run Marius we need to specify our model hyperparameters and path to our preprocessed graph dataset. The following configuration file, ``paleo_config.yaml``, is included in ``marius-examples/link-predict-example``. It contains standard options for link prediction. Note a few important parameters that might need to be changed based on your system:\n\n``storage.device_type`` : We assume training on GPU. If using CPU only, switch from ``cuda`` to ``CPU``.\n\n``training.num_epochs``: The number of epochs to train for. Adjust higher or lower based on the desired accuracy of the embeddings. Our default is ``100`` epochs.\n\n``storage.dataset.dataset_dir``: We assume we are in the link prediction example directory ``~/marius-examples/link-predict-example/`` when running the training process, so our default relative path is ``dataset/``. Change if running from another directory.\n\n``paleo_config.yaml``\n\n.. code-block:: yaml\n\n        model:\n            learning_task: LINK_PREDICTION\n            encoder:\n                layers:\n                - - type: EMBEDDING\n                    output_dim: 50\n            decoder:\n                type: COMPLEX\n            loss:\n                type: SOFTMAX_CE\n            sparse_optimizer:\n                type: ADAGRAD\n                options:\n                learning_rate: 0.1\n        storage:\n            device_type: cuda\n            dataset:\n                dataset_dir: dataset/\n            edges:\n                type: DEVICE_MEMORY\n            embeddings:\n                type: DEVICE_MEMORY\n            save_model: true\n        training:\n            batch_size: 1000\n            negative_sampling:\n                num_chunks: 100\n                negatives_per_positive: 512\n                degree_fraction: 0.0\n                filtered: false\n            num_epochs: 100\n            pipeline:\n                sync: true\n            epochs_per_shuffle: 1\n        evaluation:\n            batch_size: 1000\n            negative_sampling:\n                filtered: true\n            pipeline:\n                sync: true\n\n**Step 2: Run Marius**\n\nNow that we have a configuration file and dataset ready, we simply need to run the training executable with our config file as the argument.::\n\n    marius_train paleo_config.yaml\n\nThe output should appear similar to::\n\n    [info] [marius.cpp:45] Start initialization\n    Initialization Complete: 4.424s\n    ################ Starting training epoch 1 ################\n    Edges processed: [10000/96522], 10.36%\n    Edges processed: [20000/96522], 20.72%\n    Edges processed: [30000/96522], 31.08%\n    Edges processed: [40000/96522], 41.44%\n    Edges processed: [50000/96522], 51.80%\n    Edges processed: [60000/96522], 62.16%\n    Edges processed: [70000/96522], 72.52%\n    Edges processed: [80000/96522], 82.88%\n    Edges processed: [90000/96522], 93.24%\n    Edges processed: [96522/96522], 100.00%\n    ################ Finished training epoch 1 ################\n    Epoch Runtime: 527ms\n    Edges per Second: 183153.7\n    Evaluating validation set\n    =================================\n    Link Prediction: 10724 edges evaluated\n    Mean Rank: 1426.696568\n    MRR: 0.115575\n    Hits@1: 0.058653\n    Hits@3: 0.128683\n    Hits@5: 0.169153\n    Hits@10: 0.229952\n    Hits@50: 0.392111\n    Hits@100: 0.459437\n    =================================\n\nAfter this has finished, our output will be in our ``[dataset_dir]`` (using the provided config, this will be ``dataset/``.\n\nHere are the files that were created in training:\nLet's check again what was added in the ``dataset/`` directory. For clarity, we only list the files that were created in training. Notice that several files have been created, including the trained model, the embedding table, a full configuration file, and output logs:\n\n.. code-block:: bash\n\n   $ ls dataset/ \n   model.pt                           # contains the dense model parameters, embeddings of the edge-types\n   model_state.pt                     # optimizer state of the trained model parameters\n   full_config.yaml                   # detailed config generated based on user-defined config\n   metadata.csv                       # information about metadata\n   logs/                              # logs containing output, error, debug information, and etc.\n   nodes/  \n     embeddings.bin                   # trained node embeddings of the graph\n     embeddings_state.bin             # node embedding optimizer state\n     ...\n   edges/   \n     ...\n   ...\n\n3. Inference with Python: Using Embeddings for Link Prediction\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nWe will use the Marius inference tool ``marius_predict`` to perform link prediction with our trained model. For more information about ``marius_predict``, see :ref:`marius_predict`.\n\nIn ``~marius-examples/link-predict-example/``, run::\n\n    marius_predict --config paleo_config.yaml --output_dir results/ --metrics mrr mean_rank hits1 hits10 hits50 --save_scores --save_ranks\n\nThis tool takes our config, an output directory, and our desired metrics as input, and perform link prediction evaluation over the test set of edges provided in the config file. Metrics are saved to ``results/metrics.txt`` and scores and ranks for each test edge are saved to ``results/scores.csv``. \n\n"
  },
  {
    "path": "docs/examples/config/nc_custom.rst",
    "content": "Custom Dataset Node Classification\n---------------------------------------------\nIn this tutorial, we use the **Cora dataset** as an example to demonstrate a step-by-step walkthrough from preprocessing the dataset to defining the configuration file and to training **a node classification with 3-layer GraphSage model**.\n\n1. Preprocess Dataset\n^^^^^^^^^^^^^^^^^^^^^\n\nPreprocessing a custom dataset is straightforward with the help of Marius python API. Preprocessing using the Marius Python API requires creating a custom Dataset class of type ``NodeClassificationDataset`` or ``LinkPredictionDataset``. An example python script which preprocesses, trains, and evaluates the Cora dataset is provided in ``examples/python/custom_nc_graphsage.py``. For detailed steps, please refer to (link).\n\nLet's borrow the provided ``examples/python/custom_nc_graphsage.py`` and modify it to suit our purpose. We first ``download()`` the dataset to ``datasets/custom_nc_example/cora/``, then ``preprocess()``,. Note that the ``MYDATASET`` class is a child class of ``NodeClassificationDataset``: \n\n.. code-block:: python\n\n    import marius as m\n    import torch\n    from omegaconf import OmegaConf\n\n    import numpy as np\n    import pandas as pd\n\n    from pathlib import Path\n\n    from marius.tools.preprocess.dataset import NodeClassificationDataset\n    from marius.tools.preprocess.utils import download_url, extract_file\n    from marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\n    from marius.tools.configuration.constants import PathConstants\n    from marius.tools.preprocess.datasets.dataset_helpers import remap_nodes\n\n    def switch_to_num(row):\n        names = ['Neural_Networks', 'Rule_Learning', 'Reinforcement_Learning', 'Probabilistic_Methods',\\\n                'Theory', 'Genetic_Algorithms', 'Case_Based']\n        idx = 0\n        for i in range(len(names)):\n            if (row == names[i]):\n                idx = i\n                break\n        \n        return idx\n\n    class MYDATASET(NodeClassificationDataset):\n        \n        def __init__(self, output_directory: Path, spark=False):\n\n            super().__init__(output_directory, spark)\n\n            self.dataset_name = \"cora\"\n            self.dataset_url = \"http://www.cs.umd.edu/~sen/lbc-proj/data/cora.tgz\"\n        \n        def download(self, overwrite=False):\n\n            # These are the files that I want to make my the end of the the download\n            self.input_edge_list_file = self.output_directory / Path(\"edge.csv\")\n            self.input_node_feature_file = self.output_directory / Path(\"node-feat.csv\")\n            self.input_node_label_file = self.output_directory / Path(\"node-label.csv\")\n            self.input_train_nodes_file = self.output_directory / Path(\"train.csv\")\n            self.input_valid_nodes_file = self.output_directory / Path(\"valid.csv\")\n            self.input_test_nodes_file = self.output_directory / Path(\"test.csv\")\n\n            download = False\n            if not self.input_edge_list_file.exists():\n                download = True\n            if not self.input_node_feature_file.exists():\n                download = True\n            if not self.input_node_label_file.exists():\n                download = True\n            if not self.input_train_nodes_file.exists():\n                download = True\n            if not self.input_valid_nodes_file.exists():\n                download = True\n            if not self.input_test_nodes_file.exists():\n                download = True\n            \n            if download:\n                archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n                extract_file(archive_path, remove_input=False)\n\n                # Reading and processing the csv\n                df = pd.read_csv(dataset_dir / Path(\"cora/cora.content\"), sep=\"\\t\", header=None)\n                cols = df.columns[1:len(df.columns)-1]\n\n                # Getting all the indices\n                indices = np.array(range(len(df)))\n                np.random.shuffle(indices)\n                train_indices = indices[0:int(0.8*len(df))]\n                valid_indices = indices[int(0.8*len(df)):int(0.8*len(df))+int(0.1*len(df))]\n                test_indices = indices[int(0.8*len(df))+int(0.1*len(df)):]\n\n                np.savetxt(dataset_dir / Path(\"train.csv\"), train_indices, delimiter=\",\", fmt=\"%d\")\n                np.savetxt(dataset_dir / Path(\"valid.csv\"), valid_indices, delimiter=\",\", fmt=\"%d\")\n                np.savetxt(dataset_dir / Path(\"test.csv\"), test_indices, delimiter=\",\", fmt=\"%d\")\n\n\n                # Features\n                features = df[cols]\n                features.to_csv(index=False, sep=\",\", path_or_buf = dataset_dir / Path(\"node-feat.csv\"), header=False)\n\n                # Labels\n                labels = df[df.columns[len(df.columns)-1]]\n                labels = labels.apply(switch_to_num)\n                labels.to_csv(index=False, sep=\",\", path_or_buf = dataset_dir / Path(\"node-label.csv\"), header=False)\n\n                # Edges\n                node_ids = df[df.columns[0]]\n                dict_reverse = node_ids.to_dict()\n                nodes_dict = {v: k for k, v in dict_reverse.items()}\n                df_edges = pd.read_csv(dataset_dir / Path(\"cora/cora.cites\"), sep=\"\\t\", header=None)\n                df_edges.replace({0: nodes_dict, 1: nodes_dict},inplace=True)\n                df_edges.to_csv(index=False, sep=\",\", path_or_buf = dataset_dir / Path(\"edge.csv\"), header=False)\n\n            \n        def preprocess(self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False):\n            train_nodes = np.genfromtxt(self.input_train_nodes_file, delimiter=\",\").astype(np.int32)\n            valid_nodes = np.genfromtxt(self.input_valid_nodes_file, delimiter=\",\").astype(np.int32)\n            test_nodes = np.genfromtxt(self.input_test_nodes_file, delimiter=\",\").astype(np.int32)\n\n            converter = TorchEdgeListConverter(\n                output_dir=self.output_directory,\n                train_edges=self.input_edge_list_file,\n                num_partitions=num_partitions,\n                src_column = 0,\n                dst_column = 1,\n                remap_ids=remap_ids,\n                sequential_train_nodes=sequential_train_nodes,\n                delim=\",\",\n                known_node_ids=[train_nodes, valid_nodes, test_nodes],\n                partitioned_evaluation=partitioned_eval\n            )\n            dataset_stats = converter.convert()\n\n            features = np.genfromtxt(self.input_node_feature_file, delimiter=\",\").astype(np.float32)\n            labels = np.genfromtxt(self.input_node_label_file, delimiter=\",\").astype(np.int32)\n\n            if remap_ids:\n                node_mapping = np.genfromtxt(self.output_directory / Path(PathConstants.node_mapping_path), delimiter=\",\")\n                train_nodes, valid_nodes, test_nodes, features, labels = remap_nodes(node_mapping, train_nodes, valid_nodes, test_nodes, features, labels)\n\n            with open(self.train_nodes_file, \"wb\") as f:\n                f.write(bytes(train_nodes))\n            with open(self.valid_nodes_file, \"wb\") as f:\n                f.write(bytes(valid_nodes))\n            with open(self.test_nodes_file, \"wb\") as f:\n                f.write(bytes(test_nodes))\n            with open(self.node_features_file, \"wb\") as f:\n                f.write(bytes(features))\n            with open(self.node_labels_file, \"wb\") as f:\n                f.write(bytes(labels))\n\n            # update dataset yaml\n            dataset_stats.num_train = train_nodes.shape[0]\n            dataset_stats.num_valid = valid_nodes.shape[0]\n            dataset_stats.num_test = test_nodes.shape[0]\n            dataset_stats.node_feature_dim = features.shape[1]\n            dataset_stats.num_classes = 40\n\n            dataset_stats.num_nodes = dataset_stats.num_train + dataset_stats.num_valid + dataset_stats.num_test\n\n            with open(self.output_directory / Path(\"dataset.yaml\"), \"w\") as f:\n                yaml_file = OmegaConf.to_yaml(dataset_stats)\n                f.writelines(yaml_file)\n\n            return\n\n    if __name__ == '__main__':\n        # initialize and preprocess dataset\n        dataset_dir = Path(\"datasets/custom_nc_example/cora/\") # note that we write to this directory\n        dataset = MYDATASET(dataset_dir)\n        if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n            dataset.download()\n            dataset.preprocess()\n\nWe preprocess the Cora dataset by running the ollowing command (assuming we are in the ``marius`` root directory):\n\n.. code-block:: bash\n\n   $ python datasets/custom_nc_example/custom_nc_graphsage.py \n    Downloading cora.tgz to cora/cora.tgz\n    Reading edges\n    Remapping Edges\n    Node mapping written to: cora/nodes/node_mapping.txt\n    Dataset statistics written to: cora/dataset.yaml\n\nIn this example, assume we have not created the ``datasets/custom_nc_example/cora/`` repository, ``custom_nc_graphsage.py`` will create it for us. \n\nFor detailed usages of Marius python API, please refer to (link).\n\nLet's check what is inside the created directory:\n\n.. code-block:: bash\n\n   $ ls -1 datasets/custom_nc_example/cora/\n   dataset.yaml                       # input dataset statistics                                \n   nodes/  \n     node_mapping.txt                 # mapping of raw node ids to integer uuids\n     features.bin                     # preprocessed features list\n     labels.bin                       # preprocessed labels list\n     test_nodes.bin                   # preprocessed testing nodes list\n     train_nodes.bin                  # preprocessed training nodes list\n     validation_nodes.bin             # preprocessed validation nodes list\n   edges/   \n     train_edges.bin                  # mapping of raw edge(relation) ids to integer uuids\n   cora/                              # downloaded source files\n     ...\n   edge.csv                           # raw edge list\n   train.csv                          # raw training edge list                                              \n   test.csv                           # raw testing edge list    \n   valid.csv                          # raw validation edge list    \n   node-feat.csv                      # node features\n   node-label.csv                     # node labels\n   cora.tgz                           # downloaded Cora dataset\n\n\nLet's check what is inside the generated ``dataset.yaml`` file:\n\n.. code-block:: bash\n\n   $ cat datasets/ogbn_arxiv_example/dataset.yaml\n    dataset_dir: /marius-internal/datasets/custom_nc_example/cora/\n    num_edges: 5429\n    num_nodes: 2708\n    num_relations: 1\n    num_train: 2166\n    num_valid: 270\n    num_test: 272\n    node_feature_dim: 1433\n    rel_feature_dim: -1\n    num_classes: 40\n    initialized: false\n\n\n2. Define Configuration File\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nTo train a model, we need to define a YAML configuration file based on information created from the preprocessing python script. \n\nThe configuration file contains information including but not limited to the inputs to the model, training procedure, and hyperparameters to optimize. Given a configuration file, marius assembles a model depending on the given parameters. The configuration file is grouped up into four sections:\n\n* Model: Defines the architecture of the model, neighbor sampling configuration, loss, and optimizer(s)\n* Storage: Specifies the input dataset and how to store the graph, features, and embeddings.\n* Training: Sets options for the training procedure and hyperparameters. E.g. batch size, negative sampling.\n* Evaluation: Sets options for the evaluation procedure (if any). The options here are similar to those in the training section.\n\nFor the full configuration schema, please refer to ``docs/config_interface``.\n\nAn example YAML configuration file for the Cora dataset is given in ``examples/configuration/custom_nc.yaml``. Note that the ``dataset_dir`` is set to the preprocessing output directory, in our example, ``datasets/custom_nc_example/cora/``.\n\nLet's create the same YAML configuration file for the OGBN_Arxiv dataset from scratch. We follow the structure of the configuration file and create each of the four sections one by one. In a YAML file, indentation is used to denote nesting and all parameters are in the format of key-value pairs. \n\n#. | First, we define the **model**. We begin by setting all required parameters. This includes ``learning_task``, ``encoder``, ``decoder``, and ``loss``.\n   | Note that the output of the encoder is the output label vector for a given node. (E.g. For node classification with 5 classes, the output label vector from the encoder might look like this: [.05, .2, .8, .01, .03]. In this case, an argmax will return a class label of 2 for the node.) The rest of the configurations can be fine-tuned by the user.\n\n    .. code-block:: yaml\n    \n        model:\n          learning_task: NODE_CLASSIFICATION # set the learning task to node classification\n          encoder:\n            train_neighbor_sampling:\n              - type: ALL\n              - type: ALL\n              - type: ALL\n            layers: # define three layers of GNN of type GRAPH_SAGE\n              - - type: FEATURE\n                  output_dim: 1433 # set to 1433 (to match \"node_feature_dim=1433\" in \"dataset.yaml\") for each layer except for the last\n                  bias: true\n              - - type: GNN\n                  options:\n                    type: GRAPH_SAGE\n                    aggregator: MEAN\n                  input_dim: 1433 # set to 1433 (to match \"node_feature_dim=1433\" in \"dataset.yaml\") for each layer except for the last\n                  output_dim: 1433\n                  bias: true\n              - - type: GNN\n                  options:\n                    type: GRAPH_SAGE\n                    aggregator: MEAN\n                  input_dim: 1433\n                  output_dim: 1433\n                  bias: true\n              - - type: GNN\n                  options:\n                    type: GRAPH_SAGE\n                    aggregator: MEAN\n                  input_dim: 1433\n                  output_dim: 40 # set \"output_dim\" to 40 (to match \"num_classes=40\") in \"dataset.yaml\" for the last layer\n                  bias: true\n          decoder:\n            type: NODE\n          loss:\n            type: CROSS_ENTROPY\n            options:\n              reduction: SUM\n          dense_optimizer:\n            type: ADAM\n            options:\n              learning_rate: 0.01\n        storage:\n          # omit\n        training:\n          # omit\n        evaluation:\n          # omit\n      \n#. | Next, we set the **storage** and **dataset**. We begin by setting all required parameters. This includes ``dataset``. Here, the ``dataset_dir`` is set to ``datasets/custom_nc_example/cora/``, which is the preprocessing output directory.\n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          device_type: cuda\n          dataset:\n            dataset_dir: datasets/custom_nc_example/cora/\n          edges:\n            type: DEVICE_MEMORY\n            options:\n              dtype: int\n          features:\n            type: DEVICE_MEMORY\n            options:\n              dtype: float\n        training:\n          # omit\n        evaluation:\n          # omit\n\n#. Lastly, we configure **training** and **evaluation**. We begin by setting all required parameters. This includes ``num_epochs``. We set ``num_epochs=10`` (10 epochs to train) to demonstrate this example. \n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          # omit\n        training:\n          batch_size: 1000\n          num_epochs: 10\n          pipeline:\n            sync: true\n        evaluation:\n          batch_size: 1000\n          pipeline:\n            sync: true\n     \n3. Train Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nAfter defining our configuration file, training is run with ``marius_train <your_config.yaml>``.\n\nWe can now train our example using the configuration file we just created by running the following command (assuming we are in the ``marius`` root directory):\n\n.. code-block:: bash\n\n   $ marius_train datasets/custom_nc_example/cora/custom_nc.yaml\n    [2022-04-05 18:41:44.987] [info] [marius.cpp:45] Start initialization\n    [04/05/22 18:41:49.122] Initialization Complete: 4.134s\n    [04/05/22 18:41:49.135] ################ Starting training epoch 1 ################\n    [04/05/22 18:41:49.161] Nodes processed: [1000/2166], 46.17%\n    [04/05/22 18:41:49.180] Nodes processed: [2000/2166], 92.34%\n    [04/05/22 18:41:49.199] Nodes processed: [2166/2166], 100.00%\n    [04/05/22 18:41:49.199] ################ Finished training epoch 1 ################\n    [04/05/22 18:41:49.199] Epoch Runtime: 63ms\n    [04/05/22 18:41:49.199] Nodes per Second: 34380.953\n    [04/05/22 18:41:49.199] Evaluating validation set\n    [04/05/22 18:41:49.213]\n    =================================\n    Node Classification: 270 nodes evaluated\n    Accuracy: 12.962963%\n    =================================\n    [04/05/22 18:41:49.213] Evaluating test set\n    [04/05/22 18:41:49.221]\n    =================================\n    Node Classification: 272 nodes evaluated\n    Accuracy: 16.176471%\n    =================================\n\nAfter running this configuration for 10 epochs, we should see a result similar to below with arruracy roughly equal to 86%:\n\n.. code-block:: bash\n\n    =================================\n    [04/05/22 18:41:49.820] ################ Starting training epoch 10 ################\n    [04/05/22 18:41:49.833] Nodes processed: [1000/2166], 46.17%\n    [04/05/22 18:41:49.854] Nodes processed: [2000/2166], 92.34%\n    [04/05/22 18:41:49.872] Nodes processed: [2166/2166], 100.00%\n    [04/05/22 18:41:49.872] ################ Finished training epoch 10 ################\n    [04/05/22 18:41:49.872] Epoch Runtime: 51ms\n    [04/05/22 18:41:49.872] Nodes per Second: 42470.59\n    [04/05/22 18:41:49.872] Evaluating validation set\n    [04/05/22 18:41:49.883]\n    =================================\n    Node Classification: 270 nodes evaluated\n    Accuracy: 84.814815%\n    =================================\n    [04/05/22 18:41:49.883] Evaluating test set\n    [04/05/22 18:41:49.891]\n    =================================\n    Node Classification: 272 nodes evaluated\n    Accuracy: 88.970588%\n    =================================\n\nLet's check again what was added in the ``datasets/custom_nc_example/cora/`` directory. For clarity, we only list the files that were created in training. Notice that several files have been created, including the trained model, the embedding table, a full configuration file, and output logs:\n\n.. code-block:: bash\n\n   $ ls -1 datasets/ogbn_arxiv_example/ \n   model.pt                           # contains the dense model parameters, including the GNN parameters\n   model_state.pt                     # optimizer state of the trained model parameters\n   full_config.yaml                   # detailed config generated based on user-defined config\n   metadata.csv                       # information about metadata\n   logs/                              # logs containing output, error, debug information, and etc.\n   nodes/  \n     ...\n   edges/   \n     ...\n   ...\n\n.. note::\n  ``model.pt`` contains the dense model parameters. For GNN encoders, this file will include the GNN parameters.\n\n4. Inference\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n4.1 Command Line\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n4.2 Load Into Python\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n"
  },
  {
    "path": "docs/examples/config/nc_ogbn_arxiv.rst",
    "content": "Small Scale Node Classification (OGBN-Arxiv)\n---------------------------------------------\n\nIn this tutorial, we use the **OGBN-Arxiv dataset** as an example to demonstrate a step-by-step walkthrough from preprocessing the dataset to defining the configuration file and to training **a node classification with 3-layer GraphSage model**.\n\n1. Preprocess Dataset\n^^^^^^^^^^^^^^^^^^^^^\n\nPreprocessing a dataset is straightforward with the ``marius_preprocess`` command. This command comes with ``marius`` when ``marius`` is installed. See (TODO link) for installation information.\n\nAssuming ``marius_preprocess`` has been built, we preprocess the OGBN-Arxiv dataset by running the following command (assuming we are in the ``marius`` root directory):\n\n.. code-block:: bash\n\n   $ marius_preprocess --dataset ogbn_arxiv --output_directory datasets/ogbn_arxiv_example/\n   Downloading arxiv.zip to datasets/ogbn_arxiv_example/arxiv.zip\n   Reading edges\n   Remapping Edges\n   Node mapping written to: datasets/ogbn_arxiv_example/nodes/node_mapping.txt\n   Dataset statistics written to: datasets/ogbn_arxiv_example/dataset.yaml\n\nThe  ``--dataset`` flag specifies which of the pre-set datasets ``marius_preprocess`` will preprocess and download.\n\nThe  ``--output_directory`` flag specifies where the preprocessed graph will be output and is set by the user. In this example, assume we have not created the ``datasets/ogbn_arxiv_example/`` repository, ``marius_preprocess`` will create it for us. \n\nFor detailed usages of  ``marius_preprocess``, please execute the following command:\n\n.. code-block:: bash\n\n   $ marius_preprocess -h\n\nLet's check what is inside the created directory:\n\n.. code-block:: bash\n\n   $ ls -1 datasets/ogbn_arxiv_example/ \n   dataset.yaml                       # input dataset statistics                                \n   nodes/  \n     node_mapping.txt                 # mapping of raw node ids to integer uuids\n     features.bin                     # preprocessed features list\n     labels.bin                       # preprocessed labels list\n     test_nodes.bin                   # preprocessed testing nodes list\n     train_nodes.bin                  # preprocessed training nodes list\n     validation_nodes.bin             # preprocessed validation nodes list\n   edges/   \n     train_edges.bin                  # preprocessed training edge list\n   arxiv/                             # dir with provided source files of the downloaded OGBN-Arxiv dataset\n     RELEASE_v1.txt  \n     mapping/  \n     processed/  \n     raw/  \n     split/\n   edge.csv                           # raw edge list\n   train.csv                          # raw training edge list                                              \n   test.csv                           # raw testing edge list    \n   valid.csv                          # raw validation edge list    \n   node-feat.csv                      # node features\n   node-label.csv                     # node labels\n   README.txt                         # README of the downloaded OGBN-Arxiv dataset\n   arxvi.zip                          # downloaded OGBN-Arxiv dataset\n\n\nLet's check what is inside the generated ``dataset.yaml`` file:\n\n.. code-block:: bash\n\n   $ cat datasets/ogbn_arxiv_example/dataset.yaml\n   dataset_dir: /marius-internal/datasets/ogbn_arxiv_example/\n   num_edges: 1166243\n   num_nodes: 169343\n   num_relations: 1\n   num_train: 90941\n   num_valid: 29799\n   num_test: 48603\n   node_feature_dim: 128\n   rel_feature_dim: -1\n   num_classes: 40\n   initialized: false\n\n\n.. note:: \n   If the above ``marius_preprocess`` command fails due to any missing directory errors, please create the ``<output_directory>/edges`` and ``<output_directory>/nodes`` directories as a workaround.\n\n2. Define Configuration File\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nTo train a model, we need to define a YAML configuration file based on information created from marius_preprocess. \n\nThe configuration file contains information including but not limited to the inputs to the model, training procedure, and hyperparameters to optimize. Given a configuration file, marius assembles a model depending on the given parameters. The configuration file is grouped up into four sections:\n\n* Model: Defines the architecture of the model, neighbor sampling configuration, loss, and optimizer(s)\n* Storage: Specifies the input dataset and how to store the graph, features, and embeddings.\n* Training: Sets options for the training procedure and hyperparameters. E.g. batch size, negative sampling.\n* Evaluation: Sets options for the evaluation procedure (if any). The options here are similar to those in the training section.\n\nFor the full configuration schema, please refer to ``docs/config_interface``.\n\nAn example YAML configuration file for the OGBN_Arxiv dataset is given in ``examples/configuration/ogbn_arxiv.yaml``. Note that the ``dataset_dir`` is set to the preprocessing output directory, in our example, ``datasets/ogbn_arxiv_example/``.\n\nLet's create the same YAML configuration file for the OGBN_Arxiv dataset from scratch. We follow the structure of the configuration file and create each of the four sections one by one. In a YAML file, indentation is used to denote nesting and all parameters are in the format of key-value pairs. \n\n#. | First, we define the **model**. We begin by setting all required parameters. This includes ``learning_task``, ``encoder``, ``decoder``, and ``loss``.\n   | Note that the output of the encoder is the output label vector for a given node. (E.g. For node classification with 5 classes, the output label vector from the encoder might look like this: [.05, .2, .8, .01, .03]. In this case, an argmax will return a class label of 2 for the node.) The rest of the configurations can be fine-tuned by the user.\n\n    .. code-block:: yaml\n    \n        model:\n          learning_task: NODE_CLASSIFICATION # set the learning task to node classification\n          encoder:\n            train_neighbor_sampling:\n              - type: ALL\n              - type: ALL\n              - type: ALL\n            layers: # define three layers of GNN of type GRAPH_SAGE\n              - - type: FEATURE\n                  output_dim: 128 # set to 128 (to match \"node_feature_dim=128\" in \"dataset.yaml\") for each layer except for the last\n                  bias: true\n              - - type: GNN\n                  options:\n                    type: GRAPH_SAGE\n                    aggregator: MEAN\n                  input_dim: 128 # set to 128 (to match \"node_feature_dim=128\" in \"dataset.yaml\") for each layer except for the last\n                  output_dim: 128\n                  bias: true\n              - - type: GNN\n                  options:\n                    type: GRAPH_SAGE\n                    aggregator: MEAN\n                  input_dim: 128\n                  output_dim: 128\n                  bias: true\n              - - type: GNN\n                  options:\n                    type: GRAPH_SAGE\n                    aggregator: MEAN\n                  input_dim: 128\n                  output_dim: 40 # set \"output_dim\" to 40 (to match \"num_classes=40\") in \"dataset.yaml\" for the last layer\n                  bias: true\n          decoder:\n            type: NODE\n          loss:\n            type: CROSS_ENTROPY\n            options:\n              reduction: SUM\n          dense_optimizer:\n            type: ADAM\n            options:\n              learning_rate: 0.01\n        storage:\n          # omit\n        training:\n          # omit\n        evaluation:\n          # omit\n      \n#. | Next, we set the **storage** and **dataset**. We begin by setting all required parameters. This includes ``dataset``. Here, the ``dataset_dir`` is set to ``datasets/ogbn_arxiv_example/``, which is the preprocessing output directory.\n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          device_type: cuda\n          dataset:\n            dataset_dir: datasets/ogbn_arxiv_example/\n          edges:\n            type: DEVICE_MEMORY\n            options:\n              dtype: int\n          features:\n            type: DEVICE_MEMORY\n            options:\n              dtype: float\n        training:\n          # omit\n        evaluation:\n          # omit\n\n#. Lastly, we configure **training** and **evaluation**. We begin by setting all required parameters. This includes ``num_epochs``. We set ``num_epochs=10`` (10 epochs to train) to demonstrate this example. \n\n    .. code-block:: yaml\n    \n        model:\n          # omit\n        storage:\n          # omit\n        training:\n          batch_size: 1000\n          num_epochs: 10\n          pipeline:\n            sync: true\n        evaluation:\n          batch_size: 1000\n          pipeline:\n            sync: true\n     \n3. Train Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nAfter defining our configuration file, training is run with ``marius_train <your_config.yaml>``.\n\nWe can now train our example using the configuration file we just created by running the following command (assuming we are in the ``marius`` root directory):\n\n.. code-block:: bash\n\n   $ marius_train datasets/ogbn_arxiv_example/ogbn_arxiv.yaml\n    [2022-04-05 18:50:11.677] [info] [marius.cpp:45] Start initialization\n    [04/05/22 18:50:15.807] Initialization Complete: 4.13s\n    [04/05/22 18:50:15.877] ################ Starting training epoch 1 ################\n    [04/05/22 18:50:16.310] Nodes processed: [10000/90941], 11.00%\n    [04/05/22 18:50:16.753] Nodes processed: [20000/90941], 21.99%\n    [04/05/22 18:50:17.192] Nodes processed: [30000/90941], 32.99%\n    [04/05/22 18:50:17.641] Nodes processed: [40000/90941], 43.98%\n    [04/05/22 18:50:18.089] Nodes processed: [50000/90941], 54.98%\n    [04/05/22 18:50:18.538] Nodes processed: [60000/90941], 65.98%\n    [04/05/22 18:50:18.983] Nodes processed: [70000/90941], 76.97%\n    [04/05/22 18:50:19.424] Nodes processed: [80000/90941], 87.97%\n    [04/05/22 18:50:19.861] Nodes processed: [90000/90941], 98.97%\n    [04/05/22 18:50:19.904] Nodes processed: [90941/90941], 100.00%\n    [04/05/22 18:50:19.904] ################ Finished training epoch 1 ################\n    [04/05/22 18:50:19.904] Epoch Runtime: 4027ms\n    [04/05/22 18:50:19.904] Nodes per Second: 22582.816\n    [04/05/22 18:50:19.904] Evaluating validation set\n    [04/05/22 18:50:20.795]\n    =================================\n    Node Classification: 29799 nodes evaluated\n    Accuracy: 65.753884%\n    =================================\n    [04/05/22 18:50:20.795] Evaluating test set\n    [04/05/22 18:50:22.194]\n    =================================\n    Node Classification: 48603 nodes evaluated\n    Accuracy: 63.909635%\n    =================================\n\n\nAfter running this configuration for 10 epochs, we should see a result similar to below with arruracy roughly equal to 67%:\n\n.. code-block:: bash\n\n    =================================\n    [04/05/22 18:51:12.589] ################ Starting training epoch 10 ################\n    [04/05/22 18:51:13.024] Nodes processed: [10000/90941], 11.00%\n    [04/05/22 18:51:13.456] Nodes processed: [20000/90941], 21.99%\n    [04/05/22 18:51:13.889] Nodes processed: [30000/90941], 32.99%\n    [04/05/22 18:51:14.336] Nodes processed: [40000/90941], 43.98%\n    [04/05/22 18:51:14.789] Nodes processed: [50000/90941], 54.98%\n    [04/05/22 18:51:15.240] Nodes processed: [60000/90941], 65.98%\n    [04/05/22 18:51:15.678] Nodes processed: [70000/90941], 76.97%\n    [04/05/22 18:51:16.119] Nodes processed: [80000/90941], 87.97%\n    [04/05/22 18:51:16.556] Nodes processed: [90000/90941], 98.97%\n    [04/05/22 18:51:16.599] Nodes processed: [90941/90941], 100.00%\n    [04/05/22 18:51:16.599] ################ Finished training epoch 10 ################\n    [04/05/22 18:51:16.599] Epoch Runtime: 4010ms\n    [04/05/22 18:51:16.599] Nodes per Second: 22678.553\n    [04/05/22 18:51:16.599] Evaluating validation set\n    [04/05/22 18:51:17.485]\n    =================================\n    Node Classification: 29799 nodes evaluated\n    Accuracy: 69.445283%\n    =================================\n    [04/05/22 18:51:17.485] Evaluating test set\n    [04/05/22 18:51:18.882]\n    =================================\n    Node Classification: 48603 nodes evaluated\n    Accuracy: 68.078102%\n    =================================\n\n\nLet's check again what was added in the ``datasets/ogbn_arxiv_example/`` directory. For clarity, we only list the files that were created in training. Notice that several files have been created, including the trained model, the embedding table, a full configuration file, and output logs:\n\n.. code-block:: bash\n\n   $ ls -1 datasets/ogbn_arxiv_example/ \n   model.pt                           # contains the dense model parameters, including the GNN parameters\n   model_state.pt                     # optimizer state of the trained model parameters\n   full_config.yaml                   # detailed config generated based on user-defined config\n   metadata.csv                       # information about metadata\n   logs/                              # logs containing output, error, debug information, and etc.\n   nodes/  \n     ...\n   edges/   \n     ...\n   ...\n\n.. note::\n  ``model.pt`` contains the dense model parameters. For GNN encoders, this file will include the GNN parameters.\n\n4. Inference\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n4.1 Command Line\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n4.2 Load Into Python\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n"
  },
  {
    "path": "docs/examples/config/resume_training.rst",
    "content": "Resume Training (FB15K-237)\n---------------------------------------------\n\nIn this tutorial, we use the **FB15K_237 knowledge graph** as an example to demonstrate the resume training functionality available in Marius using the :doc:`fb15k_237 <../examples/config/lp_fb15k237>` example. \n\nUsing ``marius_preprocess``, we pre-process the data to make it available under path ``datasets/fb15k_237_rt``\n\n.. code-block:: bash\n\n   $ marius_preprocess --dataset fb15k_237 --output_directory datasets/fb15k_237_rt/\n   Downloading FB15K-237.2.zip to datasets/fb15k_237_rt/FB15K-237.2.zip\n   Reading edges\n   Remapping Edges\n   Node mapping written to: datasets/fb15k_237_rt/nodes/node_mapping.txt\n   Relation mapping written to: datasets/fb15k_237_rt/edges/relation_mapping.txt\n   Dataset statistics written to: datasets/fb15k_237_rt/dataset.yaml\n\nTrain the model at least once before trying to resume training.\n\n.. code-block:: bash\n\n   $ marius_train fb15k_237_config.yaml\n   [05/06/22 18:08:21.037] ################ Finished training epoch 10 ################\n   ...\n   $ ls datasets/fb15k_237_rt/\n   README.txt\n   dataset.yaml\n   edges\n   model_0\n   nodes\n\nThe current model parameters are present in ``datasets/fb15k_237_rt/model_0``\n\n\n1. Resume training and overwrite existing model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssuming the model is trained at least once, ``training.resume_training`` can be set to ``true`` to train the previously saved model further for n epochs (default 10). \n\n.. code-block:: yaml\n\n   training:\n     batch_size: 1000\n     num_epochs: 10\n     resume_training: true\n\nRunning ``marius_train`` with the updated config will now overwrite the model parameters in ``datasets/fb15k_237_rt/model_0/``\n\n.. code-block:: bash\n\n   $ marius_train fb15k_237_config.yaml\n   [05/06/22 18:13:41.662] ################ Starting training epoch 11 ################\n   ...\n   [05/06/22 18:13:59.233] ################ Finished training epoch 20 ################\n   ...\n\n\n2. Resume training from given checkpoint\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n``training.resume_from_checkpoint`` can be set to preserve the existing checkpointed model and write the new model to a different directory. \nIf ``storage.model_dir`` is set, the new model will be written to the given directory, else a new directory of the pattern ``datasets/fb15k_237_rt/model_x``\nwill be created where `x` changes incrementally from 0-10 and will take the least non-existent value. \n\n.. code-block:: bash\n\n   $ ls datasets/fb15k_237_rt/\n   README.txt\n   dataset.yaml\n   edges\n   model_0\n   nodes\n\nResuming training from the above config with ``training.resume_from_checkpoint`` set will write the model to ``datasets/fb15k_237_rt/model_1`` if \n``storage.model_dir`` is not set. Since ``datasets/fb15k_237_rt/model_0`` now has a model trained for 20 epochs, the new model will further be \ntrained 10 epochs from there.\n\n.. code-block:: yaml\n\n   training:\n     batch_size: 1000\n     num_epochs: 10\n     resume_training: true\n     resume_from_checkpoint: datasets/fb15k_237_rt/model_0/\n\nRunning ``marius_train`` with the updated config will save the new model parameters to ``datasets/fb15k_237_rt/model_1/``\n\n.. code-block:: bash\n\n   $ marius_train fb15k_237_config.yaml\n   [05/06/22 18:13:41.662] ################ Starting training epoch 21 ################\n   ...\n   [05/06/22 18:13:59.233] ################ Finished training epoch 30 ################\n   ...\n   $ ls datasets/fb15k_237_rt/\n   README.txt\n   dataset.yaml\n   edges\n   model_0\n   nodes\n"
  },
  {
    "path": "docs/examples/index.rst",
    "content": "\nExamples\n**************************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    config/index\n    python/index\n\n\n"
  },
  {
    "path": "docs/examples/introduction.rst",
    "content": ".. _introduction\n\nIntroduction\n********************"
  },
  {
    "path": "docs/examples/prediction/command_line.rst",
    "content": ""
  },
  {
    "path": "docs/examples/prediction/python.rst",
    "content": ""
  },
  {
    "path": "docs/examples/preprocessing/command_line.rst",
    "content": ""
  },
  {
    "path": "docs/examples/preprocessing/python.rst",
    "content": ""
  },
  {
    "path": "docs/examples/python/index.rst",
    "content": "\nPython Examples\n**************************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    lp_fb15k237\n    lp_custom\n    nc_ogbn_arxiv"
  },
  {
    "path": "docs/examples/python/lp_custom.rst",
    "content": "Custom Dataset Link Prediction\n---------------------------------------------\nThis example will demonstrate how to use Marius Python API to do a Link \nPrediction task on a small scale graph. In this example we will use ogbn-arxiv\ngraph. Ogbn-arxiv graph is s dataset that is not added in Marius by default. So \nwe have to write a custom dataset class before we can do the model training. In \nthis example we will explain both the process of defining a custom dataset class\nand how to make a model for link prediction using DistMult. Also it would be \nbenefical to go through the lp_fb15k_237 example too because both custom and fb15k_237\nwill have similar model.\n\n*Example file location: examples/python/custom_lp.py*\n\nBy going through the example we aim you will understand following things:\n\n- How to use make your own custom dataset class to preprocess data\n- How to define a model using the Python APIs and configure it as needed\n- How to add different reporting metrics for the accuracy\n- How to initialize dataloader objects for training and evaluation\n- And lastly how to do training and evaluation\n\nNote: This is a GPU example and we are setting the device to GPU at the start of the\nmain using the line::\n\n    device = torch.device(\"cuda\")\n\nIf you want to run CPU based training please change *cuda* to *cpu*.\n\n1. Create Dataset Class\n^^^^^^^^^^^^^^^^^^^^^^^\nThe dataset orbn-arxiv is a custom dataset so for that we will need to make a new\ndataset class for preprocessing. This new dataset which in the example is called\n``MYDATASET`` is a child class of the parent class ``LinkPredictionDataset``.\nMaking a new dataset class requires writing two methods:\n\n- ``download()``: This method downloads the dataset from the source location and\n  extracts all the necessary files for preprocessing. In this example we are only\n  using the ``raw/edges.csv``. So in the download method we extract it properly.\n  We are doing it using the following method::\n\n        self.input_train_edges_file = self.output_directory / Path(\"edge.csv\")\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n            extract_file(self.output_directory / Path(\"arxiv/raw/edge.csv.gz\"))\n            (self.output_directory / Path(\"arxiv/raw/edge.csv\")).rename(self.input_train_edges_file)\n\n  All that we are doing here is to download the file, extract the edge.csv file\n  and rename it to ensure that we can easily reference it in the preprocess function.\n  Note that marius has built in ``download_url`` and ``extract_file`` function if \n  you want to use it.\n\n- ``preprocess()``: The main job of this method is to call the ``convertor()`` function.\n  Marius supports two types of convertor. First is a torch based convertor and \n  the other is a spark based convertor. In this example we are only using \n  ``TorchEdgeListConverter``. For more details about both the convertor you can \n  find the class defination at location ``src/python/tools/preprocess/convertors``.\n  To use the convertor class we need to define an object of convertor class and \n  after that we can call ``convertor.convert()`` to generate the preprocessed files::\n\n        converter = TorchEdgeListConverter\n        splits = [0.8, 0.1, 0.1] # 80%-train, 10%-validation, 10%-test\n        converter = converter(\n            output_dir=self.output_directory,\n            train_edges=self.input_train_edges_file,\n            src_column = 0, # col 0 is src and col 1 dst node in input csv\n            dst_column = 1,\n            delim=\",\", # CSV delimitor is \",\"\n            splits = splits, # Splitting the data in train, valid and test\n            remap_ids=remap_ids # Remapping the raw entity ids into random integers\n        )\n        return converter.convert()\n\n  As shown above in the code, first we are defining a convertor object. There are\n  many options in convertor object and you can find more details in the class \n  definition. In this example we are passing following things:\n\n  - ``output_dir=self.output_directory``: For file output location\n  - ``train_edges=self.input_train_edges_file``: Input edges file to preprocess\n  - ``columns = [0,1]``: Specifics which columns in edge.csv are source and destination\n  - ``delim=\",\"``: What delimitor is used in the csv\n  - ``splits = splits``: In this example we only have single edge.csv file so what fractions to split data in train, valid and test\n  - ``remap_ids=remap_ids``: Remapping the raw entity ids to a random number\n\n  Lastly once the ``converter.convert()`` is called the input ``edge.csv`` is then \n  converted into ``edges.bin`` file. The file will be located at ``self.output_directory / Path(\"edge.csv\")``.\n  And Marius uses this file as input.\n\nOnce you have defined the class all you need to do is instansiate the base directory\nwhere you will store all the dataset and preprocessed files. And call the download\nand preprocess on the objects. As shown in the code.::\n\n    dataset_dir = Path(\"ogbn_arxiv_dataset/\")\n    dataset = MYDATASET(dataset_dir)\n    if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n        dataset.download()\n        dataset.preprocess()\n\nLastly, note that dataset preprocessing will return a ``dataset.yaml`` file which\nis needed for further tasks, so in the example we are reading it after ``preprocess()``.\n\nOnce you are done with preprocessing the dataset rest of the steps will be similar\nto the lp_fb15k_237 example.\n\n2. Create Model\n^^^^^^^^^^^^^^^\nNext step is to define a model for the task. In this example we are going to make\na model with *DistMult*. The model is defined in the function ``init_model``. \nThere are three steps to defining a model:\n\n1. Defining an encoder: In this example we are defining a single layer encoder.\nThe layer is an embedding layer::\n\n   embedding_layer = m.nn.layers.EmbeddingLayer(dimension=embedding_dim, \n                                                device=device)\n \nTo define a model all you need to do is call the ``GeneralEncoder(..)`` method with all\nthe layers as shown below::\n\n    encoder = m.encoders.GeneralEncoder(layers=[[embedding_layer]])\n\nIn this example we are only having a single layer in the encoder but you can have\nmore than one layer also. (See the node classification example for refer on how to\npass more than one layer to ``GeneralEncoder(..)`` method)\n\n2. Defining a decoder: In this example we are using *DistMult* as our decoder so\nwe are calling the following method::\n\n    decoder = m.nn.decoders.edge.DistMult(num_relations=num_relations,\n                                          embedding_dim=embedding_dim,\n                                          use_inverse_relations=True,\n                                          device=device,\n                                          dtype=dtype,\n                                          mode=\"train\")\n\n\n3. Defining a loss function: We are using *SoftmaxCrossEntropy* in this example. And defining\nit is just doing a function call::\n\n    loss = m.nn.SoftmaxCrossEntropy(reduction=\"sum\")\n\nThere are many other options available for encoder, decoder and loss functions.\nPlease refer to the API documentation for more details.\n\nIn addition to doing the above three tasks, which defines the model, we also need\nto provide details regarding which metrics we want to be reported. This is done through\nfollowing code::\n\n    reporter = m.report.LinkPredictionReporter()\n    reporter.add_metric(m.report.MeanReciprocalRank())\n    reporter.add_metric(m.report.MeanRank())\n    reporter.add_metric(m.report.Hitsk(1))\n    reporter.add_metric(m.report.Hitsk(10))\n\nNotice that you can add multiple metrics.\n\nOnce we have defined the encoder, decoder, loss function and the reporter, we can\ncreate a model object using the following method::\n\n    m.nn.Model(encoder, decoder, loss, reporter)\n\nAnd now this model can be passed to during training and evaluation.\n\nLastly if you want to add an optimizer to the function you can do it as follows::\n\n    model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=.1)]\n\n3. Create Dataloader\n^^^^^^^^^^^^^^^^^^^^\nAfter defining the model we need to define two dataloader objects, one for training\nand the other for evaluation. Dataloader objects are used to handle all the data\nmovement required for training. Marius supported different types of storage backends\nlike complete InMemory, Partition Buffers, Flat_File, etc. Please refer to documentation\nand the original paper for more details.\n\nIn this example we are using an InMemory storage backend where all the data will reside\nin memory. This can be defined using the method ``tensor_to_file()``. Do define \na dataloader object we need to do 3 things:\n\n- First is a simple method call to define which objects need to be read::\n\n    train_edges = m.storage.tensor_from_file(filename=dataset.train_edges_file, shape=[dataset_stats.num_train, -1], dtype=torch.int32, device=device)\n    \n- Second for this example we want to use a negative edge sampler so we define it\n  as follows::\n    \n    train_neg_sampler = m.data.samplers.CorruptNodeNegativeSampler(num_chunks=10, num_negatives=500, degree_fraction=0.0, filtered=False)\n\n- And last is to make the data loader object itself which will be used during training\n  to fetch the data and process batches::\n\n    train_dataloader = m.data.DataLoader(edges=train_edges,\n                                         node_embeddings=embeddings,\n                                         batch_size=1000,\n                                         neg_sampler=train_neg_sampler,\n                                         learning_task=\"lp\",\n                                         train=True)\n\nOnce done with this we have defined the dataloader for training task. Similarly in the\nexample we also define a dataloader for evaluation.\n\n4. Train Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nNow we have everything available to start the training. For training we run multiple\nepochs of training and evaluation in this example.\n\nFor training all we need is the following function::\n    \n    def train_epoch(model, dataloader):\n        dataloader.initializeBatches()\n\n        while dataloader.hasNextBatch():\n            batch = dataloader.getBatch()\n            model.train_batch(batch)\n            dataloader.updateEmbeddings(batch)\n\nAll we are doing in this function is as follows:\n\n- Initializing the batches before the start of the epoch\n- If there is a next batch available we fetch the next batch\n- We train the model on the fetched batch\n- And we update the embeddings\n\n5. Inference\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\nSimilar to training the evaluation is also pretty simple can be concluded easily\nusing the following function::\n\n    def eval_epoch(model, dataloader):\n        dataloader.initializeBatches()\n\n        while dataloader.hasNextBatch():\n            batch = dataloader.getBatch()\n            model.evaluate_batch(batch)\n        \n        model.reporter.report()\n\nThe function does the following:\n\n- Initialize the batches before the start of every epoch\n- Load if there is a next batch of data available\n- Evaluate the batch\n- Once all batches are done report the metrics we defined earlier in reporter\n\n6. Save Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\nWork in progress - More details will be added soon"
  },
  {
    "path": "docs/examples/python/lp_fb15k237.rst",
    "content": "Small Scale Link Prediction (FB15K-237)\n---------------------------------------------\nThis example will demonstrate how to use Marius Python API to do a Link \nPrediction task on a small scale graph. In this example we will use FB15K-237\ngraph. FB15K-237 is a graph that is supported by Marius already so you won't\nneed to write your own custom dataset class for preprocessing. If you want to \nuse a custom dataset which is not supported by marius please refer to lp_custom\nexample.\n\n*Example file location: examples/python/fb15k_237_gpu.py*\n\nBy going through the example we aim you will understand following things:\n\n- How to use Marius' internally supported in dataset to do preprocessing\n- How to define a model using the Python APIs and configure it as needed\n- How to add different reporting metrics for the accuracy\n- How to initialize data loading objects for training and evaluation\n- And lastly how to do training and evaluation\n\nNote: This is a GPU example and we are setting the device to GPU at the start of the\nmain using the line::\n\n    device = torch.device(\"cuda\")\n\nIf you want to run CPU based training please change *cuda* to *cpu*.\n\n1. Create Dataset Class\n^^^^^^^^^^^^^^^^^^^^^^^\nIn this example we are going to use a built in dataset class to do preprocessing\nfor FB15K-237 graph. Marius already has support for few graphs and you can use their\ndataset classes directly to preprocess the data.\n\nTo use a built in class you need to import it which is done using the following line::\n    \n    from marius.tools.preprocess.datasets.fb15k_237 import FB15K237\n\nOnce you imported the class all you need to do is instansiate the base directory\nwhere you will store all the dataset and preprocessed files. And call the download\nand preprocess on the objects. As shown in the code.::\n\n    dataset_dir = Path(\"fb15k_dataset/\")\n    dataset = FB15K237(dataset_dir)\n    if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n        dataset.download()\n        dataset.preprocess()\n\nLastly, note that dataset preprocessing will return a ``dataset.yaml`` file which\nis needed for further tasks, so we read it in the example code.\n\n2. Create Model\n^^^^^^^^^^^^^^^\nNext step is to define a model for the task. In this example we are going to make\na model with *DistMult*. The model is defined in the function ``init_model``. \nThere are three steps to defining a model:\n\n1. Defining an encoder: In this example we are defining a single layer encoder.\nThe layer is an embedding layer::\n\n   embedding_layer = m.nn.layers.EmbeddingLayer(dimension=embedding_dim, \n                                                device=device)\n \nTo define a model all you need to do is call the ``GeneralEncoder(..)`` method with all\nthe layers as shown below::\n\n    encoder = m.encoders.GeneralEncoder(layers=[[embedding_layer]])\n\nIn this example we are only having a single layer in the encoder but you can have\nmore than one layer also. (See the node classification example for refer on how to\npass more than one layer to ``GeneralEncoder(..)`` method)\n\n2. Defining a decoder: In this example we are using *DistMult* as our decoder so\nwe are calling the following method::\n\n    decoder = m.nn.decoders.edge.DistMult(num_relations=num_relations,\n                                          embedding_dim=embedding_dim,\n                                          use_inverse_relations=True,\n                                          device=device,\n                                          dtype=dtype,\n                                          mode=\"train\")\n\nNotice that we are using mode as ``train`` but there are other\noptions available. Please refer to API documentation for more details. \n\n3. Defining a loss function: We are using *SoftmaxCrossEntropy* in this example. And defining\nit is just doing a function call::\n\n    loss = m.nn.SoftmaxCrossEntropy(reduction=\"sum\")\n\nThere are many other options available for encoder, decoder and loss functions.\nPlease refer to the API documentation for more details.\n\nIn addition to doing the above three tasks, which defines the model, we also need\nto provide details regarding which metrics we want to be reported. This is done through\nfollowing code::\n\n    reporter = m.report.LinkPredictionReporter()\n    reporter.add_metric(m.report.MeanReciprocalRank())\n    reporter.add_metric(m.report.MeanRank())\n    reporter.add_metric(m.report.Hitsk(1))\n    reporter.add_metric(m.report.Hitsk(10))\n\nNotice that you can add multiple metrics.\n\nOnce we have defined the encoder, decoder, loss function and the reporter, we can\ncreate a model object using the following method::\n\n    m.nn.Model(encoder, decoder, loss, reporter)\n\nAnd now this model can be passed to during training and evaluation.\n\nLastly if you want to add an optimizer to the function you can do it as follows::\n\n    model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=.1)]\n\n3. Create Dataloader\n^^^^^^^^^^^^^^^^^^^^\nAfter defining the model we need to define two dataloader objects, one for training\nand the other for evaluation. Dataloader objects are used to handle all the data\nmovement required for training. Marius supported different types of storage backends\nlike complete InMemory, Partition Buffers, Flat_File, etc. Please refer to documentation\nand the original paper for more details.\n\nIn this example we are using an InMemory storage backend where all the data will reside\nin memory. This can be defined using the method ``tensor_to_file()``. Do define \na dataloader object we need to do 3 things:\n\n- First is a simple method call to define which objects need to be read::\n\n    train_edges = m.storage.tensor_from_file(filename=dataset.train_edges_file, shape=[dataset_stats.num_train, -1], dtype=torch.int32, device=device)\n    \n- Second for this example we want to use a negative edge sampler so we define it\n  as follows::\n    \n    train_neg_sampler = m.data.samplers.CorruptNodeNegativeSampler(num_chunks=10, num_negatives=500, degree_fraction=0.0, filtered=False)\n\n- And last is to make the data loader object itself which will be used during training\n  to fetch the data and process batches::\n\n    train_dataloader = m.data.DataLoader(edges=train_edges,\n                                         node_embeddings=embeddings,\n                                         batch_size=1000,\n                                         neg_sampler=train_neg_sampler,\n                                         learning_task=\"lp\",\n                                         train=True)\n\nOnce done with this we have defined the dataloader for training task. Similarly in the\nexample we also define a dataloader for evaluation.\n\n4. Train Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nNow we have everything available to start the training. For training we run multiple\nepochs of training and evaluation in this example.\n\nFor training all we need is the following function::\n    \n    def train_epoch(model, dataloader):\n        dataloader.initializeBatches()\n\n        while dataloader.hasNextBatch():\n            batch = dataloader.getBatch()\n            model.train_batch(batch)\n            dataloader.updateEmbeddings(batch)\n\nAll we are doing in this function is as follows:\n\n- Initializing the batches before the start of the epoch\n- If there is a next batch available we fetch the next batch\n- We train the model on the fetched batch\n- And we update the embeddings\n\n5. Inference\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\nSimilar to training the evaluation is also pretty simple can be concluded easily\nusing the following function::\n\n    def eval_epoch(model, dataloader):\n        dataloader.initializeBatches()\n\n        while dataloader.hasNextBatch():\n            batch = dataloader.getBatch()\n            model.evaluate_batch(batch)\n        \n        model.reporter.report()\n\nThe function does the following:\n\n- Initialize the batches before the start of every epoch\n- Load if there is a next batch of data available\n- Evaluate the batch\n- Once all batches are done report the metrics we defined earlier in reporter\n\n6. Save Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\nWork in progress - More details will be added soon"
  },
  {
    "path": "docs/examples/python/nc_ogbn_arxiv.rst",
    "content": "Small Scale Node Classification (OGBN-Arxiv)\n---------------------------------------------\nOGBN-Arxiv is a built in dataset in Marius for node classification. In this example\nwe will use the dataset class for OGBN-Arxiv (already defined in Marius) and the \npython APIs to make a node classification example. This example will use GraphSage\nand will have two layer encoder. First layer will be the FEATURE layer and the second\nwill be the GNN layer.\n\n*Example file location: examples/python/ogbn_arxiv_nc.py*\n\nBy going through the example we aim you will understand following things:\n\n- How to use Marius' internally defined in dataset to do preprocessing\n- How to define a GraphSage based node classification model using python APIs for\n  node classification\n- How to add reporting metrics for you task\n- How to initialize data loading objects for training and evaluation\n- And lastly how to do training and evaluation\n\nNote: This is a GPU example and we are setting the device to GPU at the start of the main using the line::\n\n    device = torch.device(\"cuda\")\n\nIf you want to run CPU based training please change *cuda* to *cpu*.\n\n1. Create Dataset Class\n^^^^^^^^^^^^^^^^^^^^^^^\nThe dataset in this example is OGBN-Arxiv. For this dataset we already have a preprocessing\ndataset class defined in Marius. So we can use that class directly. This will help us\navoid writing a new custom dataset class. To use the dataset class you need to import\nit using::\n\n    from marius.tools.preprocess.datasets.ogbn_arxiv import OGBNArxiv\n\nOnce the dataset class is imported you can easily do the preprocessing by calling the\ndownload and the preprocess methods::\n\n    dataset_dir = Path(\"ogbn_arxiv_nc_dataset/\")\n    dataset = OGBNArxiv(dataset_dir)\n    if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n        dataset.download()\n        dataset.preprocess()\n\nThis will return all the necessary data in properly formated .bin files which can\nnow be used in Marius for training.\n\n2. Create Model\n^^^^^^^^^^^^^^^\nIn this example we are going to create a four layer model using GraphSage. While\ndefining a model you need to define three things. First is the encoder, second is\na decoder and lastly we need to define the loss function. This will setup the model.\nAdditionally to get the accuracy metrics we also need to set a reporter. In this\nsection we will discuss all this things.\n\n**Encoder:**\n\nFor node classification we are going to define a four layer encoder. The first layer\nwill be a FEATURE layer. For the feature layer we do not need to define anything\ncomplicated so we can easily define the feature layer as::\n\n    feature_layer = m.nn.layers.FeatureLayer(dimension=feature_dim, device=device)\n\nThe rest three layers are GraphSage layers. There can also be defined simply using\nthe following code::\n\n    graph_sage_layer1 = m.nn.layers.GraphSageLayer(input_dim=feature_dim,\n                                                   output_dim=feature_dim,\n                                                   device=device,\n                                                   bias=True)\n\n    graph_sage_layer2 = m.nn.layers.GraphSageLayer(input_dim=feature_dim,\n                                                   output_dim=feature_dim,\n                                                   device=device,\n                                                   bias=True)\n\n    graph_sage_layer3 = m.nn.layers.GraphSageLayer(input_dim=feature_dim,\n                                                   output_dim=num_classes,\n                                                   device=device,\n                                                   bias=True)\n\nNotice that the last layer has different activation function then other 2. There are \nother options available for the activation function, please refer to documentation for\nmore details.\n\nOnce we have setup both this layers we can call the encoder method to set up the encoder.\nNote that in the code below we are setting the feature layer first then the GraphSage layers::\n\n    encoder = m.encoders.GeneralEncoder(layers=[[feature_layer],\n                                                [graph_sage_layer1],\n                                                [graph_sage_layer2],\n                                                [graph_sage_layer3]])\n\n**Decoder**\n\nSetting up the decoder in this example is simple. All we are doing is setting up\nthe No Op Node Decoder::\n\n    decoder = m.nn.decoders.node.NoOpNodeDecoder()\n\n**Loss Function**\n\nFor the loss function we are using *Cross Entropy* with reduction as *SUM*. We are \nsetting it as follows::\n\n    loss = m.nn.CrossEntropyLoss(reduction=\"sum\")\n\n**Reporter**\n\nLastly we need to set the reporter for getting the results. In this example we are\ngoing to use ``CategoricalMetric`` as our metric::\n\n    reporter = m.report.NodeClassificationReporter()\n    reporter.add_metric(m.report.CategoricalAccuracy())\n\n**Defining the Model**\n\nOnce all this details are set up properly all we need to do is call the ``Model``\nmethod to initialize the model::\n\n    model = m.nn.Model(encoder, decoder, loss, reporter)\n\nLastly we are also adding a optimizer to the model and after that we are done with\nmodel creation::\n\n    model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=.01)]\n\n1. Create Dataloader\n^^^^^^^^^^^^^^^^^^^^\nThe dataloader object is used for setting up the storage layer. We are going to use\n``tensor_to_file()`` method for defining the dataloader. This method stores all the data \nin memory.\n\nIn this example for training dataloader we first need to setup the storage for\nfour files which are: ``edges_all``, ``train_nodes``, ``features`` and ``labels``. All\nfour can be done easily using the following API calls::\n\n    edges_all = m.storage.tensor_from_file(filename=dataset.edge_list_file, shape=[dataset_stats.num_edges, -1], dtype=torch.int32, device=device)\n    train_nodes = m.storage.tensor_from_file(filename=dataset.train_nodes_file, shape=[dataset_stats.num_train], dtype=torch.int32, device=device)\n    features = m.storage.tensor_from_file(filename=dataset.node_features_file, shape=[dataset_stats.num_nodes, -1], dtype=torch.float32, device=device)\n    labels = m.storage.tensor_from_file(filename=dataset.node_labels_file, shape=[dataset_stats.num_nodes], dtype=torch.int32, device=device)\n\nIn the examples above we are passing the file which we got from the preprocessor with proper shape.\nThe details for shape can be fetched from the yaml file retruned from the preprocessor.\n\nIn this example we are setting up a 3-hop neighbour sampler and we define this next::\n\n    nbr_sampler_3_hop = m.data.samplers.LayeredNeighborSampler(num_neighbors=[-1, -1, -1])\n\nAfter defining the 3-hop sampler we can define the dataloader class as follows::\n\n    train_dataloader = m.data.DataLoader(nodes=train_nodes,\n                                         edges=edges_all,\n                                         node_features=features,\n                                         node_labels=labels,\n                                         batch_size=1000,\n                                         nbr_sampler=nbr_sampler_3_hop,\n                                         learning_task=\"nc\",\n                                         train=True)\n\nThe things that we need to pass into the dataloader definition is all the file objects \nthat we defined, the batch size and the neighbour sampler that we want to use.\n\nSimilar to the ``train_dataloader``, we also define the ``eval_dataloader``. please\nrefer to the example for more details. ``eval_dataloader`` definition is similar to the \ntrain.\n\n4. Train Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nNow we have defined both the model and the dataloaders so we can start with the training\ntask. To train an epoch all we need to do is call the following function::\n\n    def train_epoch(model, dataloader):\n        dataloader.initializeBatches()\n        while dataloader.hasNextBatch():\n            batch = dataloader.getBatch()\n            model.train_batch(batch)\n\nThis function does the following:\n\n- Initialize the batches for training.\n- Load the next batch (if it is there)\n- Train the model\n\n5. Inference\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\nSimilar to the training we can do the evaluation using the following function::\n\n    def eval_epoch(model, dataloader, device):\n        dataloader.initializeBatches()\n        \n        while dataloader.hasNextBatch():\n            batch = dataloader.getBatch()\n            model.evaluate_batch(batch)\n        \n        model.reporter.report()\n\nHere all we are doing is as follows:\n\n- Initialize the batches for evaluation\n- Load the next batch (if it is there)\n- Evalutate the batch\n- Call the report function on the model to get the metrics\n\n6. Save Model\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\nWork in progress - More details later\n"
  },
  {
    "path": "docs/export_and_inference/index.rst",
    "content": "\nModel Export and Inference\n**************************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    marius_predict\n    marius_postprocess\n\n\n"
  },
  {
    "path": "docs/export_and_inference/marius_postprocess.rst",
    "content": ".. _marius_postprocess\n\nModel exporting tool (marius_postprocess)\n==================================================\n\nThis page describes the marius_postprocess tool, which can convert and export trained models to csv or parquet formats.\n\nCurrently this tool is in memory only, thus the embedding table(s) must fit in CPU memory to perform the export.\n\nExample Usage\n##############################\n\nTrain the fb15k_237 example model:\n\n    .. code-block:: bash\n\n        marius_preprocess --dataset fb15k_237 --output_directory datasets/fb15k_237_example/\n        marius_train examples/configuration/fb15k_237.yaml\n\nThe trained model is located at: ``datasets/fb15k_237_example/model_0``\n\nExport the model to CSV format:\n\n    .. code-block:: bash\n\n        marius_postprocess --model_dir datasets/fb15k_237_example/model_0 --format csv --output_dir my_output_dir\n\nThe output of the command should look like:\n\n    .. code-block:: text\n\n        Wrote my_output_dir/embeddings.csv: shape (14541, 2)\n        Wrote my_output_dir/relation_embeddings.csv: shape (237, 2)\n        Wrote my_output_dir/inverse_relation_embeddings.csv: shape (237, 2)\n        Wrote my_output_dir/model.pt\n\nFrom the above we can see that the node and edge-type (relation) embeddings have been written to CSV files. model.pt is a pytorch model file which may contain additional model parameters (GNN weights).\n\nThe output files contain two columns (id, embedding). The number of rows corresponds to the number of nodes or number of edge-types.\n\nNote that the CSV format may not be ideal for exporting the embedding table(s), as the embedding vectors are converted to text representations. The model can be exported in parquet format by using ``--format parquet``, or copied in raw format with ``--format bin``.\n\nCommand line arguments\n##############################\n\nBelow is the help message for the tool, containing an overview of the tools arguments and usage.\n\n    .. code-block:: text\n\n        $ marius_postprocess --help\n        usage: postprocess [-h] [--model_dir model_dir] [--format format] [--delim delim] [--output_dir output_dir] [--overwrite]\n\n        Convert trained embeddings to desired output format and output to specified directory.\n\n        Example usage:\n        marius_postprocess --model_dir foo --format csv --output_dir bar\n\n        optional arguments:\n          -h, --help            show this help message and exit\n          --model_dir model_dir\n                                Directory of the trained model\n          --format format, -f format\n                                Format of output embeddings. Choices are [csv, parquet, binary]\n          --delim delim         Delimiter to use for the output CSV\n          --output_dir output_dir\n                                Output directory, if not provided the model directory will be used.\n          --overwrite           If enabled, the output directory will be overwritten\n\n"
  },
  {
    "path": "docs/export_and_inference/marius_predict.rst",
    "content": ".. _marius_predict:\n\nBatch Inference (marius_predict)\n==================================================\n\nThis document contains an overview of the inference module for link prediction and node classification models trained using the configuration API. The module supports both in memory and disk-based inference.\n\nThe input test set can be be preprocessed, or can be in the raw input format and then preprocessed (partitioned, remapped and converted to binary format) before input to evaluation.\n\nLink Prediction\n##############################\n\nInput\n**********\n\n- A configuration file for a previously trained link prediction model\n\n- A set of test edges (preprocessed or unpreprocessed)\n\n- A list of metrics to compute (optional)\n\n- Negative sampling configuration (optional)\n\nOutput\n****************************\n\nText file containing a summary of metrics for the evaluation set: ``<output_dir>/metrics.txt`` (optional)\n\nCSV file where each row denotes an edge, and it’s corresponding score and link prediction rank ``<output_dir>/scores.csv`` (optional)\n\nExample Usage\n****************************\n\n    .. code-block:: bash\n\n        marius_predict --config configs/fb15k237.yaml --metrics mrr mr hits3 hits5 hits10 hits50 hits100 hits2129 --save_ranks --save_scores --output_dir results/`\n\nThis command takes in a trained configuration file, ``configs/fb15k237.yaml``, which defines a model that has been previous trained.\n\nThe list of metrics over the training set will be computed and output to results/metrics.txt. The ranks and scores for each edge are output to ``results/scores.csv``.\n\nContents of ``configs/fb15k237.yaml``. The test set here has been created during preprocessing and is stored in ``<storage.dataset.dataset_dir>/edges/test_edges.bin``\n\n    .. code-block:: yaml\n\n        model:\n          learning_task: LINK_PREDICTION\n          encoder:\n            layers:\n\n              - - type: EMBEDDING\n                  output_dim: 10\n                  bias: true\n                  init:\n                    type: GLOROT_NORMAL\n\n          decoder:\n            type: DISTMULT\n          loss:\n            type: SOFTMAX_CE\n            options:\n              reduction: SUM\n          dense_optimizer:\n            type: ADAM\n            options:\n              learning_rate: 0.01\n          sparse_optimizer:\n            type: ADAGRAD\n            options:\n              learning_rate: 0.1\n\n        storage:\n          device_type: cpu\n          dataset:\n            dataset_dir: ./fb15k_237_example/\n          edges:\n            type: HOST_MEMORY\n            options:\n              dtype: int\n          embeddings:\n            type: HOST_MEMORY\n            options:\n              dtype: float\n        training:\n          batch_size: 1000\n          negative_sampling:\n            num_chunks: 10\n            negatives_per_positive: 10\n            degree_fraction: 0\n            filtered: false\n          num_epochs: 10\n          pipeline:\n            sync: true\n        evaluation:\n          batch_size: 1000\n          negative_sampling:\n            filtered: true\n          pipeline:\n            sync: true\n\nSince ``storage.model_dir`` is not specified in the above configuration, ``marius_predict`` will use the latest trained model present in ``storage.dataset.dataset_dir``.\nWhen ``storage.model_dir`` is not specified, ``marius_train`` stores the model parameters in `model_x` directory within the `storage.dataset.dataset_dir`, where x changes \nincrementally from 0 - 10. A maximum of 11 models are stored when `model_dir` is not specified, post which the contents in `model_10/` directory are overwritten with the \nlatest parameters. ``marius_predict`` will use the latest model for inference and save the files to that directory. If ``storage.model_dir`` is specified, the model \nparameters will be loaded from the given directory and the generated files will be saved to the same. \n\nExample output\n****************************\nTwo files are output by the above command:\n\n\nmetrics.txt\n    .. code-block:: text\n\n        Link Prediction: 40932 edges evaluated\n        MRR: 0.125147\n        Mean Rank: 426.079766\n        Hits@3: 0.156259\n        Hits@5: 0.207148\n        Hits@10: 0.285229\n        Hits@50: 0.510383\n        Hits@100: 0.598725\n        Hits@2129: 0.947987\n\n\nscores.csv\n    .. code-block:: text\n\n        src,rel,dst,rank,score\n        14469,149,11486,26,32.206722\n        8558,74,7904,2789,5.628761\n        3160,73,8048,282,7.548909\n        7240,168,4510,149,1.634745\n        2393,211,10586,2,96.834641\n        12773,198,5262,3136,9.098152\n        11469,88,8946,18,15.922592\n        2045,166,3344,289,0.407495\n\n\nInput a new test set\n****************************************\n\nIf the dataset does not have a predefined test set. (e.g. ``storage.dataset.num_test == 0``). Then users can specify a separate test set with the ``--input_file <path_to_test_set>``. This test set can either be preprocessed and in binary format, or unpreprocessed.\n\nPreprocessed input_test set usage:\n\n    .. code-block:: bash\n\n        marius_predict --config configs/fb15k237.yaml --input_file test_edges.bin --metrics mrr --save_ranks --save_scores --output_dir results/\n\nUnpreprocessed input_test set usage:\n\nIf the input test set is unpreprocessed and in some raw input format. Then the ``--preprocess_input`` flag can be given. Users will need to specify the format of their input with ``--input_format <format>``. Currently delimited formats are only supported.\n\n    .. code-block:: bash\n\n        marius_predict --config configs/fb15k237.yaml --input_file test_edges.csv --preprocess_input --input_format CSV --metrics mrr --save_ranks --save_scores --output_dir results/\n\n\nNode Classification\n##############################\n\nInput\n**********\n\nA configuration file for a previously trained node classification model\n\nA set of test nodes (preprocessed or unpreprocessed)\n\nA list of metrics to compute (optional)\n\nOutput\n**********\n\nText file containing a summary of metrics for the evaluation set: ``<output_dir>/metrics.txt`` (optional)\n\nCSV file where each row denotes an node, and it’s corresponding node classification label ``<output_dir>/labels.csv`` (optional)\n\nExample Usage\n********************\n\n\n    .. code-block:: bash\n\n        marius_predict --config configs/arxiv.yaml --metrics accuracy --save_labels --output_dir results/\n\nThis command takes in a trained configuration file, ``configs/arxiv.yaml``, which defines the previously trained model.\n\nThe list of metrics over the training set will be computed and output to ``results/metrics.txt``. The ranks and scores for each node are output to ``results/labels.csv``.\n\n\nCommand line arguments\n##############################\n\nBelow is the help message for the tool, containing an overview of the tools arguments and usage.\n\n\n    .. code-block:: text\n\n        $ marius_predict --help\n        usage: predict [-h] --config config [--output_dir output_dir] [--metrics [metrics ...]] [--save_labels] [--save_scores] [--save_ranks] [--batch_size batch_size] [--num_nbrs num_nbrs]\n                       [--num_negs num_negs] [--num_chunks num_chunks] [--deg_frac deg_frac] [--filtered filtered] [--input_file input_file] [--input_format input_format] [--preprocess_input preprocess_input]\n                       [--columns columns] [--header_length header_length] [--delim delim] [--dtype dtype]\n\n        Tool for performing link prediction or node classification inference with trained models.\n\n        Link prediction example usage:\n        marius_predict <trained_config> --output_dir results/ --metrics mrr mean_rank hits1 hits10 hits50 --save_scores --save_ranks\n        Assuming <trained_config> contains a link prediction model, this command will perform link prediction evaluation over the test set of edges provided in the config file. Metrics are saved to results/metrics.csv and scores and ranks for each test edge are saved to results/scores.csv\n\n        Node classification example usage:\n        marius_predict <trained_config> --output_dir results/ --metrics accuracy --save_labels\n        This command will perform node classification evaluation over the test set of nodes provided in the config file. Metrics are saved to results/metrics.csv and labels for each test node are saved to results/labels.csv\n\n        Custom inputs:\n        The test set can be directly specified setting --input_file <test_set_file>. If the test set has not been preprocessed, then --preprocess_input should be enabled. The default format is a binary file, but additional formats can be specified with --input_format.\n\n        optional arguments:\n          -h, --help            show this help message and exit\n          --config config       Configuration file for trained model\n          --output_dir output_dir\n                                Path to output directory\n          --metrics [metrics ...]\n                                List of metrics to report.\n          --save_labels         (Node Classification) If true, the node classification labels of each test node will be saved to <output_dir>/labels.csv\n          --save_scores         (Link Prediction) If true, the link prediction scores of each test edge will be saved to <output_dir>/scores.csv\n          --save_ranks          (Link Prediction) If true, the link prediction ranks of each test edge will be saved to <output_dir>/scores.csv\n          --batch_size batch_size\n                                Number of examples to evaluate at a time.\n          --num_nbrs num_nbrs   Number of neighbors to sample for each GNN layer. If not provided, then the module will check if the output of the encoder has been saved after training (see\n                                storage.export_encoded_nodes). If the encoder outputs exist, the the module will skip the encode step (incl. neighbor sampling) and only perform the decode over the saved\n                                inputs. If encoder outputs are not saved, model.encoder.eval_neighbor_sampling will be used for the neighbor sampling configuration. If model.encoder.eval_neighbor_sampling does\n                                not exist, then model.encoder.train_neighbor_sampling will be used.If none of the above are given, then the model is assumed to not require neighbor sampling.\n          --num_negs num_negs   (Link Prediction) Number of negatives to compare per positive edge for link prediction. If -1, then all nodes are used as negatives. Otherwise, num_neg*num_chunks nodes will be\n                                sampled and used as negatives. If not provided, the evaluation.negative_sampling configuration will be used.if evaluation.negative_sampling is not provided, then negative\n                                sampling will not occur and only the scores for the input edges will be computed, this means that any ranking metrics cannot be calculated.\n          --num_chunks num_chunks\n                                (Link Prediction) Specifies the amount of reuse of negative samples. A given set of num_neg sampled nodes will be reused to corrupt (batch_size // num_chunks) edges.\n          --deg_frac deg_frac   (Link Prediction) Specifies the fraction of the num_neg nodes sampled as negatives that should be sampled according to their degree. This sampling procedure approximates degree\n                                based sampling by sampling nodes that appear in the current batch of edges.\n          --filtered filtered   (Link Prediction) If true, then false negative samples will be filtered out. This is only supported when evaluating with all nodes.\n          --input_file input_file\n                                Path to input file containing the test set, if not provided then the test set described in the configuration file will be used.\n          --input_format input_format\n                                Format of the input file to test. Options are [BINARY, CSV, TSV, DELIMITED] files. If DELIMITED, then --delim must be specified.\n          --preprocess_input preprocess_input\n                                If true, the input file (if provided) will be preprocessed before evaluation.\n          --columns columns     List of column ids of input delimited file which denote the src node, edge-type, and dst node of edges.E.g. columns=[0, 2, 1] means that the source nodes are found in the first\n                                column of the file, the edge-types are found in the third column, and the destination nodes are found in the second column.For graphs without edge types, only the location node\n                                columns need to be provided. E.g. [0, 1]If the input file contains node ids rather than edges, then only a single id is needed. E.g. [2]\n          --header_length header_length\n                                Length of the header for input delimited file\n          --delim delim         Delimiter for input file\n          --dtype dtype         Datatype of input file elements. Defaults to the dataset specified in the configuration file."
  },
  {
    "path": "docs/graph_learning/decoders.rst",
    "content": "Decoders\n********************"
  },
  {
    "path": "docs/graph_learning/downstream_tasks.rst",
    "content": "Downstream Tasks and Applications\n*********************************\n\n- :ref:`lp_paleo`"
  },
  {
    "path": "docs/graph_learning/encoders.rst",
    "content": "Encoders\n********************"
  },
  {
    "path": "docs/graph_learning/index.rst",
    "content": "\nGraph Learning\n**************************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    intro\n    downstream_tasks\n\n\n"
  },
  {
    "path": "docs/graph_learning/intro.rst",
    "content": "Intro to Graph Embeddings\n***************************\n\nA brief overview of graph-structured data, graph embeddings, and their applications.\n\nGraph-Structured Data\n-----------------------\nA graph is a data structure consisting of nodes and edges connecting them. For example, a social media network can be modeled as a graph with users as the nodes and friendships as edges between them. Protein networks can be modeled as a graph, with proteins as the nodes and different edge types specifying the different biological interactions between them. In knowledge graphs, such as Wikidata, nodes represent different real-world concepts and edges the relations between them.\n\nGraph-structured data is different from other common data types such as images or text in that it is non-Euclidian: unlike the grid-like structure of these other data types, graphs have no clear \"start\" or \"end\" point and have a complex, arbitrary structure. This allows them to represent complex data in a rich and easily understandable way, but also makes it difficult to apply modern machine learning algorithms on them, which are usually built to handle vectorized data.\n\nGraph Embeddings\n-----------------------\nGraph embeddings are used to solve our aforementioned problem. The idea is to transform nodes, edges, and other graph features into vector representations, in which each embedding encodes some information about the structure of the graph. For example, if we were to embed the nodes of a graph, we would expect a good embedding output to be one in which the embeddings for nodes which are close together in the graph to also be similar in the embedding vector space.\n\nThe purpose of the Marius system is to quickly generate these embeddings for a graph, in which embeddings accurately reflect properties of the graph structure. Using data movement techniques, Marius can generate embeddings for massive graphs with billions of nodes and edges.\n\nGraph Learning Tasks\n-----------------------\nGraph embeddings make it easy to perform downstream graph analytics tasks. Two of the most common inference tasks are:\n\n**Node Classification:** Often we come across graphs in which some nodes have labels or categories attached to them while others do not. Node classification is the process of predicting these missing labels for unlabeled nodes. An example of using Marius for node classification can be found here.\n\n**Link Prediction:** Link prediction is the process of predicting whether an edge exists between two particular nodes. We can use node and relation embeddings to predict new or missing edges in a graph. For example, in a social media network, link prediction could amount to predicting new friend recommendations. An end-to-end example of using Marius for link prediction can be found here."
  },
  {
    "path": "docs/graph_learning/learning_tasks.rst",
    "content": "Learning Tasks\n********************"
  },
  {
    "path": "docs/index.rst",
    "content": ".. Marius documentation master file, created by\n    sphinx-quickstart on Tue Oct 20 13:17:05 2020.\n\nMarius\n********************\n\n.. toctree::\n    :maxdepth: 2\n\n    introduction\n    quickstart\n    examples/index\n    config_interface/index\n    python_api/index\n    preprocess_datasets/index\n    export_and_inference/index\n    graph_learning/index\n"
  },
  {
    "path": "docs/introduction.rst",
    "content": ".. _introduction\n\nIntroduction\n=========================\n\nMarius is a system for scaling graph learning on a single machine. Marius supports training and evaluation of GNNs and graph embedding models for link prediction or node classification. See our papers Marius and Marius++ for technical details.\n\nFeature Overview\n##############################\n\n\n* **Billion scale** link prediction and node classification training and evaluation\n* **High performance** configuration-file based execution\n* **PyTorch compatible** Python API for custom training and evaluation routines\n\n\n.. container:: twocol\n\n    .. container:: leftside\n\n        Define 3-layer GraphSage model in Python\n\n        ::\n\n            nbr_sampler = m.nn.LayeredNeighborSampler([-1, -1, -1])\n\n            feat_dim = 128\n            num_classes = 40\n\n            device = torch.device(\"cuda\")\n\n            feat_layer = m.nn.layers.FeatureLayer(dimension=feature_dim,\n                                                  device=device)\n\n            gs_layer1 = m.nn.layers.GraphSageLayer(input_dim=feature_dim,\n                                                   output_dim=feature_dim,\n                                                   device=device)\n\n            gs_layer2 = m.nn.layers.GraphSageLayer(input_dim=feature_dim,\n                                                   output_dim=feature_dim,\n                                                   device=device)\n\n            gs_layer3 = m.nn.layers.GraphSageLayer(input_dim=feature_dim,\n                                                   output_dim=num_classes,\n                                                   device=device)\n\n            encoder = m.encoders.GeneralEncoder(layers=[[feature_layer],\n                                                        [graph_sage_layer1],\n                                                        [graph_sage_layer2],\n                                                        [graph_sage_layer3]])\n\n            decoder = m.nn.decoders.node.NoOpNodeDecoder()\n            loss = m.nn.CrossEntropyLoss(reduction=\"sum\")\n\n            model = m.nn.Model(encoder, decoder, loss)\n            model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(),\n                                                   lr=.01)]\n\n    .. container:: rightside\n\n        or with YAML configuration\n        ::\n\n            model:\n              learning_task: node_classification\n              encoder:\n                train_neighbor_sampling:\n                  - type: all\n                  - type: all\n                  - type: all\n                layers:\n                  - - type: feature\n                      output_dim: 128\n                  - - type: gnn\n                      options:\n                        type: graph_sage\n                      input_dim: 128\n                      output_dim: 128\n                  - - type: GNN\n                      options:\n                        type: graph_sage\n                      input_dim: 128\n                      output_dim: 128\n                  - - type: gnn\n                      options:\n                        type: graph_sage\n                      input_dim: 128\n                      output_dim: 40\n              decoder:\n                type: node\n              loss:\n                type: cross_entropy\n                options:\n                  reduction: sum\n              dense_optimizer:\n                type: adam\n                options:\n                  learning_rate: 0.01\n\n\nPreprocessing\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n* Performant dataset preprocessing of raw datasets in CSV format\n* 13 built-in datasets for link prediction or node classification\n* Custom dataset support\n\nTraining & Evaluation\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n* CPU-GPU pipeline to mitigate data movement overheads\n* Optimized neighborhood sampling and datastructures for GNN aggregation\n* Scale beyond CPU memory with a partition buffer\n\nSupported Input Graphs\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n* Formats: CSV/TSVs, PyTorch tensors, Numpy arrays\n* Graphs with or without edge-types or node features\n* Scales to graphs with billions of edges and 100s of millions of nodes\n\nSupported Models\n\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\n\n* Tasks: Link prediction, node classification\n* GNN layers: GraphSage, GCN, RGCN, GAT\n* Link prediction decoders: ComplEx, DistMult, TransE\n\nUpcoming Features\n##############################\n\n* Configuration file optimizer and generator (in testing)\n* SQL database to graph conversion tool (in testing)\n* Multi-GPU training (in progress)\n* Model checkpointing (in progress)\n* KNN inference module\n* marius_preprocess parquet file support\n* Remote storage for graph data and embeddings\n* Additional encoder layers and decoder layers"
  },
  {
    "path": "docs/preprocess_datasets/built_in.rst",
    "content": ""
  },
  {
    "path": "docs/preprocess_datasets/command_line.rst",
    "content": "\nCommand Line Preprocessing\n================================\n\nThe preprocessing procedure takes datasets in their raw format and converts them to the input format required by Marius.\n\nBuilt-in datasets\n-----------------------\n\nPreprocessing the FB15K-237 knowledge graph\n\n.. code-block:: bash\n\n   $ marius_preprocess --dataset fb15k_237 --output_directory datasets/fb15k_237_example/\n   Downloading FB15K-237.2.zip to datasets/fb15k_237_example/FB15K-237.2.zip\n   Reading edges\n   Remapping Edges\n   Node mapping written to: datasets/fb15k_237_example/nodes/node_mapping.txt\n   Relation mapping written to: datasets/fb15k_237_example/edges/relation_mapping.txt\n   Dataset statistics written to: datasets/fb15k_237_example/dataset.yaml\n\nThe  ``--dataset`` flag specifies which of the built-in datasets ``marius_preprocess`` will preprocess and download.\n\nThe  ``--output_directory`` flag specifies where the preprocessed graph will be output and is set by the user. In this example, assume we have not created the datasets/fb15k_237_example repository. ``marius_preprocess`` will create it for us.\n\nSee `Usage`_ for detailed options.\n\nHere are the contents of the output directory after preprocessing\n\n.. code-block:: bash\n\n   $ ls -l datasets/fb15k_237_example/\n   dataset.yaml                       # input dataset statistics\n   nodes/\n     node_mapping.txt                 # mapping of raw node ids to integer uuids\n   edges/\n     relation_mapping.txt             # mapping of raw edge(relation) ids to integer uuids\n     test_edges.bin                   # preprocessed testing edge list\n     train_edges.bin                  # preprocessed training edge list\n     validation_edges.bin             # preprocessed validation edge list\n   train.txt                          # raw training edge list\n   test.txt                           # raw testing edge list\n   valid.txt                          # raw validation edge list\n   text_cvsc.txt                      # relation triples as used in Toutanova and Chen CVSM-2015\n   text_emnlp.txt                     # relation triples as used inToutanova et al. EMNLP-2015\n   README.txt                         # README of the downloaded FB15K-237 dataset\n\n\nList of built-in datasets\n\n.. code-block:: text\n\n    # node classification\n    ogbn_arxiv\n    ogbn_products\n    ogbn_papers100m\n    ogb_mag240m\n\n    # link prediction\n    fb15k\n    fb15k_237\n    livejournal\n    twitter\n    freebase86m\n    ogbl_wikikg2\n    ogbl_citation2\n    ogbl_ppa\n    ogb_wikikg90mv2\n\n\nCustom datasets\n-----------------------\n\n.. _custom_dataset_example: http://marius-project.org/marius/examples/config/lp_custom.html#preprocess-dataset\n\nDatasets in delimited file formats such as CSVs can be preprocessed with ``marius_preprocess``\n\nSee this `example <custom_dataset_example_>`_.\n\n\nUsage\n-----------------------\n\n.. code-block:: text\n\n    usage: marius_preprocess [-h] [--output_directory output_directory] [--edges edges [edges ...]] [--dataset dataset] [--num_partitions num_partitions] [--partitioned_eval] [--delim delim]\n                      [--dataset_split dataset_split [dataset_split ...]] [--overwrite] [--spark] [--no_remap_ids]\n\n    Preprocess built-in datasets and custom link prediction datasets\n\n    optional arguments:\n      -h, --help            show this help message and exit\n      --output_directory output_directory\n                            Directory to put graph data\n      --edges edges [edges ...]\n                            File(s) containing the edge list(s) for a custom dataset\n      --dataset dataset     Name of dataset to preprocess\n      --num_partitions num_partitions\n                            Number of node partitions\n      --partitioned_eval    If true, the validation and/or the test set will be partitioned.\n      --delim delim, -d delim\n                            Delimiter to use for delimited file inputs\n      --dataset_split dataset_split [dataset_split ...], -ds dataset_split [dataset_split ...]\n                            Split dataset into specified fractions\n      --overwrite           If true, the preprocessed dataset will be overwritten if it already exists\n      --spark               If true, pyspark will be used to perform the preprocessing\n      --no_remap_ids        If true, the node ids of the input dataset will not be remapped to random integer ids.\n      --columns [columns [columns ...]]\n                            List of column ids of input delimited files which\n                            denote the src node, edge-type, and dst node of edges.\n"
  },
  {
    "path": "docs/preprocess_datasets/index.rst",
    "content": "\nDatasets and Preprocessing\n**************************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n    :caption: Contents\n\n    command_line\n    python\n\n"
  },
  {
    "path": "docs/preprocess_datasets/python.rst",
    "content": ""
  },
  {
    "path": "docs/python_api/configuration/index.rst",
    "content": "\nmarius.config\n********************\n\n.. automodule:: marius.config\n    :members:\n    :undoc-members:\n    :imported-members:\n"
  },
  {
    "path": "docs/python_api/index.rst",
    "content": "\nPython API\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    configuration/index\n    data/index\n    manager/index\n    nn/index\n    pipeline/index\n    reporting/index\n    storage/index\n    tools/index\n\n"
  },
  {
    "path": "docs/python_api/manager/index.rst",
    "content": "\nmarius.manager\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n"
  },
  {
    "path": "docs/python_api/nn/activation.rst",
    "content": "Activation Functions\n=======================================\n\n.. function:: marius.nn.apply_activation(activation_function: marius._config.ActivationFunction, input: torch.Tensor) -> torch.Tensor\n\n"
  },
  {
    "path": "docs/python_api/nn/decoders/decoder.rst",
    "content": "Decoder\n=======================================\n\n.. autoclass:: marius.nn.decoders.Decoder\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n"
  },
  {
    "path": "docs/python_api/nn/decoders/edge/comparators.rst",
    "content": "Comparator\n=======================================\n\n.. autoclass:: marius.nn.decoders.edge.Comparator\n    :members:\n    :undoc-members:\n    :exclude-members: __init__\n\n    .. method:: __init__()\n\n.. autoclass:: marius.nn.decoders.edge.L2Compare\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.decoders.edge.CosineCompare\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.decoders.edge.DotCompare\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/nn/decoders/edge/complex.rst",
    "content": "ComplEx\n=======================================\n\n.. autoclass:: marius.nn.decoders.edge.ComplEx\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/nn/decoders/edge/distmult.rst",
    "content": "DistMult\n=======================================\n\n.. autoclass:: marius.nn.decoders.edge.DistMult\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/nn/decoders/edge/edge_decoder.rst",
    "content": "EdgeDecoder\n=======================================\n\n.. autoclass:: marius.nn.decoders.edge.EdgeDecoder\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, apply_relation, compute_scores, select_relations\n\n    .. method:: __init__()\n    \n    .. method:: apply_relation(self: marius._nn.decoders.edge.EdgeDecoder, nodes: torch.Tensor, relations: torch.Tensor) -> torch.Tensor\n    \n    .. method:: compute_scores(self: marius._nn.decoders.edge.EdgeDecoder, src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor\n    \n    .. method:: select_relations(self: marius._nn.decoders.edge.EdgeDecoder, indices: torch.Tensor, inverse: bool = False) -> torch.Tensor\n"
  },
  {
    "path": "docs/python_api/nn/decoders/edge/index.rst",
    "content": "\nedge\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    comparators\n    complex\n    distmult\n    edge_decoder\n    relation_operators\n    transe"
  },
  {
    "path": "docs/python_api/nn/decoders/edge/relation_operators.rst",
    "content": "RelationOperator\n=======================================\n\n.. autoclass:: marius.nn.decoders.edge.RelationOperator\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n\n.. autoclass:: marius.nn.decoders.edge.HadamardOperator\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n\n.. autoclass:: marius.nn.decoders.edge.ComplexHadamardOperator\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n\n.. autoclass:: marius.nn.decoders.edge.TranslationOperator\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n\n.. autoclass:: marius.nn.decoders.edge.NoOp\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n"
  },
  {
    "path": "docs/python_api/nn/decoders/edge/transe.rst",
    "content": "TransE\n=======================================\n\n.. autoclass:: marius.nn.decoders.edge.TransE\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/nn/decoders/index.rst",
    "content": "\ndecoders\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    edge/index\n    node/index\n    decoder"
  },
  {
    "path": "docs/python_api/nn/decoders/node/index.rst",
    "content": "\nnode\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    node_decoder\n    noop_node_decoder"
  },
  {
    "path": "docs/python_api/nn/decoders/node/node_decoder.rst",
    "content": "NodeDecoder\n=======================================\n\n.. autoclass:: marius.nn.decoders.node.NodeDecoder\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/nn/decoders/node/noop_node_decoder.rst",
    "content": "NoOpNodeDecoder\n=======================================\n\n.. autoclass:: marius.nn.decoders.node.NoOpNodeDecoder\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/nn/encoders/general_encoder.rst",
    "content": "GeneralEncoder\n=======================================\n\n.. autoclass:: marius.nn.encoders.GeneralEncoder\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__(self: marius._nn.encoders.GeneralEncoder, encoder_config: marius._config.EncoderConfig, device: torch.device, num_relations: int = 1) -> None\n    \n    .. method:: __init__(self: marius._nn.encoders.GeneralEncoder, layers: List[List[Layer]]) -> None\n    \n    .. method:: forward(self: marius._nn.encoders.GeneralEncoder, embeddings: Optional[torch.Tensor], features: Optional[torch.Tensor], dense_graph: marius._data.DENSEGraph, train: bool = True) -> torch.Tensor"
  },
  {
    "path": "docs/python_api/nn/encoders/index.rst",
    "content": "\nencoders\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    general_encoder\n"
  },
  {
    "path": "docs/python_api/nn/index.rst",
    "content": "\nmarius.nn\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    decoders/index\n    encoders/index\n    layers/index\n    activation\n    initialization\n    loss\n    model\n    optim\n"
  },
  {
    "path": "docs/python_api/nn/initialization.rst",
    "content": "Initialization\n=======================================\n\n.. autofunction:: marius.nn.compute_fans\n\n.. function:: marius.nn.glorot_uniform(shape: List[int], device: object, dtype: object, fans: Tuple[int, int] = (- 1, - 1)) -> torch.Tensor\n\n.. function:: marius.nn.glorot_normal(shape: List[int], device: object, dtype: object, fans: Tuple[int, int] = (- 1, - 1)) -> torch.Tensor\n\n.. function:: marius.nn.constant_init(shape: List[int], constant: float = 0, device: object, dtype: object) -> torch.Tensor\n\n.. function:: marius.nn.uniform_init(shape: List[int], scale_factor: float = 0.001, device: object, dtype: object) -> torch.Tensor\n\n.. function:: marius.nn.normal_init(shape: List[int], mean: float = 0, std: float = 1, device: object, dtype: object) -> torch.Tensor\n\n.. function:: marius.nn.initialize_tensor(init_config: marius._config.InitConfig, shape: List[int], device: object, dtype: object, fans: Tuple[int, int] = (- 1, - 1)) -> torch.Tensor\n\n.. function:: marius.nn.initialize_subtensor(init_config: marius._config.InitConfig, sub_shape: List[int], full_shape: List[int], device: object, dtype: object, fans: Tuple[int, int] = (- 1, - 1)) -> torch.Tensor\n"
  },
  {
    "path": "docs/python_api/nn/layers/embedding.rst",
    "content": "EmbeddingLayer\n=======================================\n\n.. autoclass:: marius.nn.layers.EmbeddingLayer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward, init_embeddings\n\n    .. method:: __init__(self: marius._nn.layers.EmbeddingLayer, layer_config: marius._config.LayerConfig, device: torch.device, offset: int = 0) -> None\n    \n    .. method:: __init__(self: marius._nn.layers.EmbeddingLayer, dimension: int, device: torch.device, init: marius._config.InitConfig, bias: bool = False, bias_init: marius._config.InitConfig, activation: str = ‘none’, offset: int = 0) -> None\n    \n    .. method:: forward(self: marius._nn.layers.EmbeddingLayer, input: torch.Tensor) -> torch.Tensor\n    \n    .. method:: init_embeddings(self: marius._nn.layers.EmbeddingLayer, num_nodes: int) -> torch.Tensor"
  },
  {
    "path": "docs/python_api/nn/layers/feature.rst",
    "content": "FeatureLayer\n=======================================\n\n.. autoclass:: marius.nn.layers.FeatureLayer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__(self: marius._nn.layers.FeatureLayer, layer_config: marius._config.LayerConfig, device: torch.device, offset: int = 0) -> None\n    \n    .. method:: __init__(self: marius._nn.layers.FeatureLayer, dimension: int, device: torch.device, bias: bool = False, bias_init: marius._config.InitConfig, activation: str = ‘none’, offset: int = 0) -> None\n\n    .. method:: forward(self: marius._nn.layers.EmbeddingLayer, input: torch.Tensor) -> torch.Tensor"
  },
  {
    "path": "docs/python_api/nn/layers/gnn.rst",
    "content": "GNNLayer\n=======================================\n\n.. autoclass:: marius.nn.layers.GNNLayer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__()\n    \n    .. method:: forward(self: marius._nn.layers.GNNLayer, inputs: torch.Tensor, dense_graph: marius._data.DENSEGraph, train: bool) -> torch.Tensor\n\n.. autoclass:: marius.nn.layers.GraphSageLayer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__(self: marius._nn.layers.GraphSageLayer, layer_config: marius._config.LayerConfig, device: torch.device) -> None\n    \n    .. method:: __init__(self: marius._nn.layers.GraphSageLayer, input_dim: int, output_dim: int, device: Optional[torch.device] = None, aggregator: str = ‘mean’, init: marius._config.InitConfig, bias: bool = False, bias_init: marius._config.InitConfig, activation: str = ‘none’) -> None\n    \n    .. method:: forward(self: marius._nn.layers.GraphSageLayer, inputs: torch.Tensor, dense_graph: marius._data.DENSEGraph, train: bool = True) -> torch.Tensor\n\n.. autoclass:: marius.nn.layers.GATLayer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__(self: marius._nn.layers.GATLayer, layer_config: marius._config.LayerConfig, device: torch.device) -> None\n    \n    .. method:: __init__(self: marius._nn.layers.GATLayer, input_dim: int, output_dim: int, device: Optional[torch.device] = None, num_heads: int = 10, average_heads: bool = False, input_dropout: float = 0.0, attention_dropout: float = 0.0, negative_slope: float = 0.2, init: marius._config.InitConfig, bias: bool = False, bias_init: marius._config.InitConfig, activation: str = ‘none’) -> None\n    \n    .. method:: forward(self: marius._nn.layers.GATLayer, inputs: torch.Tensor, dense_graph: marius._data.DENSEGraph, train: bool = True) -> torch.Tensor\n\n.. autoclass:: marius.nn.layers.GCNLayer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__(self: marius._nn.layers.GCNLayer, layer_config: marius._config.LayerConfig, device: torch.device) -> None\n    \n    .. method:: __init__(self: marius._nn.layers.GCNLayer, input_dim: int, output_dim: int, device: Optional[torch.device] = None, init: marius._config.InitConfig, bias: bool = False, bias_init: marius._config.InitConfig, activation: str = ‘none’) -> None\n\n    .. method:: forward(self: marius._nn.layers.GCNLayer, inputs: torch.Tensor, dense_graph: marius._data.DENSEGraph, train: bool = True) -> torch.Tensor"
  },
  {
    "path": "docs/python_api/nn/layers/index.rst",
    "content": "\nlayers\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    embedding\n    feature\n    gnn\n    layer\n    reduction\n"
  },
  {
    "path": "docs/python_api/nn/layers/layer.rst",
    "content": "Layer\n********************\n\n.. autoclass:: marius.nn.layers.Layer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, post_hook\n    \n    .. method:: __init__()\n    \n    .. method:: post_hook(self: marius._nn.layers.Layer, inputs: torch.Tensor) -> torch.Tensor"
  },
  {
    "path": "docs/python_api/nn/layers/reduction.rst",
    "content": "ReductionLayer\n=======================================\n\n.. autoclass:: marius.nn.layers.ReductionLayer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__()\n    \n    .. method:: forward(self: marius._nn.layers.ReductionLayer, inputs: List[torch.Tensor]) -> torch.Tensor\n\n.. autoclass:: marius.nn.layers.ConcatReduction\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__(self: marius._nn.layers.ConcatReduction, layer_config: marius._config.LayerConfig, device: torch.device) -> None\n    \n    .. method:: __init__(self: marius._nn.layers.ConcatReduction, input_dim: int, output_dim: int, device: Optional[torch.device] = None, init: marius._config.InitConfig, bias: bool = False, bias_init: marius._config.InitConfig, activation: str = ‘none’) -> None\n    \n    .. method:: forward(self: marius._nn.layers.ConcatReduction, inputs: List[torch.Tensor]) -> torch.Tensor\n\n.. autoclass:: marius.nn.layers.LinearReduction\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, forward\n\n    .. method:: __init__(self: marius._nn.layers.LinearReduction, layer_config: marius._config.LayerConfig, device: torch.device) -> None\n    \n    .. method:: __init__(self: marius._nn.layers.LinearReduction, input_dim: int, output_dim: int, device: Optional[torch.device] = None, init: marius._config.InitConfig, bias: bool = False, bias_init: marius._config.InitConfig, activation: str = ‘none’) -> None\n    \n    .. method:: forward(self: marius._nn.layers.LinearReduction, inputs: List[torch.Tensor]) -> torch.Tensor"
  },
  {
    "path": "docs/python_api/nn/loss.rst",
    "content": "Loss Functions\n=======================================\n\n.. autoclass:: marius.nn.LossFunction\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.SoftmaxCrossEntropy\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.RankingLoss\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.CrossEntropyLoss\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.BCEAfterSigmoidLoss\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.BCEWithLogitsLoss\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.MSELoss\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.SoftPlusLoss\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/nn/model.rst",
    "content": "Model\n********************\n\n.. autoclass:: marius.nn.Model\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, broadcast, forward_lp, forward_nc\n\n    .. method:: __init__(self: marius._nn.Model, arg0: GeneralEncoder, arg1: Decoder, arg2: marius._nn.LossFunction, arg3: Reporter) -> None\n    \n    .. method:: __init__(self: marius._nn.Model, encoder: GeneralEncoder, decoder: Decoder, loss: marius._nn.LossFunction = None, reporter: Reporter = None, sparse_lr: float = 0.1) -> None\n    \n    .. method:: broadcast(self: marius._nn.Model, devices: List[torch.device]) -> None\n    \n    .. method:: forward_lp(self: marius._nn.Model, batch: marius._data.Batch, train: bool) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]\n    \n    .. method:: forward_nc(self: marius._nn.Model, node_embeddings: Optional[torch.Tensor], node_features: Optional[torch.Tensor], dense_graph: marius._data.DENSEGraph, train: bool) -> torch.Tensor"
  },
  {
    "path": "docs/python_api/nn/optim.rst",
    "content": "Optimizers\n********************\n\n.. autoclass:: marius.nn.Optimizer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__\n\n    .. method:: __init__()\n\n.. autoclass:: marius.nn.SGDOptimizer\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.nn.AdagradOptimizer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__\n\n    .. method:: __init__(self: marius._nn.AdagradOptimizer, param_dict: torch._C.cpp.OrderedTensorDict, options: marius._config.AdagradOptions) -> None\n    \n    .. method:: __init__(self: marius._nn.AdagradOptimizer, param_dict: torch._C.cpp.OrderedTensorDict, lr: float = 0.1, eps: float = 1e-10, lr_decay: float = 0, init_value: float = 0, weight_decay: float = 0) -> None\n\n.. autoclass:: marius.nn.AdamOptimizer\n    :members:\n    :undoc-members:\n    :exclude-members: __init__\n\n    .. method:: __init__(self: marius._nn.AdamOptimizer, param_dict: torch._C.cpp.OrderedTensorDict, options: marius._config.AdamOptions) -> None\n    \n    .. method:: __init__(self: marius._nn.AdamOptimizer, param_dict: torch._C.cpp.OrderedTensorDict, lr: float = 0.1, eps: float = 1e-08, beta_1: float = 0.9, beta_2: float = 0.999, weight_decay: float = 0, amsgrad: bool = False) -> None"
  },
  {
    "path": "docs/python_api/pipeline/evaluator.rst",
    "content": "Evaluator\n=======================================\n\n.. autoclass:: marius.pipeline.Evaluator\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.pipeline.SynchronousEvaluator\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.pipeline.PipelineEvaluator\n    :members:\n    :undoc-members:\n    :special-members: __init__"
  },
  {
    "path": "docs/python_api/pipeline/graph_encoder.rst",
    "content": "GraphEncoder\n=======================================\n\n.. autoclass:: marius.pipeline.GraphEncoder\n    :members:\n    :undoc-members:\n    :exclude-members: __init__\n\n    .. method:: __init__()\n\n.. autoclass:: marius.pipeline.SynchronousEncoder\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.pipeline.PipelineEncoder\n    :members:\n    :undoc-members:\n    :special-members: __init__"
  },
  {
    "path": "docs/python_api/pipeline/index.rst",
    "content": "\nmarius.pipeline\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    evaluator\n    trainer\n    graph_encoder"
  },
  {
    "path": "docs/python_api/pipeline/trainer.rst",
    "content": "Trainer\n=======================================\n\n.. autoclass:: marius.pipeline.Trainer\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.pipeline.SynchronousTrainer\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.pipeline.PipelineTrainer\n    :members:\n    :undoc-members:\n    :special-members: __init__"
  },
  {
    "path": "docs/python_api/reporting/index.rst",
    "content": "\nmarius.report\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    reporters"
  },
  {
    "path": "docs/python_api/reporting/metrics.rst",
    "content": ""
  },
  {
    "path": "docs/python_api/reporting/reporters.rst",
    "content": "\nReporter\n=======================================\n\n.. autoclass:: marius.report.Reporter\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.report.LinkPredictionReporter\n    :members:\n    :undoc-members:\n    :special-members: __init__\n    :exclude-members: add_result, compute_ranks\n\n    .. method:: add_result(self: marius._report.LinkPredictionReporter, pos_scores: torch.Tensor, neg_scores: torch.Tensor, edges: torch.Tensor = None) -> None\n    \n    .. method:: compute_ranks(self: marius._report.LinkPredictionReporter, pos_scores: torch.Tensor, neg_scores: torch.Tensor) -> torch.Tensor\n\n.. autoclass:: marius.report.NodeClassificationReporter\n    :members:\n    :undoc-members:\n    :special-members: __init__\n    :exclude-members: add_result\n\n    .. method:: add_result(self: marius._report.NodeClassificationReporter, y_true: torch.Tensor, y_pred: torch.Tensor, node_ids: torch.Tensor = None) -> None\n\n.. autoclass:: marius.report.ProgressReporter\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/storage/graph_storage.rst",
    "content": "GraphStorage\n=======================================\n\n.. autoclass:: marius.storage.GraphModelStorage\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, getNodeEmbeddingState, getNodeEmbeddingStateRange, getNodeEmbeddings, getNodeEmbeddingsRange, getNodeFeatures, getNodeFeaturesRange,  getNodeIdsRange, getNodeLabels, getNodeLabelsRange, getRandomNodeIds, get_edges, get_edges_range, init_subgraph, setActiveEdges, setActiveNodes, setBufferOrdering, updateAddNodeEmbeddingState, updateAddNodeEmbeddings, updatePutNodeEmbeddingState, updatePutNodeEmbeddings\n\n    .. method:: __init__(self: marius._storage.GraphModelStorage, storage_ptrs: marius._storage.GraphModelStoragePtrs, storage_config: marius._config.StorageConfig) -> None\n    \n    .. method:: __init__(self: marius._storage.GraphModelStorage, edges: marius._storage.Storage, nodes: marius._storage.Storage = None, node_features: marius._storage.Storage = None, node_embeddings: marius._storage.Storage = None, node_optim_state: marius._storage.Storage = None, node_labels: marius._storage.Storage = None, filter_edges: List[marius._storage.Storage] = [], train: bool = False, prefetch: bool = False) -> None\n    \n    .. method:: getNodeEmbeddingState(self: marius._storage.GraphModelStorage, indices: torch.Tensor) -> torch.Tensor\n    \n    .. method:: getNodeEmbeddingStateRange(self: marius._storage.GraphModelStorage, start: int, size: int) -> torch.Tensor\n    \n    .. method:: getNodeEmbeddings(self: marius._storage.GraphModelStorage, indices: torch.Tensor) -> torch.Tensor\n    \n    .. method:: getNodeEmbeddingsRange(self: marius._storage.GraphModelStorage, start: int, size: int) -> torch.Tensor\n    \n    .. method:: getNodeFeatures(self: marius._storage.GraphModelStorage, indices: torch.Tensor) -> torch.Tensor\n    \n    .. method:: getNodeFeaturesRange(self: marius._storage.GraphModelStorage, start: int, size: int) -> torch.Tensor\n    \n    .. method:: getNodeIdsRange(self: marius._storage.GraphModelStorage, start: int, size: int) -> torch.Tensor\n    \n    .. method:: getNodeLabels(self: marius._storage.GraphModelStorage, indices: torch.Tensor) -> torch.Tensor\n    \n    .. method:: getNodeLabelsRange(self: marius._storage.GraphModelStorage, start: int, size: int) -> torch.Tensor\n\n    .. method:: getRandomNodeIds(self: marius._storage.GraphModelStorage, size: int) -> torch.Tensor\n    \n    .. method:: get_edges(self: marius._storage.GraphModelStorage, indices: torch.Tensor) -> torch.Tensor\n    \n    .. method:: get_edges_range(self: marius._storage.GraphModelStorage, start: int, size: int) -> torch.Tensor\n    \n    .. method:: init_subgraph(self: marius._storage.GraphModelStorage, buffer_state: torch.Tensor) -> None\n    \n    .. method:: setActiveEdges(self: marius._storage.GraphModelStorage, active_edges: torch.Tensor) -> None\n    \n    .. method:: setActiveNodes(self: marius._storage.GraphModelStorage, node_ids: torch.Tensor) -> None\n    \n    .. method:: setBufferOrdering(self: marius._storage.GraphModelStorage, buffer_states: List[torch.Tensor]) -> None\n    \n    .. method:: updateAddNodeEmbeddingState(self: marius._storage.GraphModelStorage, indices: torch.Tensor, values: torch.Tensor) -> None\n    \n    .. method:: updateAddNodeEmbeddings(self: marius._storage.GraphModelStorage, indices: torch.Tensor, values: torch.Tensor) -> None\n    \n    .. method:: updatePutNodeEmbeddingState(self: marius._storage.GraphModelStorage, indices: torch.Tensor, state: torch.Tensor) -> None\n    \n    .. method:: updatePutNodeEmbeddings(self: marius._storage.GraphModelStorage, indices: torch.Tensor, embeddings: torch.Tensor) -> None"
  },
  {
    "path": "docs/python_api/storage/index.rst",
    "content": "\nmarius.storage\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    graph_storage\n    storage\n"
  },
  {
    "path": "docs/python_api/storage/storage.rst",
    "content": "Storage\n=======================================\n\n.. function:: marius.storage.tensor_from_file(filename: str, shape: List[int], dtype: torch.dtype, device: torch.device) -> torch.Tensor\n\n.. autoclass:: marius.storage.Storage\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, indexAdd, indexPut, indexRead, range, rangePut\n\n    .. method:: __init__()\n\n    .. method:: indexAdd(self: marius._storage.Storage, indices: torch.Tensor, values: torch.Tensor) -> None\n\n    .. method:: indexPut(self: marius._storage.Storage, indices: torch.Tensor, values: torch.Tensor) -> None\n\n    .. method:: indexRead(self: marius._storage.Storage, indices: torch.Tensor) -> torch.Tensor\n\n    .. method:: range(self: marius._storage.Storage, offset: int, n: int) -> torch.Tensor\n\n    .. method:: rangePut(self: marius._storage.Storage, offset: int, n: int, values: torch.Tensor) -> None\n\n.. autoclass:: marius.storage.FlatFile\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, append\n\n    .. method:: __init__(self: marius._storage.FlatFile, filename: str, shape: List[int], dtype: torch.dtype, alloc: bool = False) -> None\n    \n    .. method:: __init__(self: marius._storage.FlatFile, filename: str, data: torch.Tensor) -> None\n    \n    .. method:: __init__(self: marius._storage.FlatFile, filename: str, dtype: torch.dtype) -> None\n    \n    .. method:: append(self: marius._storage.FlatFile, values: torch.Tensor) -> None\n\n.. autoclass:: marius.storage.PartitionBufferStorage\n    :members:\n    :undoc-members:\n    :exclude-members: __init__, getGlobalToLocalMap, setBufferOrdering\n\n    .. method:: __init__(self: marius._storage.PartitionBufferStorage, filename: str, dim0_size: int, dim1_size: int, options: marius._config.PartitionBufferOptions) -> None\n\n    .. method:: __init__(self: marius._storage.PartitionBufferStorage, filename: str, data: torch.Tensor, options: marius._config.PartitionBufferOptions) -> None\n    \n    .. method:: __init__(self: marius._storage.PartitionBufferStorage, filename: str, options: marius._config.PartitionBufferOptions) -> None\n    \n    .. method:: getGlobalToLocalMap(self: marius._storage.PartitionBufferStorage, get_current: bool = True) -> torch.Tensor\n\n    .. method:: setBufferOrdering(self: marius._storage.PartitionBufferStorage, buffer_states: List[torch.Tensor]) -> None\n\n.. autoclass:: marius.storage.InMemory\n    :members:\n    :undoc-members:\n    :exclude-members: __init__\n\n    .. method:: __init__(self: marius._storage.InMemory, filename: str, shape: List[int], dtype: torch.dtype, device: torch.device) -> None\n\n    .. method:: __init__(self: marius._storage.InMemory, filename: str, data: torch.Tensor, device: torch.device) -> None\n    \n    .. method:: __init__(self: marius._storage.InMemory, filename: str, dtype: torch.dtype) -> None\n"
  },
  {
    "path": "docs/python_api/tools/configuration/constants.rst",
    "content": "constants\n=======================================\n\n.. automodule:: marius.tools.configuration.constants\n    :members:\n    :undoc-members:"
  },
  {
    "path": "docs/python_api/tools/configuration/datatypes.rst",
    "content": "datatypes\n=======================================\n\n.. automodule:: marius.tools.configuration.datatypes\n    :members:\n    :undoc-members:\n"
  },
  {
    "path": "docs/python_api/tools/configuration/index.rst",
    "content": "\nconfiguration\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    constants\n    datatypes\n    marius_config\n"
  },
  {
    "path": "docs/python_api/tools/configuration/marius_config.rst",
    "content": "marius_config\n=======================================\n\n.. automodule:: marius.tools.configuration.marius_config\n    :members:\n    :undoc-members:\n"
  },
  {
    "path": "docs/python_api/tools/index.rst",
    "content": "\nmarius.tools\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    configuration/index\n    postprocess/index\n    predict/index\n    preprocess/index\n"
  },
  {
    "path": "docs/python_api/tools/preprocess/converters/index.rst",
    "content": "\nconverters\n********************\n\n.. automodule:: marius.tools.preprocess.converters\n    :members:\n    :undoc-members:\n    :imported-members:\n\n"
  },
  {
    "path": "docs/python_api/tools/preprocess/datasets/index.rst",
    "content": "\ndatasets\n********************\n\n.. autoclass:: marius.tools.preprocess.dataset.Dataset\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.fb15k.FB15K\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.fb15k_237.FB15K237\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.freebase86m.Freebase86m\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.friendster.Friendster\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.livejournal.Livejournal\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.ogb_mag240m.OGBMag240M\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.ogb_wikikg90mv2.OGBWikiKG90Mv2\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.ogbl_citation2.OGBLCitation2\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.ogbl_ppa.OGBLPpa\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.ogbl_wikikg2.OGBLWikiKG2\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.ogbn_arxiv.OGBNArxiv\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.ogbn_papers100m.OGBNPapers100M\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.ogbn_products.OGBNProducts\n    :members:\n    :undoc-members:\n    :special-members: __init__\n\n.. autoclass:: marius.tools.preprocess.datasets.twitter.Twitter\n    :members:\n    :undoc-members:\n    :special-members: __init__\n"
  },
  {
    "path": "docs/python_api/tools/preprocess/index.rst",
    "content": "\npreprocess\n********************\n\n.. toctree::\n    :glob:\n    :maxdepth: 2\n\n    converters/index\n    datasets/index\n    partitioners/index\n    readers/index\n    writers/index\n    custom\n    dataset\n    utils\n"
  },
  {
    "path": "docs/python_api/tools/preprocess/partitioners/index.rst",
    "content": "\npartitioners\n********************\n\n.. automodule:: marius.tools.preprocess.converters.partitioners\n    :members:\n    :undoc-members:\n    :imported-members:\n"
  },
  {
    "path": "docs/python_api/tools/preprocess/readers/index.rst",
    "content": "\nreaders\n********************\n\n.. automodule:: marius.tools.preprocess.converters.readers\n    :members:\n    :undoc-members:\n    :imported-members:"
  },
  {
    "path": "docs/python_api/tools/preprocess/writers/index.rst",
    "content": "\nwriters\n********************\n\n.. automodule:: marius.tools.preprocess.converters.writers\n    :members:\n    :undoc-members:\n    :imported-members:"
  },
  {
    "path": "docs/quickstart.rst",
    "content": ".. _quickstart\n\nGetting Started\n=========================\n\nBuild and Install\n##############################\n\nRequirements\n****************************\n* CUDA >= 10.1\n* CuDNN >= 7\n* pytorch >= 1.8\n* python >= 3.7\n* GCC >= 7 (On Linux) or Clang 12.0 (On MacOS)\n* cmake >= 3.12\n* make >= 3.8\n\nPip installation\n****************************\n\nFirst check that the required software is installed (see above).\n\n    .. code-block:: bash\n\n        git clone https://github.com/marius-team/marius.git\n        pip3 install .\n\nThe Python API can be accessed with ``import marius``.\n\nThe following commands will be installed:\n- marius_train: Train models using configuration files and the command line\n- marius_eval: Command line model evaluation\n- marius_preprocess: Built-in dataset downloading and preprocessing\n- marius_predict: Batch inference tool for link prediction or node classification\n\n\nCMake build (No Python API)\n****************************\n\nThis does not build the Python API, but only the C++ sources and marius_train executable.\n\n    .. code-block:: bash\n\n        git clone https://github.com/marius-team/marius.git\n\n        # installs only marius.tools (required)\n        MARIUS_NO_BINDINGS=1 pip3 install .\n\n        mkdir build\n        cd build\n        cmake ../ -DUSE_CUDA=1\n        make marius_train -j\n        cd ..\n\n        # run with build/marius_train config.yaml\n\n\nConfiguration Interface\n##############################\n\n.. _config_examples_link: http://marius-project.org/marius/examples/config/index.html\n.. _schema_link: http://marius-project.org/marius/config_interface/full_schema.html\n\nSee configuration `examples <config_examples_link_>`_ for detailed examples and the `configuration schema <schema_link_>`_ for all options.\n\nPreprocess & Configuration\n****************************\n\n\nPreprocess dataset: this downloads and preprocesses the dataset into the arxiv_example/ directory\n\n\n    .. code-block:: bash\n\n        marius_preprocess --dataset ogbn_arxiv --output_dir arxiv_example/\n\n\n\n\nDefine configuration file: 1-layer GraphSage GNN\n\n    .. code-block:: yaml\n\n        model:\n          learning_task: NODE_CLASSIFICATION\n          encoder:\n            train_neighbor_sampling:\n              - type: ALL\n            layers:\n              - - type: FEATURE\n                  output_dim: 128\n              - - type: GNN\n                  options:\n                    type: GRAPH_SAGE\n                    aggregator: MEAN\n                  input_dim: 128\n                  output_dim: 40\n          decoder:\n            type: NODE\n          loss:\n            type: CROSS_ENTROPY\n            options:\n              reduction: SUM\n          dense_optimizer:\n            type: ADAM\n            options:\n              learning_rate: 0.01\n        storage:\n          device_type: cuda\n          dataset:\n            dataset_dir: arxiv_example/\n            num_edges: 1166243\n            num_train: 90941\n            num_nodes: 169343\n            num_relations: 1\n            num_valid: 29799\n            num_test: 48603\n            node_feature_dim: 128\n            num_classes: 40\n          edges:\n            type: DEVICE_MEMORY\n            options:\n              dtype: int\n          features:\n            type: DEVICE_MEMORY\n            options:\n              dtype: float\n        training:\n          batch_size: 1000\n          num_epochs: 10\n          pipeline:\n            sync: true\n        evaluation:\n          batch_size: 1000\n          pipeline:\n            sync: true\n\n\nTraining\n****************************\n\n\n    Train the model described in the configuration file for 10 epochs.\n\n    .. code-block:: bash\n\n        marius_train arxiv_config.yaml\n\n    The output will look similar to:\n\n    .. code-block:: text\n\n        [04/08/22 01:12:10.693] ################ Starting training epoch 1 ################\n        [04/08/22 01:12:10.721] Nodes processed: [10000/90941], 11.00%\n        [04/08/22 01:12:10.741] Nodes processed: [20000/90941], 21.99%\n        [04/08/22 01:12:10.762] Nodes processed: [30000/90941], 32.99%\n        [04/08/22 01:12:10.800] Nodes processed: [40000/90941], 43.98%\n        [04/08/22 01:12:10.820] Nodes processed: [50000/90941], 54.98%\n        [04/08/22 01:12:10.840] Nodes processed: [60000/90941], 65.98%\n        [04/08/22 01:12:10.863] Nodes processed: [70000/90941], 76.97%\n        [04/08/22 01:12:10.883] Nodes processed: [80000/90941], 87.97%\n        [04/08/22 01:12:10.916] Nodes processed: [90000/90941], 98.97%\n        [04/08/22 01:12:10.918] Nodes processed: [90941/90941], 100.00%\n        [04/08/22 01:12:10.918] ################ Finished training epoch 1 ################\n        [04/08/22 01:12:10.918] Epoch Runtime: 224ms\n        [04/08/22 01:12:10.918] Nodes per Second: 405986.6\n        [04/08/22 01:12:10.918] Evaluating validation set\n        [04/08/22 01:12:11.005]\n        =================================\n        Node Classification: 29799 nodes evaluated\n        Accuracy: 58.669754%\n        =================================\n        [04/08/22 01:12:11.005] Evaluating test set\n        [04/08/22 01:12:11.133]\n        =================================\n        Node Classification: 48603 nodes evaluated\n        Accuracy: 57.936753%\n        =================================\n        ...\n\n\n\nInference\n****************************\n\n    Evaluate the test set for the dataset after 10 epochs have completed.\n\n    .. code-block:: bash\n\n        marius_eval arxiv_config.yaml\n\n\n    Output:\n\n    .. code-block:: text\n\n        [04/08/22 02:06:25.330] Evaluating test set\n        [04/08/22 02:06:25.585]\n        =================================\n        Node Classification: 48603 nodes evaluated\n        Accuracy: 64.963068%\n        =================================\n\n\nPython API\n##############################\n\n\n.. _python_examples_link: http://marius-project.org/marius/examples/python/index.html\n\n.. _python_api_link: http://marius-project.org/marius/python_api/index.html\n\nSee the `Python examples <python_examples_link_>`_ and `API docs <python_api_link_>`_ (under construction) for more details.\n\nPreprocess Dataset and load graph data\n**************************************\n\nImport marius and preprocess ogbn_arxiv for node classifcation.\n\n    .. code-block:: python\n\n        import marius as m\n        import torch\n        from marius.tools.preprocess.datasets.ogbn_arxiv import OGBNArxiv\n\n        # initialize and preprocess dataset\n        dataset = OGBNArxiv(\"arvix_example/\")\n        dataset.download()\n        dataset_stats = dataset.preprocess()\n\nLoad dataset tensors into GPU memory\n\n    .. code-block:: python\n\n        device = torch.device(\"cuda\")\n\n        edges = m.storage.tensor_from_file(filename=dataset.edge_list_file,\n                                           shape=[dataset_stats.num_edges, -1],\n                                           dtype=torch.int32,\n                                           device=device)\n        train_nodes = m.storage.tensor_from_file(filename=dataset.train_nodes_file,\n                                                 shape=[dataset_stats.num_train],\n                                                 dtype=torch.int32,\n                                                 device=device)\n        test_nodes = m.storage.tensor_from_file(filename=dataset.test_nodes_file,\n                                                shape=[dataset_stats.num_test],\n                                                dtype=torch.int32,\n                                                device=device)\n        features = m.storage.tensor_from_file(filename=dataset.node_features_file,\n                                              shape=[dataset_stats.num_nodes, -1],\n                                              dtype=torch.float32,\n                                              device=device)\n        labels = m.storage.tensor_from_file(filename=dataset.node_labels_file,\n                                            shape=[dataset_stats.num_nodes],\n                                            dtype=torch.int32,\n                                            device=device)\n\nDefine Model\n****************************\n\nDefine single layer graph sage model\n\n    .. code-block:: python\n\n        feature_dim = dataset_stats.node_feature_dim\n        num_classes = dataset_stats.num_classes\n\n        feature_layer = m.nn.layers.FeatureLayer(dimension=feature_dim,\n                                                 device=device)\n\n        graph_sage_layer = m.nn.layers.GraphSageLayer(input_dim=feature_dim,\n                                                      output_dim=num_classes,\n                                                      device=device)\n\n        encoder = m.encoders.GeneralEncoder(layers=[[feature_layer],\n                                                    [graph_sage_layer]])\n\n        decoder = m.nn.decoders.node.NoOpNodeDecoder()\n        loss = m.nn.CrossEntropyLoss(reduction=\"sum\")\n\n        reporter = m.report.NodeClassificationReporter()\n        reporter.add_metric(m.report.CategoricalAccuracy())\n\n        model = m.nn.Model(encoder, decoder, loss, reporter)\n        model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=.01)]\n\n        nbr_sampler = m.data.samplers.LayeredNeighborSampler(num_neighbors=[-1])\n\nTraining and Evaluation\n****************************\n\nSetup training and evaluation dataloaders\n\n    .. code-block:: python\n\n        train_loader = m.data.DataLoader(edges=edges,\n                                         batch_size=1000\n                                         nodes=train_nodes,\n                                         nbr_sampler=nbr_sampler,\n                                         learning_task=\"nc\")\n\n        eval_loader = m.data.DataLoader(edges=edges,\n                                        batch_size=1000\n                                        nodes=test_nodes,\n                                        nbr_sampler=nbr_sampler,\n                                        learning_task=\"nc)\n\n\nTrain 10 epochs\n\n    .. code-block:: python\n\n        num_epochs = 10\n        for i in range(num_epochs)\n\n            train_loader.initializeBatches()\n            while train_loader.hasNextBatch():\n                batch = train_loader.getBatch()\n                model.train_batch(batch)\n\nEvaluate Test Set\n\n    .. code-block:: python\n\n        eval_loader.initializeBatches()\n        while eval_loader.hasNextBatch():\n            batch = eval_loader.getBatch()\n            model.evaluate_batch(batch)\n\n        model.reporter.report()\n\n"
  },
  {
    "path": "examples/configuration/custom_lp.yaml",
    "content": "model:\n  learning_task: LINK_PREDICTION\n  encoder:\n    layers:\n      - - type: EMBEDDING\n          output_dim: 50\n  decoder:\n    type: DISTMULT\n    options:\n      input_dim: 50\n  loss:\n    type: SOFTMAX_CE\n    options:\n      reduction: SUM\n  dense_optimizer:\n      type: ADAM\n      options:\n        learning_rate: 0.1\n  sparse_optimizer:\n      type: ADAGRAD\n      options:\n        learning_rate: 0.1\nstorage:\n  device_type: cuda\n  dataset:\n    dataset_dir: /marius-internal/datasets/custom_lp_example/\n  edges:\n    type: DEVICE_MEMORY\n  embeddings:\n    type: DEVICE_MEMORY\n  save_model: true\ntraining:\n  batch_size: 1000\n  negative_sampling:\n    num_chunks: 10\n    negatives_per_positive: 500\n    degree_fraction: 0.0\n    filtered: false\n  num_epochs: 10\n  pipeline:\n    sync: true\n  epochs_per_shuffle: 1\nevaluation:\n  batch_size: 1000\n  negative_sampling:\n    filtered: true\n  pipeline:\n    sync: true"
  },
  {
    "path": "examples/configuration/custom_nc.yaml",
    "content": "model:\n  learning_task: NODE_CLASSIFICATION\n  encoder:\n    train_neighbor_sampling:\n      - type: ALL\n      - type: ALL\n      - type: ALL\n    layers:\n      - - type: FEATURE\n          output_dim: 1433\n          bias: true\n      - - type: GNN\n          options:\n            type: GRAPH_SAGE\n            aggregator: MEAN\n          input_dim: 1433\n          output_dim: 1433\n          bias: true\n      - - type: GNN\n          options:\n            type: GRAPH_SAGE\n            aggregator: MEAN\n          input_dim: 1433\n          output_dim: 1433\n          bias: true\n      - - type: GNN\n          options:\n            type: GRAPH_SAGE\n            aggregator: MEAN\n          input_dim: 1433\n          output_dim: 40\n          bias: true\n  decoder:\n    type: NODE\n  loss:\n    type: CROSS_ENTROPY\n    options:\n      reduction: SUM\n  dense_optimizer:\n    type: ADAM\n    options:\n      learning_rate: 0.01\nstorage:\n  device_type: cuda\n  dataset: \n    dataset_dir: datasets/custom_nc_example/cora/\n  edges:\n    type: DEVICE_MEMORY\n    options:\n      dtype: int\n  features:\n    type: DEVICE_MEMORY\n    options:\n      dtype: float\ntraining:\n  batch_size: 1000\n  num_epochs: 10\n  pipeline:\n    sync: true\nevaluation:\n  batch_size: 1000\n  pipeline:\n    sync: true "
  },
  {
    "path": "examples/configuration/fb15k_237.yaml",
    "content": "model:\n  learning_task: LINK_PREDICTION\n  encoder:\n    layers:\n      - - type: EMBEDDING\n          output_dim: 50\n  decoder:\n    type: DISTMULT\n    options:\n      input_dim: 50\n  loss:\n    type: SOFTMAX_CE\n    options:\n      reduction: SUM\n  dense_optimizer:\n      type: ADAM\n      options:\n        learning_rate: 0.1\n  sparse_optimizer:\n      type: ADAGRAD\n      options:\n        learning_rate: 0.1\nstorage:\n  device_type: cuda\n  dataset:\n    dataset_dir: ./datasets/fb15k_237_example/\n  edges:\n    type: DEVICE_MEMORY\n  embeddings:\n    type: DEVICE_MEMORY\n  save_model: true\ntraining:\n  batch_size: 1000\n  negative_sampling:\n    num_chunks: 10\n    negatives_per_positive: 500\n    degree_fraction: 0.0\n    filtered: false\n  num_epochs: 10\n  pipeline:\n    sync: true\n  epochs_per_shuffle: 1\nevaluation:\n  batch_size: 1000\n  negative_sampling:\n    filtered: true\n  pipeline:\n    sync: true"
  },
  {
    "path": "examples/configuration/ogbn_arxiv.yaml",
    "content": "model:\n  learning_task: NODE_CLASSIFICATION\n  encoder:\n    train_neighbor_sampling:\n      - type: ALL\n      - type: ALL\n      - type: ALL\n    layers:\n      - - type: FEATURE\n          output_dim: 128\n          bias: true\n      - - type: GNN\n          options:\n            type: GRAPH_SAGE\n            aggregator: MEAN\n          input_dim: 128\n          output_dim: 128\n          bias: true\n      - - type: GNN\n          options:\n            type: GRAPH_SAGE\n            aggregator: MEAN\n          input_dim: 128\n          output_dim: 128\n          bias: true\n      - - type: GNN\n          options:\n            type: GRAPH_SAGE\n            aggregator: MEAN\n          input_dim: 128\n          output_dim: 40\n          bias: true\n  decoder:\n    type: NODE\n  loss:\n    type: CROSS_ENTROPY\n    options:\n      reduction: SUM\n  dense_optimizer:\n    type: ADAM\n    options:\n      learning_rate: 0.01\nstorage:\n  device_type: cuda\n  dataset: \n    dataset_dir: datasets/ogbn_arxiv_example/\n  edges:\n    type: DEVICE_MEMORY\n    options:\n      dtype: int\n  features:\n    type: DEVICE_MEMORY\n    options:\n      dtype: float\ntraining:\n  batch_size: 1000\n  num_epochs: 10\n  pipeline:\n    sync: true\nevaluation:\n  batch_size: 1000\n  pipeline:\n    sync: true"
  },
  {
    "path": "examples/configuration/sakila.yaml",
    "content": "model:\n  learning_task: LINK_PREDICTION # set the learning task to link prediction\n  encoder:\n    layers:\n      - - type: EMBEDDING # set the encoder to be an embedding table with 50-dimensional embeddings\n          output_dim: 50\n  decoder:\n    type: DISTMULT # set the decoder to DistMult\n    options:\n      input_dim: 50\n  loss:\n    type: SOFTMAX_CE\n    options:\n      reduction: SUM\n  dense_optimizer: # optimizer to use for dense model parameters. In this case these are the DistMult relation (edge-type) embeddings\n      type: ADAM\n      options:\n        learning_rate: 0.1\n  sparse_optimizer: # optimizer to use for node embedding table\n      type: ADAGRAD\n      options:\n        learning_rate: 0.1\nstorage:\n  device_type: cuda\n  dataset:\n    dataset_dir: /marius/datasets/sakila/\n  edges:\n    type: DEVICE_MEMORY\n  embeddings:\n    type: DEVICE_MEMORY\n  save_model: true\ntraining:\n  batch_size: 1000\n  negative_sampling:\n    num_chunks: 10\n    negatives_per_positive: 500\n    degree_fraction: 0.0\n    filtered: false\n  num_epochs: 10\n  pipeline:\n    sync: true\n  epochs_per_shuffle: 1\nevaluation:\n  batch_size: 1000\n  negative_sampling:\n    filtered: true\n  pipeline:\n    sync: true\n"
  },
  {
    "path": "examples/db2graph/dockerfile",
    "content": "# setup for Marius\nFROM nvidia/cuda:11.4.2-cudnn8-devel-ubuntu20.04\n\nENV TZ=US\n\nRUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone\n\nRUN apt update\n\nRUN apt install -y g++ \\\n         make \\\n         wget \\\n         unzip \\\n         vim \\\n         git \\\n         python3-pip \\\n         build-essential \\\n         python-dev \\\n         libpq-dev\n\n# install cmake 3.20\nRUN wget https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0-linux-x86_64.sh \\\n    && mkdir /opt/cmake \\\n    && sh cmake-3.20.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake/ \\\n    && ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake\n\n# install pytorch\nRUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 && pip3 install docutils==0.17\n\n# install Marius\nRUN git clone https://github.com/marius-team/marius.git && cd marius && pip3 install .\n\n# install debconf-set-selections & systemctl\nRUN apt-get install debconf\n\nRUN apt-get install systemctl\n\n# install mysql-8\nRUN echo \"mysql-community-server mysql-community-server/root-pass password password\" | debconf-set-selections\n\nRUN echo \"mysql-community-server mysql-community-server/re-root-pass password password\" | debconf-set-selections\n\nRUN DEBIAN_FRONTEND=noninteractive apt-get -y install mysql-server\n\n# Adding a run.sh script to initialize things\nCOPY run.sh /usr/local/bin/run.sh\n\nRUN chmod +x usr/local/bin/run.sh\n"
  },
  {
    "path": "examples/db2graph/run.sh",
    "content": "#!/bin/sh\nsystemctl start mysql\nmkdir /db2graph_eg\nwget -O /db2graph_eg/sakila-db.tar.gz https://downloads.mysql.com/docs/sakila-db.tar.gz\ntar -xf /db2graph_eg/sakila-db.tar.gz -C /db2graph_eg/\nmysql -u root -p=password < /db2graph_eg/sakila-db/sakila-schema.sql\nmysql -u root -p=password < /db2graph_eg/sakila-db/sakila-data.sql\n## For creating a new user for accessing the data\nmysql -u root -p=password mysql -e \"CREATE USER 'sakila_user'@'localhost' IDENTIFIED BY 'sakila_password';\"\nmysql -u root -p=password mysql -e \"GRANT ALL PRIVILEGES ON *.* TO 'sakila_user'@'localhost';\"\nmysql -u root -p=password mysql -e \"FLUSH PRIVILEGES;\"\nservice mysql restart"
  },
  {
    "path": "examples/docker/README.md",
    "content": "# Docker Installation\n\nThe following instructions install the necessary dependencies and build\nthe system using Docker. We describe the installation for GPU-based machines, \nalthough Marius and MariusGNN can run on CPU only machines as well.\n\n### Build and Install Instructions ###\n1. Check if docker is installed (`which docker`) and if not install it: https://docs.docker.com/engine/install/\n2. Check if docker can access the GPUs by running `sudo docker run --gpus all nvidia/cuda:11.8.0-base-ubuntu22.04 nvidia-smi`. If this doesn't print the output of `nvidia-smi`, docker cannot access the CUDA driver on the host machine and you need to install the NVIDIA drivers for GPU support.\n3. Once the above succeeds, you should no longer need anything installed on the host machine.\n4. Create a docker image using the provided Dockerfile: `docker build -t image_name:image_tag gpu_ubuntu/.`\n5. Run the docker image: `docker run --gpus all -it image_name:image_tag bash`. It is often useful to link the current directory into the containers `/working_dir/` using the `-v` option (see below).\n6. Once the container is running, install and build the system:\n   ```\n   cd marius\n   pip3 install . --no-build-isolation\n   ```\n\n**Full List of Example Commands for GPU Installation**:\n\n```\nCURRENT_DIR=`pwd`\ngit clone https://github.com/marius-team/marius.git\ncd marius/examples/docker/\ndocker build -t marius:latest gpu_ubuntu/.\ndocker run --gpus all -it -v $CURRENT_DIR:/working_dir/ marius:latest bash\ncd marius\npip3 install . --no-build-isolation\n```\n\n**CPU Only Installation**: If your machine does not have a GPU, remove the `--gpus all` from the docker run command in the GPU installation instructions. \nYou can also optionally use the Dockerfile in `cpu_ubuntu/` rather than `gpu_ubuntu/`.\n\n**Installation Notes**:\n1. The installation requires Docker to have at least 8GB of memory to work with. This is generally satisfied by\n   default, but if not (often on Mac), the `docker build` command may throw an error code 137. See\n   [here](https://stackoverflow.com/questions/44533319/how-to-assign-more-memory-to-docker-container/44533437#44533437),\n   [here](https://stackoverflow.com/questions/34674325/error-build-process-returned-exit-code-137-during-docker-build-on-tutum), and\n   [here](https://stackoverflow.com/questions/57291806/docker-build-failed-after-pip-installed-requirements-with-exit-code-137)\n   for StackOverflow threads on how to increase Docker available memory or fix this issue. The `pip3 install .` command\n   may also cause Docker memory issues. Increase the memory available to Docker or decrease the number of threads used for building\n   MariusGNN (to decrease the number of threads change `-j{}` in line 45 of `setup.py` to `-j1` for example). One thread\n   should build with 8GB of memory but may take some time (~30mins)."
  },
  {
    "path": "examples/docker/cpu_ubuntu/dockerfile",
    "content": "FROM ubuntu:22.04\nRUN apt update\n\nRUN apt install -y g++ \\\n         make \\\n         wget \\\n         unzip \\\n         vim \\\n         git \\\n         dstat \\\n         python3-pip\n\n# install gcc-9\nRUN apt install -y software-properties-common\nRUN add-apt-repository -y ppa:ubuntu-toolchain-r/test\nRUN apt update\nRUN apt install -y gcc-9 g++-9\nRUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9\nRUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9\n\n# install cmake 3.20\nRUN wget https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0-linux-x86_64.sh\nRUN mkdir /opt/cmake\nRUN sh cmake-3.20.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake/\nRUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake\n\n# install pytorch\nRUN python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu\n\nRUN mkdir /working_dir\nWORKDIR /working_dir"
  },
  {
    "path": "examples/docker/gpu_ubuntu/dockerfile",
    "content": "FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04\nRUN apt update\n\nRUN apt install -y g++ \\\n         make \\\n         wget \\\n         unzip \\\n         vim \\\n         git \\\n         dstat \\\n         python3-pip\n\n# install gcc-9\nRUN apt install -y software-properties-common\nRUN add-apt-repository -y ppa:ubuntu-toolchain-r/test\nRUN apt update\nRUN apt install -y gcc-9 g++-9\nRUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9\nRUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9\n\n# install cmake 3.20\nRUN wget https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0-linux-x86_64.sh\nRUN mkdir /opt/cmake\nRUN sh cmake-3.20.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake/\nRUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake\n\n# install pytorch\nRUN python3 -m pip install torch==2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html\n\nRUN mkdir /working_dir\nWORKDIR /working_dir"
  },
  {
    "path": "examples/preprocessing/custom_dataset.py",
    "content": ""
  },
  {
    "path": "examples/python/custom.py",
    "content": ""
  },
  {
    "path": "examples/python/custom_lp.py",
    "content": "from pathlib import Path\n\nfrom omegaconf import OmegaConf\n\nimport marius as m\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\nimport torch  # isort:skip\n\n\nclass MYDATASET(LinkPredictionDataset):\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogbn_arxiv\"\n        self.dataset_url = \"http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip\"\n\n    def download(self, overwrite=False):\n        self.input_train_edges_file = self.output_directory / Path(\"edge.csv\")\n\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            extract_file(self.output_directory / Path(\"arxiv/raw/edge.csv.gz\"))\n\n            (self.output_directory / Path(\"arxiv/raw/edge.csv\")).rename(self.input_train_edges_file)\n\n    def preprocess(self, remap_ids=True, splits=None):\n        converter = TorchEdgeListConverter\n        splits = [0.8, 0.1, 0.1]  # 80%-train, 10%-validation, 10%-test\n        converter = converter(\n            output_dir=self.output_directory,\n            train_edges=self.input_train_edges_file,\n            src_column=0,  # col 0 is src and col 1 dst node in input csv\n            dst_column=1,\n            delim=\",\",  # CSV delimitor is \",\"\n            splits=splits,  # Splitting the data in train, valid and test\n            remap_ids=remap_ids,  # Remapping the raw entity ids into random integers\n        )\n\n        return converter.convert()\n\n\ndef init_model(embedding_dim, num_nodes, num_relations, device, dtype):\n    # setup shallow embedding encoder\n    embedding_layer = m.nn.layers.EmbeddingLayer(dimension=embedding_dim, device=device)\n    encoder = m.encoders.GeneralEncoder(layers=[[embedding_layer]])\n\n    # initialize node embedding table\n    emb_table = embedding_layer.init_embeddings(num_nodes)\n\n    # initialize DistMult decoder\n    decoder = m.nn.decoders.edge.DistMult(\n        num_relations=num_relations,\n        embedding_dim=embedding_dim,\n        use_inverse_relations=True,\n        device=device,\n        dtype=dtype,\n        mode=\"train\",\n    )\n\n    loss = m.nn.SoftmaxCrossEntropy(reduction=\"sum\")\n\n    # metrics to compute during evaluation\n    reporter = m.report.LinkPredictionReporter()\n    reporter.add_metric(m.report.MeanReciprocalRank())\n    reporter.add_metric(m.report.MeanRank())\n    reporter.add_metric(m.report.Hitsk(1))\n    reporter.add_metric(m.report.Hitsk(10))\n\n    # sparse_lr sets the learning rate for the embedding parameters\n    model = m.nn.Model(encoder, decoder, loss, reporter, sparse_lr=0.1)\n\n    # set optimizer for dense model parameters. In this case this is the DistMult relation (edge-type) embeddings\n    model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=0.1)]\n\n    return model, emb_table\n\n\ndef train_epoch(model, dataloader):\n    # need to reset dataloader state before each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.train_batch(batch)\n        dataloader.updateEmbeddings(batch)\n\n        counter += 1\n        if counter % 100 == 0:\n            print(\"Trained {} batches\".format(counter))\n\n    print(\"Trained {} batches\".format(counter))\n\n\ndef eval_epoch(model, dataloader):\n    # need to reset dataloader before state each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.evaluate_batch(batch)\n\n        counter += 1\n        if counter % 100 == 0:\n            print(\"Evaluated {} batches\".format(counter))\n\n    print(\"Evaluated {} batches\".format(counter))\n\n    model.reporter.report()\n\n\nif __name__ == \"__main__\":\n    # initialize and preprocess dataset\n    dataset_dir = Path(\"ogbn_arxiv_dataset/\")\n    dataset = MYDATASET(dataset_dir)\n    if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n        dataset.download()\n        dataset.preprocess()\n\n    dataset_stats = OmegaConf.load(dataset_dir / Path(\"dataset.yaml\"))\n\n    # create model\n    device = torch.device(\"cuda\")\n    dtype = torch.float32\n    embedding_dim = 50\n    model, embeddings = init_model(embedding_dim, dataset_stats.num_nodes, dataset_stats.num_relations, device, dtype)\n\n    # setup training dataloader\n    train_edges = m.storage.tensor_from_file(\n        filename=dataset.train_edges_file, shape=[dataset_stats.num_train, -1], dtype=torch.int32, device=device\n    )\n    train_neg_sampler = m.data.samplers.CorruptNodeNegativeSampler(\n        num_chunks=10, num_negatives=500, degree_fraction=0.0, filtered=False\n    )\n\n    train_dataloader = m.data.DataLoader(\n        edges=train_edges,\n        node_embeddings=embeddings,\n        batch_size=1000,\n        neg_sampler=train_neg_sampler,\n        learning_task=\"lp\",\n        train=True,\n    )\n\n    # setup eval dataloader\n    valid_edges = m.storage.tensor_from_file(\n        filename=dataset.valid_edges_file, shape=[dataset_stats.num_valid, -1], dtype=torch.int32, device=device\n    )\n    test_edges = m.storage.tensor_from_file(\n        filename=dataset.test_edges_file, shape=[dataset_stats.num_test, -1], dtype=torch.int32, device=device\n    )\n    eval_neg_sampler = m.data.samplers.CorruptNodeNegativeSampler(filtered=True)\n\n    eval_dataloader = m.data.DataLoader(\n        edges=test_edges,\n        node_embeddings=embeddings,\n        batch_size=1000,\n        neg_sampler=eval_neg_sampler,\n        learning_task=\"lp\",\n        filter_edges=[train_edges, valid_edges],  # used to filter out false negatives in evaluation\n        train=False,\n    )\n\n    for i in range(5):\n        print(\"Train Epoch {}\".format(i))\n        print(\"-------------------------\")\n        train_epoch(model, train_dataloader)\n        print(\"-------------------------\")\n        print(\"Evaluating\")\n        eval_epoch(model, eval_dataloader)\n        print(\"-------------------------\")\n"
  },
  {
    "path": "examples/python/custom_nc_graphsage.py",
    "content": "from pathlib import Path\n\nimport numpy as np\nimport pandas as pd\nfrom omegaconf import OmegaConf\n\nimport marius as m\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import NodeClassificationDataset\nfrom marius.tools.preprocess.datasets.dataset_helpers import remap_nodes\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\nimport torch  # isort:skip\n\n\ndef switch_to_num(row):\n    names = [\n        \"Neural_Networks\",\n        \"Rule_Learning\",\n        \"Reinforcement_Learning\",\n        \"Probabilistic_Methods\",\n        \"Theory\",\n        \"Genetic_Algorithms\",\n        \"Case_Based\",\n    ]\n    idx = 0\n    for i in range(len(names)):\n        if row == names[i]:\n            idx = i\n            break\n\n    return idx\n\n\nclass MYDATASET(NodeClassificationDataset):\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"cora\"\n        self.dataset_url = \"http://www.cs.umd.edu/~sen/lbc-proj/data/cora.tgz\"\n\n    def download(self, overwrite=False):\n        # These are the files we want to make my the end of the the download\n        self.input_edge_list_file = self.output_directory / Path(\"edge.csv\")\n        self.input_node_feature_file = self.output_directory / Path(\"node-feat.csv\")\n        self.input_node_label_file = self.output_directory / Path(\"node-label.csv\")\n        self.input_train_nodes_file = self.output_directory / Path(\"train.csv\")\n        self.input_valid_nodes_file = self.output_directory / Path(\"valid.csv\")\n        self.input_test_nodes_file = self.output_directory / Path(\"test.csv\")\n\n        # If files already exist we don't need to do processing\n        download = False\n        if not self.input_edge_list_file.exists():\n            download = True\n        if not self.input_node_feature_file.exists():\n            download = True\n        if not self.input_node_label_file.exists():\n            download = True\n        if not self.input_train_nodes_file.exists():\n            download = True\n        if not self.input_valid_nodes_file.exists():\n            download = True\n        if not self.input_test_nodes_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            # Reading and processing the csv\n            df = pd.read_csv(dataset_dir / Path(\"cora/cora.content\"), sep=\"\\t\", header=None)\n            cols = df.columns[1 : len(df.columns) - 1]\n\n            # Getting all the indices\n            indices = np.array(range(len(df)))\n            np.random.shuffle(indices)\n            train_indices = indices[0 : int(0.8 * len(df))]\n            valid_indices = indices[int(0.8 * len(df)) : int(0.8 * len(df)) + int(0.1 * len(df))]\n            test_indices = indices[int(0.8 * len(df)) + int(0.1 * len(df)) :]\n\n            np.savetxt(dataset_dir / Path(\"train.csv\"), train_indices, delimiter=\",\", fmt=\"%d\")\n            np.savetxt(dataset_dir / Path(\"valid.csv\"), valid_indices, delimiter=\",\", fmt=\"%d\")\n            np.savetxt(dataset_dir / Path(\"test.csv\"), test_indices, delimiter=\",\", fmt=\"%d\")\n\n            # Features\n            features = df[cols]\n            features.to_csv(index=False, sep=\",\", path_or_buf=dataset_dir / Path(\"node-feat.csv\"), header=False)\n\n            # Labels\n            labels = df[df.columns[len(df.columns) - 1]]\n            labels = labels.apply(switch_to_num)\n            labels.to_csv(index=False, sep=\",\", path_or_buf=dataset_dir / Path(\"node-label.csv\"), header=False)\n\n            # Edges\n            node_ids = df[df.columns[0]]\n            dict_reverse = node_ids.to_dict()\n            nodes_dict = {v: k for k, v in dict_reverse.items()}\n            df_edges = pd.read_csv(dataset_dir / Path(\"cora/cora.cites\"), sep=\"\\t\", header=None)\n            df_edges.replace({0: nodes_dict, 1: nodes_dict}, inplace=True)\n            df_edges.to_csv(index=False, sep=\",\", path_or_buf=dataset_dir / Path(\"edge.csv\"), header=False)\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        train_nodes = np.genfromtxt(self.input_train_nodes_file, delimiter=\",\").astype(np.int32)\n        valid_nodes = np.genfromtxt(self.input_valid_nodes_file, delimiter=\",\").astype(np.int32)\n        test_nodes = np.genfromtxt(self.input_test_nodes_file, delimiter=\",\").astype(np.int32)\n\n        # Calling the convert function to generate the preprocessed files\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_edge_list_file,\n            num_partitions=num_partitions,\n            src_column=0,\n            dst_column=1,\n            remap_ids=remap_ids,\n            sequential_train_nodes=sequential_train_nodes,\n            delim=\",\",\n            known_node_ids=[train_nodes, valid_nodes, test_nodes],\n            partitioned_evaluation=partitioned_eval,\n        )\n        dataset_stats = converter.convert()\n\n        features = np.genfromtxt(self.input_node_feature_file, delimiter=\",\").astype(np.float32)\n        labels = np.genfromtxt(self.input_node_label_file, delimiter=\",\").astype(np.int32)\n\n        # The remap in the convertor will only change the edge.csv so we need to manually\n        # remap rest of the *.csv files. We are doing that here\n        if remap_ids:\n            node_mapping = np.genfromtxt(self.output_directory / Path(PathConstants.node_mapping_path), delimiter=\",\")\n            train_nodes, valid_nodes, test_nodes, features, labels = remap_nodes(\n                node_mapping, train_nodes, valid_nodes, test_nodes, features, labels\n            )\n\n        # Writing the remapped files as bin files\n        with open(self.train_nodes_file, \"wb\") as f:\n            f.write(bytes(train_nodes))\n        with open(self.valid_nodes_file, \"wb\") as f:\n            f.write(bytes(valid_nodes))\n        with open(self.test_nodes_file, \"wb\") as f:\n            f.write(bytes(test_nodes))\n        with open(self.node_features_file, \"wb\") as f:\n            f.write(bytes(features))\n        with open(self.node_labels_file, \"wb\") as f:\n            f.write(bytes(labels))\n\n        # update dataset yaml\n        dataset_stats.num_train = train_nodes.shape[0]\n        dataset_stats.num_valid = valid_nodes.shape[0]\n        dataset_stats.num_test = test_nodes.shape[0]\n        dataset_stats.node_feature_dim = features.shape[1]\n        dataset_stats.num_classes = 40\n\n        dataset_stats.num_nodes = dataset_stats.num_train + dataset_stats.num_valid + dataset_stats.num_test\n\n        with open(self.output_directory / Path(\"dataset.yaml\"), \"w\") as f:\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n        return\n\n\ndef init_model(feature_dim, num_classes, device):\n    feature_layer = m.nn.layers.FeatureLayer(dimension=feature_dim, device=device)\n\n    graph_sage_layer1 = m.nn.layers.GraphSageLayer(\n        input_dim=feature_dim, output_dim=feature_dim, device=device, bias=True\n    )\n\n    graph_sage_layer2 = m.nn.layers.GraphSageLayer(\n        input_dim=feature_dim, output_dim=feature_dim, device=device, bias=True\n    )\n\n    graph_sage_layer3 = m.nn.layers.GraphSageLayer(\n        input_dim=feature_dim, output_dim=num_classes, device=device, bias=True\n    )\n\n    encoder = m.encoders.GeneralEncoder(\n        layers=[[feature_layer], [graph_sage_layer1], [graph_sage_layer2], [graph_sage_layer3]]\n    )\n\n    # Setting up the decoder\n    decoder = m.nn.decoders.node.NoOpNodeDecoder()\n\n    # Loss Function\n    loss = m.nn.CrossEntropyLoss(reduction=\"sum\")\n\n    # Set reporter to track accuracy at evaluation\n    reporter = m.report.NodeClassificationReporter()\n    reporter.add_metric(m.report.CategoricalAccuracy())\n\n    # Making the model\n    model = m.nn.Model(encoder, decoder, loss, reporter)\n\n    # Set optimizer\n    model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=0.01)]\n\n    return model\n\n\ndef train_epoch(model, dataloader):\n    # need to reset dataloader state before each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.train_batch(batch)\n\n        counter += 1\n        if counter % 50 == 0:\n            print(\"Trained {} batches\".format(counter))\n\n    print(\"Trained {} batches\".format(counter))\n\n\ndef eval_epoch(model, dataloader):\n    # need to reset dataloader before state each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.evaluate_batch(batch)\n\n        counter += 1\n        if counter % 50 == 0:\n            print(\"Evaluated {} batches\".format(counter))\n\n    print(\"Evaluated {} batches\".format(counter))\n\n    model.reporter.report()\n\n\nif __name__ == \"__main__\":\n    # Here we are initializing the cora dataset. Details regarding what this\n    # dataset class is doing can be found: [TODO add path location]\n\n    # initialize and preprocess dataset\n    dataset_dir = Path(\"cora/\")\n    dataset = MYDATASET(dataset_dir)\n    if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n        dataset.download()\n        dataset.preprocess()\n\n    dataset_stats = OmegaConf.load(dataset_dir / Path(\"dataset.yaml\"))\n\n    # Rest of the code (i.e. model, dataloader, training, etc) is same as nc_ogbn_arxiv example.\n    # Please refer to the documentation at docs/examples/python/nc_ogbn_arxiv.rst for details rest of the code\n\n    # Create the model\n    device = torch.device(\"cuda\")\n    dtype = torch.float32\n    feature_dim = dataset_stats.node_feature_dim\n    model = init_model(feature_dim, dataset_stats.num_classes, device)\n\n    # load training Data - Edges, Nodes, Features, labels\n    edges_all = m.storage.tensor_from_file(\n        filename=dataset.edge_list_file, shape=[dataset_stats.num_edges, -1], dtype=torch.int32, device=device\n    )\n    train_nodes = m.storage.tensor_from_file(\n        filename=dataset.train_nodes_file, shape=[dataset_stats.num_train], dtype=torch.int32, device=device\n    )\n    features = m.storage.tensor_from_file(\n        filename=dataset.node_features_file, shape=[dataset_stats.num_nodes, -1], dtype=torch.float32, device=device\n    )\n    labels = m.storage.tensor_from_file(\n        filename=dataset.node_labels_file, shape=[dataset_stats.num_nodes], dtype=torch.int32, device=device\n    )\n\n    nbr_sampler_3_hop = m.data.samplers.LayeredNeighborSampler(num_neighbors=[-1, -1, -1])\n    train_dataloader = m.data.DataLoader(\n        nodes=train_nodes,\n        edges=edges_all,\n        node_features=features,\n        node_labels=labels,\n        batch_size=1000,\n        nbr_sampler=nbr_sampler_3_hop,\n        learning_task=\"nc\",\n        train=True,\n    )\n\n    # Evaluation:\n    test_nodes = m.storage.tensor_from_file(\n        filename=dataset.test_nodes_file, shape=[dataset_stats.num_test], dtype=torch.int32, device=device\n    )\n    eval_dataloader = m.data.DataLoader(\n        nodes=test_nodes,\n        edges=edges_all,\n        node_labels=labels,\n        node_features=features,\n        batch_size=1000,\n        nbr_sampler=nbr_sampler_3_hop,\n        learning_task=\"nc\",\n        train=False,\n    )\n\n    # Doing the iterations\n    for i in range(5):\n        print(\"Train Epoch {}\".format(i))\n        print(\"-------------------------\")\n        train_epoch(model, train_dataloader)\n        print()\n        print(\"Evaluating\")\n        eval_epoch(model, eval_dataloader)\n\n        print(\"-------------------------\")\n"
  },
  {
    "path": "examples/python/fb15k_237.py",
    "content": "from pathlib import Path\n\nfrom omegaconf import OmegaConf\n\nimport marius as m\nfrom marius.tools.preprocess.datasets.fb15k_237 import FB15K237\n\nimport torch  # isort:skip\n\n\ndef init_model(embedding_dim, num_nodes, num_relations, device, dtype):\n    # setup shallow embedding encoder\n    embedding_layer = m.nn.layers.EmbeddingLayer(dimension=embedding_dim, device=device)\n    encoder = m.encoders.GeneralEncoder(layers=[[embedding_layer]])\n\n    # initialize node embedding table\n    emb_table = embedding_layer.init_embeddings(num_nodes)\n\n    # initialize DistMult decoder\n    decoder = m.nn.decoders.edge.DistMult(\n        num_relations=num_relations,\n        embedding_dim=embedding_dim,\n        use_inverse_relations=True,\n        device=device,\n        dtype=dtype,\n        mode=\"train\",\n    )\n\n    loss = m.nn.SoftmaxCrossEntropy(reduction=\"sum\")\n\n    # metrics to compute during evaluation\n    reporter = m.report.LinkPredictionReporter()\n    reporter.add_metric(m.report.MeanReciprocalRank())\n    reporter.add_metric(m.report.MeanRank())\n    reporter.add_metric(m.report.Hitsk(1))\n    reporter.add_metric(m.report.Hitsk(10))\n\n    # sparse_lr sets the learning rate for the embedding parameters\n    model = m.nn.Model(encoder, decoder, loss, reporter, sparse_lr=0.1)\n\n    # set optimizer for dense model parameters. In this case this is the DistMult relation (edge-type) embeddings\n    model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=0.1)]\n\n    return model, emb_table\n\n\ndef train_epoch(model, dataloader):\n    # need to reset dataloader state before each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.train_batch(batch)\n        dataloader.updateEmbeddings(batch)\n\n        counter += 1\n        if counter % 100 == 0:\n            print(\"Trained {} batches\".format(counter))\n\n    print(\"Trained {} batches\".format(counter))\n\n\ndef eval_epoch(model, dataloader):\n    # need to reset dataloader before state each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.evaluate_batch(batch)\n\n        counter += 1\n        if counter % 100 == 0:\n            print(\"Evaluated {} batches\".format(counter))\n\n    print(\"Evaluated {} batches\".format(counter))\n\n    model.reporter.report()\n\n\nif __name__ == \"__main__\":\n    dataset_dir = Path(\"fb15k_dataset/\")\n    dataset = FB15K237(dataset_dir)\n    if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n        dataset.download()\n        dataset.preprocess()\n\n    dataset_stats = OmegaConf.load(dataset_dir / Path(\"dataset.yaml\"))\n\n    # create model\n    device = torch.device(\"cpu\")\n    dtype = torch.float32\n    embedding_dim = 50\n    model, embeddings = init_model(embedding_dim, dataset_stats.num_nodes, dataset_stats.num_relations, device, dtype)\n\n    # setup training dataloader\n    train_edges = m.storage.tensor_from_file(\n        filename=dataset.train_edges_file, shape=[dataset_stats.num_train, -1], dtype=torch.int32, device=device\n    )\n    train_neg_sampler = m.data.samplers.CorruptNodeNegativeSampler(\n        num_chunks=10, num_negatives=500, degree_fraction=0.0, filtered=False\n    )\n\n    train_dataloader = m.data.DataLoader(\n        edges=train_edges,\n        node_embeddings=embeddings,\n        batch_size=1000,\n        neg_sampler=train_neg_sampler,\n        learning_task=\"lp\",\n        train=True,\n    )\n\n    # setup eval dataloader\n    valid_edges = m.storage.tensor_from_file(\n        filename=dataset.valid_edges_file, shape=[dataset_stats.num_valid, -1], dtype=torch.int32, device=device\n    )\n    test_edges = m.storage.tensor_from_file(\n        filename=dataset.test_edges_file, shape=[dataset_stats.num_test, -1], dtype=torch.int32, device=device\n    )\n    eval_neg_sampler = m.data.samplers.CorruptNodeNegativeSampler(filtered=True)\n\n    eval_dataloader = m.data.DataLoader(\n        edges=test_edges,\n        node_embeddings=embeddings,\n        batch_size=1000,\n        neg_sampler=eval_neg_sampler,\n        learning_task=\"lp\",\n        filter_edges=[train_edges, valid_edges],  # used to filter out false negatives in evaluation\n        train=False,\n    )\n\n    for i in range(5):\n        print(\"Train Epoch {}\".format(i))\n        print(\"-------------------------\")\n        train_epoch(model, train_dataloader)\n        print(\"-------------------------\")\n        print(\"Evaluating\")\n        eval_epoch(model, eval_dataloader)\n        print(\"-------------------------\")\n"
  },
  {
    "path": "examples/python/fb15k_237_gpu.py",
    "content": "from pathlib import Path\n\nfrom omegaconf import OmegaConf\n\nimport marius as m\nfrom marius.tools.preprocess.datasets.fb15k_237 import FB15K237\n\nimport torch  # isort:skip\n\n\ndef init_model(embedding_dim, num_nodes, num_relations, device, dtype):\n    # setup shallow embedding encoder\n    embedding_layer = m.nn.layers.EmbeddingLayer(dimension=embedding_dim, device=device)\n    encoder = m.encoders.GeneralEncoder(layers=[[embedding_layer]])\n\n    # initialize node embedding table\n    emb_table = embedding_layer.init_embeddings(num_nodes)\n\n    # initialize DistMult decoder\n    decoder = m.nn.decoders.edge.DistMult(\n        num_relations=num_relations,\n        embedding_dim=embedding_dim,\n        use_inverse_relations=True,\n        device=device,\n        dtype=dtype,\n        mode=\"train\",\n    )\n\n    loss = m.nn.SoftmaxCrossEntropy(reduction=\"sum\")\n\n    # metrics to compute during evaluation\n    reporter = m.report.LinkPredictionReporter()\n    reporter.add_metric(m.report.MeanReciprocalRank())\n    reporter.add_metric(m.report.MeanRank())\n    reporter.add_metric(m.report.Hitsk(1))\n    reporter.add_metric(m.report.Hitsk(10))\n\n    # sparse_lr sets the learning rate for the embedding parameters\n    model = m.nn.Model(encoder, decoder, loss, reporter, sparse_lr=0.1)\n\n    # set optimizer for dense model parameters. In this case this is the DistMult relation (edge-type) embeddings\n    model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=0.1)]\n\n    return model, emb_table\n\n\ndef train_epoch(model, dataloader):\n    # need to reset dataloader state before each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.train_batch(batch)\n        dataloader.updateEmbeddings(batch)\n\n        counter += 1\n        if counter % 100 == 0:\n            print(\"Trained {} batches\".format(counter))\n\n    print(\"Trained {} batches\".format(counter))\n\n\ndef eval_epoch(model, dataloader):\n    # need to reset dataloader before state each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.evaluate_batch(batch)\n\n        counter += 1\n        if counter % 100 == 0:\n            print(\"Evaluated {} batches\".format(counter))\n\n    print(\"Evaluated {} batches\".format(counter))\n\n    model.reporter.report()\n\n\nif __name__ == \"__main__\":\n    dataset_dir = Path(\"fb15k_dataset/\")\n    dataset = FB15K237(dataset_dir)\n    if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n        dataset.download()\n        dataset.preprocess()\n\n    dataset_stats = OmegaConf.load(dataset_dir / Path(\"dataset.yaml\"))\n\n    # create model\n    device = torch.device(\"cuda\")\n    dtype = torch.float32\n    embedding_dim = 50\n    model, embeddings = init_model(embedding_dim, dataset_stats.num_nodes, dataset_stats.num_relations, device, dtype)\n\n    # setup training dataloader\n    train_edges = m.storage.tensor_from_file(\n        filename=dataset.train_edges_file, shape=[dataset_stats.num_train, -1], dtype=torch.int32, device=device\n    )\n    train_neg_sampler = m.data.samplers.CorruptNodeNegativeSampler(\n        num_chunks=10, num_negatives=500, degree_fraction=0.0, filtered=False\n    )\n\n    train_dataloader = m.data.DataLoader(\n        edges=train_edges,\n        node_embeddings=embeddings,\n        batch_size=1000,\n        neg_sampler=train_neg_sampler,\n        learning_task=\"lp\",\n        train=True,\n    )\n\n    # setup eval dataloader\n    valid_edges = m.storage.tensor_from_file(\n        filename=dataset.valid_edges_file, shape=[dataset_stats.num_valid, -1], dtype=torch.int32, device=device\n    )\n    test_edges = m.storage.tensor_from_file(\n        filename=dataset.test_edges_file, shape=[dataset_stats.num_test, -1], dtype=torch.int32, device=device\n    )\n    eval_neg_sampler = m.data.samplers.CorruptNodeNegativeSampler(filtered=True)\n\n    eval_dataloader = m.data.DataLoader(\n        edges=test_edges,\n        node_embeddings=embeddings,\n        batch_size=1000,\n        neg_sampler=eval_neg_sampler,\n        learning_task=\"lp\",\n        filter_edges=[train_edges, valid_edges],  # used to filter out false negatives in evaluation\n        train=False,\n    )\n\n    for i in range(5):\n        print(\"Train Epoch {}\".format(i))\n        print(\"-------------------------\")\n        train_epoch(model, train_dataloader)\n        print(\"-------------------------\")\n        print(\"Evaluating\")\n        eval_epoch(model, eval_dataloader)\n        print(\"-------------------------\")\n"
  },
  {
    "path": "examples/python/ogbn_arxiv_nc.py",
    "content": "from pathlib import Path\n\nfrom omegaconf import OmegaConf\n\nimport marius as m\nfrom marius.tools.preprocess.datasets.ogbn_arxiv import OGBNArxiv\n\nimport torch  # isort:skip\n\n\ndef init_model(feature_dim, num_classes, device):\n    feature_layer = m.nn.layers.FeatureLayer(dimension=feature_dim, device=device)\n\n    graph_sage_layer1 = m.nn.layers.GraphSageLayer(\n        input_dim=feature_dim, output_dim=feature_dim, device=device, bias=True\n    )\n\n    graph_sage_layer2 = m.nn.layers.GraphSageLayer(\n        input_dim=feature_dim, output_dim=feature_dim, device=device, bias=True\n    )\n\n    graph_sage_layer3 = m.nn.layers.GraphSageLayer(\n        input_dim=feature_dim, output_dim=num_classes, device=device, bias=True\n    )\n\n    encoder = m.encoders.GeneralEncoder(\n        layers=[[feature_layer], [graph_sage_layer1], [graph_sage_layer2], [graph_sage_layer3]]\n    )\n\n    # Setting up the decoder\n    decoder = m.nn.decoders.node.NoOpNodeDecoder()\n\n    # Loss Function\n    loss = m.nn.CrossEntropyLoss(reduction=\"sum\")\n\n    # Set reporter to track accuracy at evaluation\n    reporter = m.report.NodeClassificationReporter()\n    reporter.add_metric(m.report.CategoricalAccuracy())\n\n    # Making the model\n    model = m.nn.Model(encoder, decoder, loss, reporter)\n\n    # Set optimizer\n    model.optimizers = [m.nn.AdamOptimizer(model.named_parameters(), lr=0.01)]\n\n    return model\n\n\ndef train_epoch(model, dataloader):\n    # need to reset dataloader state before each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.train_batch(batch)\n\n        counter += 1\n        if counter % 50 == 0:\n            print(\"Trained {} batches\".format(counter))\n\n    print(\"Trained {} batches\".format(counter))\n\n\ndef eval_epoch(model, dataloader):\n    # need to reset dataloader before state each epoch\n    dataloader.initializeBatches()\n\n    counter = 0\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch()\n        model.evaluate_batch(batch)\n\n        counter += 1\n        if counter % 50 == 0:\n            print(\"Evaluated {} batches\".format(counter))\n\n    print(\"Evaluated {} batches\".format(counter))\n\n    model.reporter.report()\n\n\nif __name__ == \"__main__\":\n    # initialize and preprocess dataset\n    dataset_dir = Path(\"ogbn_arxiv_nc_dataset/\")\n    dataset = OGBNArxiv(dataset_dir)\n    if not (dataset_dir / Path(\"edges/train_edges.bin\")).exists():\n        dataset.download()\n        dataset.preprocess()\n\n    dataset_stats = OmegaConf.load(dataset_dir / Path(\"dataset.yaml\"))\n\n    # Create the model\n    device = torch.device(\"cuda\")\n    dtype = torch.float32\n    feature_dim = dataset_stats.node_feature_dim\n    model = init_model(feature_dim, dataset_stats.num_classes, device)\n\n    # load training Data - Edges, Nodes, Features, labels\n    edges_all = m.storage.tensor_from_file(\n        filename=dataset.edge_list_file, shape=[dataset_stats.num_edges, -1], dtype=torch.int32, device=device\n    )\n    train_nodes = m.storage.tensor_from_file(\n        filename=dataset.train_nodes_file, shape=[dataset_stats.num_train], dtype=torch.int32, device=device\n    )\n    features = m.storage.tensor_from_file(\n        filename=dataset.node_features_file, shape=[dataset_stats.num_nodes, -1], dtype=torch.float32, device=device\n    )\n    labels = m.storage.tensor_from_file(\n        filename=dataset.node_labels_file, shape=[dataset_stats.num_nodes], dtype=torch.int32, device=device\n    )\n\n    nbr_sampler_3_hop = m.data.samplers.LayeredNeighborSampler(num_neighbors=[-1, -1, -1])\n    train_dataloader = m.data.DataLoader(\n        nodes=train_nodes,\n        edges=edges_all,\n        node_features=features,\n        node_labels=labels,\n        batch_size=1000,\n        nbr_sampler=nbr_sampler_3_hop,\n        learning_task=\"nc\",\n        train=True,\n    )\n\n    # Evaluation:\n    test_nodes = m.storage.tensor_from_file(\n        filename=dataset.test_nodes_file, shape=[dataset_stats.num_test], dtype=torch.int32, device=device\n    )\n    eval_dataloader = m.data.DataLoader(\n        nodes=test_nodes,\n        edges=edges_all,\n        node_labels=labels,\n        node_features=features,\n        batch_size=1000,\n        nbr_sampler=nbr_sampler_3_hop,\n        learning_task=\"nc\",\n        train=False,\n    )\n\n    # Doing the iterations\n    for i in range(5):\n        print(\"Train Epoch {}\".format(i))\n        print(\"-------------------------\")\n        train_epoch(model, train_dataloader)\n        print()\n        print(\"Evaluating\")\n        eval_epoch(model, eval_dataloader)\n\n        print(\"-------------------------\")\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[tool.black]\nline-length = 120\n\n[tool.isort]\nprofile = \"black\"\nline_length = 120\n\n[tool.pytest.ini_options]\npythonpath = [\n    \".\"\n]\n\n[build-system]\nrequires = [\"setuptools==44.0\", \"torch>=1.7.1\"]\nbuild-backend = \"setuptools.build_meta\"\n"
  },
  {
    "path": "setup.cfg",
    "content": "[metadata]\nname = marius\nversion = 0.0.2\ndescription = A system for training embeddings for large scale graphs on a single machine\nlong_description = file: README.md\nlong_description_content_type = text/markdown\n\n# Links\nurl = https://github.com/marius-team/marius\n\n# Author information\nauthor = Jason Mohoney\nauthor_email = mohoney2@wisc.edu\nmaintainer = Jason Mohoney\nmaintainer_email = mohoney2@wisc.edu\n\n# License information\nlicense = Apache-2.0\nlicense_files = LICENSE\n\n[options.extras_require]\nspark =\n    pyspark>=3.0.0\ntests =\n    pytest==7.0.1\n    tox==3.25.1\ndocs =\n    sphinx-rtd-theme==1.0.0\n    sphinx-autodoc-typehints==1.17.0\n    breathe==4.30.0\ndb2graph = \n    psycopg2-binary\n    mysql-connector-python\n\n[options]\ninstall_requires =\n    numpy>=1.2\n    pandas>=1.1\n    torch>=1.7.1\n    omegaconf>=2.2\n    psutil>=5.9\n    GPUtil>=1.4\n    importlib_metadata>=4.0.0\n\nzip_safe = false\npython_requires = >=3.7\ninclude_package_data = true\n\npackages =\n    marius\n    marius.tools\n    marius.console_scripts\n\npackage_dir =\n    marius = src/python\n    marius.tools = src/python/tools\n    marius.console_scripts = src/python/console_scripts\n\n[options.entry_points]\nconsole_scripts =\n    marius_train = marius.console_scripts.marius_train:main\n    marius_eval = marius.console_scripts.marius_eval:main\n    marius_preprocess = marius.tools.marius_preprocess:main\n    marius_postprocess = marius.tools.marius_postprocess:main\n    marius_config_generator = marius.tools.marius_config_generator:main\n    marius_predict = marius.tools.marius_predict:main\n    marius_env_info = marius.distribution.marius_env_info:main\n    marius_db2graph = marius.tools.db2graph.marius_db2graph:main"
  },
  {
    "path": "setup.py",
    "content": "import os\nimport platform\nimport subprocess\nimport sys\n\nfrom setuptools import Extension, setup\nfrom setuptools.command.build_ext import build_ext\n\n\nclass CMakeExtension(Extension):\n    def __init__(self, name, sourcedir=\"\"):\n        Extension.__init__(self, name, sources=[])\n        self.sourcedir = os.path.abspath(sourcedir)\n\n\nclass CMakeBuild(build_ext):\n    def run(self):\n        try:\n            _ = subprocess.check_output([\"cmake\", \"--version\"])\n        except OSError:\n            raise RuntimeError(\n                \"CMake must be installed to build the following extensions: \"\n                + \", \".join(e.name for e in self.extensions)\n            )\n\n        if platform.system() == \"Windows\":\n            raise RuntimeError(\"Unsupported on Windows\")\n\n        for ext in self.extensions:\n            self.build_extension(ext)\n\n    def build_extension(self, ext):\n        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))\n        cmake_args = [\"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=\" + extdir, \"-DPYTHON_EXECUTABLE=\" + sys.executable]\n\n        cfg = \"Debug\" if self.debug else \"Release\"\n        build_args = [\"--config\", cfg]\n\n        if platform.system() == \"Windows\":\n            raise RuntimeError(\"Unsupported on Windows\")\n        else:\n            cmake_args += [\"-DCMAKE_BUILD_TYPE=\" + cfg]\n\n            num_threads = os.cpu_count()\n            build_args += [\"--\", \"-j{}\".format(num_threads)]\n\n        cmake_args += [\"-DCMAKE_BUILD_WITH_INSTALL_RPATH=TRUE\"]\n        cmake_args += [\"-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=TRUE\"]\n\n        try:\n            import torch\n\n            if torch.cuda.is_available():\n                cmake_args += [\"-DUSE_CUDA=TRUE\"]\n                cmake_args += [\"-DUSE_OMP=TRUE\"]\n        except ImportError:\n            raise ImportError(\"Pytorch not found. Please install pytorch first.\")\n\n        if sys.platform == \"darwin\":\n            cmake_args.append(\"-DCMAKE_INSTALL_RPATH=@loader_path\")\n        else:  # values: linux*, aix, freebsd, ... just as well win32 & cygwin\n            cmake_args.append(\"-DCMAKE_INSTALL_RPATH=$ORIGIN\")\n\n        env = os.environ.copy()\n        env[\"CXXFLAGS\"] = '{} -DVERSION_INFO=\\\\\"{}\\\\\"'.format(env.get(\"CXXFLAGS\", \"\"), self.distribution.get_version())\n\n        if not os.path.exists(self.build_temp):\n            os.makedirs(self.build_temp)\n\n        print(cmake_args)\n\n        subprocess.check_call([\"cmake\", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)\n        subprocess.check_call([\"cmake\", \"--build\", \".\", \"--target\", \"bindings\"] + build_args, cwd=self.build_temp)\n        print()  # Add an empty line for cleaner output\n\n\nonly_python = os.environ.get(\"MARIUS_NO_BINDINGS\", None)\nif only_python:\n    setup()\nelse:\n    setup(\n        ext_modules=[\n            CMakeExtension(\"marius._config\"),\n            CMakeExtension(\"marius._data\"),\n            CMakeExtension(\"marius._manager\"),\n            CMakeExtension(\"marius._nn\"),\n            CMakeExtension(\"marius._pipeline\"),\n            CMakeExtension(\"marius._report\"),\n            CMakeExtension(\"marius._storage\"),\n        ],\n        cmdclass=dict(build_ext=CMakeBuild),\n    )\n"
  },
  {
    "path": "src/__init__.py",
    "content": ""
  },
  {
    "path": "src/cpp/cmake/FindSphinx.cmake",
    "content": "#Look for an executable called sphinx-build\nfind_program(SPHINX_EXECUTABLE\n        NAMES sphinx-build\n        DOC \"Path to sphinx-build executable\")\n\ninclude(FindPackageHandleStandardArgs)\n\n#Handle standard arguments to find_package like REQUIRED and QUIET\nfind_package_handle_standard_args(Sphinx\n        \"Failed to find sphinx-build executable\"\n        SPHINX_EXECUTABLE)"
  },
  {
    "path": "src/cpp/include/common/datatypes.h",
    "content": "//\n// Created by jasonmohoney on 10/19/19.\n//\n\n#ifndef MARIUS_DATATYPES_H\n#define MARIUS_DATATYPES_H\n\n#include <map>\n#include <string>\n#include <unordered_map>\n#include <vector>\n\n#include \"common/exception.h\"\n\n#pragma GCC diagnostic push\n#pragma GCC diagnostic ignored \"-Wunused-parameter\"\n#include \"torch/torch.h\"\n#pragma GCC diagnostic pop\n\nusing std::map;\nusing std::shared_ptr;\nusing std::string;\nusing std::unique_ptr;\n\n/** Program Constants */\n\n#define MAX_READ_SIZE 1E8\n\n/** Deployment configs */\n\n// Dummy CUDA objects so we don't break the CPU-only build\nclass DummyCudaEvent {\n   public:\n    DummyCudaEvent(int val) { (void)val; }\n\n    void start(){};\n\n    void record(){};\n\n    void synchronize(){};\n\n    int elapsed_time(DummyCudaEvent) { return 0; }\n};\n\nclass DummyCudaStream {\n   public:\n    DummyCudaStream() {}\n\n    void synchronize(){};\n};\n\nclass DummyCudaStreamGuard {\n   public:\n    DummyCudaStreamGuard(DummyCudaStream) {}\n};\n\n#ifdef MARIUS_CUDA\n    #include <ATen/cuda/CUDAContext.h>\n    #include <ATen/cuda/CUDAEvent.h>\n    #include <ATen/cuda/Exceptions.h>\n    #include <c10/cuda/CUDAGuard.h>\n    #include <c10/cuda/CUDAStream.h>\n    #include <c10/util/Exception.h>\n\ntypedef at::cuda::CUDAEvent CudaEvent;\ntypedef at::cuda::CUDAStream CudaStream;\ntypedef at::cuda::CUDAStreamGuard CudaStreamGuard;\n\nusing at::cuda::getStreamFromPool;\n\n#else\ntypedef DummyCudaEvent CudaEvent;\ntypedef DummyCudaStream CudaStream;\ntypedef DummyCudaStreamGuard CudaStreamGuard;\n\ninline CudaStream getStreamFromPool(bool = false, int = 0) { return CudaStream(); }\n#endif\n\n#ifndef IO_FLAGS\n    #define IO_FLAGS 0\n#endif\n\n/** Typedefs */\n\n/**\n * Tensor of edges in COO format with node and relation indices. Shape (n, 3)\n * First column -> src_idx\n * Second column -> rel_idx\n * Third column -> dst_idx\n */\ntypedef torch::Tensor EdgeList;\n\n/** 1D Tensor of indices. Shape (n) */\ntypedef torch::Tensor Indices;\n\n/** Tensor of gradients. Shape: (n, EMBEDDING_SIZE) */\ntypedef torch::Tensor Gradients;\n\n/** Tensor containing optimizer state for a selection of parameters. Shape: (n, FEATURE_SIZE) */\ntypedef torch::Tensor OptimizerState;\n\ntypedef std::chrono::time_point<std::chrono::steady_clock> Timestamp;\n\n#endif  // MARIUS_DATATYPES_H\n"
  },
  {
    "path": "src/cpp/include/common/exception.h",
    "content": "//\n// Created by Jason Mohoney on 2/4/22.\n//\n\n#ifndef MARIUS_EXCEPTION_H\n#define MARIUS_EXCEPTION_H\n\n#include <exception>\n\n#include \"torch/torch.h\"\n\nstruct MariusRuntimeException : public std::runtime_error {\n   public:\n    MariusRuntimeException(const std::string &message) : runtime_error(message) {}\n};\n\nstruct UndefinedTensorException : public MariusRuntimeException {\n   public:\n    UndefinedTensorException() : MariusRuntimeException(\"Tensor undefined\") {}\n};\n\nstruct NANTensorException : public MariusRuntimeException {\n   public:\n    NANTensorException() : MariusRuntimeException(\"Tensor contains NANs\") {}\n};\n\nstruct OOMTensorException : public MariusRuntimeException {\n   public:\n    OOMTensorException() : MariusRuntimeException(\"Tensor results in OOM\") {}\n};\n\nstruct TensorSizeMismatchException : public MariusRuntimeException {\n   public:\n    //    TensorSizeMismatchException(torch::Tensor input, std::string message) : MariusRuntimeException((std::stringstream(\"Tensor size mismatch. Size: \") <<\n    //    input.sizes() << \" \" << message).str()) {}\n    TensorSizeMismatchException(torch::Tensor input, std::string message) : MariusRuntimeException(message) {}\n};\n\nstruct UnexpectedNullPtrException : public MariusRuntimeException {\n   public:\n    UnexpectedNullPtrException(std::string message = \"\") : MariusRuntimeException(message) {}\n};\n\n#endif  // MARIUS_EXCEPTION_H\n"
  },
  {
    "path": "src/cpp/include/common/pybind_headers.h",
    "content": "//\n// Created by Jason Mohoney on 3/7/22.\n//\n\n#ifndef MARIUS_PYBIND_HEADERS_H\n#define MARIUS_PYBIND_HEADERS_H\n\n#include \"pybind11/embed.h\"\n#include \"torch/extension.h\"\n\nnamespace py = pybind11;\n\n#endif  // MARIUS_PYBIND_HEADERS_H\n"
  },
  {
    "path": "src/cpp/include/common/util.h",
    "content": "//\n// Created by Jason Mohoney on 7/30/20.\n//\n\n#ifndef MARIUS_UTIL_H\n#define MARIUS_UTIL_H\n\n#include \"datatypes.h\"\n\nclass Timer {\n   public:\n    bool gpu_;\n    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;\n    std::chrono::time_point<std::chrono::high_resolution_clock> stop_time_;\n    CudaEvent *start_event_;\n    CudaEvent *end_event_;\n\n    Timer(bool gpu) {\n        start_event_ = new CudaEvent(0);\n        end_event_ = new CudaEvent(0);\n        gpu_ = gpu;\n    }\n\n    ~Timer() {\n        delete start_event_;\n        delete end_event_;\n    }\n\n    void start() {\n        start_time_ = std::chrono::high_resolution_clock::now();\n        if (gpu_) {\n            start_event_->record();\n        }\n    }\n\n    void stop() {\n        stop_time_ = std::chrono::high_resolution_clock::now();\n        if (gpu_) {\n            end_event_->record();\n        }\n    }\n\n    int64_t getDuration(bool ms = true) {\n        int64_t duration;\n        if (ms) {\n            duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time_ - start_time_).count();\n        } else {\n            duration = std::chrono::duration_cast<std::chrono::microseconds>(stop_time_ - start_time_).count();\n        }\n\n        if (gpu_) {\n            start_event_->synchronize();\n            end_event_->synchronize();\n            duration = start_event_->elapsed_time(*end_event_);\n        }\n        return duration;\n    }\n};\n\nbool has_nans(torch::Tensor values);\n\nvoid assert_no_nans(torch::Tensor values);\n\nvoid assert_no_neg(torch::Tensor values);\n\nvoid assert_in_range(torch::Tensor values, int64_t start, int64_t end);\n\nvoid process_mem_usage();\n\nvoid *memset_wrapper(void *ptr, int value, int64_t num);\n\nvoid *memcpy_wrapper(void *dest, const void *src, int64_t count);\n\nint64_t pread_wrapper(int fd, void *buf, int64_t count, int64_t offset);\n\nint64_t pwrite_wrapper(int fd, const void *buf, int64_t count, int64_t offset);\n\ntorch::Tensor transfer_tensor(torch::Tensor input, torch::Device device, CudaStream *compute_stream = nullptr, CudaStream *transfer_stream = nullptr);\n\nint64_t get_dtype_size_wrapper(torch::Dtype dtype_);\n\nstd::string get_directory(std::string path);\n\ntemplate <typename T1, typename T2>\nbool instance_of(std::shared_ptr<T1> instance) {\n    return (std::dynamic_pointer_cast<T2>(instance) != nullptr);\n}\n\nstd::tuple<torch::Tensor, std::vector<torch::Tensor>> map_tensors(std::vector<torch::Tensor> unmapped_tensors);\n\nstd::vector<torch::Tensor> apply_tensor_map(torch::Tensor map, std::vector<torch::Tensor> unmapped_tensors);\n#endif  // MARIUS_UTIL_H\n"
  },
  {
    "path": "src/cpp/include/configuration/config.h",
    "content": "//\n// Created by Jason Mohoney on 10/7/21.\n//\n\n#ifndef MARIUS_CONFIG_H\n#define MARIUS_CONFIG_H\n\n#include \"common/datatypes.h\"\n#include \"common/pybind_headers.h\"\n#include \"constants.h\"\n#include \"options.h\"\n\nusing pyobj = pybind11::object;\nusing std::shared_ptr;\n\nstruct NeighborSamplingConfig {\n    NeighborSamplingLayer type;\n    shared_ptr<NeighborSamplingOptions> options = nullptr;\n    bool use_hashmap_sets;\n};\n\nstruct OptimizerConfig {\n    OptimizerType type;\n    shared_ptr<OptimizerOptions> options = nullptr;\n};\n\nstruct InitConfig {\n    InitDistribution type;\n    shared_ptr<InitOptions> options = nullptr;\n\n    InitConfig(){};\n    InitConfig(InitDistribution type, shared_ptr<InitOptions> options) : type(type), options(options){};\n};\n\nstruct LossConfig {\n    LossFunctionType type;\n    shared_ptr<LossOptions> options = nullptr;\n};\n\nstruct LayerConfig {\n    LayerType type;\n    shared_ptr<LayerOptions> options = nullptr;\n    int input_dim;\n    int output_dim;\n    shared_ptr<InitConfig> init = nullptr;\n    shared_ptr<OptimizerConfig> optimizer = nullptr;\n    bool bias;\n    shared_ptr<InitConfig> bias_init = nullptr;\n    ActivationFunction activation;\n};\n\nstruct EncoderConfig {\n    bool use_incoming_nbrs;\n    bool use_outgoing_nbrs;\n    std::vector<std::vector<shared_ptr<LayerConfig>>> layers;\n    std::vector<shared_ptr<NeighborSamplingConfig>> train_neighbor_sampling;\n    std::vector<shared_ptr<NeighborSamplingConfig>> eval_neighbor_sampling;\n};\n\nstruct DecoderConfig {\n    DecoderType type;\n    shared_ptr<DecoderOptions> options = nullptr;\n    shared_ptr<OptimizerConfig> optimizer = nullptr;\n};\n\nstruct StorageBackendConfig {\n    StorageBackend type;\n    shared_ptr<StorageOptions> options = nullptr;\n};\n\nstruct DatasetConfig {\n    string dataset_dir;\n    int64_t num_edges;\n    int64_t num_nodes;\n    int64_t num_relations;\n    int64_t num_train;\n    int64_t num_valid;\n    int64_t num_test;\n    int node_feature_dim;\n    int rel_feature_dim;\n    int num_classes;\n};\n\nstruct NegativeSamplingConfig {\n    int num_chunks;\n    int negatives_per_positive;\n    float degree_fraction;\n    bool filtered;\n    LocalFilterMode local_filter_mode;\n};\n\nstruct PipelineConfig {\n    bool sync;\n    int staleness_bound;\n    int gpu_sync_interval;\n    bool gpu_model_average;\n    int batch_host_queue_size;\n    int batch_device_queue_size;\n    int gradients_device_queue_size;\n    int gradients_host_queue_size;\n    int batch_loader_threads;\n    int batch_transfer_threads;\n    int compute_threads;\n    int gradient_transfer_threads;\n    int gradient_update_threads;\n};\n\nstruct CheckpointConfig {\n    // TODO: save the checkpoint which performs best on the valid/test set.\n    bool save_best;\n    int interval;\n    bool save_state;\n};\n\nstruct ModelConfig {\n    int random_seed;\n    LearningTask learning_task;\n    shared_ptr<EncoderConfig> encoder = nullptr;\n    shared_ptr<DecoderConfig> decoder = nullptr;\n    shared_ptr<LossConfig> loss = nullptr;\n    shared_ptr<OptimizerConfig> dense_optimizer = nullptr;\n    shared_ptr<OptimizerConfig> sparse_optimizer = nullptr;\n};\n\nstruct StorageConfig {\n    torch::Device device_type = torch::Device(\"cpu\");\n    std::vector<int> device_ids = {};\n    shared_ptr<DatasetConfig> dataset = nullptr;\n    shared_ptr<StorageBackendConfig> edges = nullptr;\n    shared_ptr<StorageBackendConfig> nodes = nullptr;\n    shared_ptr<StorageBackendConfig> embeddings = nullptr;\n    shared_ptr<StorageBackendConfig> features = nullptr;\n    bool prefetch;\n    bool shuffle_input;\n    bool full_graph_evaluation;\n    bool export_encoded_nodes;\n    std::string model_dir;\n    spdlog::level::level_enum log_level;\n    bool train_edges_pre_sorted;\n};\n\nstruct TrainingConfig {\n    int batch_size;\n    shared_ptr<NegativeSamplingConfig> negative_sampling = nullptr;\n    int num_epochs;\n    shared_ptr<PipelineConfig> pipeline = nullptr;\n    int epochs_per_shuffle;\n    int logs_per_epoch;\n    bool save_model;\n    shared_ptr<CheckpointConfig> checkpoint = nullptr;\n    bool resume_training;\n    string resume_from_checkpoint;\n};\n\nstruct EvaluationConfig {\n    int batch_size;\n    shared_ptr<NegativeSamplingConfig> negative_sampling = nullptr;\n    shared_ptr<PipelineConfig> pipeline = nullptr;\n    int epochs_per_eval;\n    string checkpoint_dir;\n    bool full_graph_evaluation;\n};\n\nstruct MariusConfig {\n    shared_ptr<ModelConfig> model = nullptr;\n    shared_ptr<StorageConfig> storage = nullptr;\n    shared_ptr<TrainingConfig> training = nullptr;\n    shared_ptr<EvaluationConfig> evaluation = nullptr;\n};\n\nbool check_missing(pyobj python_object);\n\ntemplate <typename T>\nT cast_helper(pyobj python_object);\n\nPYBIND11_EXPORT shared_ptr<NeighborSamplingConfig> initNeighborSamplingConfig(pyobj python_object);\n\n// Lol at this name\nPYBIND11_EXPORT shared_ptr<InitConfig> initInitConfig(pyobj python_object);\n\nPYBIND11_EXPORT shared_ptr<OptimizerConfig> initOptimizerConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<DatasetConfig> initDatasetConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<LayerConfig> initLayerConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<EncoderConfig> initEncoderConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<DecoderConfig> initDecoderConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<LossConfig> initLossConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<StorageBackendConfig> initStorageBackendConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<NegativeSamplingConfig> initNegativeSamplingConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<PipelineConfig> initPipelineConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<CheckpointConfig> initCheckpointConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<ModelConfig> initModelConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<StorageConfig> initStorageConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<TrainingConfig> initTrainingConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<EvaluationConfig> initEvaluationConfig(pyobj python_config);\n\nPYBIND11_EXPORT shared_ptr<MariusConfig> initMariusConfig(pyobj python_config);\n\nshared_ptr<MariusConfig> loadConfig(string config_path, bool save = false);\n\n#endif  // MARIUS_CONFIG_H\n"
  },
  {
    "path": "src/cpp/include/configuration/constants.h",
    "content": "//\n// Created by Jason Mohoney on 2/18/20.\n//\n\n#ifndef MARIUS_CONSTANTS_H\n#define MARIUS_CONSTANTS_H\n\n#include <string>\n\n#include \"common/datatypes.h\"\n\n#define MISSING_STR \"???\"\n\n#define MAX_NODE_EMBEDDING_INIT_SIZE 1E7  // how many node embeddings to initialize at one time\n\nnamespace PathConstants {\nconst string model_file = \"model.pt\";\nconst string model_state_file = \"model_state.pt\";\nconst string model_config_file = \"model_config.yaml\";\n\nconst string training = \"train_\";\nconst string validation = \"validation_\";\nconst string test = \"test_\";\n\nconst string dst_sort = \"_dst_sort\";\n\nconst string edges_directory = \"edges/\";\nconst string edges_file = \"edges\";\nconst string edge_partition_offsets_file = \"partition_offsets.txt\";\n\nconst string node_mapping_file = \"node_mapping.txt\";\nconst string relation_mapping_file = \"relation_mapping.txt\";\n\nconst string nodes_directory = \"nodes/\";\nconst string nodes_file = \"nodes\";\nconst string features_file = \"features\";\nconst string labels_file = \"labels\";\nconst string embeddings_file = \"embeddings\";\nconst string encoded_nodes_file = \"encoded_nodes\";\nconst string embeddings_state_file = \"embeddings_state\";\n\nconst string file_ext = \".bin\";\nconst string checkpoint_metadata_file = \"metadata.csv\";\nconst string config_file = \"config.yaml\";\n\nconst string output_metrics_file = \"metrics.txt\";\nconst string output_scores_file = \"scores.csv\";\nconst string output_labels_file = \"labels.csv\";\n};  // namespace PathConstants\n\n#endif  // MARIUS_CONSTANTS_H\n"
  },
  {
    "path": "src/cpp/include/configuration/options.h",
    "content": "//\n// Created by Jason Mohoney on 10/7/21.\n//\n\n#ifndef MARIUS_OPTIONS_H\n#define MARIUS_OPTIONS_H\n\n#include \"common/datatypes.h\"\n#include \"reporting/logger.h\"\n\n// ENUM values\nenum class LearningTask { NODE_CLASSIFICATION, LINK_PREDICTION, ENCODE };\n\nLearningTask getLearningTask(std::string string_val);\n\nenum class InitDistribution { ZEROS, ONES, CONSTANT, UNIFORM, NORMAL, GLOROT_UNIFORM, GLOROT_NORMAL };\n\nInitDistribution getInitDistribution(std::string string_val);\n\nenum class LossFunctionType { SOFTMAX_CE, RANKING, CROSS_ENTROPY, BCE_AFTER_SIGMOID, BCE_WITH_LOGITS, MSE, SOFTPLUS };\n\nLossFunctionType getLossFunctionType(std::string string_val);\n\nenum class LossReduction { MEAN, SUM };\n\nLossReduction getLossReduction(std::string string_val);\n\nenum class ActivationFunction { RELU, SIGMOID, NONE };\n\nActivationFunction getActivationFunction(std::string string_val);\n\nenum class OptimizerType { SGD, ADAM, ADAGRAD, DEFAULT };\n\nOptimizerType getOptimizerType(std::string string_val);\n\nenum class ReductionLayerType {\n    NONE,\n    CONCAT,\n    LINEAR,\n};\n\nReductionLayerType getReductionLayerType(std::string string_val);\n\nenum class LayerType { NONE, EMBEDDING, FEATURE, GNN, DENSE, REDUCTION };\n\nLayerType getLayerType(std::string string_val);\n\nenum class DenseLayerType { NONE, LINEAR, CONV };\n\nDenseLayerType getDenseLayerType(std::string string_val);\n\nenum class GNNLayerType { NONE, GRAPH_SAGE, GCN, GAT, RGCN };\n\nGNNLayerType getGNNLayerType(std::string string_val);\n\nenum class GraphSageAggregator { GCN, MEAN };\n\nGraphSageAggregator getGraphSageAggregator(std::string string_val);\n\nenum class DecoderType { NODE, DISTMULT, TRANSE, COMPLEX };\n\nDecoderType getDecoderType(std::string string_val);\n\nenum class EdgeDecoderMethod { ONLY_POS, POS_AND_NEG, CORRUPT_NODE, CORRUPT_REL };\n\nEdgeDecoderMethod getEdgeDecoderMethod(std::string string_val);\n\nenum class StorageBackend { PARTITION_BUFFER, FLAT_FILE, HOST_MEMORY, DEVICE_MEMORY };\n\nStorageBackend getStorageBackend(std::string string_val);\n\nenum class EdgeBucketOrdering { OLD_BETA, NEW_BETA, ALL_BETA, COMET, CUSTOM };\n\nEdgeBucketOrdering getEdgeBucketOrderingEnum(std::string string_val);\n\nenum class NodePartitionOrdering { DISPERSED, SEQUENTIAL, CUSTOM };\n\nNodePartitionOrdering getNodePartitionOrderingEnum(std::string string_val);\n\nenum class NeighborSamplingLayer { ALL, UNIFORM, DROPOUT };\n\nNeighborSamplingLayer getNeighborSamplingLayer(std::string string_val);\n\nenum class LocalFilterMode { ALL, DEG };\n\nLocalFilterMode getLocalFilterMode(std::string string_val);\n\ntorch::Dtype getDtype(std::string string_val);\n\nspdlog::level::level_enum getLogLevel(std::string string_val);\n\nstruct InitOptions {\n    virtual ~InitOptions() = default;\n};\n\nstruct ConstantInitOptions : InitOptions {\n    float constant;\n    ConstantInitOptions(){};\n    ConstantInitOptions(float constant) : constant(constant){};\n};\n\nstruct UniformInitOptions : InitOptions {\n    float scale_factor;\n    UniformInitOptions(){};\n    UniformInitOptions(float scale_factor) : scale_factor(scale_factor){};\n};\n\nstruct NormalInitOptions : InitOptions {\n    float mean;\n    float std;\n    NormalInitOptions(){};\n    NormalInitOptions(float mean, float std) : mean(mean), std(std){};\n};\n\nstruct LossOptions {\n    LossReduction loss_reduction;\n\n    virtual ~LossOptions() = default;\n};\n\nstruct RankingLossOptions : LossOptions {\n    LossReduction loss_reduction;\n    float margin;\n};\n\nstruct OptimizerOptions {\n    float learning_rate;\n\n    virtual ~OptimizerOptions() = default;\n};\n\nstruct AdagradOptions : OptimizerOptions {\n    float eps;\n    float init_value;\n    float lr_decay;\n    float weight_decay;\n};\n\nstruct AdamOptions : OptimizerOptions {\n    bool amsgrad;\n    float beta_1;\n    float beta_2;\n    float eps;\n    float weight_decay;\n};\n\nstruct LayerOptions {\n    virtual ~LayerOptions() = default;\n};\n\nstruct EmbeddingLayerOptions : LayerOptions {};\n\nstruct FeatureLayerOptions : LayerOptions {};\n\nstruct DenseLayerOptions : LayerOptions {\n    DenseLayerType type;\n};\n\nstruct ReductionLayerOptions : LayerOptions {\n    ReductionLayerType type;\n};\n\nstruct GNNLayerOptions : LayerOptions {\n    GNNLayerType type;\n    virtual ~GNNLayerOptions() = default;\n};\n\nstruct GraphSageLayerOptions : GNNLayerOptions {\n    GraphSageAggregator aggregator;\n};\n\nstruct GATLayerOptions : GNNLayerOptions {\n    int num_heads;\n    bool average_heads;\n    float negative_slope;\n    float input_dropout;\n    float attention_dropout;\n};\n\nstruct DecoderOptions {\n    virtual ~DecoderOptions() = default;\n};\n\nstruct EdgeDecoderOptions : DecoderOptions {\n    int input_dim;\n    bool inverse_edges;\n    EdgeDecoderMethod edge_decoder_method;\n};\n\nstruct StorageOptions {\n    torch::Dtype dtype;\n    virtual ~StorageOptions() = default;\n};\n\nstruct PartitionBufferOptions : StorageOptions {\n    int num_partitions;\n    int buffer_capacity;\n    bool prefetching;\n    int fine_to_coarse_ratio;\n    int num_cache_partitions;\n    EdgeBucketOrdering edge_bucket_ordering;\n    NodePartitionOrdering node_partition_ordering;\n    bool randomly_assign_edge_buckets;\n};\n\nstruct NeighborSamplingOptions {\n    virtual ~NeighborSamplingOptions() = default;\n};\n\nstruct UniformSamplingOptions : NeighborSamplingOptions {\n    int max_neighbors;\n};\n\nstruct DropoutSamplingOptions : NeighborSamplingOptions {\n    float rate;\n};\n\n#endif  // MARIUS_OPTIONS_H\n"
  },
  {
    "path": "src/cpp/include/configuration/util.h",
    "content": "//\n// Created by Jason Mohoney on 1/19/22.\n//\n\n#ifndef MARIUS_CONFIGURATION_UTIL_H\n#define MARIUS_CONFIGURATION_UTIL_H\n\n#include \"config.h\"\n\nstd::vector<torch::Device> devices_from_config(std::shared_ptr<StorageConfig> storage_config);\n\n#endif  // MARIUS_CONFIGURATION_UTIL_H\n"
  },
  {
    "path": "src/cpp/include/data/batch.h",
    "content": "//\n// Created by Jason Mohoney on 7/9/20.\n//\n\n#ifndef MARIUS_BATCH_H\n#define MARIUS_BATCH_H\n\n#include \"common/datatypes.h\"\n#include \"common/util.h\"\n#include \"graph.h\"\n\nusing std::vector;\n\n/**\n * Specifies location of the batch in the pipeline\n */\nenum class BatchStatus {\n    Waiting,\n    AccumulatedIndices,\n    LoadedEmbeddings,\n    TransferredToDevice,\n    PreparedForCompute,\n    ComputedGradients,\n    AccumulatedGradients,\n    TransferredToHost,\n    Done\n};\n\n/**\n * Contains metadata, edges and embeddings for a single batch.\n */\nclass Batch {\n   public:\n    int batch_id_;      /**< ID of the batch */\n    int64_t start_idx_; /**< Offset in the edges storage */\n    int batch_size_;    /**< Number of edges in the batch */\n    bool train_;        /**< If true, this batch is a training batch and requires gradient tracking */\n    int device_id_;     /**< ID of the device the batch is assigned to */\n\n    LearningTask task_;\n\n    Timestamp load_timestamp_;    /**< Timestamp of when the embeddings for the batch have been loaded from storage */\n    Timestamp compute_timestamp_; /**< Timestamp of when the gradients for the batch have been computed */\n    CudaEvent device_transfer_;   /**< Used as a sync point when transferring from host to device */\n    CudaEvent host_transfer_;     /**< Used as a sync point when transferring from device to host */\n    Timer timer_;                 /**< Timer used to track how long batch operations take */\n    BatchStatus status_;          /**< Tracks location of the batch in the pipeline */\n\n    Indices root_node_indices_;\n    Indices unique_node_indices_;         /**< Global node ids for each unique node in the batch. includes negative samples */\n    torch::Tensor node_embeddings_;       /**< Embedding tensor for each unique node in the the batch.  */\n    torch::Tensor node_gradients_;        /**< Gradients for each node embedding in the batch */\n    torch::Tensor node_embeddings_state_; /**< Optimizer state for each node embedding in the batch */\n    torch::Tensor node_state_update_;     /**< Updates to adjust the optimizer state */\n\n    torch::Tensor node_features_; /**< Feature vector for each unique node in the the batch.  */\n    torch::Tensor node_labels_;   /**< Label for each unique node in the the batch.  */\n\n    Indices src_neg_indices_mapping_; /**< Maps ids from the sampled nodes, which corrupt the source nodes of edges, to global node ids */\n    Indices dst_neg_indices_mapping_; /**< Maps ids from the sampled nodes, which corrupt the destination nodes of edges, to global node ids */\n\n    torch::Tensor edges_;\n\n    // Encoder\n    DENSEGraph dense_graph_;\n    torch::Tensor encoded_uniques_;\n\n    // Negative Sampling params\n    torch::Tensor neg_edges_;\n    Indices rel_neg_indices_; /**< Global relation ids for negative relations in the batch */\n    Indices src_neg_indices_; /**< Global node ids for the sampled nodes that are used to corrupt the source nodes of edges */\n    Indices dst_neg_indices_; /**< Global node ids for the sampled nodes that are used to corrupt the destination nodes of edges */\n\n    torch::Tensor src_neg_filter_; /**< Used to filter out false negatives for source corrupted negatives */\n    torch::Tensor dst_neg_filter_; /**< Used to filter out false negatives for destination corrupted negatives */\n\n    Batch(bool train); /**< Constructor */\n\n    ~Batch(); /**< Destructor */\n\n    void to(torch::Device device, CudaStream *compute_stream = nullptr); /**< Transfers embeddings, optimizer state, and indices to specified device */\n\n    void accumulateGradients(float learning_rate); /**< Accumulates gradients into the unique_node_gradients, and applies optimizer update rule to create the\n                                                      unique_node_gradients2 tensor */\n\n    void embeddingsToHost(); /**< Transfers gradients and embedding updates to host */\n\n    void clear(); /**< Clears all tensor data in the batch */\n};\n#endif  // MARIUS_BATCH_H\n"
  },
  {
    "path": "src/cpp/include/data/dataloader.h",
    "content": "//\n// Created by jasonmohoney on 10/4/19.\n//\n\n#ifndef MARIUS_DATASET_H\n#define MARIUS_DATASET_H\n\n#include <map>\n#include <string>\n#include <tuple>\n#include <vector>\n\n#include \"batch.h\"\n#include \"common/datatypes.h\"\n#include \"configuration/config.h\"\n#include \"data/samplers/edge.h\"\n#include \"data/samplers/negative.h\"\n#include \"data/samplers/neighbor.h\"\n#include \"storage/graph_storage.h\"\n#include \"storage/storage.h\"\n\nclass DataLoader {\n   public:\n    bool train_;\n    int epochs_processed_;\n    int64_t batches_processed_;\n    int64_t current_edge_;\n    std::mutex *sampler_lock_;\n    vector<shared_ptr<Batch>> batches_;\n    int batch_size_;\n\n    bool single_dataset_;\n\n    int batch_id_offset_;\n    vector<shared_ptr<Batch>>::iterator batch_iterator_;\n    std::mutex *batch_lock_;\n    std::condition_variable *batch_cv_;\n    bool waiting_for_batches_;\n    int batches_left_;\n    int total_batches_processed_;\n    bool all_read_;\n\n    vector<torch::Tensor> buffer_states_;\n\n    // Link prediction\n    vector<torch::Tensor> edge_buckets_per_buffer_;\n    vector<torch::Tensor>::iterator edge_buckets_per_buffer_iterator_;\n\n    // Node classification\n    vector<torch::Tensor> node_ids_per_buffer_;\n    vector<torch::Tensor>::iterator node_ids_per_buffer_iterator_;\n\n    shared_ptr<NeighborSampler> training_neighbor_sampler_;\n    shared_ptr<NeighborSampler> evaluation_neighbor_sampler_;\n\n    shared_ptr<NegativeSampler> training_negative_sampler_;\n    shared_ptr<NegativeSampler> evaluation_negative_sampler_;\n\n    Timestamp timestamp_;\n\n    shared_ptr<GraphModelStorage> graph_storage_;\n\n    shared_ptr<EdgeSampler> edge_sampler_;\n    shared_ptr<NegativeSampler> negative_sampler_;\n    shared_ptr<NeighborSampler> neighbor_sampler_;\n\n    shared_ptr<TrainingConfig> training_config_;\n    shared_ptr<EvaluationConfig> evaluation_config_;\n    bool only_root_features_;\n\n    LearningTask learning_task_;\n\n    CudaStream *compute_stream_;\n\n    DataLoader(shared_ptr<GraphModelStorage> graph_storage, LearningTask learning_task, shared_ptr<TrainingConfig> training_config,\n               shared_ptr<EvaluationConfig> evaluation_config, shared_ptr<EncoderConfig> encoder_config);\n\n    DataLoader(shared_ptr<GraphModelStorage> graph_storage, LearningTask learning_task, int batch_size, shared_ptr<NegativeSampler> negative_sampler = nullptr,\n               shared_ptr<NeighborSampler> neighbor_sampler = nullptr, bool train = false);\n\n    ~DataLoader();\n\n    void setBufferOrdering();\n\n    void setActiveEdges();\n\n    void setActiveNodes();\n\n    void initializeBatches(bool prepare_encode = false);\n\n    void clearBatches();\n\n    /**\n     * Check to see whether another batch exists.\n     * @return True if batch exists, false if not\n     */\n    bool hasNextBatch();\n\n    shared_ptr<Batch> getNextBatch();\n\n    /**\n     * Notify that the batch has been completed. Used for concurrency control.\n     */\n    void finishedBatch();\n\n    /**\n     * Gets the next batch to be processed by the pipeline.\n     * Loads edges from storage\n     * Constructs negative negative edges\n     * Loads CPU embedding parameters\n     * @return The next batch\n     */\n    shared_ptr<Batch> getBatch(at::optional<torch::Device> device = c10::nullopt, bool perform_map = false, int worker_id = 0);\n\n    /**\n     * Loads edges and samples negatives to construct a batch\n     * @param batch: Batch object to load edges into.\n     */\n    void edgeSample(shared_ptr<Batch> batch, int worker_id = 0);\n\n    /**\n     * Creates a mapping from global node ids into batch local node ids\n     * @param batch: Batch to map\n     */\n    void mapEdges(shared_ptr<Batch> batch, bool use_negs, bool use_nbrs, bool set_map);\n\n    /**\n     * Loads edges and samples negatives to construct a batch\n     * @param batch: Batch object to load nodes into.\n     */\n    void nodeSample(shared_ptr<Batch> batch, int worker_id = 0);\n\n    /**\n     * Samples negatives for the batch using the dataloader's negative sampler\n     * @param batch: Batch object to load negative samples into.\n     */\n    void negativeSample(shared_ptr<Batch> batch);\n\n    /**\n     * Loads CPU parameters into batch\n     * @param batch: Batch object to load parameters into.\n     */\n    void loadCPUParameters(shared_ptr<Batch> batch);\n\n    /**\n     * Loads GPU parameters into batch\n     * @param batch Batch object to load parameters into.\n     */\n    void loadGPUParameters(shared_ptr<Batch> batch);\n\n    /**\n     * Applies gradient updates to underlying storage\n     * @param batch: Batch object to apply updates from.\n     * @param gpu: If true, only the gpu parameters will be updated.\n     */\n    void updateEmbeddings(shared_ptr<Batch> batch, bool gpu);\n\n    /**\n     * Notify that the epoch has been completed. Prepares dataset for a new epoch.\n     */\n    void nextEpoch();\n\n    /**\n     * Load graph from storage.\n     */\n    void loadStorage();\n\n    bool epochComplete() { return (batches_left_ == 0) && all_read_; }\n\n    /**\n     * Unload graph from storage.\n     * @param write Set to true to write embedding table state to disk\n     */\n    void unloadStorage(bool write = false) { graph_storage_->unload(write); }\n\n    /**\n     * Gets the number of edges from the graph storage.\n     * @return Number of edges in the graph\n     */\n    int64_t getNumEdges() { return graph_storage_->getNumEdges(); }\n\n    int64_t getEpochsProcessed() { return epochs_processed_; }\n\n    int64_t getBatchesProcessed() { return batches_processed_; }\n\n    bool isTrain() { return train_; }\n\n    /**\n     * Sets graph storage, negative sampler, and neighbor sampler to training set.\n     */\n    void setTrainSet() {\n        if (single_dataset_) {\n            throw MariusRuntimeException(\"This dataloader only has a single dataset and cannot switch\");\n        } else {\n            batch_size_ = training_config_->batch_size;\n            train_ = true;\n            graph_storage_->setTrainSet();\n            negative_sampler_ = training_negative_sampler_;\n            neighbor_sampler_ = training_neighbor_sampler_;\n            loadStorage();\n        }\n    }\n\n    /**\n     * Sets graph storage, negative sampler, and neighbor sampler to validation set.\n     */\n    void setValidationSet() {\n        if (single_dataset_) {\n            throw MariusRuntimeException(\"This dataloader only has a single dataset and cannot switch\");\n        } else {\n            batch_size_ = evaluation_config_->batch_size;\n            train_ = false;\n            graph_storage_->setValidationSet();\n            negative_sampler_ = evaluation_negative_sampler_;\n            neighbor_sampler_ = evaluation_neighbor_sampler_;\n            loadStorage();\n        }\n    }\n\n    void setTestSet() {\n        if (single_dataset_) {\n            throw MariusRuntimeException(\"This dataloader only has a single dataset and cannot switch\");\n        } else {\n            batch_size_ = evaluation_config_->batch_size;\n            train_ = false;\n            graph_storage_->setTestSet();\n            negative_sampler_ = evaluation_negative_sampler_;\n            neighbor_sampler_ = evaluation_neighbor_sampler_;\n            loadStorage();\n        }\n    }\n\n    void setEncode() {\n        if (single_dataset_) {\n            loadStorage();\n            initializeBatches(true);\n        } else {\n            batch_size_ = evaluation_config_->batch_size;\n            train_ = false;\n            graph_storage_->setTrainSet();\n            neighbor_sampler_ = evaluation_neighbor_sampler_;\n            loadStorage();\n            initializeBatches(true);\n        }\n    }\n};\n\n#endif  // MARIUS_DATASET_H\n"
  },
  {
    "path": "src/cpp/include/data/graph.h",
    "content": "//\n// Created by Jason Mohoney on 8/25/21.\n//\n\n#ifndef MARIUS_SRC_CPP_INCLUDE_GRAPH_H_\n#define MARIUS_SRC_CPP_INCLUDE_GRAPH_H_\n\n#include \"common/datatypes.h\"\n#include \"common/util.h\"\n#include \"configuration/config.h\"\n#include \"nn/layers/gnn/layer_helpers.h\"\n\n/**\n * Object to handle arbitrary in-memory graph/sub-graph.\n */\nclass MariusGraph {\n   public:\n    EdgeList src_sorted_edges_;           // easy access of outgoing neighbors\n    EdgeList dst_sorted_edges_;           // easy access of incoming neighbors\n    EdgeList active_in_memory_subgraph_;  // shuffled\n\n    int64_t num_nodes_in_memory_;\n    Indices node_ids_;\n    Indices out_sorted_uniques_;\n    Indices out_offsets_;\n    torch::Tensor out_num_neighbors_;\n    Indices in_sorted_uniques_;\n    Indices in_offsets_;\n    torch::Tensor in_num_neighbors_;\n\n    int max_out_num_neighbors_;\n    int max_in_num_neighbors_;\n\n    int num_hash_maps_;\n    std::vector<torch::Tensor> hash_maps_;\n\n    // used for filtering negatives\n    EdgeList all_src_sorted_edges_;\n    EdgeList all_dst_sorted_edges_;\n\n    MariusGraph();\n\n    MariusGraph(EdgeList edges);\n\n    MariusGraph(EdgeList src_sorted_edges, EdgeList dst_sorted_edges, int64_t num_nodes_in_memory, int num_hash_maps = 1);\n    // TODO: this change may affect some cpp and python tests\n\n    ~MariusGraph();\n\n    /**\n     * Get the node IDs from the graph.\n     * @return Tensor of node IDs\n     */\n    Indices getNodeIDs();\n\n    /**\n     * Get the edges from the graph.\n     * @param incoming Get incoming edges if true, outgoing edges if false\n     * @return Tensor of edge IDs\n     */\n    Indices getEdges(bool incoming = true);\n\n    /**\n     * Get the relation IDs from the graph.\n     * @param incoming Get incoming relation IDs if true, outgoing relation IDs if false\n     * @return Tensor of relation IDs\n     */\n    Indices getRelationIDs(bool incoming = true);\n\n    /**\n     * Get the offsets of the neighbors in the sorted edge list.\n     * @param incoming Get incoming neighbor offsets if true, outgoing neighbor offsets if false\n     * @return Tensor of neighbor offsets\n     */\n    Indices getNeighborOffsets(bool incoming = true);\n\n    /**\n     * Get the number of neighbors for each node in the graph.\n     * @param incoming Get number of incoming neighbor if true, number of outgoing neighbors if false\n     * @return Number of neighbors\n     */\n    torch::Tensor getNumNeighbors(bool incoming = true);\n\n    /**\n     * Get the neighbors for the specified node IDs.\n     * @param node_ids The node IDs to get neighbors from\n     * @param incoming Get incoming neighbors if true, outgoing if false\n     * @param neighbor_sampling_layer The neighbor sampling strategy to use\n     * @param max_neighbors_size The maximum number of neighbors to sample\n     * @return Neighbors of specified nodes\n     */\n    std::tuple<torch::Tensor, torch::Tensor> getNeighborsForNodeIds(torch::Tensor node_ids, bool incoming, NeighborSamplingLayer neighbor_sampling_layer,\n                                                                    int max_neighbors_size, float rate);\n\n    /**\n     * Clear the graph.\n     */\n    void clear();\n\n    void to(torch::Device device);\n\n    void sortAllEdges(EdgeList additional_edges);\n};\n\n/**\n * MariusGraph sublass, orders the CSR representation of the graph for fast GNN encoding.\n */\nclass DENSEGraph : public MariusGraph {\n   public:\n    Indices hop_offsets_;\n\n    Indices in_neighbors_mapping_;\n    Indices out_neighbors_mapping_;\n\n    std::vector<torch::Tensor> in_neighbors_vec_;\n    std::vector<torch::Tensor> out_neighbors_vec_;\n\n    torch::Tensor node_properties_;\n\n    int num_nodes_in_memory_;\n\n    DENSEGraph();\n\n    DENSEGraph(Indices hop_offsets, Indices node_ids, Indices in_offsets, std::vector<torch::Tensor> in_neighbors_vec, Indices in_neighbors_mapping,\n               Indices out_offsets, std::vector<torch::Tensor> out_neighbors_vec, Indices out_neighbors_mapping, int num_nodes_in_memory);\n\n    ~DENSEGraph();\n\n    /**\n     * Prepares GNN graph for next layer.\n     */\n    void prepareForNextLayer();\n\n    /**\n     * Gets the ids of the neighbors for the current layer.\n     * @param incoming Get incoming edges if true, outgoing edges if false\n     * @param global If false, return node IDs local to the batch. If true, return any global node IDs\n     * @return Tensor of edge IDs\n     */\n    Indices getNeighborIDs(bool incoming = true, bool global = false);\n\n    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> getCombinedNeighborIDs();\n\n    /**\n     * Gets the offset of the node ids in the outermost layer.\n     * @return Layer offset\n     */\n    int64_t getLayerOffset();\n\n    /**\n     * Maps local IDs to batch.\n     */\n    void performMap();\n\n    void setNodeProperties(torch::Tensor node_properties);\n\n    /**\n     * Clear the graph.\n     */\n    void clear();\n\n    void to(torch::Device device, CudaStream *compute_stream = nullptr, CudaStream *transfer_stream = nullptr);\n};\n\n#endif  // MARIUS_SRC_CPP_INCLUDE_GRAPH_H_\n"
  },
  {
    "path": "src/cpp/include/data/ordering.h",
    "content": "//\n// Created by Jason Mohoney on 7/17/20.\n//\n\n#ifndef MARIUS_ORDERING_H\n#define MARIUS_ORDERING_H\n\n#include \"batch.h\"\n\nusing std::pair;\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getEdgeBucketOrdering(EdgeBucketOrdering edge_bucket_ordering, int num_partitions, int buffer_capacity,\n                                                                               int fine_to_coarse_ratio, int num_cache_partitions,\n                                                                               bool randomly_assign_edge_buckets);\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> convertEdgeBucketOrderToTensors(vector<vector<int>> buffer_states,\n                                                                                         vector<vector<std::pair<int, int>>> edge_buckets_per_buffer);\n\nvector<vector<int>> getBetaOrderingHelper(int num_partitions, int buffer_capacity);\n\nvector<vector<std::pair<int, int>>> greedyAssignEdgeBucketsToBuffers(vector<vector<int>> buffer_states, int num_partitions);\n\nvector<vector<std::pair<int, int>>> randomlyAssignEdgeBucketsToBuffers(vector<vector<int>> buffer_states, int num_partitions);\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getTwoLevelBetaOrdering(int num_partitions, int buffer_capacity, int fine_to_coarse_ratio,\n                                                                                 int num_cache_partitions, bool randomly_assign_edge_buckets);\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getCustomEdgeBucketOrdering();\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getNodePartitionOrdering(NodePartitionOrdering node_partition_ordering, Indices train_nodes,\n                                                                                  int64_t total_num_nodes, int num_partitions, int buffer_capacity,\n                                                                                  int fine_to_coarse_ratio, int num_cache_partitions);\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getDispersedNodePartitionOrdering(Indices train_nodes, int64_t total_num_nodes, int num_partitions,\n                                                                                           int buffer_capacity, int fine_to_coarse_ratio,\n                                                                                           int num_cache_partitions);\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getSequentialNodePartitionOrdering(Indices train_nodes, int64_t total_num_nodes, int num_partitions,\n                                                                                            int buffer_capacity);\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getCustomNodePartitionOrdering();\n\n#endif  // MARIUS_ORDERING_H\n"
  },
  {
    "path": "src/cpp/include/data/samplers/edge.h",
    "content": "//\n// Created by Jason Mohoney on 2/8/22.\n//\n\n#ifndef MARIUS_EDGE_H\n#define MARIUS_EDGE_H\n\n#include \"storage/graph_storage.h\"\n\n/**\n * Samples the edges from a given batch.\n */\nclass EdgeSampler {\n   public:\n    shared_ptr<GraphModelStorage> graph_storage_;\n\n    virtual ~EdgeSampler(){};\n\n    /**\n     * Get edges for a given batch.\n     * @param batch Batch to sample into\n     * @return Edges sampled for the batch\n     */\n    virtual EdgeList getEdges(shared_ptr<Batch> batch) = 0;\n};\n\nclass RandomEdgeSampler : public EdgeSampler {\n   public:\n    bool without_replacement_;\n\n    RandomEdgeSampler(shared_ptr<GraphModelStorage> graph_storage, bool without_replacement = true);\n\n    EdgeList getEdges(shared_ptr<Batch> batch) override;\n};\n\n#endif  // MARIUS_EDGE_H\n"
  },
  {
    "path": "src/cpp/include/data/samplers/negative.h",
    "content": "//\n// Created by Jason Mohoney on 2/8/22.\n//\n\n#ifndef MARIUS_NEGATIVE_H\n#define MARIUS_NEGATIVE_H\n\n#include \"storage/graph_storage.h\"\n\nstd::tuple<torch::Tensor, torch::Tensor> batch_sample(torch::Tensor edges, int num_negatives, bool inverse = false);\n\ntorch::Tensor deg_negative_local_filter(torch::Tensor deg_sample_indices, torch::Tensor edges);\n\ntorch::Tensor compute_filter_corruption_cpu(shared_ptr<MariusGraph> graph, torch::Tensor edges, torch::Tensor corruption_nodes, bool inverse = false,\n                                            bool global = false, LocalFilterMode local_filter_mode = LocalFilterMode::ALL,\n                                            torch::Tensor deg_sample_indices = torch::Tensor());\n\ntorch::Tensor compute_filter_corruption_gpu(shared_ptr<MariusGraph> graph, torch::Tensor edges, torch::Tensor corruption_nodes, bool inverse = false,\n                                            bool global = false, LocalFilterMode local_filter_mode = LocalFilterMode::ALL,\n                                            torch::Tensor deg_sample_indices = torch::Tensor());\n\ntorch::Tensor compute_filter_corruption(shared_ptr<MariusGraph> graph, torch::Tensor edges, torch::Tensor corruption_nodes, bool inverse = false,\n                                        bool global = false, LocalFilterMode local_filter_mode = LocalFilterMode::ALL,\n                                        torch::Tensor deg_sample_indices = torch::Tensor());\n\ntorch::Tensor apply_score_filter(torch::Tensor scores, torch::Tensor filter);\n\n/**\n * Samples the negative edges from a given batch.\n */\nclass NegativeSampler {\n   public:\n    virtual ~NegativeSampler(){};\n\n    /**\n     * Get negative edges from the given batch.\n     * Return a tensor of node ids of shape [num_negs] or a [num_negs, 3] shaped tensor of negative edges.\n     * @param inverse Sample for inverse edges\n     * @return The negative nodes/edges sampled\n     */\n    virtual std::tuple<torch::Tensor, torch::Tensor> getNegatives(shared_ptr<MariusGraph> graph, torch::Tensor edges = torch::Tensor(),\n                                                                  bool inverse = false) = 0;\n};\n\nclass CorruptNodeNegativeSampler : public NegativeSampler {\n   public:\n    int num_chunks_;\n    int num_negatives_;\n    float degree_fraction_;\n    bool filtered_;\n    LocalFilterMode local_filter_mode_;\n\n    CorruptNodeNegativeSampler(int num_chunks, int num_negatives, float degree_fraction, bool filtered = false,\n                               LocalFilterMode local_filter_mode = LocalFilterMode::DEG);\n\n    std::tuple<torch::Tensor, torch::Tensor> getNegatives(shared_ptr<MariusGraph> graph, torch::Tensor edges = torch::Tensor(), bool inverse = false) override;\n};\n\nclass CorruptRelNegativeSampler : public NegativeSampler {\n   public:\n    int num_chunks_;\n    int num_negatives_;\n    bool filtered_;\n\n    CorruptRelNegativeSampler(int num_chunks, int num_negatives, bool filtered = false);\n\n    std::tuple<torch::Tensor, torch::Tensor> getNegatives(shared_ptr<MariusGraph> graph, torch::Tensor edges = torch::Tensor(), bool inverse = false) override;\n};\n\nclass NegativeEdgeSampler : public NegativeSampler {\n   public:\n    int num_chunks_;\n    int num_negatives_;\n\n    NegativeEdgeSampler(int num_chunks, int num_negatives, bool filtered = false);\n\n    std::tuple<torch::Tensor, torch::Tensor> getNegatives(shared_ptr<MariusGraph> graph, torch::Tensor edges = torch::Tensor(), bool inverse = false) override;\n};\n\n#endif  // MARIUS_NEGATIVE_H\n"
  },
  {
    "path": "src/cpp/include/data/samplers/neighbor.h",
    "content": "//\n// Created by Jason Mohoney on 2/8/22.\n//\n\n#ifndef MARIUS_NEIGHBOR_SAMPLER_H\n#define MARIUS_NEIGHBOR_SAMPLER_H\n\n#include \"configuration/config.h\"\n#include \"storage/graph_storage.h\"\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_all_gpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                        torch::Tensor num_neighbors);\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_all_cpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                        torch::Tensor num_neighbors, int64_t total_neighbors);\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_uniform_gpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                            torch::Tensor num_neighbors, int64_t max_neighbors, int64_t max_id);\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_uniform_cpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                            torch::Tensor num_neighbors, int64_t max_neighbors, int64_t total_neighbors);\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_dropout_gpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                            torch::Tensor num_neighbors, float rate);\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_dropout_cpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                            torch::Tensor num_neighbors, float rate, int64_t total_neighbors);\n\n/**\n * Samples the neighbors from a given batch given a neighbor sampling strategy.\n */\nclass NeighborSampler {\n   public:\n    shared_ptr<GraphModelStorage> storage_;\n    shared_ptr<MariusGraph> graph_;\n\n    virtual ~NeighborSampler(){};\n\n    /**\n     * Get neighbors of provided nodes using given neighborhood sampling strategy.\n     * @param node_ids Nodes to get neighbors from\n     * @return The neighbors sampled using strategy\n     */\n    virtual DENSEGraph getNeighbors(torch::Tensor node_ids, shared_ptr<MariusGraph> graph = nullptr, int worker_id = 0) = 0;\n};\n\nclass LayeredNeighborSampler : public NeighborSampler {\n   public:\n    bool use_incoming_nbrs_;\n    bool use_outgoing_nbrs_;\n    std::vector<shared_ptr<NeighborSamplingConfig>> sampling_layers_;\n\n    bool use_hashmap_sets_;\n    bool use_bitmaps_;\n\n    // TODO: this change may affect test, docs, python examples\n    LayeredNeighborSampler(shared_ptr<GraphModelStorage> storage, std::vector<shared_ptr<NeighborSamplingConfig>> layer_configs, bool use_incoming_nbrs = true,\n                           bool use_outgoing_nbrs = true);\n\n    LayeredNeighborSampler(shared_ptr<MariusGraph> graph, std::vector<shared_ptr<NeighborSamplingConfig>> layer_configs, bool use_incoming_nbrs = true,\n                           bool use_outgoing_nbrs = true);\n\n    LayeredNeighborSampler(std::vector<shared_ptr<NeighborSamplingConfig>> layer_configs, bool use_incoming_nbrs = true, bool use_outgoing_nbrs = true);\n\n    void checkLayerConfigs();\n\n    DENSEGraph getNeighbors(torch::Tensor node_ids, shared_ptr<MariusGraph> graph = nullptr, int worker_id = 0) override;\n    // TODO this change may affect test_nn.py\n\n    torch::Tensor computeDeltaIdsHelperMethod1(torch::Tensor hash_map, torch::Tensor node_ids, torch::Tensor delta_incoming_edges,\n                                               torch::Tensor delta_outgoing_edges, int64_t num_nodes_in_memory);\n};\n\n#endif  // MARIUS_NEIGHBOR_SAMPLER_H\n"
  },
  {
    "path": "src/cpp/include/marius.h",
    "content": "#include \"configuration/config.h\"\n#include \"data/dataloader.h\"\n#include \"nn/model.h\"\n#include \"storage/graph_storage.h\"\n\nvoid encode_and_export(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, shared_ptr<MariusConfig> marius_config);\n\nstd::tuple<shared_ptr<Model>, shared_ptr<GraphModelStorage>, shared_ptr<DataLoader> > marius_init(shared_ptr<MariusConfig> marius_config, bool train);\n\nvoid marius_train(shared_ptr<MariusConfig> marius_config);\n\nvoid marius_eval(shared_ptr<MariusConfig> marius_config);\n\nvoid marius(int argc, char *argv[]);\n\nint main(int argc, char *argv[]);\n"
  },
  {
    "path": "src/cpp/include/nn/activation.h",
    "content": "//\n// Created by Jason Mohoney on 10/7/21.\n//\n\n#ifndef MARIUS_ACTIVATION_H\n#define MARIUS_ACTIVATION_H\n\n#include \"common/datatypes.h\"\n#include \"configuration/config.h\"\n\ntorch::Tensor apply_activation(ActivationFunction activation_function, torch::Tensor input);\n\n#endif  // MARIUS_ACTIVATION_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/decoder.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_DECODER_H\n#define MARIUS_DECODER_H\n\n#include <configuration/options.h>\n\n#include \"common/datatypes.h\"\n\nclass Decoder {\n   public:\n    LearningTask learning_task_;\n\n    virtual ~Decoder(){};\n};\n\n#endif  // MARIUS_DECODER_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/edge/comparators.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_COMPARATOR_H\n#define MARIUS_COMPARATOR_H\n\n#include \"common/datatypes.h\"\n\ntorch::Tensor pad_and_reshape(torch::Tensor input, int num_chunks);\n\n// Embedding Comparator Functions\nclass Comparator {\n   public:\n    virtual ~Comparator(){};\n    virtual torch::Tensor operator()(torch::Tensor src, torch::Tensor dst) = 0;\n};\n\nclass L2Compare : public Comparator {\n   public:\n    L2Compare(){};\n\n    torch::Tensor operator()(torch::Tensor src, torch::Tensor dst) override;\n};\n\nclass CosineCompare : public Comparator {\n   public:\n    CosineCompare(){};\n\n    torch::Tensor operator()(torch::Tensor src, torch::Tensor dst) override;\n};\n\nclass DotCompare : public Comparator {\n   public:\n    DotCompare(){};\n\n    torch::Tensor operator()(torch::Tensor src, torch::Tensor dst) override;\n};\n\n#endif  // MARIUS_COMPARATOR_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/edge/complex.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_COMPLEX_H\n#define MARIUS_COMPLEX_H\n\n#include \"nn/decoders/edge/edge_decoder.h\"\n\nclass ComplEx : public EdgeDecoder, public torch::nn::Cloneable<ComplEx> {\n   public:\n    ComplEx(int num_relations, int embedding_dim, torch::TensorOptions tensor_options = torch::TensorOptions(), bool use_inverse_relations = true,\n            EdgeDecoderMethod decoder_method = EdgeDecoderMethod::CORRUPT_NODE);\n\n    void reset() override;\n};\n\n#endif  // MARIUS_COMPLEX_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/edge/decoder_methods.h",
    "content": "//\n// Created by Jason Mohoney on 3/31/22.\n//\n\n#ifndef MARIUS_DECODER_METHODS_H\n#define MARIUS_DECODER_METHODS_H\n\n#include \"common/datatypes.h\"\n#include \"nn/decoders/edge/edge_decoder.h\"\n\nstd::tuple<torch::Tensor, torch::Tensor> only_pos_forward(shared_ptr<EdgeDecoder> decoder, torch::Tensor positive_edges, torch::Tensor node_embeddings);\n\nstd::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> neg_and_pos_forward(shared_ptr<EdgeDecoder> decoder, torch::Tensor positive_edges,\n                                                                                           torch::Tensor negative_edges, torch::Tensor node_embeddings);\n\nstd::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> node_corrupt_forward(shared_ptr<EdgeDecoder> decoder, torch::Tensor positive_edges,\n                                                                                            torch::Tensor node_embeddings, torch::Tensor dst_negs,\n                                                                                            torch::Tensor src_negs = torch::Tensor());\n\nstd::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> rel_corrupt_forward(shared_ptr<EdgeDecoder> decoder, torch::Tensor positive_edges,\n                                                                                           torch::Tensor node_embeddings, torch::Tensor neg_rel_ids);\n\n#endif  // MARIUS_DECODER_METHODS_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/edge/distmult.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_DISTMULT_H\n#define MARIUS_DISTMULT_H\n\n#include \"nn/decoders/edge/edge_decoder.h\"\n\nclass DistMult : public EdgeDecoder, public torch::nn::Cloneable<DistMult> {\n   public:\n    DistMult(int num_relations, int embedding_dim, torch::TensorOptions tensor_options = torch::TensorOptions(), bool use_inverse_relations = true,\n             EdgeDecoderMethod decoder_method = EdgeDecoderMethod::CORRUPT_NODE);\n\n    void reset() override;\n};\n\n#endif  // MARIUS_DISTMULT_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/edge/edge_decoder.h",
    "content": "//\n// Created by Jason Mohoney on 2/6/22.\n//\n\n#ifndef MARIUS_EDGE_DECODER_H\n#define MARIUS_EDGE_DECODER_H\n\n#include \"common/datatypes.h\"\n#include \"nn/decoders/decoder.h\"\n#include \"nn/decoders/edge/comparators.h\"\n#include \"nn/decoders/edge/relation_operators.h\"\n\nclass EdgeDecoder : public Decoder {\n   public:\n    shared_ptr<Comparator> comparator_;\n    shared_ptr<RelationOperator> relation_operator_;\n    torch::Tensor relations_;\n    torch::Tensor inverse_relations_;\n    int num_relations_;\n    int embedding_size_;\n    torch::TensorOptions tensor_options_;\n    EdgeDecoderMethod decoder_method_;\n\n    bool use_inverse_relations_;\n\n    torch::Tensor apply_relation(torch::Tensor nodes, torch::Tensor relations);\n\n    torch::Tensor compute_scores(torch::Tensor src, torch::Tensor dst);\n\n    torch::Tensor select_relations(torch::Tensor indices, bool inverse = false);\n};\n#endif  // MARIUS_EDGE_DECODER_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/edge/relation_operators.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_RELATION_OPERATOR_H\n#define MARIUS_RELATION_OPERATOR_H\n\n#include \"common/datatypes.h\"\n\n// Relation Operators\nclass RelationOperator {\n   public:\n    virtual ~RelationOperator(){};\n    virtual torch::Tensor operator()(const torch::Tensor &embs, const torch::Tensor &rels) = 0;\n};\n\nclass HadamardOperator : public RelationOperator {\n   public:\n    torch::Tensor operator()(const torch::Tensor &embs, const torch::Tensor &rels) override;\n};\n\nclass ComplexHadamardOperator : public RelationOperator {\n   public:\n    torch::Tensor operator()(const torch::Tensor &embs, const torch::Tensor &rels) override;\n};\n\nclass TranslationOperator : public RelationOperator {\n   public:\n    torch::Tensor operator()(const torch::Tensor &embs, const torch::Tensor &rels) override;\n};\n\nclass NoOp : public RelationOperator {\n   public:\n    torch::Tensor operator()(const torch::Tensor &embs, const torch::Tensor &rels) override;\n};\n\n#endif  // MARIUS_RELATION_OPERATOR_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/edge/transe.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_TRANSE_H\n#define MARIUS_TRANSE_H\n\n#include \"nn/decoders/edge/edge_decoder.h\"\n\nclass TransE : public EdgeDecoder, public torch::nn::Cloneable<TransE> {\n   public:\n    TransE(int num_relations, int embedding_dim, torch::TensorOptions tensor_options = torch::TensorOptions(), bool use_inverse_relations = true,\n           EdgeDecoderMethod decoder_method = EdgeDecoderMethod::CORRUPT_NODE);\n\n    void reset() override;\n};\n\n#endif  // MARIUS_TRANSE_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/node/node_decoder.h",
    "content": "//\n// Created by Jason Mohoney on 2/5/22.\n//\n\n#ifndef MARIUS_NODE_DECODER_H\n#define MARIUS_NODE_DECODER_H\n\n#include \"nn/decoders/decoder.h\"\n\nclass NodeDecoder : public Decoder {\n   public:\n    virtual torch::Tensor forward(torch::Tensor node_repr) = 0;\n};\n\n#endif  // MARIUS_NODE_DECODER_H\n"
  },
  {
    "path": "src/cpp/include/nn/decoders/node/noop_node_decoder.h",
    "content": "//\n// Created by Jason Mohoney on 2/7/22.\n//\n\n#ifndef MARIUS_NOOP_NODE_DECODER_H\n#define MARIUS_NOOP_NODE_DECODER_H\n\n#include \"nn/decoders/node/node_decoder.h\"\n\nclass NoOpNodeDecoder : public NodeDecoder, public torch::nn::Cloneable<NoOpNodeDecoder> {\n   public:\n    NoOpNodeDecoder() { learning_task_ = LearningTask::NODE_CLASSIFICATION; };\n\n    torch::Tensor forward(torch::Tensor node_repr) override;\n\n    void reset() override;\n};\n\n#endif  // MARIUS_NOOP_NODE_DECODER_H"
  },
  {
    "path": "src/cpp/include/nn/encoders/encoder.h",
    "content": "//\n// Created by Jason Mohoney on 10/7/21.\n//\n\n#ifndef MARIUS_ENCODER_H\n#define MARIUS_ENCODER_H\n\n#include \"configuration/config.h\"\n#include \"nn/layers/layer.h\"\n\nclass GeneralEncoder : public torch::nn::Cloneable<GeneralEncoder> {\n   public:\n    shared_ptr<EncoderConfig> encoder_config_;\n    int num_relations_;\n    torch::Device device_;\n    bool has_features_;\n    bool has_embeddings_;\n\n    std::vector<std::vector<shared_ptr<Layer>>> layers_;\n\n    GeneralEncoder(shared_ptr<EncoderConfig> encoder_config, torch::Device device, int num_relations = 1);\n\n    GeneralEncoder(std::vector<std::vector<shared_ptr<Layer>>> layers);\n\n    torch::Tensor forward(at::optional<torch::Tensor> embeddings, at::optional<torch::Tensor> features, DENSEGraph dense_graph, bool train = true);\n\n    void reset() override;\n\n    std::shared_ptr<Layer> initEmbeddingLayer(shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id);\n\n    std::shared_ptr<Layer> initFeatureLayer(shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id);\n\n    std::shared_ptr<Layer> initDenseLayer(shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id);\n\n    std::shared_ptr<Layer> initGNNLayer(shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id, int sampling_id);\n\n    std::shared_ptr<Layer> initReductionLayer(shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id);\n};\n\n#endif  // MARIUS_ENCODER_H\n"
  },
  {
    "path": "src/cpp/include/nn/initialization.h",
    "content": "//\n// Created by Jason Mohoney on 10/7/21.\n//\n\n#ifndef MARIUS_INITIALIZATION_H\n#define MARIUS_INITIALIZATION_H\n\n#include \"common/datatypes.h\"\n#include \"configuration/config.h\"\n\nstd::tuple<int64_t, int64_t> compute_fans(std::vector<int64_t> shape);\n\ntorch::Tensor glorot_uniform(std::vector<int64_t> shape, std::tuple<int64_t, int64_t> fans, torch::TensorOptions options);\n\ntorch::Tensor glorot_normal(std::vector<int64_t> shape, std::tuple<int64_t, int64_t> fans, torch::TensorOptions options);\n\ntorch::Tensor constant_init(float constant, std::vector<int64_t> shape, torch::TensorOptions options);\n\ntorch::Tensor uniform_init(float scale_factor, std::vector<int64_t> shape, torch::TensorOptions options);\n\ntorch::Tensor normal_init(float mean, float std, std::vector<int64_t> shape, torch::TensorOptions options);\n\ntorch::Tensor initialize_tensor(shared_ptr<InitConfig> init_config, std::vector<int64_t> shape, torch::TensorOptions tensor_options,\n                                std::tuple<int64_t, int64_t> fans = {-1, -1});\n\n/** For initializing large tensors that won't fit in memory */\ntorch::Tensor initialize_subtensor(shared_ptr<InitConfig> init_config, std::vector<int64_t> sub_shape, std::vector<int64_t> full_shape,\n                                   torch::TensorOptions tensor_options, std::tuple<int64_t, int64_t> fans = {-1, -1});\n\n#endif  // MARIUS_INITIALIZATION_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/embedding/embedding.h",
    "content": "//\n// Created by Jason Mohoney on 2/1/22.\n//\n\n#ifndef MARIUS_EMBEDDING_H\n#define MARIUS_EMBEDDING_H\n\n#include \"common/datatypes.h\"\n#include \"nn/layers/layer.h\"\n#include \"storage/storage.h\"\n\nclass EmbeddingLayer : public Layer {\n   public:\n    int offset_;\n\n    EmbeddingLayer(shared_ptr<LayerConfig> layer_config, torch::Device device, int offset = 0);\n\n    torch::Tensor forward(torch::Tensor input);\n\n    torch::Tensor init_embeddings(int64_t num_nodes);\n\n    void reset() override;\n};\n\n#endif  // MARIUS_EMBEDDING_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/feature/feature.h",
    "content": "//\n// Created by Jason Mohoney on 2/1/22.\n//\n\n#ifndef MARIUS_FEATURE_H\n#define MARIUS_FEATURE_H\n\n#include \"common/datatypes.h\"\n#include \"nn/layers/layer.h\"\n\nclass FeatureLayer : public Layer {\n   public:\n    int offset_;\n\n    FeatureLayer(shared_ptr<LayerConfig> layer_config, torch::Device device, int offset = 0);\n\n    torch::Tensor forward(torch::Tensor input);\n\n    void reset() override;\n};\n\n#endif  // MARIUS_FEATURE_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/gnn/gat_layer.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_GAT_LAYER_H\n#define MARIUS_GAT_LAYER_H\n\n#include \"gnn_layer.h\"\n\nclass GATLayer : public GNNLayer {\n   public:\n    shared_ptr<GATLayerOptions> options_;\n    int head_dim_;\n    float input_dropout_;\n    float attention_dropout_;\n    torch::Tensor weight_matrices_;\n    torch::Tensor a_l_;\n    torch::Tensor a_r_;\n\n    GATLayer(shared_ptr<LayerConfig> layer_config, torch::Device device);\n\n    void reset() override;\n\n    torch::Tensor forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train = true) override;\n};\n\n#endif  // MARIUS_GAT_LAYER_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/gnn/gcn_layer.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_GCN_LAYER_H\n#define MARIUS_GCN_LAYER_H\n\n#include \"gnn_layer.h\"\n\nclass GCNLayer : public GNNLayer {\n   public:\n    shared_ptr<GNNLayerOptions> options_;\n    bool use_incoming_;\n    bool use_outgoing_;\n    torch::Tensor w_;\n\n    GCNLayer(shared_ptr<LayerConfig> layer_config, torch::Device device);\n\n    void reset() override;\n\n    torch::Tensor forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train = true) override;\n};\n\n#endif  // MARIUS_GCN_LAYER_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/gnn/gnn_layer.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_GNN_LAYER_H\n#define MARIUS_GNN_LAYER_H\n\n#include \"common/datatypes.h\"\n#include \"configuration/config.h\"\n#include \"data/graph.h\"\n#include \"nn/initialization.h\"\n#include \"nn/layers/layer.h\"\n\nclass GNNLayer : public Layer {\n   public:\n    int input_dim_;\n    int output_dim_;\n\n    virtual ~GNNLayer(){};\n\n    virtual torch::Tensor forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train) { return torch::Tensor(); };\n};\n\n#endif  // MARIUS_GNN_LAYER_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/gnn/graph_sage_layer.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_GRAPH_SAGE_LAYER_H\n#define MARIUS_GRAPH_SAGE_LAYER_H\n\n#include \"gnn_layer.h\"\n\nclass GraphSageLayer : public GNNLayer {\n   public:\n    shared_ptr<GraphSageLayerOptions> options_;\n    torch::Tensor w1_;\n    torch::Tensor w2_;\n\n    GraphSageLayer(shared_ptr<LayerConfig> layer_config, torch::Device device);\n\n    void reset() override;\n\n    torch::Tensor forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train = true) override;\n};\n\n#endif  // MARIUS_GRAPH_SAGE_LAYER_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/gnn/layer_helpers.h",
    "content": "//\n// Created by Jason Mohoney on 10/1/21.\n//\n\n#ifndef MARIUS_LAYER_HELPERS_H\n#define MARIUS_LAYER_HELPERS_H\n\n#include \"common/datatypes.h\"\n\ntorch::Tensor segment_ids_from_offsets(torch::Tensor offsets, int64_t input_size);\n\ntorch::Tensor segmented_sum(torch::Tensor tensor, torch::Tensor segment_ids, int64_t num_segments);\n\ntorch::Tensor segmented_sum_with_offsets(torch::Tensor tensor, torch::Tensor offsets);\n\ntorch::Tensor segmented_max_with_offsets(torch::Tensor tensor, torch::Tensor offsets);\n\nstd::tuple<torch::Tensor, torch::Tensor> attention_softmax(torch::Tensor neighbor_attention, torch::Tensor self_attention, torch::Tensor segment_offsets,\n                                                           torch::Tensor segment_ids, torch::Tensor num_nbrs);\n\n#endif  // MARIUS_LAYER_HELPERS_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/gnn/rgcn_layer.h",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#ifndef MARIUS_RGCN_LAYER_H\n#define MARIUS_RGCN_LAYER_H\n\n#include \"gnn_layer.h\"\n\nclass RGCNLayer : public GNNLayer {\n   public:\n    shared_ptr<GNNLayerOptions> options_;\n    int num_relations_;\n    torch::Tensor relation_matrices_;\n    torch::Tensor inverse_relation_matrices_;\n    torch::Tensor self_matrix_;\n\n    RGCNLayer(shared_ptr<LayerConfig> layer_config, int num_relations, torch::Device device);\n\n    void reset() override;\n\n    torch::Tensor forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train = true) override;\n};\n\n#endif  // MARIUS_RGCN_LAYER_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/layer.h",
    "content": "//\n// Created by Jason Mohoney on 2/1/22.\n//\n\n#ifndef MARIUS_LAYER_H\n#define MARIUS_LAYER_H\n\n#include \"common/datatypes.h\"\n#include \"configuration/config.h\"\n#include \"data/graph.h\"\n#include \"nn/activation.h\"\n#include \"nn/initialization.h\"\n\nclass Layer : public torch::nn::Module {\n   public:\n    shared_ptr<LayerConfig> config_;\n    torch::Device device_;\n    torch::Tensor bias_;\n\n    Layer();\n\n    virtual ~Layer(){};\n\n    virtual void reset() = 0;\n\n    torch::Tensor post_hook(torch::Tensor input);\n\n    void init_bias();\n};\n\n#endif  // MARIUS_LAYER_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/reduction/concat.h",
    "content": "//\n// Created by Jason Mohoney on 12/10/21.\n//\n\n#ifndef MARIUS_CONCAT_H\n#define MARIUS_CONCAT_H\n\n#include \"common/datatypes.h\"\n#include \"reduction_layer.h\"\n\nclass ConcatReduction : public ReductionLayer {\n   public:\n    ConcatReduction(shared_ptr<LayerConfig> layer_config, torch::Device device);\n\n    torch::Tensor forward(std::vector<torch::Tensor> inputs) override;\n\n    void reset() override;\n};\n\n#endif  // MARIUS_CONCAT_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/reduction/linear.h",
    "content": "//\n// Created by Jason Mohoney on 12/10/21.\n//\n\n#ifndef MARIUS_LINEAR_H\n#define MARIUS_LINEAR_H\n\n#include \"common/datatypes.h\"\n#include \"reduction_layer.h\"\n\nclass LinearReduction : public ReductionLayer {\n   public:\n    torch::Tensor weight_matrix_;\n\n    LinearReduction(shared_ptr<LayerConfig> layer_config, torch::Device device);\n\n    torch::Tensor forward(std::vector<torch::Tensor> inputs) override;\n\n    void reset() override;\n};\n\n#endif  // MARIUS_LINEAR_H\n"
  },
  {
    "path": "src/cpp/include/nn/layers/reduction/reduction_layer.h",
    "content": "//\n// Created by Jason Mohoney on 8/25/21.\n//\n\n#ifndef MARIUS_FEATURIZER_H_\n#define MARIUS_FEATURIZER_H_\n\n#include \"common/datatypes.h\"\n#include \"configuration/config.h\"\n#include \"nn/layers/layer.h\"\n\n/**\n  Generates new embeddings for nodes by combining node features and their respective embeddings in order to emphasize individual node properties.\n*/\nclass ReductionLayer : public Layer {\n   public:\n    virtual ~ReductionLayer(){};\n\n    virtual torch::Tensor forward(std::vector<torch::Tensor> inputs) = 0;\n};\n\n#endif  // MARIUS_FEATURIZER_H_\n"
  },
  {
    "path": "src/cpp/include/nn/loss.h",
    "content": "//\n// Created by Jason Mohoney on 8/25/21.\n//\n\n#ifndef MARIUS_SRC_CPP_INCLUDE_LOSS_H_\n#define MARIUS_SRC_CPP_INCLUDE_LOSS_H_\n\n#include \"common/datatypes.h\"\n#include \"configuration/config.h\"\n\nvoid check_score_shapes(torch::Tensor pos_scores, torch::Tensor neg_scores);\n\nstd::tuple<torch::Tensor, torch::Tensor> scores_to_labels(torch::Tensor pos_scores, torch::Tensor neg_scores, bool one_hot);\n\ntorch::Tensor to_one_hot(torch::Tensor labels, int num_classes);\n\n// Loss Functions\n/**\n  Calculates loss for generated embeddings. Currently only supports link prediction losses. Node classification is hard-coded to use torch.cross_entropy.\n*/\nclass LossFunction {\n   public:\n    virtual ~LossFunction(){};\n    /**\n      Takes positive and negative scores and calculates loss.\n      @param pos_scores Positive scores\n      @param neg_scores Negative scores\n      @return Loss vector\n    */\n    virtual torch::Tensor operator()(torch::Tensor y_pred, torch::Tensor targets, bool scores) = 0;\n};\n\nclass SoftmaxCrossEntropy : public LossFunction {\n   private:\n    LossReduction reduction_type_;\n\n   public:\n    SoftmaxCrossEntropy(shared_ptr<LossOptions> options) { reduction_type_ = options->loss_reduction; };\n\n    torch::Tensor operator()(torch::Tensor y_pred, torch::Tensor targets, bool scores) override;\n};\n\nclass RankingLoss : public LossFunction {\n   private:\n    float margin_;\n    LossReduction reduction_type_;\n\n   public:\n    RankingLoss(shared_ptr<RankingLossOptions> options) {\n        margin_ = options->margin;\n        reduction_type_ = options->loss_reduction;\n    };\n\n    torch::Tensor operator()(torch::Tensor pos_scores, torch::Tensor neg_scores, bool scores = true) override;\n};\n\nclass CrossEntropyLoss : public LossFunction {\n   private:\n    LossReduction reduction_type_;\n\n   public:\n    CrossEntropyLoss(shared_ptr<LossOptions> options) { reduction_type_ = options->loss_reduction; };\n\n    torch::Tensor operator()(torch::Tensor y_pred, torch::Tensor targets, bool scores) override;\n};\n\nclass BCEAfterSigmoidLoss : public LossFunction {\n   private:\n    LossReduction reduction_type_;\n\n   public:\n    BCEAfterSigmoidLoss(shared_ptr<LossOptions> options) { reduction_type_ = options->loss_reduction; };\n\n    torch::Tensor operator()(torch::Tensor y_pred, torch::Tensor targets, bool scores) override;\n};\n\nclass BCEWithLogitsLoss : public LossFunction {\n   private:\n    LossReduction reduction_type_;\n\n   public:\n    BCEWithLogitsLoss(shared_ptr<LossOptions> options) { reduction_type_ = options->loss_reduction; };\n\n    torch::Tensor operator()(torch::Tensor y_pred, torch::Tensor targets, bool scores) override;\n};\n\nclass MSELoss : public LossFunction {\n   private:\n    LossReduction reduction_type_;\n\n   public:\n    MSELoss(shared_ptr<LossOptions> options) { reduction_type_ = options->loss_reduction; };\n\n    torch::Tensor operator()(torch::Tensor y_pred, torch::Tensor targets, bool scores) override;\n};\n\nclass SoftPlusLoss : public LossFunction {\n   private:\n    LossReduction reduction_type_;\n\n   public:\n    SoftPlusLoss(shared_ptr<LossOptions> options) { reduction_type_ = options->loss_reduction; };\n\n    torch::Tensor operator()(torch::Tensor y_pred, torch::Tensor targets, bool scores) override;\n};\n\nshared_ptr<LossFunction> getLossFunction(shared_ptr<LossConfig> config);\n\n#endif  // MARIUS_SRC_CPP_INCLUDE_LOSS_H_\n"
  },
  {
    "path": "src/cpp/include/nn/model.h",
    "content": "//\n// Created by Jason Mohoney on 2/11/21.\n//\n\n#ifndef MARIUS_INCLUDE_MODEL_H_\n#define MARIUS_INCLUDE_MODEL_H_\n\n#include \"configuration/config.h\"\n#include \"data/batch.h\"\n#include \"decoders/decoder.h\"\n#include \"encoders/encoder.h\"\n#include \"loss.h\"\n#include \"optim.h\"\n#include \"reporting/reporting.h\"\n\nclass Model : public torch::nn::Module {\n   public:\n    shared_ptr<GeneralEncoder> encoder_;\n    shared_ptr<Decoder> decoder_;\n    shared_ptr<LossFunction> loss_function_;\n    shared_ptr<Reporter> reporter_;\n    std::vector<shared_ptr<Optimizer>> optimizers_;\n\n    torch::Device device_;\n    LearningTask learning_task_;\n    float sparse_lr_;\n\n    // Multi-GPU training\n    std::vector<shared_ptr<Model>> device_models_;\n\n    Model(shared_ptr<GeneralEncoder> encoder, shared_ptr<Decoder> decoder, shared_ptr<LossFunction> loss, shared_ptr<Reporter> reporter = nullptr,\n          std::vector<shared_ptr<Optimizer>> optimizers_ = {});\n\n    torch::Tensor forward_nc(at::optional<torch::Tensor> node_embeddings, at::optional<torch::Tensor> node_features, DENSEGraph dense_graph, bool train);\n\n    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> forward_lp(shared_ptr<Batch> batch, bool train);\n\n    void train_batch(shared_ptr<Batch> batch, bool call_step = true);\n\n    void evaluate_batch(shared_ptr<Batch> batch);\n\n    void clear_grad();\n\n    void clear_grad_all();\n\n    void step();\n\n    void step_all();\n\n    void save(string directory);\n\n    void load(string directory, bool train);\n\n    void broadcast(std::vector<torch::Device> devices);\n\n    void all_reduce();\n\n    void setup_optimizers(shared_ptr<ModelConfig> model_config);\n\n    int64_t get_base_embedding_dim();\n\n    bool has_embeddings();\n};\n\nshared_ptr<Model> initModelFromConfig(shared_ptr<ModelConfig> model_config, std::vector<torch::Device> devices, int num_relations, bool train);\n\n#endif  // MARIUS_INCLUDE_MODEL_H_\n"
  },
  {
    "path": "src/cpp/include/nn/model_helpers.h",
    "content": "//\n// Created by Jason Mohoney on 9/17/21.\n//\n\n#ifndef MARIUS_MODEL_HELPERS_H\n#define MARIUS_MODEL_HELPERS_H\n\n#include \"model.h\"\n#include \"nn/decoders/edge/complex.h\"\n#include \"nn/decoders/edge/distmult.h\"\n#include \"nn/decoders/edge/edge_decoder.h\"\n#include \"nn/decoders/edge/transe.h\"\n#include \"nn/decoders/node/noop_node_decoder.h\"\n\nstd::shared_ptr<Decoder> decoder_clone_helper(std::shared_ptr<Decoder> decoder, torch::Device device) {\n    return std::dynamic_pointer_cast<Decoder>(std::dynamic_pointer_cast<torch::nn::Module>(decoder)->clone(device));\n}\n\nstd::shared_ptr<GeneralEncoder> encoder_clone_helper(std::shared_ptr<GeneralEncoder> encoder, torch::Device device) {\n    return std::dynamic_pointer_cast<GeneralEncoder>(encoder->clone(device));\n}\n\nstd::shared_ptr<Decoder> get_edge_decoder(DecoderType decoder_type, EdgeDecoderMethod edge_decoder_method, int num_relations, int embedding_dim,\n                                          torch::TensorOptions tensor_options, bool use_inverse_relations) {\n    shared_ptr<EdgeDecoder> decoder;\n\n    if (decoder_type == DecoderType::DISTMULT) {\n        decoder = std::make_shared<DistMult>(num_relations, embedding_dim, tensor_options, use_inverse_relations, edge_decoder_method);\n    } else if (decoder_type == DecoderType::TRANSE) {\n        decoder = std::make_shared<TransE>(num_relations, embedding_dim, tensor_options, use_inverse_relations, edge_decoder_method);\n    } else if (decoder_type == DecoderType::COMPLEX) {\n        decoder = std::make_shared<ComplEx>(num_relations, embedding_dim, tensor_options, use_inverse_relations, edge_decoder_method);\n    } else {\n        throw std::runtime_error(\"Decoder not supported for learning task.\");\n    }\n\n    return decoder;\n}\n\nstd::shared_ptr<Decoder> get_node_decoder(DecoderType decoder_type) {\n    shared_ptr<NodeDecoder> decoder;\n\n    if (decoder_type == DecoderType::NODE) {\n        decoder = std::make_shared<NoOpNodeDecoder>();\n    } else {\n        throw std::runtime_error(\"Decoder not supported for learning task.\");\n    }\n\n    return decoder;\n}\n\n#endif  // MARIUS_MODEL_HELPERS_H\n"
  },
  {
    "path": "src/cpp/include/nn/optim.h",
    "content": "//\n// Created by Jason Mohoney on 12/9/21.\n//\n\n#ifndef MARIUS_OPTIM_H\n#define MARIUS_OPTIM_H\n\n#include \"common/datatypes.h\"\n#include \"configuration/config.h\"\n\nclass Optimizer {\n   public:\n    int64_t num_steps_;\n\n    torch::OrderedDict<std::string, torch::OrderedDict<std::string, torch::Tensor>> state_dict_;\n    torch::OrderedDict<std::string, torch::Tensor> param_dict_;\n\n    virtual ~Optimizer(){};\n\n    void save(torch::serialize::OutputArchive &output_archive);\n\n    void load(torch::serialize::InputArchive &input_archive);\n\n    void clear_grad();\n\n    virtual void reset_state() = 0;\n\n    virtual void step() = 0;\n\n    virtual std::shared_ptr<Optimizer> clone() = 0;\n};\n\nclass SGDOptimizer : public Optimizer {\n   public:\n    float learning_rate_;\n\n    SGDOptimizer(const SGDOptimizer &optim) {\n        param_dict_ = optim.param_dict_;\n        learning_rate_ = optim.learning_rate_;\n        reset_state();\n    }\n\n    SGDOptimizer(torch::OrderedDict<std::string, torch::Tensor> param_dict, float learning_rate);\n\n    void reset_state() override;\n\n    void step() override;\n\n    std::shared_ptr<Optimizer> clone() override;\n};\n\nclass AdagradOptimizer : public Optimizer {\n   public:\n    float learning_rate_;\n    float eps_;\n    float lr_decay_;\n    float weight_decay_;\n    float init_value_;\n\n    AdagradOptimizer(const AdagradOptimizer &optim) {\n        param_dict_ = optim.param_dict_;\n        learning_rate_ = optim.learning_rate_;\n        eps_ = optim.eps_;\n        lr_decay_ = optim.lr_decay_;\n        weight_decay_ = optim.weight_decay_;\n        init_value_ = optim.init_value_;\n        reset_state();\n    }\n\n    AdagradOptimizer(torch::OrderedDict<std::string, torch::Tensor> param_dict, std::shared_ptr<AdagradOptions> options);\n\n    void reset_state() override;\n\n    void step() override;\n\n    std::shared_ptr<Optimizer> clone() override;\n};\n\nclass AdamOptimizer : public Optimizer {\n   public:\n    float learning_rate_;\n    float eps_;\n    float beta_1_;\n    float beta_2_;\n    float weight_decay_;\n    bool amsgrad_;\n\n    AdamOptimizer(const AdamOptimizer &optim) {\n        param_dict_ = optim.param_dict_;\n        learning_rate_ = optim.learning_rate_;\n        eps_ = optim.eps_;\n        beta_1_ = optim.beta_1_;\n        beta_2_ = optim.beta_2_;\n        weight_decay_ = optim.weight_decay_;\n        amsgrad_ = optim.amsgrad_;\n        reset_state();\n    }\n\n    AdamOptimizer(torch::OrderedDict<std::string, torch::Tensor> param_dict, std::shared_ptr<AdamOptions> options);\n\n    void reset_state() override;\n\n    void step() override;\n\n    std::shared_ptr<Optimizer> clone() override;\n};\n\n#endif  // MARIUS_OPTIM_H\n"
  },
  {
    "path": "src/cpp/include/nn/regularizer.h",
    "content": "//\n// Created by Jason Mohoney on 8/25/21.\n//\n\n#ifndef MARIUS_SRC_CPP_INCLUDE_REGULARIZER_H_\n#define MARIUS_SRC_CPP_INCLUDE_REGULARIZER_H_\n\n#include \"common/datatypes.h\"\n\nclass Regularizer {\n   public:\n    virtual ~Regularizer(){};\n\n    virtual torch::Tensor operator()(torch::Tensor src_nodes_embs, torch::Tensor dst_node_embs) = 0;\n};\n\nclass NormRegularizer : public Regularizer {\n   private:\n    int norm_;\n    float coefficient_;\n\n   public:\n    NormRegularizer(int norm, float coefficient);\n\n    torch::Tensor operator()(torch::Tensor src_nodes_embs, torch::Tensor dst_node_embs) override;\n};\n\n#endif  // MARIUS_SRC_CPP_INCLUDE_REGULARIZER_H_\n"
  },
  {
    "path": "src/cpp/include/pipeline/evaluator.h",
    "content": "//\n// Created by Jason Mohoney on 2/28/20.\n//\n\n#ifndef MARIUS_EVALUATOR_H\n#define MARIUS_EVALUATOR_H\n\n#include <iostream>\n\n#include \"data/dataloader.h\"\n#include \"pipeline_cpu.h\"\n#include \"pipeline_gpu.h\"\n\n/**\n  The evaluator runs the evaluation process using the given model and dataset.\n*/\nclass Evaluator {\n   public:\n    shared_ptr<DataLoader> dataloader_;\n\n    virtual ~Evaluator(){};\n\n    /**\n      Runs evaluation process.\n      @param validation If true, evaluate on validation set. Otherwise evaluate on test set\n    */\n    virtual void evaluate(bool validation) = 0;\n};\n\nclass PipelineEvaluator : public Evaluator {\n    shared_ptr<Pipeline> pipeline_;\n\n   public:\n    PipelineEvaluator(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, shared_ptr<PipelineConfig> pipeline_config);\n\n    void evaluate(bool validation) override;\n};\n\nclass SynchronousEvaluator : public Evaluator {\n    shared_ptr<Model> model_;\n\n   public:\n    SynchronousEvaluator(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model);\n\n    void evaluate(bool validation) override;\n};\n\n#endif  // MARIUS_EVALUATOR_H\n"
  },
  {
    "path": "src/cpp/include/pipeline/graph_encoder.h",
    "content": "//\n// Created by Jason Mohoney on 1/21/22.\n//\n\n#ifndef MARIUS_GRAPH_ENCODER_H\n#define MARIUS_GRAPH_ENCODER_H\n\n#include \"data/dataloader.h\"\n#include \"pipeline_cpu.h\"\n#include \"pipeline_gpu.h\"\n\nclass GraphEncoder {\n   public:\n    shared_ptr<DataLoader> dataloader_;\n    shared_ptr<ProgressReporter> progress_reporter_;\n\n    virtual ~GraphEncoder(){};\n    /**\n      Encodes all of the nodes in the graph\n      @param seperate_layers. If true, all the nodes at each layer will be encoded before moving onto the next layer.\n    */\n    virtual void encode(bool separate_layers = false) = 0;\n};\n\nclass PipelineGraphEncoder : public GraphEncoder {\n    shared_ptr<Pipeline> pipeline_;\n\n   public:\n    PipelineGraphEncoder(shared_ptr<DataLoader> sampler, std::shared_ptr<Model> model, shared_ptr<PipelineConfig> pipeline_config, int logs_per_epoch = 10);\n\n    void encode(bool separate_layers = false) override;\n};\n\nclass SynchronousGraphEncoder : public GraphEncoder {\n    std::shared_ptr<Model> model_;\n\n   public:\n    SynchronousGraphEncoder(shared_ptr<DataLoader> sampler, std::shared_ptr<Model> model, int logs_per_epoch = 10);\n\n    void encode(bool separate_layers = false) override;\n};\n\n#endif  // MARIUS_GRAPH_ENCODER_H\n"
  },
  {
    "path": "src/cpp/include/pipeline/pipeline.h",
    "content": "//\n// Created by Jason Mohoney on 2/29/20.\n//\n#ifndef MARIUS_PIPELINE_H\n#define MARIUS_PIPELINE_H\n\n#include <time.h>\n\n#include \"common/datatypes.h\"\n#include \"data/batch.h\"\n#include \"data/dataloader.h\"\n#include \"nn/model.h\"\n#include \"pipeline_constants.h\"\n#include \"queue.h\"\n\nclass Pipeline;\n\nclass Worker {\n   protected:\n    Pipeline *pipeline_;\n    struct timespec sleep_time_;\n    std::atomic<bool> paused_;\n    std::atomic<bool> done_;\n\n    std::thread thread_;\n\n   public:\n    explicit Worker(Pipeline *pipeline);\n\n    virtual void run() = 0;\n\n    void spawn() { thread_ = std::thread(&Worker::run, this); }\n\n    void start() { paused_ = false; }\n\n    void pause() { paused_ = true; }\n\n    void stop() {\n        paused_ = true;\n        done_ = true;\n\n        if (thread_.joinable()) {\n            thread_.join();\n        }\n    }\n};\n\nclass LoadBatchWorker : public Worker {\n   public:\n    int worker_id_;\n\n    LoadBatchWorker(Pipeline *pipeline, int worker_id) : Worker{pipeline}, worker_id_{worker_id} {};\n\n    void run() override;\n};\n\nclass UpdateBatchWorker : public Worker {\n   public:\n    UpdateBatchWorker(Pipeline *pipeline) : Worker{pipeline} {};\n\n    void run() override;\n};\n\nclass WriteNodesWorker : public Worker {\n   public:\n    WriteNodesWorker(Pipeline *pipeline) : Worker{pipeline} {}\n\n    void run() override;\n};\n\nclass Pipeline {\n   public:\n    shared_ptr<DataLoader> dataloader_;\n    shared_ptr<Model> model_;\n    shared_ptr<ProgressReporter> reporter_;\n    shared_ptr<PipelineConfig> pipeline_options_;\n\n    int staleness_bound_;\n    std::atomic<int> batches_in_flight_;\n    std::mutex *max_batches_lock_;\n    std::condition_variable *max_batches_cv_;\n    std::atomic<int64_t> edges_processed_;\n\n    shared_ptr<Queue<shared_ptr<Batch>>> loaded_batches_;\n    shared_ptr<Queue<shared_ptr<Batch>>> update_batches_;\n\n    std::mutex *pipeline_lock_;\n    std::condition_variable pipeline_cv_;\n\n    std::atomic<int> admitted_batches_;\n\n    std::atomic<int> assign_id_;\n\n    bool encode_only_;\n    bool train_;\n\n    int64_t curr_pos_;\n\n    ~Pipeline();\n\n    shared_ptr<Worker> initWorkerOfType(int worker_type, int gpu_id = 0, int worker_id = 0);\n\n    virtual void addWorkersToPool(int pool_id, int worker_type, int num_workers, int num_gpus = 1) = 0;\n\n    bool isDone();\n\n    bool isTrain();\n\n    bool has_embeddings();\n\n    void waitComplete();\n\n    virtual void initialize() = 0;\n\n    virtual void start() = 0;\n\n    virtual void pauseAndFlush() = 0;\n\n    virtual void flushQueues() = 0;\n\n    virtual void setQueueExpectingData(bool expecting_data) = 0;\n\n    virtual void reportQueueStatus(){};\n\n    virtual void reportThreadStatus(){};\n};\n\n#endif  // MARIUS_PIPELINE_H\n"
  },
  {
    "path": "src/cpp/include/pipeline/pipeline_constants.h",
    "content": "//\n// Created by Jason Mohoney on 1/21/22.\n//\n\n#ifndef MARIUS_PIPELINE_CONSTANTS_H\n#define MARIUS_PIPELINE_CONSTANTS_H\n\n// CPU Pipeline worker IDs\n#define LOAD_BATCH_ID 0\n#define CPU_COMPUTE_ID 1\n#define UPDATE_BATCH_ID 2\n\n// GPU Pipeline worker IDs\n#define H2D_TRANSFER_ID 3\n#define GPU_COMPUTE_ID 4\n#define D2H_TRANSFER_ID 5\n\n// Encode Pipeline worker IDs\n#define CPU_ENCODE_ID 6\n#define GPU_ENCODE_ID 7\n#define NODE_WRITE_ID 8\n\n#define CPU_NUM_WORKER_TYPES 3\n#define GPU_NUM_WORKER_TYPES 5\n\n#define WAIT_TIME 100000  // 100 micro seconds\n#define NANOSECOND 1\n#define MICROSECOND 1000\n#define MILLISECOND 1000000\n\n#endif  // MARIUS_PIPELINE_CONSTANTS_H\n"
  },
  {
    "path": "src/cpp/include/pipeline/pipeline_cpu.h",
    "content": "//\n// Created by Jason Mohoney on 1/21/22.\n//\n\n#ifndef MARIUS_PIPELINE_CPU_H\n#define MARIUS_PIPELINE_CPU_H\n\n#include \"pipeline.h\"\n#include \"queue.h\"\n\nclass ComputeWorkerCPU : public Worker {\n   public:\n    ComputeWorkerCPU(Pipeline *pipeline) : Worker{pipeline} {}\n\n    void run() override;\n};\n\nclass EncodeNodesWorkerCPU : public Worker {\n   public:\n    int gpu_id_;\n\n    EncodeNodesWorkerCPU(Pipeline *pipeline) : Worker{pipeline} {}\n\n    void run() override;\n};\n\nclass PipelineCPU : public Pipeline {\n   public:\n    vector<shared_ptr<Worker>> pool_[CPU_NUM_WORKER_TYPES];\n\n    PipelineCPU(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, bool train, shared_ptr<ProgressReporter> reporter,\n                shared_ptr<PipelineConfig> pipeline_config, bool encode_only = false);\n\n    ~PipelineCPU();\n\n    void addWorkersToPool(int pool_id, int worker_type, int num_workers, int num_gpus = 1) override;\n\n    void initialize() override;\n\n    void start() override;\n\n    void pauseAndFlush() override;\n\n    void flushQueues() override;\n\n    void setQueueExpectingData(bool expecting_data) override;\n};\n\n#endif  // MARIUS_PIPELINE_CPU_H\n"
  },
  {
    "path": "src/cpp/include/pipeline/pipeline_gpu.h",
    "content": "//\n// Created by Jason Mohoney on 1/21/22.\n//\n\n#ifndef MARIUS_PIPELINE_GPU_H\n#define MARIUS_PIPELINE_GPU_H\n\n#include \"pipeline.h\"\n#include \"queue.h\"\n\nclass BatchToDeviceWorker : public Worker {\n   public:\n    BatchToDeviceWorker(Pipeline *pipeline) : Worker{pipeline} {};\n\n    void run() override;\n};\n\nclass ComputeWorkerGPU : public Worker {\n   public:\n    int gpu_id_;\n\n    ComputeWorkerGPU(Pipeline *pipeline, int gpu_id) : Worker{pipeline}, gpu_id_{gpu_id} {}\n\n    void run() override;\n};\n\nclass EncodeNodesWorkerGPU : public Worker {\n   public:\n    int gpu_id_;\n\n    EncodeNodesWorkerGPU(Pipeline *pipeline, int gpu_id) : Worker{pipeline}, gpu_id_{gpu_id} {}\n\n    void run() override;\n};\n\nclass BatchToHostWorker : public Worker {\n   public:\n    int gpu_id_;\n\n    BatchToHostWorker(Pipeline *pipeline, int gpu_id) : Worker{pipeline}, gpu_id_{gpu_id} {};\n\n    void run() override;\n};\n\nclass PipelineGPU : public Pipeline {\n   public:\n    vector<shared_ptr<Worker>> pool_[GPU_NUM_WORKER_TYPES];\n\n    std::vector<shared_ptr<Queue<shared_ptr<Batch>>>> device_loaded_batches_;  // one queue per GPU\n    std::vector<shared_ptr<Queue<shared_ptr<Batch>>>> device_update_batches_;  // one queue per GPU\n\n    // these variables should only be accessed/updated when the model->lock is acquired\n    std::mutex *gpu_sync_lock_;\n    std::condition_variable *gpu_sync_cv_;\n    int batches_since_last_sync_;\n    int gpu_sync_interval_;\n\n    PipelineGPU(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, bool train, shared_ptr<ProgressReporter> reporter,\n                shared_ptr<PipelineConfig> pipeline_config, bool encode_only = false);\n\n    ~PipelineGPU();\n\n    void addWorkersToPool(int pool_id, int worker_type, int num_workers, int num_gpus = 1) override;\n\n    void initialize() override;\n\n    void start() override;\n\n    void pauseAndFlush() override;\n\n    void flushQueues() override;\n\n    void setQueueExpectingData(bool expecting_data) override;\n};\n\n#endif  // MARIUS_PIPELINE_GPU_H\n"
  },
  {
    "path": "src/cpp/include/pipeline/pipeline_monitor.h",
    "content": "//\n// Created by Jason Mohoney on 1/21/22.\n//\n\n#ifndef MARIUS_PIPELINE_MONITOR_H\n#define MARIUS_PIPELINE_MONITOR_H\n\n#endif  // MARIUS_PIPELINE_MONITOR_H\n"
  },
  {
    "path": "src/cpp/include/pipeline/queue.h",
    "content": "//\n// Created by Jason Mohoney on 1/21/22.\n//\n\n#ifndef MARIUS_QUEUE_H\n#define MARIUS_QUEUE_H\n\ntemplate <class T>\nclass Queue {\n   private:\n    int max_size_;\n\n   public:\n    std::deque<T> queue_;\n    std::mutex *mutex_;\n    std::condition_variable *cv_;\n    std::atomic<bool> expecting_data_;\n\n    Queue<T>(int max_size) {\n        queue_ = std::deque<T>();\n        max_size_ = max_size;\n        mutex_ = new std::mutex();\n        cv_ = new std::condition_variable();\n        expecting_data_ = true;\n    }\n\n    bool push(T item) {\n        bool result = true;\n        if (isFull()) {\n            result = false;\n        } else {\n            queue_.push_back(item);\n        }\n        return result;\n    }\n\n    void blocking_push(T item) {\n        bool pushed = false;\n        while (!pushed) {\n            std::unique_lock lock(*mutex_);\n            pushed = push(item);\n            if (!pushed) {\n                cv_->wait(lock);\n            } else {\n                cv_->notify_all();\n            }\n            lock.unlock();\n        }\n    }\n\n    std::tuple<bool, T> pop() {\n        bool result = true;\n        T item;\n        if (isEmpty()) {\n            result = false;\n        } else {\n            item = queue_.front();\n            queue_.pop_front();\n        }\n        return std::forward_as_tuple(result, item);\n    }\n\n    std::tuple<bool, T> blocking_pop() {\n        bool popped = false;\n        T item;\n        while (!popped && expecting_data_) {\n            std::unique_lock lock(*mutex_);\n            auto tup = pop();\n            popped = std::get<0>(tup);\n            item = std::get<1>(tup);\n            if (!popped) {\n                cv_->wait(lock);\n            } else {\n                cv_->notify_all();\n            }\n            lock.unlock();\n        }\n        return std::forward_as_tuple(popped, item);\n    }\n\n    void lock() { mutex_->lock(); }\n\n    void unlock() { mutex_->unlock(); }\n\n    void flush() {\n        lock();\n        queue_ = std::deque<T>();\n        unlock();\n    }\n\n    int size() { return queue_.size(); }\n\n    bool isFull() { return queue_.size() == max_size_; }\n\n    bool isEmpty() { return queue_.size() == 0; }\n\n    int getMaxSize() { return max_size_; }\n\n    typedef typename std::deque<T> queue_type;\n\n    typedef typename queue_type::iterator iterator;\n    typedef typename queue_type::const_iterator const_iterator;\n\n    inline iterator begin() noexcept { return queue_.begin(); }\n\n    inline const_iterator cbegin() const noexcept { return queue_.cbegin(); }\n\n    inline iterator end() noexcept { return queue_.end(); }\n\n    inline const_iterator cend() const noexcept { return queue_.cend(); }\n};\n\n#endif  // MARIUS_QUEUE_H\n"
  },
  {
    "path": "src/cpp/include/pipeline/trainer.h",
    "content": "//\n// Created by Jason Mohoney on 2/28/20.\n//\n#ifndef MARIUS_TRAINER_H\n#define MARIUS_TRAINER_H\n\n#include \"data/dataloader.h\"\n#include \"pipeline_cpu.h\"\n#include \"pipeline_gpu.h\"\n\n/**\n  The trainer runs the training process using the given model for the specified number of epochs.\n*/\nclass Trainer {\n   public:\n    shared_ptr<DataLoader> dataloader_;\n    shared_ptr<ProgressReporter> progress_reporter_;\n    LearningTask learning_task_;\n\n    virtual ~Trainer(){};\n    /**\n      Runs training process for embeddings for specified number of epochs.\n      @param num_epochs The number of epochs to train for\n    */\n    virtual void train(int num_epochs = 1) = 0;\n};\n\nclass PipelineTrainer : public Trainer {\n    shared_ptr<Pipeline> pipeline_;\n\n   public:\n    PipelineTrainer(shared_ptr<DataLoader> dataloader, std::shared_ptr<Model> model, shared_ptr<PipelineConfig> pipeline_config, int logs_per_epoch = 10);\n\n    void train(int num_epochs = 1) override;\n};\n\nclass SynchronousTrainer : public Trainer {\n    std::shared_ptr<Model> model_;\n\n   public:\n    SynchronousTrainer(shared_ptr<DataLoader> dataloader, std::shared_ptr<Model> model, int logs_per_epoch = 10);\n\n    void train(int num_epochs = 1) override;\n};\n\n#endif  // MARIUS_TRAINER_H\n"
  },
  {
    "path": "src/cpp/include/reporting/logger.h",
    "content": "//\n// Created by Jason Mohoney on 7/2/20.\n//\n\n#ifndef MARIUS_LOGGER_H\n#define MARIUS_LOGGER_H\n#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE\n\n#include <spdlog/sinks/basic_file_sink.h>\n#include <spdlog/sinks/stdout_color_sinks.h>\n#include <spdlog/spdlog.h>\n\n#include <string>\n\nusing std::shared_ptr;\nusing std::string;\n\nclass MariusLogger {\n   private:\n    shared_ptr<spdlog::sinks::basic_file_sink_mt> trace_sink_;\n    shared_ptr<spdlog::sinks::basic_file_sink_mt> debug_sink_;\n    shared_ptr<spdlog::sinks::basic_file_sink_mt> info_sink_;\n    shared_ptr<spdlog::sinks::basic_file_sink_mt> warn_sink_;\n    shared_ptr<spdlog::sinks::basic_file_sink_mt> error_sink_;\n    shared_ptr<spdlog::sinks::stdout_color_sink_mt> console_sink_;\n\n   public:\n    shared_ptr<spdlog::logger> main_logger_;\n\n    MariusLogger(string model_dir) {\n        spdlog::drop_all();\n\n        console_sink_ = std::make_shared<spdlog::sinks::stdout_color_sink_mt>();\n        console_sink_->set_level(spdlog::level::info);\n        console_sink_->set_pattern(\"[%x %T.%e] %v\");\n\n        trace_sink_ = std::make_shared<spdlog::sinks::basic_file_sink_mt>(fmt::format(\"{}/logs/{}.log\", model_dir, \"trace\"), true);\n        trace_sink_->set_level(spdlog::level::trace);\n        trace_sink_->set_pattern(\"[%l] [%x %T.%e] [PID:%P TID:%t] [%s:%!:%#] %v\");\n\n        debug_sink_ = std::make_shared<spdlog::sinks::basic_file_sink_mt>(fmt::format(\"{}/logs/{}.log\", model_dir, \"debug\"), true);\n        debug_sink_->set_level(spdlog::level::debug);\n        debug_sink_->set_pattern(\"[%l] [%x %T.%e] [PID:%P TID:%t] [%s:%!:%#] %v\");\n\n        info_sink_ = std::make_shared<spdlog::sinks::basic_file_sink_mt>(fmt::format(\"{}/logs/{}.log\", model_dir, \"info\"), true);\n        info_sink_->set_level(spdlog::level::info);\n        info_sink_->set_pattern(\"[%l] [%x %T.%e] [PID:%P TID:%t] [%s:%!:%#] %v\");\n\n        warn_sink_ = std::make_shared<spdlog::sinks::basic_file_sink_mt>(fmt::format(\"{}/logs/{}.log\", model_dir, \"warn\"), true);\n        warn_sink_->set_level(spdlog::level::warn);\n        warn_sink_->set_pattern(\"[%l] [%x %T.%e] [PID:%P TID:%t] [%s:%!:%#] %v\");\n\n        error_sink_ = std::make_shared<spdlog::sinks::basic_file_sink_mt>(fmt::format(\"{}/logs/{}.log\", model_dir, \"error\"), true);\n        error_sink_->set_level(spdlog::level::err);\n        error_sink_->set_pattern(\"[%l] [%x %T.%e] [PID:%P TID:%t] [%g:%s:%!:%#] %v\");\n\n        spdlog::sinks_init_list sink_list = {error_sink_, warn_sink_, info_sink_, debug_sink_, trace_sink_, console_sink_};\n\n        main_logger_ = std::make_shared<spdlog::logger>(\"MariusLogger\", sink_list.begin(), sink_list.end());\n        main_logger_->set_level(spdlog::level::trace);\n        spdlog::register_logger(main_logger_);\n\n        spdlog::flush_every(std::chrono::seconds(1));\n    }\n\n    void setConsoleLogLevel(spdlog::level::level_enum level) { console_sink_->set_level(level); }\n};\n#endif  // MARIUS_LOGGER_H"
  },
  {
    "path": "src/cpp/include/reporting/reporting.h",
    "content": "//\n// Created by Jason Mohoney on 8/24/21.\n//\n\n#ifndef MARIUS_SRC_CPP_INCLUDE_REPORTING_H_\n#define MARIUS_SRC_CPP_INCLUDE_REPORTING_H_\n\n#include \"common/datatypes.h\"\n\nclass Metric {\n   public:\n    std::string name_;\n    std::string unit_;\n\n    virtual ~Metric(){};\n};\n\nclass RankingMetric : public Metric {\n   public:\n    virtual torch::Tensor computeMetric(torch::Tensor ranks) = 0;\n};\n\nclass HitskMetric : public RankingMetric {\n    int k_;\n\n   public:\n    HitskMetric(int k);\n\n    torch::Tensor computeMetric(torch::Tensor ranks);\n};\n\nclass MeanRankMetric : public RankingMetric {\n   public:\n    MeanRankMetric();\n\n    torch::Tensor computeMetric(torch::Tensor ranks);\n};\n\nclass MeanReciprocalRankMetric : public RankingMetric {\n   public:\n    MeanReciprocalRankMetric();\n\n    torch::Tensor computeMetric(torch::Tensor ranks);\n};\n\nclass ClassificationMetric : public Metric {\n   public:\n    virtual torch::Tensor computeMetric(torch::Tensor y_true, torch::Tensor y_pred) = 0;\n};\n\nclass CategoricalAccuracyMetric : public ClassificationMetric {\n   public:\n    CategoricalAccuracyMetric();\n\n    torch::Tensor computeMetric(torch::Tensor y_true, torch::Tensor y_pred) override;\n};\n\nclass Reporter {\n   private:\n    std::mutex *lock_;\n\n   public:\n    std::vector<shared_ptr<Metric>> metrics_;\n\n    Reporter() { lock_ = new std::mutex(); }\n\n    virtual ~Reporter();\n\n    void lock() { lock_->lock(); }\n\n    void unlock() { lock_->unlock(); }\n\n    void addMetric(shared_ptr<Metric> metric) { metrics_.emplace_back(metric); }\n\n    virtual void report() = 0;\n};\n\nclass LinkPredictionReporter : public Reporter {\n   public:\n    std::vector<torch::Tensor> per_batch_ranks_;\n    std::vector<torch::Tensor> per_batch_scores_;\n    std::vector<torch::Tensor> per_batch_edges_;\n    torch::Tensor all_ranks_;\n    torch::Tensor all_scores_;\n    torch::Tensor all_edges_;\n\n    LinkPredictionReporter();\n\n    ~LinkPredictionReporter();\n\n    void clear();\n\n    torch::Tensor computeRanks(torch::Tensor pos_scores, torch::Tensor neg_scores);\n\n    void addResult(torch::Tensor pos_scores, torch::Tensor neg_scores, torch::Tensor edges = torch::Tensor());\n\n    void report() override;\n\n    void save(string directory, bool scores, bool ranks);\n};\n\nclass NodeClassificationReporter : public Reporter {\n   public:\n    std::vector<torch::Tensor> per_batch_y_true_;\n    std::vector<torch::Tensor> per_batch_y_pred_;\n    std::vector<torch::Tensor> per_batch_nodes_;\n    torch::Tensor all_y_true_;\n    torch::Tensor all_y_pred_;\n    torch::Tensor all_nodes_;\n\n    NodeClassificationReporter();\n\n    ~NodeClassificationReporter();\n\n    void clear();\n\n    void addResult(torch::Tensor y_true, torch::Tensor y_pred, torch::Tensor node_ids = torch::Tensor());\n\n    void report() override;\n\n    void save(string directory, bool labels);\n};\n\nclass ProgressReporter : public Reporter {\n    std::string item_name_;\n    int64_t total_items_;\n    int64_t current_item_;\n    int total_reports_;\n    int64_t next_report_;\n    int64_t items_per_report_;\n\n   public:\n    ProgressReporter(std::string item_name, int64_t total_items, int total_reports);\n\n    ~ProgressReporter();\n\n    void clear();\n\n    void addResult(int64_t items_processed);\n\n    void report() override;\n};\n\n#endif  // MARIUS_SRC_CPP_INCLUDE_REPORTING_H_\n"
  },
  {
    "path": "src/cpp/include/storage/buffer.h",
    "content": "//\n// Created by Jason Mohoney on 5/26/20.\n//\n\n#ifndef MARIUS_BUFFER_H\n#define MARIUS_BUFFER_H\n\n#include \"common/datatypes.h\"\n#include \"data/batch.h\"\n\nclass Partition {\n   public:\n    std::mutex *lock_;            /**< Mutex lock to prevent race conditions */\n    std::condition_variable *cv_; /**< Condition variable for signaling */\n    void *data_ptr_;              /**< Pointer to partition in memory */\n    int partition_id_;            /**< ID of the partition */\n\n    bool present_; /**< If true this partition is present in the buffer */\n\n    int64_t partition_size_; /**< Number of embeddings in each partition, the last partition may have fewer embeddings than this */\n    int embedding_size_;     /**< Number of elements in each embedding */\n    torch::Dtype dtype_;     /**< Datatype of the embeddings */\n    int dtype_size_;         /**< Size in bytes of the datatype */\n    int64_t total_size_;     /**< Total size in bytes of the partition */\n\n    int64_t idx_offset_;  /**< Embedding ID offset of the partition */\n    int64_t file_offset_; /**< Offset in bytes of the partition in the embedding file */\n    int buffer_idx_;      /**< Buffer entry ID of the partition in the buffer */\n\n    torch::Tensor tensor_; /**< Tensor view of the partition */\n\n    bool evicting_;\n\n    Partition(int partition_id, int64_t partition_size, int embedding_size, torch::Dtype dtype, int64_t idx_offset, int64_t file_offset);\n\n    ~Partition();\n\n    torch::Tensor indexRead(Indices indices);\n};\n\nclass PartitionedFile {\n   public:\n    int num_partitions_;       /**< Number of partitions in the file */\n    int64_t partition_size_;   /**< Number of embeddings in each partition, the last partition may have fewer embeddings than this */\n    int embedding_size_;       /**< Number of elements in each embedding */\n    int64_t total_embeddings_; /**< Total number of embeddings */\n    torch::Dtype dtype_;       /**< Datatype of the embeddings */\n    int dtype_size_;           /**< Size in bytes of embedding element dtype */\n    string filename_;          /**< Name of the backing file */\n    int fd_;                   /**< File descriptor for the backing file */\n\n    /** Constructor */\n    PartitionedFile(string filename, int num_partitions, int64_t partition_size, int embedding_size, int64_t total_embeddings, torch::Dtype dtype);\n\n    /** Loads a partition of the specified id into addr, assumes that addr has been allocated with sufficient memory */\n    void readPartition(void *addr, Partition *partition);\n\n    /** Writes a partition from memory to the file */\n    void writePartition(Partition *partition, bool clear_mem = true);\n};\n\nclass LookaheadBlock {\n   private:\n    std::thread *thread_;\n    PartitionedFile *partitioned_file_;\n    std::vector<void *> mems_;\n\n    void run();\n\n   public:\n    int64_t total_size_;\n    std::atomic<bool> present_;\n    std::mutex *lock_;\n    std::condition_variable cv_;\n    std::vector<Partition *> partitions_;\n    std::atomic<bool> done_;\n\n    LookaheadBlock(int64_t total_size, PartitionedFile *partitioned_file, int num_per_lookahead);\n\n    ~LookaheadBlock();\n\n    void start(std::vector<Partition *> first_partitions);\n\n    void stop();\n\n    void move_to_buffer(std::vector<void *> buff_addrs, std::vector<int64_t> buffer_idxs, std::vector<Partition *> next_partitions);\n};\n\nclass AsyncWriteBlock {\n   private:\n    std::thread *thread_;\n    PartitionedFile *partitioned_file_;\n    std::vector<void *> mems_;\n\n    void run();\n\n   public:\n    int64_t total_size_;\n    std::atomic<bool> present_;\n    std::mutex *lock_;\n    std::condition_variable cv_;\n    std::vector<Partition *> partitions_;\n    std::atomic<bool> done_;\n\n    AsyncWriteBlock(int64_t total_size, PartitionedFile *partitioned_file, int num_per_evict);\n\n    ~AsyncWriteBlock();\n\n    void start();\n\n    void stop();\n\n    void async_write(std::vector<Partition *> partitions);\n};\n\nclass PartitionBuffer {\n   private:\n    std::atomic<int64_t> size_;\n    int capacity_;\n    int num_partitions_;     /**< Number of partitions in the file */\n    int64_t partition_size_; /**< Number of embeddings in each partition, the last partition may have fewer embeddings than this */\n    int embedding_size_;     /**< Number of elements in each embedding */\n    int fine_to_coarse_ratio_;\n    int64_t total_embeddings_;\n    torch::Dtype dtype_; /**< Datatype of the embeddings */\n    int dtype_size_;\n\n    void *buff_mem_;\n    bool loaded_;\n\n    Indices in_buffer_ids_;\n    torch::Tensor buffer_tensor_view_;\n    std::vector<Partition *> partition_table_;\n\n    bool prefetching_;\n    LookaheadBlock *lookahead_block_;\n    AsyncWriteBlock *async_write_block_;\n\n    string filename_;\n    PartitionedFile *partitioned_file_;\n\n    // order in which data is accessed\n    torch::Tensor buffer_state_;\n    std::vector<torch::Tensor> buffer_states_;\n    std::vector<torch::Tensor>::iterator buffer_state_iterator_;\n\n    torch::Tensor getBufferState();\n\n    void admit(std::vector<Partition *> admit_partitions, std::vector<int64_t> buffer_idxs);\n\n    void evict(std::vector<Partition *> evict_partitions);\n\n    void startThreads();\n\n    void stopThreads();\n\n   public:\n    PartitionBuffer(int capacity, int num_partitions, int fine_to_coarse_ratio, int64_t partition_size, int embedding_size, int64_t total_embeddings,\n                    torch::Dtype dtype, string filename, bool prefetching);\n\n    ~PartitionBuffer();\n\n    void load();\n\n    void write();\n\n    void unload(bool write);\n\n    std::vector<int> getNextAdmit();\n\n    std::vector<int> getNextEvict();\n\n    Indices getRandomIds(int64_t size);\n\n    torch::Tensor indexRead(torch::Tensor indices);\n\n    torch::Tensor getGlobalToLocalMap(bool get_current);\n\n    void indexAdd(torch::Tensor indices, torch::Tensor values);\n\n    void setBufferOrdering(std::vector<torch::Tensor> buffer_states);\n\n    bool hasSwap();\n\n    void performNextSwap();\n\n    void sync();\n\n    int64_t getNumInMemory() { return buffer_tensor_view_.size(0); }\n};\n\n#endif  // MARIUS_BUFFER_H\n"
  },
  {
    "path": "src/cpp/include/storage/checkpointer.h",
    "content": "//\n// Created by Jason Mohoney on 12/15/21.\n//\n\n#ifndef MARIUS_CHECKPOINTER_H\n#define MARIUS_CHECKPOINTER_H\n\n#include \"data/dataloader.h\"\n#include \"nn/model.h\"\n#include \"storage/storage.h\"\n\nstruct CheckpointMeta {\n    string name = \"checkpoint\";\n    int num_epochs = -1;\n    int checkpoint_id = -1;\n\n    bool link_prediction = true;\n    bool has_state = false;\n    bool has_encoded = false;\n    bool has_model = true;\n};\n\nclass Checkpointer {\n   public:\n    std::shared_ptr<Model> model_;\n    shared_ptr<GraphModelStorage> storage_;\n    std::shared_ptr<CheckpointConfig> config_;\n\n    Checkpointer(std::shared_ptr<Model> model, shared_ptr<GraphModelStorage> storage, std::shared_ptr<CheckpointConfig> config);\n\n    Checkpointer(){};\n\n    void saveMetadata(string directory, CheckpointMeta checkpoint_meta);\n\n    CheckpointMeta loadMetadata(string directory);\n\n    std::tuple<std::shared_ptr<Model>, shared_ptr<GraphModelStorage>, CheckpointMeta> load(string checkpoint_dir, std::shared_ptr<MariusConfig> marius_config,\n                                                                                           bool train);\n\n    void save(string checkpoint_dir, CheckpointMeta checkpoint_meta);\n\n    void create_checkpoint(string checkpoint_dir, CheckpointMeta checkpoint_meta, int epochs);\n};\n\n#endif  // MARIUS_CHECKPOINTER_H"
  },
  {
    "path": "src/cpp/include/storage/graph_storage.h",
    "content": "//\n// Created by Jason Mohoney on 6/18/21.\n//\n\n#ifndef MARIUS_SRC_CPP_INCLUDE_GRAPH_STORAGE_H_\n#define MARIUS_SRC_CPP_INCLUDE_GRAPH_STORAGE_H_\n\n#include \"configuration/constants.h\"\n#include \"nn/model.h\"\n#include \"storage/storage.h\"\n\nstruct GraphModelStoragePtrs {\n    shared_ptr<Storage> edges = nullptr;\n    shared_ptr<Storage> train_edges = nullptr;\n    shared_ptr<Storage> train_edges_dst_sort = nullptr;\n    shared_ptr<Storage> validation_edges = nullptr;\n    shared_ptr<Storage> test_edges = nullptr;\n    shared_ptr<Storage> nodes = nullptr;\n    shared_ptr<Storage> train_nodes = nullptr;\n    shared_ptr<Storage> valid_nodes = nullptr;\n    shared_ptr<Storage> test_nodes = nullptr;\n    shared_ptr<Storage> node_features = nullptr;\n    shared_ptr<Storage> node_labels = nullptr;\n    shared_ptr<Storage> relation_features = nullptr;\n    shared_ptr<Storage> relation_labels = nullptr;\n    shared_ptr<Storage> node_embeddings = nullptr;\n    shared_ptr<Storage> encoded_nodes = nullptr;\n    shared_ptr<Storage> node_optimizer_state = nullptr;\n    std::vector<shared_ptr<Storage>> filter_edges;\n};\n\nstruct InMemorySubgraphState {\n    EdgeList all_in_memory_edges_;\n    EdgeList all_in_memory_mapped_edges_;\n    torch::Tensor in_memory_partition_ids_;\n    torch::Tensor in_memory_edge_bucket_ids_;\n    torch::Tensor in_memory_edge_bucket_sizes_;\n    torch::Tensor in_memory_edge_bucket_starts_;\n    torch::Tensor global_to_local_index_map_;\n    shared_ptr<MariusGraph> in_memory_subgraph_;\n};\n\nclass GraphModelStorage {\n   private:\n    void _load(shared_ptr<Storage> storage);\n\n    void _unload(shared_ptr<Storage> storage, bool write);\n\n    int64_t num_nodes_;\n    int64_t num_edges_;\n\n   protected:\n    bool train_;\n\n    shared_ptr<InMemory> in_memory_embeddings_;\n    shared_ptr<InMemory> in_memory_features_;\n\n   public:\n    // In memory subgraph for partition buffer\n\n    EdgeList active_edges_;\n    Indices active_nodes_;\n\n    std::mutex *subgraph_lock_;\n    std::condition_variable *subgraph_cv_;\n    shared_ptr<InMemorySubgraphState> current_subgraph_state_;\n    shared_ptr<InMemorySubgraphState> next_subgraph_state_;\n    bool prefetch_;\n    bool prefetch_complete_;\n\n    GraphModelStoragePtrs storage_ptrs_;\n    bool full_graph_evaluation_;\n\n    GraphModelStorage(GraphModelStoragePtrs storage_ptrs, shared_ptr<StorageConfig> storage_config);\n\n    GraphModelStorage(GraphModelStoragePtrs storage_ptrs, bool prefetch = false);\n\n    ~GraphModelStorage();\n\n    void load();\n\n    void unload(bool write);\n\n    void initializeInMemorySubGraph(torch::Tensor buffer_state, int num_hash_maps = 1);\n\n    void updateInMemorySubGraph_(shared_ptr<InMemorySubgraphState> subgraph, std::pair<std::vector<int>, std::vector<int>> swap_ids);\n\n    void updateInMemorySubGraph();\n\n    void getNextSubGraph();\n\n    EdgeList merge_sorted_edge_buckets(EdgeList edges, torch::Tensor starts, int buffer_size, bool src);\n\n    void setEdgesStorage(shared_ptr<Storage> edge_storage);\n\n    void setNodesStorage(shared_ptr<Storage> node_storage);\n\n    EdgeList getEdges(Indices indices);\n\n    EdgeList getEdgesRange(int64_t start, int64_t size);\n\n    Indices getRandomNodeIds(int64_t size);\n\n    Indices getNodeIdsRange(int64_t start, int64_t size);\n\n    void shuffleEdges();\n\n    torch::Tensor getNodeEmbeddings(Indices indices);\n\n    torch::Tensor getNodeEmbeddingsRange(int64_t start, int64_t size);\n\n    torch::Tensor getNodeFeatures(Indices indices);\n\n    torch::Tensor getNodeFeaturesRange(int64_t start, int64_t size);\n\n    torch::Tensor getEncodedNodes(Indices indices);\n\n    torch::Tensor getEncodedNodesRange(int64_t start, int64_t size);\n\n    torch::Tensor getNodeLabels(Indices indices);\n\n    torch::Tensor getNodeLabelsRange(int64_t start, int64_t size);\n\n    void updatePutNodeEmbeddings(Indices indices, torch::Tensor values);\n\n    void updateAddNodeEmbeddings(Indices indices, torch::Tensor values);\n\n    void updatePutEncodedNodes(Indices indices, torch::Tensor values);\n\n    void updatePutEncodedNodesRange(int64_t start, int64_t size, torch::Tensor values);\n\n    OptimizerState getNodeEmbeddingState(Indices indices);\n\n    OptimizerState getNodeEmbeddingStateRange(int64_t start, int64_t size);\n\n    void updatePutNodeEmbeddingState(Indices indices, OptimizerState state);\n\n    void updateAddNodeEmbeddingState(Indices indices, torch::Tensor values);\n\n    bool embeddingsOffDevice();\n\n    void sortAllEdges();\n\n    int getNumPartitions() {\n        int num_partitions = 1;\n\n        if (useInMemorySubGraph()) {\n            if (instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_features)) {\n                num_partitions = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->options_->num_partitions;\n            }\n\n            // assumes both the node features and node embeddings have the same number of partitions\n            if (instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_embeddings)) {\n                num_partitions = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->options_->num_partitions;\n            }\n        }\n\n        return num_partitions;\n    }\n\n    bool useInMemorySubGraph() {\n        bool embeddings_buffered = instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_embeddings);\n        bool features_buffered = instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_features);\n\n        return (embeddings_buffered || features_buffered) && (train_ || (!full_graph_evaluation_));\n    }\n\n    bool hasSwap() {\n        if (storage_ptrs_.node_embeddings != nullptr) {\n            return std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->hasSwap();\n        }\n\n        if (storage_ptrs_.node_features != nullptr) {\n            return std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->hasSwap();\n        }\n\n        return false;\n    }\n\n    std::pair<std::vector<int>, std::vector<int>> getNextSwapIds() {\n        std::vector<int> evict_ids;\n        std::vector<int> admit_ids;\n\n        if (storage_ptrs_.node_embeddings != nullptr && instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_embeddings)) {\n            evict_ids = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->getNextEvict();\n            admit_ids = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->getNextAdmit();\n        } else if (storage_ptrs_.node_features != nullptr && instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_features)) {\n            evict_ids = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->getNextEvict();\n            admit_ids = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->getNextAdmit();\n        }\n\n        return std::make_pair(evict_ids, admit_ids);\n    }\n\n    void performSwap() {\n        if (storage_ptrs_.node_embeddings != nullptr && instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_embeddings)) {\n            std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->performNextSwap();\n            if (storage_ptrs_.node_optimizer_state != nullptr && train_) {\n                std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_optimizer_state)->performNextSwap();\n            }\n        }\n\n        if (storage_ptrs_.node_features != nullptr && instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_features)) {\n            std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->performNextSwap();\n        }\n    }\n\n    void setBufferOrdering(vector<torch::Tensor> buffer_states) {\n        if (storage_ptrs_.node_embeddings != nullptr && instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_embeddings)) {\n            std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->setBufferOrdering(buffer_states);\n            if (storage_ptrs_.node_optimizer_state != nullptr && !train_) {\n                std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_optimizer_state)->setBufferOrdering(buffer_states);\n            }\n        }\n        if (storage_ptrs_.node_features != nullptr && instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_features)) {\n            std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->setBufferOrdering(buffer_states);\n        }\n    }\n\n    void setActiveEdges(torch::Tensor active_edges) { active_edges_ = active_edges; }\n\n    void setActiveNodes(torch::Tensor node_ids) { active_nodes_ = node_ids; }\n\n    int64_t getNumActiveEdges() {\n        if (active_edges_.defined()) {\n            return active_edges_.size(0);\n        } else {\n            return storage_ptrs_.edges->getDim0();\n        }\n    }\n\n    int64_t getNumActiveNodes() {\n        if (active_nodes_.defined()) {\n            return active_nodes_.size(0);\n        } else {\n            return storage_ptrs_.nodes->getDim0();\n        }\n    }\n\n    int64_t getNumEdges() { return storage_ptrs_.edges->getDim0(); }\n\n    int64_t getNumNodes() {\n        if (storage_ptrs_.node_embeddings != nullptr) {\n            return storage_ptrs_.node_embeddings->getDim0();\n        }\n\n        if (storage_ptrs_.node_features != nullptr) {\n            return storage_ptrs_.node_features->getDim0();\n        }\n\n        return num_nodes_;\n    }\n\n    int64_t getNumNodesInMemory() {\n        if (storage_ptrs_.node_embeddings != nullptr) {\n            if (useInMemorySubGraph()) {\n                return std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->getNumInMemory();\n            }\n        }\n\n        if (storage_ptrs_.node_features != nullptr) {\n            if (useInMemorySubGraph()) {\n                return std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->getNumInMemory();\n            }\n        }\n\n        return getNumNodes();\n    }\n\n    void setTrainSet() {\n        train_ = true;\n\n        if (storage_ptrs_.train_edges != nullptr) {\n            setEdgesStorage(storage_ptrs_.train_edges);\n        }\n\n        if (storage_ptrs_.train_nodes != nullptr) {\n            setNodesStorage(storage_ptrs_.train_nodes);\n        }\n    }\n\n    void setValidationSet() {\n        train_ = false;\n\n        if (storage_ptrs_.validation_edges != nullptr) {\n            setEdgesStorage(storage_ptrs_.validation_edges);\n        }\n\n        if (storage_ptrs_.valid_nodes != nullptr) {\n            setNodesStorage(storage_ptrs_.valid_nodes);\n        }\n    }\n\n    void setTestSet() {\n        train_ = false;\n\n        if (storage_ptrs_.test_edges != nullptr) {\n            setEdgesStorage(storage_ptrs_.test_edges);\n        }\n\n        if (storage_ptrs_.test_nodes != nullptr) {\n            setNodesStorage(storage_ptrs_.test_nodes);\n        }\n    }\n\n    void setFilterEdges(std::vector<shared_ptr<Storage>> filter_edges) { storage_ptrs_.filter_edges = filter_edges; }\n\n    void addFilterEdges(shared_ptr<Storage> filter_edges) { storage_ptrs_.filter_edges.emplace_back(filter_edges); }\n};\n\n#endif  // MARIUS_SRC_CPP_INCLUDE_GRAPH_STORAGE_H_\n"
  },
  {
    "path": "src/cpp/include/storage/io.h",
    "content": "//\n// Created by jasonmohoney on 10/4/19.\n//\n\n#ifndef MARIUS_IO_H\n#define MARIUS_IO_H\n\n#include <sys/ioctl.h>\n#include <sys/mman.h>\n#include <sys/stat.h>\n\n#include <fstream>\n#include <iostream>\n#include <memory>\n#include <string>\n\n#include \"common/datatypes.h\"\n#include \"storage/graph_storage.h\"\n#include \"storage/storage.h\"\n\nstd::tuple<shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>> initializeEdges(shared_ptr<StorageConfig> storage_config,\n                                                                                                               LearningTask learning_task);\n\nstd::tuple<shared_ptr<Storage>, shared_ptr<Storage>> initializeNodeEmbeddings(std::shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config,\n                                                                              bool reinitialize, bool train, std::shared_ptr<InitConfig> init_config);\n\nstd::tuple<shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>> initializeNodeIds(shared_ptr<StorageConfig> storage_config);\n\nshared_ptr<Storage> initializeRelationFeatures(shared_ptr<StorageConfig> storage_config);\n\nshared_ptr<Storage> initializeNodeFeatures(std::shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config);\n\nshared_ptr<Storage> initializeNodeLabels(std::shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config);\n\nshared_ptr<GraphModelStorage> initializeStorageLinkPrediction(std::shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config, bool reinitialize,\n                                                              bool train, std::shared_ptr<InitConfig> init_config);\n\nshared_ptr<GraphModelStorage> initializeStorageNodeClassification(std::shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config, bool reinitialize,\n                                                                  bool train, std::shared_ptr<InitConfig> init_config);\n\nshared_ptr<GraphModelStorage> initializeStorage(std::shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config, bool reinitialize, bool train,\n                                                std::shared_ptr<InitConfig> init_config = nullptr);\n\n#endif  // MARIUS_IO_H\n"
  },
  {
    "path": "src/cpp/include/storage/storage.h",
    "content": "//\n// Created by Jason Mohoney on 4/21/20.\n//\n\n#ifndef MARIUS_STORAGE_H\n#define MARIUS_STORAGE_H\n\n#include <fstream>\n#include <string>\n#include <tuple>\n#include <vector>\n\n#include \"common/datatypes.h\"\n#include \"data/batch.h\"\n#include \"storage/buffer.h\"\n\nusing std::list;\nusing std::shared_ptr;\nusing std::string;\nusing std::unordered_map;\nusing std::vector;\n\n#define MAX_SHUFFLE_SIZE 4E8\n#define MAX_SORT_SIZE 4E8\n\nvoid renameFile(string old_filename, string new_filename);\n\nvoid copyFile(string src_filename, string dst_filename);\n\nbool fileExists(string filename);\n\nvoid createDir(string path, bool exist_ok);\n\n/** Abstract storage class */\nclass Storage {\n   public:\n    int64_t dim0_size_;\n    int64_t dim1_size_;\n    torch::Dtype dtype_;\n    bool initialized_;\n    vector<int64_t> edge_bucket_sizes_;\n    torch::Tensor data_;\n    torch::Device device_;\n    string filename_;\n\n    Storage();\n\n    virtual ~Storage(){};\n\n    virtual torch::Tensor indexRead(Indices indices) = 0;\n\n    virtual void indexAdd(Indices indices, torch::Tensor values) = 0;\n\n    virtual torch::Tensor range(int64_t offset, int64_t n) = 0;\n\n    virtual void indexPut(Indices indices, torch::Tensor values) = 0;\n\n    virtual void rangePut(int64_t offset, int64_t n, torch::Tensor values) = 0;\n\n    virtual void load() = 0;\n\n    virtual void write() = 0;\n\n    virtual void unload(bool write = false) = 0;\n\n    virtual void shuffle() = 0;\n\n    virtual void sort(bool src) = 0;\n\n    int64_t getDim0() { return dim0_size_; }\n\n    bool isInitialized() { return initialized_; }\n\n    void setInitialized(bool init) { initialized_ = init; }\n\n    void readPartitionSizes(string filename) {\n        std::ifstream partition_file(filename);\n        edge_bucket_sizes_.clear();\n        int64_t size;\n        while (partition_file >> size) {\n            edge_bucket_sizes_.push_back(size);\n        }\n    }\n\n    vector<int64_t> getEdgeBucketSizes() { return edge_bucket_sizes_; }\n};\n\n/** Storage which uses the partition buffer, used for node embeddings and optimizer state */\nclass PartitionBufferStorage : public Storage {\n   public:\n    bool loaded_;\n\n    PartitionBuffer *buffer_;\n\n    shared_ptr<PartitionBufferOptions> options_;\n\n    PartitionBufferStorage(string filename, int64_t dim0_size, int64_t dim1_size, shared_ptr<PartitionBufferOptions> options);\n\n    PartitionBufferStorage(string filename, torch::Tensor data, shared_ptr<PartitionBufferOptions> options);\n\n    PartitionBufferStorage(string filename, shared_ptr<PartitionBufferOptions> options);\n\n    ~PartitionBufferStorage();\n\n    void rangePut(int64_t offset, torch::Tensor values);\n\n    void append(torch::Tensor values);\n\n    void load() override;\n\n    void unload(bool perform_write) override;\n\n    void write() override;\n\n    torch::Tensor indexRead(Indices indices) override;\n\n    void indexAdd(Indices indices, torch::Tensor values) override;\n\n    torch::Tensor range(int64_t offset, int64_t n) override;\n\n    void indexPut(Indices indices, torch::Tensor values) override;\n\n    void rangePut(int64_t offset, int64_t n, torch::Tensor values) override;\n\n    void shuffle() override;\n\n    void sort(bool src) override;\n\n    Indices getRandomIds(int64_t size) { return buffer_->getRandomIds(size); }\n\n    bool hasSwap() { return buffer_->hasSwap(); }\n\n    void performNextSwap() { buffer_->performNextSwap(); }\n\n    torch::Tensor getGlobalToLocalMap(bool get_current) { return buffer_->getGlobalToLocalMap(get_current); }\n\n    void sync() { buffer_->sync(); }\n\n    void setBufferOrdering(vector<torch::Tensor> buffer_states) { buffer_->setBufferOrdering(buffer_states); }\n\n    std::vector<int> getNextAdmit() { return buffer_->getNextAdmit(); }\n\n    std::vector<int> getNextEvict() { return buffer_->getNextEvict(); }\n\n    int64_t getNumInMemory() { return buffer_->getNumInMemory(); }\n};\n\n/** Flat File storage used for data that only requires sequential access. Can be used to store and access large amounts of edges. */\nclass FlatFile : public Storage {\n   private:\n    int fd_;\n\n    bool loaded_;\n\n   public:\n    FlatFile(string filename, int64_t dim0_size, int64_t dim1_size, torch::Dtype dtype, bool alloc = false);\n\n    FlatFile(string filename, torch::Tensor data);\n\n    FlatFile(string filename, torch::Dtype dtype);\n\n    ~FlatFile(){};\n\n    void rangePut(int64_t offset, torch::Tensor values);\n\n    void append(torch::Tensor values);\n\n    void load() override;\n\n    void write() override;\n\n    void unload(bool perform_write) override;\n\n    torch::Tensor indexRead(Indices indices) override;\n\n    void indexAdd(Indices indices, torch::Tensor values) override;\n\n    torch::Tensor range(int64_t offset, int64_t n) override;\n\n    void indexPut(Indices indices, torch::Tensor values) override;\n\n    void rangePut(int64_t offset, int64_t n, torch::Tensor values) override;\n\n    void shuffle() override;\n\n    void sort(bool src) override;\n\n    void move(string new_filename);\n\n    void copy(string new_filename, bool rename);\n\n    void mem_load();\n\n    void mem_unload(bool write);\n};\n\n/** In memory storage for data which fits in either GPU or CPU memory. */\nclass InMemory : public Storage {\n   private:\n    int fd_;\n\n    bool loaded_;\n\n   public:\n    InMemory(string filename, int64_t dim0_size, int64_t dim1_size, torch::Dtype dtype, torch::Device device);\n\n    InMemory(string filename, torch::Tensor data, torch::Device device);\n\n    InMemory(string filename, torch::Dtype dtype);\n\n    InMemory(torch::Tensor data);\n\n    ~InMemory(){};\n\n    void load() override;\n\n    void write() override;\n\n    void unload(bool perform_write) override;\n\n    torch::Tensor indexRead(Indices indices) override;\n\n    void indexAdd(Indices indices, torch::Tensor values) override;\n\n    torch::Tensor range(int64_t offset, int64_t n) override;\n\n    void indexPut(Indices indices, torch::Tensor values) override;\n\n    void rangePut(int64_t offset, int64_t n, torch::Tensor values) override;\n\n    void shuffle() override;\n\n    void sort(bool src) override;\n};\n\n#endif  // MARIUS_STORAGE_H\n"
  },
  {
    "path": "src/cpp/python_bindings/configuration/config_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"configuration/config.h\"\n\nvoid init_config(py::module &m) {\n    py::class_<NeighborSamplingConfig, std::shared_ptr<NeighborSamplingConfig>>(m, \"NeighborSamplingConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &NeighborSamplingConfig::type)\n        .def_readwrite(\"options\", &NeighborSamplingConfig::options)\n        .def_readwrite(\"use_hashmap_sets\", &NeighborSamplingConfig::use_hashmap_sets);\n\n    py::class_<OptimizerConfig, std::shared_ptr<OptimizerConfig>>(m, \"OptimizerConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &OptimizerConfig::type)\n        .def_readwrite(\"options\", &OptimizerConfig::options);\n\n    py::class_<InitConfig, std::shared_ptr<InitConfig>>(m, \"InitConfig\")\n        .def(py::init<InitDistribution, std::shared_ptr<InitOptions>>(), py::arg(\"distribution\"), py::arg(\"options\"))\n        .def_readwrite(\"type\", &InitConfig::type)\n        .def_readwrite(\"options\", &InitConfig::options);\n\n    py::class_<LossConfig, std::shared_ptr<LossConfig>>(m, \"LossConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &LossConfig::type)\n        .def_readwrite(\"options\", &LossConfig::options);\n\n    py::class_<LayerConfig, std::shared_ptr<LayerConfig>>(m, \"LayerConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &LayerConfig::type)\n        .def_readwrite(\"options\", &LayerConfig::options)\n        .def_readwrite(\"input_dim\", &LayerConfig::input_dim)\n        .def_readwrite(\"output_dim\", &LayerConfig::output_dim)\n        .def_readwrite(\"init\", &LayerConfig::init)\n        .def_readwrite(\"optimizer\", &LayerConfig::optimizer)\n        .def_readwrite(\"bias\", &LayerConfig::bias)\n        .def_readwrite(\"bias_init\", &LayerConfig::bias_init)\n        .def_readwrite(\"activation\", &LayerConfig::activation);\n\n    py::class_<EncoderConfig, std::shared_ptr<EncoderConfig>>(m, \"EncoderConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"layers\", &EncoderConfig::layers)\n        .def_readwrite(\"train_neighbor_sampling\", &EncoderConfig::train_neighbor_sampling)\n        .def_readwrite(\"eval_neighbor_sampling\", &EncoderConfig::eval_neighbor_sampling)\n        .def_readwrite(\"use_incoming_nbrs\", &EncoderConfig::use_incoming_nbrs)\n        .def_readwrite(\"use_outgoing_nbrs\", &EncoderConfig::use_outgoing_nbrs);\n\n    py::class_<DecoderConfig, std::shared_ptr<DecoderConfig>>(m, \"DecoderConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &DecoderConfig::type)\n        .def_readwrite(\"options\", &DecoderConfig::options)\n        .def_readwrite(\"optimizer\", &DecoderConfig::optimizer);\n\n    py::class_<StorageBackendConfig, std::shared_ptr<StorageBackendConfig>>(m, \"StorageBackendConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &StorageBackendConfig::type)\n        .def_readwrite(\"options\", &StorageBackendConfig::options);\n\n    py::class_<DatasetConfig, std::shared_ptr<DatasetConfig>>(m, \"DatasetConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"dataset_dir\", &DatasetConfig::dataset_dir)\n        .def_readwrite(\"num_edges\", &DatasetConfig::num_edges)\n        .def_readwrite(\"num_nodes\", &DatasetConfig::num_nodes)\n        .def_readwrite(\"num_relations\", &DatasetConfig::num_relations)\n        .def_readwrite(\"num_train\", &DatasetConfig::num_train)\n        .def_readwrite(\"num_valid\", &DatasetConfig::num_valid)\n        .def_readwrite(\"num_test\", &DatasetConfig::num_test)\n        .def_readwrite(\"node_feature_dim\", &DatasetConfig::node_feature_dim)\n        .def_readwrite(\"rel_feature_dim\", &DatasetConfig::rel_feature_dim)\n        .def_readwrite(\"num_classes\", &DatasetConfig::num_classes);\n\n    py::class_<NegativeSamplingConfig, std::shared_ptr<NegativeSamplingConfig>>(m, \"NegativeSamplingConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"num_chunks\", &NegativeSamplingConfig::num_chunks)\n        .def_readwrite(\"negatives_per_positive\", &NegativeSamplingConfig::negatives_per_positive)\n        .def_readwrite(\"degree_fraction\", &NegativeSamplingConfig::degree_fraction)\n        .def_readwrite(\"filtered\", &NegativeSamplingConfig::filtered);\n\n    py::class_<PipelineConfig, std::shared_ptr<PipelineConfig>>(m, \"PipelineConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"sync\", &PipelineConfig::sync)\n        .def_readwrite(\"staleness_bound\", &PipelineConfig::staleness_bound)\n        .def_readwrite(\"batch_host_queue_size\", &PipelineConfig::batch_host_queue_size)\n        .def_readwrite(\"batch_device_queue_size\", &PipelineConfig::batch_device_queue_size)\n        .def_readwrite(\"gradients_device_queue_size\", &PipelineConfig::gradients_device_queue_size)\n        .def_readwrite(\"gradients_host_queue_size\", &PipelineConfig::gradients_host_queue_size)\n        .def_readwrite(\"batch_loader_threads\", &PipelineConfig::batch_loader_threads)\n        .def_readwrite(\"batch_transfer_threads\", &PipelineConfig::batch_transfer_threads)\n        .def_readwrite(\"compute_threads\", &PipelineConfig::compute_threads)\n        .def_readwrite(\"gradient_transfer_threads\", &PipelineConfig::gradient_transfer_threads)\n        .def_readwrite(\"gradient_update_threads\", &PipelineConfig::gradient_update_threads);\n\n    py::class_<CheckpointConfig, std::shared_ptr<CheckpointConfig>>(m, \"CheckpointConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"save_best\", &CheckpointConfig::save_best)\n        .def_readwrite(\"interval\", &CheckpointConfig::interval)\n        .def_readwrite(\"save_state\", &CheckpointConfig::save_state);\n\n    py::class_<ModelConfig, std::shared_ptr<ModelConfig>>(m, \"ModelConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"random_seed\", &ModelConfig::random_seed)\n        .def_readwrite(\"learning_task\", &ModelConfig::learning_task)\n        .def_readwrite(\"encoder\", &ModelConfig::encoder)\n        .def_readwrite(\"decoder\", &ModelConfig::decoder)\n        .def_readwrite(\"loss\", &ModelConfig::loss)\n        .def_readwrite(\"dense_optimizer\", &ModelConfig::dense_optimizer)\n        .def_readwrite(\"sparse_optimizer\", &ModelConfig::sparse_optimizer);\n\n    py::class_<StorageConfig, std::shared_ptr<StorageConfig>>(m, \"StorageConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"device_type\", &StorageConfig::device_type)\n        .def_readwrite(\"device_ids\", &StorageConfig::device_ids)\n        .def_readwrite(\"dataset\", &StorageConfig::dataset)\n        .def_readwrite(\"edges\", &StorageConfig::edges)\n        .def_readwrite(\"nodes\", &StorageConfig::nodes)\n        .def_readwrite(\"embeddings\", &StorageConfig::embeddings)\n        .def_readwrite(\"features\", &StorageConfig::features)\n        .def_readwrite(\"prefetch\", &StorageConfig::prefetch)\n        .def_readwrite(\"shuffle_input\", &StorageConfig::shuffle_input)\n        .def_readwrite(\"full_graph_evaluation\", &StorageConfig::full_graph_evaluation)\n        .def_readwrite(\"model_dir\", &StorageConfig::model_dir)\n        .def_readwrite(\"export_encoded_nodes\", &StorageConfig::export_encoded_nodes);\n\n    py::class_<TrainingConfig, std::shared_ptr<TrainingConfig>>(m, \"TrainingConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"batch_size\", &TrainingConfig::batch_size)\n        .def_readwrite(\"negative_sampling\", &TrainingConfig::negative_sampling)\n        .def_readwrite(\"num_epochs\", &TrainingConfig::num_epochs)\n        .def_readwrite(\"pipeline\", &TrainingConfig::pipeline)\n        .def_readwrite(\"epochs_per_shuffle\", &TrainingConfig::epochs_per_shuffle)\n        .def_readwrite(\"logs_per_epoch\", &TrainingConfig::logs_per_epoch)\n        .def_readwrite(\"save_model\", &TrainingConfig::save_model)\n        .def_readwrite(\"checkpoint\", &TrainingConfig::checkpoint)\n        .def_readwrite(\"resume_training\", &TrainingConfig::resume_training)\n        .def_readwrite(\"resume_from_checkpoint\", &TrainingConfig::resume_from_checkpoint);\n\n    py::class_<EvaluationConfig, std::shared_ptr<EvaluationConfig>>(m, \"EvaluationConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"batch_size\", &EvaluationConfig::batch_size)\n        .def_readwrite(\"negative_sampling\", &EvaluationConfig::negative_sampling)\n        .def_readwrite(\"pipeline\", &EvaluationConfig::pipeline)\n        .def_readwrite(\"epochs_per_eval\", &EvaluationConfig::epochs_per_eval)\n        .def_readwrite(\"full_graph_evaluation\", &EvaluationConfig::full_graph_evaluation);\n\n    py::class_<MariusConfig, std::shared_ptr<MariusConfig>>(m, \"MariusConfig\")\n        .def(py::init<>())\n        .def_readwrite(\"model\", &MariusConfig::model)\n        .def_readwrite(\"storage\", &MariusConfig::storage)\n        .def_readwrite(\"training\", &MariusConfig::training)\n        .def_readwrite(\"evaluation\", &MariusConfig::evaluation);\n\n    m.def(\"loadConfig\", &loadConfig, py::arg(\"config_path\"), py::arg(\"save\") = false);\n}"
  },
  {
    "path": "src/cpp/python_bindings/configuration/options_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"configuration/options.h\"\n\nvoid init_options(py::module &m) {\n    py::enum_<LearningTask>(m, \"LearningTask\")\n        .value(\"NODE_CLASSIFICATION\", LearningTask::NODE_CLASSIFICATION)\n        .value(\"LINK_PREDICTION\", LearningTask::LINK_PREDICTION)\n        .value(\"ENCODE\", LearningTask::ENCODE);\n\n    m.def(\"getLearningTask\", &getLearningTask, py::arg(\"string_val\"));\n\n    py::enum_<EdgeDecoderMethod>(m, \"EdgeDecoderMethod\").value(\"ONLY_POS\", EdgeDecoderMethod::ONLY_POS).value(\"CORRUPT_NODE\", EdgeDecoderMethod::CORRUPT_NODE);\n\n    m.def(\"getEdgeDecoderMethod\", &getEdgeDecoderMethod, py::arg(\"string_val\"));\n\n    py::enum_<InitDistribution>(m, \"InitDistribution\")\n        .value(\"ZEROS\", InitDistribution::ZEROS)\n        .value(\"ONES\", InitDistribution::ONES)\n        .value(\"CONSTANT\", InitDistribution::CONSTANT)\n        .value(\"UNIFORM\", InitDistribution::UNIFORM)\n        .value(\"NORMAL\", InitDistribution::NORMAL)\n        .value(\"GLOROT_UNIFORM\", InitDistribution::GLOROT_UNIFORM)\n        .value(\"GLOROT_NORMAL\", InitDistribution::GLOROT_NORMAL);\n\n    m.def(\"getInitDistribution\", &getInitDistribution, py::arg(\"string_val\"));\n\n    py::enum_<LossFunctionType>(m, \"LossFunctionType\")\n        .value(\"SOFTMAX_CE\", LossFunctionType::SOFTMAX_CE)\n        .value(\"RANKING\", LossFunctionType::RANKING)\n        .value(\"BCE_AFTER_SIGMOID\", LossFunctionType::BCE_AFTER_SIGMOID)\n        .value(\"BCE_WITH_LOGITS\", LossFunctionType::BCE_WITH_LOGITS)\n        .value(\"MSE\", LossFunctionType::MSE)\n        .value(\"SOFTPLUS\", LossFunctionType::SOFTPLUS);\n\n    m.def(\"getLossFunctionType\", &getLossFunctionType, py::arg(\"string_val\"));\n\n    py::enum_<LossReduction>(m, \"LossReduction\").value(\"MEAN\", LossReduction::MEAN).value(\"SUM\", LossReduction::SUM);\n\n    m.def(\"getLossReduction\", &getLossReduction, py::arg(\"string_val\"));\n\n    py::enum_<ActivationFunction>(m, \"ActivationFunction\")\n        .value(\"RELU\", ActivationFunction::RELU)\n        .value(\"SIGMOID\", ActivationFunction::SIGMOID)\n        .value(\"NONE\", ActivationFunction::NONE);\n\n    m.def(\"getActivationFunction\", &getActivationFunction, py::arg(\"string_val\"));\n\n    py::enum_<OptimizerType>(m, \"OptimizerType\")\n        .value(\"SGD\", OptimizerType::SGD)\n        .value(\"ADAM\", OptimizerType::ADAM)\n        .value(\"ADAGRAD\", OptimizerType::ADAGRAD)\n        .value(\"DEFAULT\", OptimizerType::DEFAULT);\n\n    m.def(\"getOptimizerType\", &getOptimizerType, py::arg(\"string_val\"));\n\n    py::enum_<ReductionLayerType>(m, \"ReductionLayerType\")\n        .value(\"NONE\", ReductionLayerType::NONE)\n        .value(\"CONCAT\", ReductionLayerType::CONCAT)\n        .value(\"LINEAR\", ReductionLayerType::LINEAR);\n\n    m.def(\"getReductionLayerType\", &getReductionLayerType, py::arg(\"string_val\"));\n\n    py::enum_<LayerType>(m, \"LayerType\")\n        .value(\"NONE\", LayerType::NONE)\n        .value(\"EMBEDDING\", LayerType::EMBEDDING)\n        .value(\"FEATURE\", LayerType::FEATURE)\n        .value(\"GNN\", LayerType::GNN)\n        .value(\"DENSE\", LayerType::DENSE)\n        .value(\"REDUCTION\", LayerType::REDUCTION);\n\n    m.def(\"getLayerType\", &getLayerType, py::arg(\"string_val\"));\n\n    py::enum_<DenseLayerType>(m, \"DenseLayerType\")\n        .value(\"NONE\", DenseLayerType::NONE)\n        .value(\"LINEAR\", DenseLayerType::LINEAR)\n        .value(\"CONV\", DenseLayerType::CONV);\n\n    m.def(\"getDenseLayerType\", &getDenseLayerType, py::arg(\"string_val\"));\n\n    py::enum_<GNNLayerType>(m, \"GNNLayerType\")\n        .value(\"NONE\", GNNLayerType::NONE)\n        .value(\"GRAPH_SAGE\", GNNLayerType::GRAPH_SAGE)\n        .value(\"GCN\", GNNLayerType::GCN)\n        .value(\"GAT\", GNNLayerType::GAT)\n        .value(\"RGCN\", GNNLayerType::RGCN);\n\n    m.def(\"getGNNLayerType\", &getGNNLayerType, py::arg(\"string_val\"));\n\n    py::enum_<GraphSageAggregator>(m, \"GraphSageAggregator\").value(\"GCN\", GraphSageAggregator::GCN).value(\"MEAN\", GraphSageAggregator::MEAN);\n\n    m.def(\"getGraphSageAggregator\", &getGraphSageAggregator, py::arg(\"string_val\"));\n\n    py::enum_<DecoderType>(m, \"DecoderType\")\n        .value(\"NODE\", DecoderType::NODE)\n        .value(\"DISTMULT\", DecoderType::DISTMULT)\n        .value(\"TRANSE\", DecoderType::TRANSE)\n        .value(\"COMPLEX\", DecoderType::COMPLEX);\n\n    m.def(\"getDecoderType\", &getDecoderType, py::arg(\"string_val\"));\n\n    py::enum_<StorageBackend>(m, \"StorageBackend\")\n        .value(\"PARTITION_BUFFER\", StorageBackend::PARTITION_BUFFER)\n        .value(\"FLAT_FILE\", StorageBackend::FLAT_FILE)\n        .value(\"HOST_MEMORY\", StorageBackend::HOST_MEMORY)\n        .value(\"DEVICE_MEMORY\", StorageBackend::DEVICE_MEMORY);\n\n    m.def(\"getStorageBackend\", &getStorageBackend, py::arg(\"string_val\"));\n\n    py::enum_<EdgeBucketOrdering>(m, \"EdgeBucketOrdering\")\n        .value(\"OLD_BETA\", EdgeBucketOrdering::OLD_BETA)\n        .value(\"NEW_BETA\", EdgeBucketOrdering::NEW_BETA)\n        .value(\"ALL_BETA\", EdgeBucketOrdering::ALL_BETA)\n        .value(\"COMET\", EdgeBucketOrdering::COMET)\n        .value(\"CUSTOM\", EdgeBucketOrdering::CUSTOM);\n\n    m.def(\"getEdgeBucketOrderingEnum\", &getEdgeBucketOrderingEnum, py::arg(\"string_val\"));\n\n    py::enum_<NodePartitionOrdering>(m, \"NodePartitionOrdering\")\n        .value(\"DISPERSED\", NodePartitionOrdering::DISPERSED)\n        .value(\"SEQUENTIAL\", NodePartitionOrdering::SEQUENTIAL)\n        .value(\"CUSTOM\", NodePartitionOrdering::CUSTOM);\n\n    m.def(\"getNodePartitionOrderingEnum\", &getNodePartitionOrderingEnum, py::arg(\"string_val\"));\n\n    py::enum_<NeighborSamplingLayer>(m, \"NeighborSamplingLayer\")\n        .value(\"ALL\", NeighborSamplingLayer::ALL)\n        .value(\"UNIFORM\", NeighborSamplingLayer::UNIFORM)\n        .value(\"DROPOUT\", NeighborSamplingLayer::DROPOUT);\n\n    m.def(\"getNeighborSamplingLayer\", &getNeighborSamplingLayer, py::arg(\"string_val\"));\n\n    m.def(\"getDtype\", &getDtype, py::arg(\"string_val\"));\n\n    py::class_<InitOptions, std::shared_ptr<InitOptions>>(m, \"InitOptions\").def(py::init<>());\n\n    py::class_<ConstantInitOptions, InitOptions, std::shared_ptr<ConstantInitOptions>>(m, \"ConstantInitOptions\")\n        .def(py::init<float>(), py::arg(\"constant\"))\n        .def_readwrite(\"constant\", &ConstantInitOptions::constant);\n\n    py::class_<UniformInitOptions, InitOptions, std::shared_ptr<UniformInitOptions>>(m, \"UniformInitOptions\")\n        .def(py::init<float>(), py::arg(\"scale_factor\"))\n        .def_readwrite(\"scale_factor\", &UniformInitOptions::scale_factor);\n\n    py::class_<NormalInitOptions, InitOptions, std::shared_ptr<NormalInitOptions>>(m, \"NormalInitOptions\")\n        .def(py::init<float, float>(), py::arg(\"mean\"), py::arg(\"std\"))\n        .def_readwrite(\"mean\", &NormalInitOptions::mean)\n        .def_readwrite(\"std\", &NormalInitOptions::std);\n\n    py::class_<LossOptions, std::shared_ptr<LossOptions>>(m, \"LossOptions\").def(py::init<>()).def_readwrite(\"loss_reduction\", &LossOptions::loss_reduction);\n\n    py::class_<RankingLossOptions, LossOptions, std::shared_ptr<RankingLossOptions>>(m, \"RankingLossOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"loss_reduction\", &RankingLossOptions::loss_reduction)\n        .def_readwrite(\"margin\", &RankingLossOptions::margin);\n\n    py::class_<OptimizerOptions, std::shared_ptr<OptimizerOptions>>(m, \"OptimizerOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"learning_rate\", &OptimizerOptions::learning_rate);\n\n    py::class_<AdagradOptions, OptimizerOptions, std::shared_ptr<AdagradOptions>>(m, \"AdagradOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"eps\", &AdagradOptions::eps)\n        .def_readwrite(\"init_value\", &AdagradOptions::init_value)\n        .def_readwrite(\"lr_decay\", &AdagradOptions::lr_decay)\n        .def_readwrite(\"weight_decay\", &AdagradOptions::weight_decay);\n\n    py::class_<AdamOptions, OptimizerOptions, std::shared_ptr<AdamOptions>>(m, \"AdamOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"amsgrad\", &AdamOptions::amsgrad)\n        .def_readwrite(\"beta_1\", &AdamOptions::beta_1)\n        .def_readwrite(\"beta_2\", &AdamOptions::beta_2)\n        .def_readwrite(\"eps\", &AdamOptions::eps)\n        .def_readwrite(\"weight_decay\", &AdamOptions::weight_decay);\n\n    py::class_<LayerOptions, std::shared_ptr<LayerOptions>>(m, \"LayerOptions\").def(py::init<>());\n\n    py::class_<EmbeddingLayerOptions, LayerOptions, std::shared_ptr<EmbeddingLayerOptions>>(m, \"EmbeddingLayerOptions\").def(py::init<>());\n\n    py::class_<FeatureLayerOptions, LayerOptions, std::shared_ptr<FeatureLayerOptions>>(m, \"FeatureLayerOptions\").def(py::init<>());\n\n    py::class_<DenseLayerOptions, LayerOptions, std::shared_ptr<DenseLayerOptions>>(m, \"DenseLayerOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &DenseLayerOptions::type);\n\n    py::class_<ReductionLayerOptions, LayerOptions, std::shared_ptr<ReductionLayerOptions>>(m, \"ReductionLayerOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &ReductionLayerOptions::type);\n\n    py::class_<GNNLayerOptions, LayerOptions, std::shared_ptr<GNNLayerOptions>>(m, \"GNNLayerOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"type\", &GNNLayerOptions::type);\n\n    py::class_<GraphSageLayerOptions, GNNLayerOptions, std::shared_ptr<GraphSageLayerOptions>>(m, \"GraphSageLayerOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"aggregator\", &GraphSageLayerOptions::aggregator);\n\n    py::class_<GATLayerOptions, GNNLayerOptions, std::shared_ptr<GATLayerOptions>>(m, \"GATLayerOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"num_heads\", &GATLayerOptions::num_heads)\n        .def_readwrite(\"average_heads\", &GATLayerOptions::average_heads)\n        .def_readwrite(\"negative_slope\", &GATLayerOptions::negative_slope)\n        .def_readwrite(\"input_dropout\", &GATLayerOptions::input_dropout)\n        .def_readwrite(\"attention_dropout\", &GATLayerOptions::attention_dropout);\n\n    py::class_<DecoderOptions, std::shared_ptr<DecoderOptions>>(m, \"DecoderOptions\").def(py::init<>());\n\n    py::class_<EdgeDecoderOptions, DecoderOptions, std::shared_ptr<EdgeDecoderOptions>>(m, \"EdgeDecoderOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"inverse_edges\", &EdgeDecoderOptions::inverse_edges)\n        .def_readwrite(\"mode\", &EdgeDecoderOptions::edge_decoder_method)\n        .def_readwrite(\"input_dim\", &EdgeDecoderOptions::input_dim);\n\n    py::class_<StorageOptions, std::shared_ptr<StorageOptions>>(m, \"StorageOptions\").def(py::init<>()).def_readwrite(\"dtype\", &StorageOptions::dtype);\n\n    py::class_<PartitionBufferOptions, StorageOptions, std::shared_ptr<PartitionBufferOptions>>(m, \"PartitionBufferOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"num_partitions\", &PartitionBufferOptions::num_partitions)\n        .def_readwrite(\"buffer_capacity\", &PartitionBufferOptions::buffer_capacity)\n        .def_readwrite(\"prefetching\", &PartitionBufferOptions::prefetching)\n        .def_readwrite(\"fine_to_coarse_ratio\", &PartitionBufferOptions::fine_to_coarse_ratio)\n        .def_readwrite(\"edge_bucket_ordering\", &PartitionBufferOptions::edge_bucket_ordering)\n        .def_readwrite(\"node_partition_ordering\", &PartitionBufferOptions::node_partition_ordering);\n\n    py::class_<NeighborSamplingOptions, std::shared_ptr<NeighborSamplingOptions>>(m, \"NeighborSamplingOptions\").def(py::init<>());\n\n    py::class_<UniformSamplingOptions, NeighborSamplingOptions, std::shared_ptr<UniformSamplingOptions>>(m, \"UniformSamplingOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"max_neighbors\", &UniformSamplingOptions::max_neighbors);\n\n    py::class_<DropoutSamplingOptions, NeighborSamplingOptions, std::shared_ptr<DropoutSamplingOptions>>(m, \"DropoutSamplingOptions\")\n        .def(py::init<>())\n        .def_readwrite(\"rate\", &DropoutSamplingOptions::rate);\n}"
  },
  {
    "path": "src/cpp/python_bindings/configuration/wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n\n// configuration\nvoid init_config(py::module &);\nvoid init_options(py::module &);\n\nPYBIND11_MODULE(_config, m) {\n    m.doc() = \"Configuration and options for API objects.\";\n\n    // configuration\n    init_config(m);\n    init_options(m);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/manager/marius_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 4/9/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"marius.h\"\n\nvoid init_marius(py::module &m) {\n    m.def(\"marius_train\", &marius_train, py::arg(\"config\"), py::call_guard<py::gil_scoped_release>());\n    m.def(\"marius_eval\", &marius_eval, py::arg(\"config\"), py::call_guard<py::gil_scoped_release>());\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/manager/wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n\nvoid init_marius(py::module &);\n\nPYBIND11_MODULE(_manager, m) {\n    m.doc() = \"High level execution management.\";\n\n    // manager\n    init_marius(m);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/nn/activation_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"nn/activation.h\"\n\nnamespace py = pybind11;\n\nvoid init_activation(py::module &m) { m.def(\"apply_activation\", &apply_activation, py::arg(\"activation_function\"), py::arg(\"input\")); }"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/decoder_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/decoders/decoder.h\"\n\nclass PyDecoder : Decoder {\n    using Decoder::Decoder;\n};\n\nvoid init_decoder(py::module &m) { py::class_<Decoder, PyDecoder, shared_ptr<Decoder>>(m, \"Decoder\").def_readwrite(\"learning_task\", &Decoder::learning_task_); }"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/edge/comparators_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/decoders/edge/comparators.h\"\n\nclass PyComparator : Comparator {\n   public:\n    using Comparator::Comparator;\n    torch::Tensor operator()(torch::Tensor src, torch::Tensor dst) override {\n        PYBIND11_OVERRIDE_PURE_NAME(torch::Tensor, Comparator, \"__call__\", operator(), src, dst);\n    }\n};\n\nvoid init_comparators(py::module &m) {\n    py::class_<Comparator, PyComparator, shared_ptr<Comparator>>(m, \"Comparator\").def(\"__call__\", &Comparator::operator(), py::arg(\"src\"), py::arg(\"dst\"));\n\n    py::class_<L2Compare, Comparator, shared_ptr<L2Compare>>(m, \"L2Compare\").def(py::init<>());\n\n    py::class_<CosineCompare, Comparator, std::shared_ptr<CosineCompare>>(m, \"CosineCompare\").def(py::init<>());\n\n    py::class_<DotCompare, Comparator, std::shared_ptr<DotCompare>>(m, \"DotCompare\").def(py::init<>());\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/edge/complex_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/decoders/edge/complex.h\"\n\nvoid init_complex(py::module &m) {\n    py::class_<ComplEx, EdgeDecoder, std::shared_ptr<ComplEx>>(m, \"ComplEx\")\n        .def(py::init([](int num_relations, int embedding_dim, bool use_inverse_relations, py::object py_device, py::object py_dtype, string decoder_method) {\n                 torch::TensorOptions options;\n                 options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n                 return std::make_shared<ComplEx>(num_relations, embedding_dim, options, use_inverse_relations, getEdgeDecoderMethod(decoder_method));\n             }),\n             py::arg(\"num_relations\"), py::arg(\"embedding_dim\"), py::arg(\"use_inverse_relations\") = true, py::arg(\"device\") = py::none(),\n             py::arg(\"dtype\") = py::none(), py::arg(\"mode\") = \"train\")\n        .def(\"reset\", &ComplEx::reset);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/edge/distmult_wrap.cpp",
    "content": "\n#include \"common/pybind_headers.h\"\n#include \"nn/decoders/edge/distmult.h\"\n\nvoid init_distmult(py::module &m) {\n    py::class_<DistMult, EdgeDecoder, std::shared_ptr<DistMult>>(m, \"DistMult\")\n        .def(py::init([](int num_relations, int embedding_dim, bool use_inverse_relations, py::object py_device, py::object py_dtype, string decoder_method) {\n                 torch::TensorOptions options;\n                 options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n                 return std::make_shared<DistMult>(num_relations, embedding_dim, options, use_inverse_relations, getEdgeDecoderMethod(decoder_method));\n             }),\n             py::arg(\"num_relations\"), py::arg(\"embedding_dim\"), py::arg(\"use_inverse_relations\") = true, py::arg(\"device\") = py::none(),\n             py::arg(\"dtype\") = py::none(), py::arg(\"mode\") = \"train\")\n        .def(\"reset\", &DistMult::reset);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/edge/edge_decoder_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/15/22.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/decoders/edge/edge_decoder.h\"\n\nvoid init_edge_decoder(py::module &m) {\n    py::class_<EdgeDecoder, Decoder, std::shared_ptr<EdgeDecoder>>(m, \"EdgeDecoder\")\n        .def_readwrite(\"comparator\", &EdgeDecoder::comparator_)\n        .def_readwrite(\"relation_operator\", &EdgeDecoder::relation_operator_)\n        .def_readwrite(\"relations\", &EdgeDecoder::relations_)\n        .def_readwrite(\"inverse_relations\", &EdgeDecoder::inverse_relations_)\n        .def_readwrite(\"num_relations\", &EdgeDecoder::num_relations_)\n        .def_readwrite(\"embedding_size\", &EdgeDecoder::embedding_size_)\n        .def_readwrite(\"mode\", &EdgeDecoder::decoder_method_)\n        .def_readwrite(\"tensor_options\", &EdgeDecoder::tensor_options_)\n        .def_readwrite(\"use_inverse_relations\", &EdgeDecoder::use_inverse_relations_)\n        .def(\"apply_relation\", &EdgeDecoder::apply_relation, py::arg(\"nodes\"), py::arg(\"relations\"))\n        .def(\"compute_scores\", &EdgeDecoder::apply_relation, py::arg(\"src\"), py::arg(\"dst\"))\n        .def(\"select_relations\", &EdgeDecoder::apply_relation, py::arg(\"indices\"), py::arg(\"inverse\") = false);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/edge/relation_operators_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/decoders/edge/relation_operators.h\"\n\n// Trampoline classes\nclass PyRelationOperator : RelationOperator {\n   public:\n    using RelationOperator::RelationOperator;\n    torch::Tensor operator()(const torch::Tensor &embs, const torch::Tensor &rels) override {\n        PYBIND11_OVERRIDE_PURE_NAME(torch::Tensor, RelationOperator, \"__call__\", operator(), embs, rels);\n    }\n};\n\nvoid init_relation_operators(py::module &m) {\n    py::class_<RelationOperator, PyRelationOperator, std::shared_ptr<RelationOperator>>(m, \"RelationOperator\")\n        .def(py::init<>())\n        .def(\"__call__\", &RelationOperator::operator(), py::arg(\"embs\"), py::arg(\"rels\"));\n\n    py::class_<HadamardOperator, RelationOperator, std::shared_ptr<HadamardOperator>>(m, \"HadamardOperator\").def(py::init<>());\n\n    py::class_<ComplexHadamardOperator, RelationOperator, std::shared_ptr<ComplexHadamardOperator>>(m, \"ComplexHadamardOperator\").def(py::init<>());\n\n    py::class_<TranslationOperator, RelationOperator, std::shared_ptr<TranslationOperator>>(m, \"TranslationOperator\").def(py::init<>());\n\n    py::class_<NoOp, RelationOperator, std::shared_ptr<NoOp>>(m, \"NoOp\").def(py::init<>());\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/edge/transe_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/decoders/edge/transe.h\"\n\nvoid init_transe(py::module &m) {\n    py::class_<TransE, EdgeDecoder, std::shared_ptr<TransE>>(m, \"TransE\")\n        .def(py::init([](int num_relations, int embedding_dim, bool use_inverse_relations, py::object py_device, py::object py_dtype, string decoder_method) {\n                 torch::TensorOptions options;\n                 options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n                 return std::make_shared<TransE>(num_relations, embedding_dim, options, use_inverse_relations, getEdgeDecoderMethod(decoder_method));\n             }),\n             py::arg(\"num_relations\"), py::arg(\"embedding_dim\"), py::arg(\"use_inverse_relations\") = true, py::arg(\"device\") = py::none(),\n             py::arg(\"dtype\") = py::none(), py::arg(\"mode\") = \"train\")\n        .def(\"reset\", &TransE::reset);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/node/node_decoder_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/15/22.\n//\n\n#include <common/pybind_headers.h>\n\n#include \"nn/decoders/node/node_decoder.h\"\n\nvoid init_node_decoder(py::module &m) {\n    py::class_<NodeDecoder, Decoder, shared_ptr<NodeDecoder>>(m, \"NodeDecoder\").def(\"forward\", &NodeDecoder::forward, py::arg(\"inputs\"));\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/nn/decoders/node/noop_node_decoder.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/15/22.\n//\n\n#include \"nn/decoders/node/noop_node_decoder.h\"\n\n#include <common/pybind_headers.h>\n\nvoid init_noop_node_decoder(py::module &m) {\n    py::class_<NoOpNodeDecoder, NodeDecoder, torch::nn::Module, shared_ptr<NoOpNodeDecoder>>(m, \"NoOpNodeDecoder\")\n        .def(py::init<>())\n        .def(\"compute_labels\", &NoOpNodeDecoder::forward, py::arg(\"nodes\"))\n        .def(\"reset\", &NoOpNodeDecoder::reset);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/nn/encoders/encoder_wrap.cpp",
    "content": "\n#include \"common/pybind_headers.h\"\n#include \"nn/encoders/encoder.h\"\n\nvoid init_encoder(py::module &m) {\n    py::class_<GeneralEncoder, torch::nn::Module, std::shared_ptr<GeneralEncoder>>(m, \"GeneralEncoder\")\n        .def_readwrite(\"encoder_config\", &GeneralEncoder::encoder_config_)\n        .def_readwrite(\"num_relations\", &GeneralEncoder::num_relations_)\n        .def_readwrite(\"device\", &GeneralEncoder::device_)\n        .def_readwrite(\"layers\", &GeneralEncoder::layers_)\n        .def(py::init<shared_ptr<EncoderConfig>, torch::Device, int>(), py::arg(\"encoder_config\"), py::arg(\"device\"), py::arg(\"num_relations\") = 1)\n        .def(py::init<std::vector<std::vector<shared_ptr<Layer>>>>(), py::arg(\"layers\"))\n        .def(\"forward\", &GeneralEncoder::forward, py::arg(\"embeddings\"), py::arg(\"features\"), py::arg(\"dense_graph\"), py::arg(\"train\") = true)\n        .def(\"reset\", &GeneralEncoder::reset);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/initialization_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"nn/initialization.h\"\n\nvoid init_initialization(py::module &m) {\n    m.def(\"compute_fans\", &compute_fans, py::arg(\"shape\"));\n\n    m.def(\n        \"glorot_uniform\",\n        [](std::vector<int64_t> shape, py::object py_device, py::object py_dtype, std::tuple<int64_t, int64_t> fans) {\n            torch::TensorOptions options;\n            options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n            return glorot_uniform(shape, fans, options);\n        },\n        py::arg(\"shape\"), py::arg(\"device\"), py::arg(\"dtype\"), py::arg(\"fans\") = std::make_tuple(-1, -1));\n\n    m.def(\n        \"glorot_normal\",\n        [](std::vector<int64_t> shape, py::object py_device, py::object py_dtype, std::tuple<int64_t, int64_t> fans) {\n            torch::TensorOptions options;\n            options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n            return glorot_normal(shape, fans, options);\n        },\n        py::arg(\"shape\"), py::arg(\"device\"), py::arg(\"dtype\"), py::arg(\"fans\") = std::make_tuple(-1, -1));\n\n    m.def(\n        \"constant_init\",\n        [](std::vector<int64_t> shape, float constant, py::object py_device, py::object py_dtype) {\n            torch::TensorOptions options;\n            options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n            return constant_init(constant, shape, options);\n        },\n        py::arg(\"shape\"), py::arg(\"constant\") = 0, py::arg(\"device\"), py::arg(\"dtype\"));\n\n    m.def(\n        \"uniform_init\",\n        [](std::vector<int64_t> shape, float scale_factor, py::object py_device, py::object py_dtype) {\n            torch::TensorOptions options;\n            options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n            return uniform_init(scale_factor, shape, options);\n        },\n        py::arg(\"shape\"), py::arg(\"scale_factor\") = .001, py::arg(\"device\"), py::arg(\"dtype\"));\n\n    m.def(\n        \"normal_init\",\n        [](std::vector<int64_t> shape, float mean, float std, py::object py_device, py::object py_dtype) {\n            torch::TensorOptions options;\n            options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n            return normal_init(mean, std, shape, options);\n        },\n        py::arg(\"shape\"), py::arg(\"mean\") = 0, py::arg(\"std\") = 1, py::arg(\"device\"), py::arg(\"dtype\"));\n\n    m.def(\n        \"initialize_tensor\",\n        [](shared_ptr<InitConfig> init_config, std::vector<int64_t> shape, py::object py_device, py::object py_dtype, std::tuple<int64_t, int64_t> fans) {\n            torch::TensorOptions options;\n            options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n            return initialize_tensor(init_config, shape, options, fans);\n        },\n        py::arg(\"init_config\"), py::arg(\"shape\"), py::arg(\"device\"), py::arg(\"dtype\"), py::arg(\"fans\") = std::make_tuple(-1, -1));\n\n    m.def(\n        \"initialize_subtensor\",\n        [](shared_ptr<InitConfig> init_config, std::vector<int64_t> sub_shape, std::vector<int64_t> full_shape, py::object py_device, py::object py_dtype,\n           std::tuple<int64_t, int64_t> fans) {\n            torch::TensorOptions options;\n            options = options.device(torch::python::detail::py_object_to_device(py_device)).dtype(torch::python::detail::py_object_to_dtype(py_dtype));\n            return initialize_subtensor(init_config, sub_shape, full_shape, options, fans);\n        },\n        py::arg(\"init_config\"), py::arg(\"sub_shape\"), py::arg(\"full_shape\"), py::arg(\"device\"), py::arg(\"dtype\"), py::arg(\"fans\") = std::make_tuple(-1, -1));\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/embedding/embedding_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"nn/layers/embedding/embedding.h\"\n\nvoid init_embedding_layer(py::module &m) {\n    py::class_<EmbeddingLayer, Layer, std::shared_ptr<EmbeddingLayer>>(m, \"EmbeddingLayer\")\n        .def_readwrite(\"offset\", &EmbeddingLayer::offset_)\n        .def(py::init<shared_ptr<LayerConfig>, torch::Device, int>(), py::arg(\"layer_config\"), py::arg(\"device\"), py::arg(\"offset\") = 0)\n        .def(py::init([](int dimension, torch::Device device, InitConfig init, bool bias, InitConfig bias_init, string activation, int offset) {\n                 auto layer_config = std::make_shared<LayerConfig>();\n                 layer_config->input_dim = -1;\n                 layer_config->output_dim = dimension;\n                 layer_config->type = LayerType::EMBEDDING;\n                 layer_config->init = std::make_shared<InitConfig>(init);\n                 layer_config->bias = bias;\n                 layer_config->bias_init = std::make_shared<InitConfig>(bias_init);\n                 layer_config->optimizer = nullptr;\n                 layer_config->activation = getActivationFunction(activation);\n\n                 return std::make_shared<EmbeddingLayer>(layer_config, device, offset);\n             }),\n             py::arg(\"dimension\"), py::arg(\"device\"), py::arg(\"init\") = InitConfig(InitDistribution::GLOROT_UNIFORM, nullptr), py::arg(\"bias\") = false,\n             py::arg(\"bias_init\") = InitConfig(InitDistribution::ZEROS, nullptr), py::arg(\"activation\") = \"none\", py::arg(\"offset\") = 0)\n        .def(\"init_embeddings\", &EmbeddingLayer::init_embeddings, py::arg(\"num_nodes\"))\n        .def(\"forward\", &EmbeddingLayer::forward, py::arg(\"input\"))\n        .def(\"reset\", &EmbeddingLayer::reset);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/feature/feature_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/15/22.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/feature/feature.h\"\n\nvoid init_feature_layer(py::module &m) {\n    py::class_<FeatureLayer, Layer, std::shared_ptr<FeatureLayer>>(m, \"FeatureLayer\")\n        .def_readwrite(\"offset\", &FeatureLayer::offset_)\n        .def(py::init<shared_ptr<LayerConfig>, torch::Device, int>(), py::arg(\"layer_config\"), py::arg(\"device\"), py::arg(\"offset\") = 0)\n        .def(py::init([](int dimension, torch::Device device, bool bias, InitConfig bias_init, string activation, int offset) {\n                 auto layer_config = std::make_shared<LayerConfig>();\n                 layer_config->input_dim = -1;\n                 layer_config->output_dim = dimension;\n                 layer_config->type = LayerType::FEATURE;\n                 layer_config->init = nullptr;\n                 layer_config->bias = bias;\n                 layer_config->bias_init = std::make_shared<InitConfig>(bias_init);\n                 layer_config->optimizer = nullptr;\n                 layer_config->activation = getActivationFunction(activation);\n\n                 return std::make_shared<FeatureLayer>(layer_config, device, offset);\n             }),\n             py::arg(\"dimension\"), py::arg(\"device\"), py::arg(\"bias\") = false, py::arg(\"bias_init\") = InitConfig(InitDistribution::ZEROS, nullptr),\n             py::arg(\"activation\") = \"none\", py::arg(\"offset\") = 0)\n        .def(\"forward\", &FeatureLayer::forward, py::arg(\"input\"))\n        .def(\"reset\", &FeatureLayer::reset);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/gnn/gat_layer_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/gnn/gat_layer.h\"\n\nvoid init_gat_layer(py::module &m) {\n    py::class_<GATLayer, GNNLayer, shared_ptr<GATLayer>>(m, \"GATLayer\")\n        .def_readwrite(\"options\", &GATLayer::options_)\n        .def_readwrite(\"head_dim\", &GATLayer::head_dim_)\n        .def_readwrite(\"input_dropout\", &GATLayer::input_dropout_)\n        .def_readwrite(\"attention_dropout\", &GATLayer::attention_dropout_)\n        .def_readwrite(\"weight_matrices\", &GATLayer::weight_matrices_)\n        .def_readwrite(\"a_l\", &GATLayer::a_l_)\n        .def_readwrite(\"a_r\", &GATLayer::a_r_)\n        .def(py::init<shared_ptr<LayerConfig>, torch::Device>(), py::arg(\"layer_config\"), py::arg(\"device\"))\n        .def(py::init([](int input_dim, int output_dim, std::optional<torch::Device> device, int num_heads, bool average_heads, float input_dropout,\n                         float attention_dropout, float negative_slope, InitConfig init, bool bias, InitConfig bias_init, string activation) {\n                 auto layer_config = std::make_shared<LayerConfig>();\n                 layer_config->input_dim = input_dim;\n                 layer_config->output_dim = output_dim;\n                 layer_config->type = LayerType::GNN;\n\n                 auto layer_options = std::make_shared<GATLayerOptions>();\n                 layer_options->input_dropout = input_dropout;\n                 layer_options->attention_dropout = attention_dropout;\n                 layer_options->num_heads = num_heads;\n                 layer_options->negative_slope = negative_slope;\n                 layer_options->average_heads = average_heads;\n                 layer_config->options = layer_options;\n\n                 layer_config->init = std::make_shared<InitConfig>(init);\n                 layer_config->bias = bias;\n                 layer_config->bias_init = std::make_shared<InitConfig>(bias_init);\n                 layer_config->optimizer = nullptr;\n                 layer_config->activation = getActivationFunction(activation);\n\n                 torch::Device torch_device = torch::kCPU;\n                 if (device.has_value()) {\n                     torch_device = device.value();\n                 }\n\n                 return std::make_shared<GATLayer>(layer_config, torch_device);\n             }),\n             py::arg(\"input_dim\"), py::arg(\"output_dim\"), py::arg(\"device\") = py::none(), py::arg(\"num_heads\") = 10, py::arg(\"average_heads\") = false,\n             py::arg(\"input_dropout\") = 0.0, py::arg(\"attention_dropout\") = 0.0, py::arg(\"negative_slope\") = .2,\n             py::arg(\"init\") = InitConfig(InitDistribution::GLOROT_UNIFORM, nullptr), py::arg(\"bias\") = false,\n             py::arg(\"bias_init\") = InitConfig(InitDistribution::ZEROS, nullptr), py::arg(\"activation\") = \"none\")\n        .def(\"reset\", &GATLayer::reset)\n        .def(\"forward\", &GATLayer::forward, py::arg(\"inputs\"), py::arg(\"dense_graph\"), py::arg(\"train\") = true);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/gnn/gcn_layer_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/gnn/gcn_layer.h\"\n\nvoid init_gcn_layer(py::module &m) {\n    py::class_<GCNLayer, GNNLayer, shared_ptr<GCNLayer>>(m, \"GCNLayer\")\n        .def_readwrite(\"options\", &GCNLayer::options_)\n        .def_readwrite(\"w_\", &GCNLayer::w_)\n        .def(py::init<shared_ptr<LayerConfig>, torch::Device>(), py::arg(\"layer_config\"), py::arg(\"device\"))\n        .def(py::init(\n                 [](int input_dim, int output_dim, std::optional<torch::Device> device, InitConfig init, bool bias, InitConfig bias_init, string activation) {\n                     auto layer_config = std::make_shared<LayerConfig>();\n                     layer_config->input_dim = input_dim;\n                     layer_config->output_dim = output_dim;\n                     layer_config->type = LayerType::GNN;\n\n                     auto layer_options = std::make_shared<GNNLayerOptions>();\n                     layer_config->options = layer_options;\n\n                     layer_config->init = std::make_shared<InitConfig>(init);\n                     layer_config->bias = bias;\n                     layer_config->bias_init = std::make_shared<InitConfig>(bias_init);\n                     layer_config->optimizer = nullptr;\n                     layer_config->activation = getActivationFunction(activation);\n\n                     torch::Device torch_device = torch::kCPU;\n                     if (device.has_value()) {\n                         torch_device = device.value();\n                     }\n\n                     return std::make_shared<GCNLayer>(layer_config, torch_device);\n                 }),\n             py::arg(\"input_dim\"), py::arg(\"output_dim\"), py::arg(\"device\") = py::none(),\n             py::arg(\"init\") = InitConfig(InitDistribution::GLOROT_UNIFORM, nullptr), py::arg(\"bias\") = false,\n             py::arg(\"bias_init\") = InitConfig(InitDistribution::ZEROS, nullptr), py::arg(\"activation\") = \"none\")\n        .def(\"reset\", &GCNLayer::reset)\n        .def(\"forward\", &GCNLayer::forward, py::arg(\"inputs\"), py::arg(\"dense_graph\"), py::arg(\"train\") = true);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/gnn/gnn_layer_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/gnn/gnn_layer.h\"\n\nclass PyGNNLayer : GNNLayer {\n   public:\n    using GNNLayer::GNNLayer;\n    torch::Tensor forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train) override {\n        PYBIND11_OVERRIDE_PURE(torch::Tensor, GNNLayer, forward, inputs, dense_graph, train);\n    }\n};\n\nvoid init_gnn_layer(py::module &m) {\n    py::class_<GNNLayer, PyGNNLayer, Layer, shared_ptr<GNNLayer>>(m, \"GNNLayer\")\n        .def_readwrite(\"input_dim\", &GNNLayer::input_dim_)\n        .def_readwrite(\"output_dim\", &GNNLayer::output_dim_)\n        .def(\"forward\", &GNNLayer::forward, py::arg(\"inputs\"), py::arg(\"dense_graph\"), py::arg(\"train\"));\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/gnn/graph_sage_layer_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/gnn/graph_sage_layer.h\"\n\nvoid init_graph_sage_layer(py::module &m) {\n    py::class_<GraphSageLayer, GNNLayer, shared_ptr<GraphSageLayer>>(m, \"GraphSageLayer\")\n        .def_readwrite(\"options\", &GraphSageLayer::options_)\n        .def_readwrite(\"w1\", &GraphSageLayer::w1_)\n        .def_readwrite(\"w2_\", &GraphSageLayer::w2_)\n        .def(py::init<shared_ptr<LayerConfig>, torch::Device>(), py::arg(\"layer_config\"), py::arg(\"device\"))\n        .def(py::init([](int input_dim, int output_dim, std::optional<torch::Device> device, std::string aggregator, InitConfig init, bool bias,\n                         InitConfig bias_init, string activation) {\n                 auto layer_config = std::make_shared<LayerConfig>();\n                 layer_config->input_dim = input_dim;\n                 layer_config->output_dim = output_dim;\n                 layer_config->type = LayerType::GNN;\n\n                 auto layer_options = std::make_shared<GraphSageLayerOptions>();\n                 layer_options->aggregator = getGraphSageAggregator(aggregator);\n                 layer_config->options = layer_options;\n\n                 layer_config->init = std::make_shared<InitConfig>(init);\n                 layer_config->bias = bias;\n                 layer_config->bias_init = std::make_shared<InitConfig>(bias_init);\n                 layer_config->optimizer = nullptr;\n                 layer_config->activation = getActivationFunction(activation);\n\n                 torch::Device torch_device = torch::kCPU;\n                 if (device.has_value()) {\n                     torch_device = device.value();\n                 }\n\n                 return std::make_shared<GraphSageLayer>(layer_config, torch_device);\n             }),\n             py::arg(\"input_dim\"), py::arg(\"output_dim\"), py::arg(\"device\") = py::none(), py::arg(\"aggregator\") = \"mean\",\n             py::arg(\"init\") = InitConfig(InitDistribution::GLOROT_UNIFORM, nullptr), py::arg(\"bias\") = false,\n             py::arg(\"bias_init\") = InitConfig(InitDistribution::ZEROS, nullptr), py::arg(\"activation\") = \"none\")\n        .def(\"reset\", &GraphSageLayer::reset)\n        .def(\"forward\", &GraphSageLayer::forward, py::arg(\"inputs\"), py::arg(\"dense_graph\"), py::arg(\"train\") = true);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/gnn/layer_helpers_wrap.cpp",
    "content": "\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/gnn/layer_helpers.h\"\n\nvoid init_layer_helpers(py::module &m) {\n    m.def(\"segment_ids_from_offsets\", &segment_ids_from_offsets, py::arg(\"offsets\"), py::arg(\"input_size\"));\n\n    m.def(\"segmented_sum\", &segmented_sum, py::arg(\"tensor\"), py::arg(\"segment_ids\"), py::arg(\"num_segments\"));\n\n    m.def(\"segmented_sum_with_offsets\", &segmented_sum_with_offsets, py::arg(\"tensor\"), py::arg(\"offsets\"));\n\n    m.def(\"segmented_max_with_offsets\", &segmented_max_with_offsets, py::arg(\"tensor\"), py::arg(\"offsets\"));\n\n    m.def(\"attention_softmax\", &attention_softmax, py::arg(\"neighbor_attention\"), py::arg(\"self_attention\"), py::arg(\"segment_offsets\"), py::arg(\"segment_ids\"),\n          py::arg(\"num_nbrs\"));\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/gnn/rgcn_layer_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/gnn/rgcn_layer.h\"\n\nvoid init_rgcn_layer(py::module &m) {\n    py::class_<RGCNLayer, GNNLayer, shared_ptr<RGCNLayer>>(m, \"RGCNLayer\")\n        .def_readwrite(\"options\", &RGCNLayer::options_)\n        .def_readwrite(\"num_relations\", &RGCNLayer::num_relations_)\n        .def_readwrite(\"relation_matrices_\", &RGCNLayer::relation_matrices_)\n        .def_readwrite(\"inverse_relation_matrices_\", &RGCNLayer::inverse_relation_matrices_)\n        .def_readwrite(\"self_matrix_\", &RGCNLayer::self_matrix_)\n        .def(py::init<shared_ptr<LayerConfig>, int, torch::Device>(), py::arg(\"layer_config\"), py::arg(\"num_relations\"), py::arg(\"device\"))\n        .def(py::init([](int input_dim, int output_dim, int num_relations, std::optional<torch::Device> device, InitConfig init, bool bias,\n                         InitConfig bias_init, string activation) {\n                 auto layer_config = std::make_shared<LayerConfig>();\n                 layer_config->input_dim = input_dim;\n                 layer_config->output_dim = output_dim;\n                 layer_config->type = LayerType::GNN;\n\n                 auto layer_options = std::make_shared<GNNLayerOptions>();\n                 layer_config->options = layer_options;\n\n                 layer_config->init = std::make_shared<InitConfig>(init);\n                 layer_config->bias = bias;\n                 layer_config->bias_init = std::make_shared<InitConfig>(bias_init);\n                 layer_config->optimizer = nullptr;\n                 layer_config->activation = getActivationFunction(activation);\n\n                 torch::Device torch_device = torch::kCPU;\n                 if (device.has_value()) {\n                     torch_device = device.value();\n                 }\n\n                 return std::make_shared<RGCNLayer>(layer_config, num_relations, torch_device);\n             }),\n             py::arg(\"input_dim\"), py::arg(\"output_dim\"), py::arg(\"num_relations\"), py::arg(\"device\") = py::none(),\n             py::arg(\"init\") = InitConfig(InitDistribution::GLOROT_UNIFORM, nullptr), py::arg(\"bias\") = false,\n             py::arg(\"bias_init\") = InitConfig(InitDistribution::ZEROS, nullptr), py::arg(\"activation\") = \"none\")\n        .def(\"reset\", &RGCNLayer::reset)\n        .def(\"forward\", &RGCNLayer::forward, py::arg(\"inputs\"), py::arg(\"dense_graph\"), py::arg(\"train\") = true);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/layer_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/reduction/reduction_layer.h\"\n\nnamespace py = pybind11;\n\nclass PyLayer : Layer {\n   public:\n    using Layer::Layer;\n};\n\nvoid init_layer(py::module &m) {\n    py::class_<Layer, PyLayer, torch::nn::Module, std::shared_ptr<Layer>>(m, \"Layer\")\n        .def_readwrite(\"config\", &Layer::config_)\n        .def_readwrite(\"device\", &Layer::device_)\n        .def_readwrite(\"bias\", &Layer::bias_)\n        .def(\"post_hook\", &ReductionLayer::post_hook, py::arg(\"inputs\"))\n        .def(\"init_bias\", &ReductionLayer::init_bias);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/reduction/concat_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/15/22.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/reduction/concat.h\"\n\nvoid init_concat_reduction_layer(py::module &m) {\n    py::class_<ConcatReduction, ReductionLayer, std::shared_ptr<ConcatReduction>>(m, \"ConcatReduction\")\n        .def(py::init<shared_ptr<LayerConfig>, torch::Device>(), py::arg(\"layer_config\"), py::arg(\"device\"))\n        .def(py::init(\n                 [](int input_dim, int output_dim, std::optional<torch::Device> device, InitConfig init, bool bias, InitConfig bias_init, string activation) {\n                     auto layer_config = std::make_shared<LayerConfig>();\n                     layer_config->input_dim = input_dim;\n                     layer_config->output_dim = output_dim;\n                     layer_config->type = LayerType::GNN;\n\n                     auto layer_options = std::make_shared<ReductionLayerOptions>();\n                     layer_options->type = ReductionLayerType::CONCAT;\n                     layer_config->options = layer_options;\n\n                     layer_config->init = std::make_shared<InitConfig>(init);\n                     layer_config->bias = bias;\n                     layer_config->bias_init = std::make_shared<InitConfig>(bias_init);\n                     layer_config->optimizer = nullptr;\n                     layer_config->activation = getActivationFunction(activation);\n\n                     torch::Device torch_device = torch::kCPU;\n                     if (device.has_value()) {\n                         torch_device = device.value();\n                     }\n\n                     return std::make_shared<ConcatReduction>(layer_config, torch_device);\n                 }),\n             py::arg(\"input_dim\"), py::arg(\"output_dim\"), py::arg(\"device\") = py::none(),\n             py::arg(\"init\") = InitConfig(InitDistribution::GLOROT_UNIFORM, nullptr), py::arg(\"bias\") = false,\n             py::arg(\"bias_init\") = InitConfig(InitDistribution::ZEROS, nullptr), py::arg(\"activation\") = \"none\")\n        .def(\"forward\", &ConcatReduction::forward, py::arg(\"input\"))\n        .def(\"reset\", &ConcatReduction::reset);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/reduction/linear_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/15/22.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/reduction/linear.h\"\n\nvoid init_linear_reduction_layer(py::module &m) {\n    py::class_<LinearReduction, ReductionLayer, std::shared_ptr<LinearReduction>>(m, \"LinearReduction\")\n        .def_readwrite(\"weight_matrix\", &LinearReduction::weight_matrix_)\n        .def(py::init<shared_ptr<LayerConfig>, torch::Device>(), py::arg(\"layer_config\"), py::arg(\"device\"))\n        .def(py::init(\n                 [](int input_dim, int output_dim, std::optional<torch::Device> device, InitConfig init, bool bias, InitConfig bias_init, string activation) {\n                     auto layer_config = std::make_shared<LayerConfig>();\n                     layer_config->input_dim = input_dim;\n                     layer_config->output_dim = output_dim;\n                     layer_config->type = LayerType::GNN;\n\n                     auto layer_options = std::make_shared<ReductionLayerOptions>();\n                     layer_options->type = ReductionLayerType::LINEAR;\n                     layer_config->options = layer_options;\n\n                     layer_config->init = std::make_shared<InitConfig>(init);\n                     layer_config->bias = bias;\n                     layer_config->bias_init = std::make_shared<InitConfig>(bias_init);\n                     layer_config->optimizer = nullptr;\n                     layer_config->activation = getActivationFunction(activation);\n\n                     torch::Device torch_device = torch::kCPU;\n                     if (device.has_value()) {\n                         torch_device = device.value();\n                     }\n\n                     return std::make_shared<LinearReduction>(layer_config, torch_device);\n                 }),\n             py::arg(\"input_dim\"), py::arg(\"output_dim\"), py::arg(\"device\") = py::none(),\n             py::arg(\"init\") = InitConfig(InitDistribution::GLOROT_UNIFORM, nullptr), py::arg(\"bias\") = false,\n             py::arg(\"bias_init\") = InitConfig(InitDistribution::ZEROS, nullptr), py::arg(\"activation\") = \"none\")\n        .def(\"forward\", &LinearReduction::forward, py::arg(\"input\"))\n        .def(\"reset\", &LinearReduction::reset);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/layers/reduction/reduction_layer_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/30/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"nn/layers/reduction/reduction_layer.h\"\n\nclass PyReductionLayer : ReductionLayer {\n   public:\n    using ReductionLayer::ReductionLayer;\n    torch::Tensor forward(std::vector<torch::Tensor> inputs) override { PYBIND11_OVERRIDE_PURE(torch::Tensor, ReductionLayer, forward, inputs); }\n};\n\nvoid init_reduction_layer(py::module &m) {\n    py::class_<ReductionLayer, PyReductionLayer, Layer, std::shared_ptr<ReductionLayer>>(m, \"ReductionLayer\")\n        .def(\"forward\", &ReductionLayer::forward, py::arg(\"inputs\"));\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/loss_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"nn/loss.h\"\n\nnamespace py = pybind11;\n\nclass PyLossFunction : LossFunction {\n   public:\n    using LossFunction::LossFunction;\n    torch::Tensor operator()(torch::Tensor y_pred, torch::Tensor targets, bool scores) override {\n        PYBIND11_OVERRIDE_PURE_NAME(torch::Tensor, LossFunction, \"__call__\", operator(), y_pred, targets, scores);\n    }\n};\n\nvoid init_loss(py::module &m) {\n    py::class_<LossFunction, PyLossFunction, shared_ptr<LossFunction>>(m, \"LossFunction\")\n        .def(py::init<>())\n        .def(\"__call__\", &LossFunction::operator(), py::arg(\"y_pred\"), py::arg(\"targets\"), py::arg(\"scores\"));\n\n    py::class_<SoftmaxCrossEntropy, LossFunction, shared_ptr<SoftmaxCrossEntropy>>(m, \"SoftmaxCrossEntropy\")\n        .def(py::init([](string reduction) {\n                 auto options = std::make_shared<LossOptions>();\n                 options->loss_reduction = getLossReduction(reduction);\n                 return std::make_shared<SoftmaxCrossEntropy>(options);\n             }),\n             py::arg(\"reduction\") = \"sum\");\n\n    py::class_<RankingLoss, LossFunction, shared_ptr<RankingLoss>>(m, \"RankingLoss\")\n        .def(py::init([](string reduction, float margin) {\n                 auto options = std::make_shared<RankingLossOptions>();\n                 options->loss_reduction = getLossReduction(reduction);\n                 options->margin = margin;\n                 return std::make_shared<RankingLoss>(options);\n             }),\n             py::arg(\"reduction\") = \"sum\", py::arg(\"margin\") = 1.0);\n\n    py::class_<CrossEntropyLoss, LossFunction, shared_ptr<CrossEntropyLoss>>(m, \"CrossEntropyLoss\")\n        .def(py::init([](string reduction) {\n                 auto options = std::make_shared<LossOptions>();\n                 options->loss_reduction = getLossReduction(reduction);\n                 return std::make_shared<CrossEntropyLoss>(options);\n             }),\n             py::arg(\"reduction\") = \"sum\");\n\n    py::class_<BCEAfterSigmoidLoss, LossFunction, shared_ptr<BCEAfterSigmoidLoss>>(m, \"BCEAfterSigmoidLoss\")\n        .def(py::init([](string reduction) {\n                 auto options = std::make_shared<LossOptions>();\n                 options->loss_reduction = getLossReduction(reduction);\n                 return std::make_shared<BCEAfterSigmoidLoss>(options);\n             }),\n             py::arg(\"reduction\") = \"sum\");\n\n    py::class_<BCEWithLogitsLoss, LossFunction, shared_ptr<BCEWithLogitsLoss>>(m, \"BCEWithLogitsLoss\")\n        .def(py::init([](string reduction) {\n                 auto options = std::make_shared<LossOptions>();\n                 options->loss_reduction = getLossReduction(reduction);\n                 return std::make_shared<BCEWithLogitsLoss>(options);\n             }),\n             py::arg(\"reduction\") = \"sum\");\n\n    py::class_<MSELoss, LossFunction, shared_ptr<MSELoss>>(m, \"MSELoss\")\n        .def(py::init([](string reduction) {\n                 auto options = std::make_shared<LossOptions>();\n                 options->loss_reduction = getLossReduction(reduction);\n                 return std::make_shared<MSELoss>(options);\n             }),\n             py::arg(\"reduction\") = \"sum\");\n\n    py::class_<SoftPlusLoss, LossFunction, shared_ptr<SoftPlusLoss>>(m, \"SoftPlusLoss\")\n        .def(py::init([](string reduction) {\n                 auto options = std::make_shared<LossOptions>();\n                 options->loss_reduction = getLossReduction(reduction);\n                 return std::make_shared<SoftPlusLoss>(options);\n             }),\n             py::arg(\"reduction\") = \"sum\");\n\n    m.def(\"getLossFunction\", &getLossFunction, py::arg(\"config\"));\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/model_wrap.cpp",
    "content": "//\n// Created by Jason Mohoney on 3/23/21.\n//\n\n#include \"common/pybind_headers.h\"\n#include \"configuration/config.h\"\n#include \"configuration/util.h\"\n#include \"nn/model.h\"\n\nclass PyModel : Model {\n   public:\n    using Model::Model;\n};\n\nvoid init_model(py::module &m) {\n    py::class_<Model, PyModel, torch::nn::Module, shared_ptr<Model>>(m, \"Model\", py::dynamic_attr())\n        .def_readwrite(\"encoder\", &Model::encoder_)\n        .def_readwrite(\"decoder\", &Model::decoder_)\n        .def_readwrite(\"optimizers\", &Model::optimizers_)\n        .def_readwrite(\"loss_function\", &Model::loss_function_)\n        .def_readwrite(\"reporter\", &Model::reporter_)\n        .def_readwrite(\"device\", &Model::device_)\n        .def_readwrite(\"learning_task\", &Model::learning_task_)\n        .def_readwrite(\"sparse_lr\", &Model::sparse_lr_)\n        .def_readwrite(\"device_models\", &Model::device_models_)\n        .def(py::init<shared_ptr<GeneralEncoder>, shared_ptr<Decoder>, shared_ptr<LossFunction>, shared_ptr<Reporter>>())\n        .def(py::init([](shared_ptr<GeneralEncoder> encoder, shared_ptr<Decoder> decoder, shared_ptr<LossFunction> loss, shared_ptr<Reporter> reporter,\n                         float sparse_lr) {\n                 auto model = std::make_shared<Model>(encoder, decoder, loss, reporter);\n                 model->sparse_lr_ = sparse_lr;\n                 return model;\n             }),\n             py::arg(\"encoder\"), py::arg(\"decoder\"), py::arg(\"loss\") = nullptr, py::arg(\"reporter\") = nullptr, py::arg(\"sparse_lr\") = .1)\n\n        .def(\"forward_nc\", &Model::forward_nc, py::arg(\"node_embeddings\"), py::arg(\"node_features\"), py::arg(\"dense_graph\"), py::arg(\"train\"),\n             py::call_guard<py::gil_scoped_release>())\n        .def(\"forward_lp\", &Model::forward_lp, py::arg(\"batch\"), py::arg(\"train\"), py::call_guard<py::gil_scoped_release>())\n        .def(\"train_batch\", &Model::train_batch, py::arg(\"batch\"), py::arg(\"call_step\") = true, py::call_guard<py::gil_scoped_release>())\n        .def(\"evaluate_batch\", &Model::evaluate_batch, py::arg(\"batch\"), py::call_guard<py::gil_scoped_release>())\n        .def(\"clear_grad\", &Model::clear_grad)\n        .def(\"clear_grad_all\", &Model::clear_grad_all)\n        .def(\"step\", &Model::step)\n        .def(\"step_all\", &Model::step_all)\n        .def(\"save\", &Model::save, py::arg(\"directory\"))\n        .def(\"load\", &Model::load, py::arg(\"directory\"), py::arg(\"train\"))\n        .def(\"broadcast\", &Model::broadcast, py::arg(\"devices\"))\n        .def(\"all_reduce\", &Model::all_reduce);\n\n    m.def(\n        \"initModelFromConfig\",\n        [](pyobj python_config, pybind11::list devices_pylist, int num_relations, bool train) {\n            std::vector<torch::Device> devices = {};\n\n            for (auto py_id : devices_pylist) {\n                pyobj id_object = pybind11::reinterpret_borrow<pyobj>(py_id);\n                devices.emplace_back(torch::python::detail::py_object_to_device(id_object));\n            }\n\n            shared_ptr<ModelConfig> model_config = initModelConfig(python_config);\n\n            return initModelFromConfig(model_config, devices, num_relations, train);\n        },\n        py::arg(\"model_config\"), py::arg(\"devices\"), py::arg(\"num_relations\"), py::arg(\"train\"), py::return_value_policy::move);\n\n    m.def(\n        \"load_from_file\",\n        [](string config_path, bool train) {\n            auto config = loadConfig(config_path, false);\n            auto devices = devices_from_config(config->storage);\n            auto model = initModelFromConfig(config->model, devices, config->storage->dataset->num_relations, train);\n            model->load(config->storage->model_dir, train);\n            return model;\n        },\n        py::arg(\"config_path\"), py::arg(\"train\"), py::return_value_policy::move);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/optim_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"nn/optim.h\"\n\nnamespace py = pybind11;\n\nclass PyOptimizer : Optimizer {\n   public:\n    using Optimizer::Optimizer;\n\n    void reset_state() override { PYBIND11_OVERRIDE_PURE_NAME(void, Optimizer, \"reset_state\", reset_state); }\n\n    void step() override { PYBIND11_OVERRIDE_PURE_NAME(void, Optimizer, \"step\", step); }\n};\n\nvoid init_optim(py::module &m) {\n    py::class_<Optimizer, PyOptimizer, std::shared_ptr<Optimizer>>(m, \"Optimizer\")\n        .def_readwrite(\"num_steps\", &Optimizer::num_steps_)\n        //        .def_readwrite(\"state_dict\", &Optimizer::state_dict_)\n        //        .def_readwrite(\"param_dict\", &Optimizer::param_dict_)\n        // TODO need to provide bindings for torch::serialize::InputArchive and torch::serialize::OutputArchive\n        //        .def(\"save\", &Optimizer::save, py::arg(\"output_archive\"))\n        //        .def(\"load\", &Optimizer::load, py::arg(\"input_archive\"))\n        .def(\"clear_grad\", &Optimizer::clear_grad)\n        .def(\"reset_state\", &Optimizer::reset_state)\n        .def(\"step\", &Optimizer::step);\n\n    py::class_<SGDOptimizer, Optimizer, std::shared_ptr<SGDOptimizer>>(m, \"SGDOptimizer\")\n        .def_readwrite(\"learning_rate\", &SGDOptimizer::learning_rate_)\n        .def(py::init<torch::OrderedDict<std::string, torch::Tensor>, float>(), py::arg(\"param_dict\"), py::arg(\"learning_rate\"));\n\n    py::class_<AdagradOptimizer, Optimizer, std::shared_ptr<AdagradOptimizer>>(m, \"AdagradOptimizer\")\n        .def_readwrite(\"learning_rate\", &AdagradOptimizer::learning_rate_)\n        .def_readwrite(\"eps\", &AdagradOptimizer::eps_)\n        .def_readwrite(\"lr_decay\", &AdagradOptimizer::lr_decay_)\n        .def_readwrite(\"weight_decay\", &AdagradOptimizer::weight_decay_)\n        .def_readwrite(\"init_value\", &AdagradOptimizer::init_value_)\n        .def(py::init<torch::OrderedDict<std::string, torch::Tensor>, std::shared_ptr<AdagradOptions>>(), py::arg(\"param_dict\"), py::arg(\"options\"))\n        .def(py::init([](torch::OrderedDict<std::string, torch::Tensor> param_dict, float learning_rate, float eps, float lr_decay, float init_value,\n                         float weight_decay) {\n                 auto options = std::make_shared<AdagradOptions>();\n                 options->learning_rate = learning_rate;\n                 options->eps = eps;\n                 options->lr_decay = lr_decay;\n                 options->init_value = init_value;\n                 options->weight_decay = weight_decay;\n\n                 return std::make_shared<AdagradOptimizer>(param_dict, options);\n             }),\n             py::arg(\"param_dict\"), py::arg(\"lr\") = .1, py::arg(\"eps\") = 1e-10, py::arg(\"lr_decay\") = 0, py::arg(\"init_value\") = 0,\n             py::arg(\"weight_decay\") = 0);\n\n    py::class_<AdamOptimizer, Optimizer, std::shared_ptr<AdamOptimizer>>(m, \"AdamOptimizer\")\n        .def_readwrite(\"learning_rate\", &AdamOptimizer::learning_rate_)\n        .def_readwrite(\"eps\", &AdamOptimizer::eps_)\n        .def_readwrite(\"beta_1\", &AdamOptimizer::beta_1_)\n        .def_readwrite(\"beta_2\", &AdamOptimizer::beta_2_)\n        .def_readwrite(\"weight_decay\", &AdamOptimizer::weight_decay_)\n        .def_readwrite(\"amsgrad\", &AdamOptimizer::amsgrad_)\n        .def(py::init<torch::OrderedDict<std::string, torch::Tensor>, std::shared_ptr<AdamOptions>>(), py::arg(\"param_dict\"), py::arg(\"options\"))\n        .def(py::init([](torch::OrderedDict<std::string, torch::Tensor> param_dict, float learning_rate, float eps, float beta_1, float beta_2,\n                         float weight_decay, bool amsgrad) {\n                 auto options = std::make_shared<AdamOptions>();\n                 options->learning_rate = learning_rate;\n                 options->eps = eps;\n                 options->beta_1 = beta_1;\n                 options->beta_2 = beta_2;\n                 options->weight_decay = weight_decay;\n                 options->amsgrad = amsgrad;\n\n                 return std::make_shared<AdamOptimizer>(param_dict, options);\n             }),\n             py::arg(\"param_dict\"), py::arg(\"lr\") = .1, py::arg(\"eps\") = 1e-8, py::arg(\"beta_1\") = .9, py::arg(\"beta_2\") = .999, py::arg(\"weight_decay\") = 0,\n             py::arg(\"amsgrad\") = false);\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/regularizer_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"nn/regularizer.h\"\n\nnamespace py = pybind11;\n\nclass PyRegularizer : Regularizer {\n   public:\n    using Regularizer::Regularizer;\n    torch::Tensor operator()(torch::Tensor src_nodes_embs, torch::Tensor dst_node_embs) override {\n        PYBIND11_OVERRIDE_PURE_NAME(torch::Tensor, Regularizer, \"__call__\", operator(), src_nodes_embs, dst_node_embs);\n    }\n};\n\nvoid init_regularizer(py::module &m) {\n    py::class_<Regularizer, PyRegularizer>(m, \"Regularizer\")\n        .def(py::init<>())\n        .def(\"__call__\", &Regularizer::operator(), py::arg(\"src_nodes_embs\"), py::arg(\"dst_node_embs\"));\n\n    py::class_<NormRegularizer, Regularizer>(m, \"NormRegularizer\").def(py::init<int, float>(), py::arg(\"norm\"), py::arg(\"coefficient\"));\n}"
  },
  {
    "path": "src/cpp/python_bindings/nn/wrap.cpp",
    "content": "#include <pybind11/embed.h>\n\n#include \"common/pybind_headers.h\"\n\nnamespace py = pybind11;\n\n// nn\nvoid init_activation(py::module &);\nvoid init_initialization(py::module &);\nvoid init_loss(py::module &);\nvoid init_model(py::module &);\nvoid init_optim(py::module &);\nvoid init_regularizer(py::module &);\n\n// nn/decoders\nvoid init_decoder(py::module &);\n\n// nn/decoders/edge\nvoid init_comparators(py::module &);\nvoid init_complex(py::module &);\nvoid init_distmult(py::module &);\nvoid init_edge_decoder(py::module &);\nvoid init_relation_operators(py::module &);\nvoid init_transe(py::module &);\n\n// nn/decoders/node\nvoid init_node_decoder(py::module &);\nvoid init_noop_node_decoder(py::module &);\n\n// nn/encoders\nvoid init_encoder(py::module &);\n\n// nn/layers\nvoid init_layer(py::module &);\n\n// nn/layers/dense\n\n// nn/layers/embedding\nvoid init_embedding_layer(py::module &);\n\n// nn/layers/feature\nvoid init_feature_layer(py::module &);\n\n// nn/layers/gnn\nvoid init_gat_layer(py::module &);\nvoid init_gcn_layer(py::module &);\nvoid init_gnn_layer(py::module &);\nvoid init_graph_sage_layer(py::module &);\nvoid init_layer_helpers(py::module &);\nvoid init_rgcn_layer(py::module &);\n\n// nn/layers/reduction\nvoid init_concat_reduction_layer(py::module &);\nvoid init_linear_reduction_layer(py::module &);\nvoid init_reduction_layer(py::module &);\n\nPYBIND11_MODULE(_nn, m) {\n    m.doc() = \"Contains model encoders, decoders and layers.\";\n\n    // nn\n    init_activation(m);\n    init_initialization(m);\n    init_loss(m);\n    init_model(m);\n    init_optim(m);\n    init_regularizer(m);\n\n    // nn/decoders\n    auto decoders_m = m.def_submodule(\"decoders\");\n    decoders_m.doc() = \"Decoder models\";\n\n    init_decoder(decoders_m);\n\n    // nn/decoders/edge\n    auto edge_m = decoders_m.def_submodule(\"edge\");\n    edge_m.doc() = \"Decoders for link prediction\";\n\n    init_edge_decoder(edge_m);\n    init_comparators(edge_m);\n    init_complex(edge_m);\n    init_distmult(edge_m);\n    init_relation_operators(edge_m);\n    init_transe(edge_m);\n\n    // nn/decoders/node\n    auto node_m = decoders_m.def_submodule(\"node\");\n    node_m.doc() = \"Decoders for node classification\";\n\n    init_node_decoder(node_m);\n    init_noop_node_decoder(node_m);\n\n    // nn/encoders\n    auto encoders_m = m.def_submodule(\"encoders\");\n    encoders_m.doc() = \"Model encoders\";\n\n    init_encoder(encoders_m);\n\n    // nn/layers\n    auto layers_m = m.def_submodule(\"layers\");\n    layers_m.doc() = \"Layers for encoders\";\n    init_layer(layers_m);\n\n    // nn/layers/dense\n\n    // nn/layers/embedding\n    init_embedding_layer(layers_m);\n\n    // nn/layers/feature\n    init_feature_layer(layers_m);\n\n    // nn/layers/gnn\n    init_gnn_layer(layers_m);\n    init_gat_layer(layers_m);\n    init_gcn_layer(layers_m);\n    init_graph_sage_layer(layers_m);\n    init_layer_helpers(layers_m);\n    init_rgcn_layer(layers_m);\n\n    // nn/layers/reduction\n    init_reduction_layer(layers_m);\n    init_concat_reduction_layer(layers_m);\n    init_linear_reduction_layer(layers_m);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/pipeline/evaluator_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"pipeline/evaluator.h\"\n\nnamespace py = pybind11;\n\n// Trampoline class\nclass PyEvaluator : Evaluator {\n   public:\n    using Evaluator::Evaluator;\n    void evaluate(bool validation) override { PYBIND11_OVERRIDE_PURE(void, Evaluator, evaluate, validation); }\n};\n\nvoid init_evaluator(py::module &m) {\n    py::class_<Evaluator, PyEvaluator, shared_ptr<Evaluator>>(m, \"Evaluator\")\n        .def(py::init<>())\n        .def_readwrite(\"dataloader\", &Evaluator::dataloader_)\n        .def(\"evaluate\", &Evaluator::evaluate, py::arg(\"validation\"));\n\n    py::class_<SynchronousEvaluator, Evaluator, shared_ptr<SynchronousEvaluator>>(m, \"SynchronousEvaluator\")\n        .def(py::init<shared_ptr<DataLoader>, shared_ptr<Model>>(), py::arg(\"dataloader\"), py::arg(\"model\"));\n\n    py::class_<PipelineEvaluator, Evaluator, shared_ptr<PipelineEvaluator>>(m, \"PipelineEvaluator\")\n        .def(py::init<shared_ptr<DataLoader>, shared_ptr<Model>, shared_ptr<PipelineConfig>>(), py::arg(\"dataloader\"), py::arg(\"model\"),\n             py::arg(\"pipeline_config\"));\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/pipeline/graph_encoder_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"pipeline/graph_encoder.h\"\n\nnamespace py = pybind11;\n\n// Trampoline class\nclass PyGraphEncoder : GraphEncoder {\n   public:\n    using GraphEncoder::GraphEncoder;\n    void encode(bool separate_layers) override { PYBIND11_OVERRIDE_PURE(void, GraphEncoder, encode, separate_layers); }\n};\n\nvoid init_graph_encoder(py::module &m) {\n    py::class_<GraphEncoder, PyGraphEncoder, std::shared_ptr<GraphEncoder>>(m, \"GraphEncoder\")\n        .def_readwrite(\"dataloader\", &GraphEncoder::dataloader_)\n        .def_readwrite(\"progress_reporter\", &GraphEncoder::progress_reporter_)\n        .def(\"encode\", &GraphEncoder::encode, py::arg(\"separate_layers\") = false);\n\n    py::class_<SynchronousGraphEncoder, GraphEncoder, std::shared_ptr<SynchronousGraphEncoder>>(m, \"SynchronousEncoder\")\n        .def(py::init<shared_ptr<DataLoader>, std::shared_ptr<Model>>(), py::arg(\"dataloader\"), py::arg(\"model\"));\n\n    py::class_<PipelineGraphEncoder, GraphEncoder, std::shared_ptr<PipelineGraphEncoder>>(m, \"PipelineEncoder\")\n        .def(py::init<shared_ptr<DataLoader>, std::shared_ptr<Model>, std::shared_ptr<PipelineConfig>>(), py::arg(\"dataloader\"), py::arg(\"model\"),\n             py::arg(\"pipeline_config\"));\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/pipeline/trainer_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"pipeline/trainer.h\"\n\nnamespace py = pybind11;\n\n// Trampoline class\nclass PyTrainer : Trainer {\n   public:\n    using Trainer::Trainer;\n    void train(int num_epochs = 1) override { PYBIND11_OVERRIDE_PURE(void, Trainer, train, num_epochs); }\n};\n\nvoid init_trainer(py::module &m) {\n    py::class_<Trainer, PyTrainer, shared_ptr<Trainer>>(m, \"Trainer\")\n        .def(py::init<>())\n        .def_readwrite(\"dataloader\", &Trainer::dataloader_)\n        .def_readwrite(\"progress_reporter\", &Trainer::progress_reporter_)\n        .def_readwrite(\"learning_task\", &Trainer::learning_task_)\n        .def(\"train\", &Trainer::train, py::arg(\"num_epochs\") = 1);\n\n    py::class_<SynchronousTrainer, Trainer, shared_ptr<SynchronousTrainer>>(m, \"SynchronousTrainer\")\n        .def(py::init<shared_ptr<DataLoader>, shared_ptr<Model>, int>(), py::arg(\"dataloader\"), py::arg(\"model\"), py::arg(\"logs_per_epoch\") = 10);\n\n    py::class_<PipelineTrainer, Trainer, shared_ptr<PipelineTrainer>>(m, \"PipelineTrainer\")\n        .def(py::init<shared_ptr<DataLoader>, shared_ptr<Model>, shared_ptr<PipelineConfig>, int>(), py::arg(\"dataloader\"), py::arg(\"model\"),\n             py::arg(\"pipeline_config\"), py::arg(\"logs_per_epoch\") = 10);\n}"
  },
  {
    "path": "src/cpp/python_bindings/pipeline/wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n\n// pipeline\nvoid init_evaluator(py::module &);\nvoid init_graph_encoder(py::module &);\nvoid init_trainer(py::module &);\n\nPYBIND11_MODULE(_pipeline, m) {\n    m.doc() = \"Training and Evaluation pipelines.\";\n\n    // pipeline\n    init_evaluator(m);\n    init_graph_encoder(m);\n    init_trainer(m);\n}"
  },
  {
    "path": "src/cpp/python_bindings/reporting/reporting_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"reporting/reporting.h\"\n\nclass PyReporter : Reporter {\n   public:\n    using Reporter::Reporter;\n    void report() override { PYBIND11_OVERRIDE_PURE_NAME(void, Reporter, \"report\", report); }\n};\n\nclass PyMetric : Metric {\n   public:\n    using Metric::Metric;\n};\n\nvoid init_reporting(py::module &m) {\n    py::class_<Metric, PyMetric, std::shared_ptr<Metric>>(m, \"Metric\").def_readwrite(\"name\", &Metric::name_).def_readwrite(\"unit\", &Metric::unit_);\n\n    py::class_<RankingMetric, Metric, std::shared_ptr<RankingMetric>>(m, \"RankingMetric\")\n        .def(\"compute_metric\", &RankingMetric::computeMetric, py::arg(\"ranks\"));\n    py::class_<HitskMetric, RankingMetric, std::shared_ptr<HitskMetric>>(m, \"Hitsk\")\n        .def(py::init<int>(), py::arg(\"k\"))\n        .def(\"compute_metric\", &HitskMetric::computeMetric, py::arg(\"ranks\"));\n    py::class_<MeanRankMetric, RankingMetric, std::shared_ptr<MeanRankMetric>>(m, \"MeanRank\")\n        .def(py::init<>())\n        .def(\"compute_metric\", &MeanRankMetric::computeMetric, py::arg(\"ranks\"));\n    py::class_<MeanReciprocalRankMetric, RankingMetric, std::shared_ptr<MeanReciprocalRankMetric>>(m, \"MeanReciprocalRank\")\n        .def(py::init<>())\n        .def(\"compute_metric\", &MeanReciprocalRankMetric::computeMetric, py::arg(\"ranks\"));\n\n    py::class_<ClassificationMetric, Metric, std::shared_ptr<ClassificationMetric>>(m, \"ClassificationMetric\")\n        .def(\"compute_metric\", &ClassificationMetric::computeMetric, py::arg(\"y_true\"), py::arg(\"y_pred\"));\n    py::class_<CategoricalAccuracyMetric, ClassificationMetric, std::shared_ptr<CategoricalAccuracyMetric>>(m, \"CategoricalAccuracy\")\n        .def(py::init<>())\n        .def(\"compute_metric\", &CategoricalAccuracyMetric::computeMetric, py::arg(\"y_true\"), py::arg(\"y_pred\"));\n\n    py::class_<Reporter, PyReporter, std::shared_ptr<Reporter>>(m, \"Reporter\")\n        .def_readwrite(\"metrics\", &Reporter::metrics_)\n        .def(py::init<>())\n        .def(\"add_metric\", &Reporter::addMetric, py::arg(\"metric\"))\n        .def(\"report\", &Reporter::report);\n\n    py::class_<LinkPredictionReporter, Reporter, std::shared_ptr<LinkPredictionReporter>>(m, \"LinkPredictionReporter\")\n        .def(py::init<>())\n        .def(\"clear\", &LinkPredictionReporter::clear)\n        .def(\"compute_ranks\", &LinkPredictionReporter::computeRanks, py::arg(\"pos_scores\"), py::arg(\"neg_scores\"))\n        .def(\"add_result\", &LinkPredictionReporter::addResult, py::arg(\"pos_scores\"), py::arg(\"neg_scores\"), py::arg(\"edges\") = torch::Tensor())\n        .def(\"save\", &LinkPredictionReporter::save, py::arg(\"directory\"), py::arg(\"scores\") = false, py::arg(\"ranks\") = false);\n\n    py::class_<NodeClassificationReporter, Reporter, std::shared_ptr<NodeClassificationReporter>>(m, \"NodeClassificationReporter\")\n        .def(py::init<>())\n        .def(\"clear\", &NodeClassificationReporter::clear)\n        .def(\"add_result\", &NodeClassificationReporter::addResult, py::arg(\"y_true\"), py::arg(\"y_pred\"), py::arg(\"node_ids\") = torch::Tensor())\n        .def(\"save\", &NodeClassificationReporter::save, py::arg(\"directory\"), py::arg(\"labels\") = false);\n\n    py::class_<ProgressReporter, Reporter, std::shared_ptr<ProgressReporter>>(m, \"ProgressReporter\")\n        .def(py::init<std::string, int64_t, int>(), py::arg(\"item_name\"), py::arg(\"total_items\"), py::arg(\"total_reports\"))\n        .def(\"clear\", &ProgressReporter::clear)\n        .def(\"add_result\", &ProgressReporter::addResult, py::arg(\"items_processed\"));\n}"
  },
  {
    "path": "src/cpp/python_bindings/reporting/wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n\n// reporting\nvoid init_reporting(py::module &);\n\nPYBIND11_MODULE(_report, m) {\n    m.doc() = \"Training and evaluation metrics.\";\n\n    // reporting\n    init_reporting(m);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/storage/graph_storage_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"storage/graph_storage.h\"\n\nvoid init_graph_storage(py::module &m) {\n    py::class_<GraphModelStoragePtrs, std::shared_ptr<GraphModelStoragePtrs>>(m, \"GraphModelStoragePtrs\")\n        .def_readwrite(\"edges\", &GraphModelStoragePtrs::edges)\n        .def_readwrite(\"train_edges\", &GraphModelStoragePtrs::train_edges)\n        .def_readwrite(\"validation_edges\", &GraphModelStoragePtrs::validation_edges)\n        .def_readwrite(\"test_edges\", &GraphModelStoragePtrs::test_edges)\n        .def_readwrite(\"nodes\", &GraphModelStoragePtrs::nodes)\n        .def_readwrite(\"train_nodes\", &GraphModelStoragePtrs::train_nodes)\n        .def_readwrite(\"valid_nodes\", &GraphModelStoragePtrs::valid_nodes)\n        .def_readwrite(\"test_nodes\", &GraphModelStoragePtrs::test_nodes)\n        .def_readwrite(\"node_features\", &GraphModelStoragePtrs::node_features)\n        .def_readwrite(\"node_labels\", &GraphModelStoragePtrs::node_labels)\n        .def_readwrite(\"relation_features\", &GraphModelStoragePtrs::relation_features)\n        .def_readwrite(\"relation_labels\", &GraphModelStoragePtrs::relation_labels)\n        .def_readwrite(\"node_embeddings\", &GraphModelStoragePtrs::node_embeddings)\n        .def_readwrite(\"node_optimizer_state\", &GraphModelStoragePtrs::node_optimizer_state);\n\n    py::class_<InMemorySubgraphState, std::shared_ptr<InMemorySubgraphState>>(m, \"InMemorySubgraphState\")\n        .def_readwrite(\"all_in_memory_edges\", &InMemorySubgraphState::all_in_memory_edges_)\n        .def_readwrite(\"all_in_memory_mapped_edges\", &InMemorySubgraphState::all_in_memory_mapped_edges_)\n        .def_readwrite(\"in_memory_partition_ids\", &InMemorySubgraphState::in_memory_partition_ids_)\n        .def_readwrite(\"in_memory_edge_bucket_ids\", &InMemorySubgraphState::in_memory_edge_bucket_ids_)\n        .def_readwrite(\"in_memory_edge_bucket_starts\", &InMemorySubgraphState::in_memory_edge_bucket_starts_)\n        .def_readwrite(\"in_memory_edge_bucket_sizes\", &InMemorySubgraphState::in_memory_edge_bucket_sizes_)\n        .def_readwrite(\"global_to_local_index_map\", &InMemorySubgraphState::global_to_local_index_map_)\n        .def_readwrite(\"in_memory_subgraph\", &InMemorySubgraphState::in_memory_subgraph_);\n\n    py::class_<GraphModelStorage, std::shared_ptr<GraphModelStorage>>(m, \"GraphModelStorage\")\n        .def_readwrite(\"active_edges\", &GraphModelStorage::active_edges_)\n        .def_readwrite(\"active_nodes\", &GraphModelStorage::active_nodes_)\n        .def_readwrite(\"storage_ptrs\", &GraphModelStorage::storage_ptrs_)\n        .def_readwrite(\"full_graph_evaluation\", &GraphModelStorage::full_graph_evaluation_)\n        .def_readwrite(\"current_subgraph_state\", &GraphModelStorage::current_subgraph_state_)\n        .def_readwrite(\"next_subgraph_state\", &GraphModelStorage::next_subgraph_state_)\n\n        .def(py::init<GraphModelStoragePtrs, shared_ptr<StorageConfig>>(), py::arg(\"storage_ptrs\"), py::arg(\"storage_config\"))\n        .def(py::init([](shared_ptr<Storage> edges, shared_ptr<Storage> nodes, shared_ptr<Storage> node_features, shared_ptr<Storage> node_embeddings,\n                         shared_ptr<Storage> node_optimizer_state, shared_ptr<Storage> node_labels, std::vector<shared_ptr<Storage>> filter_edges, bool train,\n                         bool prefetch) {\n                 GraphModelStoragePtrs ptrs;\n                 ptrs.edges = edges;\n                 ptrs.nodes = nodes;\n                 ptrs.node_features = node_features;\n                 ptrs.node_embeddings = node_embeddings;\n                 ptrs.node_optimizer_state = node_optimizer_state;\n                 ptrs.node_labels = node_labels;\n                 ptrs.filter_edges = filter_edges;\n\n                 // initialize optimizer state if needed\n                 if (train && node_optimizer_state == nullptr && node_embeddings != nullptr) {\n                     string optimizer_state_filename = get_directory(node_embeddings->filename_);\n\n                     shared_ptr<FlatFile> init_optimizer_state_storage = std::make_shared<FlatFile>(optimizer_state_filename, node_embeddings->dtype_);\n\n                     int64_t curr_num_nodes = 0;\n                     int64_t offset = 0;\n                     int64_t num_nodes = node_embeddings->getDim0();\n\n                     while (offset < num_nodes) {\n                         if (num_nodes - offset < MAX_NODE_EMBEDDING_INIT_SIZE) {\n                             curr_num_nodes = num_nodes - offset;\n                         } else {\n                             curr_num_nodes = MAX_NODE_EMBEDDING_INIT_SIZE;\n                         }\n\n                         OptimizerState emb_state = torch::zeros({curr_num_nodes, node_embeddings->dim1_size_}, node_embeddings->dtype_);\n                         init_optimizer_state_storage->append(emb_state);\n\n                         offset += curr_num_nodes;\n                     }\n\n                     if (instance_of<Storage, InMemory>(node_embeddings)) {\n                         ptrs.node_optimizer_state = std::make_shared<InMemory>(optimizer_state_filename, node_embeddings->dtype_);\n                     } else if (instance_of<Storage, PartitionBufferStorage>(node_embeddings)) {\n                         ptrs.node_optimizer_state = std::make_shared<PartitionBufferStorage>(\n                             optimizer_state_filename, std::dynamic_pointer_cast<PartitionBufferStorage>(node_embeddings)->options_);\n                     } else {\n                         throw MariusRuntimeException(\"Unsupported storage backend for embeddings\");\n                     }\n                 }\n\n                 return std::make_shared<GraphModelStorage>(ptrs, prefetch);\n             }),\n             py::arg(\"edges\"), py::arg(\"nodes\") = shared_ptr<Storage>(nullptr), py::arg(\"node_features\") = shared_ptr<Storage>(nullptr),\n             py::arg(\"node_embeddings\") = shared_ptr<Storage>(nullptr), py::arg(\"node_optim_state\") = shared_ptr<Storage>(nullptr),\n             py::arg(\"node_labels\") = shared_ptr<Storage>(nullptr), py::arg(\"filter_edges\") = std::vector<shared_ptr<Storage>>(), py::arg(\"train\") = false,\n             py::arg(\"prefetch\") = false)\n\n        .def(\"load\", &GraphModelStorage::load)\n        .def(\"unload\", &GraphModelStorage::unload, py::arg(\"write\"))\n        .def(\"init_subgraph\", &GraphModelStorage::initializeInMemorySubGraph, py::arg(\"buffer_state\"), py::arg(\"num_hash_maps\") = 1)\n        .def(\"update_subgraph\", &GraphModelStorage::updateInMemorySubGraph)\n        .def(\"sort_all_edges\", &GraphModelStorage::sortAllEdges)\n        .def(\"set_edge_storage\", &GraphModelStorage::setEdgesStorage, py::arg(\"edge_storage\"))\n        .def(\"set_node_storage\", &GraphModelStorage::setNodesStorage, py::arg(\"node_storage\"))\n        .def(\"get_edges\", &GraphModelStorage::getEdges, py::arg(\"indices\"))\n        .def(\"get_edges_range\", &GraphModelStorage::getEdgesRange, py::arg(\"start\"), py::arg(\"size\"))\n        .def(\"getRandomNodeIds\", &GraphModelStorage::getRandomNodeIds, py::arg(\"size\"))\n        .def(\"getNodeIdsRange\", &GraphModelStorage::getNodeIdsRange, py::arg(\"start\"), py::arg(\"size\"))\n        .def(\"shuffleEdges\", &GraphModelStorage::shuffleEdges)\n        .def(\"getNodeEmbeddings\", &GraphModelStorage::getNodeEmbeddings, py::arg(\"indices\"))\n        .def(\"getNodeEmbeddingsRange\", &GraphModelStorage::getNodeEmbeddingsRange, py::arg(\"start\"), py::arg(\"size\"))\n        .def(\"getNodeFeatures\", &GraphModelStorage::getNodeFeatures, py::arg(\"indices\"))\n        .def(\"getNodeFeaturesRange\", &GraphModelStorage::getNodeFeaturesRange, py::arg(\"start\"), py::arg(\"size\"))\n        .def(\"getNodeLabels\", &GraphModelStorage::getNodeLabels, py::arg(\"indices\"))\n        .def(\"getNodeLabelsRange\", &GraphModelStorage::getNodeLabelsRange, py::arg(\"start\"), py::arg(\"size\"))\n        .def(\"updatePutNodeEmbeddings\", &GraphModelStorage::updatePutNodeEmbeddings, py::arg(\"indices\"), py::arg(\"embeddings\"))\n        .def(\"updateAddNodeEmbeddings\", &GraphModelStorage::updateAddNodeEmbeddings, py::arg(\"indices\"), py::arg(\"values\"))\n        .def(\"getNodeEmbeddingState\", &GraphModelStorage::getNodeEmbeddingState, py::arg(\"indices\"))\n        .def(\"getNodeEmbeddingStateRange\", &GraphModelStorage::getNodeEmbeddingStateRange, py::arg(\"start\"), py::arg(\"size\"))\n        .def(\"updatePutNodeEmbeddingState\", &GraphModelStorage::updatePutNodeEmbeddingState, py::arg(\"indices\"), py::arg(\"state\"))\n        .def(\"updateAddNodeEmbeddingState\", &GraphModelStorage::updateAddNodeEmbeddingState, py::arg(\"indices\"), py::arg(\"values\"))\n        .def(\"embeddingsOffDevice\", &GraphModelStorage::embeddingsOffDevice)\n        .def(\"getNumPartitions\", &GraphModelStorage::getNumPartitions)\n        .def(\"useInMemorySubGraph\", &GraphModelStorage::useInMemorySubGraph)\n        .def(\"hasSwap\", &GraphModelStorage::hasSwap)\n        .def(\"performSwap\", &GraphModelStorage::performSwap)\n        .def(\"setBufferOrdering\", &GraphModelStorage::setBufferOrdering, py::arg(\"buffer_states\"))\n        .def(\"setActiveEdges\", &GraphModelStorage::setActiveEdges, py::arg(\"active_edges\"))\n        .def(\"setActiveNodes\", &GraphModelStorage::setActiveNodes, py::arg(\"node_ids\"))\n        .def(\"getNumActiveEdges\", &GraphModelStorage::getNumActiveEdges)\n        .def(\"getNumActiveNodes\", &GraphModelStorage::getNumActiveNodes)\n        .def(\"getNumEdges\", &GraphModelStorage::getNumEdges)\n        .def(\"getNumNodes\", &GraphModelStorage::getNumNodes)\n        .def(\"getNumNodesInMemory\", &GraphModelStorage::getNumNodesInMemory)\n        .def(\"setTrainSet\", &GraphModelStorage::setTrainSet)\n        .def(\"setValidationSet\", &GraphModelStorage::setValidationSet)\n        .def(\"setTestSet\", &GraphModelStorage::setTestSet)\n        .def(\"setFilterEdges\", &GraphModelStorage::setFilterEdges)\n        .def(\"addFilterEdges\", &GraphModelStorage::addFilterEdges);\n}"
  },
  {
    "path": "src/cpp/python_bindings/storage/io_wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n#include \"configuration/config.h\"\n#include \"configuration/util.h\"\n#include \"data/dataloader.h\"\n#include \"storage/io.h\"\n\nvoid init_io(py::module &m) {\n    m.def(\n        \"load_model\",\n        [](string filename, bool train) {\n            shared_ptr<MariusConfig> marius_config = loadConfig(filename, train);\n\n            std::vector<torch::Device> devices = devices_from_config(marius_config->storage);\n\n            shared_ptr<Model> model = initModelFromConfig(marius_config->model, devices, marius_config->storage->dataset->num_relations, train);\n            model->load(marius_config->storage->model_dir, train);\n\n            return model;\n        },\n        py::arg(\"filename\"), py::arg(\"train\"));\n\n    m.def(\n        \"load_storage\",\n        [](string filename, bool train) {\n            shared_ptr<MariusConfig> marius_config = loadConfig(filename, train);\n\n            std::vector<torch::Device> devices = devices_from_config(marius_config->storage);\n\n            shared_ptr<Model> model = initModelFromConfig(marius_config->model, devices, marius_config->storage->dataset->num_relations, train);\n\n            shared_ptr<GraphModelStorage> graph_model_storage = initializeStorage(model, marius_config->storage, false, train);\n\n            return graph_model_storage;\n        },\n        py::arg(\"filename\"), py::arg(\"train\"));\n\n    m.def(\n        \"init_from_config\",\n        [](string filename, bool train, bool load_storage) {\n            shared_ptr<MariusConfig> marius_config = loadConfig(filename, train);\n\n            std::vector<torch::Device> devices = devices_from_config(marius_config->storage);\n\n            shared_ptr<Model> model = initModelFromConfig(marius_config->model, devices, marius_config->storage->dataset->num_relations, train);\n\n            shared_ptr<GraphModelStorage> graph_model_storage = initializeStorage(model, marius_config->storage, true, train);\n\n            shared_ptr<DataLoader> dataloader = std::make_shared<DataLoader>(graph_model_storage, model->learning_task_, marius_config->training,\n                                                                             marius_config->evaluation, marius_config->model->encoder);\n\n            if (train) {\n                dataloader->setTrainSet();\n            } else {\n                dataloader->setTestSet();\n            }\n\n            dataloader->loadStorage();\n\n            return std::make_tuple(model, dataloader);\n        },\n        py::arg(\"filename\"), py::arg(\"train\"), py::arg(\"load_storage\") = true);\n}\n"
  },
  {
    "path": "src/cpp/python_bindings/storage/storage_wrap.cpp",
    "content": "#include <sys/stat.h>\n\n#include \"common/pybind_headers.h\"\n#include \"storage/storage.h\"\n\n// Trampoline class\nclass PyStorage : Storage {\n   public:\n    using Storage::Storage;\n\n    torch::Tensor indexRead(torch::Tensor indices) override { PYBIND11_OVERRIDE_PURE(torch::Tensor, Storage, indexRead, indices); }\n\n    void indexAdd(torch::Tensor indices, torch::Tensor values) override { PYBIND11_OVERRIDE_PURE(void, Storage, indexAdd, indices, values); }\n\n    torch::Tensor range(int64_t offset, int64_t n) override { PYBIND11_OVERRIDE_PURE(torch::Tensor, Storage, range, offset, n); }\n\n    void indexPut(torch::Tensor indices, torch::Tensor values) override { PYBIND11_OVERRIDE_PURE(void, Storage, indexPut, indices, values); }\n\n    void rangePut(int64_t offset, int64_t n, torch::Tensor values) override { PYBIND11_OVERRIDE_PURE(void, Storage, rangePut, offset, n, values); }\n\n    void load() override { PYBIND11_OVERRIDE_PURE(void, Storage, load); }\n\n    void write() override { PYBIND11_OVERRIDE_PURE(void, Storage, write); }\n\n    void unload(bool write) override { PYBIND11_OVERRIDE_PURE(void, Storage, unload, write); }\n\n    void shuffle() override { PYBIND11_OVERRIDE_PURE(void, Storage, shuffle); }\n\n    void sort(bool src) override { PYBIND11_OVERRIDE_PURE(void, Storage, sort, src); }\n};\n\nvoid init_storage(py::module &m) {\n    py::class_<Storage, PyStorage, std::shared_ptr<Storage>>(m, \"Storage\")\n        .def_readwrite(\"dim0_size\", &Storage::dim0_size_)\n        .def_readwrite(\"dim1_size\", &Storage::dim1_size_)\n        .def_readwrite(\"dtype\", &Storage::dtype_)\n        .def_readwrite(\"initialized\", &Storage::initialized_)\n        .def_readwrite(\"edge_bucket_sizes\", &Storage::edge_bucket_sizes_)\n        .def_readwrite(\"data\", &Storage::data_)\n        .def_readwrite(\"device\", &Storage::device_)\n        .def_readwrite(\"filename\", &Storage::filename_)\n        .def(\"indexRead\", &Storage::indexRead, py::arg(\"indices\"))\n        .def(\"indexAdd\", &Storage::indexAdd, py::arg(\"indices\"), py::arg(\"values\"))\n        .def(\"range\", &Storage::range, py::arg(\"offset\"), py::arg(\"n\"))\n        .def(\"indexPut\", &Storage::indexPut, py::arg(\"indices\"), py::arg(\"values\"))\n        .def(\"rangePut\", &Storage::rangePut, py::arg(\"offset\"), py::arg(\"n\"), py::arg(\"values\"))\n        .def(\"load\", &Storage::load)\n        .def(\"write\", &Storage::write)\n        .def(\"unload\", &Storage::unload, py::arg(\"write\"))\n        .def(\"shuffle\", &Storage::shuffle)\n        .def(\"sort\", &Storage::sort, py::arg(\"src\"))\n        .def(\"read_edge_bucket_sizes\", &Storage::readPartitionSizes, py::arg(\"filename\"));\n\n    py::class_<PartitionBufferStorage, Storage, std::shared_ptr<PartitionBufferStorage>>(m, \"PartitionBufferStorage\")\n        .def_readwrite(\"filename\", &PartitionBufferStorage::filename_)\n        .def_readwrite(\"loaded\", &PartitionBufferStorage::loaded_)\n        .def_readwrite(\"options\", &PartitionBufferStorage::options_)\n        .def(py::init<string, int64_t, int64_t, shared_ptr<PartitionBufferOptions>>(), py::arg(\"filename\"), py::arg(\"dim0_size\"), py::arg(\"dim1_size\"),\n             py::arg(\"options\"))\n        .def(py::init<string, torch::Tensor, shared_ptr<PartitionBufferOptions>>(), py::arg(\"filename\"), py::arg(\"data\"), py::arg(\"options\"))\n        .def(py::init<string, shared_ptr<PartitionBufferOptions>>(), py::arg(\"filename\"), py::arg(\"options\"))\n        .def(\"hasSwap\", &PartitionBufferStorage::hasSwap)\n        .def(\"performNextSwap\", &PartitionBufferStorage::performNextSwap)\n        .def(\"getGlobalToLocalMap\", &PartitionBufferStorage::getGlobalToLocalMap, py::arg(\"get_current\") = true)\n        .def(\"sync\", &PartitionBufferStorage::sync)\n        .def(\"setBufferOrdering\", &PartitionBufferStorage::setBufferOrdering, py::arg(\"buffer_states\"))\n        .def(\"getNextAdmit\", &PartitionBufferStorage::getNextAdmit)\n        .def(\"getNextEvict\", &PartitionBufferStorage::getNextEvict)\n        .def(\"getNumInMemory\", &PartitionBufferStorage::getNumInMemory);\n\n    py::class_<FlatFile, Storage, std::shared_ptr<FlatFile>>(m, \"FlatFile\")\n        .def(py::init([](std::string filename, std::vector<int64_t> shape, py::object py_dtype, bool alloc) {\n                 int64_t dim0_size;\n                 int64_t dim1_size;\n\n                 if (shape.size() > 2 || shape.empty()) {\n                     throw MariusRuntimeException(\"Tensor shape must be 1 or 2 dimensional.\");\n                 } else if (shape.size() == 2) {\n                     dim0_size = shape[0];\n                     dim1_size = shape[1];\n                 } else {\n                     dim0_size = shape[0];\n                     dim1_size = 1;\n                 }\n\n                 torch::Dtype dtype = torch::python::detail::py_object_to_dtype(py_dtype);\n\n                 return std::make_shared<FlatFile>(filename, dim0_size, dim1_size, dtype, alloc);\n             }),\n             py::arg(\"filename\"), py::arg(\"shape\"), py::arg(\"dtype\"), py::arg(\"alloc\") = false)\n\n        .def(py::init<string, torch::Tensor>(), py::arg(\"filename\"), py::arg(\"data\"))\n\n        .def(py::init([](std::string filename, py::object py_dtype) {\n                 torch::Dtype dtype = torch::python::detail::py_object_to_dtype(py_dtype);\n\n                 return std::make_shared<FlatFile>(filename, dtype);\n             }),\n             py::arg(\"filename\"), py::arg(\"dtype\"))\n\n        .def(\"append\", &FlatFile::append, py::arg(\"values\"))\n        .def(\"move\", &FlatFile::move, py::arg(\"new_filename\"))\n        .def(\"copy\", &FlatFile::copy, py::arg(\"new_filename\"), py::arg(\"rename\"))\n        .def(\"mem_load\", &FlatFile::mem_load)\n        .def(\"mem_unload\", &FlatFile::mem_unload, py::arg(\"write\"));\n\n    py::class_<InMemory, Storage, std::shared_ptr<InMemory>>(m, \"InMemory\")\n        .def(py::init([](std::string filename, std::vector<int64_t> shape, py::object py_dtype, torch::Device device) {\n                 int64_t dim0_size;\n                 int64_t dim1_size;\n\n                 if (shape.size() > 2 || shape.empty()) {\n                     throw MariusRuntimeException(\"Tensor shape must be 1 or 2 dimensional.\");\n                 } else if (shape.size() == 2) {\n                     dim0_size = shape[0];\n                     dim1_size = shape[1];\n                 } else {\n                     dim0_size = shape[0];\n                     dim1_size = 1;\n                 }\n\n                 torch::Dtype dtype = torch::python::detail::py_object_to_dtype(py_dtype);\n\n                 return std::make_shared<InMemory>(filename, dim0_size, dim1_size, dtype, device);\n             }),\n             py::arg(\"filename\"), py::arg(\"shape\"), py::arg(\"dtype\"), py::arg(\"device\"))\n\n        .def(py::init<string, torch::Tensor, torch::Device>(), py::arg(\"filename\"), py::arg(\"data\"), py::arg(\"device\"))\n\n        .def(py::init([](std::string filename, py::object py_dtype) {\n                 torch::Dtype dtype = torch::python::detail::py_object_to_dtype(py_dtype);\n\n                 return std::make_shared<InMemory>(filename, dtype);\n             }),\n             py::arg(\"filename\"), py::arg(\"dtype\"))\n\n        .def(py::init<torch::Tensor>(), py::arg(\"data\"));\n\n    m.def(\n        \"tensor_from_file\",\n        [](py::object py_filename, std::vector<int64_t> shape, py::object py_dtype, py::object py_device) {\n            std::string filename = py::str(((py::object)py_filename.attr(\"__str__\"))());\n\n            torch::Dtype dtype = torch::python::detail::py_object_to_dtype(py_dtype);\n            torch::Device device = torch::python::detail::py_object_to_device(py_device);\n            int dtype_size = get_dtype_size_wrapper(dtype);\n\n            struct stat stat_buf;\n            int rc = stat(filename.c_str(), &stat_buf);\n            int64_t file_size = rc == 0 ? stat_buf.st_size : -1;\n\n            if (file_size == -1) {\n                throw MariusRuntimeException(\"Cannot get size of file: \" + filename);\n            }\n\n            int64_t dim0_size = file_size / dtype_size;\n            int64_t dim1_size = 1;\n\n            auto storage = std::make_shared<InMemory>(filename, dim0_size, dim1_size, dtype, device);\n            storage->load();\n            return storage->data_.clone().reshape(shape);\n        },\n        py::arg(\"filename\"), py::arg(\"shape\"), py::arg(\"dtype\"), py::arg(\"device\"));\n}"
  },
  {
    "path": "src/cpp/python_bindings/storage/wrap.cpp",
    "content": "#include \"common/pybind_headers.h\"\n\n// storage\nvoid init_graph_storage(py::module &);\nvoid init_io(py::module &);\nvoid init_storage(py::module &);\n\nPYBIND11_MODULE(_storage, m) {\n    m.doc() = \"Storage objects for arbitrary backends.\";\n\n    // storage\n    init_storage(m);\n    init_graph_storage(m);\n    init_io(m);\n}\n"
  },
  {
    "path": "src/cpp/src/common/util.cpp",
    "content": "//\n// Created by Jason Mohoney on 7/30/20.\n//\n\n#include \"common/util.h\"\n\n#include <unistd.h>\n\n#include <fstream>\n#include <iostream>\n\n#include \"reporting/logger.h\"\n\nvoid assert_no_nans(torch::Tensor values) {\n    if (torch::isnan(values).any().item<bool>()) {\n        throw MariusRuntimeException(\"Tensor contains Nans\");\n    }\n}\n\nvoid assert_no_neg(torch::Tensor values) {\n    if ((values.le(-1)).any().item<bool>()) {\n        throw MariusRuntimeException(\"Tensor contains negative values\");\n    }\n}\n\nvoid assert_in_range(torch::Tensor values, int64_t start, int64_t end) {\n    if ((values.ge(start) & values.le(end)).any().item<bool>()) {\n        throw MariusRuntimeException(\"Tensor contains is not in range: \" + std::to_string(start) + \"-\" + std::to_string(end));\n    }\n}\n\nvoid process_mem_usage() {\n    double vm_usage = 0.0;\n    double resident_set = 0.0;\n\n    // the two fields we want\n    unsigned long vsize;\n    long rss;\n    {\n        std::string ignore;\n        std::ifstream ifs(\"/proc/self/stat\", std::ios_base::in);\n        ifs >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >>\n            ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> vsize >> rss;\n    }\n\n    long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024;  // in case x86-64 is configured to use 2MB pages\n    vm_usage = vsize / 1024.0;\n    resident_set = rss * page_size_kb;\n\n    SPDLOG_DEBUG(\"VM Usage: {}GB. RSS: {}GB\", vm_usage / pow(2, 20), resident_set / pow(2, 20));\n}\n\nvoid *memset_wrapper(void *ptr, int value, int64_t num) {\n    int64_t curr_bytes = 0;\n    int64_t local_offset = 0;\n\n    while (local_offset < num) {\n        curr_bytes = num - local_offset;\n        if (curr_bytes > 1e9) {\n            curr_bytes = 1e9;\n        }\n\n        memset((char *)ptr + local_offset, value, curr_bytes);\n\n        local_offset += curr_bytes;\n    }\n\n    return ptr;\n}\n\nvoid *memcpy_wrapper(void *dest, const void *src, int64_t count) {\n    int64_t curr_bytes = 0;\n    int64_t local_offset = 0;\n\n    while (local_offset < count) {\n        curr_bytes = count - local_offset;\n        if (curr_bytes > 1e9) {\n            curr_bytes = 1e9;\n        }\n\n        memcpy((char *)dest + local_offset, (char *)src + local_offset, curr_bytes);\n\n        local_offset += curr_bytes;\n    }\n\n    return dest;\n}\n\nint64_t pread_wrapper(int fd, void *buf, int64_t count, int64_t offset) {\n    int64_t curr_bytes = 0;\n    int64_t local_offset = 0;\n\n    while (local_offset < count) {\n        curr_bytes = count - local_offset;\n        if (curr_bytes > 1e9) {\n            curr_bytes = 1e9;\n        }\n\n        if (pread(fd, (char *)buf + local_offset, curr_bytes, offset + local_offset) == -1) {\n            return -1;\n        }\n\n        local_offset += curr_bytes;\n    }\n\n    return count;\n}\n\nint64_t pwrite_wrapper(int fd, const void *buf, int64_t count, int64_t offset) {\n    int64_t curr_bytes = 0;\n    int64_t local_offset = 0;\n\n    while (local_offset < count) {\n        curr_bytes = count - local_offset;\n        if (curr_bytes > 1e9) {\n            curr_bytes = 1e9;\n        }\n\n        if (pwrite(fd, (char *)buf + local_offset, curr_bytes, offset + local_offset) == -1) {\n            return -1;\n        }\n\n        local_offset += curr_bytes;\n    }\n\n    return count;\n}\n\ntorch::Tensor transfer_tensor(torch::Tensor input, torch::Device device, CudaStream *compute_stream, CudaStream *transfer_stream) {\n    if (input.defined()) {\n        if (device.is_cuda() && input.device().is_cpu()) {\n            input = input.pin_memory();\n        }\n        input = input.to(device, false);\n\n#ifdef MARIUS_CUDA\n        if (device.is_cuda() || input.device().is_cuda()) {\n            if (compute_stream != nullptr) input.record_stream(*compute_stream);\n            if (transfer_stream != nullptr) input.record_stream(*transfer_stream);\n        }\n#endif\n    }\n\n    return input;\n}\n\nint64_t get_dtype_size_wrapper(torch::Dtype dtype_) {\n    if (dtype_ == torch::kFloat64) {\n        return 8;\n    }\n    if (dtype_ == torch::kFloat32) {\n        return 4;\n    }\n    if (dtype_ == torch::kFloat16) {\n        return 2;\n    }\n    if (dtype_ == torch::kInt64) {\n        return 8;\n    }\n    if (dtype_ == torch::kInt32) {\n        return 4;\n    }\n\n    SPDLOG_ERROR(\"Unable to determine dtype_size_ for given dtype_ {}\", dtype_);\n    throw std::runtime_error(\"\");\n}\n\nstd::string get_directory(std::string filename) {\n    assert(!filename.empty());\n\n    string directory;\n    const size_t last_slash_idx = filename.rfind('/');\n    if (std::string::npos != last_slash_idx) {\n        directory = filename.substr(0, last_slash_idx);\n    }\n\n    return directory;\n}\n\nstd::tuple<torch::Tensor, std::vector<torch::Tensor>> map_tensors(std::vector<torch::Tensor> unmapped_tensors) {\n    for (auto tensor : unmapped_tensors) {\n        if (tensor.sizes().size() > 1) {\n            throw MariusRuntimeException(\"Input tensors must be 1D\");\n        }\n    }\n\n    torch::Tensor all_ids = torch::cat(unmapped_tensors);\n\n    auto unique_tup = torch::_unique2(all_ids, true, true, false);\n\n    torch::Tensor map = std::get<0>(unique_tup);\n    torch::Tensor mapped_all_ids = std::get<1>(unique_tup);\n\n    std::vector<torch::Tensor> mapped_tensors;\n\n    int64_t offset = 0;\n    int64_t size;\n    for (auto tensor : unmapped_tensors) {\n        size = tensor.size(0);\n        mapped_tensors.emplace_back(mapped_all_ids.narrow(0, offset, size));\n        offset += size;\n    }\n\n    return std::forward_as_tuple(map, mapped_tensors);\n}\n\n// TODO this function uses a searchsorted to find the approriate value in the map tensor\n// this can be made faster on the cpu by using an std::map to perform lookups\nstd::vector<torch::Tensor> apply_tensor_map(torch::Tensor map, std::vector<torch::Tensor> unmapped_tensors) {\n    for (auto tensor : unmapped_tensors) {\n        if (tensor.sizes().size() > 1) {\n            throw MariusRuntimeException(\"Input tensors must be 1D\");\n        }\n    }\n\n    std::vector<torch::Tensor> mapped_tensors;\n\n    for (auto tensor : unmapped_tensors) {\n        mapped_tensors.emplace_back(torch::searchsorted(map, tensor));\n    }\n\n    return mapped_tensors;\n}"
  },
  {
    "path": "src/cpp/src/configuration/config.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/8/21.\n//\n\n#include \"configuration/config.h\"\n\n#include <common/pybind_headers.h>\n#include <stdlib.h>\n\nbool check_missing(pyobj python_object) {\n    bool missing = false;\n    try {\n        string string_val = pybind11::cast<string>(python_object);\n\n        if (string_val == MISSING_STR) {\n            missing = true;\n        }\n    } catch (pybind11::cast_error) {\n    }\n\n    return missing;\n}\n\ntemplate <typename T>\nT cast_helper(pyobj python_object) {\n    bool missing = check_missing(python_object);\n\n    if (missing) {\n        T default_value;\n        return default_value;\n    } else {\n        return pybind11::cast<T>(python_object);\n    }\n}\n\nshared_ptr<NeighborSamplingConfig> initNeighborSamplingConfig(pyobj python_object) {\n    shared_ptr<NeighborSamplingConfig> ret_config = std::make_shared<NeighborSamplingConfig>();\n\n    ret_config->type = getNeighborSamplingLayer(cast_helper<string>(python_object.attr(\"type\")));\n\n    pyobj py_options = python_object.attr(\"options\");\n\n    if (ret_config->type == NeighborSamplingLayer::UNIFORM) {\n        auto uniform_options = std::make_shared<UniformSamplingOptions>();\n        uniform_options->max_neighbors = cast_helper<int>(py_options.attr(\"max_neighbors\"));\n        ret_config->options = uniform_options;\n    } else if (ret_config->type == NeighborSamplingLayer::DROPOUT) {\n        auto dropout_options = std::make_shared<DropoutSamplingOptions>();\n        dropout_options->rate = cast_helper<float>(py_options.attr(\"rate\"));\n        ret_config->options = dropout_options;\n    } else {\n        auto options = std::make_shared<NeighborSamplingOptions>();\n        ret_config->options = options;\n    }\n\n    ret_config->use_hashmap_sets = cast_helper<bool>(python_object.attr(\"use_hashmap_sets\"));\n\n    return ret_config;\n}\n\nshared_ptr<InitConfig> initInitConfig(pyobj python_object) {\n    shared_ptr<InitConfig> ret_config = std::make_shared<InitConfig>();\n\n    ret_config->type = getInitDistribution(cast_helper<string>(python_object.attr(\"type\")));\n\n    pyobj py_options = python_object.attr(\"options\");\n\n    if (ret_config->type == InitDistribution::CONSTANT) {\n        auto constant_options = std::make_shared<ConstantInitOptions>();\n        constant_options->constant = cast_helper<float>(py_options.attr(\"constant\"));\n        ret_config->options = constant_options;\n    } else if (ret_config->type == InitDistribution::UNIFORM) {\n        auto uniform_options = std::make_shared<UniformInitOptions>();\n        uniform_options->scale_factor = cast_helper<float>(py_options.attr(\"scale_factor\"));\n        ret_config->options = uniform_options;\n    } else if (ret_config->type == InitDistribution::NORMAL) {\n        auto normal_options = std::make_shared<NormalInitOptions>();\n        normal_options->mean = cast_helper<float>(py_options.attr(\"mean\"));\n        normal_options->std = cast_helper<float>(py_options.attr(\"std\"));\n        ret_config->options = normal_options;\n    } else {\n        auto options = std::make_shared<InitOptions>();\n        ret_config->options = options;\n    }\n\n    return ret_config;\n}\n\nshared_ptr<OptimizerConfig> initOptimizerConfig(pyobj python_config) {\n    shared_ptr<OptimizerConfig> ret_config = std::make_shared<OptimizerConfig>();\n\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    ret_config->type = getOptimizerType(cast_helper<string>(python_config.attr(\"type\")));\n\n    if (ret_config->type == OptimizerType::DEFAULT) {\n        return nullptr;\n    }\n\n    pyobj py_options = python_config.attr(\"options\");\n\n    if (ret_config->type == OptimizerType::ADAGRAD) {\n        auto adagrad_options = std::make_shared<AdagradOptions>();\n        adagrad_options->weight_decay = cast_helper<float>(py_options.attr(\"weight_decay\"));\n        adagrad_options->lr_decay = cast_helper<float>(py_options.attr(\"lr_decay\"));\n        adagrad_options->init_value = cast_helper<float>(py_options.attr(\"init_value\"));\n        adagrad_options->eps = cast_helper<float>(py_options.attr(\"eps\"));\n        ret_config->options = adagrad_options;\n    } else if (ret_config->type == OptimizerType::ADAM) {\n        auto adam_options = std::make_shared<AdamOptions>();\n        adam_options->weight_decay = cast_helper<float>(py_options.attr(\"weight_decay\"));\n        adam_options->amsgrad = cast_helper<bool>(py_options.attr(\"amsgrad\"));\n        adam_options->beta_1 = cast_helper<float>(py_options.attr(\"beta_1\"));\n        adam_options->beta_2 = cast_helper<float>(py_options.attr(\"beta_2\"));\n        adam_options->eps = cast_helper<float>(py_options.attr(\"eps\"));\n        ret_config->options = adam_options;\n    } else {\n        auto options = std::make_shared<OptimizerOptions>();\n        ret_config->options = options;\n    }\n\n    ret_config->options->learning_rate = cast_helper<float>(py_options.attr(\"learning_rate\"));\n\n    return ret_config;\n}\n\nshared_ptr<DatasetConfig> initDatasetConfig(pyobj python_config) {\n    shared_ptr<DatasetConfig> ret_config = std::make_shared<DatasetConfig>();\n\n    ret_config->dataset_dir = cast_helper<string>(python_config.attr(\"dataset_dir\"));\n    ret_config->num_train = cast_helper<int64_t>(python_config.attr(\"num_train\"));\n    ret_config->num_valid = cast_helper<int64_t>(python_config.attr(\"num_valid\"));\n    ret_config->num_test = cast_helper<int64_t>(python_config.attr(\"num_test\"));\n    ret_config->num_edges = cast_helper<int64_t>(python_config.attr(\"num_edges\"));\n    ret_config->num_nodes = cast_helper<int64_t>(python_config.attr(\"num_nodes\"));\n    ret_config->num_relations = cast_helper<int64_t>(python_config.attr(\"num_relations\"));\n    ret_config->node_feature_dim = cast_helper<int>(python_config.attr(\"node_feature_dim\"));\n    ret_config->rel_feature_dim = cast_helper<int>(python_config.attr(\"rel_feature_dim\"));\n    ret_config->num_classes = cast_helper<int>(python_config.attr(\"num_classes\"));\n\n    return ret_config;\n}\n\nshared_ptr<LayerConfig> initLayerConfig(pyobj python_config) {\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    shared_ptr<LayerConfig> ret_config = std::make_shared<LayerConfig>();\n\n    ret_config->type = getLayerType(cast_helper<string>(python_config.attr(\"type\")));\n\n    if (ret_config->type == LayerType::EMBEDDING) {\n        ret_config->options = nullptr;\n        ret_config->input_dim = -1;\n        ret_config->output_dim = cast_helper<int>(python_config.attr(\"output_dim\"));\n        ret_config->init = initInitConfig(python_config.attr(\"init\"));\n        ret_config->optimizer = initOptimizerConfig(python_config.attr(\"optimizer\"));\n        ret_config->bias = cast_helper<bool>(python_config.attr(\"bias\"));\n        ret_config->bias_init = initInitConfig(python_config.attr(\"bias_init\"));\n        ret_config->activation = getActivationFunction(cast_helper<string>(python_config.attr(\"activation\")));\n    } else if (ret_config->type == LayerType::FEATURE) {\n        ret_config->options = nullptr;\n        ret_config->input_dim = -1;\n        ret_config->output_dim = cast_helper<int>(python_config.attr(\"output_dim\"));\n        ret_config->init = nullptr;\n        ret_config->optimizer = nullptr;\n        ret_config->bias = cast_helper<bool>(python_config.attr(\"bias\"));\n        ret_config->bias_init = initInitConfig(python_config.attr(\"bias_init\"));\n        ret_config->activation = getActivationFunction(cast_helper<string>(python_config.attr(\"activation\")));\n    } else if (ret_config->type == LayerType::GNN) {\n        pyobj py_options = python_config.attr(\"options\");\n        auto options = std::make_shared<GNNLayerOptions>();\n        options->type = getGNNLayerType(cast_helper<string>(py_options.attr(\"type\")));\n        ret_config->input_dim = cast_helper<int>(python_config.attr(\"input_dim\"));\n        ret_config->output_dim = cast_helper<int>(python_config.attr(\"output_dim\"));\n        ret_config->init = initInitConfig(python_config.attr(\"init\"));\n        ret_config->optimizer = initOptimizerConfig(python_config.attr(\"optimizer\"));\n        ret_config->bias = cast_helper<bool>(python_config.attr(\"bias\"));\n        ret_config->bias_init = initInitConfig(python_config.attr(\"bias_init\"));\n        ret_config->activation = getActivationFunction(cast_helper<string>(python_config.attr(\"activation\")));\n\n        if (options->type == GNNLayerType::GRAPH_SAGE) {\n            auto graph_sage_options = std::make_shared<GraphSageLayerOptions>();\n            graph_sage_options->type = GNNLayerType::GRAPH_SAGE;\n            graph_sage_options->aggregator = getGraphSageAggregator(cast_helper<string>(py_options.attr(\"aggregator\")));\n            ret_config->options = graph_sage_options;\n        } else if (options->type == GNNLayerType::GAT) {\n            auto gat_options = std::make_shared<GATLayerOptions>();\n            gat_options->type = GNNLayerType::GAT;\n            gat_options->num_heads = cast_helper<int>(py_options.attr(\"num_heads\"));\n            gat_options->negative_slope = cast_helper<float>(py_options.attr(\"negative_slope\"));\n            gat_options->average_heads = cast_helper<bool>(py_options.attr(\"average_heads\"));\n            gat_options->input_dropout = cast_helper<float>(py_options.attr(\"input_dropout\"));\n            gat_options->attention_dropout = cast_helper<float>(py_options.attr(\"attention_dropout\"));\n            ret_config->options = gat_options;\n        } else {\n            ret_config->options = std::make_shared<GNNLayerOptions>();\n        }\n\n    } else if (ret_config->type == LayerType::DENSE) {\n        pyobj py_options = python_config.attr(\"options\");\n        auto options = std::make_shared<DenseLayerOptions>();\n        options->type = getDenseLayerType(cast_helper<string>(py_options.attr(\"type\")));\n        ret_config->options = options;\n        ret_config->input_dim = cast_helper<int>(python_config.attr(\"input_dim\"));\n        ret_config->output_dim = cast_helper<int>(python_config.attr(\"output_dim\"));\n        ret_config->init = initInitConfig(python_config.attr(\"init\"));\n        ret_config->optimizer = initOptimizerConfig(python_config.attr(\"optimizer\"));\n        ret_config->bias = cast_helper<bool>(python_config.attr(\"bias\"));\n        ret_config->bias_init = initInitConfig(python_config.attr(\"bias_init\"));\n        ret_config->activation = getActivationFunction(cast_helper<string>(python_config.attr(\"activation\")));\n    } else if (ret_config->type == LayerType::REDUCTION) {\n        pyobj py_options = python_config.attr(\"options\");\n        auto options = std::make_shared<ReductionLayerOptions>();\n        options->type = getReductionLayerType(cast_helper<string>(py_options.attr(\"type\")));\n        ret_config->options = options;\n        ret_config->input_dim = cast_helper<int>(python_config.attr(\"input_dim\"));\n        ret_config->output_dim = cast_helper<int>(python_config.attr(\"output_dim\"));\n        ret_config->init = initInitConfig(python_config.attr(\"init\"));\n        ret_config->optimizer = initOptimizerConfig(python_config.attr(\"optimizer\"));\n        ret_config->bias = cast_helper<bool>(python_config.attr(\"bias\"));\n        ret_config->bias_init = initInitConfig(python_config.attr(\"bias_init\"));\n        ret_config->activation = getActivationFunction(cast_helper<string>(python_config.attr(\"activation\")));\n    }\n    return ret_config;\n}\n\nshared_ptr<EncoderConfig> initEncoderConfig(pyobj python_config) {\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    shared_ptr<EncoderConfig> ret_config = std::make_shared<EncoderConfig>();\n\n    pybind11::list stage_python_obj = cast_helper<pybind11::list>(python_config.attr(\"layers\"));\n    pybind11::list train_sample_python_obj = cast_helper<pybind11::list>(python_config.attr(\"train_neighbor_sampling\"));\n    pybind11::list eval_sample_python_obj = cast_helper<pybind11::list>(python_config.attr(\"eval_neighbor_sampling\"));\n\n    auto layer_vec = std::vector<std::vector<shared_ptr<LayerConfig>>>();\n    auto train_sample_vec = std::vector<shared_ptr<NeighborSamplingConfig>>();\n    auto eval_sample_vec = std::vector<shared_ptr<NeighborSamplingConfig>>();\n\n    for (auto py_stage : stage_python_obj) {\n        pybind11::list stage_obj = cast_helper<pybind11::list>(pybind11::reinterpret_borrow<pyobj>(py_stage));\n\n        auto stage_vec = std::vector<shared_ptr<LayerConfig>>();\n\n        for (auto py_layer : stage_obj) {\n            pyobj layer_object = pybind11::reinterpret_borrow<pyobj>(py_layer);\n            stage_vec.emplace_back(initLayerConfig(layer_object));\n        }\n        layer_vec.emplace_back(stage_vec);\n    }\n\n    for (auto py_layer : train_sample_python_obj) {\n        pyobj layer_object = pybind11::reinterpret_borrow<pyobj>(py_layer);\n        train_sample_vec.emplace_back(initNeighborSamplingConfig(layer_object));\n    }\n\n    for (auto py_layer : eval_sample_python_obj) {\n        pyobj layer_object = pybind11::reinterpret_borrow<pyobj>(py_layer);\n        eval_sample_vec.emplace_back(initNeighborSamplingConfig(layer_object));\n    }\n\n    ret_config->layers = layer_vec;\n    ret_config->train_neighbor_sampling = train_sample_vec;\n    ret_config->eval_neighbor_sampling = eval_sample_vec;\n    ret_config->use_incoming_nbrs = cast_helper<bool>(python_config.attr(\"use_incoming_nbrs\"));\n    ret_config->use_outgoing_nbrs = cast_helper<bool>(python_config.attr(\"use_outgoing_nbrs\"));\n\n    return ret_config;\n}\n\nshared_ptr<DecoderConfig> initDecoderConfig(pyobj python_config) {\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    shared_ptr<DecoderConfig> ret_config = std::make_shared<DecoderConfig>();\n\n    ret_config->type = getDecoderType(cast_helper<string>(python_config.attr(\"type\")));\n    ret_config->optimizer = initOptimizerConfig(python_config.attr(\"optimizer\"));\n\n    if (ret_config->type != DecoderType::NODE) {\n        pyobj py_options = python_config.attr(\"options\");\n        auto options = std::make_shared<EdgeDecoderOptions>();\n        options->inverse_edges = cast_helper<bool>(py_options.attr(\"inverse_edges\"));\n        options->edge_decoder_method = getEdgeDecoderMethod(cast_helper<string>(py_options.attr(\"edge_decoder_method\")));\n        ret_config->options = options;\n    } else {\n        auto options = std::make_shared<DecoderOptions>();\n        ret_config->options = options;\n    }\n\n    return ret_config;\n}\n\nshared_ptr<LossConfig> initLossConfig(pyobj python_config) {\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    shared_ptr<LossConfig> ret_config = std::make_shared<LossConfig>();\n\n    ret_config->type = getLossFunctionType(cast_helper<string>(python_config.attr(\"type\")));\n\n    pyobj py_options = python_config.attr(\"options\");\n\n    if (ret_config->type == LossFunctionType::RANKING) {\n        auto ranking_options = std::make_shared<RankingLossOptions>();\n        ranking_options->margin = cast_helper<float>(py_options.attr(\"margin\"));\n        ranking_options->loss_reduction = getLossReduction(cast_helper<string>(py_options.attr(\"reduction\")));\n        ret_config->options = ranking_options;\n    } else {\n        auto options = std::make_shared<LossOptions>();\n        options->loss_reduction = getLossReduction(cast_helper<string>(py_options.attr(\"reduction\")));\n        ret_config->options = options;\n    }\n\n    return ret_config;\n}\n\nshared_ptr<StorageBackendConfig> initStorageBackendConfig(pyobj python_config) {\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    shared_ptr<StorageBackendConfig> ret_config = std::make_shared<StorageBackendConfig>();\n\n    ret_config->type = getStorageBackend(cast_helper<string>(python_config.attr(\"type\")));\n\n    pyobj py_options = python_config.attr(\"options\");\n\n    if (ret_config->type == StorageBackend::PARTITION_BUFFER) {\n        auto buffer_options = std::make_shared<PartitionBufferOptions>();\n        buffer_options->num_partitions = cast_helper<int>(py_options.attr(\"num_partitions\"));\n        buffer_options->buffer_capacity = cast_helper<int>(py_options.attr(\"buffer_capacity\"));\n        buffer_options->prefetching = cast_helper<bool>(py_options.attr(\"prefetching\"));\n        buffer_options->fine_to_coarse_ratio = cast_helper<int>(py_options.attr(\"fine_to_coarse_ratio\"));\n        buffer_options->num_cache_partitions = cast_helper<int>(py_options.attr(\"num_cache_partitions\"));\n        buffer_options->edge_bucket_ordering = getEdgeBucketOrderingEnum(cast_helper<string>(py_options.attr(\"edge_bucket_ordering\")));\n        buffer_options->node_partition_ordering = getNodePartitionOrderingEnum(cast_helper<string>(py_options.attr(\"node_partition_ordering\")));\n        buffer_options->randomly_assign_edge_buckets = cast_helper<bool>(py_options.attr(\"randomly_assign_edge_buckets\"));\n        buffer_options->dtype = getDtype(cast_helper<string>(py_options.attr(\"dtype\")));\n        ret_config->options = buffer_options;\n    } else {\n        auto options = std::make_shared<StorageOptions>();\n        options->dtype = getDtype(cast_helper<string>(py_options.attr(\"dtype\")));\n        ret_config->options = options;\n    }\n\n    return ret_config;\n}\n\nshared_ptr<NegativeSamplingConfig> initNegativeSamplingConfig(pyobj python_config) {\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    shared_ptr<NegativeSamplingConfig> ret_config = std::make_shared<NegativeSamplingConfig>();\n\n    ret_config->filtered = cast_helper<bool>(python_config.attr(\"filtered\"));\n    if (!ret_config->filtered) {\n        ret_config->negatives_per_positive = cast_helper<int>(python_config.attr(\"negatives_per_positive\"));\n        ret_config->num_chunks = cast_helper<int>(python_config.attr(\"num_chunks\"));\n        ret_config->degree_fraction = cast_helper<float>(python_config.attr(\"degree_fraction\"));\n        ret_config->local_filter_mode = getLocalFilterMode(cast_helper<std::string>(python_config.attr(\"local_filter_mode\")));\n    } else {\n        ret_config->num_chunks = 1;\n        ret_config->degree_fraction = 0.0;\n        ret_config->negatives_per_positive = -1;  // This is set to the proper value by the graph_batcher\n        ret_config->local_filter_mode = LocalFilterMode::DEG;\n    }\n\n    return ret_config;\n}\n\nshared_ptr<PipelineConfig> initPipelineConfig(pyobj python_config) {\n    shared_ptr<PipelineConfig> ret_config = std::make_shared<PipelineConfig>();\n\n    ret_config->sync = cast_helper<bool>(python_config.attr(\"sync\"));\n    if (!ret_config->sync) {\n        ret_config->staleness_bound = cast_helper<int>(python_config.attr(\"staleness_bound\"));\n        ret_config->gpu_sync_interval = cast_helper<int>(python_config.attr(\"gpu_sync_interval\"));\n        ret_config->gpu_model_average = cast_helper<bool>(python_config.attr(\"gpu_model_average\"));\n        ret_config->batch_host_queue_size = cast_helper<int>(python_config.attr(\"batch_host_queue_size\"));\n        ret_config->batch_device_queue_size = cast_helper<int>(python_config.attr(\"batch_device_queue_size\"));\n        ret_config->gradients_device_queue_size = cast_helper<int>(python_config.attr(\"gradients_device_queue_size\"));\n        ret_config->gradients_host_queue_size = cast_helper<int>(python_config.attr(\"gradients_host_queue_size\"));\n        ret_config->batch_loader_threads = cast_helper<int>(python_config.attr(\"batch_loader_threads\"));\n        ret_config->batch_transfer_threads = cast_helper<int>(python_config.attr(\"batch_transfer_threads\"));\n        ret_config->compute_threads = cast_helper<int>(python_config.attr(\"compute_threads\"));\n        ret_config->gradient_transfer_threads = cast_helper<int>(python_config.attr(\"gradient_transfer_threads\"));\n        ret_config->gradient_update_threads = cast_helper<int>(python_config.attr(\"gradient_update_threads\"));\n    }\n\n    return ret_config;\n}\n\nshared_ptr<CheckpointConfig> initCheckpointConfig(pyobj python_config) {\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    shared_ptr<CheckpointConfig> ret_config = std::make_shared<CheckpointConfig>();\n\n    ret_config->save_best = cast_helper<bool>(python_config.attr(\"save_best\"));\n    ret_config->interval = cast_helper<int>(python_config.attr(\"interval\"));\n    ret_config->save_state = cast_helper<bool>(python_config.attr(\"save_state\"));\n    return ret_config;\n}\n\nshared_ptr<ModelConfig> initModelConfig(pyobj python_config) {\n    shared_ptr<ModelConfig> ret_config = std::make_shared<ModelConfig>();\n\n    ret_config->random_seed = cast_helper<int64_t>(python_config.attr(\"random_seed\"));\n    ret_config->learning_task = getLearningTask(cast_helper<std::string>(python_config.attr(\"learning_task\")));\n    ret_config->encoder = initEncoderConfig(python_config.attr(\"encoder\"));\n    ret_config->decoder = initDecoderConfig(python_config.attr(\"decoder\"));\n    ret_config->loss = initLossConfig(python_config.attr(\"loss\"));\n    ret_config->dense_optimizer = initOptimizerConfig(python_config.attr(\"dense_optimizer\"));\n    ret_config->sparse_optimizer = initOptimizerConfig(python_config.attr(\"sparse_optimizer\"));\n\n    return ret_config;\n}\n\nshared_ptr<StorageConfig> initStorageConfig(pyobj python_config) {\n    if (check_missing(python_config)) {\n        return nullptr;\n    }\n\n    shared_ptr<StorageConfig> ret_config = std::make_shared<StorageConfig>();\n\n    ret_config->device_type = torch::Device(cast_helper<string>(python_config.attr(\"device_type\")));\n    ret_config->edges = initStorageBackendConfig(python_config.attr(\"edges\"));\n    ret_config->nodes = initStorageBackendConfig(python_config.attr(\"nodes\"));\n    ret_config->embeddings = initStorageBackendConfig(python_config.attr(\"embeddings\"));\n    ret_config->features = initStorageBackendConfig(python_config.attr(\"features\"));\n    ret_config->dataset = initDatasetConfig(python_config.attr(\"dataset\"));\n    ret_config->prefetch = cast_helper<bool>(python_config.attr(\"prefetch\"));\n    ret_config->shuffle_input = cast_helper<bool>(python_config.attr(\"shuffle_input\"));\n    ret_config->model_dir = cast_helper<string>(python_config.attr(\"model_dir\"));\n\n    pybind11::list device_ids_pylist = cast_helper<pybind11::list>(python_config.attr(\"device_ids\"));\n\n    ret_config->device_ids = {};\n\n    for (auto py_id : device_ids_pylist) {\n        pyobj id_object = pybind11::reinterpret_borrow<pyobj>(py_id);\n        ret_config->device_ids.emplace_back(cast_helper<int>(id_object));\n    }\n\n    ret_config->full_graph_evaluation = cast_helper<bool>(python_config.attr(\"full_graph_evaluation\"));\n    ret_config->export_encoded_nodes = cast_helper<bool>(python_config.attr(\"export_encoded_nodes\"));\n\n    ret_config->log_level = getLogLevel(cast_helper<string>(python_config.attr(\"log_level\")));\n    ret_config->train_edges_pre_sorted = cast_helper<bool>(python_config.attr(\"train_edges_pre_sorted\"));\n    return ret_config;\n}\n\nshared_ptr<TrainingConfig> initTrainingConfig(pyobj python_config) {\n    shared_ptr<TrainingConfig> ret_config = std::make_shared<TrainingConfig>();\n\n    ret_config->batch_size = cast_helper<int>(python_config.attr(\"batch_size\"));\n    ret_config->negative_sampling = initNegativeSamplingConfig(python_config.attr(\"negative_sampling\"));\n    ret_config->pipeline = initPipelineConfig(python_config.attr(\"pipeline\"));\n    ret_config->logs_per_epoch = cast_helper<int>(python_config.attr(\"logs_per_epoch\"));\n    ret_config->num_epochs = cast_helper<int>(python_config.attr(\"num_epochs\"));\n    ret_config->save_model = cast_helper<bool>(python_config.attr(\"save_model\"));\n    ret_config->checkpoint = initCheckpointConfig(python_config.attr(\"checkpoint\"));\n    ret_config->resume_training = cast_helper<bool>(python_config.attr(\"resume_training\"));\n    ret_config->resume_from_checkpoint = cast_helper<string>(python_config.attr(\"resume_from_checkpoint\"));\n\n    return ret_config;\n}\n\nshared_ptr<EvaluationConfig> initEvaluationConfig(pyobj python_config) {\n    shared_ptr<EvaluationConfig> ret_config = std::make_shared<EvaluationConfig>();\n\n    ret_config->batch_size = cast_helper<int>(python_config.attr(\"batch_size\"));\n    ret_config->negative_sampling = initNegativeSamplingConfig(python_config.attr(\"negative_sampling\"));\n    ret_config->pipeline = initPipelineConfig(python_config.attr(\"pipeline\"));\n    ret_config->epochs_per_eval = cast_helper<int>(python_config.attr(\"epochs_per_eval\"));\n    ret_config->checkpoint_dir = cast_helper<string>(python_config.attr(\"checkpoint_dir\"));\n    return ret_config;\n}\n\nshared_ptr<MariusConfig> initMariusConfig(pyobj python_config) {\n    shared_ptr<MariusConfig> ret_config = std::make_shared<MariusConfig>();\n\n    ret_config->model = initModelConfig(python_config.attr(\"model\"));\n    ret_config->storage = initStorageConfig(python_config.attr(\"storage\"));\n    ret_config->training = initTrainingConfig(python_config.attr(\"training\"));\n    ret_config->evaluation = initEvaluationConfig(python_config.attr(\"evaluation\"));\n\n    return ret_config;\n}\n\nshared_ptr<MariusConfig> loadConfig(string config_path, bool save) {\n    string module_name = \"marius.tools.configuration.marius_config\";\n    shared_ptr<MariusConfig> ret;\n    if (Py_IsInitialized() != 0) {\n        pyobj config_module = pybind11::module::import(module_name.c_str());\n        pyobj python_config = config_module.attr(\"load_config\")(config_path, save);\n\n        ret = initMariusConfig(python_config);\n    } else {\n        setenv(\"MARIUS_NO_BINDINGS\", \"1\", true);\n\n        pybind11::scoped_interpreter guard{};\n\n        pyobj config_module = pybind11::module::import(module_name.c_str());\n        pyobj python_config = config_module.attr(\"load_config\")(config_path, save);\n\n        ret = initMariusConfig(python_config);\n    }\n\n    return ret;\n}"
  },
  {
    "path": "src/cpp/src/configuration/options.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/8/21.\n//\n\n#include \"configuration/options.h\"\n\nLearningTask getLearningTask(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"NODE_CLASSIFICATION\" || string_val == \"NC\") {\n        return LearningTask::NODE_CLASSIFICATION;\n    } else if (string_val == \"LINK_PREDICTION\" || string_val == \"LP\") {\n        return LearningTask::LINK_PREDICTION;\n    } else {\n        throw std::runtime_error(\"Unrecognized learning task string\");\n    }\n}\n\nInitDistribution getInitDistribution(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"ZEROS\") {\n        return InitDistribution::ZEROS;\n    } else if (string_val == \"ONES\") {\n        return InitDistribution::ONES;\n    } else if (string_val == \"CONSTANT\") {\n        return InitDistribution::CONSTANT;\n    } else if (string_val == \"UNIFORM\") {\n        return InitDistribution::UNIFORM;\n    } else if (string_val == \"NORMAL\") {\n        return InitDistribution::NORMAL;\n    } else if (string_val == \"GLOROT_UNIFORM\") {\n        return InitDistribution::GLOROT_UNIFORM;\n    } else if (string_val == \"GLOROT_NORMAL\") {\n        return InitDistribution::GLOROT_NORMAL;\n    } else {\n        throw std::runtime_error(\"Unrecognized init distribution string\");\n    }\n}\n\nLossFunctionType getLossFunctionType(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"SOFTMAX_CE\") {\n        return LossFunctionType::SOFTMAX_CE;\n    } else if (string_val == \"RANKING\") {\n        return LossFunctionType::RANKING;\n    } else if (string_val == \"CROSS_ENTROPY\") {\n        return LossFunctionType::CROSS_ENTROPY;\n    } else if (string_val == \"BCE_AFTER_SIGMOID\") {\n        return LossFunctionType::BCE_AFTER_SIGMOID;\n    } else if (string_val == \"BCE_WITH_LOGITS\") {\n        return LossFunctionType::BCE_WITH_LOGITS;\n    } else if (string_val == \"MSE\") {\n        return LossFunctionType::MSE;\n    } else if (string_val == \"SOFTPLUS\") {\n        return LossFunctionType::SOFTPLUS;\n    } else {\n        throw std::runtime_error(\"Unrecognized loss function type string\");\n    }\n}\n\nLossReduction getLossReduction(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"MEAN\") {\n        return LossReduction::MEAN;\n    } else if (string_val == \"SUM\") {\n        return LossReduction::SUM;\n    } else {\n        throw std::runtime_error(\"Unrecognized loss reduction string\");\n    }\n}\n\nActivationFunction getActivationFunction(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"RELU\") {\n        return ActivationFunction::RELU;\n    } else if (string_val == \"SIGMOID\") {\n        return ActivationFunction::SIGMOID;\n    } else if (string_val == \"NONE\") {\n        return ActivationFunction::NONE;\n    } else {\n        throw std::runtime_error(\"Unrecognized activation function string\");\n    }\n}\n\nOptimizerType getOptimizerType(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"SGD\") {\n        return OptimizerType::SGD;\n    } else if (string_val == \"ADAM\") {\n        return OptimizerType::ADAM;\n    } else if (string_val == \"ADAGRAD\") {\n        return OptimizerType::ADAGRAD;\n    } else if (string_val == \"DEFAULT\") {\n        return OptimizerType::DEFAULT;\n    } else {\n        throw std::runtime_error(\"Unrecognized optimizer string\");\n    }\n}\n\nReductionLayerType getReductionLayerType(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"NONE\") {\n        return ReductionLayerType::NONE;\n    } else if (string_val == \"CONCAT\") {\n        return ReductionLayerType::CONCAT;\n    } else if (string_val == \"LINEAR\") {\n        return ReductionLayerType::LINEAR;\n    } else {\n        throw std::runtime_error(\"Unrecognized reduction type string\");\n    }\n}\n\nDenseLayerType getDenseLayerType(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"NONE\") {\n        return DenseLayerType::NONE;\n    } else if (string_val == \"LINEAR\") {\n        return DenseLayerType::LINEAR;\n    } else if (string_val == \"CONV\") {\n        return DenseLayerType::CONV;\n    } else {\n        throw std::runtime_error(\"Unrecognized dense layer string\");\n    }\n}\n\nLayerType getLayerType(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"NONE\") {\n        return LayerType::NONE;\n    } else if (string_val == \"EMBEDDING\") {\n        return LayerType::EMBEDDING;\n    } else if (string_val == \"FEATURE\") {\n        return LayerType::FEATURE;\n    } else if (string_val == \"GNN\") {\n        return LayerType::GNN;\n    } else if (string_val == \"DENSE\") {\n        return LayerType::DENSE;\n    } else if (string_val == \"REDUCTION\") {\n        return LayerType::REDUCTION;\n    } else {\n        throw std::runtime_error(\"Unrecognized layer type string\");\n    }\n}\n\nGNNLayerType getGNNLayerType(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"NONE\") {\n        return GNNLayerType::NONE;\n    } else if (string_val == \"GRAPH_SAGE\") {\n        return GNNLayerType::GRAPH_SAGE;\n    } else if (string_val == \"GCN\") {\n        return GNNLayerType::GCN;\n    } else if (string_val == \"GAT\") {\n        return GNNLayerType::GAT;\n    } else if (string_val == \"RGCN\") {\n        return GNNLayerType::RGCN;\n    } else {\n        throw std::runtime_error(\"Unrecognized gnn layer type string\");\n    }\n}\n\nGraphSageAggregator getGraphSageAggregator(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"GCN\") {\n        return GraphSageAggregator::GCN;\n    } else if (string_val == \"MEAN\") {\n        return GraphSageAggregator::MEAN;\n    } else {\n        throw std::runtime_error(\"Unrecognized graph sage aggregator string\");\n    }\n}\n\nDecoderType getDecoderType(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"NODE\") {\n        return DecoderType::NODE;\n    } else if (string_val == \"DISTMULT\") {\n        return DecoderType::DISTMULT;\n    } else if (string_val == \"TRANSE\") {\n        return DecoderType::TRANSE;\n    } else if (string_val == \"COMPLEX\") {\n        return DecoderType::COMPLEX;\n    } else {\n        throw std::runtime_error(\"Unrecognized decoder type string\");\n    }\n}\n\nEdgeDecoderMethod getEdgeDecoderMethod(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"ONLY_POS\") {\n        return EdgeDecoderMethod::ONLY_POS;\n    } else if (string_val == \"POS_AND_NEG\") {\n        return EdgeDecoderMethod::POS_AND_NEG;\n    } else if (string_val == \"CORRUPT_NODE\") {\n        return EdgeDecoderMethod::CORRUPT_NODE;\n    } else if (string_val == \"CORRUPT_REL\") {\n        return EdgeDecoderMethod::CORRUPT_REL;\n    } else if (string_val == \"TRAIN\") {\n        return EdgeDecoderMethod::CORRUPT_NODE;\n    } else if (string_val == \"INFER\") {\n        return EdgeDecoderMethod::ONLY_POS;\n    } else {\n        throw std::runtime_error(\"Unrecognized edge decoder type string\");\n    }\n}\n\nStorageBackend getStorageBackend(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"PARTITION_BUFFER\") {\n        return StorageBackend::PARTITION_BUFFER;\n    } else if (string_val == \"FLAT_FILE\") {\n        return StorageBackend::FLAT_FILE;\n    } else if (string_val == \"HOST_MEMORY\") {\n        return StorageBackend::HOST_MEMORY;\n    } else if (string_val == \"DEVICE_MEMORY\") {\n        return StorageBackend::DEVICE_MEMORY;\n    } else {\n        throw std::runtime_error(\"Unrecognized storage backend string\");\n    }\n}\n\nEdgeBucketOrdering getEdgeBucketOrderingEnum(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"OLD_BETA\") {\n        return EdgeBucketOrdering::OLD_BETA;\n    } else if (string_val == \"NEW_BETA\") {\n        return EdgeBucketOrdering::NEW_BETA;\n    } else if (string_val == \"ALL_BETA\") {\n        return EdgeBucketOrdering::ALL_BETA;\n    } else if (string_val == \"COMET\") {\n        return EdgeBucketOrdering::COMET;\n    } else if (string_val == \"CUSTOM\") {\n        return EdgeBucketOrdering::CUSTOM;\n    } else {\n        throw std::runtime_error(\"Unrecognized edge bucket ordering string\");\n    }\n}\n\nNodePartitionOrdering getNodePartitionOrderingEnum(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"DISPERSED\") {\n        return NodePartitionOrdering::DISPERSED;\n    } else if (string_val == \"SEQUENTIAL\") {\n        return NodePartitionOrdering::SEQUENTIAL;\n    } else if (string_val == \"CUSTOM\") {\n        return NodePartitionOrdering::CUSTOM;\n    } else {\n        throw std::runtime_error(\"Unrecognized node partition ordering string\");\n    }\n}\n\nNeighborSamplingLayer getNeighborSamplingLayer(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"ALL\") {\n        return NeighborSamplingLayer::ALL;\n    } else if (string_val == \"UNIFORM\") {\n        return NeighborSamplingLayer::UNIFORM;\n    } else if (string_val == \"DROPOUT\") {\n        return NeighborSamplingLayer::DROPOUT;\n    } else {\n        throw std::runtime_error(\"Unrecognized neighbor sampling layer string\");\n    }\n}\n\nLocalFilterMode getLocalFilterMode(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"ALL\") {\n        return LocalFilterMode::ALL;\n    } else if (string_val == \"DEG\") {\n        return LocalFilterMode::DEG;\n    } else {\n        throw std::runtime_error(\"Unrecognized neighbor sampling layer string\");\n    }\n}\n\ntorch::Dtype getDtype(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"INT\" || string_val == \"INT32\") {\n        return torch::kInt32;\n    } else if (string_val == \"INT64\" || string_val == \"LONG\") {\n        return torch::kInt64;\n    } else if (string_val == \"FLOAT\" || string_val == \"FLOAT32\") {\n        return torch::kFloat32;\n    } else if (string_val == \"DOUBLE\" || string_val == \"FLOAT64\") {\n        return torch::kFloat64;\n    } else {\n        throw std::runtime_error(\"Unrecognized dtype string\");\n    }\n}\n\nspdlog::level::level_enum getLogLevel(std::string string_val) {\n    for (auto& c : string_val) c = toupper(c);\n\n    if (string_val == \"ERROR\" || string_val == \"E\") {\n        return spdlog::level::err;\n    } else if (string_val == \"WARN\" || string_val == \"W\") {\n        return spdlog::level::warn;\n    } else if (string_val == \"INFO\" || string_val == \"I\") {\n        return spdlog::level::info;\n    } else if (string_val == \"DEBUG\" || string_val == \"D\") {\n        return spdlog::level::debug;\n    } else if (string_val == \"TRACE\" || string_val == \"T\") {\n        return spdlog::level::trace;\n    } else {\n        throw std::runtime_error(\"Unrecognized log level string\");\n    }\n}"
  },
  {
    "path": "src/cpp/src/configuration/util.cpp",
    "content": "//\n// Created by Jason Mohoney on 1/19/22.\n//\n\n#include \"configuration/util.h\"\n\nstd::vector<torch::Device> devices_from_config(std::shared_ptr<StorageConfig> storage_config) {\n    std::vector<torch::Device> devices;\n\n    if (storage_config->device_type == torch::kCUDA) {\n        for (int i = 0; i < storage_config->device_ids.size(); i++) {\n            devices.emplace_back(torch::Device(torch::kCUDA, storage_config->device_ids[i]));\n        }\n        if (devices.empty()) {\n            devices.emplace_back(torch::Device(torch::kCUDA, 0));\n        }\n    } else {\n        devices.emplace_back(torch::kCPU);\n    }\n\n    return devices;\n}\n"
  },
  {
    "path": "src/cpp/src/data/batch.cpp",
    "content": "//\n// Created by Jason Mohoney on 7/9/20.\n//\n\n#include \"data/batch.h\"\n\n#include \"configuration/constants.h\"\n#include \"reporting/logger.h\"\n\nusing std::get;\n\nBatch::Batch(bool train) : device_transfer_(0), host_transfer_(0), timer_(false) {\n    status_ = BatchStatus::Waiting;\n    train_ = train;\n    device_id_ = -1;\n    clear();\n}\n\nBatch::~Batch() { clear(); }\n\nvoid Batch::to(torch::Device device, CudaStream *compute_stream) {\n    CudaStream transfer_stream = getStreamFromPool(false, device.index());\n    CudaStreamGuard stream_guard(transfer_stream);\n\n    if (device.is_cuda()) {\n        host_transfer_ = CudaEvent(device.index());\n    }\n\n    edges_ = transfer_tensor(edges_, device, compute_stream, &transfer_stream);\n\n    neg_edges_ = transfer_tensor(neg_edges_, device, compute_stream, &transfer_stream);\n\n    root_node_indices_ = transfer_tensor(root_node_indices_, device, compute_stream, &transfer_stream);\n\n    unique_node_indices_ = transfer_tensor(unique_node_indices_, device, compute_stream, &transfer_stream);\n\n    node_labels_ = transfer_tensor(node_labels_, device, compute_stream, &transfer_stream);\n\n    src_neg_indices_mapping_ = transfer_tensor(src_neg_indices_mapping_, device, compute_stream, &transfer_stream);\n\n    dst_neg_indices_mapping_ = transfer_tensor(dst_neg_indices_mapping_, device, compute_stream, &transfer_stream);\n\n    src_neg_filter_ = transfer_tensor(src_neg_filter_, device, compute_stream, &transfer_stream);\n\n    dst_neg_filter_ = transfer_tensor(dst_neg_filter_, device, compute_stream, &transfer_stream);\n\n    node_embeddings_ = transfer_tensor(node_embeddings_, device, compute_stream, &transfer_stream);\n\n    node_embeddings_state_ = transfer_tensor(node_embeddings_state_, device, compute_stream, &transfer_stream);\n\n    node_features_ = transfer_tensor(node_features_, device, compute_stream, &transfer_stream);\n\n    encoded_uniques_ = transfer_tensor(encoded_uniques_, device, compute_stream, &transfer_stream);\n\n    if (dense_graph_.node_ids_.defined()) {\n        dense_graph_.to(device, compute_stream, &transfer_stream);\n    }\n\n    status_ = BatchStatus::TransferredToDevice;\n}\n\nvoid Batch::accumulateGradients(float learning_rate) {\n    if (node_embeddings_.defined()) {\n        node_gradients_ = node_embeddings_.grad();\n        SPDLOG_TRACE(\"Batch: {} accumulated node gradients\", batch_id_);\n\n        node_state_update_ = node_gradients_.pow(2);\n        node_embeddings_state_.add_(node_state_update_);\n        node_gradients_ = -learning_rate * (node_gradients_ / (node_embeddings_state_.sqrt().add_(1e-10)));\n\n        SPDLOG_TRACE(\"Batch: {} adjusted gradients\", batch_id_);\n    }\n\n    node_embeddings_state_ = torch::Tensor();\n\n    SPDLOG_TRACE(\"Batch: {} cleared gpu embeddings and gradients\", batch_id_);\n\n    status_ = BatchStatus::AccumulatedGradients;\n}\n\nvoid Batch::embeddingsToHost() {\n    if (node_gradients_.defined() && node_gradients_.device().is_cuda()) {\n        auto grad_opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU).pinned_memory(true);\n        Gradients temp_grads = torch::empty(node_gradients_.sizes(), grad_opts);\n        temp_grads.copy_(node_gradients_, true);\n        Gradients temp_grads2 = torch::empty(node_state_update_.sizes(), grad_opts);\n        temp_grads2.copy_(node_state_update_, true);\n        node_gradients_ = temp_grads;\n        node_state_update_ = temp_grads2;\n    }\n\n    if (unique_node_indices_.defined()) {\n        unique_node_indices_ = unique_node_indices_.to(torch::kCPU);\n    }\n\n    if (encoded_uniques_.defined()) {\n        encoded_uniques_ = encoded_uniques_.to(torch::kCPU);\n    }\n\n    host_transfer_.record();\n    host_transfer_.synchronize();\n    status_ = BatchStatus::TransferredToHost;\n}\n\nvoid Batch::clear() {\n    root_node_indices_ = torch::Tensor();\n    unique_node_indices_ = torch::Tensor();\n    node_embeddings_ = torch::Tensor();\n    node_gradients_ = torch::Tensor();\n    node_state_update_ = torch::Tensor();\n    node_embeddings_state_ = torch::Tensor();\n\n    node_features_ = torch::Tensor();\n    node_labels_ = torch::Tensor();\n\n    src_neg_indices_mapping_ = torch::Tensor();\n    dst_neg_indices_mapping_ = torch::Tensor();\n\n    edges_ = torch::Tensor();\n    neg_edges_ = torch::Tensor();\n    src_neg_indices_ = torch::Tensor();\n    dst_neg_indices_ = torch::Tensor();\n\n    dense_graph_.clear();\n    encoded_uniques_ = torch::Tensor();\n\n    src_neg_filter_ = torch::Tensor();\n    dst_neg_filter_ = torch::Tensor();\n}"
  },
  {
    "path": "src/cpp/src/data/dataloader.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/7/20.\n//\n\n#include \"data/dataloader.h\"\n\n#include \"common/util.h\"\n#include \"data/ordering.h\"\n\nDataLoader::DataLoader(shared_ptr<GraphModelStorage> graph_storage, LearningTask learning_task, shared_ptr<TrainingConfig> training_config,\n                       shared_ptr<EvaluationConfig> evaluation_config, shared_ptr<EncoderConfig> encoder_config) {\n    current_edge_ = 0;\n    train_ = true;\n    epochs_processed_ = 0;\n    batches_processed_ = 0;\n    sampler_lock_ = new std::mutex();\n    batch_lock_ = new std::mutex;\n    batch_cv_ = new std::condition_variable;\n    waiting_for_batches_ = false;\n\n    single_dataset_ = false;\n\n    graph_storage_ = graph_storage;\n    learning_task_ = learning_task;\n    training_config_ = training_config;\n    evaluation_config_ = evaluation_config;\n    only_root_features_ = false;\n\n    edge_sampler_ = std::make_shared<RandomEdgeSampler>(graph_storage_);\n\n    if (learning_task_ == LearningTask::LINK_PREDICTION) {\n        training_negative_sampler_ = std::make_shared<CorruptNodeNegativeSampler>(\n            training_config_->negative_sampling->num_chunks, training_config_->negative_sampling->negatives_per_positive,\n            training_config_->negative_sampling->degree_fraction, training_config_->negative_sampling->filtered,\n            training_config_->negative_sampling->local_filter_mode);\n\n        evaluation_negative_sampler_ = std::make_shared<CorruptNodeNegativeSampler>(\n            evaluation_config_->negative_sampling->num_chunks, evaluation_config_->negative_sampling->negatives_per_positive,\n            evaluation_config_->negative_sampling->degree_fraction, evaluation_config_->negative_sampling->filtered,\n            evaluation_config_->negative_sampling->local_filter_mode);\n    } else {\n        training_negative_sampler_ = nullptr;\n        evaluation_negative_sampler_ = nullptr;\n    }\n\n    if (encoder_config != nullptr) {\n        if (!encoder_config->train_neighbor_sampling.empty()) {\n            training_neighbor_sampler_ = std::make_shared<LayeredNeighborSampler>(graph_storage_, encoder_config->train_neighbor_sampling,\n                                                                                  encoder_config->use_incoming_nbrs, encoder_config->use_outgoing_nbrs);\n\n            if (!encoder_config->eval_neighbor_sampling.empty()) {\n                evaluation_neighbor_sampler_ = std::make_shared<LayeredNeighborSampler>(graph_storage_, encoder_config->eval_neighbor_sampling,\n                                                                                        encoder_config->use_incoming_nbrs, encoder_config->use_incoming_nbrs);\n            } else {\n                evaluation_neighbor_sampler_ = training_neighbor_sampler_;\n            }\n\n        } else {\n            training_neighbor_sampler_ = nullptr;\n            evaluation_neighbor_sampler_ = nullptr;\n        }\n    } else {\n        training_neighbor_sampler_ = nullptr;\n        evaluation_neighbor_sampler_ = nullptr;\n    }\n\n    compute_stream_ = nullptr;\n}\n\nDataLoader::DataLoader(shared_ptr<GraphModelStorage> graph_storage, LearningTask learning_task, int batch_size, shared_ptr<NegativeSampler> negative_sampler,\n                       shared_ptr<NeighborSampler> neighbor_sampler, bool train) {\n    current_edge_ = 0;\n    train_ = train;\n    epochs_processed_ = 0;\n    batches_processed_ = 0;\n    sampler_lock_ = new std::mutex();\n    batch_lock_ = new std::mutex;\n    batch_cv_ = new std::condition_variable;\n    waiting_for_batches_ = false;\n\n    batch_size_ = batch_size;\n    single_dataset_ = true;\n\n    graph_storage_ = graph_storage;\n    learning_task_ = learning_task;\n    only_root_features_ = false;\n\n    edge_sampler_ = std::make_shared<RandomEdgeSampler>(graph_storage_);\n    negative_sampler_ = negative_sampler;\n    neighbor_sampler_ = neighbor_sampler;\n\n    training_config_ = nullptr;\n    evaluation_config_ = nullptr;\n\n    training_negative_sampler_ = nullptr;\n    evaluation_negative_sampler_ = nullptr;\n\n    training_neighbor_sampler_ = nullptr;\n    evaluation_neighbor_sampler_ = nullptr;\n\n    loadStorage();\n}\n\nDataLoader::~DataLoader() {\n    delete sampler_lock_;\n    delete batch_lock_;\n    delete batch_cv_;\n}\n\nvoid DataLoader::nextEpoch() {\n    batch_id_offset_ = 0;\n    total_batches_processed_ = 0;\n    epochs_processed_++;\n\n    if (graph_storage_->useInMemorySubGraph()) {\n        unloadStorage();\n    }\n}\n\nvoid DataLoader::setActiveEdges() {\n    EdgeList active_edges;\n\n    if (graph_storage_->useInMemorySubGraph()) {\n        torch::Tensor edge_bucket_ids = *edge_buckets_per_buffer_iterator_;\n        edge_buckets_per_buffer_iterator_++;\n\n        int num_partitions = graph_storage_->getNumPartitions();\n\n        edge_bucket_ids = edge_bucket_ids.select(1, 0) * num_partitions + edge_bucket_ids.select(1, 1);\n        torch::Tensor in_memory_edge_bucket_idx = torch::empty({edge_bucket_ids.size(0)}, edge_bucket_ids.options());\n        torch::Tensor edge_bucket_sizes = torch::empty({edge_bucket_ids.size(0)}, edge_bucket_ids.options());\n\n        auto edge_bucket_ids_accessor = edge_bucket_ids.accessor<int64_t, 1>();\n        auto in_memory_edge_bucket_idx_accessor = in_memory_edge_bucket_idx.accessor<int64_t, 1>();\n        auto edge_bucket_sizes_accessor = edge_bucket_sizes.accessor<int64_t, 1>();\n\n        auto all_edge_bucket_sizes_accessor = graph_storage_->current_subgraph_state_->in_memory_edge_bucket_sizes_.accessor<int64_t, 1>();\n        auto all_edge_bucket_starts_accessor = graph_storage_->current_subgraph_state_->in_memory_edge_bucket_starts_.accessor<int64_t, 1>();\n\n        auto tup = torch::sort(graph_storage_->current_subgraph_state_->in_memory_edge_bucket_ids_);\n        torch::Tensor sorted_in_memory_ids = std::get<0>(tup);\n        torch::Tensor in_memory_id_indices = std::get<1>(tup);\n        auto in_memory_id_indices_accessor = in_memory_id_indices.accessor<int64_t, 1>();\n\n#pragma omp parallel for\n        for (int i = 0; i < in_memory_edge_bucket_idx.size(0); i++) {\n            int64_t edge_bucket_id = edge_bucket_ids_accessor[i];\n            int64_t idx = torch::searchsorted(sorted_in_memory_ids, edge_bucket_id).item<int64_t>();\n            idx = in_memory_id_indices_accessor[idx];\n            int64_t edge_bucket_size = all_edge_bucket_sizes_accessor[idx];\n\n            in_memory_edge_bucket_idx_accessor[i] = idx;\n            edge_bucket_sizes_accessor[i] = edge_bucket_size;\n        }\n\n        torch::Tensor local_offsets = edge_bucket_sizes.cumsum(0);\n        int64_t total_size = local_offsets[-1].item<int64_t>();\n        local_offsets = local_offsets - edge_bucket_sizes;\n\n        auto local_offsets_accessor = local_offsets.accessor<int64_t, 1>();\n\n        active_edges = torch::empty({total_size, graph_storage_->storage_ptrs_.edges->dim1_size_},\n                                    graph_storage_->current_subgraph_state_->all_in_memory_mapped_edges_.options());\n\n#pragma omp parallel for\n        for (int i = 0; i < in_memory_edge_bucket_idx.size(0); i++) {\n            int64_t idx = in_memory_edge_bucket_idx_accessor[i];\n            int64_t edge_bucket_size = edge_bucket_sizes_accessor[i];\n            int64_t edge_bucket_start = all_edge_bucket_starts_accessor[idx];\n            int64_t local_offset = local_offsets_accessor[i];\n\n            active_edges.narrow(0, local_offset, edge_bucket_size) =\n                graph_storage_->current_subgraph_state_->all_in_memory_mapped_edges_.narrow(0, edge_bucket_start, edge_bucket_size);\n        }\n\n    } else {\n        active_edges = graph_storage_->storage_ptrs_.edges->range(0, graph_storage_->storage_ptrs_.edges->getDim0());\n    }\n\n    auto opts = torch::TensorOptions().dtype(torch::kInt64).device(active_edges.device());\n    active_edges = (active_edges.index_select(0, torch::randperm(active_edges.size(0), opts)));\n    graph_storage_->setActiveEdges(active_edges);\n}\n\nvoid DataLoader::setActiveNodes() {\n    torch::Tensor node_ids;\n\n    if (graph_storage_->useInMemorySubGraph()) {\n        node_ids = *node_ids_per_buffer_iterator_++;\n    } else {\n        node_ids = graph_storage_->storage_ptrs_.nodes->range(0, graph_storage_->storage_ptrs_.nodes->getDim0());\n        if (node_ids.sizes().size() == 2) {\n            node_ids = node_ids.flatten(0, 1);\n        }\n    }\n\n    auto opts = torch::TensorOptions().dtype(torch::kInt64).device(node_ids.device());\n    node_ids = (node_ids.index_select(0, torch::randperm(node_ids.size(0), opts)));\n    graph_storage_->setActiveNodes(node_ids);\n}\n\nvoid DataLoader::initializeBatches(bool prepare_encode) {\n    int64_t batch_id = 0;\n    int64_t start_idx = 0;\n\n    clearBatches();\n\n    all_read_ = false;\n    int64_t num_items;\n\n    if (prepare_encode) {\n        num_items = graph_storage_->getNumNodes();\n    } else {\n        if (learning_task_ == LearningTask::LINK_PREDICTION) {\n            setActiveEdges();\n            num_items = graph_storage_->getNumActiveEdges();\n        } else {\n            setActiveNodes();\n            num_items = graph_storage_->getNumActiveNodes();\n        }\n    }\n\n    int64_t batch_size = batch_size_;\n    vector<shared_ptr<Batch>> batches;\n    while (start_idx < num_items) {\n        if (num_items - (start_idx + batch_size) < 0) {\n            batch_size = num_items - start_idx;\n        }\n        shared_ptr<Batch> curr_batch = std::make_shared<Batch>(train_);\n        curr_batch->batch_id_ = batch_id + batch_id_offset_;\n        curr_batch->start_idx_ = start_idx;\n        curr_batch->batch_size_ = batch_size;\n\n        if (prepare_encode) {\n            curr_batch->task_ = LearningTask::ENCODE;\n        } else {\n            curr_batch->task_ = learning_task_;\n        }\n\n        batches.emplace_back(curr_batch);\n        batch_id++;\n        start_idx += batch_size;\n    }\n    batches_ = batches;\n\n    batches_left_ = batches_.size();\n    batch_iterator_ = batches_.begin();\n}\n\nvoid DataLoader::setBufferOrdering() {\n    shared_ptr<PartitionBufferOptions> options;\n\n    if (instance_of<Storage, PartitionBufferStorage>(graph_storage_->storage_ptrs_.node_embeddings)) {\n        options = std::dynamic_pointer_cast<PartitionBufferStorage>(graph_storage_->storage_ptrs_.node_embeddings)->options_;\n    } else if (instance_of<Storage, PartitionBufferStorage>(graph_storage_->storage_ptrs_.node_features)) {\n        options = std::dynamic_pointer_cast<PartitionBufferStorage>(graph_storage_->storage_ptrs_.node_features)->options_;\n    }\n\n    if (learning_task_ == LearningTask::LINK_PREDICTION) {\n        if (graph_storage_->useInMemorySubGraph()) {\n            auto tup = getEdgeBucketOrdering(options->edge_bucket_ordering, options->num_partitions, options->buffer_capacity, options->fine_to_coarse_ratio,\n                                             options->num_cache_partitions, options->randomly_assign_edge_buckets);\n            buffer_states_ = std::get<0>(tup);\n            edge_buckets_per_buffer_ = std::get<1>(tup);\n\n            edge_buckets_per_buffer_iterator_ = edge_buckets_per_buffer_.begin();\n\n            graph_storage_->setBufferOrdering(buffer_states_);\n        }\n    } else {\n        if (graph_storage_->useInMemorySubGraph()) {\n            graph_storage_->storage_ptrs_.train_nodes->load();\n            int64_t num_train_nodes = graph_storage_->storage_ptrs_.nodes->getDim0();\n            auto tup = getNodePartitionOrdering(\n                options->node_partition_ordering, graph_storage_->storage_ptrs_.train_nodes->range(0, num_train_nodes).flatten(0, 1),\n                graph_storage_->getNumNodes(), options->num_partitions, options->buffer_capacity, options->fine_to_coarse_ratio, options->num_cache_partitions);\n            buffer_states_ = std::get<0>(tup);\n            node_ids_per_buffer_ = std::get<1>(tup);\n\n            node_ids_per_buffer_iterator_ = node_ids_per_buffer_.begin();\n\n            graph_storage_->setBufferOrdering(buffer_states_);\n        }\n    }\n}\n\nvoid DataLoader::clearBatches() { batches_ = std::vector<shared_ptr<Batch>>(); }\n\nshared_ptr<Batch> DataLoader::getNextBatch() {\n    std::unique_lock batch_lock(*batch_lock_);\n    batch_cv_->wait(batch_lock, [this] { return !waiting_for_batches_; });\n\n    shared_ptr<Batch> batch;\n    if (batch_iterator_ != batches_.end()) {\n        batch = *batch_iterator_;\n        batch_iterator_++;\n\n        // check if all batches have been read\n        if (batch_iterator_ == batches_.end()) {\n            if (graph_storage_->useInMemorySubGraph()) {\n                if (!graph_storage_->hasSwap()) {\n                    all_read_ = true;\n                }\n            } else {\n                all_read_ = true;\n            }\n        }\n    } else {\n        batch = nullptr;\n        if (graph_storage_->useInMemorySubGraph()) {\n            if (graph_storage_->hasSwap()) {\n                // wait for all batches to finish before swapping\n                waiting_for_batches_ = true;\n                batch_cv_->wait(batch_lock, [this] { return batches_left_ == 0; });\n                waiting_for_batches_ = false;\n\n                graph_storage_->updateInMemorySubGraph();\n\n                initializeBatches();\n                batch = *batch_iterator_;\n                batch_iterator_++;\n\n                // check if all batches have been read\n                if (batch_iterator_ == batches_.end()) {\n                    if (graph_storage_->useInMemorySubGraph()) {\n                        if (!graph_storage_->hasSwap()) {\n                            all_read_ = true;\n                        }\n                    } else {\n                        all_read_ = true;\n                    }\n                }\n            } else {\n                all_read_ = true;\n            }\n        } else {\n            all_read_ = true;\n        }\n    }\n    batch_lock.unlock();\n    batch_cv_->notify_all();\n    return batch;\n}\n\nbool DataLoader::hasNextBatch() {\n    batch_lock_->lock();\n    bool ret = !all_read_;\n    batch_lock_->unlock();\n    return ret;\n}\n\nvoid DataLoader::finishedBatch() {\n    batch_lock_->lock();\n    batches_left_--;\n    total_batches_processed_++;\n    batch_lock_->unlock();\n    batch_cv_->notify_all();\n}\n\nshared_ptr<Batch> DataLoader::getBatch(at::optional<torch::Device> device, bool perform_map, int worker_id) {\n    shared_ptr<Batch> batch = getNextBatch();\n    if (batch == nullptr) {\n        return batch;\n    }\n\n    if (batch->task_ == LearningTask::LINK_PREDICTION) {\n        edgeSample(batch, worker_id);\n    } else if (batch->task_ == LearningTask::NODE_CLASSIFICATION || batch->task_ == LearningTask::ENCODE) {\n        nodeSample(batch, worker_id);\n    }\n\n    loadCPUParameters(batch);\n\n    if (device.has_value()) {\n        if (device.value().is_cuda()) {\n            batch->to(device.value());\n            loadGPUParameters(batch);\n            //            batch->dense_graph_.performMap();\n        }\n    }\n\n    if (perform_map) {\n        batch->dense_graph_.performMap();\n    }\n\n    return batch;\n}\n\nvoid DataLoader::edgeSample(shared_ptr<Batch> batch, int worker_id) {\n    if (!batch->edges_.defined()) {\n        batch->edges_ = edge_sampler_->getEdges(batch);\n    }\n\n    if (negative_sampler_ != nullptr) {\n        negativeSample(batch);\n    }\n\n    std::vector<torch::Tensor> all_ids = {batch->edges_.select(1, 0), batch->edges_.select(1, -1)};\n\n    if (batch->src_neg_indices_.defined()) {\n        all_ids.emplace_back(batch->src_neg_indices_.flatten(0, 1));\n    }\n\n    if (batch->dst_neg_indices_.defined()) {\n        all_ids.emplace_back(batch->dst_neg_indices_.flatten(0, 1));\n    }\n\n    torch::Tensor src_mapping;\n    torch::Tensor dst_mapping;\n    torch::Tensor src_neg_mapping;\n    torch::Tensor dst_neg_mapping;\n\n    std::vector<torch::Tensor> mapped_tensors;\n\n    if (neighbor_sampler_ != nullptr) {\n        // get unique nodes in edges and negatives\n        batch->root_node_indices_ = std::get<0>(torch::_unique(torch::cat(all_ids)));\n\n        // sample neighbors and get unique nodes\n        batch->dense_graph_ =\n            neighbor_sampler_->getNeighbors(batch->root_node_indices_, graph_storage_->current_subgraph_state_->in_memory_subgraph_, worker_id);\n        batch->unique_node_indices_ = batch->dense_graph_.getNodeIDs();\n\n        // map edges and negatives to their corresponding index in unique_node_indices_\n        auto tup = torch::sort(batch->unique_node_indices_);\n        torch::Tensor sorted_map = std::get<0>(tup);\n        torch::Tensor map_to_unsorted = std::get<1>(tup);\n\n        mapped_tensors = apply_tensor_map(sorted_map, all_ids);\n\n        int64_t num_nbrs_sampled = batch->dense_graph_.hop_offsets_[-2].item<int64_t>();\n\n        src_mapping = map_to_unsorted.index_select(0, mapped_tensors[0]) - num_nbrs_sampled;\n        dst_mapping = map_to_unsorted.index_select(0, mapped_tensors[1]) - num_nbrs_sampled;\n\n        if (batch->src_neg_indices_.defined()) {\n            src_neg_mapping = map_to_unsorted.index_select(0, mapped_tensors[2]).reshape(batch->src_neg_indices_.sizes()) - num_nbrs_sampled;\n        }\n\n        if (batch->dst_neg_indices_.defined()) {\n            dst_neg_mapping = map_to_unsorted.index_select(0, mapped_tensors[3]).reshape(batch->dst_neg_indices_.sizes()) - num_nbrs_sampled;\n        }\n    } else {\n        // map edges and negatives to their corresponding index in unique_node_indices_\n        auto tup = map_tensors(all_ids);\n        batch->unique_node_indices_ = std::get<0>(tup);\n        mapped_tensors = std::get<1>(tup);\n\n        src_mapping = mapped_tensors[0];\n        dst_mapping = mapped_tensors[1];\n\n        if (batch->src_neg_indices_.defined()) {\n            src_neg_mapping = mapped_tensors[2].reshape(batch->src_neg_indices_.sizes());\n        }\n\n        if (batch->dst_neg_indices_.defined()) {\n            dst_neg_mapping = mapped_tensors[3].reshape(batch->dst_neg_indices_.sizes());\n        }\n    }\n\n    if (batch->edges_.size(1) == 2) {\n        batch->edges_ = torch::stack({src_mapping, dst_mapping}).transpose(0, 1);\n    } else if (batch->edges_.size(1) == 3) {\n        batch->edges_ = torch::stack({src_mapping, batch->edges_.select(1, 1), dst_mapping}).transpose(0, 1);\n    } else {\n        throw TensorSizeMismatchException(batch->edges_, \"Edge list must be a 3 or 2 column tensor\");\n    }\n\n    batch->src_neg_indices_mapping_ = src_neg_mapping;\n    batch->dst_neg_indices_mapping_ = dst_neg_mapping;\n}\n\nvoid DataLoader::nodeSample(shared_ptr<Batch> batch, int worker_id) {\n    if (batch->task_ == LearningTask::ENCODE) {\n        torch::TensorOptions node_opts = torch::TensorOptions().dtype(torch::kInt64).device(graph_storage_->storage_ptrs_.edges->device_);\n        batch->root_node_indices_ = torch::arange(batch->start_idx_, batch->start_idx_ + batch->batch_size_, node_opts);\n    } else {\n        batch->root_node_indices_ = graph_storage_->getNodeIdsRange(batch->start_idx_, batch->batch_size_).to(torch::kInt64);\n    }\n\n    if (graph_storage_->storage_ptrs_.node_labels != nullptr) {\n        batch->node_labels_ = graph_storage_->getNodeLabels(batch->root_node_indices_).flatten(0, 1);\n    }\n\n    if (graph_storage_->current_subgraph_state_->global_to_local_index_map_.defined()) {\n        batch->root_node_indices_ = graph_storage_->current_subgraph_state_->global_to_local_index_map_.index_select(0, batch->root_node_indices_);\n    }\n\n    if (neighbor_sampler_ != nullptr) {\n        batch->dense_graph_ =\n            neighbor_sampler_->getNeighbors(batch->root_node_indices_, graph_storage_->current_subgraph_state_->in_memory_subgraph_, worker_id);\n        batch->unique_node_indices_ = batch->dense_graph_.getNodeIDs();\n    } else {\n        batch->unique_node_indices_ = batch->root_node_indices_;\n    }\n}\n\nvoid DataLoader::negativeSample(shared_ptr<Batch> batch) {\n    std::tie(batch->src_neg_indices_, batch->src_neg_filter_) =\n        negative_sampler_->getNegatives(graph_storage_->current_subgraph_state_->in_memory_subgraph_, batch->edges_, true);\n    std::tie(batch->dst_neg_indices_, batch->dst_neg_filter_) =\n        negative_sampler_->getNegatives(graph_storage_->current_subgraph_state_->in_memory_subgraph_, batch->edges_, false);\n}\n\nvoid DataLoader::loadCPUParameters(shared_ptr<Batch> batch) {\n    if (graph_storage_->storage_ptrs_.node_embeddings != nullptr) {\n        if (graph_storage_->storage_ptrs_.node_embeddings->device_ != torch::kCUDA) {\n            batch->node_embeddings_ = graph_storage_->getNodeEmbeddings(batch->unique_node_indices_);\n            if (train_) {\n                batch->node_embeddings_state_ = graph_storage_->getNodeEmbeddingState(batch->unique_node_indices_);\n            }\n        }\n    }\n\n    if (graph_storage_->storage_ptrs_.node_features != nullptr) {\n        if (graph_storage_->storage_ptrs_.node_features->device_ != torch::kCUDA) {\n            if (only_root_features_) {\n                batch->node_features_ = graph_storage_->getNodeFeatures(batch->root_node_indices_);\n            } else {\n                batch->node_features_ = graph_storage_->getNodeFeatures(batch->unique_node_indices_);\n            }\n        }\n    }\n\n    batch->status_ = BatchStatus::LoadedEmbeddings;\n    batch->load_timestamp_ = timestamp_;\n}\n\nvoid DataLoader::loadGPUParameters(shared_ptr<Batch> batch) {\n    if (graph_storage_->storage_ptrs_.node_embeddings != nullptr) {\n        if (graph_storage_->storage_ptrs_.node_embeddings->device_ == torch::kCUDA) {\n            batch->node_embeddings_ = graph_storage_->getNodeEmbeddings(batch->unique_node_indices_);\n            if (train_) {\n                batch->node_embeddings_state_ = graph_storage_->getNodeEmbeddingState(batch->unique_node_indices_);\n            }\n        }\n    }\n\n    if (graph_storage_->storage_ptrs_.node_features != nullptr) {\n        if (graph_storage_->storage_ptrs_.node_features->device_ == torch::kCUDA) {\n            if (only_root_features_) {\n                batch->node_features_ = graph_storage_->getNodeFeatures(batch->root_node_indices_);\n            } else {\n                batch->node_features_ = graph_storage_->getNodeFeatures(batch->unique_node_indices_);\n            }\n        }\n    }\n}\n\nvoid DataLoader::updateEmbeddings(shared_ptr<Batch> batch, bool gpu) {\n    if (gpu) {\n        if (graph_storage_->storage_ptrs_.node_embeddings->device_ == torch::kCUDA) {\n            graph_storage_->updateAddNodeEmbeddings(batch->unique_node_indices_, batch->node_gradients_);\n            graph_storage_->updateAddNodeEmbeddingState(batch->unique_node_indices_, batch->node_state_update_);\n        }\n    } else {\n        batch->host_transfer_.synchronize();\n        if (graph_storage_->storage_ptrs_.node_embeddings->device_ != torch::kCUDA) {\n            graph_storage_->updateAddNodeEmbeddings(batch->unique_node_indices_, batch->node_gradients_);\n            graph_storage_->updateAddNodeEmbeddingState(batch->unique_node_indices_, batch->node_state_update_);\n        }\n        batch->clear();\n    }\n}\n\nvoid DataLoader::loadStorage() {\n    setBufferOrdering();\n    graph_storage_->load();\n\n    batch_id_offset_ = 0;\n    batches_left_ = 0;\n    total_batches_processed_ = 0;\n    all_read_ = false;\n\n    int num_hash_maps = 1;\n    if (train_) {\n        if (training_config_ != nullptr && !training_config_->pipeline->sync) {\n            num_hash_maps = training_config_->pipeline->batch_loader_threads;\n        }\n    } else {\n        if (evaluation_config_ != nullptr && !evaluation_config_->pipeline->sync) {\n            num_hash_maps = evaluation_config_->pipeline->batch_loader_threads;\n        }\n    }\n\n    if (!buffer_states_.empty()) {\n        graph_storage_->initializeInMemorySubGraph(buffer_states_[0], num_hash_maps);\n    } else {\n        graph_storage_->initializeInMemorySubGraph(torch::empty({}), num_hash_maps);\n    }\n\n    if (negative_sampler_ != nullptr) {\n        if (instance_of<NegativeSampler, CorruptNodeNegativeSampler>(negative_sampler_)) {\n            if (std::dynamic_pointer_cast<CorruptNodeNegativeSampler>(negative_sampler_)->filtered_) {\n                graph_storage_->sortAllEdges();\n            }\n        }\n    }\n}"
  },
  {
    "path": "src/cpp/src/data/graph.cpp",
    "content": "//\n// Created by Jason Mohoney on 8/25/21.\n//\n\n#include \"data/graph.h\"\n\n#include \"common/util.h\"\n#include \"data/samplers/neighbor.h\"\n\n#ifdef MARIUS_OMP\n    #include \"omp.h\"\n#endif\n\nMariusGraph::MariusGraph(){};\n\nMariusGraph::MariusGraph(EdgeList src_sorted_edges, EdgeList dst_sorted_edges, int64_t num_nodes_in_memory, int num_hash_maps) {\n    num_nodes_in_memory_ = num_nodes_in_memory;\n\n    src_sorted_edges_ = src_sorted_edges;\n    dst_sorted_edges_ = dst_sorted_edges;\n\n    auto contiguous_src = src_sorted_edges_.select(1, 0).contiguous();\n    auto contiguous_dst = dst_sorted_edges_.select(1, -1).contiguous();\n    torch::Tensor arange_tensor = torch::arange(0, num_nodes_in_memory_, contiguous_src.device());\n\n    out_offsets_ = torch::searchsorted(contiguous_src, arange_tensor);\n    torch::Tensor end = torch::tensor({contiguous_src.size(0)}, contiguous_src.options());\n    out_num_neighbors_ = torch::cat({out_offsets_, end}).narrow(0, 1, out_offsets_.size(0)) - out_offsets_;\n\n    in_offsets_ = torch::searchsorted(contiguous_dst, arange_tensor);\n    end = torch::tensor({contiguous_dst.size(0)}, contiguous_dst.options());\n    in_num_neighbors_ = torch::cat({in_offsets_, end}).narrow(0, 1, in_offsets_.size(0)) - in_offsets_;\n\n    max_out_num_neighbors_ = torch::max(out_num_neighbors_).item<int>();\n    max_in_num_neighbors_ = torch::max(in_num_neighbors_).item<int>();\n\n    num_hash_maps_ = num_hash_maps;\n    if (num_hash_maps_ > 0) {\n        auto bool_device_options = torch::TensorOptions().dtype(torch::kBool).device(contiguous_src.device());\n        for (int i = 0; i < num_hash_maps_; i++) {\n            hash_maps_.emplace_back(torch::zeros({num_nodes_in_memory}, bool_device_options));\n        }\n    }\n}\n\nMariusGraph::MariusGraph(EdgeList edges) {\n    EdgeList src_sorted_edges = edges.index_select(0, edges.select(1, 0).argsort());\n    EdgeList dst_sorted_edges = edges.index_select(0, edges.select(1, -1).argsort());\n    int64_t num_nodes_in_memory = std::get<0>(torch::_unique(torch::cat({edges.select(1, 0), edges.select(1, -1)}))).size(0);\n\n    MariusGraph(src_sorted_edges, dst_sorted_edges, num_nodes_in_memory, 1);\n}\n\nMariusGraph::~MariusGraph() { clear(); }\n\nIndices MariusGraph::getNodeIDs() { return node_ids_; }\n\nIndices MariusGraph::getEdges(bool incoming) {\n    if (incoming) {\n        return dst_sorted_edges_;\n    } else {\n        return src_sorted_edges_;\n    }\n}\n\nIndices MariusGraph::getRelationIDs(bool incoming) {\n    if (src_sorted_edges_.size(1) == 2) {\n        return torch::Tensor();\n    } else {\n        if (incoming) {\n            return dst_sorted_edges_.select(1, 1);\n        } else {\n            return src_sorted_edges_.select(1, 1);\n        }\n    }\n}\n\nIndices MariusGraph::getNeighborOffsets(bool incoming) {\n    if (incoming) {\n        return in_offsets_;\n    } else {\n        return out_offsets_;\n    }\n}\n\nIndices MariusGraph::getNumNeighbors(bool incoming) {\n    if (incoming) {\n        return in_num_neighbors_;\n    } else {\n        return out_num_neighbors_;\n    }\n}\n\nvoid MariusGraph::clear() {\n    node_ids_ = torch::Tensor();\n    src_sorted_edges_ = torch::Tensor();\n    dst_sorted_edges_ = torch::Tensor();\n    all_src_sorted_edges_ = torch::Tensor();\n    all_dst_sorted_edges_ = torch::Tensor();\n    active_in_memory_subgraph_ = torch::Tensor();\n    out_sorted_uniques_ = torch::Tensor();\n    out_offsets_ = torch::Tensor();\n    out_num_neighbors_ = torch::Tensor();\n    in_sorted_uniques_ = torch::Tensor();\n    in_offsets_ = torch::Tensor();\n    in_num_neighbors_ = torch::Tensor();\n    all_src_sorted_edges_ = torch::Tensor();\n    all_dst_sorted_edges_ = torch::Tensor();\n\n    for (int i; i < hash_maps_.size(); i++) {\n        hash_maps_[i] = torch::Tensor();\n    }\n    hash_maps_ = {};\n}\n\nvoid MariusGraph::to(torch::Device device) {\n    node_ids_ = node_ids_.to(device);\n    src_sorted_edges_ = src_sorted_edges_.to(device);\n    dst_sorted_edges_ = dst_sorted_edges_.to(device);\n    out_sorted_uniques_ = out_sorted_uniques_.to(device);\n    out_offsets_ = out_offsets_.to(device);\n    out_num_neighbors_ = out_num_neighbors_.to(device);\n    in_sorted_uniques_ = in_sorted_uniques_.to(device);\n    in_offsets_ = in_offsets_.to(device);\n}\n\n// 1 hop sampler\nstd::tuple<torch::Tensor, torch::Tensor> MariusGraph::getNeighborsForNodeIds(torch::Tensor node_ids, bool incoming,\n                                                                             NeighborSamplingLayer neighbor_sampling_layer, int max_neighbors_size,\n                                                                             float rate) {\n    int gpu = 0;\n\n    if (node_ids.is_cuda()) {\n        gpu = 1;\n    }\n\n    //    auto device_options = torch::TensorOptions().dtype(torch::kInt64).device(node_ids.device());\n\n    Indices in_memory_ids;\n    torch::Tensor mask;\n    torch::Tensor num_neighbors = torch::empty_like(node_ids);\n    Indices global_offsets = torch::empty_like(node_ids);\n\n    if (incoming) {\n        if (gpu) {\n            num_neighbors = in_num_neighbors_.index_select(0, node_ids);\n            global_offsets = in_offsets_.index_select(0, node_ids);\n        } else {\n            auto in_num_neighbors_accessor = in_num_neighbors_.accessor<int64_t, 1>();\n            auto in_offsets_accessor = in_offsets_.accessor<int64_t, 1>();\n\n            auto num_neighbors_accessor = num_neighbors.accessor<int64_t, 1>();\n            auto global_offsets_accessor = global_offsets.accessor<int64_t, 1>();\n            auto node_ids_accessor = node_ids.accessor<int64_t, 1>();\n\n#pragma omp parallel for\n            for (int64_t i = 0; i < node_ids.size(0); i++) {\n                num_neighbors_accessor[i] = in_num_neighbors_accessor[node_ids_accessor[i]];\n                global_offsets_accessor[i] = in_offsets_accessor[node_ids_accessor[i]];\n            }\n        }\n    } else {\n        if (gpu) {\n            num_neighbors = out_num_neighbors_.index_select(0, node_ids);\n            global_offsets = out_offsets_.index_select(0, node_ids);\n        } else {\n            auto out_num_neighbors_accessor = out_num_neighbors_.accessor<int64_t, 1>();\n            auto out_offsets_accessor = out_offsets_.accessor<int64_t, 1>();\n\n            auto num_neighbors_accessor = num_neighbors.accessor<int64_t, 1>();\n            auto global_offsets_accessor = global_offsets.accessor<int64_t, 1>();\n            auto node_ids_accessor = node_ids.accessor<int64_t, 1>();\n\n#pragma omp parallel for\n            for (int64_t i = 0; i < node_ids.size(0); i++) {\n                num_neighbors_accessor[i] = out_num_neighbors_accessor[node_ids_accessor[i]];\n                global_offsets_accessor[i] = out_offsets_accessor[node_ids_accessor[i]];\n            }\n        }\n    }\n\n    torch::Tensor summed_num_neighbors;\n    Indices local_offsets = torch::empty_like(node_ids);\n    int64_t total_neighbors;\n    if (neighbor_sampling_layer != NeighborSamplingLayer::UNIFORM or gpu) {\n        summed_num_neighbors = num_neighbors.cumsum(0);\n        local_offsets = summed_num_neighbors - num_neighbors;\n        total_neighbors = summed_num_neighbors[-1].item<int64_t>();\n    }\n\n    std::tuple<torch::Tensor, torch::Tensor> ret;\n\n    torch::Tensor edges;\n    int64_t max_id;\n\n    if (incoming) {\n        edges = dst_sorted_edges_;\n        max_id = max_in_num_neighbors_;\n    } else {\n        edges = src_sorted_edges_;\n        max_id = max_out_num_neighbors_;\n    }\n\n    switch (neighbor_sampling_layer) {\n        case NeighborSamplingLayer::ALL: {\n            if (gpu) {\n                ret = sample_all_gpu(edges, global_offsets, local_offsets, num_neighbors);\n            } else {\n                ret = sample_all_cpu(edges, global_offsets, local_offsets, num_neighbors, total_neighbors);\n            }\n            break;\n        }\n        case NeighborSamplingLayer::UNIFORM: {\n            if (gpu) {\n                ret = sample_uniform_gpu(edges, global_offsets, local_offsets, num_neighbors, max_neighbors_size, max_id);\n            } else {\n                ret = sample_uniform_cpu(edges, global_offsets, local_offsets, num_neighbors, max_neighbors_size, total_neighbors);\n            }\n            break;\n        }\n        case NeighborSamplingLayer::DROPOUT: {\n            if (gpu) {\n                ret = sample_dropout_gpu(edges, global_offsets, local_offsets, num_neighbors, rate);\n            } else {\n                ret = sample_dropout_cpu(edges, global_offsets, local_offsets, num_neighbors, rate, total_neighbors);\n            }\n            break;\n        }\n    }\n    return ret;\n}\n\nvoid MariusGraph::sortAllEdges(EdgeList all_edges) {\n    all_src_sorted_edges_ = all_edges.index_select(0, all_edges.select(1, 0).argsort(0, false)).to(torch::kInt64);\n    all_dst_sorted_edges_ = all_edges.index_select(0, all_edges.select(1, -1).argsort(0, false)).to(torch::kInt64);\n}\n\nDENSEGraph::DENSEGraph(){};\n\nDENSEGraph::DENSEGraph(Indices hop_offsets, Indices node_ids, Indices in_offsets, std::vector<torch::Tensor> in_neighbors_vec, Indices in_neighbors_mapping,\n                       Indices out_offsets, std::vector<torch::Tensor> out_neighbors_vec, Indices out_neighbors_mapping, int num_nodes_in_memory) {\n    hop_offsets_ = hop_offsets;\n    node_ids_ = node_ids;\n    in_offsets_ = in_offsets;\n    in_neighbors_vec_ = in_neighbors_vec;\n    in_neighbors_mapping_ = in_neighbors_mapping;\n    out_offsets_ = out_offsets;\n    out_neighbors_vec_ = out_neighbors_vec;\n    out_neighbors_mapping_ = out_neighbors_mapping;\n    num_nodes_in_memory_ = num_nodes_in_memory;\n}\n\nDENSEGraph::~DENSEGraph() { clear(); }\n\nvoid DENSEGraph::clear() {\n    MariusGraph::clear();\n\n    hop_offsets_ = torch::Tensor();\n\n    in_neighbors_mapping_ = torch::Tensor();\n    out_neighbors_mapping_ = torch::Tensor();\n\n    in_neighbors_vec_ = {};\n    out_neighbors_vec_ = {};\n\n    node_properties_ = torch::Tensor();\n}\n\nvoid DENSEGraph::to(torch::Device device, CudaStream *compute_stream, CudaStream *transfer_stream) {\n    node_ids_ = transfer_tensor(node_ids_, device, compute_stream, transfer_stream);\n    hop_offsets_ = transfer_tensor(hop_offsets_, device, compute_stream, transfer_stream);\n\n    out_offsets_ = transfer_tensor(out_offsets_, device, compute_stream, transfer_stream);\n\n    in_offsets_ = transfer_tensor(in_offsets_, device, compute_stream, transfer_stream);\n\n    for (int i = 0; i < in_neighbors_vec_.size(); i++) {\n        in_neighbors_vec_[i] = transfer_tensor(in_neighbors_vec_[i], device, compute_stream, transfer_stream);\n    }\n\n    for (int i = 0; i < out_neighbors_vec_.size(); i++) {\n        out_neighbors_vec_[i] = transfer_tensor(out_neighbors_vec_[i], device, compute_stream, transfer_stream);\n    }\n\n    node_properties_ = transfer_tensor(node_properties_, device, compute_stream, transfer_stream);\n}\n\nint64_t DENSEGraph::getLayerOffset() { return hop_offsets_[1].item<int64_t>(); }\n\nvoid DENSEGraph::prepareForNextLayer() {\n    int64_t num_nodes_to_remove = (hop_offsets_[1] - hop_offsets_[0]).item<int64_t>();\n    int64_t num_finished_nodes = (hop_offsets_[2] - hop_offsets_[1]).item<int64_t>();\n\n    if (src_sorted_edges_.size(0) > 0) {\n        if (num_finished_nodes == out_offsets_.size(0)) {\n            return;\n        }\n        int64_t finished_out_neighbors = out_offsets_[num_finished_nodes].item<int64_t>();\n        src_sorted_edges_ = src_sorted_edges_.narrow(0, finished_out_neighbors, src_sorted_edges_.size(0) - finished_out_neighbors);\n        out_neighbors_mapping_ =\n            out_neighbors_mapping_.narrow(0, finished_out_neighbors, out_neighbors_mapping_.size(0) - finished_out_neighbors) - num_nodes_to_remove;\n        out_offsets_ = out_offsets_.narrow(0, num_finished_nodes, out_offsets_.size(0) - num_finished_nodes) - finished_out_neighbors;\n    }\n    out_num_neighbors_ = out_num_neighbors_.narrow(0, num_finished_nodes, out_num_neighbors_.size(0) - num_finished_nodes);\n\n    if (dst_sorted_edges_.size(0) > 0) {\n        if (num_finished_nodes == in_offsets_.size(0)) {\n            return;\n        }\n        int64_t finished_in_neighbors = in_offsets_[num_finished_nodes].item<int64_t>();\n        dst_sorted_edges_ = dst_sorted_edges_.narrow(0, finished_in_neighbors, dst_sorted_edges_.size(0) - finished_in_neighbors);\n        in_neighbors_mapping_ =\n            in_neighbors_mapping_.narrow(0, finished_in_neighbors, in_neighbors_mapping_.size(0) - finished_in_neighbors) - num_nodes_to_remove;\n        in_offsets_ = in_offsets_.narrow(0, num_finished_nodes, in_offsets_.size(0) - num_finished_nodes) - finished_in_neighbors;\n    }\n    in_num_neighbors_ = in_num_neighbors_.narrow(0, num_finished_nodes, in_num_neighbors_.size(0) - num_finished_nodes);\n\n    node_ids_ = node_ids_.narrow(0, num_nodes_to_remove, node_ids_.size(0) - num_nodes_to_remove);\n    hop_offsets_ = hop_offsets_.narrow(0, 1, hop_offsets_.size(0) - 1) - num_nodes_to_remove;\n}\n\nIndices DENSEGraph::getNeighborIDs(bool incoming, bool global_ids) {\n    if (global_ids) {\n        // return global node ids\n        if (incoming) {\n            return dst_sorted_edges_.select(1, 0);\n        } else {\n            return src_sorted_edges_.select(1, -1);\n        }\n    } else {\n        // return node ids local to the batch\n        if (incoming) {\n            return in_neighbors_mapping_;\n        } else {\n            return out_neighbors_mapping_;\n        }\n    }\n}\n\nstd::tuple<torch::Tensor, torch::Tensor, torch::Tensor> DENSEGraph::getCombinedNeighborIDs() {\n    torch::Tensor new_offsets = in_offsets_ + out_offsets_;\n    torch::Tensor new_num_neighbors = in_num_neighbors_ + out_num_neighbors_;\n    torch::Tensor new_mapping = torch::empty(in_neighbors_mapping_.size(0) + out_neighbors_mapping_.size(0), in_neighbors_mapping_.options());\n\n    torch::Tensor repeated_starts = new_offsets.repeat_interleave(in_num_neighbors_);\n    torch::Tensor repeated_offsets = in_offsets_.repeat_interleave(in_num_neighbors_);\n    torch::Tensor arange = torch::arange(repeated_offsets.size(0), repeated_offsets.options());\n    torch::Tensor incoming_indices = repeated_starts + arange - repeated_offsets;\n\n    torch::Tensor global_offsets = new_offsets + in_num_neighbors_;\n    repeated_starts = global_offsets.repeat_interleave(out_num_neighbors_);\n    repeated_offsets = out_offsets_.repeat_interleave(out_num_neighbors_);\n    arange = torch::arange(repeated_offsets.size(0), repeated_offsets.options());\n    torch::Tensor outgoing_indices = repeated_starts + arange - repeated_offsets;\n\n    new_mapping.index_copy_(0, incoming_indices, in_neighbors_mapping_);\n    new_mapping.index_copy_(0, outgoing_indices, out_neighbors_mapping_);\n\n    return std::forward_as_tuple(new_offsets, new_num_neighbors, new_mapping);\n}\n\nvoid DENSEGraph::performMap() {\n    if (!node_ids_.defined()) {\n        return;\n    }\n\n    auto device_options = torch::TensorOptions().dtype(torch::kInt64).device(node_ids_.device());\n\n    torch::Tensor local_id_to_batch_map = torch::zeros({num_nodes_in_memory_}, device_options);\n\n    local_id_to_batch_map.index_copy_(0, node_ids_, torch::arange(node_ids_.size(0), device_options));\n\n    if (out_neighbors_vec_.size() > 0) {\n        src_sorted_edges_ = torch::cat({out_neighbors_vec_}, 0);\n        out_neighbors_mapping_ = local_id_to_batch_map.gather(0, src_sorted_edges_.select(1, -1));\n\n        out_neighbors_vec_ = {};\n\n        torch::Tensor tmp_out_offsets = torch::cat({out_offsets_, torch::tensor({src_sorted_edges_.size(0)}, out_offsets_.device())});\n        out_num_neighbors_ = tmp_out_offsets.narrow(0, 1, out_offsets_.size(0)) - tmp_out_offsets.narrow(0, 0, out_offsets_.size(0));\n    } else {\n        out_num_neighbors_ = torch::zeros({node_ids_.size(0)}, device_options);\n    }\n\n    if (in_neighbors_vec_.size() > 0) {\n        dst_sorted_edges_ = torch::cat({in_neighbors_vec_}, 0);\n        in_neighbors_mapping_ = local_id_to_batch_map.gather(0, dst_sorted_edges_.select(1, 0));\n\n        in_neighbors_vec_ = {};\n\n        torch::Tensor tmp_in_offsets = torch::cat({in_offsets_, torch::tensor({dst_sorted_edges_.size(0)}, in_offsets_.device())});\n        in_num_neighbors_ = tmp_in_offsets.narrow(0, 1, in_offsets_.size(0)) - tmp_in_offsets.narrow(0, 0, in_offsets_.size(0));\n    } else {\n        in_num_neighbors_ = torch::zeros({node_ids_.size(0)}, device_options);\n    }\n\n    // only works for torch > 1.8\n    //    in_num_neighbors_ = torch::diff(in_offsets_, 1, 0, {}, torch::tensor({dst_sorted_edges_.size(0)}, in_offsets_.device()));\n    //    out_num_neighbors_ = torch::diff(out_offsets_, 1, 0, {}, torch::tensor({src_sorted_edges_.size(0)}, out_offsets_.device()));\n}\n\nvoid DENSEGraph::setNodeProperties(torch::Tensor node_properties) {\n    assert(node_properties.size(0) == node_ids_.size(0));\n    node_properties_ = node_properties;\n}\n"
  },
  {
    "path": "src/cpp/src/data/ordering.cpp",
    "content": "//\n// Created by Jason Mohoney on 7/17/20.\n//\n#ifdef MARIUS_OMP\n    #include \"omp.h\"\n#endif\n\n#include \"common/datatypes.h\"\n#include \"data/ordering.h\"\n#include \"reporting/logger.h\"\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getEdgeBucketOrdering(EdgeBucketOrdering edge_bucket_ordering, int num_partitions, int buffer_capacity,\n                                                                               int fine_to_coarse_ratio, int num_cache_partitions,\n                                                                               bool randomly_assign_edge_buckets) {\n    switch (edge_bucket_ordering) {\n        case EdgeBucketOrdering::OLD_BETA:\n            SPDLOG_INFO(\"Generating Old Beta Ordering\");\n            return getTwoLevelBetaOrdering(num_partitions, buffer_capacity, 1, 0, false);\n        case EdgeBucketOrdering::NEW_BETA:\n            SPDLOG_INFO(\"Generating New Beta Ordering\");\n            return getTwoLevelBetaOrdering(num_partitions, buffer_capacity, 1, 0, true);\n        case EdgeBucketOrdering::ALL_BETA:\n            return getCustomEdgeBucketOrdering();\n        case EdgeBucketOrdering::COMET:\n            SPDLOG_INFO(\"Generating COMET Ordering\");\n            return getTwoLevelBetaOrdering(num_partitions, buffer_capacity, fine_to_coarse_ratio, num_cache_partitions, randomly_assign_edge_buckets);\n        case EdgeBucketOrdering::CUSTOM:\n            return getCustomEdgeBucketOrdering();\n        default:\n            SPDLOG_ERROR(\"Not implemented\");\n            std::tuple<vector<torch::Tensor>, vector<torch::Tensor>> ret;\n            return ret;\n    }\n}\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getNodePartitionOrdering(NodePartitionOrdering node_partition_ordering, Indices train_nodes,\n                                                                                  int64_t total_num_nodes, int num_partitions, int buffer_capacity,\n                                                                                  int fine_to_coarse_ratio, int num_cache_partitions) {\n    switch (node_partition_ordering) {\n        case NodePartitionOrdering::DISPERSED:\n            SPDLOG_INFO(\"Generating Dispersed Ordering\");\n            return getDispersedNodePartitionOrdering(train_nodes, total_num_nodes, num_partitions, buffer_capacity, fine_to_coarse_ratio, num_cache_partitions);\n        case NodePartitionOrdering::SEQUENTIAL:\n            SPDLOG_INFO(\"Generating Sequential Ordering\");\n            return getSequentialNodePartitionOrdering(train_nodes, total_num_nodes, num_partitions, buffer_capacity);\n        case NodePartitionOrdering::CUSTOM:\n            return getCustomNodePartitionOrdering();\n        default:\n            SPDLOG_ERROR(\"Not implemented\");\n            std::tuple<vector<torch::Tensor>, vector<torch::Tensor>> ret;\n            return ret;\n    }\n}\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> convertEdgeBucketOrderToTensors(vector<vector<int>> buffer_states,\n                                                                                         vector<vector<std::pair<int, int>>> edge_buckets_per_buffer) {\n    vector<torch::Tensor> ret_buffer_states;\n    vector<torch::Tensor> ret_edge_buckets_per_buffer;\n\n    for (auto b : buffer_states) {\n        ret_buffer_states.emplace_back(torch::tensor(b, torch::kInt64));\n    }\n\n    for (auto edge_buckets : edge_buckets_per_buffer) {\n        torch::Tensor tmp = torch::zeros({(int64_t)edge_buckets.size(), 2}, torch::kInt64);\n\n        for (int i = 0; i < edge_buckets.size(); i++) {\n            tmp[i][0] = std::get<0>(edge_buckets[i]);\n            tmp[i][1] = std::get<1>(edge_buckets[i]);\n        }\n\n        ret_edge_buckets_per_buffer.emplace_back(tmp);\n    }\n\n    return std::forward_as_tuple(ret_buffer_states, ret_edge_buckets_per_buffer);\n}\n\nvector<vector<int>> getBetaOrderingHelper(int num_partitions, int buffer_capacity) {\n    vector<vector<int>> buffer_states;\n    Indices all_partitions = torch::randperm(num_partitions, torch::kInt32);\n\n    // get all buffer states\n    Indices in_buffer = all_partitions.index_select(0, torch::arange(buffer_capacity));\n\n    Indices combined = torch::cat({all_partitions, in_buffer});\n    auto uniques = torch::_unique2(combined, true, false, true);\n    auto vals = std::get<0>(uniques);\n    auto counts = std::get<2>(uniques);\n    Indices on_disk = vals.masked_select(counts == 1);\n\n    int *data_ptr_ = (int *)in_buffer.data_ptr();\n    buffer_states.emplace_back(vector<int>(data_ptr_, data_ptr_ + in_buffer.size(0)));\n\n    while (on_disk.size(0) >= 1) {\n        in_buffer = in_buffer.index_select(0, torch::randperm(in_buffer.size(0), torch::kInt64));\n        on_disk = on_disk.index_select(0, torch::randperm(on_disk.size(0), torch::kInt64));\n\n        for (int i = 0; i < on_disk.size(0); i++) {\n            auto admit_id = on_disk[i].clone();\n\n            on_disk[i] = in_buffer[-1];\n\n            in_buffer[-1] = admit_id;\n\n            data_ptr_ = (int *)in_buffer.data_ptr();\n            buffer_states.emplace_back(vector<int>(data_ptr_, data_ptr_ + in_buffer.size(0)));\n        }\n\n        on_disk = on_disk.index_select(0, torch::randperm(on_disk.size(0), torch::kInt64));\n\n        int num_replaced = 0;\n        for (int i = 0; i < buffer_capacity - 1; i++) {\n            if (i >= on_disk.size(0)) {\n                break;\n            }\n            num_replaced++;\n            in_buffer[i] = on_disk[i];\n\n            data_ptr_ = (int *)in_buffer.data_ptr();\n            buffer_states.emplace_back(vector<int>(data_ptr_, data_ptr_ + in_buffer.size(0)));\n        }\n        on_disk = on_disk.narrow(0, num_replaced, on_disk.size(0) - num_replaced);\n    }\n\n    return buffer_states;\n}\n\nvector<vector<std::pair<int, int>>> greedyAssignEdgeBucketsToBuffers(vector<vector<int>> buffer_states, int num_partitions) {\n    vector<vector<std::pair<int, int>>> edge_buckets_per_buffer(buffer_states.size());\n    torch::Tensor interacted = torch::zeros({num_partitions, num_partitions}, torch::kInt32);\n    auto interacted_accessor = interacted.accessor<int32_t, 2>();\n\n    for (int i = 0; i < buffer_states.size(); i++) {\n        for (int j = 0; j < buffer_states[i].size(); j++) {\n            for (int k = 0; k < buffer_states[i].size(); k++) {\n                int32_t src_part = buffer_states[i][j];\n                int32_t dst_part = buffer_states[i][k];\n                if (interacted_accessor[src_part][dst_part] == 1) {\n                    continue;\n                }\n                interacted_accessor[src_part][dst_part] = 1;\n                edge_buckets_per_buffer[i].emplace_back(std::make_pair(src_part, dst_part));\n            }\n        }\n    }\n\n    return edge_buckets_per_buffer;\n}\n\nvector<vector<std::pair<int, int>>> randomlyAssignEdgeBucketsToBuffers(vector<vector<int>> buffer_states, int num_partitions) {\n    // get edge buckets from buffer states\n    Indices all_partitions = torch::arange(num_partitions, torch::kInt32);\n    torch::Tensor left_col = all_partitions.repeat_interleave(num_partitions);\n    torch::Tensor right_col = all_partitions.repeat({num_partitions});\n    torch::Tensor all_buckets = torch::stack({left_col, right_col}, 1);\n    auto all_buckets_accessor = all_buckets.accessor<int32_t, 2>();\n\n    int num_buffers = buffer_states.size();\n    int buffer_size = buffer_states[0].size();\n    int num_buckets = all_buckets.size(0);\n\n    torch::Tensor choices = torch::zeros({num_buckets, num_buffers}, torch::kInt32);\n    int32_t *choices_mem = choices.data_ptr<int32_t>();\n\n#pragma omp parallel for\n    for (int i = 0; i < num_buffers; i++) {\n        for (int j = 0; j < buffer_size; j++) {\n            for (int k = j; k < buffer_size; k++) {\n                int src_part = buffer_states[i][j];\n                int dst_part = buffer_states[i][k];\n                *(choices_mem + (src_part * num_partitions + dst_part) * num_buffers + i) = 1;\n                *(choices_mem + (dst_part * num_partitions + src_part) * num_buffers + i) = 1;\n            }\n        }\n    }\n\n    torch::Tensor pick = torch::zeros({num_buckets}, torch::kInt32);\n    torch::Tensor pick_one_hot = torch::zeros({num_buckets, num_buffers}, torch::kInt32);\n    int32_t *pick_mem = pick.data_ptr<int32_t>();\n    int32_t *pick_one_hot_mem = pick_one_hot.data_ptr<int32_t>();\n    auto pick_accessor = pick.accessor<int32_t, 1>();\n\n    // setup seeds\n    unsigned int num_threads = 1;\n#ifdef MARIUS_OMP\n    #pragma omp parallel\n    {\n    #pragma omp single\n        num_threads = omp_get_num_threads();\n    }\n#endif\n    std::vector<unsigned int> tid_seeds(num_threads);\n\n    for (int i = 0; i < num_threads; i++) {\n        tid_seeds[i] = rand();\n    }\n\n#pragma omp parallel\n    {\n#ifdef MARIUS_OMP\n        unsigned int seed = tid_seeds[omp_get_thread_num()];\n#else\n        unsigned int seed = tid_seeds[0];\n#endif\n\n#pragma omp for\n        for (int i = 0; i < num_buckets; i++) {\n            torch::Tensor buffer_choices = torch::nonzero(choices[i]);\n            buffer_choices = torch::reshape(buffer_choices, {buffer_choices.size(0)});\n            int32_t buffer_choice = buffer_choices[rand_r(&seed) % buffer_choices.size(0)].item<int32_t>();\n\n            int32_t src_part = all_buckets_accessor[i][0];\n            int32_t dst_part = all_buckets_accessor[i][1];\n            *(pick_mem + (src_part * num_partitions + dst_part)) = buffer_choice;\n            *(pick_one_hot_mem + (src_part * num_partitions + dst_part) * num_buffers + buffer_choice) = 1;\n        }\n    }\n\n    torch::Tensor num_edge_buckets_per_buffer = torch::sum(pick_one_hot, 0);\n\n    vector<vector<std::pair<int, int>>> edge_buckets_per_buffer(num_buffers);\n    for (int i = 0; i < num_buffers; i++) {\n        edge_buckets_per_buffer[i] = vector<std::pair<int, int>>(num_edge_buckets_per_buffer[i].item<int>());\n    }\n\n    vector<int> indices(num_buffers, 0);\n    for (int i = 0; i < num_buckets; i++) {\n        int32_t src_part = all_buckets_accessor[i][0];\n        int32_t dst_part = all_buckets_accessor[i][1];\n        std::pair<int, int> pair = std::make_pair(src_part, dst_part);\n\n        int32_t buffer_choice = pick_accessor[i];\n\n        edge_buckets_per_buffer[buffer_choice][indices[buffer_choice]] = pair;\n        indices[buffer_choice] += 1;\n    }\n\n    return edge_buckets_per_buffer;\n}\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getTwoLevelBetaOrdering(int num_partitions, int buffer_capacity, int fine_to_coarse_ratio,\n                                                                                 int num_cache_partitions, bool randomly_assign_edge_buckets) {\n    int coarse_num_partitions = num_partitions / fine_to_coarse_ratio;\n    int coarse_buffer_capacity = buffer_capacity / fine_to_coarse_ratio;\n\n    coarse_num_partitions = coarse_num_partitions - num_cache_partitions;\n    coarse_buffer_capacity = coarse_buffer_capacity - num_cache_partitions;\n\n    vector<vector<int>> coarse_buffer_states = getBetaOrderingHelper(coarse_num_partitions, coarse_buffer_capacity);\n\n    int cached_fine_partitions = num_cache_partitions * fine_to_coarse_ratio;\n    torch::Tensor fine_to_coarse_map = torch::arange(cached_fine_partitions, torch::kInt32);\n    fine_to_coarse_map = torch::cat({fine_to_coarse_map, torch::randperm(num_partitions - cached_fine_partitions, torch::kInt32) + cached_fine_partitions});\n    int *data_ptr_ = (int *)fine_to_coarse_map.data_ptr();\n\n    for (int i = 0; i < coarse_buffer_states.size(); i++) {\n        for (int j = 0; j < coarse_buffer_states[i].size(); j++) {\n            coarse_buffer_states[i][j] += num_cache_partitions;\n        }\n        for (int j = 0; j < num_cache_partitions; j++) {\n            coarse_buffer_states[i].emplace_back(j);\n        }\n    }\n\n    // convert to fine buffer states\n    vector<vector<int>> buffer_states;\n\n    for (int i = 0; i < coarse_buffer_states.size(); i++) {\n        vector<int> fine_buffer_state(buffer_capacity, 0);\n        for (int j = 0; j < coarse_buffer_states[i].size(); j++) {\n            int *start = (int *)data_ptr_ + coarse_buffer_states[i][j] * fine_to_coarse_ratio;\n            int *end = (int *)data_ptr_ + (coarse_buffer_states[i][j] + 1) * fine_to_coarse_ratio;\n            vector<int> fine_partitions = vector<int>(start, end);\n\n            for (int k = j * fine_to_coarse_ratio; k < (j + 1) * fine_to_coarse_ratio; k++) {\n                fine_buffer_state[k] = fine_partitions[k - j * fine_to_coarse_ratio];\n            }\n        }\n\n        buffer_states.emplace_back(fine_buffer_state);\n    }\n\n    // assign edge buckets\n    vector<vector<std::pair<int, int>>> edge_buckets_per_buffer;\n    if (randomly_assign_edge_buckets) {\n        edge_buckets_per_buffer = randomlyAssignEdgeBucketsToBuffers(buffer_states, num_partitions);\n    } else {\n        edge_buckets_per_buffer = greedyAssignEdgeBucketsToBuffers(buffer_states, num_partitions);\n    }\n\n    return convertEdgeBucketOrderToTensors(buffer_states, edge_buckets_per_buffer);\n}\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getDispersedNodePartitionOrdering(Indices train_nodes, int64_t total_num_nodes, int num_partitions,\n                                                                                           int buffer_capacity, int fine_to_coarse_ratio,\n                                                                                           int num_cache_partitions) {\n    int coarse_num_partitions = num_partitions / fine_to_coarse_ratio;\n    int coarse_buffer_capacity = buffer_capacity / fine_to_coarse_ratio;\n\n    coarse_num_partitions = coarse_num_partitions - num_cache_partitions;\n    coarse_buffer_capacity = coarse_buffer_capacity - num_cache_partitions;\n\n    // create coarse buffer states\n    vector<torch::Tensor> coarse_buffer_states;\n    Indices all_coarse_partitions = torch::randperm(coarse_num_partitions, torch::kInt32);\n    Indices in_buffer = all_coarse_partitions.narrow(0, 0, coarse_buffer_capacity);\n    Indices on_disk = all_coarse_partitions.narrow(0, coarse_buffer_capacity, coarse_num_partitions - coarse_buffer_capacity);\n    coarse_buffer_states.emplace_back(in_buffer);\n\n    while (on_disk.size(0) > 0) {\n        in_buffer = in_buffer.index_select(0, torch::randperm(in_buffer.size(0), torch::kInt64));\n        on_disk = on_disk.index_select(0, torch::randperm(on_disk.size(0), torch::kInt64));\n\n        in_buffer[-1] = on_disk[0];\n        coarse_buffer_states.emplace_back(in_buffer);\n        on_disk = on_disk.narrow(0, 1, on_disk.size(0) - 1);\n    }\n\n    for (int i = 0; i < coarse_buffer_states.size(); i++) {\n        coarse_buffer_states[i] =\n            torch::cat({coarse_buffer_states[i] + num_cache_partitions, torch::arange(num_cache_partitions, coarse_buffer_states[i].options())});\n    }\n\n    // convert to fine buffer states\n    torch::Tensor fine_to_coarse_map = torch::randperm(num_partitions, torch::kInt32);\n    int *data_ptr_ = (int *)fine_to_coarse_map.data_ptr();\n\n    vector<torch::Tensor> buffer_states;\n\n    for (int i = 0; i < coarse_buffer_states.size(); i++) {\n        vector<int> fine_buffer_state(buffer_capacity, 0);\n        torch::Tensor coarse_buffer_state = coarse_buffer_states[i];\n        auto coarse_buffer_state_accessor = coarse_buffer_state.accessor<int32_t, 1>();\n\n        for (int j = 0; j < coarse_buffer_state.size(0); j++) {\n            int *start = (int *)data_ptr_ + coarse_buffer_state_accessor[j] * fine_to_coarse_ratio;\n            int *end = (int *)data_ptr_ + (coarse_buffer_state_accessor[j] + 1) * fine_to_coarse_ratio;\n            vector<int> fine_partitions = vector<int>(start, end);\n\n            for (int k = j * fine_to_coarse_ratio; k < (j + 1) * fine_to_coarse_ratio; k++) {\n                fine_buffer_state[k] = fine_partitions[k - j * fine_to_coarse_ratio];\n            }\n        }\n\n        buffer_states.emplace_back(torch::from_blob(fine_buffer_state.data(), {(int)fine_buffer_state.size()}, torch::kInt32).clone());\n    }\n\n    // randomly assign train nodes to buffers\n\n    int64_t partition_size = ceil((double)total_num_nodes / num_partitions);\n    torch::Tensor train_nodes_partition = train_nodes.divide(partition_size, \"trunc\");\n\n    std::vector<std::vector<int>> partition_buffer_states(num_partitions);\n\n    for (int i = 0; i < num_partitions; i++) {\n        for (int j = 0; j < buffer_states.size(); j++) {\n            bool partition_in_buffer = false;\n            auto buffer_state_accessor = buffer_states[j].accessor<int32_t, 1>();\n\n            for (int k = 0; k < buffer_capacity; k++) {\n                if (buffer_state_accessor[k] == i) {\n                    partition_in_buffer = true;\n                    break;\n                }\n            }\n            if (partition_in_buffer) {\n                partition_buffer_states[i].emplace_back(j);\n            }\n        }\n    }\n\n    torch::Tensor train_nodes_buffer_choice = torch::zeros_like(train_nodes);\n    std::vector<torch::Tensor> train_nodes_per_buffer(buffer_states.size());\n    auto train_nodes_partition_accessor = train_nodes_partition.accessor<int32_t, 1>();  // todo\n\n    for (int i = 0; i < train_nodes.size(0); i++) {\n        int partition_id = train_nodes_partition_accessor[i];\n        int rand_id = rand() % partition_buffer_states[partition_id].size();\n        train_nodes_buffer_choice[i] = partition_buffer_states[partition_id][rand_id];\n    }\n\n    for (int i = 0; i < buffer_states.size(); i++) {\n        train_nodes_per_buffer[i] = train_nodes.masked_select(train_nodes_buffer_choice == i);\n    }\n\n    return std::forward_as_tuple(buffer_states, train_nodes_per_buffer);\n}\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getSequentialNodePartitionOrdering(Indices train_nodes, int64_t total_num_nodes, int num_partitions,\n                                                                                            int buffer_capacity) {\n    int64_t partition_size = ceil((double)total_num_nodes / num_partitions);\n    torch::Tensor train_nodes_partition = train_nodes.divide(partition_size, \"trunc\");\n\n    int32_t max_train_partition = torch::max(train_nodes_partition).item<int32_t>();\n    int32_t num_train_partitions = max_train_partition + 1;\n    SPDLOG_INFO(\"Num Train Partitions: {}\", num_train_partitions);\n\n    vector<torch::Tensor> buffer_states;\n    Indices in_buffer = torch::arange(num_train_partitions, torch::kInt32);\n    Indices on_disk = torch::arange(num_train_partitions, num_partitions, torch::kInt32);\n    on_disk = on_disk.index_select(0, torch::randperm(on_disk.size(0), torch::kInt64));\n    on_disk = on_disk.narrow(0, 0, buffer_capacity - num_train_partitions);\n\n    buffer_states.emplace_back(torch::cat({in_buffer, on_disk}));\n\n    std::vector<torch::Tensor> train_nodes_per_buffer;\n    train_nodes_per_buffer.emplace_back(train_nodes.clone());\n\n    return std::forward_as_tuple(buffer_states, train_nodes_per_buffer);\n}\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getCustomNodePartitionOrdering() {\n    SPDLOG_ERROR(\"Not implemented\");\n    std::tuple<vector<torch::Tensor>, vector<torch::Tensor>> ret;\n    return ret;\n}\n\nstd::tuple<vector<torch::Tensor>, vector<torch::Tensor>> getCustomEdgeBucketOrdering() {\n    SPDLOG_ERROR(\"Not implemented\");\n    std::tuple<vector<torch::Tensor>, vector<torch::Tensor>> ret;\n    return ret;\n}\n"
  },
  {
    "path": "src/cpp/src/data/samplers/edge.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/8/22.\n//\n\n#include \"data/samplers/edge.h\"\n\nRandomEdgeSampler::RandomEdgeSampler(shared_ptr<GraphModelStorage> graph_storage, bool without_replacement) {\n    graph_storage_ = graph_storage;\n    without_replacement_ = without_replacement;\n}\n\nEdgeList RandomEdgeSampler::getEdges(shared_ptr<Batch> batch) {\n    return graph_storage_->getEdgesRange(batch->start_idx_, batch->batch_size_).clone().to(torch::kInt64);\n}\n"
  },
  {
    "path": "src/cpp/src/data/samplers/negative.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/8/22.\n//\n\n#include \"data/samplers/negative.h\"\n\nstd::tuple<torch::Tensor, torch::Tensor> batch_sample(torch::Tensor edges, int num_negatives, bool inverse) {\n    auto device = edges.device();\n    int64_t batch_size = edges.size(0);\n    Indices sample_edge_id = torch::randint(0, batch_size, {num_negatives}, device).to(torch::kInt64);\n    torch::Tensor edge_sample;\n\n    if (inverse) {\n        edge_sample = edges.index_select(0, sample_edge_id).select(1, 0);\n    } else {\n        edge_sample = edges.index_select(0, sample_edge_id).select(1, -1);\n    }\n    return std::forward_as_tuple(edge_sample, sample_edge_id);\n}\n\ntorch::Tensor deg_negative_local_filter(torch::Tensor deg_sample_indices, torch::Tensor edges) {\n    if (!deg_sample_indices.defined()) {\n        torch::TensorOptions ind_opts = torch::TensorOptions().dtype(torch::kInt64).device(edges.device());\n        return torch::empty({0, 2}, ind_opts);\n    }\n\n    int64_t num_chunks = deg_sample_indices.size(0);\n    int64_t chunk_size = ceil((double)edges.size(0) / num_chunks);\n    int64_t num_deg_negs = deg_sample_indices.size(1);\n\n    torch::Tensor chunk_ids = deg_sample_indices.div(chunk_size, \"trunc\");\n    torch::Tensor inv_mask = chunk_ids - torch::arange(0, num_chunks, deg_sample_indices.device()).view({num_chunks, -1});\n    torch::Tensor mask = (inv_mask == 0);\n    torch::Tensor temp_idx = torch::nonzero(mask);\n    torch::Tensor id_offsets = deg_sample_indices.flatten(0, 1).index_select(0, temp_idx.select(1, 0) * num_deg_negs + temp_idx.select(1, 1));\n\n    torch::Tensor filter = torch::stack({id_offsets, temp_idx.select(1, 1)}).transpose(0, 1);\n    return filter;\n}\n\ntorch::Tensor compute_filter_corruption(shared_ptr<MariusGraph> graph, torch::Tensor edges, torch::Tensor corruption_nodes, bool inverse, bool global,\n                                        LocalFilterMode local_filter_mode, torch::Tensor deg_sample_indices) {\n    if (edges.is_cuda()) {\n        return compute_filter_corruption_gpu(graph, edges, corruption_nodes, inverse, global, local_filter_mode, deg_sample_indices);\n    } else {\n        return compute_filter_corruption_cpu(graph, edges, corruption_nodes, inverse, global, local_filter_mode, deg_sample_indices);\n    }\n}\n\ntorch::Tensor compute_filter_corruption_cpu(shared_ptr<MariusGraph> graph, torch::Tensor edges, torch::Tensor corruption_nodes, bool inverse, bool global,\n                                            LocalFilterMode local_filter_mode, torch::Tensor deg_sample_indices) {\n    if (local_filter_mode == LocalFilterMode::DEG && !global) {\n        return deg_negative_local_filter(deg_sample_indices, edges);\n    }\n\n    bool has_relations;\n\n    if (edges.dim() == 3) {\n        edges = edges.flatten(0, 1);\n    } else if (edges.dim() != 2) {\n        throw TensorSizeMismatchException(edges, \"Edge list must have three (if chunked) or two dimensions\");\n    }\n\n    if (edges.size(-1) == 3) {\n        has_relations = true;\n    } else if (edges.size(-1) == 2) {\n        has_relations = false;\n    } else {\n        throw TensorSizeMismatchException(edges, \"Edge list tensor must have 3 or 2 columns.\");\n    }\n\n    int64_t num_chunks = corruption_nodes.size(0);\n    int64_t num_edges = edges.size(0);\n    int64_t chunk_size = ceil((double)num_edges / num_chunks);\n\n    torch::Tensor all_sorted_edges;\n    torch::Tensor all_sorted_nodes;\n    torch::Tensor nodes;\n    int tup_id;\n    int corrupt_id;\n\n    if (inverse) {\n        if (has_relations) {\n            tup_id = 2;\n        } else {\n            tup_id = 1;\n        }\n\n        corrupt_id = 0;\n\n        nodes = edges.select(1, tup_id).contiguous();\n\n        if (global) {\n            if (graph->all_dst_sorted_edges_.defined()) {\n                all_sorted_edges = graph->all_dst_sorted_edges_;\n            } else {\n                all_sorted_edges = graph->dst_sorted_edges_;\n            }\n\n        } else {\n            all_sorted_edges = edges.index_select(0, nodes.argsort());\n        }\n\n        all_sorted_nodes = all_sorted_edges.select(1, tup_id).contiguous();\n\n    } else {\n        tup_id = 0;\n\n        if (has_relations) {\n            corrupt_id = 2;\n        } else {\n            corrupt_id = 1;\n        }\n\n        nodes = edges.select(1, tup_id).contiguous();\n\n        if (global) {\n            if (graph->all_src_sorted_edges_.defined()) {\n                all_sorted_edges = graph->all_src_sorted_edges_;\n            } else {\n                all_sorted_edges = graph->src_sorted_edges_;\n            }\n        } else {\n            all_sorted_edges = edges.index_select(0, nodes.argsort());\n        }\n\n        all_sorted_nodes = all_sorted_edges.select(1, tup_id).contiguous();\n    }\n\n    std::vector<std::vector<int64_t>> filters(num_edges);\n\n    torch::Tensor starts = torch::searchsorted(all_sorted_nodes, nodes);\n    torch::Tensor ends = torch::searchsorted(all_sorted_nodes, nodes + 1);\n\n    auto edges_accessor = edges.accessor<int64_t, 2>();\n    auto starts_accessor = starts.accessor<int64_t, 1>();\n    auto ends_accessor = ends.accessor<int64_t, 1>();\n    auto sorted_edges_accessor = all_sorted_edges.accessor<int64_t, 2>();\n    auto negs_accessor = corruption_nodes.accessor<int64_t, 2>();\n\n    if (global) {\n#pragma omp parallel for\n        for (int64_t edge_id = 0; edge_id < nodes.size(0); edge_id++) {\n            int64_t curr_start = starts_accessor[edge_id];\n            int64_t curr_end = ends_accessor[edge_id];\n\n            for (int64_t curr = curr_start; curr < curr_end; curr++) {\n                if ((has_relations && sorted_edges_accessor[curr][1] == edges_accessor[edge_id][1]) || !has_relations) {\n                    filters[edge_id].emplace_back(sorted_edges_accessor[curr][corrupt_id]);\n                }\n            }\n        }\n    } else {\n#pragma omp parallel for\n        for (int64_t edge_id = 0; edge_id < nodes.size(0); edge_id++) {\n            int64_t curr_start = starts_accessor[edge_id];\n            int64_t curr_end = ends_accessor[edge_id];\n\n            int chunk_id = edge_id / chunk_size;\n\n            for (int64_t neg_id = 0; neg_id < corruption_nodes.size(1); neg_id++) {\n                int64_t neg_node = negs_accessor[chunk_id][neg_id];\n\n                for (int64_t curr = curr_start; curr < curr_end; curr++) {\n                    if (sorted_edges_accessor[curr][corrupt_id] == neg_node) {\n                        if ((has_relations && sorted_edges_accessor[curr][1] == edges_accessor[edge_id][1]) || !has_relations) {\n                            filters[edge_id].emplace_back(neg_id);\n                            break;\n                        }\n                    }\n                }\n            }\n        }\n    }\n\n    int64_t num_filt = 0;\n\n    for (int64_t edge_id = 0; edge_id < nodes.size(0); edge_id++) {\n        num_filt += filters[edge_id].size();\n    }\n\n    torch::Tensor filter = torch::empty({num_filt, 2}, torch::kInt64);\n\n    auto filter_accessor = filter.accessor<int64_t, 2>();\n\n    int64_t offset = 0;\n    for (int64_t edge_id = 0; edge_id < nodes.size(0); edge_id++) {\n        for (int64_t j = 0; j < filters[edge_id].size(); j++) {\n            filter_accessor[offset][0] = edge_id;\n            filter_accessor[offset][1] = filters[edge_id][j];\n            offset++;\n        }\n    }\n    return filter;\n}\n\ntorch::Tensor compute_filter_corruption_gpu(shared_ptr<MariusGraph> graph, torch::Tensor edges, torch::Tensor corruption_nodes, bool inverse, bool global,\n                                            LocalFilterMode local_filter_mode, torch::Tensor deg_sample_indices) {\n    if (local_filter_mode == LocalFilterMode::DEG && !global) {\n        return deg_negative_local_filter(deg_sample_indices, edges);\n    }\n\n    bool has_relations;\n\n    if (edges.dim() == 3) {\n        edges = edges.flatten(0, 1);\n    } else if (edges.dim() != 2) {\n        throw TensorSizeMismatchException(edges, \"Edge list must have three (if chunked) or two dimensions\");\n    }\n\n    if (edges.size(-1) == 3) {\n        has_relations = true;\n    } else if (edges.size(-1) == 2) {\n        has_relations = false;\n    } else {\n        throw TensorSizeMismatchException(edges, \"Edge list tensor must have 3 or 2 columns.\");\n    }\n\n    int64_t num_chunks = corruption_nodes.size(0);\n    int64_t num_edges = edges.size(0);\n    int64_t chunk_size = ceil((double)num_edges / num_chunks);\n\n    int64_t negs_per_pos = corruption_nodes.size(1);\n\n    torch::Tensor filter;\n    torch::Tensor all_sorted_edges;\n    torch::Tensor all_sorted_nodes;\n    torch::Tensor nodes;\n    int tup_id;\n    int corrupt_id;\n\n    if (inverse) {\n        if (has_relations) {\n            tup_id = 2;\n        } else {\n            tup_id = 1;\n        }\n\n        corrupt_id = 0;\n\n        nodes = edges.select(1, tup_id).contiguous();\n\n        if (global) {\n            all_sorted_edges = graph->all_dst_sorted_edges_;\n        } else {\n            all_sorted_edges = edges.index_select(0, nodes.argsort());\n        }\n\n        all_sorted_nodes = all_sorted_edges.select(1, tup_id).contiguous();\n    } else {\n        tup_id = 0;\n\n        if (has_relations) {\n            corrupt_id = 2;\n        } else {\n            corrupt_id = 1;\n        }\n\n        nodes = edges.select(1, tup_id).contiguous();\n\n        if (global) {\n            all_sorted_edges = graph->all_src_sorted_edges_;\n        } else {\n            all_sorted_edges = edges.index_select(0, nodes.argsort());\n        }\n\n        all_sorted_nodes = all_sorted_edges.select(1, tup_id).contiguous();\n    }\n\n    torch::Tensor starts = torch::searchsorted(all_sorted_nodes, nodes);\n    torch::Tensor ends = torch::searchsorted(all_sorted_nodes, nodes + 1);\n    torch::Tensor num_neighbors = ends - starts;\n\n    torch::Tensor summed_num_neighbors = num_neighbors.cumsum(0);\n    Indices local_offsets = summed_num_neighbors - num_neighbors;\n\n    if (global) {\n        torch::Tensor repeated_starts = starts.repeat_interleave(num_neighbors);\n        torch::Tensor repeated_offsets = local_offsets.repeat_interleave(num_neighbors);\n        torch::Tensor arange = torch::arange(repeated_offsets.size(0), edges.options());\n        torch::Tensor sorted_list_idx = repeated_starts + arange - repeated_offsets;\n\n        torch::Tensor batch_neighbors = all_sorted_edges.index_select(0, sorted_list_idx);\n        torch::Tensor edge_ids = torch::arange(edges.size(0), edges.options()).repeat_interleave(num_neighbors);\n\n        if (has_relations) {\n            torch::Tensor filter_tmp_ids =\n                torch::cat({edge_ids.view({-1, 1}), batch_neighbors.select(1, 1).view({-1, 1}), batch_neighbors.select(1, corrupt_id).view({-1, 1})}, 1);\n            torch::Tensor rel_ids = edges.select(1, 1).repeat_interleave(num_neighbors);\n            torch::Tensor mask = filter_tmp_ids.select(1, 1) == rel_ids;\n            filter_tmp_ids = filter_tmp_ids.index_select(0, torch::arange(filter_tmp_ids.size(0), filter_tmp_ids.options()).masked_select(mask));\n            filter = torch::cat({filter_tmp_ids.select(1, 0).view({-1, 1}), filter_tmp_ids.select(1, 2).view({-1, 1})}, 1);\n        } else {\n            filter = torch::cat({edge_ids.view({-1, 1}), batch_neighbors.select(1, corrupt_id).view({-1, 1})}, 1);\n        }\n    } else {\n        // TODO implement local filtering on the GPU, filter needs to be an int64, shape [*, 2], unit tests for this would be good\n        // like above when edges are int32 the filter may end up as int32\n        //        torch::TensorOptions ind_opts = torch::TensorOptions().dtype(torch::kInt64).device(edges.device());\n        //        filter = torch::empty({0, 2}, ind_opts);\n        throw MariusRuntimeException(\"Local filtering against all edges in the batch not yet supported on GPU.\");\n    }\n    return filter;\n}\n\ntorch::Tensor apply_score_filter(torch::Tensor scores, torch::Tensor filter) {\n    if (filter.defined()) {\n        scores.index_put_({filter.select(1, 0), filter.select(1, 1)}, -1e9);\n    }\n    return scores;\n}\n\nCorruptNodeNegativeSampler::CorruptNodeNegativeSampler(int num_chunks, int num_negatives, float degree_fraction, bool filtered,\n                                                       LocalFilterMode local_filter_mode) {\n    num_chunks_ = num_chunks;\n    num_negatives_ = num_negatives;\n    degree_fraction_ = degree_fraction;\n    filtered_ = filtered;\n    local_filter_mode_ = local_filter_mode;\n\n    if (filtered_) {\n        num_chunks_ = 1;\n        num_negatives_ = -1;\n        degree_fraction_ = 0.0;\n    }\n}\n\nstd::tuple<torch::Tensor, torch::Tensor> CorruptNodeNegativeSampler::getNegatives(shared_ptr<MariusGraph> graph, torch::Tensor edges, bool inverse) {\n    vector<Indices> ret_indices(num_chunks_);\n    vector<Indices> deg_sample_indices_vec(num_chunks_);\n\n    int64_t num_nodes = graph->num_nodes_in_memory_;\n\n    int num_batch = (int)(num_negatives_ * degree_fraction_);\n    int num_uni = num_negatives_ - num_batch;\n\n    torch::TensorOptions ind_opts = torch::TensorOptions().dtype(torch::kInt64).device(edges.device());\n\n    // sample uniform nodes\n    for (int j = 0; j < num_chunks_; j++) {\n        if (num_negatives_ != -1) {\n            ret_indices[j] = torch::randint(num_nodes, {num_uni}, ind_opts);\n\n            if (degree_fraction_ > 0) {\n                auto tup = batch_sample(edges, num_batch, inverse);\n                torch::Tensor deg_sample = std::get<0>(tup);\n                ret_indices[j] = torch::cat({deg_sample, ret_indices[j]});\n\n                if (local_filter_mode_ == LocalFilterMode::DEG) {\n                    torch::Tensor sample_edge_id = std::get<1>(tup);\n                    deg_sample_indices_vec[j] = sample_edge_id;\n                }\n            }\n        } else {\n            ret_indices[j] = torch::arange(num_nodes, ind_opts);\n        }\n    }\n\n    torch::Tensor output_ids = torch::stack(ret_indices);\n    torch::Tensor deg_sample_indices;\n    if (degree_fraction_ > 0 && local_filter_mode_ == LocalFilterMode::DEG) {\n        deg_sample_indices = torch::stack(deg_sample_indices_vec);\n    }\n    torch::Tensor score_filter = compute_filter_corruption(graph, edges, output_ids, inverse, filtered_, local_filter_mode_, deg_sample_indices);\n    return std::forward_as_tuple(output_ids, score_filter);\n}"
  },
  {
    "path": "src/cpp/src/data/samplers/neighbor.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/8/22.\n//\n\n#include \"data/samplers/neighbor.h\"\n\n#include <parallel_hashmap/phmap.h>\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_all_gpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                        torch::Tensor num_neighbors) {\n    torch::Tensor repeated_starts = global_offsets.repeat_interleave(num_neighbors);\n    torch::Tensor repeated_offsets = local_offsets.repeat_interleave(num_neighbors);\n    torch::Tensor arange = torch::arange(repeated_offsets.size(0), edges.options());\n    torch::Tensor sorted_list_idx = repeated_starts + arange - repeated_offsets;\n\n    return std::forward_as_tuple(edges.index_select(0, sorted_list_idx), local_offsets);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_all_cpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                        torch::Tensor num_neighbors, int64_t total_neighbors) {\n    auto global_offsets_accessor = global_offsets.accessor<int64_t, 1>();\n    auto local_offsets_accessor = local_offsets.accessor<int64_t, 1>();\n    auto num_neighbors_accessor = num_neighbors.accessor<int64_t, 1>();\n\n    int num_columns = edges.size(1);\n\n    auto options = edges.options();\n#ifdef MARIUS_CUDA\n    options = options.pinned_memory(true);\n#endif\n\n    Indices ret_neighbor_id_edges = torch::empty({total_neighbors, num_columns}, options);\n    int64_t *ret_neighbor_id_edges_mem = ret_neighbor_id_edges.data_ptr<int64_t>();\n\n    int64_t *sorted_list_ptr = edges.data_ptr<int64_t>();\n\n    if (num_columns == 3) {\n#pragma omp parallel\n        {\n#pragma omp for\n            for (int i = 0; i < local_offsets.size(0); i++) {\n                int64_t local_offset = local_offsets_accessor[i];\n                int64_t global_offset = global_offsets_accessor[i];\n                int64_t num_edges = num_neighbors_accessor[i];\n\n                int count = 0;\n\n                // can this be optimized even further?\n                for (int64_t j = global_offset; j < global_offset + num_edges; j++) {\n                    *(ret_neighbor_id_edges_mem + (3 * (local_offset + count))) = *(sorted_list_ptr + (3 * j));\n                    *(ret_neighbor_id_edges_mem + (3 * (local_offset + count)) + 1) = *(sorted_list_ptr + (3 * j) + 1);\n                    *(ret_neighbor_id_edges_mem + (3 * (local_offset + count)) + 2) = *(sorted_list_ptr + (3 * j) + 2);\n                    count++;\n                }\n            }\n        }\n    } else {\n#pragma omp parallel\n        {\n#pragma omp for\n            for (int i = 0; i < local_offsets.size(0); i++) {\n                int64_t local_offset = local_offsets_accessor[i];\n                int64_t global_offset = global_offsets_accessor[i];\n                int64_t num_edges = num_neighbors_accessor[i];\n\n                int count = 0;\n\n                // can this be optimized even further?\n                for (int64_t j = global_offset; j < global_offset + num_edges; j++) {\n                    *(ret_neighbor_id_edges_mem + (2 * (local_offset + count))) = *(sorted_list_ptr + (2 * j));\n                    *(ret_neighbor_id_edges_mem + (2 * (local_offset + count)) + 1) = *(sorted_list_ptr + (2 * j) + 1);\n                    count++;\n                }\n            }\n        }\n    }\n    return std::forward_as_tuple(ret_neighbor_id_edges, local_offsets);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_uniform_gpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                            torch::Tensor num_neighbors, int64_t max_neighbors, int64_t max_id) {\n    torch::Tensor mask = num_neighbors > max_neighbors;\n\n    torch::Tensor capped_num_neighbors = num_neighbors.masked_fill(mask, max_neighbors);\n    local_offsets = capped_num_neighbors.cumsum(0) - capped_num_neighbors;\n\n    torch::Tensor repeated_starts = global_offsets.repeat_interleave(capped_num_neighbors);\n    torch::Tensor repeated_offsets = local_offsets.repeat_interleave(capped_num_neighbors);\n    torch::Tensor arange = torch::arange(repeated_offsets.size(0), edges.options());\n    torch::Tensor ranged_sorted_list_idx = repeated_starts + arange - repeated_offsets;\n\n    torch::Tensor repeated_num_neighbors = num_neighbors.repeat_interleave(capped_num_neighbors);\n    torch::Tensor rand_samples = torch::randint(max_id, repeated_offsets.sizes(), edges.options());\n\n    rand_samples.fmod_(repeated_num_neighbors);\n    torch::Tensor sampled_sorted_list_idx = repeated_starts + rand_samples;\n\n    mask = mask.repeat_interleave(capped_num_neighbors);\n    torch::Tensor sorted_list_idx = torch::where(mask, sampled_sorted_list_idx, ranged_sorted_list_idx);\n\n    return std::forward_as_tuple(edges.index_select(0, sorted_list_idx), local_offsets);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_uniform_cpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                            torch::Tensor num_neighbors, int64_t max_neighbors, int64_t total_neighbors) {\n    auto global_offsets_accessor = global_offsets.accessor<int64_t, 1>();\n    auto num_neighbors_accessor = num_neighbors.accessor<int64_t, 1>();\n\n    auto capped_num_neighbors = num_neighbors.clone();\n    auto capped_num_neighbors_accessor = capped_num_neighbors.accessor<int64_t, 1>();\n    int64_t *capped_num_neighbors_mem = capped_num_neighbors.data_ptr<int64_t>();\n\n#pragma omp parallel for schedule(runtime)\n    for (int i = 0; i < local_offsets.size(0); i++) {\n        if (capped_num_neighbors_accessor[i] > max_neighbors) {\n            *(capped_num_neighbors_mem + i) = max_neighbors;\n        }\n    }\n\n    int num_columns = edges.size(1);\n\n    torch::Tensor summed_num_neighbors = capped_num_neighbors.cumsum(0);\n    local_offsets = summed_num_neighbors - capped_num_neighbors;\n    total_neighbors = summed_num_neighbors[-1].item<int64_t>();\n\n    auto local_offsets_accessor = local_offsets.accessor<int64_t, 1>();\n\n    auto options = edges.options();\n#ifdef MARIUS_CUDA\n    options = options.pinned_memory(true);\n#endif\n    Indices ret_neighbor_id_edges = torch::empty({total_neighbors, num_columns}, options);\n    int64_t *ret_neighbor_id_edges_mem = ret_neighbor_id_edges.data_ptr<int64_t>();\n\n    int64_t *sorted_list_ptr = edges.data_ptr<int64_t>();\n\n    // setup seeds\n    unsigned int num_threads = 1;\n\n#ifdef MARIUS_OMP\n    #pragma omp parallel\n    {\n    #pragma omp single\n        num_threads = omp_get_num_threads();\n    }\n#endif\n\n    std::vector<unsigned int> tid_seeds(num_threads);\n\n    for (int i = 0; i < num_threads; i++) {\n        tid_seeds[i] = rand();\n    }\n\n    if (num_columns == 3) {\n#pragma omp parallel default(none) shared(tid_seeds, local_offsets_accessor, local_offsets, global_offsets_accessor, global_offsets, num_neighbors_accessor, \\\n                                              num_neighbors, max_neighbors, sorted_list_ptr, edges, ret_neighbor_id_edges_mem, ret_neighbor_id_edges)\n        {\n#ifdef MARIUS_OMP\n            unsigned int seed = tid_seeds[omp_get_thread_num()];\n#else\n            unsigned int seed = tid_seeds[0];\n#endif\n\n#pragma omp for schedule(runtime)\n            for (int i = 0; i < local_offsets.size(0); i++) {\n                int64_t local_offset = local_offsets_accessor[i];\n                int64_t global_offset = global_offsets_accessor[i];\n                int64_t num_edges = num_neighbors_accessor[i];\n\n                if (num_edges > max_neighbors) {\n                    int count = 0;\n                    int64_t rand_id = 0;\n#pragma unroll\n                    for (int64_t j = 0; j < max_neighbors; j++) {\n                        rand_id = 3 * (global_offset + (rand_r(&seed) % num_edges));\n\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + count))) = *(sorted_list_ptr + rand_id);\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + count)) + 1) = *(sorted_list_ptr + rand_id + 1);\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + count)) + 2) = *(sorted_list_ptr + rand_id + 2);\n                        count++;\n                    }\n                } else {\n                    int count = 0;\n#pragma unroll\n                    for (int64_t j = global_offset; j < global_offset + num_edges; j++) {\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + count))) = *(sorted_list_ptr + (3 * j));\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + count)) + 1) = *(sorted_list_ptr + (3 * j) + 1);\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + count)) + 2) = *(sorted_list_ptr + (3 * j) + 2);\n                        count++;\n                    }\n                }\n            }\n        }\n    } else {\n#pragma omp parallel default(none) shared(tid_seeds, local_offsets_accessor, local_offsets, global_offsets_accessor, global_offsets, num_neighbors_accessor, \\\n                                              num_neighbors, max_neighbors, sorted_list_ptr, edges, ret_neighbor_id_edges_mem, ret_neighbor_id_edges)\n        {\n#ifdef MARIUS_OMP\n            unsigned int seed = tid_seeds[omp_get_thread_num()];\n#else\n            unsigned int seed = tid_seeds[0];\n#endif\n\n#pragma omp for schedule(runtime)\n            for (int i = 0; i < local_offsets.size(0); i++) {\n                int64_t local_offset = local_offsets_accessor[i];\n                int64_t global_offset = global_offsets_accessor[i];\n                int64_t num_edges = num_neighbors_accessor[i];\n\n                if (num_edges > max_neighbors) {\n                    int count = 0;\n                    int64_t rand_id = 0;\n#pragma unroll\n                    for (int64_t j = 0; j < max_neighbors; j++) {\n                        rand_id = 2 * (global_offset + (rand_r(&seed) % num_edges));\n\n                        *(ret_neighbor_id_edges_mem + (2 * (local_offset + count))) = *(sorted_list_ptr + rand_id);\n                        *(ret_neighbor_id_edges_mem + (2 * (local_offset + count)) + 1) = *(sorted_list_ptr + rand_id + 1);\n                        count++;\n                    }\n                } else {\n                    int count = 0;\n#pragma unroll\n                    for (int64_t j = global_offset; j < global_offset + num_edges; j++) {\n                        *(ret_neighbor_id_edges_mem + (2 * (local_offset + count))) = *(sorted_list_ptr + (2 * j));\n                        *(ret_neighbor_id_edges_mem + (2 * (local_offset + count)) + 1) = *(sorted_list_ptr + (2 * j) + 1);\n                        count++;\n                    }\n                }\n            }\n        }\n    }\n    return std::forward_as_tuple(ret_neighbor_id_edges, local_offsets);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_dropout_gpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                            torch::Tensor num_neighbors, float rate) {\n    torch::Tensor repeated_starts = global_offsets.repeat_interleave(num_neighbors);\n    torch::Tensor repeated_offsets = local_offsets.repeat_interleave(num_neighbors);\n    torch::Tensor arange = torch::arange(repeated_offsets.size(0), edges.options());\n    torch::Tensor sorted_list_idx = repeated_starts + arange - repeated_offsets;\n\n    torch::Tensor keep_mask = torch::rand(sorted_list_idx.size(0), torch::TensorOptions().device(edges.device()));\n    keep_mask = torch::ge(keep_mask, rate);\n    sorted_list_idx = sorted_list_idx.masked_select(keep_mask);\n\n    torch::Tensor capped_num_neighbors = segmented_sum_with_offsets(keep_mask.to(torch::kInt64), local_offsets);\n\n    torch::Tensor summed_num_neighbors = capped_num_neighbors.cumsum(0);\n    local_offsets = summed_num_neighbors - capped_num_neighbors;\n\n    return std::forward_as_tuple(edges.index_select(0, sorted_list_idx), local_offsets);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor> sample_dropout_cpu(torch::Tensor edges, torch::Tensor global_offsets, torch::Tensor local_offsets,\n                                                            torch::Tensor num_neighbors, float rate, int64_t total_neighbors) {\n    auto global_offsets_accessor = global_offsets.accessor<int64_t, 1>();\n    auto local_offsets_accessor = local_offsets.accessor<int64_t, 1>();\n    auto num_neighbors_accessor = num_neighbors.accessor<int64_t, 1>();\n\n    auto capped_num_neighbors = num_neighbors.clone();\n    int64_t *capped_num_neighbors_mem = capped_num_neighbors.data_ptr<int64_t>();\n\n    torch::Tensor keep_mask = torch::rand(total_neighbors, edges.device());\n    auto keep_mask_accessor = keep_mask.accessor<float, 1>();\n\n    int num_columns = edges.size(1);\n\n#pragma omp parallel\n    {\n#pragma omp for\n        for (int i = 0; i < local_offsets.size(0); i++) {\n            int64_t local_offset = local_offsets_accessor[i];\n            int64_t num_edges = num_neighbors_accessor[i];\n\n            int count = 0;\n            for (int j = local_offset; j < local_offset + num_edges; j++) {\n                if (keep_mask_accessor[j] >= rate) {\n                    count++;\n                }\n            }\n            *(capped_num_neighbors_mem + i) = count;\n        }\n    }\n\n    torch::Tensor summed_num_neighbors = capped_num_neighbors.cumsum(0);\n    Indices new_local_offsets = summed_num_neighbors - capped_num_neighbors;\n    total_neighbors = summed_num_neighbors[-1].item<int64_t>();\n\n    auto new_local_offsets_accessor = new_local_offsets.accessor<int64_t, 1>();\n\n    auto options = edges.options();\n#ifdef MARIUS_CUDA\n    options = options.pinned_memory(true);\n#endif\n    Indices ret_neighbor_id_edges = torch::empty({total_neighbors, 3}, options);\n    int64_t *ret_neighbor_id_edges_mem = ret_neighbor_id_edges.data_ptr<int64_t>();\n\n    int64_t *sorted_list_ptr = edges.data_ptr<int64_t>();\n\n    if (num_columns == 3) {\n#pragma omp parallel\n        {\n#pragma omp for\n            for (int i = 0; i < local_offsets.size(0); i++) {\n                int64_t old_local_offset = local_offsets_accessor[i];\n                int64_t local_offset = new_local_offsets_accessor[i];\n                int64_t global_offset = global_offsets_accessor[i];\n                int64_t num_edges = num_neighbors_accessor[i];\n\n                int local_count = 0;\n                int global_count = 0;\n\n                // can this be optimized even further?\n                for (int64_t j = global_offset; j < global_offset + num_edges; j++) {\n                    if (keep_mask_accessor[old_local_offset + global_count] >= rate) {\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + local_count))) = *(sorted_list_ptr + (3 * j));\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + local_count)) + 1) = *(sorted_list_ptr + (3 * j) + 1);\n                        *(ret_neighbor_id_edges_mem + (3 * (local_offset + local_count)) + 2) = *(sorted_list_ptr + (3 * j) + 2);\n                        local_count++;\n                    }\n                    global_count++;\n                }\n            }\n        }\n    } else {\n#pragma omp parallel\n        {\n#pragma omp for\n            for (int i = 0; i < local_offsets.size(0); i++) {\n                int64_t old_local_offset = local_offsets_accessor[i];\n                int64_t local_offset = new_local_offsets_accessor[i];\n                int64_t global_offset = global_offsets_accessor[i];\n                int64_t num_edges = num_neighbors_accessor[i];\n\n                int local_count = 0;\n                int global_count = 0;\n\n                // can this be optimized even further?\n                for (int64_t j = global_offset; j < global_offset + num_edges; j++) {\n                    if (keep_mask_accessor[old_local_offset + global_count] >= rate) {\n                        *(ret_neighbor_id_edges_mem + (2 * (local_offset + local_count))) = *(sorted_list_ptr + (2 * j));\n                        *(ret_neighbor_id_edges_mem + (2 * (local_offset + local_count)) + 1) = *(sorted_list_ptr + (2 * j) + 1);\n                        local_count++;\n                    }\n                    global_count++;\n                }\n            }\n        }\n    }\n    return std::forward_as_tuple(ret_neighbor_id_edges, new_local_offsets);\n}\n\nLayeredNeighborSampler::LayeredNeighborSampler(shared_ptr<GraphModelStorage> storage, std::vector<shared_ptr<NeighborSamplingConfig>> layer_configs,\n                                               bool use_incoming_nbrs, bool use_outgoing_nbrs) {\n    storage_ = storage;\n    graph_ = nullptr;\n    sampling_layers_ = layer_configs;\n    use_incoming_nbrs_ = use_incoming_nbrs;\n    use_outgoing_nbrs_ = use_outgoing_nbrs;\n\n    checkLayerConfigs();\n}\n\nLayeredNeighborSampler::LayeredNeighborSampler(shared_ptr<MariusGraph> graph, std::vector<shared_ptr<NeighborSamplingConfig>> layer_configs,\n                                               bool use_incoming_nbrs, bool use_outgoing_nbrs) {\n    graph_ = graph;\n    storage_ = nullptr;\n    sampling_layers_ = layer_configs;\n    use_incoming_nbrs_ = use_incoming_nbrs;\n    use_outgoing_nbrs_ = use_outgoing_nbrs;\n\n    checkLayerConfigs();\n}\n\nLayeredNeighborSampler::LayeredNeighborSampler(std::vector<shared_ptr<NeighborSamplingConfig>> layer_configs, bool use_incoming_nbrs, bool use_outgoing_nbrs) {\n    graph_ = nullptr;\n    storage_ = nullptr;\n    sampling_layers_ = layer_configs;\n    use_incoming_nbrs_ = use_incoming_nbrs;\n    use_outgoing_nbrs_ = use_outgoing_nbrs;\n\n    checkLayerConfigs();\n}\n\nvoid LayeredNeighborSampler::checkLayerConfigs() {\n    use_hashmap_sets_ = false;\n    use_bitmaps_ = false;\n\n    for (int i = 0; i < sampling_layers_.size(); i++) {\n        if (use_bitmaps_ && sampling_layers_[i]->use_hashmap_sets) {\n            throw std::runtime_error(\"Layers with use_hashmap_sets equal to true must come before those set to false.\");\n        }\n        if (sampling_layers_[i]->use_hashmap_sets) {\n            use_hashmap_sets_ = true;\n        } else {\n            use_bitmaps_ = true;\n        }\n    }\n}\n\nDENSEGraph LayeredNeighborSampler::getNeighbors(torch::Tensor node_ids, shared_ptr<MariusGraph> graph, int worker_id) {\n    Indices hop_offsets;\n    torch::Tensor incoming_edges;\n    Indices incoming_offsets;\n    Indices in_neighbors_mapping;\n    torch::Tensor outgoing_edges;\n    Indices outgoing_offsets;\n    Indices out_neighbors_mapping;\n\n    std::vector<torch::Tensor> incoming_edges_vec;\n    std::vector<torch::Tensor> outgoing_edges_vec;\n\n    auto device_options = torch::TensorOptions().dtype(torch::kInt64).device(node_ids.device());\n    hop_offsets = torch::zeros({1}, device_options);\n    Indices delta_ids = node_ids;\n\n    int gpu = 0;\n    if (node_ids.is_cuda()) {\n        gpu = 1;\n    }\n\n    if (graph == nullptr) {\n        if (storage_ != nullptr) {\n            graph = storage_->current_subgraph_state_->in_memory_subgraph_;\n        } else if (graph_ != nullptr) {\n            graph = graph_;\n        } else {\n            throw MariusRuntimeException(\"Graph to sample from is undefined\");\n        }\n    }\n\n    int64_t num_nodes_in_memory = graph->num_nodes_in_memory_;\n\n    // data structures for calculating the delta_ids\n    torch::Tensor hash_map;\n    //    void *hash_map_mem;\n    auto bool_device_options = torch::TensorOptions().dtype(torch::kBool).device(node_ids.device());\n\n    phmap::flat_hash_set<int64_t> seen_unique_nodes;\n    phmap::flat_hash_set<int64_t>::const_iterator found;\n    vector<int64_t> delta_ids_vec;\n\n    if (gpu) {\n        hash_map = torch::zeros({num_nodes_in_memory}, bool_device_options);\n    } else {\n        if (use_bitmaps_) {\n            hash_map = graph->hash_maps_[worker_id];\n        }\n        if (use_hashmap_sets_) {\n            seen_unique_nodes.reserve(node_ids.size(0));\n        }\n    }\n\n    for (int i = 0; i < sampling_layers_.size(); i++) {\n        torch::Tensor delta_incoming_edges;\n        Indices delta_incoming_offsets;\n        torch::Tensor delta_outgoing_edges;\n        Indices delta_outgoing_offsets;\n\n        NeighborSamplingLayer layer_type = sampling_layers_[i]->type;\n        auto options = sampling_layers_[i]->options;\n\n        int max_neighbors = -1;\n        float rate = 0.0;\n        if (layer_type == NeighborSamplingLayer::UNIFORM) {\n            max_neighbors = std::dynamic_pointer_cast<UniformSamplingOptions>(options)->max_neighbors;\n        } else if (layer_type == NeighborSamplingLayer::DROPOUT) {\n            rate = std::dynamic_pointer_cast<DropoutSamplingOptions>(options)->rate;\n        }\n\n        if (delta_ids.size(0) > 0) {\n            if (use_incoming_nbrs_) {\n                auto tup = graph->getNeighborsForNodeIds(delta_ids, true, layer_type, max_neighbors, rate);\n                delta_incoming_edges = std::get<0>(tup);\n                delta_incoming_offsets = std::get<1>(tup);\n            }\n\n            if (use_outgoing_nbrs_) {\n                auto tup = graph->getNeighborsForNodeIds(delta_ids, false, layer_type, max_neighbors, rate);\n                delta_outgoing_edges = std::get<0>(tup);\n                delta_outgoing_offsets = std::get<1>(tup);\n            }\n        }\n\n        if (incoming_offsets.defined()) {\n            if (delta_incoming_offsets.size(0) > 0) {\n                incoming_offsets = incoming_offsets + delta_incoming_edges.size(0);\n                incoming_offsets = torch::cat({delta_incoming_offsets, incoming_offsets}, 0);\n            }\n        } else {\n            incoming_offsets = delta_incoming_offsets;\n        }\n        if (delta_incoming_edges.size(0) > 0) {\n            incoming_edges_vec.emplace(incoming_edges_vec.begin(), delta_incoming_edges);\n        }\n\n        if (outgoing_offsets.defined()) {\n            if (delta_outgoing_offsets.size(0) > 0) {\n                outgoing_offsets = outgoing_offsets + delta_outgoing_edges.size(0);\n                outgoing_offsets = torch::cat({delta_outgoing_offsets, outgoing_offsets}, 0);\n            }\n        } else {\n            outgoing_offsets = delta_outgoing_offsets;\n        }\n        if (delta_outgoing_edges.size(0) > 0) {\n            outgoing_edges_vec.emplace(outgoing_edges_vec.begin(), delta_outgoing_edges);\n        }\n\n        // calculate delta_ids\n        if (node_ids.device().is_cuda()) {\n            if (i > 0) {\n                hash_map = 0 * hash_map;\n            }\n\n            if (delta_incoming_edges.size(0) > 0) {\n                hash_map.index_fill_(0, delta_incoming_edges.select(1, 0), 1);\n            }\n            if (delta_outgoing_edges.size(0) > 0) {\n                hash_map.index_fill_(0, delta_outgoing_edges.select(1, -1), 1);\n            }\n            hash_map.index_fill_(0, node_ids, 0);\n\n            delta_ids = hash_map.nonzero().flatten(0, 1);\n        } else {\n            if (!sampling_layers_[i]->use_hashmap_sets) {\n                delta_ids = computeDeltaIdsHelperMethod1(hash_map, node_ids, delta_incoming_edges, delta_outgoing_edges, num_nodes_in_memory);\n            } else {\n                delta_ids_vec.clear();\n\n                if (i == 0) {\n                    auto nodes_accessor = node_ids.accessor<int64_t, 1>();\n                    for (int j = 0; j < node_ids.size(0); j++) {\n                        seen_unique_nodes.emplace(nodes_accessor[j]);\n                    }\n                }\n\n                if (delta_incoming_edges.size(0) > 0) {\n                    auto incoming_accessor = delta_incoming_edges.accessor<int64_t, 2>();\n                    for (int j = 0; j < delta_incoming_edges.size(0); j++) {\n                        found = seen_unique_nodes.find(incoming_accessor[j][0]);\n                        if (found == seen_unique_nodes.end()) {\n                            delta_ids_vec.emplace_back(incoming_accessor[j][0]);\n                            seen_unique_nodes.emplace(incoming_accessor[j][0]);\n                        }\n                    }\n                }\n\n                if (delta_outgoing_edges.size(0) > 0) {\n                    int column_idx = delta_outgoing_edges.size(1) - 1;  // RW: -1 has some weird bug for accessor\n                    auto outgoing_accessor = delta_outgoing_edges.accessor<int64_t, 2>();\n                    for (int j = 0; j < delta_outgoing_edges.size(0); j++) {\n                        found = seen_unique_nodes.find(outgoing_accessor[j][column_idx]);\n                        if (found == seen_unique_nodes.end()) {\n                            delta_ids_vec.emplace_back(outgoing_accessor[j][column_idx]);\n                            seen_unique_nodes.emplace(outgoing_accessor[j][column_idx]);\n                        }\n                    }\n                }\n\n                delta_ids = torch::from_blob(delta_ids_vec.data(), {(int)delta_ids_vec.size()}, torch::kInt64);\n            }\n        }\n\n        hop_offsets = hop_offsets + delta_ids.size(0);\n        hop_offsets = torch::cat({torch::zeros({1}, device_options), hop_offsets});\n\n        if (delta_ids.size(0) > 0) {\n            node_ids = torch::cat({delta_ids, node_ids}, 0);\n        }\n    }\n    hop_offsets = torch::cat({hop_offsets, torch::tensor({node_ids.size(0)}, device_options)});\n\n    DENSEGraph ret = DENSEGraph(hop_offsets, node_ids, incoming_offsets, incoming_edges_vec, in_neighbors_mapping, outgoing_offsets, outgoing_edges_vec,\n                                out_neighbors_mapping, num_nodes_in_memory);\n\n    //    if (!gpu and use_bitmaps_) {\n    //        free(hash_map_mem);\n    //    }\n\n    return ret;\n}\n\ntorch::Tensor LayeredNeighborSampler::computeDeltaIdsHelperMethod1(torch::Tensor hash_map, torch::Tensor node_ids, torch::Tensor delta_incoming_edges,\n                                                                   torch::Tensor delta_outgoing_edges, int64_t num_nodes_in_memory) {\n    unsigned int num_threads = 1;\n#ifdef MARIUS_OMP\n    #pragma omp parallel\n    {\n    #pragma omp single\n        num_threads = omp_get_num_threads();\n    }\n#endif\n\n    int64_t chunk_size = ceil((double)num_nodes_in_memory / num_threads);\n\n    auto hash_map_accessor = hash_map.accessor<bool, 1>();\n    auto nodes_accessor = node_ids.accessor<int64_t, 1>();\n\n#pragma omp parallel default(none) shared(delta_incoming_edges, delta_outgoing_edges, hash_map_accessor, hash_map, node_ids, nodes_accessor)\n    {\n        if (delta_incoming_edges.size(0) > 0) {\n            auto incoming_accessor = delta_incoming_edges.accessor<int64_t, 2>();\n\n#pragma omp for  // nowait -> can't have this because of the below if statement skipping directly to node ids for loop\n            for (int64_t j = 0; j < delta_incoming_edges.size(0); j++) {\n                if (!hash_map_accessor[incoming_accessor[j][0]]) {\n                    hash_map_accessor[incoming_accessor[j][0]] = 1;\n                }\n            }\n        }\n\n        if (delta_outgoing_edges.size(0) > 0) {\n            auto outgoing_accessor = delta_outgoing_edges.accessor<int64_t, 2>();\n            int column_idx = delta_outgoing_edges.size(1) - 1;  // RW: -1 has some weird bug for accessor\n\n#pragma omp for\n            for (int64_t j = 0; j < delta_outgoing_edges.size(0); j++) {\n                if (!hash_map_accessor[outgoing_accessor[j][column_idx]]) {\n                    hash_map_accessor[outgoing_accessor[j][column_idx]] = 1;\n                }\n            }\n        }\n\n#pragma omp for\n        for (int64_t j = 0; j < node_ids.size(0); j++) {\n            if (hash_map_accessor[nodes_accessor[j]]) {\n                hash_map_accessor[nodes_accessor[j]] = 0;\n            }\n        }\n    }\n\n    auto device_options = torch::TensorOptions().dtype(torch::kInt64).device(node_ids.device());\n    std::vector<torch::Tensor> sub_deltas = std::vector<torch::Tensor>(num_threads);\n    int64_t upper_bound = (int64_t)(delta_incoming_edges.size(0) + delta_outgoing_edges.size(0)) / num_threads + 1;\n\n    std::vector<int> sub_counts = std::vector<int>(num_threads, 0);\n    std::vector<int> sub_offsets = std::vector<int>(num_threads, 0);\n\n#pragma omp parallel\n    {\n#ifdef MARIUS_OMP\n        int tid = omp_get_thread_num();\n#else\n        int tid = 0;\n#endif\n\n        sub_deltas[tid] = torch::empty({upper_bound}, device_options);\n        auto delta_ids_accessor = sub_deltas[tid].accessor<int64_t, 1>();\n\n        int64_t start = chunk_size * tid;\n        int64_t end = start + chunk_size;\n\n        if (end > num_nodes_in_memory) {\n            end = num_nodes_in_memory;\n        }\n\n        int private_count = 0;\n        int grow_count = 0;\n\n#pragma unroll\n        for (int64_t j = start; j < end; j++) {\n            if (hash_map_accessor[j]) {\n                delta_ids_accessor[private_count++] = j;\n                hash_map_accessor[j] = 0;\n                grow_count++;\n\n                if (grow_count == upper_bound) {\n                    sub_deltas[tid] = torch::cat({sub_deltas[tid], torch::empty({upper_bound}, device_options)}, 0);\n                    delta_ids_accessor = sub_deltas[tid].accessor<int64_t, 1>();\n                    grow_count = 0;\n                }\n            }\n        }\n        sub_counts[tid] = private_count;\n    }\n\n    int count = 0;\n    for (auto c : sub_counts) {\n        count += c;\n    }\n\n    for (int k = 0; k < num_threads - 1; k++) {\n        sub_offsets[k + 1] = sub_offsets[k] + sub_counts[k];\n    }\n\n    torch::Tensor delta_ids = torch::empty({count}, device_options);\n\n#pragma omp parallel for\n    for (int k = 0; k < num_threads; k++) {\n        delta_ids.narrow(0, sub_offsets[k], sub_counts[k]) = sub_deltas[k].narrow(0, 0, sub_counts[k]);\n    }\n\n    return delta_ids;\n}"
  },
  {
    "path": "src/cpp/src/marius.cpp",
    "content": "\n#include \"marius.h\"\n\n#include \"common/util.h\"\n#include \"configuration/util.h\"\n#include \"pipeline/evaluator.h\"\n#include \"pipeline/graph_encoder.h\"\n#include \"pipeline/trainer.h\"\n#include \"reporting/logger.h\"\n#include \"storage/checkpointer.h\"\n#include \"storage/io.h\"\n\nvoid encode_and_export(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, shared_ptr<MariusConfig> marius_config) {\n    shared_ptr<GraphEncoder> graph_encoder;\n    if (marius_config->evaluation->pipeline->sync) {\n        graph_encoder = std::make_shared<SynchronousGraphEncoder>(dataloader, model);\n    } else {\n        graph_encoder = std::make_shared<PipelineGraphEncoder>(dataloader, model, marius_config->evaluation->pipeline);\n    }\n\n    string filename = marius_config->storage->model_dir + PathConstants::encoded_nodes_file + PathConstants::file_ext;\n\n    if (fileExists(filename)) {\n        remove(filename.c_str());\n    }\n\n    int64_t num_nodes = marius_config->storage->dataset->num_nodes;\n\n    int last_stage = marius_config->model->encoder->layers.size() - 1;\n    int last_layer = marius_config->model->encoder->layers[last_stage].size() - 1;\n    int64_t dim = marius_config->model->encoder->layers[last_stage][last_layer]->output_dim;\n\n    dataloader->graph_storage_->storage_ptrs_.encoded_nodes = std::make_shared<FlatFile>(filename, num_nodes, dim, torch::kFloat32, true);\n\n    graph_encoder->encode();\n}\n\nstd::tuple<shared_ptr<Model>, shared_ptr<GraphModelStorage>, shared_ptr<DataLoader> > marius_init(shared_ptr<MariusConfig> marius_config, bool train) {\n    Timer initialization_timer = Timer(false);\n    initialization_timer.start();\n    SPDLOG_INFO(\"Start initialization\");\n\n    MariusLogger marius_logger = MariusLogger(marius_config->storage->model_dir);\n    spdlog::set_default_logger(marius_logger.main_logger_);\n    marius_logger.setConsoleLogLevel(marius_config->storage->log_level);\n\n    torch::manual_seed(marius_config->model->random_seed);\n    srand(marius_config->model->random_seed);\n\n    std::vector<torch::Device> devices = devices_from_config(marius_config->storage);\n\n    shared_ptr<Model> model;\n    shared_ptr<GraphModelStorage> graph_model_storage;\n\n    int epochs_processed = 0;\n\n    if (train) {\n        // initialize new model\n        if (!marius_config->training->resume_training && marius_config->training->resume_from_checkpoint.empty()) {\n            model = initModelFromConfig(marius_config->model, devices, marius_config->storage->dataset->num_relations, true);\n            graph_model_storage = initializeStorage(model, marius_config->storage, !marius_config->training->resume_training, true);\n        } else {\n            auto checkpoint_loader = std::make_shared<Checkpointer>();\n\n            string checkpoint_dir = marius_config->storage->model_dir;\n            if (!marius_config->training->resume_from_checkpoint.empty()) {\n                checkpoint_dir = marius_config->training->resume_from_checkpoint;\n            }\n\n            auto tup = checkpoint_loader->load(checkpoint_dir, marius_config, true);\n            model = std::get<0>(tup);\n            graph_model_storage = std::get<1>(tup);\n\n            CheckpointMeta checkpoint_meta = std::get<2>(tup);\n            epochs_processed = checkpoint_meta.num_epochs;\n        }\n    } else {\n        auto checkpoint_loader = std::make_shared<Checkpointer>();\n\n        string checkpoint_dir = marius_config->storage->model_dir;\n        if (!marius_config->evaluation->checkpoint_dir.empty()) {\n            checkpoint_dir = marius_config->evaluation->checkpoint_dir;\n        }\n        auto tup = checkpoint_loader->load(checkpoint_dir, marius_config, false);\n        model = std::get<0>(tup);\n        graph_model_storage = std::get<1>(tup);\n\n        CheckpointMeta checkpoint_meta = std::get<2>(tup);\n        epochs_processed = checkpoint_meta.num_epochs;\n    }\n\n    shared_ptr<DataLoader> dataloader = std::make_shared<DataLoader>(graph_model_storage, model->learning_task_, marius_config->training,\n                                                                     marius_config->evaluation, marius_config->model->encoder);\n\n    dataloader->epochs_processed_ = epochs_processed;\n\n    initialization_timer.stop();\n    int64_t initialization_time = initialization_timer.getDuration();\n\n    SPDLOG_INFO(\"Initialization Complete: {}s\", (double)initialization_time / 1000);\n\n    return std::forward_as_tuple(model, graph_model_storage, dataloader);\n}\n\nvoid marius_train(shared_ptr<MariusConfig> marius_config) {\n    auto tup = marius_init(marius_config, true);\n    auto model = std::get<0>(tup);\n    auto graph_model_storage = std::get<1>(tup);\n    auto dataloader = std::get<2>(tup);\n\n    shared_ptr<Trainer> trainer;\n    shared_ptr<Evaluator> evaluator;\n\n    shared_ptr<Checkpointer> model_saver;\n    CheckpointMeta metadata;\n    if (marius_config->training->save_model) {\n        model_saver = std::make_shared<Checkpointer>(model, graph_model_storage, marius_config->training->checkpoint);\n        metadata.has_state = true;\n        metadata.has_encoded = marius_config->storage->export_encoded_nodes;\n        metadata.has_model = true;\n        metadata.link_prediction = marius_config->model->learning_task == LearningTask::LINK_PREDICTION;\n    }\n\n    if (marius_config->training->pipeline->sync) {\n        trainer = std::make_shared<SynchronousTrainer>(dataloader, model, marius_config->training->logs_per_epoch);\n    } else {\n        trainer = std::make_shared<PipelineTrainer>(dataloader, model, marius_config->training->pipeline, marius_config->training->logs_per_epoch);\n    }\n\n    if (marius_config->evaluation->pipeline->sync) {\n        evaluator = std::make_shared<SynchronousEvaluator>(dataloader, model);\n    } else {\n        evaluator = std::make_shared<PipelineEvaluator>(dataloader, model, marius_config->evaluation->pipeline);\n    }\n\n    int checkpoint_interval = marius_config->training->checkpoint->interval;\n    for (int epoch = 0; epoch < marius_config->training->num_epochs; epoch++) {\n        trainer->train(1);\n\n        if ((epoch + 1) % marius_config->evaluation->epochs_per_eval == 0) {\n            if (marius_config->storage->dataset->num_valid != -1) {\n                evaluator->evaluate(true);\n            }\n\n            if (marius_config->storage->dataset->num_test != -1) {\n                evaluator->evaluate(false);\n            }\n        }\n\n        metadata.num_epochs = dataloader->epochs_processed_;\n        if (checkpoint_interval > 0 && (epoch + 1) % checkpoint_interval == 0 && epoch + 1 < marius_config->training->num_epochs) {\n            model_saver->create_checkpoint(marius_config->storage->model_dir, metadata, dataloader->epochs_processed_);\n        }\n    }\n\n    if (marius_config->training->save_model) {\n        model_saver->save(marius_config->storage->model_dir, metadata);\n\n        if (marius_config->storage->export_encoded_nodes) {\n            encode_and_export(dataloader, model, marius_config);\n        }\n    }\n}\n\nvoid marius_eval(shared_ptr<MariusConfig> marius_config) {\n    auto tup = marius_init(marius_config, false);\n    auto model = std::get<0>(tup);\n    auto graph_model_storage = std::get<1>(tup);\n    auto dataloader = std::get<2>(tup);\n\n    shared_ptr<Evaluator> evaluator;\n\n    if (marius_config->evaluation->epochs_per_eval > 0) {\n        if (marius_config->evaluation->pipeline->sync) {\n            evaluator = std::make_shared<SynchronousEvaluator>(dataloader, model);\n        } else {\n            evaluator = std::make_shared<PipelineEvaluator>(dataloader, model, marius_config->evaluation->pipeline);\n        }\n        evaluator->evaluate(false);\n    }\n\n    if (marius_config->storage->export_encoded_nodes) {\n        encode_and_export(dataloader, model, marius_config);\n    }\n}\n\nvoid marius(int argc, char *argv[]) {\n    (void)argc;\n\n    bool train = true;\n    string command_path = string(argv[0]);\n    string config_path = string(argv[1]);\n    string command_name = command_path.substr(command_path.find_last_of(\"/\\\\\") + 1);\n    if (strcmp(command_name.c_str(), \"marius_eval\") == 0) {\n        train = false;\n    }\n\n    shared_ptr<MariusConfig> marius_config = loadConfig(config_path, true);\n\n    if (train) {\n        marius_train(marius_config);\n    } else {\n        marius_eval(marius_config);\n    }\n}\n\nint main(int argc, char *argv[]) { marius(argc, argv); }"
  },
  {
    "path": "src/cpp/src/nn/activation.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/7/21.\n//\n\n#include \"nn/activation.h\"\n\ntorch::Tensor apply_activation(ActivationFunction activation_function, torch::Tensor input) {\n    if (!input.defined()) {\n        throw UndefinedTensorException();\n    }\n\n    if (activation_function == ActivationFunction::RELU) {\n        return torch::relu(input);\n    } else if (activation_function == ActivationFunction::SIGMOID) {\n        return torch::sigmoid(input);\n    } else if (activation_function == ActivationFunction::NONE) {\n        return input;\n    } else {\n        throw MariusRuntimeException(\"Unsupported activation function\");\n    }\n}\n"
  },
  {
    "path": "src/cpp/src/nn/decoders/edge/comparators.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/decoders/edge/comparators.h\"\n\ntorch::Tensor pad_and_reshape(torch::Tensor input, int num_chunks) {\n    int num_pos = input.size(0);\n    int num_per_chunk = (int)ceil((float)num_pos / num_chunks);\n\n    if (num_per_chunk != num_pos / num_chunks) {\n        int64_t new_size = num_per_chunk * num_chunks;\n        torch::nn::functional::PadFuncOptions options({0, 0, 0, new_size - num_pos});\n        input = torch::nn::functional::pad(input, options);\n    }\n\n    input = input.view({num_chunks, num_per_chunk, input.size(1)});\n\n    return input;\n}\n\ntorch::Tensor L2Compare::operator()(torch::Tensor src, torch::Tensor dst) {\n    if (!src.defined() || !dst.defined()) {\n        throw UndefinedTensorException();\n    }\n\n    if (src.sizes() == dst.sizes()) {\n        return torch::pairwise_distance(src, dst);\n    } else {\n        src = pad_and_reshape(src, dst.size(0));\n\n        torch::Tensor x2 = (src.pow(2)).sum(2).unsqueeze(2);\n        torch::Tensor y2 = (dst.pow(2)).sum(2).unsqueeze(1);\n        torch::Tensor xy = torch::matmul(src, dst.transpose(1, 2));\n\n        double tol = 1e-8;\n\n        // (x - y)^2 = x^2 + y^2 - 2*x*y\n        return torch::sqrt(torch::clamp_min(x2 + y2 - 2 * xy, tol)).flatten(0, 1).clone();\n    }\n}\n\ntorch::Tensor CosineCompare::operator()(torch::Tensor src, torch::Tensor dst) {\n    if (!src.defined() || !dst.defined()) {\n        throw UndefinedTensorException();\n    }\n\n    torch::Tensor src_norm = src.norm(2, -1);\n    torch::Tensor dst_norm = dst.norm(2, -1);\n\n    torch::Tensor normalized_src = src * src_norm.clamp_min(1e-10).reciprocal().unsqueeze(-1);\n    torch::Tensor normalized_dst = dst * dst_norm.clamp_min(1e-10).reciprocal().unsqueeze(-1);\n\n    if (src.sizes() == dst.sizes()) {\n        return (src * dst).sum(-1);\n    } else {\n        src = pad_and_reshape(src, dst.size(0));\n        return src.bmm(dst.transpose(-1, -2)).flatten(0, 1);\n    }\n}\n\ntorch::Tensor DotCompare::operator()(torch::Tensor src, torch::Tensor dst) {\n    if (!src.defined() || !dst.defined()) {\n        throw UndefinedTensorException();\n    }\n\n    if (src.sizes() == dst.sizes()) {\n        return (src * dst).sum(-1);\n    } else {\n        src = pad_and_reshape(src, dst.size(0));\n        return src.bmm(dst.transpose(-1, -2)).flatten(0, 1);\n    }\n}"
  },
  {
    "path": "src/cpp/src/nn/decoders/edge/complex.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/decoders/edge/complex.h\"\n\nComplEx::ComplEx(int num_relations, int embedding_size, torch::TensorOptions tensor_options, bool use_inverse_relations, EdgeDecoderMethod decoder_method) {\n    comparator_ = std::make_shared<DotCompare>();\n    relation_operator_ = std::make_shared<ComplexHadamardOperator>();\n    num_relations_ = num_relations;\n    embedding_size_ = embedding_size;\n    use_inverse_relations_ = use_inverse_relations;\n    tensor_options_ = tensor_options;\n    decoder_method_ = decoder_method;\n\n    learning_task_ = LearningTask::LINK_PREDICTION;\n\n    ComplEx::reset();\n}\n\nvoid ComplEx::reset() {\n    relations_ = torch::zeros({num_relations_, embedding_size_}, tensor_options_);\n    relations_.narrow(1, 0, (embedding_size_ / 2)).fill_(1);\n    relations_ = register_parameter(\"relation_embeddings\", relations_).set_requires_grad(true);\n    if (use_inverse_relations_) {\n        inverse_relations_ = torch::zeros({num_relations_, embedding_size_}, tensor_options_);\n        inverse_relations_.narrow(1, 0, (embedding_size_ / 2)).fill_(1).set_requires_grad(true);\n        inverse_relations_ = register_parameter(\"inverse_relation_embeddings\", inverse_relations_);\n    }\n}"
  },
  {
    "path": "src/cpp/src/nn/decoders/edge/decoder_methods.cpp",
    "content": "//\n// Created by Jason Mohoney on 3/31/22.\n//\n\n#include \"nn/decoders/edge/decoder_methods.h\"\n\nstd::tuple<torch::Tensor, torch::Tensor> only_pos_forward(shared_ptr<EdgeDecoder> decoder, torch::Tensor edges, torch::Tensor node_embeddings) {\n    torch::Tensor pos_scores;\n    torch::Tensor inv_pos_scores;\n\n    bool has_relations;\n    if (edges.size(1) == 3) {\n        has_relations = true;\n    } else if (edges.size(1) == 2) {\n        has_relations = false;\n    } else {\n        throw TensorSizeMismatchException(edges, \"Edge list must be a 3 or 2 column tensor\");\n    }\n\n    torch::Tensor src = node_embeddings.index_select(0, edges.select(1, 0));\n    torch::Tensor dst = node_embeddings.index_select(0, edges.select(1, -1));\n\n    torch::Tensor rel_ids;\n\n    if (has_relations) {\n        rel_ids = edges.select(1, 1);\n\n        torch::Tensor rels = decoder->select_relations(rel_ids);\n\n        pos_scores = decoder->compute_scores(decoder->apply_relation(src, rels), dst);\n\n        if (decoder->use_inverse_relations_) {\n            torch::Tensor inv_rels = decoder->select_relations(rel_ids, true);\n\n            inv_pos_scores = decoder->compute_scores(decoder->apply_relation(dst, inv_rels), src);\n        }\n    } else {\n        pos_scores = decoder->compute_scores(src, dst);\n    }\n\n    return std::forward_as_tuple(pos_scores, inv_pos_scores);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> neg_and_pos_forward(shared_ptr<EdgeDecoder> decoder, torch::Tensor positive_edges,\n                                                                                           torch::Tensor negative_edges, torch::Tensor node_embeddings) {\n    torch::Tensor pos_scores;\n    torch::Tensor inv_pos_scores;\n    torch::Tensor neg_scores;\n    torch::Tensor inv_neg_scores;\n\n    std::tie(pos_scores, inv_pos_scores) = only_pos_forward(decoder, positive_edges, node_embeddings);\n    std::tie(neg_scores, inv_neg_scores) = only_pos_forward(decoder, negative_edges, node_embeddings);\n\n    return std::forward_as_tuple(pos_scores, neg_scores, inv_pos_scores, inv_neg_scores);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> node_corrupt_forward(shared_ptr<EdgeDecoder> decoder, torch::Tensor positive_edges,\n                                                                                            torch::Tensor node_embeddings, torch::Tensor dst_negs,\n                                                                                            torch::Tensor src_negs) {\n    torch::Tensor pos_scores;\n    torch::Tensor inv_pos_scores;\n    torch::Tensor neg_scores;\n    torch::Tensor inv_neg_scores;\n\n    bool has_relations;\n    if (positive_edges.size(1) == 3) {\n        has_relations = true;\n    } else if (positive_edges.size(1) == 2) {\n        has_relations = false;\n    } else {\n        throw TensorSizeMismatchException(positive_edges, \"Edge list must be a 3 or 2 column tensor\");\n    }\n\n    torch::Tensor src = node_embeddings.index_select(0, positive_edges.select(1, 0));\n    torch::Tensor dst = node_embeddings.index_select(0, positive_edges.select(1, -1));\n\n    torch::Tensor rel_ids;\n\n    torch::Tensor dst_neg_embs = node_embeddings.index_select(0, dst_negs.flatten(0, 1)).reshape({dst_negs.size(0), dst_negs.size(1), -1});\n\n    if (has_relations) {\n        rel_ids = positive_edges.select(1, 1);\n\n        torch::Tensor rels = decoder->select_relations(rel_ids);\n        torch::Tensor adjusted_src = decoder->apply_relation(src, rels);\n\n        pos_scores = decoder->compute_scores(adjusted_src, dst);\n        neg_scores = decoder->compute_scores(adjusted_src, dst_neg_embs);\n\n        if (decoder->use_inverse_relations_) {\n            torch::Tensor inv_rels = decoder->select_relations(rel_ids, true);\n            torch::Tensor adjusted_dst = decoder->apply_relation(dst, inv_rels);\n            torch::Tensor src_neg_embs = node_embeddings.index_select(0, src_negs.flatten(0, 1)).reshape({src_negs.size(0), src_negs.size(1), -1});\n\n            inv_pos_scores = decoder->compute_scores(adjusted_dst, src);\n            inv_neg_scores = decoder->compute_scores(adjusted_dst, src_neg_embs);\n        }\n    } else {\n        pos_scores = decoder->compute_scores(src, dst);\n        neg_scores = decoder->compute_scores(src, dst_neg_embs);\n    }\n\n    if (pos_scores.size(0) != neg_scores.size(0)) {\n        int64_t new_size = neg_scores.size(0) - pos_scores.size(0);\n        torch::nn::functional::PadFuncOptions options({0, new_size});\n        pos_scores = torch::nn::functional::pad(pos_scores, options);\n\n        if (inv_pos_scores.defined()) {\n            inv_pos_scores = torch::nn::functional::pad(inv_pos_scores, options);\n        }\n    }\n\n    return std::forward_as_tuple(pos_scores, neg_scores, inv_pos_scores, inv_neg_scores);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> rel_corrupt_forward(shared_ptr<EdgeDecoder> decoder, torch::Tensor positive_edges,\n                                                                                           torch::Tensor node_embeddings, torch::Tensor neg_rel_ids) {\n    torch::Tensor pos_scores;\n    torch::Tensor inv_pos_scores;\n    torch::Tensor neg_scores;\n    torch::Tensor inv_neg_scores;\n\n    if (positive_edges.size(1) != 3) {\n        throw TensorSizeMismatchException(positive_edges, \"Edge list must be a 3 column tensor\");\n    }\n\n    torch::Tensor src = node_embeddings.index_select(0, positive_edges.select(1, 0));\n    torch::Tensor dst = node_embeddings.index_select(0, positive_edges.select(1, -1));\n\n    torch::Tensor rel_ids = positive_edges.select(1, 1);\n\n    torch::Tensor rels = decoder->select_relations(rel_ids);\n    torch::Tensor neg_rels = decoder->select_relations(neg_rel_ids);\n\n    pos_scores = decoder->compute_scores(decoder->apply_relation(src, rels), dst);\n    neg_scores = decoder->compute_scores(decoder->apply_relation(src, neg_rels), dst);\n\n    if (decoder->use_inverse_relations_) {\n        torch::Tensor inv_rels = decoder->select_relations(rel_ids, true);\n        torch::Tensor inv_neg_rels = decoder->select_relations(neg_rel_ids, true);\n\n        inv_pos_scores = decoder->compute_scores(decoder->apply_relation(dst, inv_rels), src);\n        inv_neg_scores = decoder->compute_scores(decoder->apply_relation(dst, inv_neg_rels), src);\n    }\n\n    return std::forward_as_tuple(pos_scores, neg_scores, inv_pos_scores, inv_neg_scores);\n}"
  },
  {
    "path": "src/cpp/src/nn/decoders/edge/distmult.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/decoders/edge/distmult.h\"\n\nDistMult::DistMult(int num_relations, int embedding_size, torch::TensorOptions tensor_options, bool use_inverse_relations, EdgeDecoderMethod decoder_method) {\n    comparator_ = std::make_shared<DotCompare>();\n    relation_operator_ = std::make_shared<HadamardOperator>();\n    num_relations_ = num_relations;\n    embedding_size_ = embedding_size;\n    use_inverse_relations_ = use_inverse_relations;\n    tensor_options_ = tensor_options;\n    decoder_method_ = decoder_method;\n\n    learning_task_ = LearningTask::LINK_PREDICTION;\n\n    DistMult::reset();\n}\n\nvoid DistMult::reset() {\n    relations_ = torch::ones({num_relations_, embedding_size_}, tensor_options_).set_requires_grad(true);\n    relations_ = register_parameter(\"relation_embeddings\", relations_);\n    if (use_inverse_relations_) {\n        inverse_relations_ = torch::ones({num_relations_, embedding_size_}, tensor_options_).set_requires_grad(true);\n        inverse_relations_ = register_parameter(\"inverse_relation_embeddings\", inverse_relations_);\n    }\n}"
  },
  {
    "path": "src/cpp/src/nn/decoders/edge/edge_decoder.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/6/22.\n//\n\n#include \"nn/decoders/edge/edge_decoder.h\"\n\ntorch::Tensor EdgeDecoder::apply_relation(torch::Tensor nodes, torch::Tensor relations) { return relation_operator_->operator()(nodes, relations); }\n\ntorch::Tensor EdgeDecoder::compute_scores(torch::Tensor src, torch::Tensor dst) { return comparator_->operator()(src, dst); }\n\ntorch::Tensor EdgeDecoder::select_relations(torch::Tensor indices, bool inverse) {\n    if (inverse) {\n        if (!inverse_relations_.defined()) {\n            throw UndefinedTensorException();\n        }\n        return inverse_relations_.index_select(0, indices);\n    } else {\n        return relations_.index_select(0, indices);\n    }\n}\n"
  },
  {
    "path": "src/cpp/src/nn/decoders/edge/relation_operators.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include <nn/decoders/edge/relation_operators.h>\n\ntorch::Tensor HadamardOperator::operator()(const torch::Tensor &embs, const torch::Tensor &rels) {\n    if (!rels.defined()) {\n        return embs;\n    }\n    return embs * rels;\n}\n\ntorch::Tensor ComplexHadamardOperator::operator()(const torch::Tensor &embs, const torch::Tensor &rels) {\n    if (!rels.defined()) {\n        return embs;\n    }\n    int dim = embs.size(1);\n\n    int real_len = dim / 2;\n    int imag_len = dim - dim / 2;\n\n    torch::Tensor real_emb = embs.narrow(1, 0, real_len);\n    torch::Tensor imag_emb = embs.narrow(1, real_len, imag_len);\n\n    torch::Tensor real_rel = rels.narrow(1, 0, real_len);\n    torch::Tensor imag_rel = rels.narrow(1, real_len, imag_len);\n\n    torch::Tensor out = torch::zeros_like(embs);\n\n    out.narrow(1, 0, real_len) = (real_emb * real_rel) - (imag_emb * imag_rel);\n    out.narrow(1, real_len, imag_len) = (real_emb * imag_rel) + (imag_emb * real_rel);\n\n    return out;\n}\n\ntorch::Tensor TranslationOperator::operator()(const torch::Tensor &embs, const torch::Tensor &rels) {\n    if (!rels.defined()) {\n        return embs;\n    }\n    return embs + rels;\n}\n\ntorch::Tensor NoOp::operator()(const torch::Tensor &embs, const torch::Tensor &rels) {\n    (void)rels;\n    return embs;\n}\n"
  },
  {
    "path": "src/cpp/src/nn/decoders/edge/transe.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/decoders/edge/transe.h\"\n\nTransE::TransE(int num_relations, int embedding_size, torch::TensorOptions tensor_options, bool use_inverse_relations, EdgeDecoderMethod decoder_method) {\n    comparator_ = std::make_shared<L2Compare>();\n    relation_operator_ = std::make_shared<TranslationOperator>();\n    num_relations_ = num_relations;\n    embedding_size_ = embedding_size;\n    use_inverse_relations_ = use_inverse_relations;\n    tensor_options_ = tensor_options;\n    decoder_method_ = decoder_method;\n\n    learning_task_ = LearningTask::LINK_PREDICTION;\n\n    TransE::reset();\n}\n\nvoid TransE::reset() {\n    relations_ = torch::zeros({num_relations_, embedding_size_}, tensor_options_).set_requires_grad(true);\n    relations_ = register_parameter(\"relation_embeddings\", relations_);\n    if (use_inverse_relations_) {\n        inverse_relations_ = torch::zeros({num_relations_, embedding_size_}, tensor_options_).set_requires_grad(true);\n        inverse_relations_ = register_parameter(\"inverse_relation_embeddings\", inverse_relations_);\n    }\n}\n"
  },
  {
    "path": "src/cpp/src/nn/decoders/node/noop_node_decoder.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/7/22.\n//\n#include \"nn/decoders/node/noop_node_decoder.h\"\n\ntorch::Tensor NoOpNodeDecoder::forward(torch::Tensor nodes) { return nodes; }\n\nvoid NoOpNodeDecoder::reset() { return; }\n"
  },
  {
    "path": "src/cpp/src/nn/encoders/encoder.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/encoders/encoder.h\"\n\n#include \"nn/activation.h\"\n#include \"nn/layers/embedding/embedding.h\"\n#include \"nn/layers/feature/feature.h\"\n#include \"nn/layers/gnn/gat_layer.h\"\n#include \"nn/layers/gnn/gcn_layer.h\"\n#include \"nn/layers/gnn/graph_sage_layer.h\"\n#include \"nn/layers/gnn/rgcn_layer.h\"\n#include \"nn/layers/reduction/concat.h\"\n#include \"nn/layers/reduction/linear.h\"\n#include \"nn/layers/reduction/reduction_layer.h\"\n\nGeneralEncoder::GeneralEncoder(shared_ptr<EncoderConfig> encoder_config, torch::Device device, int num_relations) : device_(torch::kCPU) {\n    encoder_config_ = encoder_config;\n    num_relations_ = num_relations;\n    device_ = device;\n\n    has_features_ = false;\n    has_embeddings_ = false;\n\n    reset();\n}\n\nGeneralEncoder::GeneralEncoder(std::vector<std::vector<shared_ptr<Layer>>> layers) : device_(torch::kCPU) {\n    layers_ = layers;\n    device_ = layers_[0][0]->device_;\n\n    int stage_id = 0;\n    for (auto stage : layers_) {\n        int layer_id = 0;\n        for (auto layer : stage) {\n            if (layer->device_ != device_) {\n                throw MariusRuntimeException(\"All layers of the encoder must use the same device.\");\n            }\n\n            // TODO unify with initLayer functions\n            string name;\n            if (instance_of<Layer, EmbeddingLayer>(layer)) {\n                name = \"embedding:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n                register_module<EmbeddingLayer>(name, std::dynamic_pointer_cast<EmbeddingLayer>(layer));\n            } else if (instance_of<Layer, FeatureLayer>(layer)) {\n                name = \"feature:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n                register_module<FeatureLayer>(name, std::dynamic_pointer_cast<FeatureLayer>(layer));\n            } else if (instance_of<Layer, ReductionLayer>(layer)) {\n                if (instance_of<Layer, LinearReduction>(layer)) {\n                    name = \"linear_reduction:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n                    register_module<LinearReduction>(name, std::dynamic_pointer_cast<LinearReduction>(layer));\n                } else if (instance_of<Layer, ConcatReduction>(layer)) {\n                    name = \"concat_reduction:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n                    register_module<ConcatReduction>(name, std::dynamic_pointer_cast<ConcatReduction>(layer));\n                } else {\n                    throw std::runtime_error(\"Unrecognized reduction layer type\");\n                }\n            } else if (instance_of<Layer, GNNLayer>(layer)) {\n                if (instance_of<Layer, GraphSageLayer>(layer)) {\n                    string name = \"graph_sage_layer:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n                    register_module<GraphSageLayer>(name, std::dynamic_pointer_cast<GraphSageLayer>(layer));\n                } else if (instance_of<Layer, GATLayer>(layer)) {\n                    string name = \"gat_layer:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n                    register_module<GATLayer>(name, std::dynamic_pointer_cast<GATLayer>(layer));\n                } else if (instance_of<Layer, GCNLayer>(layer)) {\n                    string name = \"gcn_layer:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n                    register_module<GCNLayer>(name, std::dynamic_pointer_cast<GCNLayer>(layer));\n                } else if (instance_of<Layer, RGCNLayer>(layer)) {\n                    string name = \"rgcn_layer:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n                    register_module<RGCNLayer>(name, std::dynamic_pointer_cast<RGCNLayer>(layer));\n                } else {\n                    throw std::runtime_error(\"Unrecognized GNN layer type\");\n                }\n            } else {\n                throw std::runtime_error(\"Unsupported layer type\");\n            }\n            layer_id++;\n        }\n        stage_id++;\n    }\n    encoder_config_ = nullptr;\n}\n\nshared_ptr<Layer> GeneralEncoder::initEmbeddingLayer(shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id) {\n    string name = \"embedding:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n    shared_ptr<Layer> layer = std::make_shared<EmbeddingLayer>(layer_config, device_);\n    register_module<EmbeddingLayer>(name, std::dynamic_pointer_cast<EmbeddingLayer>(layer));\n    has_embeddings_ = true;\n    return layer;\n}\n\nshared_ptr<Layer> GeneralEncoder::initFeatureLayer(shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id) {\n    string name = \"feature:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n    shared_ptr<Layer> layer = std::make_shared<FeatureLayer>(layer_config, device_);\n    register_module<FeatureLayer>(name, std::dynamic_pointer_cast<FeatureLayer>(layer));\n    has_features_ = true;\n    return layer;\n}\n\nshared_ptr<Layer> GeneralEncoder::initReductionLayer(shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id) {\n    auto options = std::dynamic_pointer_cast<ReductionLayerOptions>(layer_config->options);\n\n    if (options->type == ReductionLayerType::LINEAR) {\n        string name = \"linear_reduction:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n        shared_ptr<Layer> layer = std::make_shared<LinearReduction>(layer_config, device_);\n        register_module<LinearReduction>(name, std::dynamic_pointer_cast<LinearReduction>(layer));\n        return layer;\n    } else if (options->type == ReductionLayerType::CONCAT) {\n        string name = \"concat_reduction:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n        shared_ptr<Layer> layer = std::make_shared<ConcatReduction>(layer_config, device_);\n        register_module<ConcatReduction>(name, std::dynamic_pointer_cast<ConcatReduction>(layer));\n        return layer;\n    } else {\n        throw std::runtime_error(\"Unrecognized reduction layer type\");\n    }\n}\n\nshared_ptr<Layer> GeneralEncoder::initGNNLayer(std::shared_ptr<LayerConfig> layer_config, int stage_id, int layer_id, int sampling_id) {\n    auto options = std::dynamic_pointer_cast<GNNLayerOptions>(layer_config->options);\n\n    std::shared_ptr<Layer> layer;\n\n    if (options->type == GNNLayerType::GRAPH_SAGE) {\n        string name = \"graph_sage_layer:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n        layer = std::make_shared<GraphSageLayer>(layer_config, device_);\n        register_module<GraphSageLayer>(name, std::dynamic_pointer_cast<GraphSageLayer>(layer));\n    } else if (options->type == GNNLayerType::GAT) {\n        string name = \"gat_layer:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n        layer = std::make_shared<GATLayer>(layer_config, device_);\n        register_module<GATLayer>(name, std::dynamic_pointer_cast<GATLayer>(layer));\n    } else if (options->type == GNNLayerType::GCN) {\n        string name = \"gcn_layer:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n        layer = std::make_shared<GCNLayer>(layer_config, device_);\n        register_module<GCNLayer>(name, std::dynamic_pointer_cast<GCNLayer>(layer));\n    } else if (options->type == GNNLayerType::RGCN) {\n        string name = \"rgcn_layer:\" + std::to_string(stage_id) + \"_\" + std::to_string(layer_id);\n        layer = std::make_shared<RGCNLayer>(layer_config, num_relations_, device_);\n        register_module<RGCNLayer>(name, std::dynamic_pointer_cast<RGCNLayer>(layer));\n    } else {\n        throw std::runtime_error(\"Unrecognized GNN layer type\");\n    }\n\n    return layer;\n}\n\nvoid GeneralEncoder::reset() {\n    if (encoder_config_ != nullptr) {\n        layers_.clear();\n\n        int num_sampling_layers = encoder_config_->train_neighbor_sampling.size();\n\n        if (num_sampling_layers == 0) {\n            num_sampling_layers = encoder_config_->eval_neighbor_sampling.size();\n        }\n        int curr_sampling_layer = 0;\n\n        int stage_id = 0;\n        for (auto stage_config : encoder_config_->layers) {\n            std::vector<std::shared_ptr<Layer>> stage_layer;\n\n            int layer_id = 0;\n            for (auto layer_config : stage_config) {\n                std::shared_ptr<Layer> layer;\n\n                if (layer_config->type == LayerType::EMBEDDING) {\n                    layer = initEmbeddingLayer(layer_config, stage_id, layer_id);\n                } else if (layer_config->type == LayerType::FEATURE) {\n                    layer = initFeatureLayer(layer_config, stage_id, layer_id);\n                } else if (layer_config->type == LayerType::REDUCTION) {\n                    layer = initReductionLayer(layer_config, stage_id, layer_id);\n                } else if (layer_config->type == LayerType::GNN) {\n                    assert(curr_sampling_layer < num_sampling_layers);\n                    layer = initGNNLayer(layer_config, stage_id, layer_id, curr_sampling_layer);\n                    curr_sampling_layer++;\n                } else {\n                    throw std::runtime_error(\"Unsupported layer type\");\n                }\n\n                stage_layer.push_back(layer);\n                layer_id++;\n            }\n            layers_.push_back(stage_layer);\n            stage_id++;\n        }\n    } else {\n        for (auto stage : layers_) {\n            for (auto layer : stage) {\n                layer->reset();\n            }\n        }\n    }\n}\n\ntorch::Tensor GeneralEncoder::forward(at::optional<torch::Tensor> embeddings, at::optional<torch::Tensor> features, DENSEGraph dense_graph, bool train) {\n    dense_graph.performMap();\n\n    std::vector<torch::Tensor> outputs = {};\n\n    for (int i = 0; i < layers_.size(); i++) {\n        bool use_sample = false;\n        bool added_output = false;\n\n        int64_t output_size;\n        if (embeddings.has_value() && embeddings.value().defined()) {\n            output_size = embeddings.value().size(0);\n        } else if (features.has_value() && features.value().defined()) {\n            output_size = features.value().size(0);\n        } else {\n            throw MariusRuntimeException(\"Encoder requires embeddings and/or features as input\");\n        }\n\n        for (int j = 0; j < layers_[i].size(); j++) {\n            if (instance_of<Layer, GNNLayer>(layers_[i][j])) {\n                output_size = dense_graph.node_ids_.size(0) - (dense_graph.hop_offsets_[1].item<int64_t>() - dense_graph.hop_offsets_[0].item<int64_t>());\n            }\n        }\n\n        std::vector<torch::Tensor> max_outputs(layers_[i].size());\n        for (int j = 0; j < layers_[i].size(); j++) {\n            if (instance_of<Layer, EmbeddingLayer>(layers_[i][j])) {\n                max_outputs[j] = std::dynamic_pointer_cast<EmbeddingLayer>(layers_[i][j])->forward(embeddings.value().narrow(0, 0, output_size));\n                max_outputs[j] = layers_[i][j]->post_hook(max_outputs[j]);\n                added_output = true;\n            } else if (instance_of<Layer, FeatureLayer>(layers_[i][j])) {\n                max_outputs[j] = std::dynamic_pointer_cast<FeatureLayer>(layers_[i][j])->forward(features.value().narrow(0, 0, output_size));\n                max_outputs[j] = layers_[i][j]->post_hook(max_outputs[j]);\n                added_output = true;\n            } else if (instance_of<Layer, ReductionLayer>(layers_[i][j])) {\n                std::vector<torch::Tensor> new_outputs(1);\n                new_outputs[0] = std::dynamic_pointer_cast<ReductionLayer>(layers_[i][j])->forward(outputs);\n                new_outputs[0] = layers_[i][j]->post_hook(new_outputs[0]);\n                outputs = new_outputs;\n            } else if (instance_of<Layer, GNNLayer>(layers_[i][j])) {\n                outputs[j] = std::dynamic_pointer_cast<GNNLayer>(layers_[i][j])->forward(outputs[j], dense_graph, train);\n                outputs[j] = layers_[i][j]->post_hook(outputs[j]);\n                use_sample = true;\n            } else {\n                throw std::runtime_error(\"Unsupported layer type\");\n            }\n        }\n        // added embedding / features in this stage\n        if (added_output) {\n            for (int j = outputs.size(); j < max_outputs.size(); j++) {\n                outputs.emplace_back(max_outputs[j]);\n            }\n        }\n\n        // used GNN layer at this stage\n        if (use_sample && i < layers_.size() - 1) {\n            dense_graph.prepareForNextLayer();\n        }\n    }\n\n    assert(outputs.size() == 1);\n\n    return outputs[0];\n}"
  },
  {
    "path": "src/cpp/src/nn/initialization.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/7/21.\n//\n\n#include \"nn/initialization.h\"\n\nstd::tuple<int64_t, int64_t> compute_fans(std::vector<int64_t> shape) {\n    int64_t fan_in = 0;\n    int64_t fan_out = 0;\n\n    if (shape.size() < 1) {\n        fan_in = fan_out = 1;\n    } else if (shape.size() == 1) {\n        fan_in = fan_out = shape[0];\n    } else if (shape.size() == 2) {\n        fan_in = shape[0];\n        fan_out = shape[1];\n    } else {\n        fan_in = shape[shape.size() - 2];\n        fan_out = shape[shape.size() - 1];\n    }\n\n    return std::forward_as_tuple(fan_in, fan_out);\n}\n\ntorch::Tensor glorot_uniform(std::vector<int64_t> shape, std::tuple<int64_t, int64_t> fans, torch::TensorOptions options) {\n    int64_t fan_in = std::get<0>(fans);\n    int64_t fan_out = std::get<1>(fans);\n\n    if (fan_in == -1 || fan_out == -1) {\n        auto tup = compute_fans(shape);\n        fan_in = std::get<0>(tup);\n        fan_out = std::get<1>(tup);\n    }\n\n    float limit = sqrt(6.0 / (fan_in + fan_out));\n    torch::Tensor ret = torch::rand(shape, options);\n    ret = 2 * limit * (ret - .5);\n\n    return ret;\n}\n\ntorch::Tensor glorot_normal(std::vector<int64_t> shape, std::tuple<int64_t, int64_t> fans, torch::TensorOptions options) {\n    int64_t fan_in = std::get<0>(fans);\n    int64_t fan_out = std::get<1>(fans);\n\n    if (fan_in == -1 || fan_out == -1) {\n        auto tup = compute_fans(shape);\n        fan_in = std::get<0>(tup);\n        fan_out = std::get<1>(tup);\n    }\n\n    float std = sqrt(2.0 / (fan_in + fan_out));\n\n    return torch::randn(shape, options).mul_(std);\n}\n\ntorch::Tensor uniform_init(float scale_factor, std::vector<int64_t> shape, torch::TensorOptions options) {\n    return (2 * torch::rand(shape, options) - 1).mul_(scale_factor);\n}\ntorch::Tensor normal_init(float mean, float std, std::vector<int64_t> shape, torch::TensorOptions options) {\n    return torch::randn(shape, options).mul_(std) + mean;\n}\n\ntorch::Tensor constant_init(float constant, std::vector<int64_t> shape, torch::TensorOptions options) { return torch::ones(shape, options) * constant; }\n\ntorch::Tensor initialize_tensor(shared_ptr<InitConfig> init_config, std::vector<int64_t> shape, torch::TensorOptions tensor_options,\n                                std::tuple<int64_t, int64_t> fans) {\n    InitDistribution init_distribution = init_config->type;\n    shared_ptr<InitOptions> init_options = init_config->options;\n\n    torch::Tensor ret;\n\n    if (init_distribution == InitDistribution::GLOROT_NORMAL) {\n        ret = glorot_normal(shape, fans, tensor_options);\n    } else if (init_distribution == InitDistribution::GLOROT_UNIFORM) {\n        ret = glorot_uniform(shape, fans, tensor_options);\n    } else if (init_distribution == InitDistribution::UNIFORM) {\n        float scale_factor = std::dynamic_pointer_cast<UniformInitOptions>(init_options)->scale_factor;\n        ret = uniform_init(scale_factor, shape, tensor_options);\n    } else if (init_distribution == InitDistribution::NORMAL) {\n        float mean = std::dynamic_pointer_cast<NormalInitOptions>(init_options)->mean;\n        float std = std::dynamic_pointer_cast<NormalInitOptions>(init_options)->std;\n        ret = normal_init(mean, std, shape, tensor_options);\n    } else if (init_distribution == InitDistribution::ZEROS) {\n        ret = torch::zeros(shape, tensor_options);\n    } else if (init_distribution == InitDistribution::ONES) {\n        ret = torch::ones(shape, tensor_options);\n    } else if (init_distribution == InitDistribution::CONSTANT) {\n        float constant = std::dynamic_pointer_cast<ConstantInitOptions>(init_options)->constant;\n        ret = constant_init(constant, shape, tensor_options);\n    } else {\n        throw std::runtime_error(\"Unimplemented initialization distribution\");\n    }\n\n    return ret;\n}\n\n// Allows for initialization of small pieces of a larger tensor, for initialization methods which scale based on the tensor size\ntorch::Tensor initialize_subtensor(shared_ptr<InitConfig> init_config, std::vector<int64_t> sub_shape, std::vector<int64_t> full_shape,\n                                   torch::TensorOptions tensor_options, std::tuple<int64_t, int64_t> fans) {\n    InitDistribution init_distribution = init_config->type;\n    torch::Tensor ret;\n\n    if (init_distribution == InitDistribution::GLOROT_NORMAL) {\n        if (std::get<0>(fans) == -1 || std::get<1>(fans) == -1) {\n            fans = compute_fans(full_shape);\n        }\n        ret = glorot_normal(sub_shape, fans, tensor_options);\n    } else if (init_distribution == InitDistribution::GLOROT_UNIFORM) {\n        if (std::get<0>(fans) == -1 || std::get<1>(fans) == -1) {\n            fans = compute_fans(full_shape);\n        }\n        ret = glorot_uniform(sub_shape, fans, tensor_options);\n    } else {\n        ret = initialize_tensor(init_config, sub_shape, tensor_options);\n    }\n\n    return ret;\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/embedding/embedding.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/1/22.\n//\n\n#include \"nn/layers/embedding/embedding.h\"\n\n#include \"nn/initialization.h\"\n\nEmbeddingLayer::EmbeddingLayer(shared_ptr<LayerConfig> layer_config, torch::Device device, int offset) {\n    config_ = layer_config;\n    offset_ = offset;\n    device_ = device;\n\n    reset();\n}\n\ntorch::Tensor EmbeddingLayer::forward(torch::Tensor input) { return input.narrow(1, offset_, config_->output_dim); }\n\ntorch::Tensor EmbeddingLayer::init_embeddings(int64_t num_nodes) {\n    auto options = torch::TensorOptions().device(device_).dtype(torch::kFloat32);\n    torch::Tensor embs = initialize_tensor(config_->init, {num_nodes, config_->output_dim}, options);\n\n    return embs;\n}\n\nvoid EmbeddingLayer::reset() {\n    if (config_->bias) {\n        init_bias();\n    }\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/feature/feature.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/1/22.\n//\n\n#include \"nn/layers/feature/feature.h\"\n\nFeatureLayer::FeatureLayer(shared_ptr<LayerConfig> layer_config, torch::Device device, int offset) {\n    config_ = layer_config;\n    offset_ = offset;\n    device_ = device;\n\n    reset();\n}\n\ntorch::Tensor FeatureLayer::forward(torch::Tensor input) { return input.narrow(1, offset_, config_->output_dim); }\n\nvoid FeatureLayer::reset() {\n    if (config_->bias) {\n        init_bias();\n    }\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/gnn/gat_layer.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/layers/gnn/gat_layer.h\"\n\n#include \"nn/layers/gnn/layer_helpers.h\"\n\nGATLayer::GATLayer(shared_ptr<LayerConfig> layer_config, torch::Device device) {\n    config_ = layer_config;\n    options_ = std::dynamic_pointer_cast<GATLayerOptions>(layer_config->options);\n    input_dim_ = layer_config->input_dim;\n    output_dim_ = layer_config->output_dim;\n\n    device_ = device;\n\n    input_dropout_ = options_->input_dropout;\n    attention_dropout_ = options_->attention_dropout;\n\n    if (options_->average_heads) {\n        head_dim_ = output_dim_;\n    } else {\n        assert(output_dim_ % options_->num_heads == 0);\n        head_dim_ = output_dim_ / options_->num_heads;\n    }\n\n    reset();\n}\n\nvoid GATLayer::reset() {\n    auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32).device(device_);\n\n    torch::Tensor weight_matrices =\n        initialize_tensor(config_->init, {head_dim_ * options_->num_heads, input_dim_}, tensor_options, {input_dim_, head_dim_}).set_requires_grad(true);\n\n    torch::Tensor a_l = initialize_tensor(config_->init, {options_->num_heads, 1, head_dim_}, tensor_options, {head_dim_, 1}).set_requires_grad(true);\n\n    torch::Tensor a_r = initialize_tensor(config_->init, {options_->num_heads, 1, head_dim_}, tensor_options, {head_dim_, 1}).set_requires_grad(true);\n\n    weight_matrices_ = register_parameter(\"weight_matrices\", weight_matrices);\n    a_l_ = register_parameter(\"a_l\", a_l);\n    a_r_ = register_parameter(\"a_r\", a_r);\n\n    if (config_->bias) {\n        init_bias();\n    }\n}\n\ntorch::Tensor GATLayer::forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train) {\n    auto relu_opts = torch::nn::LeakyReLUOptions();\n    relu_opts.negative_slope(options_->negative_slope);\n    auto leaky_relu = torch::nn::LeakyReLU(relu_opts);\n\n    Indices neighbors;\n    Indices neighbor_offsets;\n    Indices total_neighbors;\n\n    if (dense_graph.out_neighbors_mapping_.defined() && dense_graph.in_neighbors_mapping_.defined()) {\n        auto tup = dense_graph.getCombinedNeighborIDs();\n        neighbors = std::get<2>(tup);\n        neighbor_offsets = std::get<0>(tup);\n        total_neighbors = std::get<1>(tup);\n    } else if (dense_graph.in_neighbors_mapping_.defined()) {\n        neighbors = dense_graph.getNeighborIDs(true, false);\n        neighbor_offsets = dense_graph.getNeighborOffsets(true);\n        total_neighbors = dense_graph.getNumNeighbors(true);\n    } else if (dense_graph.out_neighbors_mapping_.defined()) {\n        neighbors = dense_graph.getNeighborIDs(false, false);\n        neighbor_offsets = dense_graph.getNeighborOffsets(false);\n        total_neighbors = dense_graph.getNumNeighbors(false);\n    } else {\n        throw MariusRuntimeException(\"No neighbors defined.\");\n    }\n\n    int64_t layer_offset = dense_graph.getLayerOffset();\n    torch::Tensor parent_ids = torch::arange(inputs.size(0) - layer_offset, total_neighbors.options()).repeat_interleave(total_neighbors);\n\n    if (train && input_dropout_ > 0) {\n        auto dropout_opts = torch::nn::DropoutOptions().p(input_dropout_).inplace(false);\n        auto dropout = torch::nn::Dropout(dropout_opts);\n        inputs = dropout(inputs);\n    }\n\n    torch::Tensor nbr_embeddings = inputs.index_select(0, neighbors);\n    torch::Tensor nbr_transforms = torch::matmul(weight_matrices_, nbr_embeddings.transpose(0, 1));\n    nbr_transforms = nbr_transforms.reshape({options_->num_heads, head_dim_, -1});\n\n    // free memory as this tensor can become large with large numbers of neighbors\n    nbr_embeddings = torch::Tensor();\n\n    torch::Tensor self_embs = inputs.narrow(0, layer_offset, inputs.size(0) - layer_offset);\n    torch::Tensor self_transforms = torch::matmul(weight_matrices_, self_embs.transpose(0, 1));\n    self_transforms = self_transforms.reshape({options_->num_heads, head_dim_, -1});\n    torch::Tensor self_transforms_l = torch::matmul(a_l_, self_transforms);\n\n    torch::Tensor self_atn_weights = self_transforms_l + torch::matmul(a_r_, self_transforms);\n    self_atn_weights = leaky_relu(self_atn_weights);\n\n    self_transforms_l = self_transforms_l.index_select(-1, parent_ids);\n    torch::Tensor nbr_atn_weights = self_transforms_l + torch::matmul(a_r_, nbr_transforms);\n    nbr_atn_weights = leaky_relu(nbr_atn_weights);\n\n    nbr_atn_weights = nbr_atn_weights.transpose(0, 2);    // [total_num_nbrs, 1, num_heads_]\n    self_atn_weights = self_atn_weights.transpose(0, 2);  // [num_to_encode, 1, num_heads_]\n\n    std::tie(nbr_atn_weights, self_atn_weights) = attention_softmax(nbr_atn_weights, self_atn_weights, neighbor_offsets, parent_ids, total_neighbors);\n\n    nbr_atn_weights = nbr_atn_weights.transpose(0, 2);\n    self_atn_weights = self_atn_weights.transpose(0, 2);\n\n    if (train && attention_dropout_ > 0) {\n        auto dropout_opts = torch::nn::DropoutOptions().p(attention_dropout_).inplace(false);\n        auto dropout = torch::nn::Dropout(dropout_opts);\n\n        nbr_atn_weights = dropout(nbr_atn_weights);\n        self_atn_weights = dropout(self_atn_weights);\n    }\n\n    nbr_atn_weights = nbr_atn_weights.repeat({1, head_dim_, 1});\n\n    torch::Tensor tmp = (nbr_transforms * nbr_atn_weights).transpose(0, 2);\n    torch::Tensor h_i = segmented_sum(tmp, parent_ids, total_neighbors.size(0));\n    h_i = h_i.transpose(0, 2);\n\n    tmp = self_transforms * self_atn_weights;\n    h_i = h_i + tmp;\n\n    if (options_->average_heads) {\n        h_i = torch::mean(h_i, 0);\n    } else {\n        h_i = h_i.reshape({output_dim_, -1});\n    }\n\n    h_i = h_i.transpose(0, 1);\n\n    // this has been moved to the post_hook\n    //    if (config_->bias) {\n    //        h_i = h_i + bias_;\n    //    }\n\n    return h_i;\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/gnn/gcn_layer.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/layers/gnn/gcn_layer.h\"\n\n#include \"nn/layers/gnn/layer_helpers.h\"\n\nGCNLayer::GCNLayer(shared_ptr<LayerConfig> layer_config, torch::Device device) {\n    config_ = layer_config;\n    options_ = std::dynamic_pointer_cast<GNNLayerOptions>(config_->options);\n    input_dim_ = config_->output_dim;\n    output_dim_ = config_->input_dim;\n    device_ = device;\n\n    reset();\n}\n\nvoid GCNLayer::reset() {\n    auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32).device(device_);\n    torch::Tensor edge_mat = initialize_tensor(config_->init, {output_dim_, input_dim_}, tensor_options).set_requires_grad(true);\n\n    w_ = register_parameter(\"w\", edge_mat);\n    if (config_->bias) {\n        init_bias();\n    }\n}\n\ntorch::Tensor GCNLayer::forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train) {\n    torch::Tensor total_num_neighbors;\n    torch::Tensor a_i;\n\n    if (dense_graph.out_neighbors_mapping_.defined()) {\n        Indices outgoing_neighbors = dense_graph.getNeighborIDs(false, false);\n        Indices outgoing_neighbor_offsets = dense_graph.getNeighborOffsets(false);\n        torch::Tensor outgoing_num = dense_graph.getNumNeighbors(false);\n        total_num_neighbors = outgoing_num;\n\n        torch::Tensor outgoing_embeddings = inputs.index_select(0, outgoing_neighbors);\n        torch::Tensor outgoing_normalization = torch::sqrt(dense_graph.node_properties_.index_select(0, outgoing_neighbors) + 1).unsqueeze(-1);\n        outgoing_embeddings = outgoing_embeddings / outgoing_normalization;\n        a_i = segmented_sum_with_offsets(outgoing_embeddings, outgoing_neighbor_offsets);\n    }\n\n    if (dense_graph.in_neighbors_mapping_.defined()) {\n        Indices incoming_neighbors = dense_graph.getNeighborIDs(true, false);\n        Indices incoming_neighbor_offsets = dense_graph.getNeighborOffsets(true);\n        torch::Tensor incoming_num = dense_graph.getNumNeighbors(true);\n\n        if (total_num_neighbors.defined()) {\n            total_num_neighbors = total_num_neighbors + incoming_num;\n        } else {\n            total_num_neighbors = incoming_num;\n        }\n\n        torch::Tensor incoming_embeddings = inputs.index_select(0, incoming_neighbors);\n        torch::Tensor incoming_normalization = torch::sqrt(dense_graph.node_properties_.index_select(0, incoming_neighbors) + 1).unsqueeze(-1);\n        incoming_embeddings = incoming_embeddings / incoming_normalization;\n\n        if (a_i.defined()) {\n            a_i = a_i + segmented_sum_with_offsets(incoming_embeddings, incoming_neighbor_offsets);\n        } else {\n            a_i = segmented_sum_with_offsets(incoming_embeddings, incoming_neighbor_offsets);\n        }\n    }\n\n    int64_t layer_offset = dense_graph.getLayerOffset();\n    torch::Tensor self_embs = inputs.narrow(0, layer_offset, inputs.size(0) - layer_offset);\n\n    a_i = a_i + self_embs / torch::sqrt((total_num_neighbors + 1)).unsqueeze(-1);\n    a_i = a_i / torch::sqrt((total_num_neighbors + 1)).unsqueeze(-1);\n    torch::Tensor outputs = torch::matmul(w_, a_i.transpose(0, -1)).transpose(0, -1);\n    return outputs + bias_;\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/gnn/graph_sage_layer.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/layers/gnn/graph_sage_layer.h\"\n\n#include \"nn/layers/gnn/layer_helpers.h\"\n#include \"reporting/logger.h\"\n\nGraphSageLayer::GraphSageLayer(shared_ptr<LayerConfig> layer_config, torch::Device device) {\n    config_ = layer_config;\n    options_ = std::dynamic_pointer_cast<GraphSageLayerOptions>(config_->options);\n    input_dim_ = config_->input_dim;\n    output_dim_ = config_->output_dim;\n    device_ = device;\n\n    reset();\n}\n\nvoid GraphSageLayer::reset() {\n    auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32).device(device_);\n\n    // Note: need to multiply the fans by 1/2 to match DGL's initialization\n    torch::Tensor edge_mat = initialize_tensor(config_->init, {output_dim_, input_dim_}, tensor_options).set_requires_grad(true);\n    w1_ = register_parameter(\"w1\", edge_mat);\n\n    if (options_->aggregator == GraphSageAggregator::MEAN) {\n        edge_mat = initialize_tensor(config_->init, {output_dim_, input_dim_}, tensor_options).set_requires_grad(true);\n        w2_ = register_parameter(\"w2\", edge_mat);\n    }\n\n    if (config_->bias) {\n        init_bias();\n    }\n}\n\ntorch::Tensor GraphSageLayer::forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train) {\n    torch::Tensor total_num_neighbors;\n    torch::Tensor a_i;\n    //\n    //    return in_neighbors_mapping_;\n    //    return out_neighbors_mapping_;\n\n    if (dense_graph.out_neighbors_mapping_.defined()) {\n        Indices outgoing_neighbors = dense_graph.getNeighborIDs(false, false);\n        Indices outgoing_neighbor_offsets = dense_graph.getNeighborOffsets(false);\n        torch::Tensor outgoing_num = dense_graph.getNumNeighbors(false);\n\n        torch::Tensor outgoing_embeddings = inputs.index_select(0, outgoing_neighbors);\n\n        total_num_neighbors = outgoing_num;\n        a_i = segmented_sum_with_offsets(outgoing_embeddings, outgoing_neighbor_offsets);\n    }\n\n    if (dense_graph.in_neighbors_mapping_.defined()) {\n        Indices incoming_neighbors = dense_graph.getNeighborIDs(true, false);\n        Indices incoming_neighbor_offsets = dense_graph.getNeighborOffsets(true);\n        torch::Tensor incoming_num = dense_graph.getNumNeighbors(true);\n\n        if (total_num_neighbors.defined()) {\n            total_num_neighbors = total_num_neighbors + incoming_num;\n        } else {\n            total_num_neighbors = incoming_num;\n        }\n\n        torch::Tensor incoming_embeddings = inputs.index_select(0, incoming_neighbors);\n\n        if (a_i.defined()) {\n            a_i = a_i + segmented_sum_with_offsets(incoming_embeddings, incoming_neighbor_offsets);\n        } else {\n            a_i = segmented_sum_with_offsets(incoming_embeddings, incoming_neighbor_offsets);\n        }\n    }\n\n    int64_t layer_offset = dense_graph.getLayerOffset();\n    torch::Tensor self_embs = inputs.narrow(0, layer_offset, inputs.size(0) - layer_offset);\n\n    torch::Tensor outputs;\n    if (options_->aggregator == GraphSageAggregator::GCN) {\n        a_i = a_i + self_embs;\n        a_i = a_i / (total_num_neighbors + 1).unsqueeze(-1);\n        outputs = torch::matmul(w1_, a_i.transpose(0, -1)).transpose(0, -1);\n    } else if (options_->aggregator == GraphSageAggregator::MEAN) {\n        if (total_num_neighbors.defined()) {\n            torch::Tensor denominator = torch::where(torch::not_equal(total_num_neighbors, 0), total_num_neighbors, 1).to(a_i.dtype()).unsqueeze(-1);\n            a_i = a_i / denominator;\n            outputs = (torch::matmul(w1_, self_embs.transpose(0, -1)) + torch::matmul(w2_, a_i.transpose(0, -1))).transpose(0, -1);\n        } else {\n            outputs = torch::matmul(w1_, self_embs.transpose(0, -1)).transpose(0, -1);\n        }\n\n    } else {\n        throw std::runtime_error(\"Unrecognized aggregator\");\n    }\n\n    return outputs;\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/gnn/layer_helpers.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/1/21.\n//\n\n#include \"nn/layers/gnn/layer_helpers.h\"\n\n#ifdef MARIUS_CUDA\n    #include \"pytorch_scatter/segment_max.h\"\n#endif\n\ntorch::Tensor segment_ids_from_offsets(torch::Tensor segment_offsets, int64_t input_size) {\n    torch::Tensor segment_ids = torch::zeros({input_size + 1}, segment_offsets.options());\n    torch::Tensor ones_tensor = torch::ones({segment_offsets.size(0)}, segment_offsets.options());\n    segment_ids.index_add_(0, segment_offsets, ones_tensor);\n    segment_ids = segment_ids.cumsum(0) - 1;\n    return segment_ids.narrow(0, 0, segment_ids.size(0) - 1);\n}\n\ntorch::Tensor segmented_sum(torch::Tensor tensor, torch::Tensor segment_ids, int64_t num_segments) {\n    auto shape = tensor.sizes().vec();\n    shape[0] = num_segments;\n    torch::Tensor segsum = torch::zeros(shape, tensor.options());\n    segsum.index_add_(0, segment_ids, tensor);\n    return segsum;\n}\n\ntorch::Tensor segmented_sum_with_offsets(torch::Tensor tensor, torch::Tensor segment_offsets) {\n    torch::Tensor segment_ids = segment_ids_from_offsets(segment_offsets, tensor.size(0));\n    return segmented_sum(tensor, segment_ids, segment_offsets.size(0));\n}\n\ntorch::Tensor segmented_max_with_offsets(torch::Tensor tensor, torch::Tensor segment_offsets) {\n    auto shape = tensor.sizes().vec();\n    shape[0] = segment_offsets.size(0);\n    torch::Tensor out = torch::zeros(shape, tensor.options());\n\n#ifdef MARIUS_CUDA\n    return std::get<0>(segment_max_csr(tensor, torch::cat({segment_offsets, torch::tensor({tensor.size(0)}, segment_offsets.options())}), out));\n#else\n    return torch::Tensor();\n#endif\n}\n\nstd::tuple<torch::Tensor, torch::Tensor> attention_softmax(torch::Tensor neighbor_attention, torch::Tensor self_attention, torch::Tensor segment_offsets,\n                                                           torch::Tensor segment_ids, torch::Tensor num_nbrs) {\n    torch::Tensor has_nbrs_mask = torch::not_equal(num_nbrs, 0);\n    has_nbrs_mask = has_nbrs_mask.reshape({-1, 1, 1});\n\n    torch::Tensor seg_max = segmented_max_with_offsets(neighbor_attention, segment_offsets);\n    torch::Tensor attention_max = torch::where(has_nbrs_mask, torch::maximum(seg_max, self_attention), self_attention);\n\n    self_attention = torch::exp(self_attention - attention_max);\n\n    attention_max = attention_max.index_select(0, segment_ids);\n    neighbor_attention = torch::exp(neighbor_attention - attention_max);\n\n    torch::Tensor seg_sum = segmented_sum(neighbor_attention, segment_ids, segment_offsets.size(0));\n    torch::Tensor attention_sum = seg_sum + self_attention;\n\n    self_attention = self_attention / attention_sum;\n\n    attention_sum = attention_sum.index_select(0, segment_ids);\n    neighbor_attention = neighbor_attention / attention_sum;\n\n    return std::forward_as_tuple(neighbor_attention, self_attention);\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/gnn/rgcn_layer.cpp",
    "content": "//\n// Created by Jason Mohoney on 9/29/21.\n//\n\n#include \"nn/layers/gnn/rgcn_layer.h\"\n\n#include \"nn/layers/gnn/layer_helpers.h\"\n\nRGCNLayer::RGCNLayer(shared_ptr<LayerConfig> layer_config, int num_relations, torch::Device device) {\n    config_ = layer_config;\n    options_ = std::dynamic_pointer_cast<GNNLayerOptions>(config_->options);\n    num_relations_ = num_relations;\n    input_dim_ = layer_config->input_dim;\n    output_dim_ = layer_config->output_dim;\n    device_ = device;\n\n    reset();\n}\n\nvoid RGCNLayer::reset() {\n    auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32).device(device_);\n\n    torch::Tensor rel_mats = initialize_tensor(config_->init, {num_relations_, output_dim_, input_dim_}, tensor_options).set_requires_grad(true);\n    relation_matrices_ = register_parameter(\"relation_matrices\", rel_mats);\n\n    //    if (use_incoming_) {\n    //        torch::Tensor inverse_rel_mats = initialize_tensor(config_->init,\n    //                                                          {num_relations_, output_dim_, input_dim_},\n    //                                                           tensor_options).set_requires_grad(true);\n    //        inverse_relation_matrices_ = register_parameter(\"inverse_relation_matrices_\", inverse_rel_mats);\n    //    }\n\n    torch::Tensor self_mat = initialize_tensor(config_->init, {output_dim_, input_dim_}, tensor_options).set_requires_grad(true);\n    self_matrix_ = register_parameter(\"self_matrix\", self_mat);\n\n    if (config_->bias) {\n        init_bias();\n    }\n}\n\ntorch::Tensor RGCNLayer::forward(torch::Tensor inputs, DENSEGraph dense_graph, bool train) {\n    Indices outgoing_neighbors = dense_graph.getNeighborIDs(false, false);\n    Indices outgoing_relations = dense_graph.getRelationIDs(false);\n    Indices outgoing_neighbor_offsets = dense_graph.getNeighborOffsets(false);\n    torch::Tensor outgoing_num = dense_graph.getNumNeighbors(false);\n    torch::Tensor total_num_neighbors = outgoing_num;\n\n    if (!outgoing_relations.defined()) {\n        outgoing_relations = torch::zeros_like(outgoing_neighbors);\n    }\n\n    torch::Tensor outgoing_embeddings = inputs.index_select(0, outgoing_neighbors);\n    torch::Tensor outgoing_relation_matrices = relation_matrices_.index_select(0, outgoing_relations);\n    outgoing_embeddings = torch::bmm(outgoing_relation_matrices, outgoing_embeddings.unsqueeze(2)).flatten(1, 2);\n\n    torch::Tensor a_i = segmented_sum_with_offsets(outgoing_embeddings, outgoing_neighbor_offsets);\n\n    //    if (dense_graph.in_neighbors_mapping_.defined()) {\n    //        Indices incoming_neighbors = dense_graph.getNeighborIDs(true, false);\n    //        Indices incoming_relations = dense_graph.getRelationIDs(true);\n    //        Indices incoming_neighbor_offsets = dense_graph.getNeighborOffsets(true);\n    //        torch::Tensor incoming_num = dense_graph.getNumNeighbors(true);\n    //        total_num_neighbors = total_num_neighbors + incoming_num;\n    //\n    //        if (!incoming_relations.defined()) {\n    //            incoming_relations = torch::zeros_like(incoming_neighbors);\n    //        }\n    //\n    //        torch::Tensor incoming_embeddings = inputs.index_select(0, incoming_neighbors);\n    //        torch::Tensor incoming_relation_matrices = inverse_relation_matrices_.index_select(0, incoming_relations);\n    //        incoming_embeddings = torch::bmm(incoming_relation_matrices, incoming_embeddings.unsqueeze(2)).flatten(1, 2);\n    //\n    //        a_i = a_i + segmented_sum_with_offsets(incoming_embeddings, incoming_neighbor_offsets);\n    //    }\n    torch::Tensor denominator = torch::where(torch::not_equal(total_num_neighbors, 0), total_num_neighbors, 1).to(a_i.dtype()).unsqueeze(-1);\n    a_i = a_i / denominator;\n\n    int64_t layer_offset = dense_graph.getLayerOffset();\n    torch::Tensor self_embs = inputs.narrow(0, layer_offset, inputs.size(0) - layer_offset);\n\n    // clone might be needed for async parameter updates\n    self_embs = torch::matmul(self_matrix_, self_embs.transpose(0, -1)).transpose(0, -1);\n\n    // clone might be needed for async parameter updates\n    torch::Tensor outputs = a_i + self_embs + bias_;\n\n    return outputs;\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/layer.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/1/22.\n//\n\n#include \"nn/layers/layer.h\"\n\nLayer::Layer() : device_(torch::kCPU) {}\n\ntorch::Tensor Layer::post_hook(torch::Tensor input) {\n    if (config_->bias) {\n        input = input + bias_;\n    }\n    input = apply_activation(config_->activation, input);\n\n    return input;\n}\n\nvoid Layer::init_bias() {\n    auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32).device(device_);\n    torch::Tensor bias = initialize_tensor(config_->bias_init, {config_->output_dim}, tensor_options).set_requires_grad(true);\n    bias_ = register_parameter(\"bias\", bias);\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/reduction/concat.cpp",
    "content": "//\n// Created by Jason Mohoney on 12/10/21.\n//\n\n#include \"nn/layers/reduction/concat.h\"\n\nConcatReduction::ConcatReduction(shared_ptr<LayerConfig> layer_config, torch::Device device) {\n    config_ = layer_config;\n    device_ = device;\n}\n\ntorch::Tensor ConcatReduction::forward(std::vector<torch::Tensor> inputs) { return torch::cat(inputs, 1); }\n\nvoid ConcatReduction::reset() {\n    if (config_->bias) {\n        init_bias();\n    }\n}"
  },
  {
    "path": "src/cpp/src/nn/layers/reduction/linear.cpp",
    "content": "//\n// Created by Jason Mohoney on 12/10/21.\n//\n\n#include \"nn/layers/reduction/linear.h\"\n\n#include \"nn/initialization.h\"\n\nLinearReduction::LinearReduction(shared_ptr<LayerConfig> layer_config, torch::Device device) {\n    config_ = layer_config;\n    device_ = device;\n    reset();\n}\n\ntorch::Tensor LinearReduction::forward(std::vector<torch::Tensor> inputs) {\n    torch::Tensor tmp = torch::cat(inputs, 1).transpose(0, -1);\n    torch::Tensor outputs = torch::matmul(weight_matrix_, tmp);\n    return outputs.transpose(0, -1);\n}\n\nvoid LinearReduction::reset() {\n    auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32).device(device_);\n\n    torch::Tensor weight_mat = initialize_tensor(config_->init, {config_->output_dim, config_->input_dim}, tensor_options).set_requires_grad(true);\n\n    weight_matrix_ = register_parameter(\"weight_matrix\", weight_mat);\n\n    if (config_->bias) {\n        init_bias();\n    }\n}"
  },
  {
    "path": "src/cpp/src/nn/loss.cpp",
    "content": "//\n// Created by Jason Mohoney on 8/25/21.\n//\n\n#include \"nn/loss.h\"\n\nvoid check_score_shapes(torch::Tensor pos_scores, torch::Tensor neg_scores) {\n    if (!pos_scores.defined()) {\n        throw UndefinedTensorException();\n    }\n\n    if (!neg_scores.defined()) {\n        throw UndefinedTensorException();\n    }\n\n    if (pos_scores.sizes().size() != 1) {\n        throw TensorSizeMismatchException(pos_scores, \"Positive scores should be 1-dimensional\");\n    }\n\n    if (neg_scores.sizes().size() != 2) {\n        throw TensorSizeMismatchException(neg_scores, \"Negative scores should be 2-dimensional\");\n    }\n\n    if (pos_scores.size(0) != neg_scores.size(0)) {\n        //        throw TensorSizeMismatchException(pos_scores, (std::stringstream(\"Size: \") << neg_scores.size(1) << \" First dimension of pos_scores and\n        //        neg_scores should match.\").str());\n        throw TensorSizeMismatchException(pos_scores, \"First dimension of pos_scores and neg_scores should match.\");\n    }\n}\n\ntorch::Tensor to_one_hot(torch::Tensor labels, int num_classes) {\n    torch::Tensor one_hot_encodings = torch::zeros({labels.size(0), num_classes}, torch::kInt64);\n    one_hot_encodings.index_fill_(1, labels.to(torch::kInt64), 1);\n    return one_hot_encodings.to(torch::kFloat32);\n}\n\nstd::tuple<torch::Tensor, torch::Tensor> scores_to_labels(torch::Tensor pos_scores, torch::Tensor neg_scores, bool one_hot) {\n    torch::Tensor y_pred = torch::cat({pos_scores, neg_scores}, -1);\n    torch::Tensor labels;\n    if (one_hot) {\n        labels = torch::cat({torch::ones_like(pos_scores), torch::zeros_like(neg_scores)}, -1);\n    } else {\n        auto options = torch::TensorOptions().dtype(torch::kInt64).device(pos_scores.device());\n        labels = torch::zeros({pos_scores.size(0)}, options);\n    }\n\n    return std::forward_as_tuple(y_pred, labels);\n}\n\ntorch::Tensor SoftmaxCrossEntropy::operator()(torch::Tensor y_pred, torch::Tensor labels, bool scores) {\n    if (!scores) {\n        throw MariusRuntimeException(\n            \"Input to SoftmaxCrossEntropy loss function must be scores. SoftmaxCrossEntropy is currently unsupported for classification.\");\n    }\n\n    check_score_shapes(y_pred, labels);\n    std::tie(y_pred, labels) = scores_to_labels(y_pred.unsqueeze(1), labels.logsumexp(1, true), false);\n\n    torch::nn::functional::CrossEntropyFuncOptions options;\n    if (reduction_type_ == LossReduction::MEAN) {\n        options.reduction(torch::kMean);\n    } else if (reduction_type_ == LossReduction::SUM) {\n        options.reduction(torch::kSum);\n    }\n\n    return torch::nn::functional::cross_entropy(y_pred, labels, options);\n}\n\ntorch::Tensor RankingLoss::operator()(torch::Tensor pos_scores, torch::Tensor neg_scores, bool scores) {\n    // does this loss make sense?\n\n    if (!scores) {\n        throw MariusRuntimeException(\"Input to ranking loss function must be scores. This loss function is unsupported for classification.\");\n    }\n\n    auto device_options = torch::TensorOptions().dtype(torch::kInt64).device(pos_scores.device());\n    torch::nn::functional::MarginRankingLossFuncOptions options;\n    if (reduction_type_ == LossReduction::MEAN) {\n        options.reduction(torch::kMean);\n    } else if (reduction_type_ == LossReduction::SUM) {\n        options.reduction(torch::kSum);\n    }\n    options.margin(margin_);\n\n    return torch::nn::functional::margin_ranking_loss(neg_scores, pos_scores.unsqueeze(1), pos_scores.new_full({1, 1}, -1, device_options), options);\n}\n\ntorch::Tensor CrossEntropyLoss::operator()(torch::Tensor y_pred, torch::Tensor labels, bool scores) {\n    if (scores) {\n        check_score_shapes(y_pred, labels);\n        std::tie(y_pred, labels) = scores_to_labels(y_pred.unsqueeze(1), labels, false);\n    }\n\n    torch::nn::functional::CrossEntropyFuncOptions options;\n    if (reduction_type_ == LossReduction::MEAN) {\n        options.reduction(torch::kMean);\n    } else if (reduction_type_ == LossReduction::SUM) {\n        options.reduction(torch::kSum);\n    }\n\n    return torch::nn::functional::cross_entropy(y_pred, labels, options);\n}\n\ntorch::Tensor BCEAfterSigmoidLoss::operator()(torch::Tensor y_pred, torch::Tensor labels, bool scores) {\n    if (scores) {\n        check_score_shapes(y_pred, labels);\n        std::tie(y_pred, labels) = scores_to_labels(y_pred, labels.flatten(0, 1), true);\n    } else {\n        labels = to_one_hot(labels, y_pred.size(-1));\n    }\n\n    torch::nn::functional::BinaryCrossEntropyFuncOptions options;\n    if (reduction_type_ == LossReduction::MEAN) {\n        options.reduction(torch::kMean);\n    } else if (reduction_type_ == LossReduction::SUM) {\n        options.reduction(torch::kSum);\n    }\n\n    return torch::nn::functional::binary_cross_entropy(y_pred.sigmoid(), labels, options);\n}\n\ntorch::Tensor BCEWithLogitsLoss::operator()(torch::Tensor y_pred, torch::Tensor labels, bool scores) {\n    if (scores) {\n        check_score_shapes(y_pred, labels);\n        std::tie(y_pred, labels) = scores_to_labels(y_pred, labels.flatten(0, 1), true);\n    } else {\n        labels = to_one_hot(labels, y_pred.size(-1));\n    }\n\n    torch::nn::functional::BinaryCrossEntropyWithLogitsFuncOptions options;\n    if (reduction_type_ == LossReduction::MEAN) {\n        options.reduction(torch::kMean);\n    } else if (reduction_type_ == LossReduction::SUM) {\n        options.reduction(torch::kSum);\n    }\n\n    return torch::nn::functional::binary_cross_entropy_with_logits(y_pred, labels, options);\n}\n\ntorch::Tensor MSELoss::operator()(torch::Tensor y_pred, torch::Tensor labels, bool scores) {\n    if (scores) {\n        check_score_shapes(y_pred, labels);\n        std::tie(y_pred, labels) = scores_to_labels(y_pred, labels.flatten(0, 1), true);\n    } else {\n        labels = to_one_hot(labels, y_pred.size(-1));\n    }\n\n    torch::nn::functional::MSELossFuncOptions options;\n    if (reduction_type_ == LossReduction::MEAN) {\n        options.reduction(torch::kMean);\n    } else if (reduction_type_ == LossReduction::SUM) {\n        options.reduction(torch::kSum);\n    }\n\n    return torch::nn::functional::mse_loss(y_pred, labels, options);\n}\n\ntorch::Tensor SoftPlusLoss::operator()(torch::Tensor y_pred, torch::Tensor labels, bool scores) {\n    if (scores) {\n        check_score_shapes(y_pred, labels);\n        std::tie(y_pred, labels) = scores_to_labels(y_pred, labels.flatten(0, 1), true);\n    } else {\n        labels = to_one_hot(labels, y_pred.size(-1));\n    }\n\n    labels = 2 * labels - 1;\n    auto loss = torch::nn::functional::softplus(((-1) * labels * y_pred));\n    if (reduction_type_ == LossReduction::MEAN) {\n        loss = loss.mean();\n    } else if (reduction_type_ == LossReduction::SUM) {\n        loss = loss.sum();\n    }\n\n    return loss;\n}\n\nstd::shared_ptr<LossFunction> getLossFunction(shared_ptr<LossConfig> config) {\n    if (config == nullptr) {\n        throw UnexpectedNullPtrException();\n    }\n\n    if (config->type == LossFunctionType::SOFTMAX_CE) {\n        return std::make_shared<SoftmaxCrossEntropy>(config->options);\n    } else if (config->type == LossFunctionType::RANKING) {\n        return std::make_shared<RankingLoss>(std::dynamic_pointer_cast<RankingLossOptions>(config->options));\n    } else if (config->type == LossFunctionType::CROSS_ENTROPY) {\n        return std::make_shared<CrossEntropyLoss>(config->options);\n    } else if (config->type == LossFunctionType::BCE_AFTER_SIGMOID) {\n        return std::make_shared<BCEAfterSigmoidLoss>(config->options);\n    } else if (config->type == LossFunctionType::BCE_WITH_LOGITS) {\n        return std::make_shared<BCEWithLogitsLoss>(config->options);\n    } else if (config->type == LossFunctionType::MSE) {\n        return std::make_shared<MSELoss>(config->options);\n    } else if (config->type == LossFunctionType::SOFTPLUS) {\n        return std::make_shared<SoftPlusLoss>(config->options);\n    } else {\n        throw std::runtime_error(\"Unsupported loss function type\");\n    }\n}"
  },
  {
    "path": "src/cpp/src/nn/model.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/12/21.\n//\n\n#include \"nn/model.h\"\n\n#ifdef MARIUS_CUDA\n    #include <torch/csrc/cuda/nccl.h>\n#endif\n\n#include \"configuration/constants.h\"\n#include \"data/samplers/negative.h\"\n#include \"nn/decoders/edge/decoder_methods.h\"\n#include \"nn/layers/embedding/embedding.h\"\n#include \"nn/model_helpers.h\"\n#include \"reporting/logger.h\"\n\nModel::Model(shared_ptr<GeneralEncoder> encoder, shared_ptr<Decoder> decoder, shared_ptr<LossFunction> loss, shared_ptr<Reporter> reporter,\n             std::vector<shared_ptr<Optimizer>> optimizers)\n    : device_(torch::Device(torch::kCPU)) {\n    encoder_ = encoder;\n    decoder_ = decoder;\n    loss_function_ = loss;\n    reporter_ = reporter;\n    optimizers_ = optimizers;\n    learning_task_ = decoder_->learning_task_;\n\n    if (reporter_ == nullptr) {\n        if (learning_task_ == LearningTask::LINK_PREDICTION) {\n            reporter_ = std::make_shared<LinkPredictionReporter>();\n            reporter_->addMetric(std::make_shared<MeanRankMetric>());\n            reporter_->addMetric(std::make_shared<MeanReciprocalRankMetric>());\n            reporter_->addMetric(std::make_shared<HitskMetric>(1));\n            reporter_->addMetric(std::make_shared<HitskMetric>(3));\n            reporter_->addMetric(std::make_shared<HitskMetric>(5));\n            reporter_->addMetric(std::make_shared<HitskMetric>(10));\n            reporter_->addMetric(std::make_shared<HitskMetric>(50));\n            reporter_->addMetric(std::make_shared<HitskMetric>(100));\n        } else if (learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n            reporter_ = std::make_shared<NodeClassificationReporter>();\n            reporter_->addMetric(std::make_shared<CategoricalAccuracyMetric>());\n        } else {\n            throw MariusRuntimeException(\"Reporter must be specified for this learning task.\");\n        }\n    }\n\n    if (encoder_ != nullptr) {\n        register_module(\"encoder\", std::dynamic_pointer_cast<torch::nn::Module>(encoder_));\n    }\n\n    if (decoder_ != nullptr) {\n        register_module(\"decoder\", std::dynamic_pointer_cast<torch::nn::Module>(decoder_));\n    }\n}\n\nvoid Model::clear_grad() {\n#pragma omp parallel for\n    for (int i = 0; i < optimizers_.size(); i++) {\n        optimizers_[i]->clear_grad();\n    }\n}\n\nvoid Model::clear_grad_all() {\n    for (int i = 0; i < device_models_.size(); i++) {\n        device_models_[i]->clear_grad();\n    }\n}\n\nvoid Model::step() {\n#pragma omp parallel for\n    for (int i = 0; i < optimizers_.size(); i++) {\n        optimizers_[i]->step();\n    }\n}\n\nvoid Model::step_all() {\n    for (int i = 0; i < device_models_.size(); i++) {\n        device_models_[i]->step();\n    }\n}\n\nvoid Model::save(std::string directory) {\n    string model_filename = directory + PathConstants::model_file;\n    string model_state_filename = directory + PathConstants::model_state_file;\n    string model_meta_filename = directory + PathConstants::model_config_file;\n\n    torch::serialize::OutputArchive model_archive;\n    torch::serialize::OutputArchive state_archive;\n\n    std::dynamic_pointer_cast<torch::nn::Module>(encoder_)->save(model_archive);\n\n    if (decoder_ != nullptr) {\n        std::dynamic_pointer_cast<torch::nn::Module>(decoder_)->save(model_archive);\n    }\n\n    // Outputs each optimizer as a <K, V> pair, where key is the loop counter and value\n    // is the optimizer itself. in Model::load, Optimizer::load is called on each key.\n    for (int i = 0; i < optimizers_.size(); i++) {\n        torch::serialize::OutputArchive optim_archive;\n        optimizers_[i]->save(optim_archive);\n        state_archive.write(std::to_string(i), optim_archive);\n    }\n\n    model_archive.save_to(model_filename);\n    state_archive.save_to(model_state_filename);\n}\n\nvoid Model::load(std::string directory, bool train) {\n    string model_filename = directory + PathConstants::model_file;\n    string model_state_filename = directory + PathConstants::model_state_file;\n\n    torch::serialize::InputArchive model_archive;\n    torch::serialize::InputArchive state_archive;\n\n    model_archive.load_from(model_filename);\n\n    if (train) {\n        state_archive.load_from(model_state_filename);\n    }\n\n    int optimizer_idx = 0;\n    for (auto key : state_archive.keys()) {\n        torch::serialize::InputArchive tmp_state_archive;\n        state_archive.read(key, tmp_state_archive);\n        // optimizers have already been created as part of initModelFromConfig\n        optimizers_[optimizer_idx++]->load(tmp_state_archive);\n    }\n\n    std::dynamic_pointer_cast<torch::nn::Module>(encoder_)->load(model_archive);\n\n    if (decoder_ != nullptr) {\n        std::dynamic_pointer_cast<torch::nn::Module>(decoder_)->load(model_archive);\n    }\n}\n\nvoid Model::all_reduce() {\n    torch::NoGradGuard no_grad;\n    int num_gpus = device_models_.size();\n\n    for (int i = 0; i < named_parameters().keys().size(); i++) {\n        string key = named_parameters().keys()[i];\n\n        std::vector<torch::Tensor> input_gradients(num_gpus);\n        for (int j = 0; j < num_gpus; j++) {\n            if (!device_models_[j]->named_parameters()[key].mutable_grad().defined()) {\n                device_models_[j]->named_parameters()[key].mutable_grad() = torch::zeros_like(device_models_[j]->named_parameters()[key]);\n            }\n\n            input_gradients[j] = (device_models_[j]->named_parameters()[key].mutable_grad());\n        }\n\n#ifdef MARIUS_CUDA\n        torch::cuda::nccl::all_reduce(input_gradients, input_gradients);\n#endif\n    }\n\n    step_all();\n    clear_grad_all();\n}\n\nvoid Model::setup_optimizers(shared_ptr<ModelConfig> model_config) {\n    if (model_config->dense_optimizer == nullptr) {\n        throw UnexpectedNullPtrException();\n    }\n\n    // need to assign named parameters to each optimizer\n    torch::OrderedDict<shared_ptr<OptimizerConfig>, torch::OrderedDict<std::string, torch::Tensor>> param_map;\n\n    {\n        torch::OrderedDict<std::string, torch::Tensor> empty_dict;\n        param_map.insert(model_config->dense_optimizer, empty_dict);\n    }\n\n    // get optimizers we need to keep track of for the encoder\n    for (auto module_name : encoder_->named_modules().keys()) {\n        if (module_name.empty()) {\n            continue;\n        }\n        auto layer = std::dynamic_pointer_cast<Layer>(encoder_->named_modules()[module_name]);\n        if (layer->config_->optimizer == nullptr) {\n            for (auto param_name : layer->named_parameters().keys()) {\n                param_map[model_config->dense_optimizer].insert(module_name + \"_\" + param_name, layer->named_parameters()[param_name]);\n            }\n        } else {\n            if (!param_map.contains(layer->config_->optimizer)) {\n                torch::OrderedDict<std::string, torch::Tensor> empty_dict;\n                param_map.insert(layer->config_->optimizer, empty_dict);\n            }\n\n            for (auto param_name : layer->named_parameters().keys()) {\n                param_map[layer->config_->optimizer].insert(module_name + \"_\" + param_name, layer->named_parameters()[param_name]);\n            }\n        }\n    }\n\n    for (auto key : std::dynamic_pointer_cast<torch::nn::Module>(decoder_)->named_parameters().keys()) {\n        param_map[model_config->dense_optimizer].insert(key, std::dynamic_pointer_cast<torch::nn::Module>(decoder_)->named_parameters()[key]);\n    }\n\n    for (auto key : param_map.keys()) {\n        switch (key->type) {\n            case OptimizerType::SGD: {\n                optimizers_.emplace_back(std::make_shared<SGDOptimizer>(param_map[key], key->options->learning_rate));\n                break;\n            }\n            case OptimizerType::ADAGRAD: {\n                optimizers_.emplace_back(std::make_shared<AdagradOptimizer>(param_map[key], std::dynamic_pointer_cast<AdagradOptions>(key->options)));\n                break;\n            }\n            case OptimizerType::ADAM: {\n                optimizers_.emplace_back(std::make_shared<AdamOptimizer>(param_map[key], std::dynamic_pointer_cast<AdamOptions>(key->options)));\n                break;\n            }\n            default:\n                throw std::invalid_argument(\"Unrecognized optimizer type\");\n        }\n    }\n}\n\nint64_t Model::get_base_embedding_dim() {\n    int max_offset = 0;\n    int size = 0;\n\n    for (auto stage : encoder_->layers_) {\n        for (auto layer : stage) {\n            if (layer->config_->type == LayerType::EMBEDDING) {\n                int offset = std::dynamic_pointer_cast<EmbeddingLayer>(layer)->offset_;\n\n                if (size == 0) {\n                    size = layer->config_->output_dim;\n                }\n\n                if (offset > max_offset) {\n                    max_offset = offset;\n                    size = layer->config_->output_dim;\n                }\n            }\n        }\n    }\n\n    return max_offset + size;\n}\n\nbool Model::has_embeddings() { return encoder_->has_embeddings_; }\n\ntorch::Tensor Model::forward_nc(at::optional<torch::Tensor> node_embeddings, at::optional<torch::Tensor> node_features, DENSEGraph dense_graph, bool train) {\n    torch::Tensor encoded_nodes = encoder_->forward(node_embeddings, node_features, dense_graph, train);\n    torch::Tensor y_pred = std::dynamic_pointer_cast<NodeDecoder>(decoder_)->forward(encoded_nodes);\n    return y_pred;\n}\n\nstd::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor> Model::forward_lp(shared_ptr<Batch> batch, bool train) {\n    torch::Tensor encoded_nodes = encoder_->forward(batch->node_embeddings_, batch->node_features_, batch->dense_graph_, train);\n\n    // call proper decoder\n    torch::Tensor pos_scores;\n    torch::Tensor neg_scores;\n    torch::Tensor inv_pos_scores;\n    torch::Tensor inv_neg_scores;\n\n    auto edge_decoder = std::dynamic_pointer_cast<EdgeDecoder>(decoder_);\n\n    if (edge_decoder->decoder_method_ == EdgeDecoderMethod::ONLY_POS) {\n        std::tie(pos_scores, inv_pos_scores) = only_pos_forward(edge_decoder, batch->edges_, encoded_nodes);\n    } else if (edge_decoder->decoder_method_ == EdgeDecoderMethod::POS_AND_NEG) {\n        throw MariusRuntimeException(\"Decoder method currently unsupported.\");\n        std::tie(pos_scores, neg_scores, inv_pos_scores, inv_neg_scores) = neg_and_pos_forward(edge_decoder, batch->edges_, batch->neg_edges_, encoded_nodes);\n    } else if (edge_decoder->decoder_method_ == EdgeDecoderMethod::CORRUPT_NODE) {\n        std::tie(pos_scores, neg_scores, inv_pos_scores, inv_neg_scores) =\n            node_corrupt_forward(edge_decoder, batch->edges_, encoded_nodes, batch->dst_neg_indices_mapping_, batch->src_neg_indices_mapping_);\n    } else if (edge_decoder->decoder_method_ == EdgeDecoderMethod::CORRUPT_REL) {\n        throw MariusRuntimeException(\"Decoder method currently unsupported.\");\n        std::tie(pos_scores, neg_scores, inv_pos_scores, inv_neg_scores) =\n            rel_corrupt_forward(edge_decoder, batch->edges_, encoded_nodes, batch->rel_neg_indices_);\n    } else {\n        throw MariusRuntimeException(\"Unsupported encoder method\");\n    }\n\n    if (neg_scores.defined()) {\n        neg_scores = apply_score_filter(neg_scores, batch->dst_neg_filter_);\n    }\n\n    if (inv_neg_scores.defined()) {\n        inv_neg_scores = apply_score_filter(inv_neg_scores, batch->src_neg_filter_);\n    }\n\n    return std::forward_as_tuple(pos_scores, neg_scores, inv_pos_scores, inv_neg_scores);\n}\n\nvoid Model::train_batch(shared_ptr<Batch> batch, bool call_step) {\n    if (call_step) {\n        clear_grad();\n    }\n\n    if (batch->node_embeddings_.defined()) {\n        batch->node_embeddings_.requires_grad_();\n    }\n\n    torch::Tensor loss;\n\n    if (learning_task_ == LearningTask::LINK_PREDICTION) {\n        auto all_scores = forward_lp(batch, true);\n\n        torch::Tensor pos_scores = std::get<0>(all_scores);\n        torch::Tensor neg_scores = std::get<1>(all_scores);\n        torch::Tensor inv_pos_scores = std::get<2>(all_scores);\n        torch::Tensor inv_neg_scores = std::get<3>(all_scores);\n\n        if (inv_neg_scores.defined()) {\n            torch::Tensor rhs_loss = loss_function_->operator()(pos_scores, neg_scores, true);\n            torch::Tensor lhs_loss = loss_function_->operator()(inv_pos_scores, inv_neg_scores, true);\n            loss = lhs_loss + rhs_loss;\n        } else {\n            loss = (*loss_function_)(pos_scores, neg_scores, true);\n        }\n\n    } else if (learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n        torch::Tensor y_pred = forward_nc(batch->node_embeddings_, batch->node_features_, batch->dense_graph_, true);\n        loss = (*loss_function_)(y_pred, batch->node_labels_.to(torch::kInt64), false);\n    } else {\n        throw MariusRuntimeException(\"Unsupported learning task for training\");\n    }\n\n    loss.backward();\n\n    if (call_step) {\n        step();\n    }\n\n    if (batch->node_embeddings_.defined()) {\n        batch->accumulateGradients(sparse_lr_);\n    }\n}\n\nvoid Model::evaluate_batch(shared_ptr<Batch> batch) {\n    if (learning_task_ == LearningTask::LINK_PREDICTION) {\n        auto all_scores = forward_lp(batch, true);\n        torch::Tensor pos_scores = std::get<0>(all_scores);\n        torch::Tensor neg_scores = std::get<1>(all_scores);\n        torch::Tensor inv_pos_scores = std::get<2>(all_scores);\n        torch::Tensor inv_neg_scores = std::get<3>(all_scores);\n\n        if (neg_scores.defined()) {\n            std::dynamic_pointer_cast<LinkPredictionReporter>(reporter_)->addResult(pos_scores, neg_scores);\n        }\n\n        if (inv_neg_scores.defined()) {\n            std::dynamic_pointer_cast<LinkPredictionReporter>(reporter_)->addResult(inv_pos_scores, inv_neg_scores);\n        }\n    } else if (learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n        torch::Tensor y_pred = forward_nc(batch->node_embeddings_, batch->node_features_, batch->dense_graph_, true);\n        torch::Tensor labels = batch->node_labels_;\n\n        std::dynamic_pointer_cast<NodeClassificationReporter>(reporter_)->addResult(labels, y_pred);\n\n    } else {\n        throw MariusRuntimeException(\"Unsupported learning task for evaluation\");\n    }\n}\n\nvoid Model::broadcast(std::vector<torch::Device> devices) {\n    int i = 0;\n    for (auto device : devices) {\n        SPDLOG_INFO(\"Broadcast to GPU {}\", device.index());\n        if (device != device_) {\n            shared_ptr<GeneralEncoder> encoder = encoder_clone_helper(encoder_, device);\n            shared_ptr<Decoder> decoder = decoder_clone_helper(decoder_, device);\n            device_models_[i] = std::make_shared<Model>(encoder, decoder, loss_function_, reporter_);\n\n            for (auto optim : optimizers_) {\n                device_models_[i]->optimizers_.emplace_back(optim->clone());\n                device_models_[i]->sparse_lr_ = sparse_lr_;\n            }\n        } else {\n            device_models_[i] = std::dynamic_pointer_cast<Model>(shared_from_this());\n        }\n        i++;\n    }\n}\n\nshared_ptr<Model> initModelFromConfig(shared_ptr<ModelConfig> model_config, std::vector<torch::Device> devices, int num_relations, bool train) {\n    shared_ptr<GeneralEncoder> encoder = nullptr;\n    shared_ptr<Decoder> decoder = nullptr;\n    shared_ptr<LossFunction> loss = nullptr;\n    shared_ptr<Model> model;\n\n    if (model_config->encoder == nullptr) {\n        throw UnexpectedNullPtrException(\"Encoder config undefined\");\n    }\n\n    if (model_config->decoder == nullptr) {\n        throw UnexpectedNullPtrException(\"Decoder config undefined\");\n    }\n\n    if (model_config->loss == nullptr) {\n        throw UnexpectedNullPtrException(\"Loss config undefined\");\n    }\n\n    auto tensor_options = torch::TensorOptions().device(devices[0]).dtype(torch::kFloat32);\n\n    encoder = std::make_shared<GeneralEncoder>(model_config->encoder, devices[0], num_relations);\n\n    if (model_config->learning_task == LearningTask::LINK_PREDICTION) {\n        shared_ptr<EdgeDecoderOptions> decoder_options = std::dynamic_pointer_cast<EdgeDecoderOptions>(model_config->decoder->options);\n\n        int last_stage = model_config->encoder->layers.size() - 1;\n        int last_layer = model_config->encoder->layers[last_stage].size() - 1;\n        int64_t dim = model_config->encoder->layers[last_stage][last_layer]->output_dim;\n\n        decoder = get_edge_decoder(model_config->decoder->type, decoder_options->edge_decoder_method, num_relations, dim, tensor_options,\n                                   decoder_options->inverse_edges);\n    } else {\n        decoder = get_node_decoder(model_config->decoder->type);\n    }\n\n    loss = getLossFunction(model_config->loss);\n\n    model = std::make_shared<Model>(encoder, decoder, loss);\n    model->device_ = devices[0];\n    model->device_models_ = std::vector<shared_ptr<Model>>(devices.size());\n\n    if (train) {\n        model->setup_optimizers(model_config);\n\n        if (model_config->sparse_optimizer != nullptr) {\n            model->sparse_lr_ = model_config->sparse_optimizer->options->learning_rate;\n        } else {\n            model->sparse_lr_ = model_config->dense_optimizer->options->learning_rate;\n        }\n    }\n\n    if (devices.size() > 1) {\n        SPDLOG_INFO(\"Broadcasting model to: {} GPUs\", devices.size());\n        model->broadcast(devices);\n    } else {\n        model->device_models_[0] = model;\n    }\n\n    return model;\n}"
  },
  {
    "path": "src/cpp/src/nn/optim.cpp",
    "content": "//\n// Created by Jason Mohoney on 12/9/21.\n//\n\n#include \"nn/optim.h\"\n\nvoid Optimizer::load(torch::serialize::InputArchive &input_archive) {\n    torch::IValue tmp;\n    input_archive.read(\"num_steps\", tmp);\n    num_steps_ = tmp.toInt();\n\n    for (auto itr = state_dict_.begin(); itr != state_dict_.end(); itr++) {\n        std::string key = itr->key();\n        torch::OrderedDict<std::string, torch::Tensor> param_state = torch::OrderedDict<std::string, torch::Tensor>();\n\n        torch::serialize::InputArchive tmp_archive;\n        input_archive.read(key, tmp_archive);\n\n        for (auto itr2 = itr->value().begin(); itr2 != itr->value().end(); itr2++) {\n            tmp_archive.read(itr2->key(), state_dict_[key][itr2->key()]);\n        }\n    }\n}\n\nvoid Optimizer::save(torch::serialize::OutputArchive &output_archive) {\n    output_archive.write(\"num_steps\", num_steps_);\n\n    for (auto itr = state_dict_.begin(); itr != state_dict_.end(); itr++) {\n        std::string key = itr->key();\n        torch::OrderedDict<std::string, torch::Tensor> param_state = torch::OrderedDict<std::string, torch::Tensor>();\n\n        torch::serialize::OutputArchive tmp_archive;\n\n        for (auto itr2 = itr->value().begin(); itr2 != itr->value().end(); itr2++) {\n            tmp_archive.write(itr2->key(), itr2->value());\n        }\n\n        output_archive.write(key, tmp_archive);\n    }\n}\n\nvoid Optimizer::clear_grad() {\n    auto param_items = param_dict_.items();\n#pragma omp parallel for\n    for (int i = 0; i < param_dict_.size(); i++) {\n        param_items[i].value().mutable_grad() = torch::Tensor();\n    }\n}\n\nSGDOptimizer::SGDOptimizer(torch::OrderedDict<std::string, torch::Tensor> param_dict, float learning_rate) {\n    param_dict_ = param_dict;\n    learning_rate_ = learning_rate;\n\n    reset_state();\n}\n\nvoid SGDOptimizer::reset_state() { num_steps_ = 0; }\n\nvoid SGDOptimizer::step() {\n    num_steps_++;\n\n    auto param_items = param_dict_.items();\n#pragma omp parallel for\n    for (int i = 0; i < param_dict_.size(); i++) {\n        torch::NoGradGuard no_grad;\n\n        std::string key = param_items[i].key();\n        torch::Tensor param = param_items[i].value();\n        torch::Tensor param_grad = param.grad();\n\n        if (!param_grad.defined()) {\n            continue;\n        }\n\n        double learning_rate = learning_rate_;\n\n        param.data().add_(-learning_rate * param_grad);\n    }\n}\n\nstd::shared_ptr<Optimizer> SGDOptimizer::clone() { return std::make_shared<SGDOptimizer>(*this); }\n\nAdagradOptimizer::AdagradOptimizer(torch::OrderedDict<std::string, torch::Tensor> param_dict, std::shared_ptr<AdagradOptions> options) {\n    param_dict_ = param_dict;\n\n    learning_rate_ = options->learning_rate;\n    eps_ = options->eps;\n    lr_decay_ = options->lr_decay;\n    weight_decay_ = options->weight_decay;\n    init_value_ = options->init_value;\n\n    reset_state();\n}\n\nvoid AdagradOptimizer::reset_state() {\n    num_steps_ = 0;\n    state_dict_ = torch::OrderedDict<std::string, torch::OrderedDict<std::string, torch::Tensor>>();\n\n    for (auto itr = param_dict_.begin(); itr != param_dict_.end(); itr++) {\n        std::string key = itr->key();\n\n        torch::OrderedDict<std::string, torch::Tensor> param_state = torch::OrderedDict<std::string, torch::Tensor>();\n\n        torch::Tensor sum_state = torch::zeros_like(itr->value());\n\n        if (init_value_ != 0) {\n            sum_state.fill_(init_value_);\n        }\n        param_state.insert(\"sum\", sum_state);\n        state_dict_.insert(key, param_state);\n    }\n}\n\nvoid AdagradOptimizer::step() {\n    auto param_items = param_dict_.items();\n#pragma omp parallel for\n    for (int i = 0; i < param_dict_.size(); i++) {\n        torch::NoGradGuard no_grad;\n\n        std::string key = param_items[i].key();\n        torch::Tensor param = param_items[i].value();\n        torch::Tensor param_grad = param.grad();\n\n        if (!param_grad.defined()) {\n            continue;\n        }\n\n        torch::Tensor sum_state = state_dict_[key][\"sum\"];\n\n        if (weight_decay_ != 0) {\n            param_grad = param_grad.add(param, weight_decay_);\n        }\n\n        double learning_rate = learning_rate_;\n        if (lr_decay_ != 0) {\n            learning_rate = learning_rate / (1 + num_steps_ * lr_decay_);\n        }\n\n        sum_state.addcmul_(param_grad, param_grad, 1.0);\n        const auto std = sum_state.sqrt().add_(eps_);\n        param.data().addcdiv_(param_grad, std, -learning_rate);\n    }\n\n    num_steps_++;\n}\n\nstd::shared_ptr<Optimizer> AdagradOptimizer::clone() { return std::make_shared<AdagradOptimizer>(*this); }\n\nAdamOptimizer::AdamOptimizer(torch::OrderedDict<std::string, torch::Tensor> param_dict, std::shared_ptr<AdamOptions> options) {\n    param_dict_ = param_dict;\n\n    learning_rate_ = options->learning_rate;\n    eps_ = options->eps;\n    beta_1_ = options->beta_1;\n    beta_2_ = options->beta_2;\n    weight_decay_ = options->weight_decay;\n    amsgrad_ = options->amsgrad;\n\n    reset_state();\n}\n\nvoid AdamOptimizer::reset_state() {\n    num_steps_ = 0;\n    state_dict_ = torch::OrderedDict<std::string, torch::OrderedDict<std::string, torch::Tensor>>();\n\n    for (auto itr = param_dict_.begin(); itr != param_dict_.end(); itr++) {\n        std::string key = itr->key();\n\n        torch::OrderedDict<std::string, torch::Tensor> param_state = torch::OrderedDict<std::string, torch::Tensor>();\n\n        torch::Tensor exp_avg_state = torch::zeros_like(itr->value());\n        torch::Tensor exp_avg_sq_state = torch::zeros_like(itr->value());\n\n        param_state.insert(\"exp_avg\", exp_avg_state);\n        param_state.insert(\"exp_avg_sq\", exp_avg_sq_state);\n\n        if (amsgrad_) {\n            torch::Tensor max_exp_avg_sq_state = torch::zeros_like(itr->value());\n            param_state.insert(\"max_exp_avg_sq\", max_exp_avg_sq_state);\n        }\n\n        state_dict_.insert(key, param_state);\n    }\n}\n\nvoid AdamOptimizer::step() {\n    auto param_items = param_dict_.items();\n#pragma omp parallel for\n    for (int i = 0; i < param_dict_.size(); i++) {\n        torch::NoGradGuard no_grad;\n\n        std::string key = param_items[i].key();\n        torch::Tensor param = param_items[i].value();\n        torch::Tensor param_grad = param.grad();\n\n        if (!param_grad.defined()) {\n            continue;\n        }\n\n        torch::Tensor exp_avg_state = state_dict_[key][\"exp_avg\"];\n        torch::Tensor exp_avg_sq_state = state_dict_[key][\"exp_avg_sq\"];\n\n        float bias_correction1 = 1 - std::pow(beta_1_, num_steps_ + 1);\n        float bias_correction2 = 1 - std::pow(beta_2_, num_steps_ + 1);\n\n        if (weight_decay_ != 0) {\n            param_grad = param_grad.add(param, weight_decay_);\n        }\n\n        // Decay the first and second moment running average coefficient\n        exp_avg_state.mul_(beta_1_).add_(param_grad, 1 - beta_1_);\n        exp_avg_sq_state.mul_(beta_2_).addcmul_(param_grad, param_grad, 1 - beta_2_);\n\n        torch::Tensor denom;\n        if (amsgrad_) {\n            torch::Tensor max_exp_avg_sq_state = state_dict_[key][\"max_exp_avg_sq\"];\n            // Maintains the maximum of all 2nd moment running avg. till now\n            torch::max_out(max_exp_avg_sq_state, exp_avg_sq_state, max_exp_avg_sq_state);\n\n            // Use the max. for normalizing running avg. of gradient\n            denom = (max_exp_avg_sq_state.sqrt() / sqrt(bias_correction2)).add_(eps_);\n        } else {\n            denom = (exp_avg_sq_state.sqrt() / sqrt(bias_correction2)).add_(eps_);\n        }\n\n        auto step_size = learning_rate_ / bias_correction1;\n\n        param.data().addcdiv_(exp_avg_state, denom, -step_size);\n    }\n\n    num_steps_++;\n}\n\nstd::shared_ptr<Optimizer> AdamOptimizer::clone() { return std::make_shared<AdamOptimizer>(*this); }\n"
  },
  {
    "path": "src/cpp/src/nn/regularizer.cpp",
    "content": "//\n// Created by Jason Mohoney on 8/25/21.\n//\n#include \"nn/regularizer.h\"\n\nNormRegularizer::NormRegularizer(int norm, float coefficient) {\n    norm_ = norm;\n    coefficient_ = coefficient;\n}\n\ntorch::Tensor NormRegularizer::operator()(torch::Tensor src_nodes_embs, torch::Tensor dst_node_embs) {\n    return coefficient_ / 2 * torch::sum((torch::norm(src_nodes_embs, norm_, 0) + torch::norm(dst_node_embs, norm_, 0)));\n}"
  },
  {
    "path": "src/cpp/src/pipeline/evaluator.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/28/20.\n//\n\n#include \"pipeline/evaluator.h\"\n\n#include \"configuration/constants.h\"\n#include \"reporting/logger.h\"\n\nPipelineEvaluator::PipelineEvaluator(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, shared_ptr<PipelineConfig> pipeline_config) {\n    dataloader_ = dataloader;\n\n    if (model->device_.is_cuda()) {\n        pipeline_ = std::make_shared<PipelineGPU>(dataloader, model, false, nullptr, pipeline_config);\n    } else {\n        pipeline_ = std::make_shared<PipelineCPU>(dataloader, model, false, nullptr, pipeline_config);\n    }\n\n    pipeline_->initialize();\n}\n\nvoid PipelineEvaluator::evaluate(bool validation) {\n    if (!dataloader_->single_dataset_) {\n        if (validation) {\n            SPDLOG_INFO(\"Evaluating validation set\");\n            dataloader_->setValidationSet();\n        } else {\n            SPDLOG_INFO(\"Evaluating test set\");\n            dataloader_->setTestSet();\n        }\n    }\n\n    dataloader_->initializeBatches(false);\n\n    if (dataloader_->evaluation_negative_sampler_ != nullptr) {\n        if (dataloader_->evaluation_config_->negative_sampling->filtered) {\n            dataloader_->graph_storage_->sortAllEdges();\n        }\n    }\n\n    Timer timer = Timer(false);\n    timer.start();\n    pipeline_->start();\n    pipeline_->waitComplete();\n    pipeline_->pauseAndFlush();\n    pipeline_->model_->reporter_->report();\n    timer.stop();\n\n    int64_t epoch_time = timer.getDuration();\n    SPDLOG_INFO(\"Evaluation complete: {}ms\", epoch_time);\n}\n\nSynchronousEvaluator::SynchronousEvaluator(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model) {\n    dataloader_ = dataloader;\n    model_ = model;\n}\n\nvoid SynchronousEvaluator::evaluate(bool validation) {\n    if (!dataloader_->single_dataset_) {\n        if (validation) {\n            SPDLOG_INFO(\"Evaluating validation set\");\n            dataloader_->setValidationSet();\n        } else {\n            SPDLOG_INFO(\"Evaluating test set\");\n            dataloader_->setTestSet();\n        }\n    }\n\n    dataloader_->initializeBatches(false);\n\n    if (dataloader_->evaluation_negative_sampler_ != nullptr) {\n        if (dataloader_->evaluation_config_->negative_sampling->filtered) {\n            dataloader_->graph_storage_->sortAllEdges();\n        }\n    }\n\n    Timer timer = Timer(false);\n    timer.start();\n    int num_batches = 0;\n\n    while (dataloader_->hasNextBatch()) {\n        shared_ptr<Batch> batch = dataloader_->getBatch();\n        if (dataloader_->graph_storage_->embeddingsOffDevice()) {\n            batch->to(model_->device_);\n        }\n        dataloader_->loadGPUParameters(batch);\n\n        model_->evaluate_batch(batch);\n\n        dataloader_->finishedBatch();\n        batch->clear();\n        num_batches++;\n    }\n    timer.stop();\n\n    model_->reporter_->report();\n}"
  },
  {
    "path": "src/cpp/src/pipeline/graph_encoder.cpp",
    "content": "//\n// Created by Jason Mohoney on 1/22/22.\n//\n\n#include \"pipeline/graph_encoder.h\"\n\n#include \"reporting/logger.h\"\n\nusing std::get;\nusing std::tie;\n\nPipelineGraphEncoder::PipelineGraphEncoder(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, shared_ptr<PipelineConfig> pipeline_config,\n                                           int logs_per_epoch) {\n    dataloader_ = dataloader;\n\n    std::string item_name = \"Nodes\";\n    int64_t num_items = dataloader_->graph_storage_->getNumNodes();\n\n    progress_reporter_ = std::make_shared<ProgressReporter>(item_name, num_items, logs_per_epoch);\n\n    if (model->device_.is_cuda()) {\n        pipeline_ = std::make_shared<PipelineGPU>(dataloader, model, true, progress_reporter_, pipeline_config, true);\n    } else {\n        pipeline_ = std::make_shared<PipelineCPU>(dataloader, model, true, progress_reporter_, pipeline_config, true);\n    }\n}\n\nvoid PipelineGraphEncoder::encode(bool separate_layers) {\n    Timer timer = Timer(false);\n    timer.start();\n\n    pipeline_->start();\n    pipeline_->waitComplete();\n    pipeline_->pauseAndFlush();\n    progress_reporter_->clear();\n\n    timer.stop();\n\n    std::string item_name = \"Nodes\";\n    int64_t num_items = dataloader_->graph_storage_->getNumNodes();\n\n    int64_t epoch_time = timer.getDuration();\n    float items_per_second = (float)num_items / ((float)epoch_time / 1000);\n    SPDLOG_INFO(\"Encode took: {}ms\", epoch_time);\n    SPDLOG_INFO(\"{} per Second: {}\", item_name, items_per_second);\n}\n\nSynchronousGraphEncoder::SynchronousGraphEncoder(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, int logs_per_epoch) {\n    dataloader_ = dataloader;\n    model_ = model;\n\n    std::string item_name = \"Nodes\";\n    int64_t num_items = dataloader_->graph_storage_->getNumNodes();\n\n    progress_reporter_ = std::make_shared<ProgressReporter>(item_name, num_items, logs_per_epoch);\n}\n\nvoid SynchronousGraphEncoder::encode(bool separate_layers) {\n    dataloader_->setEncode();\n    Timer timer = Timer(false);\n    timer.start();\n    SPDLOG_INFO(\"Start full graph encode\");\n\n    while (dataloader_->hasNextBatch()) {\n        shared_ptr<Batch> batch = dataloader_->getBatch();\n        batch->to(model_->device_);\n        dataloader_->loadGPUParameters(batch);\n\n        torch::Tensor encoded_nodes = model_->encoder_->forward(batch->node_embeddings_, batch->node_features_, batch->dense_graph_, false);\n        batch->clear();\n\n        encoded_nodes = encoded_nodes.contiguous().to(torch::kCPU);\n\n        if (model_->device_.is_cuda()) {\n            torch::cuda::synchronize();\n        }\n\n        dataloader_->graph_storage_->updatePutEncodedNodesRange(batch->start_idx_, batch->batch_size_, encoded_nodes);\n        dataloader_->finishedBatch();\n    }\n\n    timer.stop();\n    SPDLOG_INFO(\"Encode Complete: {}s\", (double)timer.getDuration() / 1000);\n}\n"
  },
  {
    "path": "src/cpp/src/pipeline/pipeline.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/29/20.\n//\n\n#include \"pipeline/pipeline.h\"\n\n#include \"pipeline/pipeline_cpu.h\"\n#include \"pipeline/pipeline_gpu.h\"\n#include \"reporting/logger.h\"\n\nWorker::Worker(Pipeline *pipeline) {\n    pipeline_ = pipeline;\n    sleep_time_.tv_sec = 0;\n    sleep_time_.tv_nsec = WAIT_TIME;\n    paused_ = true;\n    done_ = false;\n}\n\nvoid LoadBatchWorker::run() {\n    while (!done_) {\n        while (!paused_) {\n            // Check that 1) the total number of batches in the pipeline does not exceed the capacity\n            // And 2) that the epoch has a batch left to process\n            std::unique_lock lock(*pipeline_->max_batches_lock_);\n            if ((pipeline_->batches_in_flight_ < pipeline_->staleness_bound_) && pipeline_->dataloader_->hasNextBatch()) {\n                pipeline_->admitted_batches_++;\n                pipeline_->batches_in_flight_++;\n                lock.unlock();\n\n                shared_ptr<Batch> batch = pipeline_->dataloader_->getBatch(c10::nullopt, false, worker_id_);\n\n                if (batch == nullptr) {\n                    break;\n                }\n\n                if (pipeline_->model_->device_.is_cuda()) {\n                    ((PipelineGPU *)pipeline_)->loaded_batches_->blocking_push(batch);\n                } else {\n                    ((PipelineCPU *)pipeline_)->loaded_batches_->blocking_push(batch);\n                }\n            } else {\n                // wait until we can try to grab a batch again\n                pipeline_->max_batches_cv_->wait(lock);\n                lock.unlock();\n            }\n        }\n        nanosleep(&sleep_time_, NULL);  // wait until std::thread is not paused\n    }\n}\n\nvoid UpdateBatchWorker::run() {\n    while (!done_) {\n        while (!paused_) {\n            auto tup = ((PipelineGPU *)pipeline_)->update_batches_->blocking_pop();\n            bool popped = std::get<0>(tup);\n            shared_ptr<Batch> batch = std::get<1>(tup);\n\n            if (!popped) {\n                break;\n            }\n\n            // transfer gradients and update parameters\n            if (batch->node_embeddings_.defined()) {\n                pipeline_->dataloader_->updateEmbeddings(batch, false);\n            }\n\n            pipeline_->reporter_->addResult(batch->batch_size_);\n            pipeline_->batches_in_flight_--;\n            pipeline_->dataloader_->finishedBatch();\n            pipeline_->max_batches_cv_->notify_one();\n            pipeline_->edges_processed_ += batch->batch_size_;\n\n            SPDLOG_TRACE(\"Completed: {}\", batch->batch_id_);\n        }\n        nanosleep(&sleep_time_, NULL);\n    }\n}\n\nvoid WriteNodesWorker::run() {\n    while (!done_) {\n        while (!paused_) {\n            shared_ptr<Batch> batch;\n            bool popped = false;\n            if (pipeline_->model_->device_.is_cuda()) {\n                auto tup = ((PipelineGPU *)pipeline_)->update_batches_->blocking_pop();\n                popped = std::get<0>(tup);\n                batch = std::get<1>(tup);\n            } else {\n                auto tup = ((PipelineCPU *)pipeline_)->update_batches_->blocking_pop();\n                popped = std::get<0>(tup);\n                batch = std::get<1>(tup);\n            }\n\n            if (!popped) {\n                break;\n            }\n\n            pipeline_->dataloader_->graph_storage_->updatePutEncodedNodesRange(batch->start_idx_, batch->batch_size_, batch->encoded_uniques_);\n            pipeline_->reporter_->addResult(batch->batch_size_);\n            pipeline_->batches_in_flight_--;\n            pipeline_->dataloader_->finishedBatch();\n            pipeline_->max_batches_cv_->notify_one();\n            pipeline_->edges_processed_ += batch->batch_size_;\n\n            SPDLOG_TRACE(\"Completed: {}\", batch->batch_id_);\n        }\n        nanosleep(&sleep_time_, NULL);\n    }\n}\n\nPipeline::~Pipeline() {\n    delete max_batches_cv_;\n    delete max_batches_lock_;\n    delete pipeline_lock_;\n}\n\nshared_ptr<Worker> Pipeline::initWorkerOfType(int worker_type, int gpu_id, int worker_id) {\n    shared_ptr<Worker> worker;\n\n    if (worker_type == LOAD_BATCH_ID) {\n        worker = std::make_shared<LoadBatchWorker>(this, worker_id);\n    } else if (worker_type == H2D_TRANSFER_ID) {\n        worker = std::make_shared<BatchToDeviceWorker>(this);\n    } else if (worker_type == CPU_COMPUTE_ID) {\n        worker = std::make_shared<ComputeWorkerCPU>(this);\n    } else if (worker_type == GPU_COMPUTE_ID) {\n        worker = std::make_shared<ComputeWorkerGPU>(this, gpu_id);\n    } else if (worker_type == D2H_TRANSFER_ID) {\n        worker = std::make_shared<BatchToHostWorker>(this, gpu_id);\n    } else if (worker_type == UPDATE_BATCH_ID) {\n        worker = std::make_shared<UpdateBatchWorker>(this);\n    } else if (worker_type == CPU_ENCODE_ID) {\n        worker = std::make_shared<EncodeNodesWorkerCPU>(this);\n    } else if (worker_type == GPU_ENCODE_ID) {\n        worker = std::make_shared<EncodeNodesWorkerGPU>(this, gpu_id);\n    } else if (worker_type == NODE_WRITE_ID) {\n        worker = std::make_shared<WriteNodesWorker>(this);\n    }\n\n    worker->spawn();\n    return worker;\n}\n"
  },
  {
    "path": "src/cpp/src/pipeline/pipeline_cpu.cpp",
    "content": "//\n// Created by Jason Mohoney on 1/21/22.\n//\n\n#include \"pipeline/pipeline_cpu.h\"\n\n#include \"pipeline/queue.h\"\n#include \"reporting/logger.h\"\n\nvoid ComputeWorkerCPU::run() {\n    while (!done_) {\n        while (!paused_) {\n            shared_ptr<Queue<shared_ptr<Batch>>> pop_queue = ((PipelineCPU *)pipeline_)->loaded_batches_;\n            auto tup = pop_queue->blocking_pop();\n            bool popped = std::get<0>(tup);\n            shared_ptr<Batch> batch = std::get<1>(tup);\n            if (!popped) {\n                break;\n            }\n            if (pipeline_->isTrain()) {\n                pipeline_->model_->train_batch(batch);\n                batch->status_ = BatchStatus::ComputedGradients;\n                shared_ptr<Queue<shared_ptr<Batch>>> push_queue = ((PipelineCPU *)pipeline_)->update_batches_;\n\n                push_queue->blocking_push(batch);\n            } else {\n                pipeline_->model_->evaluate_batch(batch);\n                pipeline_->batches_in_flight_--;\n                pipeline_->dataloader_->finishedBatch();\n                pipeline_->max_batches_cv_->notify_one();\n                batch->clear();\n            }\n        }\n        nanosleep(&sleep_time_, NULL);\n    }\n}\n\nvoid EncodeNodesWorkerCPU::run() {\n    while (!done_) {\n        while (!paused_) {\n            shared_ptr<Queue<shared_ptr<Batch>>> pop_queue = ((PipelineCPU *)pipeline_)->loaded_batches_;\n            auto tup = pop_queue->blocking_pop();\n            bool popped = std::get<0>(tup);\n            shared_ptr<Batch> batch = std::get<1>(tup);\n            if (!popped) {\n                break;\n            }\n\n            torch::Tensor encoded = pipeline_->model_->encoder_->forward(batch->node_embeddings_, batch->node_features_, batch->dense_graph_, false);\n            batch->clear();\n            batch->encoded_uniques_ = encoded.contiguous();\n\n            shared_ptr<Queue<shared_ptr<Batch>>> push_queue = ((PipelineCPU *)pipeline_)->update_batches_;\n            push_queue->blocking_push(batch);\n        }\n        nanosleep(&sleep_time_, NULL);\n    }\n}\n\nPipelineCPU::PipelineCPU(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, bool train, shared_ptr<ProgressReporter> reporter,\n                         shared_ptr<PipelineConfig> pipeline_config, bool encode_only) {\n    dataloader_ = dataloader;\n    model_ = model;\n    reporter_ = reporter;\n    train_ = train;\n    edges_processed_ = 0;\n    pipeline_options_ = pipeline_config;\n    assign_id_ = 0;\n    encode_only_ = encode_only;\n\n    if (train_) {\n        loaded_batches_ = std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->batch_host_queue_size);\n        update_batches_ = std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->gradients_host_queue_size);\n    } else {\n        loaded_batches_ = std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->batch_host_queue_size);\n    }\n\n    staleness_bound_ = pipeline_options_->staleness_bound;\n    pipeline_lock_ = new std::mutex();\n    max_batches_lock_ = new std::mutex();\n    max_batches_cv_ = new std::condition_variable();\n    batches_in_flight_ = 0;\n    admitted_batches_ = 0;\n    curr_pos_ = 0;\n\n    PipelineCPU::initialize();\n}\n\nPipelineCPU::~PipelineCPU() {\n    for (int i = 0; i < CPU_NUM_WORKER_TYPES; i++) {\n        for (int j = 0; j < pool_[i].size(); j++) {\n            pool_[i][j]->stop();\n        }\n    }\n\n    pool_->clear();\n\n    if (train_) {\n        loaded_batches_ = nullptr;\n        update_batches_ = nullptr;\n    } else {\n        loaded_batches_ = nullptr;\n    }\n}\n\nbool Pipeline::isDone() { return (batches_in_flight_ <= 0) && dataloader_->epochComplete(); }\n\nbool Pipeline::isTrain() { return train_; }\n\nbool Pipeline::has_embeddings() { return model_->has_embeddings(); }\n\nvoid Pipeline::waitComplete() {\n    timespec sleep_time{};\n    sleep_time.tv_sec = 0;\n    sleep_time.tv_nsec = MILLISECOND;  // check every 1 millisecond\n    while (!isDone()) {\n        nanosleep(&sleep_time, NULL);\n    }\n}\n\nvoid PipelineCPU::addWorkersToPool(int pool_id, int worker_type, int num_workers, int gpu_id) {\n    for (int i = 0; i < num_workers; i++) {\n        pool_[pool_id].emplace_back(initWorkerOfType(worker_type, gpu_id, i));\n    }\n}\n\nvoid PipelineCPU::initialize() {\n    if (encode_only_) {\n        addWorkersToPool(0, LOAD_BATCH_ID, pipeline_options_->batch_loader_threads);\n        addWorkersToPool(1, CPU_ENCODE_ID, pipeline_options_->compute_threads);\n        addWorkersToPool(2, NODE_WRITE_ID, pipeline_options_->gradient_update_threads);\n    } else {\n        if (train_) {\n            addWorkersToPool(0, LOAD_BATCH_ID, pipeline_options_->batch_loader_threads);\n            addWorkersToPool(1, CPU_COMPUTE_ID, pipeline_options_->compute_threads);\n            addWorkersToPool(2, UPDATE_BATCH_ID, pipeline_options_->gradient_update_threads);\n        } else {\n            addWorkersToPool(0, LOAD_BATCH_ID, pipeline_options_->batch_loader_threads);\n            addWorkersToPool(1, CPU_COMPUTE_ID, pipeline_options_->compute_threads);\n        }\n    }\n}\n\nvoid PipelineCPU::start() {\n    batches_in_flight_ = 0;\n    admitted_batches_ = 0;\n    assign_id_ = 0;\n    setQueueExpectingData(true);\n\n    for (int i = 0; i < CPU_NUM_WORKER_TYPES; i++) {\n        for (int j = 0; j < pool_[i].size(); j++) {\n            pool_[i][j]->start();\n        }\n    }\n}\n\nvoid PipelineCPU::pauseAndFlush() {\n    waitComplete();\n    setQueueExpectingData(false);\n\n    for (int i = 0; i < CPU_NUM_WORKER_TYPES; i++) {\n        for (int j = 0; j < pool_[i].size(); j++) {\n            pool_[i][j]->pause();\n        }\n    }\n\n    max_batches_cv_->notify_all();\n    SPDLOG_INFO(\"Pipeline flush complete\");\n    edges_processed_ = 0;\n}\n\nvoid PipelineCPU::flushQueues() {\n    if (train_) {\n        loaded_batches_->flush();\n        update_batches_->flush();\n    } else {\n        loaded_batches_->flush();\n    }\n}\n\nvoid PipelineCPU::setQueueExpectingData(bool expecting_data) {\n    if (train_) {\n        loaded_batches_->expecting_data_ = expecting_data;\n        loaded_batches_->cv_->notify_all();\n        update_batches_->expecting_data_ = expecting_data;\n        update_batches_->cv_->notify_all();\n    } else {\n        loaded_batches_->expecting_data_ = expecting_data;\n        loaded_batches_->cv_->notify_all();\n    }\n}"
  },
  {
    "path": "src/cpp/src/pipeline/pipeline_gpu.cpp",
    "content": "//\n// Created by Jason Mohoney on 1/21/22.\n//\n\n#include \"pipeline/pipeline_gpu.h\"\n\n#include \"pipeline/queue.h\"\n#include \"reporting/logger.h\"\n\nvoid BatchToDeviceWorker::run() {\n    unsigned int rand_seed = rand();\n\n    int assign_id = 0;\n\n    while (!done_) {\n        while (!paused_) {\n            auto tup = ((PipelineGPU *)pipeline_)->loaded_batches_->blocking_pop();\n            bool popped = std::get<0>(tup);\n            shared_ptr<Batch> batch = std::get<1>(tup);\n            if (!popped) {\n                break;\n            }\n            int queue_choice = pipeline_->assign_id_++ % ((PipelineGPU *)pipeline_)->device_loaded_batches_.size();\n\n            batch->to(pipeline_->model_->device_models_[queue_choice]->device_, pipeline_->dataloader_->compute_stream_);\n\n            ((PipelineGPU *)pipeline_)->device_loaded_batches_[queue_choice]->blocking_push(batch);\n        }\n        nanosleep(&sleep_time_, NULL);\n    }\n}\n\nvoid ComputeWorkerGPU::run() {\n    CudaStream compute_stream = getStreamFromPool(true, 0);\n    if (pipeline_->dataloader_->learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n        pipeline_->dataloader_->compute_stream_ = &compute_stream;\n    }\n    // TODO: streams for LP need a bit more work\n\n    while (!done_) {\n        while (!paused_) {\n            auto tup = ((PipelineGPU *)pipeline_)->device_loaded_batches_[gpu_id_]->blocking_pop();\n            bool popped = std::get<0>(tup);\n            shared_ptr<Batch> batch = std::get<1>(tup);\n            if (!popped) {\n                break;\n            }\n\n            pipeline_->dataloader_->loadGPUParameters(batch);\n\n            if (pipeline_->isTrain()) {\n                bool will_sync = false;\n                if (pipeline_->model_->device_models_.size() > 1) {\n                    ((PipelineGPU *)pipeline_)->gpu_sync_lock_->lock();\n                    ((PipelineGPU *)pipeline_)->batches_since_last_sync_++;\n\n                    if (((PipelineGPU *)pipeline_)->batches_since_last_sync_ == ((PipelineGPU *)pipeline_)->gpu_sync_interval_) {\n                        will_sync = true;\n                    }\n\n                    // only release the lock if we don't need to synchronize the GPUs\n                    if (!will_sync) {\n                        ((PipelineGPU *)pipeline_)->gpu_sync_lock_->unlock();\n                    }\n                }\n\n                if (pipeline_->dataloader_->compute_stream_ != nullptr) {\n                    CudaStreamGuard stream_guard(compute_stream);\n                    pipeline_->model_->device_models_[gpu_id_].get()->train_batch(batch, ((PipelineGPU *)pipeline_)->pipeline_options_->gpu_model_average);\n                } else {\n                    pipeline_->model_->device_models_[gpu_id_].get()->train_batch(batch, ((PipelineGPU *)pipeline_)->pipeline_options_->gpu_model_average);\n                }\n\n                if (will_sync) {\n                    // we already have the lock acquired, it is safe to sync?\n                    pipeline_->model_->all_reduce();\n\n                    ((PipelineGPU *)pipeline_)->batches_since_last_sync_ = 0;\n                    ((PipelineGPU *)pipeline_)->gpu_sync_lock_->unlock();\n                }\n\n                if (!pipeline_->has_embeddings()) {\n                    batch->clear();\n                    pipeline_->reporter_->addResult(batch->batch_size_);\n                    pipeline_->batches_in_flight_--;\n                    pipeline_->dataloader_->finishedBatch();\n                    pipeline_->max_batches_cv_->notify_one();\n                    pipeline_->edges_processed_ += batch->batch_size_;\n                } else {\n                    pipeline_->dataloader_->updateEmbeddings(batch, true);\n                    ((PipelineGPU *)pipeline_)->device_update_batches_[gpu_id_]->blocking_push(batch);\n                }\n            } else {\n                pipeline_->model_->device_models_[gpu_id_]->evaluate_batch(batch);\n\n                pipeline_->batches_in_flight_--;\n                pipeline_->max_batches_cv_->notify_one();\n                pipeline_->dataloader_->finishedBatch();\n                batch->clear();\n            }\n        }\n        nanosleep(&sleep_time_, NULL);\n    }\n}\n\nvoid EncodeNodesWorkerGPU::run() {\n    while (!done_) {\n        while (!paused_) {\n            auto tup = ((PipelineGPU *)pipeline_)->device_loaded_batches_[gpu_id_]->blocking_pop();\n            bool popped = std::get<0>(tup);\n            shared_ptr<Batch> batch = std::get<1>(tup);\n            if (!popped) {\n                break;\n            }\n\n            pipeline_->dataloader_->loadGPUParameters(batch);\n\n            torch::Tensor encoded =\n                pipeline_->model_->device_models_[gpu_id_].get()->encoder_->forward(batch->node_embeddings_, batch->node_features_, batch->dense_graph_, false);\n            batch->clear();\n            batch->encoded_uniques_ = encoded.contiguous();\n\n            ((PipelineGPU *)pipeline_)->device_update_batches_[gpu_id_]->blocking_push(batch);\n        }\n        nanosleep(&sleep_time_, NULL);\n    }\n}\n\nvoid BatchToHostWorker::run() {\n    while (!done_) {\n        while (!paused_) {\n            auto tup = ((PipelineGPU *)pipeline_)->device_update_batches_[gpu_id_]->blocking_pop();\n            bool popped = std::get<0>(tup);\n            shared_ptr<Batch> batch = std::get<1>(tup);\n            if (!popped) {\n                break;\n            }\n\n            batch->embeddingsToHost();\n\n            ((PipelineGPU *)pipeline_)->update_batches_->blocking_push(batch);\n        }\n        nanosleep(&sleep_time_, NULL);\n    }\n}\n\nPipelineGPU::PipelineGPU(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, bool train, shared_ptr<ProgressReporter> reporter,\n                         shared_ptr<PipelineConfig> pipeline_config, bool encode_only) {\n    dataloader_ = dataloader;\n    model_ = model;\n    reporter_ = reporter;\n    train_ = train;\n    edges_processed_ = 0;\n    pipeline_options_ = pipeline_config;\n    gpu_sync_lock_ = new std::mutex();\n    batches_since_last_sync_ = 0;\n    gpu_sync_interval_ = pipeline_options_->gpu_sync_interval;\n    assign_id_ = 0;\n    encode_only_ = encode_only;\n\n    if (train_) {\n        loaded_batches_ = std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->batch_host_queue_size);\n        for (int i = 0; i < model_->device_models_.size(); i++) {\n            device_loaded_batches_.emplace_back(std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->batch_device_queue_size));\n            if (model_->has_embeddings()) {\n                device_update_batches_.emplace_back(std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->gradients_device_queue_size));\n            }\n        }\n        if (model_->has_embeddings()) {\n            update_batches_ = std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->gradients_host_queue_size);\n        }\n    } else {\n        loaded_batches_ = std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->batch_host_queue_size);\n        device_loaded_batches_.emplace_back(std::make_shared<Queue<shared_ptr<Batch>>>(pipeline_options_->batch_device_queue_size));\n    }\n\n    pipeline_lock_ = new std::mutex();\n    max_batches_lock_ = new std::mutex();\n    max_batches_cv_ = new std::condition_variable();\n\n    staleness_bound_ = pipeline_options_->staleness_bound;\n    batches_in_flight_ = 0;\n    admitted_batches_ = 0;\n    curr_pos_ = 0;\n\n    PipelineGPU::initialize();\n}\n\nPipelineGPU::~PipelineGPU() {\n    for (int i = 0; i < GPU_NUM_WORKER_TYPES; i++) {\n        for (int j = 0; j < pool_[i].size(); j++) {\n            pool_[i][j]->stop();\n        }\n    }\n\n    pool_->clear();\n\n    delete gpu_sync_lock_;\n\n    loaded_batches_ = nullptr;\n    device_loaded_batches_ = {};\n\n    if (train_) {\n        if (model_->has_embeddings()) {\n            device_update_batches_ = {};\n        }\n\n        if (model_->has_embeddings()) {\n            update_batches_ = nullptr;\n        }\n    }\n}\n\nvoid PipelineGPU::addWorkersToPool(int pool_id, int worker_type, int num_workers, int num_gpus) {\n    for (int i = 0; i < num_workers; i++) {\n        for (int j = 0; j < num_gpus; j++) {\n            pool_[pool_id].emplace_back(initWorkerOfType(worker_type, j, i));\n        }\n    }\n}\n\nvoid PipelineGPU::initialize() {\n    if (encode_only_) {\n        addWorkersToPool(0, LOAD_BATCH_ID, pipeline_options_->batch_loader_threads);\n        addWorkersToPool(1, H2D_TRANSFER_ID, pipeline_options_->batch_transfer_threads);\n        addWorkersToPool(2, GPU_ENCODE_ID, 1, model_->device_models_.size());  // Only one std::thread manages GPU\n        if (model_->has_embeddings()) {\n            addWorkersToPool(3, D2H_TRANSFER_ID, pipeline_options_->gradient_transfer_threads, model_->device_models_.size());\n            addWorkersToPool(4, NODE_WRITE_ID, pipeline_options_->gradient_update_threads);\n        }\n    } else {\n        if (train_) {\n            addWorkersToPool(0, LOAD_BATCH_ID, pipeline_options_->batch_loader_threads);\n            addWorkersToPool(1, H2D_TRANSFER_ID, pipeline_options_->batch_transfer_threads);\n            addWorkersToPool(2, GPU_COMPUTE_ID, 1, model_->device_models_.size());  // Only one std::thread manages GPU\n            if (model_->has_embeddings()) {\n                addWorkersToPool(3, D2H_TRANSFER_ID, pipeline_options_->gradient_transfer_threads, model_->device_models_.size());\n                addWorkersToPool(4, UPDATE_BATCH_ID, pipeline_options_->gradient_update_threads);\n            }\n        } else {\n            addWorkersToPool(0, LOAD_BATCH_ID, pipeline_options_->batch_loader_threads);\n            addWorkersToPool(1, H2D_TRANSFER_ID, pipeline_options_->batch_transfer_threads);\n            addWorkersToPool(2, GPU_COMPUTE_ID, 1, model_->device_models_.size());\n        }\n    }\n}\n\nvoid PipelineGPU::start() {\n    batches_in_flight_ = 0;\n    admitted_batches_ = 0;\n    assign_id_ = 0;\n    setQueueExpectingData(true);\n\n    for (int i = 0; i < GPU_NUM_WORKER_TYPES; i++) {\n        for (int j = 0; j < pool_[i].size(); j++) {\n            pool_[i][j]->start();\n        }\n    }\n}\n\nvoid PipelineGPU::pauseAndFlush() {\n    waitComplete();\n    setQueueExpectingData(false);\n\n    for (int i = 0; i < GPU_NUM_WORKER_TYPES; i++) {\n        for (int j = 0; j < pool_[i].size(); j++) {\n            pool_[i][j]->pause();\n        }\n    }\n    max_batches_cv_->notify_all();\n\n    SPDLOG_INFO(\"Pipeline flush complete\");\n    edges_processed_ = 0;\n}\n\nvoid PipelineGPU::flushQueues() {\n    if (train_) {\n        loaded_batches_->flush();\n        for (auto d : device_loaded_batches_) {\n            d->flush();\n        }\n\n        if (model_->has_embeddings()) {\n            for (auto d : device_update_batches_) {\n                d->flush();\n            }\n        }\n\n        if (model_->has_embeddings()) {\n            update_batches_->flush();\n        }\n    } else {\n        loaded_batches_->flush();\n        for (auto d : device_loaded_batches_) {\n            d->flush();\n        }\n    }\n}\n\nvoid PipelineGPU::setQueueExpectingData(bool expecting_data) {\n    if (train_) {\n        loaded_batches_->expecting_data_ = expecting_data;\n        loaded_batches_->cv_->notify_all();\n        for (auto d : device_loaded_batches_) {\n            d->expecting_data_ = expecting_data;\n            d->cv_->notify_all();\n        }\n\n        if (model_->has_embeddings()) {\n            for (auto d : device_update_batches_) {\n                d->expecting_data_ = expecting_data;\n                d->cv_->notify_all();\n            }\n        }\n\n        if (model_->has_embeddings()) {\n            update_batches_->expecting_data_ = expecting_data;\n            update_batches_->cv_->notify_all();\n        }\n    } else {\n        loaded_batches_->expecting_data_ = expecting_data;\n        loaded_batches_->cv_->notify_all();\n        for (auto d : device_loaded_batches_) {\n            d->expecting_data_ = expecting_data;\n            d->cv_->notify_all();\n        }\n    }\n}\n"
  },
  {
    "path": "src/cpp/src/pipeline/trainer.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/28/20.\n//\n\n#include \"pipeline/trainer.h\"\n\n#include \"reporting/logger.h\"\n\nusing std::get;\nusing std::tie;\n\nPipelineTrainer::PipelineTrainer(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, shared_ptr<PipelineConfig> pipeline_config, int logs_per_epoch) {\n    dataloader_ = dataloader;\n    learning_task_ = dataloader_->learning_task_;\n\n    std::string item_name;\n    int64_t num_items = 0;\n    if (learning_task_ == LearningTask::LINK_PREDICTION) {\n        item_name = \"Edges\";\n        num_items = dataloader_->graph_storage_->storage_ptrs_.train_edges->getDim0();\n    } else if (learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n        item_name = \"Nodes\";\n        num_items = dataloader_->graph_storage_->storage_ptrs_.train_nodes->getDim0();\n    }\n\n    progress_reporter_ = std::make_shared<ProgressReporter>(item_name, num_items, logs_per_epoch);\n\n    if (model->device_.is_cuda()) {\n        pipeline_ = std::make_shared<PipelineGPU>(dataloader, model, true, progress_reporter_, pipeline_config);\n    } else {\n        pipeline_ = std::make_shared<PipelineCPU>(dataloader, model, true, progress_reporter_, pipeline_config);\n    }\n}\n\nvoid PipelineTrainer::train(int num_epochs) {\n    if (!dataloader_->single_dataset_) {\n        dataloader_->setTrainSet();\n    }\n\n    dataloader_->initializeBatches(false);\n\n    Timer timer = Timer(false);\n    for (int epoch = 0; epoch < num_epochs; epoch++) {\n        timer.start();\n        SPDLOG_INFO(\"################ Starting training epoch {} ################\", dataloader_->getEpochsProcessed() + 1);\n        pipeline_->start();\n        pipeline_->waitComplete();\n        pipeline_->pauseAndFlush();\n        SPDLOG_INFO(\"################ Finished training epoch {} ################\", dataloader_->getEpochsProcessed() + 1);\n\n        if (pipeline_->model_->device_models_.size() > 1) {\n            pipeline_->model_->all_reduce();\n        }\n\n        dataloader_->nextEpoch();\n        progress_reporter_->clear();\n        timer.stop();\n\n        std::string item_name;\n        int64_t num_items = 0;\n        if (learning_task_ == LearningTask::LINK_PREDICTION) {\n            item_name = \"Edges\";\n            num_items = dataloader_->graph_storage_->storage_ptrs_.train_edges->getDim0();\n        } else if (learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n            item_name = \"Nodes\";\n            num_items = dataloader_->graph_storage_->storage_ptrs_.train_nodes->getDim0();\n        }\n\n        int64_t epoch_time = timer.getDuration();\n        float items_per_second = (float)num_items / ((float)epoch_time / 1000);\n        SPDLOG_INFO(\"Epoch Runtime: {}ms\", epoch_time);\n        SPDLOG_INFO(\"{} per Second: {}\", item_name, items_per_second);\n    }\n}\n\nSynchronousTrainer::SynchronousTrainer(shared_ptr<DataLoader> dataloader, shared_ptr<Model> model, int logs_per_epoch) {\n    dataloader_ = dataloader;\n    model_ = model;\n    learning_task_ = dataloader_->learning_task_;\n\n    std::string item_name;\n    int64_t num_items = 0;\n    if (learning_task_ == LearningTask::LINK_PREDICTION) {\n        item_name = \"Edges\";\n        num_items = dataloader_->graph_storage_->storage_ptrs_.train_edges->getDim0();\n    } else if (learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n        item_name = \"Nodes\";\n        num_items = dataloader_->graph_storage_->storage_ptrs_.train_nodes->getDim0();\n    }\n\n    progress_reporter_ = std::make_shared<ProgressReporter>(item_name, num_items, logs_per_epoch);\n}\n\nvoid SynchronousTrainer::train(int num_epochs) {\n    if (!dataloader_->single_dataset_) {\n        dataloader_->setTrainSet();\n    }\n\n    dataloader_->initializeBatches(false);\n\n    Timer timer = Timer(false);\n\n    for (int epoch = 0; epoch < num_epochs; epoch++) {\n        timer.start();\n        SPDLOG_INFO(\"################ Starting training epoch {} ################\", dataloader_->getEpochsProcessed() + 1);\n        while (dataloader_->hasNextBatch()) {\n            // gets data and parameters for the next batch\n            shared_ptr<Batch> batch = dataloader_->getBatch();\n\n            if (dataloader_->graph_storage_->embeddingsOffDevice()) {\n                // transfers batch to the GPU\n                batch->to(model_->device_);\n            } else {\n                dataloader_->loadGPUParameters(batch);\n            }\n\n            // compute forward and backward pass of the model\n            model_->train_batch(batch);\n\n            // transfer gradients and update parameters\n            if (batch->node_embeddings_.defined()) {\n                if (dataloader_->graph_storage_->embeddingsOffDevice()) {\n                    batch->embeddingsToHost();\n                } else {\n                    dataloader_->updateEmbeddings(batch, true);\n                }\n\n                dataloader_->updateEmbeddings(batch, false);\n            }\n\n            batch->clear();\n\n            // notify that the batch has been completed\n            dataloader_->finishedBatch();\n\n            // log progress\n            progress_reporter_->addResult(batch->batch_size_);\n        }\n        SPDLOG_INFO(\"################ Finished training epoch {} ################\", dataloader_->getEpochsProcessed() + 1);\n\n        // notify that the epoch has been completed\n        dataloader_->nextEpoch();\n        progress_reporter_->clear();\n        timer.stop();\n\n        std::string item_name;\n        int64_t num_items = 0;\n        if (learning_task_ == LearningTask::LINK_PREDICTION) {\n            item_name = \"Edges\";\n            num_items = dataloader_->graph_storage_->storage_ptrs_.train_edges->getDim0();\n        } else if (learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n            item_name = \"Nodes\";\n            num_items = dataloader_->graph_storage_->storage_ptrs_.train_nodes->getDim0();\n        }\n\n        int64_t epoch_time = timer.getDuration();\n        float items_per_second = (float)num_items / ((float)epoch_time / 1000);\n        SPDLOG_INFO(\"Epoch Runtime: {}ms\", epoch_time);\n        SPDLOG_INFO(\"{} per Second: {}\", item_name, items_per_second);\n    }\n}\n"
  },
  {
    "path": "src/cpp/src/reporting/reporting.cpp",
    "content": "//\n// Created by Jason Mohoney on 8/24/21.\n//\n#include \"reporting/reporting.h\"\n\n#include <fstream>\n\n#include \"configuration/constants.h\"\n#include \"reporting/logger.h\"\n\nHitskMetric::HitskMetric(int k) {\n    k_ = k;\n    name_ = \"Hits@\" + std::to_string(k_);\n    unit_ = \"\";\n}\n\ntorch::Tensor HitskMetric::computeMetric(torch::Tensor ranks) { return torch::tensor((double)ranks.le(k_).nonzero().size(0) / ranks.size(0), torch::kFloat64); }\n\nMeanRankMetric::MeanRankMetric() {\n    name_ = \"Mean Rank\";\n    unit_ = \"\";\n}\n\ntorch::Tensor MeanRankMetric::computeMetric(torch::Tensor ranks) { return ranks.to(torch::kFloat64).mean(); }\n\nMeanReciprocalRankMetric::MeanReciprocalRankMetric() {\n    name_ = \"MRR\";\n    unit_ = \"\";\n}\n\ntorch::Tensor MeanReciprocalRankMetric::computeMetric(torch::Tensor ranks) { return ranks.to(torch::kFloat32).reciprocal().mean(); }\n\nCategoricalAccuracyMetric::CategoricalAccuracyMetric() {\n    name_ = \"Accuracy\";\n    unit_ = \"%\";\n}\n\ntorch::Tensor CategoricalAccuracyMetric::computeMetric(torch::Tensor y_true, torch::Tensor y_pred) {\n    return 100 * torch::tensor({(double)(y_true == y_pred).nonzero().size(0) / y_true.size(0)}, torch::kFloat64);\n}\n\nReporter::~Reporter() { delete lock_; }\n\nLinkPredictionReporter::LinkPredictionReporter() {}\n\nLinkPredictionReporter::~LinkPredictionReporter() { clear(); }\n\nvoid LinkPredictionReporter::clear() {\n    all_ranks_ = torch::Tensor();\n    per_batch_ranks_ = {};\n    per_batch_scores_ = {};\n    all_scores_ = torch::Tensor();\n}\n\ntorch::Tensor LinkPredictionReporter::computeRanks(torch::Tensor pos_scores, torch::Tensor neg_scores) {\n    return (neg_scores >= pos_scores.unsqueeze(1)).sum(1) + 1;\n}\n\nvoid LinkPredictionReporter::addResult(torch::Tensor pos_scores, torch::Tensor neg_scores, torch::Tensor edges) {\n    lock();\n\n    if (neg_scores.defined()) {\n        per_batch_ranks_.emplace_back(computeRanks(pos_scores, neg_scores));\n    }\n\n    if (edges.defined()) {\n        per_batch_scores_.emplace_back(pos_scores.to(torch::kCPU));\n        per_batch_edges_.emplace_back(edges.to(torch::kCPU));\n    }\n    unlock();\n}\n\nvoid LinkPredictionReporter::report() {\n    all_ranks_ = torch::cat(per_batch_ranks_).to(torch::kCPU);\n    if (per_batch_scores_.size() > 0) {\n        all_scores_ = torch::cat(per_batch_scores_);\n    }\n    per_batch_ranks_ = {};\n    per_batch_scores_ = {};\n\n    std::string report_string = \"\";\n    std::string header = \"\\n=================================\\nLink Prediction: \" + std::to_string(all_ranks_.size(0)) + \" edges evaluated\\n\";\n    report_string = report_string + header;\n\n    std::string tmp;\n    for (auto m : metrics_) {\n        torch::Tensor result = std::dynamic_pointer_cast<RankingMetric>(m)->computeMetric(all_ranks_);\n        tmp = m->name_ + \": \" + std::to_string(result.item<double>()) + m->unit_ + \"\\n\";\n        report_string = report_string + tmp;\n    }\n    std::string footer = \"=================================\";\n    report_string = report_string + footer;\n\n    SPDLOG_INFO(report_string);\n}\n\nvoid LinkPredictionReporter::save(string directory, bool scores, bool ranks) {\n    all_ranks_ = torch::cat(per_batch_ranks_).to(torch::kCPU);\n    if (per_batch_scores_.size() > 0) {\n        all_scores_ = torch::cat(per_batch_scores_);\n    }\n    per_batch_ranks_ = {};\n    per_batch_scores_ = {};\n\n    if (!metrics_.empty()) {\n        std::string report_string = \"\";\n        std::string header = \"Link Prediction: \" + std::to_string(all_ranks_.size(0)) + \" edges evaluated\\n\";\n        report_string = report_string + header;\n\n        std::string tmp;\n        for (auto m : metrics_) {\n            torch::Tensor result = std::dynamic_pointer_cast<RankingMetric>(m)->computeMetric(all_ranks_);\n            tmp = m->name_ + \": \" + std::to_string(result.item<double>()) + m->unit_ + \"\\n\";\n            report_string = report_string + tmp;\n        }\n\n        string metrics_file = directory + PathConstants::output_metrics_file;\n\n        std::ofstream metrics_stream;\n        metrics_stream.open(metrics_file);\n\n        metrics_stream << report_string;\n        metrics_stream.close();\n    }\n\n    if (ranks || scores) {\n        if (per_batch_edges_.empty()) {\n            throw MariusRuntimeException(\"To save scores or ranks, the evaluated edges must be provided to addResult()\");\n        }\n\n        all_edges_ = torch::cat({per_batch_edges_});\n\n        string output_scores_file = directory + PathConstants::output_scores_file;\n        std::ofstream scores_stream;\n        scores_stream.open(output_scores_file);\n\n        string header_string = \"\";\n        std::vector<torch::Dtype> dtypes;\n        if (all_edges_.size(1) == 3) {\n            header_string = \"src,rel,dst\";\n            dtypes = {torch::kInt64, torch::kInt64, torch::kInt64};\n        } else {\n            header_string = \"src,dst\";\n            dtypes = {torch::kInt64, torch::kInt64};\n        }\n\n        torch::Tensor output_tensor = all_edges_.to(torch::kFloat32);\n        if (ranks) {\n            output_tensor = torch::cat({output_tensor, all_ranks_.narrow(0, 0, all_edges_.size(0)).to(torch::kFloat32).unsqueeze(1)}, 1);\n            header_string = header_string + \",rank\";\n            dtypes.emplace_back(torch::kInt64);\n        }\n\n        if (scores) {\n            output_tensor = torch::cat({output_tensor, all_scores_.narrow(0, 0, all_edges_.size(0)).to(torch::kFloat32).unsqueeze(1)}, 1);\n            header_string = header_string + \",score\";\n            dtypes.emplace_back(torch::kFloat32);\n        }\n\n        scores_stream << header_string << \"\\n\";\n        auto accessor = output_tensor.accessor<float, 2>();\n\n        int64_t num_rows = output_tensor.size(0);\n        int64_t num_cols = output_tensor.size(1);\n        for (int64_t row = 0; row < num_rows; row++) {\n            string row_string = \"\";\n            for (int64_t col = 0; col < num_cols - 1; col++) {\n                row_string = row_string + std::to_string((int)accessor[row][col]) + \",\";\n            }\n\n            if (scores) {\n                row_string = row_string + std::to_string(accessor[row][num_cols - 1]) + \"\\n\";\n            } else {\n                row_string = row_string + std::to_string((int)accessor[row][num_cols - 1]) + \"\\n\";\n            }\n\n            scores_stream << row_string;\n        }\n        scores_stream.close();\n    }\n}\n\nNodeClassificationReporter::NodeClassificationReporter() {}\n\nNodeClassificationReporter::~NodeClassificationReporter() { clear(); }\n\nvoid NodeClassificationReporter::clear() {\n    all_y_true_ = torch::Tensor();\n    all_y_pred_ = torch::Tensor();\n    per_batch_y_true_ = {};\n    per_batch_y_pred_ = {};\n}\n\nvoid NodeClassificationReporter::addResult(torch::Tensor y_true, torch::Tensor y_pred, torch::Tensor node_ids) {\n    lock();\n    per_batch_y_true_.emplace_back(y_true);\n    per_batch_y_pred_.emplace_back(y_pred.argmax(1));\n\n    if (node_ids.defined()) {\n        per_batch_nodes_.emplace_back(node_ids);\n    }\n    unlock();\n}\n\nvoid NodeClassificationReporter::report() {\n    all_y_true_ = torch::cat(per_batch_y_true_);\n    all_y_pred_ = torch::cat(per_batch_y_pred_);\n    per_batch_y_true_ = {};\n    per_batch_y_pred_ = {};\n\n    std::string report_string = \"\";\n    std::string header = \"\\n=================================\\nNode Classification: \" + std::to_string(all_y_true_.size(0)) + \" nodes evaluated\\n\";\n    report_string = report_string + header;\n\n    std::string tmp;\n    for (auto m : metrics_) {\n        torch::Tensor result = std::dynamic_pointer_cast<ClassificationMetric>(m)->computeMetric(all_y_true_, all_y_pred_);\n        tmp = m->name_ + \": \" + std::to_string(result.item<double>()) + m->unit_ + \"\\n\";\n        report_string = report_string + tmp;\n    }\n    std::string footer = \"=================================\";\n    report_string = report_string + footer;\n\n    SPDLOG_INFO(report_string);\n}\n\nvoid NodeClassificationReporter::save(string directory, bool labels) {\n    all_y_true_ = torch::cat(per_batch_y_true_).to(torch::kCPU);\n    all_y_pred_ = torch::cat(per_batch_y_pred_).to(torch::kCPU);\n    per_batch_y_true_ = {};\n    per_batch_y_pred_ = {};\n\n    if (!metrics_.empty()) {\n        std::string report_string = \"\";\n        std::string header = \"\\n=================================\\nNode Classification: \" + std::to_string(all_y_true_.size(0)) + \" nodes evaluated\\n\";\n        report_string = report_string + header;\n\n        std::string tmp;\n        for (auto m : metrics_) {\n            torch::Tensor result = std::dynamic_pointer_cast<ClassificationMetric>(m)->computeMetric(all_y_true_, all_y_pred_);\n            tmp = m->name_ + \": \" + std::to_string(result.item<double>()) + m->unit_ + \"\\n\";\n            report_string = report_string + tmp;\n        }\n        std::string footer = \"=================================\";\n        report_string = report_string + footer;\n\n        string metrics_file = directory + PathConstants::output_metrics_file;\n\n        std::ofstream metrics_stream;\n        metrics_stream.open(metrics_file);\n\n        metrics_stream << report_string;\n        metrics_stream.close();\n    }\n\n    if (labels) {\n        if (per_batch_nodes_.empty()) {\n            throw MariusRuntimeException(\"To save labels, the evaluated node ids must be provided to add_result()\");\n        }\n\n        all_nodes_ = torch::cat({per_batch_nodes_}).to(torch::kCPU);\n\n        string output_labels_file = directory + PathConstants::output_labels_file;\n        std::ofstream labels_stream;\n        labels_stream.open(output_labels_file);\n\n        string header_string = \"id,y_pred,y_true\";\n\n        torch::Tensor output_tensor = all_nodes_.to(torch::kFloat32).unsqueeze(1);\n        output_tensor = torch::cat({output_tensor, all_y_pred_.to(torch::kFloat32).unsqueeze(1)}, 1);\n        output_tensor = torch::cat({output_tensor, all_y_true_.to(torch::kFloat32).unsqueeze(1)}, 1);\n\n        labels_stream << header_string << \"\\n\";\n        auto accessor = output_tensor.accessor<float, 2>();\n\n        int64_t num_rows = output_tensor.size(0);\n        int64_t num_cols = output_tensor.size(1);\n        for (int64_t row = 0; row < num_rows; row++) {\n            string row_string = \"\";\n            for (int64_t col = 0; col < num_cols - 1; col++) {\n                row_string = row_string + std::to_string((int)accessor[row][col]) + \",\";\n            }\n            row_string = row_string + std::to_string((int)accessor[row][num_cols - 1]) + \"\\n\";\n\n            labels_stream << row_string;\n        }\n        labels_stream.close();\n    }\n}\n\nProgressReporter::ProgressReporter(std::string item_name, int64_t total_items, int total_reports) {\n    item_name_ = item_name;\n    total_items_ = total_items;\n    current_item_ = 0;\n    total_reports_ = total_reports;\n    items_per_report_ = total_items_ / total_reports_;\n    next_report_ = items_per_report_;\n}\n\nProgressReporter::~ProgressReporter() { clear(); }\n\nvoid ProgressReporter::clear() {\n    current_item_ = 0;\n    next_report_ = items_per_report_;\n}\n\nvoid ProgressReporter::addResult(int64_t items_processed) {\n    lock();\n    current_item_ += items_processed;\n    if (current_item_ >= next_report_) {\n        report();\n        next_report_ = std::min({current_item_ + items_per_report_, total_items_});\n    }\n    unlock();\n}\n\nvoid ProgressReporter::report() {\n    std::string report_string = item_name_ + \" processed: [\" + std::to_string(current_item_) + \"/\" + std::to_string(total_items_) + \"], \" +\n                                fmt::format(\"{:.2f}\", 100 * (double)current_item_ / total_items_) + \"%\";\n    SPDLOG_INFO(report_string);\n}\n"
  },
  {
    "path": "src/cpp/src/storage/buffer.cpp",
    "content": "//\n// Created by Jason Mohoney on 6/3/20.\n//\n\n#include \"storage/buffer.h\"\n\n#include <common/util.h>\n#include <fcntl.h>\n#include <unistd.h>\n\n#include <fstream>\n#include <functional>\n#include <future>\n#include <iostream>\n#include <shared_mutex>\n\n#include \"configuration/constants.h\"\n#include \"reporting/logger.h\"\n\nPartition::Partition(int partition_id, int64_t partition_size, int embedding_size, torch::Dtype dtype, int64_t idx_offset, int64_t file_offset) {\n    lock_ = new std::mutex();\n    cv_ = new std::condition_variable();\n    data_ptr_ = nullptr;\n    partition_id_ = partition_id;\n\n    present_ = false;\n\n    partition_size_ = partition_size;\n    embedding_size_ = embedding_size;\n    dtype_ = dtype;\n    dtype_size_ = get_dtype_size_wrapper(dtype_);\n    total_size_ = partition_size_ * embedding_size_ * dtype_size_;\n\n    idx_offset_ = idx_offset;\n    file_offset_ = file_offset;\n    buffer_idx_ = -1;\n\n    tensor_ = torch::Tensor();\n\n    evicting_ = false;\n}\n\nPartition::~Partition() {\n    delete lock_;\n    delete cv_;\n    tensor_ = torch::Tensor();\n}\n\ntorch::Tensor Partition::indexRead(Indices indices) {\n    if (indices.sizes().size() != 1) {\n        // TODO: throw invalid input to func exception\n        throw std::runtime_error(\"\");\n    }\n\n    lock_->lock();\n\n    torch::Tensor ret = tensor_.index_select(0, indices - idx_offset_);\n\n    lock_->unlock();\n    cv_->notify_all();\n\n    return ret;\n}\n\nPartitionedFile::PartitionedFile(string filename, int num_partitions, int64_t partition_size, int embedding_size, int64_t total_embeddings,\n                                 torch::Dtype dtype) {\n    num_partitions_ = num_partitions;\n    partition_size_ = partition_size;\n    embedding_size_ = embedding_size;\n    total_embeddings_ = total_embeddings;\n    dtype_ = dtype;\n    dtype_size_ = get_dtype_size_wrapper(dtype_);\n\n    filename_ = filename;\n\n    int flags = O_RDWR | IO_FLAGS;\n    fd_ = open(filename_.c_str(), flags);\n    if (fd_ == -1) {\n        SPDLOG_ERROR(\"Unable to open {}\\nError: {}\", filename_, errno);\n        throw std::runtime_error(\"\");\n    }\n}\n\nvoid PartitionedFile::readPartition(void *addr, Partition *partition) {\n    if (addr == NULL || partition == NULL) {\n        // TODO: throw null ptr exception\n        throw std::runtime_error(\"\");\n    }\n\n    memset_wrapper(addr, 0, partition->total_size_);\n    if (pread_wrapper(fd_, addr, partition->total_size_, partition->file_offset_) == -1) {\n        SPDLOG_ERROR(\"Unable to read Block: {}\\nError: {}\", partition->partition_id_, errno);\n        throw std::runtime_error(\"\");\n    }\n    partition->data_ptr_ = addr;\n    partition->tensor_ = torch::from_blob(addr, {partition->partition_size_, embedding_size_}, dtype_);\n}\n\n// writePartition accesses data pointed to by p->data_ptr_. Address p->data_ptr_ is expected to contain\n// same data as that of p->tensor_.\nvoid PartitionedFile::writePartition(Partition *partition, bool clear_mem) {\n    if (partition == NULL || partition->data_ptr_ == nullptr) {\n        // TODO: throw null ptr exception\n        throw std::runtime_error(\"\");\n    }\n\n    if (pwrite_wrapper(fd_, partition->data_ptr_, partition->total_size_, partition->file_offset_) == -1) {\n        throw MariusRuntimeException(fmt::format(\"Unable to write partition: {}\\nError: {}\", partition->partition_id_, errno));\n    }\n\n    if (clear_mem) {\n        memset_wrapper(partition->data_ptr_, 0, partition->total_size_);\n        partition->data_ptr_ = nullptr;\n        partition->tensor_ = torch::Tensor();\n    }\n}\n\nLookaheadBlock::LookaheadBlock(int64_t total_size, PartitionedFile *partitioned_file, int num_per_lookahead) {\n    total_size_ = total_size;\n    partitioned_file_ = partitioned_file;\n    partitions_ = {};\n    lock_ = new std::mutex();\n\n    mems_ = std::vector<void *>(num_per_lookahead);\n\n    for (int i = 0; i < num_per_lookahead; i++) {\n        if (posix_memalign(&mems_[i], 4096, total_size_)) {\n            SPDLOG_ERROR(\"Unable to allocate lookahead memory\\nError: {}\", errno);\n            throw std::runtime_error(\"\");\n        }\n        memset_wrapper(mems_[i], 0, total_size_);\n    }\n\n    done_ = false;\n    present_ = false;\n    thread_ = nullptr;\n}\n\nLookaheadBlock::~LookaheadBlock() {\n    delete lock_;\n\n    for (void *mem : mems_) {\n        free(mem);\n    }\n}\n\nvoid LookaheadBlock::run() {\n    while (!done_) {\n        // wait until block is empty\n        std::unique_lock lock(*lock_);\n        cv_.wait(lock, [this] { return present_ == false; });\n\n        if (partitions_.empty()) {\n            break;\n        }\n\n#pragma omp parallel for\n        for (int i = 0; i < partitions_.size(); i++) {\n            Partition *partition = partitions_[i];\n            std::unique_lock partition_lock(*partition->lock_);\n            partition->cv_->wait(partition_lock, [partition] { return partition->evicting_ == false; });\n            partitioned_file_->readPartition(mems_[i], partition);\n            partition_lock.unlock();\n            partition->cv_->notify_all();\n        }\n\n        present_ = true;\n        lock.unlock();\n        cv_.notify_all();\n    }\n}\n\nvoid LookaheadBlock::start(std::vector<Partition *> first_partitions) {\n    partitions_ = first_partitions;\n    if (thread_ == nullptr) {\n        thread_ = new std::thread(&LookaheadBlock::run, this);\n    }\n}\n\nvoid LookaheadBlock::stop() {\n    if (thread_ != nullptr) {\n        if (thread_->joinable()) {\n            done_ = true;\n            present_ = false;\n            cv_.notify_all();\n            thread_->join();\n        }\n        delete thread_;\n    }\n}\n\nvoid LookaheadBlock::move_to_buffer(std::vector<void *> buff_addrs, std::vector<int64_t> buffer_idxs, std::vector<Partition *> next_partitions) {\n    if (partitions_.size() > buff_addrs.size() || partitions_.size() > buffer_idxs.size()) {\n        // TODO: throw invalid inputs for function exception\n        throw std::runtime_error(\"\");\n    }\n    // wait until block is populated\n    std::unique_lock lock(*lock_);\n    cv_.wait(lock, [this] { return present_ == true; });\n\n#pragma omp parallel for\n    for (int i = 0; i < partitions_.size(); i++) {\n        Partition *partition = partitions_[i];\n        void *addr = buff_addrs[i];\n        int64_t buffer_idx = buffer_idxs[i];\n        memcpy_wrapper(addr, mems_[i], partition->total_size_);\n        memset_wrapper(mems_[i], 0, partition->total_size_);\n\n        partition->data_ptr_ = addr;\n        partition->tensor_ = torch::from_blob(partition->data_ptr_, {partition->partition_size_, partition->embedding_size_}, partition->dtype_);\n        partition->buffer_idx_ = buffer_idx;\n        partition->present_ = true;\n    }\n\n    // next partition will be prefetched automatically\n    partitions_ = next_partitions;\n    present_ = false;\n    lock.unlock();\n    cv_.notify_all();\n}\n\nAsyncWriteBlock::AsyncWriteBlock(int64_t total_size, PartitionedFile *partitioned_file, int num_per_evict) {\n    total_size_ = total_size;\n    partitioned_file_ = partitioned_file;\n\n    lock_ = new std::mutex();\n\n    mems_ = std::vector<void *>(num_per_evict);\n\n    for (int i = 0; i < num_per_evict; i++) {\n        if (posix_memalign(&mems_[i], 4096, total_size_)) {\n            SPDLOG_ERROR(\"Unable to allocate lookahead memory\\nError: {}\", errno);\n            throw std::runtime_error(\"\");\n        }\n        memset_wrapper(mems_[i], 0, total_size_);\n    }\n\n    done_ = false;\n    present_ = false;\n    thread_ = nullptr;\n}\n\nAsyncWriteBlock::~AsyncWriteBlock() {\n    delete lock_;\n\n    for (void *mem : mems_) {\n        free(mem);\n    }\n}\n\nvoid AsyncWriteBlock::run() {\n    while (!done_) {\n        // wait until block is empty\n        std::unique_lock lock(*lock_);\n        cv_.wait(lock, [this] { return present_ == true; });\n\n        if (done_) {\n            return;\n        }\n\n#pragma omp parallel for\n        for (int i = 0; i < partitions_.size(); i++) {\n            Partition *partition = partitions_[i];\n            partitioned_file_->writePartition(partition);\n            partition->present_ = false;\n            partition->evicting_ = false;\n            partition->cv_->notify_all();\n        }\n\n        present_ = false;\n        lock.unlock();\n        cv_.notify_all();\n    }\n}\n\nvoid AsyncWriteBlock::start() {\n    if (thread_ == nullptr) {\n        thread_ = new std::thread(&AsyncWriteBlock::run, this);\n    }\n}\n\nvoid AsyncWriteBlock::stop() {\n    if (thread_ != nullptr) {\n        if (thread_->joinable()) {\n            done_ = true;\n            present_ = true;\n            cv_.notify_all();\n            thread_->join();\n        }\n        delete thread_;\n    }\n}\n\nvoid AsyncWriteBlock::async_write(std::vector<Partition *> partitions) {\n    if (partitions.size() > mems_.size()) {\n        // TODO: throw invalid inputs for function exception\n        throw std::runtime_error(\"\");\n    }\n\n    // wait until block is empty\n    std::unique_lock lock(*lock_);\n    cv_.wait(lock, [this] { return present_ == false; });\n\n    partitions_ = partitions;\n\n#pragma omp parallel for\n    for (int i = 0; i < partitions_.size(); i++) {\n        void *mem = mems_[i];\n        Partition *partition = partitions_[i];\n\n        memcpy_wrapper(mem, partition->data_ptr_, total_size_);\n        memset_wrapper(partition->data_ptr_, 0, total_size_);\n\n        partition->data_ptr_ = mem;\n        partition->evicting_ = true;\n    }\n\n    present_ = true;\n\n    lock.unlock();\n    cv_.notify_all();\n}\n\nPartitionBuffer::PartitionBuffer(int capacity, int num_partitions, int fine_to_coarse_ratio, int64_t partition_size, int embedding_size,\n                                 int64_t total_embeddings, torch::Dtype dtype, string filename, bool prefetching) {\n    capacity_ = capacity;\n    size_ = 0;\n    num_partitions_ = num_partitions;\n    partition_size_ = partition_size;\n    fine_to_coarse_ratio_ = fine_to_coarse_ratio;\n    dtype_ = dtype;\n    dtype_size_ = get_dtype_size_wrapper(dtype_);\n    embedding_size_ = embedding_size;\n    total_embeddings_ = total_embeddings;\n    filename_ = filename;\n    partition_table_ = std::vector<Partition *>();\n\n    prefetching_ = prefetching;\n\n    int64_t curr_idx_offset = 0;\n    int64_t curr_file_offset = 0;\n    int64_t curr_partition_size = partition_size_;\n    int64_t curr_total_size = curr_partition_size * embedding_size_ * dtype_size_;\n    for (int64_t i = 0; i < num_partitions_; i++) {\n        // the last partition might be slightly smaller\n        if (i == num_partitions_ - 1) {\n            curr_partition_size = total_embeddings_ - curr_idx_offset;\n            curr_total_size = curr_partition_size * embedding_size_ * dtype_size_;\n        }\n\n        Partition *curr_part = new Partition(i, curr_partition_size, embedding_size_, dtype_, curr_idx_offset, curr_file_offset);\n        partition_table_.push_back(curr_part);\n\n        curr_file_offset += curr_total_size;\n        curr_idx_offset += curr_partition_size;\n    }\n\n    filename_ = filename;\n    partitioned_file_ = new PartitionedFile(filename_, num_partitions_, partition_size_, embedding_size_, total_embeddings_, dtype_);\n\n    loaded_ = false;\n}\n\nPartitionBuffer::~PartitionBuffer() {\n    unload(true);\n\n    delete partitioned_file_;\n    for (int64_t i = 0; i < num_partitions_; i++) {\n        delete partition_table_[i];\n    }\n}\n\nvoid PartitionBuffer::load() {\n    if (!loaded_) {\n        if (posix_memalign(&buff_mem_, 4096, capacity_ * partition_size_ * embedding_size_ * dtype_size_)) {\n            SPDLOG_ERROR(\"Unable to allocate buffer memory\\nError: {}\", errno);\n            throw std::runtime_error(\"\");\n        }\n        memset_wrapper(buff_mem_, 0, capacity_ * partition_size_ * embedding_size_ * dtype_size_);\n        buffer_tensor_view_ = torch::from_blob(buff_mem_, {capacity_ * partition_size_, embedding_size_}, dtype_);\n\n        // initialize buffer\n        int partition_id;\n\n        int64_t num_nodes = 0;\n\n        for (int i = 0; i < buffer_state_.size(0); i++) {\n            partition_id = buffer_state_[i].item<int>();\n            Partition *partition = partition_table_[partition_id];\n            void *buff_addr = (char *)buff_mem_ + (i * partition_size_ * embedding_size_ * dtype_size_);\n            partitioned_file_->readPartition(buff_addr, partition);\n            partition->present_ = true;\n            partition->buffer_idx_ = i;\n            num_nodes += partition->partition_size_;\n        }\n\n        in_buffer_ids_ = torch::empty({num_nodes}, torch::kInt64);\n        //        int64_t offset = 0;\n        //        for (int i = 0; i < buffer_state_.size(0); i++) {\n        //            partition_id = buffer_state_[i].item<int>();\n        //            Partition *partition = partition_table_[partition_id];\n        //            int64_t partition_offset = partition->idx_offset_;\n        //\n        //            in_buffer_ids_.slice(0, offset, offset + partition->partition_size_) = torch::arange(partition_offset, partition_offset +\n        //            partition->partition_size_); offset += partition->partition_size_;\n        //        }\n\n        if (prefetching_) {\n            lookahead_block_ = new LookaheadBlock(partition_size_ * embedding_size_ * dtype_size_, partitioned_file_, fine_to_coarse_ratio_);\n            async_write_block_ = new AsyncWriteBlock(partition_size_ * embedding_size_ * dtype_size_, partitioned_file_, fine_to_coarse_ratio_);\n            startThreads();\n        }\n\n        loaded_ = true;\n    }\n}\n\nvoid PartitionBuffer::unload(bool write) {\n    if (loaded_) {\n        if (write) {\n            sync();\n        }\n        buffer_tensor_view_ = torch::Tensor();\n        free(buff_mem_);\n        buff_mem_ = nullptr;\n\n        if (prefetching_) {\n            stopThreads();\n            delete lookahead_block_;\n            delete async_write_block_;\n        }\n\n        size_ = 0;\n        loaded_ = false;\n    }\n}\n\ntorch::Tensor PartitionBuffer::getBufferState() { return buffer_state_; }\n\n// indices a relative to the local node ids\ntorch::Tensor PartitionBuffer::indexRead(torch::Tensor indices) {\n    if (indices.sizes().size() != 1) {\n        // TODO: throw invalid input to func exception\n        throw std::runtime_error(\"\");\n    }\n\n    auto out_options = torch::TensorOptions().dtype(torch::kFloat32);\n#ifdef MARIUS_CUDA\n    out_options = out_options.pinned_memory(true);\n#endif\n    torch::Tensor out = torch::empty({indices.size(0), buffer_tensor_view_.size(1)}, out_options);\n    torch::index_select_out(out, buffer_tensor_view_, 0, indices);\n\n    return out;\n}\n\nIndices PartitionBuffer::getRandomIds(int64_t size) { return torch::randint(in_buffer_ids_.size(0), size, torch::kInt64); }\n\n// indices must contain unique values, else there is a possibility of a race condition\nvoid PartitionBuffer::indexAdd(torch::Tensor indices, torch::Tensor values) {\n    if (!values.defined() || indices.sizes().size() != 1 || indices.size(0) != values.size(0) || buffer_tensor_view_.size(1) != values.size(1)) {\n        // TODO: throw invalid inputs for function error\n        throw std::runtime_error(\"\");\n    }\n    // buffer_tensor_view_.index_add_(0, indices, values);\n\n    // assumes this operation is only used on float valued data, and this op takes place on the CPU\n    auto data_accessor = buffer_tensor_view_.accessor<float, 2>();\n    auto ids_accessor = indices.accessor<int64_t, 1>();\n    auto values_accessor = values.accessor<float, 2>();\n\n    int d = values.size(1);\n    int64_t size = indices.size(0);\n#pragma omp parallel for\n    for (int64_t i = 0; i < size; i++) {\n        for (int j = 0; j < d; j++) {\n            data_accessor[ids_accessor[i]][j] += values_accessor[i][j];\n        }\n    }\n}\n\nvoid PartitionBuffer::setBufferOrdering(std::vector<torch::Tensor> buffer_states) {\n    buffer_states_ = buffer_states;\n    buffer_state_iterator_ = buffer_states_.begin();\n    buffer_state_ = *buffer_state_iterator_++;\n\n    if (loaded_) {\n        unload(true);\n        load();\n    }\n}\n\nbool PartitionBuffer::hasSwap() { return buffer_state_iterator_ != buffer_states_.end(); }\n\nvoid PartitionBuffer::performNextSwap() {\n    if (!buffer_state_.defined() || buffer_state_iterator_ == buffer_states_.end()) {\n        return;\n    }\n\n    // get evicted and admitted partitions\n    std::vector<int> evict_ids = getNextEvict();\n    std::vector<int> admit_ids = getNextAdmit();\n\n    std::vector<Partition *> admit_partitions;\n    std::vector<Partition *> evict_partitions;\n    std::vector<int64_t> evict_buffer_idxs;\n    for (int admit_id : admit_ids) {\n        admit_partitions.emplace_back(partition_table_[admit_id]);\n    }\n    for (int evict_id : evict_ids) {\n        evict_partitions.emplace_back(partition_table_[evict_id]);\n        evict_buffer_idxs.emplace_back(partition_table_[evict_id]->buffer_idx_);\n    }\n\n    buffer_state_ = *buffer_state_iterator_++;\n\n    // evict partition\n    evict(evict_partitions);\n    // admit partition\n    admit(admit_partitions, evict_buffer_idxs);\n\n    int64_t num_nodes = 0;\n\n    int partition_id;\n    for (int i = 0; i < buffer_state_.size(0); i++) {\n        partition_id = buffer_state_[i].item<int>();\n        num_nodes += partition_table_[partition_id]->partition_size_;\n    }\n\n    in_buffer_ids_ = torch::empty({num_nodes}, torch::kInt64);\n\n    //    int64_t offset = 0;\n    //    for (int i = 0; i < buffer_state_.size(0); i++) {\n    //        partition_id = buffer_state_[i].item<int>();\n    //        Partition *partition = partition_table_[partition_id];\n    //        int64_t partition_offset = partition->idx_offset_;\n    //\n    //        in_buffer_ids_.slice(0, offset, offset + partition->partition_size_) = torch::arange(partition_offset, partition_offset +\n    //        partition->partition_size_); offset += partition->partition_size_;\n    //    }\n}\n\nstd::vector<int> PartitionBuffer::getNextAdmit() {\n    std::vector<int> admit_ids;\n    bool admitted;\n\n    if (buffer_state_iterator_ != buffer_states_.end()) {\n        for (int i = 0; i < buffer_state_iterator_->size(0); i++) {\n            admitted = true;\n            for (int j = 0; j < buffer_state_.size(0); j++) {\n                if ((*buffer_state_iterator_)[i].item<int>() == (buffer_state_)[j].item<int>()) {\n                    admitted = false;\n                }\n            }\n            if (admitted) {\n                admit_ids.emplace_back((*buffer_state_iterator_)[i].item<int>());\n            }\n        }\n    }\n    return admit_ids;\n}\n\nstd::vector<int> PartitionBuffer::getNextEvict() {\n    std::vector<int> evict_ids;\n    bool evicted;\n\n    for (int i = 0; i < buffer_state_.size(0); i++) {\n        evicted = true;\n        for (int j = 0; j < buffer_state_iterator_->size(0); j++) {\n            if ((*buffer_state_iterator_)[j].item<int>() == buffer_state_[i].item<int>()) {\n                evicted = false;\n            }\n        }\n        if (evicted) {\n            evict_ids.emplace_back(buffer_state_[i].item<int>());\n        }\n    }\n    return evict_ids;\n}\n\ntorch::Tensor PartitionBuffer::getGlobalToLocalMap(bool get_current) {\n    torch::Tensor buffer_index_map = -torch::ones({total_embeddings_}, torch::kInt64);\n\n    torch::Tensor buffer_state;\n\n    if (get_current) {\n        buffer_state = buffer_state_;\n\n#pragma omp parallel for\n        for (int i = 0; i < buffer_state.size(0); i++) {\n            int partition_id = buffer_state[i].item<int>();\n            Partition *partition = partition_table_[partition_id];\n            int64_t partition_offset = partition->idx_offset_;\n            int64_t buffer_offset = partition->buffer_idx_ * partition_size_;\n            buffer_index_map.slice(0, partition_offset, partition_offset + partition->partition_size_) =\n                torch::arange(buffer_offset, buffer_offset + partition->partition_size_);\n        }\n\n    } else {\n        // get mapping for next swap\n        buffer_state = *buffer_state_iterator_;\n\n        // get evicted and admitted partitions\n        std::vector<int> evict_ids = getNextEvict();\n        std::vector<int> admit_ids = getNextAdmit();\n\n        // get mapping for the partitions that will still be in the buffer\n#pragma omp parallel for\n        for (int i = 0; i < buffer_state.size(0); i++) {\n            int partition_id = buffer_state[i].item<int>();\n            Partition *partition = partition_table_[partition_id];\n            int64_t partition_offset = partition->idx_offset_;\n\n            if (partition->buffer_idx_ != -1) {\n                int64_t buffer_offset = partition->buffer_idx_ * partition_size_;\n                buffer_index_map.slice(0, partition_offset, partition_offset + partition->partition_size_) =\n                    torch::arange(buffer_offset, buffer_offset + partition->partition_size_);\n            }\n        }\n\n// get mapping for the partitions that will be admitted\n#pragma omp parallel for\n        for (int i = 0; i < evict_ids.size(); i++) {\n            Partition *admit_partition = partition_table_[admit_ids[i]];\n            Partition *evict_partition = partition_table_[evict_ids[i]];\n            int64_t partition_offset = admit_partition->idx_offset_;\n            int64_t buffer_offset = evict_partition->buffer_idx_ * partition_size_;\n            buffer_index_map.slice(0, partition_offset, partition_offset + admit_partition->partition_size_) =\n                torch::arange(buffer_offset, buffer_offset + admit_partition->partition_size_);\n        }\n    }\n    return buffer_index_map;\n}\n\nvoid PartitionBuffer::evict(std::vector<Partition *> evict_partitions) {\n    if (prefetching_) {\n        async_write_block_->async_write(evict_partitions);\n    } else {\n#pragma omp parallel for\n        for (int i = 0; i < evict_partitions.size(); i++) {\n            partitioned_file_->writePartition(evict_partitions[i]);\n        }\n    }\n\n#pragma omp parallel for\n    for (int i = 0; i < evict_partitions.size(); i++) {\n        evict_partitions[i]->present_ = false;\n    }\n}\n\nvoid PartitionBuffer::admit(std::vector<Partition *> admit_partitions, std::vector<int64_t> buffer_idxs) {\n    if (admit_partitions.size() > buffer_idxs.size()) {\n        // TODO: throw invalid inputs for function error\n        throw std::runtime_error(\"\");\n    }\n\n    std::vector<void *> buff_addrs(buffer_idxs.size());\n\n#pragma omp parallel for\n    for (int i = 0; i < buffer_idxs.size(); i++) {\n        void *buff_addr = (char *)buff_mem_ + (buffer_idxs[i] * partition_size_ * embedding_size_ * dtype_size_);\n        buff_addrs[i] = buff_addr;\n    }\n\n    if (prefetching_) {\n        std::vector<int> next_admit_ids = getNextAdmit();\n        std::vector<Partition *> next_partitions;\n        if (!next_admit_ids.empty()) {\n            for (int admit_id : next_admit_ids) {\n                next_partitions.emplace_back(partition_table_[admit_id]);\n            }\n        }\n        lookahead_block_->move_to_buffer(buff_addrs, buffer_idxs, next_partitions);\n    } else {\n#pragma omp parallel for\n        for (int i = 0; i < admit_partitions.size(); i++) {\n            Partition *partition = admit_partitions[i];\n            partitioned_file_->readPartition(buff_addrs[i], partition);\n            partition->present_ = true;\n            partition->buffer_idx_ = buffer_idxs[i];\n        }\n    }\n}\n\nvoid PartitionBuffer::sync() {\n    SPDLOG_DEBUG(\"Synchronizing buffer\");\n    Partition *curr_partition;\n    for (int i = 0; i < num_partitions_; i++) {\n        curr_partition = partition_table_[i];\n        if (curr_partition->present_) {\n            partitioned_file_->writePartition(curr_partition, true);\n            curr_partition->present_ = false;\n            curr_partition->buffer_idx_ = -1;\n        }\n    }\n}\n\nvoid PartitionBuffer::startThreads() {\n    SPDLOG_DEBUG(\"Starting prefetching threads\");\n    std::vector<Partition *> partitions;\n    std::vector<int> admit_ids = getNextAdmit();\n    for (int admit_id : admit_ids) {\n        partitions.emplace_back(partition_table_[admit_id]);\n    }\n    lookahead_block_->start(partitions);\n    async_write_block_->start();\n}\n\nvoid PartitionBuffer::stopThreads() {\n    SPDLOG_DEBUG(\"Stopping prefetching threads\");\n    lookahead_block_->stop();\n    async_write_block_->stop();\n}\n"
  },
  {
    "path": "src/cpp/src/storage/checkpointer.cpp",
    "content": "//\n// Created by Jason Mohoney on 12/15/21.\n//\n\n#include \"storage/checkpointer.h\"\n\n#include \"configuration/util.h\"\n#include \"reporting/logger.h\"\n#include \"storage/io.h\"\n#include \"storage/storage.h\"\n\nCheckpointer::Checkpointer(std::shared_ptr<Model> model, shared_ptr<GraphModelStorage> storage, std::shared_ptr<CheckpointConfig> config) {\n    model_ = model;\n    storage_ = storage;\n    config_ = config;\n}\n\nvoid Checkpointer::create_checkpoint(string checkpoint_dir, CheckpointMeta checkpoint_meta, int epochs) {\n    string tmp_checkpoint_dir = checkpoint_dir + \"checkpoint_\" + std::to_string(epochs) + \"_tmp/\";\n    createDir(tmp_checkpoint_dir, false);\n\n    std::string new_embeddings_file = tmp_checkpoint_dir + PathConstants::embeddings_file + PathConstants::file_ext;\n    std::string new_embeddings_state_file = tmp_checkpoint_dir + PathConstants::embeddings_state_file + PathConstants::file_ext;\n\n    std::string embeddings_file = checkpoint_dir + PathConstants::embeddings_file + PathConstants::file_ext;\n    std::string embeddings_state_file = checkpoint_dir + PathConstants::embeddings_state_file + PathConstants::file_ext;\n\n    if (fileExists(embeddings_file)) {\n        copyFile(embeddings_file, new_embeddings_file);\n        if (this->config_->save_state) copyFile(embeddings_state_file, new_embeddings_state_file);\n    }\n\n    this->save(tmp_checkpoint_dir, checkpoint_meta);\n\n    string final_checkpoint_dir = checkpoint_dir + \"checkpoint_\" + std::to_string(epochs) + \"/\";\n    renameFile(tmp_checkpoint_dir, final_checkpoint_dir);\n}\n\nvoid Checkpointer::save(string checkpoint_dir, CheckpointMeta checkpoint_meta) {\n    if (checkpoint_meta.has_model) {\n        if (storage_->storage_ptrs_.node_embeddings != nullptr) {\n            storage_->storage_ptrs_.node_embeddings->write();\n        }\n        model_->save(checkpoint_dir);\n    }\n\n    if (checkpoint_meta.has_state) {\n        if (storage_->storage_ptrs_.node_optimizer_state != nullptr) {\n            storage_->storage_ptrs_.node_optimizer_state->write();\n        }\n    }\n\n    saveMetadata(checkpoint_dir, checkpoint_meta);\n}\n\nstd::tuple<std::shared_ptr<Model>, shared_ptr<GraphModelStorage>, CheckpointMeta> Checkpointer::load(string checkpoint_dir,\n                                                                                                     std::shared_ptr<MariusConfig> marius_config, bool train) {\n    CheckpointMeta checkpoint_meta = loadMetadata(checkpoint_dir);\n\n    std::vector<torch::Device> devices = devices_from_config(marius_config->storage);\n    std::shared_ptr<Model> model = initModelFromConfig(marius_config->model, devices, marius_config->storage->dataset->num_relations, train);\n    model->load(checkpoint_dir, train);\n\n    if (checkpoint_meta.link_prediction) {\n        model->learning_task_ = LearningTask::LINK_PREDICTION;\n    } else {\n        model->learning_task_ = LearningTask::NODE_CLASSIFICATION;\n    }\n\n    shared_ptr<GraphModelStorage> storage = initializeStorage(model, marius_config->storage, false, train);\n\n    return std::forward_as_tuple(model, storage, checkpoint_meta);\n}\n\nCheckpointMeta Checkpointer::loadMetadata(string directory) {\n    CheckpointMeta ret_meta;\n\n    std::ifstream input_file;\n    input_file.open(directory + PathConstants::checkpoint_metadata_file);\n\n    std::string line;\n    std::getline(input_file, line);\n    ret_meta.name = line;\n\n    std::getline(input_file, line);\n    ret_meta.num_epochs = std::stoi(line);\n\n    std::getline(input_file, line);\n    ret_meta.checkpoint_id = std::stoi(line);\n\n    std::getline(input_file, line);\n    std::istringstream(line) >> ret_meta.link_prediction;\n\n    std::getline(input_file, line);\n    std::istringstream(line) >> ret_meta.has_state;\n\n    std::getline(input_file, line);\n    std::istringstream(line) >> ret_meta.has_encoded;\n\n    std::getline(input_file, line);\n    std::istringstream(line) >> ret_meta.has_model;\n\n    return ret_meta;\n}\n\nvoid Checkpointer::saveMetadata(string directory, CheckpointMeta checkpoint_meta) {\n    std::ofstream output_file;\n    output_file.open(directory + PathConstants::checkpoint_metadata_file);\n\n    output_file << checkpoint_meta.name << \"\\n\";\n    output_file << checkpoint_meta.num_epochs << \"\\n\";\n    output_file << checkpoint_meta.checkpoint_id << \"\\n\";\n    output_file << checkpoint_meta.link_prediction << \"\\n\";\n    output_file << checkpoint_meta.has_state << \"\\n\";\n    output_file << checkpoint_meta.has_encoded << \"\\n\";\n    output_file << checkpoint_meta.has_model << \"\\n\";\n}"
  },
  {
    "path": "src/cpp/src/storage/graph_storage.cpp",
    "content": "//\n// Created by Jason Mohoney on 6/18/21.\n//\n\n#include \"storage/graph_storage.h\"\n\n#include <algorithm>\n#include <random>\n\n#include \"data/ordering.h\"\n#include \"reporting/logger.h\"\n\nGraphModelStorage::GraphModelStorage(GraphModelStoragePtrs storage_ptrs, shared_ptr<StorageConfig> storage_config) {\n    storage_ptrs_ = storage_ptrs;\n    train_ = true;\n    full_graph_evaluation_ = storage_config->full_graph_evaluation;\n\n    prefetch_ = storage_config->prefetch;\n    prefetch_complete_ = false;\n    subgraph_lock_ = new std::mutex();\n    subgraph_cv_ = new std::condition_variable();\n\n    current_subgraph_state_ = nullptr;\n    next_subgraph_state_ = nullptr;\n    in_memory_embeddings_ = nullptr;\n    in_memory_features_ = nullptr;\n\n    num_nodes_ = storage_config->dataset->num_nodes;\n    num_edges_ = storage_config->dataset->num_edges;\n\n    if (full_graph_evaluation_) {\n        if (storage_ptrs_.node_embeddings != nullptr) {\n            if (instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_embeddings)) {\n                string node_embedding_filename = storage_config->model_dir + PathConstants::embeddings_file + PathConstants::file_ext;\n\n                in_memory_embeddings_ =\n                    std::make_shared<InMemory>(node_embedding_filename, storage_ptrs_.node_embeddings->dim0_size_, storage_ptrs_.node_embeddings->dim1_size_,\n                                               storage_ptrs_.node_embeddings->dtype_, torch::kCPU);\n            }\n        }\n\n        if (storage_ptrs_.node_features != nullptr) {\n            if (instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_features)) {\n                string node_feature_filename =\n                    storage_config->dataset->dataset_dir + PathConstants::nodes_directory + PathConstants::features_file + PathConstants::file_ext;\n\n                in_memory_features_ = std::make_shared<InMemory>(node_feature_filename, storage_ptrs_.node_features->dim0_size_,\n                                                                 storage_ptrs_.node_features->dim1_size_, storage_ptrs_.node_features->dtype_, torch::kCPU);\n            }\n        }\n    }\n}\n\nGraphModelStorage::GraphModelStorage(GraphModelStoragePtrs storage_ptrs, bool prefetch) {\n    storage_ptrs_ = storage_ptrs;\n    train_ = true;\n    full_graph_evaluation_ = false;\n\n    prefetch_ = prefetch;\n    prefetch_complete_ = false;\n    subgraph_lock_ = new std::mutex();\n    subgraph_cv_ = new std::condition_variable();\n\n    current_subgraph_state_ = nullptr;\n    next_subgraph_state_ = nullptr;\n    in_memory_embeddings_ = nullptr;\n    in_memory_features_ = nullptr;\n\n    if (storage_ptrs_.node_embeddings != nullptr) {\n        num_nodes_ = storage_ptrs_.node_embeddings->getDim0();\n    } else if (storage_ptrs_.node_features != nullptr) {\n        num_nodes_ = storage_ptrs_.node_features->getDim0();\n    } else {\n        throw MariusRuntimeException(\"The input graph must have node features and/or node embeddings\");\n    }\n    num_edges_ = storage_ptrs_.edges->getDim0();\n}\n\nGraphModelStorage::~GraphModelStorage() {\n    unload(false);\n\n    delete subgraph_lock_;\n    delete subgraph_cv_;\n}\n\nvoid GraphModelStorage::_load(shared_ptr<Storage> storage) {\n    if (storage != nullptr) {\n        storage->load();\n    }\n}\n\nvoid GraphModelStorage::_unload(shared_ptr<Storage> storage, bool write) {\n    if (storage != nullptr) {\n        storage->unload(write);\n    }\n}\n\nvoid GraphModelStorage::load() {\n    _load(storage_ptrs_.edges);\n    _load(storage_ptrs_.train_edges);\n    _load(storage_ptrs_.train_edges_dst_sort);\n    _load(storage_ptrs_.nodes);\n\n    if (train_) {\n        _load(storage_ptrs_.node_embeddings);\n        _load(storage_ptrs_.node_optimizer_state);\n        _load(storage_ptrs_.node_features);\n    } else {\n        if (storage_ptrs_.node_embeddings != nullptr) {\n            if (instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_embeddings) && full_graph_evaluation_) {\n                _load(in_memory_embeddings_);\n            } else {\n                _load(storage_ptrs_.node_embeddings);\n            }\n        }\n\n        if (storage_ptrs_.node_features != nullptr) {\n            if (instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_features) && full_graph_evaluation_) {\n                _load(in_memory_features_);\n            } else {\n                _load(storage_ptrs_.node_features);\n            }\n        }\n    }\n\n    _load(storage_ptrs_.encoded_nodes);\n\n    _load(storage_ptrs_.node_labels);\n    _load(storage_ptrs_.relation_features);\n}\n\nvoid GraphModelStorage::unload(bool write) {\n    _unload(storage_ptrs_.edges, false);\n    _unload(storage_ptrs_.train_edges, false);\n    _unload(storage_ptrs_.train_edges_dst_sort, false);\n    _unload(storage_ptrs_.validation_edges, false);\n    _unload(storage_ptrs_.test_edges, false);\n    _unload(storage_ptrs_.nodes, false);\n    _unload(storage_ptrs_.train_nodes, false);\n    _unload(storage_ptrs_.valid_nodes, false);\n    _unload(storage_ptrs_.test_nodes, false);\n    _unload(storage_ptrs_.node_embeddings, write);\n    _unload(storage_ptrs_.encoded_nodes, write);\n    _unload(storage_ptrs_.node_optimizer_state, write);\n    _unload(storage_ptrs_.node_features, false);\n    _unload(storage_ptrs_.relation_features, false);\n\n    _unload(in_memory_embeddings_, false);\n    _unload(in_memory_features_, false);\n\n    for (auto f_edges : storage_ptrs_.filter_edges) {\n        _unload(f_edges, false);\n    }\n\n    active_edges_ = torch::Tensor();\n    active_nodes_ = torch::Tensor();\n}\n\nvoid GraphModelStorage::setEdgesStorage(shared_ptr<Storage> edge_storage) { storage_ptrs_.edges = edge_storage; }\n\nvoid GraphModelStorage::setNodesStorage(shared_ptr<Storage> node_storage) { storage_ptrs_.nodes = node_storage; }\n\nEdgeList GraphModelStorage::getEdges(Indices indices) {\n    if (active_edges_.defined()) {\n        return active_edges_.index_select(0, indices);\n    } else {\n        return storage_ptrs_.edges->indexRead(indices);\n    }\n}\n\nEdgeList GraphModelStorage::getEdgesRange(int64_t start, int64_t size) {\n    if (active_edges_.defined()) {\n        return active_edges_.narrow(0, start, size);\n    } else {\n        return storage_ptrs_.edges->range(start, size);\n    }\n}\n\nvoid GraphModelStorage::shuffleEdges() { storage_ptrs_.edges->shuffle(); }\n\nIndices GraphModelStorage::getRandomNodeIds(int64_t size) {\n    torch::TensorOptions ind_opts = torch::TensorOptions().dtype(torch::kInt64).device(storage_ptrs_.edges->device_);\n\n    Indices ret;\n    if (useInMemorySubGraph()) {\n        if (storage_ptrs_.node_embeddings != nullptr) {\n            ret = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->getRandomIds(size);\n        } else {\n            ret = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->getRandomIds(size);\n        }\n    } else {\n        ret = torch::randint(getNumNodesInMemory(), {size}, ind_opts);\n    }\n\n    return ret;\n}\n\nIndices GraphModelStorage::getNodeIdsRange(int64_t start, int64_t size) {\n    if (active_nodes_.defined()) {\n        return active_nodes_.narrow(0, start, size);\n    } else {\n        return storage_ptrs_.nodes->range(start, size).flatten(0, 1);\n    }\n}\n\ntorch::Tensor GraphModelStorage::getNodeEmbeddings(Indices indices) {\n    if (!train_ && instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_embeddings) && full_graph_evaluation_) {\n        if (in_memory_embeddings_ != nullptr) {\n            return in_memory_embeddings_->indexRead(indices);\n        } else {\n            return torch::Tensor();\n        }\n    } else {\n        if (storage_ptrs_.node_embeddings != nullptr) {\n            return storage_ptrs_.node_embeddings->indexRead(indices);\n        } else {\n            return torch::Tensor();\n        }\n    }\n}\n\ntorch::Tensor GraphModelStorage::getNodeEmbeddingsRange(int64_t start, int64_t size) {\n    if (storage_ptrs_.node_embeddings != nullptr) {\n        return storage_ptrs_.node_embeddings->range(start, size);\n    } else {\n        return torch::Tensor();\n    }\n}\n\ntorch::Tensor GraphModelStorage::getEncodedNodes(Indices indices) {\n    if (storage_ptrs_.encoded_nodes != nullptr) {\n        return storage_ptrs_.encoded_nodes->indexRead(indices);\n    } else {\n        return torch::Tensor();\n    }\n}\n\ntorch::Tensor GraphModelStorage::getEncodedNodesRange(int64_t start, int64_t size) {\n    if (storage_ptrs_.encoded_nodes != nullptr) {\n        return storage_ptrs_.encoded_nodes->range(start, size);\n    } else {\n        return torch::Tensor();\n    }\n}\n\ntorch::Tensor GraphModelStorage::getNodeFeatures(Indices indices) {\n    if (!train_ && instance_of<Storage, PartitionBufferStorage>(storage_ptrs_.node_features) && full_graph_evaluation_) {\n        if (in_memory_features_ != nullptr) {\n            return in_memory_features_->indexRead(indices);\n\n        } else {\n            return torch::Tensor();\n        }\n    } else {\n        if (storage_ptrs_.node_features != nullptr) {\n            return storage_ptrs_.node_features->indexRead(indices);\n        } else {\n            return torch::Tensor();\n        }\n    }\n}\n\ntorch::Tensor GraphModelStorage::getNodeFeaturesRange(int64_t start, int64_t size) {\n    if (storage_ptrs_.node_features != nullptr) {\n        return storage_ptrs_.node_features->range(start, size);\n    } else {\n        return torch::Tensor();\n    }\n}\n\ntorch::Tensor GraphModelStorage::getNodeLabels(Indices indices) {\n    if (storage_ptrs_.node_labels != nullptr) {\n        return storage_ptrs_.node_labels->indexRead(indices);\n    } else {\n        return torch::Tensor();\n    }\n}\n\ntorch::Tensor GraphModelStorage::getNodeLabelsRange(int64_t start, int64_t size) {\n    if (storage_ptrs_.node_labels != nullptr) {\n        return storage_ptrs_.node_labels->range(start, size);\n    } else {\n        return torch::Tensor();\n    }\n}\n\nvoid GraphModelStorage::updatePutNodeEmbeddings(Indices indices, torch::Tensor embeddings) { storage_ptrs_.node_embeddings->indexPut(indices, embeddings); }\n\nvoid GraphModelStorage::updateAddNodeEmbeddings(Indices indices, torch::Tensor values) { storage_ptrs_.node_embeddings->indexAdd(indices, values); }\n\nvoid GraphModelStorage::updatePutEncodedNodes(Indices indices, torch::Tensor values) { storage_ptrs_.encoded_nodes->indexPut(indices, values); }\n\nvoid GraphModelStorage::updatePutEncodedNodesRange(int64_t start, int64_t size, torch::Tensor values) {\n    storage_ptrs_.encoded_nodes->rangePut(start, size, values);\n}\n\nOptimizerState GraphModelStorage::getNodeEmbeddingState(Indices indices) {\n    if (storage_ptrs_.node_optimizer_state != nullptr) {\n        return storage_ptrs_.node_optimizer_state->indexRead(indices);\n    } else {\n        return torch::Tensor();\n    }\n}\n\nOptimizerState GraphModelStorage::getNodeEmbeddingStateRange(int64_t start, int64_t size) {\n    if (storage_ptrs_.node_optimizer_state != nullptr) {\n        return storage_ptrs_.node_optimizer_state->range(start, size);\n    } else {\n        return torch::Tensor();\n    }\n}\n\nvoid GraphModelStorage::updatePutNodeEmbeddingState(Indices indices, OptimizerState state) {\n    if (storage_ptrs_.node_optimizer_state != nullptr) {\n        storage_ptrs_.node_optimizer_state->indexPut(indices, state);\n    }\n}\n\nvoid GraphModelStorage::updateAddNodeEmbeddingState(Indices indices, torch::Tensor values) {\n    if (storage_ptrs_.node_optimizer_state != nullptr) {\n        storage_ptrs_.node_optimizer_state->indexAdd(indices, values);\n    }\n}\n\nbool GraphModelStorage::embeddingsOffDevice() {\n    if (storage_ptrs_.node_embeddings != nullptr) {\n        return storage_ptrs_.node_embeddings->device_ != torch::kCUDA;\n    } else if (storage_ptrs_.node_features != nullptr) {\n        return storage_ptrs_.node_features->device_ != torch::kCUDA;\n    } else {\n        return false;\n    }\n}\n\nvoid GraphModelStorage::initializeInMemorySubGraph(torch::Tensor buffer_state, int num_hash_maps) {\n    if (useInMemorySubGraph()) {\n        current_subgraph_state_ = std::make_shared<InMemorySubgraphState>();\n\n        buffer_state = buffer_state.to(torch::kInt64);\n\n        int buffer_size = buffer_state.size(0);\n        int num_edge_buckets_in_mem = buffer_size * buffer_size;\n        int num_partitions = getNumPartitions();\n\n        torch::Tensor new_in_mem_partition_ids = buffer_state;\n        auto new_in_mem_partition_ids_accessor = new_in_mem_partition_ids.accessor<int64_t, 1>();\n\n        torch::Tensor in_mem_edge_bucket_ids = torch::zeros({num_edge_buckets_in_mem}, torch::kInt64);\n        torch::Tensor in_mem_edge_bucket_sizes = torch::zeros({num_edge_buckets_in_mem}, torch::kInt64);\n        torch::Tensor global_edge_bucket_starts = torch::zeros({num_edge_buckets_in_mem}, torch::kInt64);\n\n        auto in_mem_edge_bucket_ids_accessor = in_mem_edge_bucket_ids.accessor<int64_t, 1>();\n        auto in_mem_edge_bucket_sizes_accessor = in_mem_edge_bucket_sizes.accessor<int64_t, 1>();\n        auto global_edge_bucket_starts_accessor = global_edge_bucket_starts.accessor<int64_t, 1>();\n\n        // TODO we don't need to do this every time\n        std::vector<int64_t> edge_bucket_sizes_ = storage_ptrs_.edges->getEdgeBucketSizes();\n        torch::Tensor edge_bucket_sizes = torch::from_blob(edge_bucket_sizes_.data(), {(int)edge_bucket_sizes_.size()}, torch::kInt64);\n        torch::Tensor edge_bucket_ends_disk = edge_bucket_sizes.cumsum(0);\n        torch::Tensor edge_bucket_starts_disk = edge_bucket_ends_disk - edge_bucket_sizes;\n        auto edge_bucket_sizes_accessor = edge_bucket_sizes.accessor<int64_t, 1>();\n        auto edge_bucket_starts_disk_accessor = edge_bucket_starts_disk.accessor<int64_t, 1>();\n\n#pragma omp parallel for\n        for (int i = 0; i < buffer_size; i++) {\n            for (int j = 0; j < buffer_size; j++) {\n                int64_t edge_bucket_id = new_in_mem_partition_ids_accessor[i] * num_partitions + new_in_mem_partition_ids_accessor[j];\n                int64_t edge_bucket_size = edge_bucket_sizes_accessor[edge_bucket_id];\n                int64_t edge_bucket_start = edge_bucket_starts_disk_accessor[edge_bucket_id];\n\n                int idx = i * buffer_size + j;\n                in_mem_edge_bucket_ids_accessor[idx] = edge_bucket_id;\n                in_mem_edge_bucket_sizes_accessor[idx] = edge_bucket_size;\n                global_edge_bucket_starts_accessor[idx] = edge_bucket_start;\n            }\n        }\n\n        torch::Tensor in_mem_edge_bucket_starts = in_mem_edge_bucket_sizes.cumsum(0);\n        int64_t total_size = in_mem_edge_bucket_starts[-1].item<int64_t>();\n        in_mem_edge_bucket_starts = in_mem_edge_bucket_starts - in_mem_edge_bucket_sizes;\n\n        auto in_mem_edge_bucket_starts_accessor = in_mem_edge_bucket_starts.accessor<int64_t, 1>();\n\n        current_subgraph_state_->all_in_memory_edges_ = torch::empty({total_size, storage_ptrs_.edges->dim1_size_}, torch::kInt64);\n\n#pragma omp parallel for\n        for (int i = 0; i < num_edge_buckets_in_mem; i++) {\n            int64_t edge_bucket_size = in_mem_edge_bucket_sizes_accessor[i];\n            int64_t edge_bucket_start = global_edge_bucket_starts_accessor[i];\n            int64_t local_offset = in_mem_edge_bucket_starts_accessor[i];\n\n            current_subgraph_state_->all_in_memory_edges_.narrow(0, local_offset, edge_bucket_size) =\n                storage_ptrs_.edges->range(edge_bucket_start, edge_bucket_size);\n        }\n\n        if (storage_ptrs_.node_embeddings != nullptr) {\n            current_subgraph_state_->global_to_local_index_map_ =\n                std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->getGlobalToLocalMap(true);\n        } else if (storage_ptrs_.node_features != nullptr) {\n            current_subgraph_state_->global_to_local_index_map_ =\n                std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->getGlobalToLocalMap(true);\n        }\n\n        torch::Tensor mapped_edges;\n        torch::Tensor mapped_edges_dst_sort;\n        if (storage_ptrs_.edges->dim1_size_ == 3) {\n            mapped_edges =\n                torch::stack({current_subgraph_state_->global_to_local_index_map_.index_select(0, current_subgraph_state_->all_in_memory_edges_.select(1, 0)),\n                              current_subgraph_state_->all_in_memory_edges_.select(1, 1),\n                              current_subgraph_state_->global_to_local_index_map_.index_select(0, current_subgraph_state_->all_in_memory_edges_.select(1, -1))})\n                    .transpose(0, 1);\n        } else if (storage_ptrs_.edges->dim1_size_ == 2) {\n            mapped_edges =\n                torch::stack({current_subgraph_state_->global_to_local_index_map_.index_select(0, current_subgraph_state_->all_in_memory_edges_.select(1, 0)),\n                              current_subgraph_state_->global_to_local_index_map_.index_select(0, current_subgraph_state_->all_in_memory_edges_.select(1, -1))})\n                    .transpose(0, 1);\n        } else {\n            // TODO use a function for logging errors and throwing expections\n            SPDLOG_ERROR(\"Unexpected number of edge columns\");\n            std::runtime_error(\"Unexpected number of edge columns\");\n        }\n\n        current_subgraph_state_->all_in_memory_mapped_edges_ = mapped_edges;\n\n        mapped_edges = merge_sorted_edge_buckets(mapped_edges, in_mem_edge_bucket_starts, buffer_size, true);\n        mapped_edges_dst_sort = merge_sorted_edge_buckets(mapped_edges, in_mem_edge_bucket_starts, buffer_size, false);\n\n        mapped_edges = mapped_edges.to(torch::kInt64);\n        mapped_edges_dst_sort = mapped_edges_dst_sort.to(torch::kInt64);\n\n        if (current_subgraph_state_->in_memory_subgraph_ != nullptr) {\n            current_subgraph_state_->in_memory_subgraph_ = nullptr;\n        }\n\n        current_subgraph_state_->in_memory_subgraph_ = std::make_shared<MariusGraph>(mapped_edges, mapped_edges_dst_sort, getNumNodesInMemory(), num_hash_maps);\n\n        current_subgraph_state_->in_memory_partition_ids_ = new_in_mem_partition_ids;\n        current_subgraph_state_->in_memory_edge_bucket_ids_ = in_mem_edge_bucket_ids;\n        current_subgraph_state_->in_memory_edge_bucket_sizes_ = in_mem_edge_bucket_sizes;\n        current_subgraph_state_->in_memory_edge_bucket_starts_ = in_mem_edge_bucket_starts;\n\n        if (prefetch_) {\n            if (hasSwap()) {\n                // update next_subgraph_state_ in background\n                getNextSubGraph();\n            }\n        }\n    } else {\n        // Either nothing buffered (in memory training) or eval and doing full graph evaluation\n        current_subgraph_state_ = std::make_shared<InMemorySubgraphState>();\n\n        bool should_sort = false;\n\n        EdgeList src_sort;\n        EdgeList dst_sort;\n        if (storage_ptrs_.train_edges != nullptr) {\n            src_sort = storage_ptrs_.train_edges->range(0, storage_ptrs_.train_edges->getDim0()).to(torch::kInt64);\n            if (storage_ptrs_.train_edges_dst_sort != nullptr) {\n                dst_sort = storage_ptrs_.train_edges_dst_sort->range(0, storage_ptrs_.train_edges_dst_sort->getDim0()).to(torch::kInt64);\n            } else {\n                dst_sort = storage_ptrs_.train_edges->range(0, storage_ptrs_.train_edges->getDim0()).to(torch::kInt64);\n                should_sort = true;\n            }\n        } else {\n            src_sort = storage_ptrs_.edges->range(0, storage_ptrs_.edges->getDim0()).to(torch::kInt64);\n            dst_sort = storage_ptrs_.edges->range(0, storage_ptrs_.edges->getDim0()).to(torch::kInt64);\n            should_sort = true;\n        }\n\n        if (should_sort) {\n            src_sort = src_sort.index_select(0, torch::argsort(src_sort.select(1, 0))).to(torch::kInt64);\n            dst_sort = dst_sort.index_select(0, torch::argsort(dst_sort.select(1, -1))).to(torch::kInt64);\n        }\n\n        current_subgraph_state_->in_memory_subgraph_ = std::make_shared<MariusGraph>(src_sort, dst_sort, getNumNodesInMemory(), num_hash_maps);\n    }\n}\n\nvoid GraphModelStorage::updateInMemorySubGraph() {\n    if (prefetch_) {\n        // wait until the prefetching has been completed\n        std::unique_lock lock(*subgraph_lock_);\n        subgraph_cv_->wait(lock, [this] { return prefetch_complete_ == true; });\n        // need to wait for the subgraph to be prefetched to perform the swap, otherwise the prefetched buffer_index_map may be incorrect\n        performSwap();\n        // free previous subgraph\n        current_subgraph_state_->in_memory_subgraph_ = nullptr;\n        current_subgraph_state_ = nullptr;\n\n        current_subgraph_state_ = next_subgraph_state_;\n        next_subgraph_state_ = nullptr;\n        prefetch_complete_ = false;\n\n        if (hasSwap()) {\n            // update next_subgraph_state_ in background\n            getNextSubGraph();\n        }\n    } else {\n        std::pair<std::vector<int>, std::vector<int>> current_swap_ids = getNextSwapIds();\n        performSwap();\n        updateInMemorySubGraph_(current_subgraph_state_, current_swap_ids);\n    }\n}\n\nvoid GraphModelStorage::getNextSubGraph() {\n    std::pair<std::vector<int>, std::vector<int>> next_swap_ids = getNextSwapIds();\n    next_subgraph_state_ = std::make_shared<InMemorySubgraphState>();\n    next_subgraph_state_->in_memory_subgraph_ = nullptr;\n    std::thread(&GraphModelStorage::updateInMemorySubGraph_, this, next_subgraph_state_, next_swap_ids).detach();\n}\n\nvoid GraphModelStorage::updateInMemorySubGraph_(shared_ptr<InMemorySubgraphState> subgraph, std::pair<std::vector<int>, std::vector<int>> swap_ids) {\n    if (prefetch_) {\n        subgraph_lock_->lock();\n    }\n\n    std::vector<int> evict_partition_ids = std::get<0>(swap_ids);\n    std::vector<int> admit_partition_ids = std::get<1>(swap_ids);\n\n    torch::Tensor admit_ids_tensor = torch::tensor(admit_partition_ids, torch::kCPU);\n\n    int buffer_size = current_subgraph_state_->in_memory_partition_ids_.size(0);\n    int num_edge_buckets_in_mem = current_subgraph_state_->in_memory_edge_bucket_ids_.size(0);\n    int num_partitions = getNumPartitions();\n    int num_swap_partitions = evict_partition_ids.size();\n    int num_remaining_partitions = buffer_size - num_swap_partitions;\n\n    // get edge buckets that will be kept in memory\n    torch::Tensor keep_mask = torch::ones({num_edge_buckets_in_mem}, torch::kBool);\n    auto accessor_keep_mask = keep_mask.accessor<bool, 1>();\n    auto accessor_in_memory_edge_bucket_ids_ = current_subgraph_state_->in_memory_edge_bucket_ids_.accessor<int64_t, 1>();\n\n#pragma omp parallel for\n    for (int i = 0; i < num_edge_buckets_in_mem; i++) {\n        int64_t edge_bucket_id = accessor_in_memory_edge_bucket_ids_[i];\n        int64_t src_partition = edge_bucket_id / num_partitions;\n        int64_t dst_partition = edge_bucket_id % num_partitions;\n\n        for (int j = 0; j < num_swap_partitions; j++) {\n            if (src_partition == evict_partition_ids[j] || dst_partition == evict_partition_ids[j]) {\n                accessor_keep_mask[i] = false;\n            }\n        }\n    }\n\n    torch::Tensor in_mem_edge_bucket_ids = current_subgraph_state_->in_memory_edge_bucket_ids_.masked_select(keep_mask);\n    torch::Tensor in_mem_edge_bucket_sizes = current_subgraph_state_->in_memory_edge_bucket_sizes_.masked_select(keep_mask);\n    torch::Tensor local_or_global_edge_bucket_starts = current_subgraph_state_->in_memory_edge_bucket_starts_.masked_select(keep_mask);\n\n    // get new in memory partition ids\n    keep_mask = torch::ones({buffer_size}, torch::kBool);\n    accessor_keep_mask = keep_mask.accessor<bool, 1>();\n    auto accessor_in_memory_partition_ids_ = current_subgraph_state_->in_memory_partition_ids_.accessor<int64_t, 1>();\n\n#pragma omp parallel for\n    for (int i = 0; i < buffer_size; i++) {\n        int64_t partition_id = accessor_in_memory_partition_ids_[i];\n\n        for (int j = 0; j < num_swap_partitions; j++) {\n            if (partition_id == evict_partition_ids[j]) {\n                accessor_keep_mask[i] = false;\n                break;\n            }\n        }\n    }\n\n    torch::Tensor old_in_mem_partition_ids = current_subgraph_state_->in_memory_partition_ids_.masked_select(keep_mask);\n    torch::Tensor new_in_mem_partition_ids = current_subgraph_state_->in_memory_partition_ids_.masked_scatter(~keep_mask, admit_ids_tensor);\n    auto old_in_mem_partition_ids_accessor = old_in_mem_partition_ids.accessor<int64_t, 1>();\n    auto new_in_mem_partition_ids_accessor = new_in_mem_partition_ids.accessor<int64_t, 1>();\n\n    // get new incoming edge buckets\n    int num_new_edge_buckets = num_swap_partitions * (num_remaining_partitions + buffer_size);\n\n    torch::Tensor new_edge_bucket_ids = torch::zeros({num_new_edge_buckets}, torch::kInt64);\n    torch::Tensor new_edge_bucket_sizes = torch::zeros({num_new_edge_buckets}, torch::kInt64);\n    torch::Tensor new_global_edge_bucket_starts = torch::zeros({num_new_edge_buckets}, torch::kInt64);\n\n    auto new_edge_bucket_ids_accessor = new_edge_bucket_ids.accessor<int64_t, 1>();\n    auto new_edge_bucket_sizes_accessor = new_edge_bucket_sizes.accessor<int64_t, 1>();\n    auto new_global_edge_bucket_starts_accessor = new_global_edge_bucket_starts.accessor<int64_t, 1>();\n\n    // TODO we don't need to do this every time\n    std::vector<int64_t> edge_bucket_sizes_ = storage_ptrs_.edges->getEdgeBucketSizes();\n    torch::Tensor edge_bucket_sizes = torch::from_blob(edge_bucket_sizes_.data(), {(int)edge_bucket_sizes_.size()}, torch::kInt64);\n    torch::Tensor edge_bucket_ends_disk = edge_bucket_sizes.cumsum(0);\n    torch::Tensor edge_bucket_starts_disk = edge_bucket_ends_disk - edge_bucket_sizes;\n    auto edge_bucket_sizes_accessor = edge_bucket_sizes.accessor<int64_t, 1>();\n    auto edge_bucket_starts_disk_accessor = edge_bucket_starts_disk.accessor<int64_t, 1>();\n\n#pragma omp parallel for\n    for (int i = 0; i < num_remaining_partitions; i++) {\n        for (int j = 0; j < num_swap_partitions; j++) {\n            int64_t edge_bucket_id = old_in_mem_partition_ids_accessor[i] * num_partitions + admit_partition_ids[j];\n            int64_t edge_bucket_size = edge_bucket_sizes_accessor[edge_bucket_id];\n            int64_t edge_bucket_start = edge_bucket_starts_disk_accessor[edge_bucket_id];\n\n            int idx = i * num_swap_partitions + j;\n            new_edge_bucket_ids_accessor[idx] = edge_bucket_id;\n            new_edge_bucket_sizes_accessor[idx] = edge_bucket_size;\n            new_global_edge_bucket_starts_accessor[idx] = edge_bucket_start;\n        }\n    }\n\n    int offset = num_swap_partitions * num_remaining_partitions;\n\n#pragma omp parallel for\n    for (int i = 0; i < buffer_size; i++) {\n        for (int j = 0; j < num_swap_partitions; j++) {\n            int64_t edge_bucket_id = admit_partition_ids[j] * num_partitions + new_in_mem_partition_ids_accessor[i];\n            int64_t edge_bucket_size = edge_bucket_sizes_accessor[edge_bucket_id];\n            int64_t edge_bucket_start = edge_bucket_starts_disk_accessor[edge_bucket_id];\n\n            int idx = offset + i * num_swap_partitions + j;\n            new_edge_bucket_ids_accessor[idx] = edge_bucket_id;\n            new_edge_bucket_sizes_accessor[idx] = edge_bucket_size;\n            new_global_edge_bucket_starts_accessor[idx] = edge_bucket_start;\n        }\n    }\n\n    // concatenate old and new\n    in_mem_edge_bucket_ids = torch::cat({in_mem_edge_bucket_ids, new_edge_bucket_ids});\n    in_mem_edge_bucket_sizes = torch::cat({in_mem_edge_bucket_sizes, new_edge_bucket_sizes});\n    local_or_global_edge_bucket_starts = torch::cat({local_or_global_edge_bucket_starts, new_global_edge_bucket_starts});\n\n    torch::Tensor in_mem_mask = torch::ones({num_edge_buckets_in_mem - num_new_edge_buckets}, torch::kBool);\n    in_mem_mask = torch::cat({in_mem_mask, torch::zeros({num_new_edge_buckets}, torch::kBool)});\n\n    // put the ids in the correct order so the mapped edges remain sorted\n    torch::Tensor src_ids_order = torch::zeros({num_edge_buckets_in_mem}, torch::kInt64);\n    auto src_ids_order_accessor = src_ids_order.accessor<int64_t, 1>();\n\n#pragma omp parallel for\n    for (int i = 0; i < buffer_size; i++) {\n        for (int j = 0; j < buffer_size; j++) {\n            int64_t edge_bucket_id = new_in_mem_partition_ids_accessor[i] * num_partitions + new_in_mem_partition_ids_accessor[j];\n\n            int idx = i * buffer_size + j;\n            src_ids_order_accessor[idx] = edge_bucket_id;\n        }\n    }\n\n    // TODO: all these argsorts can be done with one omp for loop, probably faster, same with masked_selects above\n    torch::Tensor arg_sort = torch::argsort(in_mem_edge_bucket_ids);\n    arg_sort = (arg_sort.index_select(0, torch::argsort(torch::argsort(src_ids_order))));\n    in_mem_edge_bucket_ids = (in_mem_edge_bucket_ids.index_select(0, arg_sort));\n    in_mem_edge_bucket_sizes = (in_mem_edge_bucket_sizes.index_select(0, arg_sort));\n    local_or_global_edge_bucket_starts = (local_or_global_edge_bucket_starts.index_select(0, arg_sort));\n    in_mem_mask = (in_mem_mask.index_select(0, arg_sort));\n\n    // with everything in order grab the edge buckets\n    torch::Tensor in_mem_edge_bucket_starts = in_mem_edge_bucket_sizes.cumsum(0);\n    int64_t total_size = in_mem_edge_bucket_starts[-1].item<int64_t>();\n    in_mem_edge_bucket_starts = in_mem_edge_bucket_starts - in_mem_edge_bucket_sizes;\n\n    auto in_mem_edge_bucket_sizes_accessor = in_mem_edge_bucket_sizes.accessor<int64_t, 1>();\n    auto local_or_global_edge_bucket_starts_accessor = local_or_global_edge_bucket_starts.accessor<int64_t, 1>();\n    auto in_mem_mask_accessor = in_mem_mask.accessor<bool, 1>();\n    auto in_mem_edge_bucket_starts_accessor = in_mem_edge_bucket_starts.accessor<int64_t, 1>();\n\n    torch::Tensor new_all_in_memory_edges = torch::empty({total_size, storage_ptrs_.edges->dim1_size_}, torch::kInt64);\n\n// get the edges\n#pragma omp parallel for\n    for (int i = 0; i < num_edge_buckets_in_mem; i++) {\n        int64_t edge_bucket_size = in_mem_edge_bucket_sizes_accessor[i];\n        int64_t edge_bucket_start = local_or_global_edge_bucket_starts_accessor[i];\n        bool in_mem = in_mem_mask_accessor[i];\n        int64_t local_offset = in_mem_edge_bucket_starts_accessor[i];\n\n        if (in_mem) {\n            new_all_in_memory_edges.narrow(0, local_offset, edge_bucket_size) =\n                current_subgraph_state_->all_in_memory_edges_.narrow(0, edge_bucket_start, edge_bucket_size);\n        } else {\n            new_all_in_memory_edges.narrow(0, local_offset, edge_bucket_size) = storage_ptrs_.edges->range(edge_bucket_start, edge_bucket_size);\n        }\n    }\n\n    subgraph->all_in_memory_edges_ = new_all_in_memory_edges;\n\n    if (storage_ptrs_.node_embeddings != nullptr) {\n        subgraph->global_to_local_index_map_ =\n            std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_embeddings)->getGlobalToLocalMap(!prefetch_);\n    } else if (storage_ptrs_.node_features != nullptr) {\n        subgraph->global_to_local_index_map_ = std::dynamic_pointer_cast<PartitionBufferStorage>(storage_ptrs_.node_features)->getGlobalToLocalMap(!prefetch_);\n    }\n\n    torch::Tensor mapped_edges;\n    torch::Tensor mapped_edges_dst_sort;\n    if (storage_ptrs_.edges->dim1_size_ == 3) {\n        mapped_edges = torch::stack({subgraph->global_to_local_index_map_.index_select(0, subgraph->all_in_memory_edges_.select(1, 0)),\n                                     subgraph->all_in_memory_edges_.select(1, 1),\n                                     subgraph->global_to_local_index_map_.index_select(0, subgraph->all_in_memory_edges_.select(1, -1))})\n                           .transpose(0, 1);\n    } else if (storage_ptrs_.edges->dim1_size_ == 2) {\n        mapped_edges = torch::stack({subgraph->global_to_local_index_map_.index_select(0, subgraph->all_in_memory_edges_.select(1, 0)),\n                                     subgraph->global_to_local_index_map_.index_select(0, subgraph->all_in_memory_edges_.select(1, -1))})\n                           .transpose(0, 1);\n    } else {\n        // TODO use a function for logging errors and throwing expections\n        SPDLOG_ERROR(\"Unexpected number of edge columns\");\n        std::runtime_error(\"Unexpected number of edge columns\");\n    }\n\n    //    assert((mapped_edges == -1).nonzero().size(0) == 0);\n    //    assert((mapped_edges_dst_sort == -1).nonzero().size(0) == 0);\n\n    subgraph->all_in_memory_mapped_edges_ = mapped_edges;\n\n    mapped_edges = merge_sorted_edge_buckets(mapped_edges, in_mem_edge_bucket_starts, buffer_size, true);\n    mapped_edges_dst_sort = merge_sorted_edge_buckets(mapped_edges, in_mem_edge_bucket_starts, buffer_size, false);\n\n    mapped_edges = mapped_edges.to(torch::kInt64);\n    mapped_edges_dst_sort = mapped_edges_dst_sort.to(torch::kInt64);\n\n    int num_hash_maps = current_subgraph_state_->in_memory_subgraph_->num_hash_maps_;\n\n    if (subgraph->in_memory_subgraph_ != nullptr) {\n        subgraph->in_memory_subgraph_ = nullptr;\n    }\n\n    subgraph->in_memory_subgraph_ = std::make_shared<MariusGraph>(mapped_edges, mapped_edges_dst_sort, getNumNodesInMemory(), num_hash_maps);\n\n    // update state\n    subgraph->in_memory_partition_ids_ = new_in_mem_partition_ids;\n    subgraph->in_memory_edge_bucket_ids_ = in_mem_edge_bucket_ids;\n    subgraph->in_memory_edge_bucket_sizes_ = in_mem_edge_bucket_sizes;\n    subgraph->in_memory_edge_bucket_starts_ = in_mem_edge_bucket_starts;\n\n    if (prefetch_) {\n        prefetch_complete_ = true;\n        subgraph_lock_->unlock();\n        subgraph_cv_->notify_all();\n    }\n}\n\nEdgeList GraphModelStorage::merge_sorted_edge_buckets(EdgeList edges, torch::Tensor starts, int buffer_size, bool src) {\n    int sort_dim = 0;\n    if (!src) {\n        sort_dim = -1;\n    }\n    return edges.index_select(0, torch::argsort(edges.select(1, sort_dim)));\n}\n\nvoid GraphModelStorage::sortAllEdges() {\n    if (!useInMemorySubGraph()) {\n        std::vector<EdgeList> additional_edges = {};\n\n        if (storage_ptrs_.train_edges != nullptr) {\n            storage_ptrs_.train_edges->load();\n            additional_edges.emplace_back(storage_ptrs_.train_edges->range(0, storage_ptrs_.train_edges->getDim0()));\n        }\n\n        if (storage_ptrs_.validation_edges != nullptr) {\n            storage_ptrs_.validation_edges->load();\n            additional_edges.emplace_back(storage_ptrs_.validation_edges->range(0, storage_ptrs_.validation_edges->getDim0()));\n        }\n\n        if (storage_ptrs_.test_edges != nullptr) {\n            storage_ptrs_.test_edges->load();\n            additional_edges.emplace_back(storage_ptrs_.test_edges->range(0, storage_ptrs_.test_edges->getDim0()));\n        }\n\n        for (auto f_edges : storage_ptrs_.filter_edges) {\n            f_edges->load();\n            additional_edges.emplace_back(f_edges->range(0, f_edges->getDim0()));\n        }\n\n        current_subgraph_state_->in_memory_subgraph_->sortAllEdges(torch::cat(additional_edges));\n\n        for (auto f_edges : storage_ptrs_.filter_edges) {\n            f_edges->unload();\n        }\n    } else {\n        current_subgraph_state_->in_memory_subgraph_->sortAllEdges(current_subgraph_state_->in_memory_subgraph_->src_sorted_edges_);\n    }\n}"
  },
  {
    "path": "src/cpp/src/storage/io.cpp",
    "content": "//\n// Created by jasonmohoney on 10/4/19.\n//\n\n#include \"storage/io.h\"\n\n#include \"configuration/constants.h\"\n#include \"nn/initialization.h\"\n#include \"nn/model.h\"\n#include \"reporting/logger.h\"\n\nstd::tuple<shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>> initializeEdges(shared_ptr<StorageConfig> storage_config,\n                                                                                                               LearningTask learning_task) {\n    string train_filename =\n        storage_config->dataset->dataset_dir + PathConstants::edges_directory + PathConstants::training + PathConstants::edges_file + PathConstants::file_ext;\n    string valid_filename =\n        storage_config->dataset->dataset_dir + PathConstants::edges_directory + PathConstants::validation + PathConstants::edges_file + PathConstants::file_ext;\n    string test_filename =\n        storage_config->dataset->dataset_dir + PathConstants::edges_directory + PathConstants::test + PathConstants::edges_file + PathConstants::file_ext;\n\n    string train_dst_sort_filename = storage_config->dataset->dataset_dir + PathConstants::edges_directory + PathConstants::training +\n                                     PathConstants::edges_file + PathConstants::dst_sort + PathConstants::file_ext;\n\n    shared_ptr<Storage> train_edge_storage = nullptr;\n    shared_ptr<Storage> train_edge_storage_dst_sort = nullptr;\n    shared_ptr<Storage> valid_edge_storage = nullptr;\n    shared_ptr<Storage> test_edge_storage = nullptr;\n\n    int64_t num_train = 0;\n    int64_t num_valid = 0;\n    int64_t num_test = 0;\n    if (learning_task == LearningTask::LINK_PREDICTION) {\n        num_train = storage_config->dataset->num_train;\n        num_valid = storage_config->dataset->num_valid;\n        num_test = storage_config->dataset->num_test;\n    } else if (learning_task == LearningTask::NODE_CLASSIFICATION) {\n        num_train = storage_config->dataset->num_edges;\n    }\n\n    torch::Dtype dtype = storage_config->edges->options->dtype;\n\n    int num_columns = 3;\n    if (storage_config->dataset->num_relations == 1) {\n        num_columns = 2;\n    }\n\n    switch (storage_config->edges->type) {\n        case StorageBackend::PARTITION_BUFFER: {\n            SPDLOG_ERROR(\"Backend type not available for edges.\");\n            throw std::runtime_error(\"\");\n        }\n        case StorageBackend::FLAT_FILE: {\n            if (num_train != -1) {\n                train_edge_storage = std::make_shared<FlatFile>(train_filename, num_train, num_columns, dtype);\n            }\n            if (num_valid != -1) {\n                valid_edge_storage = std::make_shared<FlatFile>(valid_filename, num_valid, num_columns, dtype);\n            }\n            if (num_test != -1) {\n                test_edge_storage = std::make_shared<FlatFile>(test_filename, num_test, num_columns, dtype);\n            }\n            break;\n        }\n        case StorageBackend::HOST_MEMORY: {\n            if (num_train != -1) {\n                train_edge_storage = std::make_shared<InMemory>(train_filename, num_train, num_columns, dtype, torch::kCPU);\n                if (!storage_config->train_edges_pre_sorted) {\n                    copyFile(train_filename, train_dst_sort_filename);\n                }\n                train_edge_storage_dst_sort = std::make_shared<InMemory>(train_dst_sort_filename, num_train, num_columns, dtype, torch::kCPU);\n            }\n            if (num_valid != -1) {\n                valid_edge_storage = std::make_shared<InMemory>(valid_filename, num_valid, num_columns, dtype, torch::kCPU);\n            }\n            if (num_test != -1) {\n                test_edge_storage = std::make_shared<InMemory>(test_filename, num_test, num_columns, dtype, torch::kCPU);\n            }\n            break;\n        }\n        case StorageBackend::DEVICE_MEMORY: {\n            if (num_train != -1) {\n                train_edge_storage = std::make_shared<InMemory>(train_filename, num_train, num_columns, dtype, storage_config->device_type);\n                if (!storage_config->train_edges_pre_sorted) {\n                    copyFile(train_filename, train_dst_sort_filename);\n                }\n                train_edge_storage_dst_sort = std::make_shared<InMemory>(train_dst_sort_filename, num_train, num_columns, dtype, storage_config->device_type);\n            }\n            if (num_valid != -1) {\n                valid_edge_storage = std::make_shared<InMemory>(valid_filename, num_valid, num_columns, dtype, storage_config->device_type);\n            }\n            if (num_test != -1) {\n                test_edge_storage = std::make_shared<InMemory>(test_filename, num_test, num_columns, dtype, storage_config->device_type);\n            }\n            break;\n        }\n    }\n\n    bool use_buffer = false;\n\n    if (storage_config->embeddings != nullptr) {\n        if (storage_config->embeddings->type == StorageBackend::PARTITION_BUFFER) {\n            use_buffer = true;\n        }\n    }\n\n    if (storage_config->features != nullptr) {\n        if (storage_config->features->type == StorageBackend::PARTITION_BUFFER) {\n            use_buffer = true;\n        }\n    }\n\n    if (use_buffer) {\n        string train_edges_partitions =\n            storage_config->dataset->dataset_dir + PathConstants::edges_directory + PathConstants::training + PathConstants::edge_partition_offsets_file;\n\n        string validation_edges_partitions =\n            storage_config->dataset->dataset_dir + PathConstants::edges_directory + PathConstants::validation + PathConstants::edge_partition_offsets_file;\n\n        string test_edges_partitions =\n            storage_config->dataset->dataset_dir + PathConstants::edges_directory + PathConstants::test + PathConstants::edge_partition_offsets_file;\n\n        if (train_edge_storage != nullptr) {\n            train_edge_storage->readPartitionSizes(train_edges_partitions);\n        }\n\n        if (valid_edge_storage != nullptr) {\n            valid_edge_storage->readPartitionSizes(validation_edges_partitions);\n        }\n\n        if (test_edge_storage != nullptr) {\n            test_edge_storage->readPartitionSizes(test_edges_partitions);\n        }\n    } else {\n        if (train_edge_storage != nullptr) {\n            if (!storage_config->train_edges_pre_sorted) {\n                train_edge_storage->sort(true);\n                train_edge_storage_dst_sort->sort(false);\n            }\n        }\n    }\n\n    if (storage_config->shuffle_input) {\n        if (valid_edge_storage != nullptr) {\n            valid_edge_storage->shuffle();\n        }\n        if (test_edge_storage != nullptr) {\n            test_edge_storage->shuffle();\n        }\n    }\n\n    return std::forward_as_tuple(train_edge_storage, train_edge_storage_dst_sort, valid_edge_storage, test_edge_storage);\n}\n\nstd::tuple<shared_ptr<Storage>, shared_ptr<Storage>> initializeNodeEmbeddings(shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config,\n                                                                              bool reinitialize, bool train, shared_ptr<InitConfig> init_config) {\n    string node_embedding_filename = storage_config->model_dir + PathConstants::embeddings_file + PathConstants::file_ext;\n    string optimizer_state_filename = storage_config->model_dir + PathConstants::embeddings_state_file + PathConstants::file_ext;\n\n    if (storage_config->embeddings == nullptr || !model->has_embeddings()) {\n        return std::forward_as_tuple(nullptr, nullptr);\n    }\n\n    int64_t num_nodes = storage_config->dataset->num_nodes;\n    int embedding_dim = model->get_base_embedding_dim();\n    torch::Dtype dtype = storage_config->embeddings->options->dtype;\n\n    if (reinitialize) {\n        shared_ptr<FlatFile> init_node_embeddings = std::make_shared<FlatFile>(node_embedding_filename, dtype);\n        shared_ptr<FlatFile> init_optimizer_state_storage = std::make_shared<FlatFile>(optimizer_state_filename, dtype);\n\n        int64_t curr_num_nodes = 0;\n        int64_t offset = 0;\n\n        while (offset < num_nodes) {\n            if (num_nodes - offset < MAX_NODE_EMBEDDING_INIT_SIZE) {\n                curr_num_nodes = num_nodes - offset;\n            } else {\n                curr_num_nodes = MAX_NODE_EMBEDDING_INIT_SIZE;\n            }\n\n            torch::Tensor weights = initialize_subtensor(init_config, {curr_num_nodes, embedding_dim}, {num_nodes, embedding_dim}, torch::TensorOptions());\n            OptimizerState emb_state = torch::zeros_like(weights);\n            init_node_embeddings->append(weights);\n            init_optimizer_state_storage->append(emb_state);\n\n            offset += curr_num_nodes;\n        }\n    }\n\n    shared_ptr<Storage> node_embeddings = nullptr;\n    shared_ptr<Storage> optimizer_state_storage = nullptr;\n\n    switch (storage_config->embeddings->type) {\n        case StorageBackend::PARTITION_BUFFER: {\n            node_embeddings = std::make_shared<PartitionBufferStorage>(node_embedding_filename, num_nodes, embedding_dim,\n                                                                       std::dynamic_pointer_cast<PartitionBufferOptions>(storage_config->embeddings->options));\n            if (train) {\n                optimizer_state_storage = std::make_shared<PartitionBufferStorage>(\n                    optimizer_state_filename, num_nodes, embedding_dim, std::dynamic_pointer_cast<PartitionBufferOptions>(storage_config->embeddings->options));\n            }\n            break;\n        }\n        case StorageBackend::FLAT_FILE: {\n            SPDLOG_ERROR(\"Backend type not available for embeddings.\");\n            throw std::runtime_error(\"\");\n        }\n        case StorageBackend::HOST_MEMORY: {\n            node_embeddings = std::make_shared<InMemory>(node_embedding_filename, num_nodes, embedding_dim, dtype, torch::kCPU);\n            if (train) {\n                optimizer_state_storage = std::make_shared<InMemory>(optimizer_state_filename, num_nodes, embedding_dim, dtype, torch::kCPU);\n            }\n            break;\n        }\n        case StorageBackend::DEVICE_MEMORY: {\n            node_embeddings = std::make_shared<InMemory>(node_embedding_filename, num_nodes, embedding_dim, dtype, storage_config->device_type);\n            if (train) {\n                optimizer_state_storage = std::make_shared<InMemory>(optimizer_state_filename, num_nodes, embedding_dim, dtype, storage_config->device_type);\n            }\n            break;\n        }\n    }\n\n    return std::forward_as_tuple(node_embeddings, optimizer_state_storage);\n}\n\nstd::tuple<shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>> initializeNodeIds(shared_ptr<StorageConfig> storage_config) {\n    string train_filename =\n        storage_config->dataset->dataset_dir + PathConstants::nodes_directory + PathConstants::training + PathConstants::nodes_file + PathConstants::file_ext;\n    string valid_filename =\n        storage_config->dataset->dataset_dir + PathConstants::nodes_directory + PathConstants::validation + PathConstants::nodes_file + PathConstants::file_ext;\n    string test_filename =\n        storage_config->dataset->dataset_dir + PathConstants::nodes_directory + PathConstants::test + PathConstants::nodes_file + PathConstants::file_ext;\n\n    int64_t num_train = storage_config->dataset->num_train;\n    int64_t num_valid = storage_config->dataset->num_valid;\n    int64_t num_test = storage_config->dataset->num_test;\n    torch::Dtype dtype = storage_config->nodes->options->dtype;\n\n    shared_ptr<Storage> train_node_storage = nullptr;\n    shared_ptr<Storage> valid_node_storage = nullptr;\n    shared_ptr<Storage> test_node_storage = nullptr;\n\n    switch (storage_config->nodes->type) {\n        case StorageBackend::PARTITION_BUFFER: {\n            SPDLOG_ERROR(\"Backend type not available for nodes.\");\n            throw std::runtime_error(\"\");\n        }\n        case StorageBackend::FLAT_FILE: {\n            SPDLOG_ERROR(\"Backend type not available for nodes.\");\n            throw std::runtime_error(\"\");\n        }\n        case StorageBackend::HOST_MEMORY: {\n            if (num_train != -1) {\n                train_node_storage = std::make_shared<InMemory>(train_filename, num_train, 1, dtype, torch::kCPU);\n            }\n            if (num_valid != -1) {\n                valid_node_storage = std::make_shared<InMemory>(valid_filename, num_valid, 1, dtype, torch::kCPU);\n            }\n            if (num_test != -1) {\n                test_node_storage = std::make_shared<InMemory>(test_filename, num_test, 1, dtype, torch::kCPU);\n            }\n\n            break;\n        }\n        case StorageBackend::DEVICE_MEMORY: {\n            if (num_train != -1) {\n                train_node_storage = std::make_shared<InMemory>(train_filename, num_train, 1, dtype, storage_config->device_type);\n            }\n            if (num_valid != -1) {\n                valid_node_storage = std::make_shared<InMemory>(valid_filename, num_valid, 1, dtype, storage_config->device_type);\n            }\n            if (num_test != -1) {\n                test_node_storage = std::make_shared<InMemory>(test_filename, num_test, 1, dtype, storage_config->device_type);\n            }\n            break;\n        }\n    }\n\n    if (storage_config->shuffle_input) {\n        if (train_node_storage != nullptr) {\n            train_node_storage->shuffle();\n        }\n        if (valid_node_storage != nullptr) {\n            valid_node_storage->shuffle();\n        }\n        if (test_node_storage != nullptr) {\n            test_node_storage->shuffle();\n        }\n    }\n\n    return std::forward_as_tuple(train_node_storage, valid_node_storage, test_node_storage);\n}\n\nshared_ptr<Storage> initializeRelationFeatures(shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config) {\n    string rel_features_file = storage_config->model_dir + PathConstants::features_file + PathConstants::file_ext;\n\n    int64_t num_relations = storage_config->dataset->num_relations;\n    int64_t rel_feature_dim = storage_config->dataset->rel_feature_dim;\n\n    if (rel_feature_dim == -1 || num_relations == -1 || model->decoder_ == nullptr) {\n        return nullptr;\n    }\n\n    shared_ptr<Storage> rel_features =\n        std::make_shared<InMemory>(rel_features_file, num_relations, rel_feature_dim, torch::kFloat32, storage_config->device_type);\n    rel_features->load();\n\n    return rel_features;\n}\n\nshared_ptr<Storage> initializeNodeFeatures(shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config) {\n    string node_features_file = storage_config->dataset->dataset_dir + PathConstants::nodes_directory + PathConstants::features_file + PathConstants::file_ext;\n\n    shared_ptr<Storage> node_features;\n\n    int64_t num_nodes = storage_config->dataset->num_nodes;\n    int64_t node_feature_dim = storage_config->dataset->node_feature_dim;\n\n    if (storage_config->features == nullptr || node_feature_dim == -1) {\n        return nullptr;\n    }\n    torch::Dtype dtype = storage_config->features->options->dtype;\n\n    switch (storage_config->features->type) {\n        case StorageBackend::PARTITION_BUFFER: {\n            node_features = std::make_shared<PartitionBufferStorage>(node_features_file, num_nodes, node_feature_dim,\n                                                                     std::dynamic_pointer_cast<PartitionBufferOptions>(storage_config->features->options));\n            break;\n        }\n        case StorageBackend::FLAT_FILE: {\n            SPDLOG_ERROR(\"Backend type not available for features.\");\n            throw std::runtime_error(\"\");\n        }\n        case StorageBackend::HOST_MEMORY: {\n            node_features = std::make_shared<InMemory>(node_features_file, num_nodes, node_feature_dim, dtype, torch::kCPU);\n            break;\n        }\n        case StorageBackend::DEVICE_MEMORY: {\n            node_features = std::make_shared<InMemory>(node_features_file, num_nodes, node_feature_dim, dtype, storage_config->device_type);\n            break;\n        }\n    }\n\n    return node_features;\n}\n\nshared_ptr<Storage> initializeNodeLabels(shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config) {\n    string node_labels_file = storage_config->dataset->dataset_dir + PathConstants::nodes_directory + PathConstants::labels_file + PathConstants::file_ext;\n\n    shared_ptr<Storage> node_labels;\n\n    int64_t num_nodes = storage_config->dataset->num_nodes;\n    int64_t num_classes = storage_config->dataset->num_classes;\n    torch::Dtype dtype = torch::kInt32;\n\n    switch (storage_config->nodes->type) {\n        case StorageBackend::PARTITION_BUFFER: {\n            SPDLOG_ERROR(\"Backend type not available for nodes/labels.\");\n            throw std::runtime_error(\"\");\n        }\n        case StorageBackend::FLAT_FILE: {\n            SPDLOG_ERROR(\"Backend type not available for nodes/labels.\");\n            throw std::runtime_error(\"\");\n        }\n        case StorageBackend::HOST_MEMORY: {\n            node_labels = std::make_shared<InMemory>(node_labels_file, num_nodes, 1, dtype, torch::kCPU);\n            break;\n        }\n        case StorageBackend::DEVICE_MEMORY: {\n            node_labels = std::make_shared<InMemory>(node_labels_file, num_nodes, 1, dtype, storage_config->device_type);\n            break;\n        }\n    }\n\n    return node_labels;\n}\n\nshared_ptr<GraphModelStorage> initializeStorageLinkPrediction(shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config, bool reinitialize, bool train,\n                                                              shared_ptr<InitConfig> init_config) {\n    std::tuple<shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>> edge_storages =\n        initializeEdges(storage_config, model->learning_task_);\n    std::tuple<shared_ptr<Storage>, shared_ptr<Storage>> node_embeddings = initializeNodeEmbeddings(model, storage_config, reinitialize, train, init_config);\n\n    GraphModelStoragePtrs storage_ptrs = {};\n\n    storage_ptrs.train_edges = std::get<0>(edge_storages);\n    storage_ptrs.train_edges_dst_sort = std::get<1>(edge_storages);\n    storage_ptrs.validation_edges = std::get<2>(edge_storages);\n    storage_ptrs.test_edges = std::get<3>(edge_storages);\n\n    storage_ptrs.node_features = initializeNodeFeatures(model, storage_config);\n    storage_ptrs.node_embeddings = std::get<0>(node_embeddings);\n    storage_ptrs.node_optimizer_state = std::get<1>(node_embeddings);\n\n    storage_ptrs.relation_features = initializeRelationFeatures(model, storage_config);\n\n    shared_ptr<GraphModelStorage> graph_model_storage = std::make_shared<GraphModelStorage>(storage_ptrs, storage_config);\n\n    return graph_model_storage;\n}\n\nshared_ptr<GraphModelStorage> initializeStorageNodeClassification(shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config, bool reinitialize,\n                                                                  bool train, shared_ptr<InitConfig> init_config) {\n    std::tuple<shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>> edge_storages =\n        initializeEdges(storage_config, model->learning_task_);\n    std::tuple<shared_ptr<Storage>, shared_ptr<Storage>, shared_ptr<Storage>> node_id_storages = initializeNodeIds(storage_config);\n    shared_ptr<Storage> node_features = initializeNodeFeatures(model, storage_config);\n    shared_ptr<Storage> node_labels = initializeNodeLabels(model, storage_config);\n\n    GraphModelStoragePtrs storage_ptrs = {};\n\n    storage_ptrs.train_edges = std::get<0>(edge_storages);\n    storage_ptrs.train_edges_dst_sort = std::get<1>(edge_storages);\n    storage_ptrs.edges = storage_ptrs.train_edges;\n\n    storage_ptrs.train_nodes = std::get<0>(node_id_storages);\n    storage_ptrs.valid_nodes = std::get<1>(node_id_storages);\n    storage_ptrs.test_nodes = std::get<2>(node_id_storages);\n\n    storage_ptrs.nodes = storage_ptrs.train_nodes;\n    storage_ptrs.node_features = node_features;\n    storage_ptrs.node_labels = node_labels;\n\n    std::tuple<shared_ptr<Storage>, shared_ptr<Storage>> node_embeddings = initializeNodeEmbeddings(model, storage_config, reinitialize, train, init_config);\n    storage_ptrs.node_embeddings = std::get<0>(node_embeddings);\n    storage_ptrs.node_optimizer_state = std::get<1>(node_embeddings);\n\n    shared_ptr<GraphModelStorage> graph_model_storage = std::make_shared<GraphModelStorage>(storage_ptrs, storage_config);\n\n    return graph_model_storage;\n}\n\nshared_ptr<GraphModelStorage> initializeStorage(shared_ptr<Model> model, shared_ptr<StorageConfig> storage_config, bool reinitialize, bool train,\n                                                shared_ptr<InitConfig> init_config) {\n    if (init_config == nullptr) {\n        init_config = std::make_shared<InitConfig>();\n        init_config->type = InitDistribution::GLOROT_UNIFORM;\n    }\n\n    if (model->learning_task_ == LearningTask::LINK_PREDICTION) {\n        return initializeStorageLinkPrediction(model, storage_config, reinitialize, train, init_config);\n    } else if (model->learning_task_ == LearningTask::NODE_CLASSIFICATION) {\n        return initializeStorageNodeClassification(model, storage_config, reinitialize, train, init_config);\n    } else {\n        SPDLOG_ERROR(\"Unsupported Learning Task\");\n        throw std::runtime_error(\"\");\n    }\n}"
  },
  {
    "path": "src/cpp/src/storage/storage.cpp",
    "content": "//\n// Created by Jason Mohoney on 4/21/20.\n//\n\n#include \"storage/storage.h\"\n\n#include <fcntl.h>\n#include <unistd.h>\n\n#include <iostream>\n\n#include \"common/util.h\"\n#include \"configuration/constants.h\"\n#include \"reporting/logger.h\"\n\nusing std::ios;\nusing std::ios_base;\n\nvoid renameFile(string old_filename, string new_filename) {\n    int result = rename(old_filename.c_str(), new_filename.c_str());\n    if (result != 0) {\n        SPDLOG_ERROR(\"Unable to rename {}\\nError: {}\", old_filename, errno);\n        throw std::runtime_error(\"\");\n    }\n}\n\nvoid copyFile(string src_file, string dst_file) {\n    std::ifstream src;\n    std::ofstream dst;\n\n    src.open(src_file, ios::in | ios::binary);\n    dst.open(dst_file, ios::out | ios::binary);\n\n    dst << src.rdbuf();\n\n    src.close();\n    dst.close();\n}\n\nbool fileExists(string filename) {\n    if (FILE *file = fopen(filename.c_str(), \"r\")) {\n        fclose(file);\n        return true;\n    } else {\n        return false;\n    }\n}\n\nvoid createDir(string path, bool exist_ok) {\n    if (mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) == -1) {\n        if (errno == EEXIST) {\n            if (exist_ok) {\n                SPDLOG_DEBUG(\"{} directory already exists\", path);\n            } else {\n                SPDLOG_ERROR(\"{} directory already exists\", path);\n                throw std::runtime_error(\"\");\n            }\n        } else {\n            SPDLOG_ERROR(\"Failed to create {}\\nError: {}\", path, errno);\n            throw std::runtime_error(\"\");\n        }\n    }\n}\n\nStorage::Storage() : device_(torch::kCPU) {}\n\nPartitionBufferStorage::PartitionBufferStorage(string filename, int64_t dim0_size, int64_t dim1_size, shared_ptr<PartitionBufferOptions> options) {\n    filename_ = filename;\n    dim0_size_ = dim0_size;\n    dim1_size_ = dim1_size;\n    options_ = options;\n    dtype_ = options_->dtype;\n    initialized_ = true;\n    loaded_ = false;\n    int64_t partition_size = ceil((double)dim0_size_ / options_->num_partitions);\n    device_ = torch::kCPU;\n\n    buffer_ = new PartitionBuffer(options_->buffer_capacity, options_->num_partitions, options_->fine_to_coarse_ratio, partition_size, dim1_size_, dim0_size_,\n                                  dtype_, filename_, options_->prefetching);\n}\n\nPartitionBufferStorage::PartitionBufferStorage(string filename, torch::Tensor data, shared_ptr<PartitionBufferOptions> options) {\n    filename_ = filename;\n    dim0_size_ = 0;\n    dim1_size_ = data.size(1);\n    options_ = options;\n    dtype_ = options_->dtype;\n    append(data);\n    initialized_ = true;\n    loaded_ = false;\n    int64_t partition_size = ceil((double)dim0_size_ / options_->num_partitions);\n    device_ = torch::kCPU;\n\n    buffer_ = new PartitionBuffer(options_->buffer_capacity, options_->num_partitions, options_->fine_to_coarse_ratio, partition_size, dim1_size_, dim0_size_,\n                                  dtype_, filename_, options_->prefetching);\n}\n\nPartitionBufferStorage::PartitionBufferStorage(string filename, shared_ptr<PartitionBufferOptions> options) {\n    filename_ = filename;\n    dim0_size_ = 0;\n    initialized_ = false;\n    loaded_ = false;\n    options_ = options;\n    dtype_ = options_->dtype;\n    int64_t partition_size = ceil((double)dim0_size_ / options_->num_partitions);\n    device_ = torch::kCPU;\n\n    buffer_ = new PartitionBuffer(options_->buffer_capacity, options_->num_partitions, options_->fine_to_coarse_ratio, partition_size, dim1_size_, dim0_size_,\n                                  dtype_, filename_, options_->prefetching);\n}\n\nvoid PartitionBufferStorage::rangePut(int64_t offset, torch::Tensor values) {\n    int fd = open(filename_.c_str(), O_RDWR | IO_FLAGS);\n    if (fd == -1) {\n        SPDLOG_ERROR(\"Unable to open {}\\nError: {}\", filename_, errno);\n        throw std::runtime_error(\"\");\n    }\n\n    int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n    int64_t ptr_offset = offset * dim1_size_ * dtype_size;\n\n    if (pwrite_wrapper(fd, values.data_ptr(), values.size(0) * dim1_size_ * dtype_size, ptr_offset) == -1) {\n        SPDLOG_ERROR(\"Unable to write {}\\nError: {}\", filename_, errno);\n        throw std::runtime_error(\"\");\n    }\n\n    close(fd);\n}\n\nvoid PartitionBufferStorage::append(torch::Tensor values) {\n    ios::openmode flags;\n\n    if (dim0_size_ == 0) {\n        flags = ios::trunc | ios::binary;\n    } else {\n        flags = ios::binary | ios_base::app;\n    }\n\n    dim0_size_ += values.size(0);\n    dim1_size_ = values.size(1);\n    dtype_ = values.scalar_type();\n\n    std::ofstream outfile(filename_, flags);\n\n    int dtype_size = get_dtype_size_wrapper(dtype_);\n\n    outfile.write((char *)values.data_ptr(), values.size(0) * values.size(1) * dtype_size);\n\n    outfile.close();\n}\n\nPartitionBufferStorage::~PartitionBufferStorage() { delete buffer_; }\n\nvoid PartitionBufferStorage::load() {\n    if (!loaded_ && initialized_) {\n        buffer_->load();\n        loaded_ = true;\n    }\n}\n\nvoid PartitionBufferStorage::write() {\n    if (loaded_) {\n        buffer_->sync();\n    }\n}\n\nvoid PartitionBufferStorage::unload(bool perform_write) {\n    if (loaded_) {\n        buffer_->unload(perform_write);\n        loaded_ = false;\n    }\n}\n\ntorch::Tensor PartitionBufferStorage::indexRead(Indices indices) { return buffer_->indexRead(indices); }\n\nvoid PartitionBufferStorage::indexAdd(Indices indices, torch::Tensor values) { return buffer_->indexAdd(indices, values); }\n\ntorch::Tensor PartitionBufferStorage::range(int64_t offset, int64_t n) {\n    SPDLOG_ERROR(\"Unsupported operation for PartitionBufferStorage\");\n    throw std::runtime_error(\"\");\n}\n\nvoid PartitionBufferStorage::indexPut(Indices indices, torch::Tensor values) {\n    SPDLOG_ERROR(\"Unsupported operation for PartitionBufferStorage\");\n    throw std::runtime_error(\"\");\n}\n\nvoid PartitionBufferStorage::rangePut(int64_t offset, int64_t n, torch::Tensor values) {\n    SPDLOG_ERROR(\"Unsupported operation for PartitionBufferStorage\");\n    throw std::runtime_error(\"\");\n}\n\nvoid PartitionBufferStorage::shuffle() {\n    SPDLOG_ERROR(\"Shuffle not supported for PartitionBufferStorage\");\n    throw std::runtime_error(\"\");\n};\n\nvoid PartitionBufferStorage::sort(bool src) {\n    SPDLOG_ERROR(\"Sort not supported for PartitionBufferStorage\");\n    throw std::runtime_error(\"\");\n};\n\nFlatFile::FlatFile(string filename, int64_t dim0_size, int64_t dim1_size, torch::Dtype dtype, bool alloc) {\n    filename_ = filename;\n    dim0_size_ = dim0_size;\n    dim1_size_ = dim1_size;\n    dtype_ = dtype;\n    initialized_ = true;\n    loaded_ = false;\n    device_ = torch::kCPU;\n\n    if (alloc) {\n        int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n\n        std::ofstream ofs(filename_, std::ios::binary | std::ios::out);\n        ofs.seekp(dim0_size_ * dim1_size_ * dtype_size - 1);\n        ofs.write(\"\", 1);\n        ofs.close();\n    }\n}\n\nFlatFile::FlatFile(string filename, torch::Tensor data) {\n    filename_ = filename;\n    dim0_size_ = 0;\n    dim1_size_ = data.size(1);\n    dtype_ = data.scalar_type();\n    loaded_ = false;\n    append(data);\n    initialized_ = true;\n    device_ = torch::kCPU;\n}\n\nFlatFile::FlatFile(string filename, torch::Dtype dtype) {\n    filename_ = filename;\n    dim0_size_ = 0;\n    initialized_ = false;\n    loaded_ = false;\n    dtype_ = dtype;\n    device_ = torch::kCPU;\n}\n\nvoid FlatFile::rangePut(int64_t offset, torch::Tensor values) {\n    if (!values.defined() || (dim0_size_ != 0 && (values.size(0) + offset > dim0_size_ || values.size(1) != dim1_size_))) {\n        // TODO: throw invalid inputs for function error\n        throw std::runtime_error(\"\");\n    }\n\n    int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n\n    int64_t ptr_offset = offset * dim1_size_ * dtype_size;\n\n    if (pwrite_wrapper(fd_, values.data_ptr(), values.size(0) * dim1_size_ * dtype_size, ptr_offset) == -1) {\n        SPDLOG_ERROR(\"Unable to write {}\\nError: {}\", filename_, errno);\n        throw std::runtime_error(\"\");\n    }\n}\n\nvoid FlatFile::append(torch::Tensor values) {\n    ios::openmode flags = dim0_size_ == 0 ? ios::trunc | ios::binary : ios::binary | ios_base::app;\n\n    dim0_size_ += values.size(0);\n    dim1_size_ = values.size(1);\n    dtype_ = values.scalar_type();\n\n    std::ofstream outfile(filename_, flags);\n\n    int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n\n    outfile.write((char *)values.data_ptr(), values.size(0) * values.size(1) * dtype_size);\n    outfile.close();\n}\n\nvoid FlatFile::load() {\n    if (!loaded_ && initialized_) {\n        fd_ = open(filename_.c_str(), O_RDWR | IO_FLAGS);\n        if (fd_ == -1) {\n            SPDLOG_DEBUG(\"Unable to open {}\\nError: {}\", filename_, errno);\n            return;\n        }\n        loaded_ = true;\n    }\n}\n\nvoid FlatFile::write() { return; }\n\nvoid FlatFile::unload(bool perform_write) {\n    (void)perform_write;\n    if (loaded_) {\n        close(fd_);\n        loaded_ = false;\n    }\n}\n\ntorch::Tensor FlatFile::indexRead(Indices indices) {\n    SPDLOG_ERROR(\"Unsupported operation for FlatFile, only sequential access is supported\");\n    throw std::runtime_error(\"\");\n}\n\nvoid FlatFile::indexAdd(Indices indices, torch::Tensor values) {\n    SPDLOG_ERROR(\"Unsupported operation for FlatFile, only sequential access is supported\");\n    throw std::runtime_error(\"\");\n}\n\nvoid FlatFile::indexPut(Indices indices, torch::Tensor values) {\n    SPDLOG_ERROR(\"Unsupported operation for FlatFile, only sequential access is supported\");\n    throw std::runtime_error(\"\");\n}\n\nvoid FlatFile::move(string new_filename) {\n    unload(false);\n\n    renameFile(filename_, new_filename);\n\n    load();\n}\n\nvoid FlatFile::copy(string new_filename, bool rename) {\n    unload(false);\n\n    copyFile(filename_, new_filename);\n\n    if (rename) {\n        filename_ = new_filename;\n    }\n    load();\n}\n\ntorch::Tensor FlatFile::range(int64_t offset, int64_t n) {\n    if (n + offset > dim0_size_) {\n        // TODO: throw invalid inputs for function error\n        throw std::runtime_error(\"\");\n    }\n    int dtype_size = get_dtype_size_wrapper(dtype_);\n\n    int64_t ptr_offset = offset * dim1_size_ * dtype_size;\n\n    torch::Tensor output_tensor = torch::empty({n, dim1_size_}, dtype_);\n    if (pread_wrapper(fd_, output_tensor.data_ptr(), n * dim1_size_ * dtype_size, ptr_offset) == -1) {\n        SPDLOG_ERROR(\"Unable to read {}\\nError: {}\", filename_, errno);\n        throw std::runtime_error(\"\");\n    }\n    return output_tensor;\n}\n\nvoid FlatFile::rangePut(int64_t offset, int64_t n, torch::Tensor values) {\n    int dtype_size = get_dtype_size_wrapper(dtype_);\n\n    int64_t ptr_offset = offset * dim1_size_ * dtype_size;\n\n    if (pwrite_wrapper(fd_, values.data_ptr(), n * dim1_size_ * dtype_size, ptr_offset) == -1) {\n        SPDLOG_ERROR(\"Unable to write {}\\nError: {}\", filename_, errno);\n        throw std::runtime_error(\"\");\n    }\n}\n\nvoid FlatFile::shuffle() {\n    bool loaded = loaded_;\n    if (!loaded) {\n        load();\n    }\n    if (edge_bucket_sizes_.empty()) {\n        int64_t offset = 0;\n        int64_t curr_size = 0;\n        while (offset < dim0_size_) {\n            if (dim0_size_ - offset < MAX_SHUFFLE_SIZE) {\n                curr_size = dim0_size_ - offset;\n            } else {\n                curr_size = MAX_SHUFFLE_SIZE;\n            }\n            torch::Tensor chunk = range(offset, curr_size);\n            auto opts = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCPU);\n            chunk.copy_(chunk.index_select(0, torch::randperm(chunk.size(0), opts)));\n            rangePut(offset, chunk);\n            offset += curr_size;\n        }\n    } else {\n        int64_t offset = 0;\n        auto opts = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCPU);\n        for (auto itr = edge_bucket_sizes_.begin(); itr != edge_bucket_sizes_.end(); itr++) {\n            torch::Tensor edge_bucket = range(offset, *itr);\n            edge_bucket.copy_(edge_bucket.index_select(0, torch::randperm(edge_bucket.size(0), opts)));\n            rangePut(offset, edge_bucket);\n            offset += *itr;\n        }\n    }\n    if (!loaded) {\n        unload(true);\n    }\n}\n\nvoid FlatFile::sort(bool src) {\n    // function for sorting flat file storing edges\n    int sort_dim = 0;\n    if (!src) {\n        sort_dim = -1;\n    }\n\n    bool loaded = loaded_;\n    if (!loaded) {\n        load();\n    }\n    if (edge_bucket_sizes_.empty()) {\n        int64_t offset = 0;\n        int64_t curr_size = 0;\n        while (offset < dim0_size_) {\n            if (dim0_size_ - offset < MAX_SORT_SIZE) {\n                curr_size = dim0_size_ - offset;\n            } else {\n                curr_size = MAX_SORT_SIZE;\n            }\n\n            torch::Tensor chunk = range(offset, curr_size);\n            // auto opts = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCPU);\n            chunk.copy_(chunk.index_select(0, torch::argsort(chunk.select(1, sort_dim))));\n            rangePut(offset, chunk);\n            offset += curr_size;\n        }\n    } else {\n        int64_t offset = 0;\n        // auto opts = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCPU);\n        for (auto itr = edge_bucket_sizes_.begin(); itr != edge_bucket_sizes_.end(); itr++) {\n            torch::Tensor edge_bucket = range(offset, *itr);\n            edge_bucket.copy_(edge_bucket.index_select(0, torch::argsort(edge_bucket.select(1, sort_dim))));\n            rangePut(offset, edge_bucket);\n            offset += *itr;\n        }\n    }\n    if (!loaded) {\n        unload(true);\n    }\n}\n\nvoid FlatFile::mem_load() {\n    if (!loaded_) {\n        fd_ = open((filename_).c_str(), O_RDWR);\n        if (fd_ == -1) {\n            SPDLOG_ERROR(\"Unable to open {}\\nError: {}\", filename_, errno);\n            throw std::runtime_error(\"\");\n        }\n\n        int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n\n        data_ = torch::empty({dim0_size_, dim1_size_}, dtype_);\n        SPDLOG_DEBUG(\"Initialized memory edges\");\n        process_mem_usage();\n\n        int64_t offset = 0;\n        int64_t read_size = dim0_size_ * dim1_size_ * dtype_size;\n\n        if (pread_wrapper(fd_, data_.data_ptr(), read_size, offset) == -1) {\n            SPDLOG_ERROR(\"Unable to read {}\\nError: {}\", filename_, errno);\n            throw std::runtime_error(\"\");\n        }\n\n        SPDLOG_DEBUG(\"Read edges from disk\");\n        process_mem_usage();\n\n        loaded_ = true;\n    }\n}\n\nvoid FlatFile::mem_unload(bool write) {\n    if (loaded_) {\n        int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n\n        int64_t offset = 0;\n        int64_t read_size = dim0_size_ * dim1_size_ * dtype_size;\n\n        if (write) {\n            if (pwrite_wrapper(fd_, data_.data_ptr(), read_size, offset) == -1) {\n                SPDLOG_ERROR(\"Unable to write {}\\nError: {}\", filename_, errno);\n                throw std::runtime_error(\"\");\n            }\n        }\n\n        close(fd_);\n\n        SPDLOG_DEBUG(\"Edges written\");\n        process_mem_usage();\n        loaded_ = false;\n        process_mem_usage();\n        data_ = torch::Tensor();\n        SPDLOG_DEBUG(\"Nulled tensor and pointer\");\n        process_mem_usage();\n    }\n}\n\nInMemory::InMemory(string filename, int64_t dim0_size, int64_t dim1_size, torch::Dtype dtype, torch::Device device) {\n    filename_ = filename;\n    dim0_size_ = dim0_size;\n    dim1_size_ = dim1_size;\n    dtype_ = dtype;\n    initialized_ = true;\n    loaded_ = false;\n    device_ = device;\n}\n\nInMemory::InMemory(string filename, torch::Tensor data, torch::Device device) {\n    filename_ = filename;\n    dim0_size_ = data.size(0);\n    dim1_size_ = data.size(1);\n    dtype_ = data.scalar_type();\n    device_ = device;\n    loaded_ = false;\n\n    torch::Tensor temp = data.to(torch::kCPU);\n\n    std::ofstream outfile(filename_, ios::out | ios::binary);\n\n    int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n\n    outfile.write((char *)temp.data_ptr(), data.size(0) * data.size(1) * dtype_size);\n\n    outfile.close();\n}\n\nInMemory::InMemory(string filename, torch::Dtype dtype) {\n    filename_ = filename;\n    dim0_size_ = 0;\n    dim1_size_ = 0;\n    initialized_ = false;\n    dtype_ = dtype;\n    device_ = torch::kCPU;\n    loaded_ = false;\n}\n\nInMemory::InMemory(torch::Tensor data) {\n    if (data.sizes().size() == 2) {\n        dim0_size_ = data.size(0);\n        dim1_size_ = data.size(1);\n    } else if (data.sizes().size() == 1) {\n        dim0_size_ = data.size(0);\n        dim1_size_ = 1;\n    } else {\n        throw MariusRuntimeException(\"Tensor must have 1 or two dimensions\");\n    }\n\n    filename_ = \"\";\n    data_ = data.reshape({dim0_size_, dim1_size_});\n\n    initialized_ = true;\n    dtype_ = data.dtype().toScalarType();\n    device_ = data.device();\n    loaded_ = true;\n}\n\nvoid InMemory::load() {\n    if (!loaded_ && !filename_.empty()) {\n        fd_ = open((filename_).c_str(), O_RDWR);\n        if (fd_ == -1) {\n            SPDLOG_DEBUG(\"Unable to open {}\\nError: {}\", filename_, errno);\n            return;\n        }\n\n        int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n\n        data_ = torch::empty({dim0_size_, dim1_size_}, dtype_);\n\n        int64_t offset = 0;\n        int64_t read_size = dim0_size_ * dim1_size_ * dtype_size;\n\n        if (pread_wrapper(fd_, data_.data_ptr(), read_size, offset) == -1) {\n            SPDLOG_ERROR(\"Unable to read {}\\nError: {}\", filename_, errno);\n            throw std::runtime_error(\"\");\n        }\n\n        if (device_ == torch::kCUDA) {\n            data_ = data_.to(device_);\n        }\n\n        loaded_ = true;\n    }\n}\n\nvoid InMemory::write() {\n    if (loaded_ && !filename_.empty()) {\n        int64_t dtype_size = get_dtype_size_wrapper(dtype_);\n\n        torch::Tensor data = data_;\n        if (device_ == torch::kCUDA) {\n            data = data_.to(torch::kCPU);\n        }\n\n        int64_t offset = 0;\n        int64_t read_size = dim0_size_ * dim1_size_ * dtype_size;\n\n        if (pwrite_wrapper(fd_, data.data_ptr(), read_size, offset) == -1) {\n            SPDLOG_ERROR(\"Unable to read {}\\nError: {}\", filename_, errno);\n            throw std::runtime_error(\"\");\n        }\n    }\n}\n\nvoid InMemory::unload(bool perform_write) {\n    if (loaded_ && !filename_.empty()) {\n        if (perform_write) {\n            write();\n        }\n\n        close(fd_);\n        loaded_ = false;\n        data_ = torch::Tensor();\n    }\n}\n\ntorch::Tensor InMemory::indexRead(Indices indices) {\n    if (indices.sizes().size() != 1) {\n        // TODO: throw invalid input to func exception\n        throw std::runtime_error(\"\");\n    }\n\n    if (data_.defined()) {\n        if (data_.device().is_cuda()) {\n            return data_.index_select(0, indices.to(device_));\n        } else {\n            torch::Tensor out;\n\n            if (dtype_ == torch::kFloat32) {\n                auto out_options = torch::TensorOptions().dtype(torch::kFloat32);\n#ifdef MARIUS_CUDA\n                out_options = out_options.pinned_memory(true);\n#endif\n                out = torch::empty({indices.size(0), dim1_size_}, out_options);\n                torch::index_select_out(out, data_, 0, indices);\n            } else if (dtype_ == torch::kInt64) {\n                auto out_options = torch::TensorOptions().dtype(torch::kInt64);\n#ifdef MARIUS_CUDA\n                out_options = out_options.pinned_memory(true);\n#endif\n                out = torch::empty({indices.size(0), dim1_size_}, out_options);\n                torch::index_select_out(out, data_, 0, indices);\n            } else if (dtype_ == torch::kInt32) {\n                auto out_options = torch::TensorOptions().dtype(torch::kInt32);\n#ifdef MARIUS_CUDA\n                out_options = out_options.pinned_memory(true);\n#endif\n                out = torch::empty({indices.size(0), dim1_size_}, out_options);\n                torch::index_select_out(out, data_, 0, indices);\n            } else {\n                SPDLOG_ERROR(\"Not yet implemented\");\n                throw std::runtime_error(\"\");\n            }\n\n            return out;\n        }\n    } else {\n        return torch::Tensor();\n    }\n}\n\nvoid InMemory::indexAdd(Indices indices, torch::Tensor values) {\n    if (!values.defined() || indices.sizes().size() != 1 || indices.size(0) != values.size(0) || data_.size(1) != values.size(1)) {\n        // TODO: throw invalid input to func exception\n        throw std::runtime_error(\"\");\n    }\n    if (values.device().is_cuda()) {\n        data_.index_add_(0, indices, values);\n    } else {\n        // assumes this operation is only used on float valued data.\n        auto data_accessor = data_.accessor<float, 2>();\n        auto ids_accessor = indices.accessor<int64_t, 1>();\n        auto values_accessor = values.accessor<float, 2>();\n\n        int d = values.size(1);\n        int64_t size = indices.size(0);\n#pragma omp parallel for\n        for (int64_t i = 0; i < size; i++) {\n            for (int j = 0; j < d; j++) {\n                data_accessor[ids_accessor[i]][j] += values_accessor[i][j];\n            }\n        }\n    }\n}\n\nvoid InMemory::indexPut(Indices indices, torch::Tensor values) {\n    if (!values.defined() || indices.sizes().size() != 1 || indices.size(0) != values.size(0) || data_.size(1) != values.size(1)) {\n        // TODO: throw invalid input to func exception\n        throw std::runtime_error(\"\");\n    }\n    if (values.device().is_cuda()) {\n        data_[indices] = values;\n    } else {\n        // assumes this operation is only used on float valued data.\n        auto data_accessor = data_.accessor<float, 2>();\n        auto ids_accessor = indices.accessor<int64_t, 1>();\n        auto values_accessor = values.accessor<float, 2>();\n\n        int d = values.size(1);\n        int64_t size = indices.size(0);\n#pragma omp parallel for\n        for (int64_t i = 0; i < size; i++) {\n            for (int j = 0; j < d; j++) {\n                data_accessor[ids_accessor[i]][j] = values_accessor[i][j];\n            }\n        }\n    }\n}\n\ntorch::Tensor InMemory::range(int64_t offset, int64_t n) {\n    if (n + offset > dim0_size_) {\n        // TODO: throw invalid inputs for function error\n        throw std::runtime_error(\"\");\n    }\n    return data_.narrow(0, offset, n);\n}\n\nvoid InMemory::rangePut(int64_t offset, int64_t n, torch::Tensor values) { data_.narrow(0, offset, n).copy_(values); }\n\nvoid InMemory::shuffle() {\n    bool loaded = loaded_;\n    if (!loaded) {\n        load();\n\n        // may cause silent failures\n        if (!loaded_) {\n            return;\n        }\n    }\n\n    // full shuffle\n    if (edge_bucket_sizes_.empty()) {\n        auto opts = torch::TensorOptions().dtype(torch::kInt64).device(data_.device());\n        data_.copy_(data_.index_select(0, torch::randperm(dim0_size_, opts)));\n    }\n    // shuffle within edge buckets\n    else {\n        int64_t start = 0;\n        auto opts = torch::TensorOptions().dtype(torch::kInt64).device(data_.device());\n        for (auto itr = edge_bucket_sizes_.begin(); itr != edge_bucket_sizes_.end(); itr++) {\n            torch::Tensor edge_bucket = data_.narrow(0, start, *itr);\n            data_.narrow(0, start, *itr) = (edge_bucket.index_select(0, torch::randperm(edge_bucket.size(0), opts)));\n            start += *itr;\n        }\n    }\n    if (!loaded) {\n        unload(true);\n    }\n}\n\nvoid InMemory::sort(bool src) {\n    // function for sorting in memory edges\n    int sort_dim = 0;\n    if (!src) {\n        sort_dim = -1;\n    }\n\n    bool loaded = loaded_;\n    if (!loaded) {\n        load();\n\n        // may cause silent failures\n        if (!loaded_) {\n            return;\n        }\n    }\n\n    // full sort\n    if (edge_bucket_sizes_.empty()) {\n        // auto opts = torch::TensorOptions().dtype(torch::kInt64).device(data_.device());\n        data_.copy_(data_.index_select(0, torch::argsort(data_.select(1, sort_dim))));\n    }\n    // sort within edge buckets\n    else {\n        int64_t start = 0;\n        // auto opts = torch::TensorOptions().dtype(torch::kInt64).device(data_.device());\n        for (auto itr = edge_bucket_sizes_.begin(); itr != edge_bucket_sizes_.end(); itr++) {\n            torch::Tensor edge_bucket = data_.narrow(0, start, *itr);\n            data_.narrow(0, start, *itr) = (edge_bucket.index_select(0, torch::argsort(edge_bucket.select(1, sort_dim))));\n            start += *itr;\n        }\n    }\n    if (!loaded) {\n        unload(true);\n    }\n}\n"
  },
  {
    "path": "src/cpp/third_party/CMakeLists.txt",
    "content": "function(initialize_submodule DIRECTORY)\n    if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/.git)\n        find_package(Git QUIET REQUIRED)\n        message(STATUS \"${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}/.git does not exist. Initializing ${DIRECTORY} submodule ...\")\n        execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init ${DIRECTORY}\n                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}\n                RESULT_VARIABLE GIT_EXIT_CODE)\n        if(NOT GIT_EXIT_CODE EQUAL \"0\")\n            message(FATAL_ERROR \"${GIT_EXECUTABLE} submodule update --init dependencies/${DIRECTORY} failed with exit code ${GIT_EXIT_CODE}, please checkout submodules\")\n        endif()\n    endif()\nendfunction(initialize_submodule)\n\ninitialize_submodule(pybind11)\ninitialize_submodule(spdlog)\ninitialize_submodule(googletest)\ninitialize_submodule(parallel-hashmap)\n\nadd_subdirectory(googletest EXCLUDE_FROM_ALL)\nadd_subdirectory(pybind11 EXCLUDE_FROM_ALL)\nadd_subdirectory(spdlog EXCLUDE_FROM_ALL)\n"
  },
  {
    "path": "src/cuda/third_party/pytorch_scatter/atomics.cuh",
    "content": "#pragma once\n\n#define ATOMIC(NAME)                                                           \\\n  template <typename scalar, size_t size> struct Atomic##NAME##IntegerImpl;    \\\n                                                                               \\\n  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 1> {     \\\n    inline __device__ void operator()(scalar *address, scalar val) {           \\\n      uint32_t *address_as_ui = (uint32_t *)(address - ((size_t)address & 3)); \\\n      uint32_t old = *address_as_ui;                                           \\\n      uint32_t shift = ((size_t)address & 3) * 8;                              \\\n      uint32_t sum;                                                            \\\n      uint32_t assumed;                                                        \\\n                                                                               \\\n      do {                                                                     \\\n        assumed = old;                                                         \\\n        sum = OP(val, scalar((old >> shift) & 0xff));                          \\\n        old = (old & ~(0x000000ff << shift)) | (sum << shift);                 \\\n        old = atomicCAS(address_as_ui, assumed, old);                          \\\n      } while (assumed != old);                                                \\\n    }                                                                          \\\n  };                                                                           \\\n                                                                               \\\n  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 2> {     \\\n    inline __device__ void operator()(scalar *address, scalar val) {           \\\n      uint32_t *address_as_ui =                                                \\\n          (uint32_t *)((char *)address - ((size_t)address & 2));               \\\n      uint32_t old = *address_as_ui;                                           \\\n      uint32_t sum;                                                            \\\n      uint32_t newval;                                                         \\\n      uint32_t assumed;                                                        \\\n                                                                               \\\n      do {                                                                     \\\n        assumed = old;                                                         \\\n        sum = OP(val, (size_t)address & 2 ? scalar(old >> 16)                  \\\n                                          : scalar(old & 0xffff));             \\\n        newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16)            \\\n                                     : (old & 0xffff0000) | sum;               \\\n        old = atomicCAS(address_as_ui, assumed, newval);                       \\\n      } while (assumed != old);                                                \\\n    }                                                                          \\\n  };                                                                           \\\n                                                                               \\\n  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 4> {     \\\n    inline __device__ void operator()(scalar *address, scalar val) {           \\\n      uint32_t *address_as_ui = (uint32_t *)address;                           \\\n      uint32_t old = *address_as_ui;                                           \\\n      uint32_t assumed;                                                        \\\n                                                                               \\\n      do {                                                                     \\\n        assumed = old;                                                         \\\n        old = atomicCAS(address_as_ui, assumed, OP(val, (scalar)old));         \\\n      } while (assumed != old);                                                \\\n    }                                                                          \\\n  };                                                                           \\\n                                                                               \\\n  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 8> {     \\\n    inline __device__ void operator()(scalar *address, scalar val) {           \\\n      unsigned long long *address_as_ull = (unsigned long long *)address;      \\\n      unsigned long long old = *address_as_ull;                                \\\n      unsigned long long assumed;                                              \\\n                                                                               \\\n      do {                                                                     \\\n        assumed = old;                                                         \\\n        old = atomicCAS(address_as_ull, assumed, OP(val, (scalar)old));        \\\n      } while (assumed != old);                                                \\\n    }                                                                          \\\n  };                                                                           \\\n                                                                               \\\n  template <typename scalar, size_t size> struct Atomic##NAME##DecimalImpl;    \\\n                                                                               \\\n  template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 2> {     \\\n    inline __device__ void operator()(scalar *address, scalar val) {           \\\n      unsigned int *address_as_ui =                                            \\\n          (unsigned int *)((char *)address - ((size_t)address & 2));           \\\n      unsigned int old = *address_as_ui;                                       \\\n      unsigned int assumed;                                                    \\\n                                                                               \\\n      do {                                                                     \\\n        assumed = old;                                                         \\\n        at::Half hsum;                                                         \\\n        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);           \\\n        hsum = OP(hsum, val);                                                  \\\n        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)            \\\n                                  : (old & 0xffff0000) | hsum.x;               \\\n        old = atomicCAS(address_as_ui, assumed, old);                          \\\n      } while (assumed != old);                                                \\\n    }                                                                          \\\n  };                                                                           \\\n                                                                               \\\n  template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 4> {     \\\n    inline __device__ void operator()(scalar *address, scalar val) {           \\\n      int *address_as_i = (int *)address;                                      \\\n      int old = *address_as_i;                                                 \\\n      int assumed;                                                             \\\n                                                                               \\\n      do {                                                                     \\\n        assumed = old;                                                         \\\n        old = atomicCAS(address_as_i, assumed,                                 \\\n                        __float_as_int(OP(val, __int_as_float(assumed))));     \\\n      } while (assumed != old);                                                \\\n    }                                                                          \\\n  };                                                                           \\\n                                                                               \\\n  template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 8> {     \\\n    inline __device__ void operator()(scalar *address, scalar val) {           \\\n      unsigned long long int *address_as_ull =                                 \\\n          (unsigned long long int *)address;                                   \\\n      unsigned long long int old = *address_as_ull;                            \\\n      unsigned long long int assumed;                                          \\\n                                                                               \\\n      do {                                                                     \\\n        assumed = old;                                                         \\\n        old = atomicCAS(                                                       \\\n            address_as_ull, assumed,                                           \\\n            __double_as_longlong(OP(val, __longlong_as_double(assumed))));     \\\n      } while (assumed != old);                                                \\\n    }                                                                          \\\n  };\n\n#define OP(X, Y) Y + X\nATOMIC(Add)\n#undef OP\nstatic inline __device__ void atomAdd(uint8_t *address, uint8_t val) {\n    AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);\n}\nstatic inline __device__ void atomAdd(int8_t *address, int8_t val) {\n    AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);\n}\nstatic inline __device__ void atomAdd(int16_t *address, int16_t val) {\n    AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);\n}\nstatic inline __device__ void atomAdd(int32_t *address, int32_t val) {\n    atomicAdd(address, val);\n}\nstatic inline __device__ void atomAdd(int64_t *address, int64_t val) {\n    AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);\n}\n#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || CUDA_VERSION < 10000)\nstatic inline __device__ void atomAdd(at::Half *address, at::Half val) {\n  AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);\n}\n#else\nstatic inline __device__ void atomAdd(at::Half *address, at::Half val) {\n    atomicAdd(reinterpret_cast<__half *>(address), val);\n}\n#endif\nstatic inline __device__ void atomAdd(float *address, float val) {\n    atomicAdd(address, val);\n}\n#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)\nstatic inline __device__ void atomAdd(double *address, double val) {\n  AtomicAddDecimalImpl<double, sizeof(double)>()(address, val);\n}\n#else\nstatic inline __device__ void atomAdd(double *address, double val) {\n    atomicAdd(address, val);\n}\n#endif\n\n#define OP(X, Y) Y *X\nATOMIC(Mul)\n#undef OP\nstatic inline __device__ void atomMul(uint8_t *address, uint8_t val) {\n    AtomicMulIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);\n}\nstatic inline __device__ void atomMul(int8_t *address, int8_t val) {\n    AtomicMulIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);\n}\nstatic inline __device__ void atomMul(int16_t *address, int16_t val) {\n    AtomicMulIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);\n}\nstatic inline __device__ void atomMul(int32_t *address, int32_t val) {\n    AtomicMulIntegerImpl<int32_t, sizeof(int32_t)>()(address, val);\n}\nstatic inline __device__ void atomMul(int64_t *address, int64_t val) {\n    AtomicMulIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);\n}\nstatic inline __device__ void atomMul(float *address, float val) {\n    AtomicMulDecimalImpl<float, sizeof(float)>()(address, val);\n}\nstatic inline __device__ void atomMul(at::Half *address, at::Half val) {\n    AtomicMulDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);\n}\nstatic inline __device__ void atomMul(double *address, double val) {\n    AtomicMulDecimalImpl<double, sizeof(double)>()(address, val);\n}\n\n#define OP(X, Y) Y / X\nATOMIC(Div)\n#undef OP\nstatic inline __device__ void atomDiv(uint8_t *address, uint8_t val) {\n    AtomicDivIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);\n}\nstatic inline __device__ void atomDiv(int8_t *address, int8_t val) {\n    AtomicDivIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);\n}\nstatic inline __device__ void atomDiv(int16_t *address, int16_t val) {\n    AtomicDivIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);\n}\nstatic inline __device__ void atomDiv(int32_t *address, int32_t val) {\n    AtomicDivIntegerImpl<int32_t, sizeof(int32_t)>()(address, val);\n}\nstatic inline __device__ void atomDiv(int64_t *address, int64_t val) {\n    AtomicDivIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);\n}\nstatic inline __device__ void atomDiv(float *address, float val) {\n    AtomicDivDecimalImpl<float, sizeof(float)>()(address, val);\n}\nstatic inline __device__ void atomDiv(double *address, double val) {\n    AtomicDivDecimalImpl<double, sizeof(double)>()(address, val);\n}\n\n#define OP(X, Y) max(Y, X)\nATOMIC(Max)\n#undef OP\nstatic inline __device__ void atomMax(uint8_t *address, uint8_t val) {\n    AtomicMaxIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);\n}\nstatic inline __device__ void atomMax(int8_t *address, int8_t val) {\n    AtomicMaxIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);\n}\nstatic inline __device__ void atomMax(int16_t *address, int16_t val) {\n    AtomicMaxIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);\n}\nstatic inline __device__ void atomMax(int32_t *address, int32_t val) {\n    atomicMax(address, val);\n}\nstatic inline __device__ void atomMax(int64_t *address, int64_t val) {\n    AtomicMaxIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);\n}\nstatic inline __device__ void atomMax(float *address, float val) {\n    AtomicMaxDecimalImpl<float, sizeof(float)>()(address, val);\n}\nstatic inline __device__ void atomMax(double *address, double val) {\n    AtomicMaxDecimalImpl<double, sizeof(double)>()(address, val);\n}\n\n#define OP(X, Y) min(Y, X)\nATOMIC(Min)\n#undef OP\nstatic inline __device__ void atomMin(uint8_t *address, uint8_t val) {\n    AtomicMinIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);\n}\nstatic inline __device__ void atomMin(int8_t *address, int8_t val) {\n    AtomicMinIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);\n}\nstatic inline __device__ void atomMin(int16_t *address, int16_t val) {\n    AtomicMinIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);\n}\nstatic inline __device__ void atomMin(int32_t *address, int32_t val) {\n    atomicMin(address, val);\n}\nstatic inline __device__ void atomMin(int64_t *address, int64_t val) {\n    AtomicMinIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);\n}\nstatic inline __device__ void atomMin(float *address, float val) {\n    AtomicMinDecimalImpl<float, sizeof(float)>()(address, val);\n}\nstatic inline __device__ void atomMin(double *address, double val) {\n    AtomicMinDecimalImpl<double, sizeof(double)>()(address, val);\n}\n\n"
  },
  {
    "path": "src/cuda/third_party/pytorch_scatter/index_info.cuh",
    "content": "\n#pragma once\n\n#include <ATen/cuda/detail/TensorInfo.cuh>\n\n// We need our own `IndexToOffset` implementation since we do not want to\n// access the last element of the `indexptr`.\ntemplate <typename scalar_t> struct IndexPtrToOffset {\n    static inline __host__ __device__ int\n    get(int idx, const at::cuda::detail::TensorInfo<scalar_t, int> &info) {\n        int offset = idx % (info.sizes[info.dims - 1] - 1);\n        offset *= info.strides[info.dims - 1];\n        idx /= info.sizes[info.dims - 1] - 1;\n        for (int i = info.dims - 2; i >= 0; --i) {\n            offset += (idx % info.sizes[i]) * info.strides[i];\n            idx /= info.sizes[i];\n        }\n        return offset;\n    }\n};"
  },
  {
    "path": "src/cuda/third_party/pytorch_scatter/reducer.cuh",
    "content": "#pragma once\n\n#include <limits>\n#include <map>\n\n#include \"atomics.cuh\"\n\nenum SegmentReductionType { SUM, MEAN, MUL, DIV, MIN, MAX };\n\nconst std::map<std::string, SegmentReductionType> reduce2REDUCE = {\n        {\"sum\", SUM}, {\"mean\", MEAN}, {\"mul\", MUL},\n        {\"div\", DIV}, {\"min\", MIN},   {\"max\", MAX},\n};\n\n#define AT_DISPATCH_REDUCTION_TYPES(reduce, ...)                               \\\n  [&] {                                                                        \\\n    switch (reduce2REDUCE.at(reduce)) {                                        \\\n    case SUM: {                                                                \\\n      const SegmentReductionType REDUCE = SUM;                                        \\\n      return __VA_ARGS__();                                                    \\\n    }                                                                          \\\n    case MEAN: {                                                               \\\n      const SegmentReductionType REDUCE = MEAN;                                       \\\n      return __VA_ARGS__();                                                    \\\n    }                                                                          \\\n    case MUL: {                                                                \\\n      const SegmentReductionType REDUCE = MUL;                                        \\\n      return __VA_ARGS__();                                                    \\\n    }                                                                          \\\n    case DIV: {                                                                \\\n      const SegmentReductionType REDUCE = DIV;                                        \\\n      return __VA_ARGS__();                                                    \\\n    }                                                                          \\\n    case MIN: {                                                                \\\n      const SegmentReductionType REDUCE = MIN;                                        \\\n      return __VA_ARGS__();                                                    \\\n    }                                                                          \\\n    case MAX: {                                                                \\\n      const SegmentReductionType REDUCE = MAX;                                        \\\n      return __VA_ARGS__();                                                    \\\n    }                                                                          \\\n    }                                                                          \\\n  }()\n\ntemplate <typename scalar_t, SegmentReductionType REDUCE> struct Reducer {\n    static inline __host__ __device__ scalar_t init() {\n        if (REDUCE == MUL || REDUCE == DIV)\n            return (scalar_t)1;\n        else if (REDUCE == MIN)\n            return std::numeric_limits<scalar_t>::max();\n        else if (REDUCE == MAX)\n            return std::numeric_limits<scalar_t>::lowest();\n        else\n            return (scalar_t)0;\n    }\n\n    static inline __host__ __device__ void update(scalar_t *val,\n                                                  scalar_t new_val) {\n        if (REDUCE == SUM || REDUCE == MEAN)\n            *val = *val + new_val;\n        else if (REDUCE == MUL)\n            *val = *val * new_val;\n        else if (REDUCE == DIV)\n            *val = *val / new_val;\n        else if ((REDUCE == MIN && new_val < *val) ||\n                 (REDUCE == MAX && new_val > *val)) {\n            *val = new_val;\n        }\n    }\n\n    static inline __host__ __device__ void update(scalar_t *val, scalar_t new_val,\n                                                  int64_t *arg, int64_t new_arg) {\n        if (REDUCE == SUM || REDUCE == MEAN)\n            *val = *val + new_val;\n        else if (REDUCE == MUL)\n            *val = *val * new_val;\n        else if (REDUCE == DIV)\n            *val = *val / new_val;\n        else if ((REDUCE == MIN && new_val < *val) ||\n                 (REDUCE == MAX && new_val > *val)) {\n            *val = new_val;\n            *arg = new_arg;\n        }\n    }\n\n    static inline __host__ __device__ void write(scalar_t *address, scalar_t val,\n                                                 int64_t *arg_address,\n                                                 int64_t arg, int count) {\n        if (REDUCE == SUM || REDUCE == MUL || REDUCE == DIV)\n            *address = val;\n        else if (REDUCE == MEAN)\n            *address = val / (scalar_t)(count > 0 ? count : 1);\n        else if (REDUCE == MIN || REDUCE == MAX) {\n            if (count > 0) {\n                *address = val;\n                *arg_address = arg;\n            } else\n                *address = (scalar_t)0;\n        }\n    }\n\n    static inline __device__ void atomic_write(scalar_t *address, scalar_t val) {\n        if (REDUCE == SUM || REDUCE == MEAN)\n            atomAdd(address, val);\n        else if (REDUCE == MUL)\n            atomMul(address, val);\n        else if (REDUCE == DIV)\n            atomDiv(address, val);\n        else if (REDUCE == MIN)\n            atomMin(address, val);\n        else if (REDUCE == MAX)\n            atomMax(address, val);\n    }\n};"
  },
  {
    "path": "src/cuda/third_party/pytorch_scatter/segment_csr_cuda.cu",
    "content": "#include \"segment_csr_cuda.h\"\n\n#include <ATen/cuda/CUDAContext.h>\n#include <ATen/cuda/detail/IndexUtils.cuh>\n#include <ATen/cuda/detail/TensorInfo.cuh>\n\n#include \"index_info.cuh\"\n#include \"reducer.cuh\"\n#include \"utils.cuh\"\n\n#define THREADS 256\n#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS\n#define FULL_MASK 0xffffffff\n\ntemplate<typename T>\n__device__ __forceinline__ T ldg(const T* ptr) {\n#if __CUDA_ARCH__ >= 350\n    return __ldg(ptr);\n#else\n    return *ptr;\n#endif\n}\n\ntemplate <typename scalar_t, SegmentReductionType REDUCE, int TB>\n__global__ void\nsegment_csr_kernel(const scalar_t *src_data,\n                   const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,\n                   scalar_t *out_data, int64_t *arg_out_data, size_t N,\n                   size_t E) {\n\n    // Each warp processes exactly `32/TB` rows and aggregates all row values\n    // via a parallel reduction.\n\n    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;\n    int row_idx = thread_idx / TB;\n    int lane_idx = thread_idx & (TB - 1);\n\n    if (row_idx < N) {\n        int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);\n        int64_t row_start = ldg(indptr_info.data + offset);\n        int64_t row_end = ldg(indptr_info.data + offset +\n                                indptr_info.strides[indptr_info.dims - 1]);\n\n        scalar_t val = Reducer<scalar_t, REDUCE>::init();\n        int64_t arg, arg_tmp;\n\n        offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E;\n        for (int64_t src_idx = row_start + lane_idx; src_idx < row_end;\n             src_idx += TB) {\n            Reducer<scalar_t, REDUCE>::update(&val, src_data[offset + src_idx], &arg,\n                                              src_idx);\n        }\n\n#pragma unroll\n        for (int i = TB / 2; i > 0; i /= 2) {\n            // Parallel reduction inside a single warp.\n            if (REDUCE == MIN || REDUCE == MAX)\n                arg_tmp = __shfl_down_sync(FULL_MASK, arg, i);\n            Reducer<scalar_t, REDUCE>::update(\n                    &val, __shfl_down_sync(FULL_MASK, val, i), &arg, arg_tmp);\n        }\n\n        if (lane_idx == 0) {\n            Reducer<scalar_t, REDUCE>::write(out_data + row_idx, val,\n                                             arg_out_data + row_idx, arg,\n                                             row_end - row_start);\n        }\n    }\n}\n\ntemplate <typename scalar_t, SegmentReductionType REDUCE>\n__global__ void segment_csr_broadcast_kernel(\n        const scalar_t *src_data,\n        const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,\n        scalar_t *out_data, int64_t *arg_out_data, size_t N, size_t K, size_t E) {\n\n    // Each thread processes exactly one row. It turned out that is more\n    // efficient than using shared memory due to avoiding synchronization\n    // barriers.\n\n    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;\n    int row_idx = thread_idx / K;\n    int lane_idx = thread_idx % K;\n\n    if (thread_idx < N * K) {\n        int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);\n        int64_t row_start = ldg(indptr_info.data + offset);\n        int64_t row_end = ldg(indptr_info.data + offset +\n                                indptr_info.strides[indptr_info.dims - 1]);\n\n        scalar_t val = Reducer<scalar_t, REDUCE>::init();\n        int64_t arg;\n\n        offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E * K;\n        for (int64_t src_idx = row_start; src_idx < row_end; src_idx++) {\n            Reducer<scalar_t, REDUCE>::update(\n                    &val, src_data[offset + K * src_idx + lane_idx], &arg, src_idx);\n        }\n\n        Reducer<scalar_t, REDUCE>::write(out_data + thread_idx, val,\n                                         arg_out_data + thread_idx, arg,\n                                         row_end - row_start);\n    }\n}\n\nstd::tuple<torch::Tensor, torch::optional<torch::Tensor>>\nsegment_csr_cuda(torch::Tensor src, torch::Tensor indptr,\n                 torch::optional<torch::Tensor> optional_out,\n                 std::string reduce) {\n    CHECK_CUDA(src);\n    CHECK_CUDA(indptr);\n    if (optional_out.has_value())\n        CHECK_CUDA(optional_out.value());\n    cudaSetDevice(src.get_device());\n\n    CHECK_INPUT(src.dim() >= indptr.dim());\n\n    auto sizes = indptr.sizes().vec();\n    for (auto i = 0; i < indptr.dim() - 1; i++)\n        sizes[i] = src.size(i);\n    indptr = indptr.expand(sizes);\n\n    auto dim = indptr.dim() - 1;\n\n    src = src.contiguous();\n\n    torch::Tensor out;\n    if (optional_out.has_value()) {\n        out = optional_out.value().contiguous();\n        for (int i = 0; i < out.dim(); i++)\n            if (i != dim)\n                CHECK_INPUT(src.size(i) == out.size(i));\n        CHECK_INPUT(src.numel() == 0 || out.size(dim) == indptr.size(dim) - 1);\n    } else {\n        sizes = src.sizes().vec();\n        sizes[dim] = std::max<int64_t>(indptr.size(dim) - 1, 0);\n        out = torch::empty(sizes, src.options());\n    }\n\n    torch::optional<torch::Tensor> arg_out = torch::nullopt;\n    int64_t *arg_out_data = nullptr;\n    if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {\n        arg_out = torch::full(out.sizes(), src.size(dim), indptr.options());\n        arg_out_data = arg_out.value().data_ptr<int64_t>();\n    }\n\n    if (src.numel() == 0) {\n        if (!optional_out.has_value())\n            out.fill_(0);\n        return std::make_tuple(out, arg_out);\n    }\n\n    auto N = out.size(dim) * (indptr.numel() / indptr.size(-1));\n    auto K = out.numel() / N;\n    auto E = src.size(dim);\n\n    auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);\n    auto stream = at::cuda::getCurrentCUDAStream();\n    AT_DISPATCH_ALL_TYPES(src.scalar_type(), \"_\", [&] {\n        auto src_data = src.data_ptr<scalar_t>();\n        auto out_data = out.data_ptr<scalar_t>();\n\n        AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {\n            if (K == 1) {\n                segment_csr_kernel<scalar_t, REDUCE, 1>\n                <<<BLOCKS(32, N), THREADS, 0, stream>>>(\n                        src_data, indptr_info, out_data, arg_out_data, N, E);\n            } else {\n                segment_csr_broadcast_kernel<scalar_t, REDUCE>\n                <<<BLOCKS(1, N * K), THREADS, 0, stream>>>(\n                        src_data, indptr_info, out_data, arg_out_data, N, K, E);\n            }\n        });\n    });\n\n    return std::make_tuple(out, arg_out);\n}\n\ntemplate <typename scalar_t, int TB>\n__global__ void\ngather_csr_kernel(const scalar_t *src_data,\n                  const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,\n                  scalar_t *out_data, size_t N, size_t E) {\n\n    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;\n    int row_idx = thread_idx / TB;\n    int lane_idx = thread_idx % TB;\n\n    if (row_idx < N) {\n        int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);\n        int row_start = ldg(indptr_info.data + offset);\n        int row_end = ldg(indptr_info.data + offset +\n                            indptr_info.strides[indptr_info.dims - 1]);\n        scalar_t val = ldg(src_data + row_idx);\n\n        offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E;\n        for (int out_idx = row_start + lane_idx; out_idx < row_end; out_idx += TB) {\n            out_data[offset + out_idx] = val; // \"Mostly\" coalesced.\n        }\n    }\n}\n\ntemplate <typename scalar_t>\n__global__ void gather_csr_broadcast_kernel(\n        const scalar_t *src_data,\n        const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,\n        scalar_t *out_data, size_t N, size_t K, size_t E) {\n\n    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;\n    int row_idx = thread_idx / K;\n    int lane_idx = thread_idx % K;\n\n    if (thread_idx < N * K) {\n        int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);\n        int row_start = ldg(indptr_info.data + offset);\n        int row_end = ldg(indptr_info.data + offset +\n                            indptr_info.strides[indptr_info.dims - 1]);\n\n        scalar_t val = src_data[thread_idx]; // Coalesced.\n\n        offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E * K;\n        for (int out_idx = row_start; out_idx < row_end; out_idx++) {\n            out_data[offset + K * out_idx + lane_idx] = val; // \"Mostly\" coalesced.\n        }\n    }\n}\n\ntorch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,\n                              torch::optional<torch::Tensor> optional_out) {\n    CHECK_CUDA(src);\n    CHECK_CUDA(indptr);\n    if (optional_out.has_value())\n        CHECK_CUDA(optional_out.value());\n    cudaSetDevice(src.get_device());\n\n    CHECK_INPUT(src.dim() >= indptr.dim());\n\n    auto sizes = indptr.sizes().vec();\n    for (auto i = 0; i < indptr.dim() - 1; i++)\n        sizes[i] = src.size(i);\n    indptr = indptr.expand(sizes);\n\n    auto dim = indptr.dim() - 1;\n    CHECK_INPUT(src.size(dim) == 0 || src.size(dim) == indptr.size(dim) - 1);\n\n    src = src.contiguous();\n\n    torch::Tensor out;\n    if (optional_out.has_value()) {\n        out = optional_out.value().contiguous();\n        for (auto i = 0; i < out.dim(); i++)\n            if (i != dim)\n                CHECK_INPUT(src.size(i) == out.size(i));\n    } else {\n        auto sizes = src.sizes().vec();\n        if (src.numel() > 0) {\n            sizes[dim] = indptr.flatten()[-1].cpu().data_ptr<int64_t>()[0];\n        } else {\n            sizes[dim] = 0;\n        }\n        out = torch::empty(sizes, src.options());\n    }\n\n    if (src.numel() == 0) {\n        if (!optional_out.has_value())\n            out.fill_(0);\n        return out;\n    }\n\n    auto N = src.size(dim) * (indptr.numel() / indptr.size(-1));\n    auto K = src.numel() / N;\n    auto E = out.size(dim);\n\n    auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);\n    auto stream = at::cuda::getCurrentCUDAStream();\n    AT_DISPATCH_ALL_TYPES(src.scalar_type(), \"_\", [&] {\n        auto src_data = src.data_ptr<scalar_t>();\n        auto out_data = out.data_ptr<scalar_t>();\n\n        if (K == 1)\n            gather_csr_kernel<scalar_t, 4><<<BLOCKS(1, 4 * N), THREADS, 0, stream>>>(\n                    src_data, indptr_info, out_data, N, E);\n        else\n            gather_csr_broadcast_kernel<scalar_t>\n            <<<BLOCKS(1, N * K), THREADS, 0, stream>>>(src_data, indptr_info,\n                                                       out_data, N, K, E);\n    });\n\n    return out;\n}"
  },
  {
    "path": "src/cuda/third_party/pytorch_scatter/segment_csr_cuda.h",
    "content": "#pragma once\n\n#include \"common/datatypes.h\"\n\nstd::tuple<torch::Tensor, torch::optional<torch::Tensor>>\nsegment_csr_cuda(torch::Tensor src, torch::Tensor indptr,\n                 torch::optional<torch::Tensor> optional_out,\n                 std::string reduce);\n\ntorch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,\n                              torch::optional<torch::Tensor> optional_out);"
  },
  {
    "path": "src/cuda/third_party/pytorch_scatter/segment_max.cpp",
    "content": "#include <torch/script.h>\n#include \"segment_csr_cuda.h\"\n\ninline std::vector<int64_t> list2vec(const c10::List<int64_t> list) {\n    std::vector<int64_t> result;\n    result.reserve(list.size());\n    for (size_t i = 0; i < list.size(); i++)\n        result.push_back(list[i]);\n    return result;\n}\n\nusing torch::autograd::AutogradContext;\nusing torch::autograd::Variable;\nusing torch::autograd::variable_list;\n\nclass SegmentMaxCSR : public torch::autograd::Function<SegmentMaxCSR> {\npublic:\n    static variable_list forward(AutogradContext *ctx, Variable src,\n                                 Variable indptr,\n                                 torch::optional<Variable> optional_out) {\n        ctx->saved_data[\"src_shape\"] = src.sizes();\n        auto result = segment_csr_cuda(src, indptr, optional_out, \"max\");\n        auto out = std::get<0>(result);\n        auto arg_out = std::get<1>(result).value();\n        ctx->save_for_backward({indptr, arg_out});\n        ctx->mark_non_differentiable({arg_out});\n        if (optional_out.has_value())\n            ctx->mark_dirty({optional_out.value()});\n        return {out, arg_out};\n    }\n\n    static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {\n        auto grad_out = grad_outs[0];\n        auto saved = ctx->get_saved_variables();\n        auto indptr = saved[0];\n        auto arg_out = saved[1];\n        auto src_shape = list2vec(ctx->saved_data[\"src_shape\"].toIntList());\n        src_shape[indptr.dim() - 1] += 1;\n        auto grad_in = torch::zeros(src_shape, grad_out.options());\n        grad_in.scatter_(indptr.dim() - 1, arg_out, grad_out);\n        grad_in =\n                grad_in.narrow(indptr.dim() - 1, 0, src_shape[indptr.dim() - 1] - 1);\n        return {grad_in, Variable(), Variable()};\n    }\n};\n\nstd::tuple<torch::Tensor, torch::Tensor>\nsegment_max_csr(torch::Tensor src, torch::Tensor indptr,\n                torch::optional<torch::Tensor> optional_out) {\n    auto result = SegmentMaxCSR::apply(src, indptr, optional_out);\n    return std::make_tuple(result[0], result[1]);\n}\n"
  },
  {
    "path": "src/cuda/third_party/pytorch_scatter/segment_max.h",
    "content": "#pragma once\n\n#include <common/datatypes.h>\n\nstd::tuple<torch::Tensor, torch::Tensor>\nsegment_max_csr(torch::Tensor src, torch::Tensor indptr,\n                torch::optional<torch::Tensor> optional_out);"
  },
  {
    "path": "src/cuda/third_party/pytorch_scatter/utils.cuh",
    "content": "#pragma once\n\n#define CHECK_CUDA(x)                                                          \\\n  AT_ASSERTM(x.device().is_cuda(), #x \" must be CUDA tensor\")\n#define CHECK_INPUT(x) AT_ASSERTM(x, \"Input mismatch\")\n"
  },
  {
    "path": "src/python/__init__.py",
    "content": "# isort: skip_file\nimport os\nimport sys\n\n# import torch # this import here causes a GIL error\nonly_python = os.environ.get(\"MARIUS_NO_BINDINGS\", None)\n\nif not only_python:\n    try:\n        # import torch  # noqa F401\n\n        # load main modules\n        from . import _config as config  # RW: import first due to marius/torch omp linking\n        import torch  # noqa F401\n\n        # from . import _config as config\n        from . import _data as data\n        from . import _manager as manager\n        from . import _nn as nn\n        from . import _pipeline as pipeline\n        from . import _report as report\n        from . import _storage as storage\n\n        # load submodules\n        from ._data import samplers as samplers\n        from ._nn import decoders as decoders\n        from ._nn import encoders as encoders\n        from ._nn import layers as layers\n        from ._nn.decoders import edge as edge\n        from ._nn.decoders import node as node\n\n        sys.modules[f\"{__name__}.config\"] = config\n        sys.modules[f\"{__name__}.data\"] = data\n        sys.modules[f\"{__name__}.data.samplers\"] = samplers\n        sys.modules[f\"{__name__}.manager\"] = manager\n        sys.modules[f\"{__name__}.nn\"] = nn\n        sys.modules[f\"{__name__}.nn.encoders\"] = encoders\n        sys.modules[f\"{__name__}.nn.decoders\"] = decoders\n        sys.modules[f\"{__name__}.nn.decoders.edge\"] = edge\n        sys.modules[f\"{__name__}.nn.decoders.node\"] = node\n        sys.modules[f\"{__name__}.nn.layers\"] = layers\n        sys.modules[f\"{__name__}.pipeline\"] = pipeline\n        sys.modules[f\"{__name__}.report\"] = report\n        sys.modules[f\"{__name__}.storage\"] = storage\n\n        __all__ = [\"config\", \"data\", \"manager\", \"nn\", \"pipeline\", \"report\", \"storage\"]\n\n    except ModuleNotFoundError:\n        print(\"Bindings not installed\")\n"
  },
  {
    "path": "src/python/console_scripts/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/console_scripts/marius_eval.py",
    "content": "import argparse\nimport sys\n\nimport marius as m\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Configuration file based evaluation\", prog=\"eval\")\n\n    parser.add_argument(\n        \"config\",\n        metavar=\"config\",\n        type=str,\n        help=(\n            \"Path to YAML configuration file that describes the evaluation process. See documentation\"\n            \" docs/config_interface for more details.\"\n        ),\n    )\n\n    args = parser.parse_args()\n    config = m.config.loadConfig(args.config, save=True)\n    m.manager.marius_eval(config)\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
  },
  {
    "path": "src/python/console_scripts/marius_train.py",
    "content": "import argparse\nimport sys\n\nimport marius as m\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Configuration file based training\", prog=\"train\")\n\n    parser.add_argument(\n        \"config\",\n        metavar=\"config\",\n        type=str,\n        help=(\n            \"Path to YAML configuration file that describes the training process. See documentation\"\n            \" docs/config_interface for more details.\"\n        ),\n    )\n\n    args = parser.parse_args()\n    config = m.config.loadConfig(args.config, save=True)\n    m.manager.marius_train(config)\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
  },
  {
    "path": "src/python/distribution/generate_stubs.py",
    "content": "import os\n\nfrom pybind11_stubgen import ModuleStubsGenerator\n\n\ndef generate_stubs(output_dir, module_name):\n    module = ModuleStubsGenerator(module_name)\n\n    module.parse()\n\n    module.write_setup_py = False\n\n    module_name = module_name.split(\".\")[-1]\n\n    os.makedirs(output_dir, exist_ok=True)\n\n    with open(\"{}/{}.pyi\".format(output_dir, module_name), \"w\") as fp:\n        fp.write(\"#\\n# AUTOMATICALLY GENERATED FILE\\n#\\n\\n\")\n        fp.write(\"import torch\\n\")\n        fp.write(\"\\n\".join(module.to_lines()))\n\n\ndef gen_all_stubs(output_dir):\n    generate_stubs(output_dir, \"marius.config\")\n\n    generate_stubs(output_dir, \"marius._data.samplers\")\n    generate_stubs(output_dir, \"marius.data\")\n\n    generate_stubs(output_dir, \"marius.manager\")\n\n    generate_stubs(output_dir, \"marius._nn.decoders.edge\")\n    generate_stubs(output_dir, \"marius._nn.decoders.node\")\n    generate_stubs(output_dir, \"marius._nn.decoders\")\n    generate_stubs(output_dir, \"marius._nn.encoders\")\n    generate_stubs(output_dir, \"marius._nn.layers\")\n    generate_stubs(output_dir, \"marius.nn\")\n\n    generate_stubs(output_dir, \"marius.pipeline\")\n    generate_stubs(output_dir, \"marius.report\")\n    generate_stubs(output_dir, \"marius.storage\")\n\n    generate_stubs(output_dir, \"marius\")\n\n\nif __name__ == \"__main__\":\n    gen_all_stubs(\"tmp\")\n# generate_stubs(\"tmp\", \"marius\")\n# generate_stubs(\"tmp\", \"marius.config\")\n# generate_stubs(\"tmp\", \"marius.data\")\n# generate_stubs(\"tmp\", \"marius._data.samplers\")\n# generate_stubs(\"tmp\", \"marius.manager\")\n# generate_stubs(\"tmp\", \"marius.nn\")\n# generate_stubs(\"tmp\", \"marius._nn.decoders\")\n# generate_stubs(\"tmp\", \"marius._nn.encoders\")\n# generate_stubs(\"tmp\", \"marius._nn.layers\")\n# generate_stubs(\"tmp\", \"marius.pipeline\")\n# generate_stubs(\"tmp\", \"marius.report\")\n# generate_stubs(\"tmp\", \"marius.storage\")\n\n# generate_stubs(\"tmp\", \"marius\")\n# generate_stubs(\"tmp\", \"marius.config\")\n# generate_stubs(\"tmp\", \"marius.data\")\n# generate_stubs(\"tmp\", \"marius.data.samplers\")\n# generate_stubs(\"tmp\", \"marius.manager\")\n# generate_stubs(\"tmp\", \"marius.nn\")\n# generate_stubs(\"tmp\", \"marius.nn.decoders\")\n# generate_stubs(\"tmp\", \"marius.nn.encoders\")\n# generate_stubs(\"tmp\", \"marius.nn.layers\")\n# generate_stubs(\"tmp\", \"marius.pipeline\")\n# generate_stubs(\"tmp\", \"marius.report\")\n# generate_stubs(\"tmp\", \"marius.storage\")\n"
  },
  {
    "path": "src/python/distribution/marius_env_info.py",
    "content": "import os\nimport platform\nimport re\nimport sys\n\nimport yaml\nfrom importlib_metadata import version\n\n\nclass MyDumper(yaml.Dumper):\n    def increase_indent(self, flow=False, indentless=False):\n        return super(MyDumper, self).increase_indent(flow, False)\n\n\ndef get_os_info():\n    os_info = {\"platform\": platform.platform()}\n    return os_info\n\n\ndef get_cpu_info():\n    cpu_info = {\"num_cpus\": \"N/A\", \"total_memory\": \"N/A\"}\n\n    try:\n        import psutil\n    except ImportError:\n        return cpu_info\n\n    cpu_info[\"num_cpus\"] = psutil.cpu_count()\n    cpu_info[\"total_memory\"] = \"{}GB\".format(psutil.virtual_memory().total >> 30)\n    return cpu_info\n\n\ndef get_gpu_info():\n    gpu_info = \"N/A\"\n\n    try:\n        import GPUtil\n    except RuntimeError:\n        return gpu_info\n\n    gpus = GPUtil.getGPUs()\n    gpu_info = []\n    for gpu in gpus:\n        gpu_info.append({\"name\": gpu.name, \"memory\": \"{}GB\".format(int(gpu.memoryTotal) >> 10)})\n\n    return gpu_info\n\n\ndef get_python_info():\n    py_deps = [\n        \"numpy\",\n        \"pandas\",\n        \"tox\",\n        \"pytest\",\n        \"torch\",\n        \"omegaconf\",\n        \"pyspark\",\n        \"pip\",\n    ]\n    py_deps_version = {}\n    for dep in py_deps:\n        try:\n            imported_dep = __import__(dep)\n            py_deps_version[dep + \"_version\"] = imported_dep.__version__\n        except ModuleNotFoundError:\n            py_deps_version[dep + \"_version\"] = \"N/A\"\n\n    pytorch_info = {\n        \"version\": sys.version,\n        \"deps\": py_deps_version,\n    }\n    return pytorch_info\n\n\ndef get_cuda_info():\n    cuda_info = {\"version\": \"N/A\"}\n    try:\n        import torch\n    except ImportError:\n        return cuda_info\n\n    if torch.has_cuda:\n        cuda_info[\"version\"] = torch.version.cuda\n    return cuda_info\n\n\ndef get_openmp_info():\n    openmp_info = {\"version\": \"N/A\"}\n\n    openmp_output = os.popen(\"echo | cpp -fopenmp -dM | grep -i open\").read()\n    version_pattern = re.compile(r\"#define\\s_OPENMP.*\\s([0-9.]+)\")\n    openmp_version = re.search(version_pattern, openmp_output)\n    if openmp_version is not None:\n        openmp_info[\"version\"] = openmp_version.group(1)\n\n    return openmp_info\n\n\ndef get_pytorch_info():\n    pytorch_info = {\"version\": \"N/A\", \"install_path\": \"N/A\"}\n    try:\n        import torch\n    except ImportError:\n        return pytorch_info\n\n    pytorch_info[\"version\"] = torch.__version__\n    pytorch_info[\"install_path\"] = os.path.dirname(torch.__file__)\n\n    return pytorch_info\n\n\ndef get_marius_info():\n    marius_info = {\"version\": \"N/A\", \"install_path\": \"N/A\", \"bindings_installed\": False}\n    try:\n        import marius\n\n        marius_info[\"install_path\"] = marius.__path__[0]\n        marius_info[\"version\"] = version(\"marius\")\n    except ImportError:\n        return marius_info\n\n    try:\n        import marius.nn\n\n        marius_info[\"bindings_installed\"] = True\n    except ImportError:\n        pass\n\n    return marius_info\n\n\ndef get_pybind_info():\n    pybind_info = {\"PYBIND11_COMPILER_TYPE\": \"N/A\", \"PYBIND11_STDLIB\": \"N/A\", \"PYBIND11_BUILD_ABI\": \"N/A\"}\n\n    try:\n        import torch\n    except ImportError:\n        return pybind_info\n\n    pybind_info[\"PYBIND11_COMPILER_TYPE\"] = torch._C._PYBIND11_COMPILER_TYPE\n    pybind_info[\"PYBIND11_STDLIB\"] = torch._C._PYBIND11_STDLIB\n    pybind_info[\"PYBIND11_BUILD_ABI\"] = torch._C._PYBIND11_BUILD_ABI\n    return pybind_info\n\n\ndef get_cmake_info():\n    cmake_info = {\"version\": \"N/A\"}\n\n    cmake_output = os.popen(\"cmake --version\").read().split(\"\\n\")[0]\n    version_pattern = re.compile(r\".*\\s([0-9.]+)\")\n    cmake_version = re.search(version_pattern, cmake_output)\n    if cmake_version is not None:\n        cmake_info[\"version\"] = cmake_version.group(1)\n\n    return cmake_info\n\n\ndef main():\n    env_info = {}\n\n    env_info[\"operating_system\"] = get_os_info()\n    env_info[\"cpu_info\"] = get_cpu_info()\n    env_info[\"gpu_info\"] = get_gpu_info()\n    env_info[\"python\"] = get_python_info()\n    env_info[\"pytorch\"] = get_pytorch_info()\n    env_info[\"cuda\"] = get_cuda_info()\n    env_info[\"marius\"] = get_marius_info()\n    env_info[\"pybind\"] = get_pybind_info()\n    env_info[\"cmake\"] = get_cmake_info()\n    env_info[\"openmp\"] = get_openmp_info()\n\n    print(yaml.dump(env_info, Dumper=MyDumper, default_flow_style=False))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/python/tools/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/tools/configuration/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/tools/configuration/constants.py",
    "content": "from dataclasses import dataclass\n\n\n@dataclass\nclass PathConstants:\n    model_file: str = \"model.pt\"\n    model_state_file: str = \"model_state.pt\"\n    edges_directory: str = \"edges/\"\n    nodes_directory: str = \"nodes/\"\n    training_file_prefix: str = \"train_\"\n    validation_file_prefix: str = \"validation_\"\n    test_file_prefix: str = \"test_\"\n    partition_offsets_file: str = \"partition_offsets.txt\"\n    node_mapping_file: str = \"node_mapping.txt\"\n    relation_mapping_file: str = \"relation_mapping.txt\"\n    edge_file_name: str = \"edges\"\n    edge_weight_file_name: str = \"edges_weights\"\n    node_file_name: str = \"nodes\"\n    features_file_name: str = \"features\"\n    labels_file_name: str = \"labels\"\n    node_embeddings_file_name: str = \"embeddings\"\n    node_embeddings_state_file_name: str = \"embeddings_state\"\n    saved_full_config_file_name: str = \"full_config.yaml\"\n    file_ext: str = \".bin\"\n\n    train_edges_path: str = edges_directory + training_file_prefix + edge_file_name + file_ext\n    train_edges_weights_path: str = edges_directory + training_file_prefix + edge_weight_file_name + file_ext\n\n    valid_edges_path: str = edges_directory + validation_file_prefix + edge_file_name + file_ext\n    valid_edges_weights_path: str = edges_directory + validation_file_prefix + edge_weight_file_name + file_ext\n\n    test_edges_path: str = edges_directory + test_file_prefix + edge_file_name + file_ext\n    test_edges_weights_path: str = edges_directory + test_file_prefix + edge_weight_file_name + file_ext\n\n    train_edge_buckets_path: str = edges_directory + training_file_prefix + partition_offsets_file\n    valid_edge_buckets_path: str = edges_directory + validation_file_prefix + partition_offsets_file\n    test_edge_buckets_path: str = edges_directory + test_file_prefix + partition_offsets_file\n\n    node_features_path: str = nodes_directory + features_file_name + file_ext\n    relation_features_path: str = edges_directory + features_file_name + file_ext\n    labels_path: str = nodes_directory + labels_file_name + file_ext\n\n    train_nodes_path: str = nodes_directory + training_file_prefix + node_file_name + file_ext\n    valid_nodes_path: str = nodes_directory + validation_file_prefix + node_file_name + file_ext\n    test_nodes_path: str = nodes_directory + test_file_prefix + node_file_name + file_ext\n\n    node_mapping_path: str = nodes_directory + node_mapping_file\n    relation_mapping_path: str = edges_directory + relation_mapping_file\n"
  },
  {
    "path": "src/python/tools/configuration/datatypes.py",
    "content": "from dataclasses import dataclass\n\n# This file contains enums and detailed option settings for each enum value, where applicable\n\n\n# options dataclasses\n@dataclass\nclass InitOptions:\n    pass\n\n\n@dataclass\nclass UniformInitOptions(InitOptions):\n    scale_factor: float = 1\n\n    def __post_init__(self):\n        if self.scale_factor <= 0:\n            raise ValueError(\"scale_factor must be positive\")\n\n\n@dataclass\nclass NormalInitOptions(InitOptions):\n    mean: float = 0\n    std: float = 1\n\n    def __post_init__(self):\n        if self.std <= 0:\n            raise ValueError(\"std must be positive\")\n\n\n@dataclass\nclass ConstantInitOptions(InitOptions):\n    constant: float = 0\n\n\n@dataclass\nclass LossOptions:\n    reduction: str = \"SUM\"\n\n\n@dataclass\nclass RankingLossOptions(LossOptions):\n    margin: float = 0.1\n\n\n@dataclass\nclass OptimizerOptions:\n    learning_rate: float = 0.1\n\n    def __post_init__(self):\n        if self.learning_rate <= 0:\n            raise ValueError(\"learning_rate must be positive\")\n\n\n@dataclass\nclass AdagradOptions(OptimizerOptions):\n    learning_rate = 0.1\n    eps: float = 1e-10\n    init_value: float = 0\n    lr_decay: float = 0\n    weight_decay: float = 0\n\n    def __post_init__(self):\n        if self.init_value < 0:\n            raise ValueError(\"init_value for AdaGradOptimizer must be non-negative\")\n        # is this the case??\n        if self.lr_decay < 0:\n            raise ValueError(\"lr_decay for AdaGradOptimizer must be non-negative\")\n        if self.weight_decay < 0:\n            raise ValueError(\"weight_decay for AdaGradOptimizer must be non-negative\")\n\n\n@dataclass\nclass AdamOptions(OptimizerOptions):\n    learning_rate = 0.1\n    amsgrad: bool = False\n    beta_1: float = 0.9\n    beta_2: float = 0.999\n    eps: float = 1e-8\n    weight_decay: float = 0\n\n    def __post_init__(self):\n        if self.beta_1 < 0:\n            raise ValueError(\"beta_1 for AdamOptimizer must be non-negative\")\n        # is this the case??\n        if self.beta_2 < 0:\n            raise ValueError(\"beta_2 for AdamOptimizer must be non-negative\")\n        if self.weight_decay < 0:\n            raise ValueError(\"weight_decay for AdamOptimizer  must be non-negative\")\n\n\n@dataclass\nclass LayerOptions:\n    pass\n\n\n@dataclass\nclass EmbeddingLayerOptions(LayerOptions):\n    pass\n\n\n@dataclass\nclass FeatureLayerOptions(LayerOptions):\n    pass\n\n\n@dataclass\nclass DenseLayerOptions(LayerOptions):\n    type: str = \"LINEAR\"\n\n\n@dataclass\nclass ReductionLayerOptions(LayerOptions):\n    type: str = \"CONCAT\"\n\n\n@dataclass\nclass GNNLayerOptions(LayerOptions):\n    type: str\n    pass\n\n\n@dataclass\nclass GraphSageLayerOptions(GNNLayerOptions):\n    type: str = \"GRAPH_SAGE\"\n    aggregator: str = \"GCN\"\n\n\n@dataclass\nclass GATLayerOptions(GNNLayerOptions):\n    type: str = \"GAT\"\n    num_heads: int = 10\n    average_heads: bool = True\n    negative_slope: float = 0.2\n    input_dropout: float = 0.0\n    attention_dropout: float = 0.0\n\n    def __post_init__(self):\n        if self.num_heads <= 0:\n            raise ValueError(\"num_heads must be positive\")\n\n\n@dataclass\nclass DecoderOptions:\n    pass\n\n\n@dataclass\nclass EdgeDecoderOptions(DecoderOptions):\n    inverse_edges: bool = True\n    use_relation_features: bool = False\n    edge_decoder_method: str = \"CORRUPT_NODE\"\n\n\n@dataclass\nclass StorageOptions:\n    dtype: str = \"float\"\n\n\n@dataclass\nclass PartitionBufferOptions(StorageOptions):\n    num_partitions: int = 16\n    buffer_capacity: int = 8\n    prefetching: bool = True\n    fine_to_coarse_ratio: int = 1\n    num_cache_partitions: int = 0\n    edge_bucket_ordering: str = \"COMET\"\n    node_partition_ordering: str = \"DISPERSED\"\n    randomly_assign_edge_buckets: bool = True\n\n    def __post_init__(self):\n        if self.num_partitions < 2:\n            raise ValueError(\n                \"There must be at least two partitions to use the partition buffer, got: {}\".format(self.num_partitions)\n            )\n        if self.buffer_capacity < 2:\n            raise ValueError(\n                \"The partition buffer must have capacity of at least 2, got: {}\".format(self.buffer_capacity)\n            )\n\n        # no need to have a buffer capacity larger than the number of partitions\n        if self.num_partitions < self.buffer_capacity:\n            self.buffer_capacity = self.num_partitions\n\n\n@dataclass\nclass NeighborSamplingOptions:\n    pass\n\n\n@dataclass\nclass UniformSamplingOptions(NeighborSamplingOptions):\n    max_neighbors: int = 10\n\n    def __post_init__(self):\n        if self.max_neighbors <= 0:\n            raise ValueError(\"max_neighbors must be positive\")\n\n\n@dataclass\nclass DropoutSamplingOptions(NeighborSamplingOptions):\n    rate: float = 0.0\n\n    def __post_init__(self):\n        if self.rate < 0 or self.rate >= 1:\n            raise ValueError(\"rate must be in [0, 1)\")\n"
  },
  {
    "path": "src/python/tools/configuration/marius_config.py",
    "content": "import os\nimport random\nimport re\nimport shutil\nimport sys\nfrom dataclasses import dataclass, field\nfrom distutils import dir_util\nfrom pathlib import Path\nfrom typing import List\n\nfrom omegaconf import MISSING, DictConfig, OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.configuration.datatypes import (\n    AdagradOptions,\n    AdamOptions,\n    ConstantInitOptions,\n    DecoderOptions,\n    DenseLayerOptions,\n    DropoutSamplingOptions,\n    EdgeDecoderOptions,\n    GATLayerOptions,\n    GNNLayerOptions,\n    GraphSageLayerOptions,\n    InitOptions,\n    LayerOptions,\n    LossOptions,\n    NeighborSamplingOptions,\n    NormalInitOptions,\n    OptimizerOptions,\n    PartitionBufferOptions,\n    RankingLossOptions,\n    ReductionLayerOptions,\n    StorageOptions,\n    UniformInitOptions,\n    UniformSamplingOptions,\n)\nfrom marius.tools.configuration.validation import (\n    check_encoder_layer_dimensions,\n    check_full_graph_evaluation,\n    check_gnn_layers_alignment,\n    validate_dataset_config,\n    validate_storage_config,\n)\n\n\ndef get_model_dir_path(dataset_dir):\n    # will support storing upto 11 different model params when model_dir is not specified.\n    # post that, it will overwrite in <dataset_dir>/model_10 directory.\n    for i in range(11):\n        model_dir = \"{}/model_{}\".format(dataset_dir, i)\n        model_dir_path = Path(model_dir)\n        if not model_dir_path.exists():\n            return str(model_dir_path)\n\n    return str(model_dir_path)\n\n\n@dataclass\nclass NeighborSamplingConfig:\n    type: str = \"ALL\"\n    options: NeighborSamplingOptions = NeighborSamplingOptions()\n    use_hashmap_sets: bool = False\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        self.type = input_config.type.upper()\n\n        new_options = NeighborSamplingOptions()\n\n        if self.type == \"UNIFORM\":\n            new_options = UniformSamplingOptions()\n\n        if self.type == \"DROPOUT\":\n            new_options = DropoutSamplingOptions()\n\n        if \"options\" in input_config.keys():\n            for key in new_options.__dict__.keys():\n                if key in input_config.options.keys():\n                    val = input_config.options.__getattr__(key)\n                    new_options.__setattr__(key, val)\n\n        self.options = new_options\n\n        if \"use_hashmap_sets\" in input_config.keys():\n            self.use_hashmap_sets = input_config.use_hashmap_sets\n\n\n@dataclass\nclass OptimizerConfig:\n    type: str = \"ADAGRAD\"\n    options: OptimizerOptions = AdagradOptions()\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        self.type = input_config.type.upper()\n\n        new_options = OptimizerOptions()\n\n        if self.type == \"DEFAULT\":\n            self.options = new_options\n            return\n\n        if self.type == \"ADAGRAD\":\n            new_options = AdagradOptions()\n\n        if self.type == \"ADAM\":\n            new_options = AdamOptions()\n\n        for key in new_options.__dict__.keys():\n            if key in input_config.options.keys():\n                val = input_config.options.__getattr__(key)\n                new_options.__setattr__(key, val)\n\n        self.options = new_options\n\n\n@dataclass\nclass InitConfig:\n    type: str = \"GLOROT_UNIFORM\"\n    options: InitOptions = InitOptions()\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        self.type = input_config.type.upper()\n\n        new_options = InitOptions()\n\n        if self.type == \"CONSTANT\":\n            new_options = ConstantInitOptions()\n\n        if self.type == \"UNIFORM\":\n            new_options = UniformInitOptions()\n\n        if self.type == \"NORMAL\":\n            new_options = NormalInitOptions()\n\n        for key in new_options.__dict__.keys():\n            if key in input_config.options.keys():\n                val = input_config.options.__getattr__(key)\n                new_options.__setattr__(key, val)\n\n        self.options = new_options\n\n\n@dataclass\nclass LossConfig:\n    type: str = \"SOFTMAX_CE\"\n    options: LossOptions = LossOptions()\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        self.type = input_config.type.upper()\n\n        new_options = LossOptions()\n\n        if self.type == \"RANKING\":\n            new_options = RankingLossOptions()\n\n        if \"options\" in input_config.keys():\n            for key in new_options.__dict__.keys():\n                if key in input_config.options.keys():\n                    val = input_config.options.__getattr__(key)\n                    new_options.__setattr__(key, val)\n\n        self.options = new_options\n\n\n@dataclass\nclass LayerConfig:\n    type: str = None\n    options: LayerOptions = LayerOptions()\n    input_dim: int = -1\n    output_dim: int = -1\n    init: InitConfig = InitConfig(type=\"GLOROT_UNIFORM\")\n    optimizer: OptimizerConfig = OptimizerConfig(type=\"DEFAULT\")\n    bias: bool = False\n    bias_init: InitConfig = InitConfig(type=\"ZEROS\")\n    activation: str = \"NONE\"\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        self.type = input_config.type.upper()\n\n        if \"init\" in input_config.keys():\n            self.init.merge(input_config.init)\n\n        if \"options\" in input_config.keys():\n            new_options = LayerOptions()\n\n            if self.type == \"GNN\":\n                new_options = GNNLayerOptions(type=\"NONE\")\n                if input_config.options.type.upper() == \"GRAPH_SAGE\":\n                    new_options = GraphSageLayerOptions()\n                elif input_config.options.type.upper() == \"GAT\":\n                    new_options = GATLayerOptions()\n\n            if self.type == \"DENSE\":\n                new_options = DenseLayerOptions()\n\n            if self.type == \"REDUCTION\":\n                new_options = ReductionLayerOptions()\n\n            for key in new_options.__dict__.keys():\n                if key in input_config.options.keys():\n                    val = input_config.options.__getattr__(key)\n                    new_options.__setattr__(key, val)\n\n            self.options = new_options\n\n        if \"activation\" in input_config.keys():\n            self.activation = input_config.activation.upper()\n\n        if \"bias\" in input_config.keys():\n            self.bias = input_config.bias\n\n        if \"bias_init\" in input_config.keys():\n            self.bias_init.merge(input_config.bias_init)\n\n        if \"optimizer\" in input_config.keys():\n            if self.optimizer is MISSING:\n                self.optimizer = OptimizerConfig()\n            self.optimizer.merge(input_config.optimizer)\n\n        if \"input_dim\" in input_config.keys():\n            self.input_dim = input_config.input_dim\n\n        if \"output_dim\" in input_config.keys():\n            self.output_dim = input_config.output_dim\n\n\n@dataclass\nclass EncoderConfig:\n    use_incoming_nbrs: bool = True\n    use_outgoing_nbrs: bool = True\n    layers: List[List[LayerConfig]] = field(default_factory=list)\n    train_neighbor_sampling: List[NeighborSamplingConfig] = field(default_factory=list)\n    eval_neighbor_sampling: List[NeighborSamplingConfig] = field(default_factory=list)\n    embedding_dim: int = -1\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n        if \"use_incoming_nbrs\" in input_config.keys():\n            self.use_incoming_nbrs = input_config.use_incoming_nbrs\n\n        if \"use_outgoing_nbrs\" in input_config.keys():\n            self.use_outgoing_nbrs = input_config.use_outgoing_nbrs\n\n        new_layers = []\n        if \"layers\" in input_config.keys():\n            for stage in input_config.layers:\n                new_stages = []\n                for layer_config in stage:\n                    base_layer = LayerConfig()\n                    base_layer.merge(layer_config)\n                    new_stages.append(base_layer)\n                    if base_layer.type == \"EMBEDDING\":\n                        self.embedding_dim = base_layer.output_dim\n\n                new_layers.append(new_stages)\n\n        self.layers = new_layers\n\n        new_train = []\n        if \"train_neighbor_sampling\" in input_config.keys():\n            for layer_config in input_config.train_neighbor_sampling:\n                base_layer = NeighborSamplingConfig()\n                base_layer.merge(layer_config)\n                new_train.append(base_layer)\n\n        self.train_neighbor_sampling = new_train\n\n        new_eval = []\n        if \"eval_neighbor_sampling\" in input_config.keys():\n            for layer_config in input_config.eval_neighbor_sampling:\n                base_layer = NeighborSamplingConfig()\n                base_layer.merge(layer_config)\n                new_eval.append(base_layer)\n\n        self.eval_neighbor_sampling = new_eval\n\n\n@dataclass\nclass DecoderConfig:\n    type: str = \"DISTMULT\"\n    options: DecoderOptions = DecoderOptions()\n    optimizer: OptimizerConfig = OptimizerConfig()\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        self.type = input_config.type.upper()\n\n        new_options = DecoderOptions()\n\n        if self.type != \"NODE\":\n            new_options = EdgeDecoderOptions()\n\n        if \"options\" in input_config.keys():\n            for key in new_options.__dict__.keys():\n                if key in input_config.options.keys():\n                    val = input_config.options.__getattr__(key)\n                    new_options.__setattr__(key, val)\n\n        self.options = new_options\n\n        if \"optimizer\" in input_config.keys():\n            self.optimizer.merge(input_config.optimizer)\n\n\n@dataclass\nclass ModelConfig:\n    random_seed: int = MISSING\n    learning_task: str = MISSING\n    encoder: EncoderConfig = MISSING\n    decoder: DecoderConfig = MISSING\n    loss: LossConfig = MISSING\n    dense_optimizer: OptimizerConfig = OptimizerConfig()\n    sparse_optimizer: OptimizerConfig = OptimizerConfig()\n\n    def __post_init__(self):\n        if self.random_seed is MISSING:\n            self.random_seed = random.randint(0, sys.maxsize)\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return:\n        \"\"\"\n\n        if \"random_seed\" in input_config.keys():\n            self.random_seed = input_config.random_seed\n\n        if \"learning_task\" in input_config.keys():\n            self.learning_task = input_config.learning_task.upper()\n\n        if \"encoder\" in input_config.keys():\n            if self.encoder is MISSING:\n                self.encoder = EncoderConfig()\n            self.encoder.merge(input_config.encoder)\n\n        if \"decoder\" in input_config.keys():\n            if self.decoder is MISSING:\n                self.decoder = DecoderConfig()\n            self.decoder.merge(input_config.decoder)\n\n        if \"loss\" in input_config.keys():\n            if self.loss is MISSING:\n                self.loss = LossConfig()\n            self.loss.merge(input_config.loss)\n\n        if \"dense_optimizer\" in input_config.keys():\n            self.dense_optimizer.merge(input_config.dense_optimizer)\n\n        if \"sparse_optimizer\" in input_config.keys():\n            self.sparse_optimizer.merge(input_config.sparse_optimizer)\n\n        self.__post_init__()\n\n\n@dataclass\nclass StorageBackendConfig:\n    type: str = \"DEVICE_MEMORY\"\n    options: StorageOptions = StorageOptions(dtype=\"float\")\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        self.type = input_config.type.upper()\n\n        new_options = self.options\n\n        if self.type == \"PARTITION_BUFFER\":\n            new_options = PartitionBufferOptions()\n\n        if \"options\" in input_config.keys():\n            for key in new_options.__dict__.keys():\n                if key in input_config.options.keys():\n                    val = input_config.options.__getattr__(key)\n                    new_options.__setattr__(key, val)\n\n        self.options = new_options\n\n\n@dataclass\nclass DatasetConfig:\n    dataset_dir: str = MISSING\n    num_edges: int = MISSING\n    num_nodes: int = MISSING\n    num_relations: int = 1\n    num_train: int = MISSING\n    num_valid: int = -1\n    num_test: int = -1\n    node_feature_dim: int = -1\n    rel_feature_dim: int = -1\n    num_classes: int = -1\n    initialized: bool = False\n\n    def __post_init__(self):\n        if not self.initialized:\n            return\n\n        edges_path = Path(self.dataset_dir) / Path(\"edges\")\n        if not edges_path.exists():\n            raise ValueError(\"{} does not exist\".format(str(edges_path)))\n\n        train_edges_filepath = edges_path / Path(\"train_edges.bin\")\n        if not train_edges_filepath.exists():\n            raise ValueError(\"{} does not exist\".format(str(train_edges_filepath)))\n\n        nodes_path = Path(self.dataset_dir) / Path(\"nodes\")\n        node_mapping_filepath = nodes_path / Path(\"node_mapping.txt\")\n        if node_mapping_filepath.exists():\n            num_lines = int(os.popen(\"wc -l {}\".format(node_mapping_filepath)).read().lstrip().split(\" \")[0])\n            if num_lines != self.num_nodes:\n                raise ValueError(\n                    \"Expected to see {} lines in file {}, but found {}\".format(\n                        self.num_nodes, str(node_mapping_filepath), num_lines\n                    )\n                )\n\n        relation_mapping_filepath = edges_path / Path(\"relation_mapping.txt\")\n        if relation_mapping_filepath.exists():\n            num_lines = int(os.popen(\"wc -l {}\".format(relation_mapping_filepath)).read().lstrip().split(\" \")[0])\n            if num_lines != self.num_relations:\n                raise ValueError(\n                    \"Expected to see {} lines in file {}, but found {}\".format(\n                        self.num_relations, str(relation_mapping_filepath), num_lines\n                    )\n                )\n\n    def populate_dataset_stats(self):\n        if self.dataset_dir is MISSING:\n            raise ValueError(\"Path to pre-processed dataset directory <dataset_dir> not found\")\n\n        dataset_dir_path = Path(self.dataset_dir)\n        if not dataset_dir_path.exists():\n            raise ValueError(\"Path specified as dataset_dir ({}) does not exist\".format(str(dataset_dir_path)))\n\n        dataset_stats_path = Path(self.dataset_dir) / Path(\"dataset.yaml\")\n        dataset_stats_path = dataset_stats_path.absolute()\n        if not dataset_stats_path.exists():\n            raise ValueError(\n                \"{} does not exist, expected to see dataset.yaml file in {} generated by marius_preprocess\".format(\n                    str(dataset_stats_path), self.dataset_dir\n                )\n            )\n\n        dataset_cfg = OmegaConf.load(dataset_stats_path)\n\n        keys = self.__dict__.keys()\n        for key in dataset_cfg.keys():\n            if key in keys:\n                val = dataset_cfg.__getattr__(key)\n                self.__setattr__(key, val)\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        self.initialized = True\n        for key in self.__dict__.keys():\n            if key in input_config.keys():\n                val = input_config.__getattr__(key)\n                self.__setattr__(key, val)\n\n        self.populate_dataset_stats()\n\n        self.__post_init__()\n\n\n@dataclass\nclass StorageConfig:\n    device_type: str = \"cpu\"\n    device_ids: List[int] = field(default_factory=list)\n    dataset: DatasetConfig = DatasetConfig()\n    edges: StorageBackendConfig = StorageBackendConfig(options=StorageOptions(dtype=\"int\"))\n    nodes: StorageBackendConfig = StorageBackendConfig(options=StorageOptions(dtype=\"int\"))\n    embeddings: StorageBackendConfig = StorageBackendConfig(options=StorageOptions(dtype=\"float\"))\n    features: StorageBackendConfig = StorageBackendConfig(options=StorageOptions(dtype=\"float\"))\n    prefetch: bool = True\n    shuffle_input: bool = True\n    full_graph_evaluation: bool = True\n    export_encoded_nodes: bool = False\n    model_dir: str = MISSING\n    log_level: str = \"info\"\n    train_edges_pre_sorted: bool = False\n\n    SUPPORTED_EMBEDDING_BACKENDS = [\"PARTITION_BUFFER\", \"DEVICE_MEMORY\", \"HOST_MEMORY\"]\n    SUPPORTED_EDGE_BACKENDS = [\"FLAT_FILE\", \"DEVICE_MEMORY\", \"HOST_MEMORY\"]\n    SUPPORTED_NODE_BACKENDS = [\"DEVICE_MEMORY\", \"HOST_MEMORY\"]\n\n    def __post_init__(self):\n        if self.embeddings.type not in self.SUPPORTED_EMBEDDING_BACKENDS:\n            raise ValueError(\n                \"Storage type for embeddings should be one of PARTITION_BUFFER, DEVICE_MEMORY or HOST_MEMORY\"\n            )\n\n        if self.edges.type not in self.SUPPORTED_EDGE_BACKENDS:\n            raise ValueError(\"Storage type for edges should be one of FLAT_FILE, DEVICE_MEMORY or HOST_MEMORY\")\n\n        if self.nodes.type not in self.SUPPORTED_NODE_BACKENDS:\n            raise ValueError(\"Storage type for nodes should be one of DEVICE_MEMORY or HOST_MEMORY\")\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        if \"device_type\" in input_config.keys():\n            self.device_type = input_config.device_type\n\n        if \"device_ids\" in input_config.keys():\n            self.device_ids = input_config.device_ids\n\n        if \"dataset\" in input_config.keys():\n            self.dataset.merge(input_config.dataset)\n\n        if \"model_dir\" in input_config.keys():\n            self.model_dir = input_config.model_dir\n        else:\n            self.model_dir = get_model_dir_path(self.dataset.dataset_dir)\n\n        if \"edges\" in input_config.keys():\n            self.edges.merge(input_config.edges)\n\n        if \"nodes\" in input_config.keys():\n            if self.nodes is MISSING:\n                self.nodes = StorageBackendConfig(options=StorageOptions(dtype=\"int\"))\n            self.nodes.merge(input_config.nodes)\n\n        if \"embeddings\" in input_config.keys():\n            if self.embeddings is MISSING:\n                self.embeddings = StorageBackendConfig(options=StorageOptions(dtype=\"float\"))\n            self.embeddings.merge(input_config.embeddings)\n\n        if \"features\" in input_config.keys():\n            if self.features is MISSING:\n                self.features = StorageBackendConfig(options=StorageOptions(dtype=\"float\"))\n            self.features.merge(input_config.features)\n\n        if \"prefetch\" in input_config.keys():\n            self.prefetch = input_config.prefetch\n\n        if \"shuffle_input\" in input_config.keys():\n            self.shuffle_input = input_config.shuffle_input\n\n        if \"full_graph_evaluation\" in input_config.keys():\n            self.full_graph_evaluation = input_config.full_graph_evaluation\n\n        if \"export_encoded_nodes\" in input_config.keys():\n            self.export_encoded_nodes = input_config.export_encoded_nodes\n\n        self.__post_init__()\n\n        if \"log_level\" in input_config.keys():\n            self.log_level = input_config.log_level\n\n        if \"train_edges_pre_sorted\" in input_config.keys():\n            self.train_edges_pre_sorted = input_config.train_edges_pre_sorted\n\n\n@dataclass\nclass NegativeSamplingConfig:\n    num_chunks: int = 1\n    negatives_per_positive: int = 1000\n    degree_fraction: float = 0\n    filtered: bool = False\n    local_filter_mode: str = \"DEG\"\n\n    def __post_init__(self):\n        # for filtered mrr, the sampling class members should be ignored\n        if self.num_chunks <= 0:\n            raise ValueError(\"num_chunks must be positive\")\n        if self.negatives_per_positive <= 0 and self.negatives_per_positive != -1:\n            raise ValueError(\"negatives_per_positive must be positive or -1 if using all nodes\")\n        if self.degree_fraction < 0:\n            raise ValueError(\"degree_fraction must not be negative\")\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        if \"num_chunks\" in input_config.keys():\n            self.num_chunks = input_config.num_chunks\n\n        if \"negatives_per_positive\" in input_config.keys():\n            self.negatives_per_positive = input_config.negatives_per_positive\n\n        if \"degree_fraction\" in input_config.keys():\n            self.degree_fraction = input_config.degree_fraction\n\n        if \"filtered\" in input_config.keys():\n            self.filtered = input_config.filtered\n\n        if \"local_filter_mode\" in input_config.keys():\n            self.local_filter_mode = input_config.local_filter_mode\n\n        self.__post_init__()\n\n\n@dataclass\nclass CheckpointConfig:\n    save_best: bool = False\n    interval: int = -1\n    save_state: bool = False\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        if \"save_best\" in input_config.keys():\n            self.save_best = input_config.save_best\n\n        if \"interval\" in input_config.keys():\n            self.interval = input_config.interval\n\n        if \"save_state\" in input_config.keys():\n            self.save_state = input_config.save_state\n\n\n@dataclass\nclass PipelineConfig:\n    sync: bool = True\n    gpu_sync_interval: int = 16\n    gpu_model_average: bool = True\n    staleness_bound: int = 16\n    batch_host_queue_size: int = 4\n    batch_device_queue_size: int = 4\n    gradients_device_queue_size: int = 4\n    gradients_host_queue_size: int = 4\n    batch_loader_threads: int = 4\n    batch_transfer_threads: int = 2\n    compute_threads: int = 1\n    gradient_transfer_threads: int = 2\n    gradient_update_threads: int = 4\n\n    def __post_init__(self):\n        # for the sync setting, pipeline values can be ignored\n        if not self.sync:\n            if self.staleness_bound <= 0:\n                raise ValueError(\"staleness_bound must be positive\")\n            if self.batch_host_queue_size <= 0:\n                raise ValueError(\"batch_host_queue_size must be positive\")\n            if self.batch_device_queue_size <= 0:\n                raise ValueError(\"batch_device_queue_size must be positive\")\n            if self.gradients_device_queue_size <= 0:\n                raise ValueError(\"gradients_device_queue_size must be positive\")\n            if self.batch_loader_threads <= 0:\n                raise ValueError(\"batch_loader_threads must be positive\")\n            if self.batch_transfer_threads <= 0:\n                raise ValueError(\"batch_transfer_threads must be positive\")\n            if self.compute_threads <= 0:\n                raise ValueError(\"compute_threads must be positive\")\n            if self.gradient_transfer_threads <= 0:\n                raise ValueError(\"gradient_transfer_threads must be positive\")\n            if self.gradient_update_threads <= 0:\n                raise ValueError(\"gradient_update_threads must be positive\")\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        for key in self.__dict__.keys():\n            if key in input_config.keys():\n                val = input_config.__getattr__(key)\n                self.__setattr__(key, val)\n\n        self.__post_init__()\n\n\n@dataclass\nclass TrainingConfig:\n    batch_size: int = 1000\n    negative_sampling: NegativeSamplingConfig = MISSING\n    num_epochs: int = 10\n    pipeline: PipelineConfig = PipelineConfig()\n    epochs_per_shuffle: int = 1\n    logs_per_epoch: int = 10\n    save_model: bool = True\n    checkpoint: CheckpointConfig = CheckpointConfig()\n    resume_training: bool = False\n    resume_from_checkpoint: str = \"\"\n\n    def __post_init__(self):\n        if self.batch_size <= 0:\n            raise ValueError(\"batch_size must be positive\")\n        if self.num_epochs <= 0:\n            raise ValueError(\"num_epochs must be positive\")\n        if self.epochs_per_shuffle <= 0:\n            raise ValueError(\"epochs_per_shuffle must be positive\")\n        if self.logs_per_epoch < 0:\n            raise ValueError(\"logs_per_epoch must not be negative\")\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        for key in self.__dict__.keys():\n            if key in input_config.keys():\n                if input_config.get(key, None) is not None:\n                    if key == \"negative_sampling\":\n                        val = input_config.get(\"negative_sampling\", MISSING)\n                        if val is not MISSING:\n                            if self.negative_sampling is MISSING:\n                                self.negative_sampling = NegativeSamplingConfig()\n                            self.negative_sampling.merge(val)\n                    elif key == \"pipeline\":\n                        if self.pipeline is MISSING:\n                            self.pipeline = PipelineConfig()\n                        self.pipeline.merge(input_config.pipeline)\n                    elif key == \"checkpoint\":\n                        self.checkpoint.merge(input_config.checkpoint)\n                    else:\n                        val = input_config.__getattr__(key)\n                        self.__setattr__(key, val)\n\n        self.__post_init__()\n\n\n@dataclass\nclass EvaluationConfig:\n    batch_size: int = 1000\n    negative_sampling: NegativeSamplingConfig = MISSING\n    pipeline: PipelineConfig = PipelineConfig()\n    epochs_per_eval: int = 1\n    checkpoint_dir: str = \"\"\n\n    def __post_init__(self):\n        if self.batch_size <= 0:\n            raise ValueError(\"batch_size must be positive\")\n\n    def merge(self, input_config: DictConfig):\n        \"\"\"\n        Merges under specified dictionary config into the current configuration object\n        :param input_config: The input configuration dictionary\n        :return: Structured output config\n        \"\"\"\n\n        for key in self.__dict__.keys():\n            if key in input_config.keys():\n                if key == \"negative_sampling\":\n                    val = input_config.get(\"negative_sampling\", MISSING)\n                    if val is not MISSING:\n                        if self.negative_sampling is MISSING:\n                            self.negative_sampling = NegativeSamplingConfig()\n                        self.negative_sampling.merge(val)\n                elif key == \"pipeline\":\n                    self.pipeline.merge(input_config.pipeline)\n                else:\n                    val = input_config.__getattr__(key)\n                    self.__setattr__(key, val)\n\n        self.__post_init__()\n\n\n@dataclass\nclass MariusConfig:\n    model: ModelConfig = ModelConfig()\n    storage: StorageConfig = StorageConfig()\n    training: TrainingConfig = TrainingConfig()\n    evaluation: EvaluationConfig = EvaluationConfig()\n\n    # defining this constructor prevents from re-use of old attribute values during testing.\n    def __init__(self):\n        self.model = ModelConfig()\n        self.storage = StorageConfig()\n        self.training = TrainingConfig()\n        self.evaluation = EvaluationConfig()\n\n    def __post_init__(self):\n        if self.model.learning_task == \"NODE_CLASSIFICATION\":\n            # do node classification specific validation\n            pass\n\n        elif self.model.learning_task == \"LINK_PREDICTION\":\n            # do link prediction specific validation\n            pass\n\n\ndef type_safe_merge(base_config: MariusConfig, input_config: DictConfig):\n    \"\"\"\n    Merges under specified dictionary config into the current configuration object\n    :param base_config: The default configuration\n    :param input_config: The input configuration dictionary\n    :return: Structured output config\n    \"\"\"\n\n    if \"model\" in input_config.keys():\n        base_config.model.merge(input_config.model)\n\n    if \"storage\" in input_config.keys():\n        base_config.storage.merge(input_config.storage)\n\n    if \"training\" in input_config.keys():\n        base_config.training.merge(input_config.training)\n\n    if \"evaluation\" in input_config.keys():\n        base_config.evaluation.merge(input_config.evaluation)\n\n    base_config.__post_init__()\n\n    return base_config\n\n\ndef initialize_model_dir(output_config):\n    relation_mapping_filepath = (\n        Path(output_config.storage.dataset.dataset_dir) / Path(\"edges\") / Path(\"relation_mapping.txt\")\n    )\n    if relation_mapping_filepath.exists():\n        shutil.copy(\n            str(relation_mapping_filepath), \"{}/{}\".format(output_config.storage.model_dir, \"relation_mapping.txt\")\n        )\n\n    node_mapping_filepath = Path(output_config.storage.dataset.dataset_dir) / Path(\"nodes\") / Path(\"node_mapping.txt\")\n    if node_mapping_filepath.exists():\n        shutil.copy(str(node_mapping_filepath), \"{}/{}\".format(output_config.storage.model_dir, \"node_mapping.txt\"))\n\n\ndef infer_model_dir(output_config):\n    # if `output_config.storage.model_dir` points to a path which contains saved model params file, then just use that.\n    model_dir_path = Path(output_config.storage.model_dir)\n    model_file_path = model_dir_path / Path(\"model.pt\")\n    if model_dir_path.exists() and model_file_path.exists():\n        return\n\n    # if model_dir is of the form `model_x/`, where x belong to [0, 10], then set model_dir to the largest\n    # existing directory. If model_dir is user specified, the control would never reach here.\n    # the below regex check is an additional validation step.\n    if re.fullmatch(\n        \"{}model_[0-9]+/\".format(output_config.storage.dataset.dataset_dir), output_config.storage.model_dir\n    ):\n        match_result = re.search(r\".*/model_([0-9]+)/$\", output_config.storage.model_dir)\n        last_model_id = -1\n        if len(match_result.groups()) == 1:\n            last_model_id = int(match_result.groups()[0]) - 1\n\n        if last_model_id >= 0:\n            output_config.storage.model_dir = \"{}model_{}/\".format(\n                output_config.storage.dataset.dataset_dir, last_model_id\n            )\n\n\ndef load_config(input_config_path, save=False):\n    \"\"\"\n    This function loads an input user specified configuration file and creates a full configuration file with all\n    defaults set based on the input\n    :param input_config_path: path to the input configuration file\n    :param save: If true, the full configuration file will be saved to <dir_of_input_config>/full_config.yaml\n    :return: config dict object\n    \"\"\"\n    input_config_path = Path(input_config_path).absolute()\n    input_cfg = OmegaConf.load(input_config_path)\n\n    # merge the underspecified input configuration with the fully specified default configuration\n    base_config = MariusConfig()\n    output_config = type_safe_merge(base_config, input_cfg)\n\n    if output_config.storage.dataset.dataset_dir[-1] != \"/\":\n        output_config.storage.dataset.dataset_dir = output_config.storage.dataset.dataset_dir + \"/\"\n\n    if output_config.storage.model_dir[-1] != \"/\":\n        output_config.storage.model_dir += \"/\"\n\n    if output_config.training.resume_from_checkpoint != \"\" and output_config.training.resume_from_checkpoint[-1] != \"/\":\n        output_config.training.resume_from_checkpoint += \"/\"\n\n    if save and (output_config.training.resume_from_checkpoint != \"\" or not output_config.training.resume_training):\n        # create model_dir when\n        # 1. training from scratch [NOT resuming training]\n        # 2. resume_training mode, with resume_from_checkpoint specified.\n        Path(output_config.storage.model_dir).mkdir(parents=True, exist_ok=True)\n        initialize_model_dir(output_config)\n\n        OmegaConf.save(output_config, output_config.storage.model_dir + PathConstants.saved_full_config_file_name)\n\n        # incase of resuming training, copy files from resume_from_checkpoint to the new folder.\n        if output_config.training.resume_from_checkpoint != \"\":\n            dir_util.copy_tree(output_config.training.resume_from_checkpoint, output_config.storage.model_dir)\n\n    else:\n        # this path is taken in test cases where random configs are passed to this function for parsing.\n        # could also be taken when marius_predict is run or marius_train is run with resume_training set to true,\n        # but resume_from_checkpoint isn't specified (it will then overwrite the model_dir with new model)\n        infer_model_dir(output_config)\n\n    # we can then perform validation, and optimization over the fully specified configuration file here before returning\n    validate_dataset_config(output_config)\n    validate_storage_config(output_config)\n    check_encoder_layer_dimensions(output_config)\n    check_gnn_layers_alignment(output_config)\n    check_full_graph_evaluation(output_config)\n\n    return output_config\n"
  },
  {
    "path": "src/python/tools/configuration/validation.py",
    "content": "import os\nfrom pathlib import Path\n\nimport psutil\nfrom omegaconf import MISSING\n\nlong_dtype_list = [\"long\", \"int64\"]\n\n\ndef get_lines_in_file(filepath):\n    return int(os.popen(\"wc -l {}\".format(filepath)).read().lstrip().split(\" \")[0])\n\n\ndef validate_dataset_config(output_config):\n    dataset_config = output_config.storage.dataset\n\n    if dataset_config.initialized is False:\n        return\n\n    if output_config.model.learning_task == \"LINK_PREDICTION\":\n        num_cols = 2 if dataset_config.num_relations == 1 else 3\n        edges_dtype_size = 8 if output_config.storage.edges.options.dtype in long_dtype_list else 4\n        edges_path = Path(dataset_config.dataset_dir + \"edges\")\n        train_edges_filepath = edges_path / Path(\"train_edges.bin\")\n        assert (\n            os.path.getsize(train_edges_filepath) == dataset_config.num_train * num_cols * edges_dtype_size\n        ), \"Expected size for {} is {}, got {}\".format(\n            str(train_edges_filepath),\n            dataset_config.num_train * num_cols * edges_dtype_size,\n            os.path.getsize(train_edges_filepath),\n        )\n\n        test_edges_filepath = edges_path / Path(\"test_edges.bin\")\n        if dataset_config.num_test is not MISSING and dataset_config.num_test != -1:\n            if not test_edges_filepath.exists():\n                raise ValueError(\"{} does not exist\".format(str(test_edges_filepath)))\n\n            assert (\n                os.path.getsize(test_edges_filepath) == dataset_config.num_test * num_cols * edges_dtype_size\n            ), \"Expected size for {} is {}, got {}\".format(\n                str(test_edges_filepath),\n                dataset_config.num_test * num_cols * edges_dtype_size,\n                os.path.getsize(test_edges_filepath),\n            )\n\n            test_edges_partitions_filepath = edges_path / Path(\"test_partition_offsets.txt\")\n            if (\n                not output_config.storage.full_graph_evaluation\n                and not test_edges_partitions_filepath.exists()\n                and output_config.storage.embeddings.type == \"PARTITION_BUFFER\"\n            ):\n                raise ValueError(\n                    \"{} does not exist, required for partitioned eval\".format(test_edges_partitions_filepath)\n                )\n\n        validation_edges_filepath = edges_path / Path(\"validation_edges.bin\")\n        if dataset_config.num_valid is not MISSING and dataset_config.num_valid != -1:\n            if not validation_edges_filepath.exists():\n                raise ValueError(\"{} does not exist\".format(str(validation_edges_filepath)))\n\n            assert (\n                os.path.getsize(validation_edges_filepath) == dataset_config.num_valid * num_cols * edges_dtype_size\n            ), \"Expected size for {} is {}, got {}\".format(\n                str(validation_edges_filepath),\n                dataset_config.num_valid * num_cols * edges_dtype_size,\n                os.path.getsize(validation_edges_filepath),\n            )\n\n            valid_edges_partitions_filepath = edges_path / Path(\"validation_partition_offsets.txt\")\n            if (\n                not output_config.storage.full_graph_evaluation\n                and not valid_edges_partitions_filepath.exists()\n                and output_config.storage.embeddings.type == \"PARTITION_BUFFER\"\n            ):\n                raise ValueError(\n                    \"{} does not exist, required for partitioned eval\".format(valid_edges_partitions_filepath)\n                )\n\n        relation_mapping_filepath = edges_path / Path(\"relation_mapping.txt\")\n        if dataset_config.num_relations > 1:\n            if not relation_mapping_filepath.exists():\n                raise ValueError(\"{} does not exist\".format(str(relation_mapping_filepath)))\n\n            num_lines = get_lines_in_file(relation_mapping_filepath)\n            if num_lines != dataset_config.num_relations:\n                raise ValueError(\n                    \"Expected {} lines in file {}, but found {}\".format(\n                        dataset_config.num_relations, str(relation_mapping_filepath), num_lines\n                    )\n                )\n\n    if output_config.model.learning_task == \"NODE_CLASSIFICATION\":\n        nodes_dtype_size = 8 if output_config.storage.nodes.options.dtype in long_dtype_list else 4\n        nodes_path = Path(dataset_config.dataset_dir + \"nodes\")\n        train_nodes_filepath = nodes_path / Path(\"train_nodes.bin\")\n        if not train_nodes_filepath.exists():\n            raise ValueError(\"{} does not exist\".format(str(train_nodes_filepath)))\n\n        assert (\n            os.path.getsize(train_nodes_filepath) == dataset_config.num_train * nodes_dtype_size\n        ), \"Expected size for {} is {}, got {}\".format(\n            str(train_nodes_filepath),\n            dataset_config.num_train * nodes_dtype_size,\n            os.path.getsize(train_nodes_filepath),\n        )\n\n        test_nodes_filepath = nodes_path / Path(\"test_nodes.bin\")\n        if dataset_config.num_test is not MISSING and dataset_config.num_test != -1:\n            if not test_nodes_filepath.exists():\n                raise ValueError(\"{} does not exist\".format(str(test_nodes_filepath)))\n\n            assert (\n                os.path.getsize(test_nodes_filepath) == dataset_config.num_test * nodes_dtype_size\n            ), \"Expected size for {} is {}, got {}\".format(\n                str(test_nodes_filepath),\n                dataset_config.num_test * nodes_dtype_size,\n                os.path.getsize(test_nodes_filepath),\n            )\n\n        valid_nodes_filepath = nodes_path / Path(\"validation_nodes.bin\")\n        if dataset_config.num_valid is not MISSING and dataset_config.num_valid != -1:\n            if not valid_nodes_filepath.exists():\n                raise ValueError(\"{} does not exist\".format(str(valid_nodes_filepath)))\n\n            assert (\n                os.path.getsize(valid_nodes_filepath) == dataset_config.num_valid * nodes_dtype_size\n            ), \"Expected size for {} is {}, got {}\".format(\n                str(valid_nodes_filepath),\n                dataset_config.num_valid * nodes_dtype_size,\n                os.path.getsize(valid_nodes_filepath),\n            )\n\n\ndef validate_storage_config(output_config):\n    storage_config = output_config.storage\n    dataset_config = storage_config.dataset\n\n    if dataset_config.initialized is False:\n        return\n\n    if storage_config.embeddings.type != \"PARTITION_BUFFER\" and storage_config.features.type != \"PARTITION_BUFFER\":\n        return\n\n    edges_path = Path(dataset_config.dataset_dir + \"edges\")\n    train_edges_partitions_filepath = edges_path / Path(\"train_partition_offsets.txt\")\n    if not train_edges_partitions_filepath.exists():\n        raise ValueError(\n            \"{} does not exist, required for PARTITION_BUFFER mode\".format(str(train_edges_partitions_filepath))\n        )\n\n    num_lines = get_lines_in_file(train_edges_partitions_filepath)\n    num_partitions = storage_config.embeddings.options.num_partitions\n    assert num_lines == num_partitions**2, (\n        \"Expected to see {} lines in {}, but found {} lines\\n\"\n        \"marius_preprocess was likely run with sqrt({}) partitions, \"\n        \"but config file has {} partitions\".format(\n            num_partitions**2, str(train_edges_partitions_filepath), num_lines, num_lines, num_partitions\n        )\n    )\n\n    test_edges_partitions_filepath = edges_path / Path(\"test_partition_offsets.txt\")\n    if test_edges_partitions_filepath.exists():\n        num_lines = get_lines_in_file(test_edges_partitions_filepath)\n        assert num_lines == num_partitions**2, (\n            \"Expected to see {} lines in {}, but found {} lines\\n\"\n            \"marius_preprocess was likely run with sqrt({}) partitions, \"\n            \"but config file has {} partitions\".format(\n                num_partitions**2, str(test_edges_partitions_filepath), num_lines, num_lines, num_partitions\n            )\n        )\n\n    valid_edges_partitions_filepath = edges_path / Path(\"validation_partition_offsets.txt\")\n    if valid_edges_partitions_filepath.exists():\n        num_lines = get_lines_in_file(valid_edges_partitions_filepath)\n        assert num_lines == num_partitions**2, (\n            \"Expected to see {} lines in {}, but found {} lines\\n\"\n            \"marius_preprocess was likely run with sqrt({}) partitions, \"\n            \"but config file has {} partitions\".format(\n                num_partitions**2, str(valid_edges_partitions_filepath), num_lines, num_lines, num_partitions\n            )\n        )\n\n    return\n\n\ndef check_encoder_layer_dimensions(output_config):\n    if output_config.model.encoder is MISSING or output_config.model.encoder == -1:\n        raise ValueError(\"No Encoder layer found. Expected to see at least 1 layer\")\n    embeddings_output_dim = -1\n    features_output_dim = -1\n    layers = output_config.model.encoder.layers\n    # ensure that each layer has correct number of inputs and outputs\n    for stage_idx, layer_list in enumerate(layers):\n        for layer_idx, layer in enumerate(layer_list):\n            if layer.type == \"EMBEDDING\":\n                assert (\n                    layer.input_dim == -1\n                ), \"Expected Embedding layer to have no input, but found input dim as {}\".format(layer.input_dim)\n                assert (\n                    layer.output_dim > 0\n                ), \"Expected output dimension for Embedding layer to be > 0, but found {}\".format(layer.output_dim)\n                embeddings_output_dim = layer.output_dim if embeddings_output_dim == -1 else embeddings_output_dim\n                assert (\n                    embeddings_output_dim == layer.output_dim\n                ), \"All Embedding Layers must have the same output dimension, found {} and {}\".format(\n                    embeddings_output_dim, layer.output_dim\n                )\n                continue\n\n            if layer.type == \"FEATURE\":\n                assert (\n                    layer.input_dim == -1\n                ), \"Expected Feature layer to have no input, but found input dim as {}\".format(layer.input_dim)\n                assert (\n                    layer.output_dim > 0\n                ), \"Expected output dimension for Feature layer to be > 0, but found {}\".format(layer.output_dim)\n                features_output_dim = layer.output_dim if features_output_dim == -1 else features_output_dim\n                assert (\n                    features_output_dim == layer.output_dim\n                ), \"All Feature Layers must have the same output dimension, found {} and {}\".format(\n                    features_output_dim, layer.output_dim\n                )\n                continue\n\n            if layer.type == \"GNN\":\n                # should have one input and one output\n                assert stage_idx > 0, \"GNN Layer found in Stage 0\"\n                assert (\n                    len(layers[stage_idx - 1]) > layer_idx\n                ), \"Corresponding previous Layer for GNN Layer in Stage {} not found\".format(stage_idx)\n                assert layers[stage_idx - 1][layer_idx].output_dim == layer.input_dim, (\n                    \"GNN Layer in Stage {} has input dimension of {}, \"\n                    \"but output dimension of previous layers is {}\".format(\n                        stage_idx, layer.input_dim, layers[stage_idx - 1][layer_idx].output_dim\n                    )\n                )\n                continue\n\n            if layer.type == \"REDUCTION\" or layer.type == \"DENSE\":\n                # no constraints on input and output dim\n                continue\n\n            raise ValueError(\"Unsupported layer type\\nShould be one of EMBEDDING, FEATURE, REDUCTION, GNN, DENSE\")\n\n    # ensure that output dimension of a stage is equal to the input dimension of the next one\n    for i in range(1, len(layers)):\n        prev_stage_output_dim_sum = sum([layer.output_dim for layer in layers[i - 1]])\n        cur_stage_input_dim_sum = sum([layer.input_dim for layer in layers[i]])\n        if prev_stage_output_dim_sum != cur_stage_input_dim_sum:\n            raise ValueError(\n                \"Encoder layers dimension mismatch.\\n\"\n                \"Output dimension of stage {} = {}\\n\"\n                \"Input dimension of stage {} = {}\".format(i - 1, prev_stage_output_dim_sum, i, cur_stage_input_dim_sum)\n            )\n\n\ndef check_gnn_layers_alignment(output_config):\n    # we now know that there will be at least one layer as check_encoder_layer_dimensions was called before\n    layers = output_config.model.encoder.layers\n    gnn_stage_count = 0\n    for i in range(len(layers)):\n        for layer in layers[i]:\n            if layer.type == \"GNN\":\n                gnn_stage_count += 1\n                break\n\n    neighbor_sampling_layers = output_config.model.encoder.train_neighbor_sampling\n    assert gnn_stage_count == len(\n        neighbor_sampling_layers\n    ), \"#GNN Stages != #train_neighbor_sampling layers\\nGNN Stages = {}, train_neighbor_sampling layers = {}\".format(\n        gnn_stage_count, len(neighbor_sampling_layers)\n    )\n\n\n# will remove this once AnzeXie's pr is merged\ndef retrieve_memory_info():\n    mem = psutil.virtual_memory()\n    return mem.total\n\n\n# will remove this once AnzeXie's pr is merged\ndef get_storage_overheads(output_config):\n    num_nodes = output_config.storage.dataset.num_nodes\n    num_edges = output_config.storage.dataset.num_edges\n    num_relations = output_config.storage.dataset.num_relations\n    embedding_dim = 0 if output_config.model.encoder.embedding_dim == -1 else output_config.model.encoder.embedding_dim\n    edge_dtype_size = 8 if output_config.storage.edges.options.dtype in long_dtype_list else 4\n    node_dtype_size = 8 if output_config.storage.nodes.options.dtype in long_dtype_list else 4\n    feature_mem_overhead = 0\n\n    if output_config.storage.dataset.node_feature_dim != -1:\n        feature_dtype_size = 8 if output_config.storage.features.options.dtype in long_dtype_list else 4\n        feature_dim = output_config.storage.dataset.node_feature_dim\n        feature_mem_overhead = feature_dim * num_nodes * feature_dtype_size\n\n    node_mem_overhead = 2 * num_nodes * embedding_dim * node_dtype_size + feature_mem_overhead\n    rel_mem_overhead = 2 * num_relations * embedding_dim * edge_dtype_size\n    edge_mem_overhead = (\n        num_edges * 2 * edge_dtype_size * 2 if num_relations == 1 else num_edges * 3 * edge_dtype_size * 2\n    )\n\n    return node_mem_overhead, rel_mem_overhead, edge_mem_overhead\n\n\ndef check_full_graph_evaluation(output_config):\n    if output_config.storage.dataset.initialized is False:\n        return\n\n    full_graph_evaluation = output_config.storage.full_graph_evaluation\n    if not full_graph_evaluation:\n        return\n\n    # replace these function call\n    mem_available = retrieve_memory_info()\n    node_mem_overhead, rel_mem_overhead, _ = get_storage_overheads(output_config)\n    if node_mem_overhead + rel_mem_overhead > mem_available:\n        raise ValueError(\n            \"full_graph_evaluation set to true, but not enough memory available for storing node and relation\"\n            \" embeddings\\nRequired memory = {} bytes, Available memory = {} bytes\".format(\n                str(node_mem_overhead + rel_mem_overhead), str(mem_available)\n            )\n        )\n"
  },
  {
    "path": "src/python/tools/db2graph/marius_db2graph.py",
    "content": "import argparse\nimport logging\nimport re\nimport sys\nimport time\nfrom pathlib import Path\n\nimport mysql.connector\nimport pandas as pd\nimport psutil\nimport psycopg2\nfrom mysql.connector import errorcode\nfrom omegaconf import OmegaConf\n\nINVALID_ENTRY_LIST = [\"0\", None, \"\", 0, \"not reported\", \"None\", \"none\"]\nFETCH_SIZE = int(1e4)\nMAX_FETCH_SIZE = int(1e9)\nOUTPUT_FILE_NAME = \"edges.txt\"\n\n\ndef set_args():\n    parser = argparse.ArgumentParser(\n        description=(\n            \"Db2Graph is tool to generate graphs from relational database using SQL queries.                See\"\n            \" documentation docs/db2graph for more details.\"\n        ),\n        prog=\"db2graph\",\n    )\n\n    parser.add_argument(\n        \"--config_path\",\n        metavar=\"config_path\",\n        type=str,\n        default=\"\",\n        help=\"Path to the config file. See documentation docs/db2graph for more details.\",\n    )\n\n    parser.add_argument(\n        \"--output_directory\",\n        metavar=\"output_directory\",\n        type=str,\n        default=\"./\",\n        help=\"Directory to put output data and log file. See documentation docs/db2graph for more details.\",\n    )\n    return parser\n\n\ndef config_parser_fn(config_name):\n    \"\"\"\n    Takes the input yaml config file's name (& relative path). Returns all the extracted data\n\n    :param config_name: file name (& relative path) for the YAML config file\n    :returns:\n        - db_server: string denoting database server (initial support only for mariadb)\n        - db_name: name of the database you need to pull from\n        - db_user: user name used to access the database\n        - db_password: password used to access the database\n        - db_host: hostname of the database\n        - edges_queries_list: list of sql queries to define edges of type entity nodes to entity nodes\n            & the names of edges\n    \"\"\"\n    input_cfg = None\n    input_config_path = Path(config_name).absolute()\n\n    input_cfg = OmegaConf.load(input_config_path)\n\n    # db_server used to distinguish between different databases\n    db_server = None\n    if \"db_server\" in input_cfg.keys():\n        db_server = input_cfg[\"db_server\"]\n    else:\n        logging.error(\"ERROR: db_server is not defined\")\n        exit(1)\n\n    # db_name is the name of the database to pull the data from\n    db_name = None\n    if \"db_name\" in input_cfg.keys():\n        db_name = input_cfg[\"db_name\"]\n    else:\n        logging.error(\"ERROR: db_name is not defined\")\n        exit(1)\n\n    # db_user is the user name used to access the database\n    db_user = None\n    if \"db_user\" in input_cfg.keys():\n        db_user = input_cfg[\"db_user\"]\n    else:\n        logging.error(\"ERROR: db_user is not defined\")\n\n    # db_password is the password used to access the database\n    db_password = None\n    if \"db_password\" in input_cfg.keys():\n        db_password = input_cfg[\"db_password\"]\n    else:\n        logging.error(\"ERROR: db_password is not defined\")\n\n    # db_host is the hostname of the database\n    db_host = None\n    if \"db_host\" in input_cfg.keys():\n        db_host = input_cfg[\"db_host\"]\n    else:\n        logging.error(\"ERROR: db_host is not defined\")\n\n    # Getting all edge queries for edge type entity node to entity node\n    edges_queries_list = list()\n    edge_rel_list = list()\n    if \"edges_queries\" in input_cfg.keys():\n        query_filepath = input_cfg[\"edges_queries\"]\n\n        if not Path(query_filepath).exists():\n            raise ValueError(\"{} does not exist\".format(str(query_filepath)))\n\n        edge_queries_file = open(query_filepath, \"r\")\n        read_lines = edge_queries_file.readlines()\n        for i in range(len(read_lines)):\n            read_lines[i] = read_lines[i].strip()\n            if read_lines[i] == \"\":\n                logging.error(\"Error: Empty lines are not allowed in edges_query file. \" + \"Please remove them\")\n                exit(1)\n\n            # Removing the last '\\n' character\n            if read_lines[i][-1] == \"\\n\":\n                read_lines[i] = read_lines[i][:-1]\n\n            # Adding the line to rel_list if even else its a query\n            if i % 2 == 0:\n                edge_rel_list.append(read_lines[i])\n            else:\n                edges_queries_list.append(read_lines[i])\n    else:\n        logging.error(\"ERROR: edges_queries is not defined\")\n        exit(1)\n\n    return db_server, db_name, db_user, db_password, db_host, edges_queries_list, edge_rel_list\n\n\ndef connect_to_db(db_server, db_name, db_user, db_password, db_host):\n    \"\"\"\n    Function takes db_server and db_name as the input. Tries to connect to the database and returns an object\n    which can be used to execute queries.\n    Assumption: default user: root, host: 127.0.0.1 and password:\"\". You will need to change code if otherwise\n\n    :param db_server: The name of the backend database application used for accessing data\n    :param db_name: The name of the database where the data resides\n    :param db_user: The user name used to access the database\n    :param db_password: The password used to access the database\n    :param db_host: The hostname of the database\n    :return cnx: cursor object that can be used to execute the database queries\n    \"\"\"\n    if db_server == \"maria-db\" or db_server == \"my-sql\":\n        try:\n            cnx = mysql.connector.connect(user=db_user, password=db_password, host=db_host, database=db_name)\n        except mysql.connector.Error as err:\n            if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:\n                logging.error(f\"Incorrect user name or password\\n{err}\")\n            elif err.errno == errorcode.ER_BAD_DB_ERROR:\n                logging.error(f\"Non-existing database\\n{err}\")\n            else:\n                logging.error(err)\n\n    elif db_server == \"postgre-sql\":\n        try:\n            cnx = psycopg2.connect(user=db_user, password=db_password, host=db_host, database=db_name)\n        except psycopg2.Error as err:\n            logging.error(f\"Error\\n{err}\")\n\n    else:\n        logging.error(\"Other databases are currently not supported.\")\n\n    return cnx\n\n\ndef validation_check_edge_entity_entity_queries(edges_queries_list):\n    \"\"\"\n    Ensures that the edge_entity_entity_queries are correctly formatted.\n\n    :param edges_queries_list: List of all the queries defining edges from entity nodes to entity nodes\n    :return new_query_list: These are updated queries with necessary changes if any\n    \"\"\"\n    # Format: SELECT table1_name.col1_name, table2_name.col2_name FROM ____ WHERE ____ (and so on);\n    logging.info(\"\\nValidating queries for proper formatting\")\n    new_query_list = list()\n    for q in range(len(edges_queries_list)):\n        logging.info(f\"Checking query[{q}]\")\n        qry_split = edges_queries_list[q].strip().split()\n\n        if \"AS\" in qry_split or \"as\" in qry_split:\n            logging.error(\"Error: Cannot use AS keyword in query. Please update\" + \" the query\")\n            exit(1)\n\n        check_var = qry_split[0].lower()\n        if check_var != \"select\":\n            logging.error(\"Error: Incorrect edge entity node - entity node formatting, \" + \"not starting with SELECT\")\n            exit(1)\n\n        check_split = qry_split[1].split(\".\")\n        if len(check_split) != 2:\n            logging.error(\n                \"Error: Incorrect edge entity node - entity node formatting, \"\n                + \"table1_name.col1_name not correctly formatted\"\n            )\n            exit(1)\n        if check_split[1][-1] != \",\":\n            logging.error(\n                \"Error: Incorrect edge entity node - entity node formatting, \"\n                + \"missing ',' at the end of table1_name.col1_name\"\n            )\n            exit(1)\n\n        check_split = qry_split[2].split(\".\")\n        if len(check_split) != 2:\n            logging.error(\n                \"Error: Incorrect edge entity node - entity node formatting, \"\n                + \"table2_name.col2_name not correctly formatted\"\n            )\n            exit(1)\n\n        check_var = qry_split[3].lower()\n        if check_var != \"from\":\n            logging.error(\n                \"Error: Incorrect edge entity node - entity node formatting, \"\n                + \"extra elements after table2_name.col2_name\"\n            )\n            exit(1)\n\n        new_query_list.append(edges_queries_list[q])\n\n    return new_query_list\n\n\ndef clean_token(token):\n    \"\"\"\n    Helper to clean a dataframe, can be used by applying this function to a dataframe\n\n    :param token: elements to clean\n    :return token: cleaned token\n    \"\"\"\n    token = str(token)\n    token = token.strip().strip(\"\\t.'\\\" \")\n    return token.lower()\n\n\ndef get_init_fetch_size():\n    \"\"\"\n    In an initial pass, estimates the optimal maximum possible fetch_size\n    for given query based on memory usage report of virtual_memory()\n\n    :return limit_fetch_size: the optimal maximum possible fetch_size for database engine\n    \"\"\"\n    mem_copy = psutil.virtual_memory()\n    mem_copy_used = mem_copy.used\n    limit_fetch_size = min(mem_copy.available / 2, MAX_FETCH_SIZE)  # max fetch_size limited to MAX_FETCH_SIZE\n    return limit_fetch_size, mem_copy_used\n\n\ndef get_fetch_size(fetch_size, limit_fetch_size, mem_copy_used):\n    \"\"\"\n    Calculates the optimal maximum fetch_size based on the current snapshot of virtual_memory()\n    Increase fetch_size if the amount of memory used is less than half of machine's total available memory\n    The size of fetch_size is limited between 10000 and limit_fetch_size bytes\n\n    :param limit_fetch_size: the optimal maximum possible fetch_size\n    :return fetch_size: updated fetch_size passed into database engine\n    \"\"\"\n    delta = (\n        psutil.virtual_memory().used - mem_copy_used\n    )  # delta between two virtual_memory(), i.e. mem used for curr fetch_size\n    est_fetch_size = limit_fetch_size / (delta + 1) * fetch_size  # estimated optimal fetch_size\n    if est_fetch_size > limit_fetch_size:\n        fetch_size = int(limit_fetch_size)\n    elif FETCH_SIZE < est_fetch_size and est_fetch_size <= limit_fetch_size:\n        fetch_size = int(est_fetch_size)\n    else:\n        fetch_size = FETCH_SIZE\n    return fetch_size\n\n\ndef get_cursor(cnx, db_server, cursor_name):\n    \"\"\"\n    Gets the cursor for the database connection\n\n    :param cnx: database connection\n    :param db_server: database server\n    :param cursor_name: name of the cursor (needed for postgre-sql)\n    :return cursor: cursor for database connection\n    \"\"\"\n    cursor = []\n    if db_server == \"maria-db\" or db_server == \"my-sql\":\n        cursor = cnx.cursor()\n    elif db_server == \"postgre-sql\":\n        cursor = cnx.cursor(name=cursor_name)\n    return cursor\n\n\ndef post_processing(output_dir, cnx, edges_queries_list, edge_rel_list, db_server):\n    \"\"\"\n    Executes the given queries_list one by one, cleanses the data by removing duplicates,\n    then append the entity nodes with tableName_colName which works as Unique Identifier,\n    and store the final result in a dataframe/.txt file\n\n    :param output_dir: Directory to put output file\n    :param cnx: Cursor object\n    :param edges_queries_list: List of all the queries defining edges from entity nodes to entity nodes\n    :param edge_rel_list: List of all the relationships defining edges from entity nodes to entity nodes\n    :param db_server: database server name\n    :return 0: 0 for success, exit code 1 for failure\n    \"\"\"\n    if len(edges_queries_list) != len(edge_rel_list):\n        logging.error(\"Number of queries in edges_queries_list must match number of edges in edge_rel_list\")\n        exit(1)\n\n    open(output_dir / Path(OUTPUT_FILE_NAME), \"w\").close()  # Clearing the output file\n    logging.info(\"\\nProcessing queries to generate edges\")\n\n    fetch_size = FETCH_SIZE\n    # generating edges entity node to entity nodes\n    for i in range(len(edges_queries_list)):\n        start_time2 = time.time()\n        first_pass = True\n\n        # Executing the query and timing it\n        query = edges_queries_list[i]\n        cursor_name = \"edge_entity_entity_cursor\" + str(\n            i\n        )  # Name imp because: https://www.psycopg.org/docs/usage.html#server-side-cursors\n        cursor = get_cursor(cnx, db_server, cursor_name)\n        cursor.execute(query)\n\n        # Getting Basic Details\n        table_name_list = re.split(\" \", query)  # table name of the query to execute\n        table_name1 = table_name_list[1].split(\".\")[0]  # src table\n        col_name1 = table_name_list[1].split(\".\")[1][:-1]  # src column, (note last character ',' is removed)\n        table_name2 = table_name_list[2].split(\".\")[0]  # dst/target table\n        col_name2 = table_name_list[2].split(\".\")[1]  # dst/target column\n\n        # Processing each batch of cursor on client\n        rows_completed = 0\n\n        # In an initial sample pass, estimates the optimal maximum possible fetch_size for\n        # given query based on memory usage report of virtual_memory()\n        # process data with fetch_size=10000, record the amount of memory used,\n        # increase fetch_size if the amount of memory used is less than half of machine's total available memory,\n        # Note: all unit size are in bytes, fetch_size limited between 10000 and 100000000 bytes\n        if first_pass:\n            limit_fetch_size, mem_copy_used = get_init_fetch_size()\n\n        # Potential issue: There might be duplicates now possible as drop_duplicates over smaller range\n        # expected that user db does not have dupliacted\n        while True:  # Looping till all rows are completed and processed\n            result = cursor.fetchmany(fetch_size)\n            result = pd.DataFrame(result)\n            if result.shape[0] == 0:\n                break\n\n            # Cleaning Part\n            result = result.applymap(clean_token)  # strip tokens and lower case strings\n            result = result[~result.iloc[:, 1].isin(INVALID_ENTRY_LIST)]  # clean invalid data\n            result = result[~result.iloc[:, 0].isin(INVALID_ENTRY_LIST)]\n            result = result.drop_duplicates()  # remove invalid row\n\n            result.iloc[:, 0] = table_name1 + \"_\" + col_name1 + \"_\" + result.iloc[:, 0]  # src\n            result.iloc[:, 1] = table_name2 + \"_\" + col_name2 + \"_\" + result.iloc[:, 1]  # dst/target\n            result.insert(1, \"rel\", edge_rel_list[i])  # rel\n            result.columns = [\"src\", \"rel\", \"dst\"]\n\n            # storing the output\n            result.to_csv(\n                output_dir / Path(OUTPUT_FILE_NAME), sep=\"\\t\", header=False, index=False, mode=\"a\"\n            )  # Appending the output to disk\n            del result\n            rows_completed += fetch_size\n\n            # update fetch_size based on current snapshot of the machine's memory usage\n            if first_pass:\n                fetch_size = get_fetch_size(fetch_size, limit_fetch_size, mem_copy_used)\n                first_pass = False\n        logging.info(f\"Finished processing query[{i}] in {time.time() - start_time2:.3f} seconds\")\n\n\ndef main():\n    total_time = time.time()\n    parser = set_args()\n    args = parser.parse_args()\n\n    ret_data = config_parser_fn(args.config_path)\n    db_server = ret_data[0]\n    db_name = ret_data[1]\n    db_user = ret_data[2]\n    db_password = ret_data[3]\n    db_host = ret_data[4]\n    edges_queries_list = ret_data[5]\n    edge_rel_list = ret_data[6]\n\n    output_dir = Path(args.output_directory)\n    output_dir.mkdir(parents=True, exist_ok=True)\n    logging.basicConfig(\n        filename=output_dir / Path(\"marius_db2graph.log\"), level=logging.INFO, filemode=\"w\"\n    )  # set filemode='w' if want to start a fresh log file\n    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))  # add handler to print to console\n\n    try:\n        logging.info(f\"\\nStarting marius_db2graph conversion tool for config: {args.config_path}\")\n\n        cnx = connect_to_db(db_server, db_name, db_user, db_password, db_host)\n\n        # Generating edges\n        edges_queries_list = validation_check_edge_entity_entity_queries(edges_queries_list)\n        post_processing(output_dir, cnx, edges_queries_list, edge_rel_list, db_server)\n\n        cnx.close()\n        logging.info(f\"\\nTotal execution time: {time.time()-total_time:.3f} seconds\")\n        logging.info(\"\\nEdge file written to \" + str(output_dir / Path(OUTPUT_FILE_NAME)))\n    except Exception as e:\n        logging.error(e)\n        logging.info(f\"\\nTotal execution time: {time.time()-total_time:.3f} seconds\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/python/tools/marius_config_generator.py",
    "content": "import argparse\nimport os\nfrom pathlib import Path\n\nimport pandas as pd\n\nHERE = os.path.abspath(os.path.dirname(__file__))\nDEFAULT_CONFIG_FILE = os.path.join(HERE, \"config_templates\", \"default_configs.txt\")\nDATASET_STATS = os.path.join(HERE, \"dataset_stats\", \"dataset_stats.tsv\")\n\n\ndef output_config(config_dict, output_dir):\n    device = config_dict.get(\"device\")\n    if config_dict.get(\"dataset\") is None:\n        ds_name = \"custom\"\n    else:\n        ds_name = config_dict.get(\"dataset\")\n\n    file = Path(output_dir) / Path(str(ds_name) + \"_\" + device.lower() + \".ini\")\n    all_sections = [\n        \"general\",\n        \"model\",\n        \"storage\",\n        \"training_sampling\",\n        \"training\",\n        \"training_pipeline\",\n        \"evaluation\",\n        \"evaluation_pipeline\",\n        \"path\",\n        \"reporting\",\n    ]\n    opts = list(config_dict.keys())\n    section_to_print = []\n\n    for sec in all_sections:\n        for key in opts:\n            if key.split(\".\")[0] == sec:\n                if sec not in section_to_print:\n                    section_to_print.append(sec)\n\n    with open(file, \"w+\") as f:\n        for sec in section_to_print:\n            f.write(\"[\" + sec + \"]\\n\")\n            for key in opts:\n                if key.split(\".\")[0] == sec:\n                    f.write(key.split(\".\")[1] + \"=\" + str(config_dict.get(key)) + \"\\n\")\n            f.write(\"\\n\")\n\n\ndef read_template(file):\n    with open(file, \"r\") as f:\n        lines = f.readlines()\n\n    keys = []\n    values = []\n    valid_dict = {}\n    for line in lines:\n        line = line.split(\"=\")\n        line[1] = line[1].rstrip()\n        keys.append(line[0])\n        sub_line = line[1].split(\"*\")\n        values.append(sub_line[0])\n        if len(sub_line) > 1:\n            valid_dict.update({line[0]: sub_line[1:]})\n    config_dict = dict(zip(keys, values))\n\n    return config_dict, valid_dict\n\n\ndef set_up_files(output_directory):\n    try:\n        if not Path(output_directory).exists():\n            Path(output_directory).mkdir(parents=False, exist_ok=False)\n    except FileExistsError:\n        print(\"Directory already exists.\")\n    except FileNotFoundError:\n        print(\"Incorrect parent path given for output directory.\")\n\n\ndef update_dataset_stats(dataset, arg_dict, config_dict):\n    datasets_stats = pd.read_csv(DATASET_STATS, sep=\"\\t\")\n    stats_row = datasets_stats[datasets_stats[\"dataset\"] == dataset]\n    if not stats_row.empty:\n        stats_list = stats_row.iloc[0][[\"num_nodes\", \"num_train\", \"num_relations\", \"num_valid\", \"num_test\"]].tolist()\n        arg_dict = update_stats(stats_list, arg_dict, config_dict, opt=\"stats_dataset\")\n    else:\n        raise RuntimeError(\"Unrecognized dataset\")\n\n    return arg_dict\n\n\ndef update_stats(stats, arg_dict, config_dict, opt=\"stats\"):\n    keys_common = [\"general.num_nodes\", \"general.num_train\"]\n    for i in range(len(keys_common)):\n        k = keys_common[i]\n        if arg_dict.get(k) is None and config_dict.get(k) != stats[i]:\n            arg_dict.update({k: stats[i]})\n\n    if opt == \"stats_dataset\":\n        keys = [\"general.num_relations\", \"general.num_valid\", \"general.num_test\"]\n        for i in range(len(keys)):\n            k = keys[i]\n            if arg_dict.get(k) is None and config_dict.get(k) != stats[i + 2]:\n                arg_dict.update({k: stats[i + 2]})\n    else:\n        if arg_dict.get(\"general.num_edges\") is None and config_dict.get(\"general.num_edges\") != stats[2]:\n            arg_dict.update({\"general.num_edges\": str(int(stats[2]))})\n\n    return arg_dict\n\n\ndef update_data_path(dir, arg_dict):\n    dir = Path(dir)\n\n    if arg_dict.get(\"path.train_edges\") is None:\n        arg_dict.update({\"path.train_edges\": str(dir / Path(\"train_edges.pt\"))})\n\n    if arg_dict.get(\"custom_ordering\"):\n        arg_dict.update({\"path.custom_ordering\": str(dir / Path(\"custom_ordering.txt\"))})\n\n    if arg_dict.get(\"partitions_train\"):\n        arg_dict.update({\"path.train_edges_paritions\": str(dir / Path(\"train_edges_partitions.txt\"))})\n\n    if arg_dict.get(\"partitions_valid\") and arg_dict.get(\"general.num_valid\") != \"0\":\n        arg_dict.update({\"path.validation_edges_paritions\": str(dir / Path(\"validation_edges_partitions.txt\"))})\n\n    if arg_dict.get(\"partitions_test\") and arg_dict.get(\"general.num_test\") != \"0\":\n        arg_dict.update({\"path.test_edges_paritions\": str(dir / Path(\"test_edges_partitions.txt\"))})\n\n    if arg_dict.get(\"general.learning_task\") is None:\n        if arg_dict.get(\"general.num_valid\") != \"0\" and arg_dict.get(\"path.validation_edges\") is None:\n            arg_dict.update({\"path.validation_edges\": str(dir / Path(\"valid_edges.pt\"))})\n\n        if arg_dict.get(\"general.num_test\") != \"0\" and arg_dict.get(\"path.test_edges\") is None:\n            arg_dict.update({\"path.test_edges\": str(dir / Path(\"test_edges.pt\"))})\n\n        if arg_dict.get(\"path.node_ids\") is None:\n            arg_dict.update({\"path.node_ids\": str(dir / Path(\"node_mapping.txt\"))})\n\n        if arg_dict.get(\"general.num_relations\") != \"1\" and arg_dict.get(\"path.relation_ids\") is None:\n            arg_dict.update({\"path.relation_ids\": str(dir / Path(\"rel_mapping.txt\"))})\n    else:\n        if arg_dict.get(\"path.train_nodes\") is None:\n            arg_dict.update({\"path.train_nodes\": str(dir / Path(\"train_nodes.pt\"))})\n\n        if arg_dict.get(\"path.node_features\") is None:\n            arg_dict.update({\"path.node_features\": str(dir / Path(\"features.pt\"))})\n\n        if arg_dict.get(\"path.node_labels\") is None:\n            arg_dict.update({\"path.node_labels\": str(dir / Path(\"labels.pt\"))})\n\n        if arg_dict.get(\"general.num_valid\") != \"0\" and arg_dict.get(\"path.valid_nodes\") is None:\n            arg_dict.update({\"path.valid_nodes\": str(dir / Path(\"valid_nodes.pt\"))})\n\n        if arg_dict.get(\"general.num_test\") != \"0\" and arg_dict.get(\"path.test_nodes\") is None:\n            arg_dict.update({\"path.test_nodes\": str(dir / Path(\"test_nodes.pt\"))})\n\n    return arg_dict\n\n\ndef set_args():\n    parser = argparse.ArgumentParser(\n        description=\"Generate configs\",\n        prog=\"config_generator\",\n        formatter_class=argparse.RawTextHelpFormatter,\n        epilog=(\"Specify certain config (optional): \" + \"[--<section>.<key>=<value>]\"),\n    )\n    mode = parser.add_mutually_exclusive_group()\n    parser.add_argument(\n        \"output_directory\",\n        metavar=\"output_directory\",\n        type=str,\n        help=\"Directory to put configs \\nAlso \"\n        + \"assumed to be the default directory of preprocessed\"\n        + \" data if --data_directory is not specified\",\n    )\n    parser.add_argument(\n        \"--data_directory\", metavar=\"data_directory\", type=str, help=\"Directory of the preprocessed data\"\n    )\n    mode.add_argument(\"--dataset\", \"-d\", metavar=\"dataset\", type=str, help=\"Dataset to preprocess\")\n    mode.add_argument(\n        \"--stats\",\n        \"-s\",\n        metavar=(\"num_nodes\", \"num_train\"),\n        nargs=2,\n        help=\"Dataset statistics.\\n\"\n        + \"Enter in order of num_nodes, num_train.\\n\"\n        + \"This option will be overwritten if general.num_nodes \"\n        + \"and/or general.num_train are/is specified.\",\n    )\n    mode.add_argument(\n        \"--stats_nc\",\n        \"-s_nc\",\n        metavar=(\"num_nodes\", \"num_train\", \"num_edges\"),\n        nargs=3,\n        help=\"Enter in order of num_nodes, num_train, \"\n        + \"num_edges if the learning task is node \"\n        + \"classification.\\n\"\n        + \"This option will be overwritten if general.num_edges\"\n        + \", general.num_nodes, general.num_train is specified.\",\n    )\n    parser.add_argument(\n        \"--device\",\n        \"-dev\",\n        metavar=\"generate_config\",\n        choices=[\"GPU\", \"CPU\", \"multi-GPU\"],\n        nargs=\"?\",\n        default=\"GPU\",\n        help=(\n            \"Generates configs for a single-GPU/multi-CPU\"\n            + \"/multi-GPU training configuration file by \"\n            + \"default. \\nValid options (default to GPU): \"\n            + \"[GPU, CPU, multi-GPU]\\n\"\n            + \"This option will be overwritten if \"\n            + \"general.device is specified.\"\n        ),\n    )\n    parser.add_argument(\n        \"--custom_ordering\",\n        \"-co\",\n        action=\"store_true\",\n        help=\"If stated, will add default custom_ordering \"\n        + \"file path to configuration file.\\n\"\n        + \"This option will be overwritten if \"\n        + \"path.custom_ordering is specified.\",\n    )\n    parser.add_argument(\n        \"--partitions_train\",\n        action=\"store_true\",\n        help=\"If stated, will add default training edges \"\n        + \"partitions file to configuration file.\\n\"\n        + \"This option will be overwritten if \"\n        + \"path.train_edges_partitions is specified.\",\n    )\n    parser.add_argument(\n        \"--partitions_valid\",\n        action=\"store_true\",\n        help=\"If stated, will add default valid edges \"\n        + \"partitions file to configuration file.\\n\"\n        + \"This option will be overwritten if \"\n        + \"path.validation_edges_partitions is specified.\",\n    )\n    parser.add_argument(\n        \"--partitions_test\",\n        action=\"store_true\",\n        help=\"If stated, will add default test edges \"\n        + \"partitions file to configuration file.\\n\"\n        + \"This option will be overwritten if \"\n        + \"path.test_edges_partitions is specified.\",\n    )\n\n    config_dict, valid_dict = read_template(DEFAULT_CONFIG_FILE)\n\n    for key in list(config_dict.keys()):\n        if valid_dict.get(key) is not None:\n            parser.add_argument(\n                str(\"--\" + key),\n                metavar=key,\n                type=str,\n                choices=valid_dict.get(key),\n                default=config_dict.get(key),\n                help=argparse.SUPPRESS,\n            )\n        else:\n            parser.add_argument(\n                str(\"--\" + key), metavar=key, type=str, default=config_dict.get(key), help=argparse.SUPPRESS\n            )\n\n    return parser, config_dict\n\n\ndef parse_args(args, config_dict):\n    arg_dict = vars(args)\n    set_up_files(args.output_directory)\n\n    for key in list(config_dict.keys()):\n        if arg_dict.get(key) == config_dict.get(key):\n            arg_dict.pop(key)\n\n    if arg_dict.get(\"general.device\") is None:\n        if arg_dict.get(\"device\") != config_dict.get(\"general.device\"):\n            arg_dict.update({\"general.device\": arg_dict.get(\"device\")})\n\n    if arg_dict.get(\"dataset\") is not None:\n        arg_dict = update_dataset_stats(arg_dict.get(\"dataset\"), arg_dict, config_dict)\n    elif arg_dict.get(\"stats\") is not None:\n        arg_dict = update_stats(arg_dict.get(\"stats\"), arg_dict)\n    elif arg_dict.get(\"stats_nc\") is not None:\n        arg_dict = update_stats(arg_dict.get(\"stats_nc\"), arg_dict, config_dict, \"nodeclassification\")\n    else:\n        raise RuntimeError(\"Must specify either dataset or dataset stats.\")\n\n    dir = args.output_directory\n    if args.data_directory is None:\n        arg_dict = update_data_path(dir, arg_dict)\n    else:\n        arg_dict = update_data_path(args.data_directory, arg_dict)\n\n    return arg_dict\n\n\ndef main():\n    parser, config_dict = set_args()\n    args = parser.parse_args()\n    config_dict = parse_args(args, config_dict)\n    output_config(config_dict, args.output_directory)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/python/tools/marius_postprocess.py",
    "content": "import argparse\nfrom argparse import RawDescriptionHelpFormatter\nfrom pathlib import Path\n\nfrom marius.tools.postprocess.in_memory_exporter import InMemoryExporter\n\n# from marius.tools.postprocess.spark_exporter import SparkExporter\n\n\ndef set_args():\n    parser = argparse.ArgumentParser(\n        description=(\n            \"Convert trained embeddings to desired output format and output to specified directory.\\n\\n\"\n            \"Example usage:\\n\"\n            \"marius_postprocess --model_dir foo --format csv --output_dir bar\"\n        ),\n        prog=\"postprocess\",\n        formatter_class=RawDescriptionHelpFormatter,\n    )\n    parser.add_argument(\"--model_dir\", metavar=\"model_dir\", type=str, help=\"Directory of the trained model\")\n    parser.add_argument(\n        \"--format\",\n        \"-f\",\n        metavar=\"format\",\n        default=\"CSV\",\n        help=\"Format of output embeddings. Choices are [csv, parquet, binary]\",\n    )\n    parser.add_argument(\"--delim\", metavar=\"delim\", default=\",\", help=\"Delimiter to use for the output CSV\")\n    # parser.add_argument('--spark',\n    #                     action='store_true',\n    #                     default=False,\n    #                     help='If true, pyspark will be used to perform the postprocessing')\n    parser.add_argument(\n        \"--output_dir\",\n        metavar=\"output_dir\",\n        type=str,\n        default=None,\n        help=\"Output directory, if not provided the model directory will be used.\",\n    )\n    parser.add_argument(\n        \"--overwrite\", action=\"store_true\", default=False, help=\"If enabled, the output directory will be overwritten\"\n    )\n\n    return parser\n\n\ndef main():\n    parser = set_args()\n    args = parser.parse_args()\n    model_dir = Path(args.model_dir)\n    fmt = args.format.upper()\n    delim = args.delim\n    output_dir = args.output_dir\n\n    if output_dir is None:\n        output_dir = model_dir\n    else:\n        output_dir = Path(output_dir)\n\n    exporter = InMemoryExporter(model_dir, fmt=fmt, delim=delim, overwrite=args.overwrite)\n    exporter.export(output_dir)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/python/tools/marius_predict.py",
    "content": "import argparse\nimport os\nimport pathlib\nfrom argparse import RawDescriptionHelpFormatter\n\nimport numpy as np\nimport pandas as pd\n\nimport marius as m\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.prediction.link_prediction import infer_lp\nfrom marius.tools.prediction.node_classification import infer_nc\nfrom marius.tools.preprocess.converters.partitioners.torch_partitioner import partition_edges\nfrom marius.tools.preprocess.converters.readers.pandas_readers import PandasDelimitedFileReader\nfrom marius.tools.preprocess.converters.torch_converter import (\n    SUPPORTED_DELIM_FORMATS,\n    apply_mapping1d,\n    apply_mapping_edges,\n    dataframe_to_tensor,\n)\n\nimport torch  # isort:skip\n\n\ndef str2bool(v):\n    if isinstance(v, bool):\n        return v\n    if v.lower() in (\"yes\", \"true\", \"t\", \"y\", \"1\"):\n        return True\n    elif v.lower() in (\"no\", \"false\", \"f\", \"n\", \"0\"):\n        return False\n    else:\n        raise argparse.ArgumentTypeError(\"Boolean value expected.\")\n\n\ndef set_args():\n    parser = argparse.ArgumentParser(\n        description=(\n            \"Tool for performing link prediction or node classification inference with trained models.\\n\\nLink\"\n            \" prediction example usage: \\nmarius_predict <trained_config> --output_dir results/ --metrics mrr mean_rank\"\n            \" hits1 hits10 hits50 --save_scores --save_ranks \\nAssuming <trained_config> contains a link prediction\"\n            \" model, this command will perform link prediction evaluation over the test set of edges provided in the\"\n            \" config file. Metrics are saved to results/metrics.txt and scores and ranks for each test edge are saved\"\n            \" to results/scores.csv \\n\\nNode classification example usage: \\nmarius_predict <trained_config>\"\n            \" --output_dir results/ --metrics accuracy --save_labels \\nThis command will perform node classification\"\n            \" evaluation over the test set of nodes provided in the config file. Metrics are saved to\"\n            \" results/metrics.txt and labels for each test node are saved to results/labels.csv \\n\\nCustom inputs:\"\n            \" \\nThe test set can be directly specified setting --input_file <test_set_file>. If the test set has not\"\n            \" been preprocessed, then --preprocess_input should be enabled. The default format is a binary file, but\"\n            \" additional formats can be specified with --input_format.\"\n        ),\n        prog=\"predict\",\n        formatter_class=RawDescriptionHelpFormatter,\n    )\n    parser.add_argument(\n        \"--config\", metavar=\"config\", required=True, type=str, help=\"Configuration file for trained model\"\n    )\n\n    parser.add_argument(\"--output_dir\", metavar=\"output_dir\", type=str, default=\"\", help=\"Path to output directory\")\n\n    parser.add_argument(\n        \"--metrics\", metavar=\"metrics\", type=str, nargs=\"*\", default=[], help=\"List of metrics to report\"\n    )\n\n    parser.add_argument(\n        \"--save_labels\",\n        action=\"store_true\",\n        default=False,\n        help=(\n            \"(Node Classification) If true, the node classification labels of each test node will be saved to\"\n            \" <output_dir>/labels.csv\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--save_scores\",\n        action=\"store_true\",\n        default=False,\n        help=(\n            \"(Link Prediction) If true, the link prediction scores of each test edge will be saved to\"\n            \" <output_dir>/scores.csv\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--save_ranks\",\n        action=\"store_true\",\n        default=False,\n        help=(\n            \"(Link Prediction) If true, the link prediction ranks of each test edge will be saved to\"\n            \" <output_dir>/scores.csv\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--batch_size\", metavar=\"batch_size\", type=int, default=10000, help=\"Number of examples to evaluate at a time.\"\n    )\n\n    parser.add_argument(\n        \"--num_nbrs\",\n        metavar=\"num_nbrs\",\n        type=list,\n        default=None,\n        help=(\n            \"Number of neighbors to sample for each GNN layer.If not provided, then the module will check if the output\"\n            \" of the encoder has been saved after training (see storage.export_encoded_nodes). If the encoder outputs\"\n            \" exist, the the module will skip the encode step (incl. neighbor sampling) and only perform the decode\"\n            \" over the saved inputs.If encoder outputs are not saved, model.encoder.eval_neighbor_sampling will be used\"\n            \" for the neighbor sampling configuration.If model.encoder.eval_neighbor_sampling does not exist, then\"\n            \" model.encoder.train_neighbor_sampling will be used.If none of the above are given, then the model is\"\n            \" assumed to not require neighbor sampling.\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--num_negs\",\n        metavar=\"num_negs\",\n        type=int,\n        default=None,\n        help=(\n            \"(Link Prediction) Number of negatives to compare per positive edge for link prediction. If -1, then all\"\n            \" nodes are used as negatives. Otherwise, num_neg*num_chunks nodes will be sampled and used as negatives.If\"\n            \" not provided, the evaluation.negative_sampling configuration will be used.if evaluation.negative_sampling\"\n            \" is not provided, then negative sampling will not occur and only the scores for the input edges will be\"\n            \" computed, this means that any ranking metrics cannot be calculated.\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--num_chunks\",\n        metavar=\"num_chunks\",\n        type=int,\n        default=1,\n        help=(\n            \"(Link Prediction) Specifies the amount of reuse of negative samples. \"\n            \"A given set of num_neg sampled nodes will be reused to corrupt (batch_size // num_chunks) edges.\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--deg_frac\",\n        metavar=\"deg_frac\",\n        type=float,\n        default=0.0,\n        help=(\n            \"(Link Prediction) Specifies the fraction of the num_neg nodes sampled as negatives that should be sampled\"\n            \" according to their degree. This sampling procedure approximates degree based sampling by sampling nodes\"\n            \" that appear in the current batch of edges.\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--filtered\",\n        metavar=\"filtered\",\n        type=str2bool,\n        default=True,\n        help=(\n            \"(Link Prediction) If true, then false negative samples will be filtered out. \"\n            \"This is only supported when evaluating with all nodes.\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--input_file\",\n        metavar=\"input_file\",\n        type=str,\n        default=\"\",\n        help=(\n            \"Path to input file containing the test set, \"\n            \"if not provided then the test set described in the configuration file will be used.\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--input_format\",\n        metavar=\"input_format\",\n        type=str,\n        default=\"binary\",\n        help=(\n            \"Format of the input file to test. \"\n            \"Options are [BINARY, CSV, TSV, DELIMITED] files. If DELIMITED, then --delim must be specified.\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--preprocess_input\",\n        metavar=\"preprocess_input\",\n        type=str2bool,\n        default=False,\n        help=\"If true, the input file (if provided) will be preprocessed before evaluation.\",\n    )\n\n    parser.add_argument(\n        \"--columns\",\n        metavar=\"columns\",\n        type=list,\n        default=[],\n        help=(\n            \"List of column ids of input delimited file which denote the src node, edge-type, and dst node of\"\n            \" edges.E.g. columns=[0, 2, 1] means that the source nodes are found in the first column of the file, the\"\n            \" edge-types are found in the third column, and the destination nodes are found in the second column.For\"\n            \" graphs without edge types, only the location node columns need to be provided. E.g. [0, 1]If the input\"\n            \" file contains node ids rather than edges, then only a single id is needed. E.g. [2]\"\n        ),\n    )\n\n    parser.add_argument(\n        \"--header_length\",\n        metavar=\"header_length\",\n        type=int,\n        default=0,\n        help=\"Length of the header for input delimited file\",\n    )\n\n    parser.add_argument(\"--delim\", metavar=\"delim\", type=str, default=None, help=\"Delimiter for input file\")\n\n    parser.add_argument(\n        \"--dtype\",\n        metavar=\"dtype\",\n        type=str,\n        default=\"\",\n        help=\"Datatype of input file elements. Defaults to the dataset specified in the configuration file.\",\n    )\n\n    return parser\n\n\ndef get_metrics(config, args):\n    metrics = []\n    if config.model.learning_task == m.config.LearningTask.LINK_PREDICTION:\n        # setup metrics\n        for metric in args.metrics:\n            metric = metric.upper()\n\n            if metric == \"MRR\" or metric == \"MEANRECIPROCALRANK\" or metric == \"MEAN_RECIPROCAL_RANK\":\n                metrics.append(m.report.MeanReciprocalRank())\n            elif metric == \"MR\" or metric == \"MEANRANK\" or metric == \"MEAN_RANK\":\n                metrics.append(m.report.MeanRank())\n            elif metric.startswith(\"HITS\"):\n                str_offset = 4\n                if metric.startswith(\"HITS@K\"):\n                    str_offset = 6\n                try:\n                    k = int(metric[str_offset:])\n                    metrics.append(m.report.Hitsk(k))\n                except RuntimeError as err:\n                    raise RuntimeWarning(\n                        \"Unable to parse k value for hits@k metric: \" + metric + \"\\nError: \" + err.__str__()\n                    )\n\n            else:\n                raise RuntimeWarning(\"Unsupported metric for link prediction: \" + metric)\n\n    elif config.model.learning_task == m.config.LearningTask.NODE_CLASSIFICATION:\n        for metric in args.metrics:\n            metric = metric.upper()\n\n            if (\n                metric == \"ACC\"\n                or metric == \"ACCURACY\"\n                or metric == \"CATEGORICAL_ACCURACY\"\n                or metric == \"CATEGORICALACCURACY\"\n            ):\n                metrics.append(m.report.CategoricalAccuracy())\n            else:\n                raise RuntimeWarning(\"Unsupported metric for node classification: \" + metric)\n\n    else:\n        raise RuntimeError(\"Unsupported learning task.\")\n\n    return metrics\n\n\ndef get_dtype(storage_backend, args):\n    str_dtype = args.dtype.lower()\n    if str_dtype == \"\":\n        if storage_backend.dtype == torch.int32:\n            numpy_dtype = np.int32\n            str_dtype = \"int32\"\n        else:\n            numpy_dtype = np.int64\n            str_dtype = \"int64\"\n    else:\n        if str_dtype == \"int32\" or str_dtype == \"int\":\n            numpy_dtype = np.int32\n        elif str_dtype == \"int64\" or str_dtype == \"long\":\n            numpy_dtype = np.int64\n        else:\n            raise RuntimeError(\"Unsupported datatype for input file.\")\n\n    return str_dtype, numpy_dtype\n\n\ndef get_columns(config, args):\n    is_edges = config.model.learning_task == m.config.LearningTask.LINK_PREDICTION\n\n    columns = args.columns\n    if len(columns) == 0:\n        if is_edges:\n            if config.storage.dataset.num_relations > 1:\n                columns = [0, 1, 2]\n            else:\n                columns = [0, 1]\n        else:\n            columns = [0]\n    else:\n        if is_edges:\n            if config.storage.dataset.num_relations > 1:\n                assert len(columns) == 3\n            else:\n                assert len(columns) == 2\n        else:\n            assert len(columns) == 1\n    return columns\n\n\ndef infer_input_shape(config, args):\n    is_edges = config.model.learning_task == m.config.LearningTask.LINK_PREDICTION\n\n    if args.input_format.upper() == \"BINARY\" or args.input_format.upper() == \"BIN\":\n        if is_edges:\n            storage_backend = config.storage.edges\n\n            file_size = os.stat(args.input_file).st_size\n            _, numpy_dtype = get_dtype(storage_backend, args)\n\n            if config.storage.dataset.num_relations > 1:\n                shape = [file_size // (numpy_dtype.itemsize * 3), 3]\n            else:\n                shape = [file_size // (numpy_dtype.itemsize * 2), 2]\n\n            assert shape[0] * shape[1] * numpy_dtype.itemsize == file_size\n        else:\n            storage_backend = config.storage.nodes\n\n            file_size = os.stat(args.input_file).st_size\n            _, numpy_dtype = get_dtype(storage_backend, args)\n            shape = [file_size // numpy_dtype.itemsize]\n            assert shape[0] * numpy_dtype.itemsize == file_size\n\n    elif args.input_format.upper() in SUPPORTED_DELIM_FORMATS:\n        line_count = None\n        with open(args.input_format) as f:\n            line_count = sum(1 for _ in f)\n\n        if is_edges:\n            if config.storage.dataset.num_relations > 1:\n                shape = [line_count, 3]\n            else:\n                shape = [line_count, 2]\n        else:\n            if config.storage.dataset.num_relations > 1:\n                shape = [line_count, 3]\n            else:\n                shape = [line_count, 2]\n    else:\n        raise RuntimeError(\"Unsupported input format. \")\n\n    return shape\n\n\ndef get_nbrs_config(config, args):\n    nbrs = args.num_nbrs\n    if nbrs is None:\n        if config.storage.export_encoded_nodes and config.model.learning_task == m.config.LearningTask.LINK_PREDICTION:\n            return None\n\n        nbrs = []\n        if config.model.encoder.eval_neighbor_sampling is not None:\n            for layer in config.model.encoder.eval_neighbor_sampling:\n                if layer.type == m.config.NeighborSamplingLayer.ALL:\n                    nbrs.append(-1)\n                else:\n                    nbrs.append(layer.options.num_neighbors)\n\n            return nbrs\n\n        if config.model.encoder.train_neighbor_sampling is not None:\n            for layer in config.model.encoder.train_neighbor_sampling:\n                if layer.type == m.config.NeighborSamplingLayer.ALL:\n                    nbrs.append(-1)\n                else:\n                    nbrs.append(layer.options.num_neighbors)\n\n            return nbrs\n\n    return nbrs\n\n\ndef get_neg_config(config, args):\n    if args.num_negs is None:\n        num_negs = config.evaluation.negative_sampling.negatives_per_positive\n        num_chunks = config.evaluation.negative_sampling.num_chunks\n        deg_frac = config.evaluation.negative_sampling.degree_fraction\n        filtered = config.evaluation.negative_sampling.filtered\n        return num_negs, num_chunks, deg_frac, filtered\n    else:\n        return args.num_negs, args.num_chunks, args.deg_frac, args.filtered\n\n\ndef preprocess_input_file(config, args):\n    assert args.preprocess_input\n    assert pathlib.Path(args.input_file).exists()\n\n    is_edges = config.model.learning_task == m.config.LearningTask.LINK_PREDICTION\n\n    if is_edges:\n        storage_backend = config.storage.edges\n    else:\n        storage_backend = config.storage.nodes\n\n    shape = infer_input_shape(config, args)\n    str_dtype, numpy_dtype = get_dtype(storage_backend, args)\n\n    node_mapping_file = config.storage.dataset.dataset_dir + PathConstants.node_mapping_path\n    rel_mapping_file = config.storage.dataset.dataset_dir + PathConstants.relation_mapping_path\n\n    node_mapping_df = None\n    rel_mapping_df = None\n\n    if pathlib.Path(node_mapping_file).exists():\n        node_mapping_df = pd.read_csv(node_mapping_file, sep=\",\", header=None)\n\n    if pathlib.Path(rel_mapping_file).exists():\n        rel_mapping_df = pd.read_csv(rel_mapping_file, sep=\",\", header=None)\n\n    if args.input_format.upper() == \"BINARY\" or args.input_format.upper() == \"BIN\":\n        input_tensor = torch.from_file(np.fromfile(args.filename, numpy_dtype)).resize(shape)\n\n        if node_mapping_df is not None:\n            if len(input_tensor.shape) == 2:\n                input_tensor = apply_mapping_edges(input_tensor, node_mapping_df, rel_mapping_df)\n            else:\n                input_tensor = apply_mapping1d(input_tensor, node_mapping_df)\n    else:\n        columns = get_columns(config, args)\n\n        delim = args.delim\n\n        if delim is None:\n            if args.input_format.upper() == \"CSV\":\n                delim = \",\"\n            elif args.input_format.upper() == \"TSV\":\n                delim = \"\\t\"\n            else:\n                raise RuntimeError(\"Delimiter must be specified.\")\n\n        reader = PandasDelimitedFileReader(\n            args.input_file, columns=columns, header_length=args.header_length, delim=delim, dtype=str_dtype\n        )\n\n        input_df, _, _ = reader.read()\n\n        if node_mapping_df is not None:\n            if len(input_df.shape) == 2:\n                input_df = apply_mapping_edges(input_df, node_mapping_df, rel_mapping_df)\n            else:\n                input_df = apply_mapping1d(input_df, node_mapping_df)\n\n        input_tensor = dataframe_to_tensor(input_df)\n\n    # TODO probably not a great way to name the preprocessed file\n    input_file = \"preproc_\" + args.input_file.split(\".\")[-2] + \".bin\"\n    input_file_offsets = None\n\n    num_partitions = 1\n    if (\n        config.storage.embeddings is not None\n        and config.storage.embeddings.type == m.config.StorageBackend.PARTITION_BUFFER\n    ):\n        num_partitions = config.storage.embeddings.options.num_partitions\n    elif (\n        config.storage.features is not None and config.storage.features.type == m.config.StorageBackend.PARTITION_BUFFER\n    ):\n        num_partitions = config.storage.features.options.num_partitions\n\n    if num_partitions > 1 and len(input_tensor.shape) == 2:\n        input_file_offsets = args.input_file.split(\".\")[-2] + \"_offsets.txt\"\n        input_tensor, offsets = partition_edges(input_tensor, config.storage.dataset.num_nodes, num_partitions)\n\n        with open(config.storage.dataset.dataset_dir + input_file_offsets, \"w\") as f:\n            f.writelines([str(o) + \"\\n\" for o in offsets])\n\n    with open(config.storage.dataset.dataset_dir + input_file, \"wb\") as f:\n        f.write(bytes(input_tensor.numpy()))\n\n    return input_file, input_file_offsets, storage_backend, shape\n\n\ndef get_input_file_storage(config, args):\n    assert pathlib.Path(args.input_file).exists()\n\n    if args.preprocess_input:\n        input_file, input_file_offsets, storage_backend, shape = preprocess_input_file(config, args)\n    else:\n        input_file = args.input_file\n        input_file_offsets = None\n\n        is_edges = config.model.learning_task == m.config.LearningTask.LINK_PREDICTION\n        if is_edges:\n            storage_backend = config.storage.edges\n        else:\n            storage_backend = config.storage.nodes\n\n        shape = infer_input_shape(config, args)\n\n    if storage_backend.type is m.config.StorageBackend.DEVICE_MEMORY:\n        input_storage = m.storage.InMemory(input_file, shape, storage_backend.dtype, config.storage.device)\n    elif storage_backend.type is m.config.StorageBackend.HOST_MEMORY:\n        input_storage = m.storage.InMemory(input_file, shape, storage_backend.dtype, torch.device(\"cpu\"))\n    elif storage_backend.type is m.config.StorageBackend.FLAT_FILE:\n        input_storage = m.storage.FlatFile(input_file, shape, storage_backend.dtype)\n    else:\n        raise RuntimeError(\"Unexpected storage backend for input_file.\")\n\n    if input_file_offsets is not None:\n        input_storage.read_edge_bucket_sizes(input_file_offsets)\n\n\ndef run_predict(args):\n    config = m.config.loadConfig(args.config)\n    metrics = get_metrics(config, args)\n\n    model_dir_path = pathlib.Path(config.storage.model_dir)\n    if not model_dir_path.exists():\n        raise RuntimeError(\"Path {} with model params doesn't exist.\".format(str(model_dir_path)))\n\n    model: m.nn.Model = m.storage.load_model(args.config, train=False)\n    graph_storage: m.storage.GraphModelStorage = m.storage.load_storage(args.config, train=False)\n\n    if args.input_file != \"\":\n        input_storage = get_input_file_storage(config, args)\n\n        if config.model.learning_task == m.config.LearningTask.LINK_PREDICTION:\n            graph_storage.storage_ptrs.edges = input_storage\n        elif config.model.learning_task == m.config.LearningTask.NODE_CLASSIFICATION:\n            graph_storage.storage_ptrs.nodes = input_storage\n        else:\n            raise RuntimeError(\"Unsupported learning task for inference.\")\n    else:\n        graph_storage.setTestSet()\n\n    output_dir = args.output_dir\n    if output_dir == \"\":\n        output_dir = config.storage.model_dir\n\n    nbrs = get_nbrs_config(config, args)\n\n    if config.model.learning_task == m.config.LearningTask.LINK_PREDICTION:\n        num_negs, num_chunks, deg_frac, filtered = get_neg_config(config, args)\n        infer_lp(\n            model=model,\n            graph_storage=graph_storage,\n            output_dir=output_dir,\n            metrics=metrics,\n            save_scores=args.save_scores,\n            save_ranks=args.save_ranks,\n            batch_size=args.batch_size,\n            num_nbrs=nbrs,\n            num_negs=num_negs,\n            num_chunks=num_chunks,\n            deg_frac=deg_frac,\n            filtered=filtered,\n        )\n\n    elif config.model.learning_task == m.config.LearningTask.NODE_CLASSIFICATION:\n        infer_nc(\n            model=model,\n            graph_storage=graph_storage,\n            output_dir=output_dir,\n            metrics=metrics,\n            save_labels=args.save_labels,\n            batch_size=args.batch_size,\n            num_nbrs=nbrs,\n        )\n    else:\n        raise RuntimeError(\"Unsupported learning task for inference.\")\n\n    print(\"Results output to: {}\".format(output_dir))\n\n\ndef main():\n    parser = set_args()\n    args = parser.parse_args()\n    run_predict(args)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/python/tools/marius_preprocess.py",
    "content": "import argparse\nimport shutil\nfrom pathlib import Path\n\nfrom marius.tools.preprocess import custom\nfrom marius.tools.preprocess.datasets import (\n    fb15k,\n    fb15k_237,\n    freebase86m,\n    livejournal,\n    ogb_mag240m,\n    ogb_wikikg90mv2,\n    ogbl_citation2,\n    ogbl_collab,\n    ogbl_ppa,\n    ogbl_wikikg2,\n    ogbn_arxiv,\n    ogbn_papers100m,\n    ogbn_products,\n    twitter,\n)\n\n\ndef set_args():\n    parser = argparse.ArgumentParser(description=\"Preprocess Datasets\", prog=\"preprocess\")\n\n    parser.add_argument(\n        \"--output_directory\", metavar=\"output_directory\", type=str, default=\"\", help=\"Directory to put graph data\"\n    )\n\n    parser.add_argument(\n        \"--edges\", metavar=\"edges\", nargs=\"+\", type=str, help=\"File(s) containing the edge list(s) for a custom dataset\"\n    )\n\n    parser.add_argument(\n        \"--dataset\", metavar=\"dataset\", type=str, default=\"custom\", help=\"Name of dataset to preprocess\"\n    )\n\n    parser.add_argument(\n        \"--num_partitions\",\n        metavar=\"num_partitions\",\n        required=False,\n        type=int,\n        default=1,\n        help=\"Number of node partitions\",\n    )\n\n    parser.add_argument(\n        \"--partitioned_eval\",\n        action=\"store_true\",\n        default=False,\n        help=\"If true, the validation and/or the test set will be partitioned.\",\n    )\n\n    parser.add_argument(\n        \"--delim\", \"-d\", metavar=\"delim\", type=str, default=\"\\t\", help=\"Delimiter to use for delimited file inputs\"\n    )\n\n    parser.add_argument(\n        \"--dataset_split\",\n        \"-ds\",\n        metavar=\"dataset_split\",\n        nargs=\"+\",\n        type=float,\n        default=None,\n        help=\"Split dataset into specified fractions\",\n    )\n\n    parser.add_argument(\n        \"--overwrite\",\n        action=\"store_true\",\n        default=False,\n        help=\"If true, the preprocessed dataset will be overwritten if it already exists\",\n    )\n\n    parser.add_argument(\n        \"--spark\", action=\"store_true\", default=False, help=\"If true, pyspark will be used to perform the preprocessing\"\n    )\n\n    parser.add_argument(\n        \"--no_remap_ids\",\n        action=\"store_true\",\n        default=False,\n        help=\"If true, the node ids of the input dataset will not be remapped to random integer ids\",\n    )\n\n    parser.add_argument(\n        \"--sequential_train_nodes\",\n        action=\"store_true\",\n        default=False,\n        help=\"If true, the train nodes will be given ids 0 to num train nodes\",\n    )\n\n    parser.add_argument(\n        \"--src_column\",\n        metavar=\"src_column\",\n        required=False,\n        type=int,\n        default=None,\n        help=\"The column id of the src column\",\n    )\n\n    parser.add_argument(\n        \"--dst_column\",\n        metavar=\"dst_column\",\n        required=False,\n        type=int,\n        default=None,\n        help=\"The column id of the dst column\",\n    )\n\n    parser.add_argument(\n        \"--edge_type_column\",\n        metavar=\"edge_type_column\",\n        required=False,\n        type=int,\n        default=None,\n        help=\"The column id which denotes the edge weight column\",\n    )\n\n    parser.add_argument(\n        \"--edge_weight_column\",\n        metavar=\"edge_weight_column\",\n        required=False,\n        type=int,\n        default=None,\n        help=\"The column id which denotes the edge weight column\",\n    )\n\n    return parser\n\n\ndef main():\n    parser = set_args()\n    args = parser.parse_args()\n    if args.dataset == \"custom\" and (args.src_column is None or args.dst_column is None):\n        parser.error(\"When using a custom dataset, src column and dst column must be specified\")\n\n    if args.output_directory == \"\":\n        args.output_directory = args.dataset\n\n    if args.overwrite and Path(args.output_directory).exists():\n        shutil.rmtree(args.output_directory)\n\n    dataset_dict = {\n        \"FB15K\": fb15k.FB15K,\n        \"FB15K_237\": fb15k_237.FB15K237,\n        \"LIVEJOURNAL\": livejournal.Livejournal,\n        \"TWITTER\": twitter.Twitter,\n        \"FREEBASE86M\": freebase86m.Freebase86m,\n        \"OGBL_WIKIKG2\": ogbl_wikikg2.OGBLWikiKG2,\n        \"OGBL_CITATION2\": ogbl_citation2.OGBLCitation2,\n        \"OGBL_PPA\": ogbl_ppa.OGBLPpa,\n        \"OGBN_ARXIV\": ogbn_arxiv.OGBNArxiv,\n        \"OGBN_PRODUCTS\": ogbn_products.OGBNProducts,\n        \"OGBN_PAPERS100M\": ogbn_papers100m.OGBNPapers100M,\n        \"OGB_WIKIKG90MV2\": ogb_wikikg90mv2.OGBWikiKG90Mv2,\n        \"OGB_MAG240M\": ogb_mag240m.OGBMag240M,\n        \"OGBL_COLLAB\": ogbl_collab.OGBLCollab,\n    }\n\n    dataset = dataset_dict.get(args.dataset.upper())\n    if dataset is not None:\n        print(\"Using existing dataset of\", args.dataset.upper())\n        dataset = dataset(args.output_directory, spark=args.spark)\n        dataset.download(args.overwrite)\n        dataset.preprocess(\n            num_partitions=args.num_partitions,\n            remap_ids=not args.no_remap_ids,\n            splits=args.dataset_split,\n            sequential_train_nodes=args.sequential_train_nodes,\n            partitioned_eval=args.partitioned_eval,\n        )\n\n    else:\n        print(\"Preprocess custom dataset\")\n\n        # custom link prediction dataset\n        dataset = custom.CustomLinkPredictionDataset(\n            output_directory=args.output_directory,\n            files=args.edges,\n            delim=args.delim,\n            dataset_name=args.dataset,\n            spark=args.spark,\n        )\n        dataset.preprocess(\n            num_partitions=args.num_partitions,\n            remap_ids=not args.no_remap_ids,\n            splits=args.dataset_split,\n            partitioned_eval=args.partitioned_eval,\n            sequential_train_nodes=args.sequential_train_nodes,\n            src_column=args.src_column,\n            dst_column=args.dst_column,\n            edge_type_column=args.edge_type_column,\n            edge_weight_column=args.edge_weight_column,\n        )\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/python/tools/postprocess/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/tools/postprocess/in_memory_exporter.py",
    "content": "import shutil\nfrom pathlib import Path\n\nimport numpy as np\nimport pandas as pd\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\n\nimport torch  # isort:skip\n\nSUPPORTED_FORMATS = [\"CSV\", \"PARQUET\", \"BINARY\", \"BIN\"]\n\n\ndef get_ordered_raw_ids(mapping_path):\n    assert mapping_path.exists()\n\n    mapping = pd.read_csv(mapping_path, header=None)\n    raw_id = mapping.iloc[:, 0]\n    mapped_id = mapping.iloc[:, 1]\n\n    sorted_args = np.argsort(mapped_id)\n    raw_id = raw_id[sorted_args]\n\n    return raw_id\n\n\ndef save_df(output_df: pd.DataFrame, output_dir: Path, name: str, fmt: str, delim: str = \",\", overwrite: bool = False):\n    output_path = output_dir / Path(f\"{name}.{fmt.lower()}\")\n\n    if output_path.exists() and not overwrite:\n        raise RuntimeError(f\"{output_path} already exists. Enable overwrite mode or delete/move the file to save.\")\n\n    if fmt == \"CSV\":\n        with np.printoptions(linewidth=10000):\n            output_df.to_csv(output_path, sep=delim, index=False, encoding=\"utf8\")\n    elif fmt == \"PARQUET\":\n        output_df.to_parquet(output_path)\n    else:\n        raise RuntimeError(f\"Unimplemented format: {fmt}\")\n\n    print(f\"Wrote {output_path}: shape {output_df.shape}\")\n\n\nclass InMemoryExporter(object):\n    def __init__(self, model_dir: Path, fmt: str = \"CSV\", delim: str = \",\", overwrite: bool = False):\n        fmt = fmt.upper()\n\n        if not model_dir.exists():\n            raise RuntimeError(f\"Model directory not found {model_dir}\")\n\n        if fmt not in SUPPORTED_FORMATS:\n            raise RuntimeError(f\"Unsupported format {fmt}, must be one of {SUPPORTED_FORMATS}\")\n\n        self.model_dir = model_dir\n        self.fmt = fmt\n        self.delim = delim\n        self.overwrite = overwrite\n        self.config = OmegaConf.load(model_dir / PathConstants.saved_full_config_file_name)\n\n    def export_node_embeddings(self, output_dir: Path):\n        num_nodes = self.config.storage.dataset.num_nodes\n        node_embedding_path = self.model_dir / \"embeddings.bin\"\n        node_mapping_path = self.model_dir / PathConstants.node_mapping_file\n\n        if node_embedding_path.exists():\n            raw_id = get_ordered_raw_ids(node_mapping_path)\n        else:\n            raw_id = np.arange(num_nodes)\n\n        if node_embedding_path.exists():\n            save_df(\n                pd.DataFrame(\n                    np.array(\n                        [raw_id, list(np.fromfile(node_embedding_path, np.float32).reshape(num_nodes, -1))],\n                        dtype=object,\n                    ).T,\n                    columns=[\"id\", \"embedding\"],\n                ),\n                output_dir,\n                \"embeddings\",\n                self.fmt,\n                self.delim,\n                self.overwrite,\n            )\n\n        encoded_nodes_path = self.model_dir / \"encoded_nodes.bin\"\n        if encoded_nodes_path.exists():\n            save_df(\n                pd.DataFrame(\n                    np.array(\n                        [raw_id, list(np.fromfile(encoded_nodes_path, np.float32).reshape(num_nodes, -1))], dtype=object\n                    ).T,\n                    columns=[\"id\", \"embedding\"],\n                ),\n                output_dir,\n                \"encoded_nodes\",\n                self.fmt,\n                self.delim,\n                self.overwrite,\n            )\n\n    def export_rel_embeddings(self, output_dir: Path):\n        num_rels = self.config.storage.dataset.num_relations\n        model = torch.jit.load(self.model_dir / PathConstants.model_file).to(\"cpu\")\n        rel_mapping_path = self.model_dir / PathConstants.relation_mapping_path\n\n        if rel_mapping_path.exists():\n            raw_id = get_ordered_raw_ids(rel_mapping_path)\n        else:\n            raw_id = np.arange(num_rels)\n\n        model_param_dict = dict(model.named_parameters(recurse=True))\n\n        if \"relation_embeddings\" in model_param_dict.keys():\n            save_df(\n                pd.DataFrame(\n                    np.array([raw_id, list(model_param_dict[\"relation_embeddings\"].detach().numpy())], dtype=object).T,\n                    columns=[\"id\", \"embedding\"],\n                ),\n                output_dir,\n                \"relation_embeddings\",\n                self.fmt,\n                self.delim,\n                self.overwrite,\n            )\n\n        if \"inverse_relation_embeddings\" in model_param_dict.keys():\n            save_df(\n                pd.DataFrame(\n                    np.array(\n                        [raw_id, list(model_param_dict[\"inverse_relation_embeddings\"].detach().numpy())], dtype=object\n                    ).T,\n                    columns=[\"id\", \"embedding\"],\n                ),\n                output_dir,\n                \"inverse_relation_embeddings\",\n                self.fmt,\n                self.delim,\n                self.overwrite,\n            )\n\n    def export_model(self, output_dir: Path):\n        model_path = self.model_dir / PathConstants.model_file\n        output_path = Path(f\"{output_dir}/model.pt\")\n\n        if model_path != output_path:\n            if output_dir.__str__().startswith(\"s3://\"):\n                import s3fs\n\n                s3 = s3fs.S3FileSystem()\n                s3.put(model_path, output_path)\n            else:\n                if output_path.exists() and not self.overwrite:\n                    raise RuntimeError(\n                        f\"{output_path} already exists. Enable overwrite mode or delete/move the file to save.\"\n                    )\n                shutil.copy(model_path, output_path)\n                print(f\"Wrote {output_path}\")\n\n    def copy_model(self, output_dir: Path):\n        if self.model_dir != output_dir:\n            if output_dir.__str__().startswith(\"s3://\"):\n                import s3fs\n\n                s3 = s3fs.S3FileSystem()\n                s3.put(self.model_dir, output_dir)\n            else:\n                shutil.copytree(self.model_dir, output_dir, dirs_exist_ok=self.overwrite)\n\n    def export(self, output_dir: Path):\n        if self.fmt.startswith(\"BIN\"):\n            self.copy_model(output_dir)\n        else:\n            if not output_dir.__str__().startswith(\"s3://\"):\n                output_dir.mkdir(parents=True, exist_ok=True)\n            self.export_node_embeddings(output_dir)\n            self.export_rel_embeddings(output_dir)\n            self.export_model(output_dir)\n"
  },
  {
    "path": "src/python/tools/prediction/link_prediction.py",
    "content": "import marius as m\n\n\ndef infer_lp(\n    model: m.nn.Model,\n    graph_storage: m.storage.GraphModelStorage,\n    output_dir: str,\n    metrics: list = None,\n    save_scores: bool = False,\n    save_ranks: bool = False,\n    batch_size: int = 10000,\n    num_nbrs: list = None,\n    num_negs: int = None,\n    num_chunks: int = 1,\n    deg_frac: float = 0.0,\n    filtered: bool = True,\n):\n    reporter = m.report.LinkPredictionReporter()\n\n    for metric in metrics:\n        reporter.add_metric(metric)\n\n    neg_sampler = None\n    if num_negs is None:\n        for metric in metrics:\n            if isinstance(metric, m.report.RankingMetric):\n                raise RuntimeError(\"Ranking metrics require the negative sampling configuration to be provided.\")\n\n        # Set the decoder to only compute scores for the positives\n        model.decoder.mode = m.config.EdgeDecoderMethod.ONLY_POS\n    else:\n        model.decoder.mode = m.config.EdgeDecoderMethod.CORRUPT_NODE\n        neg_sampler = m.samplers.CorruptNodeNegativeSampler(num_chunks, num_negs, deg_frac, filtered)\n\n    nbr_sampler = None\n    if num_nbrs is not None:\n        nbr_sampler = m.samplers.LayeredNeighborSampler(graph_storage, num_nbrs)\n    # if not graph_storage.has_encoded() and num_nbrs is not None:\n    #     nbr_sampler = m.samplers.LayeredNeighborSampler(graph_storage, num_nbrs)\n\n    dataloader = m.data.DataLoader(\n        graph_storage=graph_storage,\n        neg_sampler=neg_sampler,\n        nbr_sampler=nbr_sampler,\n        batch_size=batch_size,\n        learning_task=\"lp\",\n    )\n\n    dataloader.initializeBatches()\n\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch(model.device)\n\n        pos, neg, inv_pos, inv_neg = model.forward_lp(batch, train=False)\n\n        # if graph_storage.has_encoded():\n        #     # batch.node_embeddings contains saved encoder outputs\n        #     pos, neg, inv_pos, inv_neg = model.decoder.forward(batch.edges, batch.node_embeddings)\n        # else:\n        #     pos, neg, inv_pos, inv_neg = model.forward_lp(batch, train=False)\n\n        reporter.add_result(pos, neg, batch.edges)\n        if inv_pos is not None:\n            reporter.add_result(inv_pos, inv_neg, batch.edges)\n\n        batch.clear()\n        dataloader.finishedBatch()\n\n    reporter.save(output_dir, save_scores, save_ranks)\n"
  },
  {
    "path": "src/python/tools/prediction/node_classification.py",
    "content": "import marius as m\n\n\ndef infer_nc(\n    model: m.nn.Model,\n    graph_storage: m.storage.GraphModelStorage,\n    output_dir: str,\n    metrics: list = None,\n    save_labels: bool = False,\n    batch_size: int = 1000,\n    num_nbrs: list = None,\n):\n    reporter = m.report.NodeClassificationReporter()\n\n    for metric in metrics:\n        reporter.add_metric(metric)\n\n    nbr_sampler = None\n    if num_nbrs is not None:\n        nbr_sampler = m.samplers.LayeredNeighborSampler(graph_storage, num_nbrs)\n\n    dataloader = m.data.DataLoader(\n        graph_storage=graph_storage, nbr_sampler=nbr_sampler, batch_size=batch_size, learning_task=\"nc\"\n    )\n\n    dataloader.initializeBatches()\n\n    while dataloader.hasNextBatch():\n        batch = dataloader.getBatch(model.device)\n        labels = model.forward_nc(batch.node_embeddings, batch.node_features, batch.dense_graph, train=False)\n        reporter.add_result(labels)\n        batch.clear()\n        dataloader.finishedBatch()\n\n    reporter.save(output_dir, save_labels)\n"
  },
  {
    "path": "src/python/tools/preprocess/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/tools/preprocess/converters/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/tools/preprocess/converters/partitioners/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/tools/preprocess/converters/partitioners/partitioner.py",
    "content": "from abc import ABC\n\n\nclass Partitioner(ABC):\n    def __init__(self):\n        pass\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/partitioners/spark_partitioner.py",
    "content": "import math\n\nfrom pyspark.sql.dataframe import DataFrame\nfrom pyspark.sql.functions import floor\n\nfrom marius.tools.preprocess.converters.partitioners.partitioner import Partitioner\nfrom marius.tools.preprocess.converters.spark_constants import DST_EDGE_BUCKET_COL, INDEX_COL, SRC_EDGE_BUCKET_COL\nfrom marius.tools.preprocess.utils import get_df_count\n\n\ndef get_partition_size(nodes, num_partitions):\n    partition_size = math.ceil(get_df_count(nodes, INDEX_COL) / num_partitions)\n    return partition_size\n\n\ndef get_edge_buckets(edges_df: DataFrame, partition_size):\n    partitioned_edges = edges_df.withColumn(SRC_EDGE_BUCKET_COL, floor(edges_df.src / partition_size)).withColumn(\n        DST_EDGE_BUCKET_COL, floor(edges_df.dst / partition_size)\n    )\n    return partitioned_edges\n\n\nclass SparkPartitioner(Partitioner):\n    def __init__(self, spark, partitioned_evaluation):\n        super().__init__()\n\n        self.spark = spark\n        self.partitioned_evaluation = partitioned_evaluation\n\n    def partition_edges(self, train_edges_df, valid_edges_df, test_edges_df, nodes_df, num_partitions):\n        \"\"\" \"\"\"\n        partition_size = get_partition_size(nodes_df, num_partitions)\n        train_edges_df = get_edge_buckets(train_edges_df, partition_size)\n\n        if self.partitioned_evaluation:\n            if valid_edges_df is not None:\n                valid_edges_df = get_edge_buckets(valid_edges_df, partition_size)\n\n            if test_edges_df is not None:\n                test_edges_df = get_edge_buckets(test_edges_df, partition_size)\n\n        return train_edges_df, valid_edges_df, test_edges_df\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/partitioners/torch_partitioner.py",
    "content": "import numpy as np\n\nfrom marius.tools.preprocess.converters.partitioners.partitioner import Partitioner\n\nimport torch  # isort:skip\n\n\ndef dataframe_to_tensor(df):\n    return torch.tensor(df.to_numpy())\n\n\ndef partition_edges(edges, num_nodes, num_partitions, edge_weights=None):\n    partition_size = int(np.ceil(num_nodes / num_partitions))\n\n    src_partitions = torch.div(edges[:, 0], partition_size, rounding_mode=\"trunc\")\n    dst_partitions = torch.div(edges[:, -1], partition_size, rounding_mode=\"trunc\")\n\n    _, dst_args = torch.sort(dst_partitions, stable=True)\n    _, src_args = torch.sort(src_partitions[dst_args], stable=True)\n    sort_order = dst_args[src_args]\n\n    edges = edges[sort_order]\n    if edge_weights is not None:\n        edge_weights = edge_weights[sort_order]\n\n    edge_bucket_ids = torch.div(edges, partition_size, rounding_mode=\"trunc\")\n    offsets = np.zeros([num_partitions, num_partitions], dtype=int)\n    unique_src, num_source = torch.unique_consecutive(edge_bucket_ids[:, 0], return_counts=True)\n\n    num_source_offsets = torch.cumsum(num_source, 0) - num_source\n\n    curr_src_unique = 0\n    for i in range(num_partitions):\n        if curr_src_unique < unique_src.size(0) and unique_src[curr_src_unique] == i:\n            offset = num_source_offsets[curr_src_unique]\n            num_edges = num_source[curr_src_unique]\n            dst_ids = edge_bucket_ids[offset : offset + num_edges, -1]\n\n            unique_dst, num_dst = torch.unique_consecutive(dst_ids, return_counts=True)\n\n            offsets[i][unique_dst] = num_dst\n            curr_src_unique += 1\n\n    offsets = list(offsets.flatten())\n\n    return edges, offsets, edge_weights\n\n\nclass TorchPartitioner(Partitioner):\n    def __init__(self, partitioned_evaluation):\n        super().__init__()\n\n        self.partitioned_evaluation = partitioned_evaluation\n\n    def partition_edges(\n        self, train_edges_tens, valid_edges_tens, test_edges_tens, num_nodes, num_partitions, edge_weights=None\n    ):\n        # Extract the edge weights\n        train_edge_weights, valid_edge_weights, test_edge_weights = None, None, None\n        if edge_weights is not None:\n            train_edge_weights, valid_edge_weights, test_edge_weights = (\n                edge_weights[0],\n                edge_weights[1],\n                edge_weights[2],\n            )\n\n        train_edges_tens, train_offsets, train_edge_weights = partition_edges(\n            train_edges_tens, num_nodes, num_partitions, edge_weights=train_edge_weights\n        )\n\n        valid_offsets = None\n        test_offsets = None\n\n        if self.partitioned_evaluation:\n            if valid_edges_tens is not None:\n                valid_edges_tens, valid_offsets, valid_edge_weights = partition_edges(\n                    valid_edges_tens, num_nodes, num_partitions, edge_weights=valid_edge_weights\n                )\n\n            if test_edges_tens is not None:\n                test_edges_tens, test_offsets, test_edge_weights = partition_edges(\n                    test_edges_tens, num_nodes, num_partitions, edge_weights=test_edge_weights\n                )\n\n        return (\n            train_edges_tens,\n            train_offsets,\n            valid_edges_tens,\n            valid_offsets,\n            test_edges_tens,\n            test_offsets,\n            [train_edge_weights, valid_edge_weights, test_edge_weights],\n        )\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/readers/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/tools/preprocess/converters/readers/pandas_readers.py",
    "content": "from pathlib import Path\n\nimport pandas as pd\n\nfrom marius.tools.preprocess.converters.readers.reader import Reader\nfrom marius.tools.preprocess.converters.torch_constants import TorchConverterColumnKeys as ColNames\n\n\nclass PandasDelimitedFileReader(Reader):\n    def __init__(\n        self,\n        train_edges: Path,\n        valid_edges: Path = None,\n        test_edges: Path = None,\n        columns: dict = {},\n        header_length: int = 0,\n        delim: str = \"\\t\",\n    ):\n        \"\"\"\n        This class converts an input dataset from a delimited file format, into the format required for input to Marius\n\n        :param train_edges:                 The path to the raw training edge list [REQUIRED]\n        :param valid_edges:                 The path to the raw validation edge list\n        :param test_edges:                  The path to the raw test edge list\n                                            it is the train/valid/test split. The sum of this list must be 1.\n        :param columns:                     A dict containing the columns we want to extract and the names we want\n                                            to assing them. The key should be the name we want to assign the column\n                                            and the value is the column id.\n                                            Any columns with a None id are ignored.\n        :param header_length:               The length of the header of the input edge lists\n        :param delim:                       The delimiter used between columns of the input edge lists\n        \"\"\"\n\n        super().__init__()\n\n        assert train_edges is not None\n        self.train_edges = train_edges\n        self.valid_edges = valid_edges\n        self.test_edges = test_edges\n        self.header_length = header_length\n        self.columns = columns\n        self.delim = delim\n\n    def read_single_file(self, file_path):\n        if file_path is None:\n            return None\n\n        # Determine the columns to read\n        cols_to_keeps = []\n        id_to_name_mapping = {}\n        for col_name, col_id in self.columns.items():\n            if col_id is not None:\n                cols_to_keeps.append(col_id)\n                id_to_name_mapping[col_id] = col_name.value\n\n        # Read the file and extracted the columns we need\n        file_data = pd.read_csv(file_path, delimiter=self.delim, skiprows=self.header_length, header=None)\n        file_data = file_data[cols_to_keeps]\n        file_data = file_data.rename(columns=id_to_name_mapping)\n\n        # Make sure we got the src and dst columns\n        columns_read = list(file_data.columns)\n        assert \"src_column\" in columns_read\n        assert \"dst_column\" in columns_read\n\n        # Ensure that data is in the proper order\n        cols_order = [ColNames.SRC_COL.value, ColNames.DST_COL.value]\n        if \"edge_type_column\" in columns_read:\n            cols_order.insert(len(cols_order) - 1, ColNames.EDGE_TYPE_COL.value)\n\n        if \"edge_weight_column\" in columns_read:\n            cols_order.insert(len(cols_order), ColNames.EDGE_WEIGHT_COL.value)\n\n        file_data = file_data[cols_order]\n        return file_data\n\n    def read(self):\n        return (\n            self.read_single_file(self.train_edges),\n            self.read_single_file(self.valid_edges),\n            self.read_single_file(self.test_edges),\n        )\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/readers/reader.py",
    "content": "from abc import ABC, abstractmethod\n\n\nclass Reader(ABC):\n    def __init__(self):\n        pass\n\n    @abstractmethod\n    def read(self):\n        \"\"\"\n        This function reads a set of input data and converts it to either torch tensors or pyspark dataframes\n        \"\"\"\n        pass\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/readers/spark_readers.py",
    "content": "from pathlib import Path\n\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.dataframe import DataFrame\n\nfrom marius.tools.preprocess.converters.readers.reader import Reader\n\n\nclass SparkDelimitedFileReader(Reader):\n    def __init__(\n        self,\n        spark: SparkSession,\n        train_edges: Path,\n        valid_edges: Path = None,\n        test_edges: Path = None,\n        columns: list = [0, 1, 2],\n        header_length: int = 0,\n        delim: str = \"\\t\",\n        dtype: str = \"int32\",\n    ):\n        \"\"\"\n        This class converts an input dataset from a delimited file format, into the format required for input to Marius\n\n        :param spark:                       The spark session to use [REQUIRED]\n        :param train_edges:                 The path to the raw training edge list [REQUIRED]\n        :param valid_edges:                 The path to the raw validation edge list\n        :param test_edges:                  The path to the raw test edge list\n                                            it is the train/valid/test split. The sum of this list must be 1.\n        :param columns:                     Denotes the columns to extract for the edges. The default is [0, 1, 2],\n                                            where the first index is the column id of the src nodes, the second the\n                                            relations (edge-types), and the third the dst nodes. For graphs without\n                                            edge types, only two ids should be provided.\n        :param header_length:               The length of the header of the input edge lists\n        :param delim:                       The delimiter used between columns of the input edge lists\n        :param dtype:                       The datatype of the assign integer ids to each entity\n        \"\"\"\n\n        super().__init__()\n\n        self.spark = spark\n\n        self.train_edges = train_edges\n        self.valid_edges = valid_edges\n        self.test_edges = test_edges\n        self.columns = columns\n        self.header_length = header_length\n\n        self.header = False\n\n        if self.header_length > 1:\n            raise RuntimeError(\"Spark reader unable to support files with multiline headers\")\n        elif self.header_length == 1:\n            self.header = True\n\n        self.delim = delim\n        self.dtype = dtype\n\n        if len(self.columns) == 2:\n            self.has_rels = False\n        elif len(self.columns) == 3:\n            self.has_rels = True\n        else:\n            raise RuntimeError(\n                \"Incorrect number of columns specified, expected length 2 or 3, received {}\".format(len(self.columns))\n            )\n\n    def read(self):\n        all_edges_df: DataFrame = None\n        train_edges_df: DataFrame = None\n        valid_edges_df: DataFrame = None\n        test_edges_df: DataFrame = None\n\n        if self.valid_edges is None and self.test_edges is None:\n            # no validation or test edges supplied\n\n            # read in training edge list\n            all_edges_df = self.spark.read.option(\"header\", self.header).csv(self.train_edges.__str__(), sep=self.delim)\n\n            column_order = []\n            for i in self.columns:\n                column_order.append(all_edges_df.columns[i])\n\n            all_edges_df = all_edges_df.select(column_order)\n        else:\n            # predefined valid and test edges.\n            all_edges_df = self.spark.read.option(\"header\", self.header).csv(\n                [self.train_edges.__str__(), self.valid_edges.__str__(), self.test_edges.__str__()], sep=self.delim\n            )\n\n            train_edges_df = self.spark.read.option(\"header\", self.header).csv(\n                self.train_edges.__str__(), sep=self.delim\n            )\n\n            valid_edges_df = self.spark.read.option(\"header\", self.header).csv(\n                self.valid_edges.__str__(), sep=self.delim\n            )\n\n            test_edges_df = self.spark.read.option(\"header\", self.header).csv(self.test_edges.__str__(), sep=self.delim)\n\n            column_order = []\n            for i in self.columns:\n                column_order.append(all_edges_df.columns[i])\n\n            all_edges_df = all_edges_df.select(column_order)\n            train_edges_df = train_edges_df.select(column_order)\n            valid_edges_df = valid_edges_df.select(column_order)\n            test_edges_df = test_edges_df.select(column_order)\n\n        return all_edges_df, train_edges_df, valid_edges_df, test_edges_df\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/spark_constants.py",
    "content": "SRC_COL = \"src\"\nREL_COL = \"rel\"\nDST_COL = \"dst\"\nINDEX_COL = \"index\"\nSRC_EDGE_BUCKET_COL = \"src_part\"\nDST_EDGE_BUCKET_COL = \"dst_part\"\nPARTITION_ID = \"partition_id\"\nNODE_LABEL = \"node_label\"\nRELATION_LABEL = \"relation_label\"\nTMP_DATA_DIRECTORY = \"tmp_pyspark\"\nSPARK_APP_NAME = \"marius_edge_converter\"\nEDGES_INDEX_COL = \"edge_index\"\nREL_INDEX_COL = \"rel_index\"\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/spark_converter.py",
    "content": "import glob\nimport os\nfrom pathlib import Path\n\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import col, monotonically_increasing_id, rand, row_number\nfrom pyspark.sql.window import Window\n\nfrom marius.tools.preprocess.converters.partitioners.spark_partitioner import SparkPartitioner\nfrom marius.tools.preprocess.converters.readers.spark_readers import SparkDelimitedFileReader\nfrom marius.tools.preprocess.converters.spark_constants import (\n    DST_COL,\n    EDGES_INDEX_COL,\n    INDEX_COL,\n    NODE_LABEL,\n    REL_COL,\n    REL_INDEX_COL,\n    RELATION_LABEL,\n    SPARK_APP_NAME,\n    SRC_COL,\n    TMP_DATA_DIRECTORY,\n)\nfrom marius.tools.preprocess.converters.writers.spark_writer import SparkWriter\n\nSUPPORTED_DELIM_FORMATS = [\"CSV\", \"TSV\", \"TXT\", \"DELIM\", \"DELIMITED\"]\nSUPPORTED_NON_DELIM_FILE_FORMATS = [\"PARQUET\"]\n\n\ndef remap_columns(df, has_rels):\n    columns = [SRC_COL, REL_COL, DST_COL]\n    if not has_rels:\n        columns = [SRC_COL, DST_COL]\n    return df.toDF(*columns)\n\n\ndef get_nodes_df(edges_df):\n    nodes = (\n        edges_df.select(col(SRC_COL).alias(NODE_LABEL))\n        .union(edges_df.select(col(DST_COL).alias(NODE_LABEL)))\n        .distinct()\n        .repartition(1)\n        .orderBy(rand())\n        .cache()\n    )\n    nodes = assign_ids(nodes, INDEX_COL)\n    return nodes\n\n\ndef get_relations_df(edges_df):\n    rels = (\n        edges_df.drop(SRC_COL, DST_COL)\n        .distinct()\n        .repartition(1)\n        .orderBy(rand())\n        .withColumnRenamed(REL_COL, RELATION_LABEL)\n        .cache()\n    )\n    rels = assign_ids(rels, REL_INDEX_COL)\n    return rels\n\n\ndef assign_ids(df, col_id):\n    if df is None:\n        return None\n    return df.withColumn(col_id, row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)\n\n\ndef remap_edges(edges_df, nodes, rels):\n    if rels is not None:\n        remapped_edges_df = (\n            edges_df.join(nodes.hint(\"merge\"), edges_df.src == nodes.node_label)\n            .drop(NODE_LABEL, SRC_COL)\n            .withColumnRenamed(INDEX_COL, SRC_COL)\n            .join(rels.hint(\"merge\"), edges_df.rel == rels.relation_label)\n            .drop(RELATION_LABEL, REL_COL)\n            .withColumnRenamed(INDEX_COL, REL_COL)\n            .join(nodes.hint(\"merge\"), edges_df.dst == nodes.node_label)\n            .drop(NODE_LABEL, DST_COL)\n            .withColumnRenamed(INDEX_COL, DST_COL)\n        )\n    else:\n        remapped_edges_df = (\n            edges_df.join(nodes.hint(\"merge\"), edges_df.src == nodes.node_label)\n            .drop(NODE_LABEL, SRC_COL)\n            .withColumnRenamed(INDEX_COL, SRC_COL)\n            .join(nodes.hint(\"merge\"), edges_df.dst == nodes.node_label)\n            .drop(NODE_LABEL, DST_COL)\n            .withColumnRenamed(INDEX_COL, DST_COL)\n        )\n\n    return remapped_edges_df\n\n\ndef write_df_to_csv(df, output_filename):\n    df.write.csv(TMP_DATA_DIRECTORY, mode=\"overwrite\", sep=\"\\t\")\n    tmp_file = glob.glob(\"{}/*.csv\".format(TMP_DATA_DIRECTORY))[0]\n    os.system(\"mv {} {}\".format(tmp_file, output_filename))\n    os.system(\"rm -rf {}\".format(TMP_DATA_DIRECTORY))\n\n\nclass SparkEdgeListConverter(object):\n    def __init__(\n        self,\n        output_dir: Path,\n        train_edges: Path,\n        valid_edges: Path = None,\n        test_edges: Path = None,\n        columns: list = [0, 1, 2],\n        header_length: int = 0,\n        format: str = \"csv\",\n        delim: str = \"\\t\",\n        dtype: str = \"int32\",\n        num_partitions: int = 1,\n        splits: list = None,\n        partitioned_evaluation: bool = False,\n        remap_ids: bool = True,\n        spark_driver_memory: str = \"32g\",\n        spark_executor_memory: str = \"4g\",\n    ):\n        self.output_dir = output_dir\n\n        self.spark = (\n            SparkSession.builder.appName(SPARK_APP_NAME)\n            .config(\"spark.driver.memory\", spark_driver_memory)\n            .config(\"spark.executor.memory\", spark_executor_memory)\n            .config(\"spark.logConf\", False)\n            .getOrCreate()\n        )\n\n        self.spark.sparkContext.setLogLevel(\"OFF\")\n\n        if format.upper() in SUPPORTED_DELIM_FORMATS:\n            self.reader = SparkDelimitedFileReader(\n                self.spark, train_edges, valid_edges, test_edges, columns, header_length, delim, dtype\n            )\n        else:\n            raise RuntimeError(\"Unsupported input format\")\n\n        self.num_partitions = num_partitions\n\n        if self.num_partitions > 1:\n            self.partitioner = SparkPartitioner(self.spark, partitioned_evaluation)\n        else:\n            self.partitioner = None\n\n        self.writer = SparkWriter(self.spark, self.output_dir, partitioned_evaluation)\n\n        self.train_split = None\n        self.valid_split = None\n        self.test_split = None\n\n        if splits is not None:\n            if len(splits) == 2:\n                self.train_split = splits[0]\n                self.test_split = splits[1]\n\n                assert (self.train_split + self.test_split) == 1\n            if len(splits) == 3:\n                self.train_split = splits[0]\n                self.valid_split = splits[1]\n                self.test_split = splits[2]\n\n                assert (self.train_split + self.valid_split + self.test_split) == 1\n\n        self.has_rels = False\n        if len(columns) == 3:\n            self.has_rels = True\n\n    def convert(self):\n        print(\"Reading edges\")\n        all_edges_df, train_edges_df, valid_edges_df, test_edges_df = self.reader.read()\n\n        all_edges_df = remap_columns(all_edges_df, self.has_rels)\n\n        if train_edges_df is not None:\n            train_edges_df = remap_columns(train_edges_df, self.has_rels)\n\n        if valid_edges_df is not None:\n            valid_edges_df = remap_columns(valid_edges_df, self.has_rels)\n\n        if test_edges_df is not None:\n            test_edges_df = remap_columns(test_edges_df, self.has_rels)\n\n        print(\"Assigning unique IDs\")\n\n        # get node and relation labels and assign indices\n        nodes_df = get_nodes_df(all_edges_df)\n\n        if self.has_rels:\n            rels_df = get_relations_df(all_edges_df)\n        else:\n            rels_df = None\n\n        print(\"Remapping edges\")\n\n        # replace node and relation labels with indices\n        if train_edges_df is not None:\n            train_edges_df = remap_edges(train_edges_df, nodes_df, rels_df)\n        if valid_edges_df is not None:\n            valid_edges_df = remap_edges(valid_edges_df, nodes_df, rels_df)\n        if test_edges_df is not None:\n            test_edges_df = remap_edges(test_edges_df, nodes_df, rels_df)\n\n        if train_edges_df is None:\n            all_edges_df = remap_edges(all_edges_df, nodes_df, rels_df)\n            if self.test_split is not None:\n                # check if a dataset split is needed\n                if self.valid_split is not None:\n                    print(\n                        \"Splitting into: {}/{}/{} fractions\".format(self.train_split, self.valid_split, self.test_split)\n                    )\n\n                    # split into train/valid/test\n                    train_edges_df, valid_edges_df, test_edges_df = all_edges_df.randomSplit(\n                        [self.train_split, self.valid_split, self.test_split]\n                    )\n                else:\n                    print(\"Splitting into: {}/{} fractions\".format(self.train_split, self.test_split))\n                    # split into train/test\n                    train_edges_df, test_edges_df = all_edges_df.randomSplit([self.train_split, self.test_split])\n            else:\n                train_edges_df = all_edges_df\n        all_edges_df, train_edges_df, valid_edges_df, test_edges_df = (\n            assign_ids(all_edges_df, EDGES_INDEX_COL),\n            assign_ids(train_edges_df, EDGES_INDEX_COL),\n            assign_ids(valid_edges_df, EDGES_INDEX_COL),\n            assign_ids(test_edges_df, EDGES_INDEX_COL),\n        )\n\n        if self.partitioner is not None:\n            print(\"Partition nodes into {} partitions\".format(self.num_partitions))\n            train_edges_df, valid_edges_df, test_edges_df = self.partitioner.partition_edges(\n                train_edges_df, valid_edges_df, test_edges_df, nodes_df, self.num_partitions\n            )\n\n        return self.writer.write_to_binary(\n            train_edges_df, valid_edges_df, test_edges_df, nodes_df, rels_df, self.num_partitions\n        )\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/torch_constants.py",
    "content": "from enum import Enum, unique\n\n\n@unique\nclass TorchConverterColumnKeys(Enum):\n    SRC_COL = \"src_column\"\n    DST_COL = \"dst_column\"\n    EDGE_TYPE_COL = \"edge_type_column\"\n    EDGE_WEIGHT_COL = \"edge_weight_column\"\n\n    def __hash__(self) -> int:\n        return hash(self.name)\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/torch_converter.py",
    "content": "import os\nfrom pathlib import Path\n\nimport numpy as np\nimport pandas as pd\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.preprocess.converters.partitioners.torch_partitioner import TorchPartitioner\nfrom marius.tools.preprocess.converters.readers.pandas_readers import PandasDelimitedFileReader\nfrom marius.tools.preprocess.converters.torch_constants import TorchConverterColumnKeys as ColNames\nfrom marius.tools.preprocess.converters.writers.torch_writer import TorchWriter\n\nimport torch  # isort:skip\n\nSUPPORTED_DELIM_FORMATS = [\"CSV\", \"TSV\", \"TXT\", \"DELIM\", \"DELIMITED\"]\nSUPPORTED_IN_MEMORY_FORMATS = [\"NUMPY\", \"NP\", \"PYTORCH\", \"TORCH\"]\n\n\ndef dataframe_to_tensor(df):\n    return torch.tensor(df.to_numpy())\n\n\ndef apply_mapping_edges(input_edges, node_mapping_df, rel_mapping_df=None):\n    if isinstance(input_edges, torch.Tensor):\n        assert len(input_edges.shape) == 2\n\n        src = input_edges[:, 0]\n        dst = input_edges[:, -1]\n\n        src = apply_mapping1d(src, node_mapping_df)\n        dst = apply_mapping1d(dst, node_mapping_df)\n\n        stack_tens = []\n        if rel_mapping_df is None:\n            assert input_edges.shape[1] == 2\n            stack_tens = [src, dst]\n        else:\n            assert input_edges.shape[1] == 3\n            rel = input_edges[:, 1]\n            rel = apply_mapping1d(rel, rel_mapping_df)\n            stack_tens = [src, rel, dst]\n\n        return torch.stack(stack_tens, dim=1)\n\n    elif isinstance(input_edges, pd.DataFrame):\n        src = input_edges.iloc[:, 0]\n        dst = input_edges.iloc[:, -1]\n\n        src = apply_mapping1d(src, node_mapping_df)\n        dst = apply_mapping1d(dst, node_mapping_df)\n\n        concat_df = []\n        if rel_mapping_df is None:\n            assert input_edges.shape[1] == 2\n\n            concat_df = [src, dst]\n        else:\n            assert input_edges.shape[1] == 3\n            rel = input_edges[:, 1]\n            rel = apply_mapping1d(rel, rel_mapping_df)\n\n            concat_df = [src, rel, dst]\n\n        return pd.concat(concat_df, axis=1)\n    else:\n        raise RuntimeError(\"Unsupported datatype for input. Must be a pandas.DataFrame or a 2D torch.Tensor\")\n\n\ndef apply_mapping1d(input_ids, mapping_df):\n    if isinstance(input_ids, torch.Tensor):\n        assert len(input_ids.shape) == 1\n        mapping = dataframe_to_tensor(mapping_df)\n        return mapping[:, 1][input_ids]\n    elif isinstance(input_ids, pd.Series):\n        return input_ids.map(mapping_df.iloc[:, 1])\n    else:\n        raise RuntimeError(\"Unsupported datatype for input. Must be a pandas.Series or a 1D torch.Tensor\")\n\n\ndef extract_tensors_from_df(df, column_mappings):\n    if df is None:\n        return None, None\n\n    edge_weight_tensor = None\n    edge_weight_column_num = column_mappings[ColNames.EDGE_WEIGHT_COL]\n    edge_weight_column_name = ColNames.EDGE_WEIGHT_COL.value\n\n    if edge_weight_column_num is not None:\n        assert edge_weight_column_name in list(df.columns)\n        edge_weight_tensor = torch.tensor(df[edge_weight_column_name].values)\n        df = df.drop(columns=[edge_weight_column_name])\n\n    edges_tensor = dataframe_to_tensor(df)\n    return edges_tensor, edge_weight_tensor\n\n\ndef map_edge_list_dfs(\n    edge_lists: list,\n    known_node_ids=None,\n    sequential_train_nodes=False,\n    sequential_deg_nodes=0,\n    column_mappings: dict = {},\n):\n    if sequential_train_nodes or sequential_deg_nodes > 0:\n        raise RuntimeError(\"sequential_train_nodes not yet supported for map_edge_list_dfs\")\n\n    # Combine all the non null dfs\n    combined_dfs = []\n    has_rels = column_mappings[ColNames.EDGE_TYPE_COL] is not None\n    for edge_df in edge_lists:\n        if edge_df is not None:\n            # Convert all columns to str\n            edge_df[ColNames.SRC_COL.value] = edge_df[ColNames.SRC_COL.value].astype(str)\n            edge_df[ColNames.DST_COL.value] = edge_df[ColNames.DST_COL.value].astype(str)\n            if has_rels:\n                edge_df[ColNames.EDGE_TYPE_COL.value] = edge_df[ColNames.EDGE_TYPE_COL.value].astype(str)\n            combined_dfs.append(edge_df)\n\n    # Get the unique nodes\n    all_edges_df = pd.concat(combined_dfs)\n    unique_src = all_edges_df[ColNames.SRC_COL.value].unique().astype(str)\n    unique_dst = all_edges_df[ColNames.DST_COL.value].unique().astype(str)\n\n    unique_list = [unique_src, unique_dst]\n    if known_node_ids is not None:\n        for n in known_node_ids:\n            unique_list.append(n.numpy().astype(str))\n\n    unique_nodes = np.unique(np.concatenate(unique_list, axis=None))\n    num_nodes = unique_nodes.shape[0]\n    mapped_node_ids = np.random.permutation(num_nodes)\n    nodes_dict = dict(zip(list(unique_nodes), list(mapped_node_ids)))\n\n    unique_rels = torch.empty([0])\n    mapped_rel_ids = torch.empty([0])\n    rels_dict = None\n\n    if has_rels:\n        unique_rels = all_edges_df[ColNames.EDGE_TYPE_COL.value].unique()\n        num_rels = unique_rels.shape[0]\n        mapped_rel_ids = np.random.permutation(num_rels)\n        rels_dict = dict(zip(list(unique_rels), list(mapped_rel_ids)))\n\n    output_edge_lists, output_edge_weights = [], []\n    for edge_list in edge_lists:\n        if edge_list is None:\n            output_edge_lists.append(None)\n            output_edge_weights.append(None)\n            continue\n\n        # Map the src and dst values\n        edge_list[ColNames.SRC_COL.value] = edge_list[ColNames.SRC_COL.value].map(nodes_dict)\n        assert edge_list[ColNames.SRC_COL.value].isna().sum() == 0\n\n        edge_list[ColNames.DST_COL.value] = edge_list[ColNames.DST_COL.value].map(nodes_dict)\n        assert edge_list[ColNames.DST_COL.value].isna().sum() == 0\n\n        if has_rels:\n            edge_list[ColNames.EDGE_TYPE_COL.value] = edge_list[ColNames.EDGE_TYPE_COL.value].map(rels_dict)\n            assert edge_list[ColNames.EDGE_TYPE_COL.value].isna().sum() == 0\n\n        edge_tensor, edge_weights = extract_tensors_from_df(edge_list, column_mappings)\n        output_edge_lists.append(edge_tensor)\n        output_edge_weights.append(edge_weights)\n\n    node_mapping = np.stack([unique_nodes, mapped_node_ids], axis=1)\n    rel_mapping = None\n    if has_rels:\n        rel_mapping = np.stack([unique_rels, mapped_rel_ids], axis=1)\n\n    return output_edge_lists, node_mapping, rel_mapping, output_edge_weights\n\n\ndef extract_tensor_from_tens(edges_tensor, column_mappings):\n    if edges_tensor is None:\n        return None, None\n\n    edge_weights_column = column_mappings[ColNames.EDGE_WEIGHT_COL]\n    cols_to_keep = [column_mappings[ColNames.SRC_COL], column_mappings[ColNames.DST_COL]]\n    if column_mappings[ColNames.EDGE_TYPE_COL] is not None:\n        cols_to_keep.insert(len(cols_to_keep) - 1, column_mappings[ColNames.EDGE_TYPE_COL])\n\n    converted_tensor = edges_tensor[:, cols_to_keep]\n    converted_weights = None\n    if edge_weights_column is not None:\n        converted_weights = edges_tensor[:, edge_weights_column]\n\n    return converted_tensor, converted_weights\n\n\ndef map_edge_lists(\n    edge_lists: list,\n    perform_unique=True,\n    known_node_ids=None,\n    sequential_train_nodes=False,\n    sequential_deg_nodes=0,\n    column_mappings: dict = {},\n):\n    print(\"Remapping node ids\")\n\n    # Ensure that we extract the edge weights as well that edge_lists are in [src, dst] or in [src, type, dst] order\n    edge_weights_list = [None] * len(edge_lists)\n    has_rels = column_mappings[ColNames.EDGE_TYPE_COL] is not None\n    all_edges = []\n    if isinstance(edge_lists[0], pd.DataFrame):\n        first_df = edge_lists[0]\n        if any(col_dtype != np.number for col_dtype in first_df.dtypes):\n            # need to take uniques using pandas for string datatypes, since torch doesn't support strings\n            return map_edge_list_dfs(\n                edge_lists,\n                known_node_ids,\n                sequential_train_nodes,\n                sequential_deg_nodes,\n                column_mappings=column_mappings,\n            )\n\n        for idx in range(len(edge_lists)):\n            edge_tensors, edge_weights = extract_tensors_from_df(edge_lists[idx], column_mappings)\n            edge_lists[idx] = edge_tensors\n            edge_weights_list[idx] = edge_weights\n            if edge_tensors is not None:\n                all_edges.append(edge_tensors)\n    else:\n        # Determine the order of tensors to keep\n        for idx in range(len(edge_lists)):\n            curr_edges = edge_lists[idx]\n            if curr_edges is None:\n                continue\n\n            converted_edges, converted_weights = extract_tensor_from_tens(curr_edges, column_mappings)\n            edge_lists[idx] = converted_edges\n            all_edges.append(converted_edges)\n            edge_weights_list[idx] = converted_weights\n\n    all_edges = torch.cat(all_edges)\n    num_rels = 1\n    unique_rels = torch.empty([0])\n    mapped_rel_ids = torch.empty([0])\n    output_dtype = torch.int32\n\n    if perform_unique:\n        unique_src = torch.unique(all_edges[:, 0])\n        unique_dst = torch.unique(all_edges[:, -1])\n        if known_node_ids is None:\n            unique_nodes = torch.unique(torch.cat([unique_src, unique_dst]), sorted=True)\n        else:\n            unique_nodes = torch.unique(torch.cat([unique_src, unique_dst] + known_node_ids), sorted=True)\n\n        num_nodes = unique_nodes.size(0)\n        if has_rels:\n            unique_rels = torch.unique(all_edges[:, 1], sorted=True)\n            num_rels = unique_rels.size(0)\n\n    else:\n        num_nodes = torch.max(all_edges[:, 0])[0]\n        unique_nodes = torch.arange(num_nodes).to(output_dtype)\n\n        if has_rels:\n            num_rels = torch.max(all_edges[:, 1])[0]\n            unique_rels = torch.arange(num_rels).to(output_dtype)\n\n    if has_rels:\n        min_rel_val = unique_rels[0].to(torch.int64)\n\n    if sequential_train_nodes or sequential_deg_nodes > 0:\n        print(\"inside sequential mode because\", sequential_train_nodes, sequential_deg_nodes)\n        seq_nodes = None\n\n        if sequential_train_nodes and sequential_deg_nodes <= 0:\n            print(\"Sequential Train Nodes\")\n            seq_nodes = known_node_ids[0]\n        else:\n            out_degrees = torch.zeros(\n                [\n                    num_nodes,\n                ],\n                dtype=torch.int32,\n            )\n            out_degrees = torch.scatter_add(\n                out_degrees,\n                0,\n                torch.squeeze(edge_lists[0][:, 0]).to(torch.int64),\n                torch.ones(\n                    [\n                        edge_lists[0].shape[0],\n                    ],\n                    dtype=torch.int32,\n                ),\n            )\n\n            in_degrees = torch.zeros(\n                [\n                    num_nodes,\n                ],\n                dtype=torch.int32,\n            )\n            in_degrees = torch.scatter_add(\n                in_degrees,\n                0,\n                torch.squeeze(edge_lists[0][:, -1]).to(torch.int64),\n                torch.ones(\n                    [\n                        edge_lists[0].shape[0],\n                    ],\n                    dtype=torch.int32,\n                ),\n            )\n\n            degrees = in_degrees + out_degrees\n\n            deg_argsort = torch.argsort(degrees, dim=0, descending=True)\n            high_degree_nodes = deg_argsort[:sequential_deg_nodes]\n\n            print(\"High Deg Nodes Degree Sum:\", torch.sum(degrees[high_degree_nodes]).numpy())\n\n            if sequential_train_nodes and sequential_deg_nodes > 0:\n                print(\"Sequential Train and High Deg Nodes\")\n                seq_nodes = torch.unique(torch.cat([high_degree_nodes, known_node_ids[0]]))\n                seq_nodes = seq_nodes.index_select(0, torch.randperm(seq_nodes.size(0), dtype=torch.int64))\n                print(\"Total Seq Nodes: \", seq_nodes.shape[0])\n            else:\n                print(\"Sequential High Deg Nodes\")\n                seq_nodes = high_degree_nodes\n\n        seq_mask = torch.zeros(num_nodes, dtype=torch.bool)\n        seq_mask[seq_nodes.to(torch.int64)] = True\n        all_other_nodes = torch.arange(num_nodes, dtype=seq_nodes.dtype)\n        all_other_nodes = all_other_nodes[~seq_mask]\n\n        mapped_node_ids = -1 * torch.ones(num_nodes, dtype=output_dtype)\n        mapped_node_ids[seq_nodes.to(torch.int64)] = torch.arange(seq_nodes.shape[0], dtype=output_dtype)\n        mapped_node_ids[all_other_nodes.to(torch.int64)] = seq_nodes.shape[0] + torch.randperm(\n            num_nodes - seq_nodes.shape[0], dtype=output_dtype\n        )\n    else:\n        mapped_node_ids = torch.randperm(num_nodes, dtype=output_dtype)\n\n    if has_rels:\n        mapped_rel_ids = torch.randperm(num_rels, dtype=output_dtype)\n\n    # TODO may use too much memory if the max id is very large\n    # Needed to support indexing w/ the remap\n    if torch.max(unique_nodes) + 1 > num_nodes:\n        extended_map = torch.zeros(torch.max(unique_nodes) + 1, dtype=output_dtype)\n        extended_map[unique_nodes] = mapped_node_ids\n    else:\n        extended_map = mapped_node_ids\n\n    all_edges = None  # can safely free this tensor\n\n    output_edge_lists = []\n    for idx, edge_list in enumerate(edge_lists):\n        if edge_list is None:\n            output_edge_lists.append(None)\n            continue\n\n        new_src = extended_map[edge_list[:, 0].to(torch.int64)]\n        new_dst = extended_map[edge_list[:, -1].to(torch.int64)]\n        curr_row = [new_src, new_dst]\n\n        if has_rels:\n            new_rel = mapped_rel_ids[edge_list[:, 1].to(torch.int64) - min_rel_val]\n            curr_row.insert(len(curr_row) - 1, new_rel)\n        output_edge_lists.append(torch.stack(curr_row, dim=1))\n\n    node_mapping = np.stack([unique_nodes.numpy(), mapped_node_ids.numpy()], axis=1)\n    rel_mapping = None\n    if has_rels:\n        rel_mapping = np.stack([unique_rels.numpy(), mapped_rel_ids.numpy()], axis=1)\n\n    return output_edge_lists, node_mapping, rel_mapping, edge_weights_list\n\n\ndef split_edges(edges, edges_weights, splits):\n    train_edges_tens, train_edges_weights = None, None\n    valid_edges_tens, valid_edges_weights = None, None\n    test_edges_tens, test_edges_weights = None, None\n\n    total_split_edges = int(sum(splits) * edges.shape[0])\n    num_total_edges = edges.shape[0]\n    rand_perm = torch.randperm(num_total_edges)\n\n    if len(splits) == 3:\n        train_split = splits[0]\n        valid_split = splits[1]\n        test_split = splits[2]\n        print(\"Splitting into: {}/{}/{} fractions\".format(train_split, valid_split, test_split))\n\n        num_train = int(num_total_edges * train_split)\n        num_valid = int(num_total_edges * valid_split)\n\n        train_edges_tens = edges[rand_perm[:num_train]]\n        valid_edges_tens = edges[rand_perm[num_train : num_train + num_valid]]\n        test_edges_tens = edges[rand_perm[num_train + num_valid : total_split_edges]]\n\n        if edges_weights is not None:\n            train_edges_weights = edges_weights[rand_perm[:num_train]]\n            valid_edges_weights = edges_weights[rand_perm[num_train : num_train + num_valid]]\n            test_edges_weights = edges_weights[rand_perm[num_train + num_valid : total_split_edges]]\n\n    elif len(splits) == 2:\n        train_split = splits[0]\n        test_split = splits[1]\n        print(\"Splitting into: {}/{} fractions\".format(train_split, test_split))\n\n        num_train = int(num_total_edges * train_split)\n\n        train_edges_tens = edges[rand_perm[:num_train]]\n        test_edges_tens = edges[rand_perm[num_train:total_split_edges]]\n\n        if edges_weights is not None:\n            train_edges_weights = edges_weights[rand_perm[:num_train]]\n            test_edges_weights = edges_weights[rand_perm[num_train:total_split_edges]]\n\n    else:\n        raise RuntimeError(\"Splits must be length 2 or 3\")\n\n    return (\n        train_edges_tens,\n        train_edges_weights,\n        valid_edges_tens,\n        valid_edges_weights,\n        test_edges_tens,\n        test_edges_weights,\n    )\n\n\nclass TorchEdgeListConverter(object):\n    def __init__(\n        self,\n        output_dir: Path,\n        train_edges: Path,\n        valid_edges: Path = None,\n        test_edges: Path = None,\n        splits: list = None,\n        format: str = \"csv\",\n        header_length: int = 0,\n        delim: str = \"\\t\",\n        dtype: str = \"int32\",\n        num_partitions: int = 1,\n        partitioned_evaluation: bool = False,\n        src_column: int = None,\n        dst_column: int = None,\n        edge_type_column: int = None,\n        edge_weight_column: int = None,\n        remap_ids: bool = True,\n        sequential_train_nodes: bool = False,\n        sequential_deg_nodes: int = 0,\n        num_nodes: int = None,\n        num_rels: int = None,\n        known_node_ids: list = None,\n    ):\n        \"\"\"\n        This converter is used to preprocess input edge lists which fit in memory. Pandas, numpy and pytorch are used to convert input edge lists that are\n        stored as delimited files, numpy arrays, or pytorch tensors into the input format required by Marius.\n\n        Steps of conversion process:\n        1. Read in input dataset and convert to a pytorch tensor\n        2. Remap node and relation ids to randomly assigned integer ids (optional). Write mappings to the output directory.\n        3. Perform data set splitting into train/valid/test sets (optional)\n        4. Reorder/partition edge list(s) according to their edge buckets (optional)\n        5. Write contents of the edge list(s) tensors to a file in the specified output directory\n\n        Output format:\n        The output format is as follows\n\n        <output_dir>/\n            edges/\n                train_edges.bin                                 Binary file of size num_train * 2 * sizeof(dtype) or num_train * 3 * sizeof(dtype)\n                train_partition_offsets.txt     (optional)      List of training edge bucket sizes in sequential order (0, 0), (0, 1) ... (1, 0), ... (n-1, n-1)\n                valid_edges.bin                 (optional)      Binary file of size num_valid * 2 * sizeof(dtype) or num_valid * 3 * sizeof(dtype) or num_valid * 3 * sizeof(dtype)\n                                                                The ordering of the data is as as follows based on dataset breakdown:\n                                                                    Both edge weights and edge types present: [src, type, weight, dst]\n                                                                    Neither edge weight or edge type present: [src, dst]\n                                                                    Only edge weight present: [src, weight, dst]\n                                                                    Only edge type present: [src, type, dst]\n                valid_partition_offsets.txt     (optional)      List of validation edge bucket sizes in sequential order (0, 0), (0, 1) ... (1, 0), ... (n-1, n-1)\n                test_edges.bin                  (optional)      Binary file of size num_test * 2 * sizeof(dtype) or num_test * 3 * sizeof(dtype)\n                test_partition_offsets.txt      (optional)      List of test edge bucket sizes in sequential order (0, 0), (0, 1) ... (1, 0), ... (n-1, n-1)\n                relation_mapping.txt            (optional)      Two column CSV containing a mapping of raw relation/edge-type ids (1st column) to randomly assigned integer ids (2nd column).\n            nodes/\n                node_mapping.txt                (optional)      Two column CSV containing a mapping of raw node ids (1st column) to randomly assigned integer ids (2nd column).\n            dataset.yaml                                        Output dataset statistics in YAML format.\n\n        :param output_dir:   (required)         Directory which will contain the preprocessed dataset\n        :param train_edges:  (required)         Raw input training edges, can be a delimited file, a numpy array, or pytorch tensor\n        :param valid_edges:                     Raw input validation edges, can be a delimited file, a numpy array, or pytorch tensor (optional)\n        :param test_edges:                      Raw input test edges, can be a delimited file, a numpy array, or pytorch tensor (optional)\n        :param splits:                          Train/valid/test split to use for the input\n        :param format:                          Format of the input dataset, can be a delimited file (CSV, TSV, TXT) or a numpy array or a pytorch tensor.\n        :param src_column:                      The column storing the src nodes.\n        :param dst_column:                      The column storing the dst nodes.\n        :param edge_type_column:                The column storing the edge type.\n        :param edge_weight_column:              The column storing the edge weights.\n        :param header_length:                   Length of the header for input delimited files\n        :param delim:                           Delimiter of the input delimited files\n        :param dtype:                           Datatype of the node ids in the output preprocessed datasets. Unless you have over 2 billion nodes, this should\n                                                stay as int32/\n        :param num_partitions:                  Number of node partitions which will be used to train the model with the partition buffer. Setting this will\n                                                reorder the output edge list(s) according to the num_partitions^2 edge buckets in a sequential order.\n                                                E.g. edge bucket (0,0) will be first, then (0, 1), (0, 2) ... (0, n-1), (1, 0) .... (1, n-1), ... (n-1, 0) ... (n-1, n-1).\n                                                The sizes of the edge buckets are stored in <output_dir>/edges/<type>_partition_offsets.txt\n        :param partitioned_evaluation:          If true, the edge buckets for the validation and test sets will be computed and the edge lists will be reordered.\n        :param remap_ids:                       If true, then the raw entity ids of the input edge lists will be remapped to random integer ids. The mapping of\n                                                the node ids is stored as a two column CSV in <output_dir>/nodes/node_mapping.txt\n        :param sequential_train_nodes           If true, the train nodes will be given ids 0 to num train nodes. Applicable to node classification datasets. If set,\n                                                remap_ids must also be set.\n        :param sequential_deg_nodes             If greater than zero, this number of the highest degree nodes based on the train edges will be given ids 0 to this number. If\n                                                greater than zero, remap_ids must also be set. Can be mixed with sequential_train_nodes, in which case train and high deg nodes\n                                                are given ids starting from 0.\n        :param num_nodes:                       Number of nodes in the dataset, this is required when remap_ids is set to false.\n        :param num_rels:                        Number of nodes in the dataset, this is required when remap_ids is set to false and the dataset has edge_types\n        :param known_node_ids:                  List of node id arrays or tensors which contain known node ids for the dataset. Used for generating node id mappings\n                                                when some nodes may not be present in the edge list.\n        \"\"\"  # noqa: E501\n\n        # Read in the src and dst column\n        if src_column is None:\n            raise ValueError(\"Src column must be specified with a non None value\")\n\n        if dst_column is None:\n            raise ValueError(\"Dst column must be specified with a non None value\")\n\n        # Save these variables\n        self.output_dir = output_dir\n        self.num_nodes = num_nodes\n        self.num_rels = num_rels\n        self.column_mappings = {\n            ColNames.SRC_COL: src_column,\n            ColNames.DST_COL: dst_column,\n            ColNames.EDGE_TYPE_COL: edge_type_column,\n            ColNames.EDGE_WEIGHT_COL: edge_weight_column,\n        }\n\n        if format.upper() in SUPPORTED_DELIM_FORMATS:\n            assert isinstance(train_edges, str) or isinstance(train_edges, Path)\n            self.reader = PandasDelimitedFileReader(\n                train_edges, valid_edges, test_edges, self.column_mappings, header_length, delim\n            )\n\n        elif format.upper() in SUPPORTED_IN_MEMORY_FORMATS:\n            self.reader = None\n            if format.upper() == \"NUMPY\" or format.upper() == \"NP\":\n                assert isinstance(train_edges, np.ndarray)\n                self.train_edges_tens = torch.from_numpy(train_edges)\n                self.valid_edges_tens = None\n                self.test_edges_tens = None\n\n                if valid_edges is not None:\n                    assert isinstance(valid_edges, np.ndarray)\n                    self.valid_edges_tens = torch.from_numpy(valid_edges)\n\n                if test_edges is not None:\n                    assert isinstance(test_edges, np.ndarray)\n                    self.test_edges_tens = torch.from_numpy(test_edges)\n\n            elif format.upper() == \"PYTORCH\" or format.upper() == \"TORCH\":\n                assert isinstance(train_edges, torch.Tensor)\n                self.train_edges_tens = train_edges\n                self.valid_edges_tens = valid_edges\n                self.test_edges_tens = test_edges\n\n                if valid_edges is not None:\n                    assert isinstance(valid_edges, torch.Tensor)\n\n                if test_edges is not None:\n                    assert isinstance(test_edges, torch.Tensor)\n        else:\n            raise RuntimeError(\"Unsupported input format\")\n        self.num_partitions = num_partitions\n\n        if self.num_partitions > 1:\n            self.partitioner = TorchPartitioner(partitioned_evaluation)\n        else:\n            self.partitioner = None\n\n        self.writer = TorchWriter(self.output_dir, partitioned_evaluation)\n        self.splits = splits\n\n        # Determine if this has edge types\n        self.has_rels = self.column_mappings[ColNames.EDGE_TYPE_COL] is not None\n        if dtype.upper() == \"INT32\" or dtype.upper() == \"INT\":\n            self.dtype = torch.int32\n            self.weight_dtype = torch.float32\n        elif dtype.upper() == \"INT64\" or dtype.upper() == \"LONG\":\n            self.dtype = torch.int64\n            self.weight_dtype = torch.float64\n        else:\n            raise RuntimeError(\"Unrecognized datatype\")\n\n        self.remap_ids = remap_ids\n\n        if self.num_nodes is None and not self.remap_ids:\n            raise RuntimeError(\n                \"Must specify num_nodes and num_rels (if applicable) to the converter when remap_ids=False\"\n            )\n\n        if self.num_rels is None and not self.remap_ids and self.has_rels:\n            raise RuntimeError(\n                \"Must specify num_nodes and num_rels (if applicable) to the converter when remap_ids=False\"\n            )\n\n        self.sequential_train_nodes = sequential_train_nodes\n\n        if self.sequential_train_nodes is True and self.remap_ids is False:\n            raise RuntimeError(\"remap_ids must be true when sequential_train_nodes is true\")\n\n        self.sequential_deg_nodes = sequential_deg_nodes\n\n        if self.sequential_deg_nodes > 0 and self.remap_ids is False:\n            raise RuntimeError(\"remap_ids must be true when sequential_deg_nodes is greater than zero\")\n\n        if known_node_ids is not None:\n            self.known_node_ids = []\n            for node_id in known_node_ids:\n                if node_id is not None:\n                    if isinstance(node_id, np.ndarray):\n                        node_id = torch.from_numpy(node_id)\n\n                    assert isinstance(node_id, torch.Tensor)\n                    self.known_node_ids.append(node_id)\n        else:\n            self.known_node_ids = None\n\n    # flake8: noqa: C901\n    def convert(self):\n        train_edges_tens, train_edge_weights = None, None\n        valid_edges_tens, valid_edge_weights = None, None\n        test_edges_tens, test_edge_weights = None, None\n\n        os.makedirs(self.output_dir / Path(\"nodes\"), exist_ok=True)\n        os.makedirs(self.output_dir / Path(\"edges\"), exist_ok=True)\n\n        if self.reader is not None:\n            print(\"Reading edges\")\n            train_edges_df, valid_edges_df, test_edges_df = self.reader.read()\n\n            if self.remap_ids:\n                all_edge_lists, node_mapping, rel_mapping, all_edge_weights = map_edge_lists(\n                    [train_edges_df, valid_edges_df, test_edges_df],\n                    known_node_ids=self.known_node_ids,\n                    sequential_train_nodes=self.sequential_train_nodes,\n                    sequential_deg_nodes=self.sequential_deg_nodes,\n                    column_mappings=self.column_mappings,\n                )\n\n                self.num_nodes = node_mapping.shape[0]\n\n                if rel_mapping is None:\n                    self.num_rels = 1\n                else:\n                    self.num_rels = rel_mapping.shape[0]\n\n                train_edges_tens = all_edge_lists[0]\n                if len(all_edge_lists) == 2:\n                    test_edges_tens = all_edge_lists[1]\n                elif len(all_edge_lists) == 3:\n                    valid_edges_tens = all_edge_lists[1]\n                    test_edges_tens = all_edge_lists[2]\n\n                train_edge_weights = all_edge_weights[0]\n                valid_edge_weights = all_edge_weights[1]\n                test_edge_weights = all_edge_weights[2]\n\n                print(\n                    \"Node mapping written to: {}\".format(\n                        (self.output_dir / Path(PathConstants.node_mapping_path)).__str__()\n                    )\n                )\n                np.savetxt(\n                    (self.output_dir / Path(PathConstants.node_mapping_path)).__str__(),\n                    node_mapping,\n                    fmt=\"%s\",\n                    delimiter=\",\",\n                )\n\n                if self.num_rels > 1:\n                    print(\n                        \"Relation mapping written to: {}\".format(\n                            (self.output_dir / Path(PathConstants.relation_mapping_path)).__str__()\n                        )\n                    )\n                    np.savetxt(\n                        (self.output_dir / Path(PathConstants.relation_mapping_path)).__str__(),\n                        rel_mapping,\n                        fmt=\"%s\",\n                        delimiter=\",\",\n                    )\n            else:\n                # Determine which columns to keep\n                print(\"Not remapping node ids\")\n\n                # Extract all the tensors and weights\n                train_edges_tens, train_edge_weights = extract_tensors_from_df(train_edges_df, self.column_mappings)\n                valid_edges_tens, valid_edge_weights = extract_tensors_from_df(valid_edges_df, self.column_mappings)\n                test_edges_tens, test_edge_weights = extract_tensors_from_df(test_edges_df, self.column_mappings)\n\n        else:\n            print(\"Using in memory data\")\n            train_edges_tens = self.train_edges_tens\n            valid_edges_tens = self.valid_edges_tens\n            test_edges_tens = self.test_edges_tens\n\n            if self.remap_ids:\n                all_edges_list, node_mapping, rel_mapping, all_edge_weights = map_edge_lists(\n                    [train_edges_tens, valid_edges_tens, test_edges_tens],\n                    known_node_ids=self.known_node_ids,\n                    sequential_train_nodes=self.sequential_train_nodes,\n                    sequential_deg_nodes=self.sequential_deg_nodes,\n                    column_mappings=self.column_mappings,\n                )\n\n                self.num_nodes = node_mapping.shape[0]\n                if rel_mapping is None:\n                    self.num_rels = 1\n                else:\n                    self.num_rels = rel_mapping.shape[0]\n\n                train_edges_tens = all_edges_list[0]\n                if len(all_edges_list) == 2:\n                    test_edges_tens = all_edges_list[1]\n                elif len(all_edges_list) == 3:\n                    valid_edges_tens = all_edges_list[1]\n                    test_edges_tens = all_edges_list[2]\n\n                train_edge_weights = all_edge_weights[0]\n                valid_edge_weights = all_edge_weights[1]\n                test_edge_weights = all_edge_weights[2]\n\n                print(\n                    \"Node mapping written to: {}\".format(\n                        (self.output_dir / Path(PathConstants.node_mapping_path)).__str__()\n                    )\n                )\n                np.savetxt(\n                    (self.output_dir / Path(PathConstants.node_mapping_path)).__str__(),\n                    node_mapping,\n                    fmt=\"%s\",\n                    delimiter=\",\",\n                )\n\n                if self.num_rels > 1:\n                    print(\n                        \"Relation mapping written to: {}\".format(\n                            (self.output_dir / Path(PathConstants.relation_mapping_path)).__str__()\n                        )\n                    )\n                    np.savetxt(\n                        (self.output_dir / Path(PathConstants.relation_mapping_path)).__str__(),\n                        rel_mapping,\n                        fmt=\"%s\",\n                        delimiter=\",\",\n                    )\n\n            else:\n                train_edges_tens, train_edge_weights = extract_tensor_from_tens(train_edges_tens, self.column_mappings)\n                test_edges_tens, test_edge_weights = extract_tensor_from_tens(test_edges_tens, self.column_mappings)\n                valid_edges_tens, valid_edge_weights = extract_tensor_from_tens(valid_edges_tens, self.column_mappings)\n\n        # Split the edges\n        if self.splits is not None:\n            (\n                train_edges_tens,\n                train_edge_weights,\n                valid_edges_tens,\n                valid_edge_weights,\n                test_edges_tens,\n                test_edge_weights,\n            ) = split_edges(train_edges_tens, train_edge_weights, self.splits)\n\n        # Cast to the correct dtype\n        def perform_cast(edge_tensor, weights_tensor, edge_dtype, weights_dtype):\n            if edge_tensor is None:\n                return edge_tensor, weights_tensor\n\n            if weights_tensor is not None:\n                weights_tensor = weights_tensor.to(weights_dtype)\n            return edge_tensor.to(edge_dtype), weights_tensor\n\n        train_edges_tens, train_edge_weights = perform_cast(\n            train_edges_tens, train_edge_weights, self.dtype, self.weight_dtype\n        )\n        valid_edges_tens, valid_edge_weights = perform_cast(\n            valid_edges_tens, valid_edge_weights, self.dtype, self.weight_dtype\n        )\n        test_edges_tens, test_edge_weights = perform_cast(\n            test_edges_tens, test_edge_weights, self.dtype, self.weight_dtype\n        )\n\n        # Resolve all the null counts\n        if self.num_nodes is None:\n            combined_nodes = [train_edges_tens[:, [0, -1]]]\n            if test_edges_tens is not None:\n                combined_nodes.append(test_edges_tens[:, [0, -1]])\n            if valid_edges_tens is not None:\n                combined_nodes.append(valid_edges_tens[:, [0, -1]])\n\n            combined_tensor = torch.unique(combined_nodes, sorted=False)\n            self.num_nodes = torch.numel(combined_tensor)\n\n        if self.num_rels is None:\n            self.num_rels = 1\n\n        all_edge_weights = [train_edge_weights, valid_edge_weights, test_edge_weights]\n        if self.partitioner is not None:\n            print(\"Partition nodes into {} partitions\".format(self.num_partitions))\n            (\n                train_edges_tens,\n                train_edges_offsets,\n                valid_edges_tens,\n                valid_edges_offsets,\n                test_edges_tens,\n                test_edges_offsets,\n                all_edge_weights,\n            ) = self.partitioner.partition_edges(\n                train_edges_tens,\n                valid_edges_tens,\n                test_edges_tens,\n                self.num_nodes,\n                self.num_partitions,\n                edge_weights=all_edge_weights,\n            )\n\n            return self.writer.write_to_binary(\n                train_edges_tens,\n                valid_edges_tens,\n                test_edges_tens,\n                self.num_nodes,\n                self.num_rels,\n                self.num_partitions,\n                train_edges_offsets,\n                valid_edges_offsets,\n                test_edges_offsets,\n                edge_weights=all_edge_weights,\n            )\n        else:\n            return self.writer.write_to_binary(\n                train_edges_tens,\n                valid_edges_tens,\n                test_edges_tens,\n                self.num_nodes,\n                self.num_rels,\n                self.num_partitions,\n                edge_weights=all_edge_weights,\n            )\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/writers/__init__.py",
    "content": ""
  },
  {
    "path": "src/python/tools/preprocess/converters/writers/spark_writer.py",
    "content": "import glob\nimport os\nimport re\nimport sys\nfrom pathlib import Path\nfrom random import randint\n\nimport numpy as np\nimport pandas as pd\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.configuration.marius_config import DatasetConfig\nfrom marius.tools.preprocess.converters.spark_constants import (\n    DST_EDGE_BUCKET_COL,\n    EDGES_INDEX_COL,\n    INDEX_COL,\n    REL_INDEX_COL,\n    SRC_EDGE_BUCKET_COL,\n    TMP_DATA_DIRECTORY,\n)\nfrom marius.tools.preprocess.utils import get_df_count\n\n\n# TODO can this be made faster? Pandas is pretty slow and not parallel\ndef convert_to_binary(input_filename, output_filename):\n    assert input_filename != output_filename\n    with open(output_filename, \"wb\") as output_file:\n        for chunk in pd.read_csv(input_filename, header=None, chunksize=10**8, sep=\"\\t\", dtype=int):\n            chunk_array = chunk.to_numpy(dtype=np.int32)\n            output_file.write(bytes(chunk_array))\n\n    os.system(\"rm {}\".format(input_filename))\n\n\n# TODO we can make this faster by using the cat bash command to combine these files super fast\ndef merge_csvs(input_directory, output_file):\n    all_csvs = []\n    for filename in glob.iglob(input_directory + \"/**/*.csv\", recursive=True):\n        all_csvs.append(filename)\n\n    print(\"Merging CSVs from {} to {}\".format(input_directory, output_file))\n    os.system(\"rm -rf {}\".format(output_file))\n    for source_file in all_csvs:\n        os.system(\"cat {} >> {}\".format(source_file, output_file))\n\n    os.system(\"rm -rf {}\".format(input_directory))\n\n\ndef write_df_to_csv(df, output_filename):\n    tmp_dir = TMP_DATA_DIRECTORY + str(randint(0, sys.maxsize))\n    df.write.csv(tmp_dir, mode=\"overwrite\", sep=\"\\t\")\n    merge_csvs(tmp_dir, output_filename)\n\n\ndef write_partitioned_df_to_csv(partition_triples, num_partitions, output_filename):\n    bucket_counts = partition_triples.groupBy([SRC_EDGE_BUCKET_COL, DST_EDGE_BUCKET_COL]).count()\n\n    print(partition_triples.rdd.getNumPartitions())\n\n    # for edges, the order needs to be maintained. all edges that belong to bucket [i, j]\n    # should appear before [i, j+1] and that of [i, j+1] should appear before [i+1, j].\n    # repartitionByRange makes sure that all edges belonging to src bucket i, fall in the\n    # same partition. Also, this function will output at most `num_partitions` partitions.\n    partition_triples.repartitionByRange(num_partitions, SRC_EDGE_BUCKET_COL).sortWithinPartitions(\n        SRC_EDGE_BUCKET_COL, DST_EDGE_BUCKET_COL\n    ).drop(DST_EDGE_BUCKET_COL, SRC_EDGE_BUCKET_COL).write.csv(\n        TMP_DATA_DIRECTORY + \"_edges\", mode=\"overwrite\", sep=\"\\t\"\n    )\n\n    # for partition offset counts, the ordering of dst_buckets does not matter since we\n    # read the value before setting the offset in line number 92.\n    # dst_buckets = counts.iloc[:, 0].values.\n    # we make use of partitionBy to parallelize writes.\n    bucket_counts.write.partitionBy(SRC_EDGE_BUCKET_COL).csv(TMP_DATA_DIRECTORY + \"_counts\", mode=\"overwrite\", sep=\"\\t\")\n\n    partition_offsets = []\n\n    os.system(\"rm -rf {}\".format(output_filename))\n    for i in range(num_partitions):\n        # looks like there is no way in glob to restrict to the pattern [0]*{i}- alone.\n        # it matches things like part-00004-sdfvf0-sdf.csv when given part-[0]*0-*.csv\n        tmp_edges_files = glob.glob(\"{}/part-[0]*{}-*.csv\".format(TMP_DATA_DIRECTORY + \"_edges\", str(i)))\n\n        tmp_counts_files = glob.glob(\n            \"{}/{}={}/*.csv\".format(TMP_DATA_DIRECTORY + \"_counts\", SRC_EDGE_BUCKET_COL, str(i))\n        )\n\n        edges_bucket_counts = np.zeros(num_partitions, dtype=np.int)\n        edge_file_pattern = re.compile(r\"{}/part-[0]*{}-.*\\.csv\".format(TMP_DATA_DIRECTORY + \"_edges\", str(i)))\n        for tmp_edges_file in tmp_edges_files:\n            if edge_file_pattern.match(tmp_edges_file):\n                os.system(\"cat {} >> {}\".format(tmp_edges_file, output_filename))\n\n        for tmp_counts_file in tmp_counts_files:\n            counts = pd.read_csv(tmp_counts_file, sep=\"\\t\", header=None)\n\n            dst_buckets = counts.iloc[:, 0].values\n            dst_counts = counts.iloc[:, 1].values\n\n            edges_bucket_counts[dst_buckets] = dst_counts\n\n        partition_offsets.append(edges_bucket_counts)\n\n    os.system(\"rm -rf {}\".format(TMP_DATA_DIRECTORY + \"_edges\"))\n    os.system(\"rm -rf {}\".format(TMP_DATA_DIRECTORY + \"_counts\"))\n\n    return np.concatenate(partition_offsets)\n\n\nclass SparkWriter(object):\n    def __init__(self, spark, output_dir, partitioned_evaluation):\n        super().__init__()\n\n        self.spark = spark\n        self.output_dir = output_dir\n        self.partitioned_evaluation = partitioned_evaluation\n\n    def write_to_csv(self, train_edges_df, valid_edges_df, test_edges_df, nodes_df, rels_df, num_partitions):\n        dataset_stats = DatasetConfig()\n        dataset_stats.dataset_dir = Path(self.output_dir).absolute().__str__()\n\n        dataset_stats.num_edges = get_df_count(train_edges_df, EDGES_INDEX_COL)\n        train_edges_df = train_edges_df.drop(EDGES_INDEX_COL)\n        dataset_stats.num_train = dataset_stats.num_edges\n\n        if valid_edges_df is not None:\n            dataset_stats.num_valid = get_df_count(valid_edges_df, EDGES_INDEX_COL)\n            valid_edges_df = valid_edges_df.drop(EDGES_INDEX_COL)\n        if test_edges_df is not None:\n            dataset_stats.num_test = get_df_count(test_edges_df, EDGES_INDEX_COL)\n            test_edges_df = test_edges_df.drop(EDGES_INDEX_COL)\n\n        dataset_stats.num_nodes = get_df_count(nodes_df, INDEX_COL)\n\n        if rels_df is None:\n            dataset_stats.num_relations = 1\n        else:\n            dataset_stats.num_relations = get_df_count(rels_df, REL_INDEX_COL)\n\n        with open(self.output_dir / Path(\"dataset.yaml\"), \"w\") as f:\n            print(\"Dataset statistics written to: {}\".format((self.output_dir / Path(\"dataset.yaml\")).__str__()))\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n        write_df_to_csv(nodes_df, self.output_dir / Path(PathConstants.node_mapping_path))\n\n        if rels_df is not None:\n            write_df_to_csv(rels_df, self.output_dir / Path(PathConstants.relation_mapping_path))\n\n        if num_partitions > 1:\n            offsets = write_partitioned_df_to_csv(\n                train_edges_df, num_partitions, self.output_dir / Path(PathConstants.train_edges_path)\n            )\n\n            with open(self.output_dir / Path(PathConstants.train_edge_buckets_path), \"w\") as f:\n                f.writelines([str(o) + \"\\n\" for o in offsets])\n\n            if self.partitioned_evaluation:\n                if valid_edges_df is not None:\n                    offsets = write_partitioned_df_to_csv(\n                        valid_edges_df, num_partitions, self.output_dir / Path(PathConstants.valid_edges_path)\n                    )\n\n                    with open(self.output_dir / Path(PathConstants.valid_edge_buckets_path), \"w\") as f:\n                        f.writelines([str(o) + \"\\n\" for o in offsets])\n\n                if test_edges_df is not None:\n                    offsets = write_partitioned_df_to_csv(\n                        test_edges_df, num_partitions, self.output_dir / Path(PathConstants.test_edges_path)\n                    )\n                    with open(self.output_dir / Path(PathConstants.test_edge_buckets_path), \"w\") as f:\n                        f.writelines([str(o) + \"\\n\" for o in offsets])\n\n            else:\n                if valid_edges_df is not None:\n                    write_df_to_csv(valid_edges_df, self.output_dir / Path(PathConstants.valid_edges_path))\n\n                if test_edges_df is not None:\n                    write_df_to_csv(test_edges_df, self.output_dir / Path(PathConstants.test_edges_path))\n\n        else:\n            write_df_to_csv(train_edges_df, self.output_dir / Path(PathConstants.train_edges_path))\n\n            if valid_edges_df is not None:\n                write_df_to_csv(valid_edges_df, self.output_dir / Path(PathConstants.valid_edges_path))\n\n            if test_edges_df is not None:\n                write_df_to_csv(test_edges_df, self.output_dir / Path(PathConstants.test_edges_path))\n\n        return dataset_stats\n\n    def write_to_binary(self, train_edges_df, valid_edges_df, test_edges_df, nodes_df, rels_df, num_partitions):\n        print(\"Writing to CSV\")\n        dataset_stats = self.write_to_csv(\n            train_edges_df, valid_edges_df, test_edges_df, nodes_df, rels_df, num_partitions\n        )\n\n        train_file = self.output_dir / Path(PathConstants.train_edges_path)\n        valid_file = self.output_dir / Path(PathConstants.valid_edges_path)\n        test_file = self.output_dir / Path(PathConstants.test_edges_path)\n\n        tmp_train_file = TMP_DATA_DIRECTORY + \"tmp_train_edges.tmp\"\n        tmp_valid_file = TMP_DATA_DIRECTORY + \"tmp_valid_edges.tmp\"\n        tmp_test_file = TMP_DATA_DIRECTORY + \"tmp_test_edges.tmp\"\n\n        print(\"Converting to binary\")\n        os.rename(train_file, tmp_train_file)\n        convert_to_binary(tmp_train_file, train_file)\n\n        if valid_edges_df is not None:\n            os.rename(valid_file, tmp_valid_file)\n            convert_to_binary(tmp_valid_file, valid_file)\n\n        if test_edges_df is not None:\n            os.rename(test_file, tmp_test_file)\n            convert_to_binary(tmp_test_file, test_file)\n\n        return dataset_stats\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/writers/torch_writer.py",
    "content": "from pathlib import Path\n\nimport numpy as np\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.configuration.marius_config import DatasetConfig\n\n\nclass TorchWriter(object):\n    def __init__(self, output_dir, partitioned_evaluation):\n        super().__init__()\n\n        self.output_dir = output_dir\n        self.partitioned_evaluation = partitioned_evaluation\n\n    def write_to_binary(\n        self,\n        train_edges_tens,\n        valid_edges_tens,\n        test_edges_tens,\n        num_nodes,\n        num_rels,\n        num_partitions,\n        train_edges_offsets=None,\n        valid_edges_offsets=None,\n        test_edges_offsets=None,\n        edge_weights=None,\n    ):\n        dataset_stats = DatasetConfig()\n        dataset_stats.dataset_dir = Path(self.output_dir).absolute().__str__() + \"/\"\n\n        dataset_stats.num_edges = train_edges_tens.size(0)\n        dataset_stats.num_train = train_edges_tens.size(0)\n\n        if valid_edges_tens is not None:\n            dataset_stats.num_valid = valid_edges_tens.size(0)\n        if test_edges_tens is not None:\n            dataset_stats.num_test = test_edges_tens.size(0)\n\n        dataset_stats.num_nodes = num_nodes\n        dataset_stats.num_relations = num_rels\n\n        with open(self.output_dir / Path(\"dataset.yaml\"), \"w\") as f:\n            print(\"Dataset statistics written to: {}\".format((self.output_dir / Path(\"dataset.yaml\")).__str__()))\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n        # Read the edge weights\n        train_edges_weights, valid_edges_weights, test_edges_weights = None, None, None\n        if edge_weights is not None:\n            train_edges_weights, valid_edges_weights, test_edges_weights = (\n                edge_weights[0],\n                edge_weights[1],\n                edge_weights[2],\n            )\n\n        with open(self.output_dir / Path(PathConstants.train_edges_path), \"wb\") as f:\n            print(\"Train edges written to:\", PathConstants.train_edges_path)\n            f.write(bytes(train_edges_tens.numpy()))\n\n        if train_edges_weights is not None:\n            train_weights_save_path = self.output_dir / Path(PathConstants.train_edges_weights_path)\n            print(\"Train edges weights written to:\", train_weights_save_path)\n            train_weights_arr = train_edges_weights.numpy().flatten().astype(np.float32)\n            train_weights_arr.tofile(train_weights_save_path)\n\n        if valid_edges_tens is not None:\n            print(\"Valid edges written to:\", PathConstants.valid_edges_path)\n            with open(self.output_dir / Path(PathConstants.valid_edges_path), \"wb\") as f:\n                f.write(bytes(valid_edges_tens.numpy()))\n\n            if valid_edges_weights is not None:\n                valid_weights_save_path = self.output_dir / Path(PathConstants.valid_edges_weights_path)\n                print(\"Valid edges weights written to:\", PathConstants.valid_edges_weights_path)\n                valid_weights_arr = valid_edges_weights.numpy().flatten().astype(np.float32)\n                valid_weights_arr.tofile(valid_weights_save_path)\n\n        if test_edges_tens is not None:\n            print(\"Test edges written to:\", PathConstants.test_edges_path)\n            with open(self.output_dir / Path(PathConstants.test_edges_path), \"wb\") as f:\n                f.write(bytes(test_edges_tens.numpy()))\n\n            if test_edges_weights is not None:\n                test_weights_save_path = self.output_dir / Path(PathConstants.test_edges_weights_path)\n                print(\"Test edge weights written to:\", PathConstants.test_edges_weights_path)\n                test_weights_arr = test_edges_weights.numpy().flatten().astype(np.float32)\n                test_weights_arr.tofile(test_weights_save_path)\n\n        if num_partitions > 1:\n            with open(self.output_dir / Path(PathConstants.train_edge_buckets_path), \"w\") as f:\n                print(\"Train partition offsets written to:\", PathConstants.train_edge_buckets_path)\n                f.writelines([str(o) + \"\\n\" for o in train_edges_offsets])\n\n            if valid_edges_offsets is not None:\n                print(\"Valid partition offsets written to:\", PathConstants.valid_edge_buckets_path)\n                with open(self.output_dir / Path(PathConstants.valid_edge_buckets_path), \"w\") as f:\n                    f.writelines([str(o) + \"\\n\" for o in valid_edges_offsets])\n\n            if test_edges_offsets is not None:\n                print(\"Test partition offsets written to:\", PathConstants.test_edge_buckets_path)\n                with open(self.output_dir / Path(PathConstants.test_edge_buckets_path), \"w\") as f:\n                    f.writelines([str(o) + \"\\n\" for o in test_edges_offsets])\n\n        return dataset_stats\n"
  },
  {
    "path": "src/python/tools/preprocess/converters/writers/writer.py",
    "content": ""
  },
  {
    "path": "src/python/tools/preprocess/custom.py",
    "content": "import importlib\nfrom pathlib import Path\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\n\npyspark_loader = importlib.find_loader(\"pyspark\")\npyspark_found = pyspark_loader is not None\n\nif pyspark_found:\n    from marius.tools.preprocess.converters.spark_converter import SparkEdgeListConverter\n\n\nclass CustomLinkPredictionDataset(LinkPredictionDataset):\n    def __init__(\n        self, output_directory: Path, files: list, delim: str = \"\\t\", dataset_name: str = \"custom\", spark: bool = False\n    ):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = dataset_name\n        self.output_directory = output_directory\n\n        if len(files) == 1:\n            self.train_edges_file = files[0]\n            self.valid_edges_file = None\n            self.test_edges_file = None\n\n        if len(files) == 3:\n            self.train_edges_file = files[0]\n            self.valid_edges_file = files[1]\n            self.test_edges_file = files[2]\n\n        self.delim = delim\n        self.spark = spark\n\n    def download(self, overwrite=False):\n        pass\n\n    def preprocess(\n        self,\n        num_partitions=1,\n        remap_ids=True,\n        splits=[0.9, 0.05, 0.05],\n        partitioned_eval=False,\n        sequential_train_nodes=False,\n        src_column=None,\n        dst_column=None,\n        edge_type_column=None,\n        edge_weight_column=None,\n    ):\n        if self.spark and pyspark_found:\n            converter_class = SparkEdgeListConverter\n        else:\n            converter_class = TorchEdgeListConverter\n\n        converter = converter_class(\n            output_dir=self.output_directory,\n            train_edges=self.train_edges_file,\n            valid_edges=self.valid_edges_file,\n            test_edges=self.test_edges_file,\n            delim=self.delim,\n            src_column=src_column,\n            dst_column=dst_column,\n            edge_type_column=edge_type_column,\n            edge_weight_column=edge_weight_column,\n            num_partitions=num_partitions,\n            sequential_train_nodes=sequential_train_nodes,\n            splits=splits,\n            remap_ids=remap_ids,\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/dataset.py",
    "content": "import os\nfrom abc import ABC, abstractmethod\nfrom pathlib import Path\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.configuration.marius_config import DatasetConfig\n\n\nclass Dataset(ABC):\n    \"\"\"\n    Abstract dataset class\n    \"\"\"\n\n    edge_list_file: Path\n    edge_features_file: Path\n\n    node_mapping_file: Path\n    node_features_file: Path\n\n    relation_mapping_file: Path\n    relation_features_file: Path\n\n    node_type_file: Path\n    node_features_file: Path\n\n    dataset_name: str\n    dataset_url: str\n    output_directory: Path\n\n    spark: bool\n\n    def __init__(self, output_directory, spark=False):\n        self.output_directory = output_directory\n        self.spark = spark\n        os.makedirs(self.output_directory / Path(PathConstants.edges_directory), exist_ok=True)\n        os.makedirs(self.output_directory / Path(PathConstants.nodes_directory), exist_ok=True)\n\n        self.edge_list_file = self.output_directory / Path(PathConstants.train_edges_path)\n        self.edge_buckets_file = self.output_directory / Path(PathConstants.train_edge_buckets_path)\n\n        self.node_features_file = self.output_directory / Path(PathConstants.node_features_path)\n        self.relation_features_file = self.output_directory / Path(PathConstants.relation_features_path)\n\n    @abstractmethod\n    def download(self, overwrite=False):\n        pass\n\n    @abstractmethod\n    def preprocess(self) -> DatasetConfig:\n        pass\n\n\nclass NodeClassificationDataset(Dataset):\n    def __init__(self, output_directory, spark):\n        super().__init__(output_directory, spark)\n\n        self.train_nodes_file = output_directory / Path(PathConstants.train_nodes_path)\n        self.valid_nodes_file = output_directory / Path(PathConstants.valid_nodes_path)\n        self.test_nodes_file = output_directory / Path(PathConstants.test_nodes_path)\n\n        self.node_labels_file = output_directory / Path(PathConstants.labels_path)\n\n\nclass LinkPredictionDataset(Dataset):\n    def __init__(self, output_directory, spark):\n        super().__init__(output_directory, spark)\n\n        self.train_edges_file = output_directory / Path(PathConstants.train_edges_path)\n        self.train_edge_buckets_file = self.output_directory / Path(PathConstants.train_edge_buckets_path)\n\n        self.valid_edges_file = output_directory / Path(PathConstants.valid_edges_path)\n        self.valid_edge_buckets_file = self.output_directory / Path(PathConstants.valid_edge_buckets_path)\n\n        self.test_edges_file = output_directory / Path(PathConstants.test_edges_path)\n        self.test_edge_buckets_file = self.output_directory / Path(PathConstants.test_edge_buckets_path)\n\n\nclass GraphClassificationDataset(Dataset):\n    pass\n"
  },
  {
    "path": "src/python/tools/preprocess/dataset_stats.tsv",
    "content": "dataset\tnum_nodes\tnum_edges\tnum_relations\tnum_train\tnum_valid\tnum_test\nlive_journal\t4847571\t68993773\t1\t62094395\t3449689\t3449689\nfb15k\t14951\t592213\t1345\t483142\t50000\t59071\nfb15k_237\t114541\t310116\t237\t272115\t17535\t20466\nwn18\t40943\t151442\t18\t141442\t5000\t5000\nwn18rr\t40943\t93003\t11\t86835\t3034\t3134\ncodex_s\t2034\t36543\t42\t32888\t1827\t1828\ncodex_m\t17050\t206205\t51\t185584\t10310\t10311\ncodex_l\t77951\t612437\t69\t551193\t30622\t30622\ndrkg\t97238\t5874261\t107\t5286834\t293713\t293714\nhetionet\t45160\t2250198\t25\t2025178\t112510\t112510\nfreebase86m\t86054151\t338586276\t14824\t304727650\t16929318\t16929308\nkinships\t24\t112\t12\t100\t6\t6\nogbl_ppa\t576289\t30326273\t1\t21231931\t6062562\t3031780\nogbl_ddi\t4267\t1334889\t1\t1067911\t133489\t133489\nogbl_collab\t235868\t1285465\t1\t1179052\t60084\t46329\nogbl_biokg\t45085\t5088434\t51\t4762678\t162886\t162870\nogbn_arxiv\t169341\t1166243\t1\t378578\t247936\t539729\nogbn_proteins\t132534\t39561254\t1\t23798109\t5980116\t9783027\nogbn_products\t2400608\t61859140\t1\t28266324\t2379825\t31212991\nopenbiolink_hq\t184635\t4563405\t28\t4192002\t188394\t193009\nopenbiolink_lq\t486942\t27320889\t32\t25508954\t1132001\t679934"
  },
  {
    "path": "src/python/tools/preprocess/datasets/__init__.py",
    "content": "__all__ = [\n    \"fb15k\",\n    \"livejournal\",\n    \"freebase86m\",\n    \"ogbl_wikikg2\",\n    \"ogbl_citation2\",\n    \"ogbl_ppa\",\n    \"ogbn_arxiv\",\n    \"ogbn_products\",\n    \"ogbn_papers100m\",\n    \"twitter\",\n    \"fb15k_237\",\n    \"ogb_wikikg90mv2\",\n    \"ogb_mag240m\",\n    \"ogbl_collab\",\n]\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/dataset_helpers.py",
    "content": "import numpy as np\n\n\ndef remap_nodes(node_mapping, train_nodes, valid_nodes, test_nodes, features, labels):\n    num_nodes = node_mapping.shape[0]\n\n    random_map = node_mapping[:, 1]\n    random_map = random_map.astype(train_nodes.dtype)\n    random_map_argsort = np.argsort(random_map)\n\n    train_nodes = random_map[train_nodes]\n    valid_nodes = random_map[valid_nodes]\n    test_nodes = random_map[test_nodes]\n\n    features = features[random_map_argsort]\n\n    if labels.shape[0] != num_nodes:\n        labels = np.concatenate((labels, -np.ones([num_nodes - labels.shape[0]], dtype=np.int32)))\n\n    labels = labels[random_map_argsort]\n\n    return train_nodes, valid_nodes, test_nodes, features, labels\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/fb15k.py",
    "content": "from pathlib import Path\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\n\nclass FB15K(LinkPredictionDataset):\n    \"\"\"\n    Freebase 15k\n\n    The FB15k dataset contains knowledge base relation triples and textual\n    mentions of Freebase entity pairs. It has a total of 592,213 triplets\n    with 14,951 entities and 1,345 relationships.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"fb15k\"\n        self.dataset_url = \"https://dl.fbaipublicfiles.com/starspace/fb15k.tgz\"\n\n    def download(self, overwrite=False):\n        self.input_train_edges_file = self.output_directory / Path(\"freebase_mtr100_mte100-train.txt\")\n        self.input_valid_edges_file = self.output_directory / Path(\"freebase_mtr100_mte100-valid.txt\")\n        self.input_test_edges_file = self.output_directory / Path(\"freebase_mtr100_mte100-test.txt\")\n\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n        if not self.input_valid_edges_file.exists():\n            download = True\n        if not self.input_test_edges_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=True)\n\n            for file in (self.output_directory / Path(\"FB15k\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n            (self.output_directory / Path(\"FB15k\")).rmdir()\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_train_edges_file,\n            valid_edges=self.input_valid_edges_file,\n            test_edges=self.input_test_edges_file,\n            num_partitions=num_partitions,\n            remap_ids=remap_ids,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/fb15k_237.py",
    "content": "from pathlib import Path\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\n\nclass FB15K237(LinkPredictionDataset):\n    \"\"\"\n    Freebase 15k 237\n\n    The FB15k dataset contains knowledge base relation triples and textual mentions\n    of Freebase entity pairs. It has a total of 592,213 triplets with 14,951 entities\n    and 1,345 relationships. FB15K-237 is a variant of the original dataset where\n    inverse relations are removed, since it was found that a large number of test triplets\n    could be obtained by inverting triplets in the training set.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"fb15k237\"\n        self.dataset_url = \"https://data.deepai.org/FB15K-237.2.zip\"\n\n    def download(self, overwrite=False):\n        self.input_train_edges_file = self.output_directory / Path(\"train.txt\")\n        self.input_valid_edges_file = self.output_directory / Path(\"valid.txt\")\n        self.input_test_edges_file = self.output_directory / Path(\"test.txt\")\n\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n        if not self.input_valid_edges_file.exists():\n            download = True\n        if not self.input_test_edges_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=True)\n\n            for file in (self.output_directory / Path(\"Release\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n            (self.output_directory / Path(\"Release\")).rmdir()\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_train_edges_file,\n            valid_edges=self.input_valid_edges_file,\n            test_edges=self.input_test_edges_file,\n            num_partitions=num_partitions,\n            remap_ids=remap_ids,\n            partitioned_evaluation=partitioned_eval,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/freebase86m.py",
    "content": "from pathlib import Path\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\n\nclass Freebase86m(LinkPredictionDataset):\n    \"\"\"\n    Freebase\n\n    The full Freebase dataset. 86054151 nodes, 338586276 edges, 14824 relations.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"freebase86m\"\n        self.dataset_url = \"https://data.dgl.ai/dataset/Freebase.zip\"\n\n    def download(self, overwrite=False):\n        self.input_train_edges_file = self.output_directory / Path(\"train.txt\")\n        self.input_valid_edges_file = self.output_directory / Path(\"valid.txt\")\n        self.input_test_edges_file = self.output_directory / Path(\"test.txt\")\n\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n        if not self.input_valid_edges_file.exists():\n            download = True\n        if not self.input_test_edges_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=True)\n\n            for file in (self.output_directory / Path(\"Freebase\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n            (self.output_directory / Path(\"Freebase\")).rmdir()\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_train_edges_file,\n            valid_edges=self.input_valid_edges_file,\n            test_edges=self.input_test_edges_file,\n            num_partitions=num_partitions,\n            src_column=0,\n            dst_column=1,\n            edge_type_column=2,\n            remap_ids=remap_ids,\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/friendster.py",
    "content": "from pathlib import Path\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file, strip_header\n\n\nclass Friendster(LinkPredictionDataset):\n    \"\"\"\n    Friendster\n\n    Friendster is an on-line gaming network.\n    Before re-launching as a game website, Friendster was a social networking site where\n    users can form friendship edge each other. Friendster social network also allows\n    users form a group which other members can then join. We consider such user-defined\n    groups as ground-truth communities. For the social network, we take the induced subgraph\n    of the nodes that either belong to at least one community or are connected to other nodes\n    that belong to at least one community. 65,608,366 nodes, 1,806,067,135 edges.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"friendster\"\n        self.dataset_url = \"https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz\"\n\n    def download(self, overwrite=False):\n        self.input_edges = self.output_directory / Path(\"com-friendster.ungraph.txt\")\n\n        if not self.input_edges.exists():\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=True)\n            strip_header(self.input_edges, num_lines=4)\n\n    def preprocess(\n        self,\n        num_partitions=1,\n        remap_ids=True,\n        splits=None,\n        sequential_train_nodes=False,\n        generate_random_features=False,\n        node_feature_dim=32,\n        num_classes=50,\n        node_splits=[0.1, 0.05, 0.05],\n        partitioned_eval=False,\n    ):\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_edges,\n            delim=\"\\t\",\n            src_column=0,\n            dst_column=1,\n            header_length=0,\n            num_partitions=num_partitions,\n            splits=splits,\n            remap_ids=remap_ids,\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/livejournal.py",
    "content": "from pathlib import Path\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file, strip_header\n\n\nclass Livejournal(LinkPredictionDataset):\n    \"\"\"\n    Livejournal\n\n    LiveJournal is a free on-line community with almost 10 million members;\n    a significant fraction of these members are highly active.\n    (For example, roughly 300,000 update their content in any given 24-hour period.)\n    LiveJournal allows members to maintain journals, individual and group blogs,\n    and it allows people to declare which other members are their friends they belong.\n    4,847,571 nodes, 68,993,773 edges.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"twitter\"\n        self.dataset_url = \"https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz\"\n\n    def download(self, overwrite=False):\n        self.input_edges = self.output_directory / Path(\"soc-LiveJournal1.txt\")\n\n        if not self.input_edges.exists():\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=True)\n            strip_header(self.input_edges, num_lines=4)\n\n    def preprocess(\n        self,\n        num_partitions=1,\n        remap_ids=True,\n        splits=[0.9, 0.05, 0.05],\n        sequential_train_nodes=False,\n        partitioned_eval=False,\n    ):\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_edges,\n            delim=\"\\t\",\n            src_column=0,\n            dst_column=1,\n            header_length=0,\n            num_partitions=num_partitions,\n            splits=splits,\n            remap_ids=remap_ids,\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogb_mag240m.py",
    "content": "import os\nfrom pathlib import Path\n\nimport numpy as np\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import NodeClassificationDataset\nfrom marius.tools.preprocess.datasets.dataset_helpers import remap_nodes\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\nimport torch  # isort:skip\n\n\nclass OGBMag240M(NodeClassificationDataset):\n    \"\"\"\n    Open Graph Benchmark: mag\n\n    The ogbn-mag dataset is a heterogeneous network composed of a subset of the Microsoft Academic Graph (MAG).\n    It contains four types of entities—papers, authors, institutions,\n    and fields of study—as well as four types of directed relations connecting two types of entities—an author\n    is “affiliated with” an institution, an author “writes” a paper, a paper “cites” a paper,\n    and a paper “has a topic of” a field of study. Similar to ogbn-arxiv,\n    each paper is associated with a 128-dimensional word2vec feature vector,\n    and all the other types of entities are not associated with input node features.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogb_mag240m\"\n        self.dataset_url = \"https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/mag240m_kddcup2021.zip\"\n\n    def download(self, overwrite=False):\n        self.input_cites_edge_list_file = self.output_directory / Path(\"cites_edge_index.npy\")\n        self.input_splits_file = self.output_directory / Path(\"split_dict.pt\")\n        self.input_node_feature_file = self.output_directory / Path(\"node_feat.npy\")\n        self.input_node_label_file = self.output_directory / Path(\"node_label.npy\")\n\n        download = False\n        if not self.input_cites_edge_list_file.exists():\n            download = True\n        if not self.input_splits_file.exists():\n            download = True\n        if not self.input_node_feature_file.exists():\n            download = True\n        if not self.input_node_label_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            (self.output_directory / Path(\"mag240m_kddcup2021/processed/paper___cites___paper/edge_index.npy\")).rename(\n                self.input_cites_edge_list_file\n            )\n            (self.output_directory / Path(\"mag240m_kddcup2021/split_dict.pt\")).rename(self.input_splits_file)\n            (self.output_directory / Path(\"mag240m_kddcup2021/processed/paper/node_feat.npy\")).rename(\n                self.input_node_feature_file\n            )\n            (self.output_directory / Path(\"mag240m_kddcup2021/processed/paper/node_label.npy\")).rename(\n                self.input_node_label_file\n            )\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        citation_edges = np.load(self.input_cites_edge_list_file).astype(np.int32).transpose()\n\n        split_dict = torch.load(self.input_splits_file)\n\n        train_nodes = split_dict[\"train\"].astype(np.int32)\n        valid_nodes = split_dict[\"valid\"].astype(np.int32)\n        # test_nodes = split_dict['test'].astype(np.int32)\n        test_nodes = valid_nodes\n\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=citation_edges,\n            num_partitions=num_partitions,\n            remap_ids=remap_ids,\n            sequential_train_nodes=sequential_train_nodes,\n            format=\"numpy\",\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            known_node_ids=[\n                train_nodes,\n                valid_nodes,\n                test_nodes,\n                np.arange(121751666, dtype=np.int32),\n            ],  # not all nodes appear in the edges\n            num_nodes=121751666,\n            num_rels=1,\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        dataset_stats = converter.convert()\n\n        features = np.load(self.input_node_feature_file)\n        labels = np.load(self.input_node_label_file)\n        labels[np.isnan(labels)] = -1\n        labels = labels.astype(np.int32)\n\n        if remap_ids:\n            node_mapping = np.genfromtxt(self.output_directory / Path(PathConstants.node_mapping_path), delimiter=\",\")\n            train_nodes, valid_nodes, test_nodes, features, labels = remap_nodes(\n                node_mapping, train_nodes, valid_nodes, test_nodes, features, labels\n            )\n\n        # convert to float32 in chunks, tested on ~500 GB RAM, need at least ~375GB minimum for float32 features\n        num_nodes = features.shape[0]\n        feat_dim = features.shape[1]\n        np.save(self.output_directory / Path(\"temp.npy\"), features)\n        features = np.zeros((num_nodes, feat_dim), np.float32)\n        chunk_size = int(2e7)\n        start = 0\n        while start < num_nodes:\n            float16_features = np.load(self.output_directory / Path(\"temp.npy\"), mmap_mode=\"r\")[\n                start : start + chunk_size\n            ]\n            features[start : start + chunk_size] = float16_features.astype(np.float32)\n            start += chunk_size\n        os.remove(self.output_directory / Path(\"temp.npy\"))\n\n        with open(self.train_nodes_file, \"wb\") as f:\n            f.write(bytes(train_nodes))\n        with open(self.valid_nodes_file, \"wb\") as f:\n            f.write(bytes(valid_nodes))\n        with open(self.test_nodes_file, \"wb\") as f:\n            f.write(bytes(test_nodes))\n        with open(self.node_features_file, \"wb\") as f:\n            chunk_size = int(1e7)\n            start = 0\n            while start < num_nodes:\n                f.write(bytes(features[start : start + chunk_size]))\n                start += chunk_size\n        with open(self.node_labels_file, \"wb\") as f:\n            f.write(bytes(labels))\n\n        # update dataset yaml\n        dataset_stats.num_train = train_nodes.shape[0]\n        dataset_stats.num_valid = valid_nodes.shape[0]\n        dataset_stats.num_test = test_nodes.shape[0]\n        dataset_stats.feature_dim = features.shape[1]\n        dataset_stats.num_classes = 153\n\n        dataset_stats.num_nodes = labels.shape[0]\n\n        with open(self.output_directory / Path(\"dataset.yaml\"), \"w\") as f:\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n        return dataset_stats\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogb_wikikg90mv2.py",
    "content": "from pathlib import Path\n\nimport numpy as np\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\n\nclass OGBWikiKG90Mv2(LinkPredictionDataset):\n    \"\"\"\n    Open Graph Benchmark: wikikg2\n\n    The ogbl-wikikg2 dataset is a Knowledge Graph (KG) extracted from the Wikidata knowledge base.\n    It contains a set of triplet edges (head, relation, tail),\n    capturing the different types of relations between entities in the world, e.g.,\n    (Canada, citizen, Hinton). We retrieve all the relational statements in Wikidata and filter out rare entities.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogb_wikikg90mv2\"\n        self.dataset_url = \"https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/wikikg90m-v2.zip\"\n\n    def download(self, overwrite=False):\n        self.input_train_edges_file = self.output_directory / Path(\"train_hrt.npy\")\n        self.input_valid_edges_sr_file = self.output_directory / Path(\"val_hr.npy\")\n        self.input_valid_edges_d_file = self.output_directory / Path(\"val_t.npy\")\n        # self.input_test_edges_file = self.output_directory / Path(\"test-dev_hr.npy\")\n        # self.input_test_edges_file = self.output_directory / Path(\"test-challenge_hr.npy\")\n\n        self.input_node_feature_file = self.output_directory / Path(\"entity_feat.npy\")\n        self.input_rel_feature_file = self.output_directory / Path(\"relation_feat.npy\")\n\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n        if not self.input_valid_edges_sr_file.exists():\n            download = True\n        if not self.input_valid_edges_d_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=True)\n\n            for file in (self.output_directory / Path(\"wikikg90m-v2/processed/\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        train_edges = np.load(self.input_train_edges_file).astype(np.int32)\n        valid_edges_sr = np.load(self.input_valid_edges_sr_file)\n        valid_edges_d = np.load(self.input_valid_edges_d_file)\n\n        valid_edges = np.concatenate((valid_edges_sr, np.reshape(valid_edges_d, (-1, 1))), axis=1).astype(np.int32)\n\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=train_edges,\n            valid_edges=valid_edges,\n            test_edges=valid_edges,\n            num_partitions=num_partitions,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            remap_ids=remap_ids,\n            sequential_train_nodes=sequential_train_nodes,\n            format=\"numpy\",\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        dataset_stats = converter.convert()\n\n        node_features = np.load(self.input_node_feature_file).astype(np.float32)\n        rel_features = np.load(self.input_rel_feature_file).astype(np.float32)\n\n        if remap_ids:\n            node_mapping = np.genfromtxt(self.output_directory / Path(PathConstants.node_mapping_path), delimiter=\",\")\n            random_node_map = node_mapping[:, 1].astype(np.int32)\n            random_node_map_argsort = np.argsort(random_node_map)\n\n            with open(self.node_features_file, \"wb\") as f:\n                chunk_size = 1e7\n                num_chunks = np.ceil(node_mapping.shape[0] / chunk_size)\n\n                offset = 0\n\n                for chunk_id in range(num_chunks):\n                    if offset + chunk_size >= node_mapping.shape[0]:\n                        chunk_size = node_mapping.shape[0] - offset\n                    f.write(bytes(node_features[random_node_map_argsort[offset : offset + chunk_size]]))\n\n            rel_mapping = np.genfromtxt(\n                self.output_directory / Path(PathConstants.relation_mapping_path), delimiter=\",\"\n            )\n            random_rel_map = rel_mapping[:, 1].astype(np.int32)\n            random_rel_map_argsort = np.argsort(random_rel_map)\n            rel_features = rel_features[random_rel_map_argsort]\n        else:\n            with open(self.node_features_file, \"wb\") as f:\n                f.write(bytes(node_features))\n\n        with open(self.relation_features_file, \"wb\") as f:\n            f.write(bytes(rel_features))\n\n        # update dataset yaml\n        dataset_stats.node_feature_dim = node_features.shape[1]\n        dataset_stats.rel_feature_dim = rel_features.shape[1]\n\n        with open(self.output_directory / Path(\"dataset.yaml\"), \"w\") as f:\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n        return dataset_stats\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogbl_citation2.py",
    "content": "from pathlib import Path\n\nimport numpy as np\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\nimport torch  # isort:skip\n\n\nclass OGBLCitation2(LinkPredictionDataset):\n    \"\"\"\n    Open Graph Benchmark: citation2\n\n    The ogbl-citation2 dataset is a directed graph, representing the citation network between\n    a subset of papers extracted from MAG. Each node is a paper with 128-dimensional\n    word2vec features that summarizes its title and abstract, and each directed edge\n    indicates that one paper cites another. All nodes also come with meta-information\n    indicating the year the corresponding paper was published.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogbl_citation2\"\n        self.dataset_url = \"http://snap.stanford.edu/ogb/data/linkproppred/citation-v2.zip\"\n\n    def download(self, overwrite=False):\n        self.input_train_edges_file = self.output_directory / Path(\"train.pt\")\n        self.input_valid_edges_file = self.output_directory / Path(\"valid.pt\")\n        self.input_test_edges_file = self.output_directory / Path(\"test.pt\")\n\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n        if not self.input_valid_edges_file.exists():\n            download = True\n        if not self.input_test_edges_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            for file in (self.output_directory / Path(\"citation-v2/split/time\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        train_idx = torch.load(self.input_train_edges_file)\n        valid_idx = torch.load(self.input_valid_edges_file)\n        test_idx = torch.load(self.input_test_edges_file)\n\n        train_list = np.array([train_idx.get(\"source_node\"), train_idx.get(\"target_node\")]).T\n        valid_list = np.array([valid_idx.get(\"source_node\"), valid_idx.get(\"target_node\")]).T\n        test_list = np.array([test_idx.get(\"source_node\"), test_idx.get(\"target_node\")]).T\n\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=train_list,\n            valid_edges=valid_list,\n            test_edges=test_list,\n            num_partitions=num_partitions,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            remap_ids=remap_ids,\n            known_node_ids=[\n                torch.arange(2927963)\n            ],  # not all nodes appear in the edges, need to supply all node ids for the mapping to be correct\n            format=\"numpy\",\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogbl_collab.py",
    "content": "from pathlib import Path\n\nimport pandas as pd\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\nimport torch  # isort:skip\n\n\nclass OGBLCollab(LinkPredictionDataset):\n    \"\"\"\n    Open Graph Benchmark: collab\n\n    The ogbl-collab dataset is a weighted directed graph, representing a subset of the collaboration network\n    between authors indexed by MAG. Each node represents an author and edges indicate the collaboration between\n    authors. All nodes come with 128-dimensional features, obtained by averaging the word embeddings of papers\n    that are published by the authors. All edges are associated with two meta-information: the year and the\n    edge weight, representing the number of co-authored papers published in that year. The graph can be viewed\n    as a dynamic multi-graph since there can be multiple edges between two nodes if they collaborate in more\n    than one year.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False, include_edge_type=True, include_edge_weight=True):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogbl_citation2\"\n        self.dataset_url = \"http://snap.stanford.edu/ogb/data/linkproppred/collab.zip\"\n        self.node_ids = None\n        self.include_edge_type = include_edge_type\n        self.include_edge_weight = include_edge_weight\n\n    def download(self, overwrite=False):\n        self.input_train_edges_file = self.output_directory / Path(\"train.pt\")\n        self.input_valid_edges_file = self.output_directory / Path(\"valid.pt\")\n        self.input_test_edges_file = self.output_directory / Path(\"test.pt\")\n\n        download = False\n        if overwrite:\n            download = True\n        elif not self.input_train_edges_file.exists():\n            download = True\n        elif not self.input_valid_edges_file.exists():\n            download = True\n        elif not self.input_test_edges_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            for file in (self.output_directory / Path(\"collab/split/time\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n        # Read in the nodes\n        nodes_path = Path(self.output_directory).joinpath(\"collab\", \"raw\", \"num-node-list.csv.gz\")\n        df = pd.read_csv(nodes_path, compression=\"gzip\", header=None)\n        self.num_nodes = df.iloc[0][0]\n\n    def preprocess(\n        self,\n        num_partitions=1,\n        remap_ids=True,\n        splits=None,\n        sequential_train_nodes=False,\n        partitioned_eval=False,\n    ):\n        # Read in the training data\n        train_idx = torch.load(self.input_train_edges_file)\n        train_edges = torch.from_numpy(train_idx.get(\"edge\"))\n\n        # Read in the valid data\n        valid_idx = torch.load(self.input_valid_edges_file)\n        valid_edges = torch.from_numpy(valid_idx.get(\"edge\"))\n\n        # Read in the test data\n        test_idx = torch.load(self.input_test_edges_file)\n        test_edges = torch.from_numpy(test_idx.get(\"edge\"))\n\n        edge_type_column, edge_weight_column = None, None\n        if self.include_edge_type:\n            # Added in the year information\n            train_year = torch.from_numpy(train_idx.get(\"year\").reshape(-1, 1))\n            train_edges = torch.cat((train_edges, train_year), dim=1)\n\n            valid_year = torch.from_numpy(valid_idx.get(\"year\").reshape(-1, 1))\n            valid_edges = torch.cat((valid_edges, valid_year), dim=1)\n\n            test_year = torch.from_numpy(test_idx.get(\"year\").reshape(-1, 1))\n            test_edges = torch.cat((test_edges, test_year), dim=1)\n\n            edge_type_column = 2\n\n        if self.include_edge_weight:\n            # Add in the weights\n            train_weight = torch.from_numpy(train_idx.get(\"weight\").reshape(-1, 1))\n            train_edges = torch.cat((train_edges, train_weight), dim=1)\n\n            valid_weight = torch.from_numpy(valid_idx.get(\"weight\").reshape(-1, 1))\n            valid_edges = torch.cat((valid_edges, valid_weight), dim=1)\n\n            test_weight = torch.from_numpy(test_idx.get(\"weight\").reshape(-1, 1))\n            test_edges = torch.cat((test_edges, test_weight), dim=1)\n\n            edge_weight_column = 3\n\n        # Add in the edge type information\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=train_edges,\n            valid_edges=valid_edges,\n            test_edges=test_edges,\n            num_partitions=num_partitions,\n            remap_ids=remap_ids,\n            known_node_ids=[torch.arange(self.num_nodes)],\n            format=\"pytorch\",\n            splits=splits,\n            sequential_train_nodes=sequential_train_nodes,\n            src_column=0,\n            dst_column=1,\n            edge_type_column=edge_type_column,\n            edge_weight_column=edge_weight_column,\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogbl_ppa.py",
    "content": "from pathlib import Path\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\nimport torch  # isort:skip\n\n\nclass OGBLPpa(LinkPredictionDataset):\n    \"\"\"\n    Open Graph Benchmark: ppa\n\n    The ogbl-ppa dataset is an undirected, unweighted graph.\n    Nodes represent proteins from 58 different species, and edges indicate biologically meaningful\n    associations between proteins, e.g., physical interactions, co-expression, homology or genomic neighborhood.\n    Each node contains a 58-dimensional one-hot feature vector that indicates the species that\n    the corresponding protein comes from.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogbl_ppa\"\n        self.dataset_url = \"http://snap.stanford.edu/ogb/data/linkproppred/ppassoc.zip\"\n\n    def download(self, overwrite=False, remap_ids=True):\n        self.input_train_edges_file = self.output_directory / Path(\"train.pt\")\n        self.input_valid_edges_file = self.output_directory / Path(\"valid.pt\")\n        self.input_test_edges_file = self.output_directory / Path(\"test.pt\")\n\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n        if not self.input_valid_edges_file.exists():\n            download = True\n        if not self.input_test_edges_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            for file in (self.output_directory / Path(\"ppassoc/split/throughput\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        train_idx = torch.load(self.input_train_edges_file).get(\"edge\")\n        valid_idx = torch.load(self.input_valid_edges_file).get(\"edge\")\n        test_idx = torch.load(self.input_test_edges_file).get(\"edge\")\n\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=train_idx,\n            valid_edges=valid_idx,\n            test_edges=test_idx,\n            num_partitions=num_partitions,\n            remap_ids=remap_ids,\n            format=\"numpy\",\n            partitioned_evaluation=partitioned_eval,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogbl_wikikg2.py",
    "content": "from pathlib import Path\n\nimport numpy as np\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\nimport torch  # isort:skip\n\n\nclass OGBLWikiKG2(LinkPredictionDataset):\n    \"\"\"\n    Open Graph Benchmark: wikikg2\n\n    The ogbl-wikikg2 dataset is a Knowledge Graph (KG) extracted from the Wikidata knowledge base.\n    It contains a set of triplet edges (head, relation, tail), capturing the different\n    types of relations between entities in the world, e.g., (Canada, citizen, Hinton).\n    We retrieve all the relational statements in Wikidata and filter out rare entities.\n     Our KG contains 2,500,604 entities and 535 relation types.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogbl_wikikg2\"\n        self.dataset_url = \"http://snap.stanford.edu/ogb/data/linkproppred/wikikg-v2.zip\"\n\n    def download(self, overwrite=False):\n        self.input_train_edges_file = self.output_directory / Path(\"train.pt\")\n        self.input_valid_edges_file = self.output_directory / Path(\"valid.pt\")\n        self.input_test_edges_file = self.output_directory / Path(\"test.pt\")\n\n        download = False\n        if not self.input_train_edges_file.exists():\n            download = True\n        if not self.input_valid_edges_file.exists():\n            download = True\n        if not self.input_test_edges_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            for file in (self.output_directory / Path(\"wikikg-v2/split/time\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        train_idx = torch.load(self.input_train_edges_file)\n        valid_idx = torch.load(self.input_valid_edges_file)\n        test_idx = torch.load(self.input_test_edges_file)\n\n        train_list = np.array([train_idx.get(\"head\"), train_idx.get(\"relation\"), train_idx.get(\"tail\")]).T\n        valid_list = np.array([valid_idx.get(\"head\"), valid_idx.get(\"relation\"), valid_idx.get(\"tail\")]).T\n        test_list = np.array([test_idx.get(\"head\"), test_idx.get(\"relation\"), test_idx.get(\"tail\")]).T\n\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=train_list.astype(\"int32\"),\n            valid_edges=valid_list.astype(\"int32\"),\n            test_edges=test_list.astype(\"int32\"),\n            num_partitions=num_partitions,\n            format=\"numpy\",\n            remap_ids=remap_ids,\n            partitioned_evaluation=partitioned_eval,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogbn_arxiv.py",
    "content": "from pathlib import Path\n\nimport numpy as np\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import NodeClassificationDataset\nfrom marius.tools.preprocess.datasets.dataset_helpers import remap_nodes\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\n\nclass OGBNArxiv(NodeClassificationDataset):\n    \"\"\"\n    Open Graph Benchmark: arxiv\n\n    The ogbn-arxiv dataset is a directed graph,\n    representing the citation network between all Computer Science (CS) arXiv papers indexed by MAG.\n    Each node is an arXiv paper and each directed edge indicates that one paper cites another one.\n    Each paper comes with a 128-dimensional feature vector obtained by averaging the embeddings of words\n    in its title and abstract.\n    The embeddings of individual words are computed by running the skip-gram model over the MAG corpus.\n    We also provide the mapping from MAG paper IDs into the raw texts of titles and abstracts here.\n    In addition, all papers are also associated with the year that the corresponding paper was published.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogbn_arxiv\"\n        self.dataset_url = \"http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip\"\n\n    def download(self, overwrite=False):\n        self.input_edge_list_file = self.output_directory / Path(\"edge.csv\")\n        self.input_node_feature_file = self.output_directory / Path(\"node-feat.csv\")\n        self.input_node_label_file = self.output_directory / Path(\"node-label.csv\")\n        self.input_train_nodes_file = self.output_directory / Path(\"train.csv\")\n        self.input_valid_nodes_file = self.output_directory / Path(\"valid.csv\")\n        self.input_test_nodes_file = self.output_directory / Path(\"test.csv\")\n\n        download = False\n        if not self.input_edge_list_file.exists():\n            download = True\n        if not self.input_node_feature_file.exists():\n            download = True\n        if not self.input_node_label_file.exists():\n            download = True\n        if not self.input_train_nodes_file.exists():\n            download = True\n        if not self.input_valid_nodes_file.exists():\n            download = True\n        if not self.input_test_nodes_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            extract_file(self.output_directory / Path(\"arxiv/raw/edge.csv.gz\"))\n            extract_file(self.output_directory / Path(\"arxiv/raw/node-feat.csv.gz\"))\n            extract_file(self.output_directory / Path(\"arxiv/raw/node-label.csv.gz\"))\n\n            (self.output_directory / Path(\"arxiv/raw/edge.csv\")).rename(self.input_edge_list_file)\n            (self.output_directory / Path(\"arxiv/raw/node-feat.csv\")).rename(self.input_node_feature_file)\n            (self.output_directory / Path(\"arxiv/raw/node-label.csv\")).rename(self.input_node_label_file)\n\n            for file in (self.output_directory / Path(\"arxiv/split/time\")).iterdir():\n                extract_file(file)\n\n            for file in (self.output_directory / Path(\"arxiv/split/time\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        train_nodes = np.genfromtxt(self.input_train_nodes_file, delimiter=\",\").astype(np.int32)\n        valid_nodes = np.genfromtxt(self.input_valid_nodes_file, delimiter=\",\").astype(np.int32)\n        test_nodes = np.genfromtxt(self.input_test_nodes_file, delimiter=\",\").astype(np.int32)\n\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_edge_list_file,\n            num_partitions=num_partitions,\n            src_column=0,\n            dst_column=1,\n            remap_ids=remap_ids,\n            sequential_train_nodes=sequential_train_nodes,\n            delim=\",\",\n            known_node_ids=[train_nodes, valid_nodes, test_nodes],\n            partitioned_evaluation=partitioned_eval,\n        )\n        dataset_stats = converter.convert()\n\n        features = np.genfromtxt(self.input_node_feature_file, delimiter=\",\").astype(np.float32)\n        labels = np.genfromtxt(self.input_node_label_file, delimiter=\",\").astype(np.int32)\n\n        if remap_ids:\n            node_mapping = np.genfromtxt(self.output_directory / Path(PathConstants.node_mapping_path), delimiter=\",\")\n            train_nodes, valid_nodes, test_nodes, features, labels = remap_nodes(\n                node_mapping, train_nodes, valid_nodes, test_nodes, features, labels\n            )\n\n        with open(self.train_nodes_file, \"wb\") as f:\n            f.write(bytes(train_nodes))\n        with open(self.valid_nodes_file, \"wb\") as f:\n            f.write(bytes(valid_nodes))\n        with open(self.test_nodes_file, \"wb\") as f:\n            f.write(bytes(test_nodes))\n        with open(self.node_features_file, \"wb\") as f:\n            f.write(bytes(features))\n        with open(self.node_labels_file, \"wb\") as f:\n            f.write(bytes(labels))\n\n        # update dataset yaml\n        dataset_stats.num_train = train_nodes.shape[0]\n        dataset_stats.num_valid = valid_nodes.shape[0]\n        dataset_stats.num_test = test_nodes.shape[0]\n        dataset_stats.node_feature_dim = features.shape[1]\n        dataset_stats.num_classes = 40\n\n        dataset_stats.num_nodes = dataset_stats.num_train + dataset_stats.num_valid + dataset_stats.num_test\n\n        with open(self.output_directory / Path(\"dataset.yaml\"), \"w\") as f:\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n        return dataset_stats\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogbn_papers100m.py",
    "content": "from pathlib import Path\n\nimport numpy as np\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import NodeClassificationDataset\nfrom marius.tools.preprocess.datasets.dataset_helpers import remap_nodes\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\nimport torch  # isort:skip\n\n\nclass OGBNPapers100M(NodeClassificationDataset):\n    \"\"\"\n    Open Graph Benchmark: ogbn-papers100m\n\n    Directed citation graph of 111 million papers indexed by MAG.\n    Its graph structure and node features are constructed in the same way as ogbn-arxiv.\n    Among its node set, approximately 1.5 million of them are arXiv papers,\n    each of which is manually labeled with one of arXiv’s subject areas.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogbn_papers100M\"\n        self.dataset_url = \"http://snap.stanford.edu/ogb/data/nodeproppred/papers100M-bin.zip\"\n\n    def download(self, overwrite=False):\n        self.input_edge_list_file = self.output_directory / Path(\"data.npz\")  # key: edge_index\n        self.input_node_feature_file = self.output_directory / Path(\"data.npz\")  # key: node_feat\n        self.input_node_label_file = self.output_directory / Path(\"node-label.npz\")\n        self.input_train_nodes_file = self.output_directory / Path(\"train.csv\")\n        self.input_valid_nodes_file = self.output_directory / Path(\"valid.csv\")\n        self.input_test_nodes_file = self.output_directory / Path(\"test.csv\")\n\n        download = False\n        if not self.input_edge_list_file.exists():\n            download = True\n        if not self.input_node_feature_file.exists():\n            download = True\n        if not self.input_node_label_file.exists():\n            download = True\n        if not self.input_train_nodes_file.exists():\n            download = True\n        if not self.input_valid_nodes_file.exists():\n            download = True\n        if not self.input_test_nodes_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            (self.output_directory / Path(\"papers100M-bin/raw/data.npz\")).rename(self.input_node_feature_file)\n            (self.output_directory / Path(\"papers100M-bin/raw/node-label.npz\")).rename(self.input_node_label_file)\n\n            for file in (self.output_directory / Path(\"papers100M-bin/split/time\")).iterdir():\n                extract_file(file)\n\n            for file in (self.output_directory / Path(\"papers100M-bin/split/time\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        data_dict = np.load(self.input_edge_list_file)\n\n        input_edges = torch.from_numpy(data_dict[\"edge_index\"].astype(np.int32).transpose())\n        train_nodes = np.genfromtxt(self.input_train_nodes_file, delimiter=\",\").astype(np.int32)\n        valid_nodes = np.genfromtxt(self.input_valid_nodes_file, delimiter=\",\").astype(np.int32)\n        test_nodes = np.genfromtxt(self.input_test_nodes_file, delimiter=\",\").astype(np.int32)\n\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=input_edges,\n            num_partitions=num_partitions,\n            remap_ids=remap_ids,\n            sequential_train_nodes=sequential_train_nodes,\n            format=\"pytorch\",\n            known_node_ids=[train_nodes, valid_nodes, test_nodes],\n            partitioned_evaluation=partitioned_eval,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        dataset_stats = converter.convert()\n\n        features = data_dict[\"node_feat\"].astype(np.float32)\n        labels = np.load(self.input_node_label_file)[\"node_label\"].astype(np.int32)\n        labels[np.isnan(labels)] = -1\n\n        if remap_ids:\n            node_mapping = np.genfromtxt(self.output_directory / Path(PathConstants.node_mapping_path), delimiter=\",\")\n            train_nodes, valid_nodes, test_nodes, features, labels = remap_nodes(\n                node_mapping, train_nodes, valid_nodes, test_nodes, features, labels\n            )\n\n        with open(self.train_nodes_file, \"wb\") as f:\n            f.write(bytes(train_nodes))\n        with open(self.valid_nodes_file, \"wb\") as f:\n            f.write(bytes(valid_nodes))\n        with open(self.test_nodes_file, \"wb\") as f:\n            f.write(bytes(test_nodes))\n        with open(self.node_features_file, \"wb\") as f:\n            f.write(bytes(features))\n        with open(self.node_labels_file, \"wb\") as f:\n            f.write(bytes(labels))\n\n        # update dataset yaml\n        dataset_stats.num_train = train_nodes.shape[0]\n        dataset_stats.num_valid = valid_nodes.shape[0]\n        dataset_stats.num_test = test_nodes.shape[0]\n        dataset_stats.node_feature_dim = features.shape[1]\n        dataset_stats.num_classes = 172\n\n        dataset_stats.num_nodes = labels.shape[0]\n\n        with open(self.output_directory / Path(\"dataset.yaml\"), \"w\") as f:\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n        return dataset_stats\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/ogbn_products.py",
    "content": "from pathlib import Path\n\nimport numpy as np\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import NodeClassificationDataset\nfrom marius.tools.preprocess.datasets.dataset_helpers import remap_nodes\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\n\nclass OGBNProducts(NodeClassificationDataset):\n    \"\"\"\n    Open Graph Benchmark: products\n\n    The ogbn-products dataset is an undirected and unweighted graph,\n    representing an Amazon product co-purchasing network.\n    Nodes represent products sold in Amazon,\n    and edges between two products indicate that the products are purchased together.\n    We follow to process node features and target categories.\n    Specifically, node features are generated by extracting bag-of-words features from the product\n    descriptions followed by a Principal Component Analysis to reduce the dimension to 100.\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"ogbn_products\"\n        self.dataset_url = \"http://snap.stanford.edu/ogb/data/nodeproppred/products.zip\"\n\n    def download(self, overwrite=False):\n        self.input_edge_list_file = self.output_directory / Path(\"edge.csv\")\n        self.input_node_feature_file = self.output_directory / Path(\"node-feat.csv\")\n        self.input_node_label_file = self.output_directory / Path(\"node-label.csv\")\n        self.input_train_nodes_file = self.output_directory / Path(\"train.csv\")\n        self.input_valid_nodes_file = self.output_directory / Path(\"valid.csv\")\n        self.input_test_nodes_file = self.output_directory / Path(\"test.csv\")\n\n        download = False\n        if not self.input_edge_list_file.exists():\n            download = True\n        if not self.input_node_feature_file.exists():\n            download = True\n        if not self.input_node_label_file.exists():\n            download = True\n        if not self.input_train_nodes_file.exists():\n            download = True\n        if not self.input_valid_nodes_file.exists():\n            download = True\n        if not self.input_test_nodes_file.exists():\n            download = True\n\n        if download:\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=False)\n\n            extract_file(self.output_directory / Path(\"products/raw/edge.csv.gz\"))\n            extract_file(self.output_directory / Path(\"products/raw/node-feat.csv.gz\"))\n            extract_file(self.output_directory / Path(\"products/raw/node-label.csv.gz\"))\n\n            (self.output_directory / Path(\"products/raw/edge.csv\")).rename(self.input_edge_list_file)\n            (self.output_directory / Path(\"products/raw/node-feat.csv\")).rename(self.input_node_feature_file)\n            (self.output_directory / Path(\"products/raw/node-label.csv\")).rename(self.input_node_label_file)\n\n            for file in (self.output_directory / Path(\"products/split/sales_ranking\")).iterdir():\n                extract_file(file)\n\n            for file in (self.output_directory / Path(\"products/split/sales_ranking\")).iterdir():\n                file.rename(self.output_directory / Path(file.name))\n\n    def preprocess(\n        self, num_partitions=1, remap_ids=True, splits=None, sequential_train_nodes=False, partitioned_eval=False\n    ):\n        train_nodes = np.genfromtxt(self.input_train_nodes_file, delimiter=\",\").astype(np.int32)\n        valid_nodes = np.genfromtxt(self.input_valid_nodes_file, delimiter=\",\").astype(np.int32)\n        test_nodes = np.genfromtxt(self.input_test_nodes_file, delimiter=\",\").astype(np.int32)\n\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_edge_list_file,\n            num_partitions=num_partitions,\n            columns=[0, 1],\n            src_column=0,\n            dst_column=1,\n            remap_ids=remap_ids,\n            sequential_train_nodes=sequential_train_nodes,\n            delim=\",\",\n            known_node_ids=[train_nodes, valid_nodes, test_nodes],\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        dataset_stats = converter.convert()\n\n        features = np.genfromtxt(self.input_node_feature_file, delimiter=\",\").astype(np.float32)\n        labels = np.genfromtxt(self.input_node_label_file, delimiter=\",\").astype(np.int32)\n\n        if remap_ids:\n            node_mapping = np.genfromtxt(self.output_directory / Path(PathConstants.node_mapping_path), delimiter=\",\")\n            train_nodes, valid_nodes, test_nodes, features, labels = remap_nodes(\n                node_mapping, train_nodes, valid_nodes, test_nodes, features, labels\n            )\n\n        with open(self.train_nodes_file, \"wb\") as f:\n            f.write(bytes(train_nodes))\n        with open(self.valid_nodes_file, \"wb\") as f:\n            f.write(bytes(valid_nodes))\n        with open(self.test_nodes_file, \"wb\") as f:\n            f.write(bytes(test_nodes))\n        with open(self.node_features_file, \"wb\") as f:\n            f.write(bytes(features))\n        with open(self.node_labels_file, \"wb\") as f:\n            f.write(bytes(labels))\n\n        # update dataset yaml\n        dataset_stats.num_train = train_nodes.shape[0]\n        dataset_stats.num_valid = valid_nodes.shape[0]\n        dataset_stats.num_test = test_nodes.shape[0]\n        dataset_stats.node_feature_dim = features.shape[1]\n        dataset_stats.num_classes = 47\n\n        dataset_stats.num_nodes = dataset_stats.num_train + dataset_stats.num_valid + dataset_stats.num_test\n\n        with open(self.output_directory / Path(\"dataset.yaml\"), \"w\") as f:\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n        return dataset_stats\n"
  },
  {
    "path": "src/python/tools/preprocess/datasets/twitter.py",
    "content": "from pathlib import Path\n\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\nfrom marius.tools.preprocess.dataset import LinkPredictionDataset\nfrom marius.tools.preprocess.utils import download_url, extract_file\n\n\nclass Twitter(LinkPredictionDataset):\n    \"\"\"\n    Twitter\n\n    467 million Twitter posts from 20 million users covering a 7 month period from\n    June 1 2009 to December 31 2009. Estimated 20-30% of all public tweets published\n    on Twitter during the particular time frame. For each public tweet the following\n    information is available: Author, Time, Content\n    \"\"\"\n\n    def __init__(self, output_directory: Path, spark=False):\n        super().__init__(output_directory, spark)\n\n        self.dataset_name = \"twitter\"\n        self.dataset_url = \"https://snap.stanford.edu/data/twitter-2010.txt.gz\"\n\n    def download(self, overwrite=False):\n        self.input_edges = self.output_directory / Path(\"twitter-2010.txt\")\n\n        if not self.input_edges.exists():\n            archive_path = download_url(self.dataset_url, self.output_directory, overwrite)\n            extract_file(archive_path, remove_input=True)\n\n    def preprocess(\n        self,\n        num_partitions=1,\n        remap_ids=True,\n        splits=[0.9, 0.05, 0.05],\n        sequential_train_nodes=False,\n        partitioned_eval=False,\n    ):\n        converter = TorchEdgeListConverter(\n            output_dir=self.output_directory,\n            train_edges=self.input_edges,\n            delim=\" \",\n            src_column=0,\n            dst_column=1,\n            num_partitions=num_partitions,\n            splits=splits,\n            remap_ids=remap_ids,\n            partitioned_evaluation=partitioned_eval,\n        )\n\n        return converter.convert()\n"
  },
  {
    "path": "src/python/tools/preprocess/utils.py",
    "content": "import gzip\nimport os\nimport shutil\nimport tarfile\nimport zipfile\nfrom pathlib import Path\nfrom urllib.parse import urlparse\nfrom urllib.request import urlretrieve\nfrom zipfile import ZipFile\n\n\ndef get_df_count(df, col):\n    if df is None:\n        return None\n    return df.agg({col: \"max\"}).collect()[0][0] + 1\n\n\ndef download_url(url, output_dir, overwrite):\n    output_dir = Path(output_dir)\n\n    url_components = urlparse(url)\n    filename = Path(url_components.path + url_components.query).name\n    filepath = output_dir / filename\n\n    if filepath.is_file() and not overwrite:\n        print(f\"File already exists: {filepath}\")\n    else:\n        try:\n            print(f\"Downloading {filename} to {filepath}\")\n            urlretrieve(url, str(filepath))\n        except OSError:\n            raise RuntimeError(f\"Failed to download {filename}\")\n\n    return filepath\n\n\ndef extract_file(filepath, remove_input=True):\n    try:\n        if tarfile.is_tarfile(str(filepath)):\n            if str(filepath).endswith(\".gzip\") or str(filepath).endswith(\".gz\"):\n                with tarfile.open(filepath, \"r:gz\") as tar:\n                    tar.extractall(path=filepath.parent)\n            elif str(filepath).endswith(\".tar.gz\") or str(filepath).endswith(\".tgz\"):\n                with tarfile.open(filepath, \"r:gz\") as tar:\n                    tar.extractall(path=filepath.parent)\n            elif str(filepath).endswith(\".tar\"):\n                with tarfile.open(filepath, \"r:\") as tar:\n                    tar.extractall(path=filepath.parent)\n            elif str(filepath).endswith(\".bz2\"):\n                with tarfile.open(filepath, \"r:bz2\") as tar:\n                    tar.extractall(path=filepath.parent)\n            else:\n                try:\n                    with tarfile.open(filepath, \"r:gz\") as tar:\n                        tar.extractall(path=filepath.parent)\n                except tarfile.TarError:\n                    raise RuntimeError(\n                        \"Unrecognized file format, may need to perform extraction manually with a custom dataset.\"\n                    )\n        elif zipfile.is_zipfile(str(filepath)):\n            with ZipFile(filepath, \"r\") as zip:\n                zip.extractall(filepath.parent)\n        else:\n            try:\n                with filepath.with_suffix(\"\").open(\"wb\") as output_f, gzip.GzipFile(filepath) as gzip_f:\n                    shutil.copyfileobj(gzip_f, output_f)\n            except gzip.BadGzipFile:\n                raise RuntimeError(\"Undefined file format.\")\n    except EOFError:\n        raise RuntimeError(\"Dataset file isn't complete. Try downloading again.\")\n\n    if filepath.exists() and remove_input:\n        filepath.unlink()\n\n    return filepath.parent\n\n\ndef strip_header(filepath, num_lines):\n    cmd = \"tail -n +{} {} > tmp.txt\".format(num_lines + 1, filepath)\n    os.system(cmd)\n\n    cmd = \"mv tmp.txt {}\".format(filepath)\n    os.system(cmd)\n"
  },
  {
    "path": "test/CMakeLists.txt",
    "content": "file(COPY ${PROJECT_SOURCE_DIR}/test/test_data DESTINATION ${CMAKE_CURRENT_BINARY_DIR})\nfile(COPY ${PROJECT_SOURCE_DIR}/test/test_configs DESTINATION ${CMAKE_CURRENT_BINARY_DIR})\n\nset(MARIUS_TEST_HOME ${CMAKE_CURRENT_BINARY_DIR})\n\nadd_subdirectory(cpp)\n"
  },
  {
    "path": "test/README.md",
    "content": "# Testing Marius #\n\nMarius uses GTest for testing C++ code, and uses tox and pytest for testing python code.\n\nCurrently only a simple set of end to end tests are written for C++ and Python. \n\n### C++ Tests ###\n\nTests must be built before they can be run.\n\nBuilding the tests (working from `<INSTALL_DIR>/marius`):  \n```\ncd build\nmake end_to_end -j\n```\n\nRunning the tests: \n```\ncd build/test/cpp/end_to_end\n./end_to_end\n```\n\n### Python Tests ###\n\nRunning the tests (working from `<INSTALL_DIR>/marius`):\n```\ntox\n```\n"
  },
  {
    "path": "test/__init__.py",
    "content": ""
  },
  {
    "path": "test/cpp/CMakeLists.txt",
    "content": "find_package(Threads REQUIRED)\n\ninclude_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})\n\n## Add links to test files below\nadd_subdirectory(end_to_end)\nadd_subdirectory(integration)\nadd_subdirectory(performance)\nadd_subdirectory(unit)\n"
  },
  {
    "path": "test/cpp/end_to_end/CMakeLists.txt",
    "content": "file(GLOB SRCS *.cpp)\n\nADD_EXECUTABLE(end_to_end ${SRCS})\n\nTARGET_LINK_LIBRARIES(end_to_end\n        ${PROJECT_NAME}\n        gtest gtest_main\n        )\n\nadd_test(NAME end_to_end COMMAND end_to_end WORKING_DIRECTORY ${project_WORKING_DIR})"
  },
  {
    "path": "test/cpp/end_to_end/main.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/18/20.\n//\n\n#include \"gtest/gtest.h\"\n\nint main(int argc, char **argv) {\n    ::testing::InitGoogleTest(&argc, argv);\n    int ret = RUN_ALL_TESTS();\n    return ret;\n}"
  },
  {
    "path": "test/cpp/end_to_end/test_main.cpp",
    "content": "//\n// Created by Jason Mohoney on 3/28/21.\n//\n\n#include <gtest/gtest.h>\n#include <marius.h>\n\n#include <string>\n\n/**\n * Runs marius training on a default test configuration\n */\nTEST(TestMain, TestLinkPred) {\n    std::string conf_str = std::string(MARIUS_TEST_DIRECTORY) + \"/test_configs/fb15k_237_e2e.yaml\";\n    const char* conf = conf_str.c_str();\n    int num_args = 2;\n    const char* n_argv[] = {\"marius_train\", conf};\n    marius(num_args, (char**)(n_argv));\n}\n"
  },
  {
    "path": "test/cpp/integration/CMakeLists.txt",
    "content": "file(GLOB SRCS *.cpp)\n\nADD_EXECUTABLE(integration ${SRCS})\n\nTARGET_LINK_LIBRARIES(integration\n        ${PROJECT_NAME}\n        gtest gtest_main\n        )\n\nadd_test(NAME integration COMMAND integration WORKING_DIRECTORY ${project_WORKING_DIR})"
  },
  {
    "path": "test/cpp/integration/main.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/18/20.\n//\n\n#include \"gtest/gtest.h\"\n\nint main(int argc, char **argv) {\n    ::testing::InitGoogleTest(&argc, argv);\n    int ret = RUN_ALL_TESTS();\n    return ret;\n}"
  },
  {
    "path": "test/cpp/performance/CMakeLists.txt",
    "content": "file(GLOB SRCS *.cpp)\n\nADD_EXECUTABLE(performance ${SRCS})\n\nTARGET_LINK_LIBRARIES(performance\n        ${PROJECT_NAME}\n        gtest gtest_main\n        )\n\nadd_test(NAME performance COMMAND performance WORKING_DIRECTORY ${project_WORKING_DIR})"
  },
  {
    "path": "test/cpp/performance/main.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/18/20.\n//\n\n#include \"gtest/gtest.h\"\n\nint main(int argc, char **argv) {\n    ::testing::InitGoogleTest(&argc, argv);\n    int ret = RUN_ALL_TESTS();\n    return ret;\n}"
  },
  {
    "path": "test/cpp/unit/CMakeLists.txt",
    "content": "file(GLOB TEST_HEADERS *.h)\nfile(GLOB SRCS *.cpp)\n\nadd_library(${TEST_LIB}\n        SHARED\n        ${TEST_HEADERS}\n        ${SRCS})\ntarget_link_libraries(${TEST_LIB} ${PROJECT_NAME})\ntarget_link_libraries(${TEST_LIB} gtest)\ntarget_link_libraries(${TEST_LIB} gtest_main)\n\nADD_EXECUTABLE(unit ${SRCS})\n\nTARGET_LINK_LIBRARIES(unit ${TEST_LIB} ${PROJECT_NAME} gtest gtest_main)\n\nadd_test(NAME unit COMMAND unit WORKING_DIRECTORY ${project_WORKING_DIR})"
  },
  {
    "path": "test/cpp/unit/main.cpp",
    "content": "//\n// Created by Jason Mohoney on 10/18/20.\n//\n\n#include \"gtest/gtest.h\"\n\nint main(int argc, char **argv) {\n    ::testing::InitGoogleTest(&argc, argv);\n    int ret = RUN_ALL_TESTS();\n    return ret;\n}"
  },
  {
    "path": "test/cpp/unit/nn/test_activation.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/4/22.\n//\n\n#include <gtest/gtest.h>\n#include <nn/activation.h>\n\ntorch::Tensor tensor1 = torch::tensor({-1.0, -5.2, 1.2, 3.5, 5.5, 7.0}, torch::kFloat32);\ntorch::Tensor tensor2 = torch::tensor({1.0, 5.2, 1.2, 3.5, 5.5, 7.0}, torch::kFloat32);\ntorch::Tensor tensor3 = torch::tensor({1.0}, torch::kFloat32);\ntorch::Tensor tensor4 = torch::tensor({-1.0}, torch::kFloat32);\ntorch::Tensor tensor5 = torch::tensor({{-1.0, -5.2, 1.2, 3.5, 5.5, 7.0}, {1.0, 5.2, 1.2, 3.5, 5.5, 7.0}}, torch::kFloat32);\ntorch::Tensor empty_tensor;\n\nTEST(TestActivation, TestRelu) {\n    ActivationFunction activation = ActivationFunction::RELU;\n\n    torch::Tensor expected_tensor1 = torch::tensor({0.0, 0.0, 1.2, 3.5, 5.5, 7.0}, torch::kFloat32);\n    torch::Tensor expected_tensor2 = torch::tensor({1.0, 5.2, 1.2, 3.5, 5.5, 7.0}, torch::kFloat32);\n    torch::Tensor expected_tensor3 = torch::tensor({1.0}, torch::kFloat32);\n    torch::Tensor expected_tensor4 = torch::tensor({0.0}, torch::kFloat32);\n    torch::Tensor expected_tensor5 = torch::tensor({{0.0, 0.0, 1.2, 3.5, 5.5, 7.0}, {1.0, 5.2, 1.2, 3.5, 5.5, 7.0}}, torch::kFloat32);\n\n    ASSERT_TRUE(apply_activation(activation, tensor1).eq(expected_tensor1).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor2).eq(expected_tensor2).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor3).eq(expected_tensor3).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor4).eq(expected_tensor4).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor5).eq(expected_tensor5).all().item<bool>());\n\n    ASSERT_THROW(apply_activation(activation, empty_tensor), UndefinedTensorException);\n}\n\nTEST(TestActivation, TestSigmoid) {\n    ActivationFunction activation = ActivationFunction::SIGMOID;\n\n    torch::Tensor expected_tensor1 = torch::sigmoid(tensor1);\n    torch::Tensor expected_tensor2 = torch::sigmoid(tensor2);\n    torch::Tensor expected_tensor3 = torch::sigmoid(tensor3);\n    torch::Tensor expected_tensor4 = torch::sigmoid(tensor4);\n    torch::Tensor expected_tensor5 = torch::sigmoid(tensor5);\n\n    ASSERT_TRUE(apply_activation(activation, tensor1).eq(expected_tensor1).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor2).eq(expected_tensor2).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor3).eq(expected_tensor3).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor4).eq(expected_tensor4).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor5).eq(expected_tensor5).all().item<bool>());\n\n    ASSERT_THROW(apply_activation(activation, empty_tensor), UndefinedTensorException);\n}\n\nTEST(TestActivation, TestNone) {\n    ActivationFunction activation = ActivationFunction::NONE;\n\n    ASSERT_TRUE(apply_activation(activation, tensor1).eq(tensor1).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor2).eq(tensor2).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor3).eq(tensor3).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor4).eq(tensor4).all().item<bool>());\n    ASSERT_TRUE(apply_activation(activation, tensor5).eq(tensor5).all().item<bool>());\n\n    ASSERT_THROW(apply_activation(activation, empty_tensor), UndefinedTensorException);\n}\n"
  },
  {
    "path": "test/cpp/unit/nn/test_initialization.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/4/22.\n//\n\n#include <gtest/gtest.h>\n#include <nn/initialization.h>\n\nauto f16_options = torch::TensorOptions().dtype(torch::kFloat16);\nauto f32_options = torch::TensorOptions().dtype(torch::kFloat32);\nauto f64_options = torch::TensorOptions().dtype(torch::kFloat64);\n\nstd::vector<int64_t> shape1 = {5};\nstd::vector<int64_t> shape2 = {5, 3};\nstd::vector<int64_t> shape3 = {5, 3, 2};\n\nTEST(TestInitialization, TestUniform) {\n    float scale_factor1 = 1.0;\n    float scale_factor2 = 2.0;\n    float scale_factor3 = .25;\n\n    // test scale factor\n    torch::Tensor tensor = uniform_init(scale_factor1, shape2, f32_options);\n    ASSERT_TRUE((tensor.ge(-scale_factor1) & tensor.le(scale_factor1)).all().item<bool>());\n\n    tensor = uniform_init(scale_factor2, shape2, f32_options);\n    ASSERT_TRUE((tensor.ge(-scale_factor2) & tensor.le(scale_factor2)).all().item<bool>());\n\n    tensor = uniform_init(scale_factor3, shape2, f32_options);\n    ASSERT_TRUE((tensor.ge(-scale_factor3) & tensor.le(scale_factor3)).all().item<bool>());\n\n    // test shape\n    tensor = uniform_init(scale_factor1, shape1, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape1);\n\n    tensor = uniform_init(scale_factor1, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n\n    tensor = uniform_init(scale_factor1, shape3, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape3);\n\n    // test tensor options\n    tensor = uniform_init(scale_factor1, shape2, f16_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat16);\n\n    tensor = uniform_init(scale_factor1, shape2, f32_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n\n    tensor = uniform_init(scale_factor1, shape2, f64_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat64);\n}\n\nTEST(TestInitialization, TestNormal) {\n    std::vector<int64_t> shape_large = {500, 500};  // large shape used to get better estimate of mean and std for normal distribution\n\n    float mean1 = 0.0;\n    float mean2 = -.5;\n    float mean3 = 2.0;\n\n    float std1 = 1.0;\n    float std2 = 2.5;\n    float std3 = 5.0;\n\n    // test mean/std\n    torch::Tensor tensor = normal_init(mean1, std1, shape_large, f32_options);\n    ASSERT_NEAR(tensor.mean().item<float>(), mean1, .1);\n    ASSERT_NEAR(tensor.std().item<float>(), std1, .1);\n\n    tensor = normal_init(mean2, std2, shape_large, f32_options);\n    ASSERT_NEAR(tensor.mean().item<float>(), mean2, .1);\n    ASSERT_NEAR(tensor.std().item<float>(), std2, .1);\n\n    tensor = normal_init(mean3, std3, shape_large, f32_options);\n    ASSERT_NEAR(tensor.mean().item<float>(), mean3, .1);\n    ASSERT_NEAR(tensor.std().item<float>(), std3, .1);\n\n    // test shape\n    tensor = normal_init(mean1, std1, shape1, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape1);\n\n    tensor = normal_init(mean1, std1, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n\n    tensor = normal_init(mean1, std1, shape3, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape3);\n\n    // test tensor options\n    tensor = normal_init(mean1, std1, shape1, f16_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat16);\n\n    tensor = normal_init(mean1, std1, shape1, f32_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n\n    tensor = normal_init(mean1, std1, shape1, f64_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat64);\n}\n\nTEST(TestInitialization, TestConstant) {\n    float val1 = 0.0;\n    float val2 = -.5;\n    float val3 = 2.0;\n\n    torch::Tensor tensor = constant_init(val1, shape2, f32_options);\n    ASSERT_TRUE(tensor.eq(val1).all().item<bool>());\n\n    tensor = constant_init(val2, shape2, f32_options);\n    ASSERT_TRUE(tensor.eq(val2).all().item<bool>());\n\n    tensor = constant_init(val3, shape2, f32_options);\n    ASSERT_TRUE(tensor.eq(val3).all().item<bool>());\n\n    // test shape\n    tensor = constant_init(val1, shape1, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape1);\n\n    tensor = constant_init(val1, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n\n    tensor = constant_init(val1, shape3, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape3);\n\n    // test tensor options\n    tensor = constant_init(val1, shape2, f16_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat16);\n\n    tensor = constant_init(val1, shape2, f32_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n\n    tensor = constant_init(val1, shape2, f64_options);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat64);\n}\n\nTEST(TestInitialization, TestComputeFans) {\n    std::tuple<int64_t, int64_t> output;\n\n    // dims = 0\n    std::vector<int64_t> shape = {};\n    output = compute_fans(shape);\n\n    ASSERT_EQ(std::get<0>(output), 1);\n    ASSERT_EQ(std::get<1>(output), 1);\n\n    // dims = 1\n    shape = {1};\n    output = compute_fans(shape);\n    ASSERT_EQ(std::get<0>(output), shape[0]);\n    ASSERT_EQ(std::get<1>(output), shape[0]);\n\n    shape = {10};\n    output = compute_fans(shape);\n    ASSERT_EQ(std::get<0>(output), shape[0]);\n    ASSERT_EQ(std::get<1>(output), shape[0]);\n\n    // dims = 2\n    shape = {1, 1};\n    output = compute_fans(shape);\n    ASSERT_EQ(std::get<0>(output), shape[0]);\n    ASSERT_EQ(std::get<1>(output), shape[1]);\n\n    shape = {10, 5};\n    output = compute_fans(shape);\n    ASSERT_EQ(std::get<0>(output), shape[0]);\n    ASSERT_EQ(std::get<1>(output), shape[1]);\n\n    // dims > 2\n    shape = {1, 1, 1};\n    output = compute_fans(shape);\n    ASSERT_EQ(std::get<0>(output), shape[1]);\n    ASSERT_EQ(std::get<1>(output), shape[2]);\n\n    shape = {10, 5, 3};\n    output = compute_fans(shape);\n    ASSERT_EQ(std::get<0>(output), shape[1]);\n    ASSERT_EQ(std::get<1>(output), shape[2]);\n\n    shape = {15, 3, 9};\n    output = compute_fans(shape);\n    ASSERT_EQ(std::get<0>(output), shape[1]);\n    ASSERT_EQ(std::get<1>(output), shape[2]);\n\n    shape = {2, 4, 6, 8, 10};\n    output = compute_fans(shape);\n    ASSERT_EQ(std::get<0>(output), shape[3]);\n    ASSERT_EQ(std::get<1>(output), shape[4]);\n}\n\nTEST(TestInitialization, TestGlorotUniform) {\n    std::tuple<int64_t, int64_t> compute_fans = {-1, -1};\n    std::tuple<int64_t, int64_t> given_fans = {1, 1};\n\n    // dims = 0\n    std::vector<int64_t> shape = {};\n    torch::Tensor tensor = glorot_uniform(shape, compute_fans, f32_options);\n    float limit = sqrt(6.0 / (1 + 1));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {2, 2};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (2 + 2));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    // dims = 1\n    shape = {10};\n    tensor = glorot_uniform(shape, compute_fans, f32_options);\n    limit = sqrt(6.0 / (10 + 10));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {10, 10};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (10 + 10));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {20, 20};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (20 + 20));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    // dims = 2\n    shape = {10, 20};\n    tensor = glorot_uniform(shape, compute_fans, f32_options);\n    limit = sqrt(6.0 / (10 + 20));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {10, 20};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (10 + 20));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {20, 20};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (20 + 20));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    // dims > 2\n    shape = {10, 20, 10};\n    tensor = glorot_uniform(shape, compute_fans, f32_options);\n    limit = sqrt(6.0 / (20 + 10));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {20, 10};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (20 + 10));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {10, 10};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (10 + 10));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    shape = {10, 20, 10, 5, 3};\n    tensor = glorot_uniform(shape, compute_fans, f32_options);\n    limit = sqrt(6.0 / (5 + 3));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {5, 3};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (5 + 3));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    given_fans = {100, 50};\n    tensor = glorot_uniform(shape, given_fans, f32_options);\n    limit = sqrt(6.0 / (100 + 50));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n    ASSERT_TRUE(tensor.sizes() == shape);\n}\n\nTEST(TestInitialization, TestGlorotNormal) {\n    std::tuple<int64_t, int64_t> compute_fans = {-1, -1};\n    std::tuple<int64_t, int64_t> given_fans = {1, 1};\n\n    // only checking shape since it's non-trivial to check if a tensor with few elements comes from the normal distribution\n    // dims = 0\n    std::vector<int64_t> shape = {};\n    torch::Tensor tensor = glorot_normal(shape, compute_fans, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    // dims = 1\n    shape = {10};\n    tensor = glorot_normal(shape, compute_fans, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    // dims = 2\n    shape = {10, 20};\n    tensor = glorot_normal(shape, compute_fans, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    // dims > 2\n    shape = {10, 20, 10};\n    tensor = glorot_uniform(shape, compute_fans, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape);\n\n    shape = {10, 20, 10, 5, 3};\n    tensor = glorot_uniform(shape, compute_fans, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape);\n}\n\nTEST(TestInitialization, TestTensorInit) {\n    torch::Tensor tensor;\n    shared_ptr<InitConfig> init_config = std::make_shared<InitConfig>();\n\n    init_config->type = InitDistribution::GLOROT_NORMAL;\n    tensor = initialize_tensor(init_config, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n\n    init_config->type = InitDistribution::GLOROT_UNIFORM;\n    tensor = initialize_tensor(init_config, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    float limit = sqrt(6.0 / (shape2[0] + shape2[1]));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n\n    std::tuple<int64_t, int64_t> fans = {10, 25};\n    tensor = initialize_tensor(init_config, shape2, f32_options, fans);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    limit = sqrt(6.0 / (10 + 25));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n\n    init_config->type = InitDistribution::UNIFORM;\n    limit = .25;\n    auto options = std::make_shared<UniformInitOptions>();\n    options->scale_factor = limit;\n    init_config->options = options;\n    tensor = initialize_tensor(init_config, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n\n    init_config->type = InitDistribution::NORMAL;\n    auto normal_options = std::make_shared<NormalInitOptions>();\n    init_config->options = normal_options;\n    tensor = initialize_tensor(init_config, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n\n    init_config->type = InitDistribution::ZEROS;\n    tensor = initialize_tensor(init_config, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    ASSERT_TRUE(tensor.eq(0).all().item<bool>());\n\n    init_config->type = InitDistribution::ONES;\n    tensor = initialize_tensor(init_config, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    ASSERT_TRUE(tensor.eq(1).all().item<bool>());\n\n    init_config->type = InitDistribution::CONSTANT;\n    auto const_options = std::make_shared<ConstantInitOptions>();\n    const_options->constant = .35;\n    init_config->options = const_options;\n    tensor = initialize_tensor(init_config, shape2, f32_options);\n    ASSERT_TRUE(tensor.sizes() == shape2);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    ASSERT_TRUE(tensor.eq(.35).all().item<bool>());\n}\n\nTEST(TestInitialization, TestSubtensorInit) {\n    torch::Tensor tensor;\n    shared_ptr<InitConfig> init_config = std::make_shared<InitConfig>();\n\n    std::vector<int64_t> sub_shape = {2, 3};\n    std::vector<int64_t> full_shape = {4, 3};\n\n    init_config->type = InitDistribution::GLOROT_NORMAL;\n    tensor = initialize_subtensor(init_config, sub_shape, full_shape, f32_options);\n    ASSERT_TRUE(tensor.sizes() == sub_shape);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n\n    init_config->type = InitDistribution::GLOROT_UNIFORM;\n    tensor = initialize_subtensor(init_config, sub_shape, full_shape, f32_options);\n    ASSERT_TRUE(tensor.sizes() == sub_shape);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    float limit = sqrt(6.0 / (sub_shape[0] + sub_shape[1]));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n\n    std::tuple<int64_t, int64_t> fans = {10, 25};\n    tensor = initialize_subtensor(init_config, sub_shape, full_shape, f32_options, fans);\n    ASSERT_TRUE(tensor.sizes() == sub_shape);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    limit = sqrt(6.0 / (10 + 25));\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n\n    init_config->type = InitDistribution::UNIFORM;\n    limit = .25;\n    auto options = std::make_shared<UniformInitOptions>();\n    options->scale_factor = limit;\n    init_config->options = options;\n    tensor = initialize_subtensor(init_config, sub_shape, full_shape, f32_options);\n    ASSERT_TRUE(tensor.sizes() == sub_shape);\n    ASSERT_TRUE(tensor.dtype() == torch::kFloat32);\n    ASSERT_TRUE((tensor.ge(-limit) & tensor.le(limit)).all().item<bool>());\n}"
  },
  {
    "path": "test/cpp/unit/nn/test_loss.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/4/22.\n//\n\n#include <gtest/gtest.h>\n#include <nn/loss.h>\n\ntorch::Tensor test_pos1 = torch::tensor({500.0}, torch::kFloat32);\ntorch::Tensor test_pos2 = torch::tensor({.1}, torch::kFloat32);\ntorch::Tensor test_pos3 = torch::tensor({-500.0}, torch::kFloat32);\ntorch::Tensor test_pos4 = torch::tensor({.5, 2.5, 5.0, 7.5, 100.0, 250.0}, torch::kFloat32);\n\ntorch::Tensor test_neg1 = torch::tensor({{150.0, 100.0, 50.0, 25.0, 10.0}}, torch::kFloat32);\ntorch::Tensor test_neg2 = torch::tensor({{.001, -.001, -.005, -.1, -10.0}}, torch::kFloat32);\ntorch::Tensor test_neg3 = torch::tensor({{-150.0, -100.0, -50.0, -25.0, 10.0}}, torch::kFloat32);\ntorch::Tensor test_neg4 = torch::tensor({{.5, 10.0}, {2.5, -1.0}, {5.0, 1.0}, {7.5, -5.0}, {100.0, 20.0}, {250.0, 10.0}}, torch::kFloat32);\n\ntorch::Tensor test_y_pred1 = torch::tensor({{.75, .25}, {.5, .5}, {3.0, .25}}, torch::kFloat32);\ntorch::Tensor test_y_label1 = torch::tensor({0, 1, 0}, torch::kInt64);\n// torch::Tensor test_y_label1 = torch::tensor({{.75, .25}, {.5, .5}, {.9, .1}}, torch::kFloat32);\n\ntorch::Tensor test_y_pred2 = torch::tensor({{.75, .25, .1}, {.5, .5, .9}, {3.0, .25, 5.0}}, torch::kFloat32);\ntorch::Tensor test_y_label2 = torch::tensor({0, 2, 2}, torch::kInt64);\n// torch::Tensor test_y_label2 = torch::tensor({{.75, .20, .05}, {.2, .2, .6}, {.35, .05, .6}}, torch::kFloat32);\n\ntorch::Tensor invalid_tensor = torch::tensor({{{0.0}}}, torch::kFloat32);\n\ntorch::Tensor undef_tensor;\n\nTEST(TestLoss, TestShapeMismatch) {\n    // check undefined pos\n    EXPECT_THROW(check_score_shapes(undef_tensor, test_neg4), UndefinedTensorException);\n\n    // check undefined neg\n    EXPECT_THROW(check_score_shapes(test_pos1, undef_tensor), UndefinedTensorException);\n\n    // check invalid pos\n    EXPECT_THROW(check_score_shapes(test_neg4, test_neg4), TensorSizeMismatchException);\n    EXPECT_THROW(check_score_shapes(invalid_tensor, test_neg4), TensorSizeMismatchException);\n\n    // check invalid neg\n    EXPECT_THROW(check_score_shapes(test_pos4, test_pos4), TensorSizeMismatchException);\n    EXPECT_THROW(check_score_shapes(test_pos4, invalid_tensor), TensorSizeMismatchException);\n\n    // check neg mismatch\n    EXPECT_THROW(check_score_shapes(test_pos1, test_neg4), TensorSizeMismatchException);\n\n    // check valid\n    EXPECT_NO_THROW(check_score_shapes(test_pos1, test_neg1));\n    EXPECT_NO_THROW(check_score_shapes(test_pos2, test_neg2));\n    EXPECT_NO_THROW(check_score_shapes(test_pos3, test_neg3));\n    EXPECT_NO_THROW(check_score_shapes(test_pos4, test_neg4));\n}\n\nTEST(TestLoss, TestSoftmaxCrossEntropy) {\n    auto loss_options_mean = std::make_shared<LossOptions>();\n    loss_options_mean->loss_reduction = LossReduction::MEAN;\n\n    auto loss_options_sum = std::make_shared<LossOptions>();\n    loss_options_sum->loss_reduction = LossReduction::SUM;\n\n    auto *loss_fn_mean = new SoftmaxCrossEntropy(loss_options_mean);\n    auto *loss_fn_sum = new SoftmaxCrossEntropy(loss_options_sum);\n\n    // test mean reduction\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos4, test_neg4, true));\n    ASSERT_THROW(loss_fn_mean->operator()(test_y_pred1, test_y_label1, false), MariusRuntimeException);\n    ASSERT_THROW(loss_fn_mean->operator()(test_y_pred2, test_y_label2, false), MariusRuntimeException);\n\n    // test sum reduction\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos4, test_neg4, true));\n    ASSERT_THROW(loss_fn_sum->operator()(test_y_pred1, test_y_label1, false), MariusRuntimeException);\n    ASSERT_THROW(loss_fn_sum->operator()(test_y_pred2, test_y_label2, false), MariusRuntimeException);\n\n    auto mean_loss = loss_fn_mean->operator()(test_pos4, test_neg4, true);\n    auto sum_loss = loss_fn_sum->operator()(test_pos4, test_neg4, true);\n\n    ASSERT_TRUE(((sum_loss / test_pos4.size(0)) == mean_loss).all().item<bool>());\n\n    delete loss_fn_mean;\n    delete loss_fn_sum;\n}\n\nTEST(TestLoss, TestRankingLoss) {\n    auto loss_options_mean = std::make_shared<RankingLossOptions>();\n    loss_options_mean->loss_reduction = LossReduction::MEAN;\n    loss_options_mean->margin = 0.0;\n\n    auto loss_options_sum = std::make_shared<RankingLossOptions>();\n    loss_options_sum->loss_reduction = LossReduction::SUM;\n    loss_options_sum->margin = 0.0;\n\n    auto *loss_fn_mean = new RankingLoss(loss_options_mean);\n    auto *loss_fn_sum = new RankingLoss(loss_options_sum);\n\n    // test mean reduction\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos4, test_neg4, true));\n    ASSERT_THROW(loss_fn_mean->operator()(test_y_pred1, test_y_label1, false), MariusRuntimeException);\n\n    // test sum reduction\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos4, test_neg4, true));\n    ASSERT_THROW(loss_fn_sum->operator()(test_y_pred1, test_y_label1, false), MariusRuntimeException);\n\n    auto mean_loss = loss_fn_mean->operator()(test_pos4, test_neg4);\n    auto sum_loss = loss_fn_sum->operator()(test_pos4, test_neg4);\n\n    ASSERT_TRUE(((sum_loss / (test_pos4.size(0) * 2)) == mean_loss).all().item<bool>());\n\n    // test margin\n    float margin1 = -10.0;\n    float margin2 = 5.0;\n    float margin3 = 10.0;\n\n    loss_options_sum->margin = margin1;\n    auto *loss_fn_sum1 = new RankingLoss(loss_options_sum);\n\n    auto loss1 = loss_fn_sum1->operator()(test_pos4, test_neg4);\n\n    loss_options_sum->margin = margin2;\n    auto *loss_fn_sum2 = new RankingLoss(loss_options_sum);\n\n    auto loss2 = loss_fn_sum2->operator()(test_pos4, test_neg4);\n\n    loss_options_sum->margin = margin3;\n    auto *loss_fn_sum3 = new RankingLoss(loss_options_sum);\n\n    auto loss3 = loss_fn_sum3->operator()(test_pos4, test_neg4);\n\n    ASSERT_TRUE((loss1 < loss2).all().item<bool>());\n    ASSERT_TRUE((loss2 < loss3).all().item<bool>());\n\n    delete loss_fn_mean;\n    delete loss_fn_sum;\n    delete loss_fn_sum1;\n    delete loss_fn_sum2;\n    delete loss_fn_sum3;\n}\n\nTEST(TestLoss, TestCrossEntropyLoss) {\n    auto loss_options_mean = std::make_shared<LossOptions>();\n    loss_options_mean->loss_reduction = LossReduction::MEAN;\n\n    auto loss_options_sum = std::make_shared<LossOptions>();\n    loss_options_sum->loss_reduction = LossReduction::SUM;\n\n    auto *loss_fn_mean = new CrossEntropyLoss(loss_options_mean);\n    auto *loss_fn_sum = new CrossEntropyLoss(loss_options_sum);\n\n    // test mean reduction\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred2, test_y_label2, false));\n\n    // test sum reduction\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred2, test_y_label2, false));\n\n    auto mean_loss = loss_fn_mean->operator()(test_pos4, test_neg4, true);\n    auto sum_loss = loss_fn_sum->operator()(test_pos4, test_neg4, true);\n\n    ASSERT_TRUE(((sum_loss / (test_pos4.size(0))) == mean_loss).all().item<bool>());\n\n    delete loss_fn_mean;\n    delete loss_fn_sum;\n}\n\nTEST(TestLoss, TestBCEAfterSigmoid) {\n    auto loss_options_mean = std::make_shared<LossOptions>();\n    loss_options_mean->loss_reduction = LossReduction::MEAN;\n\n    auto loss_options_sum = std::make_shared<LossOptions>();\n    loss_options_sum->loss_reduction = LossReduction::SUM;\n\n    auto *loss_fn_mean = new BCEAfterSigmoidLoss(loss_options_mean);\n    auto *loss_fn_sum = new BCEAfterSigmoidLoss(loss_options_sum);\n\n    // test mean reduction\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred2, test_y_label2, false));\n\n    // test sum reduction\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred2, test_y_label2, false));\n\n    auto mean_loss = loss_fn_mean->operator()(test_pos4, test_neg4, true);\n    auto sum_loss = loss_fn_sum->operator()(test_pos4, test_neg4, true);\n\n    ASSERT_TRUE(((sum_loss / (3 * test_pos4.size(0))) == mean_loss).all().item<bool>());\n\n    delete loss_fn_mean;\n    delete loss_fn_sum;\n}\n\nTEST(TestLoss, TestBCEWithLogits) {\n    auto loss_options_mean = std::make_shared<LossOptions>();\n    loss_options_mean->loss_reduction = LossReduction::MEAN;\n\n    auto loss_options_sum = std::make_shared<LossOptions>();\n    loss_options_sum->loss_reduction = LossReduction::SUM;\n\n    auto *loss_fn_mean = new BCEWithLogitsLoss(loss_options_mean);\n    auto *loss_fn_sum = new BCEWithLogitsLoss(loss_options_sum);\n\n    // test mean reduction\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred2, test_y_label2, false));\n\n    // test sum reduction\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred2, test_y_label2, false));\n\n    auto mean_loss = loss_fn_mean->operator()(test_pos4, test_neg4, true);\n    auto sum_loss = loss_fn_sum->operator()(test_pos4, test_neg4, true);\n\n    ASSERT_TRUE(((sum_loss / (3 * test_pos4.size(0))) == mean_loss).all().item<bool>());\n\n    delete loss_fn_mean;\n    delete loss_fn_sum;\n}\n\nTEST(TestLoss, TestMSE) {\n    auto loss_options_mean = std::make_shared<LossOptions>();\n    loss_options_mean->loss_reduction = LossReduction::MEAN;\n\n    auto loss_options_sum = std::make_shared<LossOptions>();\n    loss_options_sum->loss_reduction = LossReduction::SUM;\n\n    auto *loss_fn_mean = new MSELoss(loss_options_mean);\n    auto *loss_fn_sum = new MSELoss(loss_options_sum);\n\n    // test mean reduction\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred2, test_y_label2, false));\n\n    // test sum reduction\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred2, test_y_label2, false));\n\n    auto mean_loss = loss_fn_mean->operator()(test_pos4, test_neg4, true);\n    auto sum_loss = loss_fn_sum->operator()(test_pos4, test_neg4, true);\n\n    ASSERT_TRUE(((sum_loss / (3 * test_pos4.size(0))) == mean_loss).all().item<bool>());\n\n    delete loss_fn_mean;\n    delete loss_fn_sum;\n}\n\nTEST(TestLoss, TestSoftPlus) {\n    auto loss_options_mean = std::make_shared<LossOptions>();\n    loss_options_mean->loss_reduction = LossReduction::MEAN;\n\n    auto loss_options_sum = std::make_shared<LossOptions>();\n    loss_options_sum->loss_reduction = LossReduction::SUM;\n\n    auto *loss_fn_mean = new SoftPlusLoss(loss_options_mean);\n    auto *loss_fn_sum = new SoftPlusLoss(loss_options_sum);\n\n    // test mean reduction\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_mean->operator()(test_y_pred2, test_y_label2, false));\n\n    // test sum reduction\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos1, test_neg1, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos2, test_neg2, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos3, test_neg3, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_pos4, test_neg4, true));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred1, test_y_label1, false));\n    ASSERT_NO_THROW(loss_fn_sum->operator()(test_y_pred2, test_y_label2, false));\n\n    auto mean_loss = loss_fn_mean->operator()(test_pos4, test_neg4, true);\n    auto sum_loss = loss_fn_sum->operator()(test_pos4, test_neg4, true);\n\n    ASSERT_TRUE(((sum_loss / (3 * test_pos4.size(0))) == mean_loss).all().item<bool>());\n\n    delete loss_fn_mean;\n    delete loss_fn_sum;\n}\n\nTEST(TestLoss, TestGetLossFunction) {\n    // test nullptr\n    shared_ptr<LossConfig> loss_config = nullptr;\n    EXPECT_THROW(getLossFunction(loss_config), UnexpectedNullPtrException);\n\n    auto loss_options_mean = std::make_shared<LossOptions>();\n    loss_options_mean->loss_reduction = LossReduction::MEAN;\n\n    auto ranking_loss_options_mean = std::make_shared<RankingLossOptions>();\n    ranking_loss_options_mean->loss_reduction = LossReduction::MEAN;\n    ranking_loss_options_mean->margin = 1.0;\n\n    // test softmax\n    loss_config = std::make_shared<LossConfig>();\n    loss_config->type = LossFunctionType::SOFTMAX_CE;\n    loss_config->options = loss_options_mean;\n\n    auto softmax_loss = new SoftmaxCrossEntropy(loss_options_mean);\n    auto ret_loss = getLossFunction(loss_config);\n    ASSERT_EQ(softmax_loss->operator()(test_pos4, test_neg4, true).item<float>(), ret_loss->operator()(test_pos4, test_neg4, true).item<float>());\n\n    delete softmax_loss;\n\n    // test ranking\n    loss_config = std::make_shared<LossConfig>();\n    loss_config->type = LossFunctionType::RANKING;\n    loss_config->options = ranking_loss_options_mean;\n\n    auto ranking_loss = new RankingLoss(ranking_loss_options_mean);\n    ret_loss = getLossFunction(loss_config);\n    ASSERT_EQ(ranking_loss->operator()(test_pos4, test_neg4, true).item<float>(), ret_loss->operator()(test_pos4, test_neg4, true).item<float>());\n\n    delete ranking_loss;\n\n    // test bce sigmoid\n    loss_config = std::make_shared<LossConfig>();\n    loss_config->type = LossFunctionType::BCE_AFTER_SIGMOID;\n    loss_config->options = loss_options_mean;\n\n    auto bce_sigmoid_loss = new BCEAfterSigmoidLoss(loss_options_mean);\n    ret_loss = getLossFunction(loss_config);\n    ASSERT_EQ(bce_sigmoid_loss->operator()(test_pos4, test_neg4, true).item<float>(), ret_loss->operator()(test_pos4, test_neg4, true).item<float>());\n\n    delete bce_sigmoid_loss;\n\n    // test bce logits\n    loss_config = std::make_shared<LossConfig>();\n    loss_config->type = LossFunctionType::BCE_WITH_LOGITS;\n    loss_config->options = loss_options_mean;\n\n    auto bce_logits_loss = new BCEWithLogitsLoss(loss_options_mean);\n    ret_loss = getLossFunction(loss_config);\n    ASSERT_EQ(bce_logits_loss->operator()(test_pos4, test_neg4, true).item<float>(), ret_loss->operator()(test_pos4, test_neg4, true).item<float>());\n\n    delete bce_logits_loss;\n\n    // test mse\n    loss_config = std::make_shared<LossConfig>();\n    loss_config->type = LossFunctionType::MSE;\n    loss_config->options = loss_options_mean;\n\n    auto mse_loss = new MSELoss(loss_options_mean);\n    ret_loss = getLossFunction(loss_config);\n    ASSERT_EQ(mse_loss->operator()(test_pos4, test_neg4, true).item<float>(), ret_loss->operator()(test_pos4, test_neg4, true).item<float>());\n\n    delete mse_loss;\n\n    // test softplus\n    loss_config = std::make_shared<LossConfig>();\n    loss_config->type = LossFunctionType::SOFTPLUS;\n    loss_config->options = loss_options_mean;\n\n    auto softplus_loss = new SoftPlusLoss(loss_options_mean);\n    ret_loss = getLossFunction(loss_config);\n    ASSERT_EQ(softplus_loss->operator()(test_pos4, test_neg4, true).item<float>(), ret_loss->operator()(test_pos4, test_neg4, true).item<float>());\n\n    delete softplus_loss;\n}"
  },
  {
    "path": "test/cpp/unit/nn/test_model.cpp",
    "content": "//\n// Created by Jason Mohoney on 2/5/22.\n//\n\n#include <gtest/gtest.h>\n#include <nn/decoders/edge/corrupt_node_decoder.h>\n#include <nn/decoders/edge/corrupt_rel_decoder.h>\n#include <nn/decoders/edge/distmult.h>\n#include <nn/decoders/edge/transe.h>\n#include <nn/decoders/node/node_decoder.h>\n#include <nn/decoders/node/node_decoder_model.h>\n#include <nn/decoders/node/noop_node_decoder.h>\n#include <nn/layers/embedding/embedding.h>\n#include <nn/layers/feature/feature.h>\n#include <nn/model.h>\n\nTEST(TestModel, TestInitModelFromConfigLP) {\n    int embedding_dim = 50;\n    int random_seed = 100;\n    int num_relations = 10;\n\n    auto model_config = std::make_shared<ModelConfig>();\n    auto encoder_config = std::make_shared<EncoderConfig>();\n    auto decoder_config = std::make_shared<DecoderConfig>();\n    auto loss_config = std::make_shared<LossConfig>();\n    auto dense_optimizer = std::make_shared<OptimizerConfig>();\n    auto sparse_optimizer = std::make_shared<OptimizerConfig>();\n\n    dense_optimizer->type = OptimizerType::SGD;\n    sparse_optimizer->type = OptimizerType::SGD;\n\n    auto optimizer_options = std::make_shared<OptimizerOptions>();\n    optimizer_options->learning_rate = .1;\n\n    dense_optimizer->options = optimizer_options;\n    sparse_optimizer->options = optimizer_options;\n\n    auto layer_config = std::make_shared<LayerConfig>();\n    layer_config->type = LayerType::EMBEDDING;\n    layer_config->output_dim = embedding_dim;\n\n    std::vector<shared_ptr<LayerConfig>> stage;\n    stage.emplace_back(layer_config);\n    encoder_config->layers.emplace_back(stage);\n\n    decoder_config->type = DecoderType::DISTMULT;\n    auto decoder_options = std::make_shared<EdgeDecoderOptions>();\n    decoder_options->inverse_edges = true;\n    decoder_options->edge_decoder_method = EdgeDecoderMethod::CORRUPT_NODE;\n    decoder_config->options = decoder_options;\n\n    loss_config->type = LossFunctionType::SOFTMAX_CE;\n    auto loss_options = std::make_shared<LossOptions>();\n    loss_options->loss_reduction = LossReduction::SUM;\n    loss_config->options = loss_options;\n\n    model_config->random_seed = random_seed;\n    model_config->learning_task = LearningTask::LINK_PREDICTION;\n\n    // check missing encoder config\n    ASSERT_THROW(initModelFromConfig(model_config, {torch::kCPU}, num_relations, true), UnexpectedNullPtrException);\n    model_config->encoder = encoder_config;\n\n    // check missing decoder config\n    ASSERT_THROW(initModelFromConfig(model_config, {torch::kCPU}, num_relations, true), UnexpectedNullPtrException);\n    model_config->decoder = decoder_config;\n\n    // check missing loss\n    ASSERT_THROW(initModelFromConfig(model_config, {torch::kCPU}, num_relations, true), UnexpectedNullPtrException);\n    model_config->loss = loss_config;\n\n    // check missing dense optimizer\n    ASSERT_THROW(initModelFromConfig(model_config, {torch::kCPU}, num_relations, true), UnexpectedNullPtrException);\n    model_config->dense_optimizer = dense_optimizer;\n    model_config->sparse_optimizer = sparse_optimizer;\n\n    ASSERT_NO_THROW(initModelFromConfig(model_config, {torch::kCPU}, num_relations, true));\n    shared_ptr<Model> model = initModelFromConfig(model_config, {torch::kCPU}, num_relations, true);\n\n    // check learning task\n    ASSERT_EQ(model->learning_task_, LearningTask::LINK_PREDICTION);\n\n    // test encoder\n    ASSERT_EQ(model->encoder_->encoder_config_->train_neighbor_sampling.size(), 0);\n    ASSERT_EQ(model->encoder_->encoder_config_->eval_neighbor_sampling.size(), 0);\n    ASSERT_EQ(model->encoder_->layers_.size(), 1);\n    ASSERT_EQ(model->encoder_->layers_[0].size(), 1);\n\n    bool is_instance = instance_of<Layer, EmbeddingLayer>(model->encoder_->layers_[0][0]);\n    bool is_not_instance = !instance_of<Layer, FeatureLayer>(model->encoder_->layers_[0][0]);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n\n    // test link prediction model decoder\n    is_instance = instance_of<DecoderModel, DistMult>(model->decoder_->model_);\n    is_not_instance = !instance_of<DecoderModel, TransE>(model->decoder_->model_);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n\n    is_instance = instance_of<Decoder, CorruptNodeDecoder>(model->decoder_);\n    is_not_instance = !instance_of<Decoder, CorruptRelDecoder>(model->decoder_);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n\n    // test loss\n    is_instance = instance_of<LossFunction, SoftmaxCrossEntropy>(model->loss_function_);\n    is_not_instance = !instance_of<LossFunction, SoftPlusLoss>(model->loss_function_);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n\n    // check optimizers set properly\n    ASSERT_EQ(model->optimizers_.size(), 1);\n\n    is_instance = instance_of<Optimizer, SGDOptimizer>(model->optimizers_[0]);\n    is_not_instance = !instance_of<Optimizer, AdagradOptimizer>(model->optimizers_[0]);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n}\n\nTEST(TestModel, TestInitModelFromConfigNC) {\n    int feature_dim = 50;\n    int random_seed = 100;\n\n    auto model_config = std::make_shared<ModelConfig>();\n    auto encoder_config = std::make_shared<EncoderConfig>();\n    auto decoder_config = std::make_shared<DecoderConfig>();\n    auto loss_config = std::make_shared<LossConfig>();\n    auto dense_optimizer = std::make_shared<OptimizerConfig>();\n    auto sparse_optimizer = std::make_shared<OptimizerConfig>();\n\n    dense_optimizer->type = OptimizerType::SGD;\n    sparse_optimizer->type = OptimizerType::SGD;\n\n    auto optimizer_options = std::make_shared<OptimizerOptions>();\n    optimizer_options->learning_rate = .1;\n\n    dense_optimizer->options = optimizer_options;\n    sparse_optimizer->options = optimizer_options;\n\n    auto layer_config = std::make_shared<LayerConfig>();\n    layer_config->type = LayerType::FEATURE;\n    layer_config->output_dim = feature_dim;\n\n    std::vector<shared_ptr<LayerConfig>> stage;\n    stage.emplace_back(layer_config);\n    encoder_config->layers.emplace_back(stage);\n\n    decoder_config->type = DecoderType::NODE;\n\n    loss_config->type = LossFunctionType::SOFTMAX_CE;\n    auto loss_options = std::make_shared<LossOptions>();\n    loss_options->loss_reduction = LossReduction::SUM;\n    loss_config->options = loss_options;\n\n    model_config->random_seed = random_seed;\n    model_config->learning_task = LearningTask::NODE_CLASSIFICATION;\n\n    // check missing encoder config\n    ASSERT_THROW(initModelFromConfig(model_config, {torch::kCPU}, -1, true), UnexpectedNullPtrException);\n    model_config->encoder = encoder_config;\n\n    // check missing decoder config\n    ASSERT_THROW(initModelFromConfig(model_config, {torch::kCPU}, -1, true), UnexpectedNullPtrException);\n    model_config->decoder = decoder_config;\n\n    // check missing loss\n    ASSERT_THROW(initModelFromConfig(model_config, {torch::kCPU}, -1, true), UnexpectedNullPtrException);\n    model_config->loss = loss_config;\n\n    // check missing dense optimizer\n    ASSERT_THROW(initModelFromConfig(model_config, {torch::kCPU}, -1, true), UnexpectedNullPtrException);\n    model_config->dense_optimizer = dense_optimizer;\n    model_config->sparse_optimizer = sparse_optimizer;\n\n    ASSERT_NO_THROW(initModelFromConfig(model_config, {torch::kCPU}, -1, true));\n    shared_ptr<Model> model = initModelFromConfig(model_config, {torch::kCPU}, -1, true);\n\n    // check learning task\n    ASSERT_EQ(model->learning_task_, LearningTask::NODE_CLASSIFICATION);\n\n    bool is_instance = instance_of<Layer, FeatureLayer>(model->encoder_->layers_[0][0]);\n    bool is_not_instance = !instance_of<Layer, EmbeddingLayer>(model->encoder_->layers_[0][0]);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n\n    // test link prediction model decoder\n    is_instance = instance_of<DecoderModel, NoOpNodeDecoder>(model->decoder_->model_);\n    is_not_instance = !instance_of<DecoderModel, TransE>(model->decoder_->model_);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n\n    is_instance = instance_of<Decoder, NodeDecoder>(model->decoder_);\n    is_not_instance = !instance_of<Decoder, CorruptNodeDecoder>(model->decoder_);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n\n    // test loss\n    is_instance = instance_of<LossFunction, SoftmaxCrossEntropy>(model->loss_function_);\n    is_not_instance = !instance_of<LossFunction, SoftPlusLoss>(model->loss_function_);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n\n    // check optimizers set properly\n    ASSERT_EQ(model->optimizers_.size(), 1);\n\n    is_instance = instance_of<Optimizer, SGDOptimizer>(model->optimizers_[0]);\n    is_not_instance = !instance_of<Optimizer, AdagradOptimizer>(model->optimizers_[0]);\n\n    ASSERT_TRUE(is_instance);\n    ASSERT_TRUE(is_not_instance);\n}"
  },
  {
    "path": "test/cpp/unit/test_buffer.cpp",
    "content": "#include <fcntl.h>\n#include <unistd.h>\n\n#include <string>\n\n#include \"gtest/gtest.h\"\n#include \"storage/storage.h\"\n#include \"testing_util.h\"\n#include \"util.h\"\n\n#define tryNextSwapAndAssert(pb, admits_, evicts_) \\\n    ASSERT_EQ(pb->hasSwap(), true);                \\\n    admits = pb->getNextAdmit();                   \\\n    evicts = pb->getNextEvict();                   \\\n    ASSERT_EQ(admits, admits_);                    \\\n    ASSERT_EQ(evicts, evicts_);                    \\\n    pb->performNextSwap();\n\nclass PartitionBufferTest : public ::testing::Test {\n   protected:\n    string filename;\n    int fd;\n    torch::Tensor rand_tensor_float32;\n    int64_t rows;\n    int64_t cols;\n    int capacity;\n    int num_partitions;\n    int fine_to_coarse_ratio;\n    int64_t partition_size;\n    int embedding_size;\n    int64_t total_embeddings;\n    torch::Dtype dtype;\n    int dtype_size;\n    vector<torch::Tensor> buffer_states;\n    PartitionBuffer *pb;\n\n    PartitionBufferTest() {\n        total_embeddings = 45;\n        rows = 100;\n        cols = 100;\n        capacity = 2;\n        num_partitions = 5;\n        partition_size = 10;\n        embedding_size = rows * cols;\n        dtype = torch::kFloat32;\n        dtype_size = get_dtype_size_wrapper(dtype);\n        fine_to_coarse_ratio = 2;\n    }\n\n    ~PartitionBufferTest() {}\n\n    void SetUp() override {\n        filename = testing::TempDir() + \"embeddings_data.txt\";\n        fd = createTmpFile(filename);\n        ASSERT_NE(fd, -1);\n\n        int64_t tensor_size = rows * cols * dtype_size;\n        ASSERT_EQ(genRandTensorAndWriteToFile(rand_tensor_float32, total_embeddings, embedding_size, dtype, fd), total_embeddings * tensor_size);\n\n        for (int i = 1; i < 5; i++) {\n            torch::Tensor state = torch::zeros({2}, torch::kInt64);\n            state[1] = i;\n            buffer_states.push_back(state);\n        }\n        for (int i = 4; i >= 2; i--) {\n            torch::Tensor state = torch::ones({2}, torch::kInt64);\n            state[1] = i;\n            buffer_states.push_back(state);\n        }\n        {\n            torch::Tensor state = torch::zeros({2}, torch::kInt64);\n            state[0] = 2;\n            state[1] = 3;\n            buffer_states.push_back(state);\n        }\n        {\n            torch::Tensor state = torch::zeros({2}, torch::kInt64);\n            state[0] = 2;\n            state[1] = 4;\n            buffer_states.push_back(state);\n        }\n        {\n            torch::Tensor state = torch::zeros({2}, torch::kInt64);\n            state[0] = 3;\n            state[1] = 4;\n            buffer_states.push_back(state);\n        }\n    }\n\n    void initializePartitionBuffer(bool prefetch) {\n        pb = new PartitionBuffer(capacity, num_partitions, fine_to_coarse_ratio, partition_size, embedding_size, total_embeddings, dtype, filename, prefetch);\n        pb->setBufferOrdering(buffer_states);\n        pb->load();\n    }\n\n    void TearDown() override {\n        close(fd);\n        remove(filename.c_str());\n        delete (pb);\n    }\n};\n\nclass PartitionedFileTest : public ::testing::Test {\n   protected:\n    int num_partitions;\n    int64_t partition_size;\n    int embedding_size;\n    int64_t total_embeddings;\n    torch::Dtype dtype;\n    int dtype_size;\n    string filename;\n    int fd;\n    PartitionedFile *pf;\n    torch::Tensor rand_tensor_float32;\n\n    PartitionedFileTest() {\n        num_partitions = 5;\n        partition_size = 10;\n        embedding_size = 10;\n        total_embeddings = 45;\n        dtype = torch::kFloat32;\n        dtype_size = get_dtype_size_wrapper(dtype);\n    }\n\n    ~PartitionedFileTest() {}\n\n    void SetUp() override {\n        filename = testing::TempDir() + \"partitioned_file.txt\";\n        fd = createTmpFile(filename);\n        ASSERT_NE(fd, -1);\n        ASSERT_EQ(genRandTensorAndWriteToFile(rand_tensor_float32, total_embeddings, embedding_size, dtype, fd),\n                  total_embeddings * embedding_size * dtype_size);\n        pf = new PartitionedFile(filename, num_partitions, partition_size, embedding_size, total_embeddings, dtype);\n    }\n\n    void TearDown() override {\n        remove(filename.c_str());\n        delete (pf);\n        close(fd);\n    }\n};\n\nclass LookaheadBlockTest : public ::testing::Test {\n   protected:\n    int total_size;\n    PartitionedFile *pf;\n    int total_embeddings;\n    int num_per_lookahead;\n    int num_partitions;\n    int partition_size;\n    int embedding_size;\n    int dtype_size;\n    torch::Dtype dtype;\n    string filename;\n    int fd;\n    torch::Tensor rand_tensor_float32;\n    std::vector<Partition *> partitions;\n    std::vector<Partition *> returned_partitions;\n\n    LookaheadBlockTest() {\n        total_embeddings = 45;\n        num_partitions = 5;\n        partition_size = 10;\n        embedding_size = 10000;\n        dtype = torch::kFloat32;\n        dtype_size = get_dtype_size_wrapper(dtype);\n        total_size = partition_size * embedding_size * dtype_size;\n        num_per_lookahead = 2;\n    }\n\n    ~LookaheadBlockTest() {}\n\n    void SetUp() override {\n        filename = testing::TempDir() + \"lookahead_buffer.txt\";\n        fd = createTmpFile(filename);\n        ASSERT_NE(fd, -1);\n        ASSERT_EQ(genRandTensorAndWriteToFile(rand_tensor_float32, total_embeddings, embedding_size, dtype, fd),\n                  total_embeddings * embedding_size * dtype_size);\n        pf = new PartitionedFile(filename, num_partitions, partition_size, embedding_size, total_embeddings, dtype);\n    }\n\n    void TearDown() override {\n        remove(filename.c_str());\n        delete (pf);\n        close(fd);\n        for (int i = 0; i < partitions.size(); i++) {\n            delete (partitions[i]);\n            delete (returned_partitions[i]);\n        }\n    }\n};\n\nclass AsyncWriteBlockTest : public ::testing::Test {\n   protected:\n    int total_size;\n    PartitionedFile *pf;\n    int total_embeddings;\n    int num_per_evict;\n    int num_partitions;\n    int partition_size;\n    int embedding_size;\n    int dtype_size;\n    torch::Dtype dtype;\n    string filename;\n    int fd;\n    torch::Tensor rand_tensor_float32;\n    std::vector<Partition *> partitions;\n\n    AsyncWriteBlockTest() {\n        total_embeddings = 45;\n        num_partitions = 5;\n        partition_size = 10;\n        embedding_size = 10000;\n        dtype = torch::kFloat32;\n        dtype_size = get_dtype_size_wrapper(dtype);\n        total_size = partition_size * embedding_size * dtype_size;\n        num_per_evict = 5;\n    }\n\n    ~AsyncWriteBlockTest() {}\n\n    void SetUp() override {\n        filename = testing::TempDir() + \"asyn_write_block.txt\";\n        fd = createTmpFile(filename);\n        ASSERT_NE(fd, -1);\n        ASSERT_EQ(genRandTensorAndWriteToFile(rand_tensor_float32, total_embeddings, embedding_size, dtype, fd),\n                  total_embeddings * embedding_size * dtype_size);\n        pf = new PartitionedFile(filename, num_partitions, partition_size, embedding_size, total_embeddings, dtype);\n    }\n\n    void TearDown() override {\n        remove(filename.c_str());\n        delete (pf);\n        close(fd);\n        for (int i = 0; i < partitions.size(); i++) {\n            delete (partitions[i]);\n        }\n    }\n};\n\nTEST_F(PartitionBufferTest, TestPartitionBufferOrdering) {\n    initializePartitionBuffer(false);\n    pb->unload(false);\n    pb->load();\n    std::vector<int> admits, evicts;\n    for (int i = 2; i <= 4; i++) {\n        tryNextSwapAndAssert(pb, std::vector<int>(1, i), std::vector<int>(1, i - 1))\n    }\n    tryNextSwapAndAssert(pb, std::vector<int>(1, 1), std::vector<int>(1, 0));\n    for (int i = 4; i >= 3; i--) {\n        tryNextSwapAndAssert(pb, std::vector<int>(1, i - 1), std::vector<int>(1, i));\n    }\n    tryNextSwapAndAssert(pb, std::vector<int>(1, 3), std::vector<int>(1, 1));\n    tryNextSwapAndAssert(pb, std::vector<int>(1, 4), std::vector<int>(1, 3));\n    tryNextSwapAndAssert(pb, std::vector<int>(1, 3), std::vector<int>(1, 2));\n    ASSERT_EQ(pb->hasSwap(), false);\n}\n\nTEST_F(PartitionBufferTest, TestPartitionBufferPrefetch) {\n    initializePartitionBuffer(true);\n    std::vector<int> admits, evicts;\n    for (int i = 2; i <= 4; i++) {\n        tryNextSwapAndAssert(pb, std::vector<int>(1, i), std::vector<int>(1, i - 1))\n    }\n    tryNextSwapAndAssert(pb, std::vector<int>(1, 1), std::vector<int>(1, 0));\n    for (int i = 4; i >= 3; i--) {\n        tryNextSwapAndAssert(pb, std::vector<int>(1, i - 1), std::vector<int>(1, i));\n    }\n    tryNextSwapAndAssert(pb, std::vector<int>(1, 3), std::vector<int>(1, 1));\n    tryNextSwapAndAssert(pb, std::vector<int>(1, 4), std::vector<int>(1, 3));\n    tryNextSwapAndAssert(pb, std::vector<int>(1, 3), std::vector<int>(1, 2));\n    ASSERT_EQ(pb->hasSwap(), false);\n}\n\nTEST_F(PartitionBufferTest, TestPartitionBufferIndexRead) {\n    initializePartitionBuffer(false);\n    torch::Tensor indices = pb->getRandomIds(20);\n    torch::Tensor expected = rand_tensor_float32.index_select(0, indices);\n    ASSERT_EQ(expected.equal(pb->indexRead(indices)), true);\n\n    // indexRead should take in only 1d tensors.\n    ASSERT_THROW(pb->indexRead(torch::randint(1000, {10, 10}, torch::kInt64)), std::runtime_error);\n}\n\nTEST_F(PartitionBufferTest, TestPartitionBufferIndexAdd) {\n    initializePartitionBuffer(false);\n    torch::Tensor indices = std::get<0>(at::_unique(pb->getRandomIds(1000)));\n    torch::Tensor rand_values = torch::randint(1000, {indices.size(0), rows * cols}, torch::kFloat32);\n    torch::Tensor updated_values = rand_tensor_float32.index_add_(0, indices, rand_values).index_select(0, indices);\n    pb->indexAdd(indices, rand_values);\n    ASSERT_EQ(updated_values.equal(pb->indexRead(indices)), true);\n\n    // indexAdd should check tensor dims\n    ASSERT_THROW(pb->indexAdd(indices, torch::randint(1000, {indices.size(0) + 1, rows * cols}, torch::kFloat32)), std::runtime_error);\n    ASSERT_THROW(pb->indexAdd(indices, torch::randint(1000, {indices.size(0), rows * cols + 1}, torch::kFloat32)), std::runtime_error);\n    ASSERT_THROW(pb->indexAdd(torch::randint(1000, {10, 10}, torch::kInt64), rand_values), std::runtime_error);\n}\n\nTEST_F(PartitionBufferTest, TestPartitionBufferSync) {\n    initializePartitionBuffer(false);\n    torch::Tensor rand;\n    ASSERT_EQ(genRandTensorAndWriteToFile(rand, 2, embedding_size, dtype, fd), 2 * embedding_size * dtype_size);\n    pb->unload(true);\n    rand = torch::Tensor();\n    rand = torch::randn({total_embeddings, embedding_size}, dtype);\n    ASSERT_EQ(pread_wrapper(fd, (void *)rand.data_ptr(), total_embeddings * embedding_size * dtype_size, 0), total_embeddings * embedding_size * dtype_size);\n    ASSERT_EQ(rand_tensor_float32.equal(rand), true);\n}\n\nTEST_F(PartitionBufferTest, TestPartitionBufferGlobalMap) {\n    initializePartitionBuffer(false);\n    torch::Tensor exp_map = -torch::ones({total_embeddings}, torch::kInt64);\n    exp_map.slice(0, 0, 20) = torch::arange(0, 20);\n    ASSERT_EQ(exp_map.equal(pb->getGlobalToLocalMap(true)), true);\n    exp_map.slice(0, 10, 20) = -torch::ones({10}, torch::kInt64);\n    exp_map.slice(0, 20, 30) = torch::arange(10, 20);\n    ASSERT_EQ(exp_map.equal(pb->getGlobalToLocalMap(false)), true);\n}\n\nTEST_F(PartitionedFileTest, TestReadPartition) {\n    int idx_offset = (num_partitions - 1) * partition_size;\n    Partition p(num_partitions - 1, std::min(partition_size, total_embeddings - idx_offset), embedding_size, dtype, idx_offset,\n                idx_offset * embedding_size * dtype_size);\n    torch::Tensor rand_tensor = torch::randint(1000, {partition_size, embedding_size}, torch::kFloat32);\n    void *addr = rand_tensor.data_ptr();\n    pf->readPartition(addr, &p);\n    torch::Tensor indices = at::randint(idx_offset, total_embeddings, 10, torch::kInt64);\n    torch::Tensor returned_values = p.indexRead(indices);\n    ASSERT_EQ(returned_values.equal(rand_tensor_float32.index_select(0, indices)), true);\n\n    // check for exceptions on passing null ptrs.\n    ASSERT_THROW(pf->readPartition(addr, NULL), std::runtime_error);\n    ASSERT_THROW(pf->readPartition(NULL, &p), std::runtime_error);\n}\n\nTEST_F(PartitionedFileTest, TestWritePartition) {\n    int idx_offset = (num_partitions - 1) * partition_size;\n    Partition p(num_partitions - 1, std::min(partition_size, total_embeddings - idx_offset), embedding_size, dtype, idx_offset,\n                idx_offset * embedding_size * dtype_size);\n    void *addr = (void *)((char *)rand_tensor_float32.data_ptr() + idx_offset * embedding_size * dtype_size);\n    pf->readPartition(addr, &p);\n\n    // write random data to PartitionedFile\n    torch::Tensor rand;\n    ASSERT_EQ(genRandTensorAndWriteToFile(rand, total_embeddings, embedding_size, dtype, fd), total_embeddings * embedding_size * dtype_size);\n\n    // write prev data\n    pf->writePartition(&p, false);\n    Partition p_(num_partitions - 1, std::min(partition_size, total_embeddings - idx_offset), embedding_size, dtype, idx_offset,\n                 idx_offset * embedding_size * dtype_size);\n    addr = (void *)((char *)rand.data_ptr() + idx_offset * embedding_size * dtype_size);\n    pf->readPartition(addr, &p_);\n    ASSERT_EQ(p.tensor_.equal(p_.tensor_), true);\n\n    // writePartition after clearmem should throw an exception\n    pf->writePartition(&p, true);\n    ASSERT_THROW(pf->writePartition(&p, true), std::runtime_error);\n\n    // writePartition should throw error on passing NULL;\n    ASSERT_THROW(pf->writePartition(NULL, true), std::runtime_error);\n}\n\nTEST_F(LookaheadBlockTest, TestMoveToBuffer) {\n    for (int i = 0; i < num_partitions; i++) {\n        int idx_offset = i * partition_size;\n        partitions.push_back(new Partition(i, std::min(partition_size, total_embeddings - idx_offset), embedding_size, dtype, idx_offset,\n                                           idx_offset * embedding_size * dtype_size));\n        returned_partitions.push_back(new Partition(i, std::min(partition_size, total_embeddings - idx_offset), embedding_size, dtype, idx_offset,\n                                                    idx_offset * embedding_size * dtype_size));\n    }\n\n    LookaheadBlock lb(total_size, pf, num_per_lookahead);\n    vector<Partition *> curr_partitions(2);\n    for (int i = 0; i < 2; i++) curr_partitions[i] = partitions[i];\n    lb.start(curr_partitions);\n\n    vector<void *> buff_mem(num_per_lookahead);\n    for (int i = 0; i < buff_mem.size(); i++) buff_mem[i] = malloc(partition_size * embedding_size * dtype_size);\n    vector<int64_t> buff_ids;\n    buff_ids.push_back(0);\n    buff_ids.push_back(1);\n    for (int i = 2; i < 4; i++) curr_partitions[i - 2] = partitions[i];\n    lb.move_to_buffer(buff_mem, buff_ids, curr_partitions);\n    for (int i = 0; i < 2; i++) {\n        pf->readPartition(buff_mem[i], returned_partitions[i]);\n        ASSERT_EQ(returned_partitions[i]->tensor_.equal(partitions[i]->tensor_), true);\n    }\n\n    curr_partitions.pop_back();\n    curr_partitions[0] = partitions[4];\n    lb.move_to_buffer(buff_mem, buff_ids, curr_partitions);\n    for (int i = 2; i < 4; i++) {\n        pf->readPartition(buff_mem[i - 2], returned_partitions[i]);\n        ASSERT_EQ(returned_partitions[i]->tensor_.equal(partitions[i]->tensor_), true);\n    }\n\n    buff_ids.pop_back();\n    curr_partitions.pop_back();\n\n    // move_to_buffer should throw an exception when buffer size is less than the existing partitions\n    ASSERT_THROW(lb.move_to_buffer(vector<void *>(), vector<int64_t>(), curr_partitions), std::runtime_error);\n\n    lb.move_to_buffer(buff_mem, buff_ids, curr_partitions);\n    pf->readPartition(buff_mem[0], returned_partitions[4]);\n    ASSERT_EQ(returned_partitions[4]->tensor_.equal(partitions[4]->tensor_), true);\n    lb.stop();\n    for (int i = 0; i < buff_mem.size(); i++) free(buff_mem[i]);\n}\n\nTEST_F(AsyncWriteBlockTest, TestAsyncWrite) {\n    for (int i = 0; i < num_partitions; i++) {\n        int idx_offset = i * partition_size;\n        partitions.push_back(new Partition(i, std::min(partition_size, total_embeddings - idx_offset), embedding_size, dtype, idx_offset,\n                                           idx_offset * embedding_size * dtype_size));\n    }\n\n    LookaheadBlock lb(total_size, pf, num_per_evict);\n    lb.start(partitions);\n    AsyncWriteBlock awb(total_size, pf, num_per_evict);\n    awb.start();\n\n    vector<void *> buff_mem(num_per_evict);\n    vector<int64_t> buff_ids(num_per_evict);\n    for (int i = 0; i < buff_mem.size(); i++) {\n        buff_mem[i] = malloc(partition_size * embedding_size * dtype_size);\n        buff_ids[i] = i;\n    }\n    lb.move_to_buffer(buff_mem, buff_ids, vector<Partition *>());\n    torch::Tensor rand_tensor = torch::randn({total_embeddings, embedding_size}, dtype);\n    for (int i = 0; i < partitions.size(); i++) {\n        int idx_offset = i * partition_size;\n        void *addr = (void *)((char *)rand_tensor.data_ptr() + idx_offset * embedding_size * dtype_size);\n        memcpy_wrapper(partitions[i]->data_ptr_, addr, partitions[i]->partition_size_ * embedding_size * dtype_size);\n        partitions[i]->tensor_ = torch::from_blob(partitions[i]->data_ptr_, {partitions[i]->partition_size_, embedding_size}, dtype);\n    }\n\n    awb.async_write(partitions);\n    // wait until the write happens\n    std::unique_lock lock(*(awb.lock_));\n    awb.cv_.wait(lock, [&awb] { return awb.present_ == false; });\n    lock.unlock();\n    awb.cv_.notify_all();\n\n    pread_wrapper(fd, (void *)rand_tensor_float32.data_ptr(), total_embeddings * embedding_size * dtype_size, 0);\n    ASSERT_EQ(rand_tensor_float32.equal(rand_tensor), true);\n\n    // async_write should throw an exception when the passed in partitions has a length greater than the mem block\n    partitions.push_back(partitions[0]);\n    ASSERT_THROW(awb.async_write(partitions), std::runtime_error);\n    partitions.pop_back();\n\n    for (int i = 0; i < buff_mem.size(); i++) free(buff_mem[i]);\n    lb.stop();\n    awb.stop();\n}"
  },
  {
    "path": "test/cpp/unit/test_storage.cpp",
    "content": "#include <fcntl.h>\n#include <unistd.h>\n\n#include \"configuration/config.h\"\n#include \"gtest/gtest.h\"\n#include \"storage/storage.h\"\n#include \"testing_util.h\"\n\n#define tryNextSwapAndAssert(pbs, admits_, evicts_) \\\n    ASSERT_EQ(pbs.hasSwap(), true);                 \\\n    admits = pbs.getNextAdmit();                    \\\n    evicts = pbs.getNextEvict();                    \\\n    ASSERT_EQ(admits, admits_);                     \\\n    ASSERT_EQ(evicts, evicts_);                     \\\n    pbs.performNextSwap();\n\nclass StorageTest : public ::testing::Test {\n   protected:\n    vector<std::string> filenames_array;\n    vector<int> fd_array;\n    int64_t dim0_size;\n    int64_t dim1_size;\n    vector<torch::Dtype> dtype_array;\n    vector<int> dtype_size_array;\n    vector<torch::Tensor> rand_tensors_array;\n\n    StorageTest() {\n        dim0_size = 46;\n        dim1_size = 1000;\n        dtype_array = {torch::kInt32, torch::kInt64, torch::kFloat16, torch::kFloat32, torch::kFloat64};\n        for (int i = 0; i < dtype_array.size(); i++) dtype_size_array.push_back(get_dtype_size_wrapper(dtype_array[i]));\n    }\n\n    ~StorageTest() {}\n\n    void SetUp() override {\n        for (int i = 0; i < dtype_array.size(); i++) {\n            filenames_array.push_back(testing::TempDir() + \"storage_\" + std::to_string(i) + \".txt\");\n            fd_array.push_back(createTmpFile(filenames_array[i]));\n            ASSERT_NE(fd_array[i], -1);\n            torch::Tensor rand_tensor = getRandTensor(dim0_size, dim1_size, dtype_array[i]);\n            rand_tensors_array.push_back(rand_tensor);\n        }\n    }\n\n    void TearDown() override {\n        for (int i = 0; i < dtype_array.size(); i++) {\n            close(fd_array[i]);\n            remove(filenames_array[i].c_str());\n        }\n    }\n};\n\nclass FlatFileTest : public StorageTest {\n   protected:\n    int num_edges;\n    int num_cols;\n    int num_nodes;\n    std::string edges_data_path;\n    std::string edges_bucket_partition_path;\n    FlatFileTest() {\n        num_edges = 1000;\n        num_cols = 3;\n        num_nodes = 100;\n        edges_data_path = testing::TempDir() + \"flat_file_edges.bin\";\n        edges_bucket_partition_path = testing::TempDir() + \"flat_file_edge_partitions.txt\";\n    }\n\n    ~FlatFileTest() {}\n};\n\nclass InMemoryTest : public StorageTest {\n   protected:\n    int num_edges;\n    int num_cols;\n    int num_nodes;\n    std::string edges_data_path;\n    std::string edges_bucket_partition_path;\n    InMemoryTest() {\n        num_edges = 1000;\n        num_cols = 3;\n        num_nodes = 100;\n        edges_data_path = testing::TempDir() + \"in_memory_edges.bin\";\n        edges_bucket_partition_path = testing::TempDir() + \"in_memory_edge_partitions.txt\";\n    }\n\n    ~InMemoryTest() {}\n};\n\nclass PartitionBufferStorageTest : public StorageTest {\n   protected:\n    shared_ptr<PartitionBufferOptions> options;\n    int capacity;\n    int num_partitions;\n    int64_t partition_size;\n    int fine_to_coarse_ratio;\n    vector<torch::Tensor> buffer_states;\n\n    PartitionBufferStorageTest() {\n        shared_ptr<MariusConfig> p = loadConfig(std::string(MARIUS_TEST_DIRECTORY) + \"/test_configs/fb15k_237.yaml\");\n        options = std::dynamic_pointer_cast<PartitionBufferOptions>(p->storage->embeddings->options);\n\n        capacity = 2;\n        num_partitions = 5;\n        partition_size = 10;\n        fine_to_coarse_ratio = 2;\n\n        for (int i = 1; i < 5; i++) {\n            torch::Tensor state = torch::zeros({2}, torch::kInt64);\n            state[1] = i;\n            buffer_states.push_back(state);\n        }\n        for (int i = 4; i >= 2; i--) {\n            torch::Tensor state = torch::ones({2}, torch::kInt64);\n            state[1] = i;\n            buffer_states.push_back(state);\n        }\n        {\n            torch::Tensor state = torch::zeros({2}, torch::kInt64);\n            state[0] = 2;\n            state[1] = 3;\n            buffer_states.push_back(state);\n        }\n        {\n            torch::Tensor state = torch::zeros({2}, torch::kInt64);\n            state[0] = 2;\n            state[1] = 4;\n            buffer_states.push_back(state);\n        }\n        {\n            torch::Tensor state = torch::zeros({2}, torch::kInt64);\n            state[0] = 3;\n            state[1] = 4;\n            buffer_states.push_back(state);\n        }\n    }\n\n    ~PartitionBufferStorageTest() {}\n};\n\nTEST_F(FlatFileTest, TestFlatFileWrite) {\n    for (int i = 0; i < dtype_size_array.size(); i++) {\n        FlatFile flat_file(filenames_array[i], 0, dim1_size, dtype_array[i]);\n        flat_file.append(rand_tensors_array[i]);\n        torch::Tensor rand_tensor = getRandTensor(dim0_size, dim1_size, dtype_array[i]);\n\n        ASSERT_EQ(pread_wrapper(fd_array[i], (void *)rand_tensor.data_ptr(), dim0_size * dim1_size * dtype_size_array[i], 0),\n                  dim0_size * dim1_size * dtype_size_array[i]);\n        ASSERT_EQ(rand_tensors_array[i].equal(rand_tensor), true);\n\n        flat_file.load();\n        flat_file.load();\n        flat_file.unload(true);\n        ASSERT_THROW(flat_file.rangePut(17, rand_tensor.index_select(0, torch::arange(17, 37))), std::runtime_error);\n\n        flat_file.load();\n        torch::Tensor rand_sub_tensor = getRandTensor(20, dim1_size, dtype_array[i]);\n        rand_tensor.slice(0, 17, 37) = rand_sub_tensor;\n        flat_file.rangePut(17, rand_tensor.index_select(0, torch::arange(17, 37)));\n        ASSERT_EQ(rand_tensor.index_select(0, torch::arange(17, 37)).equal(flat_file.range(17, 20)), true);\n\n        ASSERT_THROW(flat_file.rangePut(37, rand_tensors_array[i].index_select(0, torch::arange(17, 37))), std::runtime_error);\n        ASSERT_THROW(flat_file.range(17, 30), std::runtime_error);\n        ASSERT_THROW(flat_file.rangePut(17, torch::Tensor()), std::runtime_error);\n    }\n}\n\nTEST_F(FlatFileTest, TestFlatFileCopy) {\n    for (int i = 0; i < dtype_size_array.size(); i++) {\n        FlatFile flat_file(filenames_array[i], 0, dim1_size, dtype_array[i]);\n        flat_file.append(rand_tensors_array[i]);\n\n        std::string new_filename = testing::TempDir() + \"new_storage.txt\";\n        int new_fd = createTmpFile(new_filename);\n        torch::Tensor rand_tensor;\n        ASSERT_EQ(genRandTensorAndWriteToFile(rand_tensor, dim0_size, dim1_size, dtype_array[i], new_fd), dim0_size * dim1_size * dtype_size_array[i]);\n        ASSERT_NE(new_fd, -1);\n\n        flat_file.copy(new_filename, false);\n        ASSERT_EQ(pread_wrapper(new_fd, (void *)rand_tensor.data_ptr(), dim0_size * dim1_size * dtype_size_array[i], 0),\n                  dim0_size * dim1_size * dtype_size_array[i]);\n        ASSERT_EQ(rand_tensors_array[i].equal(rand_tensor), true);\n\n        close(new_fd);\n        remove(new_filename.c_str());\n    }\n}\n\nTEST_F(FlatFileTest, TestFlatFileShuffle) {\n    for (int i = 0; i < dtype_size_array.size(); i++) {\n        FlatFile flat_file(filenames_array[i], 0, dim1_size, dtype_array[i]);\n        flat_file.append(rand_tensors_array[i]);\n        torch::Tensor rand_tensor = getRandTensor(dim0_size, dim1_size, dtype_array[i]);\n\n        rand_tensor.slice(0, 0, dim0_size) = rand_tensors_array[i].slice(0, 0, dim0_size);\n        flat_file.shuffle();\n        ASSERT_THROW(flat_file.range(0, dim0_size), std::runtime_error);\n\n        flat_file.load();\n        rand_tensors_array[i] = flat_file.range(0, dim0_size);\n        ASSERT_EQ(checkPermOf2dTensor(rand_tensors_array[i], rand_tensor), true);\n    }\n}\n\nTEST_F(FlatFileTest, TestFlatFileSort) {\n    for (int i = 0; i < dtype_size_array.size(); i++) {\n        FlatFile flat_file(filenames_array[i], 0, dim1_size, dtype_array[i]);\n        flat_file.append(rand_tensors_array[i]);\n        torch::Tensor rand_tensor = getRandTensor(dim0_size, dim1_size, dtype_array[i]);\n\n        rand_tensor.slice(0, 0, dim0_size) = rand_tensors_array[i].slice(0, 0, dim0_size);\n        flat_file.sort(true);\n        ASSERT_THROW(flat_file.range(0, dim0_size), std::runtime_error);\n\n        flat_file.load();\n        rand_tensors_array[i] = flat_file.range(0, dim0_size);\n        rand_tensor.copy_(rand_tensor.index_select(0, torch::argsort(rand_tensor.select(1, 0))));\n        ASSERT_EQ(rand_tensor.equal(rand_tensors_array[i]), true);\n    }\n}\n\nTEST_F(FlatFileTest, TestFlatFileSortEdges) {\n    torch::Tensor rand_tensor = getRandTensor(num_edges, num_cols, torch::kInt32, num_nodes);\n    createTmpFile(edges_data_path);\n\n    FlatFile flat_file(edges_data_path, 0, num_cols, torch::kInt32);\n    flat_file.append(rand_tensor);\n\n    vector<int64_t> partition_sizes = partitionEdges(rand_tensor, 3, num_nodes + 1);\n    createTmpFile(edges_bucket_partition_path);\n    {\n        std::ofstream ostrm;\n        ostrm.open(edges_bucket_partition_path, std::ios::out | std::ios::trunc);\n        for (int i = 0; i < partition_sizes.size(); i++) ostrm << partition_sizes[i] << \"\\n\";\n        ostrm.close();\n    }\n    flat_file.readPartitionSizes(edges_bucket_partition_path);\n    flat_file.load();\n\n    torch::Tensor rand_tensor_1 = getRandTensor(num_edges, num_cols, torch::kInt32);\n    torch::Tensor rand_tensor_2 = getRandTensor(num_edges, num_cols, torch::kInt32);\n    rand_tensor_1.copy_(flat_file.range(0, num_edges));\n    rand_tensor_2.copy_(rand_tensor_1);\n\n    flat_file.sort(true);\n    vector<int64_t> edge_bucket_sizes = flat_file.getEdgeBucketSizes();\n    sortWithinEdgeBuckets(rand_tensor_2, edge_bucket_sizes);\n    rand_tensor_1 = flat_file.range(0, num_edges);\n    ASSERT_EQ(rand_tensor_1.equal(rand_tensor_2), true);\n\n    flat_file.sort(false);\n    sortWithinEdgeBuckets(rand_tensor_2, edge_bucket_sizes, -1);\n    rand_tensor_1 = flat_file.range(0, num_edges);\n    ASSERT_EQ(rand_tensor_1.equal(rand_tensor_2), true);\n\n    remove(edges_bucket_partition_path.c_str());\n    remove(edges_data_path.c_str());\n}\n\nTEST_F(InMemoryTest, TestIndexRead) {\n    for (int i = 0; i < dtype_size_array.size(); i++) {\n        InMemory in_memory(filenames_array[i], rand_tensors_array[i], torch::kCPU);\n        in_memory.load();\n        torch::Tensor rand_tensor = getRandTensor(10, dim1_size, dtype_array[i]);\n        ASSERT_EQ(pread_wrapper(fd_array[i], rand_tensor.data_ptr(), 10 * dim1_size * dtype_size_array[i], 10 * dim1_size * dtype_size_array[i]),\n                  10 * dim1_size * dtype_size_array[i]);\n        ASSERT_EQ(in_memory.indexRead(torch::arange(10, 20)).equal(rand_tensor), true);\n\n        ASSERT_THROW(in_memory.indexRead(torch::randint(100, {10, 10}, dtype_array[i])), std::runtime_error);\n        ASSERT_THROW(in_memory.range(17, 30), std::runtime_error);\n    }\n}\n\nTEST_F(InMemoryTest, TestIndexAdd) {\n    // just iterate over torch::kFloat32, indexAdd doesn't support any other dtype\n    for (int i = 3; i < 4; i++) {\n        InMemory in_memory(filenames_array[i], rand_tensors_array[i], torch::kCPU);\n        in_memory.load();\n        torch::Tensor indices = torch::arange(0, dim0_size);\n        torch::Tensor rand_values = torch::randint(1000, {indices.size(0), dim1_size}, dtype_array[i]);\n        torch::Tensor updated_values = rand_tensors_array[i].index_add_(0, indices, rand_values).index_select(0, indices);\n        in_memory.indexAdd(indices, rand_values);\n        ASSERT_EQ(updated_values.equal(in_memory.indexRead(indices)), true);\n\n        // indexAdd should check tensor dims\n        ASSERT_THROW(in_memory.indexAdd(indices, torch::randn({indices.size(0) + 1, dim1_size}, dtype_array[i])), std::runtime_error);\n        ASSERT_THROW(in_memory.indexAdd(indices, torch::randn({indices.size(0), dim1_size + 1}, dtype_array[i])), std::runtime_error);\n        ASSERT_THROW(in_memory.indexAdd(torch::randint(1000, {10, 10}, torch::kInt64), rand_values), std::runtime_error);\n        rand_values = torch::Tensor();\n        ASSERT_THROW(in_memory.indexAdd(indices, rand_values), std::runtime_error);\n    }\n}\n\nTEST_F(InMemoryTest, TestIndexPut) {\n    // just iterate over torch::kFloat32, indexPut doesn't support any other dtype\n    for (int i = 3; i < 4; i++) {\n        InMemory in_memory(filenames_array[i], rand_tensors_array[i], torch::kCPU);\n        in_memory.load();\n        torch::Tensor indices = std::get<0>(at::_unique(torch::randint(dim0_size, dim1_size, torch::kInt64)));\n        torch::Tensor rand_values = getRandTensor(indices.size(0), dim1_size, dtype_array[i]);\n\n        in_memory.indexPut(indices, rand_values);\n        ASSERT_EQ(rand_values.equal(in_memory.indexRead(indices)), true);\n\n        // indexPut should check tensor dims\n        rand_values = getRandTensor(indices.size(0) + 1, dim1_size, dtype_array[i]);\n        ASSERT_THROW(in_memory.indexPut(indices, rand_values), std::runtime_error);\n        rand_values = getRandTensor(indices.size(0), dim1_size + 1, dtype_array[i]);\n        ASSERT_THROW(in_memory.indexPut(indices, rand_values), std::runtime_error);\n        rand_values = getRandTensor(indices.size(0), dim1_size, dtype_array[i]);\n        ASSERT_THROW(in_memory.indexPut(torch::randint(1000, {10, 10}, torch::kInt64), rand_values), std::runtime_error);\n\n        rand_values = torch::Tensor();\n        ASSERT_THROW(in_memory.indexPut(indices, rand_values), std::runtime_error);\n    }\n}\n\nTEST_F(InMemoryTest, TestInMemoryShuffle) {\n    for (int i = 0; i < dtype_array.size(); i++) {\n        InMemory in_memory(filenames_array[i], rand_tensors_array[i], torch::kCPU);\n        in_memory.load();\n        torch::Tensor rand_tensor = getRandTensor(dim0_size, dim1_size, dtype_array[i]);\n\n        rand_tensor.slice(0, 0, dim0_size) = rand_tensors_array[i].slice(0, 0, dim0_size);\n        in_memory.shuffle();\n        rand_tensors_array[i] = in_memory.range(0, dim0_size);\n        ASSERT_EQ(checkPermOf2dTensor(rand_tensors_array[i], rand_tensor), true);\n    }\n}\n\nTEST_F(InMemoryTest, TestInMemorySort) {\n    for (int i = 0; i < dtype_array.size(); i++) {\n        InMemory in_memory(filenames_array[i], rand_tensors_array[i], torch::kCPU);\n        in_memory.load();\n        torch::Tensor rand_tensor = getRandTensor(dim0_size, dim1_size, dtype_array[i]);\n\n        rand_tensor.slice(0, 0, dim0_size) = rand_tensors_array[i].slice(0, 0, dim0_size);\n        in_memory.sort(true);\n        rand_tensors_array[i] = in_memory.range(0, dim0_size);\n        rand_tensor.copy_(rand_tensor.index_select(0, torch::argsort(rand_tensor.select(1, 0))));\n        ASSERT_EQ(rand_tensor.equal(rand_tensors_array[i]), true);\n    }\n}\n\nTEST_F(InMemoryTest, TestInMemorySortEdges) {\n    torch::Tensor rand_tensor = getRandTensor(num_edges, num_cols, torch::kInt32, num_nodes);\n    createTmpFile(edges_data_path);\n\n    InMemory in_memory(edges_data_path, num_edges, num_cols, torch::kInt32, torch::kCPU);\n    vector<int64_t> partition_sizes = partitionEdges(rand_tensor, 3, num_nodes + 1);\n    createTmpFile(edges_bucket_partition_path);\n    {\n        std::ofstream ostrm;\n        ostrm.open(edges_bucket_partition_path, std::ios::out | std::ios::trunc);\n        for (int i = 0; i < partition_sizes.size(); i++) ostrm << partition_sizes[i] << \"\\n\";\n        ostrm.close();\n    }\n    in_memory.readPartitionSizes(edges_bucket_partition_path);\n    in_memory.load();\n\n    torch::Tensor rand_tensor_1 = getRandTensor(num_edges, num_cols, torch::kInt32, 10000);\n    torch::Tensor rand_tensor_2 = getRandTensor(num_edges, num_cols, torch::kInt32);\n    rand_tensor_1.copy_(in_memory.range(0, num_edges));\n    rand_tensor_2.copy_(rand_tensor_1);\n\n    in_memory.sort(true);\n    vector<int64_t> edge_bucket_sizes = in_memory.getEdgeBucketSizes();\n    sortWithinEdgeBuckets(rand_tensor_2, edge_bucket_sizes);\n    rand_tensor_1 = in_memory.range(0, num_edges);\n    ASSERT_EQ(rand_tensor_1.equal(rand_tensor_2), true);\n\n    in_memory.sort(false);\n    sortWithinEdgeBuckets(rand_tensor_2, edge_bucket_sizes, -1);\n    rand_tensor_1 = in_memory.range(0, num_edges);\n    ASSERT_EQ(rand_tensor_1.equal(rand_tensor_2), true);\n\n    remove(edges_bucket_partition_path.c_str());\n    remove(edges_data_path.c_str());\n}\n\nTEST_F(PartitionBufferStorageTest, TestBufferOrdering) {\n    for (int i = 0; i < dtype_size_array.size(); i++) {\n        PartitionBufferStorage pbs(filenames_array[i], rand_tensors_array[i], options);\n        pbs.setBufferOrdering(buffer_states);\n        pbs.load();\n\n        std::vector<int> admits, evicts;\n        for (int i = 2; i <= 4; i++) {\n            tryNextSwapAndAssert(pbs, std::vector<int>(1, i), std::vector<int>(1, i - 1))\n        }\n        tryNextSwapAndAssert(pbs, std::vector<int>(1, 1), std::vector<int>(1, 0));\n        for (int i = 4; i >= 3; i--) {\n            tryNextSwapAndAssert(pbs, std::vector<int>(1, i - 1), std::vector<int>(1, i));\n        }\n        tryNextSwapAndAssert(pbs, std::vector<int>(1, 3), std::vector<int>(1, 1));\n        tryNextSwapAndAssert(pbs, std::vector<int>(1, 4), std::vector<int>(1, 3));\n        tryNextSwapAndAssert(pbs, std::vector<int>(1, 3), std::vector<int>(1, 2));\n        ASSERT_EQ(pbs.hasSwap(), false);\n    }\n}\n\nTEST_F(PartitionBufferStorageTest, TestIndexRead) {\n    for (int i = 0; i < dtype_size_array.size(); i++) {\n        PartitionBufferStorage pbs(filenames_array[i], rand_tensors_array[i], options);\n        pbs.setBufferOrdering(buffer_states);\n        pbs.load();\n\n        torch::Tensor indices = pbs.getRandomIds(20);\n        torch::Tensor expected = rand_tensors_array[i].index_select(0, indices);\n        ASSERT_EQ(expected.equal(pbs.indexRead(indices)), true);\n\n        // indexRead should take in only 1d tensors.\n        ASSERT_THROW(pbs.indexRead(torch::randint(1000, {10, 10}, torch::kInt64)), std::runtime_error);\n    }\n}\n\nTEST_F(PartitionBufferStorageTest, TestRangePut) {\n    for (int i = 0; i < dtype_size_array.size(); i++) {\n        PartitionBufferStorage pbs(filenames_array[i], rand_tensors_array[i], options);\n        pbs.setBufferOrdering(buffer_states);\n        pbs.load();\n\n        torch::Tensor rand_sub_tensor = getRandTensor(20, dim1_size, dtype_array[i]);\n        pbs.rangePut(0, rand_sub_tensor);\n        pbs.unload(false);\n        pbs.load();\n        ASSERT_EQ(rand_sub_tensor.equal(pbs.indexRead(torch::arange(0, 20))), true);\n\n        pbs.performNextSwap();\n        rand_sub_tensor = getRandTensor(10, dim1_size, dtype_array[i]);\n        pbs.rangePut(20, rand_sub_tensor);\n        pbs.unload(false);\n        pbs.setBufferOrdering(buffer_states);\n        pbs.load();\n        pbs.performNextSwap();\n        ASSERT_EQ(rand_sub_tensor.equal(pbs.indexRead(torch::arange(10, 20))), true);\n    }\n}\n\nTEST_F(PartitionBufferStorageTest, TestIndexAdd) {\n    // just iterate over torch::kFloat32, indexAdd doesn't support any other dtype\n    for (int i = 3; i < 4; i++) {\n        PartitionBufferStorage pbs(filenames_array[i], rand_tensors_array[i], options);\n        pbs.setBufferOrdering(buffer_states);\n        pbs.load();\n\n        torch::Tensor indices = std::get<0>(at::_unique(pbs.getRandomIds(20)));\n        torch::Tensor rand_values = getRandTensor(indices.size(0), dim1_size, dtype_array[i]);\n        torch::Tensor updated_values = rand_tensors_array[i].index_add_(0, indices, rand_values).index_select(0, indices);\n        pbs.indexAdd(indices, rand_values);\n        ASSERT_EQ(updated_values.equal(pbs.indexRead(indices)), true);\n\n        ASSERT_EQ(pbs.getNumInMemory(), 20);\n\n        // indexAdd should check tensor dims\n        rand_values = getRandTensor(indices.size(0) + 1, dim1_size, dtype_array[i]);\n        ASSERT_THROW(pbs.indexAdd(indices, rand_values), std::runtime_error);\n        rand_values = getRandTensor(indices.size(0), dim1_size + 1, dtype_array[i]);\n        ASSERT_THROW(pbs.indexAdd(indices, rand_values), std::runtime_error);\n        rand_values = getRandTensor(indices.size(0), dim1_size, dtype_array[i]);\n        ASSERT_THROW(pbs.indexAdd(torch::randint(1000, {10, 10}, torch::kInt64), rand_values), std::runtime_error);\n        rand_values = torch::Tensor();\n        ASSERT_THROW(pbs.indexAdd(indices, rand_values), std::runtime_error);\n    }\n}"
  },
  {
    "path": "test/cpp/unit/testing_util.cpp",
    "content": "#include \"testing_util.h\"\n\n#include \"gtest/gtest.h\"\n#include \"util.h\"\n\nint createTmpFile(std::string &filename) { return open(filename.c_str(), O_RDWR | O_CREAT, 0777); }\n\ntorch::Tensor getRandTensor(int dim0_size, int dim1_size, torch::Dtype dtype, int max_val) {\n    if (dtype == torch::kInt32 || dtype == torch::kInt64) {\n        return torch::randint(max_val, {dim0_size, dim1_size}, dtype);\n    }\n    return torch::randn({dim0_size, dim1_size}, dtype);\n}\n\nint genRandTensorAndWriteToFile(torch::Tensor &rand_tensor, int total_embeddings, int embedding_size, torch::Dtype dtype, int fd) {\n    rand_tensor = getRandTensor(total_embeddings, embedding_size, dtype);\n    int tensor_size = embedding_size * get_dtype_size_wrapper(dtype);\n    return pwrite_wrapper(fd, rand_tensor.data_ptr(), total_embeddings * tensor_size, 0);\n}\n\nbool checkPermOf2dTensor(torch::Tensor &a, torch::Tensor &b) {\n    if (a.sizes().size() != b.sizes().size() || a.sizes().size() != 2) return false;\n    vector<int> has_seen_count(a.size(0), 0);\n    for (int i = 0; i < a.size(0); i++) {\n        for (int j = 0; j < b.size(0); j++) {\n            if (a[i].equal(b[j])) {\n                has_seen_count[i] += 1;\n            }\n        }\n    }\n    for (int i = 0; i < a.size(0); i++)\n        if (has_seen_count[i] != 1) return false;\n    return true;\n}\n\nvoid sortWithinEdgeBuckets(torch::Tensor &rand_tensor, vector<int64_t> &edge_bucket_sizes, int sort_dim) {\n    int64_t offset = 0;\n    for (auto itr = edge_bucket_sizes.begin(); itr != edge_bucket_sizes.end(); itr++) {\n        torch::Tensor edge_bucket = rand_tensor.slice(0, offset, offset + *itr);\n        edge_bucket.copy_(edge_bucket.index_select(0, torch::argsort(edge_bucket.select(1, sort_dim))));\n        rand_tensor.slice(0, offset, offset + *itr) = edge_bucket;\n        edge_bucket = torch::Tensor();\n        offset += *itr;\n    }\n}\n\nbool sortEdgesSrcDest(vector<int> &edge1, vector<int> &edge2) {\n    if (edge1[0] != edge2[0]) return edge1[0] < edge2[0];\n    if (edge1[2] != edge2[2]) return edge1[2] < edge2[2];\n    return false;\n}\n\nvector<int64_t> partitionEdges(torch::Tensor &edges, int num_partitions, int num_nodes) {\n    vector<vector<int>> edges_vec(edges.size(0), vector<int>(edges.size(1)));\n    for (int i = 0; i < edges_vec.size(); i++) {\n        for (int j = 0; j < edges_vec[i].size(); j++) {\n            edges_vec[i][j] = edges[i][j].item<int>();\n        }\n    }\n    sort(edges_vec.begin(), edges_vec.end(), sortEdgesSrcDest);\n    int partition_size = ceil(((double)num_nodes) / num_partitions);\n    std::pair<int, int> prev(edges_vec[0][0] / partition_size, edges_vec[0][2] / partition_size), cur;\n    int count = 1;\n    vector<int64_t> partition_sizes_;\n    for (int i = 1; i < edges_vec.size(); i++) {\n        cur = std::pair<int, int>(edges_vec[i][0] / partition_size, edges_vec[i][2] / partition_size);\n        if (cur == prev) {\n            count++;\n            continue;\n        }\n        partition_sizes_.push_back(count);\n        count = 1;\n        prev = cur;\n    }\n    partition_sizes_.push_back(count);\n    return partition_sizes_;\n}"
  },
  {
    "path": "test/cpp/unit/testing_util.h",
    "content": "#include <fcntl.h>\n#include <torch/torch.h>\n#include <unistd.h>\n\n#include <string>\n\n#include \"storage/storage.h\"\n\nint createTmpFile(std::string &filename);\n\ntorch::Tensor getRandTensor(int dim0_size, int dim1_size, torch::Dtype dtype, int max_val = 1000);\n\nint genRandTensorAndWriteToFile(torch::Tensor &rand_tensor, int total_embeddings, int embedding_size, torch::Dtype dtype, int fd);\n\nbool checkPermOf2dTensor(torch::Tensor &a, torch::Tensor &b);\n\nvoid sortWithinEdgeBuckets(torch::Tensor &rand_tensor, vector<int64_t> &edge_bucket_sizes, int sort_dim = 0);\n\nvector<int64_t> partitionEdges(torch::Tensor &edges, int num_partitions, int num_nodes);"
  },
  {
    "path": "test/db2graph/test_postgres.py",
    "content": "import os\nimport random\nfrom pathlib import Path\n\nimport psycopg2\n\nfrom src.python.tools.db2graph.marius_db2graph import connect_to_db, post_processing\n\n\nclass TestConnector:\n    database = \"postgres\"\n    user = \"postgres\"\n    password = \"postgres\"\n    host = os.environ.get(\"POSTGRES_HOST\")\n    port = os.environ.get(\"POSTGRES_PORT\")\n\n    customer_names = [\"sofia\", \"lukas\", \"rajesh\", \"daiyu\", \"hina\", \"lorenzo\", \"donghai\", \"shuchang\", \"johnny\"]\n    country_names = [\"spain\", \"germany\", \"india\", \"china\", \"japan\", \"italy\", \"china\", \"china\", \"usa\"]\n    item_names = [\n        \"fenugreek\",\n        \"soy sauce\",\n        \"oregano\",\n        \"tomato\",\n        \"cumin\",\n        \"soy sauce\",\n        \"eggs\",\n        \"onions\",\n        \"onions\",\n        \"wasabi\",\n        \"rice\",\n        \"chicken breast\",\n        \"salmon\",\n        \"sourdough bread\",\n        \"meatballs\",\n        \"root beer\",\n        \"croissant\",\n        \"taco sauce\",\n    ]\n\n    def fill_db(self):\n        \"\"\"\n        Filling the database with data for testing things\n        \"\"\"\n        conn = psycopg2.connect(\n            database=self.database, user=self.user, password=self.password, host=self.host, port=self.port\n        )\n        cur = conn.cursor()\n\n        # DROP TABLE IF EXISTS\n        cur.execute(\"DROP TABLE IF EXISTS ORDERS;\")\n        cur.execute(\"DROP TABLE IF EXISTS CUSTOMERS;\")\n        conn.commit()\n\n        # Create two tables - First Customers and second Orders\n        cur.execute(\n            \"\"\"CREATE TABLE CUSTOMERS\n                        (ID INT PRIMARY KEY NOT NULL,\n                        CUSTOMERNAME TEXT NOT NULL,\n                        COUNTRY TEXT NOT NULL,\n                        PHONE VARCHAR(10) NOT NULL);\"\"\"\n        )\n        conn.commit()\n        cur.execute(\n            \"\"\"CREATE TABLE ORDERS\n                        (ID INT PRIMARY KEY NOT NULL,\n                        CUSTOMERID INT NOT NULL,\n                        AMOUNT INT NOT NULL,\n                        ITEM TEXT NOT NULL,\n                        CONSTRAINT fk_customer\n                            FOREIGN KEY(CUSTOMERID)\n                                REFERENCES CUSTOMERS(ID));\"\"\"\n        )\n        conn.commit()\n\n        # Insert some data\n        # Inserting Customers\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (1, '{self.customer_names[0]}',\"\n            f\" '{self.country_names[0]}', '6081237654')\"\n        )\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (2, '{self.customer_names[1]}',\"\n            f\" '{self.country_names[1]}', '6721576540')\"\n        )\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (3, '{self.customer_names[2]}',\"\n            f\" '{self.country_names[2]}', '5511234567')\"\n        )\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (4, '{self.customer_names[3]}',\"\n            f\" '{self.country_names[3]}', '3211248173')\"\n        )\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (5, '{self.customer_names[4]}',\"\n            f\" '{self.country_names[4]}', '6667890001')\"\n        )\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (6, '{self.customer_names[5]}',\"\n            f\" '{self.country_names[5]}', '6260001111')\"\n        )\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (7, '{self.customer_names[6]}',\"\n            f\" '{self.country_names[6]}', '7874561234')\"\n        )\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (8, '{self.customer_names[7]}',\"\n            f\" '{self.country_names[7]}', '4041015059')\"\n        )\n        cur.execute(\n            f\"INSERT INTO CUSTOMERS (ID,CUSTOMERNAME,COUNTRY,PHONE)             VALUES (9, '{self.customer_names[8]}',\"\n            f\" '{self.country_names[8]}', '5647525398')\"\n        )\n        conn.commit()\n\n        # Inserting Orders\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (1, 3, 5, '{self.item_names[0]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (2, 7, 7, '{self.item_names[1]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (3, 6, 2, '{self.item_names[2]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (4, 1, 3, '{self.item_names[3]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (5, 3, 5, '{self.item_names[4]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (6, 5, 7, '{self.item_names[5]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (7, 2, 1, '{self.item_names[6]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (8, 9, 3, '{self.item_names[7]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (9, 4, 3, '{self.item_names[8]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (10, 5, 15, '{self.item_names[9]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (11, 8, 9, '{self.item_names[10]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (12, 4, 12, '{self.item_names[11]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (13, 5, 20, '{self.item_names[12]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (14, 6, 11, '{self.item_names[13]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (15, 2, 8, '{self.item_names[14]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (16, 9, 2, '{self.item_names[15]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (17, 2, 6, '{self.item_names[16]}')\"\n        )\n        cur.execute(\n            f\"INSERT INTO ORDERS (ID,CUSTOMERID,AMOUNT,ITEM)             VALUES (18, 1, 4, '{self.item_names[17]}')\"\n        )\n        conn.commit()\n\n        conn.close()\n        return\n\n    def test_connect_to_db(self):\n        \"\"\"\n        Basic connecter to db test. Just checking if connection established\n        and corrected values are fetched\n        \"\"\"\n        # Filling database with data for testing\n        conn = psycopg2.connect(\n            database=self.database, user=self.user, password=self.password, host=self.host, port=self.port\n        )\n\n        # Create table\n        cur = conn.cursor()\n        cur.execute(\n            \"\"\"CREATE TABLE COMPANY\n            (ID INT PRIMARY KEY     NOT NULL,\n            NAME           TEXT    NOT NULL,\n            AGE            INT     NOT NULL);\"\"\"\n        )\n        conn.commit()\n\n        # Insert some data\n        num_data_to_insert = 5\n        self.name = []\n        self.age = []\n        for i in range(num_data_to_insert):\n            self.name.append(\"name\" + str(i))\n            self.age.append(random.randint(1, 100))\n\n        for i in range(num_data_to_insert):\n            cur.execute(\n                f\"INSERT INTO COMPANY (ID,NAME,AGE)                 VALUES ({i}, '{self.name[i]}', {self.age[i]})\"\n            )\n        conn.commit()\n        conn.close()\n\n        # Setting the connect function to test\n        conn = connect_to_db(\n            db_server=\"postgre-sql\",\n            db_name=self.database,\n            db_user=self.user,\n            db_password=self.password,\n            db_host=self.host,\n        )\n        cur = conn.cursor()\n        cur.execute(\"SELECT id, name, age from COMPANY\")\n        rows = cur.fetchall()\n        index = 0\n        for row in rows:\n            assert row[0] == index\n            assert row[1] == self.name[index]\n            assert row[2] == self.age[index]\n            index += 1\n        conn.close()\n\n    def test_edges_entity_entity(self):\n        \"\"\"\n        Testing edges_entity_entity type of queries which generate edges\n        \"\"\"\n        self.fill_db()  # Filling database with data for testing\n\n        # Getting all the inputs for the function\n        output_dir = Path(\"output_dir_edges_entity_entity/\")\n        output_dir.mkdir(parents=True, exist_ok=True)\n\n        db_server = \"postgre-sql\"\n\n        conn = psycopg2.connect(\n            database=self.database, user=self.user, password=self.password, host=self.host, port=self.port\n        )\n\n        edge_entity_entity_queries_list = []\n        edge_entity_entity_queries_list.append(\n            \"SELECT customers.customername, customers.country FROM customers ORDER BY customers.customername ASC;\"\n        )\n        edge_entity_entity_queries_list.append(\n            \"SELECT orders.item, customers.country FROM orders, customers WHERE orders.customerid = customers.id ORDER\"\n            \" BY orders.item ASC;\"\n        )\n        edge_entity_entity_rel_list = [\"lives_in\", \"ordered_by_people_from_country\"]\n\n        # Testing the function\n        post_processing(output_dir, conn, edge_entity_entity_queries_list, edge_entity_entity_rel_list, db_server)\n\n        # Asserting the correctionness of the output\n        # Predefined correct output for the input queries\n        correct_output = []\n\n        # expected outputs for query 1\n        correct_output.append(\"customers_customername_daiyu\\tlives_in\\tcustomers_country_china\\n\")\n        correct_output.append(\"customers_customername_donghai\\tlives_in\\tcustomers_country_china\\n\")\n        correct_output.append(\"customers_customername_hina\\tlives_in\\tcustomers_country_japan\\n\")\n        correct_output.append(\"customers_customername_johnny\\tlives_in\\tcustomers_country_usa\\n\")\n        correct_output.append(\"customers_customername_lorenzo\\tlives_in\\tcustomers_country_italy\\n\")\n        correct_output.append(\"customers_customername_lukas\\tlives_in\\tcustomers_country_germany\\n\")\n        correct_output.append(\"customers_customername_rajesh\\tlives_in\\tcustomers_country_india\\n\")\n        correct_output.append(\"customers_customername_shuchang\\tlives_in\\tcustomers_country_china\\n\")\n        correct_output.append(\"customers_customername_sofia\\tlives_in\\tcustomers_country_spain\\n\")\n\n        # expected outputs for query 2\n        correct_output.append(\"orders_item_chicken breast\\tordered_by_people_from_country\\tcustomers_country_china\\n\")\n        correct_output.append(\"orders_item_croissant\\tordered_by_people_from_country\\tcustomers_country_germany\\n\")\n        correct_output.append(\"orders_item_cumin\\tordered_by_people_from_country\\tcustomers_country_india\\n\")\n        correct_output.append(\"orders_item_eggs\\tordered_by_people_from_country\\tcustomers_country_germany\\n\")\n        correct_output.append(\"orders_item_fenugreek\\tordered_by_people_from_country\\tcustomers_country_india\\n\")\n        correct_output.append(\"orders_item_meatballs\\tordered_by_people_from_country\\tcustomers_country_germany\\n\")\n        correct_output.append(\"orders_item_onions\\tordered_by_people_from_country\\tcustomers_country_usa\\n\")\n        correct_output.append(\"orders_item_onions\\tordered_by_people_from_country\\tcustomers_country_china\\n\")\n        correct_output.append(\"orders_item_oregano\\tordered_by_people_from_country\\tcustomers_country_italy\\n\")\n        correct_output.append(\"orders_item_rice\\tordered_by_people_from_country\\tcustomers_country_china\\n\")\n        correct_output.append(\"orders_item_root beer\\tordered_by_people_from_country\\tcustomers_country_usa\\n\")\n        correct_output.append(\"orders_item_salmon\\tordered_by_people_from_country\\tcustomers_country_japan\\n\")\n        correct_output.append(\"orders_item_sourdough bread\\tordered_by_people_from_country\\tcustomers_country_italy\\n\")\n        correct_output.append(\"orders_item_soy sauce\\tordered_by_people_from_country\\tcustomers_country_japan\\n\")\n        correct_output.append(\"orders_item_soy sauce\\tordered_by_people_from_country\\tcustomers_country_china\\n\")\n        correct_output.append(\"orders_item_taco sauce\\tordered_by_people_from_country\\tcustomers_country_spain\\n\")\n        correct_output.append(\"orders_item_tomato\\tordered_by_people_from_country\\tcustomers_country_spain\\n\")\n        correct_output.append(\"orders_item_wasabi\\tordered_by_people_from_country\\tcustomers_country_japan\\n\")\n        with open(output_dir / \"edges.txt\", \"r\") as file:\n            for line in file:\n                assert line in correct_output\n\n        return\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_fb15k_acc.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\n\nimport pytest\n\n\nclass TestFB15K(unittest.TestCase):\n    @classmethod\n    def setUp(self):\n        if not Path(TMP_TEST_DIR).exists():\n            Path(TMP_TEST_DIR).mkdir()\n\n    @classmethod\n    def tearDown(self):\n        pass\n        if Path(TMP_TEST_DIR).exists():\n            shutil.rmtree(Path(TMP_TEST_DIR))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_one_epoch(self):\n        pass\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_interval_checkpointing.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport marius as m\n\n\ndef replace_string_in_file(filepath, before, after):\n    os.system(\"sed -i -E 's@{}@{}@g' {}\".format(before, after, filepath.__str__()))\n\n\ndef get_line_in_file(filepath, line_num):\n    return os.popen(\"sed '{}!d' {}\".format(line_num, filepath.__str__())).read().lstrip()\n\n\ndef run_config(config_file, enable_checkpointing, checkpoint_interval, save_state):\n    config = m.config.loadConfig(config_file.__str__(), True)\n    config.training.num_epochs = 6\n    if enable_checkpointing:\n        config.training.checkpoint.interval = checkpoint_interval\n        config.training.checkpoint.save_state = save_state\n    m.manager.marius_train(config)\n\n\nclass TestIntervalCheckpointing(unittest.TestCase):\n    base_dir = None\n    config_file = None\n\n    @classmethod\n    def setUp(self):\n        if not Path(TMP_TEST_DIR).exists():\n            Path(TMP_TEST_DIR).mkdir()\n        self.base_dir = TMP_TEST_DIR\n\n    @classmethod\n    def tearDown(self):\n        if Path(TMP_TEST_DIR).exists():\n            shutil.rmtree(Path(TMP_TEST_DIR))\n\n    def init_dataset_dir(self, name):\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        generate_random_dataset(\n            output_dir=Path(self.base_dir) / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            Path(self.base_dir) / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        for filename in os.listdir(Path(self.base_dir) / Path(name)):\n            if filename.startswith(\"M-\"):\n                self.config_file = Path(self.base_dir) / Path(name) / Path(filename)\n\n    def test_checkpointing_with_state(self):\n        name = \"test_checkpointing_with_state\"\n        self.init_dataset_dir(name)\n\n        # runs for a total of 6 epochs, checkpoints every 2 epochs. so checkpoint_2 & checkpoint_4 should exist\n        # checkpoint 6 shouldn't exist\n        run_config(self.config_file, True, 2, True)\n\n        config = m.config.loadConfig(self.config_file.__str__(), False)\n        checkpoint_2_path = Path(config.storage.model_dir) / Path(\"checkpoint_2\")\n        checkpoint_4_path = Path(config.storage.model_dir) / Path(\"checkpoint_4\")\n        checkpoint_6_path = Path(config.storage.model_dir) / Path(\"checkpoint_6\")\n        assert checkpoint_2_path.exists(), \"Expected to see checkpointed model and params in {}, but not found\".format(\n            str(checkpoint_2_path)\n        )\n        assert checkpoint_4_path.exists(), \"Expected to see checkpointed model and params in {}, but not found\".format(\n            str(checkpoint_4_path)\n        )\n        assert not checkpoint_6_path.exists(), \"{} shouldn't have been created\".format(str(checkpoint_6_path))\n\n        checkpoint_files = [\"model.pt\", \"model_state.pt\", \"embeddings.bin\", \"embeddings_state.bin\"]\n        for checkpoint_id in [\"checkpoint_2\", \"checkpoint_4\"]:\n            for f in checkpoint_files:\n                file_path_ = Path(config.storage.model_dir) / Path(checkpoint_id) / Path(f)\n                assert file_path_.exists(), \"Expected to see checkpointed file {}, but not found\".format(\n                    str(file_path_)\n                )\n\n        # resume training from checkpoint_4 and further train 5 epochs with checkpoint disabled.\n        # so the model stored would have ideally been trained for 9 epochs.\n        full_config_path = Path(config.storage.model_dir) / Path(\"full_config.yaml\")\n        replace_string_in_file(\n            full_config_path,\n            \"resume_from_checkpoint:.*\",\n            \"resume_from_checkpoint: {}/checkpoint_4\".format(config.storage.model_dir),\n        )\n        replace_string_in_file(full_config_path, \"model_dir:.*\", \"\")\n        run_config(full_config_path, False, -1, False)\n\n        config = m.config.loadConfig(self.config_file.__str__(), False)\n        metadata_file_path = Path(config.storage.model_dir) / Path(\"metadata.csv\")\n\n        trained_epochs = int(get_line_in_file(metadata_file_path, 2))\n        assert trained_epochs == 10, \"Expected to see trained epochs as {} in {}, but found {}\".format(\n            10, str(metadata_file_path), trained_epochs\n        )\n\n    def test_checkpointing_wo_state(self):\n        name = \"test_checkpointing_wo_state\"\n        self.init_dataset_dir(name)\n\n        # runs for a total of 6 epochs, checkpoints every 3 epochs. so checkpoint_3 alone should exist\n        run_config(self.config_file, True, 3, False)\n\n        config = m.config.loadConfig(self.config_file.__str__(), False)\n        checkpoint_3_path = Path(config.storage.model_dir) / Path(\"checkpoint_3\")\n        checkpoint_6_path = Path(config.storage.model_dir) / Path(\"checkpoint_6\")\n        assert checkpoint_3_path.exists(), \"Expected to see checkpointed model and params in {}, but not found\".format(\n            str(checkpoint_3_path)\n        )\n        assert not checkpoint_6_path.exists(), \"{} shouldn't have been created\".format(str(checkpoint_6_path))\n\n        checkpoint_files = [\"model.pt\", \"model_state.pt\", \"embeddings.bin\"]\n        for checkpoint_id in [\"checkpoint_3\"]:\n            for f in checkpoint_files:\n                file_path_ = Path(config.storage.model_dir) / Path(checkpoint_id) / Path(f)\n                assert file_path_.exists(), \"Expected to see checkpointed file {}, but not found\".format(\n                    str(file_path_)\n                )\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_lp_basic.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pytest\n\nimport marius as m\n\n\ndef run_configs(directory, partitioned_eval=False):\n    for filename in os.listdir(directory):\n        if filename.startswith(\"M-\"):\n            config_file = directory / Path(filename)\n            print(\"|||||||||||||||| RUNNING CONFIG ||||||||||||||||\")\n            print(config_file)\n            config = m.config.loadConfig(config_file.__str__(), True)\n\n            if partitioned_eval:\n                config.storage.full_graph_evaluation = False\n\n            m.manager.marius_train(config)\n\n\nclass TestLP(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_dm(self):\n        name = \"dm\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs_uniform(self):\n        name = \"gs_uniform\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_uniform\", \"gs_3_layer_uniform\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"GAT only supported for GPU\")\n    def test_gat(self):\n        name = \"gat\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_sync_training(self):\n        name = \"sync_training\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\", \"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync_deg\", \"sync_filtered\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async_training(self):\n        name = \"async_training\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\", \"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"async\", \"async_deg\", \"async_filtered\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_sync_eval(self):\n        name = \"sync_eval\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\", \"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\", \"sync_deg\", \"sync_filtered\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async_eval(self):\n        name = \"async_eval\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\", \"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"async\", \"async_deg\", \"async_filtered\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n\nclass TestLPNoRelations(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"no_relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_dm(self):\n        name = \"dm\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs_uniform(self):\n        name = \"gs_uniform\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_uniform\", \"gs_3_layer_uniform\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"GAT only supported for GPU\")\n    def test_gat(self):\n        name = \"gat\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_sync_training(self):\n        name = \"sync_training\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\", \"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync_deg\", \"sync_filtered\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async_training(self):\n        name = \"async_training\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\", \"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"async\", \"async_deg\", \"async_filtered\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_sync_eval(self):\n        name = \"sync_eval\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\", \"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\", \"sync_deg\", \"sync_filtered\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async_eval(self):\n        name = \"async_eval\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\", \"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"async\", \"async_deg\", \"async_filtered\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_lp_buffer.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pytest\n\nimport marius as m\n\n\ndef run_configs(directory, partitioned_eval=False):\n    for filename in os.listdir(directory):\n        if filename.startswith(\"M-\"):\n            config_file = directory / Path(filename)\n            print(\"|||||||||||||||| RUNNING CONFIG ||||||||||||||||\")\n            print(config_file)\n            config = m.config.loadConfig(config_file.__str__(), True)\n\n            if partitioned_eval:\n                config.storage.full_graph_evaluation = False\n\n            m.manager.marius_train(config)\n\n\n# @pytest.mark.skip(\"Buffer tests currently flakey with python API\")\nclass TestLPBuffer(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"buffer\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            num_partitions=8,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_dm(self):\n        name = \"basic_dm\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"Known issue with GNN + buffer\")\n    def test_gs(self):\n        name = \"basic_gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"Known issue with GNN + buffer\")\n    def test_gs_uniform(self):\n        name = \"basic_gs_uniform\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_uniform\", \"gs_3_layer_uniform\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"GAT only supported for GPU\")\n    def test_gat(self):\n        name = \"basic_gat\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"This test can be flakey: periodically hangs for some reason.\")\n    def test_sync_training(self):\n        name = \"sync_training\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync_deg\", \"sync_filtered\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async_training(self):\n        name = \"async_training\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"async\", \"async_deg\", \"async_filtered\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_sync_eval(self):\n        name = \"sync_eval\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\", \"sync_deg\", \"sync_filtered\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async_eval(self):\n        name = \"async_eval\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"async\", \"async_deg\", \"async_filtered\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_partitioned_eval(self):\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"partitioned_eval\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],  # , \"async\", \"async_deg\", \"async_filtered\"], # RW: async test currently flakey\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n\nclass TestLPBufferNoRelations(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"buffer_no_relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            num_partitions=8,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_dm(self):\n        name = \"dm\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Known issue with GNN + buffer\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Known issue with GNN + buffer\")\n    def test_gs_uniform(self):\n        name = \"gs_uniform\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_uniform\", \"gs_3_layer_uniform\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"GAT only supported for GPU\")\n    def test_gat(self):\n        name = \"gat\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"This test can be flakey: periodically hangs for some reason.\")\n    def test_sync_training(self):\n        name = \"sync_training\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync_deg\", \"sync_filtered\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async_training(self):\n        name = \"async_training\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"async\", \"async_deg\", \"async_filtered\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_sync_eval(self):\n        name = \"sync_eval\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\", \"sync_deg\", \"sync_filtered\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async_eval(self):\n        name = \"async_eval\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"async\", \"async_deg\", \"async_filtered\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_partitioned_eval(self):\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"partitioned_eval\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],  # , \"async\", \"async_deg\", \"async_filtered\"], # RW: async test currently flakey\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_lp_storage.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pytest\n\nimport marius as m\n\n\ndef run_configs(directory, partitioned_eval=False):\n    for filename in os.listdir(directory):\n        if filename.startswith(\"M-\"):\n            config_file = directory / Path(filename)\n            print(\"|||||||||||||||| RUNNING CONFIG ||||||||||||||||\")\n            print(config_file)\n            config = m.config.loadConfig(config_file.__str__(), True)\n\n            if partitioned_eval:\n                config.storage.full_graph_evaluation = False\n\n            m.manager.marius_train(config)\n\n\nclass TestLPStorage(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"storage\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_no_valid(self):\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"no_valid\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.1],\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_only_train(self):\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"only_train\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_no_valid_no_relations(self):\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"no_valid_no_relations\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.1],\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_only_train_no_relations(self):\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"only_train_no_relations\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_no_valid_buffer(self):\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"no_valid_buffer\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.1],\n            num_partitions=8,\n            partitioned_eval=True,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_only_train_buffer(self):\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"only_train_buffer\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            num_partitions=8,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_no_valid_buffer_no_relations(self):\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"no_valid_buffer_no_relations\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.1],\n            num_partitions=8,\n            partitioned_eval=True,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_only_train_buffer_no_relations(self):\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"only_train_buffer_no_relations\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            num_partitions=8,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_model_dir.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pytest\n\nimport marius as m\n\n\ndef run_configs(directory, model_dir=None, partitioned_eval=False, sequential_train_nodes=False):\n    for filename in os.listdir(directory):\n        if filename.startswith(\"M-\"):\n            config_file = directory / Path(filename)\n            print(\"|||||||||||||||| RUNNING CONFIG ||||||||||||||||\")\n            print(config_file)\n            config = m.config.loadConfig(config_file.__str__(), True)\n\n            if model_dir is not None:\n                config.storage.model_dir = model_dir + \"/\"\n                relation_mapping_filepath = (\n                    Path(config.storage.dataset.dataset_dir) / Path(\"edges\") / Path(\"relation_mapping.txt\")\n                )\n                if relation_mapping_filepath.exists():\n                    shutil.copy(\n                        str(relation_mapping_filepath), \"{}/{}\".format(config.storage.model_dir, \"relation_mapping.txt\")\n                    )\n\n                node_mapping_filepath = (\n                    Path(config.storage.dataset.dataset_dir) / Path(\"nodes\") / Path(\"node_mapping.txt\")\n                )\n                if node_mapping_filepath.exists():\n                    shutil.copy(\n                        str(node_mapping_filepath), \"{}/{}\".format(config.storage.model_dir, \"node_mapping.txt\")\n                    )\n\n            if partitioned_eval:\n                config.storage.full_graph_evaluation = False\n\n            if sequential_train_nodes:\n                config.storage.embeddings.options.node_partition_ordering = m.config.NodePartitionOrdering.SEQUENTIAL\n                config.storage.features.options.node_partition_ordering = m.config.NodePartitionOrdering.SEQUENTIAL\n\n            m.manager.marius_train(config)\n\n\ndef has_model_params(model_dir_path, task=\"lp\", has_embeddings=False, has_relations=True):\n    if not model_dir_path.exists():\n        return False, \"{} directory with model params not found\".format(model_dir_path)\n\n    model_file = model_dir_path / Path(\"model.pt\")\n    if not model_file.exists():\n        return False, \"{} not found\".format(model_file)\n\n    model_state_file = model_dir_path / Path(\"model_state.pt\")\n    if not model_state_file.exists():\n        return False, \"{} not found\".format(model_state_file)\n\n    node_mapping_file = model_dir_path / Path(\"node_mapping.txt\")\n    if not node_mapping_file.exists():\n        return False, \"{} not found\".format(node_mapping_file)\n\n    if has_relations:\n        relation_mapping_file = model_dir_path / Path(\"relation_mapping.txt\")\n        if not relation_mapping_file.exists():\n            return False, \"{} not found\".format(relation_mapping_file)\n\n    if task == \"lp\" or has_embeddings:\n        embeddings_file = model_dir_path / Path(\"embeddings.bin\")\n        if not embeddings_file.exists():\n            return False, \"{} not found\".format(embeddings_file)\n\n        embeddings_state_file = model_dir_path / Path(\"embeddings_state.bin\")\n        if not embeddings_state_file.exists():\n            return False, \"{} not found\".format(embeddings_state_file)\n\n    return True, \"\"\n\n\nclass TestLP(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_dm(self):\n        name = \"dm\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n        model_dir_path = self.output_dir / Path(\"test_graph\") / Path(\"model_0\")\n        ret, err = has_model_params(model_dir_path)\n        assert ret is True, err\n\n        run_configs(self.output_dir / Path(name))\n        model_dir_path = self.output_dir / Path(\"test_graph\") / Path(\"model_1\")\n        ret, err = has_model_params(model_dir_path)\n        assert ret is True, err\n\n        for i in range(2, 11):\n            model_dir_path = self.output_dir / Path(\"test_graph\") / Path(\"model_{}\".format(i))\n            model_dir_path.mkdir(parents=True, exist_ok=True)\n\n        model_dir_path = self.output_dir / Path(\"test_graph\") / Path(\"model_10\")\n        ret, err = has_model_params(model_dir_path)\n        assert ret is False, err\n\n        run_configs(self.output_dir / Path(name))\n        ret, err = has_model_params(model_dir_path)\n        assert ret is True, err\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path)\n        assert ret is True, err\n\n\nclass TestNC(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\")\n        assert ret is True, err\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async(self):\n        name = \"async\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"async\"],\n            evaluation_names=[\"async\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\")\n        assert ret is True, err\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_emb(self):\n        name = \"emb\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\", True)\n        assert ret is True, err\n\n\nclass TestLPBufferNoRelations(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"buffer_no_relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            num_partitions=8,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_dm(self):\n        name = \"dm\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"lp\", has_relations=False)\n        assert ret is True, err\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_partitioned_eval(self):\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"partitioned_eval\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],  # , \"async\", \"async_deg\", \"async_filtered\"], # RW: async test currently flakey\n            task=\"lp\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"lp\", has_relations=False)\n        assert ret is True, err\n\n\nclass TestNCBuffer(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"buffer\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\")\n        assert ret is True, err\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async(self):\n        name = \"async\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"async\"],\n            evaluation_names=[\"async\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\")\n        assert ret is True, err\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_emb(self):\n        name = \"emb\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\", True)\n        assert ret is True, err\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_partitioned_eval(self):\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"partitioned_eval\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\", \"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\", True)\n        assert ret is True, err\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Sequential ordering tests currently flakey at small scale\")\n    def test_sequential(self):\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"sequential_ordering\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.1, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            sequential_train_nodes=True,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\", \"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True, sequential_train_nodes=True)\n\n        model_dir_path = self.output_dir / Path(name)\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\", True)\n        assert ret is True, err\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=False, sequential_train_nodes=True)\n\n        model_dir_path = self.output_dir / Path(name) / Path(\"_1\")\n        run_configs(self.output_dir / Path(name), str(model_dir_path))\n        ret, err = has_model_params(model_dir_path, \"nc\", True)\n        assert ret is True, err\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_nc_basic.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pytest\n\nimport marius as m\n\n\ndef run_configs(directory, partitioned_eval=False):\n    for filename in os.listdir(directory):\n        if filename.startswith(\"M-\"):\n            config_file = directory / Path(filename)\n            print(\"|||||||||||||||| RUNNING CONFIG ||||||||||||||||\")\n            print(config_file)\n            config = m.config.loadConfig(config_file.__str__(), True)\n\n            if partitioned_eval:\n                config.storage.full_graph_evaluation = False\n\n            m.manager.marius_train(config)\n\n\nclass TestNC(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs_uniform(self):\n        name = \"gs_uniform\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_uniform\", \"gs_3_layer_uniform\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"GAT only supported for GPU\")\n    def test_gat(self):\n        name = \"gat\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async(self):\n        name = \"async\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"async\"],\n            evaluation_names=[\"async\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_emb(self):\n        name = \"emb\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n\nclass TestNCNoRelations(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"no_relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 500\n        num_rels = 1\n        num_edges = 10000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs_uniform(self):\n        name = \"gs_uniform\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_uniform\", \"gs_3_layer_uniform\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"GAT only supported for GPU\")\n    def test_gat(self):\n        name = \"gat\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async(self):\n        name = \"async\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"async\"],\n            evaluation_names=[\"async\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_emb(self):\n        name = \"emb\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_nc_buffer.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pytest\n\nimport marius as m\n\n\ndef run_configs(directory, partitioned_eval=False, sequential_train_nodes=False):\n    for filename in os.listdir(directory):\n        if filename.startswith(\"M-\"):\n            config_file = directory / Path(filename)\n            print(\"|||||||||||||||| RUNNING CONFIG ||||||||||||||||\")\n            print(config_file)\n            config = m.config.loadConfig(config_file.__str__(), True)\n\n            if partitioned_eval:\n                config.storage.full_graph_evaluation = False\n\n            if sequential_train_nodes:\n                config.storage.embeddings.options.node_partition_ordering = m.config.NodePartitionOrdering.SEQUENTIAL\n                config.storage.features.options.node_partition_ordering = m.config.NodePartitionOrdering.SEQUENTIAL\n\n            m.manager.marius_train(config)\n\n\nclass TestNCBuffer(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"buffer\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs_uniform(self):\n        name = \"gs_uniform\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_uniform\", \"gs_3_layer_uniform\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"GAT only supported for GPU\")\n    def test_gat(self):\n        name = \"gat\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async(self):\n        name = \"async\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"async\"],\n            evaluation_names=[\"async\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_emb(self):\n        name = \"emb\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_partitioned_eval(self):\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"partitioned_eval\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\", \"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Sequential ordering tests currently flakey at small scale\")\n    def test_sequential(self):\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"sequential_ordering\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.1, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            sequential_train_nodes=True,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\", \"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True, sequential_train_nodes=True)\n        run_configs(self.output_dir / Path(name), partitioned_eval=False, sequential_train_nodes=True)\n\n\nclass TestNCBufferNoRelations(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"buffer_no_relations\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        num_nodes = 500\n        num_rels = 1\n        num_edges = 10000\n\n        name = \"test_graph\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs(self):\n        name = \"gs\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_gs_uniform(self):\n        name = \"gs_uniform\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_uniform\", \"gs_3_layer_uniform\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skip(\"GAT only supported for GPU\")\n    def test_gat(self):\n        name = \"gat\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Async test currently flakey.\")\n    def test_async(self):\n        name = \"async\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"async\"],\n            evaluation_names=[\"async\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_emb(self):\n        name = \"emb\"\n        shutil.copytree(self.output_dir / Path(\"test_graph\"), self.output_dir / Path(name))\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_partitioned_eval(self):\n        num_nodes = 500\n        num_rels = 1\n        num_edges = 10000\n\n        name = \"partitioned_eval\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n    # @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    @pytest.mark.skip(\"Sequential ordering tests currently flakey at small scale\")\n    def test_sequential(self):\n        num_nodes = 500\n        num_rels = 1\n        num_edges = 10000\n\n        name = \"sequential_ordering\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.1, 0.05, 0.05],\n            num_partitions=8,\n            partitioned_eval=True,\n            sequential_train_nodes=True,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\", \"gs_3_layer_emb\", \"gs_1_layer\", \"gs_3_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True, sequential_train_nodes=True)\n        run_configs(self.output_dir / Path(name), partitioned_eval=False, sequential_train_nodes=True)\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_nc_storage.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pytest\n\nimport marius as m\n\n\ndef run_configs(directory, partitioned_eval=False):\n    for filename in os.listdir(directory):\n        if filename.startswith(\"M-\"):\n            config_file = directory / Path(filename)\n            print(\"|||||||||||||||| RUNNING CONFIG ||||||||||||||||\")\n            print(config_file)\n            config = m.config.loadConfig(config_file.__str__(), True)\n\n            if partitioned_eval:\n                config.storage.full_graph_evaluation = False\n\n            m.manager.marius_train(config)\n\n\nclass TestNCStorage(unittest.TestCase):\n    output_dir = TMP_TEST_DIR / Path(\"storage\")\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_no_valid(self):\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"no_valid\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.1],\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_only_train(self):\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"only_train\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_no_valid_no_relations(self):\n        num_nodes = 500\n        num_rels = 1\n        num_edges = 10000\n\n        name = \"no_valid_no_relations\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.1],\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_only_train_no_relations(self):\n        num_nodes = 500\n        num_rels = 1\n        num_edges = 10000\n\n        name = \"only_train_no_relations\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_no_valid_buffer(self):\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"no_valid_buffer\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.1],\n            feature_dim=10,\n            num_partitions=8,\n            partitioned_eval=True,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_only_train_buffer(self):\n        num_nodes = 500\n        num_rels = 10\n        num_edges = 10000\n\n        name = \"only_train_buffer\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            feature_dim=10,\n            num_partitions=8,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_no_valid_buffer_no_relations(self):\n        num_nodes = 500\n        num_rels = 1\n        num_edges = 10000\n\n        name = \"no_valid_buffer_no_relations\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.1],\n            num_partitions=8,\n            partitioned_eval=True,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name), partitioned_eval=True)\n\n    @pytest.mark.skipif(os.environ.get(\"MARIUS_NO_BINDINGS\", None) == \"TRUE\", reason=\"Requires building the bindings\")\n    def test_only_train_buffer_no_relations(self):\n        num_nodes = 500\n        num_rels = 1\n        num_edges = 10000\n\n        name = \"only_train_buffer_no_relations\"\n        generate_random_dataset(\n            output_dir=self.output_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            num_partitions=8,\n            feature_dim=10,\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            self.output_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        run_configs(self.output_dir / Path(name))\n"
  },
  {
    "path": "test/python/bindings/end_to_end/test_resume_training.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport marius as m\n\n\ndef replace_string_in_file(filepath, before, after):\n    os.system(\"sed -i -E 's@{}@{}@g' {}\".format(before, after, filepath.__str__()))\n\n\ndef get_line_in_file(filepath, line_num):\n    return os.popen(\"sed '{}!d' {}\".format(line_num, filepath.__str__())).read().lstrip()\n\n\ndef run_config(config_file):\n    config = m.config.loadConfig(config_file.__str__(), True)\n    m.manager.marius_train(config)\n\n\nclass TestResumeTraining(unittest.TestCase):\n    base_dir = None\n    config_file = None\n\n    @classmethod\n    def setUp(self):\n        if not Path(TMP_TEST_DIR).exists():\n            Path(TMP_TEST_DIR).mkdir()\n        self.base_dir = TMP_TEST_DIR\n\n    @classmethod\n    def tearDown(self):\n        if Path(TMP_TEST_DIR).exists():\n            shutil.rmtree(Path(TMP_TEST_DIR))\n\n    def init_dataset_dir(self, name):\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        generate_random_dataset(\n            output_dir=Path(self.base_dir) / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            Path(self.base_dir) / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        for filename in os.listdir(Path(self.base_dir) / Path(name)):\n            if filename.startswith(\"M-\"):\n                self.config_file = Path(self.base_dir) / Path(name) / Path(filename)\n                config = m.config.loadConfig(self.config_file.__str__(), True)\n                m.manager.marius_train(config)\n\n    def test_resume_training_model_dir(self):\n        name = \"model_dir\"\n        self.init_dataset_dir(name)\n\n        config = m.config.loadConfig(self.config_file.__str__(), False)\n        metadata_file_path = Path(config.storage.model_dir) / Path(\"metadata.csv\")\n\n        trained_epochs = int(get_line_in_file(metadata_file_path, 2))\n        assert trained_epochs == 2, \"Expected to see trained epochs as {} in {}, but found {}\".format(\n            2, str(metadata_file_path), trained_epochs\n        )\n\n        full_config_path = Path(config.storage.model_dir) / Path(\"full_config.yaml\")\n        replace_string_in_file(full_config_path, \"resume_training: false\", \"resume_training: true\")\n        replace_string_in_file(full_config_path, \"model_dir:.*\", \"\")\n        run_config(full_config_path)\n\n        # overwrites the model_0 directory with new model data\n        trained_epochs = int(get_line_in_file(metadata_file_path, 2))\n        assert trained_epochs == 4, \"Expected to see trained epochs as {} in {}, but found {}\".format(\n            4, str(metadata_file_path), trained_epochs\n        )\n\n        # creates model_1\n        config = m.config.loadConfig(self.config_file.__str__(), False)\n        run_config(self.config_file)\n\n        full_config_path = Path(config.storage.model_dir) / Path(\"full_config.yaml\")\n        replace_string_in_file(full_config_path, \"resume_training: false\", \"resume_training: true\")\n        new_model_path = Path(TMP_TEST_DIR) / Path(name) / Path(\"custom_model_dir\")\n        replace_string_in_file(full_config_path, \"model_dir:.*\", \"model_dir: {}\".format(str(new_model_path)))\n\n        # creates new model dir\n        run_config(full_config_path)\n\n        config = m.config.loadConfig(full_config_path.__str__(), False)\n        metadata_file_path = Path(config.storage.model_dir) / Path(\"metadata.csv\")\n        trained_epochs = int(get_line_in_file(metadata_file_path, 2))\n        assert trained_epochs == 4, \"Expected to see trained epochs as {} in {}, but found {}\".format(\n            4, str(metadata_file_path), trained_epochs\n        )\n\n    def test_resume_training_checkpoint_dir(self):\n        self.init_dataset_dir(\"checkpoint_dir\")\n\n        config = m.config.loadConfig(self.config_file.__str__(), False)\n        metadata_file_path = Path(config.storage.model_dir) / Path(\"metadata.csv\")\n\n        trained_epochs = int(get_line_in_file(metadata_file_path, 2))\n        assert trained_epochs == 2, \"Expected to see trained epochs as {} in {}, but found {}\".format(\n            2, str(metadata_file_path), trained_epochs\n        )\n\n        full_config_path = Path(config.storage.model_dir) / Path(\"full_config.yaml\")\n        replace_string_in_file(full_config_path, \"resume_training: false\", \"resume_training: true\")\n        replace_string_in_file(full_config_path, \"model_dir:.*\", \"\")\n        replace_string_in_file(\n            full_config_path, \"resume_from_checkpoint:.*\", \"resume_from_checkpoint: {}\".format(config.storage.model_dir)\n        )\n\n        # creates model_1 directory with model data\n        run_config(full_config_path)\n\n        config = m.config.loadConfig(full_config_path.__str__(), False)\n        metadata_file_path = Path(config.storage.model_dir) / Path(\"metadata.csv\")\n        trained_epochs = int(get_line_in_file(metadata_file_path, 2))\n        assert trained_epochs == 4, \"Expected to see trained epochs as {} in {}, but found {}\".format(\n            4, str(metadata_file_path), trained_epochs\n        )\n"
  },
  {
    "path": "test/python/bindings/integration/test_config.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\n\nfrom omegaconf import OmegaConf\n\nimport marius.tools.configuration.marius_config\nfrom marius.config import loadConfig\n\n\nclass TestConfig(unittest.TestCase):\n    \"\"\"\n    Basic tests for loadConfig and the returned MariusConfig object.\n    \"\"\"\n\n    output_dir = TMP_TEST_DIR / Path(\"config\")\n\n    ds_config = marius.tools.configuration.marius_config.DatasetConfig()\n    ds_config.dataset_dir = output_dir.__str__()\n    ds_config.num_edges = 1000\n    ds_config.num_nodes = 100\n    ds_config.num_relations = 1\n    ds_config.num_train = 100\n    ds_config.num_valid = 10\n    ds_config.num_test = 10\n    ds_config.initialized = False\n\n    @classmethod\n    def setUp(self):\n        if not self.output_dir.exists():\n            os.makedirs(self.output_dir)\n\n        OmegaConf.save(self.ds_config, self.output_dir / Path(\"dataset.yaml\"))\n\n    @classmethod\n    def tearDown(self):\n        if self.output_dir.exists():\n            shutil.rmtree(self.output_dir)\n\n    def test_missing_config(self):\n        try:\n            loadConfig(\"foo.yaml\")\n            raise RuntimeError(\"Exception not thrown\")\n        except Exception as e:\n            assert \"No such file or directory\" in e.__str__()\n\n    def test_missing_dataset_yaml(self):\n        generate_configs_for_dataset(\n            self.output_dir,\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        os.system(\"rm {}\".format(self.output_dir / Path(\"dataset.yaml\")))\n        for filename in os.listdir(self.output_dir):\n            if filename.startswith(\"M-\"):\n                try:\n                    config_file = self.output_dir / Path(filename)\n                    _ = loadConfig(config_file.__str__(), save=True)\n                    raise RuntimeError(\"Exception not thrown\")\n                except Exception as e:\n                    assert \"expected to see dataset.yaml file\" in e.__str__()\n\n        shutil.rmtree(self.output_dir)\n        os.makedirs(self.output_dir)\n        OmegaConf.save(self.ds_config, self.output_dir / Path(\"dataset.yaml\"))\n\n        generate_configs_for_dataset(\n            self.output_dir,\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        os.system(\"rm {}\".format(self.output_dir / Path(\"dataset.yaml\")))\n        for filename in os.listdir(self.output_dir):\n            if filename.startswith(\"M-\"):\n                try:\n                    config_file = self.output_dir / Path(filename)\n                    _ = loadConfig(config_file.__str__(), save=True)\n                    raise RuntimeError(\"Exception not thrown\")\n                except Exception as e:\n                    assert \"expected to see dataset.yaml file\" in e.__str__()\n\n    def test_load_config(self):\n        generate_configs_for_dataset(\n            self.output_dir,\n            model_names=[\"distmult, gs_1_layer, gs_3_layer, gat_1_layer, gat_3_layer\"],\n            storage_names=[\"in_memory, part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        # check that each generated config can be parsed and it's members accessed.\n        for filename in os.listdir(self.output_dir):\n            if filename.startswith(\"M-\"):\n                config_file = self.output_dir / Path(filename)\n\n                config = loadConfig(config_file.__str__(), save=True)\n                loaded_full_config = loadConfig((config.storage.model_dir / Path(\"full_config.yaml\")).__str__())\n                assert loaded_full_config.model.random_seed == config.model.random_seed\n\n                assert config.model is not None\n                assert config.storage is not None\n                assert config.training is not None\n                assert config.evaluation is not None\n\n                assert config.model.encoder is not None\n                assert config.model.decoder is not None\n\n                assert config.storage.dataset.dataset_dir.rstrip(\"/\") == self.output_dir.__str__()\n                assert config.storage.dataset.num_edges == 1000\n                assert config.storage.dataset.num_nodes == 100\n                assert config.storage.dataset.num_relations == 1\n                assert config.storage.dataset.num_train == 100\n                assert config.storage.dataset.num_valid == 10\n                assert config.storage.dataset.num_test == 10\n\n                assert config.training is not None\n                assert config.evaluation is not None\n\n                config.model.random_seed = 0\n                assert config.model.random_seed == 0\n\n        # reset directory\n        shutil.rmtree(self.output_dir)\n        os.makedirs(self.output_dir)\n        OmegaConf.save(self.ds_config, self.output_dir / Path(\"dataset.yaml\"))\n\n        generate_configs_for_dataset(\n            self.output_dir,\n            model_names=[\"gs_1_layer\", \"gs_3_layer\", \"gat_1_layer\", \"gat_3_layer\"],\n            storage_names=[\"in_memory\", \"part_buffer\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        # check that each generated config can be parsed and it's members accessed.\n        for filename in os.listdir(self.output_dir):\n            if filename.startswith(\"M-\"):\n                config_file = self.output_dir / Path(filename)\n\n                config = loadConfig(config_file.__str__(), save=True)\n                loaded_full_config = loadConfig((config.storage.model_dir / Path(\"full_config.yaml\")).__str__())\n                assert loaded_full_config.model.random_seed == config.model.random_seed\n\n                assert config.model is not None\n                assert config.storage is not None\n                assert config.training is not None\n                assert config.evaluation is not None\n\n                assert config.model.encoder is not None\n                assert config.model.decoder is not None\n\n                assert config.storage.dataset.dataset_dir.rstrip(\"/\") == self.output_dir.__str__()\n                assert config.storage.dataset.num_edges == 1000\n                assert config.storage.dataset.num_nodes == 100\n                assert config.storage.dataset.num_relations == 1\n                assert config.storage.dataset.num_train == 100\n                assert config.storage.dataset.num_valid == 10\n                assert config.storage.dataset.num_test == 10\n\n                assert config.training is not None\n                assert config.evaluation is not None\n\n                config.model.random_seed = 0\n                assert config.model.random_seed == 0\n"
  },
  {
    "path": "test/python/bindings/integration/test_data.py",
    "content": "import unittest\n\nfrom marius.data import Batch, DataLoader\nfrom marius.data.samplers import CorruptNodeNegativeSampler, LayeredNeighborSampler\n\nimport torch  # isort:skip\n\n\nclass TestBatch(unittest.TestCase):\n    \"\"\"\n    Batch binding tests\n    \"\"\"\n\n    def test_construction(self):\n        b1 = Batch(train=False)\n\n        assert b1.node_embeddings is None\n        assert b1.train is False\n        assert b1.device_id == -1\n\n        rand_tens = torch.randn([10])\n        b1.node_embeddings = rand_tens\n\n        assert torch.all(torch.eq(b1.node_embeddings, rand_tens)).item() is True\n        b2 = Batch(train=True)\n\n        assert b2.node_embeddings is None\n        assert b2.train is True\n        assert b2.device_id == -1\n\n        b2.node_embeddings = rand_tens\n        assert torch.all(torch.eq(b2.node_embeddings, rand_tens)).item() is True\n\n    def test_accumulate_gradients(self):\n        b = Batch(train=True)\n\n        b.node_embeddings = torch.tensor([2.0, 4.0])\n        b.node_embeddings.grad = torch.tensor([0.5, -1.0])\n        b.node_embeddings_state = torch.tensor([0.0, 0.0])\n\n        b.accumulateGradients(learning_rate=1.0)\n\n        assert b.node_embeddings_state is None\n        assert torch.all(torch.eq(b.node_state_update, b.node_embeddings.grad.pow(2))).item() is True\n\n        expected = -1.0 * (b.node_embeddings.grad / (b.node_state_update.sqrt().add_(1e-10)))\n        assert torch.all(torch.eq(b.node_gradients, expected)).item() is True\n\n    def test_clear(self):\n        b = Batch(train=True)\n\n        b.node_embeddings = torch.tensor([2.0, 4.0])\n        b.node_embeddings.grad = torch.tensor([0.5, -1.0])\n        b.node_embeddings_state = torch.tensor([0.0, 0.0])\n\n        assert b.node_embeddings is not None\n        assert b.node_embeddings is not None\n        assert b.node_embeddings_state is not None\n\n        b.clear()\n\n        assert b.node_embeddings is None\n        assert b.node_embeddings is None\n        assert b.node_embeddings_state is None\n\n\nclass TestDataloader(unittest.TestCase):\n    def test_lp_only_edges(self):\n        num_edges = 100\n        num_nodes = 10\n        d = 5\n\n        batch_size = 10\n\n        edges = torch.randint(0, num_nodes, size=(num_edges, 2))\n        embeddings = torch.randn(size=(num_nodes, d))\n        features = torch.randn(size=(num_nodes, d))\n\n        neg_sampler = None\n        nbr_sampler = None\n\n        # constructor for in memory objects and tensors\n        dataloader = DataLoader(\n            edges=edges,\n            node_embeddings=embeddings,\n            node_features=features,\n            batch_size=batch_size,\n            neg_sampler=neg_sampler,\n            nbr_sampler=nbr_sampler,\n            learning_task=\"lp\",\n            train=False,\n        )\n\n        dataloader.initializeBatches()\n\n        count = 0\n        while dataloader.hasNextBatch():\n            b = dataloader.getBatch()\n\n            assert b.edges.shape[0] == batch_size\n            assert b.unique_node_indices.shape[0] == b.node_embeddings.shape[0]\n            assert b.unique_node_indices.shape[0] == b.node_features.shape[0]\n\n            count += 1\n\n        assert count == (num_edges / batch_size)\n\n    def test_lp_negs(self):\n        num_edges = 100\n        num_nodes = 10\n        d = 5\n\n        batch_size = 10\n\n        edges = torch.randint(0, num_nodes, size=(num_edges, 2))\n        embeddings = torch.randn(size=(num_nodes, d))\n        features = torch.randn(size=(num_nodes, d))\n\n        num_chunks = 2\n        num_negatives = 4\n        degree_fraction = 0.5\n\n        neg_sampler = CorruptNodeNegativeSampler(\n            num_chunks=num_chunks, num_negatives=num_negatives, degree_fraction=degree_fraction, filtered=False\n        )\n        nbr_sampler = None\n\n        # constructor for in memory objects and tensors\n        dataloader = DataLoader(\n            edges=edges,\n            node_embeddings=embeddings,\n            node_features=features,\n            batch_size=batch_size,\n            neg_sampler=neg_sampler,\n            nbr_sampler=nbr_sampler,\n            learning_task=\"lp\",\n            train=False,\n        )\n\n        dataloader.initializeBatches()\n\n        count = 0\n        while dataloader.hasNextBatch():\n            b = dataloader.getBatch()\n\n            assert b.edges.shape[0] == batch_size\n            assert b.unique_node_indices.shape[0] == b.node_embeddings.shape[0]\n            assert b.unique_node_indices.shape[0] == b.node_features.shape[0]\n\n            assert b.src_neg_indices.shape[0] == num_chunks\n            assert b.src_neg_indices.shape[1] == num_negatives\n            assert b.dst_neg_indices.shape[0] == num_chunks\n            assert b.dst_neg_indices.shape[1] == num_negatives\n\n            assert b.src_neg_indices_mapping.shape[0] == num_chunks\n            assert b.src_neg_indices_mapping.shape[1] == num_negatives\n            assert b.dst_neg_indices_mapping.shape[0] == num_chunks\n            assert b.dst_neg_indices_mapping.shape[1] == num_negatives\n\n            count += 1\n\n        assert count == (num_edges / batch_size)\n\n    def test_lp_negs_nbrs(self):\n        num_edges = 100\n        num_nodes = 10\n        d = 5\n\n        batch_size = 10\n\n        edges = torch.randint(0, num_nodes, size=(num_edges, 2))\n        embeddings = torch.randn(size=(num_nodes, d))\n        features = torch.randn(size=(num_nodes, d))\n\n        num_chunks = 2\n        num_negatives = 4\n        degree_fraction = 0.5\n\n        neg_sampler = CorruptNodeNegativeSampler(\n            num_chunks=num_chunks, num_negatives=num_negatives, degree_fraction=degree_fraction, filtered=False\n        )\n        nbr_sampler = LayeredNeighborSampler([-1])\n\n        # constructor for in memory objects and tensors\n        dataloader = DataLoader(\n            edges=edges,\n            node_embeddings=embeddings,\n            node_features=features,\n            batch_size=batch_size,\n            neg_sampler=neg_sampler,\n            nbr_sampler=nbr_sampler,\n            learning_task=\"lp\",\n            train=False,\n        )\n\n        dataloader.initializeBatches()\n\n        count = 0\n        while dataloader.hasNextBatch():\n            b = dataloader.getBatch()\n\n            assert b.edges.shape[0] == batch_size\n            assert b.unique_node_indices.shape[0] == b.node_embeddings.shape[0]\n            assert b.unique_node_indices.shape[0] == b.node_features.shape[0]\n\n            assert b.src_neg_indices.shape[0] == num_chunks\n            assert b.src_neg_indices.shape[1] == num_negatives\n            assert b.dst_neg_indices.shape[0] == num_chunks\n            assert b.dst_neg_indices.shape[1] == num_negatives\n\n            assert b.src_neg_indices_mapping.shape[0] == num_chunks\n            assert b.src_neg_indices_mapping.shape[1] == num_negatives\n            assert b.dst_neg_indices_mapping.shape[0] == num_chunks\n            assert b.dst_neg_indices_mapping.shape[1] == num_negatives\n\n            assert torch.all(torch.eq(b.unique_node_indices, b.dense_graph.node_ids)).item() is True\n\n            count += 1\n\n        assert count == (num_edges / batch_size)\n\n    def test_lp_nbrs(self):\n        num_edges = 100\n        num_nodes = 10\n        d = 5\n\n        batch_size = 10\n\n        edges = torch.randint(0, num_nodes, size=(num_edges, 2))\n        embeddings = torch.randn(size=(num_nodes, d))\n        features = torch.randn(size=(num_nodes, d))\n\n        neg_sampler = None\n        nbr_sampler = LayeredNeighborSampler([-1])\n\n        # constructor for in memory objects and tensors\n        dataloader = DataLoader(\n            edges=edges,\n            node_embeddings=embeddings,\n            node_features=features,\n            batch_size=batch_size,\n            neg_sampler=neg_sampler,\n            nbr_sampler=nbr_sampler,\n            learning_task=\"lp\",\n            train=False,\n        )\n\n        dataloader.initializeBatches()\n\n        count = 0\n        while dataloader.hasNextBatch():\n            b = dataloader.getBatch()\n\n            assert b.edges.shape[0] == batch_size\n            assert b.unique_node_indices.shape[0] == b.node_embeddings.shape[0]\n            assert b.unique_node_indices.shape[0] == b.node_features.shape[0]\n\n            assert torch.all(torch.eq(b.unique_node_indices, b.dense_graph.node_ids)).item() is True\n\n            count += 1\n\n        assert count == (num_edges / batch_size)\n\n    def test_nc_nbrs(self):\n        num_edges = 100\n        num_nodes = 50\n        d = 5\n\n        num_train = 25\n        batch_size = 5\n\n        edges = torch.randint(0, num_nodes, size=(num_edges, 2))\n        embeddings = torch.randn(size=(num_nodes, d))\n        features = torch.randn(size=(num_nodes, d))\n\n        nodes = torch.arange(0, num_train)\n\n        nbr_sampler = LayeredNeighborSampler([-1])\n\n        # constructor for in memory objects and tensors\n        dataloader = DataLoader(\n            edges=edges,\n            nodes=nodes,\n            node_embeddings=embeddings,\n            node_features=features,\n            batch_size=batch_size,\n            nbr_sampler=nbr_sampler,\n            learning_task=\"nc\",\n            train=False,\n        )\n\n        dataloader.initializeBatches()\n\n        count = 0\n        while dataloader.hasNextBatch():\n            b = dataloader.getBatch()\n\n            assert b.unique_node_indices.shape[0] == b.node_embeddings.shape[0]\n            assert b.unique_node_indices.shape[0] == b.node_features.shape[0]\n\n            assert torch.all(torch.eq(b.unique_node_indices, b.dense_graph.node_ids)).item() is True\n\n            count += 1\n\n        assert count == (num_train / batch_size)\n\n    def test_nc_no_nbrs(self):\n        num_edges = 100\n        num_nodes = 50\n        d = 5\n\n        num_train = 25\n        batch_size = 5\n\n        edges = torch.randint(0, num_nodes, size=(num_edges, 2))\n        embeddings = torch.randn(size=(num_nodes, d))\n        features = torch.randn(size=(num_nodes, d))\n\n        nodes = torch.arange(0, num_train)\n\n        # constructor for in memory objects and tensors\n        dataloader = DataLoader(\n            edges=edges,\n            nodes=nodes,\n            node_embeddings=embeddings,\n            node_features=features,\n            batch_size=batch_size,\n            learning_task=\"nc\",\n            train=False,\n        )\n\n        dataloader.initializeBatches()\n\n        count = 0\n        while dataloader.hasNextBatch():\n            b = dataloader.getBatch()\n\n            assert b.unique_node_indices.shape[0] == b.node_embeddings.shape[0]\n            assert b.unique_node_indices.shape[0] == b.node_features.shape[0]\n\n            assert torch.all(torch.eq(b.unique_node_indices, b.root_node_indices)).item() is True\n\n            count += 1\n\n        assert count == (num_train / batch_size)\n"
  },
  {
    "path": "test/python/bindings/integration/test_nn.py",
    "content": "import unittest\n\nfrom marius.config import LearningTask, LossOptions, LossReduction\nfrom marius.data import Batch, DENSEGraph, MariusGraph\nfrom marius.data.samplers import LayeredNeighborSampler\nfrom marius.nn import CrossEntropyLoss, Model, SGDOptimizer, SoftmaxCrossEntropy\nfrom marius.nn.decoders.edge import DistMult\nfrom marius.nn.decoders.node import NoOpNodeDecoder\nfrom marius.nn.encoders import GeneralEncoder\nfrom marius.nn.layers import EmbeddingLayer\nfrom marius.report import LinkPredictionReporter, NodeClassificationReporter\n\nimport torch  # isort:skip\n\nedge_list = torch.tensor([[0, 0, 1], [0, 0, 2], [1, 1, 4], [2, 0, 3], [3, 1, 0], [4, 0, 1]])\nbatch_edges = torch.tensor(\n    [\n        [0, 0, 1],\n        [2, 0, 3],\n        [3, 1, 0],\n    ]\n)\n\nnode_ids = torch.tensor([0, 1, 2, 3])\nnode_embeddings = torch.tensor([[1.5, 2.5], [2.5, 3.5], [4.25, 1.0], [-1.0, 0.5]])\n\nfull_graph = MariusGraph(edge_list, edge_list[torch.argsort(edge_list[:, -1])], 5)\nsampler = LayeredNeighborSampler(full_graph, [-1])\ndense_graph = sampler.getNeighbors(node_ids)\n\nnum_relations = 2\nembedding_dim = 2\n\n\ndef get_test_model_lp():\n    device = torch.device(\"cpu\")\n    dtype = torch.float32\n\n    embedding_layer = EmbeddingLayer(dimension=embedding_dim, device=device, bias=True)\n    layers = [[embedding_layer]]\n    encoder = GeneralEncoder(layers=layers)\n\n    decoder = DistMult(\n        num_relations=num_relations,\n        embedding_dim=embedding_dim,\n        use_inverse_relations=False,\n        device=device,\n        dtype=dtype,\n        mode=\"infer\",\n    )\n\n    loss = SoftmaxCrossEntropy(reduction=\"sum\")\n\n    reporter = LinkPredictionReporter()\n\n    return Model(encoder, decoder, loss, reporter)\n\n\ndef get_test_model_lp_neg():\n    device = torch.device(\"cpu\")\n    dtype = torch.float32\n\n    embedding_layer = EmbeddingLayer(dimension=embedding_dim, device=device, bias=True)\n    layers = [[embedding_layer]]\n    encoder = GeneralEncoder(layers=layers)\n\n    decoder = DistMult(\n        num_relations=num_relations,\n        embedding_dim=embedding_dim,\n        use_inverse_relations=False,\n        device=device,\n        dtype=dtype,\n        mode=\"train\",\n    )\n\n    loss = SoftmaxCrossEntropy(reduction=\"sum\")\n\n    reporter = LinkPredictionReporter()\n\n    return Model(encoder, decoder, loss, reporter)\n\n\ndef get_test_model_nc():\n    device = torch.device(\"cpu\")\n\n    embedding_layer = EmbeddingLayer(dimension=embedding_dim, device=device, bias=True)\n    layers = [[embedding_layer]]\n    encoder = GeneralEncoder(layers=layers)\n\n    decoder = NoOpNodeDecoder()\n\n    loss = CrossEntropyLoss(reduction=\"sum\")\n\n    reporter = NodeClassificationReporter()\n\n    return Model(encoder, decoder, loss, reporter)\n\n\nclass CustomModelBasic(Model):\n    def __init__(self, encoder, decoder):\n        if decoder.learning_task == LearningTask.LINK_PREDICTION:\n            reporter = LinkPredictionReporter()\n        else:\n            reporter = NodeClassificationReporter()\n\n        loss_options = LossOptions()\n        loss_options.loss_reduction = LossReduction.SUM\n        loss = SoftmaxCrossEntropy(reduction=\"sum\")\n\n        super().__init__(encoder, decoder, loss, reporter)\n\n\nclass CustomModelOverrideForward(Model):\n    def __init__(self, encoder, decoder):\n        if decoder.learning_task == LearningTask.LINK_PREDICTION:\n            reporter = LinkPredictionReporter()\n        else:\n            reporter = NodeClassificationReporter()\n\n        loss = SoftmaxCrossEntropy(reduction=\"sum\")\n\n        super().__init__(encoder, decoder, loss, reporter)\n\n    def forward_lp(self, batch, train):\n        pos = torch.ones([batch.edges.shape[0]])\n        negs = torch.unsqueeze(torch.ones([batch.edges.shape[0]]), 0)\n        return 3 * pos, 2 * negs, pos, 0 * negs\n\n\nclass TestModel(unittest.TestCase):\n    \"\"\"\n    Model binding test\n    \"\"\"\n\n    def test_construction_lp(self):\n        get_test_model_lp()\n\n    def test_construction_nc(self):\n        get_test_model_nc()\n\n    def test_forward_nc(self):\n        model = get_test_model_nc()\n        output = model.forward_nc(\n            node_embeddings=node_embeddings, node_features=torch.empty([]), dense_graph=DENSEGraph(), train=False\n        )\n        assert torch.all(torch.eq(output, node_embeddings)).item() is True\n\n    def test_forward_lp(self):\n        model = get_test_model_lp()\n\n        batch = Batch(False)\n\n        batch.node_embeddings = node_embeddings\n        batch.edges = batch_edges\n\n        scores, _, _, _ = model.forward_lp(batch=batch, train=False)\n\n        expected_scores = torch.tensor([12.5, -3.75, -0.25])\n\n        assert torch.all(torch.eq(scores, expected_scores)).item() is True\n\n    def test_train_batch(self):\n        model_lp = get_test_model_lp_neg()\n\n        batch = Batch(True)\n\n        batch.node_embeddings = node_embeddings\n        batch.node_embeddings_state = torch.zeros_like(node_embeddings)\n        batch.edges = batch_edges\n        batch.dst_neg_indices_mapping = torch.tensor([[2, 0], [0, 1], [1, 0]])\n\n        model_lp.train_batch(batch, True)\n\n        model_nc = get_test_model_nc()\n\n        batch = Batch(True)\n\n        batch.node_embeddings = node_embeddings\n        batch.node_embeddings_state = torch.zeros_like(node_embeddings)\n        batch.edges = batch_edges\n        batch.node_labels = torch.tensor([0, 1, 0, 1], dtype=torch.long)\n\n        model_nc.train_batch(batch, True)\n\n    def test_clear_grad(self):\n        model = get_test_model_nc()\n\n        model.optimizers = [SGDOptimizer(model.named_parameters(), 0.1)]\n\n        grad = torch.tensor([-1.0, -2.0])\n        model.parameters()[0].grad = grad\n\n        assert torch.all(torch.eq(model.parameters()[0].grad, grad)).item() is True\n        model.clear_grad()\n        assert model.parameters()[0].grad is None\n\n    def test_step(self):\n        model = get_test_model_nc()\n        learning_rate = 0.1\n        model.optimizers = [SGDOptimizer(model.named_parameters(), learning_rate)]\n\n        grad = torch.tensor([-1.0, -2.0])\n        model.parameters()[0].grad = grad\n\n        assert torch.all(torch.eq(model.parameters()[0].grad, grad)).item() is True\n        model.step()\n        assert torch.all(torch.eq(model.parameters()[0], -grad * learning_rate)).item() is True\n\n    def test_save(self):\n        pass\n\n    def test_load(self):\n        pass\n\n    def test_custom_model_basic(self):\n        tmp_model = get_test_model_lp()\n        model = CustomModelBasic(tmp_model.encoder, tmp_model.decoder)\n\n        batch = Batch(False)\n\n        batch.node_embeddings = node_embeddings\n        batch.edges = batch_edges\n\n        scores, _, _, _ = model.forward_lp(batch=batch, train=False)\n\n        expected_scores = torch.tensor([12.5, -3.75, -0.25])\n\n        assert torch.all(torch.eq(scores, expected_scores)).item() is True\n\n    def test_custom_model_forward_override(self):\n        tmp_model = get_test_model_lp_neg()\n        model = CustomModelOverrideForward(tmp_model.encoder, tmp_model.decoder)\n\n        batch = Batch(False)\n\n        batch.node_embeddings = node_embeddings\n        batch.edges = batch_edges\n\n        scores1, scores2, scores3, scores4 = model.forward_lp(batch=batch, train=False)\n\n        assert torch.all(torch.eq(scores1, 3 * torch.ones_like(scores1))).item() is True\n        assert torch.all(torch.eq(scores2, 2 * torch.ones_like(scores1))).item() is True\n        assert torch.all(torch.eq(scores3, 1 * torch.ones_like(scores1))).item() is True\n        assert torch.all(torch.eq(scores4, 0 * torch.ones_like(scores1))).item() is True\n\n    def test_init_from_config(self):\n        pass\n"
  },
  {
    "path": "test/python/constants.py",
    "content": "import os\n\nTESTING_CONFIG_DIR = os.environ.get(\"MARIUS_TEST_HOME\", \"\") + \"test_configs/\"\nTESTING_DATA_DIR = os.environ.get(\"MARIUS_TEST_HOME\", \"\") + \"test_data/\"\nTMP_TEST_DIR = os.environ.get(\"MARIUS_TEST_HOME\", \"\") + \"TMP_PYTHON_TEST_DIR/\"\n"
  },
  {
    "path": "test/python/helpers.py",
    "content": "import random\nfrom pathlib import Path\n\n\ndef dataset_generator(\n    train_file,\n    valid_file,\n    test_file,\n    train_len=1000,\n    valid_len=100,\n    test_len=100,\n    delim=\"\\t\",\n    start_col=0,\n    num_line_skip=0,\n):\n    with open(str(Path(train_file)), \"w\") as f:\n        for i in range(num_line_skip):\n            f.write(\"This is a line needs to be skipped.\\n\")\n        for i in range(train_len):\n            src = random.randint(1, 100)\n            dst = random.randint(1, 100)\n            rel = random.randint(101, 110)\n            for j in range(start_col):\n                f.write(\"col_\" + str(j) + delim)\n            f.write(str(src) + delim + str(rel) + delim + str(dst) + \"\\n\")\n    f.close()\n\n    with open(str(Path(valid_file)), \"w\") as f:\n        for i in range(num_line_skip):\n            f.write(\"This is a line needs to be skipped.\\n\")\n        for i in range(valid_len):\n            src = random.randint(1, 100)\n            dst = random.randint(1, 100)\n            rel = random.randint(101, 110)\n            for j in range(start_col):\n                f.write(\"col_\" + str(j) + delim)\n            f.write(str(src) + delim + str(rel) + delim + str(dst) + \"\\n\")\n    f.close()\n\n    with open(str(Path(test_file)), \"w\") as f:\n        for i in range(num_line_skip):\n            f.write(\"This is a line needs to be skipped.\\n\")\n        for i in range(test_len):\n            src = random.randint(1, 100)\n            dst = random.randint(1, 100)\n            rel = random.randint(101, 110)\n            for j in range(start_col):\n                f.write(\"col_\" + str(j) + delim)\n            f.write(str(src) + delim + str(rel) + delim + str(dst) + \"\\n\")\n    f.close()\n"
  },
  {
    "path": "test/python/postprocessing/test_in_memory_exporter.py",
    "content": "import glob\nimport os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pandas as pd\n\nimport marius as m\nfrom marius.tools.postprocess.in_memory_exporter import InMemoryExporter\n\n\ndef check_output(output_dir, fmt, has_rels=False):\n    # check created\n    assert (output_dir / (\"embeddings.\" + fmt)).exists()\n    assert (output_dir / (\"encoded_nodes.\" + fmt)).exists()\n    assert (output_dir / \"model.pt\").exists()\n\n    # check embeddings\n    if fmt == \"csv\":\n        base_embeddings_df = pd.read_csv(\n            output_dir / (\"embeddings.\" + fmt),\n            header=0,\n            converters={\"embedding\": lambda x: x.strip(\"[]\").strip().split()},\n        )\n\n        assert base_embeddings_df.shape[0] == 100  # check matches number of nodes\n        assert base_embeddings_df.shape[1] == 2  # has two columns\n        assert len(base_embeddings_df[\"embedding\"][0]) == 10\n    elif fmt == \"parquet\":\n        base_embeddings_df = pd.read_parquet(output_dir / (\"embeddings.\" + fmt))\n        assert base_embeddings_df.shape[0] == 100  # check matches number of nodes\n        assert base_embeddings_df.shape[1] == 2  # has two columns\n        assert len(base_embeddings_df.iloc[0, 1]) == 10  # the second column has list values for the embeddings\n    else:\n        raise RuntimeError(\"Unknown format\")\n\n    if fmt == \"csv\":\n        encoded_nodes_df = pd.read_csv(\n            output_dir / (\"encoded_nodes.\" + fmt),\n            header=0,\n            converters={\"embedding\": lambda x: x.strip(\"[]\").strip().split()},\n        )\n\n        assert encoded_nodes_df.shape[0] == 100\n        assert encoded_nodes_df.shape[1] == 2\n        assert len(encoded_nodes_df[\"embedding\"][0]) == 10\n    elif fmt == \"parquet\":\n        encoded_nodes_df = pd.read_parquet(output_dir / (\"encoded_nodes.\" + fmt))\n        assert encoded_nodes_df.shape[0] == 100\n        assert encoded_nodes_df.shape[1] == 2\n        assert len(encoded_nodes_df.iloc[0, 1]) == 10\n\n    if has_rels:\n        if fmt == \"csv\":\n            rel_embs_df = pd.read_csv(\n                output_dir / (\"relation_embeddings.\" + fmt),\n                header=0,\n                converters={\"embedding\": lambda x: x.strip(\"[]\").strip().split()},\n            )\n\n            assert rel_embs_df.shape[0] == 10\n            assert rel_embs_df.shape[1] == 2\n            assert len(rel_embs_df[\"embedding\"][0]) == 10\n\n            rel_embs_df = pd.read_csv(\n                output_dir / (\"inverse_relation_embeddings.\" + fmt),\n                header=0,\n                converters={\"embedding\": lambda x: x.strip(\"[]\").strip().split()},\n            )\n\n            assert rel_embs_df.shape[0] == 10\n            assert rel_embs_df.shape[1] == 2\n            assert len(rel_embs_df[\"embedding\"][0]) == 10\n        elif fmt == \"parquet\":\n            rel_embs_df = pd.read_parquet(output_dir / (\"relation_embeddings.\" + fmt))\n            assert rel_embs_df.shape[0] == 10\n            assert rel_embs_df.shape[1] == 2\n            assert len(rel_embs_df.iloc[0, 1]) == 10\n\n            rel_embs_df = pd.read_parquet(output_dir / (\"inverse_relation_embeddings.\" + fmt))\n            assert rel_embs_df.shape[0] == 10\n            assert rel_embs_df.shape[1] == 2\n            assert len(rel_embs_df.iloc[0, 1]) == 10\n\n\nclass TestLP(unittest.TestCase):\n    config_file = None\n\n    @classmethod\n    def setUp(self):\n        if not Path(TMP_TEST_DIR).exists():\n            Path(TMP_TEST_DIR).mkdir()\n\n        base_dir = TMP_TEST_DIR\n\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"export_lp\"\n        generate_random_dataset(\n            output_dir=base_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            base_dir / Path(name),\n            model_names=[\"gs_1_layer\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        self.model_dir = Path(base_dir) / name / \"model_0\"\n\n        for filename in os.listdir(base_dir / Path(name)):\n            if filename.startswith(\"M-\"):\n                self.config_file = base_dir / Path(name) / Path(filename)\n\n        config = m.config.loadConfig(self.config_file.__str__(), True)\n        config.storage.export_encoded_nodes = True\n        m.manager.marius_train(config)\n\n    @classmethod\n    def tearDown(self):\n        if Path(TMP_TEST_DIR).exists():\n            shutil.rmtree(Path(TMP_TEST_DIR))\n\n    def test_export_csv(self):\n        assert self.model_dir.exists()\n        exporter = InMemoryExporter(self.model_dir, fmt=\"csv\")\n\n        exporter.export(self.model_dir)\n        check_output(self.model_dir, fmt=\"csv\")\n\n        exporter.export(self.model_dir.parent / \"model_tmp\")\n        check_output(self.model_dir.parent / \"model_tmp\", fmt=\"csv\", has_rels=True)\n\n    def test_export_binary(self):\n        assert self.model_dir.exists()\n        exporter = InMemoryExporter(self.model_dir, fmt=\"bin\")\n\n        # nothing new should be created since the output dir matches input dir\n        exporter.export(self.model_dir)\n\n        # copies full model directory\n        exporter.export(self.model_dir.parent / \"model_tmp\")\n\n        # check contents of input match contents of output\n        input_files = glob.glob(self.model_dir.__str__() + \"/*\")\n        output_files = glob.glob((self.model_dir.parent / \"model_tmp\").__str__() + \"/*\")\n        assert len(input_files) == len(output_files)\n\n    def test_export_parquet(self):\n        assert self.model_dir.exists()\n        exporter = InMemoryExporter(self.model_dir, fmt=\"parquet\")\n\n        exporter.export(self.model_dir)\n        check_output(self.model_dir, fmt=\"parquet\")\n\n        exporter.export(self.model_dir.parent / \"model_tmp\")\n        check_output(self.model_dir.parent / \"model_tmp\", fmt=\"parquet\", has_rels=True)\n\n    # TODO add testing support for s3. Need a test bucket somewhere\n    # def test_export_s3(self):\n    #     assert self.model_dir.exists()\n    #     exporter = InMemoryExporter(self.model_dir)\n    #     exporter.export(s3_path)\n\n    def test_export_no_model(self):\n        try:\n            InMemoryExporter(Path(\"TEST_NOT_A_DIR\"))\n            raise RuntimeError(\"Exception not thrown\")\n        except RuntimeError:\n            pass\n\n    def test_export_overwrite(self):\n        test_dir = self.model_dir.parent / \"model_tmp\"\n        exporter = InMemoryExporter(self.model_dir, fmt=\"csv\", overwrite=False)\n        exporter.export(test_dir)\n        check_output(test_dir, fmt=\"csv\", has_rels=True)\n\n        try:\n            exporter.export(test_dir)\n            raise RuntimeError(\"Exception not thrown\")\n        except RuntimeError:\n            pass\n\n        exporter.overwrite = True\n        exporter.export(test_dir)\n        check_output(test_dir, fmt=\"csv\", has_rels=True)\n\n\nclass TestNC(unittest.TestCase):\n    config_file = None\n\n    @classmethod\n    def setUp(self):\n        if not Path(TMP_TEST_DIR).exists():\n            Path(TMP_TEST_DIR).mkdir()\n\n        base_dir = TMP_TEST_DIR\n\n        num_nodes = 100\n        num_rels = 1\n        num_edges = 1000\n\n        name = \"nc_export\"\n        generate_random_dataset(\n            output_dir=base_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            feature_dim=10,\n            splits=[0.9, 0.05, 0.05],\n            task=\"nc\",\n        )\n\n        generate_configs_for_dataset(\n            base_dir / Path(name),\n            model_names=[\"gs_1_layer_emb\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"nc\",\n        )\n\n        self.model_dir = Path(base_dir) / name / \"model_0\"\n\n        for filename in os.listdir(base_dir / Path(name)):\n            if filename.startswith(\"M-\"):\n                self.config_file = base_dir / Path(name) / Path(filename)\n\n        config = m.config.loadConfig(self.config_file.__str__(), True)\n        config.storage.export_encoded_nodes = True\n        m.manager.marius_train(config)\n\n    @classmethod\n    def tearDown(self):\n        if Path(TMP_TEST_DIR).exists():\n            shutil.rmtree(Path(TMP_TEST_DIR))\n\n    def test_export_csv(self):\n        assert self.model_dir.exists()\n        exporter = InMemoryExporter(self.model_dir, fmt=\"csv\")\n\n        exporter.export(self.model_dir)\n        check_output(self.model_dir, fmt=\"csv\")\n\n        exporter.export(self.model_dir.parent / \"model_tmp\")\n        check_output(self.model_dir.parent / \"model_tmp\", fmt=\"csv\")\n\n    def test_export_binary(self):\n        assert self.model_dir.exists()\n        exporter = InMemoryExporter(self.model_dir, fmt=\"bin\")\n\n        # nothing new should be created since the output dir matches input dir\n        exporter.export(self.model_dir)\n\n        # copies full model directory\n        exporter.export(self.model_dir.parent / \"model_tmp\")\n\n        # check contents of input match contents of output\n        input_files = glob.glob(self.model_dir.__str__() + \"/*\")\n        output_files = glob.glob((self.model_dir.parent / \"model_tmp\").__str__() + \"/*\")\n        assert len(input_files) == len(output_files)\n\n    def test_export_parquet(self):\n        assert self.model_dir.exists()\n        exporter = InMemoryExporter(self.model_dir, fmt=\"parquet\")\n\n        exporter.export(self.model_dir)\n        check_output(self.model_dir, fmt=\"parquet\")\n\n        exporter.export(self.model_dir.parent / \"model_tmp\")\n        check_output(self.model_dir.parent / \"model_tmp\", fmt=\"parquet\")\n\n    # TODO add testing support for s3. Need a test bucket somewhere\n    # def test_export_s3(self):\n    #     assert self.model_dir.exists()\n    #     exporter = InMemoryExporter(self.model_dir)\n    #     exporter.export(s3_path)\n\n    def test_export_no_model(self):\n        try:\n            InMemoryExporter(Path(\"TEST_NOT_A_DIR\"))\n            raise RuntimeError(\"Exception not thrown\")\n        except RuntimeError:\n            pass\n\n    def test_export_overwrite(self):\n        test_dir = self.model_dir.parent / \"model_tmp\"\n        exporter = InMemoryExporter(self.model_dir, fmt=\"csv\", overwrite=False)\n        exporter.export(test_dir)\n        check_output(test_dir, fmt=\"csv\")\n\n        try:\n            exporter.export(test_dir)\n            raise RuntimeError(\"Exception not thrown\")\n        except RuntimeError:\n            pass\n\n        exporter.overwrite = True\n        exporter.export(test_dir)\n        check_output(test_dir, fmt=\"csv\")\n"
  },
  {
    "path": "test/python/predict/test_predict.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TMP_TEST_DIR\nfrom test.test_configs.generate_test_configs import generate_configs_for_dataset\nfrom test.test_data.generate import generate_random_dataset\n\nimport pandas as pd\n\nimport marius as m\nfrom marius.tools.marius_predict import run_predict, set_args\n\n\ndef validate_metrics(config, metrics, num_items, output_dir=None):\n    if output_dir is None:\n        metrics_file = Path(config.storage.model_dir) / Path(\"metrics.txt\")\n\n    else:\n        metrics_file = Path(output_dir) / Path(\"metrics.txt\")\n\n    assert metrics_file.exists()\n\n    if config.model.learning_task == m.config.LearningTask.LINK_PREDICTION:\n        task = \"Link Prediction:\"\n\n        if config.model.decoder.options.inverse_edges:\n            factor = 2\n        else:\n            factor = 1\n    else:\n        task = \"Node Classification:\"\n        factor = 1\n\n    found = []\n    for _ in metrics:\n        found.append(False)\n\n    report_items = -1\n    with open(metrics_file) as f:\n        for line in f.readlines():\n            if line.startswith(task):\n                report_items = int(line.split(\": \")[-1].split(\" \")[0])\n            else:\n                for i, metric in enumerate(metrics):\n                    if line.upper().startswith(metric.upper()):\n                        found[i] = True\n\n    # Check that all metrics have been found in the report\n    for f in found:\n        assert f\n\n    # Check the report contains the correct amount of evaluation edges/nodes\n    assert report_items == factor * num_items\n\n\ndef validate_scores(config, num_edges, save_scores, save_ranks, output_dir=None):\n    if output_dir is None:\n        scores_file = Path(config.storage.model_dir) / Path(\"scores.csv\")\n    else:\n        scores_file = Path(output_dir) / Path(\"scores.csv\")\n\n    assert scores_file.exists()\n\n    scores_df = pd.read_csv(scores_file, delimiter=\"\", header=None)\n\n    if config.storage.dataset.num_relations > 1:\n        num_cols = 3\n    else:\n        num_cols = 2\n\n    if save_scores:\n        num_cols += 1\n\n    if save_ranks:\n        num_cols += 1\n\n    assert scores_df.shape[0] == num_edges\n    assert scores_df.shape[1] == num_cols\n\n\ndef validate_labels(config, num_nodes, output_dir=None):\n    if output_dir is None:\n        labels_file = Path(config.storage.model_dir) / Path(\"labels.csv\")\n    else:\n        labels_file = Path(output_dir) / Path(\"labels.csv\")\n\n    assert labels_file.exists()\n\n    labels_df = pd.read_csv(labels_file, delimiter=\"\", header=None)\n    num_cols = 2\n\n    assert labels_df.shape[0] == num_nodes\n    assert labels_df.shape[1] == num_cols\n\n\nclass TestPredictLP(unittest.TestCase):\n    config_file = None\n\n    @classmethod\n    def setUp(self):\n        if not Path(TMP_TEST_DIR).exists():\n            Path(TMP_TEST_DIR).mkdir()\n\n        base_dir = TMP_TEST_DIR\n\n        num_nodes = 100\n        num_rels = 10\n        num_edges = 1000\n\n        name = \"basic_lp\"\n        generate_random_dataset(\n            output_dir=base_dir / Path(name),\n            num_nodes=num_nodes,\n            num_edges=num_edges,\n            num_rels=num_rels,\n            splits=[0.9, 0.05, 0.05],\n            task=\"lp\",\n        )\n\n        generate_configs_for_dataset(\n            base_dir / Path(name),\n            model_names=[\"distmult\"],\n            storage_names=[\"in_memory\"],\n            training_names=[\"sync\"],\n            evaluation_names=[\"sync\"],\n            task=\"lp\",\n        )\n\n        for filename in os.listdir(base_dir / Path(name)):\n            if filename.startswith(\"M-\"):\n                self.config_file = base_dir / Path(name) / Path(filename)\n\n        config = m.config.loadConfig(self.config_file.__str__(), True)\n        m.manager.marius_train(config)\n\n    @classmethod\n    def tearDown(self):\n        if Path(TMP_TEST_DIR).exists():\n            shutil.rmtree(Path(TMP_TEST_DIR))\n\n    def test_basic_lp(self):\n        parser = set_args()\n        args = parser.parse_args([\"--config\", self.config_file.__str__(), \"--metrics\", \"mrr\"])\n        run_predict(args)\n\n        config = m.config.loadConfig(self.config_file.__str__(), save=False)\n        validate_metrics(config, [\"MRR\"], config.storage.dataset.num_test)\n\n    def test_lp_metrics(self):\n        parser = set_args()\n        args = parser.parse_args(\n            [\n                \"--config\",\n                self.config_file.__str__(),\n                \"--metrics\",\n                \"mrr\",\n                \"mr\",\n                \"hits1\",\n                \"hits2\",\n                \"hits3\",\n                \"hits4\",\n                \"hits5\",\n                \"hits10\",\n                \"hits20\",\n            ]\n        )\n        run_predict(args)\n\n        config = m.config.loadConfig(self.config_file.__str__(), save=False)\n        validate_metrics(\n            config,\n            [\"MRR\", \"MEAN RANK\", \"HITS@1\", \"HITS@2\", \"HITS@3\", \"HITS@4\", \"HITS@5\", \"HITS@10\", \"HITS@20\"],\n            config.storage.dataset.num_test,\n        )\n\n    def test_predict_model_dir(self):\n        # 1st prediction pass, only model_0/ exists in this case and prediction uses the same directory.\n        parser = set_args()\n        args = parser.parse_args(\n            [\n                \"--config\",\n                self.config_file.__str__(),\n                \"--metrics\",\n                \"mrr\",\n                \"mr\",\n                \"hits1\",\n                \"hits2\",\n                \"hits3\",\n                \"hits4\",\n                \"hits5\",\n                \"hits10\",\n                \"hits20\",\n            ]\n        )\n        run_predict(args)\n\n        config = m.config.loadConfig(self.config_file.__str__(), save=False)\n\n        prediction_out_dir = config.storage.dataset.dataset_dir + \"model_0/\"\n        assert config.storage.model_dir == prediction_out_dir, \"Prediction should have used {} directory\".format(\n            prediction_out_dir\n        )\n        validate_metrics(\n            config,\n            [\"MRR\", \"MEAN RANK\", \"HITS@1\", \"HITS@2\", \"HITS@3\", \"HITS@4\", \"HITS@5\", \"HITS@10\", \"HITS@20\"],\n            config.storage.dataset.num_test,\n        )\n\n        # 2st prediction pass, model_0/ and model_1/ exist in this case and prediction uses model_1/ directory.\n        config = m.config.loadConfig(self.config_file.__str__(), True)\n        m.manager.marius_train(config)\n        run_predict(args)\n\n        config = m.config.loadConfig(self.config_file.__str__(), save=False)\n\n        prediction_out_dir = config.storage.dataset.dataset_dir + \"model_1/\"\n        assert config.storage.model_dir == prediction_out_dir, \"Prediction should have used {} directory\".format(\n            prediction_out_dir\n        )\n        validate_metrics(\n            config,\n            [\"MRR\", \"MEAN RANK\", \"HITS@1\", \"HITS@2\", \"HITS@3\", \"HITS@4\", \"HITS@5\", \"HITS@10\", \"HITS@20\"],\n            config.storage.dataset.num_test,\n        )\n\n        # specify model_dir path in the config. in this case, we set it to model_1/.\n        # even when you train another model which ends up getting stored in model_2/,\n        # model_predict will still use model_1/ because `model_dir` is explicitly specified in the config.\n        full_config_file = Path(config.storage.model_dir) / Path(\"full_config.yaml\")\n        config = m.config.loadConfig(self.config_file.__str__(), True)\n\n        model_2_path = Path(config.storage.dataset.dataset_dir) / Path(\"model_2\")\n        assert model_2_path.exists() is True, \"{} should have been created\".format(str(model_2_path))\n\n        m.manager.marius_train(config)\n\n        config = m.config.loadConfig(self.config_file.__str__(), True)\n        model_3_path = Path(config.storage.dataset.dataset_dir) / Path(\"model_3\")\n        assert model_3_path.exists() is True, \"{} should have been created\".format(str(model_3_path))\n\n        # run predict speifying model_dir as model_1\n        args = parser.parse_args(\n            [\n                \"--config\",\n                full_config_file.__str__(),\n                \"--metrics\",\n                \"mrr\",\n                \"mr\",\n                \"hits1\",\n                \"hits2\",\n                \"hits3\",\n                \"hits4\",\n                \"hits5\",\n                \"hits10\",\n                \"hits20\",\n            ]\n        )\n        run_predict(args)\n\n        config = m.config.loadConfig(full_config_file.__str__(), save=False)\n\n        assert config.storage.model_dir == prediction_out_dir, \"Prediction should have used {} directory\".format(\n            prediction_out_dir\n        )\n        validate_metrics(\n            config,\n            [\"MRR\", \"MEAN RANK\", \"HITS@1\", \"HITS@2\", \"HITS@3\", \"HITS@4\", \"HITS@5\", \"HITS@10\", \"HITS@20\"],\n            config.storage.dataset.num_test,\n        )\n\n    def test_lp_save_ranks(self):\n        pass\n\n    def test_lp_save_scores(self):\n        pass\n"
  },
  {
    "path": "test/python/preprocessing/test_spark_converter.py",
    "content": "import shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TESTING_DATA_DIR, TMP_TEST_DIR\nfrom test.python.preprocessing.test_torch_converter import validate_output_dir, validate_partitioned_output_dir\n\nimport numpy as np\nimport pandas as pd\nimport pytest\n\ntest_files = [\"train_edges.txt\", \"valid_edges.txt\", \"test_edges.txt\"]\n\ntry:\n    from marius.tools.configuration.marius_config import DatasetConfig\n    from marius.tools.preprocess.converters.spark_converter import SparkEdgeListConverter\n\n    pyspark_imported = True\nexcept ImportError:\n    pyspark_imported = False\n\n\n@pytest.mark.skipif(not pyspark_imported, reason=\"Pyspark must be installed to run these tests.\")\nclass TestSparkConverter(unittest.TestCase):\n    @classmethod\n    def setUp(self):\n        if not Path(TMP_TEST_DIR).exists():\n            Path(TMP_TEST_DIR).mkdir()\n\n        for test_file in test_files:\n            shutil.copy(str(Path(TESTING_DATA_DIR) / Path(test_file)), str(Path(TMP_TEST_DIR) / Path(test_file)))\n        pass\n\n    @classmethod\n    def tearDown(self):\n        pass\n        if Path(TMP_TEST_DIR).exists():\n            shutil.rmtree(Path(TMP_TEST_DIR))\n\n    def make_directory_tree(self, dir_path):\n        output_dir = Path(TMP_TEST_DIR) / Path(dir_path)\n        output_dir.mkdir()\n        nodes_out_dir = output_dir / Path(\"nodes\")\n        edges_out_dir = output_dir / Path(\"edges\")\n        nodes_out_dir.mkdir()\n        edges_out_dir.mkdir()\n        return output_dir\n\n    def test_delimited_defaults(self):\n        output_dir = self.make_directory_tree(\"test_delim_default\")\n\n        converter = SparkEdgeListConverter(\n            output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), delim=\" \"\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_delimited_str_ids(self):\n        output_dir = self.make_directory_tree(\"test_delimited_str_ids\")\n\n        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n\n        tmp[0] = tmp[0].map(str) + \"_test\"\n        tmp[1] = tmp[1].map(str) + \"_test\"\n        tmp[2] = tmp[2].map(str) + \"_test\"\n\n        tmp.to_csv(Path(TMP_TEST_DIR) / Path(\"str_train_edges.txt\"), header=None, sep=\" \", index=False)\n\n        converter = SparkEdgeListConverter(\n            output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path(\"str_train_edges.txt\"), delim=\" \"\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    # randomSplit doesn't split the df in the exact ratio, outputs a close one though. skippping this one.\n\n    def test_columns(self):\n        output_dir = self.make_directory_tree(\"test_columns\")\n\n        converter = SparkEdgeListConverter(\n            output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), delim=\" \", columns=[0, 2]\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_header(self):\n        output_dir = self.make_directory_tree(\"test_header\")\n\n        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        tmp.to_csv(\n            Path(TMP_TEST_DIR) / Path(\"header_train_edges.txt\"), header=[\"src\", \"rel\", \"dst\"], sep=\" \", index=False\n        )\n\n        converter = SparkEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"header_train_edges.txt\"),\n            delim=\" \",\n            header_length=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_delim(self):\n        output_dir = self.make_directory_tree(\"test_delim\")\n\n        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        tmp.to_csv(Path(TMP_TEST_DIR) / Path(\"delim_train_edges.txt\"), header=None, sep=\",\", index=False)\n\n        converter = SparkEdgeListConverter(\n            output_dir=output_dir, train_edges=Path(TMP_TEST_DIR) / Path(\"delim_train_edges.txt\"), delim=\",\"\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_partitions(self):\n        output_dir = self.make_directory_tree(\"test_partitions\")\n\n        converter = SparkEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            num_partitions=10,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_partitioned_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, num_partitions=10\n        )\n\n        converter = SparkEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            num_partitions=100,\n        )\n\n        converter.convert()\n\n        validate_partitioned_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, num_partitions=100\n        )\n"
  },
  {
    "path": "test/python/preprocessing/test_torch_converter.py",
    "content": "import os\nimport shutil\nimport unittest\nfrom pathlib import Path\nfrom test.python.constants import TESTING_DATA_DIR, TMP_TEST_DIR\n\nimport numpy as np\nimport pandas as pd\nfrom omegaconf import MISSING, OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.configuration.marius_config import DatasetConfig\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter\n\nimport torch  # isort:skip\n\ntest_files = [\"train_edges.txt\", \"train_edges_weights.txt\", \"valid_edges.txt\", \"test_edges.txt\"]\n\n\ndef validate_partitioned_output_dir(\n    output_dir: Path,\n    expected_stats: DatasetConfig,\n    num_partitions,\n    dtype=np.int32,\n    weight_dtype=np.float32,\n    partitioned_eval=False,\n    has_weights=False,\n):\n    print(\"Validate partioned called with value\", has_weights)\n    validate_output_dir(output_dir, expected_stats, dtype, remap_ids=True)\n\n    train_edges_path = output_dir / Path(PathConstants.train_edges_path)\n    train_edge_buckets_path = output_dir / Path(PathConstants.train_edge_buckets_path)\n\n    assert train_edge_buckets_path.exists()\n    with open(train_edge_buckets_path, \"r\") as f:\n        train_buckets_sizes = f.readlines()\n        assert len(train_buckets_sizes) == num_partitions**2\n\n    train_edges = np.fromfile(train_edges_path, dtype).reshape(expected_stats.num_train, -1)\n\n    offset = 0\n    node_partition_size = np.ceil(expected_stats.num_nodes / num_partitions)\n    for i in range(num_partitions):\n        src_lower_bound = i * node_partition_size\n        src_upper_bound = src_lower_bound + node_partition_size\n\n        for j in range(num_partitions):\n            bucket_size = int(train_buckets_sizes[i * num_partitions + j])\n\n            if bucket_size != 0:\n                edge_bucket = train_edges[offset : offset + bucket_size]\n\n                dst_lower_bound = j * node_partition_size\n                dst_upper_bound = dst_lower_bound + node_partition_size\n\n                src_col = edge_bucket[:, 0]\n                dst_col = edge_bucket[:, -1]\n\n                assert np.all((src_col >= src_lower_bound) & (src_col < src_upper_bound))\n                assert np.all((dst_col >= dst_lower_bound) & (dst_col < dst_upper_bound))\n\n                offset += bucket_size\n\n    assert offset == expected_stats.num_train\n\n    print(\"Checking with has_weight of\", has_weights)\n    if has_weights:\n        weights_file_path = output_dir / Path(PathConstants.train_edges_weights_path)\n        assert weights_file_path.exists()\n        values = np.fromfile(weights_file_path, dtype=weight_dtype)\n        values = np.sort(values)\n        for i in range(len(values)):\n            assert values[i] == float(i)\n\n\ndef validate_output_dir(\n    output_dir: Path,\n    expected_stats: DatasetConfig,\n    dtype=np.int32,\n    remap_ids=True,\n    has_weights=False,\n    weight_dtype=np.float32,\n):\n    assert output_dir.exists()\n    assert (output_dir / Path(\"edges\")).exists()\n    assert (output_dir / Path(\"nodes\")).exists()\n\n    dataset_stats = OmegaConf.load(output_dir / Path(\"dataset.yaml\"))\n\n    assert Path(dataset_stats.dataset_dir).absolute().__str__() == Path(expected_stats.dataset_dir).absolute().__str__()\n    assert dataset_stats.num_edges == expected_stats.num_edges\n    assert dataset_stats.num_relations == expected_stats.num_relations\n    assert dataset_stats.num_nodes == expected_stats.num_nodes\n    assert dataset_stats.num_train == expected_stats.num_train\n    assert dataset_stats.get(\"num_valid\", MISSING) == expected_stats.num_valid\n    assert dataset_stats.get(\"num_test\", MISSING) == expected_stats.num_test\n\n    num_columns = 3\n    if dataset_stats.num_relations == 1:\n        num_columns = 2\n\n    train_edges_path = output_dir / Path(PathConstants.train_edges_path)\n    valid_edges_path = output_dir / Path(PathConstants.valid_edges_path)\n    test_edges_path = output_dir / Path(PathConstants.test_edges_path)\n\n    dtype_size = 4\n    if dtype == np.int64:\n        dtype_size = 8\n\n    assert train_edges_path.exists()\n    assert os.path.getsize(train_edges_path) == dataset_stats.num_train * num_columns * dtype_size\n    train_edges = np.fromfile(train_edges_path, dtype)\n    assert train_edges.reshape(dataset_stats.num_train, -1).shape[1] == num_columns\n\n    if dataset_stats.get(\"num_valid\", MISSING) != MISSING and dataset_stats.get(\"num_valid\", MISSING) != -1:\n        assert valid_edges_path.exists()\n        valid_edges = np.fromfile(valid_edges_path, dtype)\n        assert valid_edges.reshape(dataset_stats.num_valid, -1).shape[1] == num_columns\n    else:\n        assert not valid_edges_path.exists()\n\n    if dataset_stats.get(\"num_test\", MISSING) != MISSING and dataset_stats.get(\"num_test\", MISSING) != -1:\n        assert test_edges_path.exists()\n        test_edges = np.fromfile(test_edges_path, dtype)\n        assert test_edges.reshape(dataset_stats.num_test, -1).shape[1] == num_columns\n    else:\n        assert not test_edges_path.exists()\n\n    node_mapping_path = output_dir / Path(PathConstants.node_mapping_path)\n    relation_mapping_path = output_dir / Path(PathConstants.relation_mapping_path)\n    if remap_ids:\n        assert node_mapping_path.exists()\n        node_mapping_df = pd.read_csv(node_mapping_path, sep=\",\", header=None)\n        assert node_mapping_df.shape[0] == dataset_stats.num_nodes\n        if num_columns == 3:\n            assert relation_mapping_path.exists()\n            relation_mapping_df = pd.read_csv(relation_mapping_path, sep=\",\", header=None)\n            assert relation_mapping_df.shape[0] == dataset_stats.num_relations\n        else:\n            assert not relation_mapping_path.exists()\n    else:\n        assert not node_mapping_path.exists()\n        assert not relation_mapping_path.exists()\n\n    print(\"Checking with has_weight of\", has_weights)\n    if has_weights:\n        weights_file_path = output_dir / Path(PathConstants.train_edges_weights_path)\n        assert weights_file_path.exists()\n        values = np.fromfile(weights_file_path, dtype=weight_dtype)\n        for i in range(len(values)):\n            assert values[i] == float(i)\n\n\nclass TestTorchConverter(unittest.TestCase):\n    \"\"\"\n    Tests for the general preprocessor\n    \"\"\"\n\n    @classmethod\n    def setUp(self):\n        if not Path(TMP_TEST_DIR).exists():\n            Path(TMP_TEST_DIR).mkdir()\n\n        for test_file in test_files:\n            shutil.copy(str(Path(TESTING_DATA_DIR) / Path(test_file)), str(Path(TMP_TEST_DIR) / Path(test_file)))\n        pass\n\n    @classmethod\n    def tearDown(self):\n        if Path(TMP_TEST_DIR).exists():\n            shutil.rmtree(Path(TMP_TEST_DIR))\n\n    def test_delimited_defaults(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_delim_default\")\n        output_dir.mkdir()\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_delimited_str_ids(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_delim_str_ids\")\n        output_dir.mkdir()\n\n        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n\n        tmp[0] = tmp[0].map(str) + \"_test\"\n        tmp[1] = tmp[1].map(str) + \"_test\"\n        tmp[2] = tmp[2].map(str) + \"_test\"\n\n        tmp.to_csv(Path(TMP_TEST_DIR) / Path(\"str_train_edges.txt\"), header=None, sep=\" \", index=False)\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"str_train_edges.txt\"),\n            delim=\" \",\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_numpy_defaults(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_numpy_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n\n        train_edges = train_edges_df.to_numpy()\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            format=\"numpy\",\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_pytorch_defaults(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n\n        train_edges = torch.tensor(train_edges_df.to_numpy())\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            format=\"pytorch\",\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_splits(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_splits\")\n        output_dir.mkdir()\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            splits=[0.9, 0.05, 0.05],\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 900\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 900\n        expected_stats.num_valid = 50\n        expected_stats.num_test = 50\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_columns(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_columns\")\n        output_dir.mkdir()\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            src_column=0,\n            dst_column=2,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_header(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_header\")\n        output_dir.mkdir()\n\n        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        tmp.to_csv(\n            Path(TMP_TEST_DIR) / Path(\"header_train_edges.txt\"), header=[\"src\", \"rel\", \"dst\"], sep=\" \", index=False\n        )\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"header_train_edges.txt\"),\n            delim=\" \",\n            header_length=1,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_delim(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_delim\")\n        output_dir.mkdir()\n\n        tmp = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        tmp.to_csv(Path(TMP_TEST_DIR) / Path(\"delim_train_edges.txt\"), header=None, sep=\",\", index=False)\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"delim_train_edges.txt\"),\n            delim=\",\",\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=True)\n\n    def test_dtype(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_dtype\")\n        output_dir.mkdir()\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            dtype=\"int64\",\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir,\n            expected_stats=expected_stats,\n            dtype=np.int64,\n            weight_dtype=np.float64,\n            remap_ids=True,\n        )\n\n    def test_partitions(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_partitions\")\n        output_dir.mkdir()\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            num_partitions=10,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_partitioned_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, num_partitions=10\n        )\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            num_partitions=100,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        validate_partitioned_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, num_partitions=100\n        )\n\n    def test_no_remap(self):\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_dtype\")\n        output_dir.mkdir()\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"),\n            delim=\" \",\n            remap_ids=False,\n            num_nodes=100,\n            num_rels=10,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n        )\n\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=False)\n\n    def test_torch_no_relation_no_remap(self):\n        remap_val = False\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        train_edges = torch.tensor(train_edges_df.to_numpy())\n\n        num_rows = train_edges.size(0)\n        train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            num_nodes=100,\n            format=\"pytorch\",\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val)\n\n    def test_pandas_no_relation_no_remap(self):\n        remap_val = False\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_file = Path(TMP_TEST_DIR) / Path(\"train_edges_weights.txt\")\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges_file,\n            delim=\" \",\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            num_nodes=100,\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val)\n\n    def test_torch_no_relation_remap(self):\n        remap_val = True\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        train_edges = torch.tensor(train_edges_df.to_numpy())\n\n        num_rows = train_edges.size(0)\n        train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            num_nodes=100,\n            format=\"pytorch\",\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val)\n\n    def test_pandas_no_relation_remap(self):\n        remap_val = True\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_file = Path(TMP_TEST_DIR) / Path(\"train_edges_weights.txt\")\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges_file,\n            delim=\" \",\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            num_nodes=100,\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val)\n\n    def test_torch_only_weights_no_remap(self):\n        remap_val = False\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        train_edges = torch.tensor(train_edges_df.to_numpy())\n\n        num_rows = train_edges.size(0)\n        train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            edge_weight_column=3,\n            num_nodes=100,\n            format=\"pytorch\",\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True\n        )\n\n    def test_pandas_only_weights_no_remap(self):\n        remap_val = False\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_file = Path(TMP_TEST_DIR) / Path(\"train_edges_weights.txt\")\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges_file,\n            delim=\" \",\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            edge_weight_column=3,\n            num_nodes=100,\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True\n        )\n\n    def test_torch_only_weights_remap(self):\n        remap_val = True\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        train_edges = torch.tensor(train_edges_df.to_numpy())\n\n        num_rows = train_edges.size(0)\n        train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            edge_weight_column=3,\n            num_nodes=100,\n            format=\"pytorch\",\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True\n        )\n\n    def test_pandas_only_weights_remap(self):\n        remap_val = True\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_file = Path(TMP_TEST_DIR) / Path(\"train_edges_weights.txt\")\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges_file,\n            delim=\" \",\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            edge_weight_column=3,\n            num_nodes=100,\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 1\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True\n        )\n\n    def test_torch_relationship_weights_no_remap(self):\n        remap_val = False\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        train_edges = torch.tensor(train_edges_df.to_numpy())\n\n        num_rows = train_edges.size(0)\n        train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            edge_weight_column=3,\n            num_nodes=100,\n            num_rels=10,\n            format=\"pytorch\",\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True\n        )\n\n    def test_pandas_relationship_weights_no_remap(self):\n        remap_val = False\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_file = Path(TMP_TEST_DIR) / Path(\"train_edges_weights.txt\")\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges_file,\n            delim=\" \",\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            edge_weight_column=3,\n            num_nodes=100,\n            num_rels=10,\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True\n        )\n\n    def test_torch_relationship_weights_remap(self):\n        remap_val = True\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        train_edges = torch.tensor(train_edges_df.to_numpy())\n\n        num_rows = train_edges.size(0)\n        train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            edge_weight_column=3,\n            num_nodes=100,\n            format=\"pytorch\",\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True\n        )\n\n    def test_pandas_relationship_weights_remap(self):\n        remap_val = True\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_file = Path(TMP_TEST_DIR) / Path(\"train_edges_weights.txt\")\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges_file,\n            delim=\" \",\n            remap_ids=remap_val,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            edge_weight_column=3,\n            num_nodes=100,\n            num_rels=10,\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_output_dir(\n            output_dir=output_dir, expected_stats=expected_stats, dtype=np.int32, remap_ids=remap_val, has_weights=True\n        )\n\n    def test_torch_relationship_weights_remap_partioned(self):\n        num_paritions = 10\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_df = pd.read_csv(Path(TMP_TEST_DIR) / Path(\"train_edges.txt\"), header=None, sep=\" \")\n        train_edges = torch.tensor(train_edges_df.to_numpy())\n\n        num_rows = train_edges.size(0)\n        train_edges = torch.column_stack((train_edges, torch.arange(num_rows)))\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges,\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            edge_weight_column=3,\n            num_partitions=num_paritions,\n            format=\"pytorch\",\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_partitioned_output_dir(\n            output_dir=output_dir,\n            expected_stats=expected_stats,\n            dtype=np.int32,\n            num_partitions=num_paritions,\n            has_weights=True,\n        )\n\n    def test_pandas_relationship_weights_remap_partioned(self):\n        num_paritions = 10\n        output_dir = Path(TMP_TEST_DIR) / Path(\"test_torch_defaults\")\n        output_dir.mkdir()\n\n        train_edges_file = Path(TMP_TEST_DIR) / Path(\"train_edges_weights.txt\")\n\n        converter = TorchEdgeListConverter(\n            output_dir=output_dir,\n            train_edges=train_edges_file,\n            delim=\" \",\n            src_column=0,\n            dst_column=2,\n            edge_type_column=1,\n            edge_weight_column=3,\n            num_partitions=num_paritions,\n        )\n        converter.convert()\n\n        expected_stats = DatasetConfig()\n        expected_stats.dataset_dir = output_dir.__str__()\n        expected_stats.num_edges = 1000\n        expected_stats.num_nodes = 100\n        expected_stats.num_relations = 10\n        expected_stats.num_train = 1000\n\n        validate_partitioned_output_dir(\n            output_dir=output_dir,\n            expected_stats=expected_stats,\n            dtype=np.int32,\n            num_partitions=num_paritions,\n            has_weights=True,\n        )\n"
  },
  {
    "path": "test/test_configs/generate_test_configs.py",
    "content": "import itertools\nimport os\nfrom pathlib import Path\nfrom test.python.constants import TESTING_CONFIG_DIR\n\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.marius_config import MariusConfig\n\n\ndef get_config(model_config_path, storage_config_path, train_config_path, eval_config_path):\n    model_config = OmegaConf.load(model_config_path)\n    storage_config = OmegaConf.load(storage_config_path)\n    train_config = OmegaConf.load(train_config_path)\n    eval_config = OmegaConf.load(eval_config_path)\n\n    base_config = config_from_sub_configs(model_config, storage_config, train_config, eval_config)\n\n    return base_config\n\n\ndef set_dataset_config(base_config, dataset_dir):\n    dataset_config_path = dataset_dir / Path(\"dataset.yaml\")\n    dataset_config = OmegaConf.load(dataset_config_path)\n\n    # the below attributes need not be manually set as they will be automatically retrieved from dataset_config_path\n    dataset_config.num_edges = -1\n    dataset_config.num_nodes = -1\n    dataset_config.num_relations = -1\n    dataset_config.num_train = -1\n    dataset_config.num_valid = -1\n    dataset_config.num_test = -1\n    dataset_config.initialized = False\n\n    base_config.storage.dataset = dataset_config\n\n\ndef config_from_sub_configs(model_config, storage_config, train_config, eval_config):\n    base_config = MariusConfig()\n\n    base_config.model = model_config\n    base_config.storage = storage_config\n    base_config.training = train_config\n    base_config.evaluation = eval_config\n\n    return base_config\n\n\ndef get_cartesian_product_of_configs(config_directory, model_names, storage_names, training_names, evaluation_names):\n    model_paths = []\n    storage_paths = []\n    train_paths = []\n    evaluation_paths = []\n\n    for filename in os.listdir(config_directory / Path(\"model\")):\n        if len(model_names) > 0:\n            for name in model_names:\n                if name == filename.split(\".\")[0]:\n                    model_paths.append(config_directory / Path(\"model\") / Path(filename))\n        else:\n            model_paths.append(config_directory / Path(\"model\") / Path(filename))\n\n    for filename in os.listdir(config_directory / Path(\"storage\")):\n        if len(storage_names) > 0:\n            for name in storage_names:\n                if name == filename.split(\".\")[0]:\n                    storage_paths.append(config_directory / Path(\"storage\") / Path(filename))\n        else:\n            storage_paths.append(config_directory / Path(\"storage\") / Path(filename))\n\n    for filename in os.listdir(config_directory / Path(\"training\")):\n        if len(training_names) > 0:\n            for name in training_names:\n                if name == filename.split(\".\")[0]:\n                    train_paths.append(config_directory / Path(\"training\") / Path(filename))\n        else:\n            train_paths.append(config_directory / Path(\"training\") / Path(filename))\n\n    for filename in os.listdir(config_directory / Path(\"evaluation\")):\n        if len(evaluation_names) > 0:\n            for name in evaluation_names:\n                if name == filename.split(\".\")[0]:\n                    evaluation_paths.append(config_directory / Path(\"evaluation\") / Path(filename))\n        else:\n            evaluation_paths.append(config_directory / Path(\"evaluation\") / Path(filename))\n\n    config_paths = itertools.product(model_paths, storage_paths, train_paths, evaluation_paths)\n\n    configs = []\n    config_names = []\n    for config_path in config_paths:\n        config_name = \"M-{}-S-{}-T-{}-E-{}.yaml\".format(\n            config_path[0].__str__().split(\"/\")[-1].split(\".\")[0],\n            config_path[1].__str__().split(\"/\")[-1].split(\".\")[0],\n            config_path[2].__str__().split(\"/\")[-1].split(\".\")[0],\n            config_path[3].__str__().split(\"/\")[-1].split(\".\")[0],\n        )\n        print(config_name)\n\n        configs.append(get_config(config_path[0], config_path[1], config_path[2], config_path[3]))\n        config_names.append(config_name)\n\n    return configs, config_names\n\n\ndef get_all_configs_for_dataset(\n    dataset_dir, model_names=[], storage_names=[], training_names=[], evaluation_names=[], task=\"lp\"\n):\n    assert (task == \"lp\") or (task == \"nc\")\n\n    config_directory = Path(TESTING_CONFIG_DIR) / Path(task)\n    configs, config_names = get_cartesian_product_of_configs(\n        config_directory, model_names, storage_names, training_names, evaluation_names\n    )\n\n    for config in configs:\n        set_dataset_config(config, dataset_dir)\n\n    return configs, config_names\n\n\ndef generate_configs_for_dataset(\n    dataset_dir, model_names=[], storage_names=[], training_names=[], evaluation_names=[], task=\"lp\"\n):\n    configs, config_names = get_all_configs_for_dataset(\n        dataset_dir, model_names, storage_names, training_names, evaluation_names, task\n    )\n\n    for i, config in enumerate(configs):\n        OmegaConf.save(config, Path(dataset_dir) / Path(config_names[i]))\n"
  },
  {
    "path": "test/test_configs/lp/evaluation/async.yaml",
    "content": "batch_size: 20\nnegative_sampling:\n  num_chunks: 2\n  negatives_per_positive: 10\n  degree_fraction: 0.0\n  filtered: false\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/lp/evaluation/async_deg.yaml",
    "content": "batch_size: 20\nnegative_sampling:\n  num_chunks: 2\n  negatives_per_positive: 10\n  degree_fraction: 0.5\n  filtered: false\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/lp/evaluation/async_filtered.yaml",
    "content": "batch_size: 20\nnegative_sampling:\n  filtered: true\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/lp/evaluation/sync.yaml",
    "content": "batch_size: 20\nnegative_sampling:\n  num_chunks: 2\n  negatives_per_positive: 10\n  degree_fraction: 0.0\n  filtered: false\npipeline:\n  sync: true"
  },
  {
    "path": "test/test_configs/lp/evaluation/sync_deg.yaml",
    "content": "batch_size: 20\nnegative_sampling:\n  num_chunks: 2\n  negatives_per_positive: 10\n  degree_fraction: 0.5\n  filtered: false\npipeline:\n  sync: true"
  },
  {
    "path": "test/test_configs/lp/evaluation/sync_filtered.yaml",
    "content": "batch_size: 20\nnegative_sampling:\n  filtered: true\npipeline:\n  sync: true"
  },
  {
    "path": "test/test_configs/lp/model/distmult.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/model/distmult_feat.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n      - type: FEATURE\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: REDUCTION\n        input_dim: 100\n        ouptut_dim: 10\n        bias: true\n        options:\n          type: LINEAR\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/model/gat_1_layer.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n      use_incoming_nbrs: true\n      use_outgoing_nbrs: false\n  eval_neighbor_sampling:\n    - type: ALL\n      use_incoming_nbrs: true\n      use_outgoing_nbrs: false\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GAT\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1"
  },
  {
    "path": "test/test_configs/lp/model/gat_3_layer.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  eval_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GAT\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GAT\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GAT\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/model/gs_1_layer.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n      use_incoming_nbrs: true\n      use_outgoing_nbrs: true\n  eval_neighbor_sampling:\n    - type: ALL\n      use_incoming_nbrs: true\n      use_outgoing_nbrs: true\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/model/gs_1_layer_feat.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n  eval_neighbor_sampling:\n    - type: ALL\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n      - type: FEATURE\n        output_dim: 10\n        bias: true\n\n    - - type: REDUCTION\n        input_dim: 100\n        ouptut_dim: 10\n        bias: true\n        options:\n          type: LINEAR\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/model/gs_1_layer_uniform.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  train_neighbor_sampling:\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n  eval_neighbor_sampling:\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/model/gs_3_layer.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  eval_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/model/gs_3_layer_feat.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  eval_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n      - type: FEATURE\n        output_dim: 10\n        bias: true\n\n    - - type: REDUCTION\n        input_dim: 100\n        ouptut_dim: 10\n        bias: true\n        options:\n          type: LINEAR\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/model/gs_3_layer_uniform.yaml",
    "content": "learning_task: LINK_PREDICTION\nencoder:\n  train_neighbor_sampling:\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n  eval_neighbor_sampling:\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: DISTMULT\nloss:\n  type: SOFTMAX_CE\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/lp/storage/edges_disk.yaml",
    "content": "device_type: cpu\ndataset: ???\nedges:\n  type: FLAT_FILE\n  options:\n    dtype: int\nembeddings:\n  type: HOST_MEMORY\n  options:\n    dtype: float\nfeatures:\n  type: HOST_MEMORY\n  options:\n    dtype: float\nsave_model: true"
  },
  {
    "path": "test/test_configs/lp/storage/in_memory.yaml",
    "content": "device_type: cpu\ndataset: ???\nedges:\n  type: HOST_MEMORY\n  options:\n    dtype: int\nembeddings:\n  type: HOST_MEMORY\n  options:\n    dtype: float\nfeatures:\n  type: HOST_MEMORY\n  options:\n    dtype: float\nsave_model: true"
  },
  {
    "path": "test/test_configs/lp/storage/part_buffer.yaml",
    "content": "device_type: cpu\ndataset: ???\nedges:\n  type: FLAT_FILE\n  options:\n    dtype: int\nembeddings:\n  type: PARTITION_BUFFER\n  options:\n    num_partitions: 8\n    buffer_capacity: 4\n    prefetching: false\n    fine_to_coarse_ratio: 2\n    num_cache_partitions: 0\n    edge_bucket_ordering: COMET\n    randomly_assign_edge_buckets: true\nfeatures:\n  type: PARTITION_BUFFER\n  options:\n    num_partitions: 8\n    buffer_capacity: 4\n    prefetching: false\n    fine_to_coarse_ratio: 2\n    num_cache_partitions: 0\n    edge_bucket_ordering: COMET\n    randomly_assign_edge_buckets: true\nsave_model: true"
  },
  {
    "path": "test/test_configs/lp/training/async.yaml",
    "content": "batch_size: 100\nnum_epochs: 2\nnegative_sampling:\n  num_chunks: 5\n  negatives_per_positive: 20\n  degree_fraction: 0.0\n  filtered: false\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/lp/training/async_deg.yaml",
    "content": "batch_size: 100\nnum_epochs: 2\nnegative_sampling:\n  num_chunks: 5\n  negatives_per_positive: 20\n  degree_fraction: 0.5\n  filtered: false\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/lp/training/async_filtered.yaml",
    "content": "batch_size: 100\nnum_epochs: 2\nnegative_sampling:\n  filtered: true\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/lp/training/sync.yaml",
    "content": "batch_size: 100\nnum_epochs: 2\nnegative_sampling:\n  num_chunks: 5\n  negatives_per_positive: 20\n  degree_fraction: 0.0\n  filtered: false\npipeline:\n  sync: true"
  },
  {
    "path": "test/test_configs/lp/training/sync_deg.yaml",
    "content": "batch_size: 100\nnum_epochs: 2\nnegative_sampling:\n  num_chunks: 5\n  negatives_per_positive: 20\n  degree_fraction: 0.5\n  filtered: false\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/lp/training/sync_filtered.yaml",
    "content": "batch_size: 100\nnum_epochs: 2\nnegative_sampling:\n  filtered: true\npipeline:\n  sync: true"
  },
  {
    "path": "test/test_configs/nc/evaluation/async.yaml",
    "content": "batch_size: 10\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/nc/evaluation/sync.yaml",
    "content": "batch_size: 10\npipeline:\n  sync: true"
  },
  {
    "path": "test/test_configs/nc/model/gat_1_layer.yaml",
    "content": "learning_task: NODE_CLASSIFICATION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n      use_incoming_nbrs: true\n      use_outgoing_nbrs: false\n  eval_neighbor_sampling:\n    - type: ALL\n      use_incoming_nbrs: true\n      use_outgoing_nbrs: false\n  layers:\n\n    - - type: FEATURE\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GAT\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: NODE\nloss:\n  type: CROSS_ENTROPY\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1"
  },
  {
    "path": "test/test_configs/nc/model/gat_3_layer.yaml",
    "content": "learning_task: NODE_CLASSIFICATION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  eval_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  layers:\n\n    - - type: FEATURE\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GAT\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GAT\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GAT\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: NODE\nloss:\n  type: CROSS_ENTROPY\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/nc/model/gs_1_layer.yaml",
    "content": "learning_task: NODE_CLASSIFICATION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n      use_incoming_nbrs: true\n      use_outgoing_nbrs: true\n  eval_neighbor_sampling:\n    - type: ALL\n      use_incoming_nbrs: true\n      use_outgoing_nbrs: true\n  layers:\n\n    - - type: FEATURE\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: NODE\nloss:\n  type: CROSS_ENTROPY\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/nc/model/gs_1_layer_emb.yaml",
    "content": "learning_task: NODE_CLASSIFICATION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n  eval_neighbor_sampling:\n    - type: ALL\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n      - type: FEATURE\n        output_dim: 10\n        bias: true\n\n    - - type: REDUCTION\n        input_dim: 20\n        output_dim: 10\n        bias: true\n        options:\n          type: LINEAR\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: NODE\nloss:\n  type: CROSS_ENTROPY\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/nc/model/gs_1_layer_uniform.yaml",
    "content": "learning_task: NODE_CLASSIFICATION\nencoder:\n  train_neighbor_sampling:\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n  eval_neighbor_sampling:\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n  layers:\n\n    - - type: FEATURE\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: NODE\nloss:\n  type: CROSS_ENTROPY\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/nc/model/gs_3_layer.yaml",
    "content": "learning_task: NODE_CLASSIFICATION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  eval_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  layers:\n\n    - - type: FEATURE\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: NODE\nloss:\n  type: CROSS_ENTROPY\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/nc/model/gs_3_layer_emb.yaml",
    "content": "learning_task: NODE_CLASSIFICATION\nencoder:\n  train_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  eval_neighbor_sampling:\n    - type: ALL\n    - type: ALL\n    - type: ALL\n  layers:\n\n    - - type: EMBEDDING\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n      - type: FEATURE\n        output_dim: 10\n        bias: true\n\n    - - type: REDUCTION\n        input_dim: 20\n        output_dim: 10\n        bias: true\n        options:\n          type: LINEAR\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: NODE\nloss:\n  type: CROSS_ENTROPY\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/nc/model/gs_3_layer_uniform.yaml",
    "content": "learning_task: NODE_CLASSIFICATION\nencoder:\n  train_neighbor_sampling:\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n  eval_neighbor_sampling:\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n    - type: UNIFORM\n      options:\n        max_neighbors: 10\n  layers:\n\n    - - type: FEATURE\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\n    - - type: GNN\n        options:\n          type: GRAPH_SAGE\n          aggregator: MEAN\n        input_dim: 10\n        output_dim: 10\n        bias: true\n        init:\n          type: GLOROT_NORMAL\n\ndecoder:\n  type: NODE\nloss:\n  type: CROSS_ENTROPY\n  options:\n    reduction: SUM\ndense_optimizer:\n  type: ADAM\n  options:\n    learning_rate: 0.01\nsparse_optimizer:\n  type: ADAGRAD\n  options:\n    learning_rate: 0.1\n"
  },
  {
    "path": "test/test_configs/nc/storage/in_memory.yaml",
    "content": "device_type: cpu\ndataset: ???\nedges:\n  type: HOST_MEMORY\n  options:\n    dtype: int\nembeddings:\n  type: HOST_MEMORY\n  options:\n    dtype: float\nfeatures:\n  type: HOST_MEMORY\n  options:\n    dtype: float"
  },
  {
    "path": "test/test_configs/nc/storage/part_buffer.yaml",
    "content": "device_type: cpu\ndataset: ???\nedges:\n  type: FLAT_FILE\n  options:\n    dtype: int\nembeddings:\n  type: PARTITION_BUFFER\n  options:\n    num_partitions: 8\n    buffer_capacity: 4\n    prefetching: false\n    fine_to_coarse_ratio: 2\n    num_cache_partitions: 0\n    node_partition_ordering: DISPERSED\n    randomly_assign_edge_buckets: true\nfeatures:\n  type: PARTITION_BUFFER\n  options:\n    num_partitions: 8\n    buffer_capacity: 4\n    prefetching: false\n    fine_to_coarse_ratio: 2\n    num_cache_partitions: 0\n    node_partition_ordering: DISPERSED\n    randomly_assign_edge_buckets: true"
  },
  {
    "path": "test/test_configs/nc/training/async.yaml",
    "content": "batch_size: 10\nnum_epochs: 2\npipeline:\n  sync: false"
  },
  {
    "path": "test/test_configs/nc/training/sync.yaml",
    "content": "batch_size: 10\nnum_epochs: 2\npipeline:\n  sync: true"
  },
  {
    "path": "test/test_data/generate.py",
    "content": "import os\nfrom pathlib import Path\n\nimport numpy as np\nimport pandas as pd\nfrom omegaconf import OmegaConf\n\nfrom marius.tools.configuration.constants import PathConstants\nfrom marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter, split_edges\n\n\ndef get_random_graph(num_nodes, num_edges, num_rels=1):\n    src_nodes = np.random.randint(0, num_nodes, size=[num_edges])\n    dst_nodes = np.random.randint(0, num_nodes, size=[num_edges])\n\n    if num_rels > 1:\n        rels = np.random.randint(0, num_rels, size=[num_edges])\n        edges = np.stack([src_nodes, rels, dst_nodes], axis=1)\n    else:\n        edges = np.stack([src_nodes, dst_nodes], axis=1)\n\n    return edges\n\n\ndef generate_features(num_nodes, feature_dim):\n    return np.random.randn(num_nodes, feature_dim).astype(np.float32)\n\n\ndef generate_labels(num_nodes, num_classes):\n    return np.random.randint(0, num_classes - 1, size=[num_nodes]).astype(np.int32)\n\n\ndef shuffle_with_map(values, node_mapping):\n    random_map = node_mapping[:, 1].astype(values.dtype)\n    random_map_argsort = np.argsort(random_map)\n    return values[random_map_argsort]\n\n\ndef apply_mapping(values, node_mapping):\n    random_map = node_mapping[:, 1].astype(values.dtype)\n    return random_map[values]\n\n\ndef remap_nc(output_dir, train_nodes, labels, num_nodes, valid_nodes=None, test_nodes=None, features=None):\n    node_mapping = np.genfromtxt(output_dir / Path(PathConstants.node_mapping_path), delimiter=\",\")\n\n    train_nodes = apply_mapping(train_nodes, node_mapping)\n\n    if valid_nodes is not None:\n        valid_nodes = apply_mapping(valid_nodes, node_mapping)\n\n    if test_nodes is not None:\n        test_nodes = apply_mapping(test_nodes, node_mapping)\n\n    if features is not None:\n        features = shuffle_with_map(features, node_mapping)\n\n    if labels.shape[0] != num_nodes:\n        labels = np.concatenate((labels, -np.ones([num_nodes - labels.shape[0]], dtype=np.int32)))\n\n    labels = shuffle_with_map(labels, node_mapping)\n\n    return train_nodes, labels, valid_nodes, test_nodes, features\n\n\ndef remap_lp(output_dir, features=None):\n    node_mapping = np.genfromtxt(output_dir / Path(PathConstants.node_mapping_path), delimiter=\",\")\n    features = shuffle_with_map(features, node_mapping)\n\n    return features\n\n\ndef generate_random_dataset_nc(\n    output_dir,\n    num_nodes,\n    num_edges,\n    num_rels=1,\n    splits=None,\n    num_partitions=1,\n    partitioned_eval=False,\n    sequential_train_nodes=False,\n    remap_ids=True,\n    feature_dim=-1,\n    num_classes=10,\n):\n    edges = get_random_graph(num_nodes, num_edges, num_rels)\n    edges_df = pd.DataFrame(data=edges)\n\n    src_col, dst_col, edge_type_col = None, None, None\n    if edges.shape[1] == 3:\n        src_col, dst_col, edge_type_col = 0, 2, 1\n    else:\n        src_col, dst_col = 0, 1\n\n    raw_edges_filename = output_dir / Path(\"raw_edges.csv\")\n    edges_df.to_csv(raw_edges_filename, \",\", header=False, index=False)\n\n    all_nodes = np.arange(0, num_nodes, dtype=np.int32)\n    train_nodes = all_nodes\n\n    valid_nodes = None\n    test_nodes = None\n    if splits is not None:\n        train_nodes, train_weights, valid_nodes, valid_weights, test_nodes, test_weights = split_edges(\n            all_nodes, None, splits\n        )\n\n    converter = TorchEdgeListConverter(\n        output_dir,\n        train_edges=Path(raw_edges_filename),\n        delim=\",\",\n        remap_ids=remap_ids,\n        num_partitions=num_partitions,\n        partitioned_evaluation=partitioned_eval,\n        sequential_train_nodes=sequential_train_nodes,\n        known_node_ids=[train_nodes, valid_nodes, test_nodes],\n        format=\"CSV\",\n        src_column=src_col,\n        dst_column=dst_col,\n        edge_type_column=edge_type_col,\n    )\n\n    dataset_stats = converter.convert()\n\n    features = None\n    if feature_dim != -1:\n        features = generate_features(num_nodes, feature_dim)\n\n    labels = generate_labels(num_nodes, num_classes)\n\n    train_nodes, labels, valid_nodes, test_nodes, features = remap_nc(\n        output_dir, train_nodes, labels, num_nodes, valid_nodes, test_nodes, features\n    )\n\n    if features is not None:\n        node_features_file = output_dir / Path(PathConstants.node_features_path)\n        with open(node_features_file, \"wb\") as f:\n            f.write(bytes(features))\n\n    labels_file = output_dir / Path(PathConstants.labels_path)\n    with open(labels_file, \"wb\") as f:\n        f.write(bytes(labels))\n\n    if train_nodes is not None:\n        train_nodes_file = output_dir / Path(PathConstants.train_nodes_path)\n        with open(train_nodes_file, \"wb\") as f:\n            f.write(bytes(train_nodes))\n\n    if valid_nodes is not None:\n        valid_nodes_file = output_dir / Path(PathConstants.valid_nodes_path)\n        with open(valid_nodes_file, \"wb\") as f:\n            f.write(bytes(valid_nodes))\n\n    if test_nodes is not None:\n        test_nodes_file = output_dir / Path(PathConstants.test_nodes_path)\n        with open(test_nodes_file, \"wb\") as f:\n            f.write(bytes(test_nodes))\n\n    # update dataset yaml\n    dataset_stats.num_train = train_nodes.shape[0]\n\n    if valid_nodes is not None:\n        dataset_stats.num_valid = valid_nodes.shape[0]\n    else:\n        dataset_stats.num_valid = -1\n\n    if test_nodes is not None:\n        dataset_stats.num_test = test_nodes.shape[0]\n    else:\n        dataset_stats.num_test = -1\n\n    if features is not None:\n        dataset_stats.node_feature_dim = features.shape[1]\n    else:\n        dataset_stats.node_feature_dim = -1\n\n    dataset_stats.num_classes = num_classes\n\n    dataset_stats.num_nodes = num_nodes\n\n    with open(output_dir / Path(\"dataset.yaml\"), \"w\") as f:\n        yaml_file = OmegaConf.to_yaml(dataset_stats)\n        f.writelines(yaml_file)\n\n\ndef generate_random_dataset_lp(\n    output_dir,\n    num_nodes,\n    num_edges,\n    num_rels=1,\n    splits=None,\n    num_partitions=1,\n    partitioned_eval=False,\n    sequential_train_nodes=False,\n    remap_ids=True,\n    feature_dim=-1,\n):\n    edges = get_random_graph(num_nodes, num_edges, num_rels)\n    edges_df = pd.DataFrame(data=edges)\n\n    src_col, dst_col, edge_type_col = None, None, None\n    if edges.shape[1] == 3:\n        src_col, dst_col, edge_type_col = 0, 2, 1\n    else:\n        src_col, dst_col = 0, 1\n\n    raw_edges_filename = output_dir / Path(\"raw_edges.csv\")\n\n    edges_df.to_csv(raw_edges_filename, \",\", header=False, index=False)\n\n    converter = TorchEdgeListConverter(\n        output_dir,\n        train_edges=raw_edges_filename,\n        delim=\",\",\n        splits=splits,\n        num_partitions=num_partitions,\n        remap_ids=remap_ids,\n        partitioned_evaluation=partitioned_eval,\n        sequential_train_nodes=sequential_train_nodes,\n        format=\"CSV\",\n        src_column=src_col,\n        dst_column=dst_col,\n        edge_type_column=edge_type_col,\n    )\n\n    dataset_stats = converter.convert()\n\n    if feature_dim != -1:\n        features = generate_features(num_nodes, feature_dim)\n\n        if remap_ids:\n            features = remap_lp(output_dir, features)\n\n        node_features_file = output_dir / Path(PathConstants.node_features_path)\n        with open(node_features_file, \"wb\") as f:\n            f.write(bytes(features))\n\n        dataset_stats.node_feature_dim = feature_dim\n        with open(output_dir / Path(\"dataset.yaml\"), \"w\") as f:\n            yaml_file = OmegaConf.to_yaml(dataset_stats)\n            f.writelines(yaml_file)\n\n\ndef generate_random_dataset(\n    output_dir,\n    num_nodes,\n    num_edges,\n    num_rels=1,\n    splits=None,\n    num_partitions=1,\n    partitioned_eval=False,\n    sequential_train_nodes=False,\n    remap_ids=True,\n    feature_dim=-1,\n    num_classes=10,\n    task=\"lp\",\n):\n    os.makedirs(output_dir, exist_ok=True)\n\n    if task == \"lp\":\n        generate_random_dataset_lp(\n            output_dir,\n            num_nodes,\n            num_edges,\n            num_rels,\n            splits,\n            num_partitions,\n            partitioned_eval,\n            sequential_train_nodes,\n            remap_ids,\n            feature_dim,\n        )\n    elif task == \"nc\":\n        generate_random_dataset_nc(\n            output_dir,\n            num_nodes,\n            num_edges,\n            num_rels,\n            splits,\n            num_partitions,\n            partitioned_eval,\n            sequential_train_nodes,\n            remap_ids,\n            feature_dim,\n            num_classes,\n        )\n    else:\n        raise RuntimeError(\"Unsupported dataset type for generator.\")\n"
  },
  {
    "path": "test/test_data/test_edges.txt",
    "content": "43 8 81\n55 7 34\n93 2 75\n24 1 13\n92 5 16\n6 7 17\n93 9 91\n94 2 21\n29 5 79\n32 5 35\n47 8 62\n1 2 23\n23 5 18\n81 1 78\n12 6 7\n90 6 16\n54 5 66\n96 2 43\n88 1 94\n18 7 78\n68 8 4\n49 1 3\n1 8 61\n5 6 16\n79 6 82\n29 0 42\n36 0 55\n14 8 19\n41 1 69\n71 0 23\n14 8 90\n78 4 14\n32 1 4\n46 5 30\n61 0 99\n79 0 11\n30 2 87\n17 7 61\n93 1 72\n89 1 59\n80 4 25\n3 0 32\n80 9 77\n86 3 69\n1 7 18\n95 7 67\n80 4 74\n81 8 17\n3 3 35\n99 6 74\n36 9 16\n70 8 19\n92 8 99\n47 4 4\n32 8 51\n4 5 91\n1 5 88\n54 8 35\n34 0 78\n78 8 43\n65 6 44\n67 0 42\n89 4 69\n42 6 31\n70 2 61\n23 8 59\n90 2 43\n92 7 23\n87 3 90\n50 0 89\n80 0 64\n6 9 53\n97 7 45\n41 0 40\n28 2 14\n24 4 39\n9 1 71\n28 7 11\n43 2 53\n61 4 48\n0 7 22\n93 1 94\n41 7 8\n70 8 67\n96 6 21\n88 3 16\n36 6 68\n27 8 3\n97 4 33\n51 3 29\n78 1 92\n18 1 85\n88 1 84\n72 1 15\n32 2 96\n36 0 64\n34 2 50\n71 3 61\n11 5 96\n42 1 95"
  },
  {
    "path": "test/test_data/train_edges.txt",
    "content": "80 6 73\n83 8 2\n50 8 66\n64 5 42\n31 5 91\n40 8 92\n18 2 32\n21 5 64\n47 8 19\n71 2 71\n12 5 11\n76 6 58\n12 6 24\n69 9 11\n12 3 55\n77 4 14\n12 8 8\n29 5 14\n46 8 8\n30 0 60\n46 6 7\n51 6 69\n0 2 52\n81 9 26\n50 0 78\n59 9 93\n62 5 12\n93 0 14\n72 7 31\n46 2 12\n44 3 67\n45 1 46\n0 2 56\n68 7 49\n51 2 21\n66 9 99\n93 2 74\n59 9 9\n12 6 3\n26 6 11\n8 7 13\n46 8 70\n50 8 2\n10 8 5\n20 1 3\n43 3 46\n51 5 70\n73 4 74\n95 7 50\n59 8 12\n46 4 99\n20 3 55\n39 3 24\n28 8 8\n31 5 22\n84 3 95\n48 3 50\n81 4 10\n66 7 4\n15 2 78\n68 6 23\n55 0 0\n58 0 48\n75 4 50\n9 4 20\n48 9 87\n97 3 94\n44 9 83\n87 8 37\n74 6 33\n10 9 8\n81 3 18\n42 0 7\n74 3 37\n37 7 33\n35 7 47\n19 4 8\n19 4 78\n87 4 2\n39 2 21\n79 8 74\n21 1 24\n25 5 33\n24 2 33\n12 6 98\n47 8 6\n30 2 94\n61 1 78\n80 0 83\n91 1 97\n24 6 5\n32 8 82\n40 7 34\n68 6 98\n76 8 19\n90 3 40\n90 1 77\n11 4 49\n10 3 82\n39 9 2\n15 0 85\n85 3 81\n67 2 80\n0 8 58\n77 9 48\n93 6 20\n67 4 62\n51 9 36\n74 3 76\n7 4 94\n23 9 46\n2 5 32\n48 7 49\n35 6 19\n52 2 33\n31 1 2\n54 3 26\n63 3 85\n40 1 43\n57 7 51\n74 3 59\n11 1 82\n13 9 23\n70 5 83\n6 2 25\n86 7 59\n71 0 62\n77 0 82\n63 8 88\n4 7 10\n36 6 73\n77 5 58\n6 2 4\n89 8 84\n8 8 80\n27 3 32\n22 1 96\n58 0 45\n62 8 19\n10 4 67\n5 7 21\n18 7 3\n59 0 96\n17 6 49\n82 3 39\n41 2 24\n43 0 22\n4 5 79\n76 7 29\n13 7 3\n2 9 52\n65 9 37\n46 1 65\n72 0 67\n42 8 83\n92 3 72\n46 4 97\n7 1 35\n10 5 23\n39 4 28\n78 3 7\n23 0 94\n86 1 22\n13 6 47\n15 4 8\n63 4 73\n63 7 54\n51 8 22\n74 7 90\n55 9 68\n55 8 89\n95 4 86\n70 8 34\n11 1 42\n74 8 32\n90 9 33\n25 8 65\n60 1 59\n34 9 45\n59 8 53\n1 2 75\n8 5 63\n79 9 30\n21 9 32\n4 5 2\n40 1 94\n3 2 20\n20 5 11\n52 9 77\n60 4 38\n22 8 68\n64 4 26\n44 7 32\n82 7 62\n58 8 55\n7 3 18\n15 6 53\n21 6 62\n99 0 22\n37 1 51\n1 6 46\n68 5 78\n34 0 92\n9 4 41\n8 5 46\n43 1 87\n96 5 78\n84 7 43\n72 3 60\n59 7 57\n28 0 83\n93 5 34\n78 2 36\n15 2 89\n68 3 71\n51 1 26\n67 2 67\n68 2 79\n85 3 66\n68 3 74\n21 3 28\n25 8 87\n82 3 67\n36 5 2\n38 9 12\n30 1 25\n89 7 45\n31 1 7\n22 8 72\n30 4 56\n14 7 60\n26 4 74\n74 0 1\n42 3 70\n91 0 85\n74 5 87\n83 0 0\n14 0 33\n48 4 18\n47 7 3\n34 8 74\n91 7 3\n13 6 56\n5 6 19\n43 5 80\n45 5 68\n41 2 29\n88 3 83\n39 4 42\n31 1 4\n51 6 13\n49 0 59\n0 0 37\n28 6 41\n58 0 94\n86 1 86\n96 0 22\n11 7 91\n61 2 5\n93 6 55\n17 5 63\n47 2 17\n93 6 42\n96 5 4\n73 1 35\n41 6 46\n8 3 69\n5 7 9\n38 3 27\n7 9 61\n10 9 75\n55 9 37\n53 1 18\n9 8 19\n58 1 56\n10 7 90\n15 2 13\n47 3 45\n74 6 60\n38 5 40\n32 4 30\n9 2 74\n85 5 37\n74 9 13\n4 5 37\n17 5 20\n88 8 11\n5 5 70\n71 2 74\n88 7 4\n71 4 89\n50 7 50\n3 2 77\n8 6 83\n30 9 74\n87 7 3\n58 3 32\n48 4 1\n93 5 99\n15 4 48\n59 6 18\n13 5 14\n42 0 4\n97 0 55\n41 7 7\n45 1 70\n47 1 49\n72 9 73\n73 6 18\n12 4 57\n65 6 2\n7 9 52\n76 3 78\n60 4 70\n69 2 17\n65 9 25\n44 7 7\n59 9 15\n39 4 7\n91 9 26\n82 9 51\n70 2 28\n29 3 38\n52 9 35\n22 5 83\n5 7 5\n61 7 98\n12 9 65\n44 7 89\n62 9 6\n87 4 26\n66 4 10\n84 9 49\n68 9 39\n56 0 52\n26 6 22\n42 6 64\n61 9 90\n78 5 39\n71 7 19\n1 0 89\n87 4 23\n23 0 52\n94 4 57\n7 0 85\n98 4 89\n87 7 39\n94 8 4\n45 1 93\n99 8 45\n21 1 79\n65 9 97\n85 5 14\n45 0 65\n41 5 12\n58 3 27\n88 4 86\n13 1 8\n71 4 39\n66 2 22\n89 6 53\n13 7 66\n61 5 91\n99 0 73\n76 3 3\n7 3 51\n61 6 93\n63 0 13\n33 7 96\n6 2 69\n68 2 65\n76 4 9\n66 0 37\n4 4 63\n76 2 26\n28 5 63\n92 9 82\n1 1 49\n43 5 20\n34 0 18\n38 7 2\n74 3 72\n71 5 76\n53 8 58\n61 7 45\n57 9 55\n79 7 87\n55 5 95\n10 1 54\n83 1 32\n74 6 61\n50 1 1\n89 5 87\n54 7 40\n83 7 48\n20 1 76\n57 2 80\n18 7 54\n56 2 13\n9 4 15\n76 7 48\n20 8 29\n34 3 95\n80 0 85\n79 4 17\n94 2 23\n46 2 94\n13 8 70\n31 2 28\n63 8 49\n83 2 97\n51 6 28\n64 0 5\n19 9 52\n69 0 27\n80 7 4\n39 9 81\n98 9 82\n28 9 81\n73 9 58\n68 7 40\n72 4 48\n9 2 65\n34 3 35\n62 0 3\n73 8 54\n13 2 38\n50 0 29\n81 2 96\n48 3 4\n58 5 97\n22 1 91\n41 7 14\n47 1 0\n44 8 58\n77 6 92\n65 6 73\n8 8 61\n74 0 2\n21 0 83\n80 9 92\n53 0 34\n85 8 55\n53 3 83\n32 6 33\n52 3 14\n34 1 14\n45 0 55\n93 5 79\n33 9 65\n79 7 27\n5 9 4\n99 7 26\n26 2 78\n36 4 9\n56 6 92\n82 7 21\n82 9 46\n99 2 90\n57 6 25\n97 4 4\n66 7 53\n79 3 23\n56 5 16\n23 8 88\n61 9 36\n27 1 51\n7 1 93\n27 7 38\n15 1 60\n83 1 5\n58 2 6\n14 4 95\n33 3 90\n45 8 88\n96 5 24\n42 5 94\n46 6 80\n31 2 65\n59 6 4\n16 4 13\n10 2 41\n81 3 73\n83 0 68\n11 0 26\n52 2 11\n75 3 81\n89 5 29\n75 9 66\n87 4 15\n73 3 10\n4 9 67\n76 2 35\n15 0 43\n37 5 93\n37 2 55\n61 4 12\n2 2 81\n4 0 69\n1 8 95\n7 4 72\n9 1 16\n25 8 88\n8 2 74\n65 3 30\n83 3 67\n42 4 1\n36 3 30\n19 1 23\n76 5 90\n83 8 13\n31 6 79\n87 6 36\n7 1 74\n0 6 69\n30 1 52\n57 0 89\n0 2 62\n55 8 25\n28 8 13\n50 9 20\n44 1 33\n48 2 77\n93 5 56\n29 6 97\n93 3 21\n4 2 94\n26 7 43\n20 0 28\n76 6 63\n15 5 66\n59 1 60\n29 4 7\n41 7 27\n40 4 97\n10 2 43\n44 6 76\n73 9 38\n88 4 89\n44 9 21\n73 9 17\n8 5 21\n9 0 85\n84 0 48\n36 3 89\n58 2 25\n27 5 5\n13 1 90\n50 3 51\n3 8 41\n79 3 69\n73 5 75\n71 6 32\n95 4 65\n65 0 98\n12 1 46\n93 8 60\n81 7 95\n48 5 30\n8 8 14\n83 1 47\n38 8 37\n58 7 12\n52 1 89\n86 0 0\n36 1 69\n20 0 56\n71 3 2\n94 6 92\n20 7 14\n53 2 1\n50 2 77\n91 6 57\n28 1 15\n26 9 97\n52 5 73\n19 7 32\n5 7 63\n27 7 73\n5 7 13\n48 9 89\n13 5 84\n48 8 11\n12 5 66\n13 8 39\n10 5 35\n30 0 79\n41 8 79\n72 9 70\n82 2 93\n49 9 5\n85 7 48\n95 4 22\n58 6 7\n45 5 87\n81 8 46\n69 7 99\n34 0 29\n57 3 57\n65 0 84\n29 3 78\n12 4 10\n93 7 5\n74 9 99\n53 0 77\n26 3 87\n62 0 99\n12 3 73\n58 3 92\n42 7 46\n98 7 15\n33 5 82\n51 3 66\n39 0 18\n23 0 14\n64 8 22\n31 9 42\n96 0 91\n73 0 21\n69 5 15\n46 7 47\n82 6 87\n96 3 79\n1 8 69\n31 7 5\n16 3 90\n45 7 94\n58 2 82\n51 0 44\n43 7 34\n2 3 26\n99 1 48\n17 8 45\n37 1 38\n12 5 81\n79 9 35\n69 3 76\n13 8 21\n8 5 67\n41 5 30\n74 2 53\n56 9 70\n86 6 8\n47 8 44\n46 9 82\n0 4 14\n80 1 47\n20 8 18\n83 2 22\n75 9 82\n71 8 55\n0 5 46\n93 7 11\n65 3 22\n26 8 88\n4 8 18\n23 5 6\n32 6 22\n26 3 94\n40 2 16\n4 0 77\n82 2 71\n2 8 74\n90 0 9\n92 4 98\n48 8 44\n47 2 53\n58 9 2\n97 9 12\n5 5 67\n24 9 56\n99 2 85\n19 1 14\n88 2 47\n95 2 49\n14 6 57\n56 7 94\n84 5 31\n5 6 96\n94 0 0\n33 0 38\n24 0 83\n77 5 62\n73 2 28\n53 4 21\n4 0 46\n30 5 34\n9 6 4\n11 3 31\n1 1 3\n86 5 42\n31 1 13\n73 4 13\n36 9 13\n27 4 2\n5 2 48\n60 9 19\n96 2 52\n69 9 96\n17 2 2\n73 8 67\n71 9 58\n31 1 54\n38 5 82\n3 0 67\n69 3 25\n50 6 98\n93 9 4\n48 7 47\n19 3 13\n40 5 77\n21 2 42\n42 1 23\n14 3 29\n42 4 38\n76 0 34\n85 6 0\n91 1 79\n75 8 58\n60 1 44\n29 2 4\n88 0 37\n53 8 28\n88 8 10\n54 6 24\n25 6 56\n26 8 79\n76 2 87\n36 9 84\n38 3 68\n84 7 50\n60 6 84\n60 3 24\n86 3 49\n52 7 56\n59 1 77\n26 4 19\n92 8 94\n18 3 6\n40 2 56\n38 2 49\n60 6 11\n35 9 30\n4 9 17\n24 5 51\n33 5 2\n3 7 82\n99 8 57\n61 9 28\n11 7 28\n31 6 73\n67 4 68\n43 5 56\n49 6 57\n78 2 87\n94 6 93\n85 2 47\n65 1 99\n98 1 63\n47 3 2\n50 8 4\n42 5 30\n77 0 85\n67 9 65\n26 3 65\n59 1 24\n36 0 76\n68 3 95\n34 6 96\n61 5 7\n44 0 59\n30 7 15\n81 2 14\n78 4 30\n20 3 65\n85 6 42\n41 7 43\n51 2 6\n26 7 25\n92 5 49\n90 0 61\n11 8 15\n77 2 31\n30 9 48\n88 9 93\n90 5 70\n57 5 17\n18 9 23\n56 2 82\n25 7 34\n26 1 9\n91 9 30\n49 8 99\n96 8 88\n93 2 65\n36 8 67\n40 5 76\n8 2 31\n92 4 66\n92 4 28\n13 2 73\n4 1 30\n83 4 6\n96 0 3\n12 9 45\n85 5 29\n34 0 39\n51 7 97\n3 9 85\n19 5 73\n92 2 38\n51 5 83\n71 9 79\n83 4 60\n62 8 77\n0 9 32\n70 7 95\n72 6 0\n69 4 95\n3 1 43\n62 9 20\n76 9 85\n84 4 79\n21 1 3\n20 5 83\n91 2 22\n83 3 21\n75 6 25\n56 1 74\n31 2 30\n66 8 3\n19 9 37\n19 5 11\n81 5 93\n68 4 38\n37 2 39\n56 8 97\n82 5 58\n81 2 65\n98 5 40\n78 6 53\n18 5 45\n42 9 29\n75 9 93\n99 8 14\n97 7 35\n33 4 41\n36 8 85\n42 3 54\n58 7 50\n3 7 53\n64 3 80\n0 7 23\n98 5 30\n71 8 86\n37 8 11\n90 2 12\n5 9 41\n54 9 58\n14 4 96\n16 5 97\n1 9 15\n41 4 9\n32 5 17\n96 7 71\n83 4 61\n21 3 81\n28 9 31\n96 8 39\n90 5 46\n65 6 63\n50 4 7\n43 7 21\n23 9 76\n54 0 47\n39 8 11\n71 4 90\n47 8 99\n46 5 71\n90 4 57\n81 4 89\n43 1 90\n32 1 72\n0 4 70\n47 5 34\n43 1 28\n13 1 69\n49 4 9\n36 7 38\n94 0 24\n64 4 11\n53 7 12\n17 5 12\n96 2 69\n99 7 75\n70 4 85\n93 6 64\n61 7 2\n47 2 50\n50 1 58\n3 4 18\n41 2 31\n45 2 49\n98 2 83\n88 2 40\n34 2 59\n86 2 99\n49 4 28\n20 0 24\n98 0 0\n51 4 78\n66 8 50\n37 2 77\n62 5 53\n97 1 20\n84 2 15\n48 3 95\n18 4 17\n20 5 9\n56 0 24\n90 8 64\n13 7 5\n80 5 19\n49 1 33\n20 2 12\n92 8 4\n25 7 28\n47 7 24\n84 4 61\n2 7 84\n0 0 25\n13 9 62\n17 4 4\n1 0 96\n59 6 6\n50 5 76\n69 1 60\n64 0 82\n37 0 96\n57 0 77\n60 5 89\n83 3 1\n23 5 86\n54 5 87\n83 8 76\n12 4 15\n13 6 86\n89 7 97\n12 8 2\n26 0 13\n64 4 48\n3 1 12\n86 1 68\n78 8 4\n96 3 14\n64 7 71\n51 7 72\n66 5 73\n86 4 17\n1 1 82\n91 9 71\n50 5 88\n60 6 81\n57 5 45\n30 7 6\n50 1 11\n84 7 30\n66 6 86\n39 4 47\n29 8 1\n82 7 30\n82 2 54\n35 3 74\n38 9 9\n64 8 88\n74 6 51\n58 4 30\n8 4 6\n72 3 63\n81 4 44\n90 4 1\n91 3 62\n19 4 53\n2 9 78\n70 0 84\n89 1 74\n66 3 0\n95 5 73\n44 9 94\n18 3 87\n6 5 90\n42 9 45\n17 1 41\n81 6 70\n72 0 42\n45 8 43\n16 5 31\n61 5 69\n87 3 6\n80 7 33"
  },
  {
    "path": "test/test_data/train_edges_weights.txt",
    "content": "80 6 73 0\n83 8 2 1\n50 8 66 2\n64 5 42 3\n31 5 91 4\n40 8 92 5\n18 2 32 6\n21 5 64 7\n47 8 19 8\n71 2 71 9\n12 5 11 10\n76 6 58 11\n12 6 24 12\n69 9 11 13\n12 3 55 14\n77 4 14 15\n12 8 8 16\n29 5 14 17\n46 8 8 18\n30 0 60 19\n46 6 7 20\n51 6 69 21\n0 2 52 22\n81 9 26 23\n50 0 78 24\n59 9 93 25\n62 5 12 26\n93 0 14 27\n72 7 31 28\n46 2 12 29\n44 3 67 30\n45 1 46 31\n0 2 56 32\n68 7 49 33\n51 2 21 34\n66 9 99 35\n93 2 74 36\n59 9 9 37\n12 6 3 38\n26 6 11 39\n8 7 13 40\n46 8 70 41\n50 8 2 42\n10 8 5 43\n20 1 3 44\n43 3 46 45\n51 5 70 46\n73 4 74 47\n95 7 50 48\n59 8 12 49\n46 4 99 50\n20 3 55 51\n39 3 24 52\n28 8 8 53\n31 5 22 54\n84 3 95 55\n48 3 50 56\n81 4 10 57\n66 7 4 58\n15 2 78 59\n68 6 23 60\n55 0 0 61\n58 0 48 62\n75 4 50 63\n9 4 20 64\n48 9 87 65\n97 3 94 66\n44 9 83 67\n87 8 37 68\n74 6 33 69\n10 9 8 70\n81 3 18 71\n42 0 7 72\n74 3 37 73\n37 7 33 74\n35 7 47 75\n19 4 8 76\n19 4 78 77\n87 4 2 78\n39 2 21 79\n79 8 74 80\n21 1 24 81\n25 5 33 82\n24 2 33 83\n12 6 98 84\n47 8 6 85\n30 2 94 86\n61 1 78 87\n80 0 83 88\n91 1 97 89\n24 6 5 90\n32 8 82 91\n40 7 34 92\n68 6 98 93\n76 8 19 94\n90 3 40 95\n90 1 77 96\n11 4 49 97\n10 3 82 98\n39 9 2 99\n15 0 85 100\n85 3 81 101\n67 2 80 102\n0 8 58 103\n77 9 48 104\n93 6 20 105\n67 4 62 106\n51 9 36 107\n74 3 76 108\n7 4 94 109\n23 9 46 110\n2 5 32 111\n48 7 49 112\n35 6 19 113\n52 2 33 114\n31 1 2 115\n54 3 26 116\n63 3 85 117\n40 1 43 118\n57 7 51 119\n74 3 59 120\n11 1 82 121\n13 9 23 122\n70 5 83 123\n6 2 25 124\n86 7 59 125\n71 0 62 126\n77 0 82 127\n63 8 88 128\n4 7 10 129\n36 6 73 130\n77 5 58 131\n6 2 4 132\n89 8 84 133\n8 8 80 134\n27 3 32 135\n22 1 96 136\n58 0 45 137\n62 8 19 138\n10 4 67 139\n5 7 21 140\n18 7 3 141\n59 0 96 142\n17 6 49 143\n82 3 39 144\n41 2 24 145\n43 0 22 146\n4 5 79 147\n76 7 29 148\n13 7 3 149\n2 9 52 150\n65 9 37 151\n46 1 65 152\n72 0 67 153\n42 8 83 154\n92 3 72 155\n46 4 97 156\n7 1 35 157\n10 5 23 158\n39 4 28 159\n78 3 7 160\n23 0 94 161\n86 1 22 162\n13 6 47 163\n15 4 8 164\n63 4 73 165\n63 7 54 166\n51 8 22 167\n74 7 90 168\n55 9 68 169\n55 8 89 170\n95 4 86 171\n70 8 34 172\n11 1 42 173\n74 8 32 174\n90 9 33 175\n25 8 65 176\n60 1 59 177\n34 9 45 178\n59 8 53 179\n1 2 75 180\n8 5 63 181\n79 9 30 182\n21 9 32 183\n4 5 2 184\n40 1 94 185\n3 2 20 186\n20 5 11 187\n52 9 77 188\n60 4 38 189\n22 8 68 190\n64 4 26 191\n44 7 32 192\n82 7 62 193\n58 8 55 194\n7 3 18 195\n15 6 53 196\n21 6 62 197\n99 0 22 198\n37 1 51 199\n1 6 46 200\n68 5 78 201\n34 0 92 202\n9 4 41 203\n8 5 46 204\n43 1 87 205\n96 5 78 206\n84 7 43 207\n72 3 60 208\n59 7 57 209\n28 0 83 210\n93 5 34 211\n78 2 36 212\n15 2 89 213\n68 3 71 214\n51 1 26 215\n67 2 67 216\n68 2 79 217\n85 3 66 218\n68 3 74 219\n21 3 28 220\n25 8 87 221\n82 3 67 222\n36 5 2 223\n38 9 12 224\n30 1 25 225\n89 7 45 226\n31 1 7 227\n22 8 72 228\n30 4 56 229\n14 7 60 230\n26 4 74 231\n74 0 1 232\n42 3 70 233\n91 0 85 234\n74 5 87 235\n83 0 0 236\n14 0 33 237\n48 4 18 238\n47 7 3 239\n34 8 74 240\n91 7 3 241\n13 6 56 242\n5 6 19 243\n43 5 80 244\n45 5 68 245\n41 2 29 246\n88 3 83 247\n39 4 42 248\n31 1 4 249\n51 6 13 250\n49 0 59 251\n0 0 37 252\n28 6 41 253\n58 0 94 254\n86 1 86 255\n96 0 22 256\n11 7 91 257\n61 2 5 258\n93 6 55 259\n17 5 63 260\n47 2 17 261\n93 6 42 262\n96 5 4 263\n73 1 35 264\n41 6 46 265\n8 3 69 266\n5 7 9 267\n38 3 27 268\n7 9 61 269\n10 9 75 270\n55 9 37 271\n53 1 18 272\n9 8 19 273\n58 1 56 274\n10 7 90 275\n15 2 13 276\n47 3 45 277\n74 6 60 278\n38 5 40 279\n32 4 30 280\n9 2 74 281\n85 5 37 282\n74 9 13 283\n4 5 37 284\n17 5 20 285\n88 8 11 286\n5 5 70 287\n71 2 74 288\n88 7 4 289\n71 4 89 290\n50 7 50 291\n3 2 77 292\n8 6 83 293\n30 9 74 294\n87 7 3 295\n58 3 32 296\n48 4 1 297\n93 5 99 298\n15 4 48 299\n59 6 18 300\n13 5 14 301\n42 0 4 302\n97 0 55 303\n41 7 7 304\n45 1 70 305\n47 1 49 306\n72 9 73 307\n73 6 18 308\n12 4 57 309\n65 6 2 310\n7 9 52 311\n76 3 78 312\n60 4 70 313\n69 2 17 314\n65 9 25 315\n44 7 7 316\n59 9 15 317\n39 4 7 318\n91 9 26 319\n82 9 51 320\n70 2 28 321\n29 3 38 322\n52 9 35 323\n22 5 83 324\n5 7 5 325\n61 7 98 326\n12 9 65 327\n44 7 89 328\n62 9 6 329\n87 4 26 330\n66 4 10 331\n84 9 49 332\n68 9 39 333\n56 0 52 334\n26 6 22 335\n42 6 64 336\n61 9 90 337\n78 5 39 338\n71 7 19 339\n1 0 89 340\n87 4 23 341\n23 0 52 342\n94 4 57 343\n7 0 85 344\n98 4 89 345\n87 7 39 346\n94 8 4 347\n45 1 93 348\n99 8 45 349\n21 1 79 350\n65 9 97 351\n85 5 14 352\n45 0 65 353\n41 5 12 354\n58 3 27 355\n88 4 86 356\n13 1 8 357\n71 4 39 358\n66 2 22 359\n89 6 53 360\n13 7 66 361\n61 5 91 362\n99 0 73 363\n76 3 3 364\n7 3 51 365\n61 6 93 366\n63 0 13 367\n33 7 96 368\n6 2 69 369\n68 2 65 370\n76 4 9 371\n66 0 37 372\n4 4 63 373\n76 2 26 374\n28 5 63 375\n92 9 82 376\n1 1 49 377\n43 5 20 378\n34 0 18 379\n38 7 2 380\n74 3 72 381\n71 5 76 382\n53 8 58 383\n61 7 45 384\n57 9 55 385\n79 7 87 386\n55 5 95 387\n10 1 54 388\n83 1 32 389\n74 6 61 390\n50 1 1 391\n89 5 87 392\n54 7 40 393\n83 7 48 394\n20 1 76 395\n57 2 80 396\n18 7 54 397\n56 2 13 398\n9 4 15 399\n76 7 48 400\n20 8 29 401\n34 3 95 402\n80 0 85 403\n79 4 17 404\n94 2 23 405\n46 2 94 406\n13 8 70 407\n31 2 28 408\n63 8 49 409\n83 2 97 410\n51 6 28 411\n64 0 5 412\n19 9 52 413\n69 0 27 414\n80 7 4 415\n39 9 81 416\n98 9 82 417\n28 9 81 418\n73 9 58 419\n68 7 40 420\n72 4 48 421\n9 2 65 422\n34 3 35 423\n62 0 3 424\n73 8 54 425\n13 2 38 426\n50 0 29 427\n81 2 96 428\n48 3 4 429\n58 5 97 430\n22 1 91 431\n41 7 14 432\n47 1 0 433\n44 8 58 434\n77 6 92 435\n65 6 73 436\n8 8 61 437\n74 0 2 438\n21 0 83 439\n80 9 92 440\n53 0 34 441\n85 8 55 442\n53 3 83 443\n32 6 33 444\n52 3 14 445\n34 1 14 446\n45 0 55 447\n93 5 79 448\n33 9 65 449\n79 7 27 450\n5 9 4 451\n99 7 26 452\n26 2 78 453\n36 4 9 454\n56 6 92 455\n82 7 21 456\n82 9 46 457\n99 2 90 458\n57 6 25 459\n97 4 4 460\n66 7 53 461\n79 3 23 462\n56 5 16 463\n23 8 88 464\n61 9 36 465\n27 1 51 466\n7 1 93 467\n27 7 38 468\n15 1 60 469\n83 1 5 470\n58 2 6 471\n14 4 95 472\n33 3 90 473\n45 8 88 474\n96 5 24 475\n42 5 94 476\n46 6 80 477\n31 2 65 478\n59 6 4 479\n16 4 13 480\n10 2 41 481\n81 3 73 482\n83 0 68 483\n11 0 26 484\n52 2 11 485\n75 3 81 486\n89 5 29 487\n75 9 66 488\n87 4 15 489\n73 3 10 490\n4 9 67 491\n76 2 35 492\n15 0 43 493\n37 5 93 494\n37 2 55 495\n61 4 12 496\n2 2 81 497\n4 0 69 498\n1 8 95 499\n7 4 72 500\n9 1 16 501\n25 8 88 502\n8 2 74 503\n65 3 30 504\n83 3 67 505\n42 4 1 506\n36 3 30 507\n19 1 23 508\n76 5 90 509\n83 8 13 510\n31 6 79 511\n87 6 36 512\n7 1 74 513\n0 6 69 514\n30 1 52 515\n57 0 89 516\n0 2 62 517\n55 8 25 518\n28 8 13 519\n50 9 20 520\n44 1 33 521\n48 2 77 522\n93 5 56 523\n29 6 97 524\n93 3 21 525\n4 2 94 526\n26 7 43 527\n20 0 28 528\n76 6 63 529\n15 5 66 530\n59 1 60 531\n29 4 7 532\n41 7 27 533\n40 4 97 534\n10 2 43 535\n44 6 76 536\n73 9 38 537\n88 4 89 538\n44 9 21 539\n73 9 17 540\n8 5 21 541\n9 0 85 542\n84 0 48 543\n36 3 89 544\n58 2 25 545\n27 5 5 546\n13 1 90 547\n50 3 51 548\n3 8 41 549\n79 3 69 550\n73 5 75 551\n71 6 32 552\n95 4 65 553\n65 0 98 554\n12 1 46 555\n93 8 60 556\n81 7 95 557\n48 5 30 558\n8 8 14 559\n83 1 47 560\n38 8 37 561\n58 7 12 562\n52 1 89 563\n86 0 0 564\n36 1 69 565\n20 0 56 566\n71 3 2 567\n94 6 92 568\n20 7 14 569\n53 2 1 570\n50 2 77 571\n91 6 57 572\n28 1 15 573\n26 9 97 574\n52 5 73 575\n19 7 32 576\n5 7 63 577\n27 7 73 578\n5 7 13 579\n48 9 89 580\n13 5 84 581\n48 8 11 582\n12 5 66 583\n13 8 39 584\n10 5 35 585\n30 0 79 586\n41 8 79 587\n72 9 70 588\n82 2 93 589\n49 9 5 590\n85 7 48 591\n95 4 22 592\n58 6 7 593\n45 5 87 594\n81 8 46 595\n69 7 99 596\n34 0 29 597\n57 3 57 598\n65 0 84 599\n29 3 78 600\n12 4 10 601\n93 7 5 602\n74 9 99 603\n53 0 77 604\n26 3 87 605\n62 0 99 606\n12 3 73 607\n58 3 92 608\n42 7 46 609\n98 7 15 610\n33 5 82 611\n51 3 66 612\n39 0 18 613\n23 0 14 614\n64 8 22 615\n31 9 42 616\n96 0 91 617\n73 0 21 618\n69 5 15 619\n46 7 47 620\n82 6 87 621\n96 3 79 622\n1 8 69 623\n31 7 5 624\n16 3 90 625\n45 7 94 626\n58 2 82 627\n51 0 44 628\n43 7 34 629\n2 3 26 630\n99 1 48 631\n17 8 45 632\n37 1 38 633\n12 5 81 634\n79 9 35 635\n69 3 76 636\n13 8 21 637\n8 5 67 638\n41 5 30 639\n74 2 53 640\n56 9 70 641\n86 6 8 642\n47 8 44 643\n46 9 82 644\n0 4 14 645\n80 1 47 646\n20 8 18 647\n83 2 22 648\n75 9 82 649\n71 8 55 650\n0 5 46 651\n93 7 11 652\n65 3 22 653\n26 8 88 654\n4 8 18 655\n23 5 6 656\n32 6 22 657\n26 3 94 658\n40 2 16 659\n4 0 77 660\n82 2 71 661\n2 8 74 662\n90 0 9 663\n92 4 98 664\n48 8 44 665\n47 2 53 666\n58 9 2 667\n97 9 12 668\n5 5 67 669\n24 9 56 670\n99 2 85 671\n19 1 14 672\n88 2 47 673\n95 2 49 674\n14 6 57 675\n56 7 94 676\n84 5 31 677\n5 6 96 678\n94 0 0 679\n33 0 38 680\n24 0 83 681\n77 5 62 682\n73 2 28 683\n53 4 21 684\n4 0 46 685\n30 5 34 686\n9 6 4 687\n11 3 31 688\n1 1 3 689\n86 5 42 690\n31 1 13 691\n73 4 13 692\n36 9 13 693\n27 4 2 694\n5 2 48 695\n60 9 19 696\n96 2 52 697\n69 9 96 698\n17 2 2 699\n73 8 67 700\n71 9 58 701\n31 1 54 702\n38 5 82 703\n3 0 67 704\n69 3 25 705\n50 6 98 706\n93 9 4 707\n48 7 47 708\n19 3 13 709\n40 5 77 710\n21 2 42 711\n42 1 23 712\n14 3 29 713\n42 4 38 714\n76 0 34 715\n85 6 0 716\n91 1 79 717\n75 8 58 718\n60 1 44 719\n29 2 4 720\n88 0 37 721\n53 8 28 722\n88 8 10 723\n54 6 24 724\n25 6 56 725\n26 8 79 726\n76 2 87 727\n36 9 84 728\n38 3 68 729\n84 7 50 730\n60 6 84 731\n60 3 24 732\n86 3 49 733\n52 7 56 734\n59 1 77 735\n26 4 19 736\n92 8 94 737\n18 3 6 738\n40 2 56 739\n38 2 49 740\n60 6 11 741\n35 9 30 742\n4 9 17 743\n24 5 51 744\n33 5 2 745\n3 7 82 746\n99 8 57 747\n61 9 28 748\n11 7 28 749\n31 6 73 750\n67 4 68 751\n43 5 56 752\n49 6 57 753\n78 2 87 754\n94 6 93 755\n85 2 47 756\n65 1 99 757\n98 1 63 758\n47 3 2 759\n50 8 4 760\n42 5 30 761\n77 0 85 762\n67 9 65 763\n26 3 65 764\n59 1 24 765\n36 0 76 766\n68 3 95 767\n34 6 96 768\n61 5 7 769\n44 0 59 770\n30 7 15 771\n81 2 14 772\n78 4 30 773\n20 3 65 774\n85 6 42 775\n41 7 43 776\n51 2 6 777\n26 7 25 778\n92 5 49 779\n90 0 61 780\n11 8 15 781\n77 2 31 782\n30 9 48 783\n88 9 93 784\n90 5 70 785\n57 5 17 786\n18 9 23 787\n56 2 82 788\n25 7 34 789\n26 1 9 790\n91 9 30 791\n49 8 99 792\n96 8 88 793\n93 2 65 794\n36 8 67 795\n40 5 76 796\n8 2 31 797\n92 4 66 798\n92 4 28 799\n13 2 73 800\n4 1 30 801\n83 4 6 802\n96 0 3 803\n12 9 45 804\n85 5 29 805\n34 0 39 806\n51 7 97 807\n3 9 85 808\n19 5 73 809\n92 2 38 810\n51 5 83 811\n71 9 79 812\n83 4 60 813\n62 8 77 814\n0 9 32 815\n70 7 95 816\n72 6 0 817\n69 4 95 818\n3 1 43 819\n62 9 20 820\n76 9 85 821\n84 4 79 822\n21 1 3 823\n20 5 83 824\n91 2 22 825\n83 3 21 826\n75 6 25 827\n56 1 74 828\n31 2 30 829\n66 8 3 830\n19 9 37 831\n19 5 11 832\n81 5 93 833\n68 4 38 834\n37 2 39 835\n56 8 97 836\n82 5 58 837\n81 2 65 838\n98 5 40 839\n78 6 53 840\n18 5 45 841\n42 9 29 842\n75 9 93 843\n99 8 14 844\n97 7 35 845\n33 4 41 846\n36 8 85 847\n42 3 54 848\n58 7 50 849\n3 7 53 850\n64 3 80 851\n0 7 23 852\n98 5 30 853\n71 8 86 854\n37 8 11 855\n90 2 12 856\n5 9 41 857\n54 9 58 858\n14 4 96 859\n16 5 97 860\n1 9 15 861\n41 4 9 862\n32 5 17 863\n96 7 71 864\n83 4 61 865\n21 3 81 866\n28 9 31 867\n96 8 39 868\n90 5 46 869\n65 6 63 870\n50 4 7 871\n43 7 21 872\n23 9 76 873\n54 0 47 874\n39 8 11 875\n71 4 90 876\n47 8 99 877\n46 5 71 878\n90 4 57 879\n81 4 89 880\n43 1 90 881\n32 1 72 882\n0 4 70 883\n47 5 34 884\n43 1 28 885\n13 1 69 886\n49 4 9 887\n36 7 38 888\n94 0 24 889\n64 4 11 890\n53 7 12 891\n17 5 12 892\n96 2 69 893\n99 7 75 894\n70 4 85 895\n93 6 64 896\n61 7 2 897\n47 2 50 898\n50 1 58 899\n3 4 18 900\n41 2 31 901\n45 2 49 902\n98 2 83 903\n88 2 40 904\n34 2 59 905\n86 2 99 906\n49 4 28 907\n20 0 24 908\n98 0 0 909\n51 4 78 910\n66 8 50 911\n37 2 77 912\n62 5 53 913\n97 1 20 914\n84 2 15 915\n48 3 95 916\n18 4 17 917\n20 5 9 918\n56 0 24 919\n90 8 64 920\n13 7 5 921\n80 5 19 922\n49 1 33 923\n20 2 12 924\n92 8 4 925\n25 7 28 926\n47 7 24 927\n84 4 61 928\n2 7 84 929\n0 0 25 930\n13 9 62 931\n17 4 4 932\n1 0 96 933\n59 6 6 934\n50 5 76 935\n69 1 60 936\n64 0 82 937\n37 0 96 938\n57 0 77 939\n60 5 89 940\n83 3 1 941\n23 5 86 942\n54 5 87 943\n83 8 76 944\n12 4 15 945\n13 6 86 946\n89 7 97 947\n12 8 2 948\n26 0 13 949\n64 4 48 950\n3 1 12 951\n86 1 68 952\n78 8 4 953\n96 3 14 954\n64 7 71 955\n51 7 72 956\n66 5 73 957\n86 4 17 958\n1 1 82 959\n91 9 71 960\n50 5 88 961\n60 6 81 962\n57 5 45 963\n30 7 6 964\n50 1 11 965\n84 7 30 966\n66 6 86 967\n39 4 47 968\n29 8 1 969\n82 7 30 970\n82 2 54 971\n35 3 74 972\n38 9 9 973\n64 8 88 974\n74 6 51 975\n58 4 30 976\n8 4 6 977\n72 3 63 978\n81 4 44 979\n90 4 1 980\n91 3 62 981\n19 4 53 982\n2 9 78 983\n70 0 84 984\n89 1 74 985\n66 3 0 986\n95 5 73 987\n44 9 94 988\n18 3 87 989\n6 5 90 990\n42 9 45 991\n17 1 41 992\n81 6 70 993\n72 0 42 994\n45 8 43 995\n16 5 31 996\n61 5 69 997\n87 3 6 998\n80 7 33 999\n"
  },
  {
    "path": "test/test_data/valid_edges.txt",
    "content": "85 3 62\n2 6 94\n10 0 17\n18 8 57\n66 4 38\n41 9 82\n42 8 63\n59 2 36\n62 8 39\n2 1 50\n31 7 12\n61 4 64\n23 9 65\n32 8 69\n7 0 62\n81 2 68\n22 7 35\n17 5 94\n23 8 37\n99 8 77\n40 0 87\n95 9 69\n91 8 29\n35 5 27\n74 3 31\n59 8 29\n92 3 22\n26 4 49\n6 8 36\n89 5 42\n42 3 67\n95 1 61\n11 7 58\n67 2 9\n28 6 73\n6 7 43\n2 3 86\n39 3 77\n7 3 30\n30 5 73\n77 5 20\n8 9 0\n25 0 4\n61 5 11\n80 2 24\n48 9 40\n97 3 42\n70 2 44\n96 2 7\n34 4 75\n83 4 84\n52 9 3\n72 5 6\n62 5 5\n62 3 92\n24 3 95\n5 9 52\n83 8 64\n88 2 39\n4 9 66\n41 7 75\n86 7 46\n54 0 86\n13 2 13\n1 1 7\n72 4 14\n0 9 82\n90 8 39\n76 4 21\n9 9 45\n74 7 94\n44 0 39\n15 5 34\n41 1 58\n61 6 82\n7 3 7\n32 0 48\n28 2 98\n96 4 37\n9 5 89\n40 1 25\n69 3 79\n6 8 73\n75 4 61\n67 4 5\n73 5 33\n79 1 89\n98 4 50\n81 2 66\n40 8 68\n26 1 34\n66 7 90\n78 1 35\n58 2 84\n59 8 1\n79 5 14\n70 3 31\n46 9 55\n37 8 68\n19 7 45"
  },
  {
    "path": "tox.ini",
    "content": "[flake8]\nextend-ignore = I001\n\n[tox]\nenvlist = check_lint\nskipsdist = true\n\n[testenv:clean_build]\nskip_install = true\ncommands =\n    rm -rf docs_build\n    rm -rf docs_html\n    rm -rf build\ndescription = Clean up build directories\nallowlist_externals =\n    /bin/rm\n    /usr/bin/rm\n\n[testenv:build_docs]\ndeps =\n    sphinx_autodoc_typehints\n    sphinx_rtd_theme\n    breathe\n    torch\nskip_install = true\ncommands =\n    mkdir -p docs_build\n    cmake -Bdocs_build -S./ -DBUILD_DOCS=1\n    cmake --build docs_build --target Sphinx\n    rm -rf docs_html\n    mv docs_build/docs/html docs_html\n    rm -rf docs_build\ndescription = Build docs with sphinx\nallowlist_externals =\n    /bin/cd\n    /bin/mv\n    /bin/mkdir\n    /bin/rm\n    /usr/bin/cd\n    /usr/bin/mv\n    /usr/bin/mkdir\n    /usr/bin/rm\n    /usr/bin/cmake\n    /usr/local/bin/cmake\n\n[testenv:autoformat]\ndeps =\n    black==22.6.0\n    isort==5.10.1\nskip_install = true\ncommands =\n    black --extend-exclude third_party --preview .\n    isort src/python/ examples/ test/ setup.py\n    /bin/bash -c 'find src/cpp/include src/cpp/src src/cpp/python_bindings test/cpp -iname *.h -o -iname *.cpp | xargs clang-format -i'\ndescription = Run black, isort, and clang-format autoformatters.\nallowlist_externals =\n    /bin/bash\n    /bin/find\n    /bin/xargs\n    /usr/bin/find\n    /usr/bin/xargs\n    /usr/bin/bash\n    /usr/bin/clang-format\n    /usr/local/bin/clang-format\n    /usr/local/bin/bash\n\n\n[testenv:check_lint]\nskip_install = true\ndeps =\n    black==22.6.0\n    flake8==4.0.1\n    flake8-black==0.3.0\n    flake8-isort==4.1.1\n    pep8-naming==0.13.0\ncommands =\n    flake8\n    /bin/bash -c 'find src/cpp/include src/cpp/src src/cpp/python_bindings test/cpp -iname *.h -o -iname *.cpp | xargs clang-format --dry-run -Werror'\ndescription = Run flake8 and clang-tidy to check formatting\nallowlist_externals =\n    /bin/find\n    /bin/xargs\n    /bin/bash\n    /usr/bin/find\n    /usr/bin/xargs\n    /usr/bin/bash\n    /usr/bin/clang-format\n    /usr/local/bin/clang-format\n    /usr/local/bin/bash\n    "
  }
]