[
  {
    "path": ".clang-format",
    "content": "---\n# ---------------------------------------------------------------------------- #\n#                                CLANG-FORMAT                                  #\n#                                   v6.0.x                                     #\n# ---------------------------------------------------------------------------- #\n# For documentation: https://clang.llvm.org/docs/ClangFormatStyleOptions.html\n#\nLanguage:        Cpp\n# BasedOnStyle:  Google\nAccessModifierOffset: -4\nAlignAfterOpenBracket: Align\nAlignConsecutiveAssignments: true\nAlignConsecutiveDeclarations: false\nAlignEscapedNewlinesLeft: true\nAlignOperands:   true\nAlignTrailingComments: true\nAllowAllParametersOfDeclarationOnNextLine: true\nAllowShortBlocksOnASingleLine: false\nAllowShortCaseLabelsOnASingleLine: false\nAllowShortFunctionsOnASingleLine: All\nAllowShortIfStatementsOnASingleLine: false\nAllowShortLoopsOnASingleLine: false\nAlwaysBreakAfterDefinitionReturnType: None\nAlwaysBreakAfterReturnType: None\nAlwaysBreakBeforeMultilineStrings: true\nAlwaysBreakTemplateDeclarations: true\nBinPackArguments: true\nBinPackParameters: true\nBraceWrapping:\n  AfterClass:      false\n  AfterControlStatement: false\n  AfterEnum:       false\n  AfterFunction:   false\n  AfterNamespace:  false\n  AfterObjCDeclaration: false\n  AfterStruct:     false\n  AfterUnion:      false\n  BeforeCatch:     false\n  BeforeElse:      false\n  IndentBraces:    false\nBreakBeforeBinaryOperators: None\nBreakBeforeBraces: Attach\nBreakBeforeTernaryOperators: true\nBreakConstructorInitializersBeforeComma: false\nColumnLimit:     120\nCommentPragmas:  '^ IWYU pragma:'\nConstructorInitializerAllOnOneLineOrOnePerLine: true\nConstructorInitializerIndentWidth: 4\nContinuationIndentWidth: 4\nCpp11BracedListStyle: true\nDerivePointerAlignment: true\nDisableFormat:   false\nExperimentalAutoDetectBinPacking: false\nForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]\nIncludeCategories:\n  - Regex:           '^<.*\\.h>'\n    Priority:        1\n  - Regex:           '^<.*'\n    Priority:        2\n  - Regex:           '.*'\n    Priority:        3\nIndentCaseLabels: true\nIndentWidth:     4\nIndentWrappedFunctionNames: false\nKeepEmptyLinesAtTheStartOfBlocks: false\nMacroBlockBegin: ''\nMacroBlockEnd:   ''\nMaxEmptyLinesToKeep: 1\nNamespaceIndentation: None\nObjCBlockIndentWidth: 2\nObjCSpaceAfterProperty: false\nObjCSpaceBeforeProtocolList: false\nPenaltyBreakBeforeFirstCallParameter: 1\nPenaltyBreakComment: 300\nPenaltyBreakFirstLessLess: 120\nPenaltyBreakString: 1000\nPenaltyExcessCharacter: 1000000\nPenaltyReturnTypeOnItsOwnLine: 200\nPointerAlignment: Left\nReflowComments:  true\nSortIncludes:    true\nSpaceAfterCStyleCast: true\nSpaceBeforeAssignmentOperators: true\nSpaceBeforeParens: ControlStatements\nSpaceInEmptyParentheses: false\nSpacesBeforeTrailingComments: 2\nSpacesInAngles:  false\nSpacesInContainerLiterals: true\nSpacesInCStyleCastParentheses: false\nSpacesInParentheses: false\nSpacesInSquareBrackets: false\nStandard:        Cpp11\nTabWidth:        8\nUseTab:          Never\n...\n"
  },
  {
    "path": ".gitignore",
    "content": "# Created by https://www.gitignore.io/api/vim,osx,c++\n/data\n### C++ ###\n# Prerequisites\n*.d\n\n# Compiled Object files\n*.slo\n*.lo\n*.o\n*.obj\n\n# Precompiled Headers\n*.gch\n*.pch\n\n# Compiled Dynamic libraries\n*.so\n*.dylib\n*.dll\n\n# Fortran module files\n*.mod\n*.smod\n\n# Compiled Static libraries\n*.lai\n*.la\n*.a\n*.lib\n\n# Executables\n*.exe\n*.out\n*.app\n\n### OSX ###\n*.DS_Store\n.AppleDouble\n.LSOverride\n\n# Icon must end with two \\r\nIcon\n\n# Thumbnails\n._*\n\n# Files that might appear in the root of a volume\n.DocumentRevisions-V100\n.fseventsd\n.Spotlight-V100\n.TemporaryItems\n.Trashes\n.VolumeIcon.icns\n.com.apple.timemachine.donotpresent\n\n# Directories potentially created on remote AFP share\n.AppleDB\n.AppleDesktop\nNetwork Trash Folder\nTemporary Items\n.apdisk\n\n### Vim ###\n# swap\n[._]*.s[a-v][a-z]\n[._]*.sw[a-p]\n[._]s[a-v][a-z]\n[._]sw[a-p]\n# session\nSession.vim\n# temporary\n.netrwhist\n*~\n# auto-generated tag files\ntags\n\n# End of https://www.gitignore.io/api/vim,osx,c++\n\n# Ignore build directory\nbuild\nfiles\n\n# ignore remote sync config file\n.remote-sync.json\n\n# Ignore frames\nframes\n\n# ignore helper MATLAB script to display point clouds\nshow_pcl.m\n\n# ignore output folder\nout"
  },
  {
    "path": ".gitlab-ci.yml",
    "content": "# ---------------------------------------------------------------------------- #\n#                               GITLAB-RUNNER-CI                               #\n# ---------------------------------------------------------------------------- #\n\n# Version: v9.5.4\n# Running on graphic01.doc.ic.ac.uk\n\n# ---------------------------------------------------------------------------- #\n#                                  PIPELINE                                    #\n# ---------------------------------------------------------------------------- #\n# The CI pipeline is divided into the following stages:\n# Lint: Performs linter checks on the repo's code\n# Build: Builds the project\n# Test: Runs the project's tests\n# Deploy: Deploys the project to a given directory\n# Cleanup: Removes all artifacts required for the CI stages\nstages:\n  - Setup\n  - Lint\n  - Build\n  - Test\n  - Deploy\n  - Cleanup\n\n# env_setup defines a number of environment variables which can be called by\n# various before_scripts of the project.\n# 1) Exporting the PATH is necessary to enable linking of the custom installed\n#    binaries in the group's shared directory.\n# 2) setup.sh makes sure all Cuda related binaries are setup correctly.\n#    This includes, but isn't limited to, CUDA itself and OpenGL.\n.env_setup: &env_setup |\n  source setup.sh --set-paths\n\n# Variables for use in script\nvariables:\n  CONTAINER_IMAGE: g1736211/dynfu\n  DEV_IMAGE: g1736211/dynfu:dev\n  TAG_IMAGE: g1736211/dynfu:${CI_COMMIT_REF_SLUG}\n# ---------------------------------------------------------------------------- #\n#                                     SETUP                                    #\n# ---------------------------------------------------------------------------- #\n# The Setup stage sets all the nevironment variables and paths using cmake.\n\n# > cmake <\n# Cmake will generate a Makefile for the project linking all needed dependencies\n# Cmake stage is required to not fail, in which case it would fail the pipeline.\ncmake:\n  stage: Setup\n\n  # Before attempting to cmake env variables must be set to link the binaries\n  # Call .setup.sh which creates build and setup the dependencies\n  # Cmake will also build inside the build/ directory\n  before_script:\n    - rm -rf build || true\n    - source setup.sh --set-paths\n\n  # Build/ directory is cached for a given commit SHA to make sure all the tasks\n  # within a same pipeline will have access to the correctly built project\n  cache:\n    key: \"$CI_COMMIT_SHA\"\n    paths:\n      - build/\n\n  # Cmake requires two additional flags:\n  # 1) CMAKE_PREFIX enables it to find the custom built binaries\n  # 2) CMAKE_CXX_FLAGS are necessary to be set as otherwise BASH doesn't seem to\n  #    set the c++ standard correctly (i.e. will throw a number of errors)\n  # 3) [Others]\n  script:\n    - source setup.sh --cmake -i\n\n# ---------------------------------------------------------------------------- #\n#                                     LINT                                     #\n# ---------------------------------------------------------------------------- #\n# The Lint stage runs the linter on the project's files.\n\n# > clang_format <\n# Clang formatting will ensure that the latest files have been formatted with\n# clang-styling. This step does not fix linter errors, it will just point them\n# out to the pipeline, failing it.\nclang_format:\n  stage: Lint\n\n  # Set env variables before testing should we need to access CUDA of OpenGL\n  # headers.\n  before_script:\n    - *env_setup\n\n  # Files are considered to be source only if *.cpp or *.hpp.\n  # Excluded files or directories must be set manually in .lintignore\n  script:\n     # Print out the configuration file variables\n     - /vol/project/2017/362/g1736211/bin/clang-format -dump-config\n     # Attempt to check if clang-format had been run before saving\n     - ./scripts/.format_bash\n\n# > lint <\n# The linter used for the project is clang-tidy.\n# 1) Linting stage must run after build due to the numerous dependencies created\n#    by Cmake. All these dependencies are logged into compile_commands.json and\n#    are referenced by clang-tidy when linting (i.e. to scan headers).\n# 2) The run-clang-tidy python script must be linked to the relevant clang-tidy\n#    executable as it cannot be found in it's canonical place (i.e /usr/bin).\n# 3) run-clang-tidy.py will accept all clang-tidy flags, it has extra value as\n#    it can use the compile_commands.json information.\n#\n# USED FLAGS:\n# 1) -list-checks : Lists all the enabled checks.\n# 2) -checks= : Sets different checkstyles for the source files\n# 3) -p <DIR> : Sets the path for the build directory containing the\n#               compile_commands.json information.\n# 4) -warnings-as-errors=* : Treats all warnings as errors.\n#\n# Used checks can be found in .lintflags\nclang_tidy:\n  stage: Lint\n\n  # Set env variables before testing should we need to access CUDA of OpenGL\n  # headers.\n  before_script:\n    - *env_setup\n\n  # Linting must occur on the newly built build/ binaries.\n  cache:\n    key: \"$CI_COMMIT_SHA\"\n    paths:\n      - build/\n    policy: pull\n\n  # Files are considered to be source only if *.cpp or *.hpp.\n  # Excluded files or directories must be set manually in .lintignore\n  script:\n     # Print out the list of enabled checks\n     - /vol/project/2017/362/g1736211/bin/clang-tidy `./scripts/.flags_bash` -list-checks\n     # Print out the list of files which will be checked for linting\n     - ./scripts/.find_bash\n     # Perform the linting with the above configuratiions\n     - /vol/project/2017/362/g1736211/bin/clang-tidy -p build `./scripts/.diff_tidy_bash` `./scripts/.flags_bash` `./scripts/.find_bash`\n\n# ---------------------------------------------------------------------------- #\n#                                    BUILD                                     #\n# ---------------------------------------------------------------------------- #\n# The build project actually generates the binaries for the project.\n# According to the flags set during the CMake stage, tests are built as well in\n# their respective directories.\n\n# > make <\n# The make stage will run the Makefile generated by cmake in the build/ dir.\n# The make stage is required not to fail.\nmake:\n  stage: Build\n\n  # Before attempting to make env variables must be set to make sure all\n  # binaries referenced by the Makefile can be accessed.\n  # Make will also build the executables inside the build/ directory\n  before_script:\n    - *env_setup\n\n  # Build/ directory is cached for a given commit SHA to make sure all the tasks\n  # within a same pipeline will have access to the correctly built project\n  cache:\n    key: \"$CI_COMMIT_SHA\"\n    paths:\n      - build/\n\n  # Run simple make from the Makefile on four processes.\n  script:\n    - source setup.sh --make\n\n# > docker_image_build <\n# The docker_image_build stage creates the Docker image for an EC2 instance \n# on a docker-compatible runner\ndocker_image_build:\n  stage: Build\n  tags:\n    - docker\n  script:\n    - docker build --build-arg CUDA_GENERATION=Maxwell -t $TAG_IMAGE .\n  except:\n    - master\n\n# > docker_image_build_master <\n# The docker_image_build_master stage creates the Docker image for all architectures\n# on a docker-compatible runner\ndocker_image_build_master:\n  stage: Build\n  tags:\n    - docker\n  script:\n    - docker build --build-arg CUDA_GENERATION=Auto -t $TAG_IMAGE .\n  only:\n    - master\n\n# ---------------------------------------------------------------------------- #\n#                                    TEST                                      #\n# ---------------------------------------------------------------------------- #\n# The Test stage will perform a number of tests on the Project.\n\n# > gtest <\n# The gtest stage runs the GTest suite present within the library.\n# GTest binaries can be found in the group shared directory.\ngtest:\n  stage: Test\n\n  # Set env variables before testing should we need to access CUDA of OpenGL\n  # headers.\n  before_script:\n    - *env_setup\n\n  # Testing must occur on the newly built build/ binaries.\n  cache:\n    key: \"$CI_COMMIT_SHA\"\n    paths:\n      - build/\n    policy: pull\n\n  # Run the gtests\n  script:\n    - ./build/bin/dynfu_test\n\n# > coverage <\n# The coverage stage runs gcov (and on top of that lcov) to generate a coverage\n# report of the test for the source files.\ncoverage:\n  stage: Test\n\n  # Set env variables before testing should we need to access CUDA of OpenGL\n  # headers.\n  before_script:\n    - *env_setup\n\n  # Coverage report is constructed for the newly build binaries\n  cache:\n    key: \"$CI_COMMIT_SHA\"\n    paths:\n      - build/\n    policy: pull\n\n  # Generate the coverage report\n  script:\n    - source setup.sh --coverage\n\n# ---------------------------------------------------------------------------- #\n#                                   DEPLOY                                     #\n# ---------------------------------------------------------------------------- #\n# The Deploy stage deploys newly built and tested binaries to a given location\n# on disk.\n\n# > deploy <\n# The deploy stage will deploy the newly built project binaries\n# to /data/dynfu.\ndeploy:\n  stage: Deploy\n\n  # The latest build build/ must be retrieved from cache\n  cache:\n    key: \"$CI_COMMIT_SHA\"\n    paths:\n      - build/\n    policy: pull\n\n  # Clean the output distributed library directory to clone new data into it\n  before_script:\n    - rm -rf /data/dynfu\n    - mkdir /data/dynfu\n\n  # Copy the latest build BINARIES to /data/\n  script:\n    - cp -R build/bin/* /data/dynfu\n\n# > docker_image__deploy_dev <\n# Deploys to Docker Hub\ndocker_image_deploy_dev:\n  stage: Deploy\n  tags:\n    - docker\n  script:\n    - docker login -u $DOCKER_HUB_LOGIN -p $DOCKER_HUB_PASSWORD\n    - docker tag $TAG_IMAGE $DEV_IMAGE\n    - docker push $DEV_IMAGE\n  only:\n    - dev\n\n# > docker_image_deploy <\n# Deploys to Docker Hub\ndocker_image_deploy:\n  stage: Deploy\n  tags:\n    - docker\n  script:\n    - docker login -u $DOCKER_HUB_LOGIN -p $DOCKER_HUB_PASSWORD\n    - docker tag $TAG_IMAGE $CONTAINER_IMAGE\n    - docker push $CONTAINER_IMAGE\n  only:\n    - master\n\n# > test_dynfu_on_ec2 <\n# Trigger testing on ec2 instance\ntest_dynfu_on_ec2:\n  stage: Deploy\n  tags:\n    - docker\n  before_script:\n    - pip install awscli\n    - export PATH=\"${PATH}:~/.local/bin\"\n  script:\n    - AWS_ACCESS_KEY_ID=${UPDATE_LAMBDA_KEY} AWS_SECRET_ACCESS_KEY=${UPDATE_LAMBDA_PASS} AWS_DEFAULT_REGION=eu-west-1 aws lambda invoke --function-name dynamic-fusion-trigger-testing /tmp/lambda-out\n  only:\n    - dev\n# ---------------------------------------------------------------------------- #\n#                                   CLEANUP                                    #\n# ---------------------------------------------------------------------------- #\n# The Cleanup stage makes sure the cache storage is cleaned for a given\n# pipeline in the runners directories.\n\n# > cache_cleanup <\n# It runs regardless of failure or success, and thus has the aim of cleaning\n# up the testing environment. In particular this script must remove unused\n# cache artifacts that were stored within one pipeline.\ncache_cleanup:\n  stage: Cleanup\n  script:\n    - echo \"Present Cache:\" && ls /home/gitlab-runner/cache/g1736211/dynfu/\n    - rm -rf \"/home/gitlab-runner/cache/g1736211/dynfu/$CI_COMMIT_SHA\"\n  when: always\n"
  },
  {
    "path": ".lintflags",
    "content": "# ---------------------------------------------------------------------------- #\n#                                   FILE FORMAT                                #\n# ---------------------------------------------------------------------------- #\n# Available checks can be found at http://clang.llvm.org/extra/clang-tidy/     #\n# To add a new check add it normally                                           #\n# To PREVENT the linter from using a check, prefix with '-'                    #\n# Extra links: https://clang.llvm.org/extra/clang-tidy/checks/list.html        #\n# Clang-tidy docs: https://clang.llvm.org/docs/ClangFormat.html                #\n# ---------------------------------------------------------------------------- #\n\n# Enforce google standard\ngoogle-*\n\n# Use High Integrity C++ Coding Standard\nhicpp-*\n\n# Use LLVM Coding Standard\nllvm-*\n\n# Use all miscellaneous flags\nmisc-*\n\n# Apply readability constraints\nreadability-*\n\n# Apply bugprone checks\nbugprone-*\n\n# ---------------------------------------------------------------------------- #\n# The following are check DISABLES for certain behaviours\n\n# Allow un-use of use auto\n-hicpp-use-auto\n\n# Allow string comparison .compare and not only via == or !=\n-misc-string-compare\n\n# Allow warnings for signed bitwise operators\n-hicpp-signed-bitwise\n\n# Allow printfs calls in C style\n-hicpp-vararg\n\n# Allow implicit bool casts and conversions\n-readability-implicit-bool-cast\n-readability-implicit-bool-conversion\n\n# Prevent conflicting standards\n-llvm-include-order\n\n# Deprecated checks\n-hicpp-special-member-functions\n-hicpp-use-equals-delete\n-hicpp-no-array-decay\n-readability-delete-null-pointer\n-hicpp-member-init\n"
  },
  {
    "path": ".lintignore",
    "content": "# Ignore the test directory when linting\n./tests/*\n\n# Ignore all git files when linting\n./.git/*\n\n# Ignore all scripts\n./scripts/*\n\n# Ignore the cmake directory\n./cmake/*\n\n# Ignore all contetnts of build directory\n./build/*\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "# ---------------------------------------------------------------------------- #\n# CMAKE INIT CONFIGURATIONS\n# ---------------------------------------------------------------------------- #\ncmake_minimum_required(VERSION 2.8.8)\n\n# ---------------------------------------------------------------------------- #\n# SET THE PROJECT\n# ---------------------------------------------------------------------------- #\nproject(kfusion C CXX)\n\n# ---------------------------------------------------------------------------- #\n# UTILITY FUNCTIONS (MACROS) AND PATHS\n# ---------------------------------------------------------------------------- #\nLIST(APPEND CMAKE_MODULE_PATH \"${CMAKE_SOURCE_DIR}/cmake/\")\ninclude(Utils)\ninclude(Targets)\n\n# ---------------------------------------------------------------------------- #\n# CMAKE SETTINGS\n# ---------------------------------------------------------------------------- #\n# Use both Debug and Release versions\nSET(CMAKE_CONFIGURATION_TYPES \"Debug;Release\")\n\n# Set CXX to compile with C++11 standard\nSET(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -std=c++11 -MP -MD -pthread -fpermissive\")\n\n# ---------------------------------------------------------------------------- #\n# DEPENDENCIES\n# ---------------------------------------------------------------------------- #\nfind_package(Boost 1.58.0 REQUIRED COMPONENTS filesystem program_options)\nfind_package(CUDA REQUIRED)\nfind_package(OpenCV REQUIRED COMPONENTS core viz highgui calib3d)\nfind_package(PCL 1.8 REQUIRED)\nfind_package(VTK REQUIRED)\n\n# ---------------------------------------------------------------------------- #\n# CUDA SETTINGS\n# ---------------------------------------------------------------------------- #\nSET(HAVE_CUDA 1)\nLIST(APPEND CUDA_NVCC_FLAGS\n   \"-gencode;arch=compute_61,code=sm_61;\n    --ftz=true;\n    --prec-div=false;\n    --prec-sqrt=false;\n    --expt-relaxed-constexpr\"\n)\n\nif(UNIX OR APPLE)\n  LIST(APPEND CUDA_NVCC_FLAGS \"-Xcompiler;-fPIC\")\nendif()\n\n# ---------------------------------------------------------------------------- #\n# SET INCLUDE PATHS\n# ---------------------------------------------------------------------------- #\ninclude_directories(include\n  ${Boost_INCLUDE_DIRS}\n  ${CUDA_INCLUDE_DIRS}\n  ${OpenCV_INCLUDE_DIRS}\n  ${PCL_INCLUDE_DIRS}\n  ${VTK_INCLUDE_DIRS}\n)\n\n# ---------------------------------------------------------------------------- #\n# BUILD SRC DIRECTORY\n# ---------------------------------------------------------------------------- #\nadd_subdirectory(src)\n\n# ---------------------------------------------------------------------------- #\n# BUILD TESTS\n# ---------------------------------------------------------------------------- #\nif(BUILD_TESTS)\n  enable_testing()\n  add_subdirectory(test)\nendif()\n"
  },
  {
    "path": "LICENSE.md",
    "content": "Copyright (c) 2012, Anatoly Baksheev\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above copyright notice, this\n  list of conditions and the following disclaimer.\n\n* Redistributions in binary form must reproduce the above copyright notice,\n  this list of conditions and the following disclaimer in the documentation\n  and/or other materials provided with the distribution.\n\n* Neither the name of the {organization} nor the names of its\n  contributors may be used to endorse or promote products derived from\n  this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "README.md",
    "content": "sobfu\n============\nsoftware for 3d reconstruction of non-rigidly deforming scenes using depth data, based on sobolevfusion\n\ndependencies\n-----------\n* boost\n* nvidia gpu with cuda >8.0\n* opencv\n* pcl 1.8.1\n* vtk\n\ninstallation\n------------\nto install, run `source setup.sh -a`\n\nusage\n------------\n`./build/bin/app /path/to/data /path/to/params <--enable-viz> <--enable-log> <--verbose>`\n\n* the data directory must include folders `color` and `depth`\n* the folder `params` has files with recommended parameters for some of the scenes from volumedeform and killingfusion datasets\n\n### options\n* with `--enable-viz`, screenshots from the pcl viewer will be logged in .png format to `/path/to/data/screenshots`\n* with `--enable-log`, meshes computed from the model tsdf's via marching cubes will be logged in .vtk format to `/path/to/data/meshes`\n* `--verbose` and `--vverbose` control verbosity of the solver\n\nexample reconstructions\n------------\nreconstructions of some of the scenes from the volumedeform and killingfusion projects can be viewed on our [youtube playlist](http://bit.ly/sobfu-yt)\n\nthere is a large trade-off between frame rate and reconstruction quality--sample reconstructions ran at approx. 2fps\n\nissues\n------------\n\n* drift in longer scenes due to less than perfect registration\n* topological changes\n\nreferences \n------------\n```\n@InProceedings{Slavcheva_2018_CVPR,\nauthor = {Slavcheva, Miroslava and Baust, Maximilian and Ilic, Slobodan},\ntitle = {SobolevFusion: 3D Reconstruction of Scenes Undergoing Free Non-Rigid Motion},\nbooktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},\nmonth = {June},\nyear = {2018}\n}\n```\n"
  },
  {
    "path": "cmake/Targets.cmake",
    "content": "################################################################################################\n# short command to setup source group\nfunction(kf_source_group group)\n  cmake_parse_arguments(VW_SOURCE_GROUP \"\" \"\" \"GLOB\" ${ARGN})\n  file(GLOB srcs ${VW_SOURCE_GROUP_GLOB})\n  #list(LENGTH ${srcs} ___size)\n  #if (___size GREATER 0)\n    source_group(${group} FILES ${srcs})\n  #endif()\nendfunction()\n\n\n################################################################################################\n# short command for declaring includes from other modules\nmacro(declare_deps_includes)\n  foreach(__arg ${ARGN})\n    get_filename_component(___path \"${CMAKE_SOURCE_DIR}/modules/${__arg}/include\" ABSOLUTE)\n    if (EXISTS ${___path})\n      include_directories(${___path})\n    endif()\n  endforeach()\n\n  unset(___path)\n  unset(__arg)\nendmacro()\n\n\n################################################################################################\n# short command for setting defeault target properties\nfunction(default_properties target)\n  set_target_properties(${target} PROPERTIES\n    DEBUG_POSTFIX \"d\"\n    ARCHIVE_OUTPUT_DIRECTORY \"${CMAKE_BINARY_DIR}/lib\"\n    RUNTIME_OUTPUT_DIRECTORY \"${CMAKE_BINARY_DIR}/bin\")\n\n    if (NOT ${target} MATCHES \"^test_\")\n      install(TARGETS ${the_target} RUNTIME DESTINATION \".\")\n    endif()\nendfunction()\n\nfunction(test_props target)\n  #os_project_label(${target} \"[test]\")\n  if(USE_PROJECT_FOLDERS)\n    set_target_properties(${target} PROPERTIES FOLDER \"Tests\")\n  endif()\nendfunction()\n\nfunction(app_props target)\n  #os_project_label(${target} \"[app]\")\n  if(USE_PROJECT_FOLDERS)\n    set_target_properties(${target} PROPERTIES FOLDER \"Apps\")\n  endif()\nendfunction()\n\n\n################################################################################################\n# short command for setting defeault target properties\nfunction(default_properties target)\n  set_target_properties(${target} PROPERTIES\n    DEBUG_POSTFIX \"d\"\n    ARCHIVE_OUTPUT_DIRECTORY \"${CMAKE_BINARY_DIR}/lib\"\n    RUNTIME_OUTPUT_DIRECTORY \"${CMAKE_BINARY_DIR}/bin\")\n\n    if (NOT ${target} MATCHES \"^test_\")\n      install(TARGETS ${the_target} RUNTIME DESTINATION \".\")\n    endif()\nendfunction()\n\n\n################################################################################################\n# short command for adding library module\nmacro(add_module_library name)\n  set(module_name ${name})\n\n  FILE(GLOB_RECURSE sources *.cpp *.cu)\n\n  cuda_add_library(${module_name} STATIC ${sources})\n\n  if(MSVC)\n    set_target_properties(${module_name} PROPERTIES DEFINE_SYMBOL KFUSION_API_EXPORTS)\n  else()\n    add_definitions(-DKFUSION_API_EXPORTS)\n  endif()\n\n  default_properties(${module_name})\nendmacro()\n\n\n################################################################################################\n# short command for adding application module\nmacro(add_application target)\n  FILE(GLOB_RECURSE sources *.cpp)\n  add_executable(${target} ${sources})\n  default_properties(${target})\nendmacro()\n\n\n################################################################################################\n# short command for adding test target\nmacro(CREATE_TEST target)\n  FILE(GLOB SRCS *.cpp)\n  ADD_EXECUTABLE(${target} ${SRCS})\n  TARGET_LINK_LIBRARIES(${target} libgtest libgmock)\n  add_test(NAME ${target} COMMAND ${target})\n  default_properties(${target})\nendmacro()\n"
  },
  {
    "path": "cmake/Utils.cmake",
    "content": "################################################################################################\n# short alias for CMake logging\nfunction(dmsg)\n  message(STATUS \"<<${ARGN}\")\nendfunction()\n\n################################################################################################\n# Command checking if current module has cuda souces to compile\nmacro(check_cuda var)\n  file(GLOB cuda src/cuda/*.cu)\n  list(LENGTH cuda ___size)  \n  if (HAVE_CUDA AND ___size GREATER 0)  \n    set(${var} ON)\n  else()  \n    set(${var} OFF) \n  endif()\n  unset(___size)\n  unset(cuda)\nendmacro()\n\n################################################################################################\n# short command for adding library module\nmacro(warnings_disable)\n  if(NOT ENABLE_NOISY_WARNINGS)\n    set(_flag_vars \"\")\n    set(_msvc_warnings \"\")\n    set(_gxx_warnings \"\")\n    foreach(arg ${ARGN})\n      if(arg MATCHES \"^CMAKE_\")\n        list(APPEND _flag_vars ${arg})\n      elseif(arg MATCHES \"^/wd\")\n        list(APPEND _msvc_warnings ${arg})\n      elseif(arg MATCHES \"^-W\")\n        list(APPEND _gxx_warnings ${arg})\n      endif()\n    endforeach()\n    if(MSVC AND _msvc_warnings AND _flag_vars)\n      foreach(var ${_flag_vars})\n        foreach(warning ${_msvc_warnings})\n          set(${var} \"${${var}} ${warning}\")\n        endforeach()\n      endforeach()\n    elseif(CV_COMPILER_IS_GNU AND _gxx_warnings AND _flag_vars)\n      foreach(var ${_flag_vars})\n        foreach(warning ${_gxx_warnings})\n          if(NOT warning MATCHES \"^-Wno-\")\n            string(REPLACE \"${warning}\" \"\" ${var} \"${${var}}\")\n            string(REPLACE \"-W\" \"-Wno-\" warning \"${warning}\")\n          endif()\n          ocv_check_flag_support(${var} \"${warning}\" _varname)\n          if(${_varname})\n            set(${var} \"${${var}} ${warning}\")\n          endif()\n        endforeach()\n      endforeach()\n    endif()\n    unset(_flag_vars)\n    unset(_msvc_warnings)\n    unset(_gxx_warnings)\n  endif(NOT ENABLE_NOISY_WARNINGS)\nendmacro()\n\n################################################################################################\n# Command for asstion options with some preconditions\nmacro(kf_option variable description value)\n  set(__value ${value})\n  set(__condition \"\")\n  set(__varname \"__value\")\n  foreach(arg ${ARGN})\n    if(arg STREQUAL \"IF\" OR arg STREQUAL \"if\")\n      set(__varname \"__condition\")\n    else()\n      list(APPEND ${__varname} ${arg})\n    endif()\n  endforeach()\n  unset(__varname)\n  if(\"${__condition}\" STREQUAL \"\")\n    set(__condition 2 GREATER 1)\n  endif()\n\n  if(${__condition})\n    if(\"${__value}\" MATCHES \";\")\n      if(${__value})\n        option(${variable} \"${description}\" ON)\n      else()\n        option(${variable} \"${description}\" OFF)\n      endif()\n    elseif(DEFINED ${__value})\n      if(${__value})\n        option(${variable} \"${description}\" ON)\n      else()\n        option(${variable} \"${description}\" OFF)\n      endif()\n    else()\n      option(${variable} \"${description}\" ${__value})\n    endif()\n  else()\n    unset(${variable} CACHE)\n  endif()\n  unset(__condition)\n  unset(__value)\nendmacro()\n"
  },
  {
    "path": "include/kfusion/cuda/device.hpp",
    "content": "#pragma once\n\n#include <cuda_fp16.h>\n#include <kfusion/cuda/temp_utils.hpp>\n#include <kfusion/precomp.hpp>\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// TsdfVolume\n\n// __kf_device__ kfusion::device::TsdfVolume::TsdfVolume(float2 *_data, int3 _dims, float3 _voxel_size,\n//                                                       float _trunc_dist, float _max_weight)\n//     : data(_data), dims(_dims), voxel_size(_voxel_size), trunc_dist(_trunc_dist), max_weight(_max_weight) {}\n//\n// __kf_device__ kfusion::device::TsdfVolume::TsdfVolume(const TsdfVolume &other)\n//     : data(other.data),\n//       dims(other.dims),\n//       voxel_size(other.voxel_size),\n//       trunc_dist(other.trunc_dist),\n//       max_weight(other.max_weight) {}\n\n__kf_device__ float2 *kfusion::device::TsdfVolume::operator()(int x, int y, int z) {\n    return data + x + y * dims.x + z * dims.y * dims.x;\n}\n\n__kf_device__ const float2 *kfusion::device::TsdfVolume::operator()(int x, int y, int z) const {\n    return data + x + y * dims.x + z * dims.y * dims.x;\n}\n\n__kf_device__ float2 *kfusion::device::TsdfVolume::beg(int x, int y) const { return data + x + dims.x * y; }\n\n__kf_device__ float2 *kfusion::device::TsdfVolume::zstep(float2 *const ptr) const { return ptr + dims.x * dims.y; }\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Projector\n\n__kf_device__ float2 kfusion::device::Projector::operator()(const float3 &p) const {\n    float2 coo;\n    coo.x = __fmaf_rn(f.x, __fdividef(p.x, p.z), c.x);\n    coo.y = __fmaf_rn(f.y, __fdividef(p.y, p.z), c.y);\n    return coo;\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Reprojector\n\n__kf_device__ float3 kfusion::device::Reprojector::operator()(int u, int v, float z) const {\n    float x = z * (u - c.x) * finv.x;\n    float y = z * (v - c.y) * finv.y;\n    return make_float3(x, y, z);\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Utility\n\nnamespace kfusion {\nnamespace device {\n__kf_device__ Vec3f operator*(const Mat3f &m, const Vec3f &v) {\n    return make_float3(dot(m.data[0], v), dot(m.data[1], v), dot(m.data[2], v));\n}\n\n__kf_device__ Vec3f operator*(const Aff3f &a, const Vec3f &v) { return a.R * v + a.t; }\n\n/* operator for multiplication of an affine transform by a float */\n__kf_device__ Aff3f operator*(const float &s, const Aff3f &a) {\n    Aff3f b;\n    b.R = a.R;\n    b.t = s * a.t;\n    return b;\n}\n\n/* 3-d matrix multiplication */\n__kf_device__ Mat3f operator*(const Mat3f &m, const Mat3f &n) {\n    Mat3f r;\n    r.data[0] = m.data[0] * make_float3(n.data[0].x, n.data[1].x, n.data[2].x);\n    r.data[1] = m.data[1] * make_float3(n.data[0].y, n.data[1].y, n.data[2].y);\n    r.data[2] = m.data[2] * make_float3(n.data[0].z, n.data[1].z, n.data[2].z);\n    return r;\n}\n\n/* affine transform composition */\n__kf_device__ Aff3f operator*(const Aff3f &a, const Aff3f &b) {\n    Aff3f c;\n    c.R = a.R * b.R;\n    c.t = a.t + b.t;\n    return c;\n}\n\n__kf_device__ Vec3f tr(const float4 &v) { return make_float3(v.x, v.y, v.z); }\n\nstruct plus {\n    __kf_device__ float operator()(float l, float r) const { return l + r; }\n    __kf_device__ double operator()(double l, double r) const { return l + r; }\n};\n}  // namespace device\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/cuda/device_array.hpp",
    "content": "#pragma once\n\n#include <kfusion/cuda/device_memory.hpp>\n#include <kfusion/exports.hpp>\n\n#include <vector>\n\nnamespace kfusion {\nnamespace cuda {\n//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/** \\brief @b DeviceArray class\n *\n * \\note Typed container for GPU memory with reference counting.\n *\n * \\author Anatoly Baksheev\n */\ntemplate <class T>\nclass KF_EXPORTS DeviceArray : public DeviceMemory {\npublic:\n    /** \\brief Element type. */\n    typedef T type;\n\n    /** \\brief Element size. */\n    enum { elem_size = sizeof(T) };\n\n    /** \\brief Empty constructor. */\n    DeviceArray();\n\n    /** \\brief Allocates internal buffer in GPU memory\n     * \\param size_t: number of elements to allocate\n     * */\n    DeviceArray(size_t size);\n\n    /** \\brief Initializes with user allocated buffer. Reference counting is\n     * disabled in this case. \\param ptr: pointer to buffer \\param size: elemens\n     * number\n     * */\n    DeviceArray(T *ptr, size_t size);\n\n    /** \\brief Copy constructor. Just increments reference counter. */\n    DeviceArray(const DeviceArray &other);\n\n    /** \\brief Assigment operator. Just increments reference counter. */\n    DeviceArray &operator=(const DeviceArray &other);\n\n    /** \\brief Allocates internal buffer in GPU memory. If internal buffer was\n     * created before the function recreates it with new size. If new and old\n     * sizes are equal it does nothing. \\param size: elemens number\n     * */\n    void create(size_t size);\n\n    /** \\brief Decrements reference counter and releases internal buffer if\n     * needed. */\n    void release();\n\n    /** \\brief Performs data copying. If destination size differs it will be\n     * reallocated. \\param other_arg: destination container\n     * */\n    void copyTo(DeviceArray &other) const;\n\n    /** \\brief Uploads data to internal buffer in GPU memory. It calls create()\n     * inside to ensure that intenal buffer size is enough. \\param host_ptr_arg:\n     * pointer to buffer to upload \\param size: elemens number\n     * */\n    void upload(const T *host_ptr, size_t size);\n\n    /** \\brief Downloads data from internal buffer to CPU memory\n     * \\param host_ptr_arg: pointer to buffer to download\n     * */\n    void download(T *host_ptr) const;\n\n    /** \\brief Uploads data to internal buffer in GPU memory. It calls create()\n     * inside to ensure that intenal buffer size is enough. \\param data: host\n     * vector to upload from\n     * */\n    template <class A>\n    void upload(const std::vector<T, A> &data);\n\n    /** \\brief Downloads data from internal buffer to CPU memory\n     * \\param data:  host vector to download to\n     * */\n    template <typename A>\n    void download(std::vector<T, A> &data) const;\n\n    /** \\brief Performs swap of data pointed with another device array.\n     * \\param other: device array to swap with\n     * */\n    void swap(DeviceArray &other_arg);\n\n    /** \\brief Returns pointer for internal buffer in GPU memory. */\n    T *ptr();\n\n    /** \\brief Returns const pointer for internal buffer in GPU memory. */\n    const T *ptr() const;\n\n    // using DeviceMemory::ptr;\n\n    /** \\brief Returns pointer for internal buffer in GPU memory. */\n    operator T *();\n\n    /** \\brief Returns const pointer for internal buffer in GPU memory. */\n    operator const T *() const;\n\n    /** \\brief Returns size in elements. */\n    size_t size() const;\n};\n\n//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/** \\brief @b DeviceArray2D class\n *\n * \\note Typed container for pitched GPU memory with reference counting.\n *\n * \\author Anatoly Baksheev\n */\ntemplate <class T>\nclass KF_EXPORTS DeviceArray2D : public DeviceMemory2D {\npublic:\n    /** \\brief Element type. */\n    typedef T type;\n\n    /** \\brief Element size. */\n    enum { elem_size = sizeof(T) };\n\n    /** \\brief Empty constructor. */\n    DeviceArray2D();\n\n    /** \\brief Allocates internal buffer in GPU memory\n     * \\param rows: number of rows to allocate\n     * \\param cols: number of elements in each row\n     * */\n    DeviceArray2D(int rows, int cols);\n\n    /** \\brief Initializes with user allocated buffer. Reference counting is\n     * disabled in this case. \\param rows: number of rows \\param cols: number of\n     * elements in each row \\param data: pointer to buffer \\param stepBytes:\n     * stride between two consecutive rows in bytes\n     * */\n    DeviceArray2D(int rows, int cols, void *data, size_t stepBytes);\n\n    /** \\brief Copy constructor. Just increments reference counter. */\n    DeviceArray2D(const DeviceArray2D &other);\n\n    /** \\brief Assigment operator. Just increments reference counter. */\n    DeviceArray2D &operator=(const DeviceArray2D &other);\n\n    /** \\brief Allocates internal buffer in GPU memory. If internal buffer was\n     * created before the function recreates it with new size. If new and old\n     * sizes are equal it does nothing. \\param rows: number of rows to allocate\n     * \\param cols: number of elements in each row\n     * */\n    void create(int rows, int cols);\n\n    /** \\brief Decrements reference counter and releases internal buffer if\n     * needed. */\n    void release();\n\n    /** \\brief Performs data copying. If destination size differs it will be\n     * reallocated. \\param other: destination container\n     * */\n    void copyTo(DeviceArray2D &other) const;\n\n    /** \\brief Uploads data to internal buffer in GPU memory. It calls create()\n     * inside to ensure that intenal buffer size is enough. \\param host_ptr:\n     * pointer to host buffer to upload \\param host_step: stride between two\n     * consecutive rows in bytes for host buffer \\param rows: number of rows to\n     * upload \\param cols: number of elements in each row\n     * */\n    void upload(const void *host_ptr, size_t host_step, int rows, int cols);\n\n    /** \\brief Downloads data from internal buffer to CPU memory. User is\n     * resposible for correct host buffer size. \\param host_ptr: pointer to host\n     * buffer to download \\param host_step: stride between two consecutive rows in\n     * bytes for host buffer\n     * */\n    void download(void *host_ptr, size_t host_step) const;\n\n    /** \\brief Performs swap of data pointed with another device array.\n     * \\param other: device array to swap with\n     * */\n    void swap(DeviceArray2D &other_arg);\n\n    /** \\brief Uploads data to internal buffer in GPU memory. It calls create()\n     * inside to ensure that intenal buffer size is enough. \\param data: host\n     * vector to upload from \\param cols: stride in elements between two\n     * consecutive rows for host buffer\n     * */\n    template <class A>\n    void upload(const std::vector<T, A> &data, int cols);\n\n    /** \\brief Downloads data from internal buffer to CPU memory\n     * \\param data: host vector to download to\n     * \\param cols: Output stride in elements between two consecutive rows for\n     * host vector.\n     * */\n    template <class A>\n    void download(std::vector<T, A> &data, int &cols) const;\n\n    /** \\brief Returns pointer to given row in internal buffer.\n     * \\param y_arg: row index\n     * */\n    T *ptr(int y = 0);\n\n    /** \\brief Returns const pointer to given row in internal buffer.\n     * \\param y_arg: row index\n     * */\n    const T *ptr(int y = 0) const;\n\n    // using DeviceMemory2D::ptr;\n\n    /** \\brief Returns pointer for internal buffer in GPU memory. */\n    operator T *();\n\n    /** \\brief Returns const pointer for internal buffer in GPU memory. */\n    operator const T *() const;\n\n    /** \\brief Returns number of elements in each row. */\n    int cols() const;\n\n    /** \\brief Returns number of rows. */\n    int rows() const;\n\n    /** \\brief Returns step in elements. */\n    size_t elem_step() const;\n};\n}  // namespace cuda\n\nnamespace device {\nusing kfusion::cuda::DeviceArray;\nusing kfusion::cuda::DeviceArray2D;\n}  // namespace device\n}  // namespace kfusion\n\n/////////////////////  Inline implementations of DeviceArray\n///////////////////////////////////////////////\n\ntemplate <class T>\ninline kfusion::cuda::DeviceArray<T>::DeviceArray() {}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray<T>::DeviceArray(size_t size) : DeviceMemory(size * elem_size) {}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray<T>::DeviceArray(T *ptr, size_t size) : DeviceMemory(ptr, size * elem_size) {}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray<T>::DeviceArray(const DeviceArray &other) : DeviceMemory(other) {}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray<T> &kfusion::cuda::DeviceArray<T>::operator=(const DeviceArray &other) {\n    DeviceMemory::operator=(other);\n    return *this;\n}\n\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray<T>::create(size_t size) {\n    DeviceMemory::create(size * elem_size);\n}\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray<T>::release() {\n    DeviceMemory::release();\n}\n\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray<T>::copyTo(DeviceArray &other) const {\n    DeviceMemory::copyTo(other);\n}\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray<T>::upload(const T *host_ptr, size_t size) {\n    DeviceMemory::upload(host_ptr, size * elem_size);\n}\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray<T>::download(T *host_ptr) const {\n    DeviceMemory::download(host_ptr);\n}\n\ntemplate <class T>\nvoid kfusion::cuda::DeviceArray<T>::swap(DeviceArray &other_arg) {\n    DeviceMemory::swap(other_arg);\n}\n\ntemplate <class T>\ninline kfusion::cuda::DeviceArray<T>::operator T *() {\n    return ptr();\n}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray<T>::operator const T *() const {\n    return ptr();\n}\ntemplate <class T>\ninline size_t kfusion::cuda::DeviceArray<T>::size() const {\n    return sizeBytes() / elem_size;\n}\n\ntemplate <class T>\ninline T *kfusion::cuda::DeviceArray<T>::ptr() {\n    return DeviceMemory::ptr<T>();\n}\ntemplate <class T>\ninline const T *kfusion::cuda::DeviceArray<T>::ptr() const {\n    return DeviceMemory::ptr<T>();\n}\n\ntemplate <class T>\ntemplate <class A>\ninline void kfusion::cuda::DeviceArray<T>::upload(const std::vector<T, A> &data) {\n    upload(&data[0], data.size());\n}\ntemplate <class T>\ntemplate <class A>\ninline void kfusion::cuda::DeviceArray<T>::download(std::vector<T, A> &data) const {\n    data.resize(size());\n    if (!data.empty())\n        download(&data[0]);\n}\n\n/////////////////////  Inline implementations of DeviceArray2D\n///////////////////////////////////////////////\n\ntemplate <class T>\ninline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D() {}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols) : DeviceMemory2D(rows, cols * elem_size) {}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols, void *data, size_t stepBytes)\n    : DeviceMemory2D(rows, cols * elem_size, data, stepBytes) {}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(const DeviceArray2D &other) : DeviceMemory2D(other) {}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray2D<T> &kfusion::cuda::DeviceArray2D<T>::operator=(const DeviceArray2D &other) {\n    DeviceMemory2D::operator=(other);\n    return *this;\n}\n\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray2D<T>::create(int rows, int cols) {\n    DeviceMemory2D::create(rows, cols * elem_size);\n}\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray2D<T>::release() {\n    DeviceMemory2D::release();\n}\n\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray2D<T>::copyTo(DeviceArray2D &other) const {\n    DeviceMemory2D::copyTo(other);\n}\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray2D<T>::upload(const void *host_ptr, size_t host_step, int rows, int cols) {\n    DeviceMemory2D::upload(host_ptr, host_step, rows, cols * elem_size);\n}\ntemplate <class T>\ninline void kfusion::cuda::DeviceArray2D<T>::download(void *host_ptr, size_t host_step) const {\n    DeviceMemory2D::download(host_ptr, host_step);\n}\n\ntemplate <class T>\ntemplate <class A>\ninline void kfusion::cuda::DeviceArray2D<T>::upload(const std::vector<T, A> &data, int cols) {\n    upload(&data[0], cols * elem_size, data.size() / cols, cols);\n}\n\ntemplate <class T>\ntemplate <class A>\ninline void kfusion::cuda::DeviceArray2D<T>::download(std::vector<T, A> &data, int &elem_step) const {\n    elem_step = cols();\n    data.resize(cols() * rows());\n    if (!data.empty())\n        download(&data[0], colsBytes());\n}\n\ntemplate <class T>\nvoid kfusion::cuda::DeviceArray2D<T>::swap(DeviceArray2D &other_arg) {\n    DeviceMemory2D::swap(other_arg);\n}\n\ntemplate <class T>\ninline T *kfusion::cuda::DeviceArray2D<T>::ptr(int y) {\n    return DeviceMemory2D::ptr<T>(y);\n}\ntemplate <class T>\ninline const T *kfusion::cuda::DeviceArray2D<T>::ptr(int y) const {\n    return DeviceMemory2D::ptr<T>(y);\n}\n\ntemplate <class T>\ninline kfusion::cuda::DeviceArray2D<T>::operator T *() {\n    return ptr();\n}\ntemplate <class T>\ninline kfusion::cuda::DeviceArray2D<T>::operator const T *() const {\n    return ptr();\n}\n\ntemplate <class T>\ninline int kfusion::cuda::DeviceArray2D<T>::cols() const {\n    return DeviceMemory2D::colsBytes() / elem_size;\n}\ntemplate <class T>\ninline int kfusion::cuda::DeviceArray2D<T>::rows() const {\n    return DeviceMemory2D::rows();\n}\n\ntemplate <class T>\ninline size_t kfusion::cuda::DeviceArray2D<T>::elem_step() const {\n    return DeviceMemory2D::step() / elem_size;\n}\n"
  },
  {
    "path": "include/kfusion/cuda/device_memory.hpp",
    "content": "#pragma once\n\n#include <kfusion/cuda/kernel_containers.hpp>\n#include <kfusion/exports.hpp>\n\nnamespace kfusion {\nnamespace cuda {\n/** \\brief Error handler. All GPU functions from this subsystem call the\n * function to report an error. For internal use only */\nKF_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func = \"\");\n\n//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/** \\brief @b DeviceMemory class\n *\n * \\note This is a BLOB container class with reference counting for GPU memory.\n *\n * \\author Anatoly Baksheev\n */\n\nclass KF_EXPORTS DeviceMemory {\npublic:\n    /** \\brief Empty constructor. */\n    DeviceMemory();\n\n    /** \\brief Destructor. */\n    ~DeviceMemory();\n\n    /** \\brief Allocates internal buffer in GPU memory\n     * \\param sizeBytes_arg: amount of memory to allocate\n     * */\n    DeviceMemory(size_t sizeBytes_arg);\n\n    /** \\brief Initializes with user allocated buffer. Reference counting is\n     * disabled in this case. \\param ptr_arg: pointer to buffer \\param\n     * sizeBytes_arg: buffer size\n     * */\n    DeviceMemory(void *ptr_arg, size_t sizeBytes_arg);\n\n    /** \\brief Copy constructor. Just increments reference counter. */\n    DeviceMemory(const DeviceMemory &other_arg);\n\n    /** \\brief Assigment operator. Just increments reference counter. */\n    DeviceMemory &operator=(const DeviceMemory &other_arg);\n\n    /** \\brief Allocates internal buffer in GPU memory. If internal buffer was\n     * created before the function recreates it with new size. If new and old\n     * sizes are equal it does nothing. \\param sizeBytes_arg: buffer size\n     * */\n    void create(size_t sizeBytes_arg);\n\n    /** \\brief Decrements reference counter and releases internal buffer if\n     * needed. */\n    void release();\n\n    /** \\brief Performs data copying. If destination size differs it will be\n     * reallocated. \\param other_arg: destination container\n     * */\n    void copyTo(DeviceMemory &other) const;\n\n    /** \\brief Uploads data to internal buffer in GPU memory. It calls create()\n     * inside to ensure that intenal buffer size is enough. \\param host_ptr_arg:\n     * pointer to buffer to upload \\param sizeBytes_arg: buffer size\n     * */\n    void upload(const void *host_ptr_arg, size_t sizeBytes_arg);\n\n    /** \\brief Downloads data from internal buffer to CPU memory\n     * \\param host_ptr_arg: pointer to buffer to download\n     * */\n    void download(void *host_ptr_arg) const;\n\n    /** \\brief Performs swap of data pointed with another device memory.\n     * \\param other: device memory to swap with\n     * */\n    void swap(DeviceMemory &other_arg);\n\n    /** \\brief Returns pointer for internal buffer in GPU memory. */\n    template <class T>\n    T *ptr();\n\n    /** \\brief Returns constant pointer for internal buffer in GPU memory. */\n    template <class T>\n    const T *ptr() const;\n\n    /** \\brief Conversion to PtrSz for passing to kernel functions. */\n    template <class U>\n    operator PtrSz<U>() const;\n\n    /** \\brief Returns true if unallocated otherwise false. */\n    bool empty() const;\n\n    size_t sizeBytes() const;\n\nprivate:\n    /** \\brief Device pointer. */\n    void *data_;\n\n    /** \\brief Allocated size in bytes. */\n    size_t sizeBytes_;\n\n    /** \\brief Pointer to reference counter in CPU memory. */\n    int *refcount_;\n};\n\n//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/** \\brief @b DeviceMemory2D class\n *\n * \\note This is a BLOB container class with reference counting for pitched GPU\n * memory.\n *\n * \\author Anatoly Baksheev\n */\n\nclass KF_EXPORTS DeviceMemory2D {\npublic:\n    /** \\brief Empty constructor. */\n    DeviceMemory2D();\n\n    /** \\brief Destructor. */\n    ~DeviceMemory2D();\n\n    /** \\brief Allocates internal buffer in GPU memory\n     * \\param rows_arg: number of rows to allocate\n     * \\param colsBytes_arg: width of the buffer in bytes\n     * */\n    DeviceMemory2D(int rows_arg, int colsBytes_arg);\n\n    /** \\brief Initializes with user allocated buffer. Reference counting is\n     * disabled in this case. \\param rows_arg: number of rows \\param\n     * colsBytes_arg: width of the buffer in bytes \\param data_arg: pointer to\n     * buffer \\param stepBytes_arg: stride between two consecutive rows in bytes\n     * */\n    DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg);\n\n    /** \\brief Copy constructor. Just increments reference counter. */\n    DeviceMemory2D(const DeviceMemory2D &other_arg);\n\n    /** \\brief Assigment operator. Just increments reference counter. */\n    DeviceMemory2D &operator=(const DeviceMemory2D &other_arg);\n\n    /** \\brief Allocates internal buffer in GPU memory. If internal buffer was\n     * created before the function recreates it with new size. If new and old\n     * sizes are equal it does nothing. \\param ptr_arg: number of rows to allocate\n     * \\param sizeBytes_arg: width of the buffer in bytes\n     * */\n    void create(int rows_arg, int colsBytes_arg);\n\n    /** \\brief Decrements reference counter and releases internal buffer if\n     * needed. */\n    void release();\n\n    /** \\brief Performs data copying. If destination size differs it will be\n     * reallocated. \\param other_arg: destination container\n     * */\n    void copyTo(DeviceMemory2D &other) const;\n\n    /** \\brief Uploads data to internal buffer in GPU memory. It calls create()\n     * inside to ensure that intenal buffer size is enough. \\param host_ptr_arg:\n     * pointer to host buffer to upload \\param host_step_arg: stride between two\n     * consecutive rows in bytes for host buffer \\param rows_arg: number of rows\n     * to upload \\param sizeBytes_arg: width of host buffer in bytes\n     * */\n    void upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg);\n\n    /** \\brief Downloads data from internal buffer to CPU memory. User is\n     * resposible for correct host buffer size. \\param host_ptr_arg: pointer to\n     * host buffer to download \\param host_step_arg: stride between two\n     * consecutive rows in bytes for host buffer\n     * */\n    void download(void *host_ptr_arg, size_t host_step_arg) const;\n\n    /** \\brief Performs swap of data pointed with another device memory.\n     * \\param other: device memory to swap with\n     * */\n    void swap(DeviceMemory2D &other_arg);\n\n    /** \\brief Returns pointer to given row in internal buffer.\n     * \\param y_arg: row index\n     * */\n    template <class T>\n    T *ptr(int y_arg = 0);\n\n    /** \\brief Returns constant pointer to given row in internal buffer.\n     * \\param y_arg: row index\n     * */\n    template <class T>\n    const T *ptr(int y_arg = 0) const;\n\n    /** \\brief Conversion to PtrStep for passing to kernel functions. */\n    template <class U>\n    operator PtrStep<U>() const;\n\n    /** \\brief Conversion to PtrStepSz for passing to kernel functions. */\n    template <class U>\n    operator PtrStepSz<U>() const;\n\n    /** \\brief Returns true if unallocated otherwise false. */\n    bool empty() const;\n\n    /** \\brief Returns number of bytes in each row. */\n    int colsBytes() const;\n\n    /** \\brief Returns number of rows. */\n    int rows() const;\n\n    /** \\brief Returns stride between two consecutive rows in bytes for internal\n     * buffer. Step is stored always and everywhere in bytes!!! */\n    size_t step() const;\n\nprivate:\n    /** \\brief Device pointer. */\n    void *data_;\n\n    /** \\brief Stride between two consecutive rows in bytes for internal buffer.\n     * Step is stored always and everywhere in bytes!!! */\n    size_t step_;\n\n    /** \\brief Width of the buffer in bytes. */\n    int colsBytes_;\n\n    /** \\brief Number of rows. */\n    int rows_;\n\n    /** \\brief Pointer to reference counter in CPU memory. */\n    int *refcount_;\n};\n}  // namespace cuda\n\nnamespace device {\nusing kfusion::cuda::DeviceMemory;\nusing kfusion::cuda::DeviceMemory2D;\n}  // namespace device\n}  // namespace kfusion\n\n/////////////////////  Inline implementations of DeviceMemory\n///////////////////////////////////////////////\n\ntemplate <class T>\ninline T *kfusion::cuda::DeviceMemory::ptr() {\n    return (T *) data_;\n}\ntemplate <class T>\ninline const T *kfusion::cuda::DeviceMemory::ptr() const {\n    return (const T *) data_;\n}\n\ntemplate <class U>\ninline kfusion::cuda::DeviceMemory::operator kfusion::cuda::PtrSz<U>() const {\n    PtrSz<U> result;\n    result.data = (U *) ptr<U>();\n    result.size = sizeBytes_ / sizeof(U);\n    return result;\n}\n\n/////////////////////  Inline implementations of DeviceMemory2D\n///////////////////////////////////////////////\n\ntemplate <class T>\nT *kfusion::cuda::DeviceMemory2D::ptr(int y_arg) {\n    return (T *) ((char *) data_ + y_arg * step_);\n}\ntemplate <class T>\nconst T *kfusion::cuda::DeviceMemory2D::ptr(int y_arg) const {\n    return (const T *) ((const char *) data_ + y_arg * step_);\n}\n\ntemplate <class U>\nkfusion::cuda::DeviceMemory2D::operator kfusion::cuda::PtrStep<U>() const {\n    PtrStep<U> result;\n    result.data = (U *) ptr<U>();\n    result.step = step_;\n    return result;\n}\n\ntemplate <class U>\nkfusion::cuda::DeviceMemory2D::operator kfusion::cuda::PtrStepSz<U>() const {\n    PtrStepSz<U> result;\n    result.data = (U *) ptr<U>();\n    result.step = step_;\n    result.cols = colsBytes_ / sizeof(U);\n    result.rows = rows_;\n    return result;\n}\n"
  },
  {
    "path": "include/kfusion/cuda/imgproc.hpp",
    "content": "#pragma once\n\n/* kinfu includes */\n#include <kfusion/types.hpp>\n\n/* pcl includes */\n#include <pcl/PolygonMesh.h>\n\nnamespace kfusion {\nnamespace cuda {\nKF_EXPORTS void depthBilateralFilter(const Depth &in, Depth &out, int ksz, float sigma_spatial, float sigma_depth);\n\nKF_EXPORTS void depthTruncation(Depth &depth, float threshold);\n\nKF_EXPORTS void depthBuildPyramid(const Depth &depth, Depth &pyramid, float sigma_depth);\n\nKF_EXPORTS void computeNormalsAndMaskDepth(const Intr &intr, Depth &depth, Normals &normals);\n\nKF_EXPORTS void computePointNormals(const Intr &intr, const Depth &depth, Cloud &points, Normals &normals);\n\nKF_EXPORTS void computeDists(const Depth &depth, Dists &dists, const Intr &intr);\n\nKF_EXPORTS void resizeDepthNormals(const Depth &depth, const Normals &normals, Depth &depth_out, Normals &normals_out);\n\nKF_EXPORTS void resizePointsNormals(const Cloud &points, const Normals &normals, Cloud &points_out,\n                                    Normals &normals_out);\n\nKF_EXPORTS void waitAllDefaultStream();\n\nKF_EXPORTS void renderTangentColors(const Normals &normals, Image &image);\n\n/** \\brief rasterises the canonical surface\n * \\param[in] intr camera intrinsics\n * \\param[in] cam2vol camera -> volume coordinate transform\n * \\param[in] surface surface to rasterise\n * \\param[out] points_out frame with the currently visible geometry\n * \\param[out] normals_out frame with the normals of the currently visible geometry\n */\nKF_EXPORTS void rasteriseSurface(const Intr &intr, const Affine3f &vol2cam, const Surface &s, Cloud &vertices_out,\n                                 Normals &normals_out);\n\nKF_EXPORTS void renderImage(const Depth &depth, const Normals &normals, const Intr &intr, const Vec3f &light_pose,\n                            Image &image);\n\nKF_EXPORTS void renderImage(const Cloud &points, const Normals &normals, const Intr &intr, const Vec3f &light_pose,\n                            Image &image);\n}  // namespace cuda\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/cuda/kernel_containers.hpp",
    "content": "#pragma once\n\n#if defined(__CUDACC__)\n#define __kf_hdevice__ __host__ __device__ __forceinline__\n#define __kf_device__ __device__ __forceinline__\n#else\n#define __kf_hdevice__\n#define __kf_device__\n#endif\n\n#include <cstddef>\n\nnamespace kfusion {\nnamespace cuda {\ntemplate <typename T>\nstruct DevPtr {\n    typedef T elem_type;\n    const static size_t elem_size = sizeof(elem_type);\n\n    T *data;\n\n    __kf_hdevice__ DevPtr() : data(0) {}\n    __kf_hdevice__ DevPtr(T *data_arg) : data(data_arg) {}\n\n    __kf_hdevice__ size_t elemSize() const { return elem_size; }\n    __kf_hdevice__ operator T *() { return data; }\n    __kf_hdevice__ operator const T *() const { return data; }\n};\n\ntemplate <typename T>\nstruct PtrSz : public DevPtr<T> {\n    __kf_hdevice__ PtrSz() : size(0) {}\n    __kf_hdevice__ PtrSz(T *data_arg, size_t size_arg) : DevPtr<T>(data_arg), size(size_arg) {}\n\n    size_t size;\n};\n\ntemplate <typename T>\nstruct PtrStep : public DevPtr<T> {\n    __kf_hdevice__ PtrStep() : step(0) {}\n    __kf_hdevice__ PtrStep(T *data_arg, size_t step_arg) : DevPtr<T>(data_arg), step(step_arg) {}\n\n    /** \\brief stride between two consecutive rows in bytes. Step is stored always\n     * and everywhere in bytes!!! */\n    size_t step;\n\n    __kf_hdevice__ T *ptr(int y = 0) { return (T *) ((char *) DevPtr<T>::data + y * step); }\n    __kf_hdevice__ const T *ptr(int y = 0) const { return (const T *) ((const char *) DevPtr<T>::data + y * step); }\n\n    __kf_hdevice__ T &operator()(int y, int x) { return ptr(y)[x]; }\n    __kf_hdevice__ const T &operator()(int y, int x) const { return ptr(y)[x]; }\n};\n\ntemplate <typename T>\nstruct PtrStepSz : public PtrStep<T> {\n    __kf_hdevice__ PtrStepSz() : cols(0), rows(0) {}\n    __kf_hdevice__ PtrStepSz(int rows_arg, int cols_arg, T *data_arg, size_t step_arg)\n        : PtrStep<T>(data_arg, step_arg), cols(cols_arg), rows(rows_arg) {}\n\n    int cols;\n    int rows;\n};\n}  // namespace cuda\n\nnamespace device {\nusing kfusion::cuda::PtrStep;\nusing kfusion::cuda::PtrStepSz;\nusing kfusion::cuda::PtrSz;\n}  // namespace device\n}  // namespace kfusion\n\nnamespace kf = kfusion;\n"
  },
  {
    "path": "include/kfusion/cuda/marching_cubes.hpp",
    "content": "#pragma once\n\n/* eigen includes */\n#include <Eigen/Core>\n\n/* kinfu includes */\n#include <kfusion/cuda/device_array.hpp>\n#include <kfusion/cuda/tsdf_volume.hpp>\n\n/* pcl includes */\n#include <pcl/point_types.h>\n\nnamespace kfusion {\nnamespace cuda {\n\n/** \\brief MarchingCubes implements MarchingCubes functionality for TSDF volume on GPU\n * \\author Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com)\n */\nclass KF_EXPORTS MarchingCubes {\npublic:\n    /** \\brief Default size for triangles buffer */\n    enum { POINTS_PER_TRIANGLE = 3, DEFAULT_TRIANGLES_BUFFER_SIZE = 2 * 1000 * 1000 * POINTS_PER_TRIANGLE };\n\n    /** \\brief Smart pointer. */\n    typedef boost::shared_ptr<MarchingCubes> Ptr;\n\n    /** \\brief Default constructor */\n    MarchingCubes();\n    /** \\brief Destructor */\n    ~MarchingCubes();\n\n    /* set volume pose */\n    void setPose(const cv::Affine3f& pose);\n\n    /* run mc to extract the zero level set */\n    Surface run(const TsdfVolume& volume, DeviceArray<pcl::PointXYZ>& vertices_buffer,\n                DeviceArray<pcl::Normal>& normals_buffer);\n\nprivate:\n    /** \\brief Edge table for marching cubes  */\n    DeviceArray<int> edgeTable_;\n\n    /** \\brief Number of vertextes table for marching cubes  */\n    DeviceArray<int> numVertsTable_;\n\n    /** \\brief Triangles table for marching cubes  */\n    DeviceArray<int> triTable_;\n\n    /** \\brief Temporary buffer used by marching cubes (first row stores occuped voxes id, second number of vetexes,\n     * third poits offsets */\n    DeviceArray2D<int> occupied_voxels_buffer_;\n\n    /* volume pose */\n    cv::Affine3f pose;\n};\n}  // namespace cuda\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/cuda/projective_icp.hpp",
    "content": "#pragma once\n\n#include <kfusion/types.hpp>\n\nnamespace kfusion {\nnamespace cuda {\nclass ProjectiveICP {\npublic:\n    enum { MAX_PYRAMID_LEVELS = 4 };\n\n    typedef std::vector<Depth> DepthPyr;\n    typedef std::vector<Cloud> PointsPyr;\n    typedef std::vector<Normals> NormalsPyr;\n\n    ProjectiveICP();\n    virtual ~ProjectiveICP();\n\n    float getDistThreshold() const;\n    void setDistThreshold(float distance);\n\n    float getAngleThreshold() const;\n    void setAngleThreshold(float angle);\n\n    void setIterationsNum(const std::vector<int> &iters);\n    int getUsedLevelsNum() const;\n\n    virtual bool estimateTransform(Affine3f &affine, const Intr &intr, const Frame &curr, const Frame &prev);\n\n    /** The function takes masked depth, i.e. it assumes for performance reasons\n     * that \"if depth(y,x) is not zero, then normals(y,x) surely is not qnan\" */\n    virtual bool estimateTransform(Affine3f &affine, const Intr &intr, const DepthPyr &dcurr, const NormalsPyr ncurr,\n                                   const DepthPyr dprev, const NormalsPyr nprev);\n    virtual bool estimateTransform(Affine3f &affine, const Intr &intr, const PointsPyr &vcurr, const NormalsPyr ncurr,\n                                   const PointsPyr vprev, const NormalsPyr nprev);\n\n    // static Vec3f rodrigues2(const Mat3f& matrix);\nprivate:\n    std::vector<int> iters_;\n    float angle_thres_;\n    float dist_thres_;\n    DeviceArray2D<float> buffer_;\n\n    struct StreamHelper;\n    cv::Ptr<StreamHelper> shelp_;\n};\n}  // namespace cuda\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/cuda/temp_utils.hpp",
    "content": "#pragma once\n\n#include <cuda.h>\n#include <kfusion/cuda/kernel_containers.hpp>\n\n#define FULL_MASK 0xffffffff\n\nnamespace kfusion {\nnamespace device {\ntemplate <class T>\n__kf_hdevice__ void swap(T &a, T &b) {\n    T c(a);\n    a = b;\n    b = c;\n}\n\ntemplate <typename T>\nstruct numeric_limits;\n\ntemplate <>\nstruct numeric_limits<float> {\n    __kf_device__ static float quiet_NaN() { return __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ };\n    __kf_device__ static float epsilon() { return 1.192092896e-07f /*FLT_EPSILON*/; };\n    __kf_device__ static float min() { return 1.175494351e-38f /*FLT_MIN*/; };\n    __kf_device__ static float max() { return 3.402823466e+38f /*FLT_MAX*/; };\n};\n\ntemplate <>\nstruct numeric_limits<unsigned short> {\n    __kf_device__ static unsigned short max() { return USHRT_MAX; };\n};\n\n__kf_device__ float dot(const float3 &v1, const float3 &v2) {\n    return __fmaf_rn(v1.x, v2.x, __fmaf_rn(v1.y, v2.y, v1.z * v2.z));\n}\n\n__kf_device__ float3 &operator+=(float3 &vec, const float &v) {\n    vec.x += v;\n    vec.y += v;\n    vec.z += v;\n    return vec;\n}\n\n__kf_device__ float3 &operator+=(float3 &v1, const float3 &v2) {\n    v1.x += v2.x;\n    v1.y += v2.y;\n    v1.z += v2.z;\n    return v1;\n}\n\n__kf_device__ float3 operator+(const float3 &v1, const float3 &v2) {\n    return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);\n}\n\n__kf_device__ float3 operator*(const float3 &v1, const float3 &v2) {\n    return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);\n}\n\n__kf_hdevice__ float3 operator*(const float3 &v1, const int3 &v2) {\n    return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);\n}\n\n__kf_device__ float3 operator/(const float3 &v1, const float3 &v2) {\n    return make_float3(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z);\n}\n\n__kf_hdevice__ float3 operator/(const float &v, const float3 &vec) {\n    return make_float3(v / vec.x, v / vec.y, v / vec.z);\n}\n\n__kf_device__ float3 &operator*=(float3 &vec, const float &v) {\n    vec.x *= v;\n    vec.y *= v;\n    vec.z *= v;\n    return vec;\n}\n\n__kf_hdevice__ float3 operator-(const float3 &v1, const float3 &v2) {\n    return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);\n}\n\n__kf_hdevice__ float3 operator*(const float3 &v1, const float &v) { return make_float3(v1.x * v, v1.y * v, v1.z * v); }\n\n__kf_hdevice__ float3 operator*(const float &v, const float3 &v1) { return make_float3(v1.x * v, v1.y * v, v1.z * v); }\n\n__kf_device__ float norm(const float3 &v) { return sqrt(dot(v, v)); }\n\n__kf_device__ float norm_sqr(const float3 &v) { return dot(v, v); }\n\n__kf_device__ float3 normalized(const float3 &v) { return v * rsqrt(dot(v, v)); }\n\n__kf_hdevice__ float3 cross(const float3 &v1, const float3 &v2) {\n    return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);\n}\n\n__kf_device__ void computeRoots2(const float &b, const float &c, float3 &roots) {\n    roots.x = 0.f;\n    float d = b * b - 4.f * c;\n    if (d < 0.f)  // no real roots!!!! THIS SHOULD NOT HAPPEN!\n        d = 0.f;\n\n    float sd = sqrtf(d);\n\n    roots.z = 0.5f * (b + sd);\n    roots.y = 0.5f * (b - sd);\n}\n\n__kf_device__ void computeRoots3(float c0, float c1, float c2, float3 &roots) {\n    if (fabsf(c0) < numeric_limits<float>::epsilon())  // one root is 0 -> quadratic equation\n    {\n        computeRoots2(c2, c1, roots);\n    } else {\n        const float s_inv3  = 1.f / 3.f;\n        const float s_sqrt3 = sqrtf(3.f);\n        // Construct the parameters used in classifying the roots of the equation\n        // and in solving the equation for the roots in closed form.\n        float c2_over_3 = c2 * s_inv3;\n        float a_over_3  = (c1 - c2 * c2_over_3) * s_inv3;\n        if (a_over_3 > 0.f)\n            a_over_3 = 0.f;\n\n        float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));\n\n        float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;\n        if (q > 0.f)\n            q = 0.f;\n\n        // Compute the eigenvalues by solving for the roots of the polynomial.\n        float rho       = sqrtf(-a_over_3);\n        float theta     = atan2f(sqrtf(-q), half_b) * s_inv3;\n        float cos_theta = __cosf(theta);\n        float sin_theta = __sinf(theta);\n        roots.x         = c2_over_3 + 2.f * rho * cos_theta;\n        roots.y         = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);\n        roots.z         = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);\n\n        // Sort in increasing order.\n        if (roots.x >= roots.y)\n            swap(roots.x, roots.y);\n\n        if (roots.y >= roots.z) {\n            swap(roots.y, roots.z);\n\n            if (roots.x >= roots.y)\n                swap(roots.x, roots.y);\n        }\n        if (roots.x <= 0)  // eigenval for symetric positive semi-definite matrix can\n                           // not be negative! Set it to 0\n            computeRoots2(c2, c1, roots);\n    }\n}\n\nstruct Eigen33 {\npublic:\n    template <int Rows>\n    struct MiniMat {\n        float3 data[Rows];\n        __kf_hdevice__ float3 &operator[](int i) { return data[i]; }\n        __kf_hdevice__ const float3 &operator[](int i) const { return data[i]; }\n    };\n    typedef MiniMat<3> Mat33;\n    typedef MiniMat<4> Mat43;\n\n    static __kf_device__ float3 unitOrthogonal(const float3 &src) {\n        float3 perp;\n        /* Let us compute the crossed product of *this with a vector\n         * that is not too close to being colinear to *this.\n         */\n\n        /* unless the x and y coords are both close to zero, we can\n         * simply take ( -y, x, 0 ) and normalize it.\n         */\n        if (!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z)) {\n            float invnm = rsqrtf(src.x * src.x + src.y * src.y);\n            perp.x      = -src.y * invnm;\n            perp.y      = src.x * invnm;\n            perp.z      = 0.0f;\n        }\n        /* if both x and y are close to zero, then the vector is close\n         * to the z-axis, so it's far from colinear to the x-axis for instance.\n         * So we take the crossed product with (1,0,0) and normalize it.\n         */\n        else {\n            float invnm = rsqrtf(src.z * src.z + src.y * src.y);\n            perp.x      = 0.0f;\n            perp.y      = -src.z * invnm;\n            perp.z      = src.y * invnm;\n        }\n\n        return perp;\n    }\n\n    __kf_device__ Eigen33(volatile float *mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}\n    __kf_device__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals) {\n        // Scale the matrix so its entries are in [-1,1].  The scaling is applied\n        // only when at least one matrix entry has magnitude larger than 1.\n\n        float max01 = fmaxf(fabsf(mat_pkg[0]), fabsf(mat_pkg[1]));\n        float max23 = fmaxf(fabsf(mat_pkg[2]), fabsf(mat_pkg[3]));\n        float max45 = fmaxf(fabsf(mat_pkg[4]), fabsf(mat_pkg[5]));\n        float m0123 = fmaxf(max01, max23);\n        float scale = fmaxf(max45, m0123);\n\n        if (scale <= numeric_limits<float>::min())\n            scale = 1.f;\n\n        mat_pkg[0] /= scale;\n        mat_pkg[1] /= scale;\n        mat_pkg[2] /= scale;\n        mat_pkg[3] /= scale;\n        mat_pkg[4] /= scale;\n        mat_pkg[5] /= scale;\n\n        // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The\n        // eigenvalues are the roots to this equation, all guaranteed to be\n        // real-valued, because the matrix is symmetric.\n        float c0 = m00() * m11() * m22() + 2.f * m01() * m02() * m12() - m00() * m12() * m12() - m11() * m02() * m02() -\n                   m22() * m01() * m01();\n        float c1 = m00() * m11() - m01() * m01() + m00() * m22() - m02() * m02() + m11() * m22() - m12() * m12();\n        float c2 = m00() + m11() + m22();\n\n        computeRoots3(c0, c1, c2, evals);\n\n        if (evals.z - evals.x <= numeric_limits<float>::epsilon()) {\n            evecs[0] = make_float3(1.f, 0.f, 0.f);\n            evecs[1] = make_float3(0.f, 1.f, 0.f);\n            evecs[2] = make_float3(0.f, 0.f, 1.f);\n        } else if (evals.y - evals.x <= numeric_limits<float>::epsilon()) {\n            // first and second equal\n            tmp[0] = row0();\n            tmp[1] = row1();\n            tmp[2] = row2();\n            tmp[0].x -= evals.z;\n            tmp[1].y -= evals.z;\n            tmp[2].z -= evals.z;\n\n            vec_tmp[0] = cross(tmp[0], tmp[1]);\n            vec_tmp[1] = cross(tmp[0], tmp[2]);\n            vec_tmp[2] = cross(tmp[1], tmp[2]);\n\n            float len1 = dot(vec_tmp[0], vec_tmp[0]);\n            float len2 = dot(vec_tmp[1], vec_tmp[1]);\n            float len3 = dot(vec_tmp[2], vec_tmp[2]);\n\n            if (len1 >= len2 && len1 >= len3) {\n                evecs[2] = vec_tmp[0] * rsqrtf(len1);\n            } else if (len2 >= len1 && len2 >= len3) {\n                evecs[2] = vec_tmp[1] * rsqrtf(len2);\n            } else {\n                evecs[2] = vec_tmp[2] * rsqrtf(len3);\n            }\n\n            evecs[1] = unitOrthogonal(evecs[2]);\n            evecs[0] = cross(evecs[1], evecs[2]);\n        } else if (evals.z - evals.y <= numeric_limits<float>::epsilon()) {\n            // second and third equal\n            tmp[0] = row0();\n            tmp[1] = row1();\n            tmp[2] = row2();\n            tmp[0].x -= evals.x;\n            tmp[1].y -= evals.x;\n            tmp[2].z -= evals.x;\n\n            vec_tmp[0] = cross(tmp[0], tmp[1]);\n            vec_tmp[1] = cross(tmp[0], tmp[2]);\n            vec_tmp[2] = cross(tmp[1], tmp[2]);\n\n            float len1 = dot(vec_tmp[0], vec_tmp[0]);\n            float len2 = dot(vec_tmp[1], vec_tmp[1]);\n            float len3 = dot(vec_tmp[2], vec_tmp[2]);\n\n            if (len1 >= len2 && len1 >= len3) {\n                evecs[0] = vec_tmp[0] * rsqrtf(len1);\n            } else if (len2 >= len1 && len2 >= len3) {\n                evecs[0] = vec_tmp[1] * rsqrtf(len2);\n            } else {\n                evecs[0] = vec_tmp[2] * rsqrtf(len3);\n            }\n\n            evecs[1] = unitOrthogonal(evecs[0]);\n            evecs[2] = cross(evecs[0], evecs[1]);\n        } else {\n            tmp[0] = row0();\n            tmp[1] = row1();\n            tmp[2] = row2();\n            tmp[0].x -= evals.z;\n            tmp[1].y -= evals.z;\n            tmp[2].z -= evals.z;\n\n            vec_tmp[0] = cross(tmp[0], tmp[1]);\n            vec_tmp[1] = cross(tmp[0], tmp[2]);\n            vec_tmp[2] = cross(tmp[1], tmp[2]);\n\n            float len1 = dot(vec_tmp[0], vec_tmp[0]);\n            float len2 = dot(vec_tmp[1], vec_tmp[1]);\n            float len3 = dot(vec_tmp[2], vec_tmp[2]);\n\n            float mmax[3];\n\n            unsigned int min_el = 2;\n            unsigned int max_el = 2;\n            if (len1 >= len2 && len1 >= len3) {\n                mmax[2]  = len1;\n                evecs[2] = vec_tmp[0] * rsqrtf(len1);\n            } else if (len2 >= len1 && len2 >= len3) {\n                mmax[2]  = len2;\n                evecs[2] = vec_tmp[1] * rsqrtf(len2);\n            } else {\n                mmax[2]  = len3;\n                evecs[2] = vec_tmp[2] * rsqrtf(len3);\n            }\n\n            tmp[0] = row0();\n            tmp[1] = row1();\n            tmp[2] = row2();\n            tmp[0].x -= evals.y;\n            tmp[1].y -= evals.y;\n            tmp[2].z -= evals.y;\n\n            vec_tmp[0] = cross(tmp[0], tmp[1]);\n            vec_tmp[1] = cross(tmp[0], tmp[2]);\n            vec_tmp[2] = cross(tmp[1], tmp[2]);\n\n            len1 = dot(vec_tmp[0], vec_tmp[0]);\n            len2 = dot(vec_tmp[1], vec_tmp[1]);\n            len3 = dot(vec_tmp[2], vec_tmp[2]);\n\n            if (len1 >= len2 && len1 >= len3) {\n                mmax[1]  = len1;\n                evecs[1] = vec_tmp[0] * rsqrtf(len1);\n                min_el   = len1 <= mmax[min_el] ? 1 : min_el;\n                max_el   = len1 > mmax[max_el] ? 1 : max_el;\n            } else if (len2 >= len1 && len2 >= len3) {\n                mmax[1]  = len2;\n                evecs[1] = vec_tmp[1] * rsqrtf(len2);\n                min_el   = len2 <= mmax[min_el] ? 1 : min_el;\n                max_el   = len2 > mmax[max_el] ? 1 : max_el;\n            } else {\n                mmax[1]  = len3;\n                evecs[1] = vec_tmp[2] * rsqrtf(len3);\n                min_el   = len3 <= mmax[min_el] ? 1 : min_el;\n                max_el   = len3 > mmax[max_el] ? 1 : max_el;\n            }\n\n            tmp[0] = row0();\n            tmp[1] = row1();\n            tmp[2] = row2();\n            tmp[0].x -= evals.x;\n            tmp[1].y -= evals.x;\n            tmp[2].z -= evals.x;\n\n            vec_tmp[0] = cross(tmp[0], tmp[1]);\n            vec_tmp[1] = cross(tmp[0], tmp[2]);\n            vec_tmp[2] = cross(tmp[1], tmp[2]);\n\n            len1 = dot(vec_tmp[0], vec_tmp[0]);\n            len2 = dot(vec_tmp[1], vec_tmp[1]);\n            len3 = dot(vec_tmp[2], vec_tmp[2]);\n\n            if (len1 >= len2 && len1 >= len3) {\n                mmax[0]  = len1;\n                evecs[0] = vec_tmp[0] * rsqrtf(len1);\n                min_el   = len3 <= mmax[min_el] ? 0 : min_el;\n                max_el   = len3 > mmax[max_el] ? 0 : max_el;\n            } else if (len2 >= len1 && len2 >= len3) {\n                mmax[0]  = len2;\n                evecs[0] = vec_tmp[1] * rsqrtf(len2);\n                min_el   = len3 <= mmax[min_el] ? 0 : min_el;\n                max_el   = len3 > mmax[max_el] ? 0 : max_el;\n            } else {\n                mmax[0]  = len3;\n                evecs[0] = vec_tmp[2] * rsqrtf(len3);\n                min_el   = len3 <= mmax[min_el] ? 0 : min_el;\n                max_el   = len3 > mmax[max_el] ? 0 : max_el;\n            }\n\n            unsigned mid_el = 3 - min_el - max_el;\n            evecs[min_el]   = normalized(cross(evecs[(min_el + 1) % 3], evecs[(min_el + 2) % 3]));\n            evecs[mid_el]   = normalized(cross(evecs[(mid_el + 1) % 3], evecs[(mid_el + 2) % 3]));\n        }\n        // Rescale back to the original size.\n        evals *= scale;\n    }\n\nprivate:\n    volatile float *mat_pkg;\n\n    __kf_device__ float m00() const { return mat_pkg[0]; }\n    __kf_device__ float m01() const { return mat_pkg[1]; }\n    __kf_device__ float m02() const { return mat_pkg[2]; }\n    __kf_device__ float m10() const { return mat_pkg[1]; }\n    __kf_device__ float m11() const { return mat_pkg[3]; }\n    __kf_device__ float m12() const { return mat_pkg[4]; }\n    __kf_device__ float m20() const { return mat_pkg[2]; }\n    __kf_device__ float m21() const { return mat_pkg[4]; }\n    __kf_device__ float m22() const { return mat_pkg[5]; }\n\n    __kf_device__ float3 row0() const { return make_float3(m00(), m01(), m02()); }\n    __kf_device__ float3 row1() const { return make_float3(m10(), m11(), m12()); }\n    __kf_device__ float3 row2() const { return make_float3(m20(), m21(), m22()); }\n\n    __kf_device__ static bool isMuchSmallerThan(float x, float y) {\n        // copied from <eigen>/include/Eigen/src/Core/NumTraits.h\n        const float prec_sqr = numeric_limits<float>::epsilon() * numeric_limits<float>::epsilon();\n        return x * x <= prec_sqr * y * y;\n    }\n};\n\nstruct Warp {\n    enum { LOG_WARP_SIZE = 5, WARP_SIZE = 1 << LOG_WARP_SIZE, STRIDE = WARP_SIZE };\n\n    /** \\brief Returns the warp lane ID of the calling thread. */\n    static __kf_device__ unsigned int laneId() {\n        unsigned int ret;\n        asm(\"mov.u32 %0, %laneid;\" : \"=r\"(ret));\n        return ret;\n    }\n\n    static __kf_device__ unsigned int id() {\n        int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;\n        return tid >> LOG_WARP_SIZE;\n    }\n\n    static __kf_device__ int laneMaskLt() {\n#if (__CUDA_ARCH__ >= 200)\n        unsigned int ret;\n        asm(\"mov.u32 %0, %lanemask_lt;\" : \"=r\"(ret));\n        return ret;\n#else\n        return 0xFFFFFFFF >> (32 - laneId());\n#endif\n    }\n    static __kf_device__ int binaryExclScan(int ballot_mask) { return __popc(Warp::laneMaskLt() & ballot_mask); }\n};\n\nstruct Block {\n    static __kf_device__ unsigned int stride() { return blockDim.x * blockDim.y * blockDim.z; }\n\n    static __kf_device__ int flattenedThreadId() {\n        return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;\n    }\n\n    template <int CTA_SIZE, typename T, class BinOp>\n    static __kf_device__ void reduce(volatile T *buffer, BinOp op) {\n        int tid = flattenedThreadId();\n        T val   = buffer[tid];\n\n        if (CTA_SIZE >= 1024) {\n            if (tid < 512)\n                buffer[tid] = val = op(val, buffer[tid + 512]);\n            __syncthreads();\n        }\n        if (CTA_SIZE >= 512) {\n            if (tid < 256)\n                buffer[tid] = val = op(val, buffer[tid + 256]);\n            __syncthreads();\n        }\n        if (CTA_SIZE >= 256) {\n            if (tid < 128)\n                buffer[tid] = val = op(val, buffer[tid + 128]);\n            __syncthreads();\n        }\n        if (CTA_SIZE >= 128) {\n            if (tid < 64)\n                buffer[tid] = val = op(val, buffer[tid + 64]);\n            __syncthreads();\n        }\n\n        if (tid < 32) {\n            if (CTA_SIZE >= 64) {\n                buffer[tid] = val = op(val, buffer[tid + 32]);\n            }\n            if (CTA_SIZE >= 32) {\n                buffer[tid] = val = op(val, buffer[tid + 16]);\n            }\n            if (CTA_SIZE >= 16) {\n                buffer[tid] = val = op(val, buffer[tid + 8]);\n            }\n            if (CTA_SIZE >= 8) {\n                buffer[tid] = val = op(val, buffer[tid + 4]);\n            }\n            if (CTA_SIZE >= 4) {\n                buffer[tid] = val = op(val, buffer[tid + 2]);\n            }\n            if (CTA_SIZE >= 2) {\n                buffer[tid] = val = op(val, buffer[tid + 1]);\n            }\n        }\n    }\n\n    template <int CTA_SIZE, typename T, class BinOp>\n    static __kf_device__ T reduce(volatile T *buffer, T init, BinOp op) {\n        int tid = flattenedThreadId();\n        T val = buffer[tid] = init;\n        __syncthreads();\n\n        if (CTA_SIZE >= 1024) {\n            if (tid < 512)\n                buffer[tid] = val = op(val, buffer[tid + 512]);\n            __syncthreads();\n        }\n        if (CTA_SIZE >= 512) {\n            if (tid < 256)\n                buffer[tid] = val = op(val, buffer[tid + 256]);\n            __syncthreads();\n        }\n        if (CTA_SIZE >= 256) {\n            if (tid < 128)\n                buffer[tid] = val = op(val, buffer[tid + 128]);\n            __syncthreads();\n        }\n        if (CTA_SIZE >= 128) {\n            if (tid < 64)\n                buffer[tid] = val = op(val, buffer[tid + 64]);\n            __syncthreads();\n        }\n\n        if (tid < 32) {\n#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 && 0\n            if (CTA_SIZE >= 64)\n                val = op(val, buffer[tid + 32]);\n            if (CTA_SIZE >= 32)\n                val = op(val, __shfl_xor(val, 16));\n            if (CTA_SIZE >= 16)\n                val = op(val, __shfl_xor(val, 8));\n            if (CTA_SIZE >= 8)\n                val = op(val, __shfl_xor(val, 4));\n            if (CTA_SIZE >= 4)\n                val = op(val, __shfl_xor(val, 2));\n            if (CTA_SIZE >= 2)\n                buffer[tid] = op(val, __shfl_xor(val, 1));\n#else\n            if (CTA_SIZE >= 64) {\n                buffer[tid] = val = op(val, buffer[tid + 32]);\n            }\n            if (CTA_SIZE >= 32) {\n                buffer[tid] = val = op(val, buffer[tid + 16]);\n            }\n            if (CTA_SIZE >= 16) {\n                buffer[tid] = val = op(val, buffer[tid + 8]);\n            }\n            if (CTA_SIZE >= 8) {\n                buffer[tid] = val = op(val, buffer[tid + 4]);\n            }\n            if (CTA_SIZE >= 4) {\n                buffer[tid] = val = op(val, buffer[tid + 2]);\n            }\n            if (CTA_SIZE >= 2) {\n                buffer[tid] = val = op(val, buffer[tid + 1]);\n            }\n#endif\n        }\n        __syncthreads();\n        return buffer[0];\n    }\n};\n\nstruct Emulation {\n    static __kf_device__ int warp_reduce(volatile int *ptr, const unsigned int tid) {\n        const unsigned int lane = tid & 31;  // index of thread in warp (0..31)\n\n        if (lane < 16) {\n            int partial = ptr[tid];\n\n            ptr[tid] = partial = partial + ptr[tid + 16];\n            ptr[tid] = partial = partial + ptr[tid + 8];\n            ptr[tid] = partial = partial + ptr[tid + 4];\n            ptr[tid] = partial = partial + ptr[tid + 2];\n            ptr[tid] = partial = partial + ptr[tid + 1];\n        }\n        return ptr[tid - lane];\n    }\n\n    static __kf_device__ int Ballot(int predicate, volatile int *cta_buffer) {\n#if __CUDA_ARCH__ >= 200\n        (void) cta_buffer;\n        return __ballot_sync(FULL_MASK, predicate);\n#else\n        int tid         = Block::flattenedThreadId();\n        cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;\n        return warp_reduce(cta_buffer, tid);\n#endif\n    }\n\n    static __kf_device__ bool All(int predicate, volatile int *cta_buffer) {\n#if __CUDA_ARCH__ >= 200\n        (void) cta_buffer;\n        return __all_sync(FULL_MASK, predicate);\n#else\n        int tid         = Block::flattenedThreadId();\n        cta_buffer[tid] = predicate ? 1 : 0;\n        return warp_reduce(cta_buffer, tid) == 32;\n#endif\n    }\n};\n}  // namespace device\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/cuda/texture_binder.hpp",
    "content": "#pragma once\n\n#include <kfusion/cuda/device_array.hpp>\n#include <kfusion/safe_call.hpp>\n\nnamespace kfusion {\nnamespace cuda {\nclass TextureBinder {\npublic:\n    template <class T, enum cudaTextureReadMode readMode>\n    TextureBinder(const DeviceArray2D<T> &arr, const struct texture<T, 2, readMode> &tex) : texref(&tex) {\n        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\n        cudaSafeCall(cudaBindTexture2D(0, tex, arr.ptr(), desc, arr.cols(), arr.rows(), arr.step()));\n    }\n\n    template <class T, enum cudaTextureReadMode readMode>\n    TextureBinder(const DeviceArray<T> &arr, const struct texture<T, 1, readMode> &tex) : texref(&tex) {\n        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\n        cudaSafeCall(cudaBindTexture(0, tex, arr.ptr(), desc, arr.sizeBytes()));\n    }\n\n    template <class T, enum cudaTextureReadMode readMode>\n    TextureBinder(const PtrStepSz<T> &arr, const struct texture<T, 2, readMode> &tex) : texref(&tex) {\n        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\n        cudaSafeCall(cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step));\n    }\n\n    template <class A, class T, enum cudaTextureReadMode readMode>\n    TextureBinder(const A &arr, const struct texture<T, 2, readMode> &tex, const cudaChannelFormatDesc &desc)\n        : texref(&tex) {\n        cudaSafeCall(cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step));\n    }\n\n    template <class T, enum cudaTextureReadMode readMode>\n    TextureBinder(const PtrSz<T> &arr, const struct texture<T, 1, readMode> &tex) : texref(&tex) {\n        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();\n        cudaSafeCall(cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()));\n    }\n\n    ~TextureBinder() { cudaSafeCall(cudaUnbindTexture(texref)); }\n\nprivate:\n    const struct textureReference *texref;\n};\n}  // namespace cuda\n\nnamespace device {\nusing kfusion::cuda::TextureBinder;\n}\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/cuda/tsdf_volume.hpp",
    "content": "#pragma once\n\n/* kinfu includes */\n#include <kfusion/types.hpp>\n\n/* pcl includes */\n#include <pcl/common/geometry.h>\n\n/* sobfu includes */\n#include <sobfu/params.hpp>\n\n/* sys headers */\n#include <memory>\n\nnamespace kfusion {\nnamespace cuda {\nclass KF_EXPORTS TsdfVolume {\npublic:\n    TsdfVolume(const Params &params);\n    virtual ~TsdfVolume();\n\n    void create(const Vec3i &dims);\n\n    Vec3i getDims() const;\n    Vec3f getVoxelSize() const;\n\n    const CudaData data() const;\n    CudaData data();\n\n    Vec3f getSize() const;\n    void setSize(const Vec3f &size);\n\n    float getTruncDist() const;\n    void setTruncDist(float &distance);\n\n    float getEta() const;\n    void setEta(float &eta);\n\n    float getMaxWeight() const;\n    void setMaxWeight(float &weight);\n\n    Affine3f getPose() const;\n    void setPose(const Affine3f &pose);\n\n    float getRaycastStepFactor() const;\n    void setRaycastStepFactor(float &factor);\n\n    float getGradientDeltaFactor() const;\n    void setGradientDeltaFactor(float &factor);\n\n    Vec3i getGridOrigin() const;\n    void setGridOrigin(const Vec3i &origin);\n\n    virtual void clear();\n    void swap(CudaData &data);\n\n    virtual void applyAffine(const Affine3f &affine);\n\n    virtual void integrate(const TsdfVolume &phi_n_psi);\n    virtual void integrate(const Dists &dists, const Affine3f &camera_pose, const Intr &intr);\n\n    virtual void initBox(const float3 &b);\n    virtual void initEllipsoid(const float3 &r);\n    virtual void initPlane(const float &z);\n    virtual void initSphere(const float3 &centre, const float &radius);\n    virtual void initTorus(const float2 &t);\n\n    void print_sdf_values();\n\n    struct Entry {\n        typedef unsigned short half;\n\n        half tsdf;\n        int weight;\n\n        static float half2float(half value);\n        static half float2half(float value);\n    };\n\nprivate:\n    CudaData data_;\n\n    float trunc_dist_;\n    float eta_;\n    float max_weight_;\n    Vec3i dims_;\n    Vec3f size_;\n    Affine3f pose_;\n\n    float gradient_delta_factor_;\n    float raycast_step_factor_;\n};\n}  // namespace cuda\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/exports.hpp",
    "content": "#pragma once\n\n#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined KFUSION_API_EXPORTS\n#define KF_EXPORTS __declspec(dllexport)\n#else\n#define KF_EXPORTS\n#endif\n"
  },
  {
    "path": "include/kfusion/internal.hpp",
    "content": "#pragma once\n\n/* kinfu includes */\n#include <kfusion/cuda/device_array.hpp>\n#include <kfusion/safe_call.hpp>\n\n/* pcl includes */\n#include <pcl/point_types.h>\n\n//#define USE_DEPTH\n\nstruct Mat4f {\n    float4 data[4];\n};\n\nnamespace kfusion {\nnamespace device {\ntypedef float4 Normal;\ntypedef float4 Point;\n\ntypedef unsigned short ushort;\ntypedef unsigned char uchar;\n\ntypedef PtrStepSz<float> Dists;\ntypedef DeviceArray2D<ushort> Depth;\ntypedef DeviceArray2D<Normal> Normals;\ntypedef DeviceArray2D<Point> Points;\ntypedef DeviceArray2D<uchar4> Image;\n\ntypedef float3 *Displacements;\n\ntypedef DeviceArray<float4> Vertices;\ntypedef Vertices Norms;\n\n/*\n * struct which holds a surface as triangles\n */\nstruct Surface {\n    Vertices vertices;\n    Norms normals;\n};\n\nstruct float8 {\n    float x, y, z, w, c1, c2, c3, c4;\n};\n\ntypedef float4 PointType;\n\ntypedef int3 Vec3i;\ntypedef float3 Vec3f;\nstruct Mat3f {\n    float3 data[3];\n};\nstruct Aff3f {\n    Mat3f R;\n    Vec3f t;\n};\n\nstruct TsdfVolume {\npublic:\n    float2 *const data;\n    const int3 dims;\n    const float3 voxel_size;\n    const float trunc_dist;\n    const float eta;\n    const float max_weight;\n\n    TsdfVolume(float2 *const data, int3 dims, float3 voxel_size, float trunc_dist, float eta, float max_weight);\n    // TsdfVolume(const TsdfVolume &);\n\n    TsdfVolume &operator=(const TsdfVolume &);\n\n    __kf_device__ float2 *operator()(int x, int y, int z);\n    __kf_device__ const float2 *operator()(int x, int y, int z) const;\n\n    __kf_device__ float2 *beg(int x, int y) const;\n    __kf_device__ float2 *zstep(float2 *const ptr) const;\n};\n\nstruct Projector {\n    float2 f, c;\n    Projector() {}\n    Projector(float fx, float fy, float cx, float cy);\n    __kf_device__ float2 operator()(const float3 &p) const;\n};\n\nstruct Reprojector {\n    Reprojector() {}\n    Reprojector(float fx, float fy, float cx, float cy);\n    float2 finv, c;\n    __kf_device__ float3 operator()(int x, int y, float z) const;\n};\n\nstruct CubeIndexEstimator {\n    CubeIndexEstimator(const TsdfVolume &vol) : volume(vol) { isoValue = 0.f; }\n\n    const TsdfVolume volume;\n    float isoValue;\n\n    __kf_device__ int computeCubeIndex(int x, int y, int z, float f[8]) const;\n    __kf_device__ void readTsdf(int x, int y, int z, float &f, float &weight) const;\n};\n\nstruct OccupiedVoxels : public CubeIndexEstimator {\n    OccupiedVoxels(const TsdfVolume &vol) : CubeIndexEstimator({vol}) {}\n\n    enum {\n        CTA_SIZE_X = 32,\n        CTA_SIZE_Y = 8,\n        CTA_SIZE   = CTA_SIZE_X * CTA_SIZE_Y,\n\n        LOG_WARP_SIZE = 5,\n        WARP_SIZE     = 1 << LOG_WARP_SIZE,\n        WARPS_COUNT   = CTA_SIZE / WARP_SIZE,\n    };\n\n    mutable int *voxels_indices;\n    mutable int *vertices_number;\n    int max_size;\n\n    __kf_device__ void operator()() const;\n};\n\nstruct TrianglesGenerator : public CubeIndexEstimator {\n    TrianglesGenerator(const TsdfVolume &vol) : CubeIndexEstimator({vol}) {}\n\n    enum { CTA_SIZE = 256, MAX_GRID_SIZE_X = 65536 };\n\n    const int *occupied_voxels;\n    const int *vertex_ofssets;\n    int voxels_count;\n    float3 cell_size;\n\n    Aff3f pose;\n\n    mutable device::PointType *outputVertices;\n    mutable device::PointType *outputNormals;\n\n    __kf_device__ void operator()() const;\n\n    __kf_device__ void store_point(float4 *ptr, int index, const float3 &vertex) const;\n\n    __kf_device__ float3 get_node_coo(int x, int y, int z) const;\n    __kf_device__ float3 vertex_interp(float3 p0, float3 p1, float f0, float f1) const;\n\n};  // namespace device\n\nstruct ComputeIcpHelper {\n    struct Policy;\n    struct PageLockHelper {\n        float *data;\n        PageLockHelper();\n        ~PageLockHelper();\n    };\n\n    float min_cosine;\n    float dist2_thres;\n\n    Aff3f aff;\n\n    float rows, cols;\n    float2 f, c, finv;\n\n    PtrStep<ushort> dcurr;\n    PtrStep<Normal> ncurr;\n    PtrStep<Point> vcurr;\n\n    ComputeIcpHelper(float dist_thres, float angle_thres);\n    void setLevelIntr(int level_index, float fx, float fy, float cx, float cy);\n\n    void operator()(const Depth &dprev, const Normals &nprev, DeviceArray2D<float> &buffer, float *data,\n                    cudaStream_t stream);\n    void operator()(const Points &vprev, const Normals &nprev, DeviceArray2D<float> &buffer, float *data,\n                    cudaStream_t stream);\n\n    static void allocate_buffer(DeviceArray2D<float> &buffer, int partials_count = -1);\n\n    // private:\n    __kf_device__ int find_coresp(int x, int y, float3 &n, float3 &d, float3 &s) const;\n    __kf_device__ void partial_reduce(const float row[7], PtrStep<float> &partial_buffer) const;\n    __kf_device__ float2 proj(const float3 &p) const;\n    __kf_device__ float3 reproj(float x, float y, float z) const;\n};\n\n/*\n * TSDF FUNCTIONS\n */\n\nvoid clear_volume(TsdfVolume &volume);\n\nvoid integrate(TsdfVolume &phi_global, TsdfVolume &phi_n_psi);\nvoid integrate(const Dists &depth, TsdfVolume &volume, const Aff3f &aff, const Projector &proj);\n\nvoid init_box(TsdfVolume &volume, const float3 &b);\nvoid init_ellipsoid(TsdfVolume &volume, const float3 &r);\nvoid init_plane(TsdfVolume &volume, const float &z);\nvoid init_sphere(TsdfVolume &volume, const float3 &centre, const float &radius);\nvoid init_torus(TsdfVolume &volume, const float2 &t);\n\n/*\n * MARCHING CUBES FUNCTIONS\n */\n\n/** \\brief binds marching cubes tables to texture references */\nvoid bindTextures(const int *edgeBuf, const int *triBuf, const int *numVertsBuf);\n\n/** \\brief unbinds */\nvoid unbindTextures();\n\n/** \\brief scans TSDF volume and retrieves occuped voxes\n * \\param[in] volume TSDF volume\n * \\param[out] occupied_voxels buffer for occupied voxels; the function fills the first row with voxel id's and second\n * row with the no. of vertices\n * \\return number of voxels in the buffer\n */\nint getOccupiedVoxels(const TsdfVolume &volume, DeviceArray2D<int> &occupied_voxels);\n\n/** \\brief computes total number of vertices for all voxels and offsets of vertices in final triangle array\n * \\param[out] occupied_voxels buffer with occupied voxels; the function fills 3rd only with offsets\n * \\return total number of vertexes\n */\nint computeOffsetsAndTotalVertices(DeviceArray2D<int> &occupied_voxels);\n\n/** \\brief generates final triangle array\n * \\param[in] volume TSDF volume\n * \\param[in] occupied_voxels occupied voxel ids (1st row), number of vertices (2nd row), offsets (3rd row).\n * \\param[in] volume_size volume size in meters\n * \\param[out] output triangle array\n */\nvoid generateTriangles(const TsdfVolume &volume, const DeviceArray2D<int> &occupied_voxels, const float3 &volume_size,\n                       const Aff3f &pose, DeviceArray<PointType> &outputVertices,\n                       DeviceArray<PointType> &outputNormals);\n\n/*\n *  IMAGE PROCESSING FUNCTIONS\n */\nvoid compute_dists(const Depth &depth, Dists dists, float2 f, float2 c);\n\nvoid truncateDepth(Depth &depth, float max_dist /*meters*/);\nvoid bilateralFilter(const Depth &src, Depth &dst, int kernel_size, float sigma_spatial, float sigma_depth);\nvoid depthPyr(const Depth &source, Depth &pyramid, float sigma_depth);\n\nvoid resizeDepthNormals(const Depth &depth, const Normals &normals, Depth &depth_out, Normals &normals_out);\nvoid resizePointsNormals(const Points &points, const Normals &normals, Points &points_out, Normals &normals_out);\n\nvoid computeNormalsAndMaskDepth(const Reprojector &reproj, Depth &depth, Normals &normals);\nvoid computePointNormals(const Reprojector &reproj, const Depth &depth, Points &points, Normals &normals);\n\n/** \\brief rasterises the surface extracted via marching cubes\n *  \\param[in] proj\n *  \\param[in] surface\n *  \\param[out] vertices_out\n *  \\paraim[out] normals_out\n */\nvoid rasteriseSurface(const Projector &proj, const Aff3f &vol2cam, const Surface &surface, Points &vertices_out,\n                      Normals &normals_out);\n\n}  // namespace device\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/kinfu.hpp",
    "content": "#pragma once\n\n/* boost includes */\n#include <boost/filesystem.hpp>\n\n/* kinfu includes */\n#include <kfusion/cuda/marching_cubes.hpp>\n#include <kfusion/cuda/projective_icp.hpp>\n#include <kfusion/cuda/tsdf_volume.hpp>\n#include <kfusion/types.hpp>\n\n/* pcl includes */\n#include <pcl/PCLPointCloud2.h>\n#include <pcl/PolygonMesh.h>\n#include <pcl/conversions.h>\n#include <pcl/point_cloud.h>\n#include <pcl/point_types.h>\n\n/* sys headers */\n#include <string>\n#include <vector>\n\nnamespace kfusion {\nnamespace cuda {\nKF_EXPORTS int getCudaEnabledDeviceCount();\nKF_EXPORTS void setDevice(int device);\nKF_EXPORTS std::string getDeviceName(int device);\nKF_EXPORTS bool checkIfPreFermiGPU(int device);\nKF_EXPORTS void printCudaDeviceInfo(int device);\nKF_EXPORTS void printShortCudaDeviceInfo(int device);\n}  // namespace cuda\n\nstruct KF_EXPORTS KinFuParams {\n    static KinFuParams default_params();\n\n    int cols;  // pixels\n    int rows;  // pixels\n\n    Intr intr;  // Camera parameters\n\n    Vec3i volume_dims;     // number of voxels\n    Vec3f volume_size;     // meters\n    Affine3f volume_pose;  // meters, inital pose\n\n    float bilateral_sigma_depth;    // meters\n    float bilateral_sigma_spatial;  // pixels\n    int bilateral_kernel_size;      // pixels\n\n    float icp_truncate_depth_dist;  // meters\n    float icp_dist_thres;           // meters\n    float icp_angle_thres;          // radians\n    std::vector<int> icp_iter_num;  // iterations for level index 0,1,..,3\n\n    float tsdf_min_camera_movement;  // meters, integrate only if exceedes\n    float tsdf_trunc_dist;           // meters;\n    float tsdf_max_weight;           // frames\n\n    float raycast_step_factor;    // in voxel sizes\n    float gradient_delta_factor;  // in voxel sizes\n\n    Vec3f light_pose;  // meters\n};\n\nclass KF_EXPORTS KinFu {\npublic:\n    typedef cv::Ptr<KinFu> Ptr;\n\n    KinFu(const KinFuParams &params);\n\n    const KinFuParams &params() const;\n    KinFuParams &params();\n\n    const cuda::TsdfVolume &tsdf() const;\n    cuda::TsdfVolume &tsdf();\n\n    const cuda::ProjectiveICP &icp() const;\n    cuda::ProjectiveICP &icp();\n\n    const cuda::MarchingCubes &mc() const;\n    cuda::MarchingCubes &mc();\n\n    void reset();\n\n    bool operator()(const cuda::Depth &depth, const cuda::Image &image = cuda::Image());\n\n    void renderImage(cuda::Image &image, int flag = 0);\n    void renderImage(cuda::Image &image, const Affine3f &pose, int flag = 0);\n\n    Affine3f getCameraPose(int time = -1) const;\n\nprotected:\n    void allocate_buffers();\n\n    int frame_counter_;\n    KinFuParams params_;\n\n    std::vector<Affine3f> poses_;\n    cuda::Dists dists_;\n    cuda::Frame curr_, prev_;\n\n    cuda::Cloud points_;\n    cuda::Normals normals_;\n    cuda::Depth depths_;\n\n    cv::Ptr<cuda::TsdfVolume> volume_;\n    cv::Ptr<cuda::ProjectiveICP> icp_;\n    cv::Ptr<cuda::MarchingCubes> mc_;\n};\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/precomp.hpp",
    "content": "#pragma once\n\n/* cuda includes */\n#include <vector_functions.h>\n\n/* kinfu includes */\n#include <kfusion/cuda/imgproc.hpp>\n#include <kfusion/cuda/marching_cubes.hpp>\n#include <kfusion/cuda/projective_icp.hpp>\n#include <kfusion/cuda/tsdf_volume.hpp>\n#include <kfusion/internal.hpp>\n#include <kfusion/kinfu.hpp>\n#include <kfusion/types.hpp>\n\n/* sys headers */\n#include <iostream>\n\nnamespace kfusion {\ntemplate <typename D, typename S>\ninline D device_cast(const S &source) {\n    return *reinterpret_cast<const D *>(source.val);\n}\n\ntemplate <>\ninline device::Aff3f device_cast<device::Aff3f, Affine3f>(const Affine3f &source) {\n    device::Aff3f aff;\n    Mat3f R = source.rotation();\n    Vec3f t = source.translation();\n    aff.R   = device_cast<device::Mat3f>(R);\n    aff.t   = device_cast<device::Vec3f>(t);\n    return aff;\n}\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/safe_call.hpp",
    "content": "#pragma once\n\n#include <cuda_runtime_api.h>\n\nnamespace kfusion {\nnamespace cuda {\nvoid error(const char *error_string, const char *file, const int line, const char *func);\n}\n}  // namespace kfusion\n\n#if defined(__GNUC__)\n#define cudaSafeCall(expr) kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__, __func__)\n#else /* defined(__CUDACC__) || defined(__MSVC__) */\n#define cudaSafeCall(expr) kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__)\n#endif\n\nnamespace kfusion {\nnamespace cuda {\nstatic inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = \"\") {\n    if (cudaSuccess != err)\n        error(cudaGetErrorString(err), file, line, func);\n}\n\nstatic inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }\n}  // namespace cuda\n\nnamespace device {\nusing kfusion::cuda::divUp;\n}\n}  // namespace kfusion\n"
  },
  {
    "path": "include/kfusion/types.hpp",
    "content": "#pragma once\n\n/* cuda includes */\n#include <vector_functions.h>\n\n/* kinfu includes */\n#include <kfusion/cuda/device_array.hpp>\n\n/* pcl includes */\n#include <pcl/point_types.h>\n\n/* opencv includes */\n#include <opencv2/core/affine.hpp>\n#include <opencv2/core/core.hpp>\n#include <opencv2/viz/vizcore.hpp>\n\n/* sys headers */\n#include <iosfwd>\n\nstruct CUevent_st;\n\nnamespace kfusion {\ntypedef cv::Matx33f Mat3f;\ntypedef cv::Vec3f Vec3f;\ntypedef cv::Vec3i Vec3i;\ntypedef cv::Affine3f Affine3f;\n\nstruct KF_EXPORTS Intr {\n    float fx, fy, cx, cy;\n\n    Intr();\n    Intr(float fx, float fy, float cx, float cy);\n    Intr operator()(int level_index) const;\n};\n\nKF_EXPORTS std::ostream &operator<<(std::ostream &os, const Intr &intr);\n\nstruct Point {\n    union {\n        float data[4];\n        struct {\n            float x, y, z;\n        };\n    };\n\n    Point() = default;\n\n    Point(float x, float y, float z) {\n        this->x = x;\n        this->y = y;\n        this->z = z;\n    }\n};\n\ntypedef Point Normal;\n\nstruct RGB {\n    union {\n        struct {\n            unsigned char b, g, r;\n        };\n        int bgra;\n    };\n};\n\nstruct PixelRGB {\n    unsigned char r, g, b;\n};\n\nnamespace cuda {\ntypedef cuda::DeviceMemory CudaData;\ntypedef cuda::CudaData Displacements;\ntypedef cuda::DeviceArray2D<unsigned short> Depth;\ntypedef cuda::DeviceArray2D<float> Dists;\ntypedef cuda::DeviceArray2D<RGB> Image;\ntypedef cuda::DeviceArray2D<Normal> Normals;\ntypedef cuda::DeviceArray2D<Point> Cloud;\n\ntypedef cuda::DeviceArray<pcl::PointXYZ> Vertices;\ntypedef cuda::DeviceArray<pcl::Normal> Norms;\n\n/*\n * struct used to hold the surface as triangles\n */\nstruct Surface {\n    Vertices vertices;\n    Norms normals;\n};\n\nstruct Frame {\n    bool use_points;\n\n    std::vector<Depth> depth_pyr;\n    std::vector<Cloud> points_pyr;\n    std::vector<Normals> normals_pyr;\n};\n}  // namespace cuda\n\ninline float deg2rad(float alpha) { return alpha * 0.017453293f; }\n\nstruct KF_EXPORTS ScopeTime {\n    const char *name;\n    double start;\n    ScopeTime(const char *name);\n    ~ScopeTime();\n};\n\nstruct KF_EXPORTS SampledScopeTime {\npublic:\n    enum { EACH = 34 };\n    SampledScopeTime(double &time_ms);\n    ~SampledScopeTime();\n\nprivate:\n    double getTime();\n    SampledScopeTime(const SampledScopeTime &);\n    SampledScopeTime &operator=(const SampledScopeTime &);\n\n    double &time_ms_;\n    double start;\n};\n\n}  // namespace kfusion\n"
  },
  {
    "path": "include/sobfu/cuda/utils.hpp",
    "content": "#pragma once\n\n/* cuda includes */\n#include <cuda.h>\n#include <math_constants.h>\n\n/* kinfu includes */\n#include <kfusion/internal.hpp>\n\n/* utility class used to avoid linker errors with extern unsized shared memory arrays with templated type */\ntemplate <class T>\nstruct SharedMemory {\n    __device__ inline operator T *() {\n        extern __shared__ int __smem[];\n        return (T *) __smem;\n    }\n\n    __device__ inline operator const T *() const {\n        extern __shared__ int __smem[];\n        return (T *) __smem;\n    }\n};\n\n/* get 1d index from 3d coordinates */\n__device__ __forceinline__ int get_global_idx(int x, int y, int z, int dim_x, int dim_y) {\n    return z * dim_y * dim_y + y * dim_x + x;\n}\n\n/*\n * LERP\n */\n\ntemplate <typename T>\n__host__ __device__ __forceinline__ T lerp(T v0, T v1, T t) {\n    return fma(t, v0, fma(-t, v1, v1));\n}\n\n__host__ __device__ __forceinline__ float3 lerp3f(float3 v0, float3 v1, float t) {\n    return make_float3(lerp<float>(v0.x, v1.x, t), lerp<float>(v0.y, v1.y, t), lerp<float>(v0.z, v1.z, t));\n}\n\n__host__ __device__ __forceinline__ float4 lerp4f(float4 v0, float4 v1, float t) {\n    return make_float4(lerp<float>(v0.x, v1.x, t), lerp<float>(v0.y, v1.y, t), lerp<float>(v0.z, v1.z, t), 0.f);\n}\n\n/*\n * INTERPOLATION\n */\n\ntemplate <typename Vol>\n__device__ __forceinline__ float2 interpolate_tsdf(const Vol &volume, const float3 &p_voxels) {\n    float3 cf = p_voxels;\n\n    cf.x = fminf(fmaxf(0.f, cf.x), (float) volume.dims.x - 1);\n    cf.y = fminf(fmaxf(0.f, cf.y), (float) volume.dims.y - 1);\n    cf.z = fminf(fmaxf(0.f, cf.z), (float) volume.dims.z - 1);\n\n    /* rounding to negative infinity */\n    int3 g = make_int3(__float2int_rd(cf.x), __float2int_rd(cf.y), __float2int_rd(cf.z));\n\n    int idx_x_1 = g.x + 1;\n    if (cf.x == 0.f || cf.x == (float) volume.dims.x - 1) {\n        idx_x_1--;\n    }\n    int idx_y_1 = g.y + 1;\n    if (cf.y == 0.f || cf.y == (float) volume.dims.y - 1) {\n        idx_y_1--;\n    }\n    int idx_z_1 = g.z + 1;\n    if (cf.z == 0.f || cf.z == (float) volume.dims.z - 1) {\n        idx_z_1--;\n    }\n\n    float a = cf.x - g.x;\n    float b = cf.y - g.y;\n    float c = cf.z - g.z;\n\n    float tsdf   = lerp(lerp(lerp((*volume(idx_x_1, idx_y_1, idx_z_1)).x, (*volume(idx_x_1, idx_y_1, g.z + 0)).x, c),\n                           lerp((*volume(idx_x_1, g.y + 0, idx_z_1)).x, (*volume(idx_x_1, g.y + 0, g.z + 0)).x, c), b),\n                      lerp(lerp((*volume(g.x + 0, idx_y_1, idx_z_1)).x, (*volume(g.x + 0, idx_y_1, g.z + 0)).x, c),\n                           lerp((*volume(g.x + 0, g.y + 0, idx_z_1)).x, (*volume(g.x + 0, g.y + 0, g.z + 0)).x, c), b),\n                      a);\n    float weight = (*volume(g.x, g.y, g.z)).y;\n\n    return make_float2(tsdf, weight);\n}\n\ntemplate <typename Field>\n__device__ __forceinline__ float4 interpolate_field(const Field &field, const float3 &p_voxels) {\n    float3 cf = p_voxels;\n\n    cf.x = fminf(fmaxf(0.f, cf.x), (float) field.dims.x - 1);\n    cf.y = fminf(fmaxf(0.f, cf.y), (float) field.dims.y - 1);\n    cf.z = fminf(fmaxf(0.f, cf.z), (float) field.dims.z - 1);\n\n    /* rounding to negative infinity */\n    int3 g = make_int3(__float2int_rd(cf.x), __float2int_rd(cf.y), __float2int_rd(cf.z));\n\n    int idx_x_1 = g.x + 1;\n    if (cf.x == 0.f || cf.x == (float) field.dims.x - 1) {\n        idx_x_1--;\n    }\n    int idx_y_1 = g.y + 1;\n    if (cf.y == 0.f || cf.y == (float) field.dims.y - 1) {\n        idx_y_1--;\n    }\n    int idx_z_1 = g.z + 1;\n    if (cf.z == 0.f || cf.z == (float) field.dims.z - 1) {\n        idx_z_1--;\n    }\n\n    float a = cf.x - g.x;\n    float b = cf.y - g.y;\n    float c = cf.z - g.z;\n\n    float4 result = lerp4f(lerp4f(lerp4f(*field(idx_x_1, idx_y_1, idx_z_1), *field(idx_x_1, idx_y_1, g.z + 0), c),\n                                  lerp4f(*field(idx_x_1, g.y + 0, idx_z_1), *field(idx_x_1, g.y + 0, g.z + 0), c), b),\n                           lerp4f(lerp4f(*field(g.x + 0, idx_y_1, idx_z_1), *field(g.x + 0, idx_y_1, g.z + 0), c),\n                                  lerp4f(*field(g.x + 0, g.y + 0, idx_z_1), *field(g.x + 0, g.y + 0, g.z + 0), c), b),\n                           a);\n    return result;\n}\n\ntemplate <typename Field>\n__device__ __forceinline__ float4 interpolate_field_inv(const Field &field, const float3 &p_voxels) {\n    float3 cf = p_voxels;\n\n    cf.x = fminf(fmaxf(0.f, cf.x), (float) field.dims.x - 1);\n    cf.y = fminf(fmaxf(0.f, cf.y), (float) field.dims.y - 1);\n    cf.z = fminf(fmaxf(0.f, cf.z), (float) field.dims.z - 1);\n\n    /* rounding to negative infinity */\n    int3 g = make_int3(__float2int_rd(cf.x), __float2int_rd(cf.y), __float2int_rd(cf.z));\n\n    int idx_x_1 = g.x + 1;\n    if (cf.x == 0.f || cf.x == (float) field.dims.x - 1) {\n        idx_x_1--;\n    }\n    int idx_y_1 = g.y + 1;\n    if (cf.y == 0.f || cf.y == (float) field.dims.y - 1) {\n        idx_y_1--;\n    }\n    int idx_z_1 = g.z + 1;\n    if (cf.z == 0.f || cf.z == (float) field.dims.z - 1) {\n        idx_z_1--;\n    }\n\n    float a = cf.x - g.x;\n    float b = cf.y - g.y;\n    float c = cf.z - g.z;\n\n    float4 result = lerp4f(lerp4f(lerp4f(field.get_displacement(idx_x_1, idx_y_1, idx_z_1),\n                                         field.get_displacement(idx_x_1, idx_y_1, g.z + 0), c),\n                                  lerp4f(field.get_displacement(idx_x_1, g.y + 0, idx_z_1),\n                                         field.get_displacement(idx_x_1, g.y + 0, g.z + 0), c),\n                                  b),\n                           lerp4f(lerp4f(field.get_displacement(g.x + 0, idx_y_1, idx_z_1),\n                                         field.get_displacement(g.x + 0, idx_y_1, g.z + 0), c),\n                                  lerp4f(field.get_displacement(g.x + 0, g.y + 0, idx_z_1),\n                                         field.get_displacement(g.x + 0, g.y + 0, g.z + 0), c),\n                                  b),\n                           a);\n    return result;\n}\n\n__device__ __forceinline__ float4 interpolate_field(const float4 *field, const float3 &p_voxels, int dim_x, int dim_y,\n                                                    int dim_z) {\n    float3 cf = p_voxels;\n\n    cf.x = fminf(fmaxf(0.f, cf.x), (float) dim_x - 1);\n    cf.y = fminf(fmaxf(0.f, cf.y), (float) dim_y - 1);\n    cf.z = fminf(fmaxf(0.f, cf.z), (float) dim_z - 1);\n\n    /* rounding to negative infinity */\n    int3 g = make_int3(__float2int_rd(cf.x), __float2int_rd(cf.y), __float2int_rd(cf.z));\n\n    int idx_x_1 = g.x + 1;\n    if (cf.x == 0.f || cf.x == (float) dim_x - 1) {\n        idx_x_1--;\n    }\n    int idx_y_1 = g.y + 1;\n    if (cf.y == 0.f || cf.y == (float) dim_y - 1) {\n        idx_y_1--;\n    }\n    int idx_z_1 = g.z + 1;\n    if (cf.z == 0.f || cf.z == (float) dim_z - 1) {\n        idx_z_1--;\n    }\n\n    float a = cf.x - g.x;\n    float b = cf.y - g.y;\n    float c = cf.z - g.z;\n\n    float4 result = lerp4f(lerp4f(lerp4f(field[get_global_idx(idx_x_1, idx_y_1, idx_z_1, dim_x, dim_y)],\n                                         field[get_global_idx(idx_x_1, idx_y_1, g.z + 0, dim_x, dim_y)], c),\n                                  lerp4f(field[get_global_idx(idx_x_1, g.y + 0, idx_z_1, dim_x, dim_y)],\n                                         field[get_global_idx(idx_x_1, g.y + 0, g.z + 0, dim_x, dim_y)], c),\n                                  b),\n                           lerp4f(lerp4f(field[get_global_idx(g.x + 0, idx_y_1, idx_z_1, dim_x, dim_y)],\n                                         field[get_global_idx(g.x + 0, idx_y_1, g.z + 0, dim_x, dim_y)], c),\n                                  lerp4f(field[get_global_idx(g.x + 0, g.y + 0, idx_z_1, dim_x, dim_y)],\n                                         field[get_global_idx(g.x + 0, g.y + 0, g.z + 0, dim_x, dim_y)], c),\n                                  b),\n                           a);\n    return result;\n}\n\n/*\n * FLOAT2\n */\n\n__device__ __forceinline__ float norm(float2 v) {\n    return __fsqrt_rn(__fadd_rn(__fmul_rn(v.x, v.x), __fmul_rn(v.y, v.y)));\n}\n\n/*\n * FLOAT3\n */\n\n__host__ __device__ __forceinline__ float3 &operator-=(float3 &v1, const float3 &v2) {\n    v1.x -= v2.x;\n    v1.y -= v2.y;\n    v1.z -= v2.z;\n    return v1;\n}\n\n__device__ __forceinline__ float3 operator/(const float3 &v, const float &d) {\n    return make_float3(__fdividef(v.x, d), __fdividef(v.y, d), __fdividef(v.z, d));\n}\n\n__device__ __forceinline__ float n(const float3 &v) {\n    return __fsqrt_rd(__fmul_rn(v.x, v.x) + __fmul_rn(v.y, v.y) + __fmul_rn(v.z, v.z));\n}\n\n__device__ __forceinline__ float3 sq(const float3 &v) {\n    return make_float3(__fmul_rn(v.x, v.x), __fmul_rn(v.y, v.y), __fmul_rn(v.z, v.z));\n}\n\n__device__ __forceinline__ float rms_epsilon(const float3 &v) { return __fsqrt_rn(v.x + v.y + v.z + 1e-5f); }\n\n/*\n * FLOAT4\n */\n\n__device__ __forceinline__ float4 operator+(const float4 &v1, const float4 &v2) {\n    return make_float4(__fadd_rn(v1.x, v2.x), __fadd_rn(v1.y, v2.y), __fadd_rn(v1.z, v2.z), 0.f);\n}\n\n__device__ __forceinline__ float4 operator-(const float4 &v1, const float4 &v2) {\n    return make_float4(__fadd_rn(v1.x, -v2.x), __fadd_rn(v1.y, -v2.y), __fadd_rn(v1.z, -v2.z), 0.f);\n}\n\n__host__ __device__ __forceinline__ float4 &operator+=(float4 &v1, const float4 &v2) {\n    v1.x += v2.x;\n    v1.y += v2.y;\n    v1.z += v2.z;\n    return v1;\n}\n\n__host__ __device__ __forceinline__ float4 &operator-=(float4 &v1, const float4 &v2) {\n    v1.x -= v2.x;\n    v1.y -= v2.y;\n    v1.z -= v2.z;\n    return v1;\n}\n\n__device__ __forceinline__ float4 operator*(const float4 &v, const float &m) {\n    return make_float4(__fmul_rn(v.x, m), __fmul_rn(v.y, m), __fmul_rn(v.z, m), 0.f);\n}\n\n__device__ __forceinline__ float4 operator*(const float &m, const float4 &v) { return v * m; }\n\n__device__ __forceinline__ float4 operator/(const float4 &v, const float &d) {\n    return make_float4(__fdividef(v.x, d), __fdividef(v.y, d), __fdividef(v.z, d), 0.f);\n}\n\n__device__ __forceinline__ float4 operator/(const float &d, const float4 &v) { return v / d; }\n\n__device__ __forceinline__ float norm(const float4 &v) {\n    return __fsqrt_rd(__fmul_rn(v.x, v.x) + __fmul_rn(v.y, v.y) + __fmul_rn(v.z, v.z));\n}\n\n__device__ __forceinline__ float norm_sq(const float4 &v) {\n    return __fmul_rn(v.x, v.x) + __fmul_rn(v.y, v.y) + __fmul_rn(v.z, v.z);\n}\n\n__device__ __forceinline__ float4 normalised(const float4 &v) {\n    float n = norm(v);\n    return (n > 1e-5f) ? v / n : v;\n}\n\n__host__ __device__ __forceinline__ float3 trunc(const float4 &f) { return make_float3(f.x, f.y, f.z); }\n\n__device__ __forceinline__ float4 sq(const float4 &v) {\n    return make_float4(__fmul_rn(v.x, v.x), __fmul_rn(v.y, v.y), __fmul_rn(v.z, v.z), 0.f);\n}\n\n__device__ __forceinline__ float rms_epsilon(const float4 &v) {\n    return __fsqrt_rd(__fadd_rn(__fadd_rn(v.x, __fadd_rn(v.y, v.z)), 1e-5f));\n}\n\n/*\n * MAT3F\n */\n\n__device__ __forceinline__ kfusion::device::Mat3f operator+(kfusion::device::Mat3f M1, kfusion::device::Mat3f M2) {\n    kfusion::device::Mat3f N;\n\n    N.data[0].x = __fadd_rn(M1.data[0].x, M2.data[0].x);\n    N.data[0].y = __fadd_rn(M1.data[0].y, M2.data[0].y);\n    N.data[0].z = __fadd_rn(M1.data[0].z, M2.data[0].z);\n\n    N.data[1].x = __fadd_rn(M1.data[1].x, M2.data[1].x);\n    N.data[1].y = __fadd_rn(M1.data[1].y, M2.data[1].y);\n    N.data[1].z = __fadd_rn(M1.data[1].z, M2.data[1].z);\n\n    N.data[2].x = __fadd_rn(M1.data[2].x, M2.data[2].x);\n    N.data[2].y = __fadd_rn(M1.data[2].y, M2.data[2].y);\n    N.data[2].z = __fadd_rn(M1.data[2].z, M2.data[2].z);\n\n    return N;\n}\n\n__device__ __forceinline__ kfusion::device::Mat3f operator-(kfusion::device::Mat3f M1, kfusion::device::Mat3f M2) {\n    kfusion::device::Mat3f N;\n\n    N.data[0].x = __fsub_rn(M1.data[0].x, M2.data[0].x);\n    N.data[0].y = __fsub_rn(M1.data[0].y, M2.data[0].y);\n    N.data[0].z = __fsub_rn(M1.data[0].z, M2.data[0].z);\n\n    N.data[1].x = __fsub_rn(M1.data[1].x, M2.data[1].x);\n    N.data[1].y = __fsub_rn(M1.data[1].y, M2.data[1].y);\n    N.data[1].z = __fsub_rn(M1.data[1].z, M2.data[1].z);\n\n    N.data[2].x = __fsub_rn(M1.data[2].x, M2.data[2].x);\n    N.data[2].y = __fsub_rn(M1.data[2].y, M2.data[2].y);\n    N.data[2].z = __fsub_rn(M1.data[2].z, M2.data[2].z);\n\n    return N;\n}\n\n__device__ __forceinline__ kfusion::device::Mat3f operator*(kfusion::device::Mat3f M, float m) {\n    kfusion::device::Mat3f N;\n\n    N.data[0].x = __fmul_rn(M.data[0].x, m);\n    N.data[0].y = __fmul_rn(M.data[0].y, m);\n    N.data[0].z = __fmul_rn(M.data[0].z, m);\n\n    N.data[1].x = __fmul_rn(M.data[1].x, m);\n    N.data[1].y = __fmul_rn(M.data[1].y, m);\n    N.data[1].z = __fmul_rn(M.data[1].z, m);\n\n    N.data[2].x = __fmul_rn(M.data[2].x, m);\n    N.data[2].y = __fmul_rn(M.data[2].y, m);\n    N.data[2].z = __fmul_rn(M.data[2].z, m);\n\n    return N;\n}\n\n__device__ __forceinline__ kfusion::device::Mat3f operator/(kfusion::device::Mat3f M, float d) { return M * 1.f / d; }\n\n/*\n * MAT4F\n */\n\n__device__ __forceinline__ float4 operator*(Mat4f M, float4 v) {\n    return make_float4(\n        __fadd_rn(__fadd_rn(__fmul_rn(M.data[0].x, v.x), __fmul_rn(M.data[0].y, v.y)), __fmul_rn(M.data[0].z, v.z)),\n        __fadd_rn(__fadd_rn(__fmul_rn(M.data[1].x, v.x), __fmul_rn(M.data[1].y, v.y)), __fmul_rn(M.data[1].z, v.z)),\n        __fadd_rn(__fadd_rn(__fmul_rn(M.data[2].x, v.x), __fmul_rn(M.data[2].y, v.y)), __fmul_rn(M.data[2].z, v.z)),\n        0.f);\n}\n\n__device__ __forceinline__ float det(Mat4f M) {\n    return __fsub_rn(__fadd_rn(__fadd_rn(__fmul_rn(__fmul_rn(M.data[0].x, M.data[1].y), M.data[2].z),\n                                         __fmul_rn(__fmul_rn(M.data[1].x, M.data[2].y), M.data[0].z)),\n                               __fmul_rn(__fmul_rn(M.data[2].x, M.data[0].y), M.data[1].z)),\n                     __fadd_rn(__fmul_rn(__fadd_rn(__fmul_rn(M.data[0].z, M.data[1].y), M.data[2].x),\n                                         __fmul_rn(__fmul_rn(M.data[1].z, M.data[2].y), M.data[0].x)),\n                               __fmul_rn(__fmul_rn(M.data[2].z, M.data[0].y), M.data[1].x)));\n}\n\n/*\n * OTHER\n */\n\n__host__ __device__ __forceinline__ float sign(float a) {\n    if (a > 0.f) {\n        return 1.f;\n    } else if (a < 0.f) {\n        return -1.f;\n    } else {\n        return 0.f;\n    }\n}\n\n__host__ __device__ __forceinline__ bool is_truncated(float tsdf) {\n    if (fabs(tsdf) >= 1.f) {\n        return true;\n    }\n\n    return false;\n}\n\n__host__ __device__ __forceinline__ float heaviside_smooth(float phi, float epsilon) {\n    return 1.f / CUDART_PI_F * (epsilon / (epsilon * epsilon + phi * phi));\n}\n\n__host__ __device__ __forceinline__ void vec(kfusion::device::Mat3f J, float *v) {\n    v[0] = J.data[0].x;\n    v[1] = J.data[1].x;\n    v[2] = J.data[2].x;\n\n    v[3] = J.data[0].y;\n    v[4] = J.data[1].y;\n    v[5] = J.data[2].y;\n\n    v[6] = J.data[0].z;\n    v[7] = J.data[1].z;\n    v[8] = J.data[2].z;\n}\n\n__host__ __device__ __forceinline__ kfusion::device::Mat3f transpose(kfusion::device::Mat3f J) {\n    kfusion::device::Mat3f J_T;\n\n    float3 row_1  = J.data[0];\n    J_T.data[0].x = row_1.x;\n    J_T.data[1].x = row_1.y;\n    J_T.data[2].x = row_1.z;\n\n    float3 row_2  = J.data[1];\n    J_T.data[0].y = row_2.x;\n    J_T.data[1].y = row_2.y;\n    J_T.data[2].y = row_2.z;\n\n    float3 row_3  = J.data[2];\n    J_T.data[0].z = row_3.x;\n    J_T.data[1].z = row_3.y;\n    J_T.data[2].z = row_3.z;\n\n    return J_T;\n}\n"
  },
  {
    "path": "include/sobfu/params.hpp",
    "content": "#pragma once\n\n/* kinfu includes */\n#include <kfusion/types.hpp>\n\n/* sobfu parameters */\nstruct Params {\n    int cols = 640;\n    int rows = 480; /* no. of rows and columns in the frames */\n\n    cv::Vec3i volume_dims; /* volume dimensions in voxels */\n    cv::Vec3f volume_size; /* volume size in metres */\n\n    cv::Affine3f volume_pose;\n    kfusion::Intr intr; /* camera intrinsics */\n\n    float icp_truncate_depth_dist; /* depth truncation distance */\n\n    float bilateral_sigma_depth, bilateral_sigma_spatial;\n    int bilateral_kernel_size;\n\n    float tsdf_trunc_dist, eta; /* tsdf truncation distance and expected object thickness */\n    float tsdf_max_weight;      /* tsdf max. weight */\n\n    float gradient_delta_factor;\n\n    int start_frame = 0; /* frame when to start registration */\n    int verbosity   = 0; /* solver verbosity */\n\n    int s /* filter size */, max_iter /* max. no of iterations of the solver */;\n    float max_update_norm /* max. update norm */, lambda /* filter parameter */, alpha /* gradient descent step size */,\n        w_reg /* weight of the regularisation term */;\n\n    cv::Vec3f voxel_sizes() {\n        return cv::Vec3f(volume_size[0] / volume_dims[0], volume_size[1] / volume_dims[1],\n                         volume_size[2] / volume_dims[2]);\n    }\n};\n"
  },
  {
    "path": "include/sobfu/precomp.hpp",
    "content": "#pragma once\n\n/* cuda includes */\n#include <cuda.h>\n\n/* kinfu includes */\n#include <kfusion/internal.hpp>\n\n/* checks if x is a power of 2 */\nbool isPow2(unsigned int x);\n\n/* computes the nearest power of 2 larger than x */\nint nextPow2(int x);\n\n/* compute the number of threads and blocks to use for the given reduction kernel; we set threads\n * block to the minimum of maxThreads and n/2; we observe the maximum specified number of blocks, because each thread\n * in the kernel can process a variable number of elements */\nvoid get_num_blocks_and_threads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads);\n"
  },
  {
    "path": "include/sobfu/reductor.hpp",
    "content": "#pragma once\n\n/* kinfu includes */\n#include <kfusion/cuda/tsdf_volume.hpp>\n\n/* sobfu incldues */\n#include <sobfu/precomp.hpp>\n#include <sobfu/vector_fields.hpp>\n\n/* thrust includes */\n#include <thrust/device_vector.h>\n#include <thrust/execution_policy.h>\n#include <thrust/extrema.h>\n#include <thrust/iterator/transform_iterator.h>\n#include <thrust/transform.h>\n\nnamespace sobfu {\nnamespace device {\n\n/*\n * REDUCTOR\n */\n\nstruct Reductor {\n    /* constructor */\n    Reductor(int3 dims_, float vsz_, float trunc_dist_);\n    /* destructor */\n    ~Reductor();\n\n    /* energy functional */\n    float data_energy(float2 *phi_global_data, float2 *phi_n_data);\n    float reg_energy_sobolev(Mat4f *J_data);\n\n    float2 max_update_norm();\n    float2 voxel_max_energy(float2 *phi_global_data, float2 *phi_n_data, Mat4f *J_data, float w_reg);\n\n    int3 dims;\n    float vsz, trunc_dist;\n    int no_voxels;\n\n    int blocks,\n        threads; /* no. of blocks and threads for the reductions used to calculate value of the energy functional */\n\n    float4 *updates;\n\n    float *h_data_out, *d_data_out;\n    float *h_reg_out, *d_reg_out;\n\n    float2 *h_max_out, *d_max_out;\n};\n\n/* kernels */\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void reduce_data_kernel(float2 *g_idata_global, float2 *g_idata_n, float *g_odata, unsigned int n);\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void reduce_reg_sobolev_kernel(Mat4f *g_idata, float *g_odata, unsigned int n);\n\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void reduce_voxel_max_energy_kernel(float2 *d_idata_global, float2 *d_idata_n, Mat4f *d_idata_reg,\n                                               float2 *d_o_data, float w_reg, unsigned int n);\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void reduce_max_kernel(float4 *updates, float2 *g_o_max_data, unsigned int n);\n\n/* host methods */\nvoid reduce_data(int size, int threads, int blocks, float2 *d_idata_global, float2 *d_idata_n, float *d_odata);\nvoid reduce_reg_sobolev(int size, int threads, int blocks, Mat4f *d_idata, float *d_odata);\n\nvoid reduce_voxel_max_energy(int size, int threads, int blocks, float2 *d_idata_global, float2 *d_idata_n,\n                             Mat4f *d_idata_reg, float w_reg, float2 *d_odata);\nvoid reduce_max(int size, int threads, int blocks, float4 *updates, float2 *d_o_max_data);\n\n/* final reduce on the cpu */\nfloat final_reduce(float *h_odata, float *d_odata, int numBlocks);\nfloat2 final_reduce_max(float2 *h_o_max_data, float2 *d_o_max_data, int numBlocks, int3 dims);\n\n}  // namespace device\n}  // namespace sobfu\n"
  },
  {
    "path": "include/sobfu/scalar_fields.hpp",
    "content": "#pragma once\n\n/* sobfu includes */\n#include <sobfu/params.hpp>\n#include <sobfu/precomp.hpp>\n\n/* kinfu includes */\n#include <kfusion/internal.hpp>\n#include <kfusion/precomp.hpp>\n#include <kfusion/types.hpp>\n\nnamespace sobfu {\nnamespace cuda {\n\n/*\n * SCALAR FIELD\n */\n\nclass ScalarField {\npublic:\n    /* constructor */\n    ScalarField(cv::Vec3i dims_);\n    /* destructor */\n    ~ScalarField();\n\n    /* get field data*/\n    kfusion::cuda::CudaData get_data();\n    const kfusion::cuda::CudaData get_data() const;\n\n    /* get dims */\n    int3 get_dims();\n\n    /* clear field */\n    void clear();\n    /* sum values in the field */\n    float sum();\n    /* print field */\n    void print();\n\nprivate:\n    kfusion::cuda::CudaData data; /* field data */\n    int3 dims;                    /* field dimensions */\n};\n\n}  // namespace cuda\n\nnamespace device {\n\n/*\n * SCALAR FIELD\n */\n\nstruct ScalarField {\n    /* constructor */\n    ScalarField(float *const data_, int3 dims_) : data(data_), dims(dims_) {}\n\n    __device__ __forceinline__ float *beg(int x, int y) const;\n    __device__ __forceinline__ float *zstep(float *const ptr) const;\n\n    __device__ __forceinline__ float *operator()(int idx) const;\n    __device__ __forceinline__ float *operator()(int x, int y, int z) const;\n\n    float *const data; /* field data */\n    const int3 dims;\n};\n\n/* clear */\n__global__ void clear_kernel(sobfu::device::ScalarField field);\nvoid clear(ScalarField &field);\n\n/* sum */\nfloat reduce_sum(sobfu::device::ScalarField &field);\n\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void reduce_sum_kernel(float *g_idata, float *g_odata, unsigned int n);\nfloat final_reduce_sum(float *d_odata, int numBlocks);\n\n}  // namespace device\n}  // namespace sobfu\n"
  },
  {
    "path": "include/sobfu/sob_fusion.hpp",
    "content": "#pragma once\n\n/* sobfu includes */\n#include <sobfu/params.hpp>\n#include <sobfu/solver.hpp>\n\n/* kinfu includes */\n#include <kfusion/internal.hpp>\n#include <kfusion/kinfu.hpp>\n#include <kfusion/precomp.hpp>\n\n/* pcl includes */\n#include <pcl/PolygonMesh.h>\n#include <pcl/point_types.h>\n\n/* sys headers */\n#include <math.h>\n#include <thread>\n\n/* */\nclass SobFusion {\npublic:\n    /* default constructor */\n    SobFusion(const Params &params);\n    /* default destructor */\n    ~SobFusion();\n\n    /* get sobfu params */\n    Params &getParams();\n\n    /* get the canonical model vertices stored as pcl triangle mesh */\n    pcl::PolygonMesh::Ptr get_phi_global_mesh();\n    /* get the canonical model mesh warped to live */\n    pcl::PolygonMesh::Ptr get_phi_global_psi_inv_mesh();\n    /* get the live model mesh */\n    pcl::PolygonMesh::Ptr get_phi_n_mesh();\n    /* get the live model mesh warped with psi */\n    pcl::PolygonMesh::Ptr get_phi_n_psi_mesh();\n\n    /* get the deformation field */\n    std::shared_ptr<sobfu::cuda::DeformationField> getDeformationField();\n\n    /* run algorithm on all frames */\n    bool operator()(const kfusion::cuda::Depth &depth, const kfusion::cuda::Image &image = kfusion::cuda::Image());\n\nprivate:\n    std::vector<cv::Affine3f> poses_;\n    kfusion::cuda::Dists dists_;\n    kfusion::cuda::Frame curr_, prev_;\n\n    kfusion::cuda::Cloud points_;\n    kfusion::cuda::Normals normals_;\n    kfusion::cuda::Depth depths_;\n\n    int frame_counter_;\n\n    /* system parameters */\n    Params params;\n\n    /* tsdf's */\n    cv::Ptr<kfusion::cuda::TsdfVolume> phi_global, phi_global_psi_inv, phi_n, phi_n_psi;\n    /* deformation field warps phi_n to phi_global */\n    std::shared_ptr<sobfu::cuda::DeformationField> psi, psi_inv;\n    /* solver */\n    std::shared_ptr<sobfu::cuda::Solver> solver;\n\n    /* marching cubes */\n    cv::Ptr<kfusion::cuda::MarchingCubes> mc;\n\n    /* run marching cubes on vol */\n    pcl::PolygonMesh::Ptr get_mesh(cv::Ptr<kfusion::cuda::TsdfVolume> vol);\n    /* convert the canonical model to pcl polygon mesh */\n    static pcl::PolygonMesh::Ptr convert_to_mesh(const kfusion::cuda::DeviceArray<pcl::PointXYZ> &triangles);\n};\n"
  },
  {
    "path": "include/sobfu/solver.hpp",
    "content": "#pragma once\n\n/* kinfu incldues */\n#include <kfusion/cuda/tsdf_volume.hpp>\n#include <kfusion/safe_call.hpp>\n\n/* sobfu includes */\n#include <sobfu/reductor.hpp>\n#include <sobfu/scalar_fields.hpp>\n#include <sobfu/vector_fields.hpp>\n\n/*\n * SOLVER PARAMETERS\n */\n\nstruct SolverParams {\n    int verbosity, max_iter, s;\n    float max_update_norm, lambda, alpha, w_reg;\n};\n\n/*\n * SDF'S USED IN THE SOLVER\n */\n\nstruct SDFs {\n    SDFs(kfusion::device::TsdfVolume &phi_global_, kfusion::device::TsdfVolume &phi_global_psi_inv_,\n         kfusion::device::TsdfVolume &phi_n_, kfusion::device::TsdfVolume &phi_n_psi_)\n        : phi_global(phi_global_), phi_global_psi_inv(phi_global_psi_inv_), phi_n(phi_n_), phi_n_psi(phi_n_psi_) {}\n\n    kfusion::device::TsdfVolume phi_global, phi_global_psi_inv, phi_n, phi_n_psi;\n};\n\n/*\n * SPATIAL DIFFERENTIATORS\n */\n\nstruct Differentiators {\n    Differentiators(sobfu::device::TsdfDifferentiator &tsdf_diff_, sobfu::device::Differentiator &diff_,\n                    sobfu::device::Differentiator &diff_inv_,\n                    sobfu::device::SecondOrderDifferentiator &second_order_diff_)\n\n        : tsdf_diff(tsdf_diff_), diff(diff_), diff_inv(diff_inv_), second_order_diff(second_order_diff_) {}\n\n    sobfu::device::TsdfDifferentiator tsdf_diff;\n    sobfu::device::Differentiator diff, diff_inv;\n    sobfu::device::SecondOrderDifferentiator second_order_diff;\n};\n\nnamespace sobfu {\nnamespace cuda {\n\n/*\n * SOLVER\n */\n\nclass Solver {\npublic:\n    /* constructor */\n    Solver(Params &params);\n    /* destructor */\n    ~Solver();\n\n    void estimate_psi(const cv::Ptr<kfusion::cuda::TsdfVolume> phi_global,\n                      cv::Ptr<kfusion::cuda::TsdfVolume> phi_global_psi_inv,\n                      const cv::Ptr<kfusion::cuda::TsdfVolume> phi_n, cv::Ptr<kfusion::cuda::TsdfVolume> phi_n_psi,\n                      std::shared_ptr<sobfu::cuda::DeformationField> psi,\n                      std::shared_ptr<sobfu::cuda::DeformationField> psi_inv);\n\nprivate:\n    /* volume params */\n    int3 dims;\n    float3 voxel_sizes;\n\n    int no_voxels;\n    float trunc_dist, eta, max_weight;\n\n    /* solver params */\n    SolverParams solver_params;\n\n    /* gradients */\n    sobfu::cuda::SpatialGradients *spatial_grads;\n    sobfu::device::SpatialGradients *spatial_grads_device;\n\n    sobfu::device::TsdfGradient *nabla_phi_n, *nabla_phi_n_o_psi;\n    sobfu::device::Jacobian *J, *J_inv;\n    sobfu::device::Laplacian *L, *L_o_psi_inv;\n    sobfu::device::PotentialGradient *nabla_U, *nabla_U_S;\n\n    /* used to calculate value of the energy functional */\n    sobfu::device::Reductor *r;\n\n    /* sobolev filter */\n    float *h_S_i, *d_S_i;\n};\n\n/* get 3d sobolev filter */\nstatic void get_3d_sobolev_filter(SolverParams &params, float *h_S_i);\n/* calculate 1d filters from a separable 3d filter */\nstatic void decompose_sobolev_filter(SolverParams &params, float *h_S_i);\n\n}  // namespace cuda\n\nnamespace device {\n\n/* potential gradient */\n__global__ void calculate_potential_gradient_kernel(float2 *phi_n_psi, float2 *phi_global, float4 *nabla_phi_n_o_psi,\n                                                    float4 *L, float4 *nabla_U, float w_reg, int dim_x, int dim_y,\n                                                    int dim_z);\nvoid calculate_potential_gradient(kfusion::device::TsdfVolume &phi_n_psi, kfusion::device::TsdfVolume &phi_global,\n                                  sobfu::device::TsdfGradient &nabla_phi_n_o_psi, sobfu::device::Laplacian &L,\n                                  sobfu::device::PotentialGradient &nabla_U, float w_reg);\n\n/* estimate psi */\nvoid estimate_psi(SDFs &sdfs, sobfu::device::DeformationField &psi, sobfu::device::DeformationField &psi_inv,\n                  sobfu::device::SpatialGradients *spatial_grads, Differentiators &diffs, float *d_S_i,\n                  sobfu::device::Reductor *r, SolverParams &params);\n\n/* DEFORMATION FIELD UPDATES */\n__global__ void update_psi_kernel(float4 *psi, float4 *nabla_U_S, float4 *updates, float alpha, int dim_x, int dim_y,\n                                  int dim_z);\nvoid update_psi(sobfu::device::DeformationField &psi, sobfu::device::PotentialGradient &nabla_U_S, float4 *updates,\n                float alpha);\n\n/*\n * CONVOLUTIONS\n */\n\nvoid set_convolution_kernel(float *d_kernel);\n\n__global__ void convolution_rows_kernel(float4 *d_dst, float4 *d_src, int image_w, int image_h, int image_d);\n__global__ void convolution_columns_kernel(float4 *d_dst, float4 *d_src, int image_w, int image_h, int image_d);\n__global__ void convolution_depth_kernel(float4 *d_dst, float4 *d_src, int image_w, int image_h, int image_d);\n\nvoid convolution_rows(float4 *d_dst, float4 *updates, int image_w, int image_h, int image_d);\nvoid convolution_columns(float4 *d_dst, float4 *updates, int image_w, int image_h, int image_d);\nvoid convolution_depth(float4 *d_dst, float4 *updates, int image_w, int image_h, int image_d);\n\n}  // namespace device\n}  // namespace sobfu\n"
  },
  {
    "path": "include/sobfu/vector_fields.hpp",
    "content": "#pragma once\n\n/* sobfu includes */\n#include <sobfu/params.hpp>\n#include <sobfu/precomp.hpp>\n\n/* kinfu includes */\n#include <kfusion/cuda/tsdf_volume.hpp>\n#include <kfusion/internal.hpp>\n#include <kfusion/precomp.hpp>\n#include <kfusion/types.hpp>\n\nnamespace sobfu {\nnamespace cuda {\n\n/*\n * VECTOR FIELD\n */\n\nclass VectorField {\npublic:\n    /* constructor */\n    VectorField(cv::Vec3i dims_);\n    /* destructor */\n    ~VectorField();\n\n    /* get dimensions of the field */\n    cv::Vec3i get_dims() const;\n\n    /* get the data in the field */\n    kfusion::cuda::CudaData get_data();\n    const kfusion::cuda::CudaData get_data() const;\n    /* set the data in the field */\n    void set_data(kfusion::cuda::CudaData &data);\n\n    /* clear field */\n    void clear();\n    /* print field */\n    void print();\n    /* get no. of nan's in the field */\n    int get_no_nans();\n\nprotected:\n    kfusion::cuda::CudaData data; /* field data */\n    cv::Vec3i dims;               /* field dimensions */\n};\n\n/*\n * DEFORMATION FIELD\n */\n\nclass DeformationField : public VectorField {\npublic:\n    /* constructor */\n    DeformationField(cv::Vec3i dims_);\n    /* destructor */\n    ~DeformationField();\n\n    /* init field to identity */\n    void clear();\n\n    /* apply the field to an sdf */\n    void apply(const cv::Ptr<kfusion::cuda::TsdfVolume> phi, cv::Ptr<kfusion::cuda::TsdfVolume> phi_psi);\n    /* approximate the inverse of the field */\n    void get_inverse(sobfu::cuda::DeformationField &psi_inv);\n};\n\n/*\n * TSDF GRADIENT, LAPLACIAN, POTENTIAL GRADIENT\n */\n\ntypedef VectorField TsdfGradient;\ntypedef VectorField Laplacian;\ntypedef VectorField PotentialGradient;\n\n/*\n * JACOBIAN\n */\n\nclass Jacobian {\npublic:\n    /* constructor */\n    Jacobian(cv::Vec3i dims_);\n    /* destructor */\n    ~Jacobian();\n\n    kfusion::cuda::CudaData get_data();\n    const kfusion::cuda::CudaData get_data() const;\n\n    /* clear jacobian */\n    void clear();\n\nprivate:\n    kfusion::cuda::CudaData data;\n    cv::Vec3i dims;\n};\n\n/*\n * SPATIAL GRADIENTS\n */\n\nstruct SpatialGradients {\n    /* constructor */\n    SpatialGradients(cv::Vec3i dims_);\n    /* destructor */\n    ~SpatialGradients();\n\n    TsdfGradient *nabla_phi_n, *nabla_phi_n_o_psi;\n    Jacobian *J, *J_inv;\n    Laplacian *L, *L_o_psi_inv;\n    PotentialGradient *nabla_U, *nabla_U_S;\n};\n\n}  // namespace cuda\n}  // namespace sobfu\n\nnamespace sobfu {\nnamespace device {\n\n/*\n * VECTOR FIELD\n */\n\nstruct VectorField {\n    /* constructor */\n    VectorField(float4 *const data_, const int3 dims_) : data(data_), dims(dims_) {}\n\n    __device__ __forceinline__ float4 *beg(int x, int y) const;\n    __device__ __forceinline__ float4 *zstep(float4 *const ptr) const;\n    __device__ __forceinline__ float4 *operator()(int x, int y, int z) const;\n\n    __device__ __forceinline__ float4 get_displacement(int x, int y, int z) const;\n\n    float4 *const data; /* vector field data */\n    const int3 dims;    /* vector field dimensions */\n};\n\n/* clear field */\n__global__ void clear_kernel(VectorField field);\nvoid clear(VectorField &field);\n\n/*\n * DEFORMATION FIELD\n */\n\ntypedef VectorField DeformationField;\n\n/* set deformation field to identity */\n__global__ void init_identity_kernel(DeformationField field);\nvoid init_identity(DeformationField &field);\n/* apply psi to an sdf */\n__global__ void apply_kernel(const kfusion::device::TsdfVolume phi, kfusion::device::TsdfVolume phi_warped,\n                             const DeformationField psi);\nvoid apply(const kfusion::device::TsdfVolume &phi, kfusion::device::TsdfVolume &phi_warped,\n           const sobfu::device::DeformationField &psi);\n/* approximate the inverse of a deformation field */\n__global__ void estimate_inverse_kernel(sobfu::device::DeformationField psi, sobfu::device::DeformationField psi_inv);\nvoid estimate_inverse(sobfu::device::DeformationField &psi, sobfu::device::DeformationField &psi_inv);\n\n/*\n * TSDF GRADIENT\n */\n\ntypedef VectorField TsdfGradient;\n\nstruct TsdfDifferentiator {\n    /* constructor */\n    TsdfDifferentiator(kfusion::device::TsdfVolume &vol_) : vol(vol_) {}\n\n    /* calculate the gradient */\n    __device__ void operator()(sobfu::device::TsdfGradient &grad) const;\n    void calculate(sobfu::device::TsdfGradient &grad);\n\n    kfusion::device::TsdfVolume vol;\n};\n\n/* estimate tsdf gradient */\n__global__ void estimate_gradient_kernel(const sobfu::device::TsdfDifferentiator diff,\n                                         sobfu::device::TsdfGradient grad);\n/* interpolate tsdf gradient */\n__global__ void interpolate_gradient_kernel(sobfu::device::TsdfGradient nabla_phi_n_psi,\n                                            sobfu::device::TsdfGradient nabla_phi_n_psi_t,\n                                            sobfu::device::DeformationField psi);\nvoid interpolate_gradient(sobfu::device::TsdfGradient &nabla_phi_n_psi, sobfu::device::TsdfGradient &nabla_phi_n_psi_t,\n                          sobfu::device::DeformationField &psi);\n\n/*\n * LAPLACIAN\n */\n\ntypedef VectorField Laplacian;\n\nstruct SecondOrderDifferentiator {\n    /* constructor */\n    SecondOrderDifferentiator(sobfu::device::DeformationField &psi_) : psi(psi_) {}\n\n    __device__ void laplacian(sobfu::device::Laplacian &L) const;\n    void calculate(sobfu::device::Laplacian &L);\n\n    sobfu::device::DeformationField psi;\n};\n\n/* estimate laplacian */\n__global__ void estimate_laplacian_kernel(const sobfu::device::SecondOrderDifferentiator diff,\n                                          sobfu::device::Laplacian L);\n/* interpolate laplacian */\n__global__ void interpolate_laplacian_kernel(sobfu::device::Laplacian L, sobfu::device::Laplacian L_o_psi,\n                                             sobfu::device::DeformationField psi);\nvoid interpolate_laplacian(sobfu::device::Laplacian &L, sobfu::device::Laplacian &L_o_psi,\n                           sobfu::device::DeformationField &psi);\n\n/*\n * POTENTIAL GRADIENT\n */\n\ntypedef VectorField PotentialGradient;\n\n/*\n * JACOBIAN\n */\n\nstruct Jacobian {\n    /* constructor */\n    Jacobian(Mat4f *const data_, int3 dims_) : data(data_), dims(dims_) {}\n\n    __device__ __forceinline__ Mat4f *beg(int x, int y) const;\n    __device__ __forceinline__ Mat4f *zstep(Mat4f *const ptr) const;\n    __device__ __forceinline__ Mat4f *operator()(int x, int y, int z) const;\n\n    Mat4f *const data; /* jacobian data */\n    const int3 dims;\n};\n\nstruct Differentiator {\n    /* constructor */\n    Differentiator(sobfu::device::DeformationField &psi_) : psi(psi_) {}\n\n    /* calculate jacobian */\n    __device__ void operator()(sobfu::device::Jacobian &J, int mode) const;\n    void calculate(sobfu::device::Jacobian &J);\n    void calculate_deformation_jacobian(sobfu::device::Jacobian &J);\n\n    sobfu::device::DeformationField psi;\n};\n\n/* clear jacobian */\n__global__ void clear_jacobian_kernel(Jacobian J);\nvoid clear(Jacobian &J);\n/* estimate jacobian */\n__global__ void estimate_jacobian_kernel(const sobfu::device::Differentiator diff, sobfu::device::Jacobian J);\n__global__ void estimate_deformation_jacobian_kernel(const sobfu::device::Differentiator diff,\n                                                     sobfu::device::Jacobian J);\n\n/*\n * SPATIAL GRADIENTS\n */\n\nstruct SpatialGradients {\n    SpatialGradients(sobfu::device::TsdfGradient *nabla_phi_n_, sobfu::device::TsdfGradient *nabla_phi_n_o_psi_,\n                     sobfu::device::Jacobian *J_, sobfu::device::Jacobian *J_inv_, sobfu::device::Laplacian *L_,\n                     sobfu::device::Laplacian *L_o_psi_inv_, sobfu::device::PotentialGradient *nabla_U_,\n                     sobfu::device::PotentialGradient *nabla_U_S_)\n        : nabla_phi_n(nabla_phi_n_),\n          nabla_phi_n_o_psi(nabla_phi_n_o_psi_),\n          J(J_),\n          J_inv(J_inv_),\n          L(L_),\n          L_o_psi_inv(L_o_psi_inv_),\n          nabla_U(nabla_U_),\n          nabla_U_S(nabla_U_S_) {}\n    ~SpatialGradients();\n\n    sobfu::device::TsdfGradient *nabla_phi_n, *nabla_phi_n_o_psi;\n    sobfu::device::Jacobian *J, *J_inv;\n    sobfu::device::Laplacian *L, *L_o_psi_inv;\n    sobfu::device::PotentialGradient *nabla_U, *nabla_U_S;\n};\n\n}  // namespace device\n}  // namespace sobfu\n"
  },
  {
    "path": "params/params_advent.ini",
    "content": "# TSDF\nVOL_DIMS_X=64\nVOL_DIMS_Y=64\nVOL_DIMS_Z=64\n\nVOL_SIZE_X=0.5\nVOL_SIZE_Y=0.5\nVOL_SIZE_Z=0.5\n\nTSDF_TRUNC_DIST=5\nETA=2\nTSDF_MAX_WEIGHT=128\n\nGRADIENT_DELTA_FACTOR=0.5\n\n# CAMERA\nINTR_FX=570.342\nINTR_FY=570.342\nINTR_CX=320.0\nINTR_CY=240.0\n\nTRUNC_DEPTH=1.5\n\nVOL_POSE_T_Z=0.5\n\n# BILATERAL FILTER\nBILATERAL_SIGMA_DEPTH=0.005\nBILATERAL_SIGMA_SPATIAL=4.5\nBILATERAL_KERNEL_SIZE=7\n\n# SOLVER\nMAX_ITER=8192\nMAX_UPDATE_NORM=5e-4\n\nS=7\nLAMBDA=0.1\n\nALPHA=0.1\nW_REG=0.2\n"
  },
  {
    "path": "params/params_boxing.ini",
    "content": "# TSDF\nVOL_DIMS_X=128\nVOL_DIMS_Y=128\nVOL_DIMS_Z=128\n\nVOL_SIZE_X=0.75\nVOL_SIZE_Y=0.75\nVOL_SIZE_Z=0.75\n\nTSDF_TRUNC_DIST=48\nETA=3\nTSDF_MAX_WEIGHT=128\n\nGRADIENT_DELTA_FACTOR=0.5\n\n# CAMERA\nINTR_FX=570.342\nINTR_FY=570.342\nINTR_CX=320.0\nINTR_CY=240.0\n\nTRUNC_DEPTH=1.0\n\nVOL_POSE_T_Z=0.1\n\n# BILATERAL FILTER\nBILATERAL_SIGMA_DEPTH=0.005\nBILATERAL_SIGMA_SPATIAL=4.5\nBILATERAL_KERNEL_SIZE=7\n\n# SOLVER\nSTART_FRAME=1\n\nMAX_ITER=4096\nMAX_UPDATE_NORM=1e-10\n\nS=7\nLAMBDA=0.1\n\nRHO_0=1.0\nALPHA=0.001\nW_REG=0.6\n"
  },
  {
    "path": "params/params_hat.ini",
    "content": "# TSDF\nVOL_DIMS_X=128\nVOL_DIMS_Y=128\nVOL_DIMS_Z=128\n\nVOL_SIZE_X=1.20\nVOL_SIZE_Y=1.20\nVOL_SIZE_Z=1.20\n\nTSDF_TRUNC_DIST=10\nETA=2\nTSDF_MAX_WEIGHT=128\n\nGRADIENT_DELTA_FACTOR=0.5\n\n# CAMERA\nINTR_FX=517.0\nINTR_FY=517.0\nINTR_CX=320.0\nINTR_CY=240.0\n\nTRUNC_DEPTH=2.0\n\nVOL_POSE_T_Z=0.75\n\n# BILATERAL FILTER\nBILATERAL_SIGMA_DEPTH=0.02\nBILATERAL_SIGMA_SPATIAL=4.5\nBILATERAL_KERNEL_SIZE=7\n\n# SOLVER\nMAX_ITER=2048\nMAX_UPDATE_NORM=1e-3\n\nS=7\nLAMBDA=0.1\n\nALPHA=0.05\nW_REG=0.1\n"
  },
  {
    "path": "params/params_ours.ini",
    "content": "# intrinsic parameters for our intel realsense sr300\n# ---rgb---\n# kinfuParams.intr = kfusion::Intr(622.011f, 622.012f, 320.331f, 235.664f);\n# ---depth---\n# kinfuParams.intr = kfusion::Intr(474.567f, 474.567f, 310.63f, 246.018f);\n#\n# ----- DOU -----\n# --- depth ---\n# kinfuParams.intr = kfusion::Intr(520.f, 520.f, 320.f, 240.f);\n\n\n"
  },
  {
    "path": "params/params_snoopy.ini",
    "content": "# TSDF\nVOL_DIMS_X=128\nVOL_DIMS_Y=128\nVOL_DIMS_Z=128\n\nVOL_SIZE_X=0.9\nVOL_SIZE_Y=0.9\nVOL_SIZE_Z=0.9\n\nTSDF_TRUNC_DIST=10\nETA=5\nTSDF_MAX_WEIGHT=128\n\nGRADIENT_DELTA_FACTOR=0.5\n\n# CAMERA\nINTR_FX=517.0\nINTR_FY=517.0\nINTR_CX=320.0\nINTR_CY=240.0\n\nTRUNC_DEPTH=3.0\n\nVOL_POSE_T_Z=0.05\n\n# BILATERAL FILTER\nBILATERAL_SIGMA_DEPTH=0.01\nBILATERAL_SIGMA_SPATIAL=4.5\nBILATERAL_KERNEL_SIZE=7\n\n# SOLVER\nSTART_FRAME=4\n\nMAX_ITER=2048\nMAX_UPDATE_NORM=1e-3\n\nS=7\nLAMBDA=0.1\n\nALPHA=0.1\nW_REG=0.2\n"
  },
  {
    "path": "params/params_umbrella.ini",
    "content": "# TSDF\nVOL_DIMS_X=128\nVOL_DIMS_Y=128\nVOL_DIMS_Z=128\n\nVOL_SIZE_X=1.0\nVOL_SIZE_Y=1.0\nVOL_SIZE_Z=1.0\n\nTSDF_TRUNC_DIST=8\nETA=3\nTSDF_MAX_WEIGHT=128\n\nGRADIENT_DELTA_FACTOR=0.5\n\n# CAMERA\nINTR_FX=570.342\nINTR_FY=570.342\nINTR_CX=320.0\nINTR_CY=240.0\n\nTRUNC_DEPTH=1.5\n\nVOL_POSE_T_Z=0.3\n\n# BILATERAL FILTER\nBILATERAL_SIGMA_DEPTH=0.04\nBILATERAL_SIGMA_SPATIAL=4.5\nBILATERAL_KERNEL_SIZE=7\n\n# SOLVER\nSTART_FRAME=1\n\nMAX_ITER=2048\nMAX_UPDATE_NORM=1e-10\n\nS=7\nLAMBDA=0.1\n\nALPHA=0.001\nW_REG=0.2\n"
  },
  {
    "path": "setup.sh",
    "content": "#!/bin/sh\n\nRUN_CMAKE=false\nRUN_MAKE=false\nRUN_MAKE_TESTS=false\n\nusage() {\n    echo \"USAGE: source setup.sh [options]\"\n    echo \"OPTIONS:\"\n    echo \"\\t--help                 -h: Display help\"\n    echo \"\\t--cmake                -c: Run CMake\"\n    echo \"\\t--make                 -m: Run make\"\n    echo \"\\t--make-tests           -t: Make tests\"\n    echo \"\\t--all                  -a: Run all the settings\"\n}\n\n# Parse through the arguments and check if any relavant flag exists\nwhile [ \"$1\" != \"\" ]; do\n    PARAM=`echo $1 | awk -F= '{print $1}'`\n    case $PARAM in\n        -h | --help)\n            usage\n            exit\n            ;;\n        -c | --cmake)\n            RUN_CMAKE=true\n            ;;\n        -m | --make)\n            RUN_MAKE=true\n            ;;\n        -t | --make-tests)\n            RUN_MAKE_TESTS=true\n            ;;\n        -a | --all)\n            RUN_CMAKE=true\n            RUN_MAKE=true\n            ;;\n        *)\n            echo \"ERROR: unknown parameter \\\"$PARAM\\\"\"\n            usage\n            return 1\n            ;;\n    esac\n    shift\ndone\n\n# If build does not exist create one\nmkdir -p build\ncd build\n\nif $RUN_CMAKE\nthen\n    if $RUN_MAKE_TESTS\n    then\n    \techo \"running cmake...\"\n    \tcmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DBUILD_TESTS=ON .. || (cd ../ && return 1)\n    else\n\techo \"running cmake...\"\n\tcmake -DCMAKE_EXPORT_COMPILE_COMMNADS=ON -DBUILD_TESTS=OFF .. || (cd ../ && return 1)\n    fi\nfi\n\nif $RUN_MAKE\nthen\n    echo \"running make...\"\n    make -j4 || (cd ../ && return 1)\n    echo \"make complete!\"\nfi\n\ncd ..\n"
  },
  {
    "path": "src/CMakeLists.txt",
    "content": "# ---------------------------------------------------------------------------- #\n# BUILD ALL SUBPROJECTS\n# ---------------------------------------------------------------------------- #\nadd_subdirectory(kfusion)\nadd_subdirectory(sobfu)\nadd_subdirectory(apps)\n\n"
  },
  {
    "path": "src/apps/CMakeLists.txt",
    "content": "# Link source files to executables\nadd_application(app)\n\n# Link to all libraries\ntarget_link_libraries(app\n    sobfu\n    kfusion\n    png\n    Boost::program_options\n    ${Boost_LIBRARIES}\n    ${CUDA_CUDA_LIBRARY}\n    ${CUDA_CUDART_LIBRARY}\n    ${OpenCV_LIBS}\n    ${PCL_LIBRARIES}\n    ${VTK_LIBRARIES}\n)\n"
  },
  {
    "path": "src/apps/demo.cpp",
    "content": "/* sobfu includes a*/\n#include <sobfu/sob_fusion.hpp>\n\n/* boost includes */\n#include <boost/program_options.hpp>\n\n/* opencv includes */\n#include <opencv2/highgui/highgui.hpp>\n#include <opencv2/imgproc/imgproc.hpp>\n\n/* pcl includes */\n#include <pcl/io/pcd_io.h>\n#include <pcl/io/vtk_io.h>\n#include <pcl/visualization/pcl_visualizer.h>\n\n/* sys headers */\n#include <iostream>\n\n/* vtk includes */\n#include <vtkArrowSource.h>\n#include <vtkCellArray.h>\n#include <vtkGlyph2D.h>\n#include <vtkImageData.h>\n#include <vtkPointData.h>\n#include <vtkPoints.h>\n#include <vtkRenderWindow.h>\n#include <vtkSmartPointer.h>\n#include <vtkXMLImageDataWriter.h>\n\nstruct SobFuApp {\n    SobFuApp(std::string file_path, std::string params_path, bool logger, bool visualizer, bool visualizer_detailed,\n             bool verbose, bool vverbose)\n        : exit_(false),\n          file_path_(file_path),\n          params_path_(params_path),\n          logger_(logger),\n          visualizer_(visualizer),\n          visualizer_detailed_(visualizer_detailed),\n          verbose_(verbose),\n          vverbose_(vverbose) {\n        /*\n         * initialise parameters\n         */\n\n        Params params;\n\n        if (verbose_) {\n            params.verbosity = 1;\n        } else if (vverbose_) {\n            params.verbosity = 2;\n        }\n\n        /*\n         * declare parameters to read from .ini\n         */\n\n        boost::program_options::options_description desc(\"parameters\");\n        declare_parameters(desc, params);\n\n        /*\n         * read parameters from .ini\n         */\n\n        boost::program_options::variables_map vm;\n        read_parameters(desc, vm);\n\n        /*\n         * parse parameters stored in units of voxels\n         */\n\n        params.tsdf_trunc_dist = vm[\"TSDF_TRUNC_DIST\"].as<float>() * params.voxel_sizes()[0];\n        params.eta             = vm[\"ETA\"].as<float>() * params.voxel_sizes()[0];\n        params.volume_pose     = cv::Affine3f().translate(\n            cv::Vec3f(-params.volume_size[0] / 2.f, -params.volume_size[1] / 2.f, vm[\"VOL_POSE_T_Z\"].as<float>()));\n\n        /*\n         * initialise sobolevfusion\n         */\n\n        sobfu = std::make_shared<SobFusion>(params);\n    }\n\n    /*\n     * declare sobfu parameters\n     */\n\n    void declare_parameters(boost::program_options::options_description &desc, Params &params) {\n        /*\n         * tsdf\n         */\n\n        desc.add_options()(\"VOL_DIMS_X\", boost::program_options::value<int>(&params.volume_dims[0]),\n                           \"no. of voxels along x axis\");\n        desc.add_options()(\"VOL_DIMS_Y\", boost::program_options::value<int>(&params.volume_dims[1]),\n                           \"no. of voxels along y axis\");\n        desc.add_options()(\"VOL_DIMS_Z\", boost::program_options::value<int>(&params.volume_dims[2]),\n                           \"no. of voxels along z axis\");\n\n        desc.add_options()(\"VOL_SIZE_X\", boost::program_options::value<float>(&params.volume_size[0]),\n                           \"vol. size along x axis (metres)\");\n        desc.add_options()(\"VOL_SIZE_Y\", boost::program_options::value<float>(&params.volume_size[1]),\n                           \"vol. size along y axis (metres)\");\n        desc.add_options()(\"VOL_SIZE_Z\", boost::program_options::value<float>(&params.volume_size[2]),\n                           \"vol. size along z axis (metres)\");\n\n        desc.add_options()(\"TSDF_TRUNC_DIST\", boost::program_options::value<float>(), \"truncation distance (voxels)\");\n        desc.add_options()(\"ETA\", boost::program_options::value<float>(), \"expected object thickness (voxels)\");\n        desc.add_options()(\"TSDF_MAX_WEIGHT\", boost::program_options::value<float>(&params.tsdf_max_weight),\n                           \"max. tsdf weight\");\n\n        desc.add_options()(\"GRADIENT_DELTA_FACTOR\", boost::program_options::value<float>(&params.gradient_delta_factor),\n                           \"delta factor when calculating tsdf gradient (voxels)\");\n\n        /*\n         * camera\n         */\n\n        desc.add_options()(\"INTR_FX\", boost::program_options::value<float>(&params.intr.fx), \"focal length x\");\n        desc.add_options()(\"INTR_FY\", boost::program_options::value<float>(&params.intr.fy), \"focal length y\");\n        desc.add_options()(\"INTR_CX\", boost::program_options::value<float>(&params.intr.cx), \"principal point x\");\n        desc.add_options()(\"INTR_CY\", boost::program_options::value<float>(&params.intr.cy), \"principal point y\");\n\n        desc.add_options()(\"TRUNC_DEPTH\", boost::program_options::value<float>(&params.icp_truncate_depth_dist),\n                           \"depth map truncation distance (metres)\");\n        desc.add_options()(\"VOL_POSE_T_Z\", boost::program_options::value<float>(),\n                           \"camera to volume translation along z axis\");\n\n        /*\n         * bilateral filter\n         */\n\n        desc.add_options()(\"BILATERAL_SIGMA_DEPTH\", boost::program_options::value<float>(&params.bilateral_sigma_depth),\n                           \"bilateral filter sigma z\");\n        desc.add_options()(\"BILATERAL_SIGMA_SPATIAL\",\n                           boost::program_options::value<float>(&params.bilateral_sigma_spatial),\n                           \"bilateral filter sigma x-y\");\n        desc.add_options()(\"BILATERAL_KERNEL_SIZE\", boost::program_options::value<int>(&params.bilateral_kernel_size),\n                           \"bilateral filter kernel size\");\n\n        /*\n         * solver\n         */\n        desc.add_options()(\"START_FRAME\", boost::program_options::value<int>(&params.start_frame),\n                           \"frame when to start registration\");\n\n        desc.add_options()(\"MAX_ITER\", boost::program_options::value<int>(&params.max_iter),\n                           \"max. no. of iterations of the solver\");\n        desc.add_options()(\"MAX_UPDATE_NORM\", boost::program_options::value<float>(&params.max_update_norm),\n                           \"max. update norm when running the solver\");\n\n        /* SOBOLEV */\n        desc.add_options()(\"S\", boost::program_options::value<int>(&params.s),\n                           \"Sobolev kernel size (currently only supports s=7\");\n        desc.add_options()(\"LAMBDA\", boost::program_options::value<float>(&params.lambda),\n                           \"Sobolev filter parameter (currently only supports 0.05, 0.1, 0.2, and 0.4\");\n\n        /* FASTFUSION */\n        desc.add_options()(\"ALPHA\", boost::program_options::value<float>(&params.alpha), \"gradient descent step size\");\n        desc.add_options()(\"W_REG\", boost::program_options::value<float>(&params.w_reg), \"regularisation weight\");\n    }\n\n    /*\n     * read parameters from params.ini\n     */\n\n    void read_parameters(boost::program_options::options_description &desc, boost::program_options::variables_map &vm) {\n        std::ifstream settings_file(params_path_);\n\n        boost::program_options::store(boost::program_options::parse_config_file(settings_file, desc), vm);\n        boost::program_options::notify(vm);\n    }\n\n    /*\n     * load colour and depth images\n     */\n\n    void load_files(std::vector<cv::String> *depths, std::vector<cv::String> *images, std::vector<cv::String> *masks) {\n        if (!boost::filesystem::exists(file_path_)) {\n            std::cerr << \"error: directory '\" << file_path_ << \"' does not exist. exiting\" << std::endl;\n            exit(EXIT_FAILURE);\n        }\n\n        if (!boost::filesystem::exists(file_path_ + \"/depth\") || !boost::filesystem::exists(file_path_ + \"/color\")) {\n            std::cerr << \"error: source directory should contain 'color' and 'depth' folders. exiting...\" << std::endl;\n            exit(EXIT_FAILURE);\n        }\n\n        cv::glob(file_path_ + \"/depth\", *depths);\n        cv::glob(file_path_ + \"/color\", *images);\n\n        std::sort((*depths).begin(), (*depths).end());\n        std::sort((*images).begin(), (*images).end());\n\n        if (boost::filesystem::exists(file_path_ + \"/omask\")) {\n            cv::glob(file_path_ + \"/omask\", *masks);\n            std::sort((*masks).begin(), (*masks).end());\n        }\n    }\n\n    /*\n     * create a new directory out if not already present inside the input folder\n     */\n\n    void create_output_directory() {\n        out_path_ = file_path_ + \"/meshes\";\n        boost::filesystem::path dir_meshes(out_path_);\n\n        if (boost::filesystem::create_directory(dir_meshes)) {\n            std::cout << \"created output directory for meshes\" << std::endl;\n        }\n\n        if (visualizer_ || visualizer_detailed_) {\n            out_screenshot_path_ = file_path_ + \"/screenshots\";\n            boost::filesystem::path dir_screenshots(out_screenshot_path_);\n\n            if (boost::filesystem::create_directory(dir_screenshots)) {\n                std::cout << \"created output directory for screenshots\" << std::endl;\n            }\n        }\n    }\n\n    /*\n     * get no. of vertices in a mesh\n     */\n\n    int get_no_vertices(pcl::PolygonMesh::Ptr mesh) {\n        pcl::PointCloud<pcl::PointXYZ> pc;\n        pcl::fromPCLPointCloud2(mesh->cloud, pc);\n\n        return pc.size();\n    }\n\n    /*\n     * save output meshes in .vtk format\n     */\n\n    void save_mesh(pcl::PolygonMesh::Ptr mesh, int i, std::string name) {\n        /* pad frame no. with 0's */\n        std::stringstream ss;\n        ss << std::setw(6) << std::setfill('0') << i;\n        std::string frameNum = ss.str();\n\n        /* save to vtk */\n        pcl::io::saveVTKFile(out_path_ + \"/\" + name + \"_\" + frameNum + \".vtk\", *mesh);\n        std::cout << \"saved \" + name + \"_\" + frameNum + \".vtk\" << std::endl;\n    }\n\n    /*\n     * save 3d deformation field in .vtk format\n     */\n\n    void save_field(std::shared_ptr<SobFusion> sobfu, int i) {\n        /* get parameters */\n        Params params = sobfu->getParams();\n\n        /* create vtk image */\n        vtkSmartPointer<vtkImageData> image = vtkSmartPointer<vtkImageData>::New();\n        /* specify size of image */\n        image->SetDimensions(params.volume_dims[0], params.volume_dims[1], params.volume_dims[2]);\n\n        /* specify size of image data */\n        image->AllocateScalars(VTK_FLOAT, 4);\n        int *dims = image->GetDimensions();\n\n        /* copy vector field data */\n        std::shared_ptr<sobfu::cuda::DeformationField> psi = sobfu->getDeformationField();\n        kfusion::cuda::CudaData displacement_data          = psi->get_data();\n        displacement_data.download(image->GetScalarPointer());\n\n        /* pad frame no. with 0's */\n        std::stringstream ss;\n        ss << std::setw(6) << std::setfill('0') << i;\n        std::string frameNum = ss.str();\n        std::string fileName = out_path_ + \"/field_\" + frameNum + \".vti\";\n\n        /* save file */\n        vtkSmartPointer<vtkXMLImageDataWriter> writer = vtkSmartPointer<vtkXMLImageDataWriter>::New();\n        writer->SetFileName(fileName.c_str());\n        writer->SetInputData(image);\n        writer->Write();\n\n        std::cout << \"saved the vector field to .vti\" << std::endl;\n    }\n\n    bool execute() {\n        /* sobfu app */\n        sobfu = sobfu;\n\n        /* images */\n        cv::Mat depth, image, mask;\n        std::vector<cv::String> depths, images, masks;\n\n        load_files(&depths, &images, &masks);\n\n        /* output */\n        create_output_directory();\n\n        /* pipeline */\n        double time_ms = 0;\n        bool has_image = false;\n        bool has_masks = masks.size() > 0;\n\n        int v1(0);\n        int v2(0);\n\n        int v3(0);\n        int v4(0);\n        int v5(0);\n\n        for (size_t i = 0; i < depths.size(); ++i) {\n            depth = cv::imread(depths[i], CV_LOAD_IMAGE_ANYDEPTH);\n            image = cv::imread(images[i], CV_LOAD_IMAGE_COLOR);\n\n            cv::Mat depth_masked = cv::Mat::zeros(depth.size(), depth.type());\n            if (has_masks) {\n                mask = cv::imread(masks[i], CV_8U);\n                depth.copyTo(depth_masked, mask);\n            }\n\n            if (!image.data || !depth.data) {\n                std::cerr << \"error: image could not be read; check for improper\"\n                          << \" permissions or invalid formats. exiting...\" << std::endl;\n                exit(EXIT_FAILURE);\n            }\n\n            if (has_masks) {\n                depth_device_.upload(depth_masked.data, depth_masked.step, depth_masked.rows, depth_masked.cols);\n            } else {\n                depth_device_.upload(depth.data, depth.step, depth.rows, depth.cols);\n            }\n\n            {\n                kfusion::SampledScopeTime fps(time_ms);\n                has_image = (*(sobfu))(depth_device_);\n            }\n\n            /* get meshes */\n            pcl::PolygonMesh::Ptr mesh_global, mesh_global_psi_inv;\n            pcl::PolygonMesh::Ptr mesh_n, mesh_n_psi;\n\n            if (visualizer_ || visualizer_detailed_ || logger_) {\n                mesh_global               = sobfu->get_phi_global_mesh();\n                int no_vertices_canonical = get_no_vertices(mesh_global);\n                std::cout << \"no. of point-normal pairs in the canonical model: \" << no_vertices_canonical << std::endl;\n\n                if (i >= 1) {\n                    mesh_global_psi_inv                      = sobfu->get_phi_global_psi_inv_mesh();\n                    int no_vertices_canonical_warped_to_live = get_no_vertices(mesh_global_psi_inv);\n                    std::cout << \"no. of point-normal pairs in the canonical model warped to live: \"\n                              << no_vertices_canonical_warped_to_live << std::endl;\n                }\n\n                if (visualizer_detailed_) {\n                    if (i == 1) {\n                        mesh_n     = sobfu->get_phi_n_mesh();\n                        mesh_n_psi = sobfu->get_phi_n_psi_mesh();\n                    }\n                    if (i >= 2) {\n                        mesh_n     = sobfu->get_phi_n_mesh();\n                        mesh_n_psi = sobfu->get_phi_n_psi_mesh();\n                    }\n                }\n            }\n\n            if (logger_) {\n                save_mesh(mesh_global, i, \"canonical_mesh\");\n                if (i >= 1) {\n                    save_mesh(mesh_global_psi_inv, i, \"canonical_warped_to_live_mesh\");\n                }\n\n                // save_field(sobfu, i);\n            }\n\n            if (visualizer_ || visualizer_detailed_) {\n                if (i == 0) {\n                    viewer = std::make_shared<pcl::visualization::PCLVisualizer>(\"meshes\");\n                }\n\n                /*\n                 * BASIC VISUALISER\n                 *\n                 */\n\n                if (visualizer_) {\n                    if (i == 0) {\n                        /* create view ports */\n                        viewer->createViewPort(0.0, 0.0, 0.5, 1.0, v1);\n                        viewer->createViewPort(0.5, 0.0, 1.0, 1.0, v2);\n\n                        /* add labels */\n                        viewer->addText(\"frame no. \" + std::to_string(i), 20, 512, 15, 1.0, 1.0, 1.0, \"frame_num\", v1);\n\n                        viewer->addText(\"canonical model\", 15, 15, 15, 1.0, 1.0, 1.0, \"v1 text\", v1);\n                        viewer->addText(\"canonical model warped to live\", 15, 15, 15, 1.0, 1.0, 1.0, \"v2 text\", v2);\n\n                        /* add bounding box */\n                        cv::Vec3f vol_size    = sobfu->getParams().volume_size;\n                        cv::Affine3f vol_pose = sobfu->getParams().volume_pose;\n\n                        cv::Vec3f min = vol_pose * cv::Vec3f(0.f, 0.f, 0.f);\n                        cv::Vec3f max = vol_pose * vol_size;\n\n                        viewer->setCameraPosition(0.0, 0.0, max[2] + 3.0, 0.0, 0.0, 0.0);\n\n                        /* display meshes */\n                        viewer->addPolygonMesh(*mesh_global, \"mesh canonical\", v1);\n                        viewer->spinOnce(5000);\n\n                        /* save screenshot */\n                        std::stringstream ss;\n                        ss << std::setw(6) << std::setfill('0') << i;\n                        std::string frame_num = ss.str();\n\n                        viewer->saveScreenshot(out_screenshot_path_ + \"/\" + frame_num + \".png\");\n                    } else if (i >= 1) {\n                        /* update label */\n                        viewer->updateText(\"frame no. \" + std::to_string(i), 20, 512, 15, 1.0, 1.0, 1.0, \"frame_num\");\n\n                        /* display updated meshes */\n                        viewer->updatePolygonMesh(*mesh_global, \"mesh canonical\");\n                        if (i == 1) {\n                            viewer->addPolygonMesh(*mesh_global_psi_inv, \"mesh canonical warped to live\", v2);\n                        } else {\n                            viewer->updatePolygonMesh(*mesh_global_psi_inv, \"mesh canonical warped to live\");\n                        }\n                        viewer->spinOnce(50);\n\n                        /* save screenshot */\n                        std::stringstream ss;\n                        ss << std::setw(6) << std::setfill('0') << i;\n                        std::string frame_num = ss.str();\n\n                        viewer->saveScreenshot(out_screenshot_path_ + \"/\" + frame_num + \".png\");\n                    }\n                }\n\n                /*\n                 * DETAILED VISUALISER\n                 *\n                 */\n\n                else if (visualizer_detailed_) {\n                    if (i == 0) {\n                        /* create view ports */\n                        viewer->createViewPort(0.0, 0.5, 0.5, 1.0, v1);\n                        viewer->createViewPort(0.5, 0.5, 1.0, 1.0, v2);\n\n                        viewer->createViewPort(0.0, 0.0, 0.5, 0.5, v3);\n                        viewer->createViewPort(0.5, 0.0, 1.0, 0.5, v4);\n\n                        /* add labels */\n                        viewer->addText(\"frame no. \" + std::to_string(i), 20, 244, 15, 1.0, 1.0, 1.0, \"frame_num\", v1);\n\n                        viewer->addText(\"phi_n\", 15, 15, 15, 1.0, 1.0, 1.0, \"v1 text\", v1);\n                        viewer->addText(\"phi_global(psi_inv)\", 15, 15, 15, 1.0, 1.0, 1.0, \"v2 text\", v2);\n\n                        viewer->addText(\"phi_global\", 15, 15, 15, 1.0, 1.0, 1.0, \"v3 text\", v3);\n                        viewer->addText(\"phi_n(psi)\", 15, 15, 15, 1.0, 1.0, 1.0, \"v4 text\", v4);\n\n                        /* add bounding box */\n                        cv::Vec3f vol_size    = sobfu->getParams().volume_size;\n                        cv::Affine3f vol_pose = sobfu->getParams().volume_pose;\n\n                        cv::Vec3f min = vol_pose * cv::Vec3f(0.f, 0.f, 0.f);\n                        cv::Vec3f max = vol_pose * vol_size;\n\n                        viewer->setCameraPosition(0.0, 0.0, max[2] + 3.0, 0.0, 0.0, 0.0);\n\n                        /* display meshes */\n                        viewer->addPolygonMesh(*mesh_global, \"mesh canonical\", v3);\n                        viewer->spinOnce(10000);\n\n                        /* save screenshot */\n                        std::stringstream ss;\n                        ss << std::setw(6) << std::setfill('0') << i;\n                        std::string frame_num = ss.str();\n\n                        viewer->saveScreenshot(out_screenshot_path_ + \"/\" + frame_num + \".png\");\n                    }\n                    if (i >= 1) {\n                        /* update labels */\n                        viewer->updateText(\"frame no. \" + std::to_string(i), 20, 244, 15, 1.0, 1.0, 1.0, \"frame_num\");\n                        /* display meshes */\n                        viewer->updatePolygonMesh(*mesh_global, \"mesh canonical\");\n\n                        if (i == 1) {\n                            viewer->addPolygonMesh(*mesh_n, \"mesh n\", v1);\n                            viewer->addPolygonMesh(*mesh_global_psi_inv, \"mesh canonical warped to live\", v2);\n                            viewer->addPolygonMesh(*mesh_n_psi, \"mesh n psi\", v4);\n                        }\n                        if (i >= 2) {\n                            viewer->updatePolygonMesh(*mesh_n, \"mesh n\");\n                            viewer->updatePolygonMesh(*mesh_global_psi_inv, \"mesh canonical warped to live\");\n                            viewer->updatePolygonMesh(*mesh_n_psi, \"mesh n psi\");\n                        }\n                        viewer->spinOnce(50);\n\n                        /* save screenshot */\n                        std::stringstream ss;\n                        ss << std::setw(6) << std::setfill('0') << i;\n                        std::string frame_num = ss.str();\n\n                        viewer->saveScreenshot(out_screenshot_path_ + \"/\" + frame_num + \".png\");\n                    }\n                }\n            }\n        }\n\n        return true;\n    }\n\n    kfusion::cuda::Depth depth_device_;\n\n    std::shared_ptr<SobFusion> sobfu;\n    std::shared_ptr<pcl::visualization::PCLVisualizer> viewer;\n\n    bool exit_, logger_, visualizer_, visualizer_detailed_, off_screen_, verbose_, vverbose_;\n    std::string file_path_, params_path_, out_path_, out_screenshot_path_;\n};\n\n/*\n * parse the input flag and determine the file path and whether or not to enable visualizer\n * all flags will be matched and the last argument which does not match the flag will be\n * treated as filepath\n */\nvoid parse_flags(std::vector<std::string> args, std::string *file_path, std::string *params_path, bool *logger,\n                 bool *visualizer, bool *visualizer_detailed, bool *verbose, bool *vverbose) {\n    std::vector<std::string> flags = {\"-h\",           \"--help\",    \"--enable-viz\", \"--enable-viz-detailed\",\n                                      \"--enable-log\", \"--verbose\", \"--vverbose\"};\n\n    int idx = 0;\n    for (auto arg : args) {\n        if (std::find(std::begin(flags), std::end(flags), arg) != std::end(flags)) {\n            if (arg == \"-h\" || arg == \"--help\") {\n                std::cout << \"USAGE: sobfu [OPTIONS] <file path> <ini path>\" << std::endl;\n                std::cout << \"\\t--help -h:    display help\" << std::endl;\n                std::cout << \"\\t--enable-viz: enable visualizer\" << std::endl;\n                std::cout << \"\\t--enable-viz-detailed: enable visualizer with additional meshes for debugging\"\n                          << std::endl;\n                std::cout << \"\\t--enable-log: log output meshes\" << std::endl;\n                std::cout << \"\\t--verbose: low verbosity\" << std::endl;\n                std::cout << \"\\t--vverbose: high verbosity\" << std::endl;\n                std::exit(EXIT_SUCCESS);\n            }\n\n            if (arg == \"--enable-log\") {\n                *logger = true;\n            }\n            if (arg == \"--enable-viz\") {\n                *visualizer = true;\n            }\n            if (arg == \"--enable-viz-detailed\") {\n                *visualizer_detailed = true;\n            }\n            if (arg == \"--verbose\") {\n                *verbose = true;\n            }\n            if (arg == \"--vverbose\") {\n                *vverbose = true;\n            }\n        } else if (idx == 0) {\n            *file_path = arg;\n            idx++;\n        } else if (idx == 1) {\n            *params_path = arg;\n        }\n    }\n}\n\nint main(int argc, char *argv[]) {\n    int device = 0;\n\n    kfusion::cuda::setDevice(device);\n    kfusion::cuda::printShortCudaDeviceInfo(device);\n\n    if (kfusion::cuda::checkIfPreFermiGPU(device)) {\n        std::cout << std::endl\n                  << \"kinfu does not support pre-fermi gpu architectures, and is not built for them by \"\n                     \"default; exiting...\"\n                  << std::endl;\n        return 1;\n    }\n\n    /* program requires at least one argument--the path to the directory where the source files are */\n    if (argc < 3) {\n        return std::cerr << \"error: incorrect number of arguments; please supply path to source data and .ini file; \"\n                            \"exiting...\"\n                         << std::endl,\n               -1;\n    }\n\n    std::vector<std::string> args(argv + 1, argv + argc);\n    std::string file_path, params_path;\n\n    bool logger              = false;\n    bool visualizer          = false;\n    bool visualizer_detailed = false;\n    bool off_screen          = false;\n    bool verbose             = false;\n    bool vverbose            = false;\n\n    parse_flags(args, &file_path, &params_path, &logger, &visualizer, &visualizer_detailed, &verbose, &vverbose);\n\n    /* disable the visualiser when running over SSH */\n    if (((visualizer || visualizer_detailed) && !off_screen) && getenv(\"SSH_CLIENT\")) {\n        return std::cerr << \"error: cannot run visualiser while running over ssh; please run locally or disable the\"\n                            \"visualiser. exiting...\"\n                         << std::endl,\n               -1;\n    }\n\n    SobFuApp app(file_path, params_path, logger, visualizer, visualizer_detailed, verbose, vverbose);\n\n    /* execute */\n    app.execute();\n\n    return 0;\n}\n"
  },
  {
    "path": "src/kfusion/CMakeLists.txt",
    "content": "# Build all kfusion source files into library kfusion\nadd_module_library(kfusion)\n\n# Link to kfusion library\ntarget_link_libraries(kfusion\n  ${CUDA_CUDART_LIBRARY}\n  ${PCL_LIBRARIES}\n)\n"
  },
  {
    "path": "src/kfusion/core.cpp",
    "content": "#include <kfusion/kinfu.hpp>\n#include <kfusion/safe_call.hpp>\n\n#include <cuda.h>\n#include <cstdio>\n#include <iostream>\n\nint kf::cuda::getCudaEnabledDeviceCount() {\n    int count;\n    cudaError_t error = cudaGetDeviceCount(&count);\n\n    if (error == cudaErrorInsufficientDriver)\n        return -1;\n\n    if (error == cudaErrorNoDevice)\n        return 0;\n\n    cudaSafeCall(error);\n    return count;\n}\n\nvoid kf::cuda::setDevice(int device) { cudaSafeCall(cudaSetDevice(device)); }\n\nstd::string kf::cuda::getDeviceName(int device) {\n    cudaDeviceProp prop;\n    cudaSafeCall(cudaGetDeviceProperties(&prop, device));\n\n    return prop.name;\n}\n\nbool kf::cuda::checkIfPreFermiGPU(int device) {\n    if (device < 0)\n        cudaSafeCall(cudaGetDevice(&device));\n\n    cudaDeviceProp prop;\n    cudaSafeCall(cudaGetDeviceProperties(&prop, device));\n    return prop.major < 2;  // CC == 1.x\n}\n\nnamespace {\ntemplate <class T>\ninline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) {\n    *attribute     = T();\n    CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);\n    if (CUDA_SUCCESS == error)\n        return;\n\n    printf(\"Driver API error = %04d\\n\", error);\n    kfusion::cuda::error(\"driver API error\", __FILE__, __LINE__);\n}\n\ninline int convertSMVer2Cores(int major, int minor) {\n    // Defines for GPU Architecture types (using the SM version to determine the #\n    // of cores per SM\n    typedef struct {\n        int SM;  // 0xMm (hexidecimal notation), M = SM Major version, and m = SM\n                 // minor version\n        int Cores;\n    } SMtoCores;\n\n    SMtoCores gpuArchCoresPerSM[] = {{0x10, 8},   {0x11, 8},   {0x12, 8},   {0x13, 8},   {0x20, 32},  {0x21, 48},\n                                     {0x30, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128}, {0x52, 128}, {0x60, 64},\n                                     {0x61, 128}, {0x70, 64},  {0x75, 64},  {-1, -1}};\n\n    int index = 0;\n    while (gpuArchCoresPerSM[index].SM != -1) {\n        if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor))\n            return gpuArchCoresPerSM[index].Cores;\n        index++;\n    }\n    printf(\"\\nCan't determine number of cores. Unknown SM version %d.%d!\\n\", major, minor);\n    return 0;\n}\n}  // namespace\n\nvoid kf::cuda::printCudaDeviceInfo(int device) {\n    int count  = getCudaEnabledDeviceCount();\n    bool valid = (device >= 0) && (device < count);\n\n    int beg = valid ? device : 0;\n    int end = valid ? device + 1 : count;\n\n    printf(\n        \"*** CUDA Device Query (Runtime API) version (CUDART static linking) \"\n        \"*** \\n\\n\");\n    printf(\"Device count: %d\\n\", count);\n\n    int driverVersion = 0, runtimeVersion = 0;\n    cudaSafeCall(cudaDriverGetVersion(&driverVersion));\n    cudaSafeCall(cudaRuntimeGetVersion(&runtimeVersion));\n\n    const char *computeMode[] = {\n        \"Default (multiple host threads can use ::cudaSetDevice() with device \"\n        \"simultaneously)\",\n        \"Exclusive (only one host thread in one process is able to use \"\n        \"::cudaSetDevice() with this device)\",\n        \"Prohibited (no host thread can use ::cudaSetDevice() with this device)\",\n        \"Exclusive Process (many threads in one process is able to use \"\n        \"::cudaSetDevice() with this device)\",\n        \"Unknown\",\n        NULL};\n\n    for (int dev = beg; dev < end; ++dev) {\n        cudaDeviceProp prop;\n        cudaSafeCall(cudaGetDeviceProperties(&prop, dev));\n\n        int sm_cores = convertSMVer2Cores(prop.major, prop.minor);\n\n        printf(\"\\nDevice %d: \\\"%s\\\"\\n\", dev, prop.name);\n        printf(\"  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\\n\", driverVersion / 1000,\n               driverVersion % 100, runtimeVersion / 1000, runtimeVersion % 100);\n        printf(\"  CUDA Capability Major/Minor version number:    %d.%d\\n\", prop.major, prop.minor);\n        printf(\n            \"  Total amount of global memory:                 %.0f MBytes (%llu \"\n            \"bytes)\\n\",\n            (float) prop.totalGlobalMem / 1048576.0f, (unsigned long long) prop.totalGlobalMem);\n        printf(\"  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\\n\", prop.multiProcessorCount, sm_cores,\n               sm_cores * prop.multiProcessorCount);\n        printf(\"  GPU Clock Speed:                               %.2f GHz\\n\", prop.clockRate * 1e-6f);\n\n#if (CUDART_VERSION >= 4000)\n        // This is not available in the CUDA Runtime API, so we make the necessary\n        // calls the driver API to support this for output\n        int memoryClock, memBusWidth, L2CacheSize;\n        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);\n        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);\n        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);\n\n        printf(\"  Memory Clock rate:                             %.2f Mhz\\n\", memoryClock * 1e-3f);\n        printf(\"  Memory Bus Width:                              %d-bit\\n\", memBusWidth);\n        if (L2CacheSize)\n            printf(\"  L2 Cache Size:                                 %d bytes\\n\", L2CacheSize);\n\n        printf(\n            \"  Max Texture Dimension Size (x,y,z)             1D=(%d), \"\n            \"2D=(%d,%d), 3D=(%d,%d,%d)\\n\",\n            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], prop.maxTexture3D[0], prop.maxTexture3D[1],\n            prop.maxTexture3D[2]);\n        printf(\n            \"  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, \"\n            \"2D=(%d,%d) x %d\\n\",\n            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], prop.maxTexture2DLayered[0],\n            prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);\n#endif\n        printf(\"  Total amount of constant memory:               %u bytes\\n\", (int) prop.totalConstMem);\n        printf(\"  Total amount of shared memory per block:       %u bytes\\n\", (int) prop.sharedMemPerBlock);\n        printf(\"  Total number of registers available per block: %d\\n\", prop.regsPerBlock);\n        printf(\"  Warp size:                                     %d\\n\", prop.warpSize);\n        printf(\"  Maximum number of threads per block:           %d\\n\", prop.maxThreadsPerBlock);\n        printf(\"  Maximum sizes of each dimension of a block:    %d x %d x %d\\n\", prop.maxThreadsDim[0],\n               prop.maxThreadsDim[1], prop.maxThreadsDim[2]);\n        printf(\"  Maximum sizes of each dimension of a grid:     %d x %d x %d\\n\", prop.maxGridSize[0],\n               prop.maxGridSize[1], prop.maxGridSize[2]);\n        printf(\"  Maximum memory pitch:                          %u bytes\\n\", (int) prop.memPitch);\n        printf(\"  Texture alignment:                             %u bytes\\n\", (int) prop.textureAlignment);\n\n#if CUDART_VERSION >= 4000\n        printf(\n            \"  Concurrent copy and execution:                 %s with %d copy \"\n            \"engine(s)\\n\",\n            (prop.deviceOverlap ? \"Yes\" : \"No\"), prop.asyncEngineCount);\n#else\n        printf(\"  Concurrent copy and execution:                 %s\\n\", prop.deviceOverlap ? \"Yes\" : \"No\");\n#endif\n        printf(\"  Run time limit on kernels:                     %s\\n\", prop.kernelExecTimeoutEnabled ? \"Yes\" : \"No\");\n        printf(\"  Integrated GPU sharing Host Memory:            %s\\n\", prop.integrated ? \"Yes\" : \"No\");\n        printf(\"  Support host page-locked memory mapping:       %s\\n\", prop.canMapHostMemory ? \"Yes\" : \"No\");\n\n        printf(\"  Concurrent kernel execution:                   %s\\n\", prop.concurrentKernels ? \"Yes\" : \"No\");\n        printf(\"  Alignment requirement for Surfaces:            %s\\n\", prop.surfaceAlignment ? \"Yes\" : \"No\");\n        printf(\"  Device has ECC support enabled:                %s\\n\", prop.ECCEnabled ? \"Yes\" : \"No\");\n        printf(\"  Device is using TCC driver mode:               %s\\n\", prop.tccDriver ? \"Yes\" : \"No\");\n#if CUDART_VERSION >= 4000\n        printf(\"  Device supports Unified Addressing (UVA):      %s\\n\", prop.unifiedAddressing ? \"Yes\" : \"No\");\n        printf(\"  Device PCI Bus ID / PCI location ID:           %d / %d\\n\", prop.pciBusID, prop.pciDeviceID);\n#endif\n        printf(\"  Compute Mode:\\n\");\n        printf(\"      %s \\n\", computeMode[prop.computeMode]);\n    }\n\n    printf(\"\\n\");\n    printf(\"deviceQuery, CUDA Driver = CUDART\");\n    printf(\", CUDA Driver Version  = %d.%d\", driverVersion / 1000, driverVersion % 100);\n    printf(\", CUDA Runtime Version = %d.%d\", runtimeVersion / 1000, runtimeVersion % 100);\n    printf(\", NumDevs = %d\\n\\n\", count);\n    fflush(stdout);\n}\n\nvoid kf::cuda::printShortCudaDeviceInfo(int device) {\n    int count  = getCudaEnabledDeviceCount();\n    bool valid = (device >= 0) && (device < count);\n\n    int beg = valid ? device : 0;\n    int end = valid ? device + 1 : count;\n\n    int driverVersion = 0, runtimeVersion = 0;\n    cudaSafeCall(cudaDriverGetVersion(&driverVersion));\n    cudaSafeCall(cudaRuntimeGetVersion(&runtimeVersion));\n\n    for (int dev = beg; dev < end; ++dev) {\n        cudaDeviceProp prop;\n        cudaSafeCall(cudaGetDeviceProperties(&prop, dev));\n\n        const char *arch_str = prop.major < 2 ? \" (pre-Fermi)\" : \"\";\n        printf(\"Device %d:  \\\"%s\\\"  %.0fMb\", dev, prop.name, (float) prop.totalGlobalMem / 1048576.0f);\n        printf(\", sm_%d%d%s, %d cores\", prop.major, prop.minor, arch_str,\n               convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);\n        printf(\", Driver/Runtime ver.%d.%d/%d.%d\\n\", driverVersion / 1000, driverVersion % 100, runtimeVersion / 1000,\n               runtimeVersion % 100);\n    }\n    fflush(stdout);\n}\n\nkf::SampledScopeTime::SampledScopeTime(double &time_ms) : time_ms_(time_ms) { start = (double) cv::getTickCount(); }\nkf::SampledScopeTime::~SampledScopeTime() {\n    static int i_ = 0;\n    time_ms_ += getTime();\n    if (i_ % EACH == 0 && i_) {\n        std::cout << \"avg. frame time = \" << time_ms_ / EACH << \"ms (\" << 1000.f * EACH / time_ms_ << \"fps)\"\n                  << std::endl;\n        time_ms_ = 0.0;\n    }\n    ++i_;\n}\n\ndouble kf::SampledScopeTime::getTime() {\n    return ((double) cv::getTickCount() - start) * 1000.0 / cv::getTickFrequency();\n}\n\nkf::ScopeTime::ScopeTime(const char *name_) : name(name_) { start = (double) cv::getTickCount(); }\nkf::ScopeTime::~ScopeTime() {\n    double time_ms = ((double) cv::getTickCount() - start) * 1000.0 / cv::getTickFrequency();\n    std::cout << \"Time(\" << name << \") = \" << time_ms << \"ms\" << std::endl;\n}\n"
  },
  {
    "path": "src/kfusion/cuda/imgproc.cu",
    "content": "#include <kfusion/cuda/device.hpp>\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Depth bilateral filter\n\nnamespace kfusion {\nnamespace device {\n__global__ void bilateral_kernel(const PtrStepSz<ushort> src, PtrStep<ushort> dst, const int ksz,\n                                 const float sigma_spatial2_inv_half, const float sigma_depth2_inv_half) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x >= src.cols || y >= src.rows)\n        return;\n\n    int value = src(y, x);\n\n    int tx = min(x - ksz / 2 + ksz, src.cols - 1);\n    int ty = min(y - ksz / 2 + ksz, src.rows - 1);\n\n    float sum1 = 0;\n    float sum2 = 0;\n\n    for (int cy = max(y - ksz / 2, 0); cy < ty; ++cy) {\n        for (int cx = max(x - ksz / 2, 0); cx < tx; ++cx) {\n            int depth = src(cy, cx);\n\n            float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);\n            float color2 = (value - depth) * (value - depth);\n\n            float weight = __expf(-(space2 * sigma_spatial2_inv_half + color2 * sigma_depth2_inv_half));\n\n            sum1 += depth * weight;\n            sum2 += weight;\n        }\n    }\n    dst(y, x) = __float2int_rn(sum1 / sum2);\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::bilateralFilter(const Depth& src, Depth& dst, int kernel_size, float sigma_spatial,\n                                      float sigma_depth) {\n    sigma_depth *= 1000;  // meters -> mm\n\n    dim3 block(64, 16);\n    dim3 grid(divUp(src.cols(), block.x), divUp(src.rows(), block.y));\n\n    cudaSafeCall(cudaFuncSetCacheConfig(bilateral_kernel, cudaFuncCachePreferL1));\n    bilateral_kernel<<<grid, block>>>(src, dst, kernel_size, 0.5f / (sigma_spatial * sigma_spatial),\n                                      0.5f / (sigma_depth * sigma_depth));\n    cudaSafeCall(cudaGetLastError());\n};\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Depth truncation\n\nnamespace kfusion {\nnamespace device {\n__global__ void truncate_depth_kernel(PtrStepSz<ushort> depth, ushort max_dist /*mm*/) {\n    int x = blockIdx.x * blockDim.x + threadIdx.x;\n    int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x < depth.cols && y < depth.rows)\n        if (depth(y, x) > max_dist)\n            depth(y, x) = 0;\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::truncateDepth(Depth& depth, float max_dist /*meters*/) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(depth.cols(), block.x), divUp(depth.rows(), block.y));\n\n    truncate_depth_kernel<<<grid, block>>>(depth, static_cast<ushort>(max_dist * 1000.f));\n    cudaSafeCall(cudaGetLastError());\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Build depth pyramid\n\nnamespace kfusion {\nnamespace device {\n__global__ void pyramid_kernel(const PtrStepSz<ushort> src, PtrStepSz<ushort> dst, float sigma_depth_mult3) {\n    int x = blockIdx.x * blockDim.x + threadIdx.x;\n    int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x >= dst.cols || y >= dst.rows)\n        return;\n\n    const int D = 5;\n    int center  = src(2 * y, 2 * x);\n\n    int tx = min(2 * x - D / 2 + D, src.cols - 1);\n    int ty = min(2 * y - D / 2 + D, src.rows - 1);\n    int cy = max(0, 2 * y - D / 2);\n\n    int sum   = 0;\n    int count = 0;\n\n    for (; cy < ty; ++cy)\n        for (int cx = max(0, 2 * x - D / 2); cx < tx; ++cx) {\n            int val = src(cy, cx);\n            if (abs(val - center) < sigma_depth_mult3) {\n                sum += val;\n                ++count;\n            }\n        }\n    dst(y, x) = (count == 0) ? 0 : sum / count;\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::depthPyr(const Depth& source, Depth& pyramid, float sigma_depth) {\n    sigma_depth *= 1000;  // meters -> mm\n\n    dim3 block(64, 16);\n    dim3 grid(divUp(pyramid.cols(), block.x), divUp(pyramid.rows(), block.y));\n\n    pyramid_kernel<<<grid, block>>>(source, pyramid, sigma_depth * 3);\n    cudaSafeCall(cudaGetLastError());\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Compute normals\n\nnamespace kfusion {\nnamespace device {\n__global__ void compute_normals_kernel(const PtrStepSz<ushort> depth, const Reprojector reproj,\n                                       PtrStep<Normal> normals) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x >= depth.cols || y >= depth.rows)\n        return;\n\n    const float qnan = numeric_limits<float>::quiet_NaN();\n\n    Normal n_out = make_float4(qnan, qnan, qnan, 0.f);\n\n    if (x < depth.cols - 1 && y < depth.rows - 1) {\n        // mm -> meters\n        float z00 = depth(y, x) * 0.001f;\n        float z01 = depth(y, x + 1) * 0.001f;\n        float z10 = depth(y + 1, x) * 0.001f;\n\n        if (z00 * z01 * z10 != 0) {\n            float3 v00 = reproj(x, y, z00);\n            float3 v01 = reproj(x + 1, y, z01);\n            float3 v10 = reproj(x, y + 1, z10);\n\n            float3 n = normalized(cross(v01 - v00, v10 - v00));\n            n_out    = make_float4(-n.x, -n.y, -n.z, 0.f);\n        }\n    }\n    normals(y, x) = n_out;\n}\n\n__global__ void mask_depth_kernel(const PtrStep<Normal> normals, PtrStepSz<ushort> depth) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x < depth.cols || y < depth.rows) {\n        float4 n = normals(y, x);\n        if (isnan(n.x))\n            depth(y, x) = 0;\n    }\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::computeNormalsAndMaskDepth(const Reprojector& reproj, Depth& depth, Normals& normals) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(depth.cols(), block.x), divUp(depth.rows(), block.y));\n\n    compute_normals_kernel<<<grid, block>>>(depth, reproj, normals);\n    cudaSafeCall(cudaGetLastError());\n\n    mask_depth_kernel<<<grid, block>>>(normals, depth);\n    cudaSafeCall(cudaGetLastError());\n}\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Compute computePointNormals\n\nnamespace kfusion {\nnamespace device {\n__global__ void points_normals_kernel(const Reprojector reproj, const PtrStepSz<ushort> depth, PtrStep<Point> points,\n                                      PtrStep<Normal> normals) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x >= depth.cols || y >= depth.rows)\n        return;\n\n    const float qnan = numeric_limits<float>::quiet_NaN();\n    points(y, x) = normals(y, x) = make_float4(qnan, qnan, qnan, qnan);\n\n    if (x >= depth.cols - 1 || y >= depth.rows - 1)\n        return;\n\n    // mm -> meters\n    float z00 = depth(y, x) * 0.001f;\n    float z01 = depth(y, x + 1) * 0.001f;\n    float z10 = depth(y + 1, x) * 0.001f;\n\n    if (z00 * z01 * z10 != 0) {\n        float3 v00 = reproj(x, y, z00);\n        float3 v01 = reproj(x + 1, y, z01);\n        float3 v10 = reproj(x, y + 1, z10);\n\n        float3 n      = normalized(cross(v01 - v00, v10 - v00));\n        normals(y, x) = make_float4(-n.x, -n.y, -n.z, 0.f);\n        points(y, x)  = make_float4(v00.x, v00.y, v00.z, 0.f);\n    }\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::computePointNormals(const Reprojector& reproj, const Depth& depth, Points& points,\n                                          Normals& normals) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(depth.cols(), block.x), divUp(depth.rows(), block.y));\n\n    points_normals_kernel<<<grid, block>>>(reproj, depth, points, normals);\n    cudaSafeCall(cudaGetLastError());\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Compute dists\n\nnamespace kfusion {\nnamespace device {\n__global__ void compute_dists_kernel(const PtrStepSz<ushort> depth, Dists dists, float2 finv, float2 c) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x < depth.cols || y < depth.rows) {\n        float xl     = (x - c.x) * finv.x;\n        float yl     = (y - c.y) * finv.y;\n        float lambda = sqrtf(xl * xl + yl * yl + 1);\n\n        dists(y, x) = depth(y, x) * lambda * 0.001f;  // meters\n    }\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::compute_dists(const Depth& depth, Dists dists, float2 f, float2 c) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(depth.cols(), block.x), divUp(depth.rows(), block.y));\n\n    compute_dists_kernel<<<grid, block>>>(depth, dists, make_float2(1.f / f.x, 1.f / f.y), c);\n    cudaSafeCall(cudaGetLastError());\n}\n\nnamespace kfusion {\nnamespace device {\n__global__ void resize_depth_normals_kernel(const PtrStep<ushort> dsrc, const PtrStep<float4> nsrc,\n                                            PtrStepSz<ushort> ddst, PtrStep<float4> ndst) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x >= ddst.cols || y >= ddst.rows)\n        return;\n\n    const float qnan = numeric_limits<float>::quiet_NaN();\n\n    ushort d = 0;\n    float4 n = make_float4(qnan, qnan, qnan, qnan);\n\n    int xs = x * 2;\n    int ys = y * 2;\n\n    int d00 = dsrc(ys + 0, xs + 0);\n    int d01 = dsrc(ys + 0, xs + 1);\n    int d10 = dsrc(ys + 1, xs + 0);\n    int d11 = dsrc(ys + 1, xs + 1);\n\n    if (d00 * d01 != 0 && d10 * d11 != 0) {\n        d = (d00 + d01 + d10 + d11) / 4;\n\n        float4 n00 = nsrc(ys + 0, xs + 0);\n        float4 n01 = nsrc(ys + 0, xs + 1);\n        float4 n10 = nsrc(ys + 1, xs + 0);\n        float4 n11 = nsrc(ys + 1, xs + 1);\n\n        n.x = (n00.x + n01.x + n10.x + n11.x) * 0.25;\n        n.y = (n00.y + n01.y + n10.y + n11.y) * 0.25;\n        n.z = (n00.z + n01.z + n10.z + n11.z) * 0.25;\n    }\n    ddst(y, x) = d;\n    ndst(y, x) = n;\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out,\n                                         Normals& normals_out) {\n    int in_cols = depth.cols();\n    int in_rows = depth.rows();\n\n    int out_cols = in_cols / 2;\n    int out_rows = in_rows / 2;\n\n    dim3 block(64, 16);\n    dim3 grid(divUp(out_cols, block.x), divUp(out_rows, block.y));\n\n    resize_depth_normals_kernel<<<grid, block>>>(depth, normals, depth_out, normals_out);\n    cudaSafeCall(cudaGetLastError());\n}\n\nnamespace kfusion {\nnamespace device {\n__global__ void resize_points_normals_kernel(const PtrStep<Point> vsrc, const PtrStep<Normal> nsrc,\n                                             PtrStepSz<Point> vdst, PtrStep<Normal> ndst) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x >= vdst.cols || y >= vdst.rows)\n        return;\n\n    const float qnan = numeric_limits<float>::quiet_NaN();\n    vdst(y, x) = ndst(y, x) = make_float4(qnan, qnan, qnan, 0.f);\n\n    int xs = x * 2;\n    int ys = y * 2;\n\n    float3 d00 = tr(vsrc(ys + 0, xs + 0));\n    float3 d01 = tr(vsrc(ys + 0, xs + 1));\n    float3 d10 = tr(vsrc(ys + 1, xs + 0));\n    float3 d11 = tr(vsrc(ys + 1, xs + 1));\n\n    if (!isnan(d00.x * d01.x * d10.x * d11.x)) {\n        float3 d   = (d00 + d01 + d10 + d11) * 0.25f;\n        vdst(y, x) = make_float4(d.x, d.y, d.z, 0.f);\n\n        float3 n00 = tr(nsrc(ys + 0, xs + 0));\n        float3 n01 = tr(nsrc(ys + 0, xs + 1));\n        float3 n10 = tr(nsrc(ys + 1, xs + 0));\n        float3 n11 = tr(nsrc(ys + 1, xs + 1));\n\n        float3 n   = (n00 + n01 + n10 + n11) * 0.25f;\n        ndst(y, x) = make_float4(n.x, n.y, n.z, 0.f);\n    }\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::resizePointsNormals(const Points& points, const Normals& normals, Points& points_out,\n                                          Normals& normals_out) {\n    int out_cols = points.cols() / 2;\n    int out_rows = points.rows() / 2;\n\n    dim3 block(64, 16);\n    dim3 grid(divUp(out_cols, block.x), divUp(out_rows, block.y));\n\n    resize_points_normals_kernel<<<grid, block>>>(points, normals, points_out, normals_out);\n    cudaSafeCall(cudaGetLastError());\n}\n\nnamespace kfusion {\nnamespace device {\n/* calculate for the projected triangle the bounding box in the image domain */\n__host__ __device__ void get_bounding_box(float2 v1, float2 v2, float2 v3, int2& min, int2& max) {\n    min.x = static_cast<int>(fmin(v1.x, fmin(v2.x, v3.x)));\n    min.y = static_cast<int>(fmin(v1.y, fmin(v2.y, v3.y)));\n\n    max.x = static_cast<int>(fmax(v1.x, fmax(v2.x, v3.x)));\n    max.y = static_cast<int>(fmax(v1.y, fmax(v2.y, v3.y)));\n}\n\n__host__ __device__ __forceinline__ float edge_function(const float2& a, const float2& b, const float2& c) {\n    return (c.x - a.x) * (b.y - a.y) - (c.y - a.y) * (b.x - a.x);\n}\n\n/* rasterise surface triangles */\n__global__ void rasterise_surface_kernel(const Projector proj, const Aff3f vol2cam, const PtrSz<Point> vsrc,\n                                         const PtrSz<Normal> nsrc, PtrStepSz<Point> points_out,\n                                         PtrStep<Normal> normals_out) {\n    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 3;\n\n    if ((x + 2) >= vsrc.size)\n        return;\n\n    /* get vertices and normals */\n    float3 v1 = vol2cam * tr(vsrc[x]);\n    float3 v2 = vol2cam * tr(vsrc[x + 1]);\n    float3 v3 = vol2cam * tr(vsrc[x + 2]);\n\n    /* project vertices onto the image plane */\n    float2 coos1 = proj(v1);\n    float2 coos2 = proj(v2);\n    float2 coos3 = proj(v3);\n\n    /* get 2-d triangle bounding box */\n    int2 min;\n    int2 max;\n    get_bounding_box(coos1, coos2, coos3, min, max);\n\n    /* check for validity of coordinates */\n    if (min.x < 0 || min.y < 0 || max.x >= points_out.cols || max.y >= points_out.rows)\n        return;\n\n    /* used for smooth interpoation */\n    float area = edge_function(coos1, coos2, coos3);\n\n    /* shade pixels */\n    for (int i = min.x; i < max.x; i++) {\n        for (int j = min.y; j < max.y; j++) {\n            /* coordinates of the centre of the pixel */\n            float2 p = make_float2(i + 0.5f, j + 0.5f);\n\n            float w0 = edge_function(coos2, coos3, p) / area;\n            float w1 = edge_function(coos3, coos1, p) / area;\n            float w2 = edge_function(coos1, coos2, p) / area;\n\n            float z = w0 * v1.z + w1 * v2.z + w2 * v3.z;\n\n            if (z < points_out(j, i).z || fabs(points_out(j, i).z) < 1e-7f) {\n                points_out(j, i) =\n                    make_float4(w0 * v1.x + w1 * v2.x + w2 * v3.x, w0 * v1.y + w1 * v2.y + w2 * v3.y, z, 0.f);\n            }\n        }\n    }\n\n    for (int i = min.x; i < max.x; i++) {\n        for (int j = min.y; j < max.y; j++) {\n            float3 v0 = tr(points_out(j, i));\n            float3 v1 = tr(points_out(j + 1, i));\n            float3 v2 = tr(points_out(j, i + 1));\n\n            float3 n          = normalized(cross(v1 - v0, v2 - v0));\n            normals_out(j, i) = make_float4(n.x, n.y, n.z, 1.f);\n        }\n    }\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::rasteriseSurface(const Projector& proj, const Aff3f& vol2cam, const Surface& surface,\n                                       Points& points_out, Normals& normals_out) {\n    dim3 block(256);\n    dim3 grid(divUp(surface.vertices.size() / 3, block.x));\n\n    rasterise_surface_kernel<<<grid, block>>>(proj, vol2cam, surface.vertices, surface.normals, points_out,\n                                              normals_out);\n    cudaSafeCall(cudaGetLastError());\n}\n"
  },
  {
    "path": "src/kfusion/cuda/marching_cubes.cu",
    "content": "#include <kfusion/cuda/device.hpp>\n\n#include <thrust/device_ptr.h>\n#include <thrust/scan.h>\n\n#define FULL_MASK 0xffffffff\n\nnamespace kfusion {\nnamespace device {\n// texture<int, 1, cudaReadModeElementType> edgeTex;\ntexture<int, 1, cudaReadModeElementType> triTex;\ntexture<int, 1, cudaReadModeElementType> numVertsTex;\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::bindTextures(const int* /*edgeBuf*/, const int* triBuf, const int* numVertsBuf) {\n    cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();\n    // cudaSafeCall(cudaBindTexture(0, edgeTex, edgeBuf, desc) );\n    cudaSafeCall(cudaBindTexture(0, triTex, triBuf, desc));\n    cudaSafeCall(cudaBindTexture(0, numVertsTex, numVertsBuf, desc));\n}\nvoid kfusion::device::unbindTextures() {\n    // cudaSafeCall( cudaUnbindTexture(edgeTex) );\n    cudaSafeCall(cudaUnbindTexture(numVertsTex));\n    cudaSafeCall(cudaUnbindTexture(triTex));\n}\n\nnamespace kfusion {\nnamespace device {\n__device__ int global_count = 0;\n__device__ int output_count;\n__device__ unsigned int blocks_done = 0;\n\n__kf_device__ void kfusion::device::CubeIndexEstimator::readTsdf(int x, int y, int z, float& f, float& weight) const {\n    float2 aux = *volume(x, y, z);\n    f          = aux.x;\n    weight     = aux.y;\n}\n\n__kf_device__ int kfusion::device::CubeIndexEstimator::computeCubeIndex(int x, int y, int z, float f[8]) const {\n    float weight;\n    readTsdf(x, y, z, f[0], weight);\n    if (weight == 0.f)\n        return 0;\n    readTsdf(x + 1, y, z, f[1], weight);\n    if (weight == 0.f)\n        return 0;\n    readTsdf(x + 1, y + 1, z, f[2], weight);\n    if (weight == 0.f)\n        return 0;\n    readTsdf(x, y + 1, z, f[3], weight);\n    if (weight == 0.f)\n        return 0;\n    readTsdf(x, y, z + 1, f[4], weight);\n    if (weight == 0.f)\n        return 0;\n    readTsdf(x + 1, y, z + 1, f[5], weight);\n    if (weight == 0.f)\n        return 0;\n    readTsdf(x + 1, y + 1, z + 1, f[6], weight);\n    if (weight == 0.f)\n        return 0;\n    readTsdf(x, y + 1, z + 1, f[7], weight);\n    if (weight == 0.f)\n        return 0;\n\n    // calculate flag indicating if each vertex is inside or outside isosurface\n    int cubeindex = 0;\n    cubeindex     = int(f[0] < isoValue);\n    cubeindex += int(f[1] < isoValue) * 2;\n    cubeindex += int(f[2] < isoValue) * 4;\n    cubeindex += int(f[3] < isoValue) * 8;\n    cubeindex += int(f[4] < isoValue) * 16;\n    cubeindex += int(f[5] < isoValue) * 32;\n    cubeindex += int(f[6] < isoValue) * 64;\n    cubeindex += int(f[7] < isoValue) * 128;\n\n    return cubeindex;\n}\n\n__kf_device__ void kfusion::device::OccupiedVoxels::operator()() const {\n    int x = threadIdx.x + blockIdx.x * CTA_SIZE_X;\n    int y = threadIdx.y + blockIdx.y * CTA_SIZE_Y;\n\n    if (__all_sync(FULL_MASK, x >= volume.dims.x) || __all_sync(FULL_MASK, y >= volume.dims.y)) {\n        return;\n    }\n\n    int ftid    = Block::flattenedThreadId();\n    int warp_id = Warp::id();\n    int lane_id = Warp::laneId();\n\n    volatile __shared__ int warps_buffer[WARPS_COUNT];\n\n    for (int z = 0; z < volume.dims.z - 1; z++) {\n        int numVerts = 0;\n        ;\n        if (x + 1 < volume.dims.x && y + 1 < volume.dims.y) {\n            float field[8];\n            int cubeindex = computeCubeIndex(x, y, z, field);\n\n            // read number of vertices from texture\n            numVerts = (cubeindex == 0 || cubeindex == 255) ? 0 : tex1Dfetch(numVertsTex, cubeindex);\n        }\n\n        int total = __popc(__ballot_sync(FULL_MASK, numVerts > 0));\n\n        if (total == 0)\n            continue;\n\n        if (lane_id == 0) {\n            int old               = atomicAdd(&global_count, total);\n            warps_buffer[warp_id] = old;\n        }\n        int old_global_voxels_count = warps_buffer[warp_id];\n\n        int offs = Warp::binaryExclScan(__ballot_sync(FULL_MASK, numVerts > 0));\n\n        if (old_global_voxels_count + offs < max_size && numVerts > 0) {\n            voxels_indices[old_global_voxels_count + offs]  = volume.dims.y * volume.dims.x * z + volume.dims.x * y + x;\n            vertices_number[old_global_voxels_count + offs] = numVerts;\n        }\n\n        bool full = old_global_voxels_count + total >= max_size;\n\n        if (full)\n            break;\n\n    } /* for(int z = 0; z < 128 - 1; z++) */\n\n    /////////////////////////\n    // prepare for future scans\n    if (ftid == 0) {\n        unsigned int total_blocks = gridDim.x * gridDim.y * gridDim.z;\n        unsigned int value        = atomicInc(&blocks_done, total_blocks);\n\n        // last block\n        if (value == total_blocks - 1) {\n            output_count = min(max_size, global_count);\n            blocks_done  = 0;\n            global_count = 0;\n        }\n    }\n} /* operator () */\n\n__global__ void getOccupiedVoxelsKernel(const OccupiedVoxels ov) { ov(); }\n\nint getOccupiedVoxels(const TsdfVolume& volume, DeviceArray2D<int>& occupied_voxels) {\n    OccupiedVoxels ov(volume);\n\n    ov.voxels_indices  = occupied_voxels.ptr(0);\n    ov.vertices_number = occupied_voxels.ptr(1);\n    ov.max_size        = occupied_voxels.cols();\n\n    dim3 block(OccupiedVoxels::CTA_SIZE_X, OccupiedVoxels::CTA_SIZE_Y);\n    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));\n\n    getOccupiedVoxelsKernel<<<grid, block>>>(ov);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n\n    int size;\n    cudaSafeCall(cudaMemcpyFromSymbol(&size, output_count, sizeof(size)));\n    return size;\n}\n\nint computeOffsetsAndTotalVertices(DeviceArray2D<int>& occupied_voxels) {\n    thrust::device_ptr<int> beg(occupied_voxels.ptr(1));\n    thrust::device_ptr<int> end = beg + occupied_voxels.cols();\n\n    thrust::device_ptr<int> out(occupied_voxels.ptr(2));\n    thrust::exclusive_scan(beg, end, out);\n\n    int lastElement, lastScanElement;\n\n    DeviceArray<int> last_elem(occupied_voxels.ptr(1) + occupied_voxels.cols() - 1, 1);\n    DeviceArray<int> last_scan(occupied_voxels.ptr(2) + occupied_voxels.cols() - 1, 1);\n\n    last_elem.download(&lastElement);\n    last_scan.download(&lastScanElement);\n\n    return lastElement + lastScanElement;\n}\n\n__kf_device__ float3 kfusion::device::TrianglesGenerator::get_node_coo(int x, int y, int z) const {\n    float3 coo = make_float3(x, y, z);\n    coo += 0.5f;  // shift to volume cell center;\n\n    coo.x *= cell_size.x;\n    coo.y *= cell_size.y;\n    coo.z *= cell_size.z;\n\n    return coo;\n}\n\n__kf_device__ float3 kfusion::device::TrianglesGenerator::vertex_interp(float3 p0, float3 p1, float f0,\n                                                                        float f1) const {\n    float t = (isoValue - f0) / (f1 - f0 + 1e-15f);\n    float x = p0.x + t * (p1.x - p0.x);\n    float y = p0.y + t * (p1.y - p0.y);\n    float z = p0.z + t * (p1.z - p0.z);\n    return make_float3(x, y, z);\n}\n\n__kf_device__ void kfusion::device::TrianglesGenerator::operator()() const {\n    int tid = threadIdx.x;\n    int idx = (blockIdx.y * MAX_GRID_SIZE_X + blockIdx.x) * CTA_SIZE + tid;\n\n    if (idx >= voxels_count)\n        return;\n\n    int voxel = occupied_voxels[idx];\n\n    int z = voxel / (volume.dims.x * volume.dims.y);\n    int y = (voxel - z * volume.dims.x * volume.dims.y) / volume.dims.x;\n    int x = (voxel - z * volume.dims.x * volume.dims.y) - y * volume.dims.x;\n\n    float f[8];\n    int cubeindex = computeCubeIndex(x, y, z, f);\n\n    /* calculate cell vertex positions */\n    float3 v[8];\n    v[0] = get_node_coo(x, y, z);\n    v[1] = get_node_coo(x + 1, y, z);\n    v[2] = get_node_coo(x + 1, y + 1, z);\n    v[3] = get_node_coo(x, y + 1, z);\n    v[4] = get_node_coo(x, y, z + 1);\n    v[5] = get_node_coo(x + 1, y, z + 1);\n    v[6] = get_node_coo(x + 1, y + 1, z + 1);\n    v[7] = get_node_coo(x, y + 1, z + 1);\n\n    /* find vertices where surface intersects the cube; use shared memory to avoid using local */\n    __shared__ float3 vertlist[12][CTA_SIZE];\n\n    vertlist[0][tid]  = vertex_interp(v[0], v[1], f[0], f[1]);\n    vertlist[1][tid]  = vertex_interp(v[1], v[2], f[1], f[2]);\n    vertlist[2][tid]  = vertex_interp(v[2], v[3], f[2], f[3]);\n    vertlist[3][tid]  = vertex_interp(v[3], v[0], f[3], f[0]);\n    vertlist[4][tid]  = vertex_interp(v[4], v[5], f[4], f[5]);\n    vertlist[5][tid]  = vertex_interp(v[5], v[6], f[5], f[6]);\n    vertlist[6][tid]  = vertex_interp(v[6], v[7], f[6], f[7]);\n    vertlist[7][tid]  = vertex_interp(v[7], v[4], f[7], f[4]);\n    vertlist[8][tid]  = vertex_interp(v[0], v[4], f[0], f[4]);\n    vertlist[9][tid]  = vertex_interp(v[1], v[5], f[1], f[5]);\n    vertlist[10][tid] = vertex_interp(v[2], v[6], f[2], f[6]);\n    vertlist[11][tid] = vertex_interp(v[3], v[7], f[3], f[7]);\n    __syncthreads();\n\n    /* output triangle vertices and normals */\n    int numVerts = tex1Dfetch(numVertsTex, cubeindex);\n\n    for (int i = 0; i < numVerts; i += 3) {\n        int index = vertex_ofssets[idx] + i;\n\n        int v1 = tex1Dfetch(triTex, (cubeindex * 16) + i + 0);\n        int v2 = tex1Dfetch(triTex, (cubeindex * 16) + i + 1);\n        int v3 = tex1Dfetch(triTex, (cubeindex * 16) + i + 2);\n\n        /* NOTE (dig15): the surface could be smoother if the normal weren't the same for each vertex of the triangle */\n        float3 n = normalized(cross(vertlist[v3][tid] - vertlist[v1][tid], vertlist[v2][tid] - vertlist[v1][tid]));\n\n        store_point(outputVertices, index + 0, pose * vertlist[v1][tid]);\n        store_point(outputNormals, index + 0, n);\n\n        store_point(outputVertices, index + 1, pose * vertlist[v2][tid]);\n        store_point(outputNormals, index + 1, n);\n\n        store_point(outputVertices, index + 2, pose * vertlist[v3][tid]);\n        store_point(outputNormals, index + 2, n);\n    }\n}\n\n__kf_device__ void kfusion::device::TrianglesGenerator::store_point(float4* ptr, int index,\n                                                                    const float3& vertex) const {\n    ptr[index] = make_float4(vertex.x, -vertex.y, -vertex.z, 1.f);\n}\n\n__global__ void trianglesGeneratorKernel(const TrianglesGenerator tg) { tg(); }\n\nvoid generateTriangles(const TsdfVolume& volume, const DeviceArray2D<int>& occupied_voxels, const float3& volume_size,\n                       const Aff3f& pose, DeviceArray<PointType>& outputVertices,\n                       DeviceArray<PointType>& outputNormals) {\n    int device;\n    cudaSafeCall(cudaGetDevice(&device));\n\n    cudaDeviceProp prop;\n    cudaSafeCall(cudaGetDeviceProperties(&prop, device));\n\n    typedef TrianglesGenerator Tg;\n    Tg tg(volume);\n\n    tg.occupied_voxels = occupied_voxels.ptr(0);\n    tg.vertex_ofssets  = occupied_voxels.ptr(2);\n    tg.voxels_count    = occupied_voxels.cols();\n    tg.cell_size.x     = volume_size.x / volume.dims.x;\n    tg.cell_size.y     = volume_size.y / volume.dims.y;\n    tg.cell_size.z     = volume_size.z / volume.dims.z;\n    tg.outputVertices  = outputVertices;\n    tg.outputNormals   = outputNormals;\n\n    tg.pose = pose;\n\n    int block_size = 256;\n    int blocks_num = divUp(tg.voxels_count, block_size);\n\n    dim3 block(block_size);\n    dim3 grid(min(blocks_num, Tg::MAX_GRID_SIZE_X), divUp(blocks_num, Tg::MAX_GRID_SIZE_X));\n\n    trianglesGeneratorKernel<<<grid, block>>>(tg);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n}  // namespace device\n}  // namespace kfusion\n"
  },
  {
    "path": "src/kfusion/cuda/proj_icp.cu",
    "content": "#include <kfusion/cuda/device.hpp>\n#include <kfusion/cuda/texture_binder.hpp>\n\nnamespace kfusion {\nnamespace device {\ntexture<ushort, 2> dprev_tex;\ntexture<Normal, 2> nprev_tex;\ntexture<Point, 2> vprev_tex;\n\nstruct ComputeIcpHelper::Policy {\n    enum {\n        CTA_SIZE_X = 32,\n        CTA_SIZE_Y = 8,\n        CTA_SIZE   = CTA_SIZE_X * CTA_SIZE_Y,\n\n        B              = 6,\n        COLS           = 6,\n        ROWS           = 6,\n        DIAG           = 6,\n        UPPER_DIAG_MAT = (COLS * ROWS - DIAG) / 2 + DIAG,\n        TOTAL          = UPPER_DIAG_MAT + B,\n\n        FINAL_REDUCE_CTA_SIZE = 256,\n        FINAL_REDUCE_STRIDE   = FINAL_REDUCE_CTA_SIZE\n    };\n};\n\n__kf_device__ float2 ComputeIcpHelper::proj(const float3& p) const {\n    float2 coo;\n    coo.x = __fmaf_rn(f.x, __fdividef(p.x, p.z), c.x);\n    coo.y = __fmaf_rn(f.y, __fdividef(p.y, p.z), c.y);\n    return coo;\n}\n\n__kf_device__ float3 ComputeIcpHelper::reproj(float u, float v, float z) const {\n    float x = z * (u - c.x) * finv.x;\n    float y = z * (v - c.y) * finv.y;\n    return make_float3(x, y, z);\n}\n\n#if defined USE_DEPTH\n__kf_device__ int ComputeIcpHelper::find_coresp(int x, int y, float3& nd, float3& d, float3& s) const {\n    int src_z = dcurr(y, x);\n    if (src_z == 0)\n        return 40;\n\n    s = aff * reproj(x, y, src_z * 0.001f);\n\n    float2 coo = proj(s);\n    if (s.z <= 0 || coo.x < 0 || coo.y < 0 || coo.x >= cols || coo.y >= rows)\n        return 80;\n\n    int dst_z = tex2D(dprev_tex, coo.x, coo.y);\n    if (dst_z == 0)\n        return 120;\n\n    d = reproj(coo.x, coo.y, dst_z * 0.001f);\n\n    float dist2 = norm_sqr(s - d);\n    if (dist2 > dist2_thres)\n        return 160;\n\n    float3 ns = aff.R * tr(ncurr(y, x));\n    nd        = tr(tex2D(nprev_tex, coo.x, coo.y));\n\n    float cosine = fabs(dot(ns, nd));\n    if (cosine < min_cosine)\n        return 200;\n    return 0;\n}\n#else\n__kf_device__ int ComputeIcpHelper::find_coresp(int x, int y, float3& nd, float3& d, float3& s) const {\n    s = tr(vcurr(y, x));\n    if (isnan(s.x))\n        return 40;\n\n    s = aff * s;\n\n    float2 coo = proj(s);\n    if (s.z <= 0 || coo.x < 0 || coo.y < 0 || coo.x >= cols || coo.y >= rows)\n        return 80;\n\n    d = tr(tex2D(vprev_tex, coo.x, coo.y));\n    if (isnan(d.x))\n        return 120;\n\n    float dist2 = norm_sqr(s - d);\n    if (dist2 > dist2_thres)\n        return 160;\n\n    float3 ns = aff.R * tr(ncurr(y, x));\n    nd        = tr(tex2D(nprev_tex, coo.x, coo.y));\n\n    float cosine = fabs(dot(ns, nd));\n    if (cosine < min_cosine)\n        return 200;\n    return 0;\n}\n#endif\n\n__kf_device__ void ComputeIcpHelper::partial_reduce(const float row[7], PtrStep<float>& partial_buf) const {\n    volatile __shared__ float smem[Policy::CTA_SIZE];\n    int tid = Block::flattenedThreadId();\n\n    float* pos  = partial_buf.data + blockIdx.x + gridDim.x * blockIdx.y;\n    size_t step = partial_buf.step / sizeof(float);\n\n#define STOR            \\\n    if (tid == 0) {     \\\n        *pos = smem[0]; \\\n        pos += step;    \\\n    }\n\n    __syncthreads();\n    smem[tid] = row[0] * row[0];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[0] * row[1];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[0] * row[2];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[0] * row[3];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[0] * row[4];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR __syncthreads();\n    smem[tid] = row[0] * row[5];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[0] * row[6];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n    ////////////////////////////////\n\n    __syncthreads();\n    smem[tid] = row[1] * row[1];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[1] * row[2];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[1] * row[3];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[1] * row[4];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[1] * row[5];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[1] * row[6];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n    ////////////////////////////////\n\n    __syncthreads();\n    smem[tid] = row[2] * row[2];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[2] * row[3];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[2] * row[4];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[2] * row[5];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[2] * row[6];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n    ////////////////////////////////\n\n    __syncthreads();\n    smem[tid] = row[3] * row[3];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[3] * row[4];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[3] * row[5];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[3] * row[6];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n    ///////////////////////////////////////////////////\n\n    __syncthreads();\n    smem[tid] = row[4] * row[4];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[4] * row[5];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[4] * row[6];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    ///////////////////////////////////////////////////\n\n    __syncthreads();\n    smem[tid] = row[5] * row[5];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n\n    __syncthreads();\n    smem[tid] = row[5] * row[6];\n    __syncthreads();\n\n    Block::reduce<Policy::CTA_SIZE>(smem, plus());\n\n    STOR\n}\n\n__global__ void icp_helper_kernel(const ComputeIcpHelper helper, PtrStep<float> partial_buf) {\n    int x = threadIdx.x + blockIdx.x * ComputeIcpHelper::Policy::CTA_SIZE_X;\n    int y = threadIdx.y + blockIdx.y * ComputeIcpHelper::Policy::CTA_SIZE_Y;\n\n    float3 n, d, s;\n    int filtered = (x < helper.cols && y < helper.rows) ? helper.find_coresp(x, y, n, d, s) : 1;\n    // if (x < helper.cols && y < helper.rows) mask(y, x) = filtered;\n\n    float row[7];\n\n    if (!filtered) {\n        *(float3*) &row[0] = cross(s, n);\n        *(float3*) &row[3] = n;\n        row[6]             = dot(n, d - s);\n    } else\n        row[0] = row[1] = row[2] = row[3] = row[4] = row[5] = row[6] = 0.f;\n\n    helper.partial_reduce(row, partial_buf);\n}\n\n__global__ void icp_final_reduce_kernel(const PtrStep<float> partial_buf, const int length, float* final_buf) {\n    const float* beg = partial_buf.ptr(blockIdx.x);\n    const float* end = beg + length;\n\n    int tid = threadIdx.x;\n\n    float sum = 0.f;\n    for (const float* t = beg + tid; t < end; t += ComputeIcpHelper::Policy::FINAL_REDUCE_STRIDE)\n        sum += *t;\n\n    __shared__ float smem[ComputeIcpHelper::Policy::FINAL_REDUCE_CTA_SIZE];\n\n    smem[tid] = sum;\n\n    __syncthreads();\n\n    Block::reduce<ComputeIcpHelper::Policy::FINAL_REDUCE_CTA_SIZE>(smem, plus());\n\n    if (tid == 0)\n        final_buf[blockIdx.x] = smem[0];\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::ComputeIcpHelper::operator()(const Depth& dprev, const Normals& nprev,\n                                                   DeviceArray2D<float>& buffer, float* data, cudaStream_t s) {\n    dprev_tex.filterMode = cudaFilterModePoint;\n    nprev_tex.filterMode = cudaFilterModePoint;\n    TextureBinder dprev_binder(dprev, dprev_tex);\n    TextureBinder nprev_binder(nprev, nprev_tex);\n\n    dim3 block(Policy::CTA_SIZE_X, Policy::CTA_SIZE_Y);\n    dim3 grid(divUp((int) cols, block.x), divUp((int) rows, block.y));\n\n    int partials_count = (int) (grid.x * grid.y);\n    allocate_buffer(buffer, partials_count);\n\n    icp_helper_kernel<<<grid, block, 0, s>>>(*this, buffer);\n    cudaSafeCall(cudaGetLastError());\n\n    int b = Policy::FINAL_REDUCE_CTA_SIZE;\n    int g = Policy::TOTAL;\n    icp_final_reduce_kernel<<<g, b, 0, s>>>(buffer, partials_count, buffer.ptr(Policy::TOTAL));\n    cudaSafeCall(cudaGetLastError());\n\n    cudaSafeCall(\n        cudaMemcpyAsync(data, buffer.ptr(Policy::TOTAL), Policy::TOTAL * sizeof(float), cudaMemcpyDeviceToHost, s));\n    cudaSafeCall(cudaGetLastError());\n}\n\nvoid kfusion::device::ComputeIcpHelper::operator()(const Points& vprev, const Normals& nprev,\n                                                   DeviceArray2D<float>& buffer, float* data, cudaStream_t s) {\n    dprev_tex.filterMode = cudaFilterModePoint;\n    nprev_tex.filterMode = cudaFilterModePoint;\n    TextureBinder vprev_binder(vprev, vprev_tex);\n    TextureBinder nprev_binder(nprev, nprev_tex);\n\n    dim3 block(Policy::CTA_SIZE_X, Policy::CTA_SIZE_Y);\n    dim3 grid(divUp((int) cols, block.x), divUp((int) rows, block.y));\n\n    int partials_count = (int) (grid.x * grid.y);\n    allocate_buffer(buffer, partials_count);\n\n    icp_helper_kernel<<<grid, block, 0, s>>>(*this, buffer);\n    cudaSafeCall(cudaGetLastError());\n\n    int b = Policy::FINAL_REDUCE_CTA_SIZE;\n    int g = Policy::TOTAL;\n    icp_final_reduce_kernel<<<g, b, 0, s>>>(buffer, partials_count, buffer.ptr(Policy::TOTAL));\n    cudaSafeCall(cudaGetLastError());\n\n    cudaSafeCall(\n        cudaMemcpyAsync(data, buffer.ptr(Policy::TOTAL), Policy::TOTAL * sizeof(float), cudaMemcpyDeviceToHost, s));\n    cudaSafeCall(cudaGetLastError());\n}\n\nvoid kfusion::device::ComputeIcpHelper::allocate_buffer(DeviceArray2D<float>& buffer, int partials_count) {\n    if (partials_count < 0) {\n        const int input_cols = 640;\n        const int input_rows = 480;\n\n        int gx = divUp(input_cols, Policy::CTA_SIZE_X);\n        int gy = divUp(input_rows, Policy::CTA_SIZE_Y);\n\n        partials_count = gx * gy;\n    }\n\n    int min_rows = Policy::TOTAL + 1;\n    int min_cols = max(partials_count, Policy::TOTAL);\n\n    if (buffer.rows() < min_rows || buffer.cols() < min_cols)\n        buffer.create(min_rows, min_cols);\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// ComputeIcpHelper::PageLockHelper\n\nkfusion::device::ComputeIcpHelper::PageLockHelper::PageLockHelper() : data(0) {\n    cudaSafeCall(cudaMallocHost((void**) &data, Policy::TOTAL * sizeof(float)));\n}\n\nkfusion::device::ComputeIcpHelper::PageLockHelper::~PageLockHelper() {\n    cudaSafeCall(cudaFreeHost(data));\n    data = 0;\n}\n"
  },
  {
    "path": "src/kfusion/cuda/tsdf_volume.cu",
    "content": "/* kfusion includes */\n#include <kfusion/cuda/device.hpp>\n#include <kfusion/cuda/texture_binder.hpp>\n\n/* sobfu includes */\n#include <sobfu/cuda/utils.hpp>\n\n/* cuda includes */\n#include <curand.h>\n#include <curand_kernel.h>\n\n/* thrust includes */\n#include <thrust/device_vector.h>\n#include <thrust/extrema.h>\n\nusing namespace kfusion::device;\n\n///////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Volume initialization\n\nnamespace kfusion {\nnamespace device {\n__global__ void clear_volume_kernel(TsdfVolume tsdf) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x < tsdf.dims.x && y < tsdf.dims.y) {\n        float2* beg = tsdf.beg(x, y);\n        float2* end = beg + tsdf.dims.x * tsdf.dims.y * tsdf.dims.z;\n\n        for (float2* pos = beg; pos != end; pos = tsdf.zstep(pos))\n            *pos = make_float2(0.f, 0.f);\n    }\n}\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::clear_volume(TsdfVolume& volume) {\n    dim3 block(64, 16);\n    dim3 grid(1, 1, 1);\n    grid.x = divUp(volume.dims.x, block.x);\n    grid.y = divUp(volume.dims.y, block.y);\n\n    clear_volume_kernel<<<grid, block>>>(volume);\n    cudaSafeCall(cudaGetLastError());\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Volume integration\n\nnamespace kfusion {\nnamespace device {\ntexture<float, 2> dists_tex(0, cudaFilterModePoint, cudaAddressModeBorder,\n                            cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat));\n\nstruct TsdfIntegrator {\n    Projector proj;\n    Aff3f vol2cam;\n\n    int2 dists_size;\n\n    __kf_device__ void operator()(TsdfVolume& volume) const {\n        int x = blockIdx.x * blockDim.x + threadIdx.x;\n        int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n        if (x >= volume.dims.x || y >= volume.dims.y) {\n            return;\n        }\n\n        float3 vc     = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,\n                                y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f);\n        float3 vc_cam = vol2cam * vc; /* transfrom from volume coos to camera coos */\n\n        float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);\n        float2* vptr = volume.beg(x, y);\n        for (int i = 0; i <= volume.dims.z - 1; ++i, vc_cam += zstep, vptr = volume.zstep(vptr)) {\n            /* project the voxel centre onto the depth map */\n            float2 coo = proj(vc_cam);\n            if (coo.x < 0 || coo.y < 0 || coo.x >= dists_size.x || coo.y >= dists_size.y) {\n                continue;\n            }\n\n            float Dp = tex2D(dists_tex, coo.x, coo.y);\n            if (Dp <= 0.f || vc_cam.z <= 0) {\n                continue;\n            }\n\n            /* get the psdf value */\n            float psdf = Dp - vc_cam.z;\n            /* get the weight */\n            float weight = (psdf > -volume.eta) ? 1.f : 0.f;\n\n            if (psdf >= volume.trunc_dist) {\n                *vptr = make_float2(1.f, weight);\n            } else if (psdf <= -volume.trunc_dist) {\n                *vptr = make_float2(-1.f, weight);\n            } else {\n                *vptr = make_float2(__fdividef(psdf, volume.trunc_dist), weight);\n            }\n        }\n    }\n\n    __kf_device__ void operator()(TsdfVolume& phi_global, TsdfVolume& phi_n_psi) const {\n        int x = blockIdx.x * blockDim.x + threadIdx.x;\n        int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n        if (x >= phi_global.dims.x || y >= phi_global.dims.y) {\n            return;\n        }\n\n        float2* pos_global = phi_global.beg(x, y);\n        float2* pos_n_psi  = phi_n_psi.beg(x, y);\n\n        for (int i = 0; i <= phi_global.dims.z - 1;\n             ++i, pos_global = phi_global.zstep(pos_global), pos_n_psi = phi_n_psi.zstep(pos_n_psi)) {\n            float2 tsdf = *pos_n_psi;\n\n            if (tsdf.y == 0.f || (tsdf.y == 1.f && (tsdf.x == 0.f || tsdf.x == -1.f))) {\n                continue;\n            }\n\n            float2 tsdf_prev = *pos_global;\n\n            float tsdf_new   = __fdividef(__fmaf_rn(tsdf_prev.y, tsdf_prev.x, tsdf.x), tsdf_prev.y + 1.f);\n            float weight_new = fminf(tsdf_prev.y + 1.f, (float) phi_global.max_weight);\n\n            /* pack and write */\n            *pos_global = make_float2(tsdf_new, weight_new);\n        }\n    }\n};  // namespace device\n\n__global__ void integrate_kernel(const TsdfIntegrator integrator, TsdfVolume volume) { integrator(volume); }\n\n__global__ void integrate_kernel(const TsdfIntegrator integrator, TsdfVolume phi_global, TsdfVolume phi_n_psi) {\n    integrator(phi_global, phi_n_psi);\n};\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::integrate(const PtrStepSz<float>& dists, TsdfVolume& volume, const Aff3f& aff,\n                                const Projector& proj) {\n    /* init tsdf */\n    TsdfIntegrator ti;\n    ti.dists_size = make_int2(dists.cols, dists.rows);\n    ti.vol2cam    = aff;\n    ti.proj       = proj;\n\n    dists_tex.filterMode     = cudaFilterModePoint;\n    dists_tex.addressMode[0] = cudaAddressModeBorder;\n    dists_tex.addressMode[1] = cudaAddressModeBorder;\n    dists_tex.addressMode[2] = cudaAddressModeBorder;\n    TextureBinder binder(dists, dists_tex, cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat));\n    (void) binder;\n\n    dim3 block(64, 16);\n    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));\n\n    integrate_kernel<<<grid, block>>>(ti, volume);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n\nvoid kfusion::device::integrate(TsdfVolume& phi_global, TsdfVolume& phi_n_psi) {\n    TsdfIntegrator ti;\n\n    dim3 block(64, 16);\n    dim3 grid(divUp(phi_global.dims.x, block.x), divUp(phi_global.dims.y, block.y));\n\n    integrate_kernel<<<grid, block>>>(ti, phi_global, phi_n_psi);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n\nnamespace kfusion {\nnamespace device {\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// signed distance fields for various primitives\n\n__global__ void init_box_kernel(TsdfVolume volume, const float3 b) {\n    int x = blockIdx.x * blockDim.x + threadIdx.x;\n    int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x >= volume.dims.x || y >= volume.dims.y)\n        return;\n\n    /* centering */\n    float3 c = make_float3(volume.dims.x / 2.f * volume.voxel_size.x, volume.dims.y / 2.f * volume.voxel_size.y,\n                           volume.dims.z / 2.f * volume.voxel_size.z);\n\n    float3 vc = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,\n                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f) -\n                c;\n    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);\n\n    float2* vptr = volume.beg(x, y);\n    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {\n        float3 d = make_float3(fabs(vc.x), fabs(vc.y), fabs(vc.z)) - b;\n\n        float sdf =\n            fmin(fmax(d.x, fmax(d.y, d.z)), 0.f) + norm(make_float3(fmax(d.x, 0.f), fmax(d.y, 0.f), fmax(d.z, 0.f)));\n        float weight = 1.f;\n\n        if (sdf >= volume.trunc_dist) {\n            *vptr = make_float2(1.f, weight);\n        } else if (sdf <= -volume.trunc_dist) {\n            *vptr = make_float2(-1.f, weight);\n        } else {\n            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);\n        }\n    }\n}\n\n__global__ void init_ellipsoid_kernel(TsdfVolume volume, const float3 r) {\n    int x = blockIdx.x * blockDim.x + threadIdx.x;\n    int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x >= volume.dims.x || y >= volume.dims.y)\n        return;\n\n    /* centering */\n    float3 c = make_float3(volume.dims.x / 2.f * volume.voxel_size.x, volume.dims.y / 2.f * volume.voxel_size.y,\n                           volume.dims.z / 2.f * volume.voxel_size.z);\n\n    float3 vc = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,\n                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f) -\n                c;\n    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);\n\n    float2* vptr = volume.beg(x, y);\n    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {\n        float k0 = norm(make_float3(vc.x / r.x, vc.y / r.y, vc.z / r.z));\n        float k1 = norm(make_float3(vc.x / (r.x * r.x), vc.y / (r.y * r.y), vc.z / (r.z * r.z)));\n\n        float sdf    = k0 * (k0 - 1.f) / k1;\n        float weight = 1.f;\n\n        if (sdf >= volume.trunc_dist) {\n            *vptr = make_float2(1.f, weight);\n        } else if (sdf <= -volume.trunc_dist) {\n            *vptr = make_float2(-1.f, weight);\n        } else {\n            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);\n        }\n    }\n}\n\n__global__ void init_sphere_kernel(TsdfVolume volume, float3 centre, float radius) {\n    int x = blockIdx.x * blockDim.x + threadIdx.x;\n    int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x >= volume.dims.x || y >= volume.dims.y)\n        return;\n\n    float3 vc    = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,\n                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f);\n    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);\n\n    float2* vptr = volume.beg(x, y);\n    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {\n        float d = sqrtf(powf(vc.x - centre.x, 2) + powf(vc.y - centre.y, 2) + powf(vc.z - centre.z, 2));\n\n        float sdf    = d - radius;\n        float weight = (sdf > -volume.eta) ? 1.f : 0.f;\n\n        if (sdf >= volume.trunc_dist) {\n            *vptr = make_float2(1.f, weight);\n        } else if (sdf <= -volume.trunc_dist) {\n            *vptr = make_float2(-1.f, weight);\n        } else {\n            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);\n        }\n    }\n}\n\n__global__ void init_plane_kernel(TsdfVolume volume, float z) {\n    int x = blockIdx.x * blockDim.x + threadIdx.x;\n    int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x >= volume.dims.x || y >= volume.dims.y)\n        return;\n\n    float3 vc    = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,\n                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f);\n    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);\n\n    float2* vptr = volume.beg(x, y);\n    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {\n        float sdf    = vc.z - z;\n        float weight = 1.f;\n\n        if (sdf >= volume.trunc_dist) {\n            *vptr = make_float2(1.f, weight);\n        } else if (sdf <= -volume.trunc_dist) {\n            *vptr = make_float2(-1.f, weight);\n        } else {\n            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);\n        }\n    }\n}\n\n__global__ void init_torus_kernel(TsdfVolume volume, const float2 t) {\n    int x = blockIdx.x * blockDim.x + threadIdx.x;\n    int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n    if (x >= volume.dims.x || y >= volume.dims.y)\n        return;\n\n    /* centering */\n    float3 c = make_float3(volume.dims.x / 2.f * volume.voxel_size.x, volume.dims.y / 2.f * volume.voxel_size.y,\n                           volume.dims.z / 2.f * volume.voxel_size.z);\n\n    float3 vc = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,\n                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f) -\n                c;\n    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);\n\n    float2* vptr = volume.beg(x, y);\n    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {\n        float2 q = make_float2(norm(make_float2(vc.x, vc.z)) - t.x, vc.y);\n\n        float sdf    = norm(q) - t.y;\n        float weight = 1.f;\n\n        if (sdf >= volume.trunc_dist) {\n            *vptr = make_float2(1.f, weight);\n        } else if (sdf <= -volume.trunc_dist) {\n            *vptr = make_float2(-1.f, weight);\n        } else {\n            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);\n        }\n    }\n}\n\n}  // namespace device\n}  // namespace kfusion\n\nvoid kfusion::device::init_box(TsdfVolume& volume, const float3& b) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));\n\n    init_box_kernel<<<grid, block>>>(volume, b);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n\nvoid kfusion::device::init_ellipsoid(TsdfVolume& volume, const float3& r) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));\n\n    init_ellipsoid_kernel<<<grid, block>>>(volume, r);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n\nvoid kfusion::device::init_plane(TsdfVolume& volume, const float& z) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));\n\n    init_plane_kernel<<<grid, block>>>(volume, z);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n\nvoid kfusion::device::init_sphere(TsdfVolume& volume, const float3& centre, const float& radius) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));\n\n    init_sphere_kernel<<<grid, block>>>(volume, centre, radius);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n\nvoid kfusion::device::init_torus(TsdfVolume& volume, const float2& t) {\n    dim3 block(64, 16);\n    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));\n\n    init_torus_kernel<<<grid, block>>>(volume, t);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n"
  },
  {
    "path": "src/kfusion/device_memory.cpp",
    "content": "#include <cassert>\n#include <cstdlib>\n#include <iostream>\n#include <kfusion/cuda/device_memory.hpp>\n#include <kfusion/safe_call.hpp>\n\nvoid kfusion::cuda::error(const char *error_string, const char *file, const int line, const char *func) {\n    std::cout << \"error: \" << error_string << \"\\t\" << file << \":\" << line << std::endl;\n    exit(0);\n}\n\n//////////////////////////    XADD    ///////////////////////////////\n\n#ifdef __GNUC__\n\n#if __GNUC__ * 10 + __GNUC_MINOR__ >= 42\n\n#if !defined WIN32 && (defined __i486__ || defined __i586__ || defined __i686__ || defined __MMX__ || \\\n                       defined __SSE__ || defined __ppc__)\n#define CV_XADD __sync_fetch_and_add\n#else\n#include <ext/atomicity.h>\n#define CV_XADD __gnu_cxx::__exchange_and_add\n#endif\n#else\n#include <bits/atomicity.h>\n#if __GNUC__ * 10 + __GNUC_MINOR__ >= 34\n#define CV_XADD __gnu_cxx::__exchange_and_add\n#else\n#define CV_XADD __exchange_and_add\n#endif\n#endif\n\n#elif defined WIN32 || defined _WIN32\n#include <intrin.h>\n#define CV_XADD(addr, delta) _InterlockedExchangeAdd((long volatile *) (addr), (delta))\n#else\n\ntemplate <typename _Tp>\nstatic inline _Tp CV_XADD(_Tp *addr, _Tp delta) {\n    int tmp = *addr;\n    *addr += delta;\n    return tmp;\n}\n\n#endif\n\n////////////////////////    DeviceArray    /////////////////////////////\n\nkfusion::cuda::DeviceMemory::DeviceMemory() : data_(0), sizeBytes_(0), refcount_(0) {}\nkfusion::cuda::DeviceMemory::DeviceMemory(void *ptr_arg, size_t sizeBytes_arg)\n    : data_(ptr_arg), sizeBytes_(sizeBytes_arg), refcount_(0) {}\nkfusion::cuda::DeviceMemory::DeviceMemory(size_t sizeBtes_arg) : data_(0), sizeBytes_(0), refcount_(0) {\n    create(sizeBtes_arg);\n}\nkfusion::cuda::DeviceMemory::~DeviceMemory() { release(); }\n\nkfusion::cuda::DeviceMemory::DeviceMemory(const DeviceMemory &other_arg)\n    : data_(other_arg.data_), sizeBytes_(other_arg.sizeBytes_), refcount_(other_arg.refcount_) {\n    if (refcount_)\n        CV_XADD(refcount_, 1);\n}\n\nkfusion::cuda::DeviceMemory &kfusion::cuda::DeviceMemory::operator=(const kfusion::cuda::DeviceMemory &other_arg) {\n    if (this != &other_arg) {\n        if (other_arg.refcount_)\n            CV_XADD(other_arg.refcount_, 1);\n        release();\n\n        data_      = other_arg.data_;\n        sizeBytes_ = other_arg.sizeBytes_;\n        refcount_  = other_arg.refcount_;\n    }\n    return *this;\n}\n\nvoid kfusion::cuda::DeviceMemory::create(size_t sizeBytes_arg) {\n    if (sizeBytes_arg == sizeBytes_)\n        return;\n\n    if (sizeBytes_arg > 0) {\n        if (data_)\n            release();\n\n        sizeBytes_ = sizeBytes_arg;\n\n        cudaSafeCall(cudaMalloc(&data_, sizeBytes_));\n\n        // refcount_ = (int*)cv::fastMalloc(sizeof(*refcount_));\n        refcount_  = new int;\n        *refcount_ = 1;\n    }\n}\n\nvoid kfusion::cuda::DeviceMemory::copyTo(DeviceMemory &other) const {\n    if (empty())\n        other.release();\n    else {\n        other.create(sizeBytes_);\n        cudaSafeCall(cudaMemcpy(other.data_, data_, sizeBytes_, cudaMemcpyDeviceToDevice));\n    }\n}\n\nvoid kfusion::cuda::DeviceMemory::release() {\n    if (refcount_ && CV_XADD(refcount_, -1) == 1) {\n        // cv::fastFree(refcount);\n        delete refcount_;\n        cudaSafeCall(cudaFree(data_));\n    }\n    data_      = 0;\n    sizeBytes_ = 0;\n    refcount_  = 0;\n}\n\nvoid kfusion::cuda::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg) {\n    create(sizeBytes_arg);\n    cudaSafeCall(cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice));\n}\n\nvoid kfusion::cuda::DeviceMemory::download(void *host_ptr_arg) const {\n    cudaSafeCall(cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost));\n}\n\nvoid kfusion::cuda::DeviceMemory::swap(DeviceMemory &other_arg) {\n    std::swap(data_, other_arg.data_);\n    std::swap(sizeBytes_, other_arg.sizeBytes_);\n    std::swap(refcount_, other_arg.refcount_);\n}\n\nbool kfusion::cuda::DeviceMemory::empty() const { return !data_; }\nsize_t kfusion::cuda::DeviceMemory::sizeBytes() const { return sizeBytes_; }\n\n////////////////////////    DeviceArray2D    /////////////////////////////\n\nkfusion::cuda::DeviceMemory2D::DeviceMemory2D() : data_(0), step_(0), colsBytes_(0), rows_(0), refcount_(0) {}\n\nkfusion::cuda::DeviceMemory2D::DeviceMemory2D(int rows_arg, int colsBytes_arg)\n    : data_(0), step_(0), colsBytes_(0), rows_(0), refcount_(0) {\n    create(rows_arg, colsBytes_arg);\n}\n\nkfusion::cuda::DeviceMemory2D::DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg)\n    : data_(data_arg), step_(step_arg), colsBytes_(colsBytes_arg), rows_(rows_arg), refcount_(0) {}\n\nkfusion::cuda::DeviceMemory2D::~DeviceMemory2D() { release(); }\n\nkfusion::cuda::DeviceMemory2D::DeviceMemory2D(const DeviceMemory2D &other_arg)\n    : data_(other_arg.data_),\n      step_(other_arg.step_),\n      colsBytes_(other_arg.colsBytes_),\n      rows_(other_arg.rows_),\n      refcount_(other_arg.refcount_) {\n    if (refcount_)\n        CV_XADD(refcount_, 1);\n}\n\nkfusion::cuda::DeviceMemory2D &kfusion::cuda::DeviceMemory2D::operator=(\n    const kfusion::cuda::DeviceMemory2D &other_arg) {\n    if (this != &other_arg) {\n        if (other_arg.refcount_)\n            CV_XADD(other_arg.refcount_, 1);\n        release();\n\n        colsBytes_ = other_arg.colsBytes_;\n        rows_      = other_arg.rows_;\n        data_      = other_arg.data_;\n        step_      = other_arg.step_;\n\n        refcount_ = other_arg.refcount_;\n    }\n    return *this;\n}\n\nvoid kfusion::cuda::DeviceMemory2D::create(int rows_arg, int colsBytes_arg) {\n    if (colsBytes_ == colsBytes_arg && rows_ == rows_arg)\n        return;\n\n    if (rows_arg > 0 && colsBytes_arg > 0) {\n        if (data_)\n            release();\n\n        colsBytes_ = colsBytes_arg;\n        rows_      = rows_arg;\n\n        cudaSafeCall(cudaMallocPitch((void **) &data_, &step_, colsBytes_, rows_));\n\n        // refcount = (int*)cv::fastMalloc(sizeof(*refcount));\n        refcount_  = new int;\n        *refcount_ = 1;\n    }\n}\n\nvoid kfusion::cuda::DeviceMemory2D::release() {\n    if (refcount_ && CV_XADD(refcount_, -1) == 1) {\n        // cv::fastFree(refcount);\n        delete refcount_;\n        cudaSafeCall(cudaFree(data_));\n    }\n\n    colsBytes_ = 0;\n    rows_      = 0;\n    data_      = 0;\n    step_      = 0;\n    refcount_  = 0;\n}\n\nvoid kfusion::cuda::DeviceMemory2D::copyTo(DeviceMemory2D &other) const {\n    if (empty())\n        other.release();\n    else {\n        other.create(rows_, colsBytes_);\n        cudaSafeCall(cudaMemcpy2D(other.data_, other.step_, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToDevice));\n    }\n}\n\nvoid kfusion::cuda::DeviceMemory2D::upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg,\n                                           int colsBytes_arg) {\n    create(rows_arg, colsBytes_arg);\n    cudaSafeCall(cudaMemcpy2D(data_, step_, host_ptr_arg, host_step_arg, colsBytes_, rows_, cudaMemcpyHostToDevice));\n}\n\nvoid kfusion::cuda::DeviceMemory2D::download(void *host_ptr_arg, size_t host_step_arg) const {\n    cudaSafeCall(cudaMemcpy2D(host_ptr_arg, host_step_arg, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToHost));\n}\n\nvoid kfusion::cuda::DeviceMemory2D::swap(DeviceMemory2D &other_arg) {\n    std::swap(data_, other_arg.data_);\n    std::swap(step_, other_arg.step_);\n\n    std::swap(colsBytes_, other_arg.colsBytes_);\n    std::swap(rows_, other_arg.rows_);\n    std::swap(refcount_, other_arg.refcount_);\n}\n\nbool kfusion::cuda::DeviceMemory2D::empty() const { return !data_; }\nint kfusion::cuda::DeviceMemory2D::colsBytes() const { return colsBytes_; }\nint kfusion::cuda::DeviceMemory2D::rows() const { return rows_; }\nsize_t kfusion::cuda::DeviceMemory2D::step() const { return step_; }\n"
  },
  {
    "path": "src/kfusion/imgproc.cpp",
    "content": "#include <kfusion/precomp.hpp>\n\nvoid kfusion::cuda::depthBilateralFilter(const Depth &in, Depth &out, int kernel_size, float sigma_spatial,\n                                         float sigma_depth) {\n    out.create(in.rows(), in.cols());\n    device::bilateralFilter(in, out, kernel_size, sigma_spatial, sigma_depth);\n}\n\nvoid kfusion::cuda::depthTruncation(Depth &depth, float threshold) { device::truncateDepth(depth, threshold); }\n\nvoid kfusion::cuda::depthBuildPyramid(const Depth &depth, Depth &pyramid, float sigma_depth) {\n    pyramid.create(depth.rows() / 2, depth.cols() / 2);\n    device::depthPyr(depth, pyramid, sigma_depth);\n}\n\nvoid kfusion::cuda::waitAllDefaultStream() { cudaSafeCall(cudaDeviceSynchronize()); }\n\nvoid kfusion::cuda::computeNormalsAndMaskDepth(const Intr &intr, Depth &depth, Normals &normals) {\n    normals.create(depth.rows(), depth.cols());\n\n    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);\n\n    device::Normals &n = (device::Normals &) normals;\n    device::computeNormalsAndMaskDepth(reproj, depth, n);\n}\n\nvoid kfusion::cuda::computePointNormals(const Intr &intr, const Depth &depth, Cloud &points, Normals &normals) {\n    points.create(depth.rows(), depth.cols());\n    normals.create(depth.rows(), depth.cols());\n\n    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);\n\n    device::Points &p  = (device::Points &) points;\n    device::Normals &n = (device::Normals &) normals;\n    device::computePointNormals(reproj, depth, p, n);\n}\n\nvoid kfusion::cuda::computeDists(const Depth &depth, Dists &dists, const Intr &intr) {\n    dists.create(depth.rows(), depth.cols());\n    device::compute_dists(depth, dists, make_float2(intr.fx, intr.fy), make_float2(intr.cx, intr.cy));\n}\n\nvoid kfusion::cuda::resizeDepthNormals(const Depth &depth, const Normals &normals, Depth &depth_out,\n                                       Normals &normals_out) {\n    depth_out.create(depth.rows() / 2, depth.cols() / 2);\n    normals_out.create(normals.rows() / 2, normals.cols() / 2);\n\n    device::Normals &nsrc = (device::Normals &) normals;\n    device::Normals &ndst = (device::Normals &) normals_out;\n\n    device::resizeDepthNormals(depth, nsrc, depth_out, ndst);\n}\n\nvoid kfusion::cuda::resizePointsNormals(const Cloud &points, const Normals &normals, Cloud &points_out,\n                                        Normals &normals_out) {\n    points_out.create(points.rows() / 2, points.cols() / 2);\n    normals_out.create(normals.rows() / 2, normals.cols() / 2);\n\n    device::Points &pi  = (device::Points &) points;\n    device::Normals &ni = (device::Normals &) normals;\n\n    device::Points &po  = (device::Points &) points_out;\n    device::Normals &no = (device::Normals &) normals_out;\n\n    device::resizePointsNormals(pi, ni, po, no);\n}\n\nvoid kfusion::cuda::rasteriseSurface(const Intr &intr, const Affine3f &vol2cam, const Surface &s, Cloud &vertices_out,\n                                     Normals &normals_out) {\n    device::Projector proj(intr.fx, intr.fy, intr.cx, intr.cy);\n    device::Aff3f dev_vol2cam = device_cast<device::Aff3f>(vol2cam);\n\n    /* convert warped triangle vertices & normals to point clouds */\n    pcl::PointCloud<pcl::PointXYZ> vertices_warped;\n    s.vertices.download(vertices_warped.points);\n\n    pcl::PointCloud<pcl::Normal> normals_warped;\n    s.normals.download(normals_warped.points);\n\n    /* lay out the warped triangle vertices and normals in memory */\n    std::vector<pcl::PointXYZ>\n        triangle_vertices; /* the index of each entry in the vector, modulo 3, stores the 1st, 2nd, and 3rd vertex of a\n                             triangle */\n    std::vector<pcl::Normal> triangle_normals;\n    for (size_t i = 0; i < vertices_warped.size() / 3; i++) {\n        triangle_vertices.emplace_back(vertices_warped[i * 3 + 0]);\n        triangle_vertices.emplace_back(vertices_warped[i * 3 + 1]);\n        triangle_vertices.emplace_back(vertices_warped[i * 3 + 2]);\n\n        triangle_normals.emplace_back(normals_warped[i * 3 + 0]);\n        triangle_normals.emplace_back(normals_warped[i * 3 + 1]);\n        triangle_normals.emplace_back(normals_warped[i * 3 + 2]);\n    }\n\n    kfusion::cuda::Vertices vertices;\n    vertices.upload(triangle_vertices);\n\n    kfusion::cuda::Norms normals;\n    normals.upload(triangle_normals);\n\n    kfusion::cuda::Surface surface;\n    surface.vertices = vertices;\n    surface.normals  = normals;\n\n    /* convert to classes that can be used in cuda */\n    const device::Surface &dev_surface = (const device::Surface &) surface;\n    device::Points &vo                 = (device::Points &) vertices_out;\n    device::Normals &no                = (device::Normals &) normals_out;\n\n    device::rasteriseSurface(proj, dev_vol2cam, dev_surface, vo, no);\n    waitAllDefaultStream();\n}\n"
  },
  {
    "path": "src/kfusion/kinfu.cpp",
    "content": "#include <kfusion/internal.hpp>\n#include <kfusion/precomp.hpp>\n\nusing namespace std;\nusing namespace kfusion;\nusing namespace kfusion::cuda;\n\nstatic inline float deg2rad(float alpha) { return alpha * 0.017453293f; }\n\nkfusion::KinFuParams kfusion::KinFuParams::default_params() {\n    const int iters[] = {10, 5, 4, 0};\n    const int levels  = sizeof(iters) / sizeof(iters[0]);\n\n    KinFuParams p;\n\n    p.cols = 640;  // pixels\n    p.rows = 480;  // pixels\n    p.intr = Intr(525.f, 525.f, p.cols / 2 - 0.5f, p.rows / 2 - 0.5f);\n\n    p.volume_dims = Vec3i::all(512);  // number of voxels\n    p.volume_size = Vec3f::all(3.f);  // meters\n    p.volume_pose = Affine3f().translate(Vec3f(-p.volume_size[0] / 2, -p.volume_size[1] / 2, 0.5f));\n\n    p.bilateral_sigma_depth   = 0.04f;  // meter\n    p.bilateral_sigma_spatial = 4.5;    // pixels\n    p.bilateral_kernel_size   = 7;      // pixels\n\n    p.icp_truncate_depth_dist = 0.f;            // meters, disabled\n    p.icp_dist_thres          = 0.1f;           // meters\n    p.icp_angle_thres         = deg2rad(30.f);  // radians\n    p.icp_iter_num.assign(iters, iters + levels);\n\n    p.tsdf_min_camera_movement = 0.f;    // meters, disabled\n    p.tsdf_trunc_dist          = 0.04f;  // meters;\n    p.tsdf_max_weight          = 64;     // frames\n\n    p.raycast_step_factor   = 0.75f;  // in voxel sizes\n    p.gradient_delta_factor = 0.5f;   // in voxel sizes\n\n    // p.light_pose = p.volume_pose.translation()/4; //meters\n    p.light_pose = Vec3f::all(0.f);  // meters\n\n    return p;\n}\n\nconst kfusion::KinFuParams &kfusion::KinFu::params() const { return params_; }\n\nkfusion::KinFuParams &kfusion::KinFu::params() { return params_; }\n\nconst kfusion::cuda::TsdfVolume &kfusion::KinFu::tsdf() const { return *volume_; }\n\nkfusion::cuda::TsdfVolume &kfusion::KinFu::tsdf() { return *volume_; }\n\nconst kfusion::cuda::ProjectiveICP &kfusion::KinFu::icp() const { return *icp_; }\n\nkfusion::cuda::ProjectiveICP &kfusion::KinFu::icp() { return *icp_; }\n\nconst kfusion::cuda::MarchingCubes &kfusion::KinFu::mc() const { return *mc_; }\n\nkfusion::cuda::MarchingCubes &kfusion::KinFu::mc() { return *mc_; }\n\nvoid kfusion::KinFu::allocate_buffers() {\n    const int LEVELS = cuda::ProjectiveICP::MAX_PYRAMID_LEVELS;\n\n    int cols = params_.cols;\n    int rows = params_.rows;\n\n    dists_.create(rows, cols);\n\n    curr_.depth_pyr.resize(LEVELS);\n    curr_.normals_pyr.resize(LEVELS);\n\n    prev_.depth_pyr.resize(LEVELS);\n    prev_.normals_pyr.resize(LEVELS);\n\n    curr_.points_pyr.resize(LEVELS);\n    prev_.points_pyr.resize(LEVELS);\n\n    for (int i = 0; i < LEVELS; ++i) {\n        curr_.depth_pyr[i].create(rows, cols);\n        curr_.normals_pyr[i].create(rows, cols);\n\n        prev_.depth_pyr[i].create(rows, cols);\n        prev_.normals_pyr[i].create(rows, cols);\n\n        curr_.points_pyr[i].create(rows, cols);\n        prev_.points_pyr[i].create(rows, cols);\n\n        cols /= 2;\n        rows /= 2;\n    }\n\n    depths_.create(params_.rows, params_.cols);\n    normals_.create(params_.rows, params_.cols);\n    points_.create(params_.rows, params_.cols);\n}\n\nvoid kfusion::KinFu::reset() {\n    if (frame_counter_)\n        cout << \"Reset\" << endl;\n\n    frame_counter_ = 0;\n    poses_.clear();\n    poses_.reserve(30000);\n    poses_.push_back(Affine3f::Identity());\n    volume_->clear();\n}\n\nkfusion::Affine3f kfusion::KinFu::getCameraPose(int time) const {\n    if ((time > static_cast<int>(poses_.size())) || (time < 0)) {\n        time = static_cast<int>(poses_.size()) - 1;\n    }\n\n    return poses_[time];\n}\n"
  },
  {
    "path": "src/kfusion/marching_cubes.cpp",
    "content": "/* kinfu includes */\n#include <kfusion/cuda/marching_cubes.hpp>\n#include <kfusion/internal.hpp>\n#include <kfusion/kinfu.hpp>\n#include <kfusion/precomp.hpp>\n\n/* sys headers */\n#include <iostream>\n\nextern const int edgeTable[256];\nextern const int triTable[256][16];\nextern const int numVertsTable[256];\n\nkfusion::cuda::MarchingCubes::MarchingCubes() {\n    edgeTable_.upload(edgeTable, 256);\n    numVertsTable_.upload(numVertsTable, 256);\n    triTable_.upload(&triTable[0][0], 256 * 16);\n}\n\nkfusion::cuda::MarchingCubes::~MarchingCubes() = default;\n\nvoid kfusion::cuda::MarchingCubes::setPose(const cv::Affine3f& pose_) { this->pose = pose_; }\n\nkfusion::cuda::Surface kfusion::cuda::MarchingCubes::run(const kfusion::cuda::TsdfVolume& volume,\n                                                         kfusion::cuda::DeviceArray<pcl::PointXYZ>& vertices_buffer,\n                                                         kfusion::cuda::DeviceArray<pcl::Normal>& normals_buffer) {\n    if (vertices_buffer.empty()) {\n        vertices_buffer.create(DEFAULT_TRIANGLES_BUFFER_SIZE);\n    }\n    if (normals_buffer.empty()) {\n        normals_buffer.create(DEFAULT_TRIANGLES_BUFFER_SIZE);\n    }\n\n    occupied_voxels_buffer_.create(3, static_cast<int>(vertices_buffer.size() / 3));\n\n    kfusion::device::bindTextures(edgeTable_, triTable_, numVertsTable_);\n\n    int3 dims         = device_cast<int3>(volume.getDims());\n    float3 voxel_size = device_cast<float3>(volume.getVoxelSize());\n    float trunc_dist  = volume.getTruncDist();\n    float eta         = volume.getEta();\n    float max_weight  = volume.getMaxWeight();\n\n    kfusion::device::TsdfVolume vol(volume.data().ptr<float2>(), dims, voxel_size, trunc_dist, eta, max_weight);\n\n    int active_voxels = kfusion::device::getOccupiedVoxels(vol, occupied_voxels_buffer_);\n    std::cout << \"no. of active voxels: \" << active_voxels << std::endl;\n\n    if (!active_voxels) {\n        kfusion::device::unbindTextures();\n        return kfusion::cuda::Surface();\n    }\n\n    kfusion::cuda::DeviceArray2D<int> occupied_voxels(3, active_voxels, occupied_voxels_buffer_.ptr(),\n                                                      occupied_voxels_buffer_.step());\n\n    int total_vertices = kfusion::device::computeOffsetsAndTotalVertices(occupied_voxels);\n\n    float3 volume_size              = device_cast<float3>(volume.getSize());\n    const device::Aff3f pose_device = device_cast<device::Aff3f>(pose);\n\n    kfusion::device::generateTriangles(vol, occupied_voxels, volume_size, pose_device,\n                                       (kfusion::cuda::DeviceArray<kfusion::device::PointType>&) vertices_buffer,\n                                       (kfusion::cuda::DeviceArray<kfusion::device::PointType>&) normals_buffer);\n\n    kfusion::device::unbindTextures();\n\n    kfusion::cuda::DeviceArray<pcl::PointXYZ> v(vertices_buffer.ptr(), total_vertices);\n    kfusion::cuda::DeviceArray<pcl::Normal> n(normals_buffer.ptr(), total_vertices);\n\n    kfusion::cuda::Surface surface;\n    surface.vertices = v;\n    surface.normals  = n;\n\n    return surface;\n}\n\n/* edge table maps 8-bit flag representing which cube vertices are inside the isosurface to 12-bit number indicating\n * which edges are intersected */\nconst int edgeTable[256] = {\n    0x0,   0x109, 0x203, 0x30a, 0x406, 0x50f, 0x605, 0x70c, 0x80c, 0x905, 0xa0f, 0xb06, 0xc0a, 0xd03, 0xe09, 0xf00,\n    0x190, 0x99,  0x393, 0x29a, 0x596, 0x49f, 0x795, 0x69c, 0x99c, 0x895, 0xb9f, 0xa96, 0xd9a, 0xc93, 0xf99, 0xe90,\n    0x230, 0x339, 0x33,  0x13a, 0x636, 0x73f, 0x435, 0x53c, 0xa3c, 0xb35, 0x83f, 0x936, 0xe3a, 0xf33, 0xc39, 0xd30,\n    0x3a0, 0x2a9, 0x1a3, 0xaa,  0x7a6, 0x6af, 0x5a5, 0x4ac, 0xbac, 0xaa5, 0x9af, 0x8a6, 0xfaa, 0xea3, 0xda9, 0xca0,\n    0x460, 0x569, 0x663, 0x76a, 0x66,  0x16f, 0x265, 0x36c, 0xc6c, 0xd65, 0xe6f, 0xf66, 0x86a, 0x963, 0xa69, 0xb60,\n    0x5f0, 0x4f9, 0x7f3, 0x6fa, 0x1f6, 0xff,  0x3f5, 0x2fc, 0xdfc, 0xcf5, 0xfff, 0xef6, 0x9fa, 0x8f3, 0xbf9, 0xaf0,\n    0x650, 0x759, 0x453, 0x55a, 0x256, 0x35f, 0x55,  0x15c, 0xe5c, 0xf55, 0xc5f, 0xd56, 0xa5a, 0xb53, 0x859, 0x950,\n    0x7c0, 0x6c9, 0x5c3, 0x4ca, 0x3c6, 0x2cf, 0x1c5, 0xcc,  0xfcc, 0xec5, 0xdcf, 0xcc6, 0xbca, 0xac3, 0x9c9, 0x8c0,\n    0x8c0, 0x9c9, 0xac3, 0xbca, 0xcc6, 0xdcf, 0xec5, 0xfcc, 0xcc,  0x1c5, 0x2cf, 0x3c6, 0x4ca, 0x5c3, 0x6c9, 0x7c0,\n    0x950, 0x859, 0xb53, 0xa5a, 0xd56, 0xc5f, 0xf55, 0xe5c, 0x15c, 0x55,  0x35f, 0x256, 0x55a, 0x453, 0x759, 0x650,\n    0xaf0, 0xbf9, 0x8f3, 0x9fa, 0xef6, 0xfff, 0xcf5, 0xdfc, 0x2fc, 0x3f5, 0xff,  0x1f6, 0x6fa, 0x7f3, 0x4f9, 0x5f0,\n    0xb60, 0xa69, 0x963, 0x86a, 0xf66, 0xe6f, 0xd65, 0xc6c, 0x36c, 0x265, 0x16f, 0x66,  0x76a, 0x663, 0x569, 0x460,\n    0xca0, 0xda9, 0xea3, 0xfaa, 0x8a6, 0x9af, 0xaa5, 0xbac, 0x4ac, 0x5a5, 0x6af, 0x7a6, 0xaa,  0x1a3, 0x2a9, 0x3a0,\n    0xd30, 0xc39, 0xf33, 0xe3a, 0x936, 0x83f, 0xb35, 0xa3c, 0x53c, 0x435, 0x73f, 0x636, 0x13a, 0x33,  0x339, 0x230,\n    0xe90, 0xf99, 0xc93, 0xd9a, 0xa96, 0xb9f, 0x895, 0x99c, 0x69c, 0x795, 0x49f, 0x596, 0x29a, 0x393, 0x99,  0x190,\n    0xf00, 0xe09, 0xd03, 0xc0a, 0xb06, 0xa0f, 0x905, 0x80c, 0x70c, 0x605, 0x50f, 0x406, 0x30a, 0x203, 0x109, 0x0};\n\n/* triangle table maps same cube vertex index to a list of up to 5 triangles which are built from the interpolated edge\n * vertices */\nconst int triTable[256][16] = {{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 1, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 8, 3, 9, 8, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 3, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 2, 10, 0, 2, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 8, 3, 2, 10, 8, 10, 9, 8, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 11, 2, 8, 11, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 9, 0, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 11, 2, 1, 9, 11, 9, 8, 11, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 10, 1, 11, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 10, 1, 0, 8, 10, 8, 11, 10, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 9, 0, 3, 11, 9, 11, 10, 9, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 3, 0, 7, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 1, 9, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 1, 9, 4, 7, 1, 7, 3, 1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 10, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 4, 7, 3, 0, 4, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 2, 10, 9, 0, 2, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, -1, -1, -1, -1},\n                               {8, 4, 7, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {11, 4, 7, 11, 2, 4, 2, 0, 4, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 0, 1, 8, 4, 7, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1, -1, -1, -1, -1},\n                               {3, 10, 1, 3, 11, 10, 7, 8, 4, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, -1, -1, -1, -1},\n                               {4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3, -1, -1, -1, -1},\n                               {4, 7, 11, 4, 11, 9, 9, 11, 10, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 5, 4, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 5, 4, 1, 5, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {8, 5, 4, 8, 3, 5, 3, 1, 5, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 10, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 0, 8, 1, 2, 10, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 2, 10, 5, 4, 2, 4, 0, 2, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, -1, -1, -1, -1},\n                               {9, 5, 4, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 11, 2, 0, 8, 11, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 5, 4, 0, 1, 5, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, -1, -1, -1, -1},\n                               {10, 3, 11, 10, 1, 3, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, -1, -1, -1, -1},\n                               {5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, -1, -1, -1, -1},\n                               {5, 4, 8, 5, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 7, 8, 5, 7, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 3, 0, 9, 5, 3, 5, 7, 3, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 7, 8, 0, 1, 7, 1, 5, 7, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 7, 8, 9, 5, 7, 10, 1, 2, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, -1, -1, -1, -1},\n                               {8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2, -1, -1, -1, -1},\n                               {2, 10, 5, 2, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1},\n                               {7, 9, 5, 7, 8, 9, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11, -1, -1, -1, -1},\n                               {2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, -1, -1, -1, -1},\n                               {11, 2, 1, 11, 1, 7, 7, 1, 5, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, -1, -1, -1, -1},\n                               {5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, -1},\n                               {11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, -1},\n                               {11, 10, 5, 7, 11, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 3, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 0, 1, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 8, 3, 1, 9, 8, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 6, 5, 2, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 6, 5, 1, 2, 6, 3, 0, 8, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 6, 5, 9, 0, 6, 0, 2, 6, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, -1, -1, -1, -1},\n                               {2, 3, 11, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {11, 0, 8, 11, 2, 0, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 1, 9, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, -1, -1, -1, -1},\n                               {6, 3, 11, 6, 5, 3, 5, 1, 3, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, -1, -1, -1, -1},\n                               {3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, -1, -1, -1, -1},\n                               {6, 5, 9, 6, 9, 11, 11, 9, 8, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 10, 6, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 3, 0, 4, 7, 3, 6, 5, 10, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 9, 0, 5, 10, 6, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, -1, -1, -1, -1},\n                               {6, 1, 2, 6, 5, 1, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7, -1, -1, -1, -1},\n                               {8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, -1, -1, -1, -1},\n                               {7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, -1},\n                               {3, 11, 2, 7, 8, 4, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, -1, -1, -1, -1},\n                               {0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1},\n                               {9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, -1},\n                               {8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, -1, -1, -1, -1},\n                               {5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, -1},\n                               {0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, -1},\n                               {6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, -1, -1, -1, -1},\n                               {10, 4, 9, 6, 4, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 10, 6, 4, 9, 10, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 0, 1, 10, 6, 0, 6, 4, 0, -1, -1, -1, -1, -1, -1, -1},\n                               {8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, -1, -1, -1, -1},\n                               {1, 4, 9, 1, 2, 4, 2, 6, 4, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4, -1, -1, -1, -1},\n                               {0, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {8, 3, 2, 8, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 4, 9, 10, 6, 4, 11, 2, 3, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, -1, -1, -1, -1},\n                               {3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, -1, -1, -1, -1},\n                               {6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, -1},\n                               {9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, -1, -1, -1, -1},\n                               {8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1, -1},\n                               {3, 11, 6, 3, 6, 0, 0, 6, 4, -1, -1, -1, -1, -1, -1, -1},\n                               {6, 4, 8, 11, 6, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {7, 10, 6, 7, 8, 10, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, -1, -1, -1, -1},\n                               {10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, -1, -1, -1, -1},\n                               {10, 6, 7, 10, 7, 1, 1, 7, 3, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, -1, -1, -1, -1},\n                               {2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9, -1},\n                               {7, 8, 0, 7, 0, 6, 6, 0, 2, -1, -1, -1, -1, -1, -1, -1},\n                               {7, 3, 2, 6, 7, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, -1, -1, -1, -1},\n                               {2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, -1},\n                               {1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, -1},\n                               {11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, -1, -1, -1, -1},\n                               {8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, -1},\n                               {0, 9, 1, 11, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, -1, -1, -1, -1},\n                               {7, 11, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 0, 8, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 1, 9, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {8, 1, 9, 8, 3, 1, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 1, 2, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 10, 3, 0, 8, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 9, 0, 2, 10, 9, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1},\n                               {6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, -1, -1, -1, -1},\n                               {7, 2, 3, 6, 2, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {7, 0, 8, 7, 6, 0, 6, 2, 0, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 7, 6, 2, 3, 7, 0, 1, 9, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, -1, -1, -1, -1},\n                               {10, 7, 6, 10, 1, 7, 1, 3, 7, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, -1, -1, -1, -1},\n                               {0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, -1, -1, -1, -1},\n                               {7, 6, 10, 7, 10, 8, 8, 10, 9, -1, -1, -1, -1, -1, -1, -1},\n                               {6, 8, 4, 11, 8, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 6, 11, 3, 0, 6, 0, 4, 6, -1, -1, -1, -1, -1, -1, -1},\n                               {8, 6, 11, 8, 4, 6, 9, 0, 1, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, -1, -1, -1, -1},\n                               {6, 8, 4, 6, 11, 8, 2, 10, 1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6, -1, -1, -1, -1},\n                               {4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, -1, -1, -1, -1},\n                               {10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, -1},\n                               {8, 2, 3, 8, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, -1, -1, -1, -1},\n                               {1, 9, 4, 1, 4, 2, 2, 4, 6, -1, -1, -1, -1, -1, -1, -1},\n                               {8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, -1, -1, -1, -1},\n                               {10, 1, 0, 10, 0, 6, 6, 0, 4, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, -1},\n                               {10, 9, 4, 6, 10, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 9, 5, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 3, 4, 9, 5, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 0, 1, 5, 4, 0, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1},\n                               {11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, -1, -1, -1, -1},\n                               {9, 5, 4, 10, 1, 2, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1},\n                               {6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, -1, -1, -1, -1},\n                               {7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, -1, -1, -1, -1},\n                               {3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, -1},\n                               {7, 2, 3, 7, 6, 2, 5, 4, 9, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7, -1, -1, -1, -1},\n                               {3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, -1, -1, -1, -1},\n                               {6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, -1},\n                               {9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, -1, -1, -1, -1},\n                               {1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, -1},\n                               {4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, -1},\n                               {7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, -1, -1, -1, -1},\n                               {6, 9, 5, 6, 11, 9, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, -1, -1, -1, -1},\n                               {0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, -1, -1, -1, -1},\n                               {6, 11, 3, 6, 3, 5, 5, 3, 1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, -1, -1, -1, -1},\n                               {0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10, -1},\n                               {11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, -1},\n                               {6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, -1, -1, -1, -1},\n                               {5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, -1, -1, -1, -1},\n                               {9, 5, 6, 9, 6, 0, 0, 6, 2, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8, -1},\n                               {1, 5, 6, 2, 1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, -1},\n                               {10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0, -1, -1, -1, -1},\n                               {0, 3, 8, 5, 6, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {11, 5, 10, 7, 5, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {11, 5, 10, 11, 7, 5, 8, 3, 0, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 11, 7, 5, 10, 11, 1, 9, 0, -1, -1, -1, -1, -1, -1, -1},\n                               {10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, -1, -1, -1, -1},\n                               {11, 1, 2, 11, 7, 1, 7, 5, 1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, -1, -1, -1, -1},\n                               {9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, -1, -1, -1, -1},\n                               {7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, -1},\n                               {2, 5, 10, 2, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1},\n                               {8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5, -1, -1, -1, -1},\n                               {9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, -1, -1, -1, -1},\n                               {9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, -1},\n                               {1, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 7, 0, 7, 1, 1, 7, 5, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 0, 3, 9, 3, 5, 5, 3, 7, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 8, 7, 5, 9, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 8, 4, 5, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1},\n                               {5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, -1, -1, -1, -1},\n                               {0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5, -1, -1, -1, -1},\n                               {10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, -1},\n                               {2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, -1, -1, -1, -1},\n                               {0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, -1},\n                               {0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, -1},\n                               {9, 4, 5, 2, 11, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, -1, -1, -1, -1},\n                               {5, 10, 2, 5, 2, 4, 4, 2, 0, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, -1},\n                               {5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, -1, -1, -1, -1},\n                               {8, 4, 5, 8, 5, 3, 3, 5, 1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 4, 5, 1, 0, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, -1, -1, -1, -1},\n                               {9, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 11, 7, 4, 9, 11, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, -1, -1, -1, -1},\n                               {1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, -1, -1, -1, -1},\n                               {3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, -1},\n                               {4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, -1, -1, -1, -1},\n                               {9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3, -1},\n                               {11, 7, 4, 11, 4, 2, 2, 4, 0, -1, -1, -1, -1, -1, -1, -1},\n                               {11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, -1, -1, -1, -1},\n                               {2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, -1, -1, -1, -1},\n                               {9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, -1},\n                               {3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, -1},\n                               {1, 10, 2, 8, 7, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 9, 1, 4, 1, 7, 7, 1, 3, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1, -1, -1, -1, -1},\n                               {4, 0, 3, 7, 4, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {4, 8, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 0, 9, 3, 9, 11, 11, 9, 10, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 1, 10, 0, 10, 8, 8, 10, 11, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 1, 10, 11, 3, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 2, 11, 1, 11, 9, 9, 11, 8, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9, -1, -1, -1, -1},\n                               {0, 2, 11, 8, 0, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {3, 2, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 3, 8, 2, 8, 10, 10, 8, 9, -1, -1, -1, -1, -1, -1, -1},\n                               {9, 10, 2, 0, 9, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, -1, -1, -1, -1},\n                               {1, 10, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {1, 3, 8, 9, 1, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 9, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {0, 3, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},\n                               {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}};\n\n/* number of vertices for each case above */\nconst int numVertsTable[256] = {\n    0,  3,  3,  6,  3,  6,  6,  9,  3,  6,  6,  9,  6,  9,  9,  6,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,\n    12, 12, 9,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,  9,  9,  6,  9,  12, 12, 9,  9,  12,\n    12, 9,  12, 15, 15, 6,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,  9,  9,  12, 9,  12, 12,\n    15, 9,  12, 12, 15, 12, 15, 15, 12, 6,  9,  9,  12, 9,  12, 6,  9,  9,  12, 12, 15, 12, 15, 9,  6,  9,  12, 12, 9,\n    12, 15, 9,  6,  12, 15, 15, 12, 15, 6,  12, 3,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,\n    9,  9,  12, 9,  12, 12, 15, 9,  6,  12, 9,  12, 9,  15, 6,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12, 15, 12, 15,\n    15, 12, 9,  12, 12, 9,  12, 15, 15, 12, 12, 9,  15, 6,  15, 12, 6,  3,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12,\n    15, 6,  9,  9,  6,  9,  12, 12, 15, 12, 15, 15, 6,  12, 9,  15, 12, 9,  6,  12, 3,  9,  12, 12, 15, 12, 15, 9,  12,\n    12, 15, 15, 6,  9,  12, 6,  3,  6,  9,  9,  6,  9,  12, 6,  3,  9,  6,  12, 3,  6,  3,  3,  0,\n};\n"
  },
  {
    "path": "src/kfusion/precomp.cpp",
    "content": "#include <kfusion/internal.hpp>\n#include <kfusion/precomp.hpp>\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Kinfu/types implementation\n\nkfusion::Intr::Intr() {}\nkfusion::Intr::Intr(float fx_, float fy_, float cx_, float cy_) : fx(fx_), fy(fy_), cx(cx_), cy(cy_) {}\n\nkfusion::Intr kfusion::Intr::operator()(int level_index) const {\n    int div = 1 << level_index;\n    return (Intr(fx / div, fy / div, cx / div, cy / div));\n}\n\nstd::ostream &operator<<(std::ostream &os, const kfusion::Intr &intr) {\n    return os << \"([f = \" << intr.fx << \", \" << intr.fy << \"] [cp = \" << intr.cx << \", \" << intr.cy << \"])\";\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// TsdfVolume host implementation\n\nkfusion::device::TsdfVolume::TsdfVolume(float2 *data, int3 dims, float3 voxel_size, float trunc_dist, float eta,\n                                        float max_weight)\n    : data(data), dims(dims), voxel_size(voxel_size), trunc_dist(trunc_dist), eta(eta), max_weight(max_weight) {}\n\n// kfusion::device::TsdfVolume::float2*\n// kfusionl::device::TsdfVolume::operator()(int x, int y, int z) { return data +\n// x + y*dims.x + z*dims.y*dims.x; }\n//\n// const kfusion::device::TsdfVolume::float2*\n// kfusionl::device::TsdfVolume::operator() (int x, int y, int z) const { return\n// data + x + y*dims.x + z*dims.y*dims.x; }\n//\n// kfusion::device::TsdfVolume::float2* kfusionl::device::TsdfVolume::beg(int\n// x, int y) const { return data + x + dims.x * y; }\n//\n// kfusion::device::TsdfVolume::float2*\n// kfusionl::device::TsdfVolume::zstep(float2 *const ptr) const { return data\n//+ dims.x * dims.y; }\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Projector host implementation\n\nkfusion::device::Projector::Projector(float fx, float fy, float cx, float cy)\n    : f(make_float2(fx, fy)), c(make_float2(cx, cy)) {}\n\n// float2 kfusion::device::Projector::operator()(const float3& p) const\n//{\n//  float2 coo;\n//  coo.x = p.x * f.x / p.z + c.x;\n//  coo.y = p.y * f.y / p.z + c.y;\n//  return coo;\n//}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Reprojector host implementation\n\nkfusion::device::Reprojector::Reprojector(float fx, float fy, float cx, float cy)\n    : finv(make_float2(1.f / fx, 1.f / fy)), c(make_float2(cx, cy)) {}\n\n// float3 kfusion::device::Reprojector::operator()(int u, int v, float z) const\n//{\n//  float x = z * (u - c.x) * finv.x;\n//  float y = z * (v - c.y) * finv.y;\n//  return make_float3(x, y, z);\n//}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// Host implementation of packing/unpacking tsdf volume element\n\n// float2 kfusion::device::pack_tsdf(float tsdf, int weight) { throw \"Not\n// implemented\"; return float2(); }  float kfusion::device::unpack_tsdf(float2\n// value, int& weight) { throw \"Not implemented\"; return 0; }  float\n// kfusion::device::unpack_tsdf(float2 value) { throw \"Not implemented\"; return\n// 0; }\n"
  },
  {
    "path": "src/kfusion/projective_icp.cpp",
    "content": "#include <kfusion/precomp.hpp>\n\nusing namespace kfusion;\nusing namespace kfusion::cuda;\n\nusing namespace std;\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// ComputeIcpHelper\n\nkfusion::device::ComputeIcpHelper::ComputeIcpHelper(float dist_thres, float angle_thres) {\n    min_cosine  = cos(angle_thres);\n    dist2_thres = dist_thres * dist_thres;\n}\n\nvoid kfusion::device::ComputeIcpHelper::setLevelIntr(int level_index, float fx, float fy, float cx, float cy) {\n    int div = 1 << level_index;\n    f       = make_float2(fx / div, fy / div);\n    c       = make_float2(cx / div, cy / div);\n    finv    = make_float2(1.f / f.x, 1.f / f.y);\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// ProjectiveICP::StreamHelper\n\nstruct kfusion::cuda::ProjectiveICP::StreamHelper {\n    typedef device::ComputeIcpHelper::PageLockHelper PageLockHelper;\n    typedef cv::Matx66f Mat6f;\n    typedef cv::Vec6f Vec6f;\n\n    cudaStream_t stream;\n    PageLockHelper locked_buffer;\n\n    StreamHelper() { cudaSafeCall(cudaStreamCreate(&stream)); }\n    ~StreamHelper() { cudaSafeCall(cudaStreamDestroy(stream)); }\n\n    operator float *() { return locked_buffer.data; }\n    operator cudaStream_t() { return stream; }\n\n    Mat6f get(Vec6f &b) {\n        cudaSafeCall(cudaStreamSynchronize(stream));\n\n        Mat6f A;\n        float *data_A = A.val;\n        float *data_b = b.val;\n\n        int shift = 0;\n        for (int i = 0; i < 6; ++i)      // rows\n            for (int j = i; j < 7; ++j)  // cols + b\n            {\n                float value = locked_buffer.data[shift++];\n                if (j == 6)  // vector b\n                    data_b[i] = value;\n                else\n                    data_A[j * 6 + i] = data_A[i * 6 + j] = value;\n            }\n        return A;\n    }\n};\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// ProjectiveICP\n\nkfusion::cuda::ProjectiveICP::ProjectiveICP() : angle_thres_(deg2rad(20.f)), dist_thres_(0.1f) {\n    const int iters[] = {10, 5, 4, 0};\n    std::vector<int> vector_iters(iters, iters + 4);\n    setIterationsNum(vector_iters);\n    device::ComputeIcpHelper::allocate_buffer(buffer_);\n\n    shelp_ = cv::Ptr<StreamHelper>(new StreamHelper());\n}\n\nkfusion::cuda::ProjectiveICP::~ProjectiveICP() {}\n\nfloat kfusion::cuda::ProjectiveICP::getDistThreshold() const { return dist_thres_; }\n\nvoid kfusion::cuda::ProjectiveICP::setDistThreshold(float distance) { dist_thres_ = distance; }\n\nfloat kfusion::cuda::ProjectiveICP::getAngleThreshold() const { return angle_thres_; }\n\nvoid kfusion::cuda::ProjectiveICP::setAngleThreshold(float angle) { angle_thres_ = angle; }\n\nvoid kfusion::cuda::ProjectiveICP::setIterationsNum(const std::vector<int> &iters) {\n    if (iters.size() >= MAX_PYRAMID_LEVELS)\n        iters_.assign(iters.begin(), iters.begin() + MAX_PYRAMID_LEVELS);\n    else {\n        iters_ = vector<int>(MAX_PYRAMID_LEVELS, 0);\n        copy(iters.begin(), iters.end(), iters_.begin());\n    }\n}\n\nint kfusion::cuda::ProjectiveICP::getUsedLevelsNum() const {\n    int i = MAX_PYRAMID_LEVELS - 1;\n    for (; i >= 0 && !iters_[i]; --i)\n        ;\n    return i + 1;\n}\n\nbool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f & /*affine*/, const Intr & /*intr*/,\n                                                     const Frame & /*curr*/, const Frame & /*prev*/) {\n    //    bool has_depth = !curr.depth_pyr.empty() && !prev.depth_pyr.empty();\n    //    bool has_points = !curr.points_pyr.empty() && !prev.points_pyr.empty();\n\n    //    if (has_depth)\n    //        return estimateTransform(affine, intr, curr.depth_pyr,\n    //        curr.normals_pyr, prev.depth_pyr, prev.normals_pyr);\n    //    else if(has_points)\n    //         return estimateTransform(affine, intr, curr.points_pyr,\n    //         curr.normals_pyr, prev.points_pyr, prev.normals_pyr);\n    //    else\n    //        CV_Assert(!\"Wrong parameters passed to estimateTransform\");\n    CV_Assert(!\"Not implemented\");\n    return false;\n}\n\nbool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f &affine, const Intr &intr, const DepthPyr &dcurr,\n                                                     const NormalsPyr ncurr, const DepthPyr dprev,\n                                                     const NormalsPyr nprev) {\n    const int LEVELS = getUsedLevelsNum();\n    StreamHelper &sh = *shelp_;\n\n    device::ComputeIcpHelper helper(dist_thres_, angle_thres_);\n    affine = Affine3f::Identity();\n\n    for (int level_index = LEVELS - 1; level_index >= 0; --level_index) {\n        const device::Normals &n = (const device::Normals &) nprev[level_index];\n\n        helper.rows = (float) n.rows();\n        helper.cols = (float) n.cols();\n        helper.setLevelIntr(level_index, intr.fx, intr.fy, intr.cx, intr.cy);\n        helper.dcurr = dcurr[level_index];\n        helper.ncurr = ncurr[level_index];\n\n        for (int iter = 0; iter < iters_[level_index]; ++iter) {\n            helper.aff = device_cast<device::Aff3f>(affine);\n            helper(dprev[level_index], n, buffer_, sh, sh);\n\n            StreamHelper::Vec6f b;\n            StreamHelper::Mat6f A = sh.get(b);\n\n            // checking nullspace\n            double det = cv::determinant(A);\n\n            if (fabs(det) < 1e-15 || cv::viz::isNan(det)) {\n                if (cv::viz::isNan(det))\n                    cout << \"qnan\" << endl;\n                return false;\n            }\n\n            StreamHelper::Vec6f r;\n            cv::solve(A, b, r, cv::DECOMP_SVD);\n            Affine3f Tinc(Vec3f(r.val), Vec3f(r.val + 3));\n            affine = Tinc * affine;\n        }\n    }\n    return true;\n}\n\nbool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f &affine, const Intr &intr, const PointsPyr &vcurr,\n                                                     const NormalsPyr ncurr, const PointsPyr vprev,\n                                                     const NormalsPyr nprev) {\n    const int LEVELS = getUsedLevelsNum();\n    StreamHelper &sh = *shelp_;\n\n    device::ComputeIcpHelper helper(dist_thres_, angle_thres_);\n    affine = Affine3f::Identity();\n\n    for (int level_index = LEVELS - 1; level_index >= 0; --level_index) {\n        const device::Normals &n = (const device::Normals &) nprev[level_index];\n        const device::Points &v  = (const device::Points &) vprev[level_index];\n\n        helper.rows = (float) n.rows();\n        helper.cols = (float) n.cols();\n        helper.setLevelIntr(level_index, intr.fx, intr.fy, intr.cx, intr.cy);\n        helper.vcurr = vcurr[level_index];\n        helper.ncurr = ncurr[level_index];\n\n        for (int iter = 0; iter < iters_[level_index]; ++iter) {\n            helper.aff = device_cast<device::Aff3f>(affine);\n            helper(v, n, buffer_, sh, sh);\n\n            StreamHelper::Vec6f b;\n            StreamHelper::Mat6f A = sh.get(b);\n\n            // checking nullspace\n            double det = cv::determinant(A);\n\n            if (fabs(det) < 1e-15 || cv::viz::isNan(det)) {\n                if (cv::viz::isNan(det))\n                    cout << \"qnan\" << endl;\n                return false;\n            }\n\n            StreamHelper::Vec6f r;\n            cv::solve(A, b, r, cv::DECOMP_SVD);\n\n            Affine3f Tinc(Vec3f(r.val), Vec3f(r.val + 3));\n            affine = Tinc * affine;\n        }\n    }\n    return true;\n}\n"
  },
  {
    "path": "src/kfusion/tsdf_volume.cpp",
    "content": "#include <kfusion/precomp.hpp>\n\nusing namespace kfusion;\nusing namespace kfusion::cuda;\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// TsdfVolume::Entry\n\nfloat kfusion::cuda::TsdfVolume::Entry::half2float(half) { throw \"Not implemented\"; }\n\nkfusion::cuda::TsdfVolume::Entry::half kfusion::cuda::TsdfVolume::Entry::float2half(float value) {\n    throw \"Not implemented\";\n}\n\n////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n/// TsdfVolume\n\nkfusion::cuda::TsdfVolume::TsdfVolume(const Params &params) {\n    dims_ = params.volume_dims;\n    size_ = params.volume_size;\n    pose_ = params.volume_pose;\n\n    trunc_dist_ = params.tsdf_trunc_dist;\n    eta_        = params.eta;\n    max_weight_ = params.tsdf_max_weight;\n\n    gradient_delta_factor_ = params.gradient_delta_factor;\n\n    create(dims_);\n}\n\nkfusion::cuda::TsdfVolume::~TsdfVolume() {}\n\nvoid kfusion::cuda::TsdfVolume::create(const Vec3i &dims) {\n    dims_ = dims;\n\n    int no_voxels = dims_[0] * dims_[1] * dims_[2];\n    data_.create(no_voxels * 2 * sizeof(float));\n\n    clear();\n}\n\nVec3i kfusion::cuda::TsdfVolume::getDims() const { return dims_; }\n\nVec3f kfusion::cuda::TsdfVolume::getVoxelSize() const {\n    return Vec3f(size_[0] / dims_[0], size_[1] / dims_[1], size_[2] / dims_[2]);\n}\n\nconst CudaData kfusion::cuda::TsdfVolume::data() const { return data_; }\nCudaData kfusion::cuda::TsdfVolume::data() { return data_; }\n\nVec3f kfusion::cuda::TsdfVolume::getSize() const { return size_; }\nvoid kfusion::cuda::TsdfVolume::setSize(const Vec3f &size) { size_ = size; }\n\nfloat kfusion::cuda::TsdfVolume::getTruncDist() const { return trunc_dist_; }\nvoid kfusion::cuda::TsdfVolume::setTruncDist(float &distance) { trunc_dist_ = distance; }\n\nfloat kfusion::cuda::TsdfVolume::getEta() const { return eta_; }\nvoid kfusion::cuda::TsdfVolume::setEta(float &eta) { eta_ = eta; }\n\nfloat kfusion::cuda::TsdfVolume::getMaxWeight() const { return max_weight_; }\nvoid kfusion::cuda::TsdfVolume::setMaxWeight(float &weight) { max_weight_ = weight; }\n\nAffine3f kfusion::cuda::TsdfVolume::getPose() const { return pose_; }\nvoid kfusion::cuda::TsdfVolume::setPose(const Affine3f &pose) { pose_ = pose; }\n\nfloat kfusion::cuda::TsdfVolume::getRaycastStepFactor() const { return raycast_step_factor_; }\nvoid kfusion::cuda::TsdfVolume::setRaycastStepFactor(float &factor) { raycast_step_factor_ = factor; }\n\nfloat kfusion::cuda::TsdfVolume::getGradientDeltaFactor() const { return gradient_delta_factor_; }\nvoid kfusion::cuda::TsdfVolume::setGradientDeltaFactor(float &factor) { gradient_delta_factor_ = factor; }\n\nvoid kfusion::cuda::TsdfVolume::clear() {\n    device::Vec3i dims = device_cast<device::Vec3i>(dims_);\n    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());\n\n    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n    device::clear_volume(volume);\n}\n\nvoid kfusion::cuda::TsdfVolume::swap(CudaData &data) { data_.swap(data); }\n\nvoid kfusion::cuda::TsdfVolume::applyAffine(const Affine3f &affine) { pose_ = affine * pose_; }\n\nvoid kfusion::cuda::TsdfVolume::integrate(const TsdfVolume &phi_n_psi) {\n    device::Vec3i dims = device_cast<device::Vec3i>(dims_);\n    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());\n\n    device::TsdfVolume phi_global_device(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n    device::TsdfVolume phi_n_psi_device(phi_n_psi.data().ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n\n    device::integrate(phi_global_device, phi_n_psi_device);\n}\n\nvoid kfusion::cuda::TsdfVolume::integrate(const Dists &dists, const Affine3f &camera_pose, const Intr &intr) {\n    Affine3f vol2cam  = camera_pose.inv() * pose_;\n    device::Aff3f aff = device_cast<device::Aff3f>(vol2cam);\n\n    device::Projector proj(intr.fx, intr.fy, intr.cx, intr.cy);\n\n    device::Vec3i dims = device_cast<device::Vec3i>(dims_);\n    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());\n    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n\n    device::integrate(dists, volume, aff, proj);\n}\n\nvoid kfusion::cuda::TsdfVolume::initBox(const float3 &b) {\n    device::Vec3i dims = device_cast<device::Vec3i>(dims_);\n    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());\n    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n\n    device::init_box(volume, b);\n}\n\nvoid kfusion::cuda::TsdfVolume::initEllipsoid(const float3 &r) {\n    device::Vec3i dims = device_cast<device::Vec3i>(dims_);\n    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());\n    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n\n    device::init_ellipsoid(volume, r);\n}\n\nvoid kfusion::cuda::TsdfVolume::initPlane(const float &z) {\n    device::Vec3i dims = device_cast<device::Vec3i>(dims_);\n    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());\n    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n\n    device::init_plane(volume, z);\n}\n\nvoid kfusion::cuda::TsdfVolume::initSphere(const float3 &centre, const float &radius) {\n    device::Vec3i dims = device_cast<device::Vec3i>(dims_);\n    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());\n    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n\n    device::init_sphere(volume, centre, radius);\n}\n\nvoid kfusion::cuda::TsdfVolume::initTorus(const float2 &t) {\n    device::Vec3i dims = device_cast<device::Vec3i>(dims_);\n    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());\n    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);\n\n    device::init_torus(volume, t);\n}\n\nvoid kfusion::cuda::TsdfVolume::print_sdf_values() {\n    cv::Vec3i dims = getDims();\n\n    float2 *phi_data_ptr = new float2[dims[0] * dims[1] * dims[2]];\n    data().download(phi_data_ptr);\n    for (int i = 0; i < dims[0]; i++) {\n        for (int j = 0; j < dims[1]; j++) {\n            for (int k = 0; k < dims[2]; k++) {\n                float2 val = phi_data_ptr[k * dims[1] * dims[0] + j * dims[0] + i];\n                if (val.x != 0.f) {\n                    std::cout << val.x << std::endl;\n                }\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/sobfu/CMakeLists.txt",
    "content": "# Build all sobfu source files into library dynfy\nadd_module_library(sobfu)\n\n# Link to sobfu library\ntarget_link_libraries(sobfu\n    kfusion\n    gcov\n    ${Boost_LIBRARIES}\n    ${CUDA_CUDART_LIBRARY}\n    ${PCL_LIBRARIES}\n)\n"
  },
  {
    "path": "src/sobfu/cuda/reductor.cu",
    "content": "/* sobfu includes */\n#include <sobfu/cuda/utils.hpp>\n#include <sobfu/reductor.hpp>\n\n#define FULL_MASK 0xffffffff\n\n/*\n * OWN KERNELS\n */\n\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void sobfu::device::reduce_data_kernel(float2 *g_idata_global, float2 *g_idata_n, float *g_odata,\n                                                  unsigned int n) {\n    float *sdata = SharedMemory<float>();\n\n    /* perform first level of reduction, reading from global memory, writing to shared memory */\n    unsigned int tid      = threadIdx.x;\n    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;\n    unsigned int gridSize = blockSize * 2 * gridDim.x;\n\n    float mySum = 0.f;\n\n    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via\n     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */\n    while (i < n) {\n        mySum += (g_idata_global[i].x - g_idata_n[i].x) * (g_idata_global[i].x - g_idata_n[i].x);\n\n        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */\n        if (nIsPow2 || i + blockSize < n) {\n            mySum += (g_idata_global[i + blockSize].x - g_idata_n[i + blockSize].x) *\n                     (g_idata_global[i + blockSize].x - g_idata_n[i + blockSize].x);\n        }\n\n        i += gridSize;\n    }\n\n    /* each thread puts its local sum into shared memory */\n    sdata[tid] = mySum;\n    __syncthreads();\n\n    /* do reduction in shared mem */\n    if ((blockSize >= 512) && (tid < 256)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 256];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 256) && (tid < 128)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 128];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 128) && (tid < 64)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 64];\n    }\n\n    __syncthreads();\n\n#if (__CUDA_ARCH__ >= 300)\n    if (tid < 32) {\n        /* fetch final intermediate sum from 2nd warp */\n        if (blockSize >= 64)\n            mySum += sdata[tid + 32];\n        /* reduce final warp using shuffle */\n        for (int offset = warpSize / 2; offset > 0; offset /= 2) {\n            mySum += __shfl_down_sync(FULL_MASK, mySum, offset);\n        }\n    }\n#else\n    /* fully unroll reduction within a single warp */\n    if ((blockSize >= 64) && (tid < 32)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 32];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 32) && (tid < 16)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 16];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 16) && (tid < 8)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 8];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 8) && (tid < 4)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 4];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 4) && (tid < 2)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 2];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 2) && (tid < 1)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 1];\n    }\n\n    __syncthreads();\n#endif\n\n    /* write result for this block to global mem */\n    if (tid == 0)\n        g_odata[blockIdx.x] = mySum;\n}\n\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void sobfu::device::reduce_reg_sobolev_kernel(Mat4f *g_idata, float *g_odata, unsigned int n) {\n    float *sdata = SharedMemory<float>();\n\n    /* perform first level of reduction, reading from global memory, writing to shared memory */\n    unsigned int tid      = threadIdx.x;\n    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;\n    unsigned int gridSize = blockSize * 2 * gridDim.x;\n\n    float mySum = 0.f;\n\n    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via\n     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */\n    while (i < n) {\n        mySum += norm_sq(g_idata[i].data[0]) + norm_sq(g_idata[i].data[1]) + norm_sq(g_idata[i].data[2]);\n\n        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */\n        if (nIsPow2 || i + blockSize < n) {\n            mySum += norm_sq(g_idata[i + blockSize].data[0]) + norm_sq(g_idata[i + blockSize].data[1]) +\n                     norm_sq(g_idata[i + blockSize].data[2]);\n        }\n\n        i += gridSize;\n    }\n\n    /* each thread puts its local sum into shared memory */\n    sdata[tid] = mySum;\n    __syncthreads();\n\n    /* do reduction in shared mem */\n    if ((blockSize >= 512) && (tid < 256)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 256];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 256) && (tid < 128)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 128];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 128) && (tid < 64)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 64];\n    }\n\n    __syncthreads();\n\n#if (__CUDA_ARCH__ >= 300)\n    if (tid < 32) {\n        /* fetch final intermediate sum from 2nd warp */\n        if (blockSize >= 64)\n            mySum += sdata[tid + 32];\n        /* reduce final warp using shuffle */\n        for (int offset = warpSize / 2; offset > 0; offset /= 2) {\n            mySum += __shfl_down_sync(FULL_MASK, mySum, offset);\n        }\n    }\n#else\n    /* fully unroll reduction within a single warp */\n    if ((blockSize >= 64) && (tid < 32)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 32];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 32) && (tid < 16)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 16];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 16) && (tid < 8)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 8];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 8) && (tid < 4)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 4];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 4) && (tid < 2)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 2];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 2) && (tid < 1)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 1];\n    }\n\n    __syncthreads();\n#endif\n\n    /* write result for this block to global mem */\n    if (tid == 0)\n        g_odata[blockIdx.x] = mySum;\n}\n\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void sobfu::device::reduce_voxel_max_energy_kernel(float2 *d_idata_global, float2 *d_idata_n,\n                                                              Mat4f *d_idata_reg, float2 *d_o_data, float w_reg,\n                                                              unsigned int n) {\n    float2 *sdata = SharedMemory<float2>();\n\n    /* perform first level of reduction, reading from global memory, writing to shared memory */\n    unsigned int tid      = threadIdx.x;\n    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;\n    unsigned int gridSize = blockSize * 2 * gridDim.x;\n\n    float2 local_max;\n    local_max.x = 0.f;\n    local_max.y = 0.f;\n\n    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via\n     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */\n    while (i < n) {\n        float temp = (d_idata_global[i].x - d_idata_n[i].x) * (d_idata_global[i].x - d_idata_n[i].x) +\n                     w_reg * (norm_sq(d_idata_reg[i].data[0]) + norm_sq(d_idata_reg[i].data[1]) +\n                              norm_sq(d_idata_reg[i].data[2]));\n\n        if (temp > local_max.x) {\n            local_max.x = temp;\n            local_max.y = (float) i;\n        }\n\n        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */\n        if (nIsPow2 || i + blockSize < n) {\n            temp = (d_idata_global[i + blockSize].x - d_idata_n[i + blockSize].x) *\n                       (d_idata_global[i + blockSize].x - d_idata_n[i + blockSize].x) +\n                   w_reg * (norm_sq(d_idata_reg[i + blockSize].data[0]) + norm_sq(d_idata_reg[i + blockSize].data[1]) +\n                            norm_sq(d_idata_reg[i + blockSize].data[2]));\n\n            if (temp > local_max.x) {\n                local_max.x = temp;\n                local_max.y = (float) i + blockSize;\n            }\n        }\n\n        i += gridSize;\n    }\n\n    /* each thread puts its local sum into shared memory */\n    sdata[tid] = local_max;\n    __syncthreads();\n\n    /* do reduction in shared mem */\n    if ((blockSize >= 512) && (tid < 256)) {\n        if (sdata[tid + 256].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 256];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 256) && (tid < 128)) {\n        if (sdata[tid + 128].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 128];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 128) && (tid < 64)) {\n        if (sdata[tid + 64].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 64];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 64) && (tid < 32)) {\n        if (sdata[tid + 32].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 32];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 32) && (tid < 16)) {\n        if (sdata[tid + 16].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 16];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 16) && (tid < 8)) {\n        if (sdata[tid + 8].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 8];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 8) && (tid < 4)) {\n        if (sdata[tid + 4].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 4];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 4) && (tid < 2)) {\n        if (sdata[tid + 2].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 2];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 2) && (tid < 1)) {\n        if (sdata[tid + 1].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 1];\n        }\n    }\n\n    __syncthreads();\n\n    /* write result for this block to global mem */\n    if (tid == 0) {\n        d_o_data[blockIdx.x] = local_max;\n    }\n}\n\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void sobfu::device::reduce_max_kernel(float4 *updates, float2 *g_o_max_data, unsigned int n) {\n    float2 *sdata = SharedMemory<float2>();\n\n    /* perform first level of reduction, reading from global memory, writing to shared memory */\n    unsigned int tid      = threadIdx.x;\n    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;\n    unsigned int gridSize = blockSize * 2 * gridDim.x;\n\n    float2 local_max;\n    local_max.x = 0.f;\n    local_max.y = 0.f;\n\n    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via\n     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */\n    while (i < n) {\n        if (norm(updates[i]) > local_max.x) {\n            local_max.x = norm(updates[i]);\n            local_max.y = (float) i;\n        }\n\n        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */\n        if (nIsPow2 || i + blockSize < n) {\n            if (norm(updates[i + blockSize]) > local_max.x) {\n                local_max.x = norm(updates[i + blockSize]);\n                local_max.y = (float) i + blockSize;\n            }\n        }\n\n        i += gridSize;\n    }\n\n    /* each thread puts its local sum into shared memory */\n    sdata[tid] = local_max;\n    __syncthreads();\n\n    /* do reduction in shared mem */\n    if ((blockSize >= 512) && (tid < 256)) {\n        if (sdata[tid + 256].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 256];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 256) && (tid < 128)) {\n        if (sdata[tid + 128].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 128];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 128) && (tid < 64)) {\n        if (sdata[tid + 64].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 64];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 64) && (tid < 32)) {\n        if (sdata[tid + 32].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 32];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 32) && (tid < 16)) {\n        if (sdata[tid + 16].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 16];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 16) && (tid < 8)) {\n        if (sdata[tid + 8].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 8];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 8) && (tid < 4)) {\n        if (sdata[tid + 4].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 4];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 4) && (tid < 2)) {\n        if (sdata[tid + 2].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 2];\n        }\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 2) && (tid < 1)) {\n        if (sdata[tid + 1].x > local_max.x) {\n            sdata[tid] = local_max = sdata[tid + 1];\n        }\n    }\n\n    __syncthreads();\n\n    /* write result for this block to global mem */\n    if (tid == 0) {\n        g_o_max_data[blockIdx.x] = local_max;\n    }\n}\n\n/* wrapper function for kernel launch */\nvoid sobfu::device::reduce_data(int size, int threads, int blocks, float2 *d_idata_global, float2 *d_idata_n,\n                                float *d_odata) {\n    dim3 dimBlock(threads, 1, 1);\n    dim3 dimGrid(blocks, 1, 1);\n\n    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't\n     * index shared memory out of bounds */\n    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);\n\n    if (isPow2(size)) {\n        switch (threads) {\n            case 512:\n                reduce_data_kernel<512, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 256:\n                reduce_data_kernel<256, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 128:\n                reduce_data_kernel<128, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 64:\n                reduce_data_kernel<64, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 32:\n                reduce_data_kernel<32, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 16:\n                reduce_data_kernel<16, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 8:\n                reduce_data_kernel<8, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 4:\n                reduce_data_kernel<4, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 2:\n                reduce_data_kernel<2, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 1:\n                reduce_data_kernel<1, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n        }\n    } else {\n        switch (threads) {\n            case 512:\n                reduce_data_kernel<512, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 256:\n                reduce_data_kernel<256, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 128:\n                reduce_data_kernel<128, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 64:\n                reduce_data_kernel<64, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 32:\n                reduce_data_kernel<32, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 16:\n                reduce_data_kernel<16, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 8:\n                reduce_data_kernel<8, false><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 4:\n                reduce_data_kernel<4, false><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 2:\n                reduce_data_kernel<2, false><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n\n            case 1:\n                reduce_data_kernel<1, false><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);\n                break;\n        }\n    }\n}\n\nvoid sobfu::device::reduce_reg_sobolev(int size, int threads, int blocks, Mat4f *d_idata, float *d_odata) {\n    dim3 dimBlock(threads, 1, 1);\n    dim3 dimGrid(blocks, 1, 1);\n\n    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't\n     * index shared memory out of bounds */\n    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);\n\n    if (isPow2(size)) {\n        switch (threads) {\n            case 512:\n                reduce_reg_sobolev_kernel<512, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 256:\n                reduce_reg_sobolev_kernel<256, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 128:\n                reduce_reg_sobolev_kernel<128, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 64:\n                reduce_reg_sobolev_kernel<64, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 32:\n                reduce_reg_sobolev_kernel<32, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 16:\n                reduce_reg_sobolev_kernel<16, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 8:\n                reduce_reg_sobolev_kernel<8, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 4:\n                reduce_reg_sobolev_kernel<4, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 2:\n                reduce_reg_sobolev_kernel<2, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 1:\n                reduce_reg_sobolev_kernel<1, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n        }\n    } else {\n        switch (threads) {\n            case 512:\n                reduce_reg_sobolev_kernel<512, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 256:\n                reduce_reg_sobolev_kernel<256, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 128:\n                reduce_reg_sobolev_kernel<128, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 64:\n                reduce_reg_sobolev_kernel<64, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 32:\n                reduce_reg_sobolev_kernel<32, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 16:\n                reduce_reg_sobolev_kernel<16, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 8:\n                reduce_reg_sobolev_kernel<8, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 4:\n                reduce_reg_sobolev_kernel<4, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 2:\n                reduce_reg_sobolev_kernel<2, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n\n            case 1:\n                reduce_reg_sobolev_kernel<1, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);\n                break;\n        }\n    }\n}\n\nvoid sobfu::device::reduce_voxel_max_energy(int size, int threads, int blocks, float2 *d_idata_global,\n                                            float2 *d_idata_n, Mat4f *d_idata_reg, float w_reg, float2 *d_odata) {\n    dim3 dimBlock(threads, 1, 1);\n    dim3 dimGrid(blocks, 1, 1);\n\n    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't\n     * index shared memory out of bounds */\n    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float2) : threads * sizeof(float2);\n\n    if (isPow2(size)) {\n        switch (threads) {\n            case 512:\n                reduce_voxel_max_energy_kernel<512, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 256:\n                reduce_voxel_max_energy_kernel<256, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 128:\n                reduce_voxel_max_energy_kernel<128, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 64:\n                reduce_voxel_max_energy_kernel<64, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 32:\n                reduce_voxel_max_energy_kernel<32, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 16:\n                reduce_voxel_max_energy_kernel<16, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 8:\n                reduce_voxel_max_energy_kernel<8, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 4:\n                reduce_voxel_max_energy_kernel<4, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 2:\n                reduce_voxel_max_energy_kernel<2, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 1:\n                reduce_voxel_max_energy_kernel<1, true>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n        }\n    } else {\n        switch (threads) {\n            case 512:\n                reduce_voxel_max_energy_kernel<512, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 256:\n                reduce_voxel_max_energy_kernel<256, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 128:\n                reduce_voxel_max_energy_kernel<128, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 64:\n                reduce_voxel_max_energy_kernel<64, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 32:\n                reduce_voxel_max_energy_kernel<32, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 16:\n                reduce_voxel_max_energy_kernel<16, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 8:\n                reduce_voxel_max_energy_kernel<8, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 4:\n                reduce_voxel_max_energy_kernel<4, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 2:\n                reduce_voxel_max_energy_kernel<2, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n\n            case 1:\n                reduce_voxel_max_energy_kernel<1, false>\n                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);\n                break;\n        }\n    }\n}\n\nvoid sobfu::device::reduce_max(int size, int threads, int blocks, float4 *updates, float2 *d_o_max_data) {\n    dim3 dimBlock(threads, 1, 1);\n    dim3 dimGrid(blocks, 1, 1);\n\n    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't\n     * index shared memory out of bounds */\n    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float2) : threads * sizeof(float2);\n\n    if (isPow2(size)) {\n        switch (threads) {\n            case 512:\n                reduce_max_kernel<512, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 256:\n                reduce_max_kernel<256, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 128:\n                reduce_max_kernel<128, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 64:\n                reduce_max_kernel<64, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 32:\n                reduce_max_kernel<32, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 16:\n                reduce_max_kernel<16, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 8:\n                reduce_max_kernel<8, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 4:\n                reduce_max_kernel<4, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 2:\n                reduce_max_kernel<2, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 1:\n                reduce_max_kernel<1, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n        }\n    } else {\n        switch (threads) {\n            case 512:\n                reduce_max_kernel<512, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 256:\n                reduce_max_kernel<256, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 128:\n                reduce_max_kernel<128, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 64:\n                reduce_max_kernel<64, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 32:\n                reduce_max_kernel<32, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 16:\n                reduce_max_kernel<16, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 8:\n                reduce_max_kernel<8, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 4:\n                reduce_max_kernel<4, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 2:\n                reduce_max_kernel<2, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n\n            case 1:\n                reduce_max_kernel<1, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);\n                break;\n        }\n    }\n}\n"
  },
  {
    "path": "src/sobfu/cuda/scalar_fields.cu",
    "content": "/* sobfu includes */\n#include <sobfu/cuda/utils.hpp>\n#include <sobfu/scalar_fields.hpp>\n\n#define FULL_MASK 0xffffffff\n\n/*\n * SCALAR FIELD\n */\n\n__device__ __forceinline__ float* sobfu::device::ScalarField::beg(int x, int y) const { return data + y * dims.x + x; }\n\n__device__ __forceinline__ float* sobfu::device::ScalarField::zstep(float* const ptr) const {\n    return ptr + dims.x * dims.y;\n}\n\n__device__ __forceinline__ float* sobfu::device::ScalarField::operator()(int idx) const { return data + idx; }\n\n__device__ __forceinline__ float* sobfu::device::ScalarField::operator()(int x, int y, int z) const {\n    return data + z * dims.y * dims.x + y * dims.x + x;\n}\n\n__global__ void sobfu::device::clear_kernel(sobfu::device::ScalarField field) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > field.dims.x - 1 || y > field.dims.y - 1) {\n        return;\n    }\n\n    float* pos = field.beg(x, y);\n    for (int i = 0; i <= field.dims.z - 1; pos = field.zstep(pos), ++i) {\n        *pos = 0.f;\n    }\n}\n\nvoid sobfu::device::clear(sobfu::device::ScalarField& field) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(field.dims.x, block.x), kfusion::device::divUp(field.dims.y, block.y));\n\n    clear_kernel<<<grid, block>>>(field);\n    cudaSafeCall(cudaGetLastError());\n}\n\ntemplate <unsigned int blockSize, bool nIsPow2>\n__global__ void sobfu::device::reduce_sum_kernel(float* g_idata, float* g_odata, unsigned int n) {\n    float* sdata = SharedMemory<float>();\n\n    /* perform first level of reduction, reading from global memory, writing to shared memory */\n    unsigned int tid      = threadIdx.x;\n    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;\n    unsigned int gridSize = blockSize * 2 * gridDim.x;\n\n    float mySum = 0.f;\n\n    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via\n     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */\n    while (i < n) {\n        mySum += g_idata[i];\n\n        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */\n        if (nIsPow2 || i + blockSize < n) {\n            mySum += g_idata[i + blockSize];\n        }\n\n        i += gridSize;\n    }\n\n    /* each thread puts its local sum into shared memory */\n    sdata[tid] = mySum;\n    __syncthreads();\n\n    /* do reduction in shared mem */\n    if ((blockSize >= 512) && (tid < 256)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 256];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 256) && (tid < 128)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 128];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 128) && (tid < 64)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 64];\n    }\n\n    __syncthreads();\n\n#if (__CUDA_ARCH__ >= 300)\n    if (tid < 32) {\n        /* fetch final intermediate sum from 2nd warp */\n        if (blockSize >= 64)\n            mySum += sdata[tid + 32];\n        /* reduce final warp using shuffle */\n        for (int offset = warpSize / 2; offset > 0; offset /= 2) {\n            mySum += __shfl_down_sync(FULL_MASK, mySum, offset);\n        }\n    }\n#else\n    /* fully unroll reduction within a single warp */\n    if ((blockSize >= 64) && (tid < 32)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 32];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 32) && (tid < 16)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 16];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 16) && (tid < 8)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 8];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 8) && (tid < 4)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 4];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 4) && (tid < 2)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 2];\n    }\n\n    __syncthreads();\n\n    if ((blockSize >= 2) && (tid < 1)) {\n        sdata[tid] = mySum = mySum + sdata[tid + 1];\n    }\n\n    __syncthreads();\n#endif\n\n    /* write result for this block to global mem */\n    if (tid == 0)\n        g_odata[blockIdx.x] = mySum;\n}\n\nfloat sobfu::device::final_reduce_sum(float* d_odata, int numBlocks) {\n    /* copy result from device to host */\n    float* h_odata = new float[numBlocks];\n    cudaSafeCall(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(float), cudaMemcpyDeviceToHost));\n\n    float result = 0.f;\n    for (int i = 0; i < numBlocks; i++) {\n        result += h_odata[i];\n    }\n\n    delete h_odata;\n    return result;\n}\n\nfloat sobfu::device::reduce_sum(sobfu::device::ScalarField& field) {\n    int no_voxels = field.dims.x * field.dims.y * field.dims.z;\n    int blocks, threads;\n\n    get_num_blocks_and_threads(no_voxels, 65536, 512, blocks, threads);\n\n    dim3 dimBlock(threads, 1, 1);\n    dim3 dimGrid(blocks, 1, 1);\n\n    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't\n     * index shared memory out of bounds */\n    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);\n\n    float* d_odata;\n    cudaSafeCall(cudaMalloc((void**) &d_odata, blocks * sizeof(float)));\n\n    if (isPow2(no_voxels)) {\n        switch (threads) {\n            case 512:\n                reduce_sum_kernel<512, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 256:\n                reduce_sum_kernel<256, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 128:\n                reduce_sum_kernel<128, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 64:\n                reduce_sum_kernel<64, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 32:\n                reduce_sum_kernel<32, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 16:\n                reduce_sum_kernel<16, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 8:\n                reduce_sum_kernel<8, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 4:\n                reduce_sum_kernel<4, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 2:\n                reduce_sum_kernel<2, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 1:\n                reduce_sum_kernel<1, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n        }\n    } else {\n        switch (threads) {\n            case 512:\n                reduce_sum_kernel<512, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 256:\n                reduce_sum_kernel<256, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 128:\n                reduce_sum_kernel<128, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 64:\n                reduce_sum_kernel<64, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 32:\n                reduce_sum_kernel<32, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 16:\n                reduce_sum_kernel<16, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 8:\n                reduce_sum_kernel<8, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 4:\n                reduce_sum_kernel<4, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 2:\n                reduce_sum_kernel<2, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n\n            case 1:\n                reduce_sum_kernel<1, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);\n                break;\n        }\n    }\n\n    float result = final_reduce_sum(d_odata, blocks);\n    cudaSafeCall(cudaFree(d_odata));\n    return result;\n}\n"
  },
  {
    "path": "src/sobfu/cuda/solver.cu",
    "content": "/* sobfu includes */\n#include <sobfu/cuda/utils.hpp>\n#include <sobfu/solver.hpp>\n\n/* kinfu includes */\n#include <kfusion/cuda/device.hpp>\n\nusing namespace kfusion;\nusing namespace kfusion::device;\n\n/*\n * POTENTIAL GRADIENT\n */\n\n__global__ void sobfu::device::calculate_potential_gradient_kernel(float2* phi_n_psi, float2* phi_global,\n                                                                   float4* nabla_phi_n_o_psi, float4* L,\n                                                                   float4* nabla_U, float w_reg, int dim_x, int dim_y,\n                                                                   int dim_z) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > dim_x - 1 || y > dim_y - 1) {\n        return;\n    }\n\n    int idx = y * dim_x + x;\n    for (int i = 0; i <= dim_z - 1; idx += dim_y * dim_x, ++i) {\n        float tsdf_n_psi  = phi_n_psi[idx].x;\n        float tsdf_global = phi_global[idx].x;\n\n        nabla_U[idx] = (tsdf_n_psi - tsdf_global) * nabla_phi_n_o_psi[idx] + w_reg * L[idx];\n    }\n}\n\nvoid sobfu::device::calculate_potential_gradient(kfusion::device::TsdfVolume& phi_n_psi,\n                                                 kfusion::device::TsdfVolume& phi_global,\n                                                 sobfu::device::TsdfGradient& nabla_phi_n_o_psi,\n                                                 sobfu::device::Laplacian& L, sobfu::device::PotentialGradient& nabla_U,\n                                                 float w_reg) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(phi_n_psi.dims.x, block.x), kfusion::device::divUp(phi_n_psi.dims.y, block.y));\n\n    calculate_potential_gradient_kernel<<<grid, block>>>(phi_n_psi.data, phi_global.data, nabla_phi_n_o_psi.data,\n                                                         L.data, nabla_U.data, w_reg, phi_n_psi.dims.x,\n                                                         phi_n_psi.dims.y, phi_n_psi.dims.z);\n    cudaSafeCall(cudaGetLastError());\n}\n\n/*\n * DEFORMATION FIELD\n */\n\n__global__ void sobfu::device::update_psi_kernel(float4* psi, float4* nabla_U_S, float4* updates, float alpha,\n                                                 int dim_x, int dim_y, int dim_z) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > dim_x - 1 || y > dim_y - 1) {\n        return;\n    }\n\n    int global_idx = y * dim_x + x;\n    for (int i = 0; i <= dim_z - 1; global_idx += dim_y * dim_x, ++i) {\n        float4 update = alpha * nabla_U_S[global_idx];\n\n        updates[global_idx] = update;\n        psi[global_idx] -= update;\n    }\n}\n\nvoid sobfu::device::update_psi(sobfu::device::DeformationField& psi, sobfu::device::PotentialGradient& nabla_U_S,\n                               float4* updates, float alpha) {\n    /* integrate in time */\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(psi.dims.x, block.x), kfusion::device::divUp(psi.dims.y, block.y));\n\n    update_psi_kernel<<<grid, block>>>(psi.data, nabla_U_S.data, updates, alpha, psi.dims.x, psi.dims.y, psi.dims.z);\n    cudaSafeCall(cudaGetLastError());\n}\n\n/*\n * PIPELINE\n */\n\nvoid sobfu::device::estimate_psi(SDFs& sdfs, sobfu::device::DeformationField& psi,\n                                 sobfu::device::DeformationField& psi_inv,\n                                 sobfu::device::SpatialGradients* spatial_grads, Differentiators& differentiators,\n                                 float* d_S_i, sobfu::device::Reductor* r, SolverParams& params) {\n    /* copy sobolev filter to constant memory */\n    set_convolution_kernel(d_S_i);\n\n    /* create cuda streams */\n    int no_streams = 3;\n    cudaStream_t streams[no_streams];\n    for (int i = 0; i < no_streams; i++) {\n        cudaSafeCall(cudaStreamCreate(&streams[i]));\n    }\n\n    /* calculate no. of blocks and no. of threads per block */\n    int3 dims = psi.dims;\n\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(dims.x, block.x), kfusion::device::divUp(dims.y, block.y));\n\n    /* apply psi to phi_n */\n    apply_kernel<<<grid, block, 0, streams[0]>>>(sdfs.phi_n, sdfs.phi_n_psi, psi);\n    cudaSafeCall(cudaGetLastError());\n\n    /* run gradient descent */\n    float2 curr_max_update_norm;\n    float e_curr = std::numeric_limits<float>::infinity();\n\n    int iter = 1;\n    while (iter <= params.max_iter) {\n        if (iter == 1 || iter % 50 == 0) {\n            std::cout << \"iter. no. \" << iter << std::endl;\n        }\n\n        /* calculate the gradient of phi_n */\n        estimate_gradient_kernel<<<grid, block, 0, streams[0]>>>(differentiators.tsdf_diff,\n                                                                 *(spatial_grads->nabla_phi_n_o_psi));\n        cudaSafeCall(cudaGetLastError());\n        /* calculate the jacobian of psi */\n        estimate_deformation_jacobian_kernel<<<grid, block, 0, streams[1]>>>(differentiators.diff, *(spatial_grads->J));\n        cudaSafeCall(cudaGetLastError());\n        /* calculate the laplacian of psi */\n        estimate_laplacian_kernel<<<grid, block, 0, streams[2]>>>(differentiators.second_order_diff,\n                                                                  *(spatial_grads->L));\n        cudaSafeCall(cudaGetLastError());\n\n        /* calculate current value of the energy functional */\n        if ((params.verbosity == 1 && (iter == 1 || iter % 50 == 0 || iter == params.max_iter) ||\n             params.verbosity == 2)) {\n            /* data term */\n            float e_data = r->data_energy(sdfs.phi_global.data, sdfs.phi_n_psi.data);\n            /* regularisation term */\n            float e_reg = r->reg_energy_sobolev(spatial_grads->J->data);\n\n            e_curr = e_data + params.w_reg * e_reg;\n            std::cout << \"data energy + w_reg * reg energy = \" << e_data << \" + \" << params.w_reg << \" * \" << e_reg\n                      << \" = \" << e_curr << std::endl;\n        }\n\n        /*\n         * PDE'S\n         */\n\n        /* calculate gradient of the potential */\n        sobfu::device::calculate_potential_gradient(sdfs.phi_n_psi, sdfs.phi_global,\n                                                    *(spatial_grads->nabla_phi_n_o_psi), *(spatial_grads->L),\n                                                    *(spatial_grads->nabla_U), params.w_reg);\n        cudaSafeCall(cudaGetLastError());\n\n        /* convolve gradient of the potential with a sobolev kernel */\n        sobfu::device::convolution_rows((*(spatial_grads->nabla_U_S)).data, (*(spatial_grads->nabla_U)).data, dims.x,\n                                        dims.y, dims.z);\n        sobfu::device::convolution_columns((*(spatial_grads->nabla_U_S)).data, (*(spatial_grads->nabla_U)).data, dims.x,\n                                           dims.y, dims.z);\n        sobfu::device::convolution_depth((*(spatial_grads->nabla_U_S)).data, (*(spatial_grads->nabla_U)).data, dims.x,\n                                         dims.y, dims.z);\n\n        /* update psi */\n        update_psi_kernel<<<grid, block, 0, streams[0]>>>(psi.data, (*(spatial_grads->nabla_U_S)).data, r->updates,\n                                                          params.alpha, dims.x, dims.y, dims.z);\n        cudaSafeCall(cudaGetLastError());\n\n        /* apply psi to phi_n */\n        apply_kernel<<<grid, block, 0, streams[0]>>>(sdfs.phi_n, sdfs.phi_n_psi, psi);\n        cudaSafeCall(cudaGetLastError());\n\n        /* get value of the max. update norm at the current iteration of the solver */\n        curr_max_update_norm = r->max_update_norm();\n        if ((params.verbosity == 1 && (iter == 1 || iter % 50 == 0 || iter == params.max_iter) ||\n             params.verbosity == 2)) {\n            int idx_x = curr_max_update_norm.y / (psi.dims.x * psi.dims.y);\n            int idx_y = (curr_max_update_norm.y - idx_x * psi.dims.x * psi.dims.y) / psi.dims.x;\n            int idx_z = curr_max_update_norm.y - psi.dims.x * (idx_y + psi.dims.y * idx_x);\n\n            std::cout << \"max. update norm \" << curr_max_update_norm.x << \" at voxel (\" << idx_z << \", \" << idx_y\n                      << \", \" << idx_x << \")\" << std::endl;\n        }\n\n        if (curr_max_update_norm.x <= params.max_update_norm) {\n            std::cout << \"SOLVER CONVERGED AFTER \" << iter << \" ITERATIONS\" << std::endl;\n            break;\n        }\n\n        if (iter == params.max_iter) {\n            std::cout << \"SOLVER REACHED MAX. NO. OF ITERATIONS WITHOUT CONVERGING\" << std::endl;\n        }\n\n        iter++;\n    }\n\n    /* iteratively estimate the inverse deformation field */\n    sobfu::device::init_identity(psi_inv);\n    sobfu::device::estimate_inverse(psi, psi_inv);\n    /* apply psi_inv to phi_global */\n    apply_kernel<<<grid, block>>>(sdfs.phi_global, sdfs.phi_global_psi_inv, psi_inv);\n    cudaSafeCall(cudaGetLastError());\n\n    for (int i = 0; i < no_streams; i++) {\n        cudaSafeCall(cudaStreamDestroy(streams[i]));\n    }\n}\n\n/*\n * CONVOLUTIONS\n */\n\n#define KERNEL_RADIUS 3\n#define KERNEL_LENGTH (2 * KERNEL_RADIUS + 1)\n\n#define ROWS_BLOCKDIM_X 4\n#define ROWS_BLOCKDIM_Y 64\n#define ROWS_RESULT_STEPS 4\n#define ROWS_HALO_STEPS 1\n\n#define COLUMNS_BLOCKDIM_X 64\n#define COLUMNS_BLOCKDIM_Y 4\n#define COLUMNS_RESULT_STEPS 4\n#define COLUMNS_HALO_STEPS 1\n\n#define DEPTH_BLOCKDIM_X 64\n#define DEPTH_BLOCKDIM_Z 4\n#define DEPTH_RESULT_STEPS 4\n#define DEPTH_HALO_STEPS 1\n\n__constant__ float S[KERNEL_LENGTH];\n\nvoid sobfu::device::set_convolution_kernel(float* d_kernel) {\n    cudaSafeCall(cudaMemcpyToSymbol(S, d_kernel, KERNEL_LENGTH * sizeof(float), 0, cudaMemcpyDeviceToDevice));\n    cudaSafeCall(cudaGetLastError());\n}\n\n/*** ROW CONVOLUTION ***/\n__global__ void sobfu::device::convolution_rows_kernel(float4* d_dst, float4* d_src, int image_w, int image_h,\n                                                       int image_d) {\n    __shared__ float4 s_data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X];\n\n    /* offset to the left halo edge */\n    const int base_x = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x;\n    const int base_y = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;\n    const int base_z = blockIdx.z;\n\n    const int first_pixel_in_line = ROWS_BLOCKDIM_X * ROWS_HALO_STEPS - threadIdx.x;\n    const int last_pixel_in_line  = image_w - base_x - 1;\n\n    d_dst += base_z * image_h * image_w + base_y * image_w + base_x;\n    d_src += base_z * image_h * image_w + base_y * image_w + base_x;\n\n    /* load main data */\n#pragma unroll\n    for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) {\n        s_data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =\n            (image_w - base_x > i * ROWS_BLOCKDIM_X) ? d_src[i * ROWS_BLOCKDIM_X] : d_src[last_pixel_in_line];\n    }\n\n    /* load left halo */\n#pragma unroll\n    for (int i = 0; i < ROWS_HALO_STEPS; i++) {\n        s_data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =\n            (base_x >= -i * ROWS_BLOCKDIM_X) ? d_src[i * ROWS_BLOCKDIM_X] : d_src[first_pixel_in_line];\n    }\n\n    /* load right halo */\n#pragma unroll\n    for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) {\n        s_data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =\n            (image_w - base_x > i * ROWS_BLOCKDIM_X) ? d_src[i * ROWS_BLOCKDIM_X] : d_src[last_pixel_in_line];\n    }\n\n    /* compute and store results */\n    __syncthreads();\n\n    /* this pixel is not part of the iamge and doesn't need to be convolved */\n    if (base_y >= image_h) {\n        return;\n    }\n#pragma unroll\n    for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) {\n        if (image_w - base_x > i * ROWS_BLOCKDIM_X) {\n            float4 sum = make_float4(0.f, 0.f, 0.f, 0.f);\n\n#pragma unroll\n            for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {\n                sum += S[KERNEL_RADIUS - j] * s_data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];\n            }\n\n            d_dst[i * ROWS_BLOCKDIM_X] = sum;\n        }\n    }\n}\n\nvoid sobfu::device::convolution_rows(float4* d_dst, float4* d_src, int image_w, int image_h, int image_d) {\n    int blocks_x =\n        image_w / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) + min(1, image_w % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X));\n    int blocks_y = image_h / ROWS_BLOCKDIM_Y + min(1, image_h % ROWS_BLOCKDIM_Y);\n    int blocks_z = image_d;\n\n    dim3 blocks(blocks_x, blocks_y, blocks_z);\n    dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y, 1);\n\n    convolution_rows_kernel<<<blocks, threads>>>(d_dst, d_src, image_w, image_h, image_d);\n    cudaSafeCall(cudaGetLastError());\n}\n\n/*** COLUMMN CONVOLUTION ***/\n__global__ void sobfu::device::convolution_columns_kernel(float4* d_dst, float4* d_src, int image_w, int image_h,\n                                                          int image_d) {\n    __shared__ float4\n        s_data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1];\n\n    /* offset to the upper halo edge */\n    const int base_x = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x;\n    const int base_y = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + threadIdx.y;\n    const int base_z = blockIdx.z;\n\n    const int first_pixel_in_line = (COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS - threadIdx.y) * image_w;\n    const int last_pixel_in_line  = (image_h - base_y - 1) * image_w;\n\n    d_dst += base_z * image_h * image_w + base_y * image_w + base_x;\n    d_src += base_z * image_h * image_w + base_y * image_w + base_x;\n\n    /* main data */\n#pragma unroll\n    for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {\n        s_data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = (image_h - base_y > i * COLUMNS_BLOCKDIM_Y)\n                                                                        ? d_src[i * COLUMNS_BLOCKDIM_Y * image_w]\n                                                                        : d_src[last_pixel_in_line];\n    }\n\n    /* upper halo */\n#pragma unroll\n    for (int i = 0; i < COLUMNS_HALO_STEPS; i++) {\n        s_data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =\n            (base_y >= -i * COLUMNS_BLOCKDIM_Y) ? d_src[i * COLUMNS_BLOCKDIM_Y * image_w] : d_src[first_pixel_in_line];\n    }\n\n    /* lower halo */\n#pragma unroll\n    for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS;\n         i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; i++) {\n        s_data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = (image_h - base_y > i * COLUMNS_BLOCKDIM_Y)\n                                                                        ? d_src[i * COLUMNS_BLOCKDIM_Y * image_w]\n                                                                        : d_src[last_pixel_in_line];\n    }\n\n    /* compute and store results */\n    __syncthreads();\n\n    /* this pixel isn't part of hte image and doesn't need to be convolved */\n    if (base_x >= image_w) {\n        return;\n    }\n\n#pragma unroll\n    for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {\n        if (image_h - base_y > i * COLUMNS_BLOCKDIM_Y) {\n            float4 sum = make_float4(0.f, 0.f, 0.f, 0.f);\n#pragma unroll\n            for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {\n                sum += S[KERNEL_RADIUS - j] * s_data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j];\n            }\n\n            d_dst[i * COLUMNS_BLOCKDIM_Y * image_w] += sum;\n        }\n    }\n}\n\nvoid sobfu::device::convolution_columns(float4* d_dst, float4* d_src, int image_w, int image_h, int image_d) {\n    int blocks_x = image_w / COLUMNS_BLOCKDIM_X + min(1, image_w % COLUMNS_BLOCKDIM_X);\n    int blocks_y = image_h / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) +\n                   min(1, image_h % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y));\n    int blocks_z = image_d;\n\n    dim3 blocks(blocks_x, blocks_y, blocks_z);\n    dim3 threads(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y, 1);\n\n    convolution_columns_kernel<<<blocks, threads>>>(d_dst, d_src, image_w, image_h, image_d);\n    cudaSafeCall(cudaGetLastError());\n}\n\n/*** DEPTH CONVOLUTION ***/\n__global__ void sobfu::device::convolution_depth_kernel(float4* d_dst, float4* d_src, int image_w, int image_h,\n                                                        int image_d) {\n    /* here it is [x][z] as we leave out y bc it has a size of 1 */\n    __shared__ float4 s_data[DEPTH_BLOCKDIM_X][(DEPTH_RESULT_STEPS + 2 * DEPTH_HALO_STEPS) * DEPTH_BLOCKDIM_Z + 1];\n\n    /* offset to the upper halo edge */\n    const int base_x = blockIdx.x * DEPTH_BLOCKDIM_X + threadIdx.x;\n    const int base_y = blockIdx.y;\n    const int base_z = (blockIdx.z * DEPTH_RESULT_STEPS - DEPTH_HALO_STEPS) * DEPTH_BLOCKDIM_Z + threadIdx.z;\n\n    const int first_pixel_in_line = (DEPTH_BLOCKDIM_Z * DEPTH_HALO_STEPS - threadIdx.z) * image_w * image_h;\n    const int last_pixel_in_line  = (image_d - base_z - 1) * image_w * image_h;\n\n    d_dst += base_z * image_h * image_w + base_y * image_w + base_x;\n    d_src += base_z * image_h * image_w + base_y * image_w + base_x;\n\n    /* main data */\n#pragma unroll\n    for (int i = DEPTH_HALO_STEPS; i < DEPTH_HALO_STEPS + DEPTH_RESULT_STEPS; i++) {\n        s_data[threadIdx.x][threadIdx.z + i * DEPTH_BLOCKDIM_Z] = (image_d - base_z > i * DEPTH_BLOCKDIM_Z)\n                                                                      ? d_src[i * DEPTH_BLOCKDIM_Z * image_w * image_h]\n                                                                      : d_src[last_pixel_in_line];\n    }\n\n    /* upper halo */\n#pragma unroll\n    for (int i = 0; i < DEPTH_HALO_STEPS; i++) {\n        s_data[threadIdx.x][threadIdx.z + i * DEPTH_BLOCKDIM_Z] = (base_z >= -i * DEPTH_BLOCKDIM_Z)\n                                                                      ? d_src[i * DEPTH_BLOCKDIM_Z * image_w * image_h]\n                                                                      : d_src[first_pixel_in_line];\n    }\n\n    /* lower halo */\n#pragma unroll\n    for (int i = DEPTH_HALO_STEPS + DEPTH_RESULT_STEPS; i < DEPTH_HALO_STEPS + DEPTH_RESULT_STEPS + DEPTH_HALO_STEPS;\n         i++) {\n        s_data[threadIdx.x][threadIdx.z + i * DEPTH_BLOCKDIM_Z] = (image_d - base_z > i * DEPTH_BLOCKDIM_Z)\n                                                                      ? d_src[i * DEPTH_BLOCKDIM_Z * image_w * image_h]\n                                                                      : d_src[last_pixel_in_line];\n    }\n\n    /* compute and store results */\n    __syncthreads();\n\n    /* this pixel is not part of the image and doesn't need to be convolved */\n    if (base_x >= image_w) {\n        return;\n    }\n\n#pragma unroll\n    for (int i = DEPTH_HALO_STEPS; i < DEPTH_HALO_STEPS + DEPTH_RESULT_STEPS; i++) {\n        if (image_d - base_z > i * DEPTH_BLOCKDIM_Z) {\n            float4 sum = make_float4(0.f, 0.f, 0.f, 0.f);\n#pragma unroll\n            for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {\n                sum += S[KERNEL_RADIUS - j] * s_data[threadIdx.x][threadIdx.z + i * DEPTH_BLOCKDIM_Z + j];\n            }\n\n            d_dst[i * DEPTH_BLOCKDIM_Z * image_w * image_h] += sum;\n        }\n    }\n}\n\nvoid sobfu::device::convolution_depth(float4* d_dst, float4* d_src, int image_w, int image_h, int image_d) {\n    int blocks_x = image_w / DEPTH_BLOCKDIM_X + min(1, image_w % DEPTH_BLOCKDIM_X);\n    int blocks_y = image_h;\n    int blocks_z =\n        image_d / (DEPTH_RESULT_STEPS * DEPTH_BLOCKDIM_Z) + min(1, image_d % (DEPTH_RESULT_STEPS * DEPTH_BLOCKDIM_Z));\n\n    dim3 blocks(blocks_x, blocks_y, blocks_z);\n    dim3 threads(DEPTH_BLOCKDIM_X, 1, DEPTH_BLOCKDIM_Z);\n\n    convolution_depth_kernel<<<blocks, threads>>>(d_dst, d_src, image_w, image_h, image_d);\n    cudaSafeCall(cudaGetLastError());\n}\n"
  },
  {
    "path": "src/sobfu/cuda/vector_fields.cu",
    "content": "/* sobfu includes */\n#include <sobfu/cuda/utils.hpp>\n#include <sobfu/vector_fields.hpp>\n\n/* kinfu includes */\n#include <kfusion/cuda/device.hpp>\n\nusing namespace kfusion::device;\n\n/*\n * VECTOR FIELD\n */\n\n__device__ __forceinline__ float4* sobfu::device::VectorField::beg(int x, int y) const { return data + x + dims.x * y; }\n\n__device__ __forceinline__ float4* sobfu::device::VectorField::zstep(float4* const ptr) const {\n    return ptr + dims.x * dims.y;\n}\n\n__device__ __forceinline__ float4* sobfu::device::VectorField::operator()(int x, int y, int z) const {\n    return data + x + y * dims.x + z * dims.y * dims.x;\n}\n\n__device__ __forceinline__ float4 sobfu::device::VectorField::get_displacement(int x, int y, int z) const {\n    return *(data + z * dims.y * dims.x + y * dims.x + x) - make_float4((float) x, (float) y, (float) z, 0.f);\n}\n\nvoid sobfu::device::clear(VectorField& field) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(field.dims.x, block.x), kfusion::device::divUp(field.dims.y, block.y));\n\n    clear_kernel<<<grid, block>>>(field);\n    cudaSafeCall(cudaGetLastError());\n}\n\n__global__ void sobfu::device::clear_kernel(sobfu::device::VectorField field) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > field.dims.x - 1 || y > field.dims.y - 1) {\n        return;\n    }\n\n    float4* beg = field.beg(x, y);\n    float4* end = beg + field.dims.x * field.dims.y * field.dims.z;\n\n    for (float4* pos = beg; pos != end; pos = field.zstep(pos)) {\n        *pos = make_float4(0.f, 0.f, 0.f, 0.f);\n    }\n}\n\n/*\n * DEFORMATION FIELD\n */\n\nvoid sobfu::device::init_identity(sobfu::device::DeformationField& psi) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(psi.dims.x, block.x), kfusion::device::divUp(psi.dims.y, block.y));\n\n    init_identity_kernel<<<grid, block>>>(psi);\n    cudaSafeCall(cudaGetLastError());\n}\n\n__global__ void sobfu::device::init_identity_kernel(sobfu::device::DeformationField psi) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > psi.dims.x - 1 || y > psi.dims.y - 1) {\n        return;\n    }\n\n    float4 idx   = make_float4((float) x, (float) y, 0.f, 0.f);\n    float4 zstep = make_float4(0.f, 0.f, 1.f, 0.f);\n\n    float4* pos = psi.beg(x, y);\n    for (int i = 0; i <= psi.dims.z - 1; idx += zstep, pos = psi.zstep(pos), ++i) {\n        *pos = idx;\n    }\n}\n\n__global__ void sobfu::device::apply_kernel(const kfusion::device::TsdfVolume phi,\n                                            kfusion::device::TsdfVolume phi_warped,\n                                            const sobfu::device::DeformationField psi) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > phi_warped.dims.x - 1 || y > phi_warped.dims.y - 1) {\n        return;\n    }\n\n    float4* psi_ptr        = psi.beg(x, y);\n    float2* phi_warped_ptr = phi_warped.beg(x, y);\n    for (int i   = 0; i <= phi_warped.dims.z - 1;\n         psi_ptr = psi.zstep(psi_ptr), phi_warped_ptr = phi_warped.zstep(phi_warped_ptr), ++i) {\n        float4 psi_val = *psi_ptr;\n\n        float2 tsdf_deformed = interpolate_tsdf(phi, trunc(psi_val));\n        *phi_warped_ptr      = tsdf_deformed;\n    }\n}\n\nvoid sobfu::device::apply(const kfusion::device::TsdfVolume& phi, kfusion::device::TsdfVolume& phi_warped,\n                          const sobfu::device::DeformationField& psi) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(phi.dims.x, block.x), kfusion::device::divUp(phi.dims.y, block.y));\n\n    apply_kernel<<<grid, block>>>(phi, phi_warped, psi);\n    cudaSafeCall(cudaGetLastError());\n}\n\n__global__ void sobfu::device::estimate_inverse_kernel(sobfu::device::DeformationField psi,\n                                                       sobfu::device::DeformationField psi_inv) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > psi_inv.dims.x - 1 || y > psi_inv.dims.y - 1) {\n        return;\n    }\n\n    float4* psi_inv_ptr = psi_inv.beg(x, y);\n    for (int i = 0; i <= psi_inv.dims.z - 1; psi_inv_ptr = psi_inv.zstep(psi_inv_ptr), ++i) {\n        float4 psi_inv_val = *psi_inv_ptr;\n        *psi_inv_ptr =\n            make_float4((float) x, (float) y, (float) i, 0.f) - 1.f * interpolate_field_inv(psi, trunc(psi_inv_val));\n    }\n}\n\nvoid sobfu::device::estimate_inverse(sobfu::device::DeformationField& psi,\n                                     sobfu::device::DeformationField& psi_inverse) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(psi_inverse.dims.x, block.x), kfusion::device::divUp(psi_inverse.dims.y, block.y));\n\n    /* estimate inverse */\n    for (int iter = 0; iter < 48; ++iter) {\n        estimate_inverse_kernel<<<grid, block>>>(psi, psi_inverse);\n        cudaSafeCall(cudaGetLastError());\n    }\n}\n\n/*\n * TSDF DIFFERENTIATOR METHODS\n */\n\n__global__ void sobfu::device::estimate_gradient_kernel(const sobfu::device::TsdfDifferentiator diff,\n                                                        sobfu::device::TsdfGradient grad) {\n    diff(grad);\n}\n\nvoid sobfu::device::TsdfDifferentiator::calculate(sobfu::device::TsdfGradient& grad) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(grad.dims.x, block.x), kfusion::device::divUp(grad.dims.y, block.y));\n\n    estimate_gradient_kernel<<<grid, block>>>(*this, grad);\n    cudaSafeCall(cudaGetLastError());\n}\n\n__device__ __forceinline__ void sobfu::device::TsdfDifferentiator::operator()(sobfu::device::TsdfGradient& grad) const {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > vol.dims.x - 1 || y > vol.dims.y - 1) {\n        return;\n    }\n\n    int idx_x_1 = x + 1;\n    int idx_x_2 = x - 1;\n    if (x == 0) {\n        idx_x_2 = x + 1;\n    } else if (x == vol.dims.x - 1) {\n        idx_x_1 = x - 1;\n    }\n\n    int idx_y_1 = y + 1;\n    int idx_y_2 = y - 1;\n    if (y == 0) {\n        idx_y_2 = y + 1;\n    } else if (y == vol.dims.y - 1) {\n        idx_y_1 = y - 1;\n    }\n\n    float4* grad_ptr = grad.beg(x, y);\n\n#pragma unroll\n    for (int i = 0; i <= vol.dims.z - 1; grad_ptr = grad.zstep(grad_ptr), ++i) {\n        int idx_z_1 = i + 1;\n        int idx_z_2 = i - 1;\n        if (i == 0) {\n            idx_z_2 = i + 1;\n        } else if (i == vol.dims.z - 1) {\n            idx_z_1 = i - 1;\n        }\n\n        float Fx1 = (*vol(idx_x_1, y, i)).x;\n        float Fx2 = (*vol(idx_x_2, y, i)).x;\n        float n_x = __fdividef(Fx1 - Fx2, 2.f);\n\n        float Fy1 = (*vol(x, idx_y_1, i)).x;\n        float Fy2 = (*vol(x, idx_y_2, i)).x;\n        float n_y = __fdividef(Fy1 - Fy2, 2.f);\n\n        float Fz1 = (*vol(x, y, idx_z_1)).x;\n        float Fz2 = (*vol(x, y, idx_z_2)).x;\n        float n_z = __fdividef(Fz1 - Fz2, 2.f);\n\n        float4 n  = make_float4(n_x, n_y, n_z, 0.f);\n        *grad_ptr = n;\n    }\n}\n\n__global__ void sobfu::device::interpolate_gradient_kernel(sobfu::device::TsdfGradient nabla_phi_n_psi,\n                                                           sobfu::device::TsdfGradient nabla_phi_n_psi_t,\n                                                           sobfu::device::DeformationField psi) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > psi.dims.x - 1 || y > psi.dims.y - 1) {\n        return;\n    }\n\n    float3 idx   = make_float3(x, y, 0.f);\n    float3 zstep = make_float3(0.f, 0.f, 1.f);\n\n    int global_idx = y * nabla_phi_n_psi.dims.x + x;\n\n    float4* nabla_phi_n_psi_t_ptr = nabla_phi_n_psi_t.beg(x, y);\n    for (int i = 0; i <= psi.dims.z - 1; nabla_phi_n_psi_t_ptr = nabla_phi_n_psi_t.zstep(nabla_phi_n_psi_t_ptr),\n             global_idx += nabla_phi_n_psi.dims.x * nabla_phi_n_psi.dims.y, idx += zstep, ++i) {\n        float4 psi_val         = psi.data[global_idx];\n        *nabla_phi_n_psi_t_ptr = interpolate_field(nabla_phi_n_psi, trunc(psi_val));\n    }\n}\n\nvoid sobfu::device::interpolate_gradient(sobfu::device::TsdfGradient& nabla_phi_n_psi,\n                                         sobfu::device::TsdfGradient& nabla_phi_n_psi_t,\n                                         sobfu::device::DeformationField& psi) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(psi.dims.x, block.x), kfusion::device::divUp(psi.dims.y, block.y));\n\n    interpolate_gradient_kernel<<<grid, block>>>(nabla_phi_n_psi, nabla_phi_n_psi_t, psi);\n    cudaSafeCall(cudaGetLastError());\n}\n\n/*\n * LAPLACIAN\n */\n\n__global__ void sobfu::device::interpolate_laplacian_kernel(sobfu::device::Laplacian L,\n                                                            sobfu::device::Laplacian L_o_psi,\n                                                            sobfu::device::DeformationField psi) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > psi.dims.x - 1 || y > psi.dims.y - 1) {\n        return;\n    }\n\n    float4* psi_ptr     = psi.beg(x, y);\n    float4* L_o_psi_ptr = L_o_psi.beg(x, y);\n    for (int i = 0; i <= psi.dims.z - 1; psi_ptr = psi.zstep(psi_ptr), L_o_psi_ptr = L_o_psi.zstep(L_o_psi_ptr), ++i) {\n        float4 psi_val = *psi_ptr;\n        *L_o_psi_ptr   = interpolate_field(L, trunc(psi_val));\n    }\n}\n\nvoid sobfu::device::interpolate_laplacian(sobfu::device::Laplacian& L, sobfu::device::Laplacian& L_o_psi,\n                                          sobfu::device::DeformationField& psi) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(psi.dims.x, block.x), kfusion::device::divUp(psi.dims.y, block.y));\n\n    interpolate_laplacian_kernel<<<grid, block>>>(L, L_o_psi, psi);\n    cudaSafeCall(cudaGetLastError());\n}\n\n/*\n * SECOND ORDER DIFFERENTIATOR METHODS\n */\n\nvoid sobfu::device::SecondOrderDifferentiator::calculate(sobfu::device::Laplacian& L) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(L.dims.x, block.x), kfusion::device::divUp(L.dims.y, block.y));\n\n    estimate_laplacian_kernel<<<grid, block>>>(*this, L);\n    cudaSafeCall(cudaGetLastError());\n}\n\n__global__ void sobfu::device::estimate_laplacian_kernel(const sobfu::device::SecondOrderDifferentiator diff,\n                                                         sobfu::device::Laplacian L) {\n    diff.laplacian(L);\n}\n\n__device__ __forceinline__ void sobfu::device::SecondOrderDifferentiator::laplacian(sobfu::device::Laplacian& L) const {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > L.dims.x - 1 || y > L.dims.y - 1) {\n        return;\n    }\n\n    int idx_x_1 = x + 1;\n    int idx_x_2 = x - 1;\n    if (x == 0) {\n        idx_x_1 = x;\n        idx_x_2 = x;\n    } else if (x == L.dims.x - 1) {\n        idx_x_1 = x;\n        idx_x_2 = x;\n    }\n\n    int idx_y_1 = y + 1;\n    int idx_y_2 = y - 1;\n    if (y == 0) {\n        idx_y_1 = y;\n        idx_y_2 = y;\n    } else if (y == L.dims.y - 1) {\n        idx_y_1 = y;\n        idx_y_2 = y;\n    }\n\n    float4* L_ptr = L.beg(x, y);\n\n#pragma unroll\n    for (int i = 0; i <= L.dims.z - 1; L_ptr = L.zstep(L_ptr), ++i) {\n        int idx_z_1 = i + 1;\n        int idx_z_2 = i - 1;\n        if (i == 0) {\n            idx_z_1 = i;\n            idx_z_2 = i;\n        } else if (i == L.dims.z - 1) {\n            idx_z_1 = i;\n            idx_z_2 = i;\n        }\n\n        float4 L_val = -6.f * *psi(x, y, i) + *psi(idx_x_1, y, i) + *psi(idx_x_2, y, i) + *psi(x, idx_y_1, i) +\n                       *psi(x, idx_y_2, i) + *psi(x, y, idx_z_1) + *psi(x, y, idx_z_2);\n        *L_ptr = -1.f * L_val;\n    }\n}\n\n/*\n * JACOBIAN\n */\n\n__device__ __forceinline__ Mat4f* sobfu::device::Jacobian::beg(int x, int y) const { return data + x + dims.x * y; }\n\n__device__ __forceinline__ Mat4f* sobfu::device::Jacobian::zstep(Mat4f* const ptr) const {\n    return ptr + dims.x * dims.y;\n}\n\n__device__ __forceinline__ Mat4f* sobfu::device::Jacobian::operator()(int x, int y, int z) const {\n    return data + x + y * dims.x + z * dims.y * dims.x;\n}\n\nvoid sobfu::device::clear(Jacobian& J) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(J.dims.x, block.x), kfusion::device::divUp(J.dims.y, block.y));\n\n    clear_jacobian_kernel<<<grid, block>>>(J);\n    cudaSafeCall(cudaGetLastError());\n    cudaSafeCall(cudaDeviceSynchronize());\n}\n\n__global__ void sobfu::device::clear_jacobian_kernel(sobfu::device::Jacobian J) {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > J.dims.x - 1 || y > J.dims.y - 1) {\n        return;\n    }\n\n    Mat4f* beg = J.beg(x, y);\n    Mat4f* end = beg + J.dims.x * J.dims.y * J.dims.z;\n\n    for (Mat4f* pos = beg; pos != end; pos = J.zstep(pos)) {\n        float4 g = make_float4(0.f, 0.f, 0.f, 0.f);\n\n        Mat4f val;\n        val.data[0] = g;\n        val.data[1] = g;\n        val.data[2] = g;\n\n        *pos = val;\n    }\n}\n\n/*\n * DIFFERENTIATOR METHODS\n */\n\nvoid sobfu::device::Differentiator::calculate(sobfu::device::Jacobian& J) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(J.dims.x, block.x), kfusion::device::divUp(J.dims.y, block.y));\n\n    estimate_jacobian_kernel<<<grid, block>>>(*this, J);\n    cudaSafeCall(cudaGetLastError());\n}\n\nvoid sobfu::device::Differentiator::calculate_deformation_jacobian(sobfu::device::Jacobian& J) {\n    dim3 block(64, 16);\n    dim3 grid(kfusion::device::divUp(J.dims.x, block.x), kfusion::device::divUp(J.dims.y, block.y));\n\n    estimate_deformation_jacobian_kernel<<<grid, block>>>(*this, J);\n    cudaSafeCall(cudaGetLastError());\n}\n\n__global__ void sobfu::device::estimate_jacobian_kernel(const sobfu::device::Differentiator diff,\n                                                        sobfu::device::Jacobian J) {\n    diff(J, 0);\n}\n\n__global__ void sobfu::device::estimate_deformation_jacobian_kernel(const sobfu::device::Differentiator diff,\n                                                                    sobfu::device::Jacobian J) {\n    diff(J, 1);\n}\n\n__device__ __forceinline__ void sobfu::device::Differentiator::operator()(sobfu::device::Jacobian& J, int mode) const {\n    int x = threadIdx.x + blockIdx.x * blockDim.x;\n    int y = threadIdx.y + blockIdx.y * blockDim.y;\n\n    if (x > psi.dims.x - 1 || y > psi.dims.y - 1) {\n        return;\n    }\n\n    int idx_x_1 = x + 1;\n    int idx_x_2 = x - 1;\n    if (x == 0) {\n        idx_x_2 = x + 1;\n    } else if (x == psi.dims.x - 1) {\n        idx_x_1 = x - 1;\n    }\n\n    int idx_y_1 = y + 1;\n    int idx_y_2 = y - 1;\n    if (y == 0) {\n        idx_y_2 = y + 1;\n    } else if (y == psi.dims.y - 1) {\n        idx_y_1 = y - 1;\n    }\n\n    Mat4f* J_ptr = J.beg(x, y);\n\n#pragma unroll\n    for (int i = 0; i <= psi.dims.z - 1; J_ptr = J.zstep(J_ptr), ++i) {\n        int idx_z_1 = i + 1;\n        int idx_z_2 = i - 1;\n        if (i == 0) {\n            idx_z_2 = i + 1;\n        } else if (i == psi.dims.z - 1) {\n            idx_z_1 = i - 1;\n        }\n\n        float4 J_x;\n        float4 J_y;\n        float4 J_z;\n\n        if (mode == 0) {\n            J_x = (*psi(idx_x_1, y, i) - *psi(idx_x_2, y, i)) / 2.f;\n            J_y = (*psi(x, idx_y_1, i) - *psi(x, idx_y_2, i)) / 2.f;\n            J_z = (*psi(x, y, idx_z_1) - *psi(x, y, idx_z_2)) / 2.f;\n        } else if (mode == 1) {\n            J_x = (psi.get_displacement(idx_x_1, y, i) - psi.get_displacement(idx_x_2, y, i)) / 2.f;\n            J_y = (psi.get_displacement(x, idx_y_1, i) - psi.get_displacement(x, idx_y_2, i)) / 2.f;\n            J_z = (psi.get_displacement(x, y, idx_z_1) - psi.get_displacement(x, y, idx_z_2)) / 2.f;\n        }\n\n        Mat4f val;\n        val.data[0] = make_float4(J_x.x, J_y.x, J_z.x, 0.f);\n        val.data[1] = make_float4(J_x.y, J_y.y, J_z.y, 0.f);\n        val.data[2] = make_float4(J_x.z, J_y.z, J_z.z, 0.f);\n\n        *J(x, y, i) = val;\n    }\n}\n"
  },
  {
    "path": "src/sobfu/precomp.cpp",
    "content": "/* sobfu includes */\n#include <sobfu/precomp.hpp>\n\n/* checks if x is a power of 2 */\nbool isPow2(unsigned int x) { return ((x & (x - 1)) == 0); }\n\n/* computes the nearest power of 2 larger than x */\nint nextPow2(int x) {\n    if (x < 0)\n        return 0;\n    --x;\n    x |= x >> 1;\n    x |= x >> 2;\n    x |= x >> 4;\n    x |= x >> 8;\n    x |= x >> 16;\n    return x + 1;\n}\n\nvoid get_num_blocks_and_threads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads) {\n    /* get device capability, to avoid block/grid size exceed the upper bound */\n    cudaDeviceProp prop;\n    int device;\n    cudaSafeCall(cudaGetDevice(&device));\n    cudaSafeCall(cudaGetDeviceProperties(&prop, device));\n\n    threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;\n    blocks  = (n + (threads * 2 - 1)) / (threads * 2);\n\n    if ((float) threads * blocks > (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {\n        printf(\"n is too large, please choose a smaller number!\\n\");\n    }\n\n    if (blocks > prop.maxGridSize[0]) {\n        printf(\"Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\\n\", blocks,\n               prop.maxGridSize[0], threads * 2, threads);\n\n        blocks /= 2;\n        threads *= 2;\n    }\n\n    blocks = std::min(maxBlocks, blocks);\n}\n"
  },
  {
    "path": "src/sobfu/reductor.cpp",
    "content": "/* sobfu incldues */\n#include <sobfu/reductor.hpp>\n\n/*\n * REDUCTOR\n */\n\nsobfu::device::Reductor::Reductor(int3 dims_, float vsz_, float trunc_dist_) {\n    dims       = dims_;\n    vsz        = vsz_;\n    trunc_dist = trunc_dist_;\n\n    no_voxels = dims.x * dims.y * dims.z;\n\n    /* own kernels */\n    get_num_blocks_and_threads(no_voxels, 65536, 512, blocks, threads);\n\n    h_data_out = new float[blocks];\n    h_reg_out  = new float[blocks];\n    h_max_out  = new float2[blocks];\n\n    cudaSafeCall(cudaMalloc((void **) &d_data_out, blocks * sizeof(float)));\n    cudaSafeCall(cudaMalloc((void **) &d_reg_out, blocks * sizeof(float)));\n    cudaSafeCall(cudaMalloc((void **) &d_max_out, blocks * sizeof(float2)));\n\n    cudaSafeCall(cudaMalloc((void **) &updates, no_voxels * sizeof(float4)));\n}\n\nsobfu::device::Reductor::~Reductor() {\n    delete h_data_out, h_reg_out, h_max_out;\n\n    cudaSafeCall(cudaFree(d_data_out));\n    cudaSafeCall(cudaFree(d_reg_out));\n    cudaSafeCall(cudaFree(d_max_out));\n    cudaSafeCall(cudaFree(updates));\n}\n\nfloat sobfu::device::Reductor::data_energy(float2 *phi_global_data, float2 *phi_n_data) {\n    reduce_data(no_voxels, threads, blocks, phi_global_data, phi_n_data, d_data_out);\n    cudaSafeCall(cudaGetLastError());\n\n    return 0.5f * final_reduce(h_data_out, d_data_out, blocks);\n}\n\nfloat sobfu::device::Reductor::reg_energy_sobolev(Mat4f *J_data) {\n    reduce_reg_sobolev(no_voxels, threads, blocks, J_data, d_reg_out);\n    cudaSafeCall(cudaGetLastError());\n\n    return 0.5f * final_reduce(h_reg_out, d_reg_out, blocks);\n}\n\nfloat2 sobfu::device::Reductor::max_update_norm() {\n    reduce_max(no_voxels, threads, blocks, updates, d_max_out);\n    cudaSafeCall(cudaGetLastError());\n\n    return final_reduce_max(h_max_out, d_max_out, blocks, dims);\n}\n\nfloat2 sobfu::device::Reductor::voxel_max_energy(float2 *phi_global_data, float2 *phi_n_data, Mat4f *J_data,\n                                                 float w_reg) {\n    reduce_voxel_max_energy(no_voxels, threads, blocks, phi_global_data, phi_n_data, J_data, w_reg, d_max_out);\n    cudaSafeCall(cudaGetLastError());\n\n    return final_reduce_max(h_max_out, d_max_out, blocks, dims);\n}\n\n/* sum partial sums from each block on cpu */\nfloat sobfu::device::final_reduce(float *h_odata, float *d_odata, int numBlocks) {\n    float result = 0.f;\n\n    /* copy result from device to host */\n    cudaSafeCall(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(float), cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < numBlocks; i++) {\n        result += h_odata[i];\n    }\n\n    return result;\n}\n\nfloat2 sobfu::device::final_reduce_max(float2 *h_o_max_data, float2 *d_o_max_data, int numBlocks, int3 dims) {\n    float2 result = make_float2(0.f, 0.f);\n\n    /* copy result from device to host */\n    cudaSafeCall(cudaMemcpy(h_o_max_data, d_o_max_data, numBlocks * sizeof(float2), cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < numBlocks; i++) {\n        if (h_o_max_data[i].x > result.x) {\n            result = h_o_max_data[i];\n        }\n    }\n\n    return result;\n}\n"
  },
  {
    "path": "src/sobfu/scalar_fields.cpp",
    "content": "#include <sobfu/scalar_fields.hpp>\n\n/*\n * SCALAR FIELD\n */\n\nsobfu::cuda::ScalarField::ScalarField(cv::Vec3i dims_) {\n    dims = make_int3(dims_[0], dims_[1], dims_[2]);\n\n    int no_voxels = dims.x * dims.y * dims.z;\n    data.create(no_voxels * sizeof(float));\n    clear();\n}\n\nsobfu::cuda::ScalarField::~ScalarField() = default;\n\nkfusion::cuda::CudaData sobfu::cuda::ScalarField::get_data() { return data; }\n\nconst kfusion::cuda::CudaData sobfu::cuda::ScalarField::get_data() const { return data; }\n\nint3 sobfu::cuda::ScalarField::get_dims() { return dims; }\n\nvoid sobfu::cuda::ScalarField::clear() {\n    sobfu::device::ScalarField field(data.ptr<float>(), dims);\n    sobfu::device::clear(field);\n}\n\nfloat sobfu::cuda::ScalarField::sum() {\n    sobfu::device::ScalarField field(data.ptr<float>(), dims);\n\n    float result = sobfu::device::reduce_sum(field);\n    return result;\n}\n\nvoid sobfu::cuda::ScalarField::print() {\n    int sizes[3] = {dims.x, dims.y, dims.z};\n\n    cv::Mat* mat = new cv::Mat(3, sizes, CV_32FC1);\n    data.download(mat->ptr<float>());\n\n    std::cout << \"--- FIELD ---\" << std::endl;\n    for (int i = 0; i < dims.x; i++) {\n        for (int j = 0; j < dims.y; j++) {\n            for (int k = 0; k < dims.z; k++) {\n                float val = mat->at<float>(k, j, i);\n\n                std::cout << \"(x,y,z)=(\" << i << \", \" << j << \", \" << k << \"), val=(\" << val << std::endl;\n            }\n        }\n    }\n\n    delete mat;\n}\n"
  },
  {
    "path": "src/sobfu/sob_fusion.cpp",
    "content": "#include <sobfu/sob_fusion.hpp>\n\nSobFusion::SobFusion(const Params &params) : frame_counter_(0), params(params) {\n    int cols = params.cols;\n    int rows = params.rows;\n\n    dists_.create(rows, cols);\n\n    curr_.depth_pyr.resize(1);\n    curr_.normals_pyr.resize(1);\n\n    prev_.depth_pyr.resize(1);\n    prev_.normals_pyr.resize(1);\n\n    curr_.points_pyr.resize(1);\n    prev_.points_pyr.resize(1);\n\n    curr_.depth_pyr[0].create(rows, cols);\n    curr_.normals_pyr[0].create(rows, cols);\n\n    prev_.depth_pyr[0].create(rows, cols);\n    prev_.normals_pyr[0].create(rows, cols);\n\n    curr_.points_pyr[0].create(rows, cols);\n    prev_.points_pyr[0].create(rows, cols);\n\n    depths_.create(rows, cols);\n    normals_.create(rows, cols);\n    points_.create(rows, cols);\n\n    poses_.clear();\n    poses_.reserve(4096);\n    poses_.push_back(cv::Affine3f::Identity());\n\n    mc = cv::Ptr<kfusion::cuda::MarchingCubes>(new kfusion::cuda::MarchingCubes());\n    mc->setPose(params.volume_pose);\n}\n\nSobFusion::~SobFusion() = default;\n\nParams &SobFusion::getParams() { return params; }\n\npcl::PolygonMesh::Ptr SobFusion::get_phi_global_mesh() { return get_mesh(phi_global); }\n\npcl::PolygonMesh::Ptr SobFusion::get_phi_global_psi_inv_mesh() { return get_mesh(phi_global_psi_inv); }\n\npcl::PolygonMesh::Ptr SobFusion::get_phi_n_mesh() { return get_mesh(phi_n); }\n\npcl::PolygonMesh::Ptr SobFusion::get_phi_n_psi_mesh() { return get_mesh(phi_n_psi); }\n\nstd::shared_ptr<sobfu::cuda::DeformationField> SobFusion::getDeformationField() { return this->psi; }\n\n/* PIPELINE\n *\n * --- frame 0 ---\n *\n * 1. bilateral filter\n * 2. depth truncation\n * 3. initailisation of phi_global and phi_n\n *\n * --- frames n + 1 ---\n * 1. bilateral filter\n * 2. depth truncation\n * 3. initialisation of phi_n\n * 4. estimation of psi\n * 5. fusion of phi_n(psi)\n * 6. warp of phi_global with psi^-1\n *\n */\n\nbool SobFusion::operator()(const kfusion::cuda::Depth &depth, const kfusion::cuda::Image & /*image*/) {\n    std::cout << \"--- FRAME NO. \" << frame_counter_ << \" ---\" << std::endl;\n\n    /*\n     *  bilateral filter\n     */\n\n    kfusion::cuda::depthBilateralFilter(depth, curr_.depth_pyr[0], params.bilateral_kernel_size,\n                                        params.bilateral_sigma_spatial, params.bilateral_sigma_depth);\n\n    /*\n     * depth truncation\n     */\n\n    kfusion::cuda::depthTruncation(curr_.depth_pyr[0], params.icp_truncate_depth_dist);\n\n    /*\n     *  compute distances using depth map\n     */\n\n    kfusion::cuda::computeDists(curr_.depth_pyr[0], dists_, params.intr);\n\n    if (frame_counter_ == 0) {\n        /*\n         * INITIALISATION OF PHI_GLOBAL\n         */\n\n        phi_global = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_global->integrate(dists_, poses_.back(), params.intr);\n\n        /*\n         * INITIALISATION OF PHI_GLOBAL(PSI_INV), PHI_N, AND PHI_N(PSI)\n         */\n\n        phi_global_psi_inv = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_n              = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_n_psi          = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n\n        /*\n         * INITIALISATION OF PSI AND PSI_INV\n         */\n\n        this->psi     = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);\n        this->psi_inv = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);\n\n        /*\n         * INITIALISATION OF THE SOLVER\n         */\n\n        this->solver = std::make_shared<sobfu::cuda::Solver>(params);\n\n        return ++frame_counter_, true;\n    }\n\n    /*\n     * UPDATE OF PHI_N\n     */\n\n    phi_n->clear();\n    this->phi_n->integrate(dists_, poses_.back(), params.intr);\n\n    /*\n     * ESTIMATION OF DEFORMATION FIELD AND SURFACE FUSION\n     */\n\n    if (frame_counter_ < params.start_frame) {\n        this->phi_global->integrate(*phi_n);\n        return ++frame_counter_, true;\n    }\n\n    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);\n    this->phi_global->integrate(*phi_n_psi);\n\n    return ++frame_counter_, true;\n}\n\npcl::PolygonMesh::Ptr SobFusion::get_mesh(cv::Ptr<kfusion::cuda::TsdfVolume> vol) {\n    kfusion::device::DeviceArray<pcl::PointXYZ> vertices_buffer_device;\n    kfusion::device::DeviceArray<pcl::Normal> normals_buffer_device;\n\n    /* run marching cubes */\n    std::shared_ptr<kfusion::cuda::Surface> model =\n        std::make_shared<kfusion::cuda::Surface>(mc->run(*vol, vertices_buffer_device, normals_buffer_device));\n    kfusion::cuda::waitAllDefaultStream();\n\n    pcl::PolygonMesh::Ptr mesh = convert_to_mesh(model->vertices);\n    return mesh;\n}\n\npcl::PolygonMesh::Ptr SobFusion::convert_to_mesh(const kfusion::cuda::DeviceArray<pcl::PointXYZ> &triangles) {\n    if (triangles.empty()) {\n        return pcl::PolygonMesh::Ptr(new pcl::PolygonMesh());\n    }\n\n    pcl::PointCloud<pcl::PointXYZ> cloud;\n    cloud.width  = static_cast<int>(triangles.size());\n    cloud.height = 1;\n    triangles.download(cloud.points);\n\n    pcl::PolygonMesh::Ptr mesh(new pcl::PolygonMesh());\n    pcl::toPCLPointCloud2(cloud, mesh->cloud);\n\n    mesh->polygons.resize(triangles.size() / 3);\n    for (size_t i = 0; i < mesh->polygons.size(); ++i) {\n        pcl::Vertices v;\n        v.vertices.push_back(i * 3 + 0);\n        v.vertices.push_back(i * 3 + 1);\n        v.vertices.push_back(i * 3 + 2);\n        mesh->polygons[i] = v;\n    }\n\n    return mesh;\n}\n"
  },
  {
    "path": "src/sobfu/solver.cpp",
    "content": "#include <sobfu/solver.hpp>\n\n/*\n * SOLVER\n */\n\nsobfu::cuda::Solver::Solver(Params& params) {\n    /*\n     * PARAMETERS\n     */\n\n    /* volume */\n    cv::Vec3i d = params.volume_dims;\n    dims        = make_int3(d[0], d[1], d[2]);\n    no_voxels   = dims.x * dims.y * dims.z;\n\n    cv::Vec3f vsz = params.voxel_sizes();\n    voxel_sizes   = make_float3(vsz[0], vsz[1], vsz[2]);\n\n    trunc_dist = params.tsdf_trunc_dist;\n    eta        = params.eta;\n    max_weight = params.tsdf_max_weight;\n\n    /* solver */\n    solver_params.verbosity = params.verbosity;\n\n    solver_params.max_iter        = params.max_iter;\n    solver_params.max_update_norm = params.max_update_norm;\n\n    solver_params.lambda = params.lambda;\n    solver_params.s      = params.s;\n\n    solver_params.alpha = params.alpha;\n    solver_params.w_reg = params.w_reg;\n\n    /*\n     * SOLVER HELPER CLASSES\n     */\n\n    /* gradients */\n    spatial_grads = new sobfu::cuda::SpatialGradients(d);\n\n    nabla_phi_n = new sobfu::device::TsdfGradient(spatial_grads->nabla_phi_n->get_data().ptr<float4>(), dims);\n    nabla_phi_n_o_psi =\n        new sobfu::device::TsdfGradient(spatial_grads->nabla_phi_n_o_psi->get_data().ptr<float4>(), dims);\n    J           = new sobfu::device::Jacobian(spatial_grads->J->get_data().ptr<Mat4f>(), dims);\n    J_inv       = new sobfu::device::Jacobian(spatial_grads->J_inv->get_data().ptr<Mat4f>(), dims);\n    L           = new sobfu::device::Laplacian(spatial_grads->L->get_data().ptr<float4>(), dims);\n    L_o_psi_inv = new sobfu::device::Laplacian(spatial_grads->L_o_psi_inv->get_data().ptr<float4>(), dims);\n    nabla_U     = new sobfu::device::PotentialGradient(spatial_grads->nabla_U->get_data().ptr<float4>(), dims);\n    nabla_U_S   = new sobfu::device::PotentialGradient(spatial_grads->nabla_U_S->get_data().ptr<float4>(), dims);\n\n    spatial_grads_device = new sobfu::device::SpatialGradients(nabla_phi_n, nabla_phi_n_o_psi, J, J_inv, L, L_o_psi_inv,\n                                                               nabla_U, nabla_U_S);\n\n    /* reductor */\n    r = new sobfu::device::Reductor(dims, voxel_sizes.x, trunc_dist);\n\n    /* sobolev filter */\n    h_S_i = new float[solver_params.s];\n    cudaSafeCall(cudaMalloc((void**) &d_S_i, solver_params.s * sizeof(float)));\n\n    decompose_sobolev_filter(solver_params, h_S_i);\n    cudaSafeCall(cudaMemcpy(d_S_i, h_S_i, solver_params.s * sizeof(float), cudaMemcpyHostToDevice));\n}\n\nsobfu::cuda::Solver::~Solver() = default;\n\nvoid sobfu::cuda::Solver::estimate_psi(const cv::Ptr<kfusion::cuda::TsdfVolume> phi_global,\n                                       cv::Ptr<kfusion::cuda::TsdfVolume> phi_global_psi_inv,\n                                       const cv::Ptr<kfusion::cuda::TsdfVolume> phi_n,\n                                       cv::Ptr<kfusion::cuda::TsdfVolume> phi_n_psi,\n                                       std::shared_ptr<sobfu::cuda::DeformationField> psi,\n                                       std::shared_ptr<sobfu::cuda::DeformationField> psi_inv) {\n    /* DEVICE CLASSES */\n    sobfu::device::DeformationField psi_device(psi->get_data().ptr<float4>(), dims);\n    sobfu::device::DeformationField psi_inv_device(psi_inv->get_data().ptr<float4>(), dims);\n\n    kfusion::device::TsdfVolume phi_global_device(phi_global->data().ptr<float2>(), dims, voxel_sizes, trunc_dist, eta,\n                                                  max_weight);\n    kfusion::device::TsdfVolume phi_global_psi_inv_device(phi_global_psi_inv->data().ptr<float2>(), dims, voxel_sizes,\n                                                          trunc_dist, eta, max_weight);\n\n    kfusion::device::TsdfVolume phi_n_device(phi_n->data().ptr<float2>(), dims, voxel_sizes, trunc_dist, eta,\n                                             max_weight);\n    kfusion::device::TsdfVolume phi_n_psi_device(phi_n_psi->data().ptr<float2>(), dims, voxel_sizes, trunc_dist, eta,\n                                                 max_weight);\n\n    SDFs sdfs(phi_global_device, phi_global_psi_inv_device, phi_n_device, phi_n_psi_device);\n\n    sobfu::device::TsdfDifferentiator tsdf_diff(phi_n_psi_device);\n    sobfu::device::Differentiator diff(psi_device);\n    sobfu::device::Differentiator diff_inv(psi_inv_device);\n    sobfu::device::SecondOrderDifferentiator second_order_diff(psi_device);\n\n    Differentiators differentiators(tsdf_diff, diff, diff_inv, second_order_diff);\n\n    /* run the solver */\n    sobfu::device::estimate_psi(sdfs, psi_device, psi_inv_device, spatial_grads_device, differentiators, d_S_i, r,\n                                solver_params);\n}\n\n/*\n * SOBOLEV FILTER\n */\n\nstatic void sobfu::cuda::get_3d_sobolev_filter(SolverParams& params, float* h_S_i) {\n    int s3 = params.s * params.s * params.s;\n\n    /* init identity and laplacian matrices */\n    cv::Mat Id    = cv::Mat::eye(s3, s3, CV_32FC1);\n    cv::Mat L_mat = -6.f * cv::Mat::eye(s3, s3, CV_32FC1);\n\n    /* calculate laplacian matrix */\n    for (int i = 0; i <= static_cast<int>(pow(params.s, 3)) - 1; ++i) {\n        int idx_z = i / (params.s * params.s);\n        int idx_y = (i - idx_z * params.s * params.s) / params.s;\n        int idx_x = i - params.s * (idx_y + params.s * idx_z);\n\n        if (idx_x + 1 < params.s) {\n            int pos                 = (idx_x + 1) + idx_y * params.s + idx_z * params.s * params.s;\n            L_mat.at<float>(i, pos) = 1.f;\n        }\n        if (idx_x - 1 >= 0) {\n            int pos                 = (idx_x - 1) + idx_y * params.s + idx_z * params.s * params.s;\n            L_mat.at<float>(i, pos) = 1.f;\n        }\n\n        if (idx_y + 1 < params.s) {\n            int pos                 = idx_x + (idx_y + 1) * params.s + idx_z * params.s * params.s;\n            L_mat.at<float>(i, pos) = 1.f;\n        }\n        if (idx_y - 1 >= 0) {\n            int pos                 = idx_x + (idx_y - 1) * params.s + idx_z * params.s * params.s;\n            L_mat.at<float>(i, pos) = 1.f;\n        }\n\n        if (idx_z + 1 < params.s) {\n            int pos                 = idx_x + idx_y * params.s + (idx_z + 1) * params.s * params.s;\n            L_mat.at<float>(i, pos) = 1.f;\n        }\n        if (idx_z - 1 >= 0) {\n            int pos                 = idx_x + idx_y * params.s + (idx_z - 1) * params.s * params.s;\n            L_mat.at<float>(i, pos) = 1.f;\n        }\n    }\n\n    /* init one-hot vector v */\n    cv::Mat v                       = cv::Mat::zeros(s3, 1, CV_32FC1);\n    v.at<float>(floor(s3 / 2.f), 0) = 1.f;\n\n    /* init sobolev filter S */\n    cv::Mat S = cv::Mat::zeros(s3, 1, CV_32FC1);\n    /* solve for S */\n    cv::solve((Id - params.lambda * L_mat), v, S, cv::DECOMP_SVD);\n\n    std::cout << S << std::endl;\n}\n\nstatic void sobfu::cuda::decompose_sobolev_filter(SolverParams& params, float* h_S_i) {\n    if (params.s == 3) {\n        if (params.lambda == 0.1f) {\n            h_S_i[0] = 0.06537f;\n            h_S_i[1] = 0.99572f;\n            h_S_i[2] = h_S_i[0];\n        }\n    }\n\n    if (params.s == 7) {\n        if (params.lambda == 0.05f) {\n            h_S_i[0] = 0.00006f;\n            h_S_i[1] = 0.00015f;\n            h_S_i[2] = 0.03917f;\n            h_S_i[3] = 0.99846f;\n            h_S_i[4] = h_S_i[2];\n            h_S_i[5] = h_S_i[1];\n            h_S_i[6] = h_S_i[0];\n        }\n\n        if (params.lambda == 0.1f) {\n            h_S_i[0] = 0.00030f;\n            h_S_i[1] = 0.00441f;\n            h_S_i[2] = 0.06571f;\n            h_S_i[3] = 0.99565f;\n            h_S_i[4] = h_S_i[2];\n            h_S_i[5] = h_S_i[1];\n            h_S_i[6] = h_S_i[0];\n        }\n\n        if (params.lambda == 0.2f) {\n            h_S_i[0] = 0.00120f;\n            h_S_i[1] = 0.01094f;\n            h_S_i[2] = 0.10204f;\n            h_S_i[3] = 0.98941f;\n            h_S_i[4] = h_S_i[2];\n            h_S_i[5] = h_S_i[1];\n            h_S_i[6] = h_S_i[0];\n        }\n\n        if (params.lambda == 0.4f) {\n            h_S_i[0] = 0.00169f;\n            h_S_i[1] = 0.01312f;\n            h_S_i[2] = 0.10927f;\n            h_S_i[3] = 0.98781f;\n            h_S_i[4] = h_S_i[2];\n            h_S_i[5] = h_S_i[1];\n            h_S_i[6] = h_S_i[0];\n        }\n    }\n\n    if (params.s == 9) {\n        if (params.lambda == 0.05f) {\n            h_S_i[0] = 0.000003f;\n            h_S_i[1] = 0.00006f;\n            h_S_i[2] = 0.00155f;\n            h_S_i[3] = 0.03917f;\n            h_S_i[4] = 0.99846f;\n            h_S_i[5] = 0.03917f;\n            h_S_i[6] = 0.00155f;\n            h_S_i[7] = 0.00006f;\n            h_S_i[8] = 0.000003f;\n        }\n\n        if (params.lambda == 0.1f) {\n            h_S_i[0] = 0.00002f;\n            h_S_i[1] = 0.00030f;\n            h_S_i[2] = 0.00441f;\n            h_S_i[3] = 0.06571f;\n            h_S_i[4] = 0.99565f;\n            h_S_i[5] = h_S_i[3];\n            h_S_i[6] = h_S_i[2];\n            h_S_i[7] = h_S_i[1];\n            h_S_i[8] = h_S_i[0];\n        }\n    }\n\n    if (params.s == 11) {\n        if (params.lambda == 0.1f) {\n            h_S_i[0]  = 0.0000015f;\n            h_S_i[1]  = 0.00002f;\n            h_S_i[2]  = 0.00030f;\n            h_S_i[3]  = 0.00441f;\n            h_S_i[4]  = 0.06571f;\n            h_S_i[5]  = 0.99565f;\n            h_S_i[6]  = h_S_i[4];\n            h_S_i[7]  = h_S_i[3];\n            h_S_i[8]  = h_S_i[2];\n            h_S_i[9]  = h_S_i[1];\n            h_S_i[10] = h_S_i[0];\n        }\n    }\n\n    /* normalise filter to unit sum */\n    float sum = 0.f;\n    for (int i = 0; i < params.s; ++i) {\n        sum += h_S_i[i];\n    }\n\n    for (int i = 0; i < params.s; ++i) {\n        h_S_i[i] /= sum;\n    }\n}\n"
  },
  {
    "path": "src/sobfu/vector_fields.cpp",
    "content": "/* sobfu includes */\n#include <sobfu/vector_fields.hpp>\n\n/*\n * VECTOR FIELD\n */\n\nsobfu::cuda::VectorField::VectorField(cv::Vec3i dims_) : dims(dims_) {\n    int no_voxels = dims[0] * dims[1] * dims[2];\n    data.create(no_voxels * sizeof(float4));\n    clear();\n}\n\nsobfu::cuda::VectorField::~VectorField() = default;\n\ncv::Vec3i sobfu::cuda::VectorField::get_dims() const { return dims; }\n\nkfusion::cuda::CudaData sobfu::cuda::VectorField::get_data() { return data; }\n\nconst kfusion::cuda::CudaData sobfu::cuda::VectorField::get_data() const { return data; }\n\nvoid sobfu::cuda::VectorField::set_data(kfusion::cuda::CudaData& data) { data = data; }\n\nvoid sobfu::cuda::VectorField::clear() {\n    int3 d = make_int3(dims[0], dims[1], dims[2]);\n\n    sobfu::device::VectorField field(data.ptr<float4>(), d);\n    sobfu::device::clear(field);\n}\n\nvoid sobfu::cuda::VectorField::print() {\n    int sizes[3] = {dims[0], dims[1], dims[2]};\n\n    cv::Mat* mat = new cv::Mat(3, sizes, CV_32FC4);\n    get_data().download(mat->ptr<float4>());\n\n    std::cout << \"--- FIELD ---\" << std::endl;\n    for (int i = 0; i < dims[0]; i++) {\n        for (int j = 0; j < dims[1]; j++) {\n            for (int k = 0; k < dims[2]; k++) {\n                float u = mat->at<float4>(k, j, i).x;\n                float v = mat->at<float4>(k, j, i).y;\n                float w = mat->at<float4>(k, j, i).z;\n\n                if (fabs(u) > 1e-5f || fabs(v) > 1e-5f || fabs(w) > 1e-5f) {\n                    std::cout << \"(x,y,z)=(\" << i << \", \" << j << \", \" << k << \"), (u,v,w)=(\" << u << \", \" << v << \",\"\n                              << w << \")\" << std::endl;\n                }\n            }\n        }\n    }\n\n    delete mat;\n}\n\nint sobfu::cuda::VectorField::get_no_nans() {\n    int sizes[3] = {dims[0], dims[1], dims[2]};\n\n    cv::Mat* mat = new cv::Mat(3, sizes, CV_32FC4);\n    get_data().download(mat->ptr<float4>());\n\n    int no_nan = 0;\n    for (int i = 0; i < dims[0]; i++) {\n        for (int j = 0; j < dims[1]; j++) {\n            for (int k = 0; k < dims[2]; k++) {\n                float u = mat->at<float4>(k, j, i).x;\n                float v = mat->at<float4>(k, j, i).y;\n                float w = mat->at<float4>(k, j, i).z;\n\n                if (std::isnan(u) || std::isnan(v) || std::isnan(w)) {\n                    no_nan++;\n                }\n            }\n        }\n    }\n\n    delete mat;\n    return no_nan;\n}\n\n/*\n * DEFORMATION FIELD\n */\n\nsobfu::cuda::DeformationField::DeformationField(cv::Vec3i dims_) : VectorField(dims_) { clear(); }\n\nsobfu::cuda::DeformationField::~DeformationField() = default;\n\nvoid sobfu::cuda::DeformationField::clear() {\n    cv::Vec3f dims = get_dims();\n    int3 d         = make_int3(dims[0], dims[1], dims[2]);\n\n    sobfu::device::DeformationField psi(get_data().ptr<float4>(), d);\n    sobfu::device::init_identity(psi);\n}\n\nvoid sobfu::cuda::DeformationField::get_inverse(sobfu::cuda::DeformationField& psi_inv) {\n    kfusion::device::Vec3i d = kfusion::device_cast<kfusion::device::Vec3i>(get_dims());\n\n    sobfu::device::DeformationField psi_device(get_data().ptr<float4>(), d);\n    sobfu::device::DeformationField psi_inverse_device(psi_inv.get_data().ptr<float4>(), d);\n\n    sobfu::device::estimate_inverse(psi_device, psi_inverse_device);\n}\n\nvoid sobfu::cuda::DeformationField::apply(const cv::Ptr<kfusion::cuda::TsdfVolume> phi_n,\n                                          cv::Ptr<kfusion::cuda::TsdfVolume> phi_n_psi) {\n    kfusion::device::Vec3i d   = kfusion::device_cast<kfusion::device::Vec3i>(phi_n->getDims());\n    kfusion::device::Vec3f vsz = kfusion::device_cast<kfusion::device::Vec3f>(phi_n->getVoxelSize());\n\n    float3 voxel_sizes = kfusion::device_cast<float3>(phi_n->getVoxelSize());\n    float trunc_dist   = phi_n->getTruncDist();\n    float eta          = phi_n->getEta();\n    float max_weight   = phi_n->getMaxWeight();\n\n    kfusion::device::TsdfVolume phi_device(phi_n->data().ptr<float2>(), d, vsz, trunc_dist, eta, max_weight);\n    kfusion::device::TsdfVolume phi_warped_device(phi_n_psi->data().ptr<float2>(), d, vsz, trunc_dist, eta, max_weight);\n\n    sobfu::device::DeformationField psi_device(get_data().ptr<float4>(), d);\n\n    sobfu::device::apply(phi_device, phi_warped_device, psi_device);\n    kfusion::cuda::waitAllDefaultStream();\n}\n\n/*\n * JACOBIAN\n */\n\nsobfu::cuda::Jacobian::Jacobian(cv::Vec3i dims_) : dims(dims_) {\n    int no_voxels = dims[0] * dims[1] * dims[2];\n    data.create(no_voxels * sizeof(Mat4f));\n    clear();\n}\n\nsobfu::cuda::Jacobian::~Jacobian() = default;\n\nkfusion::cuda::CudaData sobfu::cuda::Jacobian::get_data() { return data; }\n\nconst kfusion::cuda::CudaData sobfu::cuda::Jacobian::get_data() const { return data; }\n\nvoid sobfu::cuda::Jacobian::clear() {\n    int3 d = make_int3(dims[0], dims[1], dims[2]);\n\n    sobfu::device::Jacobian J(data.ptr<Mat4f>(), d);\n    sobfu::device::clear(J);\n}\n\n/*\n * SPATIAL GRADIENTS\n */\n\nsobfu::cuda::SpatialGradients::SpatialGradients(cv::Vec3i dims_) {\n    nabla_phi_n       = new sobfu::cuda::TsdfGradient(dims_);\n    nabla_phi_n_o_psi = new sobfu::cuda::TsdfGradient(dims_);\n    J                 = new sobfu::cuda::Jacobian(dims_);\n    J_inv             = new sobfu::cuda::Jacobian(dims_);\n    L                 = new sobfu::cuda::Laplacian(dims_);\n    L_o_psi_inv       = new sobfu::cuda::Laplacian(dims_);\n    nabla_U           = new sobfu::cuda::PotentialGradient(dims_);\n    nabla_U_S         = new sobfu::cuda::PotentialGradient(dims_);\n}\n\nsobfu::cuda::SpatialGradients::~SpatialGradients() {\n    delete nabla_phi_n, nabla_phi_n_o_psi, J, J_inv, L, L_o_psi_inv, nabla_U, nabla_U_S;\n}\n"
  },
  {
    "path": "test/CMakeLists.txt",
    "content": "# ---------------------------------------------------------------------------- #\n# GOOGLE TEST DEPENDECIES\n# ---------------------------------------------------------------------------- #\n\n# Implementation heavily based upon:\n# https://github.com/kaizouman/gtest-cmake-example\n\n# Enable ExternalProject CMake module\ninclude(ExternalProject)\n\n# Download and install GoogleTest\nExternalProject_Add(\n    gtest\n    URL https://github.com/google/googletest/archive/release-1.8.1.zip\n    PREFIX ${CMAKE_CURRENT_BINARY_DIR}/gtest\n    # Disable install step\n    INSTALL_COMMAND \"\"\n)\n\n# Get GTest source and binary directories from CMake project\nExternalProject_Get_Property(gtest source_dir binary_dir)\n\n# Create a libgtest target to be used as a dependency by test programs\nadd_library(libgtest IMPORTED STATIC GLOBAL)\nadd_dependencies(libgtest gtest)\n\n# Set libgtest properties\nset_target_properties(libgtest PROPERTIES\n    \"IMPORTED_LOCATION\" \"${binary_dir}/googlemock/gtest/libgtest.a\"\n    \"IMPORTED_LINK_INTERFACE_LIBRARIES\" \"${CMAKE_THREAD_LIBS_INIT}\"\n)\n\n# Create a libgmock target to be used as a dependency by test programs\nadd_library(libgmock IMPORTED STATIC GLOBAL)\nadd_dependencies(libgmock gtest)\n\n# Set libgmock properties\nset_target_properties(libgmock PROPERTIES\n    \"IMPORTED_LOCATION\" \"${binary_dir}/googlemock/libgmock.a\"\n    \"IMPORTED_LINK_INTERFACE_LIBRARIES\" \"${CMAKE_THREAD_LIBS_INIT}\"\n)\n\n\n# ---------------------------------------------------------------------------- #\n# TEST INCLUDE DIRECTORIES\n# ---------------------------------------------------------------------------- #\ninclude_directories(\"${source_dir}/googletest/include\"\n                    \"${source_dir}/googlemock/include\")\n\n# ---------------------------------------------------------------------------- #\n# CREATE TESTS\n# ---------------------------------------------------------------------------- #\n# Force test build\nset(gtest_build_samples ON)\n\n# Add quaternion Test\nCREATE_TEST(sobfu_test)\n\n# Link all libraries for testing\ntarget_link_libraries(sobfu_test\n  sobfu\n  ${CUDA_CUDART_LIBRARY}\n  ${OpenCV_LIBS}\n  ${PCL_LIBRARIES}\n)\n"
  },
  {
    "path": "test/deformation_field_test.cpp",
    "content": "/* gtest includes */\n#include <gtest/gtest.h>\n\n/* sobfu includes */\n#include <sobfu/params.hpp>\n#include <sobfu/solver.hpp>\n\n/* kinfu includes */\n#include <kfusion/internal.hpp>\n#include <kfusion/precomp.hpp>\n#include <kfusion/types.hpp>\n\n/* sys headers */\n#include <cmath>\n#include <ctgmath>\n#include <iostream>\n#include <memory>\n\nclass DeformationFieldTest : public ::testing::Test {\nprotected:\n    /* You can remove any or all of the following functions if its body\n     * is empty. */\n\n    /* You can do set-up work for each test here. */\n    DeformationFieldTest() = default;\n\n    /* You can do clean-up work that doesn't throw exceptions here. */\n    ~DeformationFieldTest() override = default;\n\n    /* If the constructor and destructor are not enough for setting up\n     * and cleaning up each test, you can define the following methods: */\n\n    /* Code here will be called immediately after the constructor (right\n     * before each test). */\n    void SetUp() override {\n        /* volume params */\n        params.volume_dims = cv::Vec3i::all(64);\n        params.volume_size = cv::Vec3f::all(0.25f);\n\n        params.tsdf_trunc_dist       = 10.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);\n        params.gradient_delta_factor = 0.1f;\n\n        params.eta = 2.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);\n\n        /* camera params */\n        params.intr = kfusion::Intr(1.f, 1.f, 0.f, 0.f);\n\n        dims        = kfusion::device_cast<kfusion::device::Vec3i>(params.volume_dims);\n        voxel_sizes = kfusion::device_cast<kfusion::device::Vec3f>(params.voxel_sizes());\n\n        no_voxels = dims.x * dims.y * dims.z;\n\n        /* init tsdf's */\n        phi_global         = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_global_psi_inv = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_n              = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_n_psi          = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n\n        /* init psi */\n        psi     = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);\n        psi_inv = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);\n    }\n\n    /* Code here will be called immediately after each test (right\n     * before the destructor). */\n    void TearDown() override {}\n\n    /* Objects declared here can be used by all tests in the test case for Solver. */\n\n    /* sobfu params */\n    Params params;\n\n    int3 dims;\n    float3 voxel_sizes;\n\n    int no_voxels;\n\n    /* global and live tsdf's */\n    cv::Ptr<kfusion::cuda::TsdfVolume> phi_global, phi_global_psi_inv, phi_n, phi_n_psi;\n    /* deformation field */\n    std::shared_ptr<sobfu::cuda::DeformationField> psi, psi_inv;\n    /* solver */\n    std::shared_ptr<sobfu::cuda::Solver> solver;\n\n    /* max. error */\n    float max_error = 1e-1f;\n    /* floating point precision */\n    const float epsilon = 1e-5f;\n};\n\n/* test that the vector field is correctly initialised to 0 */\nTEST_F(DeformationFieldTest, ClearTest) {\n    kfusion::cuda::CudaData data = psi->get_data();\n\n    int sizes[3]    = {dims.x, dims.y, dims.z};\n    cv::Mat *matrix = new cv::Mat(3, sizes, CV_32FC4);\n    data.download(matrix->ptr<float4>());\n\n    for (int i = 0; i < sizes[0]; i++) {\n        for (int j = 0; j < sizes[1]; j++) {\n            for (int k = 0; k < sizes[2]; k++) {\n                ASSERT_NEAR(matrix->at<float4>(k, j, i).x, (float) i, epsilon);\n                ASSERT_NEAR(matrix->at<float4>(k, j, i).y, (float) j, epsilon);\n                ASSERT_NEAR(matrix->at<float4>(k, j, i).z, (float) k, epsilon);\n            }\n        }\n    }\n}\n\n/* test that the norm of the gradient of an sdf is 1 */\nTEST_F(DeformationFieldTest, TsdfGradientTest) {\n    /* init tsdf */\n    float3 c = make_float3(0.16f, 0.16f, 0.16f);\n    float r  = 0.01f;\n    phi_global->initSphere(c, r);\n\n    kfusion::device::TsdfVolume phi_global_device(phi_global->data().ptr<float2>(), dims, voxel_sizes,\n                                                  params.tsdf_trunc_dist, params.eta, params.tsdf_max_weight);\n\n    /* init gradient */\n    kfusion::cuda::CudaData grad_data;\n    grad_data.create(no_voxels * sizeof(float4));\n    sobfu::device::TsdfGradient gradient_device(grad_data.ptr<float4>(), dims);\n\n    sobfu::device::TsdfDifferentiator diff(phi_global_device);\n    diff.calculate(gradient_device);\n\n    /* test that the gradient magnitude is 0 in the truncated regions and close to 1 elsewhere */\n    float2 *tsdf_ptr = new float2[no_voxels];\n    phi_global->data().download(tsdf_ptr);\n\n    float4 *grad_ptr = new float4[no_voxels];\n    grad_data.download(grad_ptr);\n\n    int sizes[3] = {dims.x, dims.y, dims.z};\n    for (int i = 1; i < sizes[0] - 1; i++) {\n        for (int j = 1; j < sizes[1] - 1; j++) {\n            for (int k = 1; k < sizes[2] - 1; k++) {\n                float tsdf_val  = (*(tsdf_ptr + i + j * sizes[0] + k * sizes[1] * sizes[0])).x;\n                float4 grad_val = *(grad_ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);\n\n                float norm = sqrtf(grad_val.x * grad_val.x + grad_val.y * grad_val.y + grad_val.z * grad_val.z);\n                if (fabs(tsdf_val) < 1.f) { /* only check for non-truncated voxels */\n                    ASSERT_NEAR(norm, voxel_sizes.x / params.tsdf_trunc_dist, 0.15f);\n                }\n            }\n        }\n    }\n}\n\n/* test that the jacobian of a uniform vector field is null */\nTEST_F(DeformationFieldTest, UniformFieldJacobianTest) {\n    /* init psi to a uniform field */\n    int sizes[3] = {dims.x, dims.y, dims.z};\n\n    cv::Mat *matrix = new cv::Mat(3, sizes, CV_32FC4);\n    for (int i = 0; i < sizes[0]; i++) {\n        for (int j = 0; j < sizes[1]; j++) {\n            for (int k = 0; k < sizes[2]; k++) {\n                matrix->at<float4>(k, j, i) = make_float4(1.f, 1.f, 1.f, 0.f);\n            }\n        }\n    }\n\n    kfusion::cuda::CudaData psi_data;\n    psi_data.create(no_voxels * sizeof(float4));\n    psi_data.upload(matrix->ptr<float4>(), no_voxels * sizeof(float4));\n    sobfu::device::DeformationField psi_device(psi_data.ptr<float4>(), dims);\n\n    /* init & clear the jacobian */\n    kfusion::cuda::CudaData jacobian_data;\n    jacobian_data.create(no_voxels * sizeof(Mat4f));\n    sobfu::device::Jacobian J(jacobian_data.ptr<Mat4f>(), dims);\n\n    /* calculate the jacobian */\n    sobfu::device::Differentiator diff(psi_device);\n    diff.calculate(J);\n\n    /* test that the jacobian is null */\n    Mat4f *ptr = new Mat4f[no_voxels];\n    jacobian_data.download(ptr);\n\n    for (int i = 0; i < sizes[0]; i++) {\n        for (int j = 0; j < sizes[1]; j++) {\n            for (int k = 0; k < sizes[2]; k++) {\n                Mat4f J_val = *(ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);\n\n                for (int r = 0; r < 3; r++) {\n                    ASSERT_NEAR(J_val.data[r].x, 0.f, epsilon);\n                    ASSERT_NEAR(J_val.data[r].y, 0.f, epsilon);\n                    ASSERT_NEAR(J_val.data[r].z, 0.f, epsilon);\n                }\n            }\n        }\n    }\n}\n\n/* test the calculation of the jacobian of a simple vector field */\nTEST_F(DeformationFieldTest, JacobianTestSimple) {\n    /* init psi */\n    int sizes[3] = {dims.x, dims.y, dims.z};\n\n    cv::Mat *matrix = new cv::Mat(3, sizes, CV_32FC4);\n    for (int i = 0; i < sizes[0]; i++) {\n        for (int j = 0; j < sizes[1]; j++) {\n            for (int k = 0; k < sizes[2]; k++) {\n                matrix->at<float4>(k, j, i) = make_float4(i, j, k, 0.f);\n            }\n        }\n    }\n\n    kfusion::cuda::CudaData psi_data;\n    psi_data.create(no_voxels * sizeof(float4));\n    psi_data.upload(matrix->ptr<float4>(), no_voxels * sizeof(float4));\n    sobfu::device::DeformationField psi_device(psi_data.ptr<float4>(), dims);\n\n    /* init the jacobian */\n    kfusion::cuda::CudaData jacobian_data;\n    jacobian_data.create(no_voxels * sizeof(Mat4f));\n    sobfu::device::Jacobian J(jacobian_data.ptr<Mat4f>(), dims);\n\n    /* calculate the jacobian */\n    sobfu::device::Differentiator diff(psi_device);\n    diff.calculate(J);\n\n    /* test that the jacobian values are correct */\n    Mat4f *ptr = new Mat4f[no_voxels];\n    jacobian_data.download(ptr);\n\n    for (int i = 1; i < sizes[0] - 1; i++) {\n        for (int j = 1; j < sizes[1] - 1; j++) {\n            for (int k = 1; k < sizes[2] - 1; k++) {\n                Mat4f J_val = *(ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);\n\n                ASSERT_NEAR(J_val.data[0].x, 1.f, epsilon);\n                ASSERT_NEAR(J_val.data[0].y, 0.f, epsilon);\n                ASSERT_NEAR(J_val.data[0].z, 0.f, epsilon);\n\n                ASSERT_NEAR(J_val.data[1].x, 0.f, epsilon);\n                ASSERT_NEAR(J_val.data[1].y, 1.f, epsilon);\n                ASSERT_NEAR(J_val.data[1].z, 0.f, epsilon);\n\n                ASSERT_NEAR(J_val.data[2].x, 0.f, epsilon);\n                ASSERT_NEAR(J_val.data[2].y, 0.f, epsilon);\n                ASSERT_NEAR(J_val.data[2].z, 1.f, epsilon);\n            }\n        }\n    }\n}\n\n/* test the computation of the jacobian and laplacian of a vector field */\nTEST_F(DeformationFieldTest, JacobianLaplacianTestComplicated) {\n    /* init psi */\n    int sizes[3] = {dims.x, dims.y, dims.z};\n\n    cv::Mat *matrix = new cv::Mat(3, sizes, CV_32FC4);\n    for (int i = 0; i < sizes[0]; i++) {\n        for (int j = 0; j < sizes[1]; j++) {\n            for (int k = 0; k < sizes[2]; k++) {\n                matrix->at<float4>(k, j, i) = make_float4(i * (1.f - j), exp(-k) + j, k, 0.f);\n            }\n        }\n    }\n\n    kfusion::cuda::CudaData psi_data;\n    psi_data.create(no_voxels * sizeof(float4));\n    psi_data.upload(matrix->ptr<float4>(), no_voxels * sizeof(float4));\n    sobfu::device::DeformationField psi_device(psi_data.ptr<float4>(), dims);\n\n    /* init the jacobian */\n    kfusion::cuda::CudaData J_data;\n    J_data.create(no_voxels * sizeof(Mat4f));\n    sobfu::device::Jacobian J(J_data.ptr<Mat4f>(), dims);\n\n    /* calculate the jacobian */\n    sobfu::device::Differentiator diff(psi_device);\n    diff.calculate(J);\n\n    /* test that the jacobian values are correct */\n    Mat4f *ptr = new Mat4f[no_voxels];\n    J_data.download(ptr);\n\n    /*\t       (1-y  -x      0    )\n     * J_psi = ( 0    1  -exp(-z) )\n     * \t       ( 0    0      1    )\n     */\n\n    for (int i = 1; i < sizes[0] - 1; i++) {\n        for (int j = 1; j < sizes[1] - 1; j++) {\n            for (int k = 1; k < sizes[2] - 1; k++) {\n                Mat4f J_val = *(ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);\n\n                ASSERT_NEAR(J_val.data[0].x, 1.f - j, max_error);\n                ASSERT_NEAR(J_val.data[0].y, -i, max_error);\n                ASSERT_NEAR(J_val.data[0].z, 0.f, max_error);\n\n                ASSERT_NEAR(J_val.data[1].x, 0, max_error);\n                ASSERT_NEAR(J_val.data[1].y, 1.f, max_error);\n                ASSERT_NEAR(J_val.data[1].z, -exp(-k), max_error);\n\n                ASSERT_NEAR(J_val.data[2].x, 0.f, max_error);\n                ASSERT_NEAR(J_val.data[2].y, 0.f, max_error);\n                ASSERT_NEAR(J_val.data[2].z, 1.f, max_error);\n            }\n        }\n    }\n\n    /* init the laplacian */\n    kfusion::cuda::CudaData laplacian_data;\n    laplacian_data.create(no_voxels * sizeof(float4));\n    sobfu::device::Laplacian L(laplacian_data.ptr<float4>(), dims);\n\n    /* calculate the laplacian */\n    sobfu::device::SecondOrderDifferentiator secondOrderDiff(psi_device);\n    secondOrderDiff.calculate(L);\n\n    /* test that the laplacian values are correct */\n    float4 *L_ptr = new float4[no_voxels];\n    laplacian_data.download(L_ptr);\n\n    /*\n     * expecting -(0  exp(-z)  0)--we calculate the negative laplacian for simplicity\n     */\n\n    for (int i = 1; i < sizes[0] - 1; i++) {\n        for (int j = 1; j < sizes[1] - 1; j++) {\n            for (int k = 1; k < sizes[2] - 1; k++) {\n                float4 L_val = *(L_ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);\n\n                ASSERT_NEAR(L_val.x, 0.f, max_error);\n                ASSERT_NEAR(L_val.y, -exp(-k), max_error);\n                ASSERT_NEAR(L_val.z, 0.f, max_error);\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "test/main.cpp",
    "content": "#include \"gtest/gtest.h\"\n\nint main(int argc, char **argv) {\n    ::testing::InitGoogleTest(&argc, argv);\n    return RUN_ALL_TESTS();\n}\n"
  },
  {
    "path": "test/reductions_test.cpp",
    "content": "/* gtest includes */\n#include <gtest/gtest.h>\n\n/* sobfu includes */\n#include <sobfu/params.hpp>\n#include <sobfu/solver.hpp>\n\n/* kinfu includes */\n#include <kfusion/internal.hpp>\n#include <kfusion/precomp.hpp>\n#include <kfusion/types.hpp>\n\n/* sys headers */\n#include <cmath>\n#include <ctgmath>\n#include <iostream>\n#include <memory>\n\nclass ReductionsTest : public ::testing::Test {\nprotected:\n    /* You can remove any or all of the following functions if its body\n     * is empty. */\n\n    /* You can do set-up work for each test here. */\n    ReductionsTest() = default;\n\n    /* You can do clean-up work that doesn't throw exceptions here. */\n    ~ReductionsTest() override = default;\n\n    /* If the constructor and destructor are not enough for setting up\n     * and cleaning up each test, you can define the following methods: */\n\n    /* Code here will be called immediately after the constructor (right\n     * before each test). */\n    void SetUp() override {\n        /* verbosity */\n        params.verbosity = 2;\n\n        /* volume params */\n        params.volume_dims = cv::Vec3i::all(64);\n        params.volume_size = cv::Vec3f::all(0.25f);\n\n        params.tsdf_trunc_dist = 5.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);\n        params.eta             = 2.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);\n\n        dims        = kfusion::device_cast<kfusion::device::Vec3i>(params.volume_dims);\n        voxel_sizes = kfusion::device_cast<kfusion::device::Vec3f>(params.voxel_sizes());\n\n        no_voxels = dims.x * dims.y * dims.z;\n\n        /* init tsdf's */\n        phi_global = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_n      = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n\n        /* init reduction class */\n        reductor = new sobfu::device::Reductor(dims, voxel_sizes.x, params.tsdf_trunc_dist);\n    }\n\n    /* Code here will be called immediately after each test (right\n     * before the destructor). */\n    void TearDown() override {}\n\n    /* Objects declared here can be used by all tests in the test case for Solver. */\n\n    /* sobfu params */\n    Params params;\n\n    int3 dims;\n    float3 voxel_sizes;\n\n    int no_voxels;\n\n    /* global and live tsdf's */\n    cv::Ptr<kfusion::cuda::TsdfVolume> phi_global, phi_n;\n\n    /* reduction class */\n    sobfu::device::Reductor* reductor;\n\n    /* max. error */\n    float max_error = 1e-1f;\n    /* floating point precision */\n    const float epsilon = 1e-5f;\n};\n\n/* test calculation of the energy functional data term */\nTEST_F(ReductionsTest, DataTermTest) {\n    /* init tsdf's */\n    kfusion::device::TsdfVolume phi_n_device(phi_n->data().ptr<float2>(), dims, voxel_sizes, params.tsdf_trunc_dist,\n                                             params.eta, params.tsdf_max_weight);\n    kfusion::device::clear_volume(phi_n_device); /* will contain all 0's */\n\n    float3 c = make_float3(5.f, 5.f, 5.f);\n    float r  = 0.01f;\n    phi_global->initSphere(c, r); /* will contain all 1's */\n\n    kfusion::device::TsdfVolume phi_global_device(phi_global->data().ptr<float2>(), dims, voxel_sizes,\n                                                  params.tsdf_trunc_dist, params.eta, params.tsdf_max_weight);\n\n    float data_energy = reductor->data_energy(phi_global->data().ptr<float2>(), phi_n->data().ptr<float2>());\n    ASSERT_NEAR(data_energy, 0.5f * static_cast<float>(no_voxels) * 1.f, max_error);\n}\n"
  },
  {
    "path": "test/solver_test.cpp",
    "content": "/* gtest includes */\n#include <gtest/gtest.h>\n\n/* sobfu includes */\n#include <sobfu/params.hpp>\n#include <sobfu/solver.hpp>\n\n/* kinfu includes */\n#include <kfusion/internal.hpp>\n#include <kfusion/precomp.hpp>\n#include <kfusion/types.hpp>\n\n/* sys headers */\n#include <cmath>\n#include <ctgmath>\n#include <iostream>\n#include <memory>\n\nclass SolverTest : public ::testing::Test {\nprotected:\n    /* You can remove any or all of the following functions if its body\n     * is empty. */\n\n    /* You can do set-up work for each test here. */\n    SolverTest() = default;\n\n    /* You can do clean-up work that doesn't throw exceptions here. */\n    ~SolverTest() override = default;\n\n    /* If the constructor and destructor are not enough for setting up\n     * and cleaning up each test, you can define the following methods: */\n\n    /* Code here will be called immediately after the constructor (right\n     * before each test). */\n    void SetUp() override {\n        /* verbosity */\n        params.verbosity = 1;\n\n        /* volume params */\n        params.volume_dims = cv::Vec3i::all(64);\n        params.volume_size = cv::Vec3f::all(0.25f);\n\n        params.tsdf_trunc_dist       = 10.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);\n        params.gradient_delta_factor = 0.1f;\n\n        params.eta = 2.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);\n\n        /* camera params */\n        params.intr = kfusion::Intr(1.f, 1.f, 0.f, 0.f);\n\n        /* solver params */\n        params.max_iter        = 2048;\n        params.max_update_norm = -1.f;\n\n        params.s      = 7;\n        params.lambda = 0.1f;\n\n        params.alpha = 0.001f;\n        params.w_reg = 0.4f;\n\n        dims        = kfusion::device_cast<kfusion::device::Vec3i>(params.volume_dims);\n        voxel_sizes = kfusion::device_cast<kfusion::device::Vec3f>(params.voxel_sizes());\n\n        no_voxels = dims.x * dims.y * dims.z;\n\n        /* init tsdf's */\n        phi_global         = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_global_psi_inv = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_n              = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n        phi_n_psi          = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));\n\n        mc = cv::Ptr<kfusion::cuda::MarchingCubes>(new kfusion::cuda::MarchingCubes());\n\n        /* init psi */\n        psi     = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);\n        psi_inv = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);\n    }\n\n    /* Code here will be called immediately after each test (right\n     * before the destructor). */\n    void TearDown() override {}\n\n    /* Objects declared here can be used by all tests in the test case for Solver. */\n\n    /* sobfu params */\n    Params params;\n\n    int3 dims;\n    float3 voxel_sizes;\n\n    int no_voxels;\n\n    /* global and live tsdf's */\n    cv::Ptr<kfusion::cuda::TsdfVolume> phi_global, phi_global_psi_inv, phi_n, phi_n_psi;\n    /* marching cubes--for visualisation purposes */\n    cv::Ptr<kfusion::cuda::MarchingCubes> mc;\n    /* deformation field */\n    std::shared_ptr<sobfu::cuda::DeformationField> psi, psi_inv;\n    /* solver */\n    std::shared_ptr<sobfu::cuda::Solver> solver;\n\n    /* max. error */\n    float max_error = 1e-1f;\n    /* floating point precision */\n    const float epsilon = 1e-5f;\n};\n\n/* test the alignment of 2 spheres */\nTEST_F(SolverTest, AlignmentTestSphereTranslation) {\n    /* set parameters */\n    params.max_iter = 2048;\n\n    params.alpha = 0.01f;\n    params.w_reg = 0.4f;\n\n    /* init solver */\n    solver = std::make_shared<sobfu::cuda::Solver>(params);\n\n    /* init phi_global and phi_n */\n    int sizes[3] = {dims.x, dims.y, dims.z};\n\n    float3 c = make_float3(0.13f, 0.13f, 0.13f);\n    float r  = 0.012f;\n    phi_global->initSphere(c, r);\n\n    c = make_float3(0.125f, 0.13f, 0.13f);\n    phi_n->initSphere(c, r);\n    phi_n_psi->initSphere(c, r);\n\n    /* calculate psi and apply it to phi_n */\n    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);\n}\n\n/* test the alignment of a sphere translating and expanding */\nTEST_F(SolverTest, AlignmentTestSphereExpanding) {\n    /* set parameters */\n    params.max_iter = 2048;\n\n    params.alpha = 0.005f;\n    params.w_reg = 0.4f;\n\n    /* init solver */\n    solver = std::make_shared<sobfu::cuda::Solver>(params);\n\n    /* init phi_global and phi_n */\n    int sizes[3] = {dims.x, dims.y, dims.z};\n\n    float3 c = make_float3(0.13f, 0.13f, 0.13f);\n    float r  = 0.012f;\n    phi_global->initSphere(c, r);\n\n    c = make_float3(0.125f, 0.13f, 0.13f);\n    r = 0.0145f;\n    phi_n->initSphere(c, r);\n    phi_n_psi->initSphere(c, r);\n\n    /* calculate psi and apply it to phi_n */\n    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);\n}\n\n/* test the alignment of 2 spheres over 3 frames */\nTEST_F(SolverTest, SerialAlignmentTest) {\n    /* set parameters */\n    params.max_iter = 2048;\n\n    params.alpha = 0.005f;\n    params.w_reg = 0.4f;\n\n    /* init solver */\n    solver = std::make_shared<sobfu::cuda::Solver>(params);\n\n    /* init phi_global and phi_n */\n    int sizes[3] = {dims.x, dims.y, dims.z};\n\n    float3 c = make_float3(0.13f, 0.13f, 0.13f);\n    float r  = 0.02f;\n    phi_global->initSphere(c, r);\n\n    c = make_float3(0.125f, 0.13f, 0.132f);\n    phi_n->initSphere(c, r);\n    phi_n_psi->initSphere(c, r);\n\n    /*\n     * FRAME 1 to FRAME 0\n     *\n     */\n\n    std::cout << \"\\nFRAME 1\\n\" << std::endl;\n\n    /* calculate psi and apply it to phi_n */\n    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);\n\n    /*\n     * FRAME 2 to FRAME 0\n     *\n     */\n\n    std::cout << \"\\nFRAME 2\\n\" << std::endl;\n\n    phi_n->clear();\n    phi_n_psi->clear();\n\n    c = make_float3(0.123f, 0.13f, 0.132f);\n    phi_n->initSphere(c, r);\n    psi->apply(phi_n, phi_n_psi);\n\n    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);\n}\n"
  }
]