Repository: dgrzech/sobfu
Branch: master
Commit: c83646582a14
Files: 73
Total size: 409.5 KB

Directory structure:
gitextract_kh2b_hh_/

├── .clang-format
├── .gitignore
├── .gitlab-ci.yml
├── .lintflags
├── .lintignore
├── CMakeLists.txt
├── LICENSE.md
├── README.md
├── cmake/
│   ├── Targets.cmake
│   └── Utils.cmake
├── include/
│   ├── kfusion/
│   │   ├── cuda/
│   │   │   ├── device.hpp
│   │   │   ├── device_array.hpp
│   │   │   ├── device_memory.hpp
│   │   │   ├── imgproc.hpp
│   │   │   ├── kernel_containers.hpp
│   │   │   ├── marching_cubes.hpp
│   │   │   ├── projective_icp.hpp
│   │   │   ├── temp_utils.hpp
│   │   │   ├── texture_binder.hpp
│   │   │   └── tsdf_volume.hpp
│   │   ├── exports.hpp
│   │   ├── internal.hpp
│   │   ├── kinfu.hpp
│   │   ├── precomp.hpp
│   │   ├── safe_call.hpp
│   │   └── types.hpp
│   └── sobfu/
│       ├── cuda/
│       │   └── utils.hpp
│       ├── params.hpp
│       ├── precomp.hpp
│       ├── reductor.hpp
│       ├── scalar_fields.hpp
│       ├── sob_fusion.hpp
│       ├── solver.hpp
│       └── vector_fields.hpp
├── params/
│   ├── params_advent.ini
│   ├── params_boxing.ini
│   ├── params_hat.ini
│   ├── params_ours.ini
│   ├── params_snoopy.ini
│   └── params_umbrella.ini
├── setup.sh
├── src/
│   ├── CMakeLists.txt
│   ├── apps/
│   │   ├── CMakeLists.txt
│   │   └── demo.cpp
│   ├── kfusion/
│   │   ├── CMakeLists.txt
│   │   ├── core.cpp
│   │   ├── cuda/
│   │   │   ├── imgproc.cu
│   │   │   ├── marching_cubes.cu
│   │   │   ├── proj_icp.cu
│   │   │   └── tsdf_volume.cu
│   │   ├── device_memory.cpp
│   │   ├── imgproc.cpp
│   │   ├── kinfu.cpp
│   │   ├── marching_cubes.cpp
│   │   ├── precomp.cpp
│   │   ├── projective_icp.cpp
│   │   └── tsdf_volume.cpp
│   └── sobfu/
│       ├── CMakeLists.txt
│       ├── cuda/
│       │   ├── reductor.cu
│       │   ├── scalar_fields.cu
│       │   ├── solver.cu
│       │   └── vector_fields.cu
│       ├── precomp.cpp
│       ├── reductor.cpp
│       ├── scalar_fields.cpp
│       ├── sob_fusion.cpp
│       ├── solver.cpp
│       └── vector_fields.cpp
└── test/
    ├── CMakeLists.txt
    ├── deformation_field_test.cpp
    ├── main.cpp
    ├── reductions_test.cpp
    └── solver_test.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .clang-format
================================================
---
# ---------------------------------------------------------------------------- #
#                                CLANG-FORMAT                                  #
#                                   v6.0.x                                     #
# ---------------------------------------------------------------------------- #
# For documentation: https://clang.llvm.org/docs/ClangFormatStyleOptions.html
#
Language:        Cpp
# BasedOnStyle:  Google
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands:   true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
  AfterClass:      false
  AfterControlStatement: false
  AfterEnum:       false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
ColumnLimit:     120
CommentPragmas:  '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: true
DisableFormat:   false
ExperimentalAutoDetectBinPacking: false
ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
  - Regex:           '^<.*'
    Priority:        2
  - Regex:           '.*'
    Priority:        3
IndentCaseLabels: true
IndentWidth:     4
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd:   ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
ReflowComments:  true
SortIncludes:    true
SpaceAfterCStyleCast: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles:  false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard:        Cpp11
TabWidth:        8
UseTab:          Never
...


================================================
FILE: .gitignore
================================================
# Created by https://www.gitignore.io/api/vim,osx,c++
/data
### C++ ###
# Prerequisites
*.d

# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

### OSX ###
*.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

### Vim ###
# swap
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
[._]s[a-v][a-z]
[._]sw[a-p]
# session
Session.vim
# temporary
.netrwhist
*~
# auto-generated tag files
tags

# End of https://www.gitignore.io/api/vim,osx,c++

# Ignore build directory
build
files

# ignore remote sync config file
.remote-sync.json

# Ignore frames
frames

# ignore helper MATLAB script to display point clouds
show_pcl.m

# ignore output folder
out

================================================
FILE: .gitlab-ci.yml
================================================
# ---------------------------------------------------------------------------- #
#                               GITLAB-RUNNER-CI                               #
# ---------------------------------------------------------------------------- #

# Version: v9.5.4
# Running on graphic01.doc.ic.ac.uk

# ---------------------------------------------------------------------------- #
#                                  PIPELINE                                    #
# ---------------------------------------------------------------------------- #
# The CI pipeline is divided into the following stages:
# Lint: Performs linter checks on the repo's code
# Build: Builds the project
# Test: Runs the project's tests
# Deploy: Deploys the project to a given directory
# Cleanup: Removes all artifacts required for the CI stages
stages:
  - Setup
  - Lint
  - Build
  - Test
  - Deploy
  - Cleanup

# env_setup defines a number of environment variables which can be called by
# various before_scripts of the project.
# 1) Exporting the PATH is necessary to enable linking of the custom installed
#    binaries in the group's shared directory.
# 2) setup.sh makes sure all Cuda related binaries are setup correctly.
#    This includes, but isn't limited to, CUDA itself and OpenGL.
.env_setup: &env_setup |
  source setup.sh --set-paths

# Variables for use in script
variables:
  CONTAINER_IMAGE: g1736211/dynfu
  DEV_IMAGE: g1736211/dynfu:dev
  TAG_IMAGE: g1736211/dynfu:${CI_COMMIT_REF_SLUG}
# ---------------------------------------------------------------------------- #
#                                     SETUP                                    #
# ---------------------------------------------------------------------------- #
# The Setup stage sets all the nevironment variables and paths using cmake.

# > cmake <
# Cmake will generate a Makefile for the project linking all needed dependencies
# Cmake stage is required to not fail, in which case it would fail the pipeline.
cmake:
  stage: Setup

  # Before attempting to cmake env variables must be set to link the binaries
  # Call .setup.sh which creates build and setup the dependencies
  # Cmake will also build inside the build/ directory
  before_script:
    - rm -rf build || true
    - source setup.sh --set-paths

  # Build/ directory is cached for a given commit SHA to make sure all the tasks
  # within a same pipeline will have access to the correctly built project
  cache:
    key: "$CI_COMMIT_SHA"
    paths:
      - build/

  # Cmake requires two additional flags:
  # 1) CMAKE_PREFIX enables it to find the custom built binaries
  # 2) CMAKE_CXX_FLAGS are necessary to be set as otherwise BASH doesn't seem to
  #    set the c++ standard correctly (i.e. will throw a number of errors)
  # 3) [Others]
  script:
    - source setup.sh --cmake -i

# ---------------------------------------------------------------------------- #
#                                     LINT                                     #
# ---------------------------------------------------------------------------- #
# The Lint stage runs the linter on the project's files.

# > clang_format <
# Clang formatting will ensure that the latest files have been formatted with
# clang-styling. This step does not fix linter errors, it will just point them
# out to the pipeline, failing it.
clang_format:
  stage: Lint

  # Set env variables before testing should we need to access CUDA of OpenGL
  # headers.
  before_script:
    - *env_setup

  # Files are considered to be source only if *.cpp or *.hpp.
  # Excluded files or directories must be set manually in .lintignore
  script:
     # Print out the configuration file variables
     - /vol/project/2017/362/g1736211/bin/clang-format -dump-config
     # Attempt to check if clang-format had been run before saving
     - ./scripts/.format_bash

# > lint <
# The linter used for the project is clang-tidy.
# 1) Linting stage must run after build due to the numerous dependencies created
#    by Cmake. All these dependencies are logged into compile_commands.json and
#    are referenced by clang-tidy when linting (i.e. to scan headers).
# 2) The run-clang-tidy python script must be linked to the relevant clang-tidy
#    executable as it cannot be found in it's canonical place (i.e /usr/bin).
# 3) run-clang-tidy.py will accept all clang-tidy flags, it has extra value as
#    it can use the compile_commands.json information.
#
# USED FLAGS:
# 1) -list-checks : Lists all the enabled checks.
# 2) -checks= : Sets different checkstyles for the source files
# 3) -p <DIR> : Sets the path for the build directory containing the
#               compile_commands.json information.
# 4) -warnings-as-errors=* : Treats all warnings as errors.
#
# Used checks can be found in .lintflags
clang_tidy:
  stage: Lint

  # Set env variables before testing should we need to access CUDA of OpenGL
  # headers.
  before_script:
    - *env_setup

  # Linting must occur on the newly built build/ binaries.
  cache:
    key: "$CI_COMMIT_SHA"
    paths:
      - build/
    policy: pull

  # Files are considered to be source only if *.cpp or *.hpp.
  # Excluded files or directories must be set manually in .lintignore
  script:
     # Print out the list of enabled checks
     - /vol/project/2017/362/g1736211/bin/clang-tidy `./scripts/.flags_bash` -list-checks
     # Print out the list of files which will be checked for linting
     - ./scripts/.find_bash
     # Perform the linting with the above configuratiions
     - /vol/project/2017/362/g1736211/bin/clang-tidy -p build `./scripts/.diff_tidy_bash` `./scripts/.flags_bash` `./scripts/.find_bash`

# ---------------------------------------------------------------------------- #
#                                    BUILD                                     #
# ---------------------------------------------------------------------------- #
# The build project actually generates the binaries for the project.
# According to the flags set during the CMake stage, tests are built as well in
# their respective directories.

# > make <
# The make stage will run the Makefile generated by cmake in the build/ dir.
# The make stage is required not to fail.
make:
  stage: Build

  # Before attempting to make env variables must be set to make sure all
  # binaries referenced by the Makefile can be accessed.
  # Make will also build the executables inside the build/ directory
  before_script:
    - *env_setup

  # Build/ directory is cached for a given commit SHA to make sure all the tasks
  # within a same pipeline will have access to the correctly built project
  cache:
    key: "$CI_COMMIT_SHA"
    paths:
      - build/

  # Run simple make from the Makefile on four processes.
  script:
    - source setup.sh --make

# > docker_image_build <
# The docker_image_build stage creates the Docker image for an EC2 instance 
# on a docker-compatible runner
docker_image_build:
  stage: Build
  tags:
    - docker
  script:
    - docker build --build-arg CUDA_GENERATION=Maxwell -t $TAG_IMAGE .
  except:
    - master

# > docker_image_build_master <
# The docker_image_build_master stage creates the Docker image for all architectures
# on a docker-compatible runner
docker_image_build_master:
  stage: Build
  tags:
    - docker
  script:
    - docker build --build-arg CUDA_GENERATION=Auto -t $TAG_IMAGE .
  only:
    - master

# ---------------------------------------------------------------------------- #
#                                    TEST                                      #
# ---------------------------------------------------------------------------- #
# The Test stage will perform a number of tests on the Project.

# > gtest <
# The gtest stage runs the GTest suite present within the library.
# GTest binaries can be found in the group shared directory.
gtest:
  stage: Test

  # Set env variables before testing should we need to access CUDA of OpenGL
  # headers.
  before_script:
    - *env_setup

  # Testing must occur on the newly built build/ binaries.
  cache:
    key: "$CI_COMMIT_SHA"
    paths:
      - build/
    policy: pull

  # Run the gtests
  script:
    - ./build/bin/dynfu_test

# > coverage <
# The coverage stage runs gcov (and on top of that lcov) to generate a coverage
# report of the test for the source files.
coverage:
  stage: Test

  # Set env variables before testing should we need to access CUDA of OpenGL
  # headers.
  before_script:
    - *env_setup

  # Coverage report is constructed for the newly build binaries
  cache:
    key: "$CI_COMMIT_SHA"
    paths:
      - build/
    policy: pull

  # Generate the coverage report
  script:
    - source setup.sh --coverage

# ---------------------------------------------------------------------------- #
#                                   DEPLOY                                     #
# ---------------------------------------------------------------------------- #
# The Deploy stage deploys newly built and tested binaries to a given location
# on disk.

# > deploy <
# The deploy stage will deploy the newly built project binaries
# to /data/dynfu.
deploy:
  stage: Deploy

  # The latest build build/ must be retrieved from cache
  cache:
    key: "$CI_COMMIT_SHA"
    paths:
      - build/
    policy: pull

  # Clean the output distributed library directory to clone new data into it
  before_script:
    - rm -rf /data/dynfu
    - mkdir /data/dynfu

  # Copy the latest build BINARIES to /data/
  script:
    - cp -R build/bin/* /data/dynfu

# > docker_image__deploy_dev <
# Deploys to Docker Hub
docker_image_deploy_dev:
  stage: Deploy
  tags:
    - docker
  script:
    - docker login -u $DOCKER_HUB_LOGIN -p $DOCKER_HUB_PASSWORD
    - docker tag $TAG_IMAGE $DEV_IMAGE
    - docker push $DEV_IMAGE
  only:
    - dev

# > docker_image_deploy <
# Deploys to Docker Hub
docker_image_deploy:
  stage: Deploy
  tags:
    - docker
  script:
    - docker login -u $DOCKER_HUB_LOGIN -p $DOCKER_HUB_PASSWORD
    - docker tag $TAG_IMAGE $CONTAINER_IMAGE
    - docker push $CONTAINER_IMAGE
  only:
    - master

# > test_dynfu_on_ec2 <
# Trigger testing on ec2 instance
test_dynfu_on_ec2:
  stage: Deploy
  tags:
    - docker
  before_script:
    - pip install awscli
    - export PATH="${PATH}:~/.local/bin"
  script:
    - AWS_ACCESS_KEY_ID=${UPDATE_LAMBDA_KEY} AWS_SECRET_ACCESS_KEY=${UPDATE_LAMBDA_PASS} AWS_DEFAULT_REGION=eu-west-1 aws lambda invoke --function-name dynamic-fusion-trigger-testing /tmp/lambda-out
  only:
    - dev
# ---------------------------------------------------------------------------- #
#                                   CLEANUP                                    #
# ---------------------------------------------------------------------------- #
# The Cleanup stage makes sure the cache storage is cleaned for a given
# pipeline in the runners directories.

# > cache_cleanup <
# It runs regardless of failure or success, and thus has the aim of cleaning
# up the testing environment. In particular this script must remove unused
# cache artifacts that were stored within one pipeline.
cache_cleanup:
  stage: Cleanup
  script:
    - echo "Present Cache:" && ls /home/gitlab-runner/cache/g1736211/dynfu/
    - rm -rf "/home/gitlab-runner/cache/g1736211/dynfu/$CI_COMMIT_SHA"
  when: always


================================================
FILE: .lintflags
================================================
# ---------------------------------------------------------------------------- #
#                                   FILE FORMAT                                #
# ---------------------------------------------------------------------------- #
# Available checks can be found at http://clang.llvm.org/extra/clang-tidy/     #
# To add a new check add it normally                                           #
# To PREVENT the linter from using a check, prefix with '-'                    #
# Extra links: https://clang.llvm.org/extra/clang-tidy/checks/list.html        #
# Clang-tidy docs: https://clang.llvm.org/docs/ClangFormat.html                #
# ---------------------------------------------------------------------------- #

# Enforce google standard
google-*

# Use High Integrity C++ Coding Standard
hicpp-*

# Use LLVM Coding Standard
llvm-*

# Use all miscellaneous flags
misc-*

# Apply readability constraints
readability-*

# Apply bugprone checks
bugprone-*

# ---------------------------------------------------------------------------- #
# The following are check DISABLES for certain behaviours

# Allow un-use of use auto
-hicpp-use-auto

# Allow string comparison .compare and not only via == or !=
-misc-string-compare

# Allow warnings for signed bitwise operators
-hicpp-signed-bitwise

# Allow printfs calls in C style
-hicpp-vararg

# Allow implicit bool casts and conversions
-readability-implicit-bool-cast
-readability-implicit-bool-conversion

# Prevent conflicting standards
-llvm-include-order

# Deprecated checks
-hicpp-special-member-functions
-hicpp-use-equals-delete
-hicpp-no-array-decay
-readability-delete-null-pointer
-hicpp-member-init


================================================
FILE: .lintignore
================================================
# Ignore the test directory when linting
./tests/*

# Ignore all git files when linting
./.git/*

# Ignore all scripts
./scripts/*

# Ignore the cmake directory
./cmake/*

# Ignore all contetnts of build directory
./build/*


================================================
FILE: CMakeLists.txt
================================================
# ---------------------------------------------------------------------------- #
# CMAKE INIT CONFIGURATIONS
# ---------------------------------------------------------------------------- #
cmake_minimum_required(VERSION 2.8.8)

# ---------------------------------------------------------------------------- #
# SET THE PROJECT
# ---------------------------------------------------------------------------- #
project(kfusion C CXX)

# ---------------------------------------------------------------------------- #
# UTILITY FUNCTIONS (MACROS) AND PATHS
# ---------------------------------------------------------------------------- #
LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/")
include(Utils)
include(Targets)

# ---------------------------------------------------------------------------- #
# CMAKE SETTINGS
# ---------------------------------------------------------------------------- #
# Use both Debug and Release versions
SET(CMAKE_CONFIGURATION_TYPES "Debug;Release")

# Set CXX to compile with C++11 standard
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -MP -MD -pthread -fpermissive")

# ---------------------------------------------------------------------------- #
# DEPENDENCIES
# ---------------------------------------------------------------------------- #
find_package(Boost 1.58.0 REQUIRED COMPONENTS filesystem program_options)
find_package(CUDA REQUIRED)
find_package(OpenCV REQUIRED COMPONENTS core viz highgui calib3d)
find_package(PCL 1.8 REQUIRED)
find_package(VTK REQUIRED)

# ---------------------------------------------------------------------------- #
# CUDA SETTINGS
# ---------------------------------------------------------------------------- #
SET(HAVE_CUDA 1)
LIST(APPEND CUDA_NVCC_FLAGS
   "-gencode;arch=compute_61,code=sm_61;
    --ftz=true;
    --prec-div=false;
    --prec-sqrt=false;
    --expt-relaxed-constexpr"
)

if(UNIX OR APPLE)
  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler;-fPIC")
endif()

# ---------------------------------------------------------------------------- #
# SET INCLUDE PATHS
# ---------------------------------------------------------------------------- #
include_directories(include
  ${Boost_INCLUDE_DIRS}
  ${CUDA_INCLUDE_DIRS}
  ${OpenCV_INCLUDE_DIRS}
  ${PCL_INCLUDE_DIRS}
  ${VTK_INCLUDE_DIRS}
)

# ---------------------------------------------------------------------------- #
# BUILD SRC DIRECTORY
# ---------------------------------------------------------------------------- #
add_subdirectory(src)

# ---------------------------------------------------------------------------- #
# BUILD TESTS
# ---------------------------------------------------------------------------- #
if(BUILD_TESTS)
  enable_testing()
  add_subdirectory(test)
endif()


================================================
FILE: LICENSE.md
================================================
Copyright (c) 2012, Anatoly Baksheev
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the {organization} nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: README.md
================================================
sobfu
============
software for 3d reconstruction of non-rigidly deforming scenes using depth data, based on sobolevfusion

dependencies
-----------
* boost
* nvidia gpu with cuda >8.0
* opencv
* pcl 1.8.1
* vtk

installation
------------
to install, run `source setup.sh -a`

usage
------------
`./build/bin/app /path/to/data /path/to/params <--enable-viz> <--enable-log> <--verbose>`

* the data directory must include folders `color` and `depth`
* the folder `params` has files with recommended parameters for some of the scenes from volumedeform and killingfusion datasets

### options
* with `--enable-viz`, screenshots from the pcl viewer will be logged in .png format to `/path/to/data/screenshots`
* with `--enable-log`, meshes computed from the model tsdf's via marching cubes will be logged in .vtk format to `/path/to/data/meshes`
* `--verbose` and `--vverbose` control verbosity of the solver

example reconstructions
------------
reconstructions of some of the scenes from the volumedeform and killingfusion projects can be viewed on our [youtube playlist](http://bit.ly/sobfu-yt)

there is a large trade-off between frame rate and reconstruction quality--sample reconstructions ran at approx. 2fps

issues
------------

* drift in longer scenes due to less than perfect registration
* topological changes

references 
------------
```
@InProceedings{Slavcheva_2018_CVPR,
author = {Slavcheva, Miroslava and Baust, Maximilian and Ilic, Slobodan},
title = {SobolevFusion: 3D Reconstruction of Scenes Undergoing Free Non-Rigid Motion},
booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2018}
}
```


================================================
FILE: cmake/Targets.cmake
================================================
################################################################################################
# short command to setup source group
function(kf_source_group group)
  cmake_parse_arguments(VW_SOURCE_GROUP "" "" "GLOB" ${ARGN})
  file(GLOB srcs ${VW_SOURCE_GROUP_GLOB})
  #list(LENGTH ${srcs} ___size)
  #if (___size GREATER 0)
    source_group(${group} FILES ${srcs})
  #endif()
endfunction()


################################################################################################
# short command for declaring includes from other modules
macro(declare_deps_includes)
  foreach(__arg ${ARGN})
    get_filename_component(___path "${CMAKE_SOURCE_DIR}/modules/${__arg}/include" ABSOLUTE)
    if (EXISTS ${___path})
      include_directories(${___path})
    endif()
  endforeach()

  unset(___path)
  unset(__arg)
endmacro()


################################################################################################
# short command for setting defeault target properties
function(default_properties target)
  set_target_properties(${target} PROPERTIES
    DEBUG_POSTFIX "d"
    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")

    if (NOT ${target} MATCHES "^test_")
      install(TARGETS ${the_target} RUNTIME DESTINATION ".")
    endif()
endfunction()

function(test_props target)
  #os_project_label(${target} "[test]")
  if(USE_PROJECT_FOLDERS)
    set_target_properties(${target} PROPERTIES FOLDER "Tests")
  endif()
endfunction()

function(app_props target)
  #os_project_label(${target} "[app]")
  if(USE_PROJECT_FOLDERS)
    set_target_properties(${target} PROPERTIES FOLDER "Apps")
  endif()
endfunction()


################################################################################################
# short command for setting defeault target properties
function(default_properties target)
  set_target_properties(${target} PROPERTIES
    DEBUG_POSTFIX "d"
    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")

    if (NOT ${target} MATCHES "^test_")
      install(TARGETS ${the_target} RUNTIME DESTINATION ".")
    endif()
endfunction()


################################################################################################
# short command for adding library module
macro(add_module_library name)
  set(module_name ${name})

  FILE(GLOB_RECURSE sources *.cpp *.cu)

  cuda_add_library(${module_name} STATIC ${sources})

  if(MSVC)
    set_target_properties(${module_name} PROPERTIES DEFINE_SYMBOL KFUSION_API_EXPORTS)
  else()
    add_definitions(-DKFUSION_API_EXPORTS)
  endif()

  default_properties(${module_name})
endmacro()


################################################################################################
# short command for adding application module
macro(add_application target)
  FILE(GLOB_RECURSE sources *.cpp)
  add_executable(${target} ${sources})
  default_properties(${target})
endmacro()


################################################################################################
# short command for adding test target
macro(CREATE_TEST target)
  FILE(GLOB SRCS *.cpp)
  ADD_EXECUTABLE(${target} ${SRCS})
  TARGET_LINK_LIBRARIES(${target} libgtest libgmock)
  add_test(NAME ${target} COMMAND ${target})
  default_properties(${target})
endmacro()


================================================
FILE: cmake/Utils.cmake
================================================
################################################################################################
# short alias for CMake logging
function(dmsg)
  message(STATUS "<<${ARGN}")
endfunction()

################################################################################################
# Command checking if current module has cuda souces to compile
macro(check_cuda var)
  file(GLOB cuda src/cuda/*.cu)
  list(LENGTH cuda ___size)  
  if (HAVE_CUDA AND ___size GREATER 0)  
    set(${var} ON)
  else()  
    set(${var} OFF) 
  endif()
  unset(___size)
  unset(cuda)
endmacro()

################################################################################################
# short command for adding library module
macro(warnings_disable)
  if(NOT ENABLE_NOISY_WARNINGS)
    set(_flag_vars "")
    set(_msvc_warnings "")
    set(_gxx_warnings "")
    foreach(arg ${ARGN})
      if(arg MATCHES "^CMAKE_")
        list(APPEND _flag_vars ${arg})
      elseif(arg MATCHES "^/wd")
        list(APPEND _msvc_warnings ${arg})
      elseif(arg MATCHES "^-W")
        list(APPEND _gxx_warnings ${arg})
      endif()
    endforeach()
    if(MSVC AND _msvc_warnings AND _flag_vars)
      foreach(var ${_flag_vars})
        foreach(warning ${_msvc_warnings})
          set(${var} "${${var}} ${warning}")
        endforeach()
      endforeach()
    elseif(CV_COMPILER_IS_GNU AND _gxx_warnings AND _flag_vars)
      foreach(var ${_flag_vars})
        foreach(warning ${_gxx_warnings})
          if(NOT warning MATCHES "^-Wno-")
            string(REPLACE "${warning}" "" ${var} "${${var}}")
            string(REPLACE "-W" "-Wno-" warning "${warning}")
          endif()
          ocv_check_flag_support(${var} "${warning}" _varname)
          if(${_varname})
            set(${var} "${${var}} ${warning}")
          endif()
        endforeach()
      endforeach()
    endif()
    unset(_flag_vars)
    unset(_msvc_warnings)
    unset(_gxx_warnings)
  endif(NOT ENABLE_NOISY_WARNINGS)
endmacro()

################################################################################################
# Command for asstion options with some preconditions
macro(kf_option variable description value)
  set(__value ${value})
  set(__condition "")
  set(__varname "__value")
  foreach(arg ${ARGN})
    if(arg STREQUAL "IF" OR arg STREQUAL "if")
      set(__varname "__condition")
    else()
      list(APPEND ${__varname} ${arg})
    endif()
  endforeach()
  unset(__varname)
  if("${__condition}" STREQUAL "")
    set(__condition 2 GREATER 1)
  endif()

  if(${__condition})
    if("${__value}" MATCHES ";")
      if(${__value})
        option(${variable} "${description}" ON)
      else()
        option(${variable} "${description}" OFF)
      endif()
    elseif(DEFINED ${__value})
      if(${__value})
        option(${variable} "${description}" ON)
      else()
        option(${variable} "${description}" OFF)
      endif()
    else()
      option(${variable} "${description}" ${__value})
    endif()
  else()
    unset(${variable} CACHE)
  endif()
  unset(__condition)
  unset(__value)
endmacro()


================================================
FILE: include/kfusion/cuda/device.hpp
================================================
#pragma once

#include <cuda_fp16.h>
#include <kfusion/cuda/temp_utils.hpp>
#include <kfusion/precomp.hpp>

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TsdfVolume

// __kf_device__ kfusion::device::TsdfVolume::TsdfVolume(float2 *_data, int3 _dims, float3 _voxel_size,
//                                                       float _trunc_dist, float _max_weight)
//     : data(_data), dims(_dims), voxel_size(_voxel_size), trunc_dist(_trunc_dist), max_weight(_max_weight) {}
//
// __kf_device__ kfusion::device::TsdfVolume::TsdfVolume(const TsdfVolume &other)
//     : data(other.data),
//       dims(other.dims),
//       voxel_size(other.voxel_size),
//       trunc_dist(other.trunc_dist),
//       max_weight(other.max_weight) {}

__kf_device__ float2 *kfusion::device::TsdfVolume::operator()(int x, int y, int z) {
    return data + x + y * dims.x + z * dims.y * dims.x;
}

__kf_device__ const float2 *kfusion::device::TsdfVolume::operator()(int x, int y, int z) const {
    return data + x + y * dims.x + z * dims.y * dims.x;
}

__kf_device__ float2 *kfusion::device::TsdfVolume::beg(int x, int y) const { return data + x + dims.x * y; }

__kf_device__ float2 *kfusion::device::TsdfVolume::zstep(float2 *const ptr) const { return ptr + dims.x * dims.y; }

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Projector

__kf_device__ float2 kfusion::device::Projector::operator()(const float3 &p) const {
    float2 coo;
    coo.x = __fmaf_rn(f.x, __fdividef(p.x, p.z), c.x);
    coo.y = __fmaf_rn(f.y, __fdividef(p.y, p.z), c.y);
    return coo;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Reprojector

__kf_device__ float3 kfusion::device::Reprojector::operator()(int u, int v, float z) const {
    float x = z * (u - c.x) * finv.x;
    float y = z * (v - c.y) * finv.y;
    return make_float3(x, y, z);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Utility

namespace kfusion {
namespace device {
__kf_device__ Vec3f operator*(const Mat3f &m, const Vec3f &v) {
    return make_float3(dot(m.data[0], v), dot(m.data[1], v), dot(m.data[2], v));
}

__kf_device__ Vec3f operator*(const Aff3f &a, const Vec3f &v) { return a.R * v + a.t; }

/* operator for multiplication of an affine transform by a float */
__kf_device__ Aff3f operator*(const float &s, const Aff3f &a) {
    Aff3f b;
    b.R = a.R;
    b.t = s * a.t;
    return b;
}

/* 3-d matrix multiplication */
__kf_device__ Mat3f operator*(const Mat3f &m, const Mat3f &n) {
    Mat3f r;
    r.data[0] = m.data[0] * make_float3(n.data[0].x, n.data[1].x, n.data[2].x);
    r.data[1] = m.data[1] * make_float3(n.data[0].y, n.data[1].y, n.data[2].y);
    r.data[2] = m.data[2] * make_float3(n.data[0].z, n.data[1].z, n.data[2].z);
    return r;
}

/* affine transform composition */
__kf_device__ Aff3f operator*(const Aff3f &a, const Aff3f &b) {
    Aff3f c;
    c.R = a.R * b.R;
    c.t = a.t + b.t;
    return c;
}

__kf_device__ Vec3f tr(const float4 &v) { return make_float3(v.x, v.y, v.z); }

struct plus {
    __kf_device__ float operator()(float l, float r) const { return l + r; }
    __kf_device__ double operator()(double l, double r) const { return l + r; }
};
}  // namespace device
}  // namespace kfusion


================================================
FILE: include/kfusion/cuda/device_array.hpp
================================================
#pragma once

#include <kfusion/cuda/device_memory.hpp>
#include <kfusion/exports.hpp>

#include <vector>

namespace kfusion {
namespace cuda {
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/** \brief @b DeviceArray class
 *
 * \note Typed container for GPU memory with reference counting.
 *
 * \author Anatoly Baksheev
 */
template <class T>
class KF_EXPORTS DeviceArray : public DeviceMemory {
public:
    /** \brief Element type. */
    typedef T type;

    /** \brief Element size. */
    enum { elem_size = sizeof(T) };

    /** \brief Empty constructor. */
    DeviceArray();

    /** \brief Allocates internal buffer in GPU memory
     * \param size_t: number of elements to allocate
     * */
    DeviceArray(size_t size);

    /** \brief Initializes with user allocated buffer. Reference counting is
     * disabled in this case. \param ptr: pointer to buffer \param size: elemens
     * number
     * */
    DeviceArray(T *ptr, size_t size);

    /** \brief Copy constructor. Just increments reference counter. */
    DeviceArray(const DeviceArray &other);

    /** \brief Assigment operator. Just increments reference counter. */
    DeviceArray &operator=(const DeviceArray &other);

    /** \brief Allocates internal buffer in GPU memory. If internal buffer was
     * created before the function recreates it with new size. If new and old
     * sizes are equal it does nothing. \param size: elemens number
     * */
    void create(size_t size);

    /** \brief Decrements reference counter and releases internal buffer if
     * needed. */
    void release();

    /** \brief Performs data copying. If destination size differs it will be
     * reallocated. \param other_arg: destination container
     * */
    void copyTo(DeviceArray &other) const;

    /** \brief Uploads data to internal buffer in GPU memory. It calls create()
     * inside to ensure that intenal buffer size is enough. \param host_ptr_arg:
     * pointer to buffer to upload \param size: elemens number
     * */
    void upload(const T *host_ptr, size_t size);

    /** \brief Downloads data from internal buffer to CPU memory
     * \param host_ptr_arg: pointer to buffer to download
     * */
    void download(T *host_ptr) const;

    /** \brief Uploads data to internal buffer in GPU memory. It calls create()
     * inside to ensure that intenal buffer size is enough. \param data: host
     * vector to upload from
     * */
    template <class A>
    void upload(const std::vector<T, A> &data);

    /** \brief Downloads data from internal buffer to CPU memory
     * \param data:  host vector to download to
     * */
    template <typename A>
    void download(std::vector<T, A> &data) const;

    /** \brief Performs swap of data pointed with another device array.
     * \param other: device array to swap with
     * */
    void swap(DeviceArray &other_arg);

    /** \brief Returns pointer for internal buffer in GPU memory. */
    T *ptr();

    /** \brief Returns const pointer for internal buffer in GPU memory. */
    const T *ptr() const;

    // using DeviceMemory::ptr;

    /** \brief Returns pointer for internal buffer in GPU memory. */
    operator T *();

    /** \brief Returns const pointer for internal buffer in GPU memory. */
    operator const T *() const;

    /** \brief Returns size in elements. */
    size_t size() const;
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/** \brief @b DeviceArray2D class
 *
 * \note Typed container for pitched GPU memory with reference counting.
 *
 * \author Anatoly Baksheev
 */
template <class T>
class KF_EXPORTS DeviceArray2D : public DeviceMemory2D {
public:
    /** \brief Element type. */
    typedef T type;

    /** \brief Element size. */
    enum { elem_size = sizeof(T) };

    /** \brief Empty constructor. */
    DeviceArray2D();

    /** \brief Allocates internal buffer in GPU memory
     * \param rows: number of rows to allocate
     * \param cols: number of elements in each row
     * */
    DeviceArray2D(int rows, int cols);

    /** \brief Initializes with user allocated buffer. Reference counting is
     * disabled in this case. \param rows: number of rows \param cols: number of
     * elements in each row \param data: pointer to buffer \param stepBytes:
     * stride between two consecutive rows in bytes
     * */
    DeviceArray2D(int rows, int cols, void *data, size_t stepBytes);

    /** \brief Copy constructor. Just increments reference counter. */
    DeviceArray2D(const DeviceArray2D &other);

    /** \brief Assigment operator. Just increments reference counter. */
    DeviceArray2D &operator=(const DeviceArray2D &other);

    /** \brief Allocates internal buffer in GPU memory. If internal buffer was
     * created before the function recreates it with new size. If new and old
     * sizes are equal it does nothing. \param rows: number of rows to allocate
     * \param cols: number of elements in each row
     * */
    void create(int rows, int cols);

    /** \brief Decrements reference counter and releases internal buffer if
     * needed. */
    void release();

    /** \brief Performs data copying. If destination size differs it will be
     * reallocated. \param other: destination container
     * */
    void copyTo(DeviceArray2D &other) const;

    /** \brief Uploads data to internal buffer in GPU memory. It calls create()
     * inside to ensure that intenal buffer size is enough. \param host_ptr:
     * pointer to host buffer to upload \param host_step: stride between two
     * consecutive rows in bytes for host buffer \param rows: number of rows to
     * upload \param cols: number of elements in each row
     * */
    void upload(const void *host_ptr, size_t host_step, int rows, int cols);

    /** \brief Downloads data from internal buffer to CPU memory. User is
     * resposible for correct host buffer size. \param host_ptr: pointer to host
     * buffer to download \param host_step: stride between two consecutive rows in
     * bytes for host buffer
     * */
    void download(void *host_ptr, size_t host_step) const;

    /** \brief Performs swap of data pointed with another device array.
     * \param other: device array to swap with
     * */
    void swap(DeviceArray2D &other_arg);

    /** \brief Uploads data to internal buffer in GPU memory. It calls create()
     * inside to ensure that intenal buffer size is enough. \param data: host
     * vector to upload from \param cols: stride in elements between two
     * consecutive rows for host buffer
     * */
    template <class A>
    void upload(const std::vector<T, A> &data, int cols);

    /** \brief Downloads data from internal buffer to CPU memory
     * \param data: host vector to download to
     * \param cols: Output stride in elements between two consecutive rows for
     * host vector.
     * */
    template <class A>
    void download(std::vector<T, A> &data, int &cols) const;

    /** \brief Returns pointer to given row in internal buffer.
     * \param y_arg: row index
     * */
    T *ptr(int y = 0);

    /** \brief Returns const pointer to given row in internal buffer.
     * \param y_arg: row index
     * */
    const T *ptr(int y = 0) const;

    // using DeviceMemory2D::ptr;

    /** \brief Returns pointer for internal buffer in GPU memory. */
    operator T *();

    /** \brief Returns const pointer for internal buffer in GPU memory. */
    operator const T *() const;

    /** \brief Returns number of elements in each row. */
    int cols() const;

    /** \brief Returns number of rows. */
    int rows() const;

    /** \brief Returns step in elements. */
    size_t elem_step() const;
};
}  // namespace cuda

namespace device {
using kfusion::cuda::DeviceArray;
using kfusion::cuda::DeviceArray2D;
}  // namespace device
}  // namespace kfusion

/////////////////////  Inline implementations of DeviceArray
///////////////////////////////////////////////

template <class T>
inline kfusion::cuda::DeviceArray<T>::DeviceArray() {}
template <class T>
inline kfusion::cuda::DeviceArray<T>::DeviceArray(size_t size) : DeviceMemory(size * elem_size) {}
template <class T>
inline kfusion::cuda::DeviceArray<T>::DeviceArray(T *ptr, size_t size) : DeviceMemory(ptr, size * elem_size) {}
template <class T>
inline kfusion::cuda::DeviceArray<T>::DeviceArray(const DeviceArray &other) : DeviceMemory(other) {}
template <class T>
inline kfusion::cuda::DeviceArray<T> &kfusion::cuda::DeviceArray<T>::operator=(const DeviceArray &other) {
    DeviceMemory::operator=(other);
    return *this;
}

template <class T>
inline void kfusion::cuda::DeviceArray<T>::create(size_t size) {
    DeviceMemory::create(size * elem_size);
}
template <class T>
inline void kfusion::cuda::DeviceArray<T>::release() {
    DeviceMemory::release();
}

template <class T>
inline void kfusion::cuda::DeviceArray<T>::copyTo(DeviceArray &other) const {
    DeviceMemory::copyTo(other);
}
template <class T>
inline void kfusion::cuda::DeviceArray<T>::upload(const T *host_ptr, size_t size) {
    DeviceMemory::upload(host_ptr, size * elem_size);
}
template <class T>
inline void kfusion::cuda::DeviceArray<T>::download(T *host_ptr) const {
    DeviceMemory::download(host_ptr);
}

template <class T>
void kfusion::cuda::DeviceArray<T>::swap(DeviceArray &other_arg) {
    DeviceMemory::swap(other_arg);
}

template <class T>
inline kfusion::cuda::DeviceArray<T>::operator T *() {
    return ptr();
}
template <class T>
inline kfusion::cuda::DeviceArray<T>::operator const T *() const {
    return ptr();
}
template <class T>
inline size_t kfusion::cuda::DeviceArray<T>::size() const {
    return sizeBytes() / elem_size;
}

template <class T>
inline T *kfusion::cuda::DeviceArray<T>::ptr() {
    return DeviceMemory::ptr<T>();
}
template <class T>
inline const T *kfusion::cuda::DeviceArray<T>::ptr() const {
    return DeviceMemory::ptr<T>();
}

template <class T>
template <class A>
inline void kfusion::cuda::DeviceArray<T>::upload(const std::vector<T, A> &data) {
    upload(&data[0], data.size());
}
template <class T>
template <class A>
inline void kfusion::cuda::DeviceArray<T>::download(std::vector<T, A> &data) const {
    data.resize(size());
    if (!data.empty())
        download(&data[0]);
}

/////////////////////  Inline implementations of DeviceArray2D
///////////////////////////////////////////////

template <class T>
inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D() {}
template <class T>
inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols) : DeviceMemory2D(rows, cols * elem_size) {}
template <class T>
inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols, void *data, size_t stepBytes)
    : DeviceMemory2D(rows, cols * elem_size, data, stepBytes) {}
template <class T>
inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(const DeviceArray2D &other) : DeviceMemory2D(other) {}
template <class T>
inline kfusion::cuda::DeviceArray2D<T> &kfusion::cuda::DeviceArray2D<T>::operator=(const DeviceArray2D &other) {
    DeviceMemory2D::operator=(other);
    return *this;
}

template <class T>
inline void kfusion::cuda::DeviceArray2D<T>::create(int rows, int cols) {
    DeviceMemory2D::create(rows, cols * elem_size);
}
template <class T>
inline void kfusion::cuda::DeviceArray2D<T>::release() {
    DeviceMemory2D::release();
}

template <class T>
inline void kfusion::cuda::DeviceArray2D<T>::copyTo(DeviceArray2D &other) const {
    DeviceMemory2D::copyTo(other);
}
template <class T>
inline void kfusion::cuda::DeviceArray2D<T>::upload(const void *host_ptr, size_t host_step, int rows, int cols) {
    DeviceMemory2D::upload(host_ptr, host_step, rows, cols * elem_size);
}
template <class T>
inline void kfusion::cuda::DeviceArray2D<T>::download(void *host_ptr, size_t host_step) const {
    DeviceMemory2D::download(host_ptr, host_step);
}

template <class T>
template <class A>
inline void kfusion::cuda::DeviceArray2D<T>::upload(const std::vector<T, A> &data, int cols) {
    upload(&data[0], cols * elem_size, data.size() / cols, cols);
}

template <class T>
template <class A>
inline void kfusion::cuda::DeviceArray2D<T>::download(std::vector<T, A> &data, int &elem_step) const {
    elem_step = cols();
    data.resize(cols() * rows());
    if (!data.empty())
        download(&data[0], colsBytes());
}

template <class T>
void kfusion::cuda::DeviceArray2D<T>::swap(DeviceArray2D &other_arg) {
    DeviceMemory2D::swap(other_arg);
}

template <class T>
inline T *kfusion::cuda::DeviceArray2D<T>::ptr(int y) {
    return DeviceMemory2D::ptr<T>(y);
}
template <class T>
inline const T *kfusion::cuda::DeviceArray2D<T>::ptr(int y) const {
    return DeviceMemory2D::ptr<T>(y);
}

template <class T>
inline kfusion::cuda::DeviceArray2D<T>::operator T *() {
    return ptr();
}
template <class T>
inline kfusion::cuda::DeviceArray2D<T>::operator const T *() const {
    return ptr();
}

template <class T>
inline int kfusion::cuda::DeviceArray2D<T>::cols() const {
    return DeviceMemory2D::colsBytes() / elem_size;
}
template <class T>
inline int kfusion::cuda::DeviceArray2D<T>::rows() const {
    return DeviceMemory2D::rows();
}

template <class T>
inline size_t kfusion::cuda::DeviceArray2D<T>::elem_step() const {
    return DeviceMemory2D::step() / elem_size;
}


================================================
FILE: include/kfusion/cuda/device_memory.hpp
================================================
#pragma once

#include <kfusion/cuda/kernel_containers.hpp>
#include <kfusion/exports.hpp>

namespace kfusion {
namespace cuda {
/** \brief Error handler. All GPU functions from this subsystem call the
 * function to report an error. For internal use only */
KF_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func = "");

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/** \brief @b DeviceMemory class
 *
 * \note This is a BLOB container class with reference counting for GPU memory.
 *
 * \author Anatoly Baksheev
 */

class KF_EXPORTS DeviceMemory {
public:
    /** \brief Empty constructor. */
    DeviceMemory();

    /** \brief Destructor. */
    ~DeviceMemory();

    /** \brief Allocates internal buffer in GPU memory
     * \param sizeBytes_arg: amount of memory to allocate
     * */
    DeviceMemory(size_t sizeBytes_arg);

    /** \brief Initializes with user allocated buffer. Reference counting is
     * disabled in this case. \param ptr_arg: pointer to buffer \param
     * sizeBytes_arg: buffer size
     * */
    DeviceMemory(void *ptr_arg, size_t sizeBytes_arg);

    /** \brief Copy constructor. Just increments reference counter. */
    DeviceMemory(const DeviceMemory &other_arg);

    /** \brief Assigment operator. Just increments reference counter. */
    DeviceMemory &operator=(const DeviceMemory &other_arg);

    /** \brief Allocates internal buffer in GPU memory. If internal buffer was
     * created before the function recreates it with new size. If new and old
     * sizes are equal it does nothing. \param sizeBytes_arg: buffer size
     * */
    void create(size_t sizeBytes_arg);

    /** \brief Decrements reference counter and releases internal buffer if
     * needed. */
    void release();

    /** \brief Performs data copying. If destination size differs it will be
     * reallocated. \param other_arg: destination container
     * */
    void copyTo(DeviceMemory &other) const;

    /** \brief Uploads data to internal buffer in GPU memory. It calls create()
     * inside to ensure that intenal buffer size is enough. \param host_ptr_arg:
     * pointer to buffer to upload \param sizeBytes_arg: buffer size
     * */
    void upload(const void *host_ptr_arg, size_t sizeBytes_arg);

    /** \brief Downloads data from internal buffer to CPU memory
     * \param host_ptr_arg: pointer to buffer to download
     * */
    void download(void *host_ptr_arg) const;

    /** \brief Performs swap of data pointed with another device memory.
     * \param other: device memory to swap with
     * */
    void swap(DeviceMemory &other_arg);

    /** \brief Returns pointer for internal buffer in GPU memory. */
    template <class T>
    T *ptr();

    /** \brief Returns constant pointer for internal buffer in GPU memory. */
    template <class T>
    const T *ptr() const;

    /** \brief Conversion to PtrSz for passing to kernel functions. */
    template <class U>
    operator PtrSz<U>() const;

    /** \brief Returns true if unallocated otherwise false. */
    bool empty() const;

    size_t sizeBytes() const;

private:
    /** \brief Device pointer. */
    void *data_;

    /** \brief Allocated size in bytes. */
    size_t sizeBytes_;

    /** \brief Pointer to reference counter in CPU memory. */
    int *refcount_;
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/** \brief @b DeviceMemory2D class
 *
 * \note This is a BLOB container class with reference counting for pitched GPU
 * memory.
 *
 * \author Anatoly Baksheev
 */

class KF_EXPORTS DeviceMemory2D {
public:
    /** \brief Empty constructor. */
    DeviceMemory2D();

    /** \brief Destructor. */
    ~DeviceMemory2D();

    /** \brief Allocates internal buffer in GPU memory
     * \param rows_arg: number of rows to allocate
     * \param colsBytes_arg: width of the buffer in bytes
     * */
    DeviceMemory2D(int rows_arg, int colsBytes_arg);

    /** \brief Initializes with user allocated buffer. Reference counting is
     * disabled in this case. \param rows_arg: number of rows \param
     * colsBytes_arg: width of the buffer in bytes \param data_arg: pointer to
     * buffer \param stepBytes_arg: stride between two consecutive rows in bytes
     * */
    DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg);

    /** \brief Copy constructor. Just increments reference counter. */
    DeviceMemory2D(const DeviceMemory2D &other_arg);

    /** \brief Assigment operator. Just increments reference counter. */
    DeviceMemory2D &operator=(const DeviceMemory2D &other_arg);

    /** \brief Allocates internal buffer in GPU memory. If internal buffer was
     * created before the function recreates it with new size. If new and old
     * sizes are equal it does nothing. \param ptr_arg: number of rows to allocate
     * \param sizeBytes_arg: width of the buffer in bytes
     * */
    void create(int rows_arg, int colsBytes_arg);

    /** \brief Decrements reference counter and releases internal buffer if
     * needed. */
    void release();

    /** \brief Performs data copying. If destination size differs it will be
     * reallocated. \param other_arg: destination container
     * */
    void copyTo(DeviceMemory2D &other) const;

    /** \brief Uploads data to internal buffer in GPU memory. It calls create()
     * inside to ensure that intenal buffer size is enough. \param host_ptr_arg:
     * pointer to host buffer to upload \param host_step_arg: stride between two
     * consecutive rows in bytes for host buffer \param rows_arg: number of rows
     * to upload \param sizeBytes_arg: width of host buffer in bytes
     * */
    void upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg);

    /** \brief Downloads data from internal buffer to CPU memory. User is
     * resposible for correct host buffer size. \param host_ptr_arg: pointer to
     * host buffer to download \param host_step_arg: stride between two
     * consecutive rows in bytes for host buffer
     * */
    void download(void *host_ptr_arg, size_t host_step_arg) const;

    /** \brief Performs swap of data pointed with another device memory.
     * \param other: device memory to swap with
     * */
    void swap(DeviceMemory2D &other_arg);

    /** \brief Returns pointer to given row in internal buffer.
     * \param y_arg: row index
     * */
    template <class T>
    T *ptr(int y_arg = 0);

    /** \brief Returns constant pointer to given row in internal buffer.
     * \param y_arg: row index
     * */
    template <class T>
    const T *ptr(int y_arg = 0) const;

    /** \brief Conversion to PtrStep for passing to kernel functions. */
    template <class U>
    operator PtrStep<U>() const;

    /** \brief Conversion to PtrStepSz for passing to kernel functions. */
    template <class U>
    operator PtrStepSz<U>() const;

    /** \brief Returns true if unallocated otherwise false. */
    bool empty() const;

    /** \brief Returns number of bytes in each row. */
    int colsBytes() const;

    /** \brief Returns number of rows. */
    int rows() const;

    /** \brief Returns stride between two consecutive rows in bytes for internal
     * buffer. Step is stored always and everywhere in bytes!!! */
    size_t step() const;

private:
    /** \brief Device pointer. */
    void *data_;

    /** \brief Stride between two consecutive rows in bytes for internal buffer.
     * Step is stored always and everywhere in bytes!!! */
    size_t step_;

    /** \brief Width of the buffer in bytes. */
    int colsBytes_;

    /** \brief Number of rows. */
    int rows_;

    /** \brief Pointer to reference counter in CPU memory. */
    int *refcount_;
};
}  // namespace cuda

namespace device {
using kfusion::cuda::DeviceMemory;
using kfusion::cuda::DeviceMemory2D;
}  // namespace device
}  // namespace kfusion

/////////////////////  Inline implementations of DeviceMemory
///////////////////////////////////////////////

template <class T>
inline T *kfusion::cuda::DeviceMemory::ptr() {
    return (T *) data_;
}
template <class T>
inline const T *kfusion::cuda::DeviceMemory::ptr() const {
    return (const T *) data_;
}

template <class U>
inline kfusion::cuda::DeviceMemory::operator kfusion::cuda::PtrSz<U>() const {
    PtrSz<U> result;
    result.data = (U *) ptr<U>();
    result.size = sizeBytes_ / sizeof(U);
    return result;
}

/////////////////////  Inline implementations of DeviceMemory2D
///////////////////////////////////////////////

template <class T>
T *kfusion::cuda::DeviceMemory2D::ptr(int y_arg) {
    return (T *) ((char *) data_ + y_arg * step_);
}
template <class T>
const T *kfusion::cuda::DeviceMemory2D::ptr(int y_arg) const {
    return (const T *) ((const char *) data_ + y_arg * step_);
}

template <class U>
kfusion::cuda::DeviceMemory2D::operator kfusion::cuda::PtrStep<U>() const {
    PtrStep<U> result;
    result.data = (U *) ptr<U>();
    result.step = step_;
    return result;
}

template <class U>
kfusion::cuda::DeviceMemory2D::operator kfusion::cuda::PtrStepSz<U>() const {
    PtrStepSz<U> result;
    result.data = (U *) ptr<U>();
    result.step = step_;
    result.cols = colsBytes_ / sizeof(U);
    result.rows = rows_;
    return result;
}


================================================
FILE: include/kfusion/cuda/imgproc.hpp
================================================
#pragma once

/* kinfu includes */
#include <kfusion/types.hpp>

/* pcl includes */
#include <pcl/PolygonMesh.h>

namespace kfusion {
namespace cuda {
KF_EXPORTS void depthBilateralFilter(const Depth &in, Depth &out, int ksz, float sigma_spatial, float sigma_depth);

KF_EXPORTS void depthTruncation(Depth &depth, float threshold);

KF_EXPORTS void depthBuildPyramid(const Depth &depth, Depth &pyramid, float sigma_depth);

KF_EXPORTS void computeNormalsAndMaskDepth(const Intr &intr, Depth &depth, Normals &normals);

KF_EXPORTS void computePointNormals(const Intr &intr, const Depth &depth, Cloud &points, Normals &normals);

KF_EXPORTS void computeDists(const Depth &depth, Dists &dists, const Intr &intr);

KF_EXPORTS void resizeDepthNormals(const Depth &depth, const Normals &normals, Depth &depth_out, Normals &normals_out);

KF_EXPORTS void resizePointsNormals(const Cloud &points, const Normals &normals, Cloud &points_out,
                                    Normals &normals_out);

KF_EXPORTS void waitAllDefaultStream();

KF_EXPORTS void renderTangentColors(const Normals &normals, Image &image);

/** \brief rasterises the canonical surface
 * \param[in] intr camera intrinsics
 * \param[in] cam2vol camera -> volume coordinate transform
 * \param[in] surface surface to rasterise
 * \param[out] points_out frame with the currently visible geometry
 * \param[out] normals_out frame with the normals of the currently visible geometry
 */
KF_EXPORTS void rasteriseSurface(const Intr &intr, const Affine3f &vol2cam, const Surface &s, Cloud &vertices_out,
                                 Normals &normals_out);

KF_EXPORTS void renderImage(const Depth &depth, const Normals &normals, const Intr &intr, const Vec3f &light_pose,
                            Image &image);

KF_EXPORTS void renderImage(const Cloud &points, const Normals &normals, const Intr &intr, const Vec3f &light_pose,
                            Image &image);
}  // namespace cuda
}  // namespace kfusion


================================================
FILE: include/kfusion/cuda/kernel_containers.hpp
================================================
#pragma once

#if defined(__CUDACC__)
#define __kf_hdevice__ __host__ __device__ __forceinline__
#define __kf_device__ __device__ __forceinline__
#else
#define __kf_hdevice__
#define __kf_device__
#endif

#include <cstddef>

namespace kfusion {
namespace cuda {
template <typename T>
struct DevPtr {
    typedef T elem_type;
    const static size_t elem_size = sizeof(elem_type);

    T *data;

    __kf_hdevice__ DevPtr() : data(0) {}
    __kf_hdevice__ DevPtr(T *data_arg) : data(data_arg) {}

    __kf_hdevice__ size_t elemSize() const { return elem_size; }
    __kf_hdevice__ operator T *() { return data; }
    __kf_hdevice__ operator const T *() const { return data; }
};

template <typename T>
struct PtrSz : public DevPtr<T> {
    __kf_hdevice__ PtrSz() : size(0) {}
    __kf_hdevice__ PtrSz(T *data_arg, size_t size_arg) : DevPtr<T>(data_arg), size(size_arg) {}

    size_t size;
};

template <typename T>
struct PtrStep : public DevPtr<T> {
    __kf_hdevice__ PtrStep() : step(0) {}
    __kf_hdevice__ PtrStep(T *data_arg, size_t step_arg) : DevPtr<T>(data_arg), step(step_arg) {}

    /** \brief stride between two consecutive rows in bytes. Step is stored always
     * and everywhere in bytes!!! */
    size_t step;

    __kf_hdevice__ T *ptr(int y = 0) { return (T *) ((char *) DevPtr<T>::data + y * step); }
    __kf_hdevice__ const T *ptr(int y = 0) const { return (const T *) ((const char *) DevPtr<T>::data + y * step); }

    __kf_hdevice__ T &operator()(int y, int x) { return ptr(y)[x]; }
    __kf_hdevice__ const T &operator()(int y, int x) const { return ptr(y)[x]; }
};

template <typename T>
struct PtrStepSz : public PtrStep<T> {
    __kf_hdevice__ PtrStepSz() : cols(0), rows(0) {}
    __kf_hdevice__ PtrStepSz(int rows_arg, int cols_arg, T *data_arg, size_t step_arg)
        : PtrStep<T>(data_arg, step_arg), cols(cols_arg), rows(rows_arg) {}

    int cols;
    int rows;
};
}  // namespace cuda

namespace device {
using kfusion::cuda::PtrStep;
using kfusion::cuda::PtrStepSz;
using kfusion::cuda::PtrSz;
}  // namespace device
}  // namespace kfusion

namespace kf = kfusion;


================================================
FILE: include/kfusion/cuda/marching_cubes.hpp
================================================
#pragma once

/* eigen includes */
#include <Eigen/Core>

/* kinfu includes */
#include <kfusion/cuda/device_array.hpp>
#include <kfusion/cuda/tsdf_volume.hpp>

/* pcl includes */
#include <pcl/point_types.h>

namespace kfusion {
namespace cuda {

/** \brief MarchingCubes implements MarchingCubes functionality for TSDF volume on GPU
 * \author Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com)
 */
class KF_EXPORTS MarchingCubes {
public:
    /** \brief Default size for triangles buffer */
    enum { POINTS_PER_TRIANGLE = 3, DEFAULT_TRIANGLES_BUFFER_SIZE = 2 * 1000 * 1000 * POINTS_PER_TRIANGLE };

    /** \brief Smart pointer. */
    typedef boost::shared_ptr<MarchingCubes> Ptr;

    /** \brief Default constructor */
    MarchingCubes();
    /** \brief Destructor */
    ~MarchingCubes();

    /* set volume pose */
    void setPose(const cv::Affine3f& pose);

    /* run mc to extract the zero level set */
    Surface run(const TsdfVolume& volume, DeviceArray<pcl::PointXYZ>& vertices_buffer,
                DeviceArray<pcl::Normal>& normals_buffer);

private:
    /** \brief Edge table for marching cubes  */
    DeviceArray<int> edgeTable_;

    /** \brief Number of vertextes table for marching cubes  */
    DeviceArray<int> numVertsTable_;

    /** \brief Triangles table for marching cubes  */
    DeviceArray<int> triTable_;

    /** \brief Temporary buffer used by marching cubes (first row stores occuped voxes id, second number of vetexes,
     * third poits offsets */
    DeviceArray2D<int> occupied_voxels_buffer_;

    /* volume pose */
    cv::Affine3f pose;
};
}  // namespace cuda
}  // namespace kfusion


================================================
FILE: include/kfusion/cuda/projective_icp.hpp
================================================
#pragma once

#include <kfusion/types.hpp>

namespace kfusion {
namespace cuda {
class ProjectiveICP {
public:
    enum { MAX_PYRAMID_LEVELS = 4 };

    typedef std::vector<Depth> DepthPyr;
    typedef std::vector<Cloud> PointsPyr;
    typedef std::vector<Normals> NormalsPyr;

    ProjectiveICP();
    virtual ~ProjectiveICP();

    float getDistThreshold() const;
    void setDistThreshold(float distance);

    float getAngleThreshold() const;
    void setAngleThreshold(float angle);

    void setIterationsNum(const std::vector<int> &iters);
    int getUsedLevelsNum() const;

    virtual bool estimateTransform(Affine3f &affine, const Intr &intr, const Frame &curr, const Frame &prev);

    /** The function takes masked depth, i.e. it assumes for performance reasons
     * that "if depth(y,x) is not zero, then normals(y,x) surely is not qnan" */
    virtual bool estimateTransform(Affine3f &affine, const Intr &intr, const DepthPyr &dcurr, const NormalsPyr ncurr,
                                   const DepthPyr dprev, const NormalsPyr nprev);
    virtual bool estimateTransform(Affine3f &affine, const Intr &intr, const PointsPyr &vcurr, const NormalsPyr ncurr,
                                   const PointsPyr vprev, const NormalsPyr nprev);

    // static Vec3f rodrigues2(const Mat3f& matrix);
private:
    std::vector<int> iters_;
    float angle_thres_;
    float dist_thres_;
    DeviceArray2D<float> buffer_;

    struct StreamHelper;
    cv::Ptr<StreamHelper> shelp_;
};
}  // namespace cuda
}  // namespace kfusion


================================================
FILE: include/kfusion/cuda/temp_utils.hpp
================================================
#pragma once

#include <cuda.h>
#include <kfusion/cuda/kernel_containers.hpp>

#define FULL_MASK 0xffffffff

namespace kfusion {
namespace device {
template <class T>
__kf_hdevice__ void swap(T &a, T &b) {
    T c(a);
    a = b;
    b = c;
}

template <typename T>
struct numeric_limits;

template <>
struct numeric_limits<float> {
    __kf_device__ static float quiet_NaN() { return __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ };
    __kf_device__ static float epsilon() { return 1.192092896e-07f /*FLT_EPSILON*/; };
    __kf_device__ static float min() { return 1.175494351e-38f /*FLT_MIN*/; };
    __kf_device__ static float max() { return 3.402823466e+38f /*FLT_MAX*/; };
};

template <>
struct numeric_limits<unsigned short> {
    __kf_device__ static unsigned short max() { return USHRT_MAX; };
};

__kf_device__ float dot(const float3 &v1, const float3 &v2) {
    return __fmaf_rn(v1.x, v2.x, __fmaf_rn(v1.y, v2.y, v1.z * v2.z));
}

__kf_device__ float3 &operator+=(float3 &vec, const float &v) {
    vec.x += v;
    vec.y += v;
    vec.z += v;
    return vec;
}

__kf_device__ float3 &operator+=(float3 &v1, const float3 &v2) {
    v1.x += v2.x;
    v1.y += v2.y;
    v1.z += v2.z;
    return v1;
}

__kf_device__ float3 operator+(const float3 &v1, const float3 &v2) {
    return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
}

__kf_device__ float3 operator*(const float3 &v1, const float3 &v2) {
    return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
}

__kf_hdevice__ float3 operator*(const float3 &v1, const int3 &v2) {
    return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
}

__kf_device__ float3 operator/(const float3 &v1, const float3 &v2) {
    return make_float3(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z);
}

__kf_hdevice__ float3 operator/(const float &v, const float3 &vec) {
    return make_float3(v / vec.x, v / vec.y, v / vec.z);
}

__kf_device__ float3 &operator*=(float3 &vec, const float &v) {
    vec.x *= v;
    vec.y *= v;
    vec.z *= v;
    return vec;
}

__kf_hdevice__ float3 operator-(const float3 &v1, const float3 &v2) {
    return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
}

__kf_hdevice__ float3 operator*(const float3 &v1, const float &v) { return make_float3(v1.x * v, v1.y * v, v1.z * v); }

__kf_hdevice__ float3 operator*(const float &v, const float3 &v1) { return make_float3(v1.x * v, v1.y * v, v1.z * v); }

__kf_device__ float norm(const float3 &v) { return sqrt(dot(v, v)); }

__kf_device__ float norm_sqr(const float3 &v) { return dot(v, v); }

__kf_device__ float3 normalized(const float3 &v) { return v * rsqrt(dot(v, v)); }

__kf_hdevice__ float3 cross(const float3 &v1, const float3 &v2) {
    return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
}

__kf_device__ void computeRoots2(const float &b, const float &c, float3 &roots) {
    roots.x = 0.f;
    float d = b * b - 4.f * c;
    if (d < 0.f)  // no real roots!!!! THIS SHOULD NOT HAPPEN!
        d = 0.f;

    float sd = sqrtf(d);

    roots.z = 0.5f * (b + sd);
    roots.y = 0.5f * (b - sd);
}

__kf_device__ void computeRoots3(float c0, float c1, float c2, float3 &roots) {
    if (fabsf(c0) < numeric_limits<float>::epsilon())  // one root is 0 -> quadratic equation
    {
        computeRoots2(c2, c1, roots);
    } else {
        const float s_inv3  = 1.f / 3.f;
        const float s_sqrt3 = sqrtf(3.f);
        // Construct the parameters used in classifying the roots of the equation
        // and in solving the equation for the roots in closed form.
        float c2_over_3 = c2 * s_inv3;
        float a_over_3  = (c1 - c2 * c2_over_3) * s_inv3;
        if (a_over_3 > 0.f)
            a_over_3 = 0.f;

        float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));

        float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
        if (q > 0.f)
            q = 0.f;

        // Compute the eigenvalues by solving for the roots of the polynomial.
        float rho       = sqrtf(-a_over_3);
        float theta     = atan2f(sqrtf(-q), half_b) * s_inv3;
        float cos_theta = __cosf(theta);
        float sin_theta = __sinf(theta);
        roots.x         = c2_over_3 + 2.f * rho * cos_theta;
        roots.y         = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
        roots.z         = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);

        // Sort in increasing order.
        if (roots.x >= roots.y)
            swap(roots.x, roots.y);

        if (roots.y >= roots.z) {
            swap(roots.y, roots.z);

            if (roots.x >= roots.y)
                swap(roots.x, roots.y);
        }
        if (roots.x <= 0)  // eigenval for symetric positive semi-definite matrix can
                           // not be negative! Set it to 0
            computeRoots2(c2, c1, roots);
    }
}

struct Eigen33 {
public:
    template <int Rows>
    struct MiniMat {
        float3 data[Rows];
        __kf_hdevice__ float3 &operator[](int i) { return data[i]; }
        __kf_hdevice__ const float3 &operator[](int i) const { return data[i]; }
    };
    typedef MiniMat<3> Mat33;
    typedef MiniMat<4> Mat43;

    static __kf_device__ float3 unitOrthogonal(const float3 &src) {
        float3 perp;
        /* Let us compute the crossed product of *this with a vector
         * that is not too close to being colinear to *this.
         */

        /* unless the x and y coords are both close to zero, we can
         * simply take ( -y, x, 0 ) and normalize it.
         */
        if (!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z)) {
            float invnm = rsqrtf(src.x * src.x + src.y * src.y);
            perp.x      = -src.y * invnm;
            perp.y      = src.x * invnm;
            perp.z      = 0.0f;
        }
        /* if both x and y are close to zero, then the vector is close
         * to the z-axis, so it's far from colinear to the x-axis for instance.
         * So we take the crossed product with (1,0,0) and normalize it.
         */
        else {
            float invnm = rsqrtf(src.z * src.z + src.y * src.y);
            perp.x      = 0.0f;
            perp.y      = -src.z * invnm;
            perp.z      = src.y * invnm;
        }

        return perp;
    }

    __kf_device__ Eigen33(volatile float *mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
    __kf_device__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals) {
        // Scale the matrix so its entries are in [-1,1].  The scaling is applied
        // only when at least one matrix entry has magnitude larger than 1.

        float max01 = fmaxf(fabsf(mat_pkg[0]), fabsf(mat_pkg[1]));
        float max23 = fmaxf(fabsf(mat_pkg[2]), fabsf(mat_pkg[3]));
        float max45 = fmaxf(fabsf(mat_pkg[4]), fabsf(mat_pkg[5]));
        float m0123 = fmaxf(max01, max23);
        float scale = fmaxf(max45, m0123);

        if (scale <= numeric_limits<float>::min())
            scale = 1.f;

        mat_pkg[0] /= scale;
        mat_pkg[1] /= scale;
        mat_pkg[2] /= scale;
        mat_pkg[3] /= scale;
        mat_pkg[4] /= scale;
        mat_pkg[5] /= scale;

        // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
        // eigenvalues are the roots to this equation, all guaranteed to be
        // real-valued, because the matrix is symmetric.
        float c0 = m00() * m11() * m22() + 2.f * m01() * m02() * m12() - m00() * m12() * m12() - m11() * m02() * m02() -
                   m22() * m01() * m01();
        float c1 = m00() * m11() - m01() * m01() + m00() * m22() - m02() * m02() + m11() * m22() - m12() * m12();
        float c2 = m00() + m11() + m22();

        computeRoots3(c0, c1, c2, evals);

        if (evals.z - evals.x <= numeric_limits<float>::epsilon()) {
            evecs[0] = make_float3(1.f, 0.f, 0.f);
            evecs[1] = make_float3(0.f, 1.f, 0.f);
            evecs[2] = make_float3(0.f, 0.f, 1.f);
        } else if (evals.y - evals.x <= numeric_limits<float>::epsilon()) {
            // first and second equal
            tmp[0] = row0();
            tmp[1] = row1();
            tmp[2] = row2();
            tmp[0].x -= evals.z;
            tmp[1].y -= evals.z;
            tmp[2].z -= evals.z;

            vec_tmp[0] = cross(tmp[0], tmp[1]);
            vec_tmp[1] = cross(tmp[0], tmp[2]);
            vec_tmp[2] = cross(tmp[1], tmp[2]);

            float len1 = dot(vec_tmp[0], vec_tmp[0]);
            float len2 = dot(vec_tmp[1], vec_tmp[1]);
            float len3 = dot(vec_tmp[2], vec_tmp[2]);

            if (len1 >= len2 && len1 >= len3) {
                evecs[2] = vec_tmp[0] * rsqrtf(len1);
            } else if (len2 >= len1 && len2 >= len3) {
                evecs[2] = vec_tmp[1] * rsqrtf(len2);
            } else {
                evecs[2] = vec_tmp[2] * rsqrtf(len3);
            }

            evecs[1] = unitOrthogonal(evecs[2]);
            evecs[0] = cross(evecs[1], evecs[2]);
        } else if (evals.z - evals.y <= numeric_limits<float>::epsilon()) {
            // second and third equal
            tmp[0] = row0();
            tmp[1] = row1();
            tmp[2] = row2();
            tmp[0].x -= evals.x;
            tmp[1].y -= evals.x;
            tmp[2].z -= evals.x;

            vec_tmp[0] = cross(tmp[0], tmp[1]);
            vec_tmp[1] = cross(tmp[0], tmp[2]);
            vec_tmp[2] = cross(tmp[1], tmp[2]);

            float len1 = dot(vec_tmp[0], vec_tmp[0]);
            float len2 = dot(vec_tmp[1], vec_tmp[1]);
            float len3 = dot(vec_tmp[2], vec_tmp[2]);

            if (len1 >= len2 && len1 >= len3) {
                evecs[0] = vec_tmp[0] * rsqrtf(len1);
            } else if (len2 >= len1 && len2 >= len3) {
                evecs[0] = vec_tmp[1] * rsqrtf(len2);
            } else {
                evecs[0] = vec_tmp[2] * rsqrtf(len3);
            }

            evecs[1] = unitOrthogonal(evecs[0]);
            evecs[2] = cross(evecs[0], evecs[1]);
        } else {
            tmp[0] = row0();
            tmp[1] = row1();
            tmp[2] = row2();
            tmp[0].x -= evals.z;
            tmp[1].y -= evals.z;
            tmp[2].z -= evals.z;

            vec_tmp[0] = cross(tmp[0], tmp[1]);
            vec_tmp[1] = cross(tmp[0], tmp[2]);
            vec_tmp[2] = cross(tmp[1], tmp[2]);

            float len1 = dot(vec_tmp[0], vec_tmp[0]);
            float len2 = dot(vec_tmp[1], vec_tmp[1]);
            float len3 = dot(vec_tmp[2], vec_tmp[2]);

            float mmax[3];

            unsigned int min_el = 2;
            unsigned int max_el = 2;
            if (len1 >= len2 && len1 >= len3) {
                mmax[2]  = len1;
                evecs[2] = vec_tmp[0] * rsqrtf(len1);
            } else if (len2 >= len1 && len2 >= len3) {
                mmax[2]  = len2;
                evecs[2] = vec_tmp[1] * rsqrtf(len2);
            } else {
                mmax[2]  = len3;
                evecs[2] = vec_tmp[2] * rsqrtf(len3);
            }

            tmp[0] = row0();
            tmp[1] = row1();
            tmp[2] = row2();
            tmp[0].x -= evals.y;
            tmp[1].y -= evals.y;
            tmp[2].z -= evals.y;

            vec_tmp[0] = cross(tmp[0], tmp[1]);
            vec_tmp[1] = cross(tmp[0], tmp[2]);
            vec_tmp[2] = cross(tmp[1], tmp[2]);

            len1 = dot(vec_tmp[0], vec_tmp[0]);
            len2 = dot(vec_tmp[1], vec_tmp[1]);
            len3 = dot(vec_tmp[2], vec_tmp[2]);

            if (len1 >= len2 && len1 >= len3) {
                mmax[1]  = len1;
                evecs[1] = vec_tmp[0] * rsqrtf(len1);
                min_el   = len1 <= mmax[min_el] ? 1 : min_el;
                max_el   = len1 > mmax[max_el] ? 1 : max_el;
            } else if (len2 >= len1 && len2 >= len3) {
                mmax[1]  = len2;
                evecs[1] = vec_tmp[1] * rsqrtf(len2);
                min_el   = len2 <= mmax[min_el] ? 1 : min_el;
                max_el   = len2 > mmax[max_el] ? 1 : max_el;
            } else {
                mmax[1]  = len3;
                evecs[1] = vec_tmp[2] * rsqrtf(len3);
                min_el   = len3 <= mmax[min_el] ? 1 : min_el;
                max_el   = len3 > mmax[max_el] ? 1 : max_el;
            }

            tmp[0] = row0();
            tmp[1] = row1();
            tmp[2] = row2();
            tmp[0].x -= evals.x;
            tmp[1].y -= evals.x;
            tmp[2].z -= evals.x;

            vec_tmp[0] = cross(tmp[0], tmp[1]);
            vec_tmp[1] = cross(tmp[0], tmp[2]);
            vec_tmp[2] = cross(tmp[1], tmp[2]);

            len1 = dot(vec_tmp[0], vec_tmp[0]);
            len2 = dot(vec_tmp[1], vec_tmp[1]);
            len3 = dot(vec_tmp[2], vec_tmp[2]);

            if (len1 >= len2 && len1 >= len3) {
                mmax[0]  = len1;
                evecs[0] = vec_tmp[0] * rsqrtf(len1);
                min_el   = len3 <= mmax[min_el] ? 0 : min_el;
                max_el   = len3 > mmax[max_el] ? 0 : max_el;
            } else if (len2 >= len1 && len2 >= len3) {
                mmax[0]  = len2;
                evecs[0] = vec_tmp[1] * rsqrtf(len2);
                min_el   = len3 <= mmax[min_el] ? 0 : min_el;
                max_el   = len3 > mmax[max_el] ? 0 : max_el;
            } else {
                mmax[0]  = len3;
                evecs[0] = vec_tmp[2] * rsqrtf(len3);
                min_el   = len3 <= mmax[min_el] ? 0 : min_el;
                max_el   = len3 > mmax[max_el] ? 0 : max_el;
            }

            unsigned mid_el = 3 - min_el - max_el;
            evecs[min_el]   = normalized(cross(evecs[(min_el + 1) % 3], evecs[(min_el + 2) % 3]));
            evecs[mid_el]   = normalized(cross(evecs[(mid_el + 1) % 3], evecs[(mid_el + 2) % 3]));
        }
        // Rescale back to the original size.
        evals *= scale;
    }

private:
    volatile float *mat_pkg;

    __kf_device__ float m00() const { return mat_pkg[0]; }
    __kf_device__ float m01() const { return mat_pkg[1]; }
    __kf_device__ float m02() const { return mat_pkg[2]; }
    __kf_device__ float m10() const { return mat_pkg[1]; }
    __kf_device__ float m11() const { return mat_pkg[3]; }
    __kf_device__ float m12() const { return mat_pkg[4]; }
    __kf_device__ float m20() const { return mat_pkg[2]; }
    __kf_device__ float m21() const { return mat_pkg[4]; }
    __kf_device__ float m22() const { return mat_pkg[5]; }

    __kf_device__ float3 row0() const { return make_float3(m00(), m01(), m02()); }
    __kf_device__ float3 row1() const { return make_float3(m10(), m11(), m12()); }
    __kf_device__ float3 row2() const { return make_float3(m20(), m21(), m22()); }

    __kf_device__ static bool isMuchSmallerThan(float x, float y) {
        // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
        const float prec_sqr = numeric_limits<float>::epsilon() * numeric_limits<float>::epsilon();
        return x * x <= prec_sqr * y * y;
    }
};

struct Warp {
    enum { LOG_WARP_SIZE = 5, WARP_SIZE = 1 << LOG_WARP_SIZE, STRIDE = WARP_SIZE };

    /** \brief Returns the warp lane ID of the calling thread. */
    static __kf_device__ unsigned int laneId() {
        unsigned int ret;
        asm("mov.u32 %0, %laneid;" : "=r"(ret));
        return ret;
    }

    static __kf_device__ unsigned int id() {
        int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
        return tid >> LOG_WARP_SIZE;
    }

    static __kf_device__ int laneMaskLt() {
#if (__CUDA_ARCH__ >= 200)
        unsigned int ret;
        asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret));
        return ret;
#else
        return 0xFFFFFFFF >> (32 - laneId());
#endif
    }
    static __kf_device__ int binaryExclScan(int ballot_mask) { return __popc(Warp::laneMaskLt() & ballot_mask); }
};

struct Block {
    static __kf_device__ unsigned int stride() { return blockDim.x * blockDim.y * blockDim.z; }

    static __kf_device__ int flattenedThreadId() {
        return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
    }

    template <int CTA_SIZE, typename T, class BinOp>
    static __kf_device__ void reduce(volatile T *buffer, BinOp op) {
        int tid = flattenedThreadId();
        T val   = buffer[tid];

        if (CTA_SIZE >= 1024) {
            if (tid < 512)
                buffer[tid] = val = op(val, buffer[tid + 512]);
            __syncthreads();
        }
        if (CTA_SIZE >= 512) {
            if (tid < 256)
                buffer[tid] = val = op(val, buffer[tid + 256]);
            __syncthreads();
        }
        if (CTA_SIZE >= 256) {
            if (tid < 128)
                buffer[tid] = val = op(val, buffer[tid + 128]);
            __syncthreads();
        }
        if (CTA_SIZE >= 128) {
            if (tid < 64)
                buffer[tid] = val = op(val, buffer[tid + 64]);
            __syncthreads();
        }

        if (tid < 32) {
            if (CTA_SIZE >= 64) {
                buffer[tid] = val = op(val, buffer[tid + 32]);
            }
            if (CTA_SIZE >= 32) {
                buffer[tid] = val = op(val, buffer[tid + 16]);
            }
            if (CTA_SIZE >= 16) {
                buffer[tid] = val = op(val, buffer[tid + 8]);
            }
            if (CTA_SIZE >= 8) {
                buffer[tid] = val = op(val, buffer[tid + 4]);
            }
            if (CTA_SIZE >= 4) {
                buffer[tid] = val = op(val, buffer[tid + 2]);
            }
            if (CTA_SIZE >= 2) {
                buffer[tid] = val = op(val, buffer[tid + 1]);
            }
        }
    }

    template <int CTA_SIZE, typename T, class BinOp>
    static __kf_device__ T reduce(volatile T *buffer, T init, BinOp op) {
        int tid = flattenedThreadId();
        T val = buffer[tid] = init;
        __syncthreads();

        if (CTA_SIZE >= 1024) {
            if (tid < 512)
                buffer[tid] = val = op(val, buffer[tid + 512]);
            __syncthreads();
        }
        if (CTA_SIZE >= 512) {
            if (tid < 256)
                buffer[tid] = val = op(val, buffer[tid + 256]);
            __syncthreads();
        }
        if (CTA_SIZE >= 256) {
            if (tid < 128)
                buffer[tid] = val = op(val, buffer[tid + 128]);
            __syncthreads();
        }
        if (CTA_SIZE >= 128) {
            if (tid < 64)
                buffer[tid] = val = op(val, buffer[tid + 64]);
            __syncthreads();
        }

        if (tid < 32) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 && 0
            if (CTA_SIZE >= 64)
                val = op(val, buffer[tid + 32]);
            if (CTA_SIZE >= 32)
                val = op(val, __shfl_xor(val, 16));
            if (CTA_SIZE >= 16)
                val = op(val, __shfl_xor(val, 8));
            if (CTA_SIZE >= 8)
                val = op(val, __shfl_xor(val, 4));
            if (CTA_SIZE >= 4)
                val = op(val, __shfl_xor(val, 2));
            if (CTA_SIZE >= 2)
                buffer[tid] = op(val, __shfl_xor(val, 1));
#else
            if (CTA_SIZE >= 64) {
                buffer[tid] = val = op(val, buffer[tid + 32]);
            }
            if (CTA_SIZE >= 32) {
                buffer[tid] = val = op(val, buffer[tid + 16]);
            }
            if (CTA_SIZE >= 16) {
                buffer[tid] = val = op(val, buffer[tid + 8]);
            }
            if (CTA_SIZE >= 8) {
                buffer[tid] = val = op(val, buffer[tid + 4]);
            }
            if (CTA_SIZE >= 4) {
                buffer[tid] = val = op(val, buffer[tid + 2]);
            }
            if (CTA_SIZE >= 2) {
                buffer[tid] = val = op(val, buffer[tid + 1]);
            }
#endif
        }
        __syncthreads();
        return buffer[0];
    }
};

struct Emulation {
    static __kf_device__ int warp_reduce(volatile int *ptr, const unsigned int tid) {
        const unsigned int lane = tid & 31;  // index of thread in warp (0..31)

        if (lane < 16) {
            int partial = ptr[tid];

            ptr[tid] = partial = partial + ptr[tid + 16];
            ptr[tid] = partial = partial + ptr[tid + 8];
            ptr[tid] = partial = partial + ptr[tid + 4];
            ptr[tid] = partial = partial + ptr[tid + 2];
            ptr[tid] = partial = partial + ptr[tid + 1];
        }
        return ptr[tid - lane];
    }

    static __kf_device__ int Ballot(int predicate, volatile int *cta_buffer) {
#if __CUDA_ARCH__ >= 200
        (void) cta_buffer;
        return __ballot_sync(FULL_MASK, predicate);
#else
        int tid         = Block::flattenedThreadId();
        cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
        return warp_reduce(cta_buffer, tid);
#endif
    }

    static __kf_device__ bool All(int predicate, volatile int *cta_buffer) {
#if __CUDA_ARCH__ >= 200
        (void) cta_buffer;
        return __all_sync(FULL_MASK, predicate);
#else
        int tid         = Block::flattenedThreadId();
        cta_buffer[tid] = predicate ? 1 : 0;
        return warp_reduce(cta_buffer, tid) == 32;
#endif
    }
};
}  // namespace device
}  // namespace kfusion


================================================
FILE: include/kfusion/cuda/texture_binder.hpp
================================================
#pragma once

#include <kfusion/cuda/device_array.hpp>
#include <kfusion/safe_call.hpp>

namespace kfusion {
namespace cuda {
class TextureBinder {
public:
    template <class T, enum cudaTextureReadMode readMode>
    TextureBinder(const DeviceArray2D<T> &arr, const struct texture<T, 2, readMode> &tex) : texref(&tex) {
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall(cudaBindTexture2D(0, tex, arr.ptr(), desc, arr.cols(), arr.rows(), arr.step()));
    }

    template <class T, enum cudaTextureReadMode readMode>
    TextureBinder(const DeviceArray<T> &arr, const struct texture<T, 1, readMode> &tex) : texref(&tex) {
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall(cudaBindTexture(0, tex, arr.ptr(), desc, arr.sizeBytes()));
    }

    template <class T, enum cudaTextureReadMode readMode>
    TextureBinder(const PtrStepSz<T> &arr, const struct texture<T, 2, readMode> &tex) : texref(&tex) {
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall(cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step));
    }

    template <class A, class T, enum cudaTextureReadMode readMode>
    TextureBinder(const A &arr, const struct texture<T, 2, readMode> &tex, const cudaChannelFormatDesc &desc)
        : texref(&tex) {
        cudaSafeCall(cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step));
    }

    template <class T, enum cudaTextureReadMode readMode>
    TextureBinder(const PtrSz<T> &arr, const struct texture<T, 1, readMode> &tex) : texref(&tex) {
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall(cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()));
    }

    ~TextureBinder() { cudaSafeCall(cudaUnbindTexture(texref)); }

private:
    const struct textureReference *texref;
};
}  // namespace cuda

namespace device {
using kfusion::cuda::TextureBinder;
}
}  // namespace kfusion


================================================
FILE: include/kfusion/cuda/tsdf_volume.hpp
================================================
#pragma once

/* kinfu includes */
#include <kfusion/types.hpp>

/* pcl includes */
#include <pcl/common/geometry.h>

/* sobfu includes */
#include <sobfu/params.hpp>

/* sys headers */
#include <memory>

namespace kfusion {
namespace cuda {
class KF_EXPORTS TsdfVolume {
public:
    TsdfVolume(const Params &params);
    virtual ~TsdfVolume();

    void create(const Vec3i &dims);

    Vec3i getDims() const;
    Vec3f getVoxelSize() const;

    const CudaData data() const;
    CudaData data();

    Vec3f getSize() const;
    void setSize(const Vec3f &size);

    float getTruncDist() const;
    void setTruncDist(float &distance);

    float getEta() const;
    void setEta(float &eta);

    float getMaxWeight() const;
    void setMaxWeight(float &weight);

    Affine3f getPose() const;
    void setPose(const Affine3f &pose);

    float getRaycastStepFactor() const;
    void setRaycastStepFactor(float &factor);

    float getGradientDeltaFactor() const;
    void setGradientDeltaFactor(float &factor);

    Vec3i getGridOrigin() const;
    void setGridOrigin(const Vec3i &origin);

    virtual void clear();
    void swap(CudaData &data);

    virtual void applyAffine(const Affine3f &affine);

    virtual void integrate(const TsdfVolume &phi_n_psi);
    virtual void integrate(const Dists &dists, const Affine3f &camera_pose, const Intr &intr);

    virtual void initBox(const float3 &b);
    virtual void initEllipsoid(const float3 &r);
    virtual void initPlane(const float &z);
    virtual void initSphere(const float3 &centre, const float &radius);
    virtual void initTorus(const float2 &t);

    void print_sdf_values();

    struct Entry {
        typedef unsigned short half;

        half tsdf;
        int weight;

        static float half2float(half value);
        static half float2half(float value);
    };

private:
    CudaData data_;

    float trunc_dist_;
    float eta_;
    float max_weight_;
    Vec3i dims_;
    Vec3f size_;
    Affine3f pose_;

    float gradient_delta_factor_;
    float raycast_step_factor_;
};
}  // namespace cuda
}  // namespace kfusion


================================================
FILE: include/kfusion/exports.hpp
================================================
#pragma once

#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined KFUSION_API_EXPORTS
#define KF_EXPORTS __declspec(dllexport)
#else
#define KF_EXPORTS
#endif


================================================
FILE: include/kfusion/internal.hpp
================================================
#pragma once

/* kinfu includes */
#include <kfusion/cuda/device_array.hpp>
#include <kfusion/safe_call.hpp>

/* pcl includes */
#include <pcl/point_types.h>

//#define USE_DEPTH

struct Mat4f {
    float4 data[4];
};

namespace kfusion {
namespace device {
typedef float4 Normal;
typedef float4 Point;

typedef unsigned short ushort;
typedef unsigned char uchar;

typedef PtrStepSz<float> Dists;
typedef DeviceArray2D<ushort> Depth;
typedef DeviceArray2D<Normal> Normals;
typedef DeviceArray2D<Point> Points;
typedef DeviceArray2D<uchar4> Image;

typedef float3 *Displacements;

typedef DeviceArray<float4> Vertices;
typedef Vertices Norms;

/*
 * struct which holds a surface as triangles
 */
struct Surface {
    Vertices vertices;
    Norms normals;
};

struct float8 {
    float x, y, z, w, c1, c2, c3, c4;
};

typedef float4 PointType;

typedef int3 Vec3i;
typedef float3 Vec3f;
struct Mat3f {
    float3 data[3];
};
struct Aff3f {
    Mat3f R;
    Vec3f t;
};

struct TsdfVolume {
public:
    float2 *const data;
    const int3 dims;
    const float3 voxel_size;
    const float trunc_dist;
    const float eta;
    const float max_weight;

    TsdfVolume(float2 *const data, int3 dims, float3 voxel_size, float trunc_dist, float eta, float max_weight);
    // TsdfVolume(const TsdfVolume &);

    TsdfVolume &operator=(const TsdfVolume &);

    __kf_device__ float2 *operator()(int x, int y, int z);
    __kf_device__ const float2 *operator()(int x, int y, int z) const;

    __kf_device__ float2 *beg(int x, int y) const;
    __kf_device__ float2 *zstep(float2 *const ptr) const;
};

struct Projector {
    float2 f, c;
    Projector() {}
    Projector(float fx, float fy, float cx, float cy);
    __kf_device__ float2 operator()(const float3 &p) const;
};

struct Reprojector {
    Reprojector() {}
    Reprojector(float fx, float fy, float cx, float cy);
    float2 finv, c;
    __kf_device__ float3 operator()(int x, int y, float z) const;
};

struct CubeIndexEstimator {
    CubeIndexEstimator(const TsdfVolume &vol) : volume(vol) { isoValue = 0.f; }

    const TsdfVolume volume;
    float isoValue;

    __kf_device__ int computeCubeIndex(int x, int y, int z, float f[8]) const;
    __kf_device__ void readTsdf(int x, int y, int z, float &f, float &weight) const;
};

struct OccupiedVoxels : public CubeIndexEstimator {
    OccupiedVoxels(const TsdfVolume &vol) : CubeIndexEstimator({vol}) {}

    enum {
        CTA_SIZE_X = 32,
        CTA_SIZE_Y = 8,
        CTA_SIZE   = CTA_SIZE_X * CTA_SIZE_Y,

        LOG_WARP_SIZE = 5,
        WARP_SIZE     = 1 << LOG_WARP_SIZE,
        WARPS_COUNT   = CTA_SIZE / WARP_SIZE,
    };

    mutable int *voxels_indices;
    mutable int *vertices_number;
    int max_size;

    __kf_device__ void operator()() const;
};

struct TrianglesGenerator : public CubeIndexEstimator {
    TrianglesGenerator(const TsdfVolume &vol) : CubeIndexEstimator({vol}) {}

    enum { CTA_SIZE = 256, MAX_GRID_SIZE_X = 65536 };

    const int *occupied_voxels;
    const int *vertex_ofssets;
    int voxels_count;
    float3 cell_size;

    Aff3f pose;

    mutable device::PointType *outputVertices;
    mutable device::PointType *outputNormals;

    __kf_device__ void operator()() const;

    __kf_device__ void store_point(float4 *ptr, int index, const float3 &vertex) const;

    __kf_device__ float3 get_node_coo(int x, int y, int z) const;
    __kf_device__ float3 vertex_interp(float3 p0, float3 p1, float f0, float f1) const;

};  // namespace device

struct ComputeIcpHelper {
    struct Policy;
    struct PageLockHelper {
        float *data;
        PageLockHelper();
        ~PageLockHelper();
    };

    float min_cosine;
    float dist2_thres;

    Aff3f aff;

    float rows, cols;
    float2 f, c, finv;

    PtrStep<ushort> dcurr;
    PtrStep<Normal> ncurr;
    PtrStep<Point> vcurr;

    ComputeIcpHelper(float dist_thres, float angle_thres);
    void setLevelIntr(int level_index, float fx, float fy, float cx, float cy);

    void operator()(const Depth &dprev, const Normals &nprev, DeviceArray2D<float> &buffer, float *data,
                    cudaStream_t stream);
    void operator()(const Points &vprev, const Normals &nprev, DeviceArray2D<float> &buffer, float *data,
                    cudaStream_t stream);

    static void allocate_buffer(DeviceArray2D<float> &buffer, int partials_count = -1);

    // private:
    __kf_device__ int find_coresp(int x, int y, float3 &n, float3 &d, float3 &s) const;
    __kf_device__ void partial_reduce(const float row[7], PtrStep<float> &partial_buffer) const;
    __kf_device__ float2 proj(const float3 &p) const;
    __kf_device__ float3 reproj(float x, float y, float z) const;
};

/*
 * TSDF FUNCTIONS
 */

void clear_volume(TsdfVolume &volume);

void integrate(TsdfVolume &phi_global, TsdfVolume &phi_n_psi);
void integrate(const Dists &depth, TsdfVolume &volume, const Aff3f &aff, const Projector &proj);

void init_box(TsdfVolume &volume, const float3 &b);
void init_ellipsoid(TsdfVolume &volume, const float3 &r);
void init_plane(TsdfVolume &volume, const float &z);
void init_sphere(TsdfVolume &volume, const float3 &centre, const float &radius);
void init_torus(TsdfVolume &volume, const float2 &t);

/*
 * MARCHING CUBES FUNCTIONS
 */

/** \brief binds marching cubes tables to texture references */
void bindTextures(const int *edgeBuf, const int *triBuf, const int *numVertsBuf);

/** \brief unbinds */
void unbindTextures();

/** \brief scans TSDF volume and retrieves occuped voxes
 * \param[in] volume TSDF volume
 * \param[out] occupied_voxels buffer for occupied voxels; the function fills the first row with voxel id's and second
 * row with the no. of vertices
 * \return number of voxels in the buffer
 */
int getOccupiedVoxels(const TsdfVolume &volume, DeviceArray2D<int> &occupied_voxels);

/** \brief computes total number of vertices for all voxels and offsets of vertices in final triangle array
 * \param[out] occupied_voxels buffer with occupied voxels; the function fills 3rd only with offsets
 * \return total number of vertexes
 */
int computeOffsetsAndTotalVertices(DeviceArray2D<int> &occupied_voxels);

/** \brief generates final triangle array
 * \param[in] volume TSDF volume
 * \param[in] occupied_voxels occupied voxel ids (1st row), number of vertices (2nd row), offsets (3rd row).
 * \param[in] volume_size volume size in meters
 * \param[out] output triangle array
 */
void generateTriangles(const TsdfVolume &volume, const DeviceArray2D<int> &occupied_voxels, const float3 &volume_size,
                       const Aff3f &pose, DeviceArray<PointType> &outputVertices,
                       DeviceArray<PointType> &outputNormals);

/*
 *  IMAGE PROCESSING FUNCTIONS
 */
void compute_dists(const Depth &depth, Dists dists, float2 f, float2 c);

void truncateDepth(Depth &depth, float max_dist /*meters*/);
void bilateralFilter(const Depth &src, Depth &dst, int kernel_size, float sigma_spatial, float sigma_depth);
void depthPyr(const Depth &source, Depth &pyramid, float sigma_depth);

void resizeDepthNormals(const Depth &depth, const Normals &normals, Depth &depth_out, Normals &normals_out);
void resizePointsNormals(const Points &points, const Normals &normals, Points &points_out, Normals &normals_out);

void computeNormalsAndMaskDepth(const Reprojector &reproj, Depth &depth, Normals &normals);
void computePointNormals(const Reprojector &reproj, const Depth &depth, Points &points, Normals &normals);

/** \brief rasterises the surface extracted via marching cubes
 *  \param[in] proj
 *  \param[in] surface
 *  \param[out] vertices_out
 *  \paraim[out] normals_out
 */
void rasteriseSurface(const Projector &proj, const Aff3f &vol2cam, const Surface &surface, Points &vertices_out,
                      Normals &normals_out);

}  // namespace device
}  // namespace kfusion


================================================
FILE: include/kfusion/kinfu.hpp
================================================
#pragma once

/* boost includes */
#include <boost/filesystem.hpp>

/* kinfu includes */
#include <kfusion/cuda/marching_cubes.hpp>
#include <kfusion/cuda/projective_icp.hpp>
#include <kfusion/cuda/tsdf_volume.hpp>
#include <kfusion/types.hpp>

/* pcl includes */
#include <pcl/PCLPointCloud2.h>
#include <pcl/PolygonMesh.h>
#include <pcl/conversions.h>
#include <pcl/point_cloud.h>
#include <pcl/point_types.h>

/* sys headers */
#include <string>
#include <vector>

namespace kfusion {
namespace cuda {
KF_EXPORTS int getCudaEnabledDeviceCount();
KF_EXPORTS void setDevice(int device);
KF_EXPORTS std::string getDeviceName(int device);
KF_EXPORTS bool checkIfPreFermiGPU(int device);
KF_EXPORTS void printCudaDeviceInfo(int device);
KF_EXPORTS void printShortCudaDeviceInfo(int device);
}  // namespace cuda

struct KF_EXPORTS KinFuParams {
    static KinFuParams default_params();

    int cols;  // pixels
    int rows;  // pixels

    Intr intr;  // Camera parameters

    Vec3i volume_dims;     // number of voxels
    Vec3f volume_size;     // meters
    Affine3f volume_pose;  // meters, inital pose

    float bilateral_sigma_depth;    // meters
    float bilateral_sigma_spatial;  // pixels
    int bilateral_kernel_size;      // pixels

    float icp_truncate_depth_dist;  // meters
    float icp_dist_thres;           // meters
    float icp_angle_thres;          // radians
    std::vector<int> icp_iter_num;  // iterations for level index 0,1,..,3

    float tsdf_min_camera_movement;  // meters, integrate only if exceedes
    float tsdf_trunc_dist;           // meters;
    float tsdf_max_weight;           // frames

    float raycast_step_factor;    // in voxel sizes
    float gradient_delta_factor;  // in voxel sizes

    Vec3f light_pose;  // meters
};

class KF_EXPORTS KinFu {
public:
    typedef cv::Ptr<KinFu> Ptr;

    KinFu(const KinFuParams &params);

    const KinFuParams &params() const;
    KinFuParams &params();

    const cuda::TsdfVolume &tsdf() const;
    cuda::TsdfVolume &tsdf();

    const cuda::ProjectiveICP &icp() const;
    cuda::ProjectiveICP &icp();

    const cuda::MarchingCubes &mc() const;
    cuda::MarchingCubes &mc();

    void reset();

    bool operator()(const cuda::Depth &depth, const cuda::Image &image = cuda::Image());

    void renderImage(cuda::Image &image, int flag = 0);
    void renderImage(cuda::Image &image, const Affine3f &pose, int flag = 0);

    Affine3f getCameraPose(int time = -1) const;

protected:
    void allocate_buffers();

    int frame_counter_;
    KinFuParams params_;

    std::vector<Affine3f> poses_;
    cuda::Dists dists_;
    cuda::Frame curr_, prev_;

    cuda::Cloud points_;
    cuda::Normals normals_;
    cuda::Depth depths_;

    cv::Ptr<cuda::TsdfVolume> volume_;
    cv::Ptr<cuda::ProjectiveICP> icp_;
    cv::Ptr<cuda::MarchingCubes> mc_;
};
}  // namespace kfusion


================================================
FILE: include/kfusion/precomp.hpp
================================================
#pragma once

/* cuda includes */
#include <vector_functions.h>

/* kinfu includes */
#include <kfusion/cuda/imgproc.hpp>
#include <kfusion/cuda/marching_cubes.hpp>
#include <kfusion/cuda/projective_icp.hpp>
#include <kfusion/cuda/tsdf_volume.hpp>
#include <kfusion/internal.hpp>
#include <kfusion/kinfu.hpp>
#include <kfusion/types.hpp>

/* sys headers */
#include <iostream>

namespace kfusion {
template <typename D, typename S>
inline D device_cast(const S &source) {
    return *reinterpret_cast<const D *>(source.val);
}

template <>
inline device::Aff3f device_cast<device::Aff3f, Affine3f>(const Affine3f &source) {
    device::Aff3f aff;
    Mat3f R = source.rotation();
    Vec3f t = source.translation();
    aff.R   = device_cast<device::Mat3f>(R);
    aff.t   = device_cast<device::Vec3f>(t);
    return aff;
}
}  // namespace kfusion


================================================
FILE: include/kfusion/safe_call.hpp
================================================
#pragma once

#include <cuda_runtime_api.h>

namespace kfusion {
namespace cuda {
void error(const char *error_string, const char *file, const int line, const char *func);
}
}  // namespace kfusion

#if defined(__GNUC__)
#define cudaSafeCall(expr) kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
#else /* defined(__CUDACC__) || defined(__MSVC__) */
#define cudaSafeCall(expr) kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__)
#endif

namespace kfusion {
namespace cuda {
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") {
    if (cudaSuccess != err)
        error(cudaGetErrorString(err), file, line, func);
}

static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
}  // namespace cuda

namespace device {
using kfusion::cuda::divUp;
}
}  // namespace kfusion


================================================
FILE: include/kfusion/types.hpp
================================================
#pragma once

/* cuda includes */
#include <vector_functions.h>

/* kinfu includes */
#include <kfusion/cuda/device_array.hpp>

/* pcl includes */
#include <pcl/point_types.h>

/* opencv includes */
#include <opencv2/core/affine.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/viz/vizcore.hpp>

/* sys headers */
#include <iosfwd>

struct CUevent_st;

namespace kfusion {
typedef cv::Matx33f Mat3f;
typedef cv::Vec3f Vec3f;
typedef cv::Vec3i Vec3i;
typedef cv::Affine3f Affine3f;

struct KF_EXPORTS Intr {
    float fx, fy, cx, cy;

    Intr();
    Intr(float fx, float fy, float cx, float cy);
    Intr operator()(int level_index) const;
};

KF_EXPORTS std::ostream &operator<<(std::ostream &os, const Intr &intr);

struct Point {
    union {
        float data[4];
        struct {
            float x, y, z;
        };
    };

    Point() = default;

    Point(float x, float y, float z) {
        this->x = x;
        this->y = y;
        this->z = z;
    }
};

typedef Point Normal;

struct RGB {
    union {
        struct {
            unsigned char b, g, r;
        };
        int bgra;
    };
};

struct PixelRGB {
    unsigned char r, g, b;
};

namespace cuda {
typedef cuda::DeviceMemory CudaData;
typedef cuda::CudaData Displacements;
typedef cuda::DeviceArray2D<unsigned short> Depth;
typedef cuda::DeviceArray2D<float> Dists;
typedef cuda::DeviceArray2D<RGB> Image;
typedef cuda::DeviceArray2D<Normal> Normals;
typedef cuda::DeviceArray2D<Point> Cloud;

typedef cuda::DeviceArray<pcl::PointXYZ> Vertices;
typedef cuda::DeviceArray<pcl::Normal> Norms;

/*
 * struct used to hold the surface as triangles
 */
struct Surface {
    Vertices vertices;
    Norms normals;
};

struct Frame {
    bool use_points;

    std::vector<Depth> depth_pyr;
    std::vector<Cloud> points_pyr;
    std::vector<Normals> normals_pyr;
};
}  // namespace cuda

inline float deg2rad(float alpha) { return alpha * 0.017453293f; }

struct KF_EXPORTS ScopeTime {
    const char *name;
    double start;
    ScopeTime(const char *name);
    ~ScopeTime();
};

struct KF_EXPORTS SampledScopeTime {
public:
    enum { EACH = 34 };
    SampledScopeTime(double &time_ms);
    ~SampledScopeTime();

private:
    double getTime();
    SampledScopeTime(const SampledScopeTime &);
    SampledScopeTime &operator=(const SampledScopeTime &);

    double &time_ms_;
    double start;
};

}  // namespace kfusion


================================================
FILE: include/sobfu/cuda/utils.hpp
================================================
#pragma once

/* cuda includes */
#include <cuda.h>
#include <math_constants.h>

/* kinfu includes */
#include <kfusion/internal.hpp>

/* utility class used to avoid linker errors with extern unsized shared memory arrays with templated type */
template <class T>
struct SharedMemory {
    __device__ inline operator T *() {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }

    __device__ inline operator const T *() const {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }
};

/* get 1d index from 3d coordinates */
__device__ __forceinline__ int get_global_idx(int x, int y, int z, int dim_x, int dim_y) {
    return z * dim_y * dim_y + y * dim_x + x;
}

/*
 * LERP
 */

template <typename T>
__host__ __device__ __forceinline__ T lerp(T v0, T v1, T t) {
    return fma(t, v0, fma(-t, v1, v1));
}

__host__ __device__ __forceinline__ float3 lerp3f(float3 v0, float3 v1, float t) {
    return make_float3(lerp<float>(v0.x, v1.x, t), lerp<float>(v0.y, v1.y, t), lerp<float>(v0.z, v1.z, t));
}

__host__ __device__ __forceinline__ float4 lerp4f(float4 v0, float4 v1, float t) {
    return make_float4(lerp<float>(v0.x, v1.x, t), lerp<float>(v0.y, v1.y, t), lerp<float>(v0.z, v1.z, t), 0.f);
}

/*
 * INTERPOLATION
 */

template <typename Vol>
__device__ __forceinline__ float2 interpolate_tsdf(const Vol &volume, const float3 &p_voxels) {
    float3 cf = p_voxels;

    cf.x = fminf(fmaxf(0.f, cf.x), (float) volume.dims.x - 1);
    cf.y = fminf(fmaxf(0.f, cf.y), (float) volume.dims.y - 1);
    cf.z = fminf(fmaxf(0.f, cf.z), (float) volume.dims.z - 1);

    /* rounding to negative infinity */
    int3 g = make_int3(__float2int_rd(cf.x), __float2int_rd(cf.y), __float2int_rd(cf.z));

    int idx_x_1 = g.x + 1;
    if (cf.x == 0.f || cf.x == (float) volume.dims.x - 1) {
        idx_x_1--;
    }
    int idx_y_1 = g.y + 1;
    if (cf.y == 0.f || cf.y == (float) volume.dims.y - 1) {
        idx_y_1--;
    }
    int idx_z_1 = g.z + 1;
    if (cf.z == 0.f || cf.z == (float) volume.dims.z - 1) {
        idx_z_1--;
    }

    float a = cf.x - g.x;
    float b = cf.y - g.y;
    float c = cf.z - g.z;

    float tsdf   = lerp(lerp(lerp((*volume(idx_x_1, idx_y_1, idx_z_1)).x, (*volume(idx_x_1, idx_y_1, g.z + 0)).x, c),
                           lerp((*volume(idx_x_1, g.y + 0, idx_z_1)).x, (*volume(idx_x_1, g.y + 0, g.z + 0)).x, c), b),
                      lerp(lerp((*volume(g.x + 0, idx_y_1, idx_z_1)).x, (*volume(g.x + 0, idx_y_1, g.z + 0)).x, c),
                           lerp((*volume(g.x + 0, g.y + 0, idx_z_1)).x, (*volume(g.x + 0, g.y + 0, g.z + 0)).x, c), b),
                      a);
    float weight = (*volume(g.x, g.y, g.z)).y;

    return make_float2(tsdf, weight);
}

template <typename Field>
__device__ __forceinline__ float4 interpolate_field(const Field &field, const float3 &p_voxels) {
    float3 cf = p_voxels;

    cf.x = fminf(fmaxf(0.f, cf.x), (float) field.dims.x - 1);
    cf.y = fminf(fmaxf(0.f, cf.y), (float) field.dims.y - 1);
    cf.z = fminf(fmaxf(0.f, cf.z), (float) field.dims.z - 1);

    /* rounding to negative infinity */
    int3 g = make_int3(__float2int_rd(cf.x), __float2int_rd(cf.y), __float2int_rd(cf.z));

    int idx_x_1 = g.x + 1;
    if (cf.x == 0.f || cf.x == (float) field.dims.x - 1) {
        idx_x_1--;
    }
    int idx_y_1 = g.y + 1;
    if (cf.y == 0.f || cf.y == (float) field.dims.y - 1) {
        idx_y_1--;
    }
    int idx_z_1 = g.z + 1;
    if (cf.z == 0.f || cf.z == (float) field.dims.z - 1) {
        idx_z_1--;
    }

    float a = cf.x - g.x;
    float b = cf.y - g.y;
    float c = cf.z - g.z;

    float4 result = lerp4f(lerp4f(lerp4f(*field(idx_x_1, idx_y_1, idx_z_1), *field(idx_x_1, idx_y_1, g.z + 0), c),
                                  lerp4f(*field(idx_x_1, g.y + 0, idx_z_1), *field(idx_x_1, g.y + 0, g.z + 0), c), b),
                           lerp4f(lerp4f(*field(g.x + 0, idx_y_1, idx_z_1), *field(g.x + 0, idx_y_1, g.z + 0), c),
                                  lerp4f(*field(g.x + 0, g.y + 0, idx_z_1), *field(g.x + 0, g.y + 0, g.z + 0), c), b),
                           a);
    return result;
}

template <typename Field>
__device__ __forceinline__ float4 interpolate_field_inv(const Field &field, const float3 &p_voxels) {
    float3 cf = p_voxels;

    cf.x = fminf(fmaxf(0.f, cf.x), (float) field.dims.x - 1);
    cf.y = fminf(fmaxf(0.f, cf.y), (float) field.dims.y - 1);
    cf.z = fminf(fmaxf(0.f, cf.z), (float) field.dims.z - 1);

    /* rounding to negative infinity */
    int3 g = make_int3(__float2int_rd(cf.x), __float2int_rd(cf.y), __float2int_rd(cf.z));

    int idx_x_1 = g.x + 1;
    if (cf.x == 0.f || cf.x == (float) field.dims.x - 1) {
        idx_x_1--;
    }
    int idx_y_1 = g.y + 1;
    if (cf.y == 0.f || cf.y == (float) field.dims.y - 1) {
        idx_y_1--;
    }
    int idx_z_1 = g.z + 1;
    if (cf.z == 0.f || cf.z == (float) field.dims.z - 1) {
        idx_z_1--;
    }

    float a = cf.x - g.x;
    float b = cf.y - g.y;
    float c = cf.z - g.z;

    float4 result = lerp4f(lerp4f(lerp4f(field.get_displacement(idx_x_1, idx_y_1, idx_z_1),
                                         field.get_displacement(idx_x_1, idx_y_1, g.z + 0), c),
                                  lerp4f(field.get_displacement(idx_x_1, g.y + 0, idx_z_1),
                                         field.get_displacement(idx_x_1, g.y + 0, g.z + 0), c),
                                  b),
                           lerp4f(lerp4f(field.get_displacement(g.x + 0, idx_y_1, idx_z_1),
                                         field.get_displacement(g.x + 0, idx_y_1, g.z + 0), c),
                                  lerp4f(field.get_displacement(g.x + 0, g.y + 0, idx_z_1),
                                         field.get_displacement(g.x + 0, g.y + 0, g.z + 0), c),
                                  b),
                           a);
    return result;
}

__device__ __forceinline__ float4 interpolate_field(const float4 *field, const float3 &p_voxels, int dim_x, int dim_y,
                                                    int dim_z) {
    float3 cf = p_voxels;

    cf.x = fminf(fmaxf(0.f, cf.x), (float) dim_x - 1);
    cf.y = fminf(fmaxf(0.f, cf.y), (float) dim_y - 1);
    cf.z = fminf(fmaxf(0.f, cf.z), (float) dim_z - 1);

    /* rounding to negative infinity */
    int3 g = make_int3(__float2int_rd(cf.x), __float2int_rd(cf.y), __float2int_rd(cf.z));

    int idx_x_1 = g.x + 1;
    if (cf.x == 0.f || cf.x == (float) dim_x - 1) {
        idx_x_1--;
    }
    int idx_y_1 = g.y + 1;
    if (cf.y == 0.f || cf.y == (float) dim_y - 1) {
        idx_y_1--;
    }
    int idx_z_1 = g.z + 1;
    if (cf.z == 0.f || cf.z == (float) dim_z - 1) {
        idx_z_1--;
    }

    float a = cf.x - g.x;
    float b = cf.y - g.y;
    float c = cf.z - g.z;

    float4 result = lerp4f(lerp4f(lerp4f(field[get_global_idx(idx_x_1, idx_y_1, idx_z_1, dim_x, dim_y)],
                                         field[get_global_idx(idx_x_1, idx_y_1, g.z + 0, dim_x, dim_y)], c),
                                  lerp4f(field[get_global_idx(idx_x_1, g.y + 0, idx_z_1, dim_x, dim_y)],
                                         field[get_global_idx(idx_x_1, g.y + 0, g.z + 0, dim_x, dim_y)], c),
                                  b),
                           lerp4f(lerp4f(field[get_global_idx(g.x + 0, idx_y_1, idx_z_1, dim_x, dim_y)],
                                         field[get_global_idx(g.x + 0, idx_y_1, g.z + 0, dim_x, dim_y)], c),
                                  lerp4f(field[get_global_idx(g.x + 0, g.y + 0, idx_z_1, dim_x, dim_y)],
                                         field[get_global_idx(g.x + 0, g.y + 0, g.z + 0, dim_x, dim_y)], c),
                                  b),
                           a);
    return result;
}

/*
 * FLOAT2
 */

__device__ __forceinline__ float norm(float2 v) {
    return __fsqrt_rn(__fadd_rn(__fmul_rn(v.x, v.x), __fmul_rn(v.y, v.y)));
}

/*
 * FLOAT3
 */

__host__ __device__ __forceinline__ float3 &operator-=(float3 &v1, const float3 &v2) {
    v1.x -= v2.x;
    v1.y -= v2.y;
    v1.z -= v2.z;
    return v1;
}

__device__ __forceinline__ float3 operator/(const float3 &v, const float &d) {
    return make_float3(__fdividef(v.x, d), __fdividef(v.y, d), __fdividef(v.z, d));
}

__device__ __forceinline__ float n(const float3 &v) {
    return __fsqrt_rd(__fmul_rn(v.x, v.x) + __fmul_rn(v.y, v.y) + __fmul_rn(v.z, v.z));
}

__device__ __forceinline__ float3 sq(const float3 &v) {
    return make_float3(__fmul_rn(v.x, v.x), __fmul_rn(v.y, v.y), __fmul_rn(v.z, v.z));
}

__device__ __forceinline__ float rms_epsilon(const float3 &v) { return __fsqrt_rn(v.x + v.y + v.z + 1e-5f); }

/*
 * FLOAT4
 */

__device__ __forceinline__ float4 operator+(const float4 &v1, const float4 &v2) {
    return make_float4(__fadd_rn(v1.x, v2.x), __fadd_rn(v1.y, v2.y), __fadd_rn(v1.z, v2.z), 0.f);
}

__device__ __forceinline__ float4 operator-(const float4 &v1, const float4 &v2) {
    return make_float4(__fadd_rn(v1.x, -v2.x), __fadd_rn(v1.y, -v2.y), __fadd_rn(v1.z, -v2.z), 0.f);
}

__host__ __device__ __forceinline__ float4 &operator+=(float4 &v1, const float4 &v2) {
    v1.x += v2.x;
    v1.y += v2.y;
    v1.z += v2.z;
    return v1;
}

__host__ __device__ __forceinline__ float4 &operator-=(float4 &v1, const float4 &v2) {
    v1.x -= v2.x;
    v1.y -= v2.y;
    v1.z -= v2.z;
    return v1;
}

__device__ __forceinline__ float4 operator*(const float4 &v, const float &m) {
    return make_float4(__fmul_rn(v.x, m), __fmul_rn(v.y, m), __fmul_rn(v.z, m), 0.f);
}

__device__ __forceinline__ float4 operator*(const float &m, const float4 &v) { return v * m; }

__device__ __forceinline__ float4 operator/(const float4 &v, const float &d) {
    return make_float4(__fdividef(v.x, d), __fdividef(v.y, d), __fdividef(v.z, d), 0.f);
}

__device__ __forceinline__ float4 operator/(const float &d, const float4 &v) { return v / d; }

__device__ __forceinline__ float norm(const float4 &v) {
    return __fsqrt_rd(__fmul_rn(v.x, v.x) + __fmul_rn(v.y, v.y) + __fmul_rn(v.z, v.z));
}

__device__ __forceinline__ float norm_sq(const float4 &v) {
    return __fmul_rn(v.x, v.x) + __fmul_rn(v.y, v.y) + __fmul_rn(v.z, v.z);
}

__device__ __forceinline__ float4 normalised(const float4 &v) {
    float n = norm(v);
    return (n > 1e-5f) ? v / n : v;
}

__host__ __device__ __forceinline__ float3 trunc(const float4 &f) { return make_float3(f.x, f.y, f.z); }

__device__ __forceinline__ float4 sq(const float4 &v) {
    return make_float4(__fmul_rn(v.x, v.x), __fmul_rn(v.y, v.y), __fmul_rn(v.z, v.z), 0.f);
}

__device__ __forceinline__ float rms_epsilon(const float4 &v) {
    return __fsqrt_rd(__fadd_rn(__fadd_rn(v.x, __fadd_rn(v.y, v.z)), 1e-5f));
}

/*
 * MAT3F
 */

__device__ __forceinline__ kfusion::device::Mat3f operator+(kfusion::device::Mat3f M1, kfusion::device::Mat3f M2) {
    kfusion::device::Mat3f N;

    N.data[0].x = __fadd_rn(M1.data[0].x, M2.data[0].x);
    N.data[0].y = __fadd_rn(M1.data[0].y, M2.data[0].y);
    N.data[0].z = __fadd_rn(M1.data[0].z, M2.data[0].z);

    N.data[1].x = __fadd_rn(M1.data[1].x, M2.data[1].x);
    N.data[1].y = __fadd_rn(M1.data[1].y, M2.data[1].y);
    N.data[1].z = __fadd_rn(M1.data[1].z, M2.data[1].z);

    N.data[2].x = __fadd_rn(M1.data[2].x, M2.data[2].x);
    N.data[2].y = __fadd_rn(M1.data[2].y, M2.data[2].y);
    N.data[2].z = __fadd_rn(M1.data[2].z, M2.data[2].z);

    return N;
}

__device__ __forceinline__ kfusion::device::Mat3f operator-(kfusion::device::Mat3f M1, kfusion::device::Mat3f M2) {
    kfusion::device::Mat3f N;

    N.data[0].x = __fsub_rn(M1.data[0].x, M2.data[0].x);
    N.data[0].y = __fsub_rn(M1.data[0].y, M2.data[0].y);
    N.data[0].z = __fsub_rn(M1.data[0].z, M2.data[0].z);

    N.data[1].x = __fsub_rn(M1.data[1].x, M2.data[1].x);
    N.data[1].y = __fsub_rn(M1.data[1].y, M2.data[1].y);
    N.data[1].z = __fsub_rn(M1.data[1].z, M2.data[1].z);

    N.data[2].x = __fsub_rn(M1.data[2].x, M2.data[2].x);
    N.data[2].y = __fsub_rn(M1.data[2].y, M2.data[2].y);
    N.data[2].z = __fsub_rn(M1.data[2].z, M2.data[2].z);

    return N;
}

__device__ __forceinline__ kfusion::device::Mat3f operator*(kfusion::device::Mat3f M, float m) {
    kfusion::device::Mat3f N;

    N.data[0].x = __fmul_rn(M.data[0].x, m);
    N.data[0].y = __fmul_rn(M.data[0].y, m);
    N.data[0].z = __fmul_rn(M.data[0].z, m);

    N.data[1].x = __fmul_rn(M.data[1].x, m);
    N.data[1].y = __fmul_rn(M.data[1].y, m);
    N.data[1].z = __fmul_rn(M.data[1].z, m);

    N.data[2].x = __fmul_rn(M.data[2].x, m);
    N.data[2].y = __fmul_rn(M.data[2].y, m);
    N.data[2].z = __fmul_rn(M.data[2].z, m);

    return N;
}

__device__ __forceinline__ kfusion::device::Mat3f operator/(kfusion::device::Mat3f M, float d) { return M * 1.f / d; }

/*
 * MAT4F
 */

__device__ __forceinline__ float4 operator*(Mat4f M, float4 v) {
    return make_float4(
        __fadd_rn(__fadd_rn(__fmul_rn(M.data[0].x, v.x), __fmul_rn(M.data[0].y, v.y)), __fmul_rn(M.data[0].z, v.z)),
        __fadd_rn(__fadd_rn(__fmul_rn(M.data[1].x, v.x), __fmul_rn(M.data[1].y, v.y)), __fmul_rn(M.data[1].z, v.z)),
        __fadd_rn(__fadd_rn(__fmul_rn(M.data[2].x, v.x), __fmul_rn(M.data[2].y, v.y)), __fmul_rn(M.data[2].z, v.z)),
        0.f);
}

__device__ __forceinline__ float det(Mat4f M) {
    return __fsub_rn(__fadd_rn(__fadd_rn(__fmul_rn(__fmul_rn(M.data[0].x, M.data[1].y), M.data[2].z),
                                         __fmul_rn(__fmul_rn(M.data[1].x, M.data[2].y), M.data[0].z)),
                               __fmul_rn(__fmul_rn(M.data[2].x, M.data[0].y), M.data[1].z)),
                     __fadd_rn(__fmul_rn(__fadd_rn(__fmul_rn(M.data[0].z, M.data[1].y), M.data[2].x),
                                         __fmul_rn(__fmul_rn(M.data[1].z, M.data[2].y), M.data[0].x)),
                               __fmul_rn(__fmul_rn(M.data[2].z, M.data[0].y), M.data[1].x)));
}

/*
 * OTHER
 */

__host__ __device__ __forceinline__ float sign(float a) {
    if (a > 0.f) {
        return 1.f;
    } else if (a < 0.f) {
        return -1.f;
    } else {
        return 0.f;
    }
}

__host__ __device__ __forceinline__ bool is_truncated(float tsdf) {
    if (fabs(tsdf) >= 1.f) {
        return true;
    }

    return false;
}

__host__ __device__ __forceinline__ float heaviside_smooth(float phi, float epsilon) {
    return 1.f / CUDART_PI_F * (epsilon / (epsilon * epsilon + phi * phi));
}

__host__ __device__ __forceinline__ void vec(kfusion::device::Mat3f J, float *v) {
    v[0] = J.data[0].x;
    v[1] = J.data[1].x;
    v[2] = J.data[2].x;

    v[3] = J.data[0].y;
    v[4] = J.data[1].y;
    v[5] = J.data[2].y;

    v[6] = J.data[0].z;
    v[7] = J.data[1].z;
    v[8] = J.data[2].z;
}

__host__ __device__ __forceinline__ kfusion::device::Mat3f transpose(kfusion::device::Mat3f J) {
    kfusion::device::Mat3f J_T;

    float3 row_1  = J.data[0];
    J_T.data[0].x = row_1.x;
    J_T.data[1].x = row_1.y;
    J_T.data[2].x = row_1.z;

    float3 row_2  = J.data[1];
    J_T.data[0].y = row_2.x;
    J_T.data[1].y = row_2.y;
    J_T.data[2].y = row_2.z;

    float3 row_3  = J.data[2];
    J_T.data[0].z = row_3.x;
    J_T.data[1].z = row_3.y;
    J_T.data[2].z = row_3.z;

    return J_T;
}


================================================
FILE: include/sobfu/params.hpp
================================================
#pragma once

/* kinfu includes */
#include <kfusion/types.hpp>

/* sobfu parameters */
struct Params {
    int cols = 640;
    int rows = 480; /* no. of rows and columns in the frames */

    cv::Vec3i volume_dims; /* volume dimensions in voxels */
    cv::Vec3f volume_size; /* volume size in metres */

    cv::Affine3f volume_pose;
    kfusion::Intr intr; /* camera intrinsics */

    float icp_truncate_depth_dist; /* depth truncation distance */

    float bilateral_sigma_depth, bilateral_sigma_spatial;
    int bilateral_kernel_size;

    float tsdf_trunc_dist, eta; /* tsdf truncation distance and expected object thickness */
    float tsdf_max_weight;      /* tsdf max. weight */

    float gradient_delta_factor;

    int start_frame = 0; /* frame when to start registration */
    int verbosity   = 0; /* solver verbosity */

    int s /* filter size */, max_iter /* max. no of iterations of the solver */;
    float max_update_norm /* max. update norm */, lambda /* filter parameter */, alpha /* gradient descent step size */,
        w_reg /* weight of the regularisation term */;

    cv::Vec3f voxel_sizes() {
        return cv::Vec3f(volume_size[0] / volume_dims[0], volume_size[1] / volume_dims[1],
                         volume_size[2] / volume_dims[2]);
    }
};


================================================
FILE: include/sobfu/precomp.hpp
================================================
#pragma once

/* cuda includes */
#include <cuda.h>

/* kinfu includes */
#include <kfusion/internal.hpp>

/* checks if x is a power of 2 */
bool isPow2(unsigned int x);

/* computes the nearest power of 2 larger than x */
int nextPow2(int x);

/* compute the number of threads and blocks to use for the given reduction kernel; we set threads
 * block to the minimum of maxThreads and n/2; we observe the maximum specified number of blocks, because each thread
 * in the kernel can process a variable number of elements */
void get_num_blocks_and_threads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads);


================================================
FILE: include/sobfu/reductor.hpp
================================================
#pragma once

/* kinfu includes */
#include <kfusion/cuda/tsdf_volume.hpp>

/* sobfu incldues */
#include <sobfu/precomp.hpp>
#include <sobfu/vector_fields.hpp>

/* thrust includes */
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/extrema.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/transform.h>

namespace sobfu {
namespace device {

/*
 * REDUCTOR
 */

struct Reductor {
    /* constructor */
    Reductor(int3 dims_, float vsz_, float trunc_dist_);
    /* destructor */
    ~Reductor();

    /* energy functional */
    float data_energy(float2 *phi_global_data, float2 *phi_n_data);
    float reg_energy_sobolev(Mat4f *J_data);

    float2 max_update_norm();
    float2 voxel_max_energy(float2 *phi_global_data, float2 *phi_n_data, Mat4f *J_data, float w_reg);

    int3 dims;
    float vsz, trunc_dist;
    int no_voxels;

    int blocks,
        threads; /* no. of blocks and threads for the reductions used to calculate value of the energy functional */

    float4 *updates;

    float *h_data_out, *d_data_out;
    float *h_reg_out, *d_reg_out;

    float2 *h_max_out, *d_max_out;
};

/* kernels */
template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce_data_kernel(float2 *g_idata_global, float2 *g_idata_n, float *g_odata, unsigned int n);
template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce_reg_sobolev_kernel(Mat4f *g_idata, float *g_odata, unsigned int n);

template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce_voxel_max_energy_kernel(float2 *d_idata_global, float2 *d_idata_n, Mat4f *d_idata_reg,
                                               float2 *d_o_data, float w_reg, unsigned int n);
template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce_max_kernel(float4 *updates, float2 *g_o_max_data, unsigned int n);

/* host methods */
void reduce_data(int size, int threads, int blocks, float2 *d_idata_global, float2 *d_idata_n, float *d_odata);
void reduce_reg_sobolev(int size, int threads, int blocks, Mat4f *d_idata, float *d_odata);

void reduce_voxel_max_energy(int size, int threads, int blocks, float2 *d_idata_global, float2 *d_idata_n,
                             Mat4f *d_idata_reg, float w_reg, float2 *d_odata);
void reduce_max(int size, int threads, int blocks, float4 *updates, float2 *d_o_max_data);

/* final reduce on the cpu */
float final_reduce(float *h_odata, float *d_odata, int numBlocks);
float2 final_reduce_max(float2 *h_o_max_data, float2 *d_o_max_data, int numBlocks, int3 dims);

}  // namespace device
}  // namespace sobfu


================================================
FILE: include/sobfu/scalar_fields.hpp
================================================
#pragma once

/* sobfu includes */
#include <sobfu/params.hpp>
#include <sobfu/precomp.hpp>

/* kinfu includes */
#include <kfusion/internal.hpp>
#include <kfusion/precomp.hpp>
#include <kfusion/types.hpp>

namespace sobfu {
namespace cuda {

/*
 * SCALAR FIELD
 */

class ScalarField {
public:
    /* constructor */
    ScalarField(cv::Vec3i dims_);
    /* destructor */
    ~ScalarField();

    /* get field data*/
    kfusion::cuda::CudaData get_data();
    const kfusion::cuda::CudaData get_data() const;

    /* get dims */
    int3 get_dims();

    /* clear field */
    void clear();
    /* sum values in the field */
    float sum();
    /* print field */
    void print();

private:
    kfusion::cuda::CudaData data; /* field data */
    int3 dims;                    /* field dimensions */
};

}  // namespace cuda

namespace device {

/*
 * SCALAR FIELD
 */

struct ScalarField {
    /* constructor */
    ScalarField(float *const data_, int3 dims_) : data(data_), dims(dims_) {}

    __device__ __forceinline__ float *beg(int x, int y) const;
    __device__ __forceinline__ float *zstep(float *const ptr) const;

    __device__ __forceinline__ float *operator()(int idx) const;
    __device__ __forceinline__ float *operator()(int x, int y, int z) const;

    float *const data; /* field data */
    const int3 dims;
};

/* clear */
__global__ void clear_kernel(sobfu::device::ScalarField field);
void clear(ScalarField &field);

/* sum */
float reduce_sum(sobfu::device::ScalarField &field);

template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce_sum_kernel(float *g_idata, float *g_odata, unsigned int n);
float final_reduce_sum(float *d_odata, int numBlocks);

}  // namespace device
}  // namespace sobfu


================================================
FILE: include/sobfu/sob_fusion.hpp
================================================
#pragma once

/* sobfu includes */
#include <sobfu/params.hpp>
#include <sobfu/solver.hpp>

/* kinfu includes */
#include <kfusion/internal.hpp>
#include <kfusion/kinfu.hpp>
#include <kfusion/precomp.hpp>

/* pcl includes */
#include <pcl/PolygonMesh.h>
#include <pcl/point_types.h>

/* sys headers */
#include <math.h>
#include <thread>

/* */
class SobFusion {
public:
    /* default constructor */
    SobFusion(const Params &params);
    /* default destructor */
    ~SobFusion();

    /* get sobfu params */
    Params &getParams();

    /* get the canonical model vertices stored as pcl triangle mesh */
    pcl::PolygonMesh::Ptr get_phi_global_mesh();
    /* get the canonical model mesh warped to live */
    pcl::PolygonMesh::Ptr get_phi_global_psi_inv_mesh();
    /* get the live model mesh */
    pcl::PolygonMesh::Ptr get_phi_n_mesh();
    /* get the live model mesh warped with psi */
    pcl::PolygonMesh::Ptr get_phi_n_psi_mesh();

    /* get the deformation field */
    std::shared_ptr<sobfu::cuda::DeformationField> getDeformationField();

    /* run algorithm on all frames */
    bool operator()(const kfusion::cuda::Depth &depth, const kfusion::cuda::Image &image = kfusion::cuda::Image());

private:
    std::vector<cv::Affine3f> poses_;
    kfusion::cuda::Dists dists_;
    kfusion::cuda::Frame curr_, prev_;

    kfusion::cuda::Cloud points_;
    kfusion::cuda::Normals normals_;
    kfusion::cuda::Depth depths_;

    int frame_counter_;

    /* system parameters */
    Params params;

    /* tsdf's */
    cv::Ptr<kfusion::cuda::TsdfVolume> phi_global, phi_global_psi_inv, phi_n, phi_n_psi;
    /* deformation field warps phi_n to phi_global */
    std::shared_ptr<sobfu::cuda::DeformationField> psi, psi_inv;
    /* solver */
    std::shared_ptr<sobfu::cuda::Solver> solver;

    /* marching cubes */
    cv::Ptr<kfusion::cuda::MarchingCubes> mc;

    /* run marching cubes on vol */
    pcl::PolygonMesh::Ptr get_mesh(cv::Ptr<kfusion::cuda::TsdfVolume> vol);
    /* convert the canonical model to pcl polygon mesh */
    static pcl::PolygonMesh::Ptr convert_to_mesh(const kfusion::cuda::DeviceArray<pcl::PointXYZ> &triangles);
};


================================================
FILE: include/sobfu/solver.hpp
================================================
#pragma once

/* kinfu incldues */
#include <kfusion/cuda/tsdf_volume.hpp>
#include <kfusion/safe_call.hpp>

/* sobfu includes */
#include <sobfu/reductor.hpp>
#include <sobfu/scalar_fields.hpp>
#include <sobfu/vector_fields.hpp>

/*
 * SOLVER PARAMETERS
 */

struct SolverParams {
    int verbosity, max_iter, s;
    float max_update_norm, lambda, alpha, w_reg;
};

/*
 * SDF'S USED IN THE SOLVER
 */

struct SDFs {
    SDFs(kfusion::device::TsdfVolume &phi_global_, kfusion::device::TsdfVolume &phi_global_psi_inv_,
         kfusion::device::TsdfVolume &phi_n_, kfusion::device::TsdfVolume &phi_n_psi_)
        : phi_global(phi_global_), phi_global_psi_inv(phi_global_psi_inv_), phi_n(phi_n_), phi_n_psi(phi_n_psi_) {}

    kfusion::device::TsdfVolume phi_global, phi_global_psi_inv, phi_n, phi_n_psi;
};

/*
 * SPATIAL DIFFERENTIATORS
 */

struct Differentiators {
    Differentiators(sobfu::device::TsdfDifferentiator &tsdf_diff_, sobfu::device::Differentiator &diff_,
                    sobfu::device::Differentiator &diff_inv_,
                    sobfu::device::SecondOrderDifferentiator &second_order_diff_)

        : tsdf_diff(tsdf_diff_), diff(diff_), diff_inv(diff_inv_), second_order_diff(second_order_diff_) {}

    sobfu::device::TsdfDifferentiator tsdf_diff;
    sobfu::device::Differentiator diff, diff_inv;
    sobfu::device::SecondOrderDifferentiator second_order_diff;
};

namespace sobfu {
namespace cuda {

/*
 * SOLVER
 */

class Solver {
public:
    /* constructor */
    Solver(Params &params);
    /* destructor */
    ~Solver();

    void estimate_psi(const cv::Ptr<kfusion::cuda::TsdfVolume> phi_global,
                      cv::Ptr<kfusion::cuda::TsdfVolume> phi_global_psi_inv,
                      const cv::Ptr<kfusion::cuda::TsdfVolume> phi_n, cv::Ptr<kfusion::cuda::TsdfVolume> phi_n_psi,
                      std::shared_ptr<sobfu::cuda::DeformationField> psi,
                      std::shared_ptr<sobfu::cuda::DeformationField> psi_inv);

private:
    /* volume params */
    int3 dims;
    float3 voxel_sizes;

    int no_voxels;
    float trunc_dist, eta, max_weight;

    /* solver params */
    SolverParams solver_params;

    /* gradients */
    sobfu::cuda::SpatialGradients *spatial_grads;
    sobfu::device::SpatialGradients *spatial_grads_device;

    sobfu::device::TsdfGradient *nabla_phi_n, *nabla_phi_n_o_psi;
    sobfu::device::Jacobian *J, *J_inv;
    sobfu::device::Laplacian *L, *L_o_psi_inv;
    sobfu::device::PotentialGradient *nabla_U, *nabla_U_S;

    /* used to calculate value of the energy functional */
    sobfu::device::Reductor *r;

    /* sobolev filter */
    float *h_S_i, *d_S_i;
};

/* get 3d sobolev filter */
static void get_3d_sobolev_filter(SolverParams &params, float *h_S_i);
/* calculate 1d filters from a separable 3d filter */
static void decompose_sobolev_filter(SolverParams &params, float *h_S_i);

}  // namespace cuda

namespace device {

/* potential gradient */
__global__ void calculate_potential_gradient_kernel(float2 *phi_n_psi, float2 *phi_global, float4 *nabla_phi_n_o_psi,
                                                    float4 *L, float4 *nabla_U, float w_reg, int dim_x, int dim_y,
                                                    int dim_z);
void calculate_potential_gradient(kfusion::device::TsdfVolume &phi_n_psi, kfusion::device::TsdfVolume &phi_global,
                                  sobfu::device::TsdfGradient &nabla_phi_n_o_psi, sobfu::device::Laplacian &L,
                                  sobfu::device::PotentialGradient &nabla_U, float w_reg);

/* estimate psi */
void estimate_psi(SDFs &sdfs, sobfu::device::DeformationField &psi, sobfu::device::DeformationField &psi_inv,
                  sobfu::device::SpatialGradients *spatial_grads, Differentiators &diffs, float *d_S_i,
                  sobfu::device::Reductor *r, SolverParams &params);

/* DEFORMATION FIELD UPDATES */
__global__ void update_psi_kernel(float4 *psi, float4 *nabla_U_S, float4 *updates, float alpha, int dim_x, int dim_y,
                                  int dim_z);
void update_psi(sobfu::device::DeformationField &psi, sobfu::device::PotentialGradient &nabla_U_S, float4 *updates,
                float alpha);

/*
 * CONVOLUTIONS
 */

void set_convolution_kernel(float *d_kernel);

__global__ void convolution_rows_kernel(float4 *d_dst, float4 *d_src, int image_w, int image_h, int image_d);
__global__ void convolution_columns_kernel(float4 *d_dst, float4 *d_src, int image_w, int image_h, int image_d);
__global__ void convolution_depth_kernel(float4 *d_dst, float4 *d_src, int image_w, int image_h, int image_d);

void convolution_rows(float4 *d_dst, float4 *updates, int image_w, int image_h, int image_d);
void convolution_columns(float4 *d_dst, float4 *updates, int image_w, int image_h, int image_d);
void convolution_depth(float4 *d_dst, float4 *updates, int image_w, int image_h, int image_d);

}  // namespace device
}  // namespace sobfu


================================================
FILE: include/sobfu/vector_fields.hpp
================================================
#pragma once

/* sobfu includes */
#include <sobfu/params.hpp>
#include <sobfu/precomp.hpp>

/* kinfu includes */
#include <kfusion/cuda/tsdf_volume.hpp>
#include <kfusion/internal.hpp>
#include <kfusion/precomp.hpp>
#include <kfusion/types.hpp>

namespace sobfu {
namespace cuda {

/*
 * VECTOR FIELD
 */

class VectorField {
public:
    /* constructor */
    VectorField(cv::Vec3i dims_);
    /* destructor */
    ~VectorField();

    /* get dimensions of the field */
    cv::Vec3i get_dims() const;

    /* get the data in the field */
    kfusion::cuda::CudaData get_data();
    const kfusion::cuda::CudaData get_data() const;
    /* set the data in the field */
    void set_data(kfusion::cuda::CudaData &data);

    /* clear field */
    void clear();
    /* print field */
    void print();
    /* get no. of nan's in the field */
    int get_no_nans();

protected:
    kfusion::cuda::CudaData data; /* field data */
    cv::Vec3i dims;               /* field dimensions */
};

/*
 * DEFORMATION FIELD
 */

class DeformationField : public VectorField {
public:
    /* constructor */
    DeformationField(cv::Vec3i dims_);
    /* destructor */
    ~DeformationField();

    /* init field to identity */
    void clear();

    /* apply the field to an sdf */
    void apply(const cv::Ptr<kfusion::cuda::TsdfVolume> phi, cv::Ptr<kfusion::cuda::TsdfVolume> phi_psi);
    /* approximate the inverse of the field */
    void get_inverse(sobfu::cuda::DeformationField &psi_inv);
};

/*
 * TSDF GRADIENT, LAPLACIAN, POTENTIAL GRADIENT
 */

typedef VectorField TsdfGradient;
typedef VectorField Laplacian;
typedef VectorField PotentialGradient;

/*
 * JACOBIAN
 */

class Jacobian {
public:
    /* constructor */
    Jacobian(cv::Vec3i dims_);
    /* destructor */
    ~Jacobian();

    kfusion::cuda::CudaData get_data();
    const kfusion::cuda::CudaData get_data() const;

    /* clear jacobian */
    void clear();

private:
    kfusion::cuda::CudaData data;
    cv::Vec3i dims;
};

/*
 * SPATIAL GRADIENTS
 */

struct SpatialGradients {
    /* constructor */
    SpatialGradients(cv::Vec3i dims_);
    /* destructor */
    ~SpatialGradients();

    TsdfGradient *nabla_phi_n, *nabla_phi_n_o_psi;
    Jacobian *J, *J_inv;
    Laplacian *L, *L_o_psi_inv;
    PotentialGradient *nabla_U, *nabla_U_S;
};

}  // namespace cuda
}  // namespace sobfu

namespace sobfu {
namespace device {

/*
 * VECTOR FIELD
 */

struct VectorField {
    /* constructor */
    VectorField(float4 *const data_, const int3 dims_) : data(data_), dims(dims_) {}

    __device__ __forceinline__ float4 *beg(int x, int y) const;
    __device__ __forceinline__ float4 *zstep(float4 *const ptr) const;
    __device__ __forceinline__ float4 *operator()(int x, int y, int z) const;

    __device__ __forceinline__ float4 get_displacement(int x, int y, int z) const;

    float4 *const data; /* vector field data */
    const int3 dims;    /* vector field dimensions */
};

/* clear field */
__global__ void clear_kernel(VectorField field);
void clear(VectorField &field);

/*
 * DEFORMATION FIELD
 */

typedef VectorField DeformationField;

/* set deformation field to identity */
__global__ void init_identity_kernel(DeformationField field);
void init_identity(DeformationField &field);
/* apply psi to an sdf */
__global__ void apply_kernel(const kfusion::device::TsdfVolume phi, kfusion::device::TsdfVolume phi_warped,
                             const DeformationField psi);
void apply(const kfusion::device::TsdfVolume &phi, kfusion::device::TsdfVolume &phi_warped,
           const sobfu::device::DeformationField &psi);
/* approximate the inverse of a deformation field */
__global__ void estimate_inverse_kernel(sobfu::device::DeformationField psi, sobfu::device::DeformationField psi_inv);
void estimate_inverse(sobfu::device::DeformationField &psi, sobfu::device::DeformationField &psi_inv);

/*
 * TSDF GRADIENT
 */

typedef VectorField TsdfGradient;

struct TsdfDifferentiator {
    /* constructor */
    TsdfDifferentiator(kfusion::device::TsdfVolume &vol_) : vol(vol_) {}

    /* calculate the gradient */
    __device__ void operator()(sobfu::device::TsdfGradient &grad) const;
    void calculate(sobfu::device::TsdfGradient &grad);

    kfusion::device::TsdfVolume vol;
};

/* estimate tsdf gradient */
__global__ void estimate_gradient_kernel(const sobfu::device::TsdfDifferentiator diff,
                                         sobfu::device::TsdfGradient grad);
/* interpolate tsdf gradient */
__global__ void interpolate_gradient_kernel(sobfu::device::TsdfGradient nabla_phi_n_psi,
                                            sobfu::device::TsdfGradient nabla_phi_n_psi_t,
                                            sobfu::device::DeformationField psi);
void interpolate_gradient(sobfu::device::TsdfGradient &nabla_phi_n_psi, sobfu::device::TsdfGradient &nabla_phi_n_psi_t,
                          sobfu::device::DeformationField &psi);

/*
 * LAPLACIAN
 */

typedef VectorField Laplacian;

struct SecondOrderDifferentiator {
    /* constructor */
    SecondOrderDifferentiator(sobfu::device::DeformationField &psi_) : psi(psi_) {}

    __device__ void laplacian(sobfu::device::Laplacian &L) const;
    void calculate(sobfu::device::Laplacian &L);

    sobfu::device::DeformationField psi;
};

/* estimate laplacian */
__global__ void estimate_laplacian_kernel(const sobfu::device::SecondOrderDifferentiator diff,
                                          sobfu::device::Laplacian L);
/* interpolate laplacian */
__global__ void interpolate_laplacian_kernel(sobfu::device::Laplacian L, sobfu::device::Laplacian L_o_psi,
                                             sobfu::device::DeformationField psi);
void interpolate_laplacian(sobfu::device::Laplacian &L, sobfu::device::Laplacian &L_o_psi,
                           sobfu::device::DeformationField &psi);

/*
 * POTENTIAL GRADIENT
 */

typedef VectorField PotentialGradient;

/*
 * JACOBIAN
 */

struct Jacobian {
    /* constructor */
    Jacobian(Mat4f *const data_, int3 dims_) : data(data_), dims(dims_) {}

    __device__ __forceinline__ Mat4f *beg(int x, int y) const;
    __device__ __forceinline__ Mat4f *zstep(Mat4f *const ptr) const;
    __device__ __forceinline__ Mat4f *operator()(int x, int y, int z) const;

    Mat4f *const data; /* jacobian data */
    const int3 dims;
};

struct Differentiator {
    /* constructor */
    Differentiator(sobfu::device::DeformationField &psi_) : psi(psi_) {}

    /* calculate jacobian */
    __device__ void operator()(sobfu::device::Jacobian &J, int mode) const;
    void calculate(sobfu::device::Jacobian &J);
    void calculate_deformation_jacobian(sobfu::device::Jacobian &J);

    sobfu::device::DeformationField psi;
};

/* clear jacobian */
__global__ void clear_jacobian_kernel(Jacobian J);
void clear(Jacobian &J);
/* estimate jacobian */
__global__ void estimate_jacobian_kernel(const sobfu::device::Differentiator diff, sobfu::device::Jacobian J);
__global__ void estimate_deformation_jacobian_kernel(const sobfu::device::Differentiator diff,
                                                     sobfu::device::Jacobian J);

/*
 * SPATIAL GRADIENTS
 */

struct SpatialGradients {
    SpatialGradients(sobfu::device::TsdfGradient *nabla_phi_n_, sobfu::device::TsdfGradient *nabla_phi_n_o_psi_,
                     sobfu::device::Jacobian *J_, sobfu::device::Jacobian *J_inv_, sobfu::device::Laplacian *L_,
                     sobfu::device::Laplacian *L_o_psi_inv_, sobfu::device::PotentialGradient *nabla_U_,
                     sobfu::device::PotentialGradient *nabla_U_S_)
        : nabla_phi_n(nabla_phi_n_),
          nabla_phi_n_o_psi(nabla_phi_n_o_psi_),
          J(J_),
          J_inv(J_inv_),
          L(L_),
          L_o_psi_inv(L_o_psi_inv_),
          nabla_U(nabla_U_),
          nabla_U_S(nabla_U_S_) {}
    ~SpatialGradients();

    sobfu::device::TsdfGradient *nabla_phi_n, *nabla_phi_n_o_psi;
    sobfu::device::Jacobian *J, *J_inv;
    sobfu::device::Laplacian *L, *L_o_psi_inv;
    sobfu::device::PotentialGradient *nabla_U, *nabla_U_S;
};

}  // namespace device
}  // namespace sobfu


================================================
FILE: params/params_advent.ini
================================================
# TSDF
VOL_DIMS_X=64
VOL_DIMS_Y=64
VOL_DIMS_Z=64

VOL_SIZE_X=0.5
VOL_SIZE_Y=0.5
VOL_SIZE_Z=0.5

TSDF_TRUNC_DIST=5
ETA=2
TSDF_MAX_WEIGHT=128

GRADIENT_DELTA_FACTOR=0.5

# CAMERA
INTR_FX=570.342
INTR_FY=570.342
INTR_CX=320.0
INTR_CY=240.0

TRUNC_DEPTH=1.5

VOL_POSE_T_Z=0.5

# BILATERAL FILTER
BILATERAL_SIGMA_DEPTH=0.005
BILATERAL_SIGMA_SPATIAL=4.5
BILATERAL_KERNEL_SIZE=7

# SOLVER
MAX_ITER=8192
MAX_UPDATE_NORM=5e-4

S=7
LAMBDA=0.1

ALPHA=0.1
W_REG=0.2


================================================
FILE: params/params_boxing.ini
================================================
# TSDF
VOL_DIMS_X=128
VOL_DIMS_Y=128
VOL_DIMS_Z=128

VOL_SIZE_X=0.75
VOL_SIZE_Y=0.75
VOL_SIZE_Z=0.75

TSDF_TRUNC_DIST=48
ETA=3
TSDF_MAX_WEIGHT=128

GRADIENT_DELTA_FACTOR=0.5

# CAMERA
INTR_FX=570.342
INTR_FY=570.342
INTR_CX=320.0
INTR_CY=240.0

TRUNC_DEPTH=1.0

VOL_POSE_T_Z=0.1

# BILATERAL FILTER
BILATERAL_SIGMA_DEPTH=0.005
BILATERAL_SIGMA_SPATIAL=4.5
BILATERAL_KERNEL_SIZE=7

# SOLVER
START_FRAME=1

MAX_ITER=4096
MAX_UPDATE_NORM=1e-10

S=7
LAMBDA=0.1

RHO_0=1.0
ALPHA=0.001
W_REG=0.6


================================================
FILE: params/params_hat.ini
================================================
# TSDF
VOL_DIMS_X=128
VOL_DIMS_Y=128
VOL_DIMS_Z=128

VOL_SIZE_X=1.20
VOL_SIZE_Y=1.20
VOL_SIZE_Z=1.20

TSDF_TRUNC_DIST=10
ETA=2
TSDF_MAX_WEIGHT=128

GRADIENT_DELTA_FACTOR=0.5

# CAMERA
INTR_FX=517.0
INTR_FY=517.0
INTR_CX=320.0
INTR_CY=240.0

TRUNC_DEPTH=2.0

VOL_POSE_T_Z=0.75

# BILATERAL FILTER
BILATERAL_SIGMA_DEPTH=0.02
BILATERAL_SIGMA_SPATIAL=4.5
BILATERAL_KERNEL_SIZE=7

# SOLVER
MAX_ITER=2048
MAX_UPDATE_NORM=1e-3

S=7
LAMBDA=0.1

ALPHA=0.05
W_REG=0.1


================================================
FILE: params/params_ours.ini
================================================
# intrinsic parameters for our intel realsense sr300
# ---rgb---
# kinfuParams.intr = kfusion::Intr(622.011f, 622.012f, 320.331f, 235.664f);
# ---depth---
# kinfuParams.intr = kfusion::Intr(474.567f, 474.567f, 310.63f, 246.018f);
#
# ----- DOU -----
# --- depth ---
# kinfuParams.intr = kfusion::Intr(520.f, 520.f, 320.f, 240.f);


================================================
FILE: params/params_snoopy.ini
================================================
# TSDF
VOL_DIMS_X=128
VOL_DIMS_Y=128
VOL_DIMS_Z=128

VOL_SIZE_X=0.9
VOL_SIZE_Y=0.9
VOL_SIZE_Z=0.9

TSDF_TRUNC_DIST=10
ETA=5
TSDF_MAX_WEIGHT=128

GRADIENT_DELTA_FACTOR=0.5

# CAMERA
INTR_FX=517.0
INTR_FY=517.0
INTR_CX=320.0
INTR_CY=240.0

TRUNC_DEPTH=3.0

VOL_POSE_T_Z=0.05

# BILATERAL FILTER
BILATERAL_SIGMA_DEPTH=0.01
BILATERAL_SIGMA_SPATIAL=4.5
BILATERAL_KERNEL_SIZE=7

# SOLVER
START_FRAME=4

MAX_ITER=2048
MAX_UPDATE_NORM=1e-3

S=7
LAMBDA=0.1

ALPHA=0.1
W_REG=0.2


================================================
FILE: params/params_umbrella.ini
================================================
# TSDF
VOL_DIMS_X=128
VOL_DIMS_Y=128
VOL_DIMS_Z=128

VOL_SIZE_X=1.0
VOL_SIZE_Y=1.0
VOL_SIZE_Z=1.0

TSDF_TRUNC_DIST=8
ETA=3
TSDF_MAX_WEIGHT=128

GRADIENT_DELTA_FACTOR=0.5

# CAMERA
INTR_FX=570.342
INTR_FY=570.342
INTR_CX=320.0
INTR_CY=240.0

TRUNC_DEPTH=1.5

VOL_POSE_T_Z=0.3

# BILATERAL FILTER
BILATERAL_SIGMA_DEPTH=0.04
BILATERAL_SIGMA_SPATIAL=4.5
BILATERAL_KERNEL_SIZE=7

# SOLVER
START_FRAME=1

MAX_ITER=2048
MAX_UPDATE_NORM=1e-10

S=7
LAMBDA=0.1

ALPHA=0.001
W_REG=0.2


================================================
FILE: setup.sh
================================================
#!/bin/sh

RUN_CMAKE=false
RUN_MAKE=false
RUN_MAKE_TESTS=false

usage() {
    echo "USAGE: source setup.sh [options]"
    echo "OPTIONS:"
    echo "\t--help                 -h: Display help"
    echo "\t--cmake                -c: Run CMake"
    echo "\t--make                 -m: Run make"
    echo "\t--make-tests           -t: Make tests"
    echo "\t--all                  -a: Run all the settings"
}

# Parse through the arguments and check if any relavant flag exists
while [ "$1" != "" ]; do
    PARAM=`echo $1 | awk -F= '{print $1}'`
    case $PARAM in
        -h | --help)
            usage
            exit
            ;;
        -c | --cmake)
            RUN_CMAKE=true
            ;;
        -m | --make)
            RUN_MAKE=true
            ;;
        -t | --make-tests)
            RUN_MAKE_TESTS=true
            ;;
        -a | --all)
            RUN_CMAKE=true
            RUN_MAKE=true
            ;;
        *)
            echo "ERROR: unknown parameter \"$PARAM\""
            usage
            return 1
            ;;
    esac
    shift
done

# If build does not exist create one
mkdir -p build
cd build

if $RUN_CMAKE
then
    if $RUN_MAKE_TESTS
    then
    	echo "running cmake..."
    	cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DBUILD_TESTS=ON .. || (cd ../ && return 1)
    else
	echo "running cmake..."
	cmake -DCMAKE_EXPORT_COMPILE_COMMNADS=ON -DBUILD_TESTS=OFF .. || (cd ../ && return 1)
    fi
fi

if $RUN_MAKE
then
    echo "running make..."
    make -j4 || (cd ../ && return 1)
    echo "make complete!"
fi

cd ..


================================================
FILE: src/CMakeLists.txt
================================================
# ---------------------------------------------------------------------------- #
# BUILD ALL SUBPROJECTS
# ---------------------------------------------------------------------------- #
add_subdirectory(kfusion)
add_subdirectory(sobfu)
add_subdirectory(apps)


================================================
FILE: src/apps/CMakeLists.txt
================================================
# Link source files to executables
add_application(app)

# Link to all libraries
target_link_libraries(app
    sobfu
    kfusion
    png
    Boost::program_options
    ${Boost_LIBRARIES}
    ${CUDA_CUDA_LIBRARY}
    ${CUDA_CUDART_LIBRARY}
    ${OpenCV_LIBS}
    ${PCL_LIBRARIES}
    ${VTK_LIBRARIES}
)


================================================
FILE: src/apps/demo.cpp
================================================
/* sobfu includes a*/
#include <sobfu/sob_fusion.hpp>

/* boost includes */
#include <boost/program_options.hpp>

/* opencv includes */
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

/* pcl includes */
#include <pcl/io/pcd_io.h>
#include <pcl/io/vtk_io.h>
#include <pcl/visualization/pcl_visualizer.h>

/* sys headers */
#include <iostream>

/* vtk includes */
#include <vtkArrowSource.h>
#include <vtkCellArray.h>
#include <vtkGlyph2D.h>
#include <vtkImageData.h>
#include <vtkPointData.h>
#include <vtkPoints.h>
#include <vtkRenderWindow.h>
#include <vtkSmartPointer.h>
#include <vtkXMLImageDataWriter.h>

struct SobFuApp {
    SobFuApp(std::string file_path, std::string params_path, bool logger, bool visualizer, bool visualizer_detailed,
             bool verbose, bool vverbose)
        : exit_(false),
          file_path_(file_path),
          params_path_(params_path),
          logger_(logger),
          visualizer_(visualizer),
          visualizer_detailed_(visualizer_detailed),
          verbose_(verbose),
          vverbose_(vverbose) {
        /*
         * initialise parameters
         */

        Params params;

        if (verbose_) {
            params.verbosity = 1;
        } else if (vverbose_) {
            params.verbosity = 2;
        }

        /*
         * declare parameters to read from .ini
         */

        boost::program_options::options_description desc("parameters");
        declare_parameters(desc, params);

        /*
         * read parameters from .ini
         */

        boost::program_options::variables_map vm;
        read_parameters(desc, vm);

        /*
         * parse parameters stored in units of voxels
         */

        params.tsdf_trunc_dist = vm["TSDF_TRUNC_DIST"].as<float>() * params.voxel_sizes()[0];
        params.eta             = vm["ETA"].as<float>() * params.voxel_sizes()[0];
        params.volume_pose     = cv::Affine3f().translate(
            cv::Vec3f(-params.volume_size[0] / 2.f, -params.volume_size[1] / 2.f, vm["VOL_POSE_T_Z"].as<float>()));

        /*
         * initialise sobolevfusion
         */

        sobfu = std::make_shared<SobFusion>(params);
    }

    /*
     * declare sobfu parameters
     */

    void declare_parameters(boost::program_options::options_description &desc, Params &params) {
        /*
         * tsdf
         */

        desc.add_options()("VOL_DIMS_X", boost::program_options::value<int>(&params.volume_dims[0]),
                           "no. of voxels along x axis");
        desc.add_options()("VOL_DIMS_Y", boost::program_options::value<int>(&params.volume_dims[1]),
                           "no. of voxels along y axis");
        desc.add_options()("VOL_DIMS_Z", boost::program_options::value<int>(&params.volume_dims[2]),
                           "no. of voxels along z axis");

        desc.add_options()("VOL_SIZE_X", boost::program_options::value<float>(&params.volume_size[0]),
                           "vol. size along x axis (metres)");
        desc.add_options()("VOL_SIZE_Y", boost::program_options::value<float>(&params.volume_size[1]),
                           "vol. size along y axis (metres)");
        desc.add_options()("VOL_SIZE_Z", boost::program_options::value<float>(&params.volume_size[2]),
                           "vol. size along z axis (metres)");

        desc.add_options()("TSDF_TRUNC_DIST", boost::program_options::value<float>(), "truncation distance (voxels)");
        desc.add_options()("ETA", boost::program_options::value<float>(), "expected object thickness (voxels)");
        desc.add_options()("TSDF_MAX_WEIGHT", boost::program_options::value<float>(&params.tsdf_max_weight),
                           "max. tsdf weight");

        desc.add_options()("GRADIENT_DELTA_FACTOR", boost::program_options::value<float>(&params.gradient_delta_factor),
                           "delta factor when calculating tsdf gradient (voxels)");

        /*
         * camera
         */

        desc.add_options()("INTR_FX", boost::program_options::value<float>(&params.intr.fx), "focal length x");
        desc.add_options()("INTR_FY", boost::program_options::value<float>(&params.intr.fy), "focal length y");
        desc.add_options()("INTR_CX", boost::program_options::value<float>(&params.intr.cx), "principal point x");
        desc.add_options()("INTR_CY", boost::program_options::value<float>(&params.intr.cy), "principal point y");

        desc.add_options()("TRUNC_DEPTH", boost::program_options::value<float>(&params.icp_truncate_depth_dist),
                           "depth map truncation distance (metres)");
        desc.add_options()("VOL_POSE_T_Z", boost::program_options::value<float>(),
                           "camera to volume translation along z axis");

        /*
         * bilateral filter
         */

        desc.add_options()("BILATERAL_SIGMA_DEPTH", boost::program_options::value<float>(&params.bilateral_sigma_depth),
                           "bilateral filter sigma z");
        desc.add_options()("BILATERAL_SIGMA_SPATIAL",
                           boost::program_options::value<float>(&params.bilateral_sigma_spatial),
                           "bilateral filter sigma x-y");
        desc.add_options()("BILATERAL_KERNEL_SIZE", boost::program_options::value<int>(&params.bilateral_kernel_size),
                           "bilateral filter kernel size");

        /*
         * solver
         */
        desc.add_options()("START_FRAME", boost::program_options::value<int>(&params.start_frame),
                           "frame when to start registration");

        desc.add_options()("MAX_ITER", boost::program_options::value<int>(&params.max_iter),
                           "max. no. of iterations of the solver");
        desc.add_options()("MAX_UPDATE_NORM", boost::program_options::value<float>(&params.max_update_norm),
                           "max. update norm when running the solver");

        /* SOBOLEV */
        desc.add_options()("S", boost::program_options::value<int>(&params.s),
                           "Sobolev kernel size (currently only supports s=7");
        desc.add_options()("LAMBDA", boost::program_options::value<float>(&params.lambda),
                           "Sobolev filter parameter (currently only supports 0.05, 0.1, 0.2, and 0.4");

        /* FASTFUSION */
        desc.add_options()("ALPHA", boost::program_options::value<float>(&params.alpha), "gradient descent step size");
        desc.add_options()("W_REG", boost::program_options::value<float>(&params.w_reg), "regularisation weight");
    }

    /*
     * read parameters from params.ini
     */

    void read_parameters(boost::program_options::options_description &desc, boost::program_options::variables_map &vm) {
        std::ifstream settings_file(params_path_);

        boost::program_options::store(boost::program_options::parse_config_file(settings_file, desc), vm);
        boost::program_options::notify(vm);
    }

    /*
     * load colour and depth images
     */

    void load_files(std::vector<cv::String> *depths, std::vector<cv::String> *images, std::vector<cv::String> *masks) {
        if (!boost::filesystem::exists(file_path_)) {
            std::cerr << "error: directory '" << file_path_ << "' does not exist. exiting" << std::endl;
            exit(EXIT_FAILURE);
        }

        if (!boost::filesystem::exists(file_path_ + "/depth") || !boost::filesystem::exists(file_path_ + "/color")) {
            std::cerr << "error: source directory should contain 'color' and 'depth' folders. exiting..." << std::endl;
            exit(EXIT_FAILURE);
        }

        cv::glob(file_path_ + "/depth", *depths);
        cv::glob(file_path_ + "/color", *images);

        std::sort((*depths).begin(), (*depths).end());
        std::sort((*images).begin(), (*images).end());

        if (boost::filesystem::exists(file_path_ + "/omask")) {
            cv::glob(file_path_ + "/omask", *masks);
            std::sort((*masks).begin(), (*masks).end());
        }
    }

    /*
     * create a new directory out if not already present inside the input folder
     */

    void create_output_directory() {
        out_path_ = file_path_ + "/meshes";
        boost::filesystem::path dir_meshes(out_path_);

        if (boost::filesystem::create_directory(dir_meshes)) {
            std::cout << "created output directory for meshes" << std::endl;
        }

        if (visualizer_ || visualizer_detailed_) {
            out_screenshot_path_ = file_path_ + "/screenshots";
            boost::filesystem::path dir_screenshots(out_screenshot_path_);

            if (boost::filesystem::create_directory(dir_screenshots)) {
                std::cout << "created output directory for screenshots" << std::endl;
            }
        }
    }

    /*
     * get no. of vertices in a mesh
     */

    int get_no_vertices(pcl::PolygonMesh::Ptr mesh) {
        pcl::PointCloud<pcl::PointXYZ> pc;
        pcl::fromPCLPointCloud2(mesh->cloud, pc);

        return pc.size();
    }

    /*
     * save output meshes in .vtk format
     */

    void save_mesh(pcl::PolygonMesh::Ptr mesh, int i, std::string name) {
        /* pad frame no. with 0's */
        std::stringstream ss;
        ss << std::setw(6) << std::setfill('0') << i;
        std::string frameNum = ss.str();

        /* save to vtk */
        pcl::io::saveVTKFile(out_path_ + "/" + name + "_" + frameNum + ".vtk", *mesh);
        std::cout << "saved " + name + "_" + frameNum + ".vtk" << std::endl;
    }

    /*
     * save 3d deformation field in .vtk format
     */

    void save_field(std::shared_ptr<SobFusion> sobfu, int i) {
        /* get parameters */
        Params params = sobfu->getParams();

        /* create vtk image */
        vtkSmartPointer<vtkImageData> image = vtkSmartPointer<vtkImageData>::New();
        /* specify size of image */
        image->SetDimensions(params.volume_dims[0], params.volume_dims[1], params.volume_dims[2]);

        /* specify size of image data */
        image->AllocateScalars(VTK_FLOAT, 4);
        int *dims = image->GetDimensions();

        /* copy vector field data */
        std::shared_ptr<sobfu::cuda::DeformationField> psi = sobfu->getDeformationField();
        kfusion::cuda::CudaData displacement_data          = psi->get_data();
        displacement_data.download(image->GetScalarPointer());

        /* pad frame no. with 0's */
        std::stringstream ss;
        ss << std::setw(6) << std::setfill('0') << i;
        std::string frameNum = ss.str();
        std::string fileName = out_path_ + "/field_" + frameNum + ".vti";

        /* save file */
        vtkSmartPointer<vtkXMLImageDataWriter> writer = vtkSmartPointer<vtkXMLImageDataWriter>::New();
        writer->SetFileName(fileName.c_str());
        writer->SetInputData(image);
        writer->Write();

        std::cout << "saved the vector field to .vti" << std::endl;
    }

    bool execute() {
        /* sobfu app */
        sobfu = sobfu;

        /* images */
        cv::Mat depth, image, mask;
        std::vector<cv::String> depths, images, masks;

        load_files(&depths, &images, &masks);

        /* output */
        create_output_directory();

        /* pipeline */
        double time_ms = 0;
        bool has_image = false;
        bool has_masks = masks.size() > 0;

        int v1(0);
        int v2(0);

        int v3(0);
        int v4(0);
        int v5(0);

        for (size_t i = 0; i < depths.size(); ++i) {
            depth = cv::imread(depths[i], CV_LOAD_IMAGE_ANYDEPTH);
            image = cv::imread(images[i], CV_LOAD_IMAGE_COLOR);

            cv::Mat depth_masked = cv::Mat::zeros(depth.size(), depth.type());
            if (has_masks) {
                mask = cv::imread(masks[i], CV_8U);
                depth.copyTo(depth_masked, mask);
            }

            if (!image.data || !depth.data) {
                std::cerr << "error: image could not be read; check for improper"
                          << " permissions or invalid formats. exiting..." << std::endl;
                exit(EXIT_FAILURE);
            }

            if (has_masks) {
                depth_device_.upload(depth_masked.data, depth_masked.step, depth_masked.rows, depth_masked.cols);
            } else {
                depth_device_.upload(depth.data, depth.step, depth.rows, depth.cols);
            }

            {
                kfusion::SampledScopeTime fps(time_ms);
                has_image = (*(sobfu))(depth_device_);
            }

            /* get meshes */
            pcl::PolygonMesh::Ptr mesh_global, mesh_global_psi_inv;
            pcl::PolygonMesh::Ptr mesh_n, mesh_n_psi;

            if (visualizer_ || visualizer_detailed_ || logger_) {
                mesh_global               = sobfu->get_phi_global_mesh();
                int no_vertices_canonical = get_no_vertices(mesh_global);
                std::cout << "no. of point-normal pairs in the canonical model: " << no_vertices_canonical << std::endl;

                if (i >= 1) {
                    mesh_global_psi_inv                      = sobfu->get_phi_global_psi_inv_mesh();
                    int no_vertices_canonical_warped_to_live = get_no_vertices(mesh_global_psi_inv);
                    std::cout << "no. of point-normal pairs in the canonical model warped to live: "
                              << no_vertices_canonical_warped_to_live << std::endl;
                }

                if (visualizer_detailed_) {
                    if (i == 1) {
                        mesh_n     = sobfu->get_phi_n_mesh();
                        mesh_n_psi = sobfu->get_phi_n_psi_mesh();
                    }
                    if (i >= 2) {
                        mesh_n     = sobfu->get_phi_n_mesh();
                        mesh_n_psi = sobfu->get_phi_n_psi_mesh();
                    }
                }
            }

            if (logger_) {
                save_mesh(mesh_global, i, "canonical_mesh");
                if (i >= 1) {
                    save_mesh(mesh_global_psi_inv, i, "canonical_warped_to_live_mesh");
                }

                // save_field(sobfu, i);
            }

            if (visualizer_ || visualizer_detailed_) {
                if (i == 0) {
                    viewer = std::make_shared<pcl::visualization::PCLVisualizer>("meshes");
                }

                /*
                 * BASIC VISUALISER
                 *
                 */

                if (visualizer_) {
                    if (i == 0) {
                        /* create view ports */
                        viewer->createViewPort(0.0, 0.0, 0.5, 1.0, v1);
                        viewer->createViewPort(0.5, 0.0, 1.0, 1.0, v2);

                        /* add labels */
                        viewer->addText("frame no. " + std::to_string(i), 20, 512, 15, 1.0, 1.0, 1.0, "frame_num", v1);

                        viewer->addText("canonical model", 15, 15, 15, 1.0, 1.0, 1.0, "v1 text", v1);
                        viewer->addText("canonical model warped to live", 15, 15, 15, 1.0, 1.0, 1.0, "v2 text", v2);

                        /* add bounding box */
                        cv::Vec3f vol_size    = sobfu->getParams().volume_size;
                        cv::Affine3f vol_pose = sobfu->getParams().volume_pose;

                        cv::Vec3f min = vol_pose * cv::Vec3f(0.f, 0.f, 0.f);
                        cv::Vec3f max = vol_pose * vol_size;

                        viewer->setCameraPosition(0.0, 0.0, max[2] + 3.0, 0.0, 0.0, 0.0);

                        /* display meshes */
                        viewer->addPolygonMesh(*mesh_global, "mesh canonical", v1);
                        viewer->spinOnce(5000);

                        /* save screenshot */
                        std::stringstream ss;
                        ss << std::setw(6) << std::setfill('0') << i;
                        std::string frame_num = ss.str();

                        viewer->saveScreenshot(out_screenshot_path_ + "/" + frame_num + ".png");
                    } else if (i >= 1) {
                        /* update label */
                        viewer->updateText("frame no. " + std::to_string(i), 20, 512, 15, 1.0, 1.0, 1.0, "frame_num");

                        /* display updated meshes */
                        viewer->updatePolygonMesh(*mesh_global, "mesh canonical");
                        if (i == 1) {
                            viewer->addPolygonMesh(*mesh_global_psi_inv, "mesh canonical warped to live", v2);
                        } else {
                            viewer->updatePolygonMesh(*mesh_global_psi_inv, "mesh canonical warped to live");
                        }
                        viewer->spinOnce(50);

                        /* save screenshot */
                        std::stringstream ss;
                        ss << std::setw(6) << std::setfill('0') << i;
                        std::string frame_num = ss.str();

                        viewer->saveScreenshot(out_screenshot_path_ + "/" + frame_num + ".png");
                    }
                }

                /*
                 * DETAILED VISUALISER
                 *
                 */

                else if (visualizer_detailed_) {
                    if (i == 0) {
                        /* create view ports */
                        viewer->createViewPort(0.0, 0.5, 0.5, 1.0, v1);
                        viewer->createViewPort(0.5, 0.5, 1.0, 1.0, v2);

                        viewer->createViewPort(0.0, 0.0, 0.5, 0.5, v3);
                        viewer->createViewPort(0.5, 0.0, 1.0, 0.5, v4);

                        /* add labels */
                        viewer->addText("frame no. " + std::to_string(i), 20, 244, 15, 1.0, 1.0, 1.0, "frame_num", v1);

                        viewer->addText("phi_n", 15, 15, 15, 1.0, 1.0, 1.0, "v1 text", v1);
                        viewer->addText("phi_global(psi_inv)", 15, 15, 15, 1.0, 1.0, 1.0, "v2 text", v2);

                        viewer->addText("phi_global", 15, 15, 15, 1.0, 1.0, 1.0, "v3 text", v3);
                        viewer->addText("phi_n(psi)", 15, 15, 15, 1.0, 1.0, 1.0, "v4 text", v4);

                        /* add bounding box */
                        cv::Vec3f vol_size    = sobfu->getParams().volume_size;
                        cv::Affine3f vol_pose = sobfu->getParams().volume_pose;

                        cv::Vec3f min = vol_pose * cv::Vec3f(0.f, 0.f, 0.f);
                        cv::Vec3f max = vol_pose * vol_size;

                        viewer->setCameraPosition(0.0, 0.0, max[2] + 3.0, 0.0, 0.0, 0.0);

                        /* display meshes */
                        viewer->addPolygonMesh(*mesh_global, "mesh canonical", v3);
                        viewer->spinOnce(10000);

                        /* save screenshot */
                        std::stringstream ss;
                        ss << std::setw(6) << std::setfill('0') << i;
                        std::string frame_num = ss.str();

                        viewer->saveScreenshot(out_screenshot_path_ + "/" + frame_num + ".png");
                    }
                    if (i >= 1) {
                        /* update labels */
                        viewer->updateText("frame no. " + std::to_string(i), 20, 244, 15, 1.0, 1.0, 1.0, "frame_num");
                        /* display meshes */
                        viewer->updatePolygonMesh(*mesh_global, "mesh canonical");

                        if (i == 1) {
                            viewer->addPolygonMesh(*mesh_n, "mesh n", v1);
                            viewer->addPolygonMesh(*mesh_global_psi_inv, "mesh canonical warped to live", v2);
                            viewer->addPolygonMesh(*mesh_n_psi, "mesh n psi", v4);
                        }
                        if (i >= 2) {
                            viewer->updatePolygonMesh(*mesh_n, "mesh n");
                            viewer->updatePolygonMesh(*mesh_global_psi_inv, "mesh canonical warped to live");
                            viewer->updatePolygonMesh(*mesh_n_psi, "mesh n psi");
                        }
                        viewer->spinOnce(50);

                        /* save screenshot */
                        std::stringstream ss;
                        ss << std::setw(6) << std::setfill('0') << i;
                        std::string frame_num = ss.str();

                        viewer->saveScreenshot(out_screenshot_path_ + "/" + frame_num + ".png");
                    }
                }
            }
        }

        return true;
    }

    kfusion::cuda::Depth depth_device_;

    std::shared_ptr<SobFusion> sobfu;
    std::shared_ptr<pcl::visualization::PCLVisualizer> viewer;

    bool exit_, logger_, visualizer_, visualizer_detailed_, off_screen_, verbose_, vverbose_;
    std::string file_path_, params_path_, out_path_, out_screenshot_path_;
};

/*
 * parse the input flag and determine the file path and whether or not to enable visualizer
 * all flags will be matched and the last argument which does not match the flag will be
 * treated as filepath
 */
void parse_flags(std::vector<std::string> args, std::string *file_path, std::string *params_path, bool *logger,
                 bool *visualizer, bool *visualizer_detailed, bool *verbose, bool *vverbose) {
    std::vector<std::string> flags = {"-h",           "--help",    "--enable-viz", "--enable-viz-detailed",
                                      "--enable-log", "--verbose", "--vverbose"};

    int idx = 0;
    for (auto arg : args) {
        if (std::find(std::begin(flags), std::end(flags), arg) != std::end(flags)) {
            if (arg == "-h" || arg == "--help") {
                std::cout << "USAGE: sobfu [OPTIONS] <file path> <ini path>" << std::endl;
                std::cout << "\t--help -h:    display help" << std::endl;
                std::cout << "\t--enable-viz: enable visualizer" << std::endl;
                std::cout << "\t--enable-viz-detailed: enable visualizer with additional meshes for debugging"
                          << std::endl;
                std::cout << "\t--enable-log: log output meshes" << std::endl;
                std::cout << "\t--verbose: low verbosity" << std::endl;
                std::cout << "\t--vverbose: high verbosity" << std::endl;
                std::exit(EXIT_SUCCESS);
            }

            if (arg == "--enable-log") {
                *logger = true;
            }
            if (arg == "--enable-viz") {
                *visualizer = true;
            }
            if (arg == "--enable-viz-detailed") {
                *visualizer_detailed = true;
            }
            if (arg == "--verbose") {
                *verbose = true;
            }
            if (arg == "--vverbose") {
                *vverbose = true;
            }
        } else if (idx == 0) {
            *file_path = arg;
            idx++;
        } else if (idx == 1) {
            *params_path = arg;
        }
    }
}

int main(int argc, char *argv[]) {
    int device = 0;

    kfusion::cuda::setDevice(device);
    kfusion::cuda::printShortCudaDeviceInfo(device);

    if (kfusion::cuda::checkIfPreFermiGPU(device)) {
        std::cout << std::endl
                  << "kinfu does not support pre-fermi gpu architectures, and is not built for them by "
                     "default; exiting..."
                  << std::endl;
        return 1;
    }

    /* program requires at least one argument--the path to the directory where the source files are */
    if (argc < 3) {
        return std::cerr << "error: incorrect number of arguments; please supply path to source data and .ini file; "
                            "exiting..."
                         << std::endl,
               -1;
    }

    std::vector<std::string> args(argv + 1, argv + argc);
    std::string file_path, params_path;

    bool logger              = false;
    bool visualizer          = false;
    bool visualizer_detailed = false;
    bool off_screen          = false;
    bool verbose             = false;
    bool vverbose            = false;

    parse_flags(args, &file_path, &params_path, &logger, &visualizer, &visualizer_detailed, &verbose, &vverbose);

    /* disable the visualiser when running over SSH */
    if (((visualizer || visualizer_detailed) && !off_screen) && getenv("SSH_CLIENT")) {
        return std::cerr << "error: cannot run visualiser while running over ssh; please run locally or disable the"
                            "visualiser. exiting..."
                         << std::endl,
               -1;
    }

    SobFuApp app(file_path, params_path, logger, visualizer, visualizer_detailed, verbose, vverbose);

    /* execute */
    app.execute();

    return 0;
}


================================================
FILE: src/kfusion/CMakeLists.txt
================================================
# Build all kfusion source files into library kfusion
add_module_library(kfusion)

# Link to kfusion library
target_link_libraries(kfusion
  ${CUDA_CUDART_LIBRARY}
  ${PCL_LIBRARIES}
)


================================================
FILE: src/kfusion/core.cpp
================================================
#include <kfusion/kinfu.hpp>
#include <kfusion/safe_call.hpp>

#include <cuda.h>
#include <cstdio>
#include <iostream>

int kf::cuda::getCudaEnabledDeviceCount() {
    int count;
    cudaError_t error = cudaGetDeviceCount(&count);

    if (error == cudaErrorInsufficientDriver)
        return -1;

    if (error == cudaErrorNoDevice)
        return 0;

    cudaSafeCall(error);
    return count;
}

void kf::cuda::setDevice(int device) { cudaSafeCall(cudaSetDevice(device)); }

std::string kf::cuda::getDeviceName(int device) {
    cudaDeviceProp prop;
    cudaSafeCall(cudaGetDeviceProperties(&prop, device));

    return prop.name;
}

bool kf::cuda::checkIfPreFermiGPU(int device) {
    if (device < 0)
        cudaSafeCall(cudaGetDevice(&device));

    cudaDeviceProp prop;
    cudaSafeCall(cudaGetDeviceProperties(&prop, device));
    return prop.major < 2;  // CC == 1.x
}

namespace {
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) {
    *attribute     = T();
    CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
    if (CUDA_SUCCESS == error)
        return;

    printf("Driver API error = %04d\n", error);
    kfusion::cuda::error("driver API error", __FILE__, __LINE__);
}

inline int convertSMVer2Cores(int major, int minor) {
    // Defines for GPU Architecture types (using the SM version to determine the #
    // of cores per SM
    typedef struct {
        int SM;  // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
                 // minor version
        int Cores;
    } SMtoCores;

    SMtoCores gpuArchCoresPerSM[] = {{0x10, 8},   {0x11, 8},   {0x12, 8},   {0x13, 8},   {0x20, 32},  {0x21, 48},
                                     {0x30, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128}, {0x52, 128}, {0x60, 64},
                                     {0x61, 128}, {0x70, 64},  {0x75, 64},  {-1, -1}};

    int index = 0;
    while (gpuArchCoresPerSM[index].SM != -1) {
        if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor))
            return gpuArchCoresPerSM[index].Cores;
        index++;
    }
    printf("\nCan't determine number of cores. Unknown SM version %d.%d!\n", major, minor);
    return 0;
}
}  // namespace

void kf::cuda::printCudaDeviceInfo(int device) {
    int count  = getCudaEnabledDeviceCount();
    bool valid = (device >= 0) && (device < count);

    int beg = valid ? device : 0;
    int end = valid ? device + 1 : count;

    printf(
        "*** CUDA Device Query (Runtime API) version (CUDART static linking) "
        "*** \n\n");
    printf("Device count: %d\n", count);

    int driverVersion = 0, runtimeVersion = 0;
    cudaSafeCall(cudaDriverGetVersion(&driverVersion));
    cudaSafeCall(cudaRuntimeGetVersion(&runtimeVersion));

    const char *computeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device "
        "simultaneously)",
        "Exclusive (only one host thread in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
        "Exclusive Process (many threads in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Unknown",
        NULL};

    for (int dev = beg; dev < end; ++dev) {
        cudaDeviceProp prop;
        cudaSafeCall(cudaGetDeviceProperties(&prop, dev));

        int sm_cores = convertSMVer2Cores(prop.major, prop.minor);

        printf("\nDevice %d: \"%s\"\n", dev, prop.name);
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion / 1000,
               driverVersion % 100, runtimeVersion / 1000, runtimeVersion % 100);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
        printf(
            "  Total amount of global memory:                 %.0f MBytes (%llu "
            "bytes)\n",
            (float) prop.totalGlobalMem / 1048576.0f, (unsigned long long) prop.totalGlobalMem);
        printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, sm_cores,
               sm_cores * prop.multiProcessorCount);
        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);

#if (CUDART_VERSION >= 4000)
        // This is not available in the CUDA Runtime API, so we make the necessary
        // calls the driver API to support this for output
        int memoryClock, memBusWidth, L2CacheSize;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

        printf("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        if (L2CacheSize)
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);

        printf(
            "  Max Texture Dimension Size (x,y,z)             1D=(%d), "
            "2D=(%d,%d), 3D=(%d,%d,%d)\n",
            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], prop.maxTexture3D[0], prop.maxTexture3D[1],
            prop.maxTexture3D[2]);
        printf(
            "  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, "
            "2D=(%d,%d) x %d\n",
            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], prop.maxTexture2DLayered[0],
            prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
#endif
        printf("  Total amount of constant memory:               %u bytes\n", (int) prop.totalConstMem);
        printf("  Total amount of shared memory per block:       %u bytes\n", (int) prop.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
        printf("  Warp size:                                     %d\n", prop.warpSize);
        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0],
               prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0],
               prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %u bytes\n", (int) prop.memPitch);
        printf("  Texture alignment:                             %u bytes\n", (int) prop.textureAlignment);

#if CUDART_VERSION >= 4000
        printf(
            "  Concurrent copy and execution:                 %s with %d copy "
            "engine(s)\n",
            (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
#else
        printf("  Concurrent copy and execution:                 %s\n", prop.deviceOverlap ? "Yes" : "No");
#endif
        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");

        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
#if CUDART_VERSION >= 4000
        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID);
#endif
        printf("  Compute Mode:\n");
        printf("      %s \n", computeMode[prop.computeMode]);
    }

    printf("\n");
    printf("deviceQuery, CUDA Driver = CUDART");
    printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
    printf(", CUDA Runtime Version = %d.%d", runtimeVersion / 1000, runtimeVersion % 100);
    printf(", NumDevs = %d\n\n", count);
    fflush(stdout);
}

void kf::cuda::printShortCudaDeviceInfo(int device) {
    int count  = getCudaEnabledDeviceCount();
    bool valid = (device >= 0) && (device < count);

    int beg = valid ? device : 0;
    int end = valid ? device + 1 : count;

    int driverVersion = 0, runtimeVersion = 0;
    cudaSafeCall(cudaDriverGetVersion(&driverVersion));
    cudaSafeCall(cudaRuntimeGetVersion(&runtimeVersion));

    for (int dev = beg; dev < end; ++dev) {
        cudaDeviceProp prop;
        cudaSafeCall(cudaGetDeviceProperties(&prop, dev));

        const char *arch_str = prop.major < 2 ? " (pre-Fermi)" : "";
        printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float) prop.totalGlobalMem / 1048576.0f);
        printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str,
               convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
        printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion / 1000, driverVersion % 100, runtimeVersion / 1000,
               runtimeVersion % 100);
    }
    fflush(stdout);
}

kf::SampledScopeTime::SampledScopeTime(double &time_ms) : time_ms_(time_ms) { start = (double) cv::getTickCount(); }
kf::SampledScopeTime::~SampledScopeTime() {
    static int i_ = 0;
    time_ms_ += getTime();
    if (i_ % EACH == 0 && i_) {
        std::cout << "avg. frame time = " << time_ms_ / EACH << "ms (" << 1000.f * EACH / time_ms_ << "fps)"
                  << std::endl;
        time_ms_ = 0.0;
    }
    ++i_;
}

double kf::SampledScopeTime::getTime() {
    return ((double) cv::getTickCount() - start) * 1000.0 / cv::getTickFrequency();
}

kf::ScopeTime::ScopeTime(const char *name_) : name(name_) { start = (double) cv::getTickCount(); }
kf::ScopeTime::~ScopeTime() {
    double time_ms = ((double) cv::getTickCount() - start) * 1000.0 / cv::getTickFrequency();
    std::cout << "Time(" << name << ") = " << time_ms << "ms" << std::endl;
}


================================================
FILE: src/kfusion/cuda/imgproc.cu
================================================
#include <kfusion/cuda/device.hpp>

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Depth bilateral filter

namespace kfusion {
namespace device {
__global__ void bilateral_kernel(const PtrStepSz<ushort> src, PtrStep<ushort> dst, const int ksz,
                                 const float sigma_spatial2_inv_half, const float sigma_depth2_inv_half) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x >= src.cols || y >= src.rows)
        return;

    int value = src(y, x);

    int tx = min(x - ksz / 2 + ksz, src.cols - 1);
    int ty = min(y - ksz / 2 + ksz, src.rows - 1);

    float sum1 = 0;
    float sum2 = 0;

    for (int cy = max(y - ksz / 2, 0); cy < ty; ++cy) {
        for (int cx = max(x - ksz / 2, 0); cx < tx; ++cx) {
            int depth = src(cy, cx);

            float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
            float color2 = (value - depth) * (value - depth);

            float weight = __expf(-(space2 * sigma_spatial2_inv_half + color2 * sigma_depth2_inv_half));

            sum1 += depth * weight;
            sum2 += weight;
        }
    }
    dst(y, x) = __float2int_rn(sum1 / sum2);
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::bilateralFilter(const Depth& src, Depth& dst, int kernel_size, float sigma_spatial,
                                      float sigma_depth) {
    sigma_depth *= 1000;  // meters -> mm

    dim3 block(64, 16);
    dim3 grid(divUp(src.cols(), block.x), divUp(src.rows(), block.y));

    cudaSafeCall(cudaFuncSetCacheConfig(bilateral_kernel, cudaFuncCachePreferL1));
    bilateral_kernel<<<grid, block>>>(src, dst, kernel_size, 0.5f / (sigma_spatial * sigma_spatial),
                                      0.5f / (sigma_depth * sigma_depth));
    cudaSafeCall(cudaGetLastError());
};

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Depth truncation

namespace kfusion {
namespace device {
__global__ void truncate_depth_kernel(PtrStepSz<ushort> depth, ushort max_dist /*mm*/) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < depth.cols && y < depth.rows)
        if (depth(y, x) > max_dist)
            depth(y, x) = 0;
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::truncateDepth(Depth& depth, float max_dist /*meters*/) {
    dim3 block(64, 16);
    dim3 grid(divUp(depth.cols(), block.x), divUp(depth.rows(), block.y));

    truncate_depth_kernel<<<grid, block>>>(depth, static_cast<ushort>(max_dist * 1000.f));
    cudaSafeCall(cudaGetLastError());
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Build depth pyramid

namespace kfusion {
namespace device {
__global__ void pyramid_kernel(const PtrStepSz<ushort> src, PtrStepSz<ushort> dst, float sigma_depth_mult3) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= dst.cols || y >= dst.rows)
        return;

    const int D = 5;
    int center  = src(2 * y, 2 * x);

    int tx = min(2 * x - D / 2 + D, src.cols - 1);
    int ty = min(2 * y - D / 2 + D, src.rows - 1);
    int cy = max(0, 2 * y - D / 2);

    int sum   = 0;
    int count = 0;

    for (; cy < ty; ++cy)
        for (int cx = max(0, 2 * x - D / 2); cx < tx; ++cx) {
            int val = src(cy, cx);
            if (abs(val - center) < sigma_depth_mult3) {
                sum += val;
                ++count;
            }
        }
    dst(y, x) = (count == 0) ? 0 : sum / count;
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::depthPyr(const Depth& source, Depth& pyramid, float sigma_depth) {
    sigma_depth *= 1000;  // meters -> mm

    dim3 block(64, 16);
    dim3 grid(divUp(pyramid.cols(), block.x), divUp(pyramid.rows(), block.y));

    pyramid_kernel<<<grid, block>>>(source, pyramid, sigma_depth * 3);
    cudaSafeCall(cudaGetLastError());
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Compute normals

namespace kfusion {
namespace device {
__global__ void compute_normals_kernel(const PtrStepSz<ushort> depth, const Reprojector reproj,
                                       PtrStep<Normal> normals) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x >= depth.cols || y >= depth.rows)
        return;

    const float qnan = numeric_limits<float>::quiet_NaN();

    Normal n_out = make_float4(qnan, qnan, qnan, 0.f);

    if (x < depth.cols - 1 && y < depth.rows - 1) {
        // mm -> meters
        float z00 = depth(y, x) * 0.001f;
        float z01 = depth(y, x + 1) * 0.001f;
        float z10 = depth(y + 1, x) * 0.001f;

        if (z00 * z01 * z10 != 0) {
            float3 v00 = reproj(x, y, z00);
            float3 v01 = reproj(x + 1, y, z01);
            float3 v10 = reproj(x, y + 1, z10);

            float3 n = normalized(cross(v01 - v00, v10 - v00));
            n_out    = make_float4(-n.x, -n.y, -n.z, 0.f);
        }
    }
    normals(y, x) = n_out;
}

__global__ void mask_depth_kernel(const PtrStep<Normal> normals, PtrStepSz<ushort> depth) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x < depth.cols || y < depth.rows) {
        float4 n = normals(y, x);
        if (isnan(n.x))
            depth(y, x) = 0;
    }
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::computeNormalsAndMaskDepth(const Reprojector& reproj, Depth& depth, Normals& normals) {
    dim3 block(64, 16);
    dim3 grid(divUp(depth.cols(), block.x), divUp(depth.rows(), block.y));

    compute_normals_kernel<<<grid, block>>>(depth, reproj, normals);
    cudaSafeCall(cudaGetLastError());

    mask_depth_kernel<<<grid, block>>>(normals, depth);
    cudaSafeCall(cudaGetLastError());
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Compute computePointNormals

namespace kfusion {
namespace device {
__global__ void points_normals_kernel(const Reprojector reproj, const PtrStepSz<ushort> depth, PtrStep<Point> points,
                                      PtrStep<Normal> normals) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x >= depth.cols || y >= depth.rows)
        return;

    const float qnan = numeric_limits<float>::quiet_NaN();
    points(y, x) = normals(y, x) = make_float4(qnan, qnan, qnan, qnan);

    if (x >= depth.cols - 1 || y >= depth.rows - 1)
        return;

    // mm -> meters
    float z00 = depth(y, x) * 0.001f;
    float z01 = depth(y, x + 1) * 0.001f;
    float z10 = depth(y + 1, x) * 0.001f;

    if (z00 * z01 * z10 != 0) {
        float3 v00 = reproj(x, y, z00);
        float3 v01 = reproj(x + 1, y, z01);
        float3 v10 = reproj(x, y + 1, z10);

        float3 n      = normalized(cross(v01 - v00, v10 - v00));
        normals(y, x) = make_float4(-n.x, -n.y, -n.z, 0.f);
        points(y, x)  = make_float4(v00.x, v00.y, v00.z, 0.f);
    }
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::computePointNormals(const Reprojector& reproj, const Depth& depth, Points& points,
                                          Normals& normals) {
    dim3 block(64, 16);
    dim3 grid(divUp(depth.cols(), block.x), divUp(depth.rows(), block.y));

    points_normals_kernel<<<grid, block>>>(reproj, depth, points, normals);
    cudaSafeCall(cudaGetLastError());
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Compute dists

namespace kfusion {
namespace device {
__global__ void compute_dists_kernel(const PtrStepSz<ushort> depth, Dists dists, float2 finv, float2 c) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x < depth.cols || y < depth.rows) {
        float xl     = (x - c.x) * finv.x;
        float yl     = (y - c.y) * finv.y;
        float lambda = sqrtf(xl * xl + yl * yl + 1);

        dists(y, x) = depth(y, x) * lambda * 0.001f;  // meters
    }
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::compute_dists(const Depth& depth, Dists dists, float2 f, float2 c) {
    dim3 block(64, 16);
    dim3 grid(divUp(depth.cols(), block.x), divUp(depth.rows(), block.y));

    compute_dists_kernel<<<grid, block>>>(depth, dists, make_float2(1.f / f.x, 1.f / f.y), c);
    cudaSafeCall(cudaGetLastError());
}

namespace kfusion {
namespace device {
__global__ void resize_depth_normals_kernel(const PtrStep<ushort> dsrc, const PtrStep<float4> nsrc,
                                            PtrStepSz<ushort> ddst, PtrStep<float4> ndst) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x >= ddst.cols || y >= ddst.rows)
        return;

    const float qnan = numeric_limits<float>::quiet_NaN();

    ushort d = 0;
    float4 n = make_float4(qnan, qnan, qnan, qnan);

    int xs = x * 2;
    int ys = y * 2;

    int d00 = dsrc(ys + 0, xs + 0);
    int d01 = dsrc(ys + 0, xs + 1);
    int d10 = dsrc(ys + 1, xs + 0);
    int d11 = dsrc(ys + 1, xs + 1);

    if (d00 * d01 != 0 && d10 * d11 != 0) {
        d = (d00 + d01 + d10 + d11) / 4;

        float4 n00 = nsrc(ys + 0, xs + 0);
        float4 n01 = nsrc(ys + 0, xs + 1);
        float4 n10 = nsrc(ys + 1, xs + 0);
        float4 n11 = nsrc(ys + 1, xs + 1);

        n.x = (n00.x + n01.x + n10.x + n11.x) * 0.25;
        n.y = (n00.y + n01.y + n10.y + n11.y) * 0.25;
        n.z = (n00.z + n01.z + n10.z + n11.z) * 0.25;
    }
    ddst(y, x) = d;
    ndst(y, x) = n;
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out,
                                         Normals& normals_out) {
    int in_cols = depth.cols();
    int in_rows = depth.rows();

    int out_cols = in_cols / 2;
    int out_rows = in_rows / 2;

    dim3 block(64, 16);
    dim3 grid(divUp(out_cols, block.x), divUp(out_rows, block.y));

    resize_depth_normals_kernel<<<grid, block>>>(depth, normals, depth_out, normals_out);
    cudaSafeCall(cudaGetLastError());
}

namespace kfusion {
namespace device {
__global__ void resize_points_normals_kernel(const PtrStep<Point> vsrc, const PtrStep<Normal> nsrc,
                                             PtrStepSz<Point> vdst, PtrStep<Normal> ndst) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x >= vdst.cols || y >= vdst.rows)
        return;

    const float qnan = numeric_limits<float>::quiet_NaN();
    vdst(y, x) = ndst(y, x) = make_float4(qnan, qnan, qnan, 0.f);

    int xs = x * 2;
    int ys = y * 2;

    float3 d00 = tr(vsrc(ys + 0, xs + 0));
    float3 d01 = tr(vsrc(ys + 0, xs + 1));
    float3 d10 = tr(vsrc(ys + 1, xs + 0));
    float3 d11 = tr(vsrc(ys + 1, xs + 1));

    if (!isnan(d00.x * d01.x * d10.x * d11.x)) {
        float3 d   = (d00 + d01 + d10 + d11) * 0.25f;
        vdst(y, x) = make_float4(d.x, d.y, d.z, 0.f);

        float3 n00 = tr(nsrc(ys + 0, xs + 0));
        float3 n01 = tr(nsrc(ys + 0, xs + 1));
        float3 n10 = tr(nsrc(ys + 1, xs + 0));
        float3 n11 = tr(nsrc(ys + 1, xs + 1));

        float3 n   = (n00 + n01 + n10 + n11) * 0.25f;
        ndst(y, x) = make_float4(n.x, n.y, n.z, 0.f);
    }
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::resizePointsNormals(const Points& points, const Normals& normals, Points& points_out,
                                          Normals& normals_out) {
    int out_cols = points.cols() / 2;
    int out_rows = points.rows() / 2;

    dim3 block(64, 16);
    dim3 grid(divUp(out_cols, block.x), divUp(out_rows, block.y));

    resize_points_normals_kernel<<<grid, block>>>(points, normals, points_out, normals_out);
    cudaSafeCall(cudaGetLastError());
}

namespace kfusion {
namespace device {
/* calculate for the projected triangle the bounding box in the image domain */
__host__ __device__ void get_bounding_box(float2 v1, float2 v2, float2 v3, int2& min, int2& max) {
    min.x = static_cast<int>(fmin(v1.x, fmin(v2.x, v3.x)));
    min.y = static_cast<int>(fmin(v1.y, fmin(v2.y, v3.y)));

    max.x = static_cast<int>(fmax(v1.x, fmax(v2.x, v3.x)));
    max.y = static_cast<int>(fmax(v1.y, fmax(v2.y, v3.y)));
}

__host__ __device__ __forceinline__ float edge_function(const float2& a, const float2& b, const float2& c) {
    return (c.x - a.x) * (b.y - a.y) - (c.y - a.y) * (b.x - a.x);
}

/* rasterise surface triangles */
__global__ void rasterise_surface_kernel(const Projector proj, const Aff3f vol2cam, const PtrSz<Point> vsrc,
                                         const PtrSz<Normal> nsrc, PtrStepSz<Point> points_out,
                                         PtrStep<Normal> normals_out) {
    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 3;

    if ((x + 2) >= vsrc.size)
        return;

    /* get vertices and normals */
    float3 v1 = vol2cam * tr(vsrc[x]);
    float3 v2 = vol2cam * tr(vsrc[x + 1]);
    float3 v3 = vol2cam * tr(vsrc[x + 2]);

    /* project vertices onto the image plane */
    float2 coos1 = proj(v1);
    float2 coos2 = proj(v2);
    float2 coos3 = proj(v3);

    /* get 2-d triangle bounding box */
    int2 min;
    int2 max;
    get_bounding_box(coos1, coos2, coos3, min, max);

    /* check for validity of coordinates */
    if (min.x < 0 || min.y < 0 || max.x >= points_out.cols || max.y >= points_out.rows)
        return;

    /* used for smooth interpoation */
    float area = edge_function(coos1, coos2, coos3);

    /* shade pixels */
    for (int i = min.x; i < max.x; i++) {
        for (int j = min.y; j < max.y; j++) {
            /* coordinates of the centre of the pixel */
            float2 p = make_float2(i + 0.5f, j + 0.5f);

            float w0 = edge_function(coos2, coos3, p) / area;
            float w1 = edge_function(coos3, coos1, p) / area;
            float w2 = edge_function(coos1, coos2, p) / area;

            float z = w0 * v1.z + w1 * v2.z + w2 * v3.z;

            if (z < points_out(j, i).z || fabs(points_out(j, i).z) < 1e-7f) {
                points_out(j, i) =
                    make_float4(w0 * v1.x + w1 * v2.x + w2 * v3.x, w0 * v1.y + w1 * v2.y + w2 * v3.y, z, 0.f);
            }
        }
    }

    for (int i = min.x; i < max.x; i++) {
        for (int j = min.y; j < max.y; j++) {
            float3 v0 = tr(points_out(j, i));
            float3 v1 = tr(points_out(j + 1, i));
            float3 v2 = tr(points_out(j, i + 1));

            float3 n          = normalized(cross(v1 - v0, v2 - v0));
            normals_out(j, i) = make_float4(n.x, n.y, n.z, 1.f);
        }
    }
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::rasteriseSurface(const Projector& proj, const Aff3f& vol2cam, const Surface& surface,
                                       Points& points_out, Normals& normals_out) {
    dim3 block(256);
    dim3 grid(divUp(surface.vertices.size() / 3, block.x));

    rasterise_surface_kernel<<<grid, block>>>(proj, vol2cam, surface.vertices, surface.normals, points_out,
                                              normals_out);
    cudaSafeCall(cudaGetLastError());
}


================================================
FILE: src/kfusion/cuda/marching_cubes.cu
================================================
#include <kfusion/cuda/device.hpp>

#include <thrust/device_ptr.h>
#include <thrust/scan.h>

#define FULL_MASK 0xffffffff

namespace kfusion {
namespace device {
// texture<int, 1, cudaReadModeElementType> edgeTex;
texture<int, 1, cudaReadModeElementType> triTex;
texture<int, 1, cudaReadModeElementType> numVertsTex;
}  // namespace device
}  // namespace kfusion

void kfusion::device::bindTextures(const int* /*edgeBuf*/, const int* triBuf, const int* numVertsBuf) {
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
    // cudaSafeCall(cudaBindTexture(0, edgeTex, edgeBuf, desc) );
    cudaSafeCall(cudaBindTexture(0, triTex, triBuf, desc));
    cudaSafeCall(cudaBindTexture(0, numVertsTex, numVertsBuf, desc));
}
void kfusion::device::unbindTextures() {
    // cudaSafeCall( cudaUnbindTexture(edgeTex) );
    cudaSafeCall(cudaUnbindTexture(numVertsTex));
    cudaSafeCall(cudaUnbindTexture(triTex));
}

namespace kfusion {
namespace device {
__device__ int global_count = 0;
__device__ int output_count;
__device__ unsigned int blocks_done = 0;

__kf_device__ void kfusion::device::CubeIndexEstimator::readTsdf(int x, int y, int z, float& f, float& weight) const {
    float2 aux = *volume(x, y, z);
    f          = aux.x;
    weight     = aux.y;
}

__kf_device__ int kfusion::device::CubeIndexEstimator::computeCubeIndex(int x, int y, int z, float f[8]) const {
    float weight;
    readTsdf(x, y, z, f[0], weight);
    if (weight == 0.f)
        return 0;
    readTsdf(x + 1, y, z, f[1], weight);
    if (weight == 0.f)
        return 0;
    readTsdf(x + 1, y + 1, z, f[2], weight);
    if (weight == 0.f)
        return 0;
    readTsdf(x, y + 1, z, f[3], weight);
    if (weight == 0.f)
        return 0;
    readTsdf(x, y, z + 1, f[4], weight);
    if (weight == 0.f)
        return 0;
    readTsdf(x + 1, y, z + 1, f[5], weight);
    if (weight == 0.f)
        return 0;
    readTsdf(x + 1, y + 1, z + 1, f[6], weight);
    if (weight == 0.f)
        return 0;
    readTsdf(x, y + 1, z + 1, f[7], weight);
    if (weight == 0.f)
        return 0;

    // calculate flag indicating if each vertex is inside or outside isosurface
    int cubeindex = 0;
    cubeindex     = int(f[0] < isoValue);
    cubeindex += int(f[1] < isoValue) * 2;
    cubeindex += int(f[2] < isoValue) * 4;
    cubeindex += int(f[3] < isoValue) * 8;
    cubeindex += int(f[4] < isoValue) * 16;
    cubeindex += int(f[5] < isoValue) * 32;
    cubeindex += int(f[6] < isoValue) * 64;
    cubeindex += int(f[7] < isoValue) * 128;

    return cubeindex;
}

__kf_device__ void kfusion::device::OccupiedVoxels::operator()() const {
    int x = threadIdx.x + blockIdx.x * CTA_SIZE_X;
    int y = threadIdx.y + blockIdx.y * CTA_SIZE_Y;

    if (__all_sync(FULL_MASK, x >= volume.dims.x) || __all_sync(FULL_MASK, y >= volume.dims.y)) {
        return;
    }

    int ftid    = Block::flattenedThreadId();
    int warp_id = Warp::id();
    int lane_id = Warp::laneId();

    volatile __shared__ int warps_buffer[WARPS_COUNT];

    for (int z = 0; z < volume.dims.z - 1; z++) {
        int numVerts = 0;
        ;
        if (x + 1 < volume.dims.x && y + 1 < volume.dims.y) {
            float field[8];
            int cubeindex = computeCubeIndex(x, y, z, field);

            // read number of vertices from texture
            numVerts = (cubeindex == 0 || cubeindex == 255) ? 0 : tex1Dfetch(numVertsTex, cubeindex);
        }

        int total = __popc(__ballot_sync(FULL_MASK, numVerts > 0));

        if (total == 0)
            continue;

        if (lane_id == 0) {
            int old               = atomicAdd(&global_count, total);
            warps_buffer[warp_id] = old;
        }
        int old_global_voxels_count = warps_buffer[warp_id];

        int offs = Warp::binaryExclScan(__ballot_sync(FULL_MASK, numVerts > 0));

        if (old_global_voxels_count + offs < max_size && numVerts > 0) {
            voxels_indices[old_global_voxels_count + offs]  = volume.dims.y * volume.dims.x * z + volume.dims.x * y + x;
            vertices_number[old_global_voxels_count + offs] = numVerts;
        }

        bool full = old_global_voxels_count + total >= max_size;

        if (full)
            break;

    } /* for(int z = 0; z < 128 - 1; z++) */

    /////////////////////////
    // prepare for future scans
    if (ftid == 0) {
        unsigned int total_blocks = gridDim.x * gridDim.y * gridDim.z;
        unsigned int value        = atomicInc(&blocks_done, total_blocks);

        // last block
        if (value == total_blocks - 1) {
            output_count = min(max_size, global_count);
            blocks_done  = 0;
            global_count = 0;
        }
    }
} /* operator () */

__global__ void getOccupiedVoxelsKernel(const OccupiedVoxels ov) { ov(); }

int getOccupiedVoxels(const TsdfVolume& volume, DeviceArray2D<int>& occupied_voxels) {
    OccupiedVoxels ov(volume);

    ov.voxels_indices  = occupied_voxels.ptr(0);
    ov.vertices_number = occupied_voxels.ptr(1);
    ov.max_size        = occupied_voxels.cols();

    dim3 block(OccupiedVoxels::CTA_SIZE_X, OccupiedVoxels::CTA_SIZE_Y);
    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));

    getOccupiedVoxelsKernel<<<grid, block>>>(ov);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());

    int size;
    cudaSafeCall(cudaMemcpyFromSymbol(&size, output_count, sizeof(size)));
    return size;
}

int computeOffsetsAndTotalVertices(DeviceArray2D<int>& occupied_voxels) {
    thrust::device_ptr<int> beg(occupied_voxels.ptr(1));
    thrust::device_ptr<int> end = beg + occupied_voxels.cols();

    thrust::device_ptr<int> out(occupied_voxels.ptr(2));
    thrust::exclusive_scan(beg, end, out);

    int lastElement, lastScanElement;

    DeviceArray<int> last_elem(occupied_voxels.ptr(1) + occupied_voxels.cols() - 1, 1);
    DeviceArray<int> last_scan(occupied_voxels.ptr(2) + occupied_voxels.cols() - 1, 1);

    last_elem.download(&lastElement);
    last_scan.download(&lastScanElement);

    return lastElement + lastScanElement;
}

__kf_device__ float3 kfusion::device::TrianglesGenerator::get_node_coo(int x, int y, int z) const {
    float3 coo = make_float3(x, y, z);
    coo += 0.5f;  // shift to volume cell center;

    coo.x *= cell_size.x;
    coo.y *= cell_size.y;
    coo.z *= cell_size.z;

    return coo;
}

__kf_device__ float3 kfusion::device::TrianglesGenerator::vertex_interp(float3 p0, float3 p1, float f0,
                                                                        float f1) const {
    float t = (isoValue - f0) / (f1 - f0 + 1e-15f);
    float x = p0.x + t * (p1.x - p0.x);
    float y = p0.y + t * (p1.y - p0.y);
    float z = p0.z + t * (p1.z - p0.z);
    return make_float3(x, y, z);
}

__kf_device__ void kfusion::device::TrianglesGenerator::operator()() const {
    int tid = threadIdx.x;
    int idx = (blockIdx.y * MAX_GRID_SIZE_X + blockIdx.x) * CTA_SIZE + tid;

    if (idx >= voxels_count)
        return;

    int voxel = occupied_voxels[idx];

    int z = voxel / (volume.dims.x * volume.dims.y);
    int y = (voxel - z * volume.dims.x * volume.dims.y) / volume.dims.x;
    int x = (voxel - z * volume.dims.x * volume.dims.y) - y * volume.dims.x;

    float f[8];
    int cubeindex = computeCubeIndex(x, y, z, f);

    /* calculate cell vertex positions */
    float3 v[8];
    v[0] = get_node_coo(x, y, z);
    v[1] = get_node_coo(x + 1, y, z);
    v[2] = get_node_coo(x + 1, y + 1, z);
    v[3] = get_node_coo(x, y + 1, z);
    v[4] = get_node_coo(x, y, z + 1);
    v[5] = get_node_coo(x + 1, y, z + 1);
    v[6] = get_node_coo(x + 1, y + 1, z + 1);
    v[7] = get_node_coo(x, y + 1, z + 1);

    /* find vertices where surface intersects the cube; use shared memory to avoid using local */
    __shared__ float3 vertlist[12][CTA_SIZE];

    vertlist[0][tid]  = vertex_interp(v[0], v[1], f[0], f[1]);
    vertlist[1][tid]  = vertex_interp(v[1], v[2], f[1], f[2]);
    vertlist[2][tid]  = vertex_interp(v[2], v[3], f[2], f[3]);
    vertlist[3][tid]  = vertex_interp(v[3], v[0], f[3], f[0]);
    vertlist[4][tid]  = vertex_interp(v[4], v[5], f[4], f[5]);
    vertlist[5][tid]  = vertex_interp(v[5], v[6], f[5], f[6]);
    vertlist[6][tid]  = vertex_interp(v[6], v[7], f[6], f[7]);
    vertlist[7][tid]  = vertex_interp(v[7], v[4], f[7], f[4]);
    vertlist[8][tid]  = vertex_interp(v[0], v[4], f[0], f[4]);
    vertlist[9][tid]  = vertex_interp(v[1], v[5], f[1], f[5]);
    vertlist[10][tid] = vertex_interp(v[2], v[6], f[2], f[6]);
    vertlist[11][tid] = vertex_interp(v[3], v[7], f[3], f[7]);
    __syncthreads();

    /* output triangle vertices and normals */
    int numVerts = tex1Dfetch(numVertsTex, cubeindex);

    for (int i = 0; i < numVerts; i += 3) {
        int index = vertex_ofssets[idx] + i;

        int v1 = tex1Dfetch(triTex, (cubeindex * 16) + i + 0);
        int v2 = tex1Dfetch(triTex, (cubeindex * 16) + i + 1);
        int v3 = tex1Dfetch(triTex, (cubeindex * 16) + i + 2);

        /* NOTE (dig15): the surface could be smoother if the normal weren't the same for each vertex of the triangle */
        float3 n = normalized(cross(vertlist[v3][tid] - vertlist[v1][tid], vertlist[v2][tid] - vertlist[v1][tid]));

        store_point(outputVertices, index + 0, pose * vertlist[v1][tid]);
        store_point(outputNormals, index + 0, n);

        store_point(outputVertices, index + 1, pose * vertlist[v2][tid]);
        store_point(outputNormals, index + 1, n);

        store_point(outputVertices, index + 2, pose * vertlist[v3][tid]);
        store_point(outputNormals, index + 2, n);
    }
}

__kf_device__ void kfusion::device::TrianglesGenerator::store_point(float4* ptr, int index,
                                                                    const float3& vertex) const {
    ptr[index] = make_float4(vertex.x, -vertex.y, -vertex.z, 1.f);
}

__global__ void trianglesGeneratorKernel(const TrianglesGenerator tg) { tg(); }

void generateTriangles(const TsdfVolume& volume, const DeviceArray2D<int>& occupied_voxels, const float3& volume_size,
                       const Aff3f& pose, DeviceArray<PointType>& outputVertices,
                       DeviceArray<PointType>& outputNormals) {
    int device;
    cudaSafeCall(cudaGetDevice(&device));

    cudaDeviceProp prop;
    cudaSafeCall(cudaGetDeviceProperties(&prop, device));

    typedef TrianglesGenerator Tg;
    Tg tg(volume);

    tg.occupied_voxels = occupied_voxels.ptr(0);
    tg.vertex_ofssets  = occupied_voxels.ptr(2);
    tg.voxels_count    = occupied_voxels.cols();
    tg.cell_size.x     = volume_size.x / volume.dims.x;
    tg.cell_size.y     = volume_size.y / volume.dims.y;
    tg.cell_size.z     = volume_size.z / volume.dims.z;
    tg.outputVertices  = outputVertices;
    tg.outputNormals   = outputNormals;

    tg.pose = pose;

    int block_size = 256;
    int blocks_num = divUp(tg.voxels_count, block_size);

    dim3 block(block_size);
    dim3 grid(min(blocks_num, Tg::MAX_GRID_SIZE_X), divUp(blocks_num, Tg::MAX_GRID_SIZE_X));

    trianglesGeneratorKernel<<<grid, block>>>(tg);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}
}  // namespace device
}  // namespace kfusion


================================================
FILE: src/kfusion/cuda/proj_icp.cu
================================================
#include <kfusion/cuda/device.hpp>
#include <kfusion/cuda/texture_binder.hpp>

namespace kfusion {
namespace device {
texture<ushort, 2> dprev_tex;
texture<Normal, 2> nprev_tex;
texture<Point, 2> vprev_tex;

struct ComputeIcpHelper::Policy {
    enum {
        CTA_SIZE_X = 32,
        CTA_SIZE_Y = 8,
        CTA_SIZE   = CTA_SIZE_X * CTA_SIZE_Y,

        B              = 6,
        COLS           = 6,
        ROWS           = 6,
        DIAG           = 6,
        UPPER_DIAG_MAT = (COLS * ROWS - DIAG) / 2 + DIAG,
        TOTAL          = UPPER_DIAG_MAT + B,

        FINAL_REDUCE_CTA_SIZE = 256,
        FINAL_REDUCE_STRIDE   = FINAL_REDUCE_CTA_SIZE
    };
};

__kf_device__ float2 ComputeIcpHelper::proj(const float3& p) const {
    float2 coo;
    coo.x = __fmaf_rn(f.x, __fdividef(p.x, p.z), c.x);
    coo.y = __fmaf_rn(f.y, __fdividef(p.y, p.z), c.y);
    return coo;
}

__kf_device__ float3 ComputeIcpHelper::reproj(float u, float v, float z) const {
    float x = z * (u - c.x) * finv.x;
    float y = z * (v - c.y) * finv.y;
    return make_float3(x, y, z);
}

#if defined USE_DEPTH
__kf_device__ int ComputeIcpHelper::find_coresp(int x, int y, float3& nd, float3& d, float3& s) const {
    int src_z = dcurr(y, x);
    if (src_z == 0)
        return 40;

    s = aff * reproj(x, y, src_z * 0.001f);

    float2 coo = proj(s);
    if (s.z <= 0 || coo.x < 0 || coo.y < 0 || coo.x >= cols || coo.y >= rows)
        return 80;

    int dst_z = tex2D(dprev_tex, coo.x, coo.y);
    if (dst_z == 0)
        return 120;

    d = reproj(coo.x, coo.y, dst_z * 0.001f);

    float dist2 = norm_sqr(s - d);
    if (dist2 > dist2_thres)
        return 160;

    float3 ns = aff.R * tr(ncurr(y, x));
    nd        = tr(tex2D(nprev_tex, coo.x, coo.y));

    float cosine = fabs(dot(ns, nd));
    if (cosine < min_cosine)
        return 200;
    return 0;
}
#else
__kf_device__ int ComputeIcpHelper::find_coresp(int x, int y, float3& nd, float3& d, float3& s) const {
    s = tr(vcurr(y, x));
    if (isnan(s.x))
        return 40;

    s = aff * s;

    float2 coo = proj(s);
    if (s.z <= 0 || coo.x < 0 || coo.y < 0 || coo.x >= cols || coo.y >= rows)
        return 80;

    d = tr(tex2D(vprev_tex, coo.x, coo.y));
    if (isnan(d.x))
        return 120;

    float dist2 = norm_sqr(s - d);
    if (dist2 > dist2_thres)
        return 160;

    float3 ns = aff.R * tr(ncurr(y, x));
    nd        = tr(tex2D(nprev_tex, coo.x, coo.y));

    float cosine = fabs(dot(ns, nd));
    if (cosine < min_cosine)
        return 200;
    return 0;
}
#endif

__kf_device__ void ComputeIcpHelper::partial_reduce(const float row[7], PtrStep<float>& partial_buf) const {
    volatile __shared__ float smem[Policy::CTA_SIZE];
    int tid = Block::flattenedThreadId();

    float* pos  = partial_buf.data + blockIdx.x + gridDim.x * blockIdx.y;
    size_t step = partial_buf.step / sizeof(float);

#define STOR            \
    if (tid == 0) {     \
        *pos = smem[0]; \
        pos += step;    \
    }

    __syncthreads();
    smem[tid] = row[0] * row[0];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[0] * row[1];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[0] * row[2];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[0] * row[3];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[0] * row[4];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR __syncthreads();
    smem[tid] = row[0] * row[5];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[0] * row[6];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR
    ////////////////////////////////

    __syncthreads();
    smem[tid] = row[1] * row[1];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[1] * row[2];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[1] * row[3];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[1] * row[4];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[1] * row[5];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[1] * row[6];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR
    ////////////////////////////////

    __syncthreads();
    smem[tid] = row[2] * row[2];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[2] * row[3];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[2] * row[4];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[2] * row[5];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[2] * row[6];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR
    ////////////////////////////////

    __syncthreads();
    smem[tid] = row[3] * row[3];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[3] * row[4];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[3] * row[5];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[3] * row[6];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR
    ///////////////////////////////////////////////////

    __syncthreads();
    smem[tid] = row[4] * row[4];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[4] * row[5];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[4] * row[6];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    ///////////////////////////////////////////////////

    __syncthreads();
    smem[tid] = row[5] * row[5];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR

    __syncthreads();
    smem[tid] = row[5] * row[6];
    __syncthreads();

    Block::reduce<Policy::CTA_SIZE>(smem, plus());

    STOR
}

__global__ void icp_helper_kernel(const ComputeIcpHelper helper, PtrStep<float> partial_buf) {
    int x = threadIdx.x + blockIdx.x * ComputeIcpHelper::Policy::CTA_SIZE_X;
    int y = threadIdx.y + blockIdx.y * ComputeIcpHelper::Policy::CTA_SIZE_Y;

    float3 n, d, s;
    int filtered = (x < helper.cols && y < helper.rows) ? helper.find_coresp(x, y, n, d, s) : 1;
    // if (x < helper.cols && y < helper.rows) mask(y, x) = filtered;

    float row[7];

    if (!filtered) {
        *(float3*) &row[0] = cross(s, n);
        *(float3*) &row[3] = n;
        row[6]             = dot(n, d - s);
    } else
        row[0] = row[1] = row[2] = row[3] = row[4] = row[5] = row[6] = 0.f;

    helper.partial_reduce(row, partial_buf);
}

__global__ void icp_final_reduce_kernel(const PtrStep<float> partial_buf, const int length, float* final_buf) {
    const float* beg = partial_buf.ptr(blockIdx.x);
    const float* end = beg + length;

    int tid = threadIdx.x;

    float sum = 0.f;
    for (const float* t = beg + tid; t < end; t += ComputeIcpHelper::Policy::FINAL_REDUCE_STRIDE)
        sum += *t;

    __shared__ float smem[ComputeIcpHelper::Policy::FINAL_REDUCE_CTA_SIZE];

    smem[tid] = sum;

    __syncthreads();

    Block::reduce<ComputeIcpHelper::Policy::FINAL_REDUCE_CTA_SIZE>(smem, plus());

    if (tid == 0)
        final_buf[blockIdx.x] = smem[0];
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::ComputeIcpHelper::operator()(const Depth& dprev, const Normals& nprev,
                                                   DeviceArray2D<float>& buffer, float* data, cudaStream_t s) {
    dprev_tex.filterMode = cudaFilterModePoint;
    nprev_tex.filterMode = cudaFilterModePoint;
    TextureBinder dprev_binder(dprev, dprev_tex);
    TextureBinder nprev_binder(nprev, nprev_tex);

    dim3 block(Policy::CTA_SIZE_X, Policy::CTA_SIZE_Y);
    dim3 grid(divUp((int) cols, block.x), divUp((int) rows, block.y));

    int partials_count = (int) (grid.x * grid.y);
    allocate_buffer(buffer, partials_count);

    icp_helper_kernel<<<grid, block, 0, s>>>(*this, buffer);
    cudaSafeCall(cudaGetLastError());

    int b = Policy::FINAL_REDUCE_CTA_SIZE;
    int g = Policy::TOTAL;
    icp_final_reduce_kernel<<<g, b, 0, s>>>(buffer, partials_count, buffer.ptr(Policy::TOTAL));
    cudaSafeCall(cudaGetLastError());

    cudaSafeCall(
        cudaMemcpyAsync(data, buffer.ptr(Policy::TOTAL), Policy::TOTAL * sizeof(float), cudaMemcpyDeviceToHost, s));
    cudaSafeCall(cudaGetLastError());
}

void kfusion::device::ComputeIcpHelper::operator()(const Points& vprev, const Normals& nprev,
                                                   DeviceArray2D<float>& buffer, float* data, cudaStream_t s) {
    dprev_tex.filterMode = cudaFilterModePoint;
    nprev_tex.filterMode = cudaFilterModePoint;
    TextureBinder vprev_binder(vprev, vprev_tex);
    TextureBinder nprev_binder(nprev, nprev_tex);

    dim3 block(Policy::CTA_SIZE_X, Policy::CTA_SIZE_Y);
    dim3 grid(divUp((int) cols, block.x), divUp((int) rows, block.y));

    int partials_count = (int) (grid.x * grid.y);
    allocate_buffer(buffer, partials_count);

    icp_helper_kernel<<<grid, block, 0, s>>>(*this, buffer);
    cudaSafeCall(cudaGetLastError());

    int b = Policy::FINAL_REDUCE_CTA_SIZE;
    int g = Policy::TOTAL;
    icp_final_reduce_kernel<<<g, b, 0, s>>>(buffer, partials_count, buffer.ptr(Policy::TOTAL));
    cudaSafeCall(cudaGetLastError());

    cudaSafeCall(
        cudaMemcpyAsync(data, buffer.ptr(Policy::TOTAL), Policy::TOTAL * sizeof(float), cudaMemcpyDeviceToHost, s));
    cudaSafeCall(cudaGetLastError());
}

void kfusion::device::ComputeIcpHelper::allocate_buffer(DeviceArray2D<float>& buffer, int partials_count) {
    if (partials_count < 0) {
        const int input_cols = 640;
        const int input_rows = 480;

        int gx = divUp(input_cols, Policy::CTA_SIZE_X);
        int gy = divUp(input_rows, Policy::CTA_SIZE_Y);

        partials_count = gx * gy;
    }

    int min_rows = Policy::TOTAL + 1;
    int min_cols = max(partials_count, Policy::TOTAL);

    if (buffer.rows() < min_rows || buffer.cols() < min_cols)
        buffer.create(min_rows, min_cols);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// ComputeIcpHelper::PageLockHelper

kfusion::device::ComputeIcpHelper::PageLockHelper::PageLockHelper() : data(0) {
    cudaSafeCall(cudaMallocHost((void**) &data, Policy::TOTAL * sizeof(float)));
}

kfusion::device::ComputeIcpHelper::PageLockHelper::~PageLockHelper() {
    cudaSafeCall(cudaFreeHost(data));
    data = 0;
}


================================================
FILE: src/kfusion/cuda/tsdf_volume.cu
================================================
/* kfusion includes */
#include <kfusion/cuda/device.hpp>
#include <kfusion/cuda/texture_binder.hpp>

/* sobfu includes */
#include <sobfu/cuda/utils.hpp>

/* cuda includes */
#include <curand.h>
#include <curand_kernel.h>

/* thrust includes */
#include <thrust/device_vector.h>
#include <thrust/extrema.h>

using namespace kfusion::device;

///////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Volume initialization

namespace kfusion {
namespace device {
__global__ void clear_volume_kernel(TsdfVolume tsdf) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x < tsdf.dims.x && y < tsdf.dims.y) {
        float2* beg = tsdf.beg(x, y);
        float2* end = beg + tsdf.dims.x * tsdf.dims.y * tsdf.dims.z;

        for (float2* pos = beg; pos != end; pos = tsdf.zstep(pos))
            *pos = make_float2(0.f, 0.f);
    }
}
}  // namespace device
}  // namespace kfusion

void kfusion::device::clear_volume(TsdfVolume& volume) {
    dim3 block(64, 16);
    dim3 grid(1, 1, 1);
    grid.x = divUp(volume.dims.x, block.x);
    grid.y = divUp(volume.dims.y, block.y);

    clear_volume_kernel<<<grid, block>>>(volume);
    cudaSafeCall(cudaGetLastError());
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Volume integration

namespace kfusion {
namespace device {
texture<float, 2> dists_tex(0, cudaFilterModePoint, cudaAddressModeBorder,
                            cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat));

struct TsdfIntegrator {
    Projector proj;
    Aff3f vol2cam;

    int2 dists_size;

    __kf_device__ void operator()(TsdfVolume& volume) const {
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;

        if (x >= volume.dims.x || y >= volume.dims.y) {
            return;
        }

        float3 vc     = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,
                                y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f);
        float3 vc_cam = vol2cam * vc; /* transfrom from volume coos to camera coos */

        float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);
        float2* vptr = volume.beg(x, y);
        for (int i = 0; i <= volume.dims.z - 1; ++i, vc_cam += zstep, vptr = volume.zstep(vptr)) {
            /* project the voxel centre onto the depth map */
            float2 coo = proj(vc_cam);
            if (coo.x < 0 || coo.y < 0 || coo.x >= dists_size.x || coo.y >= dists_size.y) {
                continue;
            }

            float Dp = tex2D(dists_tex, coo.x, coo.y);
            if (Dp <= 0.f || vc_cam.z <= 0) {
                continue;
            }

            /* get the psdf value */
            float psdf = Dp - vc_cam.z;
            /* get the weight */
            float weight = (psdf > -volume.eta) ? 1.f : 0.f;

            if (psdf >= volume.trunc_dist) {
                *vptr = make_float2(1.f, weight);
            } else if (psdf <= -volume.trunc_dist) {
                *vptr = make_float2(-1.f, weight);
            } else {
                *vptr = make_float2(__fdividef(psdf, volume.trunc_dist), weight);
            }
        }
    }

    __kf_device__ void operator()(TsdfVolume& phi_global, TsdfVolume& phi_n_psi) const {
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;

        if (x >= phi_global.dims.x || y >= phi_global.dims.y) {
            return;
        }

        float2* pos_global = phi_global.beg(x, y);
        float2* pos_n_psi  = phi_n_psi.beg(x, y);

        for (int i = 0; i <= phi_global.dims.z - 1;
             ++i, pos_global = phi_global.zstep(pos_global), pos_n_psi = phi_n_psi.zstep(pos_n_psi)) {
            float2 tsdf = *pos_n_psi;

            if (tsdf.y == 0.f || (tsdf.y == 1.f && (tsdf.x == 0.f || tsdf.x == -1.f))) {
                continue;
            }

            float2 tsdf_prev = *pos_global;

            float tsdf_new   = __fdividef(__fmaf_rn(tsdf_prev.y, tsdf_prev.x, tsdf.x), tsdf_prev.y + 1.f);
            float weight_new = fminf(tsdf_prev.y + 1.f, (float) phi_global.max_weight);

            /* pack and write */
            *pos_global = make_float2(tsdf_new, weight_new);
        }
    }
};  // namespace device

__global__ void integrate_kernel(const TsdfIntegrator integrator, TsdfVolume volume) { integrator(volume); }

__global__ void integrate_kernel(const TsdfIntegrator integrator, TsdfVolume phi_global, TsdfVolume phi_n_psi) {
    integrator(phi_global, phi_n_psi);
};
}  // namespace device
}  // namespace kfusion

void kfusion::device::integrate(const PtrStepSz<float>& dists, TsdfVolume& volume, const Aff3f& aff,
                                const Projector& proj) {
    /* init tsdf */
    TsdfIntegrator ti;
    ti.dists_size = make_int2(dists.cols, dists.rows);
    ti.vol2cam    = aff;
    ti.proj       = proj;

    dists_tex.filterMode     = cudaFilterModePoint;
    dists_tex.addressMode[0] = cudaAddressModeBorder;
    dists_tex.addressMode[1] = cudaAddressModeBorder;
    dists_tex.addressMode[2] = cudaAddressModeBorder;
    TextureBinder binder(dists, dists_tex, cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat));
    (void) binder;

    dim3 block(64, 16);
    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));

    integrate_kernel<<<grid, block>>>(ti, volume);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}

void kfusion::device::integrate(TsdfVolume& phi_global, TsdfVolume& phi_n_psi) {
    TsdfIntegrator ti;

    dim3 block(64, 16);
    dim3 grid(divUp(phi_global.dims.x, block.x), divUp(phi_global.dims.y, block.y));

    integrate_kernel<<<grid, block>>>(ti, phi_global, phi_n_psi);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}

namespace kfusion {
namespace device {

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// signed distance fields for various primitives

__global__ void init_box_kernel(TsdfVolume volume, const float3 b) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= volume.dims.x || y >= volume.dims.y)
        return;

    /* centering */
    float3 c = make_float3(volume.dims.x / 2.f * volume.voxel_size.x, volume.dims.y / 2.f * volume.voxel_size.y,
                           volume.dims.z / 2.f * volume.voxel_size.z);

    float3 vc = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,
                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f) -
                c;
    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);

    float2* vptr = volume.beg(x, y);
    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {
        float3 d = make_float3(fabs(vc.x), fabs(vc.y), fabs(vc.z)) - b;

        float sdf =
            fmin(fmax(d.x, fmax(d.y, d.z)), 0.f) + norm(make_float3(fmax(d.x, 0.f), fmax(d.y, 0.f), fmax(d.z, 0.f)));
        float weight = 1.f;

        if (sdf >= volume.trunc_dist) {
            *vptr = make_float2(1.f, weight);
        } else if (sdf <= -volume.trunc_dist) {
            *vptr = make_float2(-1.f, weight);
        } else {
            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);
        }
    }
}

__global__ void init_ellipsoid_kernel(TsdfVolume volume, const float3 r) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= volume.dims.x || y >= volume.dims.y)
        return;

    /* centering */
    float3 c = make_float3(volume.dims.x / 2.f * volume.voxel_size.x, volume.dims.y / 2.f * volume.voxel_size.y,
                           volume.dims.z / 2.f * volume.voxel_size.z);

    float3 vc = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,
                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f) -
                c;
    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);

    float2* vptr = volume.beg(x, y);
    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {
        float k0 = norm(make_float3(vc.x / r.x, vc.y / r.y, vc.z / r.z));
        float k1 = norm(make_float3(vc.x / (r.x * r.x), vc.y / (r.y * r.y), vc.z / (r.z * r.z)));

        float sdf    = k0 * (k0 - 1.f) / k1;
        float weight = 1.f;

        if (sdf >= volume.trunc_dist) {
            *vptr = make_float2(1.f, weight);
        } else if (sdf <= -volume.trunc_dist) {
            *vptr = make_float2(-1.f, weight);
        } else {
            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);
        }
    }
}

__global__ void init_sphere_kernel(TsdfVolume volume, float3 centre, float radius) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= volume.dims.x || y >= volume.dims.y)
        return;

    float3 vc    = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,
                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f);
    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);

    float2* vptr = volume.beg(x, y);
    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {
        float d = sqrtf(powf(vc.x - centre.x, 2) + powf(vc.y - centre.y, 2) + powf(vc.z - centre.z, 2));

        float sdf    = d - radius;
        float weight = (sdf > -volume.eta) ? 1.f : 0.f;

        if (sdf >= volume.trunc_dist) {
            *vptr = make_float2(1.f, weight);
        } else if (sdf <= -volume.trunc_dist) {
            *vptr = make_float2(-1.f, weight);
        } else {
            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);
        }
    }
}

__global__ void init_plane_kernel(TsdfVolume volume, float z) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= volume.dims.x || y >= volume.dims.y)
        return;

    float3 vc    = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,
                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f);
    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);

    float2* vptr = volume.beg(x, y);
    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {
        float sdf    = vc.z - z;
        float weight = 1.f;

        if (sdf >= volume.trunc_dist) {
            *vptr = make_float2(1.f, weight);
        } else if (sdf <= -volume.trunc_dist) {
            *vptr = make_float2(-1.f, weight);
        } else {
            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);
        }
    }
}

__global__ void init_torus_kernel(TsdfVolume volume, const float2 t) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= volume.dims.x || y >= volume.dims.y)
        return;

    /* centering */
    float3 c = make_float3(volume.dims.x / 2.f * volume.voxel_size.x, volume.dims.y / 2.f * volume.voxel_size.y,
                           volume.dims.z / 2.f * volume.voxel_size.z);

    float3 vc = make_float3(x * volume.voxel_size.x + volume.voxel_size.x / 2.f,
                            y * volume.voxel_size.y + volume.voxel_size.y / 2.f, volume.voxel_size.z / 2.f) -
                c;
    float3 zstep = make_float3(0.f, 0.f, volume.voxel_size.z);

    float2* vptr = volume.beg(x, y);
    for (int i = 0; i < volume.dims.z; vc += zstep, vptr = volume.zstep(vptr), ++i) {
        float2 q = make_float2(norm(make_float2(vc.x, vc.z)) - t.x, vc.y);

        float sdf    = norm(q) - t.y;
        float weight = 1.f;

        if (sdf >= volume.trunc_dist) {
            *vptr = make_float2(1.f, weight);
        } else if (sdf <= -volume.trunc_dist) {
            *vptr = make_float2(-1.f, weight);
        } else {
            *vptr = make_float2(__fdividef(sdf, volume.trunc_dist), weight);
        }
    }
}

}  // namespace device
}  // namespace kfusion

void kfusion::device::init_box(TsdfVolume& volume, const float3& b) {
    dim3 block(64, 16);
    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));

    init_box_kernel<<<grid, block>>>(volume, b);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}

void kfusion::device::init_ellipsoid(TsdfVolume& volume, const float3& r) {
    dim3 block(64, 16);
    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));

    init_ellipsoid_kernel<<<grid, block>>>(volume, r);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}

void kfusion::device::init_plane(TsdfVolume& volume, const float& z) {
    dim3 block(64, 16);
    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));

    init_plane_kernel<<<grid, block>>>(volume, z);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}

void kfusion::device::init_sphere(TsdfVolume& volume, const float3& centre, const float& radius) {
    dim3 block(64, 16);
    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));

    init_sphere_kernel<<<grid, block>>>(volume, centre, radius);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}

void kfusion::device::init_torus(TsdfVolume& volume, const float2& t) {
    dim3 block(64, 16);
    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));

    init_torus_kernel<<<grid, block>>>(volume, t);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}


================================================
FILE: src/kfusion/device_memory.cpp
================================================
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <kfusion/cuda/device_memory.hpp>
#include <kfusion/safe_call.hpp>

void kfusion::cuda::error(const char *error_string, const char *file, const int line, const char *func) {
    std::cout << "error: " << error_string << "\t" << file << ":" << line << std::endl;
    exit(0);
}

//////////////////////////    XADD    ///////////////////////////////

#ifdef __GNUC__

#if __GNUC__ * 10 + __GNUC_MINOR__ >= 42

#if !defined WIN32 && (defined __i486__ || defined __i586__ || defined __i686__ || defined __MMX__ || \
                       defined __SSE__ || defined __ppc__)
#define CV_XADD __sync_fetch_and_add
#else
#include <ext/atomicity.h>
#define CV_XADD __gnu_cxx::__exchange_and_add
#endif
#else
#include <bits/atomicity.h>
#if __GNUC__ * 10 + __GNUC_MINOR__ >= 34
#define CV_XADD __gnu_cxx::__exchange_and_add
#else
#define CV_XADD __exchange_and_add
#endif
#endif

#elif defined WIN32 || defined _WIN32
#include <intrin.h>
#define CV_XADD(addr, delta) _InterlockedExchangeAdd((long volatile *) (addr), (delta))
#else

template <typename _Tp>
static inline _Tp CV_XADD(_Tp *addr, _Tp delta) {
    int tmp = *addr;
    *addr += delta;
    return tmp;
}

#endif

////////////////////////    DeviceArray    /////////////////////////////

kfusion::cuda::DeviceMemory::DeviceMemory() : data_(0), sizeBytes_(0), refcount_(0) {}
kfusion::cuda::DeviceMemory::DeviceMemory(void *ptr_arg, size_t sizeBytes_arg)
    : data_(ptr_arg), sizeBytes_(sizeBytes_arg), refcount_(0) {}
kfusion::cuda::DeviceMemory::DeviceMemory(size_t sizeBtes_arg) : data_(0), sizeBytes_(0), refcount_(0) {
    create(sizeBtes_arg);
}
kfusion::cuda::DeviceMemory::~DeviceMemory() { release(); }

kfusion::cuda::DeviceMemory::DeviceMemory(const DeviceMemory &other_arg)
    : data_(other_arg.data_), sizeBytes_(other_arg.sizeBytes_), refcount_(other_arg.refcount_) {
    if (refcount_)
        CV_XADD(refcount_, 1);
}

kfusion::cuda::DeviceMemory &kfusion::cuda::DeviceMemory::operator=(const kfusion::cuda::DeviceMemory &other_arg) {
    if (this != &other_arg) {
        if (other_arg.refcount_)
            CV_XADD(other_arg.refcount_, 1);
        release();

        data_      = other_arg.data_;
        sizeBytes_ = other_arg.sizeBytes_;
        refcount_  = other_arg.refcount_;
    }
    return *this;
}

void kfusion::cuda::DeviceMemory::create(size_t sizeBytes_arg) {
    if (sizeBytes_arg == sizeBytes_)
        return;

    if (sizeBytes_arg > 0) {
        if (data_)
            release();

        sizeBytes_ = sizeBytes_arg;

        cudaSafeCall(cudaMalloc(&data_, sizeBytes_));

        // refcount_ = (int*)cv::fastMalloc(sizeof(*refcount_));
        refcount_  = new int;
        *refcount_ = 1;
    }
}

void kfusion::cuda::DeviceMemory::copyTo(DeviceMemory &other) const {
    if (empty())
        other.release();
    else {
        other.create(sizeBytes_);
        cudaSafeCall(cudaMemcpy(other.data_, data_, sizeBytes_, cudaMemcpyDeviceToDevice));
    }
}

void kfusion::cuda::DeviceMemory::release() {
    if (refcount_ && CV_XADD(refcount_, -1) == 1) {
        // cv::fastFree(refcount);
        delete refcount_;
        cudaSafeCall(cudaFree(data_));
    }
    data_      = 0;
    sizeBytes_ = 0;
    refcount_  = 0;
}

void kfusion::cuda::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg) {
    create(sizeBytes_arg);
    cudaSafeCall(cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice));
}

void kfusion::cuda::DeviceMemory::download(void *host_ptr_arg) const {
    cudaSafeCall(cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost));
}

void kfusion::cuda::DeviceMemory::swap(DeviceMemory &other_arg) {
    std::swap(data_, other_arg.data_);
    std::swap(sizeBytes_, other_arg.sizeBytes_);
    std::swap(refcount_, other_arg.refcount_);
}

bool kfusion::cuda::DeviceMemory::empty() const { return !data_; }
size_t kfusion::cuda::DeviceMemory::sizeBytes() const { return sizeBytes_; }

////////////////////////    DeviceArray2D    /////////////////////////////

kfusion::cuda::DeviceMemory2D::DeviceMemory2D() : data_(0), step_(0), colsBytes_(0), rows_(0), refcount_(0) {}

kfusion::cuda::DeviceMemory2D::DeviceMemory2D(int rows_arg, int colsBytes_arg)
    : data_(0), step_(0), colsBytes_(0), rows_(0), refcount_(0) {
    create(rows_arg, colsBytes_arg);
}

kfusion::cuda::DeviceMemory2D::DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg)
    : data_(data_arg), step_(step_arg), colsBytes_(colsBytes_arg), rows_(rows_arg), refcount_(0) {}

kfusion::cuda::DeviceMemory2D::~DeviceMemory2D() { release(); }

kfusion::cuda::DeviceMemory2D::DeviceMemory2D(const DeviceMemory2D &other_arg)
    : data_(other_arg.data_),
      step_(other_arg.step_),
      colsBytes_(other_arg.colsBytes_),
      rows_(other_arg.rows_),
      refcount_(other_arg.refcount_) {
    if (refcount_)
        CV_XADD(refcount_, 1);
}

kfusion::cuda::DeviceMemory2D &kfusion::cuda::DeviceMemory2D::operator=(
    const kfusion::cuda::DeviceMemory2D &other_arg) {
    if (this != &other_arg) {
        if (other_arg.refcount_)
            CV_XADD(other_arg.refcount_, 1);
        release();

        colsBytes_ = other_arg.colsBytes_;
        rows_      = other_arg.rows_;
        data_      = other_arg.data_;
        step_      = other_arg.step_;

        refcount_ = other_arg.refcount_;
    }
    return *this;
}

void kfusion::cuda::DeviceMemory2D::create(int rows_arg, int colsBytes_arg) {
    if (colsBytes_ == colsBytes_arg && rows_ == rows_arg)
        return;

    if (rows_arg > 0 && colsBytes_arg > 0) {
        if (data_)
            release();

        colsBytes_ = colsBytes_arg;
        rows_      = rows_arg;

        cudaSafeCall(cudaMallocPitch((void **) &data_, &step_, colsBytes_, rows_));

        // refcount = (int*)cv::fastMalloc(sizeof(*refcount));
        refcount_  = new int;
        *refcount_ = 1;
    }
}

void kfusion::cuda::DeviceMemory2D::release() {
    if (refcount_ && CV_XADD(refcount_, -1) == 1) {
        // cv::fastFree(refcount);
        delete refcount_;
        cudaSafeCall(cudaFree(data_));
    }

    colsBytes_ = 0;
    rows_      = 0;
    data_      = 0;
    step_      = 0;
    refcount_  = 0;
}

void kfusion::cuda::DeviceMemory2D::copyTo(DeviceMemory2D &other) const {
    if (empty())
        other.release();
    else {
        other.create(rows_, colsBytes_);
        cudaSafeCall(cudaMemcpy2D(other.data_, other.step_, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToDevice));
    }
}

void kfusion::cuda::DeviceMemory2D::upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg,
                                           int colsBytes_arg) {
    create(rows_arg, colsBytes_arg);
    cudaSafeCall(cudaMemcpy2D(data_, step_, host_ptr_arg, host_step_arg, colsBytes_, rows_, cudaMemcpyHostToDevice));
}

void kfusion::cuda::DeviceMemory2D::download(void *host_ptr_arg, size_t host_step_arg) const {
    cudaSafeCall(cudaMemcpy2D(host_ptr_arg, host_step_arg, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToHost));
}

void kfusion::cuda::DeviceMemory2D::swap(DeviceMemory2D &other_arg) {
    std::swap(data_, other_arg.data_);
    std::swap(step_, other_arg.step_);

    std::swap(colsBytes_, other_arg.colsBytes_);
    std::swap(rows_, other_arg.rows_);
    std::swap(refcount_, other_arg.refcount_);
}

bool kfusion::cuda::DeviceMemory2D::empty() const { return !data_; }
int kfusion::cuda::DeviceMemory2D::colsBytes() const { return colsBytes_; }
int kfusion::cuda::DeviceMemory2D::rows() const { return rows_; }
size_t kfusion::cuda::DeviceMemory2D::step() const { return step_; }


================================================
FILE: src/kfusion/imgproc.cpp
================================================
#include <kfusion/precomp.hpp>

void kfusion::cuda::depthBilateralFilter(const Depth &in, Depth &out, int kernel_size, float sigma_spatial,
                                         float sigma_depth) {
    out.create(in.rows(), in.cols());
    device::bilateralFilter(in, out, kernel_size, sigma_spatial, sigma_depth);
}

void kfusion::cuda::depthTruncation(Depth &depth, float threshold) { device::truncateDepth(depth, threshold); }

void kfusion::cuda::depthBuildPyramid(const Depth &depth, Depth &pyramid, float sigma_depth) {
    pyramid.create(depth.rows() / 2, depth.cols() / 2);
    device::depthPyr(depth, pyramid, sigma_depth);
}

void kfusion::cuda::waitAllDefaultStream() { cudaSafeCall(cudaDeviceSynchronize()); }

void kfusion::cuda::computeNormalsAndMaskDepth(const Intr &intr, Depth &depth, Normals &normals) {
    normals.create(depth.rows(), depth.cols());

    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);

    device::Normals &n = (device::Normals &) normals;
    device::computeNormalsAndMaskDepth(reproj, depth, n);
}

void kfusion::cuda::computePointNormals(const Intr &intr, const Depth &depth, Cloud &points, Normals &normals) {
    points.create(depth.rows(), depth.cols());
    normals.create(depth.rows(), depth.cols());

    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);

    device::Points &p  = (device::Points &) points;
    device::Normals &n = (device::Normals &) normals;
    device::computePointNormals(reproj, depth, p, n);
}

void kfusion::cuda::computeDists(const Depth &depth, Dists &dists, const Intr &intr) {
    dists.create(depth.rows(), depth.cols());
    device::compute_dists(depth, dists, make_float2(intr.fx, intr.fy), make_float2(intr.cx, intr.cy));
}

void kfusion::cuda::resizeDepthNormals(const Depth &depth, const Normals &normals, Depth &depth_out,
                                       Normals &normals_out) {
    depth_out.create(depth.rows() / 2, depth.cols() / 2);
    normals_out.create(normals.rows() / 2, normals.cols() / 2);

    device::Normals &nsrc = (device::Normals &) normals;
    device::Normals &ndst = (device::Normals &) normals_out;

    device::resizeDepthNormals(depth, nsrc, depth_out, ndst);
}

void kfusion::cuda::resizePointsNormals(const Cloud &points, const Normals &normals, Cloud &points_out,
                                        Normals &normals_out) {
    points_out.create(points.rows() / 2, points.cols() / 2);
    normals_out.create(normals.rows() / 2, normals.cols() / 2);

    device::Points &pi  = (device::Points &) points;
    device::Normals &ni = (device::Normals &) normals;

    device::Points &po  = (device::Points &) points_out;
    device::Normals &no = (device::Normals &) normals_out;

    device::resizePointsNormals(pi, ni, po, no);
}

void kfusion::cuda::rasteriseSurface(const Intr &intr, const Affine3f &vol2cam, const Surface &s, Cloud &vertices_out,
                                     Normals &normals_out) {
    device::Projector proj(intr.fx, intr.fy, intr.cx, intr.cy);
    device::Aff3f dev_vol2cam = device_cast<device::Aff3f>(vol2cam);

    /* convert warped triangle vertices & normals to point clouds */
    pcl::PointCloud<pcl::PointXYZ> vertices_warped;
    s.vertices.download(vertices_warped.points);

    pcl::PointCloud<pcl::Normal> normals_warped;
    s.normals.download(normals_warped.points);

    /* lay out the warped triangle vertices and normals in memory */
    std::vector<pcl::PointXYZ>
        triangle_vertices; /* the index of each entry in the vector, modulo 3, stores the 1st, 2nd, and 3rd vertex of a
                             triangle */
    std::vector<pcl::Normal> triangle_normals;
    for (size_t i = 0; i < vertices_warped.size() / 3; i++) {
        triangle_vertices.emplace_back(vertices_warped[i * 3 + 0]);
        triangle_vertices.emplace_back(vertices_warped[i * 3 + 1]);
        triangle_vertices.emplace_back(vertices_warped[i * 3 + 2]);

        triangle_normals.emplace_back(normals_warped[i * 3 + 0]);
        triangle_normals.emplace_back(normals_warped[i * 3 + 1]);
        triangle_normals.emplace_back(normals_warped[i * 3 + 2]);
    }

    kfusion::cuda::Vertices vertices;
    vertices.upload(triangle_vertices);

    kfusion::cuda::Norms normals;
    normals.upload(triangle_normals);

    kfusion::cuda::Surface surface;
    surface.vertices = vertices;
    surface.normals  = normals;

    /* convert to classes that can be used in cuda */
    const device::Surface &dev_surface = (const device::Surface &) surface;
    device::Points &vo                 = (device::Points &) vertices_out;
    device::Normals &no                = (device::Normals &) normals_out;

    device::rasteriseSurface(proj, dev_vol2cam, dev_surface, vo, no);
    waitAllDefaultStream();
}


================================================
FILE: src/kfusion/kinfu.cpp
================================================
#include <kfusion/internal.hpp>
#include <kfusion/precomp.hpp>

using namespace std;
using namespace kfusion;
using namespace kfusion::cuda;

static inline float deg2rad(float alpha) { return alpha * 0.017453293f; }

kfusion::KinFuParams kfusion::KinFuParams::default_params() {
    const int iters[] = {10, 5, 4, 0};
    const int levels  = sizeof(iters) / sizeof(iters[0]);

    KinFuParams p;

    p.cols = 640;  // pixels
    p.rows = 480;  // pixels
    p.intr = Intr(525.f, 525.f, p.cols / 2 - 0.5f, p.rows / 2 - 0.5f);

    p.volume_dims = Vec3i::all(512);  // number of voxels
    p.volume_size = Vec3f::all(3.f);  // meters
    p.volume_pose = Affine3f().translate(Vec3f(-p.volume_size[0] / 2, -p.volume_size[1] / 2, 0.5f));

    p.bilateral_sigma_depth   = 0.04f;  // meter
    p.bilateral_sigma_spatial = 4.5;    // pixels
    p.bilateral_kernel_size   = 7;      // pixels

    p.icp_truncate_depth_dist = 0.f;            // meters, disabled
    p.icp_dist_thres          = 0.1f;           // meters
    p.icp_angle_thres         = deg2rad(30.f);  // radians
    p.icp_iter_num.assign(iters, iters + levels);

    p.tsdf_min_camera_movement = 0.f;    // meters, disabled
    p.tsdf_trunc_dist          = 0.04f;  // meters;
    p.tsdf_max_weight          = 64;     // frames

    p.raycast_step_factor   = 0.75f;  // in voxel sizes
    p.gradient_delta_factor = 0.5f;   // in voxel sizes

    // p.light_pose = p.volume_pose.translation()/4; //meters
    p.light_pose = Vec3f::all(0.f);  // meters

    return p;
}

const kfusion::KinFuParams &kfusion::KinFu::params() const { return params_; }

kfusion::KinFuParams &kfusion::KinFu::params() { return params_; }

const kfusion::cuda::TsdfVolume &kfusion::KinFu::tsdf() const { return *volume_; }

kfusion::cuda::TsdfVolume &kfusion::KinFu::tsdf() { return *volume_; }

const kfusion::cuda::ProjectiveICP &kfusion::KinFu::icp() const { return *icp_; }

kfusion::cuda::ProjectiveICP &kfusion::KinFu::icp() { return *icp_; }

const kfusion::cuda::MarchingCubes &kfusion::KinFu::mc() const { return *mc_; }

kfusion::cuda::MarchingCubes &kfusion::KinFu::mc() { return *mc_; }

void kfusion::KinFu::allocate_buffers() {
    const int LEVELS = cuda::ProjectiveICP::MAX_PYRAMID_LEVELS;

    int cols = params_.cols;
    int rows = params_.rows;

    dists_.create(rows, cols);

    curr_.depth_pyr.resize(LEVELS);
    curr_.normals_pyr.resize(LEVELS);

    prev_.depth_pyr.resize(LEVELS);
    prev_.normals_pyr.resize(LEVELS);

    curr_.points_pyr.resize(LEVELS);
    prev_.points_pyr.resize(LEVELS);

    for (int i = 0; i < LEVELS; ++i) {
        curr_.depth_pyr[i].create(rows, cols);
        curr_.normals_pyr[i].create(rows, cols);

        prev_.depth_pyr[i].create(rows, cols);
        prev_.normals_pyr[i].create(rows, cols);

        curr_.points_pyr[i].create(rows, cols);
        prev_.points_pyr[i].create(rows, cols);

        cols /= 2;
        rows /= 2;
    }

    depths_.create(params_.rows, params_.cols);
    normals_.create(params_.rows, params_.cols);
    points_.create(params_.rows, params_.cols);
}

void kfusion::KinFu::reset() {
    if (frame_counter_)
        cout << "Reset" << endl;

    frame_counter_ = 0;
    poses_.clear();
    poses_.reserve(30000);
    poses_.push_back(Affine3f::Identity());
    volume_->clear();
}

kfusion::Affine3f kfusion::KinFu::getCameraPose(int time) const {
    if ((time > static_cast<int>(poses_.size())) || (time < 0)) {
        time = static_cast<int>(poses_.size()) - 1;
    }

    return poses_[time];
}


================================================
FILE: src/kfusion/marching_cubes.cpp
================================================
/* kinfu includes */
#include <kfusion/cuda/marching_cubes.hpp>
#include <kfusion/internal.hpp>
#include <kfusion/kinfu.hpp>
#include <kfusion/precomp.hpp>

/* sys headers */
#include <iostream>

extern const int edgeTable[256];
extern const int triTable[256][16];
extern const int numVertsTable[256];

kfusion::cuda::MarchingCubes::MarchingCubes() {
    edgeTable_.upload(edgeTable, 256);
    numVertsTable_.upload(numVertsTable, 256);
    triTable_.upload(&triTable[0][0], 256 * 16);
}

kfusion::cuda::MarchingCubes::~MarchingCubes() = default;

void kfusion::cuda::MarchingCubes::setPose(const cv::Affine3f& pose_) { this->pose = pose_; }

kfusion::cuda::Surface kfusion::cuda::MarchingCubes::run(const kfusion::cuda::TsdfVolume& volume,
                                                         kfusion::cuda::DeviceArray<pcl::PointXYZ>& vertices_buffer,
                                                         kfusion::cuda::DeviceArray<pcl::Normal>& normals_buffer) {
    if (vertices_buffer.empty()) {
        vertices_buffer.create(DEFAULT_TRIANGLES_BUFFER_SIZE);
    }
    if (normals_buffer.empty()) {
        normals_buffer.create(DEFAULT_TRIANGLES_BUFFER_SIZE);
    }

    occupied_voxels_buffer_.create(3, static_cast<int>(vertices_buffer.size() / 3));

    kfusion::device::bindTextures(edgeTable_, triTable_, numVertsTable_);

    int3 dims         = device_cast<int3>(volume.getDims());
    float3 voxel_size = device_cast<float3>(volume.getVoxelSize());
    float trunc_dist  = volume.getTruncDist();
    float eta         = volume.getEta();
    float max_weight  = volume.getMaxWeight();

    kfusion::device::TsdfVolume vol(volume.data().ptr<float2>(), dims, voxel_size, trunc_dist, eta, max_weight);

    int active_voxels = kfusion::device::getOccupiedVoxels(vol, occupied_voxels_buffer_);
    std::cout << "no. of active voxels: " << active_voxels << std::endl;

    if (!active_voxels) {
        kfusion::device::unbindTextures();
        return kfusion::cuda::Surface();
    }

    kfusion::cuda::DeviceArray2D<int> occupied_voxels(3, active_voxels, occupied_voxels_buffer_.ptr(),
                                                      occupied_voxels_buffer_.step());

    int total_vertices = kfusion::device::computeOffsetsAndTotalVertices(occupied_voxels);

    float3 volume_size              = device_cast<float3>(volume.getSize());
    const device::Aff3f pose_device = device_cast<device::Aff3f>(pose);

    kfusion::device::generateTriangles(vol, occupied_voxels, volume_size, pose_device,
                                       (kfusion::cuda::DeviceArray<kfusion::device::PointType>&) vertices_buffer,
                                       (kfusion::cuda::DeviceArray<kfusion::device::PointType>&) normals_buffer);

    kfusion::device::unbindTextures();

    kfusion::cuda::DeviceArray<pcl::PointXYZ> v(vertices_buffer.ptr(), total_vertices);
    kfusion::cuda::DeviceArray<pcl::Normal> n(normals_buffer.ptr(), total_vertices);

    kfusion::cuda::Surface surface;
    surface.vertices = v;
    surface.normals  = n;

    return surface;
}

/* edge table maps 8-bit flag representing which cube vertices are inside the isosurface to 12-bit number indicating
 * which edges are intersected */
const int edgeTable[256] = {
    0x0,   0x109, 0x203, 0x30a, 0x406, 0x50f, 0x605, 0x70c, 0x80c, 0x905, 0xa0f, 0xb06, 0xc0a, 0xd03, 0xe09, 0xf00,
    0x190, 0x99,  0x393, 0x29a, 0x596, 0x49f, 0x795, 0x69c, 0x99c, 0x895, 0xb9f, 0xa96, 0xd9a, 0xc93, 0xf99, 0xe90,
    0x230, 0x339, 0x33,  0x13a, 0x636, 0x73f, 0x435, 0x53c, 0xa3c, 0xb35, 0x83f, 0x936, 0xe3a, 0xf33, 0xc39, 0xd30,
    0x3a0, 0x2a9, 0x1a3, 0xaa,  0x7a6, 0x6af, 0x5a5, 0x4ac, 0xbac, 0xaa5, 0x9af, 0x8a6, 0xfaa, 0xea3, 0xda9, 0xca0,
    0x460, 0x569, 0x663, 0x76a, 0x66,  0x16f, 0x265, 0x36c, 0xc6c, 0xd65, 0xe6f, 0xf66, 0x86a, 0x963, 0xa69, 0xb60,
    0x5f0, 0x4f9, 0x7f3, 0x6fa, 0x1f6, 0xff,  0x3f5, 0x2fc, 0xdfc, 0xcf5, 0xfff, 0xef6, 0x9fa, 0x8f3, 0xbf9, 0xaf0,
    0x650, 0x759, 0x453, 0x55a, 0x256, 0x35f, 0x55,  0x15c, 0xe5c, 0xf55, 0xc5f, 0xd56, 0xa5a, 0xb53, 0x859, 0x950,
    0x7c0, 0x6c9, 0x5c3, 0x4ca, 0x3c6, 0x2cf, 0x1c5, 0xcc,  0xfcc, 0xec5, 0xdcf, 0xcc6, 0xbca, 0xac3, 0x9c9, 0x8c0,
    0x8c0, 0x9c9, 0xac3, 0xbca, 0xcc6, 0xdcf, 0xec5, 0xfcc, 0xcc,  0x1c5, 0x2cf, 0x3c6, 0x4ca, 0x5c3, 0x6c9, 0x7c0,
    0x950, 0x859, 0xb53, 0xa5a, 0xd56, 0xc5f, 0xf55, 0xe5c, 0x15c, 0x55,  0x35f, 0x256, 0x55a, 0x453, 0x759, 0x650,
    0xaf0, 0xbf9, 0x8f3, 0x9fa, 0xef6, 0xfff, 0xcf5, 0xdfc, 0x2fc, 0x3f5, 0xff,  0x1f6, 0x6fa, 0x7f3, 0x4f9, 0x5f0,
    0xb60, 0xa69, 0x963, 0x86a, 0xf66, 0xe6f, 0xd65, 0xc6c, 0x36c, 0x265, 0x16f, 0x66,  0x76a, 0x663, 0x569, 0x460,
    0xca0, 0xda9, 0xea3, 0xfaa, 0x8a6, 0x9af, 0xaa5, 0xbac, 0x4ac, 0x5a5, 0x6af, 0x7a6, 0xaa,  0x1a3, 0x2a9, 0x3a0,
    0xd30, 0xc39, 0xf33, 0xe3a, 0x936, 0x83f, 0xb35, 0xa3c, 0x53c, 0x435, 0x73f, 0x636, 0x13a, 0x33,  0x339, 0x230,
    0xe90, 0xf99, 0xc93, 0xd9a, 0xa96, 0xb9f, 0x895, 0x99c, 0x69c, 0x795, 0x49f, 0x596, 0x29a, 0x393, 0x99,  0x190,
    0xf00, 0xe09, 0xd03, 0xc0a, 0xb06, 0xa0f, 0x905, 0x80c, 0x70c, 0x605, 0x50f, 0x406, 0x30a, 0x203, 0x109, 0x0};

/* triangle table maps same cube vertex index to a list of up to 5 triangles which are built from the interpolated edge
 * vertices */
const int triTable[256][16] = {{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 1, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 8, 3, 9, 8, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 3, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {9, 2, 10, 0, 2, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {2, 8, 3, 2, 10, 8, 10, 9, 8, -1, -1, -1, -1, -1, -1, -1},
                               {3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 11, 2, 8, 11, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 9, 0, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 11, 2, 1, 9, 11, 9, 8, 11, -1, -1, -1, -1, -1, -1, -1},
                               {3, 10, 1, 11, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 10, 1, 0, 8, 10, 8, 11, 10, -1, -1, -1, -1, -1, -1, -1},
                               {3, 9, 0, 3, 11, 9, 11, 10, 9, -1, -1, -1, -1, -1, -1, -1},
                               {9, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 3, 0, 7, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 1, 9, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 1, 9, 4, 7, 1, 7, 3, 1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 10, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {3, 4, 7, 3, 0, 4, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1},
                               {9, 2, 10, 9, 0, 2, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1},
                               {2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, -1, -1, -1, -1},
                               {8, 4, 7, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {11, 4, 7, 11, 2, 4, 2, 0, 4, -1, -1, -1, -1, -1, -1, -1},
                               {9, 0, 1, 8, 4, 7, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1},
                               {4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1, -1, -1, -1, -1},
                               {3, 10, 1, 3, 11, 10, 7, 8, 4, -1, -1, -1, -1, -1, -1, -1},
                               {1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, -1, -1, -1, -1},
                               {4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3, -1, -1, -1, -1},
                               {4, 7, 11, 4, 11, 9, 9, 11, 10, -1, -1, -1, -1, -1, -1, -1},
                               {9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {9, 5, 4, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 5, 4, 1, 5, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {8, 5, 4, 8, 3, 5, 3, 1, 5, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 10, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {3, 0, 8, 1, 2, 10, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1},
                               {5, 2, 10, 5, 4, 2, 4, 0, 2, -1, -1, -1, -1, -1, -1, -1},
                               {2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, -1, -1, -1, -1},
                               {9, 5, 4, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 11, 2, 0, 8, 11, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1},
                               {0, 5, 4, 0, 1, 5, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1},
                               {2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, -1, -1, -1, -1},
                               {10, 3, 11, 10, 1, 3, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1},
                               {4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, -1, -1, -1, -1},
                               {5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, -1, -1, -1, -1},
                               {5, 4, 8, 5, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1},
                               {9, 7, 8, 5, 7, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {9, 3, 0, 9, 5, 3, 5, 7, 3, -1, -1, -1, -1, -1, -1, -1},
                               {0, 7, 8, 0, 1, 7, 1, 5, 7, -1, -1, -1, -1, -1, -1, -1},
                               {1, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {9, 7, 8, 9, 5, 7, 10, 1, 2, -1, -1, -1, -1, -1, -1, -1},
                               {10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, -1, -1, -1, -1},
                               {8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2, -1, -1, -1, -1},
                               {2, 10, 5, 2, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1},
                               {7, 9, 5, 7, 8, 9, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1},
                               {9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11, -1, -1, -1, -1},
                               {2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, -1, -1, -1, -1},
                               {11, 2, 1, 11, 1, 7, 7, 1, 5, -1, -1, -1, -1, -1, -1, -1},
                               {9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, -1, -1, -1, -1},
                               {5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, -1},
                               {11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, -1},
                               {11, 10, 5, 7, 11, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 3, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {9, 0, 1, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 8, 3, 1, 9, 8, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1},
                               {1, 6, 5, 2, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 6, 5, 1, 2, 6, 3, 0, 8, -1, -1, -1, -1, -1, -1, -1},
                               {9, 6, 5, 9, 0, 6, 0, 2, 6, -1, -1, -1, -1, -1, -1, -1},
                               {5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, -1, -1, -1, -1},
                               {2, 3, 11, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {11, 0, 8, 11, 2, 0, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1},
                               {0, 1, 9, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1},
                               {5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, -1, -1, -1, -1},
                               {6, 3, 11, 6, 5, 3, 5, 1, 3, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, -1, -1, -1, -1},
                               {3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, -1, -1, -1, -1},
                               {6, 5, 9, 6, 9, 11, 11, 9, 8, -1, -1, -1, -1, -1, -1, -1},
                               {5, 10, 6, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 3, 0, 4, 7, 3, 6, 5, 10, -1, -1, -1, -1, -1, -1, -1},
                               {1, 9, 0, 5, 10, 6, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1},
                               {10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, -1, -1, -1, -1},
                               {6, 1, 2, 6, 5, 1, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7, -1, -1, -1, -1},
                               {8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, -1, -1, -1, -1},
                               {7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, -1},
                               {3, 11, 2, 7, 8, 4, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1},
                               {5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, -1, -1, -1, -1},
                               {0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1},
                               {9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, -1},
                               {8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, -1, -1, -1, -1},
                               {5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, -1},
                               {0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, -1},
                               {6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, -1, -1, -1, -1},
                               {10, 4, 9, 6, 4, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 10, 6, 4, 9, 10, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1},
                               {10, 0, 1, 10, 6, 0, 6, 4, 0, -1, -1, -1, -1, -1, -1, -1},
                               {8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, -1, -1, -1, -1},
                               {1, 4, 9, 1, 2, 4, 2, 6, 4, -1, -1, -1, -1, -1, -1, -1},
                               {3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4, -1, -1, -1, -1},
                               {0, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {8, 3, 2, 8, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1},
                               {10, 4, 9, 10, 6, 4, 11, 2, 3, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, -1, -1, -1, -1},
                               {3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, -1, -1, -1, -1},
                               {6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, -1},
                               {9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, -1, -1, -1, -1},
                               {8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1, -1},
                               {3, 11, 6, 3, 6, 0, 0, 6, 4, -1, -1, -1, -1, -1, -1, -1},
                               {6, 4, 8, 11, 6, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {7, 10, 6, 7, 8, 10, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1},
                               {0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, -1, -1, -1, -1},
                               {10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, -1, -1, -1, -1},
                               {10, 6, 7, 10, 7, 1, 1, 7, 3, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, -1, -1, -1, -1},
                               {2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9, -1},
                               {7, 8, 0, 7, 0, 6, 6, 0, 2, -1, -1, -1, -1, -1, -1, -1},
                               {7, 3, 2, 6, 7, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, -1, -1, -1, -1},
                               {2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, -1},
                               {1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, -1},
                               {11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, -1, -1, -1, -1},
                               {8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, -1},
                               {0, 9, 1, 11, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, -1, -1, -1, -1},
                               {7, 11, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {3, 0, 8, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 1, 9, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {8, 1, 9, 8, 3, 1, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1},
                               {10, 1, 2, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 10, 3, 0, 8, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1},
                               {2, 9, 0, 2, 10, 9, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1},
                               {6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, -1, -1, -1, -1},
                               {7, 2, 3, 6, 2, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {7, 0, 8, 7, 6, 0, 6, 2, 0, -1, -1, -1, -1, -1, -1, -1},
                               {2, 7, 6, 2, 3, 7, 0, 1, 9, -1, -1, -1, -1, -1, -1, -1},
                               {1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, -1, -1, -1, -1},
                               {10, 7, 6, 10, 1, 7, 1, 3, 7, -1, -1, -1, -1, -1, -1, -1},
                               {10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, -1, -1, -1, -1},
                               {0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, -1, -1, -1, -1},
                               {7, 6, 10, 7, 10, 8, 8, 10, 9, -1, -1, -1, -1, -1, -1, -1},
                               {6, 8, 4, 11, 8, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {3, 6, 11, 3, 0, 6, 0, 4, 6, -1, -1, -1, -1, -1, -1, -1},
                               {8, 6, 11, 8, 4, 6, 9, 0, 1, -1, -1, -1, -1, -1, -1, -1},
                               {9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, -1, -1, -1, -1},
                               {6, 8, 4, 6, 11, 8, 2, 10, 1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6, -1, -1, -1, -1},
                               {4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, -1, -1, -1, -1},
                               {10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, -1},
                               {8, 2, 3, 8, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1},
                               {0, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, -1, -1, -1, -1},
                               {1, 9, 4, 1, 4, 2, 2, 4, 6, -1, -1, -1, -1, -1, -1, -1},
                               {8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, -1, -1, -1, -1},
                               {10, 1, 0, 10, 0, 6, 6, 0, 4, -1, -1, -1, -1, -1, -1, -1},
                               {4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, -1},
                               {10, 9, 4, 6, 10, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 9, 5, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 3, 4, 9, 5, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1},
                               {5, 0, 1, 5, 4, 0, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1},
                               {11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, -1, -1, -1, -1},
                               {9, 5, 4, 10, 1, 2, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1},
                               {6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, -1, -1, -1, -1},
                               {7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, -1, -1, -1, -1},
                               {3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, -1},
                               {7, 2, 3, 7, 6, 2, 5, 4, 9, -1, -1, -1, -1, -1, -1, -1},
                               {9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7, -1, -1, -1, -1},
                               {3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, -1, -1, -1, -1},
                               {6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, -1},
                               {9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, -1, -1, -1, -1},
                               {1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, -1},
                               {4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, -1},
                               {7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, -1, -1, -1, -1},
                               {6, 9, 5, 6, 11, 9, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1},
                               {3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, -1, -1, -1, -1},
                               {0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, -1, -1, -1, -1},
                               {6, 11, 3, 6, 3, 5, 5, 3, 1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, -1, -1, -1, -1},
                               {0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10, -1},
                               {11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, -1},
                               {6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, -1, -1, -1, -1},
                               {5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, -1, -1, -1, -1},
                               {9, 5, 6, 9, 6, 0, 0, 6, 2, -1, -1, -1, -1, -1, -1, -1},
                               {1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8, -1},
                               {1, 5, 6, 2, 1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, -1},
                               {10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0, -1, -1, -1, -1},
                               {0, 3, 8, 5, 6, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {10, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {11, 5, 10, 7, 5, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {11, 5, 10, 11, 7, 5, 8, 3, 0, -1, -1, -1, -1, -1, -1, -1},
                               {5, 11, 7, 5, 10, 11, 1, 9, 0, -1, -1, -1, -1, -1, -1, -1},
                               {10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, -1, -1, -1, -1},
                               {11, 1, 2, 11, 7, 1, 7, 5, 1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, -1, -1, -1, -1},
                               {9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, -1, -1, -1, -1},
                               {7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, -1},
                               {2, 5, 10, 2, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1},
                               {8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5, -1, -1, -1, -1},
                               {9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, -1, -1, -1, -1},
                               {9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, -1},
                               {1, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 7, 0, 7, 1, 1, 7, 5, -1, -1, -1, -1, -1, -1, -1},
                               {9, 0, 3, 9, 3, 5, 5, 3, 7, -1, -1, -1, -1, -1, -1, -1},
                               {9, 8, 7, 5, 9, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {5, 8, 4, 5, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1},
                               {5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, -1, -1, -1, -1},
                               {0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5, -1, -1, -1, -1},
                               {10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, -1},
                               {2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, -1, -1, -1, -1},
                               {0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, -1},
                               {0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, -1},
                               {9, 4, 5, 2, 11, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, -1, -1, -1, -1},
                               {5, 10, 2, 5, 2, 4, 4, 2, 0, -1, -1, -1, -1, -1, -1, -1},
                               {3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, -1},
                               {5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, -1, -1, -1, -1},
                               {8, 4, 5, 8, 5, 3, 3, 5, 1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 4, 5, 1, 0, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, -1, -1, -1, -1},
                               {9, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 11, 7, 4, 9, 11, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1},
                               {0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, -1, -1, -1, -1},
                               {1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, -1, -1, -1, -1},
                               {3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, -1},
                               {4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, -1, -1, -1, -1},
                               {9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3, -1},
                               {11, 7, 4, 11, 4, 2, 2, 4, 0, -1, -1, -1, -1, -1, -1, -1},
                               {11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, -1, -1, -1, -1},
                               {2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, -1, -1, -1, -1},
                               {9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, -1},
                               {3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, -1},
                               {1, 10, 2, 8, 7, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 9, 1, 4, 1, 7, 7, 1, 3, -1, -1, -1, -1, -1, -1, -1},
                               {4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1, -1, -1, -1, -1},
                               {4, 0, 3, 7, 4, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {4, 8, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {9, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {3, 0, 9, 3, 9, 11, 11, 9, 10, -1, -1, -1, -1, -1, -1, -1},
                               {0, 1, 10, 0, 10, 8, 8, 10, 11, -1, -1, -1, -1, -1, -1, -1},
                               {3, 1, 10, 11, 3, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 2, 11, 1, 11, 9, 9, 11, 8, -1, -1, -1, -1, -1, -1, -1},
                               {3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9, -1, -1, -1, -1},
                               {0, 2, 11, 8, 0, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {3, 2, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {2, 3, 8, 2, 8, 10, 10, 8, 9, -1, -1, -1, -1, -1, -1, -1},
                               {9, 10, 2, 0, 9, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, -1, -1, -1, -1},
                               {1, 10, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {1, 3, 8, 9, 1, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 9, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {0, 3, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
                               {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}};

/* number of vertices for each case above */
const int numVertsTable[256] = {
    0,  3,  3,  6,  3,  6,  6,  9,  3,  6,  6,  9,  6,  9,  9,  6,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,
    12, 12, 9,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,  9,  9,  6,  9,  12, 12, 9,  9,  12,
    12, 9,  12, 15, 15, 6,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,  9,  9,  12, 9,  12, 12,
    15, 9,  12, 12, 15, 12, 15, 15, 12, 6,  9,  9,  12, 9,  12, 6,  9,  9,  12, 12, 15, 12, 15, 9,  6,  9,  12, 12, 9,
    12, 15, 9,  6,  12, 15, 15, 12, 15, 6,  12, 3,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,
    9,  9,  12, 9,  12, 12, 15, 9,  6,  12, 9,  12, 9,  15, 6,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12, 15, 12, 15,
    15, 12, 9,  12, 12, 9,  12, 15, 15, 12, 12, 9,  15, 6,  15, 12, 6,  3,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12,
    15, 6,  9,  9,  6,  9,  12, 12, 15, 12, 15, 15, 6,  12, 9,  15, 12, 9,  6,  12, 3,  9,  12, 12, 15, 12, 15, 9,  12,
    12, 15, 15, 6,  9,  12, 6,  3,  6,  9,  9,  6,  9,  12, 6,  3,  9,  6,  12, 3,  6,  3,  3,  0,
};


================================================
FILE: src/kfusion/precomp.cpp
================================================
#include <kfusion/internal.hpp>
#include <kfusion/precomp.hpp>

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Kinfu/types implementation

kfusion::Intr::Intr() {}
kfusion::Intr::Intr(float fx_, float fy_, float cx_, float cy_) : fx(fx_), fy(fy_), cx(cx_), cy(cy_) {}

kfusion::Intr kfusion::Intr::operator()(int level_index) const {
    int div = 1 << level_index;
    return (Intr(fx / div, fy / div, cx / div, cy / div));
}

std::ostream &operator<<(std::ostream &os, const kfusion::Intr &intr) {
    return os << "([f = " << intr.fx << ", " << intr.fy << "] [cp = " << intr.cx << ", " << intr.cy << "])";
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TsdfVolume host implementation

kfusion::device::TsdfVolume::TsdfVolume(float2 *data, int3 dims, float3 voxel_size, float trunc_dist, float eta,
                                        float max_weight)
    : data(data), dims(dims), voxel_size(voxel_size), trunc_dist(trunc_dist), eta(eta), max_weight(max_weight) {}

// kfusion::device::TsdfVolume::float2*
// kfusionl::device::TsdfVolume::operator()(int x, int y, int z) { return data +
// x + y*dims.x + z*dims.y*dims.x; }
//
// const kfusion::device::TsdfVolume::float2*
// kfusionl::device::TsdfVolume::operator() (int x, int y, int z) const { return
// data + x + y*dims.x + z*dims.y*dims.x; }
//
// kfusion::device::TsdfVolume::float2* kfusionl::device::TsdfVolume::beg(int
// x, int y) const { return data + x + dims.x * y; }
//
// kfusion::device::TsdfVolume::float2*
// kfusionl::device::TsdfVolume::zstep(float2 *const ptr) const { return data
//+ dims.x * dims.y; }

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Projector host implementation

kfusion::device::Projector::Projector(float fx, float fy, float cx, float cy)
    : f(make_float2(fx, fy)), c(make_float2(cx, cy)) {}

// float2 kfusion::device::Projector::operator()(const float3& p) const
//{
//  float2 coo;
//  coo.x = p.x * f.x / p.z + c.x;
//  coo.y = p.y * f.y / p.z + c.y;
//  return coo;
//}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Reprojector host implementation

kfusion::device::Reprojector::Reprojector(float fx, float fy, float cx, float cy)
    : finv(make_float2(1.f / fx, 1.f / fy)), c(make_float2(cx, cy)) {}

// float3 kfusion::device::Reprojector::operator()(int u, int v, float z) const
//{
//  float x = z * (u - c.x) * finv.x;
//  float y = z * (v - c.y) * finv.y;
//  return make_float3(x, y, z);
//}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Host implementation of packing/unpacking tsdf volume element

// float2 kfusion::device::pack_tsdf(float tsdf, int weight) { throw "Not
// implemented"; return float2(); }  float kfusion::device::unpack_tsdf(float2
// value, int& weight) { throw "Not implemented"; return 0; }  float
// kfusion::device::unpack_tsdf(float2 value) { throw "Not implemented"; return
// 0; }


================================================
FILE: src/kfusion/projective_icp.cpp
================================================
#include <kfusion/precomp.hpp>

using namespace kfusion;
using namespace kfusion::cuda;

using namespace std;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// ComputeIcpHelper

kfusion::device::ComputeIcpHelper::ComputeIcpHelper(float dist_thres, float angle_thres) {
    min_cosine  = cos(angle_thres);
    dist2_thres = dist_thres * dist_thres;
}

void kfusion::device::ComputeIcpHelper::setLevelIntr(int level_index, float fx, float fy, float cx, float cy) {
    int div = 1 << level_index;
    f       = make_float2(fx / div, fy / div);
    c       = make_float2(cx / div, cy / div);
    finv    = make_float2(1.f / f.x, 1.f / f.y);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// ProjectiveICP::StreamHelper

struct kfusion::cuda::ProjectiveICP::StreamHelper {
    typedef device::ComputeIcpHelper::PageLockHelper PageLockHelper;
    typedef cv::Matx66f Mat6f;
    typedef cv::Vec6f Vec6f;

    cudaStream_t stream;
    PageLockHelper locked_buffer;

    StreamHelper() { cudaSafeCall(cudaStreamCreate(&stream)); }
    ~StreamHelper() { cudaSafeCall(cudaStreamDestroy(stream)); }

    operator float *() { return locked_buffer.data; }
    operator cudaStream_t() { return stream; }

    Mat6f get(Vec6f &b) {
        cudaSafeCall(cudaStreamSynchronize(stream));

        Mat6f A;
        float *data_A = A.val;
        float *data_b = b.val;

        int shift = 0;
        for (int i = 0; i < 6; ++i)      // rows
            for (int j = i; j < 7; ++j)  // cols + b
            {
                float value = locked_buffer.data[shift++];
                if (j == 6)  // vector b
                    data_b[i] = value;
                else
                    data_A[j * 6 + i] = data_A[i * 6 + j] = value;
            }
        return A;
    }
};

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// ProjectiveICP

kfusion::cuda::ProjectiveICP::ProjectiveICP() : angle_thres_(deg2rad(20.f)), dist_thres_(0.1f) {
    const int iters[] = {10, 5, 4, 0};
    std::vector<int> vector_iters(iters, iters + 4);
    setIterationsNum(vector_iters);
    device::ComputeIcpHelper::allocate_buffer(buffer_);

    shelp_ = cv::Ptr<StreamHelper>(new StreamHelper());
}

kfusion::cuda::ProjectiveICP::~ProjectiveICP() {}

float kfusion::cuda::ProjectiveICP::getDistThreshold() const { return dist_thres_; }

void kfusion::cuda::ProjectiveICP::setDistThreshold(float distance) { dist_thres_ = distance; }

float kfusion::cuda::ProjectiveICP::getAngleThreshold() const { return angle_thres_; }

void kfusion::cuda::ProjectiveICP::setAngleThreshold(float angle) { angle_thres_ = angle; }

void kfusion::cuda::ProjectiveICP::setIterationsNum(const std::vector<int> &iters) {
    if (iters.size() >= MAX_PYRAMID_LEVELS)
        iters_.assign(iters.begin(), iters.begin() + MAX_PYRAMID_LEVELS);
    else {
        iters_ = vector<int>(MAX_PYRAMID_LEVELS, 0);
        copy(iters.begin(), iters.end(), iters_.begin());
    }
}

int kfusion::cuda::ProjectiveICP::getUsedLevelsNum() const {
    int i = MAX_PYRAMID_LEVELS - 1;
    for (; i >= 0 && !iters_[i]; --i)
        ;
    return i + 1;
}

bool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f & /*affine*/, const Intr & /*intr*/,
                                                     const Frame & /*curr*/, const Frame & /*prev*/) {
    //    bool has_depth = !curr.depth_pyr.empty() && !prev.depth_pyr.empty();
    //    bool has_points = !curr.points_pyr.empty() && !prev.points_pyr.empty();

    //    if (has_depth)
    //        return estimateTransform(affine, intr, curr.depth_pyr,
    //        curr.normals_pyr, prev.depth_pyr, prev.normals_pyr);
    //    else if(has_points)
    //         return estimateTransform(affine, intr, curr.points_pyr,
    //         curr.normals_pyr, prev.points_pyr, prev.normals_pyr);
    //    else
    //        CV_Assert(!"Wrong parameters passed to estimateTransform");
    CV_Assert(!"Not implemented");
    return false;
}

bool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f &affine, const Intr &intr, const DepthPyr &dcurr,
                                                     const NormalsPyr ncurr, const DepthPyr dprev,
                                                     const NormalsPyr nprev) {
    const int LEVELS = getUsedLevelsNum();
    StreamHelper &sh = *shelp_;

    device::ComputeIcpHelper helper(dist_thres_, angle_thres_);
    affine = Affine3f::Identity();

    for (int level_index = LEVELS - 1; level_index >= 0; --level_index) {
        const device::Normals &n = (const device::Normals &) nprev[level_index];

        helper.rows = (float) n.rows();
        helper.cols = (float) n.cols();
        helper.setLevelIntr(level_index, intr.fx, intr.fy, intr.cx, intr.cy);
        helper.dcurr = dcurr[level_index];
        helper.ncurr = ncurr[level_index];

        for (int iter = 0; iter < iters_[level_index]; ++iter) {
            helper.aff = device_cast<device::Aff3f>(affine);
            helper(dprev[level_index], n, buffer_, sh, sh);

            StreamHelper::Vec6f b;
            StreamHelper::Mat6f A = sh.get(b);

            // checking nullspace
            double det = cv::determinant(A);

            if (fabs(det) < 1e-15 || cv::viz::isNan(det)) {
                if (cv::viz::isNan(det))
                    cout << "qnan" << endl;
                return false;
            }

            StreamHelper::Vec6f r;
            cv::solve(A, b, r, cv::DECOMP_SVD);
            Affine3f Tinc(Vec3f(r.val), Vec3f(r.val + 3));
            affine = Tinc * affine;
        }
    }
    return true;
}

bool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f &affine, const Intr &intr, const PointsPyr &vcurr,
                                                     const NormalsPyr ncurr, const PointsPyr vprev,
                                                     const NormalsPyr nprev) {
    const int LEVELS = getUsedLevelsNum();
    StreamHelper &sh = *shelp_;

    device::ComputeIcpHelper helper(dist_thres_, angle_thres_);
    affine = Affine3f::Identity();

    for (int level_index = LEVELS - 1; level_index >= 0; --level_index) {
        const device::Normals &n = (const device::Normals &) nprev[level_index];
        const device::Points &v  = (const device::Points &) vprev[level_index];

        helper.rows = (float) n.rows();
        helper.cols = (float) n.cols();
        helper.setLevelIntr(level_index, intr.fx, intr.fy, intr.cx, intr.cy);
        helper.vcurr = vcurr[level_index];
        helper.ncurr = ncurr[level_index];

        for (int iter = 0; iter < iters_[level_index]; ++iter) {
            helper.aff = device_cast<device::Aff3f>(affine);
            helper(v, n, buffer_, sh, sh);

            StreamHelper::Vec6f b;
            StreamHelper::Mat6f A = sh.get(b);

            // checking nullspace
            double det = cv::determinant(A);

            if (fabs(det) < 1e-15 || cv::viz::isNan(det)) {
                if (cv::viz::isNan(det))
                    cout << "qnan" << endl;
                return false;
            }

            StreamHelper::Vec6f r;
            cv::solve(A, b, r, cv::DECOMP_SVD);

            Affine3f Tinc(Vec3f(r.val), Vec3f(r.val + 3));
            affine = Tinc * affine;
        }
    }
    return true;
}


================================================
FILE: src/kfusion/tsdf_volume.cpp
================================================
#include <kfusion/precomp.hpp>

using namespace kfusion;
using namespace kfusion::cuda;

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TsdfVolume::Entry

float kfusion::cuda::TsdfVolume::Entry::half2float(half) { throw "Not implemented"; }

kfusion::cuda::TsdfVolume::Entry::half kfusion::cuda::TsdfVolume::Entry::float2half(float value) {
    throw "Not implemented";
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TsdfVolume

kfusion::cuda::TsdfVolume::TsdfVolume(const Params &params) {
    dims_ = params.volume_dims;
    size_ = params.volume_size;
    pose_ = params.volume_pose;

    trunc_dist_ = params.tsdf_trunc_dist;
    eta_        = params.eta;
    max_weight_ = params.tsdf_max_weight;

    gradient_delta_factor_ = params.gradient_delta_factor;

    create(dims_);
}

kfusion::cuda::TsdfVolume::~TsdfVolume() {}

void kfusion::cuda::TsdfVolume::create(const Vec3i &dims) {
    dims_ = dims;

    int no_voxels = dims_[0] * dims_[1] * dims_[2];
    data_.create(no_voxels * 2 * sizeof(float));

    clear();
}

Vec3i kfusion::cuda::TsdfVolume::getDims() const { return dims_; }

Vec3f kfusion::cuda::TsdfVolume::getVoxelSize() const {
    return Vec3f(size_[0] / dims_[0], size_[1] / dims_[1], size_[2] / dims_[2]);
}

const CudaData kfusion::cuda::TsdfVolume::data() const { return data_; }
CudaData kfusion::cuda::TsdfVolume::data() { return data_; }

Vec3f kfusion::cuda::TsdfVolume::getSize() const { return size_; }
void kfusion::cuda::TsdfVolume::setSize(const Vec3f &size) { size_ = size; }

float kfusion::cuda::TsdfVolume::getTruncDist() const { return trunc_dist_; }
void kfusion::cuda::TsdfVolume::setTruncDist(float &distance) { trunc_dist_ = distance; }

float kfusion::cuda::TsdfVolume::getEta() const { return eta_; }
void kfusion::cuda::TsdfVolume::setEta(float &eta) { eta_ = eta; }

float kfusion::cuda::TsdfVolume::getMaxWeight() const { return max_weight_; }
void kfusion::cuda::TsdfVolume::setMaxWeight(float &weight) { max_weight_ = weight; }

Affine3f kfusion::cuda::TsdfVolume::getPose() const { return pose_; }
void kfusion::cuda::TsdfVolume::setPose(const Affine3f &pose) { pose_ = pose; }

float kfusion::cuda::TsdfVolume::getRaycastStepFactor() const { return raycast_step_factor_; }
void kfusion::cuda::TsdfVolume::setRaycastStepFactor(float &factor) { raycast_step_factor_ = factor; }

float kfusion::cuda::TsdfVolume::getGradientDeltaFactor() const { return gradient_delta_factor_; }
void kfusion::cuda::TsdfVolume::setGradientDeltaFactor(float &factor) { gradient_delta_factor_ = factor; }

void kfusion::cuda::TsdfVolume::clear() {
    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());

    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);
    device::clear_volume(volume);
}

void kfusion::cuda::TsdfVolume::swap(CudaData &data) { data_.swap(data); }

void kfusion::cuda::TsdfVolume::applyAffine(const Affine3f &affine) { pose_ = affine * pose_; }

void kfusion::cuda::TsdfVolume::integrate(const TsdfVolume &phi_n_psi) {
    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());

    device::TsdfVolume phi_global_device(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);
    device::TsdfVolume phi_n_psi_device(phi_n_psi.data().ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);

    device::integrate(phi_global_device, phi_n_psi_device);
}

void kfusion::cuda::TsdfVolume::integrate(const Dists &dists, const Affine3f &camera_pose, const Intr &intr) {
    Affine3f vol2cam  = camera_pose.inv() * pose_;
    device::Aff3f aff = device_cast<device::Aff3f>(vol2cam);

    device::Projector proj(intr.fx, intr.fy, intr.cx, intr.cy);

    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);

    device::integrate(dists, volume, aff, proj);
}

void kfusion::cuda::TsdfVolume::initBox(const float3 &b) {
    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);

    device::init_box(volume, b);
}

void kfusion::cuda::TsdfVolume::initEllipsoid(const float3 &r) {
    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);

    device::init_ellipsoid(volume, r);
}

void kfusion::cuda::TsdfVolume::initPlane(const float &z) {
    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);

    device::init_plane(volume, z);
}

void kfusion::cuda::TsdfVolume::initSphere(const float3 &centre, const float &radius) {
    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);

    device::init_sphere(volume, centre, radius);
}

void kfusion::cuda::TsdfVolume::initTorus(const float2 &t) {
    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
    device::TsdfVolume volume(data_.ptr<float2>(), dims, vsz, trunc_dist_, eta_, max_weight_);

    device::init_torus(volume, t);
}

void kfusion::cuda::TsdfVolume::print_sdf_values() {
    cv::Vec3i dims = getDims();

    float2 *phi_data_ptr = new float2[dims[0] * dims[1] * dims[2]];
    data().download(phi_data_ptr);
    for (int i = 0; i < dims[0]; i++) {
        for (int j = 0; j < dims[1]; j++) {
            for (int k = 0; k < dims[2]; k++) {
                float2 val = phi_data_ptr[k * dims[1] * dims[0] + j * dims[0] + i];
                if (val.x != 0.f) {
                    std::cout << val.x << std::endl;
                }
            }
        }
    }
}


================================================
FILE: src/sobfu/CMakeLists.txt
================================================
# Build all sobfu source files into library dynfy
add_module_library(sobfu)

# Link to sobfu library
target_link_libraries(sobfu
    kfusion
    gcov
    ${Boost_LIBRARIES}
    ${CUDA_CUDART_LIBRARY}
    ${PCL_LIBRARIES}
)


================================================
FILE: src/sobfu/cuda/reductor.cu
================================================
/* sobfu includes */
#include <sobfu/cuda/utils.hpp>
#include <sobfu/reductor.hpp>

#define FULL_MASK 0xffffffff

/*
 * OWN KERNELS
 */

template <unsigned int blockSize, bool nIsPow2>
__global__ void sobfu::device::reduce_data_kernel(float2 *g_idata_global, float2 *g_idata_n, float *g_odata,
                                                  unsigned int n) {
    float *sdata = SharedMemory<float>();

    /* perform first level of reduction, reading from global memory, writing to shared memory */
    unsigned int tid      = threadIdx.x;
    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    float mySum = 0.f;

    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via
     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */
    while (i < n) {
        mySum += (g_idata_global[i].x - g_idata_n[i].x) * (g_idata_global[i].x - g_idata_n[i].x);

        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */
        if (nIsPow2 || i + blockSize < n) {
            mySum += (g_idata_global[i + blockSize].x - g_idata_n[i + blockSize].x) *
                     (g_idata_global[i + blockSize].x - g_idata_n[i + blockSize].x);
        }

        i += gridSize;
    }

    /* each thread puts its local sum into shared memory */
    sdata[tid] = mySum;
    __syncthreads();

    /* do reduction in shared mem */
    if ((blockSize >= 512) && (tid < 256)) {
        sdata[tid] = mySum = mySum + sdata[tid + 256];
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        sdata[tid] = mySum = mySum + sdata[tid + 128];
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        sdata[tid] = mySum = mySum + sdata[tid + 64];
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300)
    if (tid < 32) {
        /* fetch final intermediate sum from 2nd warp */
        if (blockSize >= 64)
            mySum += sdata[tid + 32];
        /* reduce final warp using shuffle */
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            mySum += __shfl_down_sync(FULL_MASK, mySum, offset);
        }
    }
#else
    /* fully unroll reduction within a single warp */
    if ((blockSize >= 64) && (tid < 32)) {
        sdata[tid] = mySum = mySum + sdata[tid + 32];
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16)) {
        sdata[tid] = mySum = mySum + sdata[tid + 16];
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8)) {
        sdata[tid] = mySum = mySum + sdata[tid + 8];
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4)) {
        sdata[tid] = mySum = mySum + sdata[tid + 4];
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2)) {
        sdata[tid] = mySum = mySum + sdata[tid + 2];
    }

    __syncthreads();

    if ((blockSize >= 2) && (tid < 1)) {
        sdata[tid] = mySum = mySum + sdata[tid + 1];
    }

    __syncthreads();
#endif

    /* write result for this block to global mem */
    if (tid == 0)
        g_odata[blockIdx.x] = mySum;
}

template <unsigned int blockSize, bool nIsPow2>
__global__ void sobfu::device::reduce_reg_sobolev_kernel(Mat4f *g_idata, float *g_odata, unsigned int n) {
    float *sdata = SharedMemory<float>();

    /* perform first level of reduction, reading from global memory, writing to shared memory */
    unsigned int tid      = threadIdx.x;
    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    float mySum = 0.f;

    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via
     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */
    while (i < n) {
        mySum += norm_sq(g_idata[i].data[0]) + norm_sq(g_idata[i].data[1]) + norm_sq(g_idata[i].data[2]);

        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */
        if (nIsPow2 || i + blockSize < n) {
            mySum += norm_sq(g_idata[i + blockSize].data[0]) + norm_sq(g_idata[i + blockSize].data[1]) +
                     norm_sq(g_idata[i + blockSize].data[2]);
        }

        i += gridSize;
    }

    /* each thread puts its local sum into shared memory */
    sdata[tid] = mySum;
    __syncthreads();

    /* do reduction in shared mem */
    if ((blockSize >= 512) && (tid < 256)) {
        sdata[tid] = mySum = mySum + sdata[tid + 256];
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        sdata[tid] = mySum = mySum + sdata[tid + 128];
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        sdata[tid] = mySum = mySum + sdata[tid + 64];
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300)
    if (tid < 32) {
        /* fetch final intermediate sum from 2nd warp */
        if (blockSize >= 64)
            mySum += sdata[tid + 32];
        /* reduce final warp using shuffle */
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            mySum += __shfl_down_sync(FULL_MASK, mySum, offset);
        }
    }
#else
    /* fully unroll reduction within a single warp */
    if ((blockSize >= 64) && (tid < 32)) {
        sdata[tid] = mySum = mySum + sdata[tid + 32];
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16)) {
        sdata[tid] = mySum = mySum + sdata[tid + 16];
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8)) {
        sdata[tid] = mySum = mySum + sdata[tid + 8];
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4)) {
        sdata[tid] = mySum = mySum + sdata[tid + 4];
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2)) {
        sdata[tid] = mySum = mySum + sdata[tid + 2];
    }

    __syncthreads();

    if ((blockSize >= 2) && (tid < 1)) {
        sdata[tid] = mySum = mySum + sdata[tid + 1];
    }

    __syncthreads();
#endif

    /* write result for this block to global mem */
    if (tid == 0)
        g_odata[blockIdx.x] = mySum;
}

template <unsigned int blockSize, bool nIsPow2>
__global__ void sobfu::device::reduce_voxel_max_energy_kernel(float2 *d_idata_global, float2 *d_idata_n,
                                                              Mat4f *d_idata_reg, float2 *d_o_data, float w_reg,
                                                              unsigned int n) {
    float2 *sdata = SharedMemory<float2>();

    /* perform first level of reduction, reading from global memory, writing to shared memory */
    unsigned int tid      = threadIdx.x;
    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    float2 local_max;
    local_max.x = 0.f;
    local_max.y = 0.f;

    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via
     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */
    while (i < n) {
        float temp = (d_idata_global[i].x - d_idata_n[i].x) * (d_idata_global[i].x - d_idata_n[i].x) +
                     w_reg * (norm_sq(d_idata_reg[i].data[0]) + norm_sq(d_idata_reg[i].data[1]) +
                              norm_sq(d_idata_reg[i].data[2]));

        if (temp > local_max.x) {
            local_max.x = temp;
            local_max.y = (float) i;
        }

        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */
        if (nIsPow2 || i + blockSize < n) {
            temp = (d_idata_global[i + blockSize].x - d_idata_n[i + blockSize].x) *
                       (d_idata_global[i + blockSize].x - d_idata_n[i + blockSize].x) +
                   w_reg * (norm_sq(d_idata_reg[i + blockSize].data[0]) + norm_sq(d_idata_reg[i + blockSize].data[1]) +
                            norm_sq(d_idata_reg[i + blockSize].data[2]));

            if (temp > local_max.x) {
                local_max.x = temp;
                local_max.y = (float) i + blockSize;
            }
        }

        i += gridSize;
    }

    /* each thread puts its local sum into shared memory */
    sdata[tid] = local_max;
    __syncthreads();

    /* do reduction in shared mem */
    if ((blockSize >= 512) && (tid < 256)) {
        if (sdata[tid + 256].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 256];
        }
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        if (sdata[tid + 128].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 128];
        }
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        if (sdata[tid + 64].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 64];
        }
    }

    __syncthreads();

    if ((blockSize >= 64) && (tid < 32)) {
        if (sdata[tid + 32].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 32];
        }
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16)) {
        if (sdata[tid + 16].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 16];
        }
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8)) {
        if (sdata[tid + 8].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 8];
        }
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4)) {
        if (sdata[tid + 4].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 4];
        }
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2)) {
        if (sdata[tid + 2].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 2];
        }
    }

    __syncthreads();

    if ((blockSize >= 2) && (tid < 1)) {
        if (sdata[tid + 1].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 1];
        }
    }

    __syncthreads();

    /* write result for this block to global mem */
    if (tid == 0) {
        d_o_data[blockIdx.x] = local_max;
    }
}

template <unsigned int blockSize, bool nIsPow2>
__global__ void sobfu::device::reduce_max_kernel(float4 *updates, float2 *g_o_max_data, unsigned int n) {
    float2 *sdata = SharedMemory<float2>();

    /* perform first level of reduction, reading from global memory, writing to shared memory */
    unsigned int tid      = threadIdx.x;
    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    float2 local_max;
    local_max.x = 0.f;
    local_max.y = 0.f;

    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via
     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */
    while (i < n) {
        if (norm(updates[i]) > local_max.x) {
            local_max.x = norm(updates[i]);
            local_max.y = (float) i;
        }

        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */
        if (nIsPow2 || i + blockSize < n) {
            if (norm(updates[i + blockSize]) > local_max.x) {
                local_max.x = norm(updates[i + blockSize]);
                local_max.y = (float) i + blockSize;
            }
        }

        i += gridSize;
    }

    /* each thread puts its local sum into shared memory */
    sdata[tid] = local_max;
    __syncthreads();

    /* do reduction in shared mem */
    if ((blockSize >= 512) && (tid < 256)) {
        if (sdata[tid + 256].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 256];
        }
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        if (sdata[tid + 128].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 128];
        }
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        if (sdata[tid + 64].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 64];
        }
    }

    __syncthreads();

    if ((blockSize >= 64) && (tid < 32)) {
        if (sdata[tid + 32].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 32];
        }
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16)) {
        if (sdata[tid + 16].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 16];
        }
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8)) {
        if (sdata[tid + 8].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 8];
        }
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4)) {
        if (sdata[tid + 4].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 4];
        }
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2)) {
        if (sdata[tid + 2].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 2];
        }
    }

    __syncthreads();

    if ((blockSize >= 2) && (tid < 1)) {
        if (sdata[tid + 1].x > local_max.x) {
            sdata[tid] = local_max = sdata[tid + 1];
        }
    }

    __syncthreads();

    /* write result for this block to global mem */
    if (tid == 0) {
        g_o_max_data[blockIdx.x] = local_max;
    }
}

/* wrapper function for kernel launch */
void sobfu::device::reduce_data(int size, int threads, int blocks, float2 *d_idata_global, float2 *d_idata_n,
                                float *d_odata) {
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't
     * index shared memory out of bounds */
    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);

    if (isPow2(size)) {
        switch (threads) {
            case 512:
                reduce_data_kernel<512, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 256:
                reduce_data_kernel<256, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 128:
                reduce_data_kernel<128, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 64:
                reduce_data_kernel<64, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 32:
                reduce_data_kernel<32, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 16:
                reduce_data_kernel<16, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 8:
                reduce_data_kernel<8, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 4:
                reduce_data_kernel<4, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 2:
                reduce_data_kernel<2, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 1:
                reduce_data_kernel<1, true><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;
        }
    } else {
        switch (threads) {
            case 512:
                reduce_data_kernel<512, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 256:
                reduce_data_kernel<256, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 128:
                reduce_data_kernel<128, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 64:
                reduce_data_kernel<64, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 32:
                reduce_data_kernel<32, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 16:
                reduce_data_kernel<16, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 8:
                reduce_data_kernel<8, false><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 4:
                reduce_data_kernel<4, false><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 2:
                reduce_data_kernel<2, false><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;

            case 1:
                reduce_data_kernel<1, false><<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_odata, size);
                break;
        }
    }
}

void sobfu::device::reduce_reg_sobolev(int size, int threads, int blocks, Mat4f *d_idata, float *d_odata) {
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't
     * index shared memory out of bounds */
    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);

    if (isPow2(size)) {
        switch (threads) {
            case 512:
                reduce_reg_sobolev_kernel<512, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 256:
                reduce_reg_sobolev_kernel<256, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 128:
                reduce_reg_sobolev_kernel<128, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 64:
                reduce_reg_sobolev_kernel<64, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 32:
                reduce_reg_sobolev_kernel<32, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 16:
                reduce_reg_sobolev_kernel<16, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 8:
                reduce_reg_sobolev_kernel<8, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 4:
                reduce_reg_sobolev_kernel<4, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 2:
                reduce_reg_sobolev_kernel<2, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 1:
                reduce_reg_sobolev_kernel<1, true><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;
        }
    } else {
        switch (threads) {
            case 512:
                reduce_reg_sobolev_kernel<512, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 256:
                reduce_reg_sobolev_kernel<256, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 128:
                reduce_reg_sobolev_kernel<128, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 64:
                reduce_reg_sobolev_kernel<64, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 32:
                reduce_reg_sobolev_kernel<32, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 16:
                reduce_reg_sobolev_kernel<16, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 8:
                reduce_reg_sobolev_kernel<8, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 4:
                reduce_reg_sobolev_kernel<4, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 2:
                reduce_reg_sobolev_kernel<2, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;

            case 1:
                reduce_reg_sobolev_kernel<1, false><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
                break;
        }
    }
}

void sobfu::device::reduce_voxel_max_energy(int size, int threads, int blocks, float2 *d_idata_global,
                                            float2 *d_idata_n, Mat4f *d_idata_reg, float w_reg, float2 *d_odata) {
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't
     * index shared memory out of bounds */
    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float2) : threads * sizeof(float2);

    if (isPow2(size)) {
        switch (threads) {
            case 512:
                reduce_voxel_max_energy_kernel<512, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 256:
                reduce_voxel_max_energy_kernel<256, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 128:
                reduce_voxel_max_energy_kernel<128, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 64:
                reduce_voxel_max_energy_kernel<64, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 32:
                reduce_voxel_max_energy_kernel<32, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 16:
                reduce_voxel_max_energy_kernel<16, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 8:
                reduce_voxel_max_energy_kernel<8, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 4:
                reduce_voxel_max_energy_kernel<4, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 2:
                reduce_voxel_max_energy_kernel<2, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 1:
                reduce_voxel_max_energy_kernel<1, true>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;
        }
    } else {
        switch (threads) {
            case 512:
                reduce_voxel_max_energy_kernel<512, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 256:
                reduce_voxel_max_energy_kernel<256, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 128:
                reduce_voxel_max_energy_kernel<128, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 64:
                reduce_voxel_max_energy_kernel<64, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 32:
                reduce_voxel_max_energy_kernel<32, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 16:
                reduce_voxel_max_energy_kernel<16, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 8:
                reduce_voxel_max_energy_kernel<8, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 4:
                reduce_voxel_max_energy_kernel<4, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 2:
                reduce_voxel_max_energy_kernel<2, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;

            case 1:
                reduce_voxel_max_energy_kernel<1, false>
                    <<<dimGrid, dimBlock, smemSize>>>(d_idata_global, d_idata_n, d_idata_reg, d_odata, w_reg, size);
                break;
        }
    }
}

void sobfu::device::reduce_max(int size, int threads, int blocks, float4 *updates, float2 *d_o_max_data) {
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't
     * index shared memory out of bounds */
    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float2) : threads * sizeof(float2);

    if (isPow2(size)) {
        switch (threads) {
            case 512:
                reduce_max_kernel<512, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 256:
                reduce_max_kernel<256, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 128:
                reduce_max_kernel<128, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 64:
                reduce_max_kernel<64, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 32:
                reduce_max_kernel<32, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 16:
                reduce_max_kernel<16, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 8:
                reduce_max_kernel<8, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 4:
                reduce_max_kernel<4, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 2:
                reduce_max_kernel<2, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 1:
                reduce_max_kernel<1, true><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;
        }
    } else {
        switch (threads) {
            case 512:
                reduce_max_kernel<512, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 256:
                reduce_max_kernel<256, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 128:
                reduce_max_kernel<128, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 64:
                reduce_max_kernel<64, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 32:
                reduce_max_kernel<32, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 16:
                reduce_max_kernel<16, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 8:
                reduce_max_kernel<8, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 4:
                reduce_max_kernel<4, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 2:
                reduce_max_kernel<2, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;

            case 1:
                reduce_max_kernel<1, false><<<dimGrid, dimBlock, smemSize>>>(updates, d_o_max_data, size);
                break;
        }
    }
}


================================================
FILE: src/sobfu/cuda/scalar_fields.cu
================================================
/* sobfu includes */
#include <sobfu/cuda/utils.hpp>
#include <sobfu/scalar_fields.hpp>

#define FULL_MASK 0xffffffff

/*
 * SCALAR FIELD
 */

__device__ __forceinline__ float* sobfu::device::ScalarField::beg(int x, int y) const { return data + y * dims.x + x; }

__device__ __forceinline__ float* sobfu::device::ScalarField::zstep(float* const ptr) const {
    return ptr + dims.x * dims.y;
}

__device__ __forceinline__ float* sobfu::device::ScalarField::operator()(int idx) const { return data + idx; }

__device__ __forceinline__ float* sobfu::device::ScalarField::operator()(int x, int y, int z) const {
    return data + z * dims.y * dims.x + y * dims.x + x;
}

__global__ void sobfu::device::clear_kernel(sobfu::device::ScalarField field) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > field.dims.x - 1 || y > field.dims.y - 1) {
        return;
    }

    float* pos = field.beg(x, y);
    for (int i = 0; i <= field.dims.z - 1; pos = field.zstep(pos), ++i) {
        *pos = 0.f;
    }
}

void sobfu::device::clear(sobfu::device::ScalarField& field) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(field.dims.x, block.x), kfusion::device::divUp(field.dims.y, block.y));

    clear_kernel<<<grid, block>>>(field);
    cudaSafeCall(cudaGetLastError());
}

template <unsigned int blockSize, bool nIsPow2>
__global__ void sobfu::device::reduce_sum_kernel(float* g_idata, float* g_odata, unsigned int n) {
    float* sdata = SharedMemory<float>();

    /* perform first level of reduction, reading from global memory, writing to shared memory */
    unsigned int tid      = threadIdx.x;
    unsigned int i        = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    float mySum = 0.f;

    /* we reduce multiple elements per thread;  the number is determined by the umber of active thread blocks (via
     * gridDim); more blocks will result in a larger gridSize and therefore fewer elements per thread */
    while (i < n) {
        mySum += g_idata[i];

        /* ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays */
        if (nIsPow2 || i + blockSize < n) {
            mySum += g_idata[i + blockSize];
        }

        i += gridSize;
    }

    /* each thread puts its local sum into shared memory */
    sdata[tid] = mySum;
    __syncthreads();

    /* do reduction in shared mem */
    if ((blockSize >= 512) && (tid < 256)) {
        sdata[tid] = mySum = mySum + sdata[tid + 256];
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        sdata[tid] = mySum = mySum + sdata[tid + 128];
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        sdata[tid] = mySum = mySum + sdata[tid + 64];
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300)
    if (tid < 32) {
        /* fetch final intermediate sum from 2nd warp */
        if (blockSize >= 64)
            mySum += sdata[tid + 32];
        /* reduce final warp using shuffle */
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            mySum += __shfl_down_sync(FULL_MASK, mySum, offset);
        }
    }
#else
    /* fully unroll reduction within a single warp */
    if ((blockSize >= 64) && (tid < 32)) {
        sdata[tid] = mySum = mySum + sdata[tid + 32];
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16)) {
        sdata[tid] = mySum = mySum + sdata[tid + 16];
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8)) {
        sdata[tid] = mySum = mySum + sdata[tid + 8];
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4)) {
        sdata[tid] = mySum = mySum + sdata[tid + 4];
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2)) {
        sdata[tid] = mySum = mySum + sdata[tid + 2];
    }

    __syncthreads();

    if ((blockSize >= 2) && (tid < 1)) {
        sdata[tid] = mySum = mySum + sdata[tid + 1];
    }

    __syncthreads();
#endif

    /* write result for this block to global mem */
    if (tid == 0)
        g_odata[blockIdx.x] = mySum;
}

float sobfu::device::final_reduce_sum(float* d_odata, int numBlocks) {
    /* copy result from device to host */
    float* h_odata = new float[numBlocks];
    cudaSafeCall(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(float), cudaMemcpyDeviceToHost));

    float result = 0.f;
    for (int i = 0; i < numBlocks; i++) {
        result += h_odata[i];
    }

    delete h_odata;
    return result;
}

float sobfu::device::reduce_sum(sobfu::device::ScalarField& field) {
    int no_voxels = field.dims.x * field.dims.y * field.dims.z;
    int blocks, threads;

    get_num_blocks_and_threads(no_voxels, 65536, 512, blocks, threads);

    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    /* when there is only one warp per block, we need to allocate two warps worth of shared memory so that we don't
     * index shared memory out of bounds */
    int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);

    float* d_odata;
    cudaSafeCall(cudaMalloc((void**) &d_odata, blocks * sizeof(float)));

    if (isPow2(no_voxels)) {
        switch (threads) {
            case 512:
                reduce_sum_kernel<512, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 256:
                reduce_sum_kernel<256, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 128:
                reduce_sum_kernel<128, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 64:
                reduce_sum_kernel<64, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 32:
                reduce_sum_kernel<32, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 16:
                reduce_sum_kernel<16, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 8:
                reduce_sum_kernel<8, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 4:
                reduce_sum_kernel<4, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 2:
                reduce_sum_kernel<2, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 1:
                reduce_sum_kernel<1, true><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;
        }
    } else {
        switch (threads) {
            case 512:
                reduce_sum_kernel<512, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 256:
                reduce_sum_kernel<256, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 128:
                reduce_sum_kernel<128, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 64:
                reduce_sum_kernel<64, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 32:
                reduce_sum_kernel<32, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 16:
                reduce_sum_kernel<16, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 8:
                reduce_sum_kernel<8, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 4:
                reduce_sum_kernel<4, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 2:
                reduce_sum_kernel<2, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;

            case 1:
                reduce_sum_kernel<1, false><<<dimGrid, dimBlock, smemSize>>>(field.data, d_odata, no_voxels);
                break;
        }
    }

    float result = final_reduce_sum(d_odata, blocks);
    cudaSafeCall(cudaFree(d_odata));
    return result;
}


================================================
FILE: src/sobfu/cuda/solver.cu
================================================
/* sobfu includes */
#include <sobfu/cuda/utils.hpp>
#include <sobfu/solver.hpp>

/* kinfu includes */
#include <kfusion/cuda/device.hpp>

using namespace kfusion;
using namespace kfusion::device;

/*
 * POTENTIAL GRADIENT
 */

__global__ void sobfu::device::calculate_potential_gradient_kernel(float2* phi_n_psi, float2* phi_global,
                                                                   float4* nabla_phi_n_o_psi, float4* L,
                                                                   float4* nabla_U, float w_reg, int dim_x, int dim_y,
                                                                   int dim_z) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > dim_x - 1 || y > dim_y - 1) {
        return;
    }

    int idx = y * dim_x + x;
    for (int i = 0; i <= dim_z - 1; idx += dim_y * dim_x, ++i) {
        float tsdf_n_psi  = phi_n_psi[idx].x;
        float tsdf_global = phi_global[idx].x;

        nabla_U[idx] = (tsdf_n_psi - tsdf_global) * nabla_phi_n_o_psi[idx] + w_reg * L[idx];
    }
}

void sobfu::device::calculate_potential_gradient(kfusion::device::TsdfVolume& phi_n_psi,
                                                 kfusion::device::TsdfVolume& phi_global,
                                                 sobfu::device::TsdfGradient& nabla_phi_n_o_psi,
                                                 sobfu::device::Laplacian& L, sobfu::device::PotentialGradient& nabla_U,
                                                 float w_reg) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(phi_n_psi.dims.x, block.x), kfusion::device::divUp(phi_n_psi.dims.y, block.y));

    calculate_potential_gradient_kernel<<<grid, block>>>(phi_n_psi.data, phi_global.data, nabla_phi_n_o_psi.data,
                                                         L.data, nabla_U.data, w_reg, phi_n_psi.dims.x,
                                                         phi_n_psi.dims.y, phi_n_psi.dims.z);
    cudaSafeCall(cudaGetLastError());
}

/*
 * DEFORMATION FIELD
 */

__global__ void sobfu::device::update_psi_kernel(float4* psi, float4* nabla_U_S, float4* updates, float alpha,
                                                 int dim_x, int dim_y, int dim_z) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > dim_x - 1 || y > dim_y - 1) {
        return;
    }

    int global_idx = y * dim_x + x;
    for (int i = 0; i <= dim_z - 1; global_idx += dim_y * dim_x, ++i) {
        float4 update = alpha * nabla_U_S[global_idx];

        updates[global_idx] = update;
        psi[global_idx] -= update;
    }
}

void sobfu::device::update_psi(sobfu::device::DeformationField& psi, sobfu::device::PotentialGradient& nabla_U_S,
                               float4* updates, float alpha) {
    /* integrate in time */
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(psi.dims.x, block.x), kfusion::device::divUp(psi.dims.y, block.y));

    update_psi_kernel<<<grid, block>>>(psi.data, nabla_U_S.data, updates, alpha, psi.dims.x, psi.dims.y, psi.dims.z);
    cudaSafeCall(cudaGetLastError());
}

/*
 * PIPELINE
 */

void sobfu::device::estimate_psi(SDFs& sdfs, sobfu::device::DeformationField& psi,
                                 sobfu::device::DeformationField& psi_inv,
                                 sobfu::device::SpatialGradients* spatial_grads, Differentiators& differentiators,
                                 float* d_S_i, sobfu::device::Reductor* r, SolverParams& params) {
    /* copy sobolev filter to constant memory */
    set_convolution_kernel(d_S_i);

    /* create cuda streams */
    int no_streams = 3;
    cudaStream_t streams[no_streams];
    for (int i = 0; i < no_streams; i++) {
        cudaSafeCall(cudaStreamCreate(&streams[i]));
    }

    /* calculate no. of blocks and no. of threads per block */
    int3 dims = psi.dims;

    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(dims.x, block.x), kfusion::device::divUp(dims.y, block.y));

    /* apply psi to phi_n */
    apply_kernel<<<grid, block, 0, streams[0]>>>(sdfs.phi_n, sdfs.phi_n_psi, psi);
    cudaSafeCall(cudaGetLastError());

    /* run gradient descent */
    float2 curr_max_update_norm;
    float e_curr = std::numeric_limits<float>::infinity();

    int iter = 1;
    while (iter <= params.max_iter) {
        if (iter == 1 || iter % 50 == 0) {
            std::cout << "iter. no. " << iter << std::endl;
        }

        /* calculate the gradient of phi_n */
        estimate_gradient_kernel<<<grid, block, 0, streams[0]>>>(differentiators.tsdf_diff,
                                                                 *(spatial_grads->nabla_phi_n_o_psi));
        cudaSafeCall(cudaGetLastError());
        /* calculate the jacobian of psi */
        estimate_deformation_jacobian_kernel<<<grid, block, 0, streams[1]>>>(differentiators.diff, *(spatial_grads->J));
        cudaSafeCall(cudaGetLastError());
        /* calculate the laplacian of psi */
        estimate_laplacian_kernel<<<grid, block, 0, streams[2]>>>(differentiators.second_order_diff,
                                                                  *(spatial_grads->L));
        cudaSafeCall(cudaGetLastError());

        /* calculate current value of the energy functional */
        if ((params.verbosity == 1 && (iter == 1 || iter % 50 == 0 || iter == params.max_iter) ||
             params.verbosity == 2)) {
            /* data term */
            float e_data = r->data_energy(sdfs.phi_global.data, sdfs.phi_n_psi.data);
            /* regularisation term */
            float e_reg = r->reg_energy_sobolev(spatial_grads->J->data);

            e_curr = e_data + params.w_reg * e_reg;
            std::cout << "data energy + w_reg * reg energy = " << e_data << " + " << params.w_reg << " * " << e_reg
                      << " = " << e_curr << std::endl;
        }

        /*
         * PDE'S
         */

        /* calculate gradient of the potential */
        sobfu::device::calculate_potential_gradient(sdfs.phi_n_psi, sdfs.phi_global,
                                                    *(spatial_grads->nabla_phi_n_o_psi), *(spatial_grads->L),
                                                    *(spatial_grads->nabla_U), params.w_reg);
        cudaSafeCall(cudaGetLastError());

        /* convolve gradient of the potential with a sobolev kernel */
        sobfu::device::convolution_rows((*(spatial_grads->nabla_U_S)).data, (*(spatial_grads->nabla_U)).data, dims.x,
                                        dims.y, dims.z);
        sobfu::device::convolution_columns((*(spatial_grads->nabla_U_S)).data, (*(spatial_grads->nabla_U)).data, dims.x,
                                           dims.y, dims.z);
        sobfu::device::convolution_depth((*(spatial_grads->nabla_U_S)).data, (*(spatial_grads->nabla_U)).data, dims.x,
                                         dims.y, dims.z);

        /* update psi */
        update_psi_kernel<<<grid, block, 0, streams[0]>>>(psi.data, (*(spatial_grads->nabla_U_S)).data, r->updates,
                                                          params.alpha, dims.x, dims.y, dims.z);
        cudaSafeCall(cudaGetLastError());

        /* apply psi to phi_n */
        apply_kernel<<<grid, block, 0, streams[0]>>>(sdfs.phi_n, sdfs.phi_n_psi, psi);
        cudaSafeCall(cudaGetLastError());

        /* get value of the max. update norm at the current iteration of the solver */
        curr_max_update_norm = r->max_update_norm();
        if ((params.verbosity == 1 && (iter == 1 || iter % 50 == 0 || iter == params.max_iter) ||
             params.verbosity == 2)) {
            int idx_x = curr_max_update_norm.y / (psi.dims.x * psi.dims.y);
            int idx_y = (curr_max_update_norm.y - idx_x * psi.dims.x * psi.dims.y) / psi.dims.x;
            int idx_z = curr_max_update_norm.y - psi.dims.x * (idx_y + psi.dims.y * idx_x);

            std::cout << "max. update norm " << curr_max_update_norm.x << " at voxel (" << idx_z << ", " << idx_y
                      << ", " << idx_x << ")" << std::endl;
        }

        if (curr_max_update_norm.x <= params.max_update_norm) {
            std::cout << "SOLVER CONVERGED AFTER " << iter << " ITERATIONS" << std::endl;
            break;
        }

        if (iter == params.max_iter) {
            std::cout << "SOLVER REACHED MAX. NO. OF ITERATIONS WITHOUT CONVERGING" << std::endl;
        }

        iter++;
    }

    /* iteratively estimate the inverse deformation field */
    sobfu::device::init_identity(psi_inv);
    sobfu::device::estimate_inverse(psi, psi_inv);
    /* apply psi_inv to phi_global */
    apply_kernel<<<grid, block>>>(sdfs.phi_global, sdfs.phi_global_psi_inv, psi_inv);
    cudaSafeCall(cudaGetLastError());

    for (int i = 0; i < no_streams; i++) {
        cudaSafeCall(cudaStreamDestroy(streams[i]));
    }
}

/*
 * CONVOLUTIONS
 */

#define KERNEL_RADIUS 3
#define KERNEL_LENGTH (2 * KERNEL_RADIUS + 1)

#define ROWS_BLOCKDIM_X 4
#define ROWS_BLOCKDIM_Y 64
#define ROWS_RESULT_STEPS 4
#define ROWS_HALO_STEPS 1

#define COLUMNS_BLOCKDIM_X 64
#define COLUMNS_BLOCKDIM_Y 4
#define COLUMNS_RESULT_STEPS 4
#define COLUMNS_HALO_STEPS 1

#define DEPTH_BLOCKDIM_X 64
#define DEPTH_BLOCKDIM_Z 4
#define DEPTH_RESULT_STEPS 4
#define DEPTH_HALO_STEPS 1

__constant__ float S[KERNEL_LENGTH];

void sobfu::device::set_convolution_kernel(float* d_kernel) {
    cudaSafeCall(cudaMemcpyToSymbol(S, d_kernel, KERNEL_LENGTH * sizeof(float), 0, cudaMemcpyDeviceToDevice));
    cudaSafeCall(cudaGetLastError());
}

/*** ROW CONVOLUTION ***/
__global__ void sobfu::device::convolution_rows_kernel(float4* d_dst, float4* d_src, int image_w, int image_h,
                                                       int image_d) {
    __shared__ float4 s_data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X];

    /* offset to the left halo edge */
    const int base_x = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x;
    const int base_y = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;
    const int base_z = blockIdx.z;

    const int first_pixel_in_line = ROWS_BLOCKDIM_X * ROWS_HALO_STEPS - threadIdx.x;
    const int last_pixel_in_line  = image_w - base_x - 1;

    d_dst += base_z * image_h * image_w + base_y * image_w + base_x;
    d_src += base_z * image_h * image_w + base_y * image_w + base_x;

    /* load main data */
#pragma unroll
    for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) {
        s_data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
            (image_w - base_x > i * ROWS_BLOCKDIM_X) ? d_src[i * ROWS_BLOCKDIM_X] : d_src[last_pixel_in_line];
    }

    /* load left halo */
#pragma unroll
    for (int i = 0; i < ROWS_HALO_STEPS; i++) {
        s_data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
            (base_x >= -i * ROWS_BLOCKDIM_X) ? d_src[i * ROWS_BLOCKDIM_X] : d_src[first_pixel_in_line];
    }

    /* load right halo */
#pragma unroll
    for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) {
        s_data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
            (image_w - base_x > i * ROWS_BLOCKDIM_X) ? d_src[i * ROWS_BLOCKDIM_X] : d_src[last_pixel_in_line];
    }

    /* compute and store results */
    __syncthreads();

    /* this pixel is not part of the iamge and doesn't need to be convolved */
    if (base_y >= image_h) {
        return;
    }
#pragma unroll
    for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) {
        if (image_w - base_x > i * ROWS_BLOCKDIM_X) {
            float4 sum = make_float4(0.f, 0.f, 0.f, 0.f);

#pragma unroll
            for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {
                sum += S[KERNEL_RADIUS - j] * s_data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];
            }

            d_dst[i * ROWS_BLOCKDIM_X] = sum;
        }
    }
}

void sobfu::device::convolution_rows(float4* d_dst, float4* d_src, int image_w, int image_h, int image_d) {
    int blocks_x =
        image_w / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X) + min(1, image_w % (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X));
    int blocks_y = image_h / ROWS_BLOCKDIM_Y + min(1, image_h % ROWS_BLOCKDIM_Y);
    int blocks_z = image_d;

    dim3 blocks(blocks_x, blocks_y, blocks_z);
    dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y, 1);

    convolution_rows_kernel<<<blocks, threads>>>(d_dst, d_src, image_w, image_h, image_d);
    cudaSafeCall(cudaGetLastError());
}

/*** COLUMMN CONVOLUTION ***/
__global__ void sobfu::device::convolution_columns_kernel(float4* d_dst, float4* d_src, int image_w, int image_h,
                                                          int image_d) {
    __shared__ float4
        s_data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1];

    /* offset to the upper halo edge */
    const int base_x = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x;
    const int base_y = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + threadIdx.y;
    const int base_z = blockIdx.z;

    const int first_pixel_in_line = (COLUMNS_BLOCKDIM_Y * COLUMNS_HALO_STEPS - threadIdx.y) * image_w;
    const int last_pixel_in_line  = (image_h - base_y - 1) * image_w;

    d_dst += base_z * image_h * image_w + base_y * image_w + base_x;
    d_src += base_z * image_h * image_w + base_y * image_w + base_x;

    /* main data */
#pragma unroll
    for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {
        s_data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = (image_h - base_y > i * COLUMNS_BLOCKDIM_Y)
                                                                        ? d_src[i * COLUMNS_BLOCKDIM_Y * image_w]
                                                                        : d_src[last_pixel_in_line];
    }

    /* upper halo */
#pragma unroll
    for (int i = 0; i < COLUMNS_HALO_STEPS; i++) {
        s_data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =
            (base_y >= -i * COLUMNS_BLOCKDIM_Y) ? d_src[i * COLUMNS_BLOCKDIM_Y * image_w] : d_src[first_pixel_in_line];
    }

    /* lower halo */
#pragma unroll
    for (int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS;
         i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; i++) {
        s_data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = (image_h - base_y > i * COLUMNS_BLOCKDIM_Y)
                                                                        ? d_src[i * COLUMNS_BLOCKDIM_Y * image_w]
                                                                        : d_src[last_pixel_in_line];
    }

    /* compute and store results */
    __syncthreads();

    /* this pixel isn't part of hte image and doesn't need to be convolved */
    if (base_x >= image_w) {
        return;
    }

#pragma unroll
    for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {
        if (image_h - base_y > i * COLUMNS_BLOCKDIM_Y) {
            float4 sum = make_float4(0.f, 0.f, 0.f, 0.f);
#pragma unroll
            for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {
                sum += S[KERNEL_RADIUS - j] * s_data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j];
            }

            d_dst[i * COLUMNS_BLOCKDIM_Y * image_w] += sum;
        }
    }
}

void sobfu::device::convolution_columns(float4* d_dst, float4* d_src, int image_w, int image_h, int image_d) {
    int blocks_x = image_w / COLUMNS_BLOCKDIM_X + min(1, image_w % COLUMNS_BLOCKDIM_X);
    int blocks_y = image_h / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y) +
                   min(1, image_h % (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y));
    int blocks_z = image_d;

    dim3 blocks(blocks_x, blocks_y, blocks_z);
    dim3 threads(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y, 1);

    convolution_columns_kernel<<<blocks, threads>>>(d_dst, d_src, image_w, image_h, image_d);
    cudaSafeCall(cudaGetLastError());
}

/*** DEPTH CONVOLUTION ***/
__global__ void sobfu::device::convolution_depth_kernel(float4* d_dst, float4* d_src, int image_w, int image_h,
                                                        int image_d) {
    /* here it is [x][z] as we leave out y bc it has a size of 1 */
    __shared__ float4 s_data[DEPTH_BLOCKDIM_X][(DEPTH_RESULT_STEPS + 2 * DEPTH_HALO_STEPS) * DEPTH_BLOCKDIM_Z + 1];

    /* offset to the upper halo edge */
    const int base_x = blockIdx.x * DEPTH_BLOCKDIM_X + threadIdx.x;
    const int base_y = blockIdx.y;
    const int base_z = (blockIdx.z * DEPTH_RESULT_STEPS - DEPTH_HALO_STEPS) * DEPTH_BLOCKDIM_Z + threadIdx.z;

    const int first_pixel_in_line = (DEPTH_BLOCKDIM_Z * DEPTH_HALO_STEPS - threadIdx.z) * image_w * image_h;
    const int last_pixel_in_line  = (image_d - base_z - 1) * image_w * image_h;

    d_dst += base_z * image_h * image_w + base_y * image_w + base_x;
    d_src += base_z * image_h * image_w + base_y * image_w + base_x;

    /* main data */
#pragma unroll
    for (int i = DEPTH_HALO_STEPS; i < DEPTH_HALO_STEPS + DEPTH_RESULT_STEPS; i++) {
        s_data[threadIdx.x][threadIdx.z + i * DEPTH_BLOCKDIM_Z] = (image_d - base_z > i * DEPTH_BLOCKDIM_Z)
                                                                      ? d_src[i * DEPTH_BLOCKDIM_Z * image_w * image_h]
                                                                      : d_src[last_pixel_in_line];
    }

    /* upper halo */
#pragma unroll
    for (int i = 0; i < DEPTH_HALO_STEPS; i++) {
        s_data[threadIdx.x][threadIdx.z + i * DEPTH_BLOCKDIM_Z] = (base_z >= -i * DEPTH_BLOCKDIM_Z)
                                                                      ? d_src[i * DEPTH_BLOCKDIM_Z * image_w * image_h]
                                                                      : d_src[first_pixel_in_line];
    }

    /* lower halo */
#pragma unroll
    for (int i = DEPTH_HALO_STEPS + DEPTH_RESULT_STEPS; i < DEPTH_HALO_STEPS + DEPTH_RESULT_STEPS + DEPTH_HALO_STEPS;
         i++) {
        s_data[threadIdx.x][threadIdx.z + i * DEPTH_BLOCKDIM_Z] = (image_d - base_z > i * DEPTH_BLOCKDIM_Z)
                                                                      ? d_src[i * DEPTH_BLOCKDIM_Z * image_w * image_h]
                                                                      : d_src[last_pixel_in_line];
    }

    /* compute and store results */
    __syncthreads();

    /* this pixel is not part of the image and doesn't need to be convolved */
    if (base_x >= image_w) {
        return;
    }

#pragma unroll
    for (int i = DEPTH_HALO_STEPS; i < DEPTH_HALO_STEPS + DEPTH_RESULT_STEPS; i++) {
        if (image_d - base_z > i * DEPTH_BLOCKDIM_Z) {
            float4 sum = make_float4(0.f, 0.f, 0.f, 0.f);
#pragma unroll
            for (int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) {
                sum += S[KERNEL_RADIUS - j] * s_data[threadIdx.x][threadIdx.z + i * DEPTH_BLOCKDIM_Z + j];
            }

            d_dst[i * DEPTH_BLOCKDIM_Z * image_w * image_h] += sum;
        }
    }
}

void sobfu::device::convolution_depth(float4* d_dst, float4* d_src, int image_w, int image_h, int image_d) {
    int blocks_x = image_w / DEPTH_BLOCKDIM_X + min(1, image_w % DEPTH_BLOCKDIM_X);
    int blocks_y = image_h;
    int blocks_z =
        image_d / (DEPTH_RESULT_STEPS * DEPTH_BLOCKDIM_Z) + min(1, image_d % (DEPTH_RESULT_STEPS * DEPTH_BLOCKDIM_Z));

    dim3 blocks(blocks_x, blocks_y, blocks_z);
    dim3 threads(DEPTH_BLOCKDIM_X, 1, DEPTH_BLOCKDIM_Z);

    convolution_depth_kernel<<<blocks, threads>>>(d_dst, d_src, image_w, image_h, image_d);
    cudaSafeCall(cudaGetLastError());
}


================================================
FILE: src/sobfu/cuda/vector_fields.cu
================================================
/* sobfu includes */
#include <sobfu/cuda/utils.hpp>
#include <sobfu/vector_fields.hpp>

/* kinfu includes */
#include <kfusion/cuda/device.hpp>

using namespace kfusion::device;

/*
 * VECTOR FIELD
 */

__device__ __forceinline__ float4* sobfu::device::VectorField::beg(int x, int y) const { return data + x + dims.x * y; }

__device__ __forceinline__ float4* sobfu::device::VectorField::zstep(float4* const ptr) const {
    return ptr + dims.x * dims.y;
}

__device__ __forceinline__ float4* sobfu::device::VectorField::operator()(int x, int y, int z) const {
    return data + x + y * dims.x + z * dims.y * dims.x;
}

__device__ __forceinline__ float4 sobfu::device::VectorField::get_displacement(int x, int y, int z) const {
    return *(data + z * dims.y * dims.x + y * dims.x + x) - make_float4((float) x, (float) y, (float) z, 0.f);
}

void sobfu::device::clear(VectorField& field) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(field.dims.x, block.x), kfusion::device::divUp(field.dims.y, block.y));

    clear_kernel<<<grid, block>>>(field);
    cudaSafeCall(cudaGetLastError());
}

__global__ void sobfu::device::clear_kernel(sobfu::device::VectorField field) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > field.dims.x - 1 || y > field.dims.y - 1) {
        return;
    }

    float4* beg = field.beg(x, y);
    float4* end = beg + field.dims.x * field.dims.y * field.dims.z;

    for (float4* pos = beg; pos != end; pos = field.zstep(pos)) {
        *pos = make_float4(0.f, 0.f, 0.f, 0.f);
    }
}

/*
 * DEFORMATION FIELD
 */

void sobfu::device::init_identity(sobfu::device::DeformationField& psi) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(psi.dims.x, block.x), kfusion::device::divUp(psi.dims.y, block.y));

    init_identity_kernel<<<grid, block>>>(psi);
    cudaSafeCall(cudaGetLastError());
}

__global__ void sobfu::device::init_identity_kernel(sobfu::device::DeformationField psi) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > psi.dims.x - 1 || y > psi.dims.y - 1) {
        return;
    }

    float4 idx   = make_float4((float) x, (float) y, 0.f, 0.f);
    float4 zstep = make_float4(0.f, 0.f, 1.f, 0.f);

    float4* pos = psi.beg(x, y);
    for (int i = 0; i <= psi.dims.z - 1; idx += zstep, pos = psi.zstep(pos), ++i) {
        *pos = idx;
    }
}

__global__ void sobfu::device::apply_kernel(const kfusion::device::TsdfVolume phi,
                                            kfusion::device::TsdfVolume phi_warped,
                                            const sobfu::device::DeformationField psi) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > phi_warped.dims.x - 1 || y > phi_warped.dims.y - 1) {
        return;
    }

    float4* psi_ptr        = psi.beg(x, y);
    float2* phi_warped_ptr = phi_warped.beg(x, y);
    for (int i   = 0; i <= phi_warped.dims.z - 1;
         psi_ptr = psi.zstep(psi_ptr), phi_warped_ptr = phi_warped.zstep(phi_warped_ptr), ++i) {
        float4 psi_val = *psi_ptr;

        float2 tsdf_deformed = interpolate_tsdf(phi, trunc(psi_val));
        *phi_warped_ptr      = tsdf_deformed;
    }
}

void sobfu::device::apply(const kfusion::device::TsdfVolume& phi, kfusion::device::TsdfVolume& phi_warped,
                          const sobfu::device::DeformationField& psi) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(phi.dims.x, block.x), kfusion::device::divUp(phi.dims.y, block.y));

    apply_kernel<<<grid, block>>>(phi, phi_warped, psi);
    cudaSafeCall(cudaGetLastError());
}

__global__ void sobfu::device::estimate_inverse_kernel(sobfu::device::DeformationField psi,
                                                       sobfu::device::DeformationField psi_inv) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > psi_inv.dims.x - 1 || y > psi_inv.dims.y - 1) {
        return;
    }

    float4* psi_inv_ptr = psi_inv.beg(x, y);
    for (int i = 0; i <= psi_inv.dims.z - 1; psi_inv_ptr = psi_inv.zstep(psi_inv_ptr), ++i) {
        float4 psi_inv_val = *psi_inv_ptr;
        *psi_inv_ptr =
            make_float4((float) x, (float) y, (float) i, 0.f) - 1.f * interpolate_field_inv(psi, trunc(psi_inv_val));
    }
}

void sobfu::device::estimate_inverse(sobfu::device::DeformationField& psi,
                                     sobfu::device::DeformationField& psi_inverse) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(psi_inverse.dims.x, block.x), kfusion::device::divUp(psi_inverse.dims.y, block.y));

    /* estimate inverse */
    for (int iter = 0; iter < 48; ++iter) {
        estimate_inverse_kernel<<<grid, block>>>(psi, psi_inverse);
        cudaSafeCall(cudaGetLastError());
    }
}

/*
 * TSDF DIFFERENTIATOR METHODS
 */

__global__ void sobfu::device::estimate_gradient_kernel(const sobfu::device::TsdfDifferentiator diff,
                                                        sobfu::device::TsdfGradient grad) {
    diff(grad);
}

void sobfu::device::TsdfDifferentiator::calculate(sobfu::device::TsdfGradient& grad) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(grad.dims.x, block.x), kfusion::device::divUp(grad.dims.y, block.y));

    estimate_gradient_kernel<<<grid, block>>>(*this, grad);
    cudaSafeCall(cudaGetLastError());
}

__device__ __forceinline__ void sobfu::device::TsdfDifferentiator::operator()(sobfu::device::TsdfGradient& grad) const {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > vol.dims.x - 1 || y > vol.dims.y - 1) {
        return;
    }

    int idx_x_1 = x + 1;
    int idx_x_2 = x - 1;
    if (x == 0) {
        idx_x_2 = x + 1;
    } else if (x == vol.dims.x - 1) {
        idx_x_1 = x - 1;
    }

    int idx_y_1 = y + 1;
    int idx_y_2 = y - 1;
    if (y == 0) {
        idx_y_2 = y + 1;
    } else if (y == vol.dims.y - 1) {
        idx_y_1 = y - 1;
    }

    float4* grad_ptr = grad.beg(x, y);

#pragma unroll
    for (int i = 0; i <= vol.dims.z - 1; grad_ptr = grad.zstep(grad_ptr), ++i) {
        int idx_z_1 = i + 1;
        int idx_z_2 = i - 1;
        if (i == 0) {
            idx_z_2 = i + 1;
        } else if (i == vol.dims.z - 1) {
            idx_z_1 = i - 1;
        }

        float Fx1 = (*vol(idx_x_1, y, i)).x;
        float Fx2 = (*vol(idx_x_2, y, i)).x;
        float n_x = __fdividef(Fx1 - Fx2, 2.f);

        float Fy1 = (*vol(x, idx_y_1, i)).x;
        float Fy2 = (*vol(x, idx_y_2, i)).x;
        float n_y = __fdividef(Fy1 - Fy2, 2.f);

        float Fz1 = (*vol(x, y, idx_z_1)).x;
        float Fz2 = (*vol(x, y, idx_z_2)).x;
        float n_z = __fdividef(Fz1 - Fz2, 2.f);

        float4 n  = make_float4(n_x, n_y, n_z, 0.f);
        *grad_ptr = n;
    }
}

__global__ void sobfu::device::interpolate_gradient_kernel(sobfu::device::TsdfGradient nabla_phi_n_psi,
                                                           sobfu::device::TsdfGradient nabla_phi_n_psi_t,
                                                           sobfu::device::DeformationField psi) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > psi.dims.x - 1 || y > psi.dims.y - 1) {
        return;
    }

    float3 idx   = make_float3(x, y, 0.f);
    float3 zstep = make_float3(0.f, 0.f, 1.f);

    int global_idx = y * nabla_phi_n_psi.dims.x + x;

    float4* nabla_phi_n_psi_t_ptr = nabla_phi_n_psi_t.beg(x, y);
    for (int i = 0; i <= psi.dims.z - 1; nabla_phi_n_psi_t_ptr = nabla_phi_n_psi_t.zstep(nabla_phi_n_psi_t_ptr),
             global_idx += nabla_phi_n_psi.dims.x * nabla_phi_n_psi.dims.y, idx += zstep, ++i) {
        float4 psi_val         = psi.data[global_idx];
        *nabla_phi_n_psi_t_ptr = interpolate_field(nabla_phi_n_psi, trunc(psi_val));
    }
}

void sobfu::device::interpolate_gradient(sobfu::device::TsdfGradient& nabla_phi_n_psi,
                                         sobfu::device::TsdfGradient& nabla_phi_n_psi_t,
                                         sobfu::device::DeformationField& psi) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(psi.dims.x, block.x), kfusion::device::divUp(psi.dims.y, block.y));

    interpolate_gradient_kernel<<<grid, block>>>(nabla_phi_n_psi, nabla_phi_n_psi_t, psi);
    cudaSafeCall(cudaGetLastError());
}

/*
 * LAPLACIAN
 */

__global__ void sobfu::device::interpolate_laplacian_kernel(sobfu::device::Laplacian L,
                                                            sobfu::device::Laplacian L_o_psi,
                                                            sobfu::device::DeformationField psi) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > psi.dims.x - 1 || y > psi.dims.y - 1) {
        return;
    }

    float4* psi_ptr     = psi.beg(x, y);
    float4* L_o_psi_ptr = L_o_psi.beg(x, y);
    for (int i = 0; i <= psi.dims.z - 1; psi_ptr = psi.zstep(psi_ptr), L_o_psi_ptr = L_o_psi.zstep(L_o_psi_ptr), ++i) {
        float4 psi_val = *psi_ptr;
        *L_o_psi_ptr   = interpolate_field(L, trunc(psi_val));
    }
}

void sobfu::device::interpolate_laplacian(sobfu::device::Laplacian& L, sobfu::device::Laplacian& L_o_psi,
                                          sobfu::device::DeformationField& psi) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(psi.dims.x, block.x), kfusion::device::divUp(psi.dims.y, block.y));

    interpolate_laplacian_kernel<<<grid, block>>>(L, L_o_psi, psi);
    cudaSafeCall(cudaGetLastError());
}

/*
 * SECOND ORDER DIFFERENTIATOR METHODS
 */

void sobfu::device::SecondOrderDifferentiator::calculate(sobfu::device::Laplacian& L) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(L.dims.x, block.x), kfusion::device::divUp(L.dims.y, block.y));

    estimate_laplacian_kernel<<<grid, block>>>(*this, L);
    cudaSafeCall(cudaGetLastError());
}

__global__ void sobfu::device::estimate_laplacian_kernel(const sobfu::device::SecondOrderDifferentiator diff,
                                                         sobfu::device::Laplacian L) {
    diff.laplacian(L);
}

__device__ __forceinline__ void sobfu::device::SecondOrderDifferentiator::laplacian(sobfu::device::Laplacian& L) const {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > L.dims.x - 1 || y > L.dims.y - 1) {
        return;
    }

    int idx_x_1 = x + 1;
    int idx_x_2 = x - 1;
    if (x == 0) {
        idx_x_1 = x;
        idx_x_2 = x;
    } else if (x == L.dims.x - 1) {
        idx_x_1 = x;
        idx_x_2 = x;
    }

    int idx_y_1 = y + 1;
    int idx_y_2 = y - 1;
    if (y == 0) {
        idx_y_1 = y;
        idx_y_2 = y;
    } else if (y == L.dims.y - 1) {
        idx_y_1 = y;
        idx_y_2 = y;
    }

    float4* L_ptr = L.beg(x, y);

#pragma unroll
    for (int i = 0; i <= L.dims.z - 1; L_ptr = L.zstep(L_ptr), ++i) {
        int idx_z_1 = i + 1;
        int idx_z_2 = i - 1;
        if (i == 0) {
            idx_z_1 = i;
            idx_z_2 = i;
        } else if (i == L.dims.z - 1) {
            idx_z_1 = i;
            idx_z_2 = i;
        }

        float4 L_val = -6.f * *psi(x, y, i) + *psi(idx_x_1, y, i) + *psi(idx_x_2, y, i) + *psi(x, idx_y_1, i) +
                       *psi(x, idx_y_2, i) + *psi(x, y, idx_z_1) + *psi(x, y, idx_z_2);
        *L_ptr = -1.f * L_val;
    }
}

/*
 * JACOBIAN
 */

__device__ __forceinline__ Mat4f* sobfu::device::Jacobian::beg(int x, int y) const { return data + x + dims.x * y; }

__device__ __forceinline__ Mat4f* sobfu::device::Jacobian::zstep(Mat4f* const ptr) const {
    return ptr + dims.x * dims.y;
}

__device__ __forceinline__ Mat4f* sobfu::device::Jacobian::operator()(int x, int y, int z) const {
    return data + x + y * dims.x + z * dims.y * dims.x;
}

void sobfu::device::clear(Jacobian& J) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(J.dims.x, block.x), kfusion::device::divUp(J.dims.y, block.y));

    clear_jacobian_kernel<<<grid, block>>>(J);
    cudaSafeCall(cudaGetLastError());
    cudaSafeCall(cudaDeviceSynchronize());
}

__global__ void sobfu::device::clear_jacobian_kernel(sobfu::device::Jacobian J) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > J.dims.x - 1 || y > J.dims.y - 1) {
        return;
    }

    Mat4f* beg = J.beg(x, y);
    Mat4f* end = beg + J.dims.x * J.dims.y * J.dims.z;

    for (Mat4f* pos = beg; pos != end; pos = J.zstep(pos)) {
        float4 g = make_float4(0.f, 0.f, 0.f, 0.f);

        Mat4f val;
        val.data[0] = g;
        val.data[1] = g;
        val.data[2] = g;

        *pos = val;
    }
}

/*
 * DIFFERENTIATOR METHODS
 */

void sobfu::device::Differentiator::calculate(sobfu::device::Jacobian& J) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(J.dims.x, block.x), kfusion::device::divUp(J.dims.y, block.y));

    estimate_jacobian_kernel<<<grid, block>>>(*this, J);
    cudaSafeCall(cudaGetLastError());
}

void sobfu::device::Differentiator::calculate_deformation_jacobian(sobfu::device::Jacobian& J) {
    dim3 block(64, 16);
    dim3 grid(kfusion::device::divUp(J.dims.x, block.x), kfusion::device::divUp(J.dims.y, block.y));

    estimate_deformation_jacobian_kernel<<<grid, block>>>(*this, J);
    cudaSafeCall(cudaGetLastError());
}

__global__ void sobfu::device::estimate_jacobian_kernel(const sobfu::device::Differentiator diff,
                                                        sobfu::device::Jacobian J) {
    diff(J, 0);
}

__global__ void sobfu::device::estimate_deformation_jacobian_kernel(const sobfu::device::Differentiator diff,
                                                                    sobfu::device::Jacobian J) {
    diff(J, 1);
}

__device__ __forceinline__ void sobfu::device::Differentiator::operator()(sobfu::device::Jacobian& J, int mode) const {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x > psi.dims.x - 1 || y > psi.dims.y - 1) {
        return;
    }

    int idx_x_1 = x + 1;
    int idx_x_2 = x - 1;
    if (x == 0) {
        idx_x_2 = x + 1;
    } else if (x == psi.dims.x - 1) {
        idx_x_1 = x - 1;
    }

    int idx_y_1 = y + 1;
    int idx_y_2 = y - 1;
    if (y == 0) {
        idx_y_2 = y + 1;
    } else if (y == psi.dims.y - 1) {
        idx_y_1 = y - 1;
    }

    Mat4f* J_ptr = J.beg(x, y);

#pragma unroll
    for (int i = 0; i <= psi.dims.z - 1; J_ptr = J.zstep(J_ptr), ++i) {
        int idx_z_1 = i + 1;
        int idx_z_2 = i - 1;
        if (i == 0) {
            idx_z_2 = i + 1;
        } else if (i == psi.dims.z - 1) {
            idx_z_1 = i - 1;
        }

        float4 J_x;
        float4 J_y;
        float4 J_z;

        if (mode == 0) {
            J_x = (*psi(idx_x_1, y, i) - *psi(idx_x_2, y, i)) / 2.f;
            J_y = (*psi(x, idx_y_1, i) - *psi(x, idx_y_2, i)) / 2.f;
            J_z = (*psi(x, y, idx_z_1) - *psi(x, y, idx_z_2)) / 2.f;
        } else if (mode == 1) {
            J_x = (psi.get_displacement(idx_x_1, y, i) - psi.get_displacement(idx_x_2, y, i)) / 2.f;
            J_y = (psi.get_displacement(x, idx_y_1, i) - psi.get_displacement(x, idx_y_2, i)) / 2.f;
            J_z = (psi.get_displacement(x, y, idx_z_1) - psi.get_displacement(x, y, idx_z_2)) / 2.f;
        }

        Mat4f val;
        val.data[0] = make_float4(J_x.x, J_y.x, J_z.x, 0.f);
        val.data[1] = make_float4(J_x.y, J_y.y, J_z.y, 0.f);
        val.data[2] = make_float4(J_x.z, J_y.z, J_z.z, 0.f);

        *J(x, y, i) = val;
    }
}


================================================
FILE: src/sobfu/precomp.cpp
================================================
/* sobfu includes */
#include <sobfu/precomp.hpp>

/* checks if x is a power of 2 */
bool isPow2(unsigned int x) { return ((x & (x - 1)) == 0); }

/* computes the nearest power of 2 larger than x */
int nextPow2(int x) {
    if (x < 0)
        return 0;
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return x + 1;
}

void get_num_blocks_and_threads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads) {
    /* get device capability, to avoid block/grid size exceed the upper bound */
    cudaDeviceProp prop;
    int device;
    cudaSafeCall(cudaGetDevice(&device));
    cudaSafeCall(cudaGetDeviceProperties(&prop, device));

    threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
    blocks  = (n + (threads * 2 - 1)) / (threads * 2);

    if ((float) threads * blocks > (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
        printf("n is too large, please choose a smaller number!\n");
    }

    if (blocks > prop.maxGridSize[0]) {
        printf("Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n", blocks,
               prop.maxGridSize[0], threads * 2, threads);

        blocks /= 2;
        threads *= 2;
    }

    blocks = std::min(maxBlocks, blocks);
}


================================================
FILE: src/sobfu/reductor.cpp
================================================
/* sobfu incldues */
#include <sobfu/reductor.hpp>

/*
 * REDUCTOR
 */

sobfu::device::Reductor::Reductor(int3 dims_, float vsz_, float trunc_dist_) {
    dims       = dims_;
    vsz        = vsz_;
    trunc_dist = trunc_dist_;

    no_voxels = dims.x * dims.y * dims.z;

    /* own kernels */
    get_num_blocks_and_threads(no_voxels, 65536, 512, blocks, threads);

    h_data_out = new float[blocks];
    h_reg_out  = new float[blocks];
    h_max_out  = new float2[blocks];

    cudaSafeCall(cudaMalloc((void **) &d_data_out, blocks * sizeof(float)));
    cudaSafeCall(cudaMalloc((void **) &d_reg_out, blocks * sizeof(float)));
    cudaSafeCall(cudaMalloc((void **) &d_max_out, blocks * sizeof(float2)));

    cudaSafeCall(cudaMalloc((void **) &updates, no_voxels * sizeof(float4)));
}

sobfu::device::Reductor::~Reductor() {
    delete h_data_out, h_reg_out, h_max_out;

    cudaSafeCall(cudaFree(d_data_out));
    cudaSafeCall(cudaFree(d_reg_out));
    cudaSafeCall(cudaFree(d_max_out));
    cudaSafeCall(cudaFree(updates));
}

float sobfu::device::Reductor::data_energy(float2 *phi_global_data, float2 *phi_n_data) {
    reduce_data(no_voxels, threads, blocks, phi_global_data, phi_n_data, d_data_out);
    cudaSafeCall(cudaGetLastError());

    return 0.5f * final_reduce(h_data_out, d_data_out, blocks);
}

float sobfu::device::Reductor::reg_energy_sobolev(Mat4f *J_data) {
    reduce_reg_sobolev(no_voxels, threads, blocks, J_data, d_reg_out);
    cudaSafeCall(cudaGetLastError());

    return 0.5f * final_reduce(h_reg_out, d_reg_out, blocks);
}

float2 sobfu::device::Reductor::max_update_norm() {
    reduce_max(no_voxels, threads, blocks, updates, d_max_out);
    cudaSafeCall(cudaGetLastError());

    return final_reduce_max(h_max_out, d_max_out, blocks, dims);
}

float2 sobfu::device::Reductor::voxel_max_energy(float2 *phi_global_data, float2 *phi_n_data, Mat4f *J_data,
                                                 float w_reg) {
    reduce_voxel_max_energy(no_voxels, threads, blocks, phi_global_data, phi_n_data, J_data, w_reg, d_max_out);
    cudaSafeCall(cudaGetLastError());

    return final_reduce_max(h_max_out, d_max_out, blocks, dims);
}

/* sum partial sums from each block on cpu */
float sobfu::device::final_reduce(float *h_odata, float *d_odata, int numBlocks) {
    float result = 0.f;

    /* copy result from device to host */
    cudaSafeCall(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i = 0; i < numBlocks; i++) {
        result += h_odata[i];
    }

    return result;
}

float2 sobfu::device::final_reduce_max(float2 *h_o_max_data, float2 *d_o_max_data, int numBlocks, int3 dims) {
    float2 result = make_float2(0.f, 0.f);

    /* copy result from device to host */
    cudaSafeCall(cudaMemcpy(h_o_max_data, d_o_max_data, numBlocks * sizeof(float2), cudaMemcpyDeviceToHost));

    for (int i = 0; i < numBlocks; i++) {
        if (h_o_max_data[i].x > result.x) {
            result = h_o_max_data[i];
        }
    }

    return result;
}


================================================
FILE: src/sobfu/scalar_fields.cpp
================================================
#include <sobfu/scalar_fields.hpp>

/*
 * SCALAR FIELD
 */

sobfu::cuda::ScalarField::ScalarField(cv::Vec3i dims_) {
    dims = make_int3(dims_[0], dims_[1], dims_[2]);

    int no_voxels = dims.x * dims.y * dims.z;
    data.create(no_voxels * sizeof(float));
    clear();
}

sobfu::cuda::ScalarField::~ScalarField() = default;

kfusion::cuda::CudaData sobfu::cuda::ScalarField::get_data() { return data; }

const kfusion::cuda::CudaData sobfu::cuda::ScalarField::get_data() const { return data; }

int3 sobfu::cuda::ScalarField::get_dims() { return dims; }

void sobfu::cuda::ScalarField::clear() {
    sobfu::device::ScalarField field(data.ptr<float>(), dims);
    sobfu::device::clear(field);
}

float sobfu::cuda::ScalarField::sum() {
    sobfu::device::ScalarField field(data.ptr<float>(), dims);

    float result = sobfu::device::reduce_sum(field);
    return result;
}

void sobfu::cuda::ScalarField::print() {
    int sizes[3] = {dims.x, dims.y, dims.z};

    cv::Mat* mat = new cv::Mat(3, sizes, CV_32FC1);
    data.download(mat->ptr<float>());

    std::cout << "--- FIELD ---" << std::endl;
    for (int i = 0; i < dims.x; i++) {
        for (int j = 0; j < dims.y; j++) {
            for (int k = 0; k < dims.z; k++) {
                float val = mat->at<float>(k, j, i);

                std::cout << "(x,y,z)=(" << i << ", " << j << ", " << k << "), val=(" << val << std::endl;
            }
        }
    }

    delete mat;
}


================================================
FILE: src/sobfu/sob_fusion.cpp
================================================
#include <sobfu/sob_fusion.hpp>

SobFusion::SobFusion(const Params &params) : frame_counter_(0), params(params) {
    int cols = params.cols;
    int rows = params.rows;

    dists_.create(rows, cols);

    curr_.depth_pyr.resize(1);
    curr_.normals_pyr.resize(1);

    prev_.depth_pyr.resize(1);
    prev_.normals_pyr.resize(1);

    curr_.points_pyr.resize(1);
    prev_.points_pyr.resize(1);

    curr_.depth_pyr[0].create(rows, cols);
    curr_.normals_pyr[0].create(rows, cols);

    prev_.depth_pyr[0].create(rows, cols);
    prev_.normals_pyr[0].create(rows, cols);

    curr_.points_pyr[0].create(rows, cols);
    prev_.points_pyr[0].create(rows, cols);

    depths_.create(rows, cols);
    normals_.create(rows, cols);
    points_.create(rows, cols);

    poses_.clear();
    poses_.reserve(4096);
    poses_.push_back(cv::Affine3f::Identity());

    mc = cv::Ptr<kfusion::cuda::MarchingCubes>(new kfusion::cuda::MarchingCubes());
    mc->setPose(params.volume_pose);
}

SobFusion::~SobFusion() = default;

Params &SobFusion::getParams() { return params; }

pcl::PolygonMesh::Ptr SobFusion::get_phi_global_mesh() { return get_mesh(phi_global); }

pcl::PolygonMesh::Ptr SobFusion::get_phi_global_psi_inv_mesh() { return get_mesh(phi_global_psi_inv); }

pcl::PolygonMesh::Ptr SobFusion::get_phi_n_mesh() { return get_mesh(phi_n); }

pcl::PolygonMesh::Ptr SobFusion::get_phi_n_psi_mesh() { return get_mesh(phi_n_psi); }

std::shared_ptr<sobfu::cuda::DeformationField> SobFusion::getDeformationField() { return this->psi; }

/* PIPELINE
 *
 * --- frame 0 ---
 *
 * 1. bilateral filter
 * 2. depth truncation
 * 3. initailisation of phi_global and phi_n
 *
 * --- frames n + 1 ---
 * 1. bilateral filter
 * 2. depth truncation
 * 3. initialisation of phi_n
 * 4. estimation of psi
 * 5. fusion of phi_n(psi)
 * 6. warp of phi_global with psi^-1
 *
 */

bool SobFusion::operator()(const kfusion::cuda::Depth &depth, const kfusion::cuda::Image & /*image*/) {
    std::cout << "--- FRAME NO. " << frame_counter_ << " ---" << std::endl;

    /*
     *  bilateral filter
     */

    kfusion::cuda::depthBilateralFilter(depth, curr_.depth_pyr[0], params.bilateral_kernel_size,
                                        params.bilateral_sigma_spatial, params.bilateral_sigma_depth);

    /*
     * depth truncation
     */

    kfusion::cuda::depthTruncation(curr_.depth_pyr[0], params.icp_truncate_depth_dist);

    /*
     *  compute distances using depth map
     */

    kfusion::cuda::computeDists(curr_.depth_pyr[0], dists_, params.intr);

    if (frame_counter_ == 0) {
        /*
         * INITIALISATION OF PHI_GLOBAL
         */

        phi_global = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_global->integrate(dists_, poses_.back(), params.intr);

        /*
         * INITIALISATION OF PHI_GLOBAL(PSI_INV), PHI_N, AND PHI_N(PSI)
         */

        phi_global_psi_inv = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_n              = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_n_psi          = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));

        /*
         * INITIALISATION OF PSI AND PSI_INV
         */

        this->psi     = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);
        this->psi_inv = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);

        /*
         * INITIALISATION OF THE SOLVER
         */

        this->solver = std::make_shared<sobfu::cuda::Solver>(params);

        return ++frame_counter_, true;
    }

    /*
     * UPDATE OF PHI_N
     */

    phi_n->clear();
    this->phi_n->integrate(dists_, poses_.back(), params.intr);

    /*
     * ESTIMATION OF DEFORMATION FIELD AND SURFACE FUSION
     */

    if (frame_counter_ < params.start_frame) {
        this->phi_global->integrate(*phi_n);
        return ++frame_counter_, true;
    }

    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);
    this->phi_global->integrate(*phi_n_psi);

    return ++frame_counter_, true;
}

pcl::PolygonMesh::Ptr SobFusion::get_mesh(cv::Ptr<kfusion::cuda::TsdfVolume> vol) {
    kfusion::device::DeviceArray<pcl::PointXYZ> vertices_buffer_device;
    kfusion::device::DeviceArray<pcl::Normal> normals_buffer_device;

    /* run marching cubes */
    std::shared_ptr<kfusion::cuda::Surface> model =
        std::make_shared<kfusion::cuda::Surface>(mc->run(*vol, vertices_buffer_device, normals_buffer_device));
    kfusion::cuda::waitAllDefaultStream();

    pcl::PolygonMesh::Ptr mesh = convert_to_mesh(model->vertices);
    return mesh;
}

pcl::PolygonMesh::Ptr SobFusion::convert_to_mesh(const kfusion::cuda::DeviceArray<pcl::PointXYZ> &triangles) {
    if (triangles.empty()) {
        return pcl::PolygonMesh::Ptr(new pcl::PolygonMesh());
    }

    pcl::PointCloud<pcl::PointXYZ> cloud;
    cloud.width  = static_cast<int>(triangles.size());
    cloud.height = 1;
    triangles.download(cloud.points);

    pcl::PolygonMesh::Ptr mesh(new pcl::PolygonMesh());
    pcl::toPCLPointCloud2(cloud, mesh->cloud);

    mesh->polygons.resize(triangles.size() / 3);
    for (size_t i = 0; i < mesh->polygons.size(); ++i) {
        pcl::Vertices v;
        v.vertices.push_back(i * 3 + 0);
        v.vertices.push_back(i * 3 + 1);
        v.vertices.push_back(i * 3 + 2);
        mesh->polygons[i] = v;
    }

    return mesh;
}


================================================
FILE: src/sobfu/solver.cpp
================================================
#include <sobfu/solver.hpp>

/*
 * SOLVER
 */

sobfu::cuda::Solver::Solver(Params& params) {
    /*
     * PARAMETERS
     */

    /* volume */
    cv::Vec3i d = params.volume_dims;
    dims        = make_int3(d[0], d[1], d[2]);
    no_voxels   = dims.x * dims.y * dims.z;

    cv::Vec3f vsz = params.voxel_sizes();
    voxel_sizes   = make_float3(vsz[0], vsz[1], vsz[2]);

    trunc_dist = params.tsdf_trunc_dist;
    eta        = params.eta;
    max_weight = params.tsdf_max_weight;

    /* solver */
    solver_params.verbosity = params.verbosity;

    solver_params.max_iter        = params.max_iter;
    solver_params.max_update_norm = params.max_update_norm;

    solver_params.lambda = params.lambda;
    solver_params.s      = params.s;

    solver_params.alpha = params.alpha;
    solver_params.w_reg = params.w_reg;

    /*
     * SOLVER HELPER CLASSES
     */

    /* gradients */
    spatial_grads = new sobfu::cuda::SpatialGradients(d);

    nabla_phi_n = new sobfu::device::TsdfGradient(spatial_grads->nabla_phi_n->get_data().ptr<float4>(), dims);
    nabla_phi_n_o_psi =
        new sobfu::device::TsdfGradient(spatial_grads->nabla_phi_n_o_psi->get_data().ptr<float4>(), dims);
    J           = new sobfu::device::Jacobian(spatial_grads->J->get_data().ptr<Mat4f>(), dims);
    J_inv       = new sobfu::device::Jacobian(spatial_grads->J_inv->get_data().ptr<Mat4f>(), dims);
    L           = new sobfu::device::Laplacian(spatial_grads->L->get_data().ptr<float4>(), dims);
    L_o_psi_inv = new sobfu::device::Laplacian(spatial_grads->L_o_psi_inv->get_data().ptr<float4>(), dims);
    nabla_U     = new sobfu::device::PotentialGradient(spatial_grads->nabla_U->get_data().ptr<float4>(), dims);
    nabla_U_S   = new sobfu::device::PotentialGradient(spatial_grads->nabla_U_S->get_data().ptr<float4>(), dims);

    spatial_grads_device = new sobfu::device::SpatialGradients(nabla_phi_n, nabla_phi_n_o_psi, J, J_inv, L, L_o_psi_inv,
                                                               nabla_U, nabla_U_S);

    /* reductor */
    r = new sobfu::device::Reductor(dims, voxel_sizes.x, trunc_dist);

    /* sobolev filter */
    h_S_i = new float[solver_params.s];
    cudaSafeCall(cudaMalloc((void**) &d_S_i, solver_params.s * sizeof(float)));

    decompose_sobolev_filter(solver_params, h_S_i);
    cudaSafeCall(cudaMemcpy(d_S_i, h_S_i, solver_params.s * sizeof(float), cudaMemcpyHostToDevice));
}

sobfu::cuda::Solver::~Solver() = default;

void sobfu::cuda::Solver::estimate_psi(const cv::Ptr<kfusion::cuda::TsdfVolume> phi_global,
                                       cv::Ptr<kfusion::cuda::TsdfVolume> phi_global_psi_inv,
                                       const cv::Ptr<kfusion::cuda::TsdfVolume> phi_n,
                                       cv::Ptr<kfusion::cuda::TsdfVolume> phi_n_psi,
                                       std::shared_ptr<sobfu::cuda::DeformationField> psi,
                                       std::shared_ptr<sobfu::cuda::DeformationField> psi_inv) {
    /* DEVICE CLASSES */
    sobfu::device::DeformationField psi_device(psi->get_data().ptr<float4>(), dims);
    sobfu::device::DeformationField psi_inv_device(psi_inv->get_data().ptr<float4>(), dims);

    kfusion::device::TsdfVolume phi_global_device(phi_global->data().ptr<float2>(), dims, voxel_sizes, trunc_dist, eta,
                                                  max_weight);
    kfusion::device::TsdfVolume phi_global_psi_inv_device(phi_global_psi_inv->data().ptr<float2>(), dims, voxel_sizes,
                                                          trunc_dist, eta, max_weight);

    kfusion::device::TsdfVolume phi_n_device(phi_n->data().ptr<float2>(), dims, voxel_sizes, trunc_dist, eta,
                                             max_weight);
    kfusion::device::TsdfVolume phi_n_psi_device(phi_n_psi->data().ptr<float2>(), dims, voxel_sizes, trunc_dist, eta,
                                                 max_weight);

    SDFs sdfs(phi_global_device, phi_global_psi_inv_device, phi_n_device, phi_n_psi_device);

    sobfu::device::TsdfDifferentiator tsdf_diff(phi_n_psi_device);
    sobfu::device::Differentiator diff(psi_device);
    sobfu::device::Differentiator diff_inv(psi_inv_device);
    sobfu::device::SecondOrderDifferentiator second_order_diff(psi_device);

    Differentiators differentiators(tsdf_diff, diff, diff_inv, second_order_diff);

    /* run the solver */
    sobfu::device::estimate_psi(sdfs, psi_device, psi_inv_device, spatial_grads_device, differentiators, d_S_i, r,
                                solver_params);
}

/*
 * SOBOLEV FILTER
 */

static void sobfu::cuda::get_3d_sobolev_filter(SolverParams& params, float* h_S_i) {
    int s3 = params.s * params.s * params.s;

    /* init identity and laplacian matrices */
    cv::Mat Id    = cv::Mat::eye(s3, s3, CV_32FC1);
    cv::Mat L_mat = -6.f * cv::Mat::eye(s3, s3, CV_32FC1);

    /* calculate laplacian matrix */
    for (int i = 0; i <= static_cast<int>(pow(params.s, 3)) - 1; ++i) {
        int idx_z = i / (params.s * params.s);
        int idx_y = (i - idx_z * params.s * params.s) / params.s;
        int idx_x = i - params.s * (idx_y + params.s * idx_z);

        if (idx_x + 1 < params.s) {
            int pos                 = (idx_x + 1) + idx_y * params.s + idx_z * params.s * params.s;
            L_mat.at<float>(i, pos) = 1.f;
        }
        if (idx_x - 1 >= 0) {
            int pos                 = (idx_x - 1) + idx_y * params.s + idx_z * params.s * params.s;
            L_mat.at<float>(i, pos) = 1.f;
        }

        if (idx_y + 1 < params.s) {
            int pos                 = idx_x + (idx_y + 1) * params.s + idx_z * params.s * params.s;
            L_mat.at<float>(i, pos) = 1.f;
        }
        if (idx_y - 1 >= 0) {
            int pos                 = idx_x + (idx_y - 1) * params.s + idx_z * params.s * params.s;
            L_mat.at<float>(i, pos) = 1.f;
        }

        if (idx_z + 1 < params.s) {
            int pos                 = idx_x + idx_y * params.s + (idx_z + 1) * params.s * params.s;
            L_mat.at<float>(i, pos) = 1.f;
        }
        if (idx_z - 1 >= 0) {
            int pos                 = idx_x + idx_y * params.s + (idx_z - 1) * params.s * params.s;
            L_mat.at<float>(i, pos) = 1.f;
        }
    }

    /* init one-hot vector v */
    cv::Mat v                       = cv::Mat::zeros(s3, 1, CV_32FC1);
    v.at<float>(floor(s3 / 2.f), 0) = 1.f;

    /* init sobolev filter S */
    cv::Mat S = cv::Mat::zeros(s3, 1, CV_32FC1);
    /* solve for S */
    cv::solve((Id - params.lambda * L_mat), v, S, cv::DECOMP_SVD);

    std::cout << S << std::endl;
}

static void sobfu::cuda::decompose_sobolev_filter(SolverParams& params, float* h_S_i) {
    if (params.s == 3) {
        if (params.lambda == 0.1f) {
            h_S_i[0] = 0.06537f;
            h_S_i[1] = 0.99572f;
            h_S_i[2] = h_S_i[0];
        }
    }

    if (params.s == 7) {
        if (params.lambda == 0.05f) {
            h_S_i[0] = 0.00006f;
            h_S_i[1] = 0.00015f;
            h_S_i[2] = 0.03917f;
            h_S_i[3] = 0.99846f;
            h_S_i[4] = h_S_i[2];
            h_S_i[5] = h_S_i[1];
            h_S_i[6] = h_S_i[0];
        }

        if (params.lambda == 0.1f) {
            h_S_i[0] = 0.00030f;
            h_S_i[1] = 0.00441f;
            h_S_i[2] = 0.06571f;
            h_S_i[3] = 0.99565f;
            h_S_i[4] = h_S_i[2];
            h_S_i[5] = h_S_i[1];
            h_S_i[6] = h_S_i[0];
        }

        if (params.lambda == 0.2f) {
            h_S_i[0] = 0.00120f;
            h_S_i[1] = 0.01094f;
            h_S_i[2] = 0.10204f;
            h_S_i[3] = 0.98941f;
            h_S_i[4] = h_S_i[2];
            h_S_i[5] = h_S_i[1];
            h_S_i[6] = h_S_i[0];
        }

        if (params.lambda == 0.4f) {
            h_S_i[0] = 0.00169f;
            h_S_i[1] = 0.01312f;
            h_S_i[2] = 0.10927f;
            h_S_i[3] = 0.98781f;
            h_S_i[4] = h_S_i[2];
            h_S_i[5] = h_S_i[1];
            h_S_i[6] = h_S_i[0];
        }
    }

    if (params.s == 9) {
        if (params.lambda == 0.05f) {
            h_S_i[0] = 0.000003f;
            h_S_i[1] = 0.00006f;
            h_S_i[2] = 0.00155f;
            h_S_i[3] = 0.03917f;
            h_S_i[4] = 0.99846f;
            h_S_i[5] = 0.03917f;
            h_S_i[6] = 0.00155f;
            h_S_i[7] = 0.00006f;
            h_S_i[8] = 0.000003f;
        }

        if (params.lambda == 0.1f) {
            h_S_i[0] = 0.00002f;
            h_S_i[1] = 0.00030f;
            h_S_i[2] = 0.00441f;
            h_S_i[3] = 0.06571f;
            h_S_i[4] = 0.99565f;
            h_S_i[5] = h_S_i[3];
            h_S_i[6] = h_S_i[2];
            h_S_i[7] = h_S_i[1];
            h_S_i[8] = h_S_i[0];
        }
    }

    if (params.s == 11) {
        if (params.lambda == 0.1f) {
            h_S_i[0]  = 0.0000015f;
            h_S_i[1]  = 0.00002f;
            h_S_i[2]  = 0.00030f;
            h_S_i[3]  = 0.00441f;
            h_S_i[4]  = 0.06571f;
            h_S_i[5]  = 0.99565f;
            h_S_i[6]  = h_S_i[4];
            h_S_i[7]  = h_S_i[3];
            h_S_i[8]  = h_S_i[2];
            h_S_i[9]  = h_S_i[1];
            h_S_i[10] = h_S_i[0];
        }
    }

    /* normalise filter to unit sum */
    float sum = 0.f;
    for (int i = 0; i < params.s; ++i) {
        sum += h_S_i[i];
    }

    for (int i = 0; i < params.s; ++i) {
        h_S_i[i] /= sum;
    }
}


================================================
FILE: src/sobfu/vector_fields.cpp
================================================
/* sobfu includes */
#include <sobfu/vector_fields.hpp>

/*
 * VECTOR FIELD
 */

sobfu::cuda::VectorField::VectorField(cv::Vec3i dims_) : dims(dims_) {
    int no_voxels = dims[0] * dims[1] * dims[2];
    data.create(no_voxels * sizeof(float4));
    clear();
}

sobfu::cuda::VectorField::~VectorField() = default;

cv::Vec3i sobfu::cuda::VectorField::get_dims() const { return dims; }

kfusion::cuda::CudaData sobfu::cuda::VectorField::get_data() { return data; }

const kfusion::cuda::CudaData sobfu::cuda::VectorField::get_data() const { return data; }

void sobfu::cuda::VectorField::set_data(kfusion::cuda::CudaData& data) { data = data; }

void sobfu::cuda::VectorField::clear() {
    int3 d = make_int3(dims[0], dims[1], dims[2]);

    sobfu::device::VectorField field(data.ptr<float4>(), d);
    sobfu::device::clear(field);
}

void sobfu::cuda::VectorField::print() {
    int sizes[3] = {dims[0], dims[1], dims[2]};

    cv::Mat* mat = new cv::Mat(3, sizes, CV_32FC4);
    get_data().download(mat->ptr<float4>());

    std::cout << "--- FIELD ---" << std::endl;
    for (int i = 0; i < dims[0]; i++) {
        for (int j = 0; j < dims[1]; j++) {
            for (int k = 0; k < dims[2]; k++) {
                float u = mat->at<float4>(k, j, i).x;
                float v = mat->at<float4>(k, j, i).y;
                float w = mat->at<float4>(k, j, i).z;

                if (fabs(u) > 1e-5f || fabs(v) > 1e-5f || fabs(w) > 1e-5f) {
                    std::cout << "(x,y,z)=(" << i << ", " << j << ", " << k << "), (u,v,w)=(" << u << ", " << v << ","
                              << w << ")" << std::endl;
                }
            }
        }
    }

    delete mat;
}

int sobfu::cuda::VectorField::get_no_nans() {
    int sizes[3] = {dims[0], dims[1], dims[2]};

    cv::Mat* mat = new cv::Mat(3, sizes, CV_32FC4);
    get_data().download(mat->ptr<float4>());

    int no_nan = 0;
    for (int i = 0; i < dims[0]; i++) {
        for (int j = 0; j < dims[1]; j++) {
            for (int k = 0; k < dims[2]; k++) {
                float u = mat->at<float4>(k, j, i).x;
                float v = mat->at<float4>(k, j, i).y;
                float w = mat->at<float4>(k, j, i).z;

                if (std::isnan(u) || std::isnan(v) || std::isnan(w)) {
                    no_nan++;
                }
            }
        }
    }

    delete mat;
    return no_nan;
}

/*
 * DEFORMATION FIELD
 */

sobfu::cuda::DeformationField::DeformationField(cv::Vec3i dims_) : VectorField(dims_) { clear(); }

sobfu::cuda::DeformationField::~DeformationField() = default;

void sobfu::cuda::DeformationField::clear() {
    cv::Vec3f dims = get_dims();
    int3 d         = make_int3(dims[0], dims[1], dims[2]);

    sobfu::device::DeformationField psi(get_data().ptr<float4>(), d);
    sobfu::device::init_identity(psi);
}

void sobfu::cuda::DeformationField::get_inverse(sobfu::cuda::DeformationField& psi_inv) {
    kfusion::device::Vec3i d = kfusion::device_cast<kfusion::device::Vec3i>(get_dims());

    sobfu::device::DeformationField psi_device(get_data().ptr<float4>(), d);
    sobfu::device::DeformationField psi_inverse_device(psi_inv.get_data().ptr<float4>(), d);

    sobfu::device::estimate_inverse(psi_device, psi_inverse_device);
}

void sobfu::cuda::DeformationField::apply(const cv::Ptr<kfusion::cuda::TsdfVolume> phi_n,
                                          cv::Ptr<kfusion::cuda::TsdfVolume> phi_n_psi) {
    kfusion::device::Vec3i d   = kfusion::device_cast<kfusion::device::Vec3i>(phi_n->getDims());
    kfusion::device::Vec3f vsz = kfusion::device_cast<kfusion::device::Vec3f>(phi_n->getVoxelSize());

    float3 voxel_sizes = kfusion::device_cast<float3>(phi_n->getVoxelSize());
    float trunc_dist   = phi_n->getTruncDist();
    float eta          = phi_n->getEta();
    float max_weight   = phi_n->getMaxWeight();

    kfusion::device::TsdfVolume phi_device(phi_n->data().ptr<float2>(), d, vsz, trunc_dist, eta, max_weight);
    kfusion::device::TsdfVolume phi_warped_device(phi_n_psi->data().ptr<float2>(), d, vsz, trunc_dist, eta, max_weight);

    sobfu::device::DeformationField psi_device(get_data().ptr<float4>(), d);

    sobfu::device::apply(phi_device, phi_warped_device, psi_device);
    kfusion::cuda::waitAllDefaultStream();
}

/*
 * JACOBIAN
 */

sobfu::cuda::Jacobian::Jacobian(cv::Vec3i dims_) : dims(dims_) {
    int no_voxels = dims[0] * dims[1] * dims[2];
    data.create(no_voxels * sizeof(Mat4f));
    clear();
}

sobfu::cuda::Jacobian::~Jacobian() = default;

kfusion::cuda::CudaData sobfu::cuda::Jacobian::get_data() { return data; }

const kfusion::cuda::CudaData sobfu::cuda::Jacobian::get_data() const { return data; }

void sobfu::cuda::Jacobian::clear() {
    int3 d = make_int3(dims[0], dims[1], dims[2]);

    sobfu::device::Jacobian J(data.ptr<Mat4f>(), d);
    sobfu::device::clear(J);
}

/*
 * SPATIAL GRADIENTS
 */

sobfu::cuda::SpatialGradients::SpatialGradients(cv::Vec3i dims_) {
    nabla_phi_n       = new sobfu::cuda::TsdfGradient(dims_);
    nabla_phi_n_o_psi = new sobfu::cuda::TsdfGradient(dims_);
    J                 = new sobfu::cuda::Jacobian(dims_);
    J_inv             = new sobfu::cuda::Jacobian(dims_);
    L                 = new sobfu::cuda::Laplacian(dims_);
    L_o_psi_inv       = new sobfu::cuda::Laplacian(dims_);
    nabla_U           = new sobfu::cuda::PotentialGradient(dims_);
    nabla_U_S         = new sobfu::cuda::PotentialGradient(dims_);
}

sobfu::cuda::SpatialGradients::~SpatialGradients() {
    delete nabla_phi_n, nabla_phi_n_o_psi, J, J_inv, L, L_o_psi_inv, nabla_U, nabla_U_S;
}


================================================
FILE: test/CMakeLists.txt
================================================
# ---------------------------------------------------------------------------- #
# GOOGLE TEST DEPENDECIES
# ---------------------------------------------------------------------------- #

# Implementation heavily based upon:
# https://github.com/kaizouman/gtest-cmake-example

# Enable ExternalProject CMake module
include(ExternalProject)

# Download and install GoogleTest
ExternalProject_Add(
    gtest
    URL https://github.com/google/googletest/archive/release-1.8.1.zip
    PREFIX ${CMAKE_CURRENT_BINARY_DIR}/gtest
    # Disable install step
    INSTALL_COMMAND ""
)

# Get GTest source and binary directories from CMake project
ExternalProject_Get_Property(gtest source_dir binary_dir)

# Create a libgtest target to be used as a dependency by test programs
add_library(libgtest IMPORTED STATIC GLOBAL)
add_dependencies(libgtest gtest)

# Set libgtest properties
set_target_properties(libgtest PROPERTIES
    "IMPORTED_LOCATION" "${binary_dir}/googlemock/gtest/libgtest.a"
    "IMPORTED_LINK_INTERFACE_LIBRARIES" "${CMAKE_THREAD_LIBS_INIT}"
)

# Create a libgmock target to be used as a dependency by test programs
add_library(libgmock IMPORTED STATIC GLOBAL)
add_dependencies(libgmock gtest)

# Set libgmock properties
set_target_properties(libgmock PROPERTIES
    "IMPORTED_LOCATION" "${binary_dir}/googlemock/libgmock.a"
    "IMPORTED_LINK_INTERFACE_LIBRARIES" "${CMAKE_THREAD_LIBS_INIT}"
)


# ---------------------------------------------------------------------------- #
# TEST INCLUDE DIRECTORIES
# ---------------------------------------------------------------------------- #
include_directories("${source_dir}/googletest/include"
                    "${source_dir}/googlemock/include")

# ---------------------------------------------------------------------------- #
# CREATE TESTS
# ---------------------------------------------------------------------------- #
# Force test build
set(gtest_build_samples ON)

# Add quaternion Test
CREATE_TEST(sobfu_test)

# Link all libraries for testing
target_link_libraries(sobfu_test
  sobfu
  ${CUDA_CUDART_LIBRARY}
  ${OpenCV_LIBS}
  ${PCL_LIBRARIES}
)


================================================
FILE: test/deformation_field_test.cpp
================================================
/* gtest includes */
#include <gtest/gtest.h>

/* sobfu includes */
#include <sobfu/params.hpp>
#include <sobfu/solver.hpp>

/* kinfu includes */
#include <kfusion/internal.hpp>
#include <kfusion/precomp.hpp>
#include <kfusion/types.hpp>

/* sys headers */
#include <cmath>
#include <ctgmath>
#include <iostream>
#include <memory>

class DeformationFieldTest : public ::testing::Test {
protected:
    /* You can remove any or all of the following functions if its body
     * is empty. */

    /* You can do set-up work for each test here. */
    DeformationFieldTest() = default;

    /* You can do clean-up work that doesn't throw exceptions here. */
    ~DeformationFieldTest() override = default;

    /* If the constructor and destructor are not enough for setting up
     * and cleaning up each test, you can define the following methods: */

    /* Code here will be called immediately after the constructor (right
     * before each test). */
    void SetUp() override {
        /* volume params */
        params.volume_dims = cv::Vec3i::all(64);
        params.volume_size = cv::Vec3f::all(0.25f);

        params.tsdf_trunc_dist       = 10.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);
        params.gradient_delta_factor = 0.1f;

        params.eta = 2.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);

        /* camera params */
        params.intr = kfusion::Intr(1.f, 1.f, 0.f, 0.f);

        dims        = kfusion::device_cast<kfusion::device::Vec3i>(params.volume_dims);
        voxel_sizes = kfusion::device_cast<kfusion::device::Vec3f>(params.voxel_sizes());

        no_voxels = dims.x * dims.y * dims.z;

        /* init tsdf's */
        phi_global         = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_global_psi_inv = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_n              = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_n_psi          = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));

        /* init psi */
        psi     = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);
        psi_inv = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);
    }

    /* Code here will be called immediately after each test (right
     * before the destructor). */
    void TearDown() override {}

    /* Objects declared here can be used by all tests in the test case for Solver. */

    /* sobfu params */
    Params params;

    int3 dims;
    float3 voxel_sizes;

    int no_voxels;

    /* global and live tsdf's */
    cv::Ptr<kfusion::cuda::TsdfVolume> phi_global, phi_global_psi_inv, phi_n, phi_n_psi;
    /* deformation field */
    std::shared_ptr<sobfu::cuda::DeformationField> psi, psi_inv;
    /* solver */
    std::shared_ptr<sobfu::cuda::Solver> solver;

    /* max. error */
    float max_error = 1e-1f;
    /* floating point precision */
    const float epsilon = 1e-5f;
};

/* test that the vector field is correctly initialised to 0 */
TEST_F(DeformationFieldTest, ClearTest) {
    kfusion::cuda::CudaData data = psi->get_data();

    int sizes[3]    = {dims.x, dims.y, dims.z};
    cv::Mat *matrix = new cv::Mat(3, sizes, CV_32FC4);
    data.download(matrix->ptr<float4>());

    for (int i = 0; i < sizes[0]; i++) {
        for (int j = 0; j < sizes[1]; j++) {
            for (int k = 0; k < sizes[2]; k++) {
                ASSERT_NEAR(matrix->at<float4>(k, j, i).x, (float) i, epsilon);
                ASSERT_NEAR(matrix->at<float4>(k, j, i).y, (float) j, epsilon);
                ASSERT_NEAR(matrix->at<float4>(k, j, i).z, (float) k, epsilon);
            }
        }
    }
}

/* test that the norm of the gradient of an sdf is 1 */
TEST_F(DeformationFieldTest, TsdfGradientTest) {
    /* init tsdf */
    float3 c = make_float3(0.16f, 0.16f, 0.16f);
    float r  = 0.01f;
    phi_global->initSphere(c, r);

    kfusion::device::TsdfVolume phi_global_device(phi_global->data().ptr<float2>(), dims, voxel_sizes,
                                                  params.tsdf_trunc_dist, params.eta, params.tsdf_max_weight);

    /* init gradient */
    kfusion::cuda::CudaData grad_data;
    grad_data.create(no_voxels * sizeof(float4));
    sobfu::device::TsdfGradient gradient_device(grad_data.ptr<float4>(), dims);

    sobfu::device::TsdfDifferentiator diff(phi_global_device);
    diff.calculate(gradient_device);

    /* test that the gradient magnitude is 0 in the truncated regions and close to 1 elsewhere */
    float2 *tsdf_ptr = new float2[no_voxels];
    phi_global->data().download(tsdf_ptr);

    float4 *grad_ptr = new float4[no_voxels];
    grad_data.download(grad_ptr);

    int sizes[3] = {dims.x, dims.y, dims.z};
    for (int i = 1; i < sizes[0] - 1; i++) {
        for (int j = 1; j < sizes[1] - 1; j++) {
            for (int k = 1; k < sizes[2] - 1; k++) {
                float tsdf_val  = (*(tsdf_ptr + i + j * sizes[0] + k * sizes[1] * sizes[0])).x;
                float4 grad_val = *(grad_ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);

                float norm = sqrtf(grad_val.x * grad_val.x + grad_val.y * grad_val.y + grad_val.z * grad_val.z);
                if (fabs(tsdf_val) < 1.f) { /* only check for non-truncated voxels */
                    ASSERT_NEAR(norm, voxel_sizes.x / params.tsdf_trunc_dist, 0.15f);
                }
            }
        }
    }
}

/* test that the jacobian of a uniform vector field is null */
TEST_F(DeformationFieldTest, UniformFieldJacobianTest) {
    /* init psi to a uniform field */
    int sizes[3] = {dims.x, dims.y, dims.z};

    cv::Mat *matrix = new cv::Mat(3, sizes, CV_32FC4);
    for (int i = 0; i < sizes[0]; i++) {
        for (int j = 0; j < sizes[1]; j++) {
            for (int k = 0; k < sizes[2]; k++) {
                matrix->at<float4>(k, j, i) = make_float4(1.f, 1.f, 1.f, 0.f);
            }
        }
    }

    kfusion::cuda::CudaData psi_data;
    psi_data.create(no_voxels * sizeof(float4));
    psi_data.upload(matrix->ptr<float4>(), no_voxels * sizeof(float4));
    sobfu::device::DeformationField psi_device(psi_data.ptr<float4>(), dims);

    /* init & clear the jacobian */
    kfusion::cuda::CudaData jacobian_data;
    jacobian_data.create(no_voxels * sizeof(Mat4f));
    sobfu::device::Jacobian J(jacobian_data.ptr<Mat4f>(), dims);

    /* calculate the jacobian */
    sobfu::device::Differentiator diff(psi_device);
    diff.calculate(J);

    /* test that the jacobian is null */
    Mat4f *ptr = new Mat4f[no_voxels];
    jacobian_data.download(ptr);

    for (int i = 0; i < sizes[0]; i++) {
        for (int j = 0; j < sizes[1]; j++) {
            for (int k = 0; k < sizes[2]; k++) {
                Mat4f J_val = *(ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);

                for (int r = 0; r < 3; r++) {
                    ASSERT_NEAR(J_val.data[r].x, 0.f, epsilon);
                    ASSERT_NEAR(J_val.data[r].y, 0.f, epsilon);
                    ASSERT_NEAR(J_val.data[r].z, 0.f, epsilon);
                }
            }
        }
    }
}

/* test the calculation of the jacobian of a simple vector field */
TEST_F(DeformationFieldTest, JacobianTestSimple) {
    /* init psi */
    int sizes[3] = {dims.x, dims.y, dims.z};

    cv::Mat *matrix = new cv::Mat(3, sizes, CV_32FC4);
    for (int i = 0; i < sizes[0]; i++) {
        for (int j = 0; j < sizes[1]; j++) {
            for (int k = 0; k < sizes[2]; k++) {
                matrix->at<float4>(k, j, i) = make_float4(i, j, k, 0.f);
            }
        }
    }

    kfusion::cuda::CudaData psi_data;
    psi_data.create(no_voxels * sizeof(float4));
    psi_data.upload(matrix->ptr<float4>(), no_voxels * sizeof(float4));
    sobfu::device::DeformationField psi_device(psi_data.ptr<float4>(), dims);

    /* init the jacobian */
    kfusion::cuda::CudaData jacobian_data;
    jacobian_data.create(no_voxels * sizeof(Mat4f));
    sobfu::device::Jacobian J(jacobian_data.ptr<Mat4f>(), dims);

    /* calculate the jacobian */
    sobfu::device::Differentiator diff(psi_device);
    diff.calculate(J);

    /* test that the jacobian values are correct */
    Mat4f *ptr = new Mat4f[no_voxels];
    jacobian_data.download(ptr);

    for (int i = 1; i < sizes[0] - 1; i++) {
        for (int j = 1; j < sizes[1] - 1; j++) {
            for (int k = 1; k < sizes[2] - 1; k++) {
                Mat4f J_val = *(ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);

                ASSERT_NEAR(J_val.data[0].x, 1.f, epsilon);
                ASSERT_NEAR(J_val.data[0].y, 0.f, epsilon);
                ASSERT_NEAR(J_val.data[0].z, 0.f, epsilon);

                ASSERT_NEAR(J_val.data[1].x, 0.f, epsilon);
                ASSERT_NEAR(J_val.data[1].y, 1.f, epsilon);
                ASSERT_NEAR(J_val.data[1].z, 0.f, epsilon);

                ASSERT_NEAR(J_val.data[2].x, 0.f, epsilon);
                ASSERT_NEAR(J_val.data[2].y, 0.f, epsilon);
                ASSERT_NEAR(J_val.data[2].z, 1.f, epsilon);
            }
        }
    }
}

/* test the computation of the jacobian and laplacian of a vector field */
TEST_F(DeformationFieldTest, JacobianLaplacianTestComplicated) {
    /* init psi */
    int sizes[3] = {dims.x, dims.y, dims.z};

    cv::Mat *matrix = new cv::Mat(3, sizes, CV_32FC4);
    for (int i = 0; i < sizes[0]; i++) {
        for (int j = 0; j < sizes[1]; j++) {
            for (int k = 0; k < sizes[2]; k++) {
                matrix->at<float4>(k, j, i) = make_float4(i * (1.f - j), exp(-k) + j, k, 0.f);
            }
        }
    }

    kfusion::cuda::CudaData psi_data;
    psi_data.create(no_voxels * sizeof(float4));
    psi_data.upload(matrix->ptr<float4>(), no_voxels * sizeof(float4));
    sobfu::device::DeformationField psi_device(psi_data.ptr<float4>(), dims);

    /* init the jacobian */
    kfusion::cuda::CudaData J_data;
    J_data.create(no_voxels * sizeof(Mat4f));
    sobfu::device::Jacobian J(J_data.ptr<Mat4f>(), dims);

    /* calculate the jacobian */
    sobfu::device::Differentiator diff(psi_device);
    diff.calculate(J);

    /* test that the jacobian values are correct */
    Mat4f *ptr = new Mat4f[no_voxels];
    J_data.download(ptr);

    /*	       (1-y  -x      0    )
     * J_psi = ( 0    1  -exp(-z) )
     * 	       ( 0    0      1    )
     */

    for (int i = 1; i < sizes[0] - 1; i++) {
        for (int j = 1; j < sizes[1] - 1; j++) {
            for (int k = 1; k < sizes[2] - 1; k++) {
                Mat4f J_val = *(ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);

                ASSERT_NEAR(J_val.data[0].x, 1.f - j, max_error);
                ASSERT_NEAR(J_val.data[0].y, -i, max_error);
                ASSERT_NEAR(J_val.data[0].z, 0.f, max_error);

                ASSERT_NEAR(J_val.data[1].x, 0, max_error);
                ASSERT_NEAR(J_val.data[1].y, 1.f, max_error);
                ASSERT_NEAR(J_val.data[1].z, -exp(-k), max_error);

                ASSERT_NEAR(J_val.data[2].x, 0.f, max_error);
                ASSERT_NEAR(J_val.data[2].y, 0.f, max_error);
                ASSERT_NEAR(J_val.data[2].z, 1.f, max_error);
            }
        }
    }

    /* init the laplacian */
    kfusion::cuda::CudaData laplacian_data;
    laplacian_data.create(no_voxels * sizeof(float4));
    sobfu::device::Laplacian L(laplacian_data.ptr<float4>(), dims);

    /* calculate the laplacian */
    sobfu::device::SecondOrderDifferentiator secondOrderDiff(psi_device);
    secondOrderDiff.calculate(L);

    /* test that the laplacian values are correct */
    float4 *L_ptr = new float4[no_voxels];
    laplacian_data.download(L_ptr);

    /*
     * expecting -(0  exp(-z)  0)--we calculate the negative laplacian for simplicity
     */

    for (int i = 1; i < sizes[0] - 1; i++) {
        for (int j = 1; j < sizes[1] - 1; j++) {
            for (int k = 1; k < sizes[2] - 1; k++) {
                float4 L_val = *(L_ptr + i + j * sizes[0] + k * sizes[1] * sizes[0]);

                ASSERT_NEAR(L_val.x, 0.f, max_error);
                ASSERT_NEAR(L_val.y, -exp(-k), max_error);
                ASSERT_NEAR(L_val.z, 0.f, max_error);
            }
        }
    }
}


================================================
FILE: test/main.cpp
================================================
#include "gtest/gtest.h"

int main(int argc, char **argv) {
    ::testing::InitGoogleTest(&argc, argv);
    return RUN_ALL_TESTS();
}


================================================
FILE: test/reductions_test.cpp
================================================
/* gtest includes */
#include <gtest/gtest.h>

/* sobfu includes */
#include <sobfu/params.hpp>
#include <sobfu/solver.hpp>

/* kinfu includes */
#include <kfusion/internal.hpp>
#include <kfusion/precomp.hpp>
#include <kfusion/types.hpp>

/* sys headers */
#include <cmath>
#include <ctgmath>
#include <iostream>
#include <memory>

class ReductionsTest : public ::testing::Test {
protected:
    /* You can remove any or all of the following functions if its body
     * is empty. */

    /* You can do set-up work for each test here. */
    ReductionsTest() = default;

    /* You can do clean-up work that doesn't throw exceptions here. */
    ~ReductionsTest() override = default;

    /* If the constructor and destructor are not enough for setting up
     * and cleaning up each test, you can define the following methods: */

    /* Code here will be called immediately after the constructor (right
     * before each test). */
    void SetUp() override {
        /* verbosity */
        params.verbosity = 2;

        /* volume params */
        params.volume_dims = cv::Vec3i::all(64);
        params.volume_size = cv::Vec3f::all(0.25f);

        params.tsdf_trunc_dist = 5.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);
        params.eta             = 2.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);

        dims        = kfusion::device_cast<kfusion::device::Vec3i>(params.volume_dims);
        voxel_sizes = kfusion::device_cast<kfusion::device::Vec3f>(params.voxel_sizes());

        no_voxels = dims.x * dims.y * dims.z;

        /* init tsdf's */
        phi_global = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_n      = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));

        /* init reduction class */
        reductor = new sobfu::device::Reductor(dims, voxel_sizes.x, params.tsdf_trunc_dist);
    }

    /* Code here will be called immediately after each test (right
     * before the destructor). */
    void TearDown() override {}

    /* Objects declared here can be used by all tests in the test case for Solver. */

    /* sobfu params */
    Params params;

    int3 dims;
    float3 voxel_sizes;

    int no_voxels;

    /* global and live tsdf's */
    cv::Ptr<kfusion::cuda::TsdfVolume> phi_global, phi_n;

    /* reduction class */
    sobfu::device::Reductor* reductor;

    /* max. error */
    float max_error = 1e-1f;
    /* floating point precision */
    const float epsilon = 1e-5f;
};

/* test calculation of the energy functional data term */
TEST_F(ReductionsTest, DataTermTest) {
    /* init tsdf's */
    kfusion::device::TsdfVolume phi_n_device(phi_n->data().ptr<float2>(), dims, voxel_sizes, params.tsdf_trunc_dist,
                                             params.eta, params.tsdf_max_weight);
    kfusion::device::clear_volume(phi_n_device); /* will contain all 0's */

    float3 c = make_float3(5.f, 5.f, 5.f);
    float r  = 0.01f;
    phi_global->initSphere(c, r); /* will contain all 1's */

    kfusion::device::TsdfVolume phi_global_device(phi_global->data().ptr<float2>(), dims, voxel_sizes,
                                                  params.tsdf_trunc_dist, params.eta, params.tsdf_max_weight);

    float data_energy = reductor->data_energy(phi_global->data().ptr<float2>(), phi_n->data().ptr<float2>());
    ASSERT_NEAR(data_energy, 0.5f * static_cast<float>(no_voxels) * 1.f, max_error);
}


================================================
FILE: test/solver_test.cpp
================================================
/* gtest includes */
#include <gtest/gtest.h>

/* sobfu includes */
#include <sobfu/params.hpp>
#include <sobfu/solver.hpp>

/* kinfu includes */
#include <kfusion/internal.hpp>
#include <kfusion/precomp.hpp>
#include <kfusion/types.hpp>

/* sys headers */
#include <cmath>
#include <ctgmath>
#include <iostream>
#include <memory>

class SolverTest : public ::testing::Test {
protected:
    /* You can remove any or all of the following functions if its body
     * is empty. */

    /* You can do set-up work for each test here. */
    SolverTest() = default;

    /* You can do clean-up work that doesn't throw exceptions here. */
    ~SolverTest() override = default;

    /* If the constructor and destructor are not enough for setting up
     * and cleaning up each test, you can define the following methods: */

    /* Code here will be called immediately after the constructor (right
     * before each test). */
    void SetUp() override {
        /* verbosity */
        params.verbosity = 1;

        /* volume params */
        params.volume_dims = cv::Vec3i::all(64);
        params.volume_size = cv::Vec3f::all(0.25f);

        params.tsdf_trunc_dist       = 10.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);
        params.gradient_delta_factor = 0.1f;

        params.eta = 2.f * params.volume_size[0] / static_cast<float>(params.volume_dims[0]);

        /* camera params */
        params.intr = kfusion::Intr(1.f, 1.f, 0.f, 0.f);

        /* solver params */
        params.max_iter        = 2048;
        params.max_update_norm = -1.f;

        params.s      = 7;
        params.lambda = 0.1f;

        params.alpha = 0.001f;
        params.w_reg = 0.4f;

        dims        = kfusion::device_cast<kfusion::device::Vec3i>(params.volume_dims);
        voxel_sizes = kfusion::device_cast<kfusion::device::Vec3f>(params.voxel_sizes());

        no_voxels = dims.x * dims.y * dims.z;

        /* init tsdf's */
        phi_global         = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_global_psi_inv = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_n              = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));
        phi_n_psi          = cv::Ptr<kfusion::cuda::TsdfVolume>(new kfusion::cuda::TsdfVolume(params));

        mc = cv::Ptr<kfusion::cuda::MarchingCubes>(new kfusion::cuda::MarchingCubes());

        /* init psi */
        psi     = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);
        psi_inv = std::make_shared<sobfu::cuda::DeformationField>(params.volume_dims);
    }

    /* Code here will be called immediately after each test (right
     * before the destructor). */
    void TearDown() override {}

    /* Objects declared here can be used by all tests in the test case for Solver. */

    /* sobfu params */
    Params params;

    int3 dims;
    float3 voxel_sizes;

    int no_voxels;

    /* global and live tsdf's */
    cv::Ptr<kfusion::cuda::TsdfVolume> phi_global, phi_global_psi_inv, phi_n, phi_n_psi;
    /* marching cubes--for visualisation purposes */
    cv::Ptr<kfusion::cuda::MarchingCubes> mc;
    /* deformation field */
    std::shared_ptr<sobfu::cuda::DeformationField> psi, psi_inv;
    /* solver */
    std::shared_ptr<sobfu::cuda::Solver> solver;

    /* max. error */
    float max_error = 1e-1f;
    /* floating point precision */
    const float epsilon = 1e-5f;
};

/* test the alignment of 2 spheres */
TEST_F(SolverTest, AlignmentTestSphereTranslation) {
    /* set parameters */
    params.max_iter = 2048;

    params.alpha = 0.01f;
    params.w_reg = 0.4f;

    /* init solver */
    solver = std::make_shared<sobfu::cuda::Solver>(params);

    /* init phi_global and phi_n */
    int sizes[3] = {dims.x, dims.y, dims.z};

    float3 c = make_float3(0.13f, 0.13f, 0.13f);
    float r  = 0.012f;
    phi_global->initSphere(c, r);

    c = make_float3(0.125f, 0.13f, 0.13f);
    phi_n->initSphere(c, r);
    phi_n_psi->initSphere(c, r);

    /* calculate psi and apply it to phi_n */
    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);
}

/* test the alignment of a sphere translating and expanding */
TEST_F(SolverTest, AlignmentTestSphereExpanding) {
    /* set parameters */
    params.max_iter = 2048;

    params.alpha = 0.005f;
    params.w_reg = 0.4f;

    /* init solver */
    solver = std::make_shared<sobfu::cuda::Solver>(params);

    /* init phi_global and phi_n */
    int sizes[3] = {dims.x, dims.y, dims.z};

    float3 c = make_float3(0.13f, 0.13f, 0.13f);
    float r  = 0.012f;
    phi_global->initSphere(c, r);

    c = make_float3(0.125f, 0.13f, 0.13f);
    r = 0.0145f;
    phi_n->initSphere(c, r);
    phi_n_psi->initSphere(c, r);

    /* calculate psi and apply it to phi_n */
    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);
}

/* test the alignment of 2 spheres over 3 frames */
TEST_F(SolverTest, SerialAlignmentTest) {
    /* set parameters */
    params.max_iter = 2048;

    params.alpha = 0.005f;
    params.w_reg = 0.4f;

    /* init solver */
    solver = std::make_shared<sobfu::cuda::Solver>(params);

    /* init phi_global and phi_n */
    int sizes[3] = {dims.x, dims.y, dims.z};

    float3 c = make_float3(0.13f, 0.13f, 0.13f);
    float r  = 0.02f;
    phi_global->initSphere(c, r);

    c = make_float3(0.125f, 0.13f, 0.132f);
    phi_n->initSphere(c, r);
    phi_n_psi->initSphere(c, r);

    /*
     * FRAME 1 to FRAME 0
     *
     */

    std::cout << "\nFRAME 1\n" << std::endl;

    /* calculate psi and apply it to phi_n */
    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);

    /*
     * FRAME 2 to FRAME 0
     *
     */

    std::cout << "\nFRAME 2\n" << std::endl;

    phi_n->clear();
    phi_n_psi->clear();

    c = make_float3(0.123f, 0.13f, 0.132f);
    phi_n->initSphere(c, r);
    psi->apply(phi_n, phi_n_psi);

    solver->estimate_psi(phi_global, phi_global_psi_inv, phi_n, phi_n_psi, psi, psi_inv);
}