Repository: ACDSLab/MPPI-Generic Branch: main Commit: b5c8daab6a91 Files: 295 Total size: 2.3 MB Directory structure: gitextract_ky3rcz46/ ├── .clang-format ├── .gitattributes ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── VERSION ├── cmake/ │ ├── Config.cmake.in │ ├── MPPIGenericToolsConfig.cmake │ └── Modules/ │ ├── CMakeLists.txt.gtest.in │ └── cmake_uninstall.cmake.in ├── doc/ │ └── feedback.md ├── examples/ │ ├── CMakeLists.txt │ ├── cartpole_example.cu │ ├── double_integrator_CORL2020.cu │ └── double_integrator_example.cu ├── include/ │ └── mppi/ │ ├── controllers/ │ │ ├── ColoredMPPI/ │ │ │ ├── colored_mppi_controller.cu │ │ │ └── colored_mppi_controller.cuh │ │ ├── MPPI/ │ │ │ ├── mppi_controller.cu │ │ │ └── mppi_controller.cuh │ │ ├── Primitives/ │ │ │ ├── primitives_controller.cu │ │ │ └── primitives_controller.cuh │ │ ├── R-MPPI/ │ │ │ ├── robust_mppi_controller.cu │ │ │ └── robust_mppi_controller.cuh │ │ ├── Tube-MPPI/ │ │ │ ├── tube_mppi_controller.cu │ │ │ └── tube_mppi_controller.cuh │ │ ├── controller.cu │ │ └── controller.cuh │ ├── core/ │ │ ├── base_plant.hpp │ │ ├── buffer.hpp │ │ ├── buffered_plant.hpp │ │ ├── mppi_common.cu │ │ ├── mppi_common.cuh │ │ ├── rmppi_kernels.cu │ │ └── rmppi_kernels.cuh │ ├── cost_functions/ │ │ ├── autorally/ │ │ │ ├── ar_robust_cost.cu │ │ │ ├── ar_robust_cost.cuh │ │ │ ├── ar_standard_cost.cu │ │ │ └── ar_standard_cost.cuh │ │ ├── cartpole/ │ │ │ ├── cartpole_quadratic_cost.cu │ │ │ └── cartpole_quadratic_cost.cuh │ │ ├── cost.cu │ │ ├── cost.cuh │ │ ├── double_integrator/ │ │ │ ├── double_integrator_circle_cost.cu │ │ │ ├── double_integrator_circle_cost.cuh │ │ │ ├── double_integrator_robust_cost.cu │ │ │ └── double_integrator_robust_cost.cuh │ │ ├── quadratic_cost/ │ │ │ ├── quadratic_cost.cu │ │ │ └── quadratic_cost.cuh │ │ └── quadrotor/ │ │ ├── quadrotor_map_cost.cu │ │ ├── quadrotor_map_cost.cuh │ │ ├── quadrotor_quadratic_cost.cu │ │ └── quadrotor_quadratic_cost.cuh │ ├── ddp/ │ │ ├── boxqp.h │ │ ├── ddp.h │ │ ├── ddp_costs.h │ │ ├── ddp_dynamics.h │ │ ├── ddp_model_wrapper.h │ │ ├── ddp_tracking_costs.h │ │ ├── result.h │ │ └── util.h │ ├── dynamics/ │ │ ├── autorally/ │ │ │ ├── ar_nn_model.cu │ │ │ └── ar_nn_model.cuh │ │ ├── bicycle_slip/ │ │ │ ├── bicycle_slip_parametric.cu │ │ │ └── bicycle_slip_parametric.cuh │ │ ├── cartpole/ │ │ │ ├── cartpole_dynamics.cu │ │ │ └── cartpole_dynamics.cuh │ │ ├── double_integrator/ │ │ │ ├── di_dynamics.cu │ │ │ └── di_dynamics.cuh │ │ ├── dubins/ │ │ │ ├── dubins.cu │ │ │ └── dubins.cuh │ │ ├── dynamics.cu │ │ ├── dynamics.cuh │ │ ├── linear/ │ │ │ ├── linear.cu │ │ │ └── linear.cuh │ │ ├── quadrotor/ │ │ │ ├── quadrotor_dynamics.cu │ │ │ └── quadrotor_dynamics.cuh │ │ ├── racer_dubins/ │ │ │ ├── racer_dubins.cu │ │ │ ├── racer_dubins.cuh │ │ │ ├── racer_dubins_elevation.cu │ │ │ ├── racer_dubins_elevation.cuh │ │ │ ├── racer_dubins_elevation_lstm_steering.cu │ │ │ ├── racer_dubins_elevation_lstm_steering.cuh │ │ │ ├── racer_dubins_elevation_lstm_unc.cu │ │ │ ├── racer_dubins_elevation_lstm_unc.cuh │ │ │ ├── racer_dubins_elevation_suspension_lstm.cu │ │ │ └── racer_dubins_elevation_suspension_lstm.cuh │ │ └── racer_suspension/ │ │ ├── racer_suspension.cu │ │ └── racer_suspension.cuh │ ├── feedback_controllers/ │ │ ├── CCM/ │ │ │ └── ccm.h │ │ ├── DDP/ │ │ │ ├── ddp.cu │ │ │ └── ddp.cuh │ │ ├── feedback.cu │ │ └── feedback.cuh │ ├── instantiations/ │ │ ├── autorally_mppi/ │ │ │ └── autorally_mppi.cuh │ │ ├── cartpole_mppi/ │ │ │ └── cartpole_mppi.cuh │ │ ├── double_integrator_mppi/ │ │ │ └── double_integrator_mppi.cuh │ │ └── quadrotor_mppi/ │ │ └── quadrotor_mppi.cuh │ ├── sampling_distributions/ │ │ ├── colored_noise/ │ │ │ ├── colored_noise.cu │ │ │ └── colored_noise.cuh │ │ ├── gaussian/ │ │ │ ├── gaussian.cu │ │ │ └── gaussian.cuh │ │ ├── nln/ │ │ │ ├── nln.cu │ │ │ └── nln.cuh │ │ ├── piecewise_linear/ │ │ │ └── piecewise_linear_noise.cuh │ │ ├── sampling_distribution.cu │ │ ├── sampling_distribution.cuh │ │ └── smooth-MPPI/ │ │ ├── smooth-MPPI.cu │ │ └── smooth-MPPI.cuh │ ├── shaping_functions/ │ │ ├── CEM/ │ │ │ ├── cem_shaping_function.cu │ │ │ └── cem_shaping_function.cuh │ │ ├── shaping_function.cu │ │ └── shaping_function.cuh │ ├── utils/ │ │ ├── activation_functions.cuh │ │ ├── angle_utils.cuh │ │ ├── cuda_math_utils.cuh │ │ ├── eigen_type_conversions.h │ │ ├── file_utils.h │ │ ├── gpu_err_chk.cuh │ │ ├── logger.hpp │ │ ├── managed.cuh │ │ ├── math_utils.h │ │ ├── matrix_mult_utils.cuh │ │ ├── nn_helpers/ │ │ │ ├── fnn_helper.cu │ │ │ ├── fnn_helper.cuh │ │ │ ├── lstm_helper.cu │ │ │ ├── lstm_helper.cuh │ │ │ ├── lstm_lstm_helper.cu │ │ │ ├── lstm_lstm_helper.cuh │ │ │ └── meta_math.h │ │ ├── numerical_integration.h │ │ ├── parallel_utils.cuh │ │ ├── risk_utils.cu │ │ ├── risk_utils.cuh │ │ ├── test_helper.h │ │ ├── texture_helpers/ │ │ │ ├── texture_helper.cu │ │ │ ├── texture_helper.cuh │ │ │ ├── three_d_texture_helper.cu │ │ │ ├── three_d_texture_helper.cuh │ │ │ ├── two_d_texture_helper.cu │ │ │ └── two_d_texture_helper.cuh │ │ └── type_printing.h │ └── version.h.in ├── resources/ │ ├── PF_1000_cell_init.npz │ ├── PF_1000_hidden_init.npz │ ├── PF_1000_lstm.npz │ ├── PF_1000_output.npz │ ├── ackerman_test.npz │ ├── autorally_nnet_09_12_2018.npz │ ├── bicycle_slip_hybrid.npz │ ├── bicycle_slip_hybrid_test.npz │ ├── bicycle_slip_kinematic_test.npz │ ├── bicycle_slip_test.npz │ ├── body_loss_bicycle_slip_kinematic_2023_03_10-12_54_39.npz │ ├── ccrf_track.npz │ ├── cell_init.npz │ ├── hidden_init.npz │ ├── lstm.npz │ ├── lstm_lstm_ackerman.npz │ ├── lstm_lstm_bicycle_slip_kinematic.npz │ ├── lstm_lstm_steering.npz │ ├── lstm_lstm_steering_accel.npz │ ├── lstm_lstm_test.npz │ ├── network_rde_2024_03_22-19_00_26.npz │ ├── network_rde_test.npz │ ├── output.npz │ ├── sim_PF_200_cell_init.npz │ ├── sim_PF_200_hidden_init.npz │ ├── sim_PF_200_lstm.npz │ └── sim_PF_200_output.npz ├── scripts/ │ ├── autorally/ │ │ ├── lstm_converter.py │ │ └── test/ │ │ ├── generateTestMaps.py │ │ └── generateTestNetwork.py │ ├── colored_noise.py │ ├── double_integrator/ │ │ ├── generate_free_energy_video.py │ │ ├── generate_trajectory_video.py │ │ ├── plot_DI_test_trajectories.py │ │ ├── plot_DI_test_trajectories_tube_only.py │ │ └── plot_stuff.py │ └── run_autoformatter.sh ├── src/ │ ├── CMakeLists.txt │ └── controllers/ │ ├── CMakeLists.txt │ ├── autorally/ │ │ ├── CMakeLists.txt │ │ └── autorally_mppi.cu │ ├── cartpole/ │ │ ├── CMakeLists.txt │ │ └── cartpole_mppi.cu │ ├── double_integrator/ │ │ ├── CMakeLists.txt │ │ └── double_integrator_mppi.cu │ └── quadrotor/ │ ├── CMakeLists.txt │ └── quadrotor_mppi.cu └── tests/ ├── CMakeLists.txt ├── controllers/ │ ├── CMakeLists.txt │ ├── controller_generic_tests.cu │ ├── controller_kernel_testing.cu │ ├── rmppi_test.cu │ ├── tube_mppi_test.cu │ └── vanilla_mppi_test.cu ├── cost_functions/ │ ├── CMakeLists.txt │ ├── autorally_robust_cost_test.cu │ ├── autorally_standard_cost_test.cu │ ├── cartpole_quadratic_cost_test.cu │ ├── general_cost_test.cu │ ├── general_quadratic_costs.cu │ ├── quadrotor_map_cost_test.cu │ └── quadrotor_quadratic_cost_test.cu ├── dynamics/ │ ├── CMakeLists.txt │ ├── angle_utils_test.cu │ ├── ar_dynamics_nn_test.cu │ ├── bicycle_slip_parametric_model_test.cu │ ├── cartpole_dynamics_tests.cu │ ├── dubins_dynamics_tests.cu │ ├── dynamics_generic_tests.cu │ ├── linear_dynamics_tests.cu │ ├── quadrotor_dynamics_tests.cu │ ├── racer_dubins_elevation_lstm_steering_model_test.cu │ ├── racer_dubins_elevation_lstm_uncertainty_model_test.cu │ ├── racer_dubins_elevation_model_test.cu │ ├── racer_dubins_elevation_suspension_test.cu │ ├── racer_dubins_model_test.cu │ └── racer_suspension_model_test.cu ├── feedback_controllers/ │ ├── CMakeLists.txt │ ├── ddp_test.cu │ └── generic_feedback_controller_test.cu ├── include/ │ ├── kernel_tests/ │ │ ├── core/ │ │ │ ├── normexp_kernel_test.cu │ │ │ ├── normexp_kernel_test.cuh │ │ │ ├── rmppi_kernel_test.cu │ │ │ ├── rmppi_kernel_test.cuh │ │ │ ├── rollout_kernel_test.cu │ │ │ ├── rollout_kernel_test.cuh │ │ │ ├── weightedreduction_kernel_test.cu │ │ │ └── weightedreduction_kernel_test.cuh │ │ ├── cost_functions/ │ │ │ ├── autorally/ │ │ │ │ ├── ar_robust_cost_kernel_test.cu │ │ │ │ ├── ar_robust_cost_kernel_test.cuh │ │ │ │ ├── ar_standard_cost_kernel_test.cu │ │ │ │ └── ar_standard_cost_kernel_test.cuh │ │ │ ├── cartpole/ │ │ │ │ ├── cartpole_quadratic_cost_kernel_test.cu │ │ │ │ └── cartpole_quadratic_cost_kernel_test.cuh │ │ │ ├── cost_generic_kernel_tests.cu │ │ │ └── cost_generic_kernel_tests.cuh │ │ ├── dynamics/ │ │ │ ├── autorally/ │ │ │ │ ├── ar_nn_dynamics_kernel_test.cu │ │ │ │ └── ar_nn_dynamics_kernel_test.cuh │ │ │ ├── cartpole/ │ │ │ │ ├── cartpole_dynamics_kernel_test.cu │ │ │ │ └── cartpole_dynamics_kernel_test.cuh │ │ │ ├── dynamics_generic_kernel_tests.cu │ │ │ └── dynamics_generic_kernel_tests.cuh │ │ ├── shaping_functions/ │ │ │ ├── shaping_function_kernels_tests.cu │ │ │ └── shaping_function_kernels_tests.cuh │ │ └── utils/ │ │ ├── network_helper_kernel_test.cuh │ │ └── texture_test_kernels.cuh │ └── mppi_test/ │ └── mock_classes/ │ ├── mock_classes.h │ ├── mock_controller.h │ ├── mock_costs.h │ ├── mock_dynamics.h │ ├── mock_feedback.h │ └── mock_sampler.h ├── integration/ │ ├── CMakeLists.txt │ └── integrator_tests.cu ├── math_utils/ │ ├── CMakeLists.txt │ ├── cuda_math_utils_tests.cu │ └── math_utils_test.cu ├── misc/ │ ├── CMakeLists.txt │ ├── di_dynamics_kernel_tests.cu │ ├── di_dynamics_kernel_tests.cuh │ └── miscellaneous_tests.cu ├── mppi_core/ │ ├── CCM_tests.cu │ ├── CMakeLists.txt │ ├── base_plant_tester.cu │ ├── buffered_plant_tester.cu │ ├── normexp_kernel_tests.cu │ ├── rmppi_kernel_tests.cu │ ├── rollout_kernel_tests.cu │ ├── viz_kernels_test.cu │ └── weightedreduction_kernel_tests.cu ├── nn_helpers/ │ ├── CMakeLists.txt │ ├── activation_functions_tests.cu │ ├── fnn_helper_test.cu │ ├── lstm_helper_test.cu │ └── lstm_lstm_helper_test.cu ├── sampling_distributions/ │ ├── CMakeLists.txt │ ├── colored_noise_tests.cu │ ├── gaussian_noise_tests.cu │ └── generic_sampling_distribution_tests.cu ├── shaping_functions/ │ ├── CMakeLists.txt │ ├── cem_shaping_function_test.cu │ └── shaping_function_test.cu ├── templated_headers/ │ ├── autorally_test_map.h.in │ ├── autorally_test_network.h.in │ ├── racer_test_networks.h.in │ └── test_networks.h.in ├── test_main.cpp └── texture_helpers/ ├── CMakeLists.txt ├── texture_helper_test.cu ├── three_d_texture_helper_test.cu └── two_d_texture_helper_test.cu ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ --- # roscpp style file taken from https://github.com/PickNikRobotics/roscpp_code_format.git BasedOnStyle: Google AccessModifierOffset: -2 ConstructorInitializerIndentWidth: 2 AlignEscapedNewlinesLeft: false AlignTrailingComments: true AllowAllParametersOfDeclarationOnNextLine: false AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AllowShortFunctionsOnASingleLine: None AlwaysBreakTemplateDeclarations: true AlwaysBreakBeforeMultilineStrings: true BreakBeforeBinaryOperators: false BreakBeforeTernaryOperators: false BreakConstructorInitializersBeforeComma: true BinPackParameters: true ColumnLimit: 120 ConstructorInitializerAllOnOneLineOrOnePerLine: true DerivePointerBinding: false PointerBindsToType: true ExperimentalAutoDetectBinPacking: false IndentCaseLabels: true MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 60 PenaltyBreakString: 1 PenaltyBreakFirstLessLess: 1000 PenaltyExcessCharacter: 1000 PenaltyReturnTypeOnItsOwnLine: 90 SpacesBeforeTrailingComments: 2 Cpp11BracedListStyle: false Standard: Auto IndentWidth: 2 TabWidth: 2 UseTab: Never IndentFunctionDeclarationAfterType: false SpacesInParentheses: false SpacesInAngles: false SpaceInEmptyParentheses: false SpacesInCStyleCastParentheses: false SpaceAfterControlStatementKeyword: true SpaceBeforeAssignmentOperators: true ContinuationIndentWidth: 4 SortIncludes: false SpaceAfterCStyleCast: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' } ... ================================================ FILE: .gitattributes ================================================ *.npz filter=lfs diff=lfs merge=lfs -text ================================================ FILE: .gitignore ================================================ # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # clion ide .idea cmake-build-debug* cmake-build-release* .clion* # vscode ide .vscode # Build directories build # clangd build .clangd compile_commands.json .cache # resouces folder resource # video files *.mp4 *.mpeg ================================================ FILE: .gitmodules ================================================ [submodule "submodules/cnpy"] path = submodules/cnpy url = https://github.com/ACDSLab/cnpy.git ================================================ FILE: CMakeLists.txt ================================================ if (CMAKE_VERSION VERSION_LESS 3.12) cmake_minimum_required(VERSION 3.8) else() cmake_minimum_required(VERSION 3.8...3.24) endif() # https://cmake.org/cmake/help/latest/policy/CMP0104.html if (CMAKE_VERSION VERSION_LESS 3.24 AND POLICY CMP0104) cmake_policy(SET CMP0104 OLD) endif() # Get Version from file file(READ "${CMAKE_CURRENT_SOURCE_DIR}/VERSION" MPPI_VERSION_FROM_FILE) # Clean up trailng whitespace and newlines string(STRIP "${MPPI_VERSION_FROM_FILE}" MPPI_VERSION) project(MPPI-Generic VERSION ${MPPI_VERSION} LANGUAGES C CXX CUDA) #################################### # Set up options for configuration # #################################### option(MPPI_BUILD_TESTS "Build unit tests." OFF) option(MPPI_BUILD_EXAMPLES "Build example code." OFF) option(MPPI_USE_CCACHE "Use ccache (when available) to potentially speed up repeated builds" ON) option(MPPI_USE_CUDA_BARRIERS "Compile MPPI-Generic with cuda barrier support. Turn off for GPUs older than the 20 series." ON) option(MPPI_USE_CUDA_BARRIERS_COST "Compile MPPI-Generic with cuda barriers for cost-only kernels when general barrier support is enabled." OFF) option(MPPI_USE_CUDA_BARRIERS_DYN "Compile MPPI-Generic with cuda barriers for dynamics-only kernels when general barrier support is enabled." ON) option(MPPI_USE_CUDA_BARRIERS_ROLLOUT "Compile MPPI-Generic with cuda barriers for combined kernels when general barrier support is enabled." ON) option(MPPI_EXPORT_PACKAGE "Export this folder as the MPPI installation to the global CMake package repository" OFF) set(MPPI_CUDA_ARCH_LIST "" CACHE STRING "Specific CUDA Architectures to build for. Leave empty for automatic selection.") # Configure CMake Cuda Flags for MPPI set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake") include(MPPIGenericToolsConfig) set(BUILD_FLAGS "-Wuninitialized -Wall") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BUILD_FLAGS}") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w") # Allow the lib location to be overwritten from command line if (NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY) get_filename_component(PROJECT_LIBS_DIR ${PROJECT_BINARY_DIR}/lib ABSOLUTE) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) else() get_filename_component(PROJECT_LIBS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} ABSOLUTE) endif() # set up cnpy # Don't install cnpy to user package registry set(TMP_PACKAGE_REGISTRY_EXPORT ${CMAKE_EXPORT_NO_PACKAGE_REGISTRY}) set(CMAKE_EXPORT_NO_PACKAGE_REGISTRY ON) add_subdirectory(submodules/cnpy) # Set the no package registry back to default set(CMAKE_EXPORT_NO_PACKAGE_REGISTRY ${TMP_PACKAGE_REGISTRY_EXPORT}) # Find Eigen find_package(Eigen3 REQUIRED) # REQUIRED for CUDA to correctly align Eigen member variables inside structures/classes # https://listengine.tuxfamily.org/lists.tuxfamily.org/eigen/2016/06/msg00006.html # add_definitions(-DEIGEN_MAX_STATIC_ALIGN_BYTES=0) # Generate Version Header File from Project Version configure_file("${PROJECT_SOURCE_DIR}/include/mppi/version.h.in" "${PROJECT_BINARY_DIR}/include/mppi/version.h") # Create a Header-only MPPI Library add_library(${MPPI_HEADER_LIBRARY_NAME} INTERFACE) add_library(MPPI::MPPI ALIAS ${MPPI_HEADER_LIBRARY_NAME}) target_include_directories(${MPPI_HEADER_LIBRARY_NAME} INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "$" "$" "$" ) target_link_libraries(${MPPI_HEADER_LIBRARY_NAME} INTERFACE ${MPPI_GENERIC_CUDA_EXTRA_LIBS} ${CUDA_LIBRARIES} cnpy Eigen3::Eigen ) target_include_directories(${MPPI_HEADER_LIBRARY_NAME} INTERFACE "$" "$" ) # --display-error-number was added as a compilation flag in CUDA 11.2 if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.2") target_compile_options(${MPPI_HEADER_LIBRARY_NAME} INTERFACE $<$:--display-error-number> ) endif() # Allow the ability to turn off cuda barriers through CMake. Necessary for GPUs older than # the 20 series if (MPPI_USE_CUDA_BARRIERS) target_compile_definitions(${MPPI_HEADER_LIBRARY_NAME} INTERFACE CMAKE_USE_CUDA_BARRIERS) if (MPPI_USE_CUDA_BARRIERS_DYN) target_compile_definitions(${MPPI_HEADER_LIBRARY_NAME} INTERFACE CMAKE_USE_CUDA_BARRIERS_DYN) endif() if (MPPI_USE_CUDA_BARRIERS_ROLLOUT) target_compile_definitions(${MPPI_HEADER_LIBRARY_NAME} INTERFACE CMAKE_USE_CUDA_BARRIERS_ROLLOUT) endif() if (MPPI_USE_CUDA_BARRIERS_COST) target_compile_definitions(${MPPI_HEADER_LIBRARY_NAME} INTERFACE CMAKE_USE_CUDA_BARRIERS_COST) endif() endif() if (MPPI_BUILD_TESTS OR MPPI_BUILD_EXAMPLES) # find yaml-cpp find_package(yaml-cpp REQUIRED) include_directories(${YAML_CPP_INCLUDE_DIR}) endif() # Install all library header files install( DIRECTORY ${PROJECT_SOURCE_DIR}/include/mppi DESTINATION include PATTERN "*.in" EXCLUDE ) # Install generated header file version.h install( DIRECTORY ${PROJECT_BINARY_DIR}/include/mppi DESTINATION include ) ########################################### # Set up CMake configuration installation # ########################################### set(CMAKE_CONFIG_DEST "lib/cmake/${PROJECT_NAME}") install( TARGETS ${MPPI_HEADER_LIBRARY_NAME} EXPORT ${PROJECT_NAME}-targets ) install( EXPORT ${PROJECT_NAME}-targets NAMESPACE MPPI:: DESTINATION ${CMAKE_CONFIG_DEST} FILE ${PROJECT_NAME}Targets.cmake ) install( EXPORT ${PROJECT_NAME}-targets DESTINATION ${CMAKE_CONFIG_DEST} FILE ${PROJECT_NAME}TargetsNoNamespace.cmake ) # Set up MPPI to export the header library as MPPI in the MPPI namespace set_target_properties (${MPPI_HEADER_LIBRARY_NAME} PROPERTIES EXPORT_NAME MPPI) export (TARGETS ${MPPI_HEADER_LIBRARY_NAME} NAMESPACE MPPI:: FILE ${PROJECT_BINARY_DIR}/${PROJECT_NAME}Targets.cmake ) # Export the package for use from the build-tree # (this registers the build-tree with a global CMake-registry ~/.cmake) if (MPPI_EXPORT_PACKAGE) export(PACKAGE ${PROJECT_NAME}) endif() ################################################################################# # Create CMake Config and Version files and install in the appropriate location # ################################################################################# # Create a CMake variable of the appropriate include directory depending on # whether it is consider from the build tree or the install tree set(TMP_MPPI_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/include") set(TMP_MPPI_TARGET_CMAKE_FILE "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Targets.cmake") # Set up build tree config file configure_package_config_file( ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in ${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake INSTALL_DESTINATION ${PROJECT_BINARY_DIR} PATH_VARS TMP_MPPI_INCLUDE_DIRS TMP_MPPI_TARGET_CMAKE_FILE ) # Set up install tree config file set(TMP_MPPI_INCLUDE_DIRS "include/") set(TMP_MPPI_TARGET_CMAKE_FILE "${CMAKE_CONFIG_DEST}/${PROJECT_NAME}Targets.cmake") configure_package_config_file( ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in ${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/${PROJECT_NAME}Config.cmake INSTALL_DESTINATION ${CMAKE_CONFIG_DEST} PATH_VARS TMP_MPPI_INCLUDE_DIRS TMP_MPPI_TARGET_CMAKE_FILE ) write_basic_package_version_file( ${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake VERSION ${PROJECT_VERSION} COMPATIBILITY SameMinorVersion ) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/MPPIGenericToolsConfig.cmake ${PROJECT_BINARY_DIR}/MPPIGenericToolsConfig.cmake COPYONLY ) install(FILES ${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/${PROJECT_NAME}Config.cmake ${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake ${CMAKE_CURRENT_SOURCE_DIR}/cmake/MPPIGenericToolsConfig.cmake DESTINATION ${CMAKE_CONFIG_DEST} ) if (MPPI_BUILD_TESTS OR MPPI_BUILD_EXAMPLES) add_subdirectory(src) endif() if (MPPI_BUILD_EXAMPLES) add_subdirectory(examples) endif() # Add custom cmake finds set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules") # Use CMake to download gtest as part of the configure step ################################################################### # Add gtest ################################################################### if (NOT DEFINED CMAKE_TOOLCHAIN_FILE AND MPPI_BUILD_TESTS) message(STATUS "Building MPPI Generic tests") enable_testing() ############################################################ # copied from # https://github.com/google/googletest/tree/master/googletest#incorporating-into-an-existing-cmake-project ############################################################ # Download and unpack googletest at configure time if (NOT TARGET gtest_main) list(GET CMAKE_MODULE_PATH -1 MPPI_GENERIC_MODULES) configure_file(${MPPI_GENERIC_MODULES}/CMakeLists.txt.gtest.in ${PROJECT_BINARY_DIR}/googletest-download/CMakeLists.txt) execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/googletest-download ) if(result) message(FATAL_ERROR "CMake step for googletest failed: ${result}") endif() execute_process(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/googletest-download ) if(result) message(FATAL_ERROR "Build step for googletest failed: ${result}") endif() # Prevent overriding the parent project's compiler/linker # settings on Windows set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) # Prevent googletest installation set(INSTALL_GTEST OFF) # Add googletest directly to our build. This defines # the gtest and gtest_main targets. add_subdirectory(${PROJECT_BINARY_DIR}/googletest-src ${PROJECT_BINARY_DIR}/googletest-build) endif() include(GoogleTest) add_subdirectory(tests) endif() # Uninstall if(NOT TARGET uninstall) configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cmake_uninstall.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY) add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) endif() ================================================ FILE: LICENSE ================================================ BSD 2-Clause License Copyright (c) 2020, Georgia Institute of Technology Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ # MPPI-Generic MPPI-Generic is a C++/CUDA header-only library implementation of Model Predictive Path Integral Control (MPPI) by [Williams et al.](https://ieeexplore.ieee.org/document/8558663) ## Citation If you use this library for research purposes, please cite the following [paper](https://arxiv.org/abs/2409.07563): ``` @misc{vlahov2024mppi, title={MPPI-Generic: A CUDA Library for Stochastic Trajectory Optimization}, author={Bogdan Vlahov and Jason Gibson and Manan Gandhi and Evangelos A. Theodorou}, year={2024}, eprint={2409.07563}, archivePrefix={arXiv}, primaryClass={cs.MS}, url={https://arxiv.org/abs/2409.07563}, } ``` ## Requirements MPPI-Generic relies on the following: * An NVIDIA GPU * GCC/G++ * CUDA 10 or newer (CUDA 11.7+ is recommended but our library is compatible back to CUDA 10) * [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) * [CMake](https://cmake.org/) 3.10 or newer * git and git-lfs ### Unit tests requirements * [yaml-cpp](https://github.com/jbeder/yaml-cpp) * python-pil ## Prerequisite Setup (Ubuntu) 1. Follow the instructions to install CUDA provided [here](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). 2. Install all the other prerequisites through `apt-get`: ```bash sudo apt-get install libeigen3-dev git git-lfs cmake gcc # Setup git lfs if it is the first you have installed it git lfs install # extra installs if you are wanting to build unit tests sudo apt-get install libyaml-cpp-dev python3-pil ``` Note: If using Pop!\_OS you can `sudo apt install system76-cuda` instead of installing CUDA manually. ## Download the repo ```bash cd /path/to/repos git clone https://github.com/ACDSLab/MPPI-Generic.git cd MPPI-Generic git submodule update --init --recursive ``` ## Building MPPI-Generic with tests The default is to build the library with tests OFF. If you would like to turn on the tests when building, pass the flag `-DMPPI_BUILD_TESTS=ON` when configuring cmake. ```bash mkdir build cd build cmake -DMPPI_BUILD_TESTS=ON .. make make test ``` ## Acknowledgements Approved for Public Release, Distribution Unlimited. ================================================ FILE: VERSION ================================================ 0.9.0 ================================================ FILE: cmake/Config.cmake.in ================================================ # - Config file for the MPPI-Generic package # It defines the following variables # MPPI_INCLUDE_DIRS - include directories for MPPI-Generic # MPPI_INCLUDE_DIR - include directories for MPPI-Generic # MPPI_LIBRARIES - libraries to link against @PACKAGE_INIT@ include(CMakeFindDependencyMacro) set(MPPI_LIBRARIES "MPPI::MPPI") set_and_check(MPPI_INCLUDE_DIRS "@PACKAGE_TMP_MPPI_INCLUDE_DIRS@") set_and_check(MPPI_TARGET_CMAKE_FILE "@PACKAGE_TMP_MPPI_TARGET_CMAKE_FILE@") # Find dependencies of MPPI-Generic find_dependency(cnpy REQUIRED HINTS ${PACKAGE_PREFIX_DIR}) find_dependency(Eigen3 REQUIRED) # Set up cmake targets and autodetection for CUDA architectures include(${MPPI_TARGET_CMAKE_FILE}) include("${CMAKE_CURRENT_LIST_DIR}/MPPIGenericToolsConfig.cmake") set(MPPI_INCLUDE_DIR ${MPPI_INCLUDE_DIRS}) check_required_components(@PROJECT_NAME@) ================================================ FILE: cmake/MPPIGenericToolsConfig.cmake ================================================ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type options: Release, RelWithDebInfo, Debug" FORCE) message(STATUS "Setting Build Type to ${CMAKE_BUILD_TYPE} by default") endif() if(NOT DEFINED CMAKE_CXX_STANDARD) if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13") # Cuda's libcu++ requires C++17 on CUDA 13+ set(CMAKE_CXX_STANDARD 17) else() set(CMAKE_CXX_STANDARD 11) endif() endif() if (NOT DEFINED CMAKE_CUDA_STANDARD) set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Look for ccache to potentially speed up repeated compilations find_program(CCACHE_PROGRAM ccache) if (CCACHE_PROGRAM AND NOT MPPI_USE_CCACHE) message(STATUS "Using ccache to speed up repeated builds") set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) elseif(NOT CCACHE_PROGRAM AND NOT MPPI_USE_CCACHE) message(STATUS "ccache not found. Using ccache can speed up repeated builds.") endif() # Add debug flags so cuda-gdb can be used to stop inside a kernel. # NOTE: You may have to run make multiple times for it to compile successfully. set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G --keep") set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO} --generate-line-info") # Generate variable for all the extra cuda libraries we use set(MPPI_GENERIC_CUDA_EXTRA_LIBS "") if (CMAKE_VERSION VERSION_LESS 3.24) # required for curand on some systems find_package(CUDA REQUIRED) if(${CUDA_curand_LIBRARY} MATCHES "NOTFOUND") message(ERROR "cuRAND library not found.") endif() list(APPEND MPPI_GENERIC_CUDA_EXTRA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_CUFFT_LIBRARIES}) else() find_package(CUDAToolkit REQUIRED) set(CUDA_VERSION ${CUDAToolkit_VERSION}) list(APPEND MPPI_GENERIC_CUDA_EXTRA_LIBS CUDA::curand CUDA::cufft) endif() # Generate name for MPPI header library set(MPPI_HEADER_LIBRARY_NAME mppi_header_only_lib) set(CUDA_PROPAGATE_HOST_FLAGS OFF) ################################################################# # Autodetect Cuda Architecture on system and add to executables # ################################################################# # CMake 3.24 added '-arch=native' support so until that version, we need to use the old method of autodetection if (CMAKE_VERSION VERSION_LESS 3.24) # Don't rerun autodetection when used as a submodule if (NOT DEFINED MPPI_ARCH_FLAGS) # More info for autodetection: # https://stackoverflow.com/questions/35485087/determining-which-gencode-compute-arch-values-i-need-for-nvcc-within-cmak CUDA_SELECT_NVCC_ARCH_FLAGS(MPPI_ARCH_FLAGS ${MPPI_CUDA_ARCH_LIST}) if (MPPI_ARCH_FLAGS STREQUAL "") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -maxrregcount=32 -arch=sm_35") else() string(REGEX REPLACE "-gencode;arch" "-gencode=arch" MPPI_ARCH_FLAGS "${MPPI_ARCH_FLAGS}") string(REPLACE ";" " " MPPI_ARCH_FLAGS "${MPPI_ARCH_FLAGS}") # string(REGEX REPLACE "^-gencode=arch" "-arch" MPPI_ARCH_FLAGS "${MPPI_ARCH_FLAGS}") message(STATUS "CUDA Architecture(s): ${MPPI_ARCH_FLAGS}") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MPPI_ARCH_FLAGS}") endif() else() message(STATUS "Autodetection already ran and found ${MPPI_ARCH_FLAGS}.") endif() else() # Don't rerun autodetection when used as a submodule if (NOT DEFINED MPPI_ARCH_FLAGS) if ("${MPPI_CUDA_ARCH_LIST}" STREQUAL "") set(CMAKE_CUDA_ARCHITECTURES native) else() set(CMAKE_CUDA_ARCHITECTURES "${MPPI_CUDA_ARCH_LIST}") endif() set(MPPI_ARCH_FLAGS "${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "CUDA Architecture(s): ${CMAKE_CUDA_ARCHITECTURES}") else() message(STATUS "Autodetection already ran and found ${MPPI_ARCH_FLAGS}.") endif() endif() ================================================ FILE: cmake/Modules/CMakeLists.txt.gtest.in ================================================ cmake_minimum_required(VERSION 2.8.2) project(googletest-download NONE) include(ExternalProject) ExternalProject_Add(googletest GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG release-1.12.1 SOURCE_DIR "${PROJECT_BINARY_DIR}/googletest-src" BINARY_DIR "${PROJECT_BINARY_DIR}/googletest-build" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" TEST_COMMAND "" ) ================================================ FILE: cmake/Modules/cmake_uninstall.cmake.in ================================================ # ============================================================================ # Copyright (c) 2011-2012 University of Pennsylvania # Copyright (c) 2013-2014 Carnegie Mellon University # Copyright (c) 2013-2016 Andreas Schuh # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See COPYING file for license information or visit # https://cmake-basis.github.io/download.html#license # ============================================================================ ############################################################################## # @file cmake_uninstall.cmake # @brief Uninstallation script based on install_manifest*.txt files. # # @ingroup CMakeTools ############################################################################## cmake_minimum_required (VERSION 2.8.12 FATAL_ERROR) # ---------------------------------------------------------------------------- # set the install prefix if (NOT DEFINED CMAKE_INSTALL_PREFIX) set (CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@") endif () # ---------------------------------------------------------------------------- # set the install configuration name if (NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) if (BUILD_TYPE) string (REGEX REPLACE "^[^A-Za-z0-9_]+" "" CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") else () set (CMAKE_INSTALL_CONFIG_NAME "@CMAKE_BUILD_TYPE@") endif () message (STATUS "Uninstall configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") endif () # ---------------------------------------------------------------------------- # set the component getting uninstalled if (NOT CMAKE_INSTALL_COMPONENT) if (COMPONENT) message (STATUS "Uninstall component: \"${COMPONENT}\"") set (CMAKE_INSTALL_COMPONENT "${COMPONENT}") else () set (CMAKE_INSTALL_COMPONENT) endif () endif () # ---------------------------------------------------------------------------- # read manifest file if (MANIFEST_FILE) if (NOT EXISTS "${MANIFEST_FILE}") message (FATAL_ERROR "Manifest file ${MANIFEST_FILE} does not exist!") endif () set (MANIFEST_FILES "${MANIFEST_FILE}") else () file (GLOB MANIFEST_FILES "${CMAKE_CURRENT_LIST_DIR}/@PROJECT_PACKAGE_CONFIG_PREFIX@*InstallManifest.txt") if (NOT MANIFEST_FILES) if (CMAKE_INSTALL_COMPONENT) set (MANIFEST_FILE "${CMAKE_CURRENT_LIST_DIR}/install_manifest_${CMAKE_INSTALL_COMPONENT}.txt") else () set (MANIFEST_FILE "${CMAKE_CURRENT_LIST_DIR}/install_manifest.txt") endif () if (NOT EXISTS "${MANIFEST_FILE}") message ("No manifest file found.") return () endif () set (MANIFEST_FILES "${MANIFEST_FILE}") endif () endif () set (MANIFEST) foreach (MANIFEST_FILE IN LISTS MANIFEST_FILES) file (READ "${MANIFEST_FILE}" _MANIFEST) string (REGEX REPLACE "\n" ";" _MANIFEST "${_MANIFEST}") list (REVERSE _MANIFEST) list (APPEND MANIFEST "${_MANIFEST}") endforeach () # ---------------------------------------------------------------------------- # remove package from CMake package registry set (REGISTERED "@BASIS_REGISTER@") if (WIN32 AND REGISTERED) set (PKGUID "@TOPLEVEL_PROJECT_PACKAGE_UID@") execute_process ( COMMAND reg delete "HKCU\\Software\\Kitware\\CMake\\Packages\\@PROJECT_PACKAGE_CONFIG_PREFIX@" /v "${PKGUID}" /f RESULT_VARIABLE RT ERROR_VARIABLE ERR ) if (RT EQUAL 0) message (STATUS "Deregister: Removed HKCU\\Software\\Kitware\\CMake\\Packages\\@PROJECT_PACKAGE_CONFIG_PREFIX@\\${PKGUID}") else () string (STRIP "${ERR}" ERR) message (STATUS "Deregister: Failed to remove package from registry: ${ERR}") endif () endif () # ---------------------------------------------------------------------------- # remove installed files foreach (F ${MANIFEST}) # skip empty entries, i.e., blank lines set (F "$ENV{DESTDIR}${F}") # support change of root if (EXISTS "${F}") set (FILE_IN_USE FALSE) if (NOT FILE_IN_USE) message (STATUS "Uninstalling: ${F}") execute_process (COMMAND "${CMAKE_COMMAND}" -E remove -f "${F}" RESULT_VARIABLE RT) if (NOT RT EQUAL 0) set (OK FALSE) message (STATUS "Failed to uninstall ${F}") endif () # remove .pyc files of .py files if (F MATCHES "\\.py$" AND EXISTS "${F}c") message (STATUS "Uninstalling: ${F}c") execute_process (COMMAND "${CMAKE_COMMAND}" -E remove -f "${F}c" RESULT_VARIABLE RT) if (NOT RT EQUAL 0) message (STATUS "Failed to uninstall ${F}c") endif () endif () else () message (STATUS "File-in-use: ${F}") endif () else () message (STATUS "Non-existent: ${F}") endif () endforeach () foreach (MANIFEST_FILE IN LISTS MANIFEST_FILES) if (EXISTS "${MANIFEST_FILE}") execute_process (COMMAND "${CMAKE_COMMAND}" -E remove -f "${MANIFEST_FILE}") endif () endforeach () # ---------------------------------------------------------------------------- # remove empty directories list (APPEND EXCLUDE_DIRS "/" "/usr" "/usr/local" "/opt" "/opt/local" "$ENV{HOME}" "$ENV{HOME}/local" # these should anyway never be used as installation prefix without subdirectory "/bin" "/boot" "/dev" "/etc" "/home" "/lib" "/lib32" "/lib64" "/media" "/mnt" "/root" "/proc" "/sys" "/var" "/tmp" "/lost+found" "/cdrom" ) if (WIN32) get_filename_component (PROGRAM_FILES_DIR "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion;ProgramFilesDir]" ABSOLUTE) if (NOT PROGRAM_FILES_DIR OR PROGRAM_FILES_DIR MATCHES "/registry") set (PROGRAM_FILES_DIR "C:/Program Files") endif () list (APPEND EXCLUDE_DIRS "${PROGRAM_FILES_DIR}") string (REPLACE "/" "\\" PROGRAM_FILES_DIR "${PROGRAM_FILES_DIR}") list (APPEND EXCLUDE_DIRS "${PROGRAM_FILES_DIR}") endif () # stop removing directories at root installation directory # the subdirectory will be still removed if it is not in the # list of excluded system directories get_filename_component (D "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}" PATH) list (APPEND EXCLUDE_DIRS "${D}") string (REPLACE "." "\\." CMAKE_INSTALL_PREFIX_RE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}") string (REPLACE "." "\\." CMAKE_REGISTRY_PREFIX_RE "$ENV{HOME}/.cmake") foreach (F ${MANIFEST}) # skip empty entries, i.e., blank lines # remove directories only if file was installed inside the installation root # or the CMake package registration on Unix if (F MATCHES "^${CMAKE_INSTALL_PREFIX_RE}" OR (UNIX AND F MATCHES "^${CMAKE_REGISTRY_PREFIX_RE}")) get_filename_component (D "$ENV{DESTDIR}${F}" PATH) while (D) # skip directory if we removed it already if (NOT EXISTS "${D}" OR NOT IS_DIRECTORY "${D}") if ("${D}" STREQUAL "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}") return () # we are done, the installation root has been removed endif () break () endif () # skip directory if it is in list of excluded directories list (FIND EXCLUDE_DIRS "${D}" IDX) if (NOT IDX EQUAL -1) break () endif () # glob files in directory to make sure it is empty file (GLOB FILES "${D}/*") if (NOT FILES) # remove directory message (STATUS "Uninstalling: ${D}") execute_process (COMMAND "${CMAKE_COMMAND}" -E remove_directory "${D}" RESULT_VARIABLE RT) if (NOT RT EQUAL 0) set (OK FALSE) message (STATUS "Failed to remove ${D}") endif () endif () if ("${D}" STREQUAL "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}") # we reached the root installation direcory break () endif () # procede with parent directory get_filename_component (D "${D}" PATH) endwhile () endif () endforeach () ================================================ FILE: doc/feedback.md ================================================ # Overview The overall structure for the Feedback Controller is a bit confusing but I will try to lay out the basic idea here. There are two halves of the Feedback Controller, the GPU portion and the CPU portion. These are separated into different classes as we need to do different things on the different devices. The GPU portion just needs to provide feedback control while the CPU portion needs to do that and have the ability to update the feedback controller itself. # Steps to creating a new FeedbackController ## Create a new FeedbackParams struct This should be where the necessary data for the CPU portion of the controller resides. This does not get copied over to the GPU so the struct can have Eigen matrices in it. For example, this struct has the Q, Q_f, and R matrices for DDP and could have the desired poles for a PID controller. ## Create a new FeedbackState struct This is the data structure that will contain everything necessary to actually compute feedback on the GPU and CPU. This struct does get copied to the GPU so try to use float arrays of known sizes to prevent a need to rewrite the copy to GPU code. It should also inherit from the GPUState struct in `feedback.cuh` as that has a necessary variable SHARED_MEM_SIZE that is used to allocate shared memory on the GPU for the feedback controller to use. This space would be needed for keeping track of PID integrated errors for each sample of MPPI as an example. For DDP, this is where the trajectory of feedback gains is stored, and for a PID controller, this would be where you put your P, I, and D gains. ## Create a GPUFeedbackController Class This should inherit from the GPUFeedbackController template in `feedback.cuh` as that provides a lot of methods already. You at minimum need to write the `void k(x_act, x_goal, t, theta, control_output)` method to return the feedback control (`control output`) you would expect given the current state (`x_act`), the goal state (`x_goal`), the timestep you are on (`t`), and some potentially scratch space (`theta`). For DDP, this is ends up being essentially `feedback_gain_traj[t] * (x_act - x_goal)` (no use for `theta`). This class also has methods you can overwrite for copyToDevice() and copyFromDevice() if you have some fancy GPUFeedback or want to get diagnostic information back from the GPU. ## Create a FeedbackController Class This should inherit from the FeedbackController template in `feedback.cuh` which requires the GPUFeedback Class and the FeedbackParams struct you made earlier. There are 3 methods at minimum you need to overwrite for this class. 1. `void initTrackingController()` is where you can do some setup that might not be done in the constructor (DDP uses this to setup the ddp optimizer using the current params) 2. `void computeFeedback(init_state, goal_state_traj, control_traj)` is where you can write how to update your feedback controller. For DDP, this is where the optimization occurs and the new feedback gains trajectory is put into the DDPFeedbackState struct. 3. `control_array k_(x_act, x_goal, t, fb_state)` is the CPU version of calculating the feedback control. It has the FeedbackState passed in as there are times that the feedback control needs to be calculated from a different place than the internal FeedbackState (the internal state might be in the middle of updating for example). The resulting control needs to be the same as the GPU Controller output given the same FeedbackState. # Conclusion After creating all of the components, you can then use this controller class as the feedback controller template argument in MPPI, Tube, RMPPI, and it "should" plug in without any problems. The DDP implementation is the only example at the moment so use that as a guiding resource if you get lost. ================================================ FILE: examples/CMakeLists.txt ================================================ # add_executable(cartpole_example cartpole_example.cpp) # target_include_directories(cartpole_example PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # target_link_libraries(cartpole_example # cnpy # cartpole_mppi) file(GLOB CUDA_TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS CUDA_TARGET_SRCS) # Get filename without extension to use as target name get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${T_FILE}) target_include_directories(${T_NAME} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) target_link_libraries(${T_NAME} ${MPPI_HEADER_LIBRARY_NAME} ) # set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: examples/cartpole_example.cu ================================================ #include #include #include using SAMPLER_T = mppi::sampling_distributions::GaussianDistribution; int main(int argc, char** argv) { auto model = new CartpoleDynamics(1.0, 1.0, 1.0); auto cost = new CartpoleQuadraticCost; model->control_rngs_->x = -5; model->control_rngs_->y = 5; CartpoleQuadraticCostParams new_params; new_params.cart_position_coeff = 50; new_params.pole_angle_coeff = 200; new_params.cart_velocity_coeff = 10; new_params.pole_angular_velocity_coeff = 1; new_params.control_cost_coeff[0] = 0; new_params.terminal_cost_coeff = 0; new_params.desired_terminal_state[0] = 20; new_params.desired_terminal_state[1] = 0; new_params.desired_terminal_state[2] = M_PI; new_params.desired_terminal_state[3] = 0; cost->setParams(new_params); float dt = 0.02; int max_iter = 1; float lambda = 0.25; float alpha = 0.0; const int num_timesteps = 100; // Set up Gaussian Distribution auto sampler_params = SAMPLER_T::SAMPLING_PARAMS_T(); for (int i = 0; i < CartpoleDynamics::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 5.0; } auto sampler = new SAMPLER_T(sampler_params); // Feedback Controller auto fb_controller = new DDPFeedback(model, dt); auto CartpoleController = new VanillaMPPIController, num_timesteps, 2048>(model, cost, fb_controller, sampler, dt, max_iter, lambda, alpha); auto controller_params = CartpoleController->getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 4, 1); controller_params.cost_rollout_dim_ = dim3(64, 4, 1); CartpoleController->setParams(controller_params); CartpoleDynamics::state_array current_state = model->getZeroState(); CartpoleDynamics::state_array next_state = model->getZeroState(); CartpoleDynamics::output_array output = CartpoleDynamics::output_array::Zero(); int time_horizon = 5000; CartpoleDynamics::state_array xdot = model->getZeroState(); auto time_start = std::chrono::system_clock::now(); for (int i = 0; i < time_horizon; ++i) { // Compute the control CartpoleController->computeControl(current_state, 1); // Increment the state CartpoleDynamics::control_array control; control = CartpoleController->getControlSeq().block(0, 0, CartpoleDynamics::CONTROL_DIM, 1); model->enforceConstraints(current_state, control); model->step(current_state, next_state, xdot, control, output, i, dt); current_state = next_state; if (i % 50 == 0) { printf("Current Time: %f ", i * dt); printf("Current Baseline Cost: %f ", CartpoleController->getBaselineCost()); model->printState(current_state.data()); // std::cout << control << std::endl; } // Slide the controls down before calling the optimizer again CartpoleController->slideControlSequence(1); } auto time_end = std::chrono::system_clock::now(); auto diff = std::chrono::duration(time_end - time_start); printf("The elapsed time is: %f milliseconds\n", diff.count()); // std::cout << "The current control at timestep " << i << " is: " << CartpoleController.get_control_seq()[i] << // std::endl; // cost->freeCudaMem(); delete (CartpoleController); delete (cost); delete (model); delete (fb_controller); delete sampler; return 0; } ================================================ FILE: examples/double_integrator_CORL2020.cu ================================================ #include #include #include #include #include #include #include #include #include // Used to generate random noise for control trajectories bool tubeFailure(float* s) { float inner_path_radius2 = 1.675 * 1.675; float outer_path_radius2 = 2.325 * 2.325; float radial_position = s[0] * s[0] + s[1] * s[1]; if ((radial_position < inner_path_radius2) || (radial_position > outer_path_radius2)) { return true; } else { return false; } } using Dyn = DoubleIntegratorDynamics; using SCost = DoubleIntegratorCircleCost; using RCost = DoubleIntegratorRobustCost; const int num_timesteps = 50; // Optimization time horizon const int total_time_horizon = 5000; using Feedback = DDPFeedback; using Sampler = mppi::sampling_distributions::GaussianDistribution; // Problem setup const float dt = 0.02; // Timestep of dynamics propagation const int max_iter = 1; // Maximum running iterations of optimization const float lambda = 2; // Learning rate parameter const float alpha = 0.0; typedef Eigen::Matrix state_trajectory; void saveTraj(const Eigen::Ref& traj, int t, std::vector& vec) { for (int i = 0; i < num_timesteps; i++) { for (int j = 0; j < Dyn::STATE_DIM; j++) { vec[t * num_timesteps * Dyn::STATE_DIM + i * Dyn::STATE_DIM + j] = traj(j, i); } } } void saveState(const Eigen::Ref& state, int t, std::vector& vec) { for (int j = 0; j < Dyn::STATE_DIM; j++) { vec[t * Dyn::STATE_DIM + j] = state(j); } } void runVanilla(const Eigen::Ref>& noise) { // Set the initial state Dyn::state_array x; x << 2, 0, 0, 1; Dyn::state_array xdot; // control variance Sampler::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < Dyn::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; } // Save actual trajectories, nominal_trajectory, free energy std::vector van_trajectory(Dyn::STATE_DIM * total_time_horizon, 0); std::vector van_nominal_traj(Dyn::STATE_DIM * num_timesteps * total_time_horizon, 0); std::vector van_free_energy(total_time_horizon, 0); // Initialize the controllers Dyn model; SCost cost; Sampler sampler(sampler_params); // DDP cost parameters Feedback fb_controller(&model, dt); auto fb_params = fb_controller.getParams(); fb_params.Q.diagonal() << 500, 500, 100, 100; fb_controller.setParams(fb_params); auto controller = VanillaMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller.setParams(controller_params); controller.initFeedback(); // Start the loop for (int t = 0; t < total_time_horizon; ++t) { /********************** Vanilla **********************/ // Compute the control controller.computeControl(x, 1); // Compute the feedback gains controller.computeFeedback(x); // Propagate the feedback trajectory controller.computeFeedbackPropagatedStateSeq(); auto nominal_trajectory = controller.getTargetStateSeq(); auto nominal_control = controller.getControlSeq(); auto fe_stat = controller.getFreeEnergyStatistics(); // Save everything saveState(x, t, van_trajectory); saveTraj(nominal_trajectory, t, van_nominal_traj); van_free_energy[t] = fe_stat.real_sys.freeEnergyMean; // Get the open loop control DoubleIntegratorDynamics::control_array current_control = nominal_control.col(0); // Apply the feedback given the current state Dyn::control_array fb_control = controller.getFeedbackControl(x, nominal_trajectory.col(0), 0); current_control += fb_control; // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add disturbance x += noise.col(t) * sqrt(model.getParams().system_noise) * dt; // Slide the control sequence controller.slideControlSequence(1); } /************* Save CNPY *********************/ cnpy::npy_save("vanilla_state_trajectory.npy", van_trajectory.data(), { total_time_horizon, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("vanilla_nominal_trajectory.npy", van_nominal_traj.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("vanilla_free_energy.npy", van_free_energy.data(), { total_time_horizon }, "w"); } void runVanillaLarge(const Eigen::Ref>& noise) { // Set the initial state DoubleIntegratorDynamics::state_array x; x << 2, 0, 0, 1; DoubleIntegratorDynamics::state_array xdot; // control variance Sampler::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < Dyn::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; } // Save actual trajectories, nominal_trajectory, free energy std::vector van_large_trajectory(Dyn::STATE_DIM * total_time_horizon, 0); std::vector van_large_nominal_traj(Dyn::STATE_DIM * num_timesteps * total_time_horizon, 0); std::vector van_large_free_energy(total_time_horizon, 0); // Initialize the controllers Dyn model(100); SCost cost; Sampler sampler(sampler_params); // DDP cost parameters Feedback fb_controller(&model, dt); auto fb_params = fb_controller.getParams(); fb_params.Q.diagonal() << 500, 500, 100, 100; fb_controller.setParams(fb_params); auto controller = VanillaMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller.setParams(controller_params); controller.initFeedback(); // Start the loop for (int t = 0; t < total_time_horizon; ++t) { /********************** Vanilla Large **********************/ // Compute the control controller.computeControl(x, 1); // Compute the feedback gains controller.computeFeedback(x); // Propagate the feedback trajectory controller.computeFeedbackPropagatedStateSeq(); auto nominal_trajectory = controller.getTargetStateSeq(); auto nominal_control = controller.getControlSeq(); auto fe_stat = controller.getFreeEnergyStatistics(); // Save everything saveState(x, t, van_large_trajectory); saveTraj(nominal_trajectory, t, van_large_nominal_traj); van_large_free_energy[t] = fe_stat.real_sys.freeEnergyMean; // Get the open loop control DoubleIntegratorDynamics::control_array current_control = nominal_control.col(0); // Apply the feedback given the current state Dyn::control_array fb_control = controller.getFeedbackControl(x, nominal_trajectory.col(0), 0); current_control += fb_control; // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add disturbance x += noise.col(t) * sqrt(model.getParams().system_noise) * dt; // Slide the control sequence controller.slideControlSequence(1); } /************* Save CNPY *********************/ cnpy::npy_save("vanilla_large_state_trajectory.npy", van_large_trajectory.data(), { total_time_horizon, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("vanilla_large_nominal_trajectory.npy", van_large_nominal_traj.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("vanilla_large_free_energy.npy", van_large_free_energy.data(), { total_time_horizon }, "w"); } void runVanillaLargeRC(const Eigen::Ref>& noise) { // Set the initial state DoubleIntegratorDynamics::state_array x; x << 2, 0, 0, 1; DoubleIntegratorDynamics::state_array xdot; // control variance Sampler::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < Dyn::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; } // Save actual trajectories, nominal_trajectory, free energy std::vector van_large_trajectory(Dyn::STATE_DIM * total_time_horizon, 0); std::vector van_large_nominal_traj(Dyn::STATE_DIM * num_timesteps * total_time_horizon, 0); std::vector van_large_free_energy(total_time_horizon, 0); // Initialize the controllers Dyn model(100); RCost cost; Sampler sampler(sampler_params); auto params = cost.getParams(); params.crash_cost = 100; cost.setParams(params); // DDP cost parameters Feedback fb_controller(&model, dt); auto fb_params = fb_controller.getParams(); fb_params.Q.diagonal() << 500, 500, 100, 100; fb_controller.setParams(fb_params); auto controller = VanillaMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller.setParams(controller_params); controller.initFeedback(); // Start the loop for (int t = 0; t < total_time_horizon; ++t) { /********************** Vanilla Large **********************/ // Compute the control controller.computeControl(x, 1); // Compute the feedback gains controller.computeFeedback(x); // Propagate the feedback trajectory controller.computeFeedbackPropagatedStateSeq(); auto nominal_trajectory = controller.getTargetStateSeq(); auto nominal_control = controller.getControlSeq(); auto fe_stat = controller.getFreeEnergyStatistics(); // Save everything saveState(x, t, van_large_trajectory); saveTraj(nominal_trajectory, t, van_large_nominal_traj); van_large_free_energy[t] = fe_stat.real_sys.freeEnergyMean; // Get the open loop control DoubleIntegratorDynamics::control_array current_control = nominal_control.col(0); // Apply the feedback given the current state Dyn::control_array fb_control = controller.getFeedbackControl(x, nominal_trajectory.col(0), 0); current_control += fb_control; // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add disturbance x += noise.col(t) * sqrt(model.getParams().system_noise) * dt; // Slide the control sequence controller.slideControlSequence(1); } /************* Save CNPY *********************/ cnpy::npy_save("vanilla_large_robust_state_trajectory.npy", van_large_trajectory.data(), { total_time_horizon, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("vanilla_large_robust_nominal_trajectory.npy", van_large_nominal_traj.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("vanilla_large_robust_free_energy.npy", van_large_free_energy.data(), { total_time_horizon }, "w"); } void runTube(const Eigen::Ref>& noise) { // Set the initial state DoubleIntegratorDynamics::state_array x; x << 2, 0, 0, 1; DoubleIntegratorDynamics::state_array xdot; // control variance Sampler::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < Dyn::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; } // Save actual trajectories, nominal_trajectory, free energy std::vector tube_trajectory(Dyn::STATE_DIM * total_time_horizon, 0); std::vector tube_nominal_traj(Dyn::STATE_DIM * num_timesteps * total_time_horizon, 0); std::vector tube_nominal_free_energy(total_time_horizon, 0); std::vector tube_real_free_energy(total_time_horizon, 0); std::vector tube_nominal_state_used(total_time_horizon, 0); // Initialize the controllers Dyn model(100); SCost cost; Sampler sampler(sampler_params); // DDP cost parameters Feedback fb_controller(&model, dt); auto fb_params = fb_controller.getParams(); fb_params.Q.diagonal() << 500, 500, 100, 100; fb_controller.setParams(fb_params); auto controller = TubeMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller.setParams(controller_params); controller.setNominalThreshold(20); // Start the loop for (int t = 0; t < total_time_horizon; ++t) { /********************** Tube **********************/ // Compute the control controller.computeControl(x, 1); // Compute the feedback gains controller.computeFeedback(x); // Propagate the feedback trajectory controller.computeFeedbackPropagatedStateSeq(); auto nominal_trajectory = controller.getTargetStateSeq(); auto nominal_control = controller.getControlSeq(); auto fe_stat = controller.getFreeEnergyStatistics(); // Save everything saveState(x, t, tube_trajectory); saveTraj(nominal_trajectory, t, tube_nominal_traj); tube_nominal_free_energy[t] = fe_stat.nominal_sys.freeEnergyMean; tube_real_free_energy[t] = fe_stat.real_sys.freeEnergyMean; tube_nominal_state_used[t] = fe_stat.nominal_state_used; // Get the open loop control DoubleIntegratorDynamics::control_array current_control = nominal_control.col(0); // Apply the feedback given the current state Dyn::control_array fb_control = controller.getFeedbackControl(x, nominal_trajectory.col(0), 0); current_control += fb_control; // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); controller.updateNominalState(current_control); // Add disturbance x += noise.col(t) * sqrt(model.getParams().system_noise) * dt; // Slide the control sequence controller.slideControlSequence(1); } /************* Save CNPY *********************/ cnpy::npy_save("tube_state_trajectory.npy", tube_trajectory.data(), { total_time_horizon, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("tube_nominal_trajectory.npy", tube_nominal_traj.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("tube_nominal_free_energy.npy", tube_nominal_free_energy.data(), { total_time_horizon }, "w"); cnpy::npy_save("tube_real_free_energy.npy", tube_real_free_energy.data(), { total_time_horizon }, "w"); cnpy::npy_save("tube_nominal_state_used.npy", tube_nominal_state_used.data(), { total_time_horizon }, "w"); } void runTubeRC(const Eigen::Ref>& noise) { // Set the initial state DoubleIntegratorDynamics::state_array x; x << 2, 0, 0, 1; DoubleIntegratorDynamics::state_array xdot; // control variance Sampler::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < Dyn::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; } // Save actual trajectories, nominal_trajectory, free energy std::vector tube_trajectory(Dyn::STATE_DIM * total_time_horizon, 0); std::vector tube_nominal_traj(Dyn::STATE_DIM * num_timesteps * total_time_horizon, 0); std::vector tube_nominal_free_energy(total_time_horizon, 0); std::vector tube_real_free_energy(total_time_horizon, 0); std::vector tube_nominal_state_used(total_time_horizon, 0); // Initialize the controllers Dyn model(100); RCost cost; Sampler sampler(sampler_params); auto params = cost.getParams(); params.crash_cost = 100; cost.setParams(params); // DDP cost parameters Feedback fb_controller(&model, dt); auto fb_params = fb_controller.getParams(); fb_params.Q.diagonal() << 500, 500, 100, 100; fb_controller.setParams(fb_params); auto controller = TubeMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller.setParams(controller_params); controller.setNominalThreshold(2); // Start the loop for (int t = 0; t < total_time_horizon; ++t) { /********************** Tube **********************/ // Compute the control controller.computeControl(x, 1); // Compute the feedback gains controller.computeFeedback(x); // Propagate the feedback trajectory controller.computeFeedbackPropagatedStateSeq(); auto nominal_trajectory = controller.getTargetStateSeq(); auto nominal_control = controller.getControlSeq(); auto fe_stat = controller.getFreeEnergyStatistics(); // Save everything saveState(x, t, tube_trajectory); saveTraj(nominal_trajectory, t, tube_nominal_traj); tube_nominal_free_energy[t] = fe_stat.nominal_sys.freeEnergyMean; tube_real_free_energy[t] = fe_stat.real_sys.freeEnergyMean; tube_nominal_state_used[t] = fe_stat.nominal_state_used; // Get the open loop control DoubleIntegratorDynamics::control_array current_control = nominal_control.col(0); // Apply the feedback given the current state Dyn::control_array fb_control = controller.getFeedbackControl(x, nominal_trajectory.col(0), 0); current_control += fb_control; // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); controller.updateNominalState(current_control); // Add disturbance x += noise.col(t) * sqrt(model.getParams().system_noise) * dt; // Slide the control sequence controller.slideControlSequence(1); } /************* Save CNPY *********************/ cnpy::npy_save("tube_robust_state_trajectory.npy", tube_trajectory.data(), { total_time_horizon, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("tube_robust_nominal_trajectory.npy", tube_nominal_traj.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("tube_robust_nominal_free_energy.npy", tube_nominal_free_energy.data(), { total_time_horizon }, "w"); cnpy::npy_save("tube_robust_real_free_energy.npy", tube_real_free_energy.data(), { total_time_horizon }, "w"); cnpy::npy_save("tube_robust_nominal_state_used.npy", tube_nominal_state_used.data(), { total_time_horizon }, "w"); } void runRobustSc(const Eigen::Ref>& noise) { // Set the initial state DoubleIntegratorDynamics::state_array x; x << 2, 0, 0, 1; DoubleIntegratorDynamics::state_array xdot; // control variance Sampler::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < Dyn::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; } // Save actual trajectories, nominal_trajectory, free energy std::vector robust_sc_trajectory(Dyn::STATE_DIM * total_time_horizon, 0); std::vector robust_sc_nominal_traj(Dyn::STATE_DIM * num_timesteps * total_time_horizon, 0); std::vector robust_sc_nominal_free_energy(total_time_horizon, 0); std::vector robust_sc_real_free_energy(total_time_horizon, 0); std::vector robust_sc_nominal_free_energy_bound(total_time_horizon, 0); std::vector robust_sc_real_free_energy_bound(total_time_horizon, 0); std::vector robust_sc_real_free_energy_growth_bound(total_time_horizon, 0); std::vector robust_sc_nominal_state_used(total_time_horizon, 0); // Initialize the controllers Dyn model(100); SCost cost; Sampler sampler(sampler_params); // DDP cost parameters Feedback fb_controller(&model, dt); auto fb_params = fb_controller.getParams(); fb_params.Q.diagonal() << 500, 500, 100, 100; fb_controller.setParams(fb_params); // Value function threshold float value_function_threshold = 20.0; auto controller = RobustMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha, value_function_threshold); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller.setParams(controller_params); // Start the loop for (int t = 0; t < total_time_horizon; ++t) { /********************** Vanilla **********************/ // Compute the control controller.updateImportanceSamplingControl(x, 1); controller.computeControl(x, 1); // Compute the feedback gains controller.computeFeedback(x); // Propagate the feedback trajectory controller.computeFeedbackPropagatedStateSeq(); auto nominal_trajectory = controller.getTargetStateSeq(); auto nominal_control = controller.getControlSeq(); auto fe_stat = controller.getFreeEnergyStatistics(); // Save everything saveState(x, t, robust_sc_trajectory); saveTraj(nominal_trajectory, t, robust_sc_nominal_traj); robust_sc_nominal_free_energy[t] = fe_stat.nominal_sys.freeEnergyMean; robust_sc_real_free_energy[t] = fe_stat.real_sys.freeEnergyMean; robust_sc_nominal_free_energy_bound[t] = value_function_threshold + 2 * fe_stat.nominal_sys.freeEnergyModifiedVariance; robust_sc_real_free_energy_bound[t] = 0; robust_sc_real_free_energy_growth_bound[t] = 0; robust_sc_nominal_state_used[t] = fe_stat.nominal_state_used; // Get the open loop control DoubleIntegratorDynamics::control_array current_control = nominal_control.col(0); // Apply the feedback given the current state Dyn::control_array fb_control = controller.getFeedbackControl(x, nominal_trajectory.col(0), 0); current_control += fb_control; // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add disturbance x += noise.col(t) * sqrt(model.getParams().system_noise) * dt; // Slide the control sequence controller.slideControlSequence(1); } /************* Save CNPY *********************/ cnpy::npy_save("robust_sc_state_trajectory.npy", robust_sc_trajectory.data(), { total_time_horizon, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("robust_sc_nominal_trajectory.npy", robust_sc_nominal_traj.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("robust_sc_nominal_free_energy.npy", robust_sc_nominal_free_energy.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_sc_real_free_energy.npy", robust_sc_real_free_energy.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_sc_nominal_state_used.npy", robust_sc_nominal_state_used.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_sc_real_free_energy_bound.npy", robust_sc_nominal_free_energy_bound.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_sc_nominal_free_energy_bound.npy", robust_sc_real_free_energy_bound.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_sc_real_free_energy_growth_bound.npy", robust_sc_real_free_energy_growth_bound.data(), { total_time_horizon }, "w"); } void runRobustRc(const Eigen::Ref>& noise) { // Set the initial state DoubleIntegratorDynamics::state_array x; x << 2, 0, 0, 1; DoubleIntegratorDynamics::state_array xdot; // control variance Sampler::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < Dyn::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; } // Save actual trajectories, nominal_trajectory, free energy std::vector robust_rc_trajectory(Dyn::STATE_DIM * total_time_horizon, 0); std::vector robust_rc_nominal_traj(Dyn::STATE_DIM * num_timesteps * total_time_horizon, 0); std::vector robust_rc_nominal_free_energy(total_time_horizon, 0); std::vector robust_rc_real_free_energy(total_time_horizon, 0); std::vector robust_rc_nominal_free_energy_bound(total_time_horizon, 0); std::vector robust_rc_real_free_energy_bound(total_time_horizon, 0); std::vector robust_rc_real_free_energy_growth_bound(total_time_horizon, 0); std::vector robust_rc_nominal_free_energy_growth(total_time_horizon, 0); std::vector robust_rc_real_free_energy_growth(total_time_horizon, 0); std::vector robust_rc_nominal_state_used(total_time_horizon, 0); // Initialize the controllers Dyn model(100); RCost cost; auto params = cost.getParams(); params.crash_cost = 100; cost.setParams(params); Sampler sampler(sampler_params); // DDP cost parameters Feedback fb_controller(&model, dt); auto fb_params = fb_controller.getParams(); fb_params.Q.diagonal() << 500, 500, 100, 100; fb_controller.setParams(fb_params); // Value function threshold float value_function_threshold = 20.0; auto controller = RobustMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha, value_function_threshold); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller.setParams(controller_params); // Start the loop for (int t = 0; t < total_time_horizon; ++t) { /********************** Robust Robust Cost **********************/ // Compute the control controller.updateImportanceSamplingControl(x, 1); controller.computeControl(x, 1); // Compute the feedback gains controller.computeFeedback(x); // Propagate the feedback trajectory controller.computeFeedbackPropagatedStateSeq(); auto nominal_trajectory = controller.getTargetStateSeq(); auto nominal_control = controller.getControlSeq(); auto fe_stat = controller.getFreeEnergyStatistics(); // Save everything saveState(x, t, robust_rc_trajectory); saveTraj(nominal_trajectory, t, robust_rc_nominal_traj); robust_rc_nominal_free_energy[t] = fe_stat.nominal_sys.freeEnergyMean; robust_rc_real_free_energy[t] = fe_stat.real_sys.freeEnergyMean; robust_rc_nominal_free_energy_bound[t] = value_function_threshold + 2 * fe_stat.nominal_sys.freeEnergyModifiedVariance; robust_rc_real_free_energy_bound[t] = fe_stat.nominal_sys.freeEnergyMean + cost.getLipshitzConstantCost() * 1 * (x - nominal_trajectory.col(0)).norm(); robust_rc_real_free_energy_growth_bound[t] = (value_function_threshold - fe_stat.nominal_sys.freeEnergyMean) + cost.getLipshitzConstantCost() * 8 * 20 * controller.computeDF() + 2 * fe_stat.nominal_sys.freeEnergyModifiedVariance; robust_rc_nominal_free_energy_growth[t] = fe_stat.nominal_sys.increase; robust_rc_real_free_energy_growth[t] = fe_stat.real_sys.increase; robust_rc_nominal_state_used[t] = fe_stat.nominal_state_used; // Get the open loop control DoubleIntegratorDynamics::control_array current_control = nominal_control.col(0); // Apply the feedback given the current state Dyn::control_array fb_control = controller.getFeedbackControl(x, nominal_trajectory.col(0), 0); current_control += fb_control; // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add disturbance x += noise.col(t) * sqrt(model.getParams().system_noise) * dt; // Slide the control sequence controller.slideControlSequence(1); } /************* Save CNPY *********************/ cnpy::npy_save("robust_rc_state_trajectory.npy", robust_rc_trajectory.data(), { total_time_horizon, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("robust_rc_nominal_trajectory.npy", robust_rc_nominal_traj.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("robust_rc_nominal_free_energy.npy", robust_rc_nominal_free_energy.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_rc_real_free_energy.npy", robust_rc_real_free_energy.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_rc_nominal_state_used.npy", robust_rc_nominal_state_used.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_rc_real_free_energy_bound.npy", robust_rc_real_free_energy_bound.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_rc_nominal_free_energy_bound.npy", robust_rc_nominal_free_energy_bound.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_rc_real_free_energy_growth_bound.npy", robust_rc_real_free_energy_growth_bound.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_rc_real_free_energy_growth.npy", robust_rc_real_free_energy_growth.data(), { total_time_horizon }, "w"); cnpy::npy_save("robust_rc_nominal_free_energy_growth.npy", robust_rc_nominal_free_energy_growth.data(), { total_time_horizon }, "w"); } int main() { // Run the double integrator example on all the controllers with the SAME noise 20 times. // Create a random number generator // Random number generator for system noise std::mt19937 gen; // Standard mersenne_twister_engine which will be seeded std::normal_distribution normal_distribution; gen.seed(7); // Seed the 7, so everyone gets the same noise normal_distribution = std::normal_distribution(0, 1); Eigen::Matrix universal_noise; universal_noise.setZero(); // Create the noise for all systems for (int t = 0; t < total_time_horizon; ++t) { for (int i = 2; i < 4; ++i) { universal_noise(i, t) = normal_distribution(gen); } } runVanilla(universal_noise); std::cout << "Finished Vanilla" << std::endl; runVanillaLarge(universal_noise); std::cout << "Finished Vanilla Large" << std::endl; runVanillaLargeRC(universal_noise); std::cout << "Finished Vanilla Large with Robust Cost" << std::endl; runTube(universal_noise); std::cout << "Finished Tube with Standard Cost" << std::endl; runTubeRC(universal_noise); std::cout << "Finished Tube with Robust Cost" << std::endl; runRobustSc(universal_noise); std::cout << "Finished RMPPI with Standard Cost" << std::endl; runRobustRc(universal_noise); std::cout << "Finished RMPPI with Robust Cost" << std::endl; return 0; } ================================================ FILE: examples/double_integrator_example.cu ================================================ #include #include #include #include #include #include // #include #define USE_COLORED_NOISE const int TIMESTEPS = 65; const int NUM_ROLLOUTS = 128; using DYN = DoubleIntegratorDynamics; using COST = QuadraticCost; using FB_CONTROLLER = DDPFeedback; #ifdef USE_COLORED_NOISE using SAMPLER = mppi::sampling_distributions::ColoredNoiseDistribution; #else using SAMPLER = mppi::sampling_distributions::GaussianDistribution; #endif int main() { float dt = 0.015; // Set the initial state DYN::state_array x; x << -9, -9, 0.1, 0.1; DYN::state_array xdot; SAMPLER::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < DYN::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 0.5; #ifdef USE_COLORED_NOISE sampler_params.exponents[i] = 1.0; #endif } SAMPLER sampler(sampler_params); // Create the dynamics, cost function, and feedback controller DYN model; COST cost; FB_CONTROLLER fb_controller = FB_CONTROLLER(&model, dt); auto cost_params = cost.getParams(); // Set up cost function DYN::state_array x_goal; x_goal << -4, -4, 0, 0; DYN::state_array q_coeffs; q_coeffs << 5, 5, 0.5, 0.5; for (int i = 0; i < DYN::STATE_DIM; i++) { cost_params.s_coeffs[i] = q_coeffs[i]; cost_params.s_goal[i] = x_goal[i]; } cost.setParams(cost_params); // Create MPPI Controller float lambda = 1; float alpha = 1.0; int max_iter = 1; int total_time_horizon = 300; auto controller = VanillaMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller.setParams(controller_params); /********************** Vanilla MPPI **********************/ float cumulative_cost = 0; int crash = 0; for (int t = 0; t < total_time_horizon; ++t) { // Compute the control controller.computeControl(x, 1); auto nominal_control = controller.getControlSeq(); auto fe_stat = controller.getFreeEnergyStatistics(); DYN::control_array current_control = nominal_control.col(0); // Propagate dynamics forward DYN::output_array y; DYN::state_array x_next; // model.computeDynamics(x, current_control, xdot); // model.updateState(x, xdot, dt); model.step(x, x_next, xdot, current_control, y, t, dt); x = x_next; if (t % 10 == 0) { std::cout << "T: " << std::fixed << std::setprecision(3) << t * dt; // << "s Free Energy: " << fe_stat.real_sys.freeEnergyMean // << " +- " << fe_stat.real_sys.freeEnergyVariance << std::endl; std::cout << " X: " << x.transpose() << std::endl; } // Slide the control sequence controller.slideControlSequence(1); cumulative_cost += cost.computeRunningCost(y, current_control, t, &crash); } std::cout << "Total Cost: " << cumulative_cost << std::endl; return 0; } ================================================ FILE: include/mppi/controllers/ColoredMPPI/colored_mppi_controller.cu ================================================ #include #include #include #include #include #define ColoredMPPI_TEMPLATE \ template #define ColoredMPPI ColoredMPPIController ColoredMPPI_TEMPLATE ColoredMPPI::ColoredMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, int num_timesteps, const Eigen::Ref& init_control_traj, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, sampler, dt, max_iter, lambda, alpha, num_timesteps, init_control_traj, stream) { // Allocate CUDA memory for the controller allocateCUDAMemory(); // Copy the noise std_dev to the device // this->copyControlStdDevToDevice(); chooseAppropriateKernel(); } ColoredMPPI_TEMPLATE ColoredMPPI::ColoredMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, sampler, params, stream) { // Allocate CUDA memory for the controller allocateCUDAMemory(); // // Copy the noise std_dev to the device // this->copyControlStdDevToDevice(); chooseAppropriateKernel(); } ColoredMPPI_TEMPLATE void ColoredMPPI::chooseAppropriateKernel() { cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, 0)); unsigned single_kernel_byte_size = mppi::kernels::calcRolloutCombinedKernelSharedMemSize( this->model_, this->cost_, this->sampler_, this->params_.dynamics_rollout_dim_); unsigned split_dyn_kernel_byte_size = mppi::kernels::calcRolloutDynamicsKernelSharedMemSize( this->model_, this->sampler_, this->params_.dynamics_rollout_dim_); unsigned split_cost_kernel_byte_size = mppi::kernels::calcRolloutCostKernelSharedMemSize(this->cost_, this->sampler_, this->params_.cost_rollout_dim_); unsigned vis_single_kernel_byte_size = mppi::kernels::calcVisualizeKernelSharedMemSize( this->model_, this->cost_, this->sampler_, this->getNumTimesteps(), this->params_.visualize_dim_); bool too_much_mem_single_kernel = single_kernel_byte_size > deviceProp.sharedMemPerBlock; bool too_much_mem_vis_kernel = vis_single_kernel_byte_size > deviceProp.sharedMemPerBlock; bool too_much_mem_split_kernel = split_dyn_kernel_byte_size > deviceProp.sharedMemPerBlock; too_much_mem_split_kernel = too_much_mem_split_kernel || split_cost_kernel_byte_size > deviceProp.sharedMemPerBlock; too_much_mem_single_kernel = too_much_mem_single_kernel || too_much_mem_vis_kernel; if (too_much_mem_split_kernel && too_much_mem_single_kernel) { std::string error_msg = "There is not enough shared memory on the GPU for either rollout kernel option. The combined rollout kernel " "takes " + std::to_string(single_kernel_byte_size) + " bytes, the cost rollout kernel takes " + std::to_string(split_cost_kernel_byte_size) + " bytes, the dynamics rollout kernel takes " + std::to_string(split_dyn_kernel_byte_size) + " bytes, the combined visualization kernel takes " + std::to_string(vis_single_kernel_byte_size) + " bytes, and the max is " + std::to_string(deviceProp.sharedMemPerBlock) + " bytes. Considering lowering the corresponding thread block sizes."; throw std::runtime_error(error_msg); } else if (too_much_mem_single_kernel) { this->setKernelChoice(kernelType::USE_SPLIT_KERNELS); return; } else if (too_much_mem_split_kernel) { this->setKernelChoice(kernelType::USE_SINGLE_KERNEL); return; } // Send the nominal control to the device this->copyNominalControlToDevice(false); state_array zero_state = this->model_->getZeroState(); // Send zero state to the device HANDLE_ERROR(cudaMemcpyAsync(this->initial_state_d_, zero_state.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); // Generate noise data this->sampler_->generateSamples(1, 0, this->gen_, true); float single_kernel_time_ms = std::numeric_limits::infinity(); float split_kernel_time_ms = std::numeric_limits::infinity(); // Evaluate each kernel that is applicable auto start_single_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !too_much_mem_single_kernel; i++) { mppi::kernels::launchRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->stream_, true); } auto end_single_kernel_time = std::chrono::steady_clock::now(); auto start_split_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !too_much_mem_split_kernel; i++) { mppi::kernels::launchSplitRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->output_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->params_.cost_rollout_dim_, this->stream_, true); } auto end_split_kernel_time = std::chrono::steady_clock::now(); // calc times if (!too_much_mem_single_kernel) { single_kernel_time_ms = mppi::math::timeDiffms(end_single_kernel_time, start_single_kernel_time); } if (!too_much_mem_split_kernel) { split_kernel_time_ms = mppi::math::timeDiffms(end_split_kernel_time, start_split_kernel_time); } std::string kernel_choice = ""; if (split_kernel_time_ms < single_kernel_time_ms) { this->setKernelChoice(kernelType::USE_SPLIT_KERNELS); kernel_choice = "split "; } else { this->setKernelChoice(kernelType::USE_SINGLE_KERNEL); kernel_choice = "single"; } this->logger_->info("Choosing %s kernel based on split taking %f ms and single taking %f ms after %d iterations\n", kernel_choice.c_str(), split_kernel_time_ms, single_kernel_time_ms, this->getNumKernelEvaluations()); } ColoredMPPI_TEMPLATE ColoredMPPI::~ColoredMPPIController() { // all implemented in standard controller } ColoredMPPI_TEMPLATE void ColoredMPPI::computeControl(const Eigen::Ref& state, int optimization_stride) { this->free_energy_statistics_.real_sys.previousBaseline = this->getBaselineCost(); state_array local_state = state; if (getLeashActive()) { this->model_->enforceLeash(state, this->state_.col(leash_jump_), this->params_.state_leash_dist_, local_state); } // Send the initial condition to the device HANDLE_ERROR(cudaMemcpyAsync(this->initial_state_d_, local_state.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); float baseline_prev = 1e8; for (int opt_iter = 0; opt_iter < this->getNumIters(); opt_iter++) { // Send the nominal control to the device this->copyNominalControlToDevice(false); // Generate noise data this->sampler_->generateSamples(optimization_stride, opt_iter, this->gen_, false); // Launch the rollout kernel if (this->getKernelChoiceAsEnum() == kernelType::USE_SPLIT_KERNELS) { mppi::kernels::launchSplitRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->output_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->params_.cost_rollout_dim_, this->stream_, false); } else if (this->getKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { mppi::kernels::launchRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->stream_, false); } // Copy the costs back to the host HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); this->setBaseline(mppi::kernels::computeBaselineCost(this->trajectory_costs_.data(), NUM_ROLLOUTS)); if (this->getBaselineCost() > baseline_prev + 1) { this->logger_->debug("Previous Baseline: %f\n Baseline: %f\n", baseline_prev, this->getBaselineCost()); } baseline_prev = this->getBaselineCost(); // Launch the norm exponential kernel if (getGamma() == 0 || getRExp() == 0) { mppi::kernels::launchNormExpKernel(NUM_ROLLOUTS, this->getNormExpThreads(), this->trajectory_costs_d_, 1.0 / this->getLambda(), this->getBaselineCost(), this->stream_, false); } else { mppi::kernels::launchTsallisKernel(NUM_ROLLOUTS, this->getNormExpThreads(), this->trajectory_costs_d_, getGamma(), getRExp(), this->getBaselineCost(), this->stream_, false); } HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); // Compute the normalizer this->setNormalizer(mppi::kernels::computeNormalizer(this->trajectory_costs_.data(), NUM_ROLLOUTS)); mppi::kernels::computeFreeEnergy(this->free_energy_statistics_.real_sys.freeEnergyMean, this->free_energy_statistics_.real_sys.freeEnergyVariance, this->free_energy_statistics_.real_sys.freeEnergyModifiedVariance, this->trajectory_costs_.data(), NUM_ROLLOUTS, this->getBaselineCost(), this->getLambda()); // Compute the cost weighted average //TODO SUM_STRIDE is BDIM_X, but should it be its own parameter? this->sampler_->updateDistributionParamsFromDevice(this->trajectory_costs_d_, this->getNormalizerCost(), 0, false); // Transfer the new control to the host this->sampler_->setHostOptimalControlSequence(this->control_.data(), 0, true); } this->free_energy_statistics_.real_sys.normalizerPercent = this->getNormalizerCost() / NUM_ROLLOUTS; this->free_energy_statistics_.real_sys.increase = this->getBaselineCost() - this->free_energy_statistics_.real_sys.previousBaseline; smoothControlTrajectory(); computeStateTrajectory(local_state); state_array zero_state = this->model_->getZeroState(); for (int i = 0; i < this->getNumTimesteps(); i++) { // this->model_->enforceConstraints(zero_state, this->control_.col(i)); this->control_.col(i)[1] = fminf(fmaxf(this->control_.col(i)[1], this->model_->control_rngs_[1].x), this->model_->control_rngs_[1].y); } // Copy back sampled trajectories this->copySampledControlFromDevice(false); if (this->getKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { // copy initial state to vis initial state for use with visualizeKernel HANDLE_ERROR(cudaMemcpyAsync(this->vis_initial_state_d_, this->initial_state_d_, sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyDeviceToDevice, this->vis_stream_)); } this->copyTopControlFromDevice(true); } ColoredMPPI_TEMPLATE void ColoredMPPI::allocateCUDAMemory() { PARENT_CLASS::allocateCUDAMemoryHelper(); } ColoredMPPI_TEMPLATE void ColoredMPPI::computeStateTrajectory(const Eigen::Ref& x0) { this->computeOutputTrajectoryHelper(this->output_, this->state_, x0, this->control_); } ColoredMPPI_TEMPLATE void ColoredMPPI::slideControlSequence(int steps) { // TODO does the logic of handling control history reasonable? leash_jump_ = steps; // Save the control history this->saveControlHistoryHelper(steps, this->control_, this->control_history_); this->slideControlSequenceHelper(steps, this->control_); } ColoredMPPI_TEMPLATE void ColoredMPPI::smoothControlTrajectory() { this->smoothControlTrajectoryHelper(this->control_, this->control_history_); } ColoredMPPI_TEMPLATE void ColoredMPPI::calculateSampledStateTrajectories() { int num_sampled_trajectories = this->getTotalSampledTrajectories(); // controls already copied in compute control if (this->getKernelChoiceAsEnum() == kernelType::USE_SPLIT_KERNELS) { mppi::kernels::launchVisualizeCostKernel( this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), num_sampled_trajectories, this->getLambda(), this->getAlpha(), this->sampled_outputs_d_, this->sampled_crash_status_d_, this->sampled_costs_d_, this->params_.cost_rollout_dim_, this->stream_, false); } else if (this->getKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { mppi::kernels::launchVisualizeKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), num_sampled_trajectories, this->getLambda(), this->getAlpha(), this->vis_initial_state_d_, this->sampled_outputs_d_, this->sampled_costs_d_, this->sampled_crash_status_d_, this->params_.visualize_dim_, this->stream_, false); } // #if true // mppi::kernels::launchVisualizeCostKernel( // this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), // num_sampled_trajectories, this->getLambda(), this->getAlpha(), this->sampled_outputs_d_, // this->sampled_crash_status_d_, this->sampled_costs_d_, this->params_.cost_rollout_dim_, this->vis_stream_, // false); // #else // mppi_common::launchVisualizeCostKernel( // this->cost_, this->getDt(), this->getNumTimesteps(), num_sampled_trajectories, this->getLambda(), // this->getAlpha(), this->sampled_outputs_d_, this->sampled_noise_d_, this->sampled_crash_status_d_, // this->control_std_dev_d_, this->sampled_costs_d_, this->vis_stream_, false); // #endif for (int i = 0; i < num_sampled_trajectories; i++) { // set initial state to the first location // shifted by one since we do not save the initial state HANDLE_ERROR(cudaMemcpyAsync(this->sampled_trajectories_[i].data(), this->sampled_outputs_d_ + i * this->getNumTimesteps() * DYN_T::OUTPUT_DIM, (this->getNumTimesteps() - 1) * DYN_T::OUTPUT_DIM * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR( cudaMemcpyAsync(this->sampled_costs_[i].data(), this->sampled_costs_d_ + (i * (this->getNumTimesteps() + 1)), (this->getNumTimesteps() + 1) * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_crash_status_[i].data(), this->sampled_crash_status_d_ + (i * this->getNumTimesteps()), this->getNumTimesteps() * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); } HANDLE_ERROR(cudaStreamSynchronize(this->vis_stream_)); } #undef ColoredMPPI ================================================ FILE: include/mppi/controllers/ColoredMPPI/colored_mppi_controller.cuh ================================================ /** * Created by jason on 10/30/19. * Creates the API for interfacing with an MPPI controller * should define a compute_control based on state as well * as return timing info **/ #ifndef MPPIGENERIC_COLORED_MPPI_CONTROLLER_CUH #define MPPIGENERIC_COLORED_MPPI_CONTROLLER_CUH #include #include #include template struct ColoredMPPIParams : public ControllerParams { float r = 2.0; float gamma = 0; Eigen::Matrix state_leash_dist_ = Eigen::Matrix::Zero(); ColoredMPPIParams() = default; ColoredMPPIParams(const ColoredMPPIParams& other) { typedef ControllerParams BASE; const BASE& other_item_ref = other; *(static_cast(this)) = other_item_ref; // this->colored_noise_exponents_ = other.colored_noise_exponents_; } ColoredMPPIParams(ColoredMPPIParams& other) { typedef ControllerParams BASE; BASE& other_item_ref = other; *(static_cast(this)) = other_item_ref; // this->colored_noise_exponents_ = other.colored_noise_exponents_; } }; template , class PARAMS_T = ColoredMPPIParams> class ColoredMPPIController : public Controller { public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW typedef Controller PARENT_CLASS; // need control_array = ... so that we can initialize // Eigen::Matrix with Eigen::Matrix::Zero(); using control_array = typename PARENT_CLASS::control_array; using control_trajectory = typename PARENT_CLASS::control_trajectory; using state_trajectory = typename PARENT_CLASS::state_trajectory; using state_array = typename PARENT_CLASS::state_array; using sampled_cost_traj = typename PARENT_CLASS::sampled_cost_traj; using FEEDBACK_GPU = typename PARENT_CLASS::TEMPLATED_FEEDBACK_GPU; /** * * Public member functions */ // Constructor ColoredMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, int num_timesteps = MAX_TIMESTEPS, const Eigen::Ref& init_control_traj = control_trajectory::Zero(), cudaStream_t stream = nullptr); ColoredMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream = nullptr); // Destructor ~ColoredMPPIController(); std::string getControllerName() const override { return "Colored MPPI"; }; /** * computes a new control sequence * @param state starting position */ void computeControl(const Eigen::Ref& state, int optimization_stride = 1) override; /** * Slide the control sequence back n steps */ void slideControlSequence(int optimization_stride) override; void setPercentageSampledControlTrajectories(float new_perc) { this->setPercentageSampledControlTrajectoriesHelper(new_perc, 1); } void setGamma(float gamma) { this->params_.gamma = gamma; } float getGamma() { return this->params_.gamma; } void setRExp(float r) { this->params_.r = r; } float getRExp() { return this->params_.r; } void setOffsetDecayRate(float decay_rate) { this->sampler_->setOffsetDecayRate(decay_rate); } float getOffsetDecayRate() { return this->sampler_->getOffsetDecayRate(); } void setColoredNoiseExponents(std::vector& new_exponents) { auto sampler_params = this->sampler_->getParams(); for (int i = 0; i < new_exponents.size(); i++) { sampler_params.exponents[i] = new_exponents[i]; } this->sampler_->setParams(sampler_params); } float getColoredNoiseExponent(int index) { auto sampler_params = this->sampler_->getParams(); return sampler_params.exponents[index]; } std::vector getColoredNoiseExponents() { std::vector exponents; auto sampler_params = this->sampler_->getParams(); for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { exponents.push_back(sampler_params.exponents[i]); } return exponents; } void setNoiseDecay(float new_noise_decay) { auto sampler_params = this->sampler_->getParams(); sampler_params.std_dev_decay = new_noise_decay; this->sampler_->setParams(sampler_params); } void setStateLeashLength(float new_state_leash, int index = 0) { this->params_.state_leash_dist_[index] = new_state_leash; } float getStateLeashLength(int index) { return this->params_.state_leash_dist_[index]; } bool getLeashActive() { return leash_active_; } void setLeashActive(bool new_leash_active) { leash_active_ = new_leash_active; } void calculateSampledStateTrajectories() override; void chooseAppropriateKernel() override; protected: std::vector& getColoredNoiseExponentsLValue() { return this->params_.colored_noise_exponents_; } void computeStateTrajectory(const Eigen::Ref& x0); void smoothControlTrajectory(); int leash_jump_ = 1; bool leash_active_ = false; float control_std_dev_decay_ = 1.0; private: // ======== MUST BE OVERWRITTEN ========= void allocateCUDAMemory(); // ======== END MUST BE OVERWRITTEN ===== }; #if __CUDACC__ #include "colored_mppi_controller.cu" #endif #endif // MPPIGENERIC_COLORED_MPPI_CONTROLLER_CUH ================================================ FILE: include/mppi/controllers/MPPI/mppi_controller.cu ================================================ #include #include #include #include #include #include #include #define VANILLA_MPPI_TEMPLATE \ template #define VanillaMPPI VanillaMPPIController VANILLA_MPPI_TEMPLATE VanillaMPPI::VanillaMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, int num_timesteps, const Eigen::Ref& init_control_traj, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, sampler, dt, max_iter, lambda, alpha, num_timesteps, init_control_traj, stream) { // Allocate CUDA memory for the controller allocateCUDAMemory(); // Copy the noise std_dev to the device // this->copyControlStdDevToDevice(); chooseAppropriateKernel(); } VANILLA_MPPI_TEMPLATE VanillaMPPI::VanillaMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, sampler, params, stream) { // Allocate CUDA memory for the controller allocateCUDAMemory(); // // Copy the noise std_dev to the device // this->copyControlStdDevToDevice(); chooseAppropriateKernel(); } VANILLA_MPPI_TEMPLATE void VanillaMPPI::chooseAppropriateKernel() { cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, 0)); unsigned single_kernel_byte_size = mppi::kernels::calcRolloutCombinedKernelSharedMemSize( this->model_, this->cost_, this->sampler_, this->params_.dynamics_rollout_dim_); unsigned split_dyn_kernel_byte_size = mppi::kernels::calcRolloutDynamicsKernelSharedMemSize( this->model_, this->sampler_, this->params_.dynamics_rollout_dim_); unsigned split_cost_kernel_byte_size = mppi::kernels::calcRolloutCostKernelSharedMemSize(this->cost_, this->sampler_, this->params_.cost_rollout_dim_); unsigned vis_single_kernel_byte_size = mppi::kernels::calcVisualizeKernelSharedMemSize( this->model_, this->cost_, this->sampler_, this->getNumTimesteps(), this->params_.visualize_dim_); bool too_much_mem_single_kernel = single_kernel_byte_size > deviceProp.sharedMemPerBlock; bool too_much_mem_vis_kernel = vis_single_kernel_byte_size > deviceProp.sharedMemPerBlock; bool too_much_mem_split_kernel = split_dyn_kernel_byte_size > deviceProp.sharedMemPerBlock; too_much_mem_split_kernel = too_much_mem_split_kernel || split_cost_kernel_byte_size > deviceProp.sharedMemPerBlock; too_much_mem_single_kernel = too_much_mem_single_kernel || too_much_mem_vis_kernel; if (too_much_mem_split_kernel && too_much_mem_single_kernel) { std::string error_msg = "There is not enough shared memory on the GPU for either rollout kernel option. The combined rollout kernel " "takes " + std::to_string(single_kernel_byte_size) + " bytes, the cost rollout kernel takes " + std::to_string(split_cost_kernel_byte_size) + " bytes, the dynamics rollout kernel takes " + std::to_string(split_dyn_kernel_byte_size) + " bytes, the combined visualization kernel takes " + std::to_string(vis_single_kernel_byte_size) + " bytes, and the max is " + std::to_string(deviceProp.sharedMemPerBlock) + " bytes. Considering lowering the corresponding thread block sizes."; throw std::runtime_error(error_msg); } else if (too_much_mem_single_kernel) { this->setKernelChoice(kernelType::USE_SPLIT_KERNELS); return; } else if (too_much_mem_split_kernel) { this->setKernelChoice(kernelType::USE_SINGLE_KERNEL); return; } // Send the nominal control to the device this->copyNominalControlToDevice(false); state_array zero_state = this->model_->getZeroState(); // Send zero state to the device HANDLE_ERROR(cudaMemcpyAsync(this->initial_state_d_, zero_state.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); // Generate noise data this->sampler_->generateSamples(1, 0, this->gen_, true); float single_kernel_time_ms = std::numeric_limits::infinity(); float split_kernel_time_ms = std::numeric_limits::infinity(); // Evaluate each kernel that is applicable auto start_single_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !too_much_mem_single_kernel; i++) { mppi::kernels::launchRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->stream_, true); } auto end_single_kernel_time = std::chrono::steady_clock::now(); auto start_split_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !too_much_mem_split_kernel; i++) { mppi::kernels::launchSplitRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->output_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->params_.cost_rollout_dim_, this->stream_, true); } auto end_split_kernel_time = std::chrono::steady_clock::now(); // calc times if (!too_much_mem_single_kernel) { single_kernel_time_ms = mppi::math::timeDiffms(end_single_kernel_time, start_single_kernel_time); } if (!too_much_mem_split_kernel) { split_kernel_time_ms = mppi::math::timeDiffms(end_split_kernel_time, start_split_kernel_time); } std::string kernel_choice = ""; if (split_kernel_time_ms < single_kernel_time_ms) { this->setKernelChoice(kernelType::USE_SPLIT_KERNELS); kernel_choice = "split "; } else { this->setKernelChoice(kernelType::USE_SINGLE_KERNEL); kernel_choice = "single"; } this->logger_->info("Choosing %s kernel based on split taking %f ms and single taking %f ms after %d iterations\n", kernel_choice.c_str(), split_kernel_time_ms, single_kernel_time_ms, this->getNumKernelEvaluations()); } VANILLA_MPPI_TEMPLATE VanillaMPPI::~VanillaMPPIController() { // all implemented in standard controller } VANILLA_MPPI_TEMPLATE void VanillaMPPI::computeControl(const Eigen::Ref& state, int optimization_stride) { this->free_energy_statistics_.real_sys.previousBaseline = this->getBaselineCost(); // Send the initial condition to the device HANDLE_ERROR(cudaMemcpyAsync(this->initial_state_d_, state.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); float baseline_prev = 1e8; for (int opt_iter = 0; opt_iter < this->getNumIters(); opt_iter++) { // Send the nominal control to the device this->copyNominalControlToDevice(false); // Generate noise data this->sampler_->generateSamples(optimization_stride, opt_iter, this->gen_, false); // Launch the rollout kernel if (this->getKernelChoiceAsEnum() == kernelType::USE_SPLIT_KERNELS) { mppi::kernels::launchSplitRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->output_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->params_.cost_rollout_dim_, this->stream_, false); } else if (this->getKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { mppi::kernels::launchRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->stream_, false); } // Copy the costs back to the host HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); this->setBaseline(mppi::kernels::computeBaselineCost(this->trajectory_costs_.data(), NUM_ROLLOUTS)); if (this->getBaselineCost() > baseline_prev + 1) { this->logger_->debug("Previous Baseline: %f\n Baseline: %f\n", baseline_prev, this->getBaselineCost()); } baseline_prev = this->getBaselineCost(); // Launch the norm exponential kernel mppi::kernels::launchNormExpKernel(NUM_ROLLOUTS, this->getNormExpThreads(), this->trajectory_costs_d_, 1.0 / this->getLambda(), this->getBaselineCost(), this->stream_, false); HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); // Compute the normalizer this->setNormalizer(mppi::kernels::computeNormalizer(this->trajectory_costs_.data(), NUM_ROLLOUTS)); mppi::kernels::computeFreeEnergy(this->free_energy_statistics_.real_sys.freeEnergyMean, this->free_energy_statistics_.real_sys.freeEnergyVariance, this->free_energy_statistics_.real_sys.freeEnergyModifiedVariance, this->trajectory_costs_.data(), NUM_ROLLOUTS, this->getBaselineCost(), this->getLambda()); this->sampler_->updateDistributionParamsFromDevice(this->trajectory_costs_d_, this->getNormalizerCost(), 0, false); // Transfer the new control to the host this->sampler_->setHostOptimalControlSequence(this->control_.data(), 0, true); } this->free_energy_statistics_.real_sys.normalizerPercent = this->getNormalizerCost() / NUM_ROLLOUTS; this->free_energy_statistics_.real_sys.increase = this->getBaselineCost() - this->free_energy_statistics_.real_sys.previousBaseline; smoothControlTrajectory(); computeStateTrajectory(state); state_array zero_state = this->model_->getZeroState(); for (int i = 0; i < this->getNumTimesteps(); i++) { this->model_->enforceConstraints(zero_state, this->control_.col(i)); } // Copy back sampled trajectories this->copySampledControlFromDevice(false); if (this->getKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { // copy initial state to vis initial state for use with visualizeKernel HANDLE_ERROR(cudaMemcpyAsync(this->vis_initial_state_d_, this->initial_state_d_, sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyDeviceToDevice, this->vis_stream_)); } this->copyTopControlFromDevice(true); } VANILLA_MPPI_TEMPLATE void VanillaMPPI::allocateCUDAMemory() { PARENT_CLASS::allocateCUDAMemoryHelper(); } VANILLA_MPPI_TEMPLATE void VanillaMPPI::computeStateTrajectory(const Eigen::Ref& x0) { this->computeOutputTrajectoryHelper(this->output_, this->state_, x0, this->control_); } VANILLA_MPPI_TEMPLATE void VanillaMPPI::smoothControlTrajectory() { this->smoothControlTrajectoryHelper(this->control_, this->control_history_); } VANILLA_MPPI_TEMPLATE void VanillaMPPI::calculateSampledStateTrajectories() { int num_sampled_trajectories = this->getTotalSampledTrajectories(); // control already copied in compute control, so run kernel if (this->getKernelChoiceAsEnum() == kernelType::USE_SPLIT_KERNELS) { mppi::kernels::launchVisualizeCostKernel( this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), num_sampled_trajectories, this->getLambda(), this->getAlpha(), this->sampled_outputs_d_, this->sampled_crash_status_d_, this->sampled_costs_d_, this->params_.cost_rollout_dim_, this->stream_, false); } else if (this->getKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { mppi::kernels::launchVisualizeKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), num_sampled_trajectories, this->getLambda(), this->getAlpha(), this->vis_initial_state_d_, this->sampled_outputs_d_, this->sampled_costs_d_, this->sampled_crash_status_d_, this->params_.visualize_dim_, this->stream_, false); } for (int i = 0; i < num_sampled_trajectories; i++) { // set initial state to the first location // shifted by one since we do not save the initial state HANDLE_ERROR(cudaMemcpyAsync(this->sampled_trajectories_[i].data(), this->sampled_outputs_d_ + i * this->getNumTimesteps() * DYN_T::OUTPUT_DIM, (this->getNumTimesteps() - 1) * DYN_T::OUTPUT_DIM * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR( cudaMemcpyAsync(this->sampled_costs_[i].data(), this->sampled_costs_d_ + (i * (this->getNumTimesteps() + 1)), (this->getNumTimesteps() + 1) * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_crash_status_[i].data(), this->sampled_crash_status_d_ + (i * this->getNumTimesteps()), this->getNumTimesteps() * sizeof(int), cudaMemcpyDeviceToHost, this->vis_stream_)); } HANDLE_ERROR(cudaStreamSynchronize(this->vis_stream_)); } #undef VANILLA_MPPI_TEMPLATE #undef VanillaMPPI ================================================ FILE: include/mppi/controllers/MPPI/mppi_controller.cuh ================================================ /** * Created by jason on 10/30/19. * Creates the API for interfacing with an MPPI controller * should define a compute_control based on state as well * as return timing info **/ #ifndef MPPIGENERIC_MPPI_CONTROLLER_CUH #define MPPIGENERIC_MPPI_CONTROLLER_CUH #include #include template , class PARAMS_T = ControllerParams> class VanillaMPPIController : public Controller { public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW // nAeed control_array = ... so that we can initialize // Eigen::Matrix with Eigen::Matrix::Zero(); typedef Controller PARENT_CLASS; using control_array = typename PARENT_CLASS::control_array; using control_trajectory = typename PARENT_CLASS::control_trajectory; using state_trajectory = typename PARENT_CLASS::state_trajectory; using state_array = typename PARENT_CLASS::state_array; using sampled_cost_traj = typename PARENT_CLASS::sampled_cost_traj; using FEEDBACK_GPU = typename PARENT_CLASS::TEMPLATED_FEEDBACK_GPU; /** * * Public member functions */ // Constructor VanillaMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, int num_timesteps = MAX_TIMESTEPS, const Eigen::Ref& init_control_traj = control_trajectory::Zero(), cudaStream_t stream = nullptr); VanillaMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream = nullptr); // Destructor ~VanillaMPPIController(); std::string getControllerName() const override { return "Vanilla MPPI"; }; /** * computes a new control sequence * @param state starting position */ void computeControl(const Eigen::Ref& state, int optimization_stride = 1) override; void setPercentageSampledControlTrajectories(float new_perc) { this->setPercentageSampledControlTrajectoriesHelper(new_perc, 1); } void calculateSampledStateTrajectories() override; void chooseAppropriateKernel() override; protected: void computeStateTrajectory(const Eigen::Ref& x0); void smoothControlTrajectory(); private: // ======== MUST BE OVERWRITTEN ========= void allocateCUDAMemory(); // ======== END MUST BE OVERWRITTEN ===== }; #if __CUDACC__ #include "mppi_controller.cu" #endif #endif // MPPIGENERIC_MPPI_CONTROLLER_CUH ================================================ FILE: include/mppi/controllers/Primitives/primitives_controller.cu ================================================ #include #include #include #include #include #include #define Primitives \ PrimitivesController template Primitives::PrimitivesController(DYN_T* model, COST_T* cost, FB_T* fb_controller, float dt, int max_iter, float lambda, float alpha, const Eigen::Ref& control_std_dev, int num_timesteps, const Eigen::Ref& init_control_traj, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, dt, max_iter, lambda, alpha, control_std_dev, num_timesteps, init_control_traj, stream) { // Allocate CUDA memory for the controller allocateCUDAMemory(); std::vector tmp_vec(DYN_T::CONTROL_DIM, 0.0); getColoredNoiseExponentsLValue() = std::move(tmp_vec); getScalePiecewiseNoiseLValue() = std::move(tmp_vec); // Copy the noise std_dev to the device this->copyControlStdDevToDevice(); control_mppi_history_ = Eigen::Matrix::Zero(); } template Primitives::PrimitivesController(DYN_T* model, COST_T* cost, FB_T* fb_controller, PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, params, stream) { // Allocate CUDA memory for the controller allocateCUDAMemory(); if (this->getColoredNoiseExponentsLValue().size() == 0) { std::vector tmp_vec(DYN_T::CONTROL_DIM, 0.0); getColoredNoiseExponentsLValue() = std::move(tmp_vec); } if (this->getScalePiecewiseNoiseLValue().size() == 0) { std::vector tmp_vec(DYN_T::CONTROL_DIM, 0.0); getScalePiecewiseNoiseLValue() = std::move(tmp_vec); } // Copy the noise std_dev to the device this->copyControlStdDevToDevice(); } template Primitives::~PrimitivesController() { // all implemented in standard controller } template void Primitives::computeControl(const Eigen::Ref& state, int optimization_stride) { // this->free_energy_statistics_.real_sys.previousBaseline = this->getBaselineCost(); state_array local_state = state; if (getLeashActive()) { this->model_->enforceLeash(state, this->state_.col(leash_jump_), this->params_.state_leash_dist_, local_state); } // Send the initial condition to the device HANDLE_ERROR(cudaMemcpyAsync(this->initial_state_d_, local_state.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); ///////////////// // BEGIN INTERMEDIATE PLANNER // Compute intermediate plan using piecewise linear noise, and choosing the best int prev_controls_idx = 1; float primitives_baseline = 0.0; float baseline_prev = 0.0; int best_idx = -1; // Send the nominal control to the device this->copyNominalControlToDevice(false); for (int opt_iter = 0; opt_iter < getNumPrimitiveIterations(); opt_iter++) { powerlaw_psd_gaussian(getColoredNoiseExponentsLValue(), this->getNumTimesteps(), NUM_ROLLOUTS, this->control_noise_d_, 0, this->gen_, this->getOffsetDecayRate(), this->stream_); // Generate piecewise linear noise data, update control_noise_d_ piecewise_linear_noise(this->getNumTimesteps(), NUM_ROLLOUTS, DYN_T::CONTROL_DIM, getPiecewiseSegments(), optimization_stride, getScalePiecewiseNoiseLValue(), getFracRandomNoiseTrajLValue(), getScaleAddNominalNoiseLValue(), this->control_d_, this->control_noise_d_, this->control_std_dev_d_, this->gen_, this->stream_); // // Set nominal controls to zero because we want to use the noise directly // this->control_ = control_trajectory::Zero(); // // Send the zero nominal control to the device // this->copyNominalControlToDevice(); // Launch the rollout kernel mppi_common::launchFastRolloutKernel( this->model_->model_d_, this->cost_->cost_d_, this->getDt(), this->getNumTimesteps(), optimization_stride, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->output_d_, this->control_d_, this->control_noise_d_, this->control_std_dev_d_, this->trajectory_costs_d_, this->stream_, false); // Copy the costs back to the host HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); primitives_baseline = mppi_common::computeBaselineCost(this->trajectory_costs_.data(), NUM_ROLLOUTS); // get previous control cost (at index = 1, since index = 0 is zero control traj) baseline_prev = this->trajectory_costs_.data()[prev_controls_idx]; if (this->debug_) { std::cerr << "Previous Baseline: " << baseline_prev << " Baseline: " << this->getBaselineCost() << std::endl; } // if baseline is too high and trajectory is unsafe, create and issue a stopping trajectory // reminder: baseline_ is the average cost along trajectory if (getStoppingCostThreshold() > 0 && primitives_baseline > getStoppingCostThreshold()) { std::cerr << "Baseline is too high, issuing stopping trajectory!" << std::endl; computeStoppingTrajectory(local_state); primitives_baseline = std::numeric_limits::min(); } // else if (primitives_baseline > baseline_prev - getHysteresisCostThreshold()) // { // // baseline is not decreasing enough, use controls from the previous iteration // if (this->debug_) // { // std::cout << "Not enough improvement, use prev controls." << std::endl; // } // HANDLE_ERROR(cudaMemcpyAsync( // this->control_.data(), // this->control_noise_d_ + prev_controls_idx * this->getNumTimesteps() * DYN_T::CONTROL_DIM, // sizeof(float) * this->getNumTimesteps() * DYN_T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); // primitives_baseline = baseline_prev; // } else { // otherwise, update the nominal control // Copy best control from device to the host best_idx = mppi_common::computeBestIndex(this->trajectory_costs_.data(), NUM_ROLLOUTS); HANDLE_ERROR(cudaMemcpyAsync( this->control_.data(), this->control_noise_d_ + best_idx * this->getNumTimesteps() * DYN_T::CONTROL_DIM, sizeof(float) * this->getNumTimesteps() * DYN_T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); } this->copyNominalControlToDevice(false); cudaStreamSynchronize(this->stream_); } // Copy back sampled trajectories for visualization if (getVisualizePrimitives()) { this->copySampledControlFromDevice(false); this->copyTopControlFromDevice(true); } // END INTERMEDIATE PLANNER //////////////// //////////////// // BEGIN MPPI for (int opt_iter = 0; opt_iter < this->getNumIters(); opt_iter++) { // Send the nominal control to the device copyMPPIControlToDevice(false); // Generate noise data // const int colored_num_timesteps = (this->getNumTimesteps() > optimization_stride) ? // this->getNumTimesteps() - optimization_stride : // this->getNumTimesteps(); // const int colored_stride = (this->getNumTimesteps() > optimization_stride) ? optimization_stride : 0; // if (colored_stride == 0) // { // std::cout << "We tripped the fail-safe" << std::endl; // } powerlaw_psd_gaussian(getColoredNoiseExponentsLValue(), this->getNumTimesteps(), NUM_ROLLOUTS, this->control_noise_d_, 0, this->gen_, this->getOffsetDecayRate(), this->stream_); // curandGenerateNormal(this->gen_, this->control_noise_d_, NUM_ROLLOUTS * this->getNumTimesteps() * // DYN_T::CONTROL_DIM, // 0.0, 1.0); /* std::vector noise = this->getSampledNoise(); float mean = 0; for(int k = 0; k < noise.size(); k++) { mean += (noise[k]/noise.size()); } float std_dev = 0; for(int k = 0; k < noise.size(); k++) { std_dev += powf(noise[k] - mean, 2); } std_dev = sqrt(std_dev/noise.size()); printf("CPU 1 side N(%f, %f)\n", mean, std_dev); */ // Launch the rollout kernel mppi_common::launchFastRolloutKernel( this->model_->model_d_, this->cost_->cost_d_, this->getDt(), this->getNumTimesteps(), optimization_stride, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->output_d_, control_mppi_d_, this->control_noise_d_, this->control_std_dev_d_, this->trajectory_costs_d_, this->stream_, false); /* noise = this->getSampledNoise(); mean = 0; for(int k = 0; k < noise.size(); k++) { mean += (noise[k]/noise.size()); } std_dev = 0; for(int k = 0; k < noise.size(); k++) { std_dev += powf(noise[k] - mean, 2); } std_dev = sqrt(std_dev/noise.size()); printf("CPU 2 side N(%f, %f)\n", mean, std_dev); */ // Copy the costs back to the host HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); this->setBaseline(mppi_common::computeBaselineCost(this->trajectory_costs_.data(), NUM_ROLLOUTS)); // if (this->getBaselineCost() > baseline_prev + 1) // { // // TODO handle printing // if (this->debug_) // { // std::cout << "Previous Baseline: " << baseline_prev << std::endl; // std::cout << " Baseline: " << this->getBaselineCost() << std::endl; // } // } // baseline_prev = this->getBaselineCost(); // Launch the norm exponential kernel if (getGamma() == 0 || getRExp() == 0) { mppi_common::launchNormExpKernel(NUM_ROLLOUTS, BDIM_X, this->trajectory_costs_d_, 1.0 / this->getLambda(), this->getBaselineCost(), this->stream_, false); } else { mppi_common::launchTsallisKernel(NUM_ROLLOUTS, BDIM_X, this->trajectory_costs_d_, getGamma(), getRExp(), this->getBaselineCost(), this->stream_, false); } HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); // Compute the normalizer this->setNormalizer(mppi_common::computeNormalizer(this->trajectory_costs_.data(), NUM_ROLLOUTS)); mppi_common::computeFreeEnergy(this->free_energy_statistics_.real_sys.freeEnergyMean, this->free_energy_statistics_.real_sys.freeEnergyVariance, this->free_energy_statistics_.real_sys.freeEnergyModifiedVariance, this->trajectory_costs_.data(), NUM_ROLLOUTS, this->getBaselineCost(), this->getLambda()); // Compute the cost weighted average //TODO SUM_STRIDE is BDIM_X, but should it be its own parameter? mppi_common::launchWeightedReductionKernel( this->trajectory_costs_d_, this->control_noise_d_, control_mppi_d_, this->getNormalizerCost(), this->getNumTimesteps(), this->stream_, false); /* noise = this->getSampledNoise(); mean = 0; for(int k = 0; k < noise.size(); k++) { mean += (noise[k]/noise.size()); } std_dev = 0; for(int k = 0; k < noise.size(); k++) { std_dev += powf(noise[k] - mean, 2); } std_dev = sqrt(std_dev/noise.size()); printf("CPU 3 side N(%f, %f)\n", mean, std_dev); */ // Transfer the new control to the host HANDLE_ERROR(cudaMemcpyAsync(control_mppi_.data(), control_mppi_d_, sizeof(float) * this->getNumTimesteps() * DYN_T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); cudaStreamSynchronize(this->stream_); } this->free_energy_statistics_.real_sys.normalizerPercent = this->getNormalizerCost() / NUM_ROLLOUTS; this->free_energy_statistics_.real_sys.increase = this->getBaselineCost() - this->free_energy_statistics_.real_sys.previousBaseline; // END MPPI //////////////////////// // decide between using the MPPI control or the primitives control if (this->debug_) { std::cerr << "mppi baseline: " << this->getBaselineCost() << ", primitives baseline: " << primitives_baseline << ", prev baseline: " << baseline_prev << std::endl; } if ((getNumPrimitiveIterations() == 0 && this->getNumIters() > 0) || ((getNumPrimitiveIterations() > 0 && this->getNumIters() > 0) && (this->getBaselineCost() < primitives_baseline + getHysteresisCostThreshold()))) { this->control_ = control_mppi_; this->copyNominalControlToDevice(); if (this->debug_) { std::cout << "Using MPPI control" << std::endl; } this->free_energy_statistics_.nominal_state_used = 0; } else { // control_mppi_ = this->control_; // don't do this, we want to save the MPPI control if (this->debug_) { std::cout << "Using primitives control, "; } if (best_idx > 0 && best_idx <= int((getFracRandomNoiseTrajLValue())[0] * NUM_ROLLOUTS)) { if (this->debug_) { std::cout << "colored noise added to nominal." << std::endl; } this->free_energy_statistics_.nominal_state_used = 1; } else if (best_idx <= int((getFracRandomNoiseTrajLValue()[0] + getFracRandomNoiseTrajLValue()[1]) * NUM_ROLLOUTS)) { if (this->debug_) { std::cout << "piecewise noise added to nominal." << std::endl; } this->free_energy_statistics_.nominal_state_used = 2; } else { if (this->debug_) { std::cout << "new piecewise trajectory." << std::endl; } this->free_energy_statistics_.nominal_state_used = 3; } } smoothControlTrajectory(); computeStateTrajectory(local_state); state_array zero_state = state_array::Zero(); for (int i = 0; i < this->getNumTimesteps(); i++) { this->model_->enforceConstraints(zero_state, this->control_.col(i)); this->model_->enforceConstraints(zero_state, control_mppi_.col(i)); // this->control_.col(i)[1] = // fminf(fmaxf(this->control_.col(i)[1], this->model_->control_rngs_[1].x), this->model_->control_rngs_[1].y); } // Copy back sampled trajectories for visualization if (!getVisualizePrimitives()) { this->copySampledControlFromDevice(false); this->copyTopControlFromDevice(true); } } template void Primitives::allocateCUDAMemory() { PARENT_CLASS::allocateCUDAMemoryHelper(); HANDLE_ERROR(cudaMalloc((void**)&control_mppi_d_, sizeof(float) * DYN_T::CONTROL_DIM * MAX_TIMESTEPS)); } template void Primitives::copyMPPIControlToDevice(bool synchronize) { HANDLE_ERROR(cudaMemcpyAsync(control_mppi_d_, control_mppi_.data(), sizeof(float) * control_mppi_.size(), cudaMemcpyHostToDevice, this->stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } template void Primitives::computeStateTrajectory(const Eigen::Ref& x0) { this->computeStateTrajectoryHelper(this->state_, x0, this->control_); } template void Primitives::computeStoppingTrajectory(const Eigen::Ref& x0) { state_array xdot; state_array state = x0; state_array xnext; output_array output; control_array u_i = control_array::Zero(); this->model_->initializeDynamics(state, u_i, output, 0, this->getDt()); for (int i = 0; i < this->getNumTimesteps() - 1; ++i) { this->model_->getStoppingControl(state, u_i); this->model_->enforceConstraints(state, u_i); this->control_.col(i) = u_i; this->model_->step(state, xnext, xdot, u_i, output, i, this->getDt()); state = xnext; } } template void Primitives::slideControlSequence(int steps) { // TODO does the logic of handling control history reasonable? leash_jump_ = steps; // Save the control history this->saveControlHistoryHelper(steps, this->control_, this->control_history_); this->saveControlHistoryHelper(steps, control_mppi_, control_mppi_history_); this->slideControlSequenceHelper(steps, this->control_); this->slideControlSequenceHelper(steps, control_mppi_); } template void Primitives::smoothControlTrajectory() { this->smoothControlTrajectoryHelper(this->control_, this->control_history_); this->smoothControlTrajectoryHelper(control_mppi_, control_mppi_history_); } template void Primitives::calculateSampledStateTrajectories() { int num_sampled_trajectories = this->getTotalSampledTrajectories(); // controls already copied in compute control mppi_common::launchStateAndCostTrajectoryKernel( this->model_->model_d_, this->cost_->cost_d_, this->fb_controller_->getDevicePointer(), this->sampled_noise_d_, this->initial_state_d_, this->sampled_outputs_d_, this->sampled_costs_d_, this->sampled_crash_status_d_, num_sampled_trajectories, this->getNumTimesteps(), this->getDt(), this->vis_stream_); for (int i = 0; i < num_sampled_trajectories; i++) { // set initial state to the first location // shifted by one since we do not save the initial state HANDLE_ERROR(cudaMemcpyAsync(this->sampled_trajectories_[i].data(), this->sampled_outputs_d_ + i * this->getNumTimesteps() * DYN_T::OUTPUT_DIM, (this->getNumTimesteps() - 1) * DYN_T::OUTPUT_DIM * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR( cudaMemcpyAsync(this->sampled_costs_[i].data(), this->sampled_costs_d_ + (i * (this->getNumTimesteps() + 1)), (this->getNumTimesteps() + 1) * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_crash_status_[i].data(), this->sampled_crash_status_d_ + (i * this->getNumTimesteps()), this->getNumTimesteps() * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); } HANDLE_ERROR(cudaStreamSynchronize(this->vis_stream_)); } #undef Primitives ================================================ FILE: include/mppi/controllers/Primitives/primitives_controller.cuh ================================================ /** * Created by david fan on 04/11/22. * Creates the API for interfacing with an MPPI controller * should define a compute_control based on state as well * as return timing info **/ #ifndef MPPIGENERIC_PRIMITIVES_CONTROLLER_CUH #define MPPIGENERIC_PRIMITIVES_CONTROLLER_CUH #include // this is needed to extend the coloredMPPI params and ensure that // dynamic reconfigure in mppi_generic_racer_plant.cpp works properly #include #include template struct PrimitivesParams : public ColoredMPPIParams { int num_primitive_iters_; int num_piecewise_segments_ = 5; std::vector scale_piecewise_noise_; std::vector frac_add_nominal_traj_; std::vector scale_add_nominal_noise_; float stopping_cost_threshold_ = 1.0e8; float hysteresis_cost_threshold_ = 0.0; bool visualize_primitives_ = false; PrimitivesParams() = default; PrimitivesParams(const PrimitivesParams& other) { typedef ColoredMPPIParams BASE; const BASE& other_item_ref = other; *(static_cast(this)) = other_item_ref; this->scale_piecewise_noise_ = other.scale_piecewise_noise_; this->frac_add_nominal_traj_ = other.frac_add_nominal_traj_; this->scale_add_nominal_noise_ = other.scale_add_nominal_noise_; } PrimitivesParams(PrimitivesParams& other) { typedef ColoredMPPIParams BASE; BASE& other_item_ref = other; *(static_cast(this)) = other_item_ref; this->scale_piecewise_noise_ = other.scale_piecewise_noise_; this->frac_add_nominal_traj_ = other.frac_add_nominal_traj_; this->scale_add_nominal_noise_ = other.scale_add_nominal_noise_; } }; template > class PrimitivesController : public Controller { public: // EIGEN_MAKE_ALIGNED_OPERATOR_NEW // need control_array = ... so that we can initialize // Eigen::Matrix with Eigen::Matrix::Zero(); typedef Controller PARENT_CLASS; using control_array = typename PARENT_CLASS::control_array; using control_trajectory = typename PARENT_CLASS::control_trajectory; using state_trajectory = typename PARENT_CLASS::state_trajectory; using state_array = typename PARENT_CLASS::state_array; using output_array = typename PARENT_CLASS::output_array; using sampled_cost_traj = typename PARENT_CLASS::sampled_cost_traj; using FEEDBACK_GPU = typename PARENT_CLASS::TEMPLATED_FEEDBACK_GPU; /** * * Public member functions */ // Constructor PrimitivesController(DYN_T* model, COST_T* cost, FB_T* fb_controller, float dt, int max_iter, float lambda, float alpha, const Eigen::Ref& control_std_dev, int num_timesteps = MAX_TIMESTEPS, const Eigen::Ref& init_control_traj = control_trajectory::Zero(), cudaStream_t stream = nullptr); PrimitivesController(DYN_T* model, COST_T* cost, FB_T* fb_controller, PARAMS_T& params, cudaStream_t stream = nullptr); // Destructor ~PrimitivesController(); std::string getControllerName() { return "Primitives"; }; /** * computes a new control sequence * @param state starting position */ void computeControl(const Eigen::Ref& state, int optimization_stride = 1) override; /** * Slide the control sequence back n steps */ void slideControlSequence(int optimization_stride) override; void setPercentageSampledControlTrajectories(float new_perc) { this->setPercentageSampledControlTrajectoriesHelper(new_perc, 1); } void setNumPrimitiveIterations(int new_num_iter) { this->params_.num_primitive_iters_ = new_num_iter; } int getNumPrimitiveIterations() { return this->params_.num_primitive_iters_; } void setGamma(float gamma) { this->params_.gamma = gamma; } float getGamma() { return this->params_.gamma; } void setRExp(float r) { this->params_.r = r; } float getRExp() { return this->params_.r; } void setOffsetDecayRate(float offset_decay_rate) { this->params_.offset_decay_rate = offset_decay_rate; } float getOffsetDecayRate() { return this->params_.offset_decay_rate; } void setColoredNoiseExponents(std::vector& new_exponents) { this->params_.colored_noise_exponents_ = new_exponents; } float getColoredNoiseExponent(int index) const { return this->params_.colored_noise_exponents_[index]; } void setPiecewiseSegments(int segments) { this->params_.num_piecewise_segments_ = segments; } int getPiecewiseSegments() { return this->params_.num_piecewise_segments_; } void setScalePiecewiseNoise(std::vector& new_scale) { this->params_.scale_piecewise_noise_ = new_scale; } std::vector getScalePiecewiseNoise() { return this->params_.scale_piecewise_noise_; } void setFracRandomNoiseTraj(std::vector frac_add_nominal_traj) { this->params_.frac_add_nominal_traj_ = frac_add_nominal_traj; } std::vector getFracRandomNoiseTraj() { return this->params_.frac_add_nominal_traj_; } void setScaleAddNominalNoise(std::vector scale_add_nominal_noise) { this->params_.scale_add_nominal_noise_ = scale_add_nominal_noise; } std::vector getScaleAddNominalNoise() { return this->params_.scale_add_nominal_noise_; } void setStateLeashLength(float new_state_leash, int index = 0) { this->params_.state_leash_dist_[index] = new_state_leash; } float getStateLeashLength(int index) { return this->params_.state_leash_dist_[index]; } bool getLeashActive() { return leash_active_; } void setLeashActive(bool new_leash_active) { leash_active_ = new_leash_active; } void setStoppingCostThreshold(float new_stopping_cost_threshold) { this->params_.stopping_cost_threshold_ = new_stopping_cost_threshold; } float getStoppingCostThreshold() { return this->params_.stopping_cost_threshold_; } void setHysteresisCostThreshold(float new_hysteresis_cost_threshold) { this->params_.hysteresis_cost_threshold_ = new_hysteresis_cost_threshold; } float getHysteresisCostThreshold() { return this->params_.hysteresis_cost_threshold_; } void setVisualizePrimitives(bool visualize_primitives) { this->params_.visualize_primitives_ = visualize_primitives; } bool getVisualizePrimitives() { return this->params_.visualize_primitives_; } void calculateSampledStateTrajectories() override; protected: std::vector& getScaleAddNominalNoiseLValue() { return this->params_.scale_add_nominal_noise_; } std::vector& getScalePiecewiseNoiseLValue() { return this->params_.scale_piecewise_noise_; } std::vector& getFracRandomNoiseTrajLValue() { return this->params_.frac_add_nominal_traj_; } std::vector& getColoredNoiseExponentsLValue() { return this->params_.colored_noise_exponents_; } void computeStateTrajectory(const Eigen::Ref& x0); void copyMPPIControlToDevice(bool synchronize = true); void computeStoppingTrajectory(const Eigen::Ref& x0); void smoothControlTrajectory(); int leash_jump_ = 1; bool leash_active_ = false; float* control_mppi_d_; // Array of size DYN_T::CONTROL_DIM*NUM_TIMESTEPS control_trajectory control_mppi_ = control_trajectory::Zero(); // host side mppi control trajectory Eigen::Matrix control_mppi_history_; private: // ======== MUST BE OVERWRITTEN ========= void allocateCUDAMemory(); // ======== END MUST BE OVERWRITTEN ===== }; #if __CUDACC__ #include "primitives_controller.cu" #endif #endif // MPPIGENERIC_PRIMITIVES_CONTROLLER_CUH ================================================ FILE: include/mppi/controllers/R-MPPI/robust_mppi_controller.cu ================================================ #include "robust_mppi_controller.cuh" #include #include #define ROBUST_MPPI_TEMPLATE \ template #define RobustMPPI RobustMPPIController ROBUST_MPPI_TEMPLATE RobustMPPI::RobustMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, float value_function_threshold, int num_timesteps, const Eigen::Ref& init_control_traj, int num_candidate_nominal_states, int optimization_stride, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, sampler, dt, max_iter, lambda, alpha, num_timesteps, init_control_traj, stream) { setValueFunctionThreshold(value_function_threshold); setOptimizationStride(optimization_stride); setNumCandidates(num_candidate_nominal_states); updateNumCandidates(getNumCandidates()); setParams(this->params_); this->sampler_->setNumDistributions(2); // Zero the control history this->control_history_ = Eigen::Matrix::Zero(); nominal_control_history_ = Eigen::Matrix::Zero(); // Allocate CUDA memory for the controller allocateCUDAMemory(); // Copy the noise std_dev to the device // this->copyControlStdDevToDevice(); // Initialize Feedback this->fb_controller_->initTrackingController(); // Initialize the nominal control trajectory nominal_control_trajectory_ = init_control_traj; this->enable_feedback_ = true; chooseAppropriateKernel(); } ROBUST_MPPI_TEMPLATE RobustMPPI::RobustMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, sampler, params, stream) { updateNumCandidates(getNumCandidates()); setParams(params); this->sampler_->setNumDistributions(2); // Zero the control history this->control_history_ = Eigen::Matrix::Zero(); nominal_control_history_ = Eigen::Matrix::Zero(); // Allocate CUDA memory for the controller allocateCUDAMemory(); // Copy the noise std_dev to the device // this->copyControlStdDevToDevice(); // Initialize Feedback this->fb_controller_->initTrackingController(); // Initialize the nominal control trajectory nominal_control_trajectory_ = this->params_.init_control_traj_; this->enable_feedback_ = true; chooseAppropriateKernel(); } ROBUST_MPPI_TEMPLATE RobustMPPI::~RobustMPPIController() { deallocateNominalStateCandidateMemory(); } ROBUST_MPPI_TEMPLATE void RobustMPPI::allocateCUDAMemory() { PARENT_CLASS::allocateCUDAMemoryHelper(1); } ROBUST_MPPI_TEMPLATE void RobustMPPI::chooseAppropriateKernel() { // Get properties of current GPU cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, 0)); unsigned single_rollout_kernel_byte_size = mppi::kernels::rmppi::calcRMPPICombinedKernelSharedMemSize( this->model_, this->cost_, this->sampler_, this->fb_controller_->getHostPointer().get(), this->params_.dynamics_rollout_dim_); unsigned rollout_dyn_kernel_byte_size = mppi::kernels::rmppi::calcRMPPIDynKernelSharedMemSize( this->model_, this->sampler_, this->fb_controller_->getHostPointer().get(), this->params_.dynamics_rollout_dim_); unsigned rollout_cost_kernel_byte_size = mppi::kernels::rmppi::calcRMPPICostKernelSharedMemSize( this->cost_, this->sampler_, this->fb_controller_->getHostPointer().get(), this->params_.cost_rollout_dim_); // Limit kernel choice to those that fit within shared memory constraints bool rollout_dyn_too_large = rollout_dyn_kernel_byte_size > deviceProp.sharedMemPerBlock; bool rollout_cost_too_large = rollout_cost_kernel_byte_size > deviceProp.sharedMemPerBlock; bool rollout_combined_too_large = single_rollout_kernel_byte_size > deviceProp.sharedMemPerBlock; bool rollout_set = false; if (rollout_combined_too_large && (rollout_dyn_too_large || rollout_cost_too_large)) { std::string error_msg = "There is not enough shared memory on the GPU for either rollout kernel option. The combined rollout kernel " "takes " + std::to_string(single_rollout_kernel_byte_size) + " bytes, the cost rollout kernel takes " + std::to_string(rollout_cost_kernel_byte_size) + " bytes, the dynamics rollout kernel takes " + std::to_string(rollout_dyn_kernel_byte_size) + " bytes, and the max is " + std::to_string(deviceProp.sharedMemPerBlock) + " bytes. Considering lowering the corresponding thread block sizes."; throw std::runtime_error(error_msg); } else if (rollout_dyn_too_large || rollout_cost_too_large) { this->setKernelChoice(kernelType::USE_SINGLE_KERNEL); rollout_set = true; } else if (rollout_combined_too_large) { this->setKernelChoice(kernelType::USE_SPLIT_KERNELS); rollout_set = true; } /** * Set up for kernel comparison */ state_array zero_state = this->model_->getZeroState(); float* initial_state_nominal_d = this->initial_state_d_; float* initial_state_real_d = this->initial_state_d_ + DYN_T::STATE_DIM; // Transfer the initial state to the GPU HANDLE_ERROR(cudaMemcpyAsync(initial_state_real_d, zero_state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(initial_state_nominal_d, zero_state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, this->stream_)); // Copy the importance sampling control to the system this->sampler_->copyImportanceSamplerToDevice(nominal_control_trajectory_.data(), 0, false); this->sampler_->copyImportanceSamplerToDevice(nominal_control_trajectory_.data(), 1, false); // Send feedback gains to the GPU this->fb_controller_->copyToDevice(false); // Generate a the control perturbations for exploration this->sampler_->generateSamples(1, 0, this->gen_, true); float single_rollout_kernel_time_ms = std::numeric_limits::infinity(); float split_rollout_kernel_time_ms = std::numeric_limits::infinity(); auto start_rollout_single_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !rollout_set; i++) { mppi::kernels::rmppi::launchRMPPIRolloutKernel( this->model_, this->cost_, this->sampler_, this->fb_controller_->getHostPointer().get(), this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), getValueFunctionThreshold(), this->initial_state_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->stream_, true); } auto end_rollout_single_kernel_time = std::chrono::steady_clock::now(); auto start_rollout_split_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !rollout_set; i++) { mppi::kernels::rmppi::launchSplitRMPPIRolloutKernel( this->model_, this->cost_, this->sampler_, this->fb_controller_->getHostPointer().get(), this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), getValueFunctionThreshold(), this->initial_state_d_, this->output_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->params_.cost_rollout_dim_, this->stream_, true); } auto end_rollout_split_kernel_time = std::chrono::steady_clock::now(); if (!rollout_set) { single_rollout_kernel_time_ms = mppi::math::timeDiffms(end_rollout_single_kernel_time, start_rollout_single_kernel_time); split_rollout_kernel_time_ms = mppi::math::timeDiffms(end_rollout_split_kernel_time, start_rollout_split_kernel_time); } std::string kernel_choice = ""; if (split_rollout_kernel_time_ms < single_rollout_kernel_time_ms) { this->setKernelChoice(kernelType::USE_SPLIT_KERNELS); kernel_choice = "split "; } else { this->setKernelChoice(kernelType::USE_SINGLE_KERNEL); kernel_choice = "single"; } this->logger_->info( "Choosing %s rollout kernel based on split taking %f ms and single taking %f ms after %d iterations\n", kernel_choice.c_str(), split_rollout_kernel_time_ms, single_rollout_kernel_time_ms, this->getNumKernelEvaluations()); // Do the same for the eval kernel chooseAppropriateEvalKernel(); } ROBUST_MPPI_TEMPLATE void RobustMPPI::chooseAppropriateEvalKernel() { // Get properties of current GPU cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, 0)); // Get shared mem sizes for various kernels unsigned single_eval_kernel_byte_size = mppi::kernels::rmppi::calcEvalCombinedKernelSharedMemSize( this->model_, this->cost_, this->sampler_, getNumEvalRollouts(), getNumEvalSamplesPerCandidate(), this->params_.eval_dyn_kernel_dim_); unsigned eval_dyn_kernel_byte_size = mppi::kernels::rmppi::calcEvalDynKernelSharedMemSize( this->model_, this->sampler_, this->params_.eval_dyn_kernel_dim_); unsigned eval_cost_kernel_byte_size = mppi::kernels::rmppi::calcEvalCostKernelSharedMemSize( this->cost_, this->sampler_, getNumEvalRollouts(), getNumEvalSamplesPerCandidate(), this->params_.eval_cost_kernel_dim_); // Limit kernel choice to those that fit within shared memory constraints bool eval_dyn_too_large = eval_dyn_kernel_byte_size > deviceProp.sharedMemPerBlock; bool eval_cost_too_large = eval_cost_kernel_byte_size > deviceProp.sharedMemPerBlock; bool eval_combined_too_large = single_eval_kernel_byte_size > deviceProp.sharedMemPerBlock; bool eval_set = false; if (eval_combined_too_large && (eval_dyn_too_large || eval_cost_too_large)) { std::string error_msg = "There is not enough shared memory on the GPU for either eval kernel option. The combined eval kernel " "takes " + std::to_string(single_eval_kernel_byte_size) + " bytes, the cost eval kernel takes " + std::to_string(eval_cost_kernel_byte_size) + " bytes, the dynamics eval kernel takes " + std::to_string(eval_dyn_kernel_byte_size) + " bytes, and the max is " + std::to_string(deviceProp.sharedMemPerBlock) + " bytes. Considering lowering the corresponding thread block sizes."; throw std::runtime_error(error_msg); } else if (eval_dyn_too_large || eval_cost_too_large) { this->setEvalKernelChoice(kernelType::USE_SINGLE_KERNEL); eval_set = true; } else if (eval_combined_too_large) { this->setEvalKernelChoice(kernelType::USE_SPLIT_KERNELS); eval_set = true; } // Send the nominal state candidates to the GPU HANDLE_ERROR(cudaMemcpyAsync(importance_sampling_states_d_, candidate_nominal_states_.data(), sizeof(float) * DYN_T::STATE_DIM * getNumCandidates(), cudaMemcpyHostToDevice, this->stream_)); Eigen::MatrixXi temp_importance_sampler_strides = Eigen::MatrixXi::Ones(getNumCandidates(), 1); // Send the importance sampler strides to the GPU HANDLE_ERROR(cudaMemcpyAsync(importance_sampling_strides_d_, temp_importance_sampler_strides.data(), sizeof(int) * getNumCandidates(), cudaMemcpyHostToDevice, this->stream_)); // Send the nominal control to the GPU copyNominalControlToDevice(false); // Generate noise for the samples this->sampler_->generateSamples(1, 0, this->gen_, true); float single_eval_kernel_time_ms = std::numeric_limits::infinity(); float split_eval_kernel_time_ms = std::numeric_limits::infinity(); auto start_eval_split_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !eval_set; i++) { mppi::kernels::rmppi::launchSplitInitEvalKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), getNumEvalRollouts(), this->getLambda(), this->getAlpha(), getNumEvalSamplesPerCandidate(), importance_sampling_strides_d_, importance_sampling_states_d_, importance_sampling_outputs_d_, importance_sampling_costs_d_, this->params_.eval_dyn_kernel_dim_, this->params_.eval_cost_kernel_dim_, this->stream_, true); } auto end_eval_split_kernel_time = std::chrono::steady_clock::now(); auto start_eval_single_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !eval_set; i++) { mppi::kernels::rmppi::launchInitEvalKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), getNumEvalRollouts(), this->getLambda(), this->getAlpha(), getNumEvalSamplesPerCandidate(), importance_sampling_strides_d_, importance_sampling_states_d_, importance_sampling_costs_d_, this->params_.eval_dyn_kernel_dim_, this->stream_, true); } auto end_eval_single_kernel_time = std::chrono::steady_clock::now(); if (!eval_set) { single_eval_kernel_time_ms = mppi::math::timeDiffms(end_eval_single_kernel_time, start_eval_single_kernel_time); split_eval_kernel_time_ms = mppi::math::timeDiffms(end_eval_split_kernel_time, start_eval_split_kernel_time); } std::string kernel_choice = ""; if (split_eval_kernel_time_ms < single_eval_kernel_time_ms) { this->setEvalKernelChoice(kernelType::USE_SPLIT_KERNELS); kernel_choice = "split "; } else { this->setEvalKernelChoice(kernelType::USE_SINGLE_KERNEL); kernel_choice = "single"; } this->logger_->info( "Choosing %s eval kernel based on split taking %f ms and single taking %f ms after %d iterations\n", kernel_choice.c_str(), split_eval_kernel_time_ms, single_eval_kernel_time_ms, this->getNumKernelEvaluations()); } ROBUST_MPPI_TEMPLATE void RobustMPPI::setParams(const PARAMS_T& p) { bool empty_eval_dyn_size = p.eval_dyn_kernel_dim_.x == 0; bool changed_sample_size = p.eval_dyn_kernel_dim_.x != this->params_.eval_dyn_kernel_dim_.x; bool changed_num_candidates = p.num_candidate_nominal_states_ != this->params_.num_candidate_nominal_states_; PARENT_CLASS::setParams(p); this->params_.dynamics_rollout_dim_.z = max(2, p.dynamics_rollout_dim_.z); this->params_.cost_rollout_dim_.z = max(2, p.cost_rollout_dim_.z); // Set up cost eval kernel dimensions if (p.eval_cost_kernel_dim_.x == 0) { this->params_.eval_cost_kernel_dim_.x = this->getNumTimesteps(); } this->params_.eval_cost_kernel_dim_.y = max(1, p.eval_cost_kernel_dim_.y); this->params_.eval_cost_kernel_dim_.z = max(1, p.eval_cost_kernel_dim_.z); // Set up dynamics eval kernel dimensions if (empty_eval_dyn_size) { this->params_.eval_dyn_kernel_dim_.x = 32; changed_sample_size = true; } this->params_.eval_dyn_kernel_dim_.y = max(1, p.eval_dyn_kernel_dim_.y); this->params_.eval_dyn_kernel_dim_.z = max(1, p.eval_dyn_kernel_dim_.z); if (changed_sample_size) { updateCandidateMemory(); } else if (changed_num_candidates) { updateNumCandidates(p.num_candidate_nominal_states_); } } ROBUST_MPPI_TEMPLATE void RobustMPPI::getInitNominalStateCandidates(const Eigen::Ref& nominal_x_k, const Eigen::Ref& nominal_x_kp1, const Eigen::Ref& real_x_kp1) { Eigen::MatrixXf points(DYN_T::STATE_DIM, 3); points << nominal_x_k, nominal_x_kp1, real_x_kp1; auto candidates = points * line_search_weights_; for (int i = 0; i < getNumCandidates(); ++i) { candidate_nominal_states_[i] = candidates.col(i); } } ROBUST_MPPI_TEMPLATE void RobustMPPI::resetCandidateCudaMem() { deallocateNominalStateCandidateMemory(); #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR( cudaMallocAsync((void**)&importance_sampling_costs_d_, sizeof(float) * getNumEvalRollouts(), this->stream_)); HANDLE_ERROR(cudaMallocAsync((void**)&importance_sampling_outputs_d_, sizeof(float) * getNumEvalRollouts() * this->getNumTimesteps() * DYN_T::OUTPUT_DIM, this->stream_)); HANDLE_ERROR(cudaMallocAsync((void**)&importance_sampling_states_d_, sizeof(float) * DYN_T::STATE_DIM * getNumCandidates(), this->stream_)); HANDLE_ERROR( cudaMallocAsync((void**)&importance_sampling_strides_d_, sizeof(int) * getNumCandidates(), this->stream_)); #else HANDLE_ERROR(cudaMalloc((void**)&importance_sampling_costs_d_, sizeof(float) * getNumEvalRollouts())); HANDLE_ERROR(cudaMalloc((void**)&importance_sampling_outputs_d_, sizeof(float) * getNumEvalRollouts() * this->getNumTimesteps() * DYN_T::OUTPUT_DIM)); HANDLE_ERROR( cudaMalloc((void**)&importance_sampling_states_d_, sizeof(float) * DYN_T::STATE_DIM * getNumCandidates())); HANDLE_ERROR(cudaMalloc((void**)&importance_sampling_strides_d_, sizeof(int) * getNumCandidates())); #endif // Set flag so that the we know cudamemory is allocated importance_sampling_cuda_mem_init_ = true; } ROBUST_MPPI_TEMPLATE void RobustMPPI::deallocateNominalStateCandidateMemory() { if (importance_sampling_cuda_mem_init_) { #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaFreeAsync(importance_sampling_costs_d_, this->stream_)); HANDLE_ERROR(cudaFreeAsync(importance_sampling_outputs_d_, this->stream_)); HANDLE_ERROR(cudaFreeAsync(importance_sampling_states_d_, this->stream_)); HANDLE_ERROR(cudaFreeAsync(importance_sampling_strides_d_, this->stream_)); #else HANDLE_ERROR(cudaFree(importance_sampling_costs_d_)); HANDLE_ERROR(cudaFree(importance_sampling_outputs_d_)); HANDLE_ERROR(cudaFree(importance_sampling_states_d_)); HANDLE_ERROR(cudaFree(importance_sampling_strides_d_)); #endif importance_sampling_costs_d_ = nullptr; importance_sampling_outputs_d_ = nullptr; importance_sampling_states_d_ = nullptr; importance_sampling_strides_d_ = nullptr; // Set flag so that we know cudamemory has been freed importance_sampling_cuda_mem_init_ = false; } } ROBUST_MPPI_TEMPLATE void RobustMPPI::copyNominalControlToDevice(bool synchronize) { this->sampler_->copyImportanceSamplerToDevice(nominal_control_trajectory_.data(), 0, synchronize); } ROBUST_MPPI_TEMPLATE void RobustMPPI::updateNumCandidates(int new_num_candidates) { if ((new_num_candidates * getNumEvalSamplesPerCandidate()) > NUM_ROLLOUTS) { std::cerr << "ERROR: (number of candidates) * (SAMPLES_PER_CANDIDATE) cannot exceed NUM_ROLLOUTS\n"; std::cerr << "number of candidates: " << new_num_candidates << ", SAMPLES_PER_CANDIDATE: " << getNumEvalSamplesPerCandidate() << ", NUM_ROLLOUTS: " << NUM_ROLLOUTS << "\n"; std::terminate(); } // New number must be odd and greater than 3 if (new_num_candidates < 3) { std::cerr << "ERROR: number of candidates must be greater or equal to 3\n"; std::cerr << "number of candidates: " << new_num_candidates << "\n"; std::terminate(); } if (new_num_candidates % 2 == 0) { std::cerr << "ERROR: number of candidates must be odd\n"; std::cerr << "number of candidates: " << new_num_candidates << "\n"; std::terminate(); } // Set the new value of the number of candidates setNumCandidates(new_num_candidates); updateCandidateMemory(); // Recompute the line search weights based on the number of candidates computeLineSearchWeights(); } ROBUST_MPPI_TEMPLATE void RobustMPPI::updateCandidateMemory() { // Resize the vector holding the candidate nominal states candidate_nominal_states_.resize(getNumCandidates()); // Resize the matrix holding the importance sampler strides importance_sampler_strides_.resize(getNumCandidates(), 1); // Resize the trajectory costs matrix candidate_trajectory_costs_.resize(getNumEvalRollouts(), 1); candidate_trajectory_costs_.setZero(); // Resize the free energy costs matrix candidate_free_energy_.resize(getNumCandidates(), 1); candidate_free_energy_.setZero(); // Deallocate and reallocate cuda memory resetCandidateCudaMem(); } ROBUST_MPPI_TEMPLATE void RobustMPPI::computeLineSearchWeights() { line_search_weights_.resize(3, getNumCandidates()); // For a given setup, this never changes.... why recompute every time? int num_candid_over_2 = getNumCandidates() / 2; for (int i = 0; i < num_candid_over_2 + 1; i++) { line_search_weights_(0, i) = 1 - i / float(num_candid_over_2); line_search_weights_(1, i) = i / float(num_candid_over_2); line_search_weights_(2, i) = 0.0; } for (int i = 1; i < num_candid_over_2 + 1; i++) { line_search_weights_(0, num_candid_over_2 + i) = 0.0; line_search_weights_(1, num_candid_over_2 + i) = 1 - i / float(num_candid_over_2); line_search_weights_(2, num_candid_over_2 + i) = i / float(num_candid_over_2); } } ROBUST_MPPI_TEMPLATE void RobustMPPI::computeImportanceSamplerStride(int stride) { Eigen::MatrixXf stride_vec(1, 3); stride_vec << 0, stride, stride; // Perform matrix multiplication, convert to array so that we can round the floats to the nearest // integer. Then cast the resultant float array to an int array. Then set equal to our int matrix. importance_sampler_strides_ = (stride_vec * line_search_weights_).array().round().template cast(); // importance_sampler_strides_.array() += stride; } ROBUST_MPPI_TEMPLATE float RobustMPPI::computeCandidateBaseline() { float baseline = candidate_trajectory_costs_(0); for (int i = 1; i < getNumEvalRollouts(); i++) { // TODO What is the reasoning behind only using the first condition to get the baseline? if (candidate_trajectory_costs_(i) < baseline) { baseline = candidate_trajectory_costs_(i); } } return baseline; } ROBUST_MPPI_TEMPLATE void RobustMPPI::computeBestIndex() { candidate_free_energy_.setZero(); float baseline = computeCandidateBaseline(); for (int i = 0; i < getNumCandidates(); i++) { for (int j = 0; j < getNumEvalSamplesPerCandidate(); j++) { candidate_free_energy_(i) += expf( -1.0 / this->getLambda() * (candidate_trajectory_costs_(i * getNumEvalSamplesPerCandidate() + j) - baseline)); } candidate_free_energy_(i) /= (1.0 * getNumEvalSamplesPerCandidate()); candidate_free_energy_(i) = -this->getLambda() * logf(candidate_free_energy_(i)) + baseline; if (candidate_free_energy_(i) < getValueFunctionThreshold()) { best_index_ = i; } } } ROBUST_MPPI_TEMPLATE void RobustMPPI::updateImportanceSamplingControl(const Eigen::Ref& state, int stride) { // (Controller Frequency)*(Optimization Time) corresponds to how many timesteps occurred in the last optimization real_stride_ = stride; computeNominalStateAndStride(state, stride); // Launches the init eval kernel // Save the nominal control history for the importance sampler this->saveControlHistoryHelper(nominal_stride_, nominal_control_trajectory_, nominal_control_history_); // Save the real control history for the optimal control this->saveControlHistoryHelper(real_stride_, this->control_, this->control_history_); // Slide the control sequence for the nominal control trajectory this->slideControlSequenceHelper(nominal_stride_, nominal_control_trajectory_); // Compute the nominal trajectory because we have slid the control sequence and updated the nominal state this->computeStateTrajectoryHelper(nominal_state_trajectory_, nominal_state_, nominal_control_trajectory_); // Compute the feedback gains and save them to an array computeNominalFeedbackGains(state); } ROBUST_MPPI_TEMPLATE void RobustMPPI::computeNominalStateAndStride(const Eigen::Ref& state, int stride) { if (!nominal_state_init_) { nominal_state_ = state; nominal_state_init_ = true; nominal_stride_ = 0; } else { getInitNominalStateCandidates(nominal_state_trajectory_.col(0), nominal_state_trajectory_.col(1), state); computeImportanceSamplerStride(stride); // Send the nominal state candidates to the GPU HANDLE_ERROR(cudaMemcpyAsync(importance_sampling_states_d_, candidate_nominal_states_.data(), sizeof(float) * DYN_T::STATE_DIM * getNumCandidates(), cudaMemcpyHostToDevice, this->stream_)); // Send the importance sampler strides to the GPU HANDLE_ERROR(cudaMemcpyAsync(importance_sampling_strides_d_, importance_sampler_strides_.data(), sizeof(int) * getNumCandidates(), cudaMemcpyHostToDevice, this->stream_)); // Send the nominal control to the GPU copyNominalControlToDevice(false); // Generate noise for the samples this->sampler_->generateSamples(stride, 0, this->gen_, false); // Launch the init eval kernel if (this->getEvalKernelChoiceAsEnum() == kernelType::USE_SPLIT_KERNELS) { mppi::kernels::rmppi::launchSplitInitEvalKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), getNumEvalRollouts(), this->getLambda(), this->getAlpha(), getNumEvalSamplesPerCandidate(), importance_sampling_strides_d_, importance_sampling_states_d_, importance_sampling_outputs_d_, importance_sampling_costs_d_, this->params_.eval_dyn_kernel_dim_, this->params_.eval_cost_kernel_dim_, this->stream_, false); } else if (this->getEvalKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { mppi::kernels::rmppi::launchInitEvalKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), getNumEvalRollouts(), this->getLambda(), this->getAlpha(), getNumEvalSamplesPerCandidate(), importance_sampling_strides_d_, importance_sampling_states_d_, importance_sampling_costs_d_, this->params_.eval_dyn_kernel_dim_, this->stream_, false); } HANDLE_ERROR(cudaMemcpyAsync(candidate_trajectory_costs_.data(), importance_sampling_costs_d_, sizeof(float) * getNumEvalRollouts(), cudaMemcpyDeviceToHost, this->stream_)); cudaStreamSynchronize(this->stream_); // Compute the best nominal state candidate from the rollouts computeBestIndex(); // best_index_ = 8; this->free_energy_statistics_.nominal_state_used = best_index_; nominal_stride_ = importance_sampler_strides_(best_index_); nominal_state_ = candidate_nominal_states_[best_index_]; } } ROBUST_MPPI_TEMPLATE void RobustMPPI::computeNominalFeedbackGains(const Eigen::Ref& state) { this->computeFeedbackHelper(state, nominal_state_trajectory_, nominal_control_trajectory_); } ROBUST_MPPI_TEMPLATE void RobustMPPI::computeControl(const Eigen::Ref& state, int optimization_stride) { // Handy dandy pointers to nominal data float* trajectory_costs_nominal_d = this->trajectory_costs_d_; float* trajectory_costs_real_d = this->trajectory_costs_d_ + NUM_ROLLOUTS; float* initial_state_nominal_d = this->initial_state_d_; float* initial_state_real_d = this->initial_state_d_ + DYN_T::STATE_DIM; this->free_energy_statistics_.nominal_sys.previousBaseline = this->getBaselineCost(0); this->free_energy_statistics_.real_sys.previousBaseline = this->getBaselineCost(1); // Transfer the feedback gains to the GPU this->fb_controller_->copyToDevice(false); // Transfer the real initial state to the GPU HANDLE_ERROR(cudaMemcpyAsync(initial_state_real_d, state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, this->stream_)); // Transfer the nominal state to the GPU: recall that the device GPU has the augmented state [nominal state, real // state] HANDLE_ERROR(cudaMemcpyAsync(initial_state_nominal_d, nominal_state_.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, this->stream_)); for (int opt_iter = 0; opt_iter < this->getNumIters(); opt_iter++) { // Copy the importance sampling control to the system this->sampler_->copyImportanceSamplerToDevice(nominal_control_trajectory_.data(), 0, false); this->sampler_->copyImportanceSamplerToDevice(nominal_control_trajectory_.data(), 1, false); // Generate a the control perturbations for exploration this->sampler_->generateSamples(optimization_stride, opt_iter, this->gen_, false); // Launch the rollout kernel if (this->getKernelChoiceAsEnum() == kernelType::USE_SPLIT_KERNELS) { mppi::kernels::rmppi::launchSplitRMPPIRolloutKernel( this->model_, this->cost_, this->sampler_, this->fb_controller_->getHostPointer().get(), this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), getValueFunctionThreshold(), this->initial_state_d_, this->output_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->params_.cost_rollout_dim_, this->stream_, false); } else { mppi::kernels::rmppi::launchRMPPIRolloutKernel( this->model_, this->cost_, this->sampler_, this->fb_controller_->getHostPointer().get(), this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), getValueFunctionThreshold(), this->initial_state_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->stream_, false); } // Return the costs -> nominal, real costs HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), trajectory_costs_real_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_nominal_.data(), trajectory_costs_nominal_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); // Launch the norm exponential kernels for the nominal costs and the real costs this->setBaseline(mppi::kernels::computeBaselineCost(trajectory_costs_nominal_.data(), NUM_ROLLOUTS), 0); this->setBaseline(mppi::kernels::computeBaselineCost(this->trajectory_costs_.data(), NUM_ROLLOUTS), 1); // In this case this->gamma = 1 / lambda mppi::kernels::launchNormExpKernel(NUM_ROLLOUTS, this->getNormExpThreads(), trajectory_costs_nominal_d, 1.0 / this->getLambda(), this->getBaselineCost(0), this->stream_, false); mppi::kernels::launchNormExpKernel(NUM_ROLLOUTS, this->getNormExpThreads(), trajectory_costs_real_d, 1.0 / this->getLambda(), this->getBaselineCost(1), this->stream_, false); HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), trajectory_costs_real_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_nominal_.data(), trajectory_costs_nominal_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); // Launch the weighted reduction kernel for the nominal costs and the real costs this->setNormalizer(mppi::kernels::computeNormalizer(trajectory_costs_nominal_.data(), NUM_ROLLOUTS), 0); this->setNormalizer(mppi::kernels::computeNormalizer(this->trajectory_costs_.data(), NUM_ROLLOUTS), 1); // Compute real free energy mppi::kernels::computeFreeEnergy(this->free_energy_statistics_.real_sys.freeEnergyMean, this->free_energy_statistics_.real_sys.freeEnergyVariance, this->free_energy_statistics_.real_sys.freeEnergyModifiedVariance, this->trajectory_costs_.data(), NUM_ROLLOUTS, this->getBaselineCost(1), this->getLambda()); // Compute Nominal State free Energy mppi::kernels::computeFreeEnergy(this->free_energy_statistics_.nominal_sys.freeEnergyMean, this->free_energy_statistics_.nominal_sys.freeEnergyVariance, this->free_energy_statistics_.nominal_sys.freeEnergyModifiedVariance, this->trajectory_costs_nominal_.data(), NUM_ROLLOUTS, this->getBaselineCost(0), this->getLambda()); // Calculate new optimal trajectories this->sampler_->updateDistributionParamsFromDevice(trajectory_costs_nominal_d, this->getNormalizerCost(0), 0, false); this->sampler_->updateDistributionParamsFromDevice(trajectory_costs_real_d, this->getNormalizerCost(1), 1, false); // Transfer the new control to the host this->sampler_->setHostOptimalControlSequence(nominal_control_trajectory_.data(), 0, false); this->sampler_->setHostOptimalControlSequence(this->control_.data(), 1, true); } // Smooth the control this->smoothControlTrajectoryHelper(this->control_, this->control_history_); this->smoothControlTrajectoryHelper(nominal_control_trajectory_, nominal_control_history_); // Compute the nominal trajectory because we updated the nominal control! this->computeStateTrajectoryHelper(nominal_state_trajectory_, nominal_state_, nominal_control_trajectory_); this->free_energy_statistics_.real_sys.normalizerPercent = this->getNormalizerCost(1) / NUM_ROLLOUTS; this->free_energy_statistics_.real_sys.increase = this->getBaselineCost(1) - this->free_energy_statistics_.real_sys.previousBaseline; this->free_energy_statistics_.nominal_sys.normalizerPercent = this->getNormalizerCost(0) / NUM_ROLLOUTS; this->free_energy_statistics_.nominal_sys.increase = this->getBaselineCost(0) - this->free_energy_statistics_.nominal_sys.previousBaseline; // Copy back sampled trajectories this->copySampledControlFromDevice(false); if (this->getKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { // copy initial state to vis initial state for use with visualizeKernel HANDLE_ERROR(cudaMemcpyAsync(this->vis_initial_state_d_, this->initial_state_d_, sizeof(float) * DYN_T::STATE_DIM * 2, cudaMemcpyDeviceToDevice, this->vis_stream_)); } this->copyTopControlFromDevice(true); } ROBUST_MPPI_TEMPLATE float RobustMPPI::computeDF() { return (this->getFeedbackPropagatedStateSeq().col(0) - this->getFeedbackPropagatedStateSeq().col(1)).norm() + (this->getTargetStateSeq().col(0) - this->getFeedbackPropagatedStateSeq().col(0)).norm(); } ROBUST_MPPI_TEMPLATE void RobustMPPI::calculateSampledStateTrajectories() { int num_sampled_trajectories = this->getTotalSampledTrajectories(); // control already copied in compute control, so run kernel if (this->getKernelChoiceAsEnum() == kernelType::USE_SPLIT_KERNELS) { mppi::kernels::launchVisualizeCostKernel( this->cost_->cost_d_, this->sampler_->sampling_d_, this->getDt(), this->getNumTimesteps(), num_sampled_trajectories, this->getLambda(), this->getAlpha(), this->sampled_outputs_d_, this->sampled_crash_status_d_, this->sampled_costs_d_, this->params_.cost_rollout_dim_, this->stream_, false); } else if (this->getKernelChoiceAsEnum() == kernelType::USE_SINGLE_KERNEL) { mppi::kernels::launchVisualizeKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), num_sampled_trajectories, this->getLambda(), this->getAlpha(), this->vis_initial_state_d_, this->sampled_outputs_d_, this->sampled_costs_d_, this->sampled_crash_status_d_, this->params_.visualize_dim_, this->stream_, false); } // copy back results for (int i = 0; i < num_sampled_trajectories * 2; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampled_trajectories_[i].data(), this->sampled_outputs_d_ + i * this->getNumTimesteps() * DYN_T::OUTPUT_DIM, this->getNumTimesteps() * DYN_T::OUTPUT_DIM * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); } HANDLE_ERROR(cudaMemcpyAsync(this->sampled_costs_.data(), this->sampled_costs_d_, this->getNumTimesteps() * 2 * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_crash_status_.data(), this->sampled_crash_status_d_, this->getNumTimesteps() * 2 * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->vis_stream_)); } ================================================ FILE: include/mppi/controllers/R-MPPI/robust_mppi_controller.cuh ================================================ /* * Software License Agreement (BSD License) * Copyright (c) 2013, Georgia Institute of Technology * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /********************************************** * @file mppi_controller.cuh * @author Grady Williams * @date May 24, 2017 * @copyright 2017 Georgia Institute of Technology * @brief Class definition for the MPPI controller. ***********************************************/ #ifndef MPPI_GEN_RMPPI_CONTROLLER_CUH_ #define MPPI_GEN_RMPPI_CONTROLLER_CUH_ #include #include #include #include // Needed for list of candidate states #include template struct RobustMPPIParams : public ControllerParams { float value_function_threshold_ = 1000.0; int optimization_stride_ = 1; int num_candidate_nominal_states_ = 9; dim3 eval_cost_kernel_dim_; dim3 eval_dyn_kernel_dim_; }; // template , // int SAMPLES_PER_CONDITION_MULTIPLIER = 1> template , class PARAMS_T = RobustMPPIParams> class RobustMPPIController : public Controller { public: /** * Set up useful types */ typedef Controller PARENT_CLASS; using control_array = typename PARENT_CLASS::control_array; using control_trajectory = typename PARENT_CLASS::control_trajectory; using state_trajectory = typename PARENT_CLASS::state_trajectory; using state_array = typename PARENT_CLASS::state_array; using sampled_cost_traj = typename PARENT_CLASS::sampled_cost_traj; using FEEDBACK_STATE = typename PARENT_CLASS::TEMPLATED_FEEDBACK_STATE; using FEEDBACK_PARAMS = typename PARENT_CLASS::TEMPLATED_FEEDBACK_PARAMS; using FEEDBACK_GPU = typename PARENT_CLASS::TEMPLATED_FEEDBACK_GPU; using NominalCandidateVector = typename util::NamedEigenAlignedVector; static const int STATE_DIM = DYN_T::STATE_DIM; static const int CONTROL_DIM = DYN_T::CONTROL_DIM; // Number of samples per condition must be a multiple of the blockDIM // static const int SAMPLES_PER_CANDIDATE = SAMPLES_PER_CANDIDATE_MULTIPLIER; int getNumEvalSamplesPerCandidate() const { return this->params_.eval_dyn_kernel_dim_.x; } // float value_function_threshold_ = 1000.0; state_array nominal_state_ = state_array::Zero(); /** * @brief Constructor for mppi controller class * @param model A basis function model of the system dynamics. * @param cost An MppiCost object. * @param dt The time increment. horizon = num_timesteps*dt * @param max_iter number of times to repeat control sequence calculations * @param gamma * @param num_timesteps The number of timesteps to look ahead for. * TODO Finish this description */ RobustMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, float value_function_threshold, int num_timesteps = MAX_TIMESTEPS, const Eigen::Ref& init_control_traj = control_trajectory::Zero(), int num_candidate_nominal_states = 9, int optimization_stride = 1, cudaStream_t stream = nullptr); RobustMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream = nullptr); /** * @brief Destructor for mppi controller class. */ ~RobustMPPIController(); std::string getControllerName() { return "Robust MPPI"; }; // Initializes the num_candidates, candidate_nominal_states, linesearch_weights, // and allocates the associated CUDA memory void updateNumCandidates(int new_num_candidates); // Update the importance sampler prior to calling computeControl void updateImportanceSamplingControl(const Eigen::Ref& state, int stride); /** * @brief Compute the control given the current state of the system. * @param state The current state of the autorally system. */ void computeControl(const Eigen::Ref& state, int optimization_stride = 1); control_trajectory getControlSeq() const override { return this->control_; }; state_trajectory getTargetStateSeq() const override { return nominal_state_trajectory_; }; state_array getNominalState() const { return nominal_state_; } int getBestIndex() const { return best_index_; }; float getValueFunctionThreshold() const { return this->params_.value_function_threshold_; } int getOptimizationStride() const { return this->params_.optimization_stride_; } int getNumCandidates() const { return this->params_.num_candidate_nominal_states_; } int getNumEvalRollouts() const { return getNumCandidates() * getNumEvalSamplesPerCandidate(); } int getEvalKernelChoiceAsInt() const { return static_cast(use_eval_kernel_); } kernelType getEvalKernelChoiceAsEnum() const { return use_eval_kernel_; } // Does nothing. This reason is because the control sliding happens during the importance sampler update. // The control applied to the real system (during the MPPI rollouts) is the nominal control (which slides // during the importance sampler update), plus the feedback term. Inside the runControlIteration function // slideControl sequence is called prior to optimization, after the importance sampler update. void slideControlSequence(int steps) override{}; // Feedback gain computation is done after the importance sampling update. The nominal trajectory computed // during the importance sampling update does not change after the optimization, thus the feedback gains will // not change either. In the current implementation of runControlIteration, the compute feedback gains is called // after the computation of the optimal control. void computeFeedback(const Eigen::Ref& state) override{}; Eigen::MatrixXf getCandidateFreeEnergy() { return candidate_free_energy_; }; float computeDF(); void setPercentageSampledControlTrajectories(float new_perc) { this->setPercentageSampledControlTrajectoriesHelper(new_perc, 2); } void setValueFunctionThreshold(float value_function_threshold) { this->params_.value_function_threshold_ = value_function_threshold; } void setOptimizationStride(float optimization_stride) { this->params_.optimization_stride_ = optimization_stride; } void setNumCandidates(int num_candidate_nominal_states) { this->params_.num_candidate_nominal_states_ = num_candidate_nominal_states; } void setNumSamplesPerCandidate(const int& num_samples) { this->params_.eval_dyn_kernel_dim_.x = num_samples; updateCandidateMemory(); } void setEvalKernelChoice(const int& kernel_type) { use_eval_kernel_ = static_cast(kernel_type); } void setEvalKernelChoice(const kernelType& kernel_type) { use_eval_kernel_ = kernel_type; } void setParams(const PARAMS_T& p); void calculateSampledStateTrajectories() override; void chooseAppropriateKernel() override; void chooseAppropriateEvalKernel(); protected: bool importance_sampling_cuda_mem_init_ = false; // int num_candidate_nominal_states_; int best_index_ = 0; // Selected nominal state candidate // int optimization_stride_; // Number of timesteps to apply the optimal control (== 1 for true MPC) int nominal_stride_ = 0; // Stride for the chosen nominal state of the importance sampler int real_stride_ = 0; // Stride for the optimal controller sliding bool nominal_state_init_ = false; // Free energy variables float nominal_free_energy_mean_ = 0; float nominal_free_energy_variance_ = 0; float nominal_free_energy_modified_variance_ = 0; // Storage classes control_trajectory nominal_control_trajectory_ = control_trajectory::Zero(); state_trajectory nominal_state_trajectory_ = state_trajectory::Zero(); sampled_cost_traj trajectory_costs_nominal_ = sampled_cost_traj::Zero(); // Make the control history size flexible, related to issue #30 Eigen::Matrix nominal_control_history_; // History used for nominal_state IS NominalCandidateVector candidate_nominal_states_ = { state_array::Zero() }; Eigen::MatrixXf line_search_weights_; // At minimum there must be 3 candidates Eigen::MatrixXi importance_sampler_strides_; // Time index where control trajectory starts for each nominal state // candidate Eigen::MatrixXf candidate_trajectory_costs_; Eigen::MatrixXf candidate_free_energy_; void allocateCUDAMemory(); void copyNominalControlToDevice(bool synchronize = true); void deallocateNominalStateCandidateMemory(); void updateCandidateMemory(); void resetCandidateCudaMem(); void getInitNominalStateCandidates(const Eigen::Ref& nominal_x_k, const Eigen::Ref& nominal_x_kp1, const Eigen::Ref& real_x_kp1); // compute the line search weights void computeLineSearchWeights(); void computeNominalStateAndStride(const Eigen::Ref& state, int stride); // compute the importance sampler strides void computeImportanceSamplerStride(int stride); // Compute the baseline of the candidates float computeCandidateBaseline(); // Get the best index based on the candidate free energy void computeBestIndex(); // Computes and saves the feedback gains used in the rollout kernel and tracking. virtual void computeNominalFeedbackGains(const Eigen::Ref& state); // CUDA Memory float* importance_sampling_costs_d_ = nullptr; float* importance_sampling_outputs_d_ = nullptr; float* importance_sampling_states_d_ = nullptr; int* importance_sampling_strides_d_ = nullptr; float* feedback_gain_array_d_ = nullptr; kernelType use_eval_kernel_ = kernelType::USE_SPLIT_KERNELS; }; #if __CUDACC__ #include "robust_mppi_controller.cu" #endif #endif /* MPPI_GEN_RMPPI_CONTROLLER_CUH_ */ ================================================ FILE: include/mppi/controllers/Tube-MPPI/tube_mppi_controller.cu ================================================ #include "tube_mppi_controller.cuh" #include #define TUBE_MPPI_TEMPLATE \ template #define TubeMPPI TubeMPPIController TUBE_MPPI_TEMPLATE TubeMPPI::TubeMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, int num_timesteps, const Eigen::Ref& init_control_traj, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, sampler, dt, max_iter, lambda, alpha, num_timesteps, init_control_traj, stream) { nominal_control_trajectory_ = init_control_traj; // call rollout kernel with z = 2 since we have a nominal state this->params_.dynamics_rollout_dim_.z = max(2, this->params_.dynamics_rollout_dim_.z); this->params_.cost_rollout_dim_.z = max(2, this->params_.cost_rollout_dim_.z); this->sampler_->setNumDistributions(2); // Allocate CUDA memory for the controller allocateCUDAMemory(); // Initialize Feedback this->fb_controller_->initTrackingController(); this->enable_feedback_ = true; chooseAppropriateKernel(); } TUBE_MPPI_TEMPLATE TubeMPPI::TubeMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS(model, cost, fb_controller, sampler, params, stream) { nominal_control_trajectory_ = this->params_.init_control_traj_; // call rollout kernel with z = 2 since we have a nominal state this->params_.dynamics_rollout_dim_.z = max(2, this->params_.dynamics_rollout_dim_.z); this->params_.cost_rollout_dim_.z = max(2, this->params_.cost_rollout_dim_.z); this->sampler_->setNumDistributions(2); // Allocate CUDA memory for the controller allocateCUDAMemory(); // Initialize Feedback this->fb_controller_->initTrackingController(); this->enable_feedback_ = true; chooseAppropriateKernel(); } TUBE_MPPI_TEMPLATE void TubeMPPI::chooseAppropriateKernel() { cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, 0)); unsigned single_kernel_byte_size = mppi::kernels::calcRolloutCombinedKernelSharedMemSize( this->model_, this->cost_, this->sampler_, this->params_.dynamics_rollout_dim_); unsigned split_dyn_kernel_byte_size = mppi::kernels::calcRolloutDynamicsKernelSharedMemSize( this->model_, this->sampler_, this->params_.dynamics_rollout_dim_); unsigned split_cost_kernel_byte_size = mppi::kernels::calcRolloutCostKernelSharedMemSize(this->cost_, this->sampler_, this->params_.cost_rollout_dim_); unsigned vis_single_kernel_byte_size = mppi::kernels::calcVisualizeKernelSharedMemSize( this->model_, this->cost_, this->sampler_, this->getNumTimesteps(), this->params_.visualize_dim_); bool too_much_mem_single_kernel = single_kernel_byte_size > deviceProp.sharedMemPerBlock; bool too_much_mem_vis_kernel = vis_single_kernel_byte_size > deviceProp.sharedMemPerBlock; bool too_much_mem_split_kernel = split_dyn_kernel_byte_size > deviceProp.sharedMemPerBlock; too_much_mem_split_kernel = too_much_mem_split_kernel || split_cost_kernel_byte_size > deviceProp.sharedMemPerBlock; too_much_mem_single_kernel = too_much_mem_single_kernel || too_much_mem_vis_kernel; if (too_much_mem_split_kernel && too_much_mem_single_kernel) { std::string error_msg = "There is not enough shared memory on the GPU for either rollout kernel option. The combined rollout kernel " "takes " + std::to_string(single_kernel_byte_size) + " bytes, the cost rollout kernel takes " + std::to_string(split_cost_kernel_byte_size) + " bytes, the dynamics rollout kernel takes " + std::to_string(split_dyn_kernel_byte_size) + " bytes, the combined visualization kernel takes " + std::to_string(vis_single_kernel_byte_size) + " bytes, and the max is " + std::to_string(deviceProp.sharedMemPerBlock) + " bytes. Considering lowering the corresponding thread block sizes."; throw std::runtime_error(error_msg); } else if (too_much_mem_single_kernel) { this->setKernelChoice(kernelType::USE_SPLIT_KERNELS); return; } else if (too_much_mem_split_kernel) { this->setKernelChoice(kernelType::USE_SINGLE_KERNEL); return; } // Send the nominal control to the device this->copyNominalControlToDevice(false); state_array zero_state = this->model_->getZeroState(); // Send zero state to the device HANDLE_ERROR(cudaMemcpyAsync(this->initial_state_d_, zero_state.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->initial_state_d_ + DYN_T::STATE_DIM, zero_state.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); // Generate noise data this->sampler_->generateSamples(1, 0, this->gen_, true); float single_kernel_time_ms = std::numeric_limits::infinity(); float split_kernel_time_ms = std::numeric_limits::infinity(); // Evaluate each kernel that is applicable auto start_single_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !too_much_mem_single_kernel; i++) { mppi::kernels::launchRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->stream_, true); } auto end_single_kernel_time = std::chrono::steady_clock::now(); auto start_split_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < this->getNumKernelEvaluations() && !too_much_mem_split_kernel; i++) { mppi::kernels::launchSplitRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->output_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->params_.cost_rollout_dim_, this->stream_, true); } auto end_split_kernel_time = std::chrono::steady_clock::now(); // calc times if (!too_much_mem_single_kernel) { single_kernel_time_ms = mppi::math::timeDiffms(end_single_kernel_time, start_single_kernel_time); } if (!too_much_mem_split_kernel) { split_kernel_time_ms = mppi::math::timeDiffms(end_split_kernel_time, start_split_kernel_time); } std::string kernel_choice = ""; if (split_kernel_time_ms < single_kernel_time_ms) { this->setKernelChoice(kernelType::USE_SPLIT_KERNELS); kernel_choice = "split "; } else { this->setKernelChoice(kernelType::USE_SINGLE_KERNEL); kernel_choice = "single"; } this->logger_->info("Choosing %s kernel based on split taking %f ms and single taking %f ms after %d iterations\n", kernel_choice.c_str(), split_kernel_time_ms, single_kernel_time_ms, this->getNumKernelEvaluations()); } TUBE_MPPI_TEMPLATE void TubeMPPI::computeControl(const Eigen::Ref& state, int optimization_stride) { if (!nominalStateInit_) { // set the nominal state to the actual state nominal_state_trajectory_.col(0) = state; nominalStateInit_ = true; } this->free_energy_statistics_.real_sys.previousBaseline = this->getBaselineCost(0); this->free_energy_statistics_.nominal_sys.previousBaseline = this->getBaselineCost(1); // std::cout << "Post disturbance Actual State: "; this->model_->printState(state.data()); // std::cout << " Nominal State: "; this->model_->printState(nominal_state_trajectory_.col(0).data()); // Handy reference pointers to the nominal state float* trajectory_costs_nominal_d = this->trajectory_costs_d_ + NUM_ROLLOUTS; float* initial_state_nominal_d = this->initial_state_d_ + DYN_T::STATE_DIM; for (int opt_iter = 0; opt_iter < this->getNumIters(); opt_iter++) { // Send the initial condition to the device HANDLE_ERROR(cudaMemcpyAsync(this->initial_state_d_, state.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(initial_state_nominal_d, nominal_state_trajectory_.data(), DYN_T::STATE_DIM * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); // Send the nominal control to the device copyControlToDevice(false); // Generate noise data this->sampler_->generateSamples(optimization_stride, opt_iter, this->gen_, false); // call rollout kernel with z = 2 since we have a nominal state this->params_.dynamics_rollout_dim_.z = max(2, this->params_.dynamics_rollout_dim_.z); this->params_.cost_rollout_dim_.z = max(2, this->params_.cost_rollout_dim_.z); // Launch the rollout kernel if (this->getKernelChoiceAsEnum() == kernelType::USE_SPLIT_KERNELS) { mppi::kernels::launchSplitRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->output_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->params_.cost_rollout_dim_, this->stream_, false); } else { mppi::kernels::launchRolloutKernel( this->model_, this->cost_, this->sampler_, this->getDt(), this->getNumTimesteps(), NUM_ROLLOUTS, this->getLambda(), this->getAlpha(), this->initial_state_d_, this->trajectory_costs_d_, this->params_.dynamics_rollout_dim_, this->stream_, false); } // Copy the costs back to the host HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_nominal_.data(), trajectory_costs_nominal_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); this->setBaseline(mppi::kernels::computeBaselineCost(this->trajectory_costs_.data(), NUM_ROLLOUTS), 0); this->setBaseline(mppi::kernels::computeBaselineCost(this->trajectory_costs_nominal_.data(), NUM_ROLLOUTS), 1); // Launch the norm exponential kernel for both actual and nominal mppi::kernels::launchNormExpKernel(NUM_ROLLOUTS, this->getNormExpThreads(), this->trajectory_costs_d_, 1.0 / this->getLambda(), this->getBaselineCost(0), this->stream_, false); mppi::kernels::launchNormExpKernel(NUM_ROLLOUTS, this->getNormExpThreads(), trajectory_costs_nominal_d, 1.0 / this->getLambda(), this->getBaselineCost(1), this->stream_, false); HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_nominal_.data(), trajectory_costs_nominal_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); // Compute the normalizer this->setNormalizer(mppi::kernels::computeNormalizer(this->trajectory_costs_.data(), NUM_ROLLOUTS), 0); this->setNormalizer(mppi::kernels::computeNormalizer(this->trajectory_costs_nominal_.data(), NUM_ROLLOUTS), 1); // Compute real free energy mppi::kernels::computeFreeEnergy(this->free_energy_statistics_.real_sys.freeEnergyMean, this->free_energy_statistics_.real_sys.freeEnergyVariance, this->free_energy_statistics_.real_sys.freeEnergyModifiedVariance, this->trajectory_costs_.data(), NUM_ROLLOUTS, this->getBaselineCost(0), this->getLambda()); // Compute Nominal State free Energy mppi::kernels::computeFreeEnergy(this->free_energy_statistics_.nominal_sys.freeEnergyMean, this->free_energy_statistics_.nominal_sys.freeEnergyVariance, this->free_energy_statistics_.nominal_sys.freeEnergyModifiedVariance, this->trajectory_costs_nominal_.data(), NUM_ROLLOUTS, this->getBaselineCost(1), this->getLambda()); // Compute the cost weighted average this->sampler_->updateDistributionParamsFromDevice(this->trajectory_costs_d_, this->getNormalizerCost(0), 0, false); this->sampler_->updateDistributionParamsFromDevice(trajectory_costs_nominal_d, this->getNormalizerCost(1), 1, false); // Transfer the new control to the host this->sampler_->setHostOptimalControlSequence(this->control_.data(), 0, false); this->sampler_->setHostOptimalControlSequence(this->nominal_control_trajectory_.data(), 1, true); // Compute the nominal and actual state trajectories computeStateTrajectory(state); // Input is the actual state // this->logger_->debug("Actual baseline: %f\n", this->getBaselineCost(0)); // this->logger_->debug("Nominal baseline: %f\n", this->getBaselineCost(1)); if (this->getBaselineCost(0) < this->getBaselineCost(1) + getNominalThreshold()) { // In this case, the disturbance the made the nominal and actual states differ improved the cost. // std::copy(state_trajectory.begin(), state_trajectory.end(), nominal_state_trajectory_.begin()); // std::copy(control_trajectory.begin(), control_trajectory.end(), nominal_control_.begin()); this->free_energy_statistics_.nominal_state_used = 0; nominal_state_trajectory_ = this->state_; nominal_control_trajectory_ = this->control_; } else { this->free_energy_statistics_.nominal_state_used = 1; } // Outside of this loop, we will utilize the nominal state trajectory and the nominal control trajectory to compute // the optimal feedback gains using our ancillary controller, then apply feedback inside our main while loop at the // same rate as our state estimator. } smoothControlTrajectory(); computeStateTrajectory(state); // Input is the actual state this->free_energy_statistics_.real_sys.normalizerPercent = this->getNormalizerCost(0) / NUM_ROLLOUTS; this->free_energy_statistics_.real_sys.increase = this->getBaselineCost(0) - this->free_energy_statistics_.real_sys.previousBaseline; this->free_energy_statistics_.nominal_sys.normalizerPercent = this->getNormalizerCost(1) / NUM_ROLLOUTS; this->free_energy_statistics_.nominal_sys.increase = this->getBaselineCost(1) - this->free_energy_statistics_.nominal_sys.previousBaseline; // Copy back sampled trajectories this->copySampledControlFromDevice(false); this->copyTopControlFromDevice(true); } TUBE_MPPI_TEMPLATE void TubeMPPI::copyControlToDevice(bool synchronize) { this->sampler_->copyImportanceSamplerToDevice(this->control_.data(), 0, false); this->sampler_->copyImportanceSamplerToDevice(this->nominal_control_trajectory_.data(), 1, synchronize); } TUBE_MPPI_TEMPLATE void TubeMPPI::allocateCUDAMemory() { PARENT_CLASS::allocateCUDAMemoryHelper(1); } TUBE_MPPI_TEMPLATE void TubeMPPI::slideControlSequence(int steps) { // Propagate the nominal trajectory forward updateNominalState(nominal_control_trajectory_.col(0)); // Save the control history this->saveControlHistoryHelper(steps, nominal_control_trajectory_, this->control_history_); this->slideControlSequenceHelper(steps, nominal_control_trajectory_); this->slideControlSequenceHelper(steps, this->control_); } TUBE_MPPI_TEMPLATE void TubeMPPI::smoothControlTrajectory() { this->smoothControlTrajectoryHelper(nominal_control_trajectory_, this->control_history_); } TUBE_MPPI_TEMPLATE void TubeMPPI::computeStateTrajectory(const Eigen::Ref& x0_actual) { // update the nominal state this->computeStateTrajectoryHelper(nominal_state_trajectory_, nominal_state_trajectory_.col(0), nominal_control_trajectory_); // update the actual state this->computeStateTrajectoryHelper(this->state_, x0_actual, this->control_); } TUBE_MPPI_TEMPLATE void TubeMPPI::updateNominalState(const Eigen::Ref& u) { state_array xdot; output_array output; this->model_->step(nominal_state_trajectory_.col(0), nominal_state_trajectory_.col(0), xdot, u, output, 0, this->getDt()); } TUBE_MPPI_TEMPLATE void TubeMPPI::calculateSampledStateTrajectories() { int num_sampled_trajectories = this->getTotalSampledTrajectories(); // control already copied in compute control, so run kernel mppi::kernels::launchVisualizeCostKernel( this->cost_->cost_d_, this->sampler_->sampling_d_, this->getDt(), this->getNumTimesteps(), num_sampled_trajectories, this->getLambda(), this->getAlpha(), this->sampled_outputs_d_, this->sampled_crash_status_d_, this->sampled_costs_d_, this->params_.cost_rollout_dim_, this->stream_, false); // copy back results for (int i = 0; i < num_sampled_trajectories * 2; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampled_trajectories_[i].data(), this->sampled_outputs_d_ + i * this->getNumTimesteps() * DYN_T::OUTPUT_DIM, this->getNumTimesteps() * DYN_T::OUTPUT_DIM * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); } HANDLE_ERROR(cudaMemcpyAsync(this->sampled_costs_.data(), this->sampled_costs_d_, this->getNumTimesteps() * 2 * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_crash_status_.data(), this->sampled_crash_status_d_, this->getNumTimesteps() * 2 * sizeof(float), cudaMemcpyDeviceToHost, this->vis_stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->vis_stream_)); } ================================================ FILE: include/mppi/controllers/Tube-MPPI/tube_mppi_controller.cuh ================================================ /** * Created by Manan Gandhi on 2/14/2020 * API for interfacing with the TUBE MPPI controller. */ #ifndef MPPIGENERIC_TUBE_MPPI_CONTROLLER_CUH #define MPPIGENERIC_TUBE_MPPI_CONTROLLER_CUH // #include #include #include #include #include #include #include template struct TubeMPPIParams : public ControllerParams { float nominal_threshold_ = 20; // How much worse the actual system has to be compared to the nominal }; template , class PARAMS_T = TubeMPPIParams> class TubeMPPIController : public Controller { public: // EIGEN_MAKE_ALIGNED_OPERATOR_NEW unnecessary due to EIGEN_MAX_ALIGN_BYTES=0 /** * Set up useful types */ typedef Controller PARENT_CLASS; using control_array = typename PARENT_CLASS::control_array; using control_trajectory = typename PARENT_CLASS::control_trajectory; using state_trajectory = typename PARENT_CLASS::state_trajectory; using state_array = typename PARENT_CLASS::state_array; using output_array = typename PARENT_CLASS::output_array; using sampled_cost_traj = typename PARENT_CLASS::sampled_cost_traj; using FEEDBACK_PARAMS = typename PARENT_CLASS::TEMPLATED_FEEDBACK_PARAMS; using FEEDBACK_GPU = typename PARENT_CLASS::TEMPLATED_FEEDBACK_GPU; TubeMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, int num_timesteps = MAX_TIMESTEPS, const Eigen::Ref& init_control_traj = control_trajectory::Zero(), cudaStream_t stream = nullptr); TubeMPPIController(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream = nullptr); void computeControl(const Eigen::Ref& state, int optimization_stride = 1) override; std::string getControllerName() { return "Tube MPPI"; }; /** * returns the current nominal control sequence */ control_trajectory getControlSeq() const override { return nominal_control_trajectory_; }; /** * returns the current nominal state sequence */ state_trajectory getTargetStateSeq() const override { return nominal_state_trajectory_; }; /** * returns the current control sequence */ control_trajectory getActualControlSeq() { return this->control_; }; /** * Slide the control sequence back n steps */ void slideControlSequence(int steps) override; void smoothControlTrajectory(); void updateNominalState(const Eigen::Ref& u); float getNominalThreshold() const { return this->params_.nominal_threshold_; } void setNominalThreshold(float threshold) { this->params_.nominal_threshold_ = threshold; } void setPercentageSampledControlTrajectories(float new_perc) { this->setPercentageSampledControlTrajectoriesHelper(new_perc, 2); } void calculateSampledStateTrajectories() override; void chooseAppropriateKernel(); private: // float nominal_threshold_ = 20; // How much worse the actual system has to be compared to the nominal // Free energy variables float nominal_free_energy_mean_ = 0; float nominal_free_energy_variance_ = 0; float nominal_free_energy_modified_variance_ = 0; // nominal state CPU side copies control_trajectory nominal_control_trajectory_ = control_trajectory::Zero(); state_trajectory nominal_state_trajectory_ = state_trajectory::Zero(); sampled_cost_traj trajectory_costs_nominal_ = sampled_cost_traj::Zero(); // Check to see if nominal state has been initialized bool nominalStateInit_ = false; void computeStateTrajectory(const Eigen::Ref& x0_actual); protected: // TODO move up and generalize, pass in what to copy and initial location void copyControlToDevice(bool synchronize = true); private: // ======== PURE VIRTUAL ========= void allocateCUDAMemory(); // ======== PURE VIRTUAL END ===== }; #if __CUDACC__ #include "tube_mppi_controller.cu" #endif #endif ================================================ FILE: include/mppi/controllers/controller.cu ================================================ #include #define CONTROLLER_TEMPLATE \ template #define CONTROLLER Controller CONTROLLER_TEMPLATE void CONTROLLER::deallocateCUDAMemory() { if (CUDA_mem_init_) { HANDLE_ERROR(cudaFree(initial_state_d_)); HANDLE_ERROR(cudaFree(vis_initial_state_d_)); HANDLE_ERROR(cudaFree(control_d_)); HANDLE_ERROR(cudaFree(output_d_)); HANDLE_ERROR(cudaFree(trajectory_costs_d_)); HANDLE_ERROR(cudaFree(cost_baseline_and_norm_d_)); CUDA_mem_init_ = false; } if (sampled_states_CUDA_mem_init_) { HANDLE_ERROR(cudaFree(sampled_outputs_d_)); HANDLE_ERROR(cudaFree(sampled_costs_d_)); sampled_states_CUDA_mem_init_ = false; } } // CONTROLLER_TEMPLATE // void CONTROLLER::copyControlStdDevToDevice(bool synchronize) // { // if (!CUDA_mem_init_) // { // return; // } // HANDLE_ERROR(cudaMemcpyAsync(control_std_dev_d_, params_.control_std_dev_.data(), // sizeof(float) * params_.control_std_dev_.size(), cudaMemcpyHostToDevice, stream_)); // if (synchronize) // { // HANDLE_ERROR(cudaStreamSynchronize(stream_)); // } // } CONTROLLER_TEMPLATE void CONTROLLER::copyNominalControlToDevice(bool synchronize) { if (!CUDA_mem_init_) { return; } this->sampler_->copyImportanceSamplerToDevice(control_.data(), 0, synchronize); } CONTROLLER_TEMPLATE void CONTROLLER::copySampledControlFromDevice(bool synchronize) { // if mem is not inited don't use it if (!sampled_states_CUDA_mem_init_) { return; } int num_sampled_trajectories = perc_sampled_control_trajectories_ * NUM_ROLLOUTS; std::vector samples(num_sampled_trajectories); if (perc_sampled_control_trajectories_ > 0.98) { // if above threshold just do everything std::iota(samples.begin(), samples.end(), 0); } else { // Create sample list without replacement // removes the top 2% since top 1% are complete noise samples = mppi::math::sample_without_replacement(num_sampled_trajectories, NUM_ROLLOUTS * 0.98); } // this explicitly adds the optimized control sequence HANDLE_ERROR(cudaMemcpyAsync(this->sampled_outputs_d_, this->output_.data(), sizeof(float) * getNumTimesteps() * DYN_T::OUTPUT_DIM, cudaMemcpyHostToDevice, this->vis_stream_)); HANDLE_ERROR(cudaMemcpyAsync( // this->sampled_noise_d_, this->sampler_->getVisControlSample(0, 0, 0), this->control_d_, sizeof(float) * getNumTimesteps() * DYN_T::CONTROL_DIM, cudaMemcpyDeviceToDevice, this->vis_stream_)); for (int i = 1; i < num_sampled_trajectories; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampled_outputs_d_ + i * getNumTimesteps() * DYN_T::OUTPUT_DIM, this->output_d_ + samples[i] * getNumTimesteps() * DYN_T::OUTPUT_DIM, sizeof(float) * getNumTimesteps() * DYN_T::OUTPUT_DIM, cudaMemcpyDeviceToDevice, this->vis_stream_)); HANDLE_ERROR(cudaMemcpyAsync( // this->sampled_noise_d_ + i * getNumTimesteps() * DYN_T::CONTROL_DIM, this->sampler_->getVisControlSample(i, 0, 0), this->sampler_->getControlSample(samples[i], 0, 0), // this->control_noise_d_ + samples[i] * getNumTimesteps() * DYN_T::CONTROL_DIM, sizeof(float) * getNumTimesteps() * DYN_T::CONTROL_DIM, cudaMemcpyDeviceToDevice, this->vis_stream_)); } if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->vis_stream_)); } } CONTROLLER_TEMPLATE std::pair CONTROLLER::findMinIndexAndValue(std::vector& temp_list) { if (temp_list.size() == 0) { return std::make_pair(0, 0.0); } int min_sample_index = 0; float min_sample_value = this->trajectory_costs_[temp_list[min_sample_index]]; for (int index = 1; index < temp_list.size(); index++) { if (this->trajectory_costs_[temp_list[index]] < min_sample_value) { min_sample_value = this->trajectory_costs_[temp_list[index]]; min_sample_index = index; } } return std::make_pair(min_sample_index, min_sample_value); } CONTROLLER_TEMPLATE void CONTROLLER::copyTopControlFromDevice(bool synchronize) { // if mem is not inited don't use it if (!sampled_states_CUDA_mem_init_ || num_top_control_trajectories_ <= 0) { return; } // Important note: Highest weighted trajectories are the ones with the lowest cost int start_top_control_traj_index = perc_sampled_control_trajectories_ * NUM_ROLLOUTS; std::vector samples(num_top_control_trajectories_); // Start by filling in the top samples list with the first n in the trajectory for (int i = 0; i < num_top_control_trajectories_; i++) { samples[i] = i; } // Calculate min weight in the current top samples list int min_sample_index = 0; float min_sample_value = 0; std::tie(min_sample_index, min_sample_value) = findMinIndexAndValue(samples); // find top n samples by removing the smallest weights from the list for (int i = num_top_control_trajectories_; i < NUM_ROLLOUTS; i++) { if (trajectory_costs_[i] > min_sample_value) { // Remove the smallest weight in the current list and add the new index samples[min_sample_index] = i; // recalculate min weight in the current list std::tie(min_sample_index, min_sample_value) = findMinIndexAndValue(samples); } } // Copy top n samples to this->sampled_noise_d_ after the randomly sampled trajectories top_n_costs_.resize(num_top_control_trajectories_); for (int i = 0; i < num_top_control_trajectories_; i++) { top_n_costs_[i] = trajectory_costs_[samples[i]] / getNormalizerCost(); HANDLE_ERROR(cudaMemcpyAsync( this->sampled_outputs_d_ + (start_top_control_traj_index + i) * getNumTimesteps() * DYN_T::OUTPUT_DIM, this->output_d_ + samples[i] * getNumTimesteps() * DYN_T::OUTPUT_DIM, sizeof(float) * getNumTimesteps() * DYN_T::OUTPUT_DIM, cudaMemcpyDeviceToDevice, this->vis_stream_)); HANDLE_ERROR(cudaMemcpyAsync( // this->sampled_noise_d_ + (start_top_control_traj_index + i) * getNumTimesteps() * DYN_T::CONTROL_DIM, this->sampler_->getVisControlSample(start_top_control_traj_index + i, 0, 0), this->sampler_->getControlSample(samples[i], 0, 0), // this->control_noise_d_ + samples[i] * getNumTimesteps() * DYN_T::CONTROL_DIM, sizeof(float) * getNumTimesteps() * DYN_T::CONTROL_DIM, cudaMemcpyDeviceToDevice, this->vis_stream_)); } if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->vis_stream_)); } } CONTROLLER_TEMPLATE void CONTROLLER::setCUDAStream(cudaStream_t stream) { stream_ = stream; model_->bindToStream(stream); cost_->bindToStream(stream); fb_controller_->bindToStream(stream); sampler_->bindToStream(stream); curandSetStream(gen_, stream); // requires the generator to be created! } CONTROLLER_TEMPLATE void CONTROLLER::createAndSeedCUDARandomNumberGen() { // Seed the PseudoRandomGenerator with the CPU time. curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT); setSeedCUDARandomNumberGen(this->params_.seed_); } CONTROLLER_TEMPLATE void CONTROLLER::setSeedCUDARandomNumberGen(unsigned seed) { // Seed the PseudoRandomGenerator with the CPU time. curandSetPseudoRandomGeneratorSeed(gen_, seed); // Reset the offset so setting the seed multiple times returns the same samples curandSetGeneratorOffset(gen_, 0); } CONTROLLER_TEMPLATE void CONTROLLER::allocateCUDAMemoryHelper(int nominal_size, bool allocate_double_noise) { if (nominal_size < 0) { nominal_size = 1; std::cerr << "nominal size cannot be below 0 when allocateCudaMemoryHelper is called" << std::endl; std::exit(-1); } else { // increment by 1 since actual is not included ++nominal_size; } HANDLE_ERROR(cudaMalloc((void**)&initial_state_d_, sizeof(float) * DYN_T::STATE_DIM * nominal_size)); HANDLE_ERROR(cudaMalloc((void**)&vis_initial_state_d_, sizeof(float) * DYN_T::STATE_DIM * nominal_size)); HANDLE_ERROR(cudaMalloc((void**)&control_d_, sizeof(float) * DYN_T::CONTROL_DIM * MAX_TIMESTEPS * nominal_size)); HANDLE_ERROR( cudaMalloc((void**)&output_d_, sizeof(float) * DYN_T::OUTPUT_DIM * MAX_TIMESTEPS * NUM_ROLLOUTS * nominal_size)); HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d_, sizeof(float) * NUM_ROLLOUTS * nominal_size)); // HANDLE_ERROR(cudaMalloc((void**)&control_std_dev_d_, sizeof(float) * DYN_T::CONTROL_DIM)); // HANDLE_ERROR(cudaMalloc((void**)&control_noise_d_, sizeof(float) * DYN_T::CONTROL_DIM * MAX_TIMESTEPS * // NUM_ROLLOUTS * // (allocate_double_noise ? nominal_size : 1))); HANDLE_ERROR(cudaMalloc((void**)&cost_baseline_and_norm_d_, sizeof(float2) * nominal_size)); cost_baseline_and_norm_.resize(nominal_size, make_float2(0.0, 0.0)); CUDA_mem_init_ = true; } CONTROLLER_TEMPLATE void CONTROLLER::resizeSampledControlTrajectories(float perc, int multiplier, int top_num) { int num_sampled_trajectories = perc * NUM_ROLLOUTS + top_num; if (sampled_states_CUDA_mem_init_) { cudaFree(sampled_outputs_d_); // cudaFree(sampled_noise_d_); cudaFree(sampled_costs_d_); cudaFree(sampled_crash_status_d_); sampled_states_CUDA_mem_init_ = false; } sampled_trajectories_.resize(num_sampled_trajectories * multiplier, output_trajectory::Zero()); sampled_costs_.resize(num_sampled_trajectories * multiplier, cost_trajectory::Zero()); sampled_crash_status_.resize(num_sampled_trajectories * multiplier, crash_status_trajectory::Zero()); sampler_->setNumVisRollouts(num_sampled_trajectories); if (num_sampled_trajectories <= 0) { return; } HANDLE_ERROR(cudaMalloc((void**)&sampled_outputs_d_, sizeof(float) * DYN_T::OUTPUT_DIM * MAX_TIMESTEPS * num_sampled_trajectories * multiplier)); // HANDLE_ERROR(cudaMalloc((void**)&sampled_noise_d_, // sizeof(float) * DYN_T::CONTROL_DIM * MAX_TIMESTEPS * num_sampled_trajectories * // multiplier)); // +1 for terminal cost HANDLE_ERROR(cudaMalloc((void**)&sampled_costs_d_, sizeof(float) * (MAX_TIMESTEPS + 1) * num_sampled_trajectories * multiplier)); HANDLE_ERROR(cudaMalloc((void**)&sampled_crash_status_d_, sizeof(int) * MAX_TIMESTEPS * num_sampled_trajectories * multiplier)); sampled_states_CUDA_mem_init_ = true; } CONTROLLER_TEMPLATE std::vector CONTROLLER::getSampledNoise() { std::vector vector = std::vector(NUM_ROLLOUTS * getNumTimesteps() * DYN_T::CONTROL_DIM, FLT_MIN); HANDLE_ERROR(cudaMemcpyAsync(vector.data(), this->sampler_->getControlSample(0, 0, 0), sizeof(float) * NUM_ROLLOUTS * getNumTimesteps() * DYN_T::CONTROL_DIM, cudaMemcpyDeviceToHost, stream_)); HANDLE_ERROR(cudaStreamSynchronize(stream_)); return vector; } #undef CONTROLLER_TEMPLATE #undef CONTROLLER ================================================ FILE: include/mppi/controllers/controller.cuh ================================================ // // Created by jason on 10/30/19. // #ifndef MPPIGENERIC_CONTROLLER_CUH #define MPPIGENERIC_CONTROLLER_CUH #include #include #include #include "curand.h" #include #include #include #include #include #include #include struct freeEnergyEstimate { float increase = -1; float previousBaseline = -1; float freeEnergyMean = -1; float freeEnergyVariance = -1; float freeEnergyModifiedVariance = -1; float normalizerPercent = -1; }; struct MPPIFreeEnergyStatistics { int nominal_state_used = 0; freeEnergyEstimate nominal_sys; freeEnergyEstimate real_sys; }; enum class kernelType : int { USE_SINGLE_KERNEL = 0, // combine dynamics and cost calls into a single kernel USE_SPLIT_KERNELS, // separate kernels for dynamics and cost calls }; template struct ControllerParams { static const int TEMPLATED_STATE_DIM = S_DIM; static const int TEMPLATED_CONTROL_DIM = C_DIM; static const int TEMPLATED_MAX_TIMESTEPS = MAX_TIMESTEPS; float dt_; float lambda_ = 1.0; // Value of the temperature in the softmax. float alpha_ = 0.0; // // MAX_TIMESTEPS is defined as an upper bound, if lower that region is just ignored when calculating control // does not reallocate cuda memory int num_timesteps_ = MAX_TIMESTEPS; int num_iters_ = 1; // Number of optimization iterations unsigned seed_ = std::chrono::system_clock::now().time_since_epoch().count(); dim3 dynamics_rollout_dim_; dim3 cost_rollout_dim_; dim3 visualize_dim_ = dim3(32, 1, 1); int norm_exp_kernel_parallelization_ = 64; Eigen::Matrix init_control_traj_ = Eigen::Matrix::Zero(); Eigen::Matrix slide_control_scale_ = Eigen::Matrix::Zero(); }; template > class Controller { public: // EIGEN_MAKE_ALIGNED_OPERATOR_NEW /** * typedefs for access to templated class from outside classes */ typedef DYN_T TEMPLATED_DYNAMICS; typedef COST_T TEMPLATED_COSTS; typedef FB_T TEMPLATED_FEEDBACK; typedef PARAMS_T TEMPLATED_PARAMS; typedef SAMPLING_T TEMPLATED_SAMPLING; using TEMPLATED_FEEDBACK_STATE = typename FB_T::TEMPLATED_FEEDBACK_STATE; using TEMPLATED_FEEDBACK_PARAMS = typename FB_T::TEMPLATED_PARAMS; using TEMPLATED_FEEDBACK_GPU = typename FB_T::TEMPLATED_GPU_FEEDBACK; using TEMPLATED_SAMPLING_PARAMS = typename SAMPLING_T::SAMPLING_PARAMS_T; static const int TEMPLATED_FEEDBACK_TIMESTEPS = FB_T::FB_TIMESTEPS; /** * Aliases */ // Control typedefs using control_array = typename DYN_T::control_array; typedef Eigen::Matrix control_trajectory; // A control trajectory // State typedefs using state_array = typename DYN_T::state_array; typedef Eigen::Matrix state_trajectory; // A state trajectory // Output typedefs using output_array = typename DYN_T::output_array; typedef Eigen::Matrix output_trajectory; // An output trajectory // Cost typedefs typedef Eigen::Matrix cost_trajectory; // +1 for terminal cost typedef Eigen::Matrix sampled_cost_traj; typedef Eigen::Matrix crash_status_trajectory; Controller(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, float dt, int max_iter, float lambda, float alpha, int num_timesteps = MAX_TIMESTEPS, const Eigen::Ref& init_control_traj = control_trajectory::Zero(), cudaStream_t stream = nullptr) { // Create the random number generator createAndSeedCUDARandomNumberGen(); model_ = model; cost_ = cost; fb_controller_ = fb_controller; sampler_ = sampler; sampler_->setNumRollouts(NUM_ROLLOUTS); sampler_->setNumDistributions(1); TEMPLATED_PARAMS params; params.dt_ = dt; params.num_iters_ = max_iter; params.lambda_ = lambda; params.alpha_ = alpha; params.num_timesteps_ = num_timesteps; params.init_control_traj_ = init_control_traj; setNumTimesteps(params.num_timesteps_); setParams(params); control_ = init_control_traj; control_history_ = Eigen::Matrix::Zero(); // Bind the model and control to the given stream setCUDAStream(stream); // Create new stream for visualization purposes HANDLE_ERROR(cudaStreamCreate(&vis_stream_)); GPUSetup(); auto logger = std::make_shared(); setLogger(logger); /** * When implementing your own version make sure to write your own allocateCUDAMemory and call it from the * constructor along with any other methods to copy memory to the device and back */ // TODO pass function pointer? } Controller(DYN_T* model, COST_T* cost, FB_T* fb_controller, SAMPLING_T* sampler, PARAMS_T& params, cudaStream_t stream = nullptr) { model_ = model; cost_ = cost; fb_controller_ = fb_controller; sampler_ = sampler; sampler_->setNumRollouts(NUM_ROLLOUTS); sampler_->setNumDistributions(1); setNumTimesteps(params_.num_timesteps_); // Create the random number generator createAndSeedCUDARandomNumberGen(); setParams(params); control_ = params_.init_control_traj_; control_history_ = Eigen::Matrix::Zero(); // Bind the model and control to the given stream setCUDAStream(stream); // Create new stream for visualization purposes HANDLE_ERROR(cudaStreamCreate(&vis_stream_)); GPUSetup(); auto logger = std::make_shared(); setLogger(logger); /** * When implementing your own version make sure to write your own allocateCUDAMemory and call it from the * constructor along with any other methods to copy memory to the device and back */ // TODO pass function pointer? } // TODO should be private with test as a friend to ensure it is only used in testing Controller() = default; /** * Destructor must be virtual so that children are properly * destroyed when called from a basePlant reference */ virtual ~Controller() { // Free the CUDA memory of every object if (model_) { model_->freeCudaMem(); } if (cost_) { cost_->freeCudaMem(); } if (fb_controller_) { fb_controller_->freeCudaMem(); } if (sampler_) { sampler_->freeCudaMem(); } // Free the CUDA memory of the controller deallocateCUDAMemory(); } // =================== METHODS THAT SHOULD HAVE NO DEFAULT ========================== // ======== PURE VIRTUAL ========= /** * Given a state, calculates the optimal control sequence using MPPI according * to the cost function used as part of the template * @param state - the current state from which we would like to calculate * a control sequence */ virtual void computeControl(const Eigen::Ref& state, int optimization_stride) = 0; /** * Call a kernel to evaluate the sampled state trajectories for visualization * and debugging. */ virtual void calculateSampledStateTrajectories() = 0; // ================ END OF METHODS WITH NO DEFAULT ============= // ======== PURE VIRTUAL END ===== virtual std::string getControllerName() const { return "name not set"; }; virtual std::string getCostFunctionName() const { return cost_->getCostFunctionName(); } virtual std::string getDynamicsModelName() const { return model_->getDynamicsModelName(); } virtual std::string getSamplingDistributionName() const { return sampler_->getSamplingDistributionName(); } virtual std::string getFullName() const { return getControllerName() + "(" + getDynamicsModelName() + ", " + getCostFunctionName() + ", " + getSamplingDistributionName() + ")"; } virtual void initFeedback() { enable_feedback_ = true; fb_controller_->initTrackingController(); }; virtual void GPUSetup() { // Call the GPU setup functions of the model, cost, sampling distribution, and feedback controller model_->GPUSetup(); cost_->GPUSetup(); fb_controller_->GPUSetup(); sampler_->setVisStream(vis_stream_); sampler_->GPUSetup(); } virtual std::vector getSampledOutputTrajectories() const { return sampled_trajectories_; } virtual std::vector getSampledCostTrajectories() const { return sampled_costs_; } virtual std::vector getSampledCrashStatusTrajectories() const { return sampled_crash_status_; } virtual std::vector getTopTransformedCosts() const { return top_n_costs_; } virtual void chooseAppropriateKernel() { use_kernel_ = kernelType::USE_SPLIT_KERNELS; } /** * only used in rmppi, here for generic calls in base_plant. Jank as hell * @param state * @param stride */ void updateImportanceSamplingControl(const Eigen::Ref& state, int optimization_stride) { } /** * Used to update the importance sampler * @param nominal_control the new nominal control sequence to sample around */ virtual void updateImportanceSampler(const Eigen::Ref& nominal_control) { // TODO copy to device new control sequence control_ = nominal_control; } /** * determines the control that should * @param state * @param rel_time * @return */ virtual control_array getCurrentControl(Eigen::Ref state, double rel_time, Eigen::Ref target_nominal_state, Eigen::Ref c_traj, TEMPLATED_FEEDBACK_STATE& fb_state) { // MPPI control control_array u_ff = interpolateControls(rel_time, c_traj); control_array u_fb = control_array::Zero(); if (enable_feedback_) { u_fb = interpolateFeedback(state, target_nominal_state, rel_time, fb_state); } control_array result = u_ff + u_fb; state_array empty_state = model_->getZeroState(); model_->enforceConstraints(empty_state, result); return result; } /** * @param steps - number of dt's to slide control sequence forward * Slide the control sequence forwards by 'steps' */ virtual void slideControlSequence(int steps) { // Save the control history this->saveControlHistoryHelper(steps, this->control_, this->control_history_); this->slideControlSequenceHelper(steps, this->control_); } /** * determines the interpolated control from control_seq_, linear interpolation * @param rel_time time since the solution was calculated * @return */ virtual control_array interpolateControls(double rel_time, Eigen::Ref c_traj) { int lower_idx = (int)(rel_time / getDt()); int upper_idx = lower_idx + 1; double alpha = (rel_time - lower_idx * getDt()) / getDt(); control_array interpolated_control; control_array prev_cmd = c_traj.col(lower_idx); control_array next_cmd = c_traj.col(upper_idx); interpolated_control = (1 - alpha) * prev_cmd + alpha * next_cmd; return interpolated_control; } virtual state_array interpolateState(Eigen::Ref s_traj, double rel_time) { int lower_idx = (int)(rel_time / getDt()); int upper_idx = lower_idx + 1; double alpha = (rel_time - lower_idx * getDt()) / getDt(); return model_->interpolateState(s_traj.col(lower_idx), s_traj.col(upper_idx), alpha); } /** * * @param state * @param rel_time * @return */ virtual control_array interpolateFeedback(Eigen::Ref state, Eigen::Ref target_nominal_state, double rel_time, TEMPLATED_FEEDBACK_STATE& fb_state) { return fb_controller_->interpolateFeedback_(state, target_nominal_state, rel_time, fb_state); } virtual curandGenerator_t getGenerator() const { return gen_; } /** * returns the current control sequence */ virtual control_trajectory getControlSeq() const { return control_; }; /** * Gets the state sequence of the nominal trajectory */ virtual state_trajectory getTargetStateSeq() const { return state_; } /** * Gets the output sequence of the nominal trajectory */ virtual output_trajectory getTargetOutputSeq() const { return output_; } /** * Return all the sampled costs sequences */ virtual sampled_cost_traj getSampledCostSeq() const { return trajectory_costs_; }; /** * Return control feedback gains */ // TODO: Think of a better name for this method? virtual TEMPLATED_FEEDBACK_STATE getFeedbackState() const { if (enable_feedback_) { return fb_controller_->getFeedbackState(); } else { TEMPLATED_FEEDBACK_STATE default_state; return default_state; } }; virtual TEMPLATED_FEEDBACK_PARAMS getFeedbackParams() const { if (enable_feedback_) { return fb_controller_->getParams(); } else { TEMPLATED_FEEDBACK_PARAMS default_fb_params; return default_fb_params; } } // Indicator for algorithm health, should be between 0.01 and 0.1 anecdotally float getNormalizerPercent() const { return this->getNormalizerCost() / (float)NUM_ROLLOUTS; } /** * Computes the actual trajectory given the MPPI optimal control and the * feedback gains computed by DDP. If feedback is not enabled, then we return * zero since this function would not make sense. */ virtual void computeFeedbackPropagatedStateSeq() { if (!enable_feedback_) { return; } // Compute the nominal trajectory propagated_feedback_state_trajectory_.col(0) = getActualStateSeq().col(0); // State that we optimized from state_array xdot; state_array current_state, next_state; output_array output; control_array current_control; for (int i = 0; i < getNumTimesteps() - 1; ++i) { current_state = propagated_feedback_state_trajectory_.col(i); // MPPI control apply feedback at the given timestep against the nominal trajectory at that timestep current_control = getControlSeq().col(i) + getFeedbackControl(current_state, getTargetStateSeq().col(i), i); model_->step(current_state, next_state, xdot, current_control, output, i, getDt()); propagated_feedback_state_trajectory_.col(i + 1) = next_state; } } /** * * @return State trajectory from optimized state with MPPI control and computed feedback gains */ state_trajectory getFeedbackPropagatedStateSeq() const { return propagated_feedback_state_trajectory_; }; float getBaselineCost(int ind = 0) const { return cost_baseline_and_norm_[ind].x; }; float getNormalizerCost(int ind = 0) const { return cost_baseline_and_norm_[ind].y; }; /** * returns the current state sequence */ state_trajectory getActualStateSeq() const { return state_; }; /** * returns the current output sequence */ output_trajectory getActualOutputSeq() const { return output_; }; virtual void computeFeedbackHelper(const Eigen::Ref& state, const Eigen::Ref& state_traj, const Eigen::Ref& control_traj) { if (!enable_feedback_) { return; } fb_controller_->computeFeedback(state, state_traj, control_traj); } virtual void computeFeedback(const Eigen::Ref& state) { computeFeedbackHelper(state, getTargetStateSeq(), getControlSeq()); } virtual control_array getFeedbackControl(const Eigen::Ref& state, const Eigen::Ref& goal_state, int t) { return fb_controller_->k(state, goal_state, t); } void smoothControlTrajectoryHelper(Eigen::Ref u, const Eigen::Ref>& control_history) { // TODO generalize to any size filter // TODO does the logic of handling control history reasonable? // Create the filter coefficients Eigen::Matrix filter_coefficients; filter_coefficients << -3, 12, 17, 12, -3; filter_coefficients /= 35.0; // Create and fill a control buffer that we can apply the convolution filter Eigen::MatrixXf control_buffer(getNumTimesteps() + 4, DYN_T::CONTROL_DIM); // Fill the first two timesteps with the control history control_buffer.topRows(2) = control_history.transpose(); // Fill the center timesteps with the current nominal trajectory control_buffer.middleRows(2, getNumTimesteps()) = u.transpose(); // Fill the last two timesteps with the end of the current nominal control trajectory control_buffer.row(getNumTimesteps() + 2) = u.transpose().row(getNumTimesteps() - 1); control_buffer.row(getNumTimesteps() + 3) = u.transpose().row(getNumTimesteps() - 1); // Apply convolutional filter to each timestep for (int i = 0; i < getNumTimesteps(); ++i) { u.col(i) = (filter_coefficients * control_buffer.middleRows(i, 5)).transpose(); } } virtual void slideControlSequenceHelper(int steps, Eigen::Ref u) { for (int i = 0; i < getNumTimesteps(); ++i) { int ind = std::min(i + steps, getNumTimesteps() - 1); u.col(i) = u.col(ind); if (i + steps > getNumTimesteps() - 1) { u.col(i) = (u.col(ind).array() - model_->zero_control_.array()) * params_.slide_control_scale_.array() + model_->zero_control_.array(); } } } virtual void saveControlHistoryHelper(int steps, const Eigen::Ref& u_trajectory, Eigen::Ref> u_history) { if (steps == 1) { // We only moved one timestep u_history.col(0) = u_history.col(1); u_history.col(1) = u_trajectory.col(0); } else if (steps >= 2) { // We have moved more than one timestep, but our history size is still only 2 u_history.col(0) = u_trajectory.col(steps - 2); u_history.col(1) = u_trajectory.col(steps - 1); } } /** * Reset Controls */ virtual void resetControls(){ // TODO }; virtual void computeStateTrajectoryHelper(Eigen::Ref result, const Eigen::Ref& x0, const Eigen::Ref& u) { result.col(0) = x0; state_array xdot; state_array state, next_state; output_array output; model_->initializeDynamics(result.col(0), u.col(0), output, 0, getDt()); for (int i = 0; i < getNumTimesteps() - 1; ++i) { state = result.col(i); control_array u_i = u.col(i); model_->enforceConstraints(state, u_i); model_->step(state, next_state, xdot, u_i, output, i, getDt()); result.col(i + 1) = next_state; } } virtual void computeOutputTrajectoryHelper(Eigen::Ref output_result, Eigen::Ref state_result, const Eigen::Ref& x0, const Eigen::Ref& u) { state_result.col(0) = x0; state_array xdot; state_array state, next_state; output_array output; model_->initializeDynamics(state_result.col(0), u.col(0), output, 0, getDt()); output_result.col(0) = output; for (int i = 0; i < getNumTimesteps() - 1; ++i) { state = state_result.col(i); control_array u_i = u.col(i); model_->enforceConstraints(state, u_i); model_->step(state, next_state, xdot, u_i, output, i, getDt()); state_result.col(i + 1) = next_state; output_result.col(i + 1) = output; } } void setNumTimesteps(int num_timesteps) { // TODO fix the tracking controller as well if ((num_timesteps <= MAX_TIMESTEPS) && (num_timesteps > 0)) { params_.num_timesteps_ = num_timesteps; } else { params_.num_timesteps_ = MAX_TIMESTEPS; printf("You must give a number of timesteps between [0, %d]\n", MAX_TIMESTEPS); } sampler_->setNumTimesteps(params_.num_timesteps_); } void setBaseline(float baseline, int index = 0) { cost_baseline_and_norm_[index].x = baseline; }; void setNormalizer(float normalizer, int index = 0) { cost_baseline_and_norm_[index].y = normalizer; }; int getNumTimesteps() const { return this->params_.num_timesteps_; } int getNormExpThreads() const { return this->params_.norm_exp_kernel_parallelization_; } /** * updates the scaling factor of noise for sampling around the nominal trajectory */ // void updateControlNoiseStdDev(const Eigen::Ref& sigma_u) // { // params_.control_std_dev_ = sigma_u; // copyControlStdDevToDevice(); // } void disableFeedbackController() { enable_feedback_ = false; } void setFeedbackParams(const TEMPLATED_FEEDBACK_PARAMS& fb_params) { fb_controller_->setParams(fb_params); } bool getFeedbackEnabled() const { return enable_feedback_; } float getPercentageSampledControlTrajectories() const { return perc_sampled_control_trajectories_; } /** * Set the percentage of sample control trajectories to copy * back from the GPU. Multiplier is an integer in case the nominal * control trajectories also need to be saved. */ void setPercentageSampledControlTrajectoriesHelper(float new_perc, int multiplier = 1) { perc_sampled_control_trajectories_ = new_perc; sample_multiplier_ = multiplier; resizeSampledControlTrajectories(perc_sampled_control_trajectories_, sample_multiplier_, num_top_control_trajectories_); } void setTopNSampledControlTrajectoriesHelper(int new_top_num_samples) { num_top_control_trajectories_ = new_top_num_samples; resizeSampledControlTrajectories(perc_sampled_control_trajectories_, sample_multiplier_, num_top_control_trajectories_); } void resizeSampledControlTrajectories(float perc, int multiplier, int top_num); int getNumberSampledTrajectories() const { return perc_sampled_control_trajectories_ * NUM_ROLLOUTS; } int getNumberTopControlTrajectories() const { return num_top_control_trajectories_; } int getTotalSampledTrajectories() const { return getNumberSampledTrajectories() + getNumberTopControlTrajectories(); } void setSlideControlScale(const Eigen::Ref& slide_control_scale) { params_.slide_control_scale_ = slide_control_scale; } /** * Return the most recent free energy calculation for the mean */ MPPIFreeEnergyStatistics getFreeEnergyStatistics() const { return free_energy_statistics_; } std::vector getSampledNoise(); /** * Public data members */ DYN_T* model_ = nullptr; COST_T* cost_ = nullptr; FB_T* fb_controller_ = nullptr; SAMPLING_T* sampler_ = nullptr; cudaStream_t stream_; cudaStream_t vis_stream_; float getDt() const { return params_.dt_; } void setDt(float dt) { params_.dt_ = dt; if (fb_controller_) { fb_controller_->setDt(dt); } } float getLambda() const { return params_.lambda_; } void setLambda(float lambda) { params_.lambda_ = lambda; } float getAlpha() const { return params_.alpha_; } void setAlpha(float alpha) { params_.alpha_ = alpha; } PARAMS_T getParams() const { return params_; } const TEMPLATED_SAMPLING_PARAMS getSamplingParams() const { return sampler_->getParams(); } void setSamplingParams(const TEMPLATED_SAMPLING_PARAMS& params, bool synchronize = true) { sampler_->setParams(params, synchronize); } void setParams(const PARAMS_T& p) { bool change_seed = p.seed_ != params_.seed_; bool change_num_timesteps = p.num_timesteps_ != params_.num_timesteps_; bool change_dt = p.dt_ != params_.dt_; // bool change_std_dev = p.control_std_dev_ != params_.control_std_dev_; params_ = p; if (change_num_timesteps) { setNumTimesteps(p.num_timesteps_); } if (change_seed) { setSeedCUDARandomNumberGen(params_.seed_); } if (change_dt) { fb_controller_->setDt(p.dt_); } } int getNumIters() const { return params_.num_iters_; } void setNumIters(int num_iter) { params_.num_iters_ = num_iter; } float getDebug() const { return debug_; } void setDebug(bool debug) { debug_ = debug; setLogLevel(mppi::util::LOG_LEVEL::DEBUG); } int getKernelChoiceAsInt() const { return static_cast(use_kernel_); } kernelType getKernelChoiceAsEnum() const { return use_kernel_; } int getNumKernelEvaluations() const { return num_kernel_evaluations_; } void setKernelChoice(const int& kernel_type) { use_kernel_ = static_cast(kernel_type); } void setKernelChoice(const kernelType& kernel_type) { use_kernel_ = kernel_type; } void setNumKernelEvaluations(const int& num_kernel_evaluations) { num_kernel_evaluations_ = num_kernel_evaluations; } void setCUDAStream(cudaStream_t stream); void setLogLevel(const mppi::util::LOG_LEVEL& level) { logger_->setLogLevel(level); model_->setLogLevel(level); cost_->setLogLevel(level); sampler_->setLogLevel(level); fb_controller_->setLogLevel(level); } void setLogger(const mppi::util::MPPILoggerPtr& logger) { logger_ = logger; model_->setLogger(logger); cost_->setLogger(logger); sampler_->setLogger(logger); fb_controller_->setLogger(logger); } mppi::util::MPPILoggerPtr getLogger() const { return logger_; } mppi::util::MPPILoggerPtr getLogger() { return logger_; } protected: // no default protected members void deallocateCUDAMemory(); PARAMS_T params_; mppi::util::MPPILoggerPtr logger_ = nullptr; // TODO get raw pointers for different things bool debug_ = false; int num_kernel_evaluations_ = 10; // number of kernel calls to do to determine which kernel to use kernelType use_kernel_ = kernelType::USE_SPLIT_KERNELS; // default is to use the split kernels // Free energy variables MPPIFreeEnergyStatistics free_energy_statistics_; // float normalizer_; // Variable for the normalizing term from sampling. // float baseline_ = 0; // Baseline cost of the system. float perc_sampled_control_trajectories_ = 0; // Percentage of sampled trajectories to return int sample_multiplier_ = 1; // How many nominal states we are keeping track of int num_top_control_trajectories_ = 0; // Top n sampled trajectories to visualize std::vector top_n_costs_; curandGenerator_t gen_; // float* control_std_dev_d_; // Array of size DYN_T::CONTROL_DIM float* initial_state_d_; // Array of sizae DYN_T::STATE_DIM * (2 if there is a nominal state) float* vis_initial_state_d_; // Array of sizae DYN_T::STATE_DIM * (2 if there is a nominal state) Eigen::Matrix control_history_; // one array of this size is allocated for each state we care about, // so it can be the size*N for N nominal states // [actual, nominal] float* control_d_; // Array of size DYN_T::CONTROL_DIM*NUM_TIMESTEPS*N float* output_d_; // Array of size DYN_T::OUTPUT_DIM*NUM_ROLLOUTS*N float* trajectory_costs_d_; // Array of size NUM_ROLLOUTS*N // float* control_noise_d_; // Array of size DYN_T::CONTROL_DIM*NUM_TIMESTEPS*NUM_ROLLOUTS*N float2* cost_baseline_and_norm_d_; // Array of size number of systems control_trajectory control_ = control_trajectory::Zero(); state_trajectory state_ = state_trajectory::Zero(); output_trajectory output_ = output_trajectory::Zero(); sampled_cost_traj trajectory_costs_ = sampled_cost_traj::Zero(); std::vector cost_baseline_and_norm_ = { make_float2(0.0, 0.0) }; bool CUDA_mem_init_ = false; bool sampled_states_CUDA_mem_init_ = false; // cudaMalloc, cudaFree boolean float* sampled_outputs_d_; // result of states that have been sampled from state trajectory kernel float* sampled_noise_d_; // noise to be passed to the state trajectory kernel float* sampled_costs_d_; // result of cost that have been sampled from state and cost trajectory kernel int* sampled_crash_status_d_; // result of crash_status that have been sampled std::vector sampled_trajectories_; // sampled state trajectories from state trajectory kernel std::vector sampled_costs_; std::vector sampled_crash_status_; // Propagated real state trajectory state_trajectory propagated_feedback_state_trajectory_ = state_trajectory::Zero(); // tracking controller variables bool enable_feedback_ = false; // void copyControlStdDevToDevice(bool synchronize = true); void copyNominalControlToDevice(bool synchronize = true); /** * Saves the sampled controls from the GPU back to the CPU * Must be called after the rolloutKernel as that is when * du_d becomes the sampled controls */ void copySampledControlFromDevice(bool synchronize = true); std::pair findMinIndexAndValue(std::vector& temp_list); void copyTopControlFromDevice(bool synchronize = true); void createAndSeedCUDARandomNumberGen(); void setSeedCUDARandomNumberGen(unsigned seed); /** * Allocates CUDA memory for actual states and nominal states if needed * @param nominal_size if only actual this should be 0 */ void allocateCUDAMemoryHelper(int nominal_size = 0, bool allocate_double_noise = true); // TODO all the copy to device functions to streamline process private: // ======== MUST BE OVERWRITTEN ========= void allocateCUDAMemory() { allocateCUDAMemoryHelper(); }; /** * TODO all copy to device and back functions implemented for specific controller * When you write your own you must control when synchronize stream is called */ // ======== END MUST BE OVERWRITTEN ===== }; #ifdef __CUDACC__ #include "controller.cu" #endif #endif // MPPIGENERIC_CONTROLLER_CUH ================================================ FILE: include/mppi/core/base_plant.hpp ================================================ /** * Created by Bogdan on 2/11/20. * Creates the API for interfacing with an MPPI controller * should define a compute_control based on state as well * as return timing info **/ #ifndef BASE_PLANT_H_ #define BASE_PLANT_H_ // Double check if these are included in mppi_common.h #include #include #include #include #include #include #include #include #include #include #include template class BasePlant { public: using c_array = typename CONTROLLER_T::control_array; using c_traj = typename CONTROLLER_T::control_trajectory; using s_array = typename CONTROLLER_T::state_array; using s_traj = typename CONTROLLER_T::state_trajectory; // using K_traj = typename CONTROLLER_T::feedback_gain_trajectory; using o_array = typename CONTROLLER_T::output_array; using o_traj = typename CONTROLLER_T::output_trajectory; using DYN_T = typename CONTROLLER_T::TEMPLATED_DYNAMICS; using DYN_PARAMS_T = typename DYN_T::DYN_PARAMS_T; using COST_T = typename CONTROLLER_T::TEMPLATED_COSTS; using COST_PARAMS_T = typename COST_T::COST_PARAMS_T; using TEMPLATED_CONTROLLER = CONTROLLER_T; using CONTROLLER_PARAMS_T = typename CONTROLLER_T::TEMPLATED_PARAMS; using SAMPLER_T = typename CONTROLLER_T::TEMPLATED_SAMPLING; using SAMPLER_PARAMS_T = typename CONTROLLER_T::TEMPLATED_SAMPLING_PARAMS; // Feedback related aliases using FB_STATE_T = typename CONTROLLER_T::TEMPLATED_FEEDBACK::TEMPLATED_FEEDBACK_STATE; typedef std::map buffer_trajectory; protected: std::mutex access_guard_; mppi::util::MPPILoggerPtr logger_ = nullptr; int hz_ = 10; // Frequency of control publisher int visualization_hz_ = 5; bool debug_mode_ = false; DYN_PARAMS_T dynamics_params_; std::mutex dynamics_params_guard_; COST_PARAMS_T cost_params_; std::mutex cost_params_guard_; CONTROLLER_PARAMS_T controller_params_; std::mutex controller_params_guard_; SAMPLER_PARAMS_T sampler_params_; std::mutex sampler_params_guard_; std::atomic has_new_dynamics_params_{ false }; std::atomic has_new_cost_params_{ false }; std::atomic has_new_controller_params_{ false }; std::atomic has_new_sampler_params_{ false }; std::atomic has_received_state_{ false }; // Values needed s_array init_state_ = s_array::Zero(); c_array init_u_ = c_array::Zero(); // Values updated at every time step s_array state_ = s_array::Zero(); c_array u_ = c_array::Zero(); // solution s_traj state_traj_; c_traj control_traj_; o_traj output_traj_; // values sometime updated // TODO init to zero? FB_STATE_T feedback_state_; // from ROSHandle mppi_node int optimization_stride_ = 1; int last_optimization_stride_ = 0; /** * From before while loop */ /** * Robot Time: based off of the time stamps from the state estimator * Wall Clock: always real time per the computer */ // Robot Time: can scale with a simulation std::atomic last_used_state_update_time_{ 0.0 }; // time of the last state update that was used for // optimization std::atomic state_time_{ 0.0 }; // Wall Clock: always real time double optimize_loop_duration_ = 0; // duration of the entire controller run loop double optimization_duration_ = 0; // Most recent time it took to run MPPI iteration double feedback_duration_ = 0; // most recent time it took to run the feedback controller double sleep_duration_ = 0; // how long the most recent loop in runControlLoop slept double avg_loop_time_ms_ = 0; // Average time it takes to run the controller double avg_optimize_time_ms_ = 0; // Average time it takes to runControlLoop double avg_feedback_time_ms_ = 0; // Average time it takes to run the feedback controller double avg_sleep_time_ms_ = 0; // Average time the runControlLoop sleeps between calls int num_iter_ = 0; // number of calls to computeControl /** * represents the status of the vehicle * 0: running normally * 1: not activated or no state information */ int status_ = 1; public: std::shared_ptr controller_; // EIGEN_MAKE_ALIGNED_OPERATOR_NEW BasePlant(std::shared_ptr controller, int hz, int optimization_stride) { controller_ = controller; hz_ = hz; optimization_stride_ = optimization_stride; control_traj_ = c_traj::Zero(); state_traj_ = s_traj::Zero(); dynamics_params_ = controller->model_->getParams(); cost_params_ = controller_->cost_->getParams(); init_state_ = controller_->model_->getZeroState(); state_ = controller_->model_->getZeroState(); auto logger = std::make_shared(); setLogger(logger); }; /** * Destructor must be virtual so that children are properly * destroyed when called from a BasePlant reference */ virtual ~BasePlant() = default; // ======== PURE VIRTUAL ========= /** * applies the control to the system * @param u */ virtual void pubControl(const c_array& u) = 0; /** * publishes the target nominal state * @param s */ virtual void pubNominalState(const s_array& s) = 0; virtual void pubFreeEnergyStatistics(MPPIFreeEnergyStatistics& fe_stats) = 0; /** * @brief Checks the system status. * @return An integer specifying the status. 0 means the system is operating * nominally, 1 means something is wrong but no action needs to be taken, * 2 means that the vehicle should stop immediately. */ virtual int checkStatus() = 0; /** * @return the current time not necessarily real time */ virtual double getCurrentTime() = 0; /** * @return the timestamp from the most recent position */ virtual double getPoseTime() = 0; /** * gets the correct time for the state message that has been updated through updateState * @return */ virtual double getStateTime() { return state_time_; } // ======== PURE VIRTUAL END ==== s_traj getStateTraj() { return state_traj_; } c_traj getControlTraj() { return control_traj_; } FB_STATE_T getFeedbackState() { return feedback_state_; } /** * Return the latest state received * @return the latest state */ virtual s_array getState() { std::lock_guard lck(access_guard_); return state_; }; virtual void setState(const Eigen::Ref& state) { state_ = state; } virtual void setControl(const Eigen::Ref& u) { u_ = u; } virtual void setDebugMode(bool mode) { debug_mode_ = mode; } void resetStateTime() { last_used_state_update_time_ = -1; }; double getAvgOptimizationTime() const { return avg_optimize_time_ms_; }; int getTargetOptimizationStride() { return optimization_stride_; }; int getLastOptimizationStride() { return last_optimization_stride_; }; void setTargetOptimizationStride(int new_val) { optimization_stride_ = new_val; } int getHz() const { return hz_; } void setHz(int hz) { hz_ = hz; } int getVisualizationHz() const { return visualization_hz_; } void setVisualizationHz(int hz) { visualization_hz_ = hz; } virtual buffer_trajectory getSmoothedBuffer(double time) { throw std::logic_error("Invalid dynamics with current plant, it requires the buffered plant"); } virtual void setSolution(const Eigen::Ref& state_seq, const Eigen::Ref& control_seq, const Eigen::Ref& output_seq, const FB_STATE_T& fb_state, double timestamp) { last_used_state_update_time_ = timestamp; std::lock_guard guard(access_guard_); state_traj_ = state_seq; output_traj_ = output_seq; control_traj_ = control_seq; feedback_state_ = fb_state; num_iter_++; } /** * updates the state and publishes a new control * @param state the most recent state from state estimator * @param time the time of the most recent state from the state estimator */ virtual void updateState(Eigen::Ref state, double time) { // calculate and update all timing variables double temp_last_state_update_time = last_used_state_update_time_; double time_since_last_opt = time - temp_last_state_update_time; state_ = state; state_time_ = time; has_received_state_ = true; if (num_iter_ == 0) { // we have not optimized yet so no reason to publish controls return; } // check if the requested time is in the calculated trajectory bool t_within_trajectory = time >= temp_last_state_update_time && time < temp_last_state_update_time + controller_->getDt() * controller_->getNumTimesteps(); // time_since_last_opt checks if we have a new state or not essentially, only publish control on new state time if (time_since_last_opt > 0 && t_within_trajectory) { s_array target_nominal_state = this->controller_->interpolateState(state_traj_, time_since_last_opt); pubControl(controller_->getCurrentControl(state, time_since_last_opt, target_nominal_state, control_traj_, feedback_state_)); if (debug_mode_) { pubNominalState(target_nominal_state); } } } virtual bool hasNewDynamicsParams() { return has_new_dynamics_params_; }; virtual bool hasNewCostParams() { return has_new_cost_params_; }; virtual bool hasNewControllerParams() { return has_new_controller_params_; }; virtual bool hasNewSamplerParams() { return has_new_sampler_params_; }; virtual DYN_PARAMS_T getNewDynamicsParams(bool set_flag = false) { has_new_dynamics_params_ = set_flag; return dynamics_params_; } virtual COST_PARAMS_T getNewCostParams(bool set_flag = false) { has_new_cost_params_ = set_flag; return cost_params_; } virtual CONTROLLER_PARAMS_T getNewControllerParams(bool set_flag = false) { has_new_controller_params_ = set_flag; return controller_params_; } virtual SAMPLER_PARAMS_T getNewSamplerParams(bool set_flag = false) { has_new_sampler_params_ = set_flag; return sampler_params_; } virtual void setDynamicsParams(const DYN_PARAMS_T& params) { std::lock_guard guard(dynamics_params_guard_); dynamics_params_ = params; has_new_dynamics_params_ = true; } virtual void setCostParams(const COST_PARAMS_T& params) { std::lock_guard guard(cost_params_guard_); cost_params_ = params; has_new_cost_params_ = true; } virtual void setControllerParams(const CONTROLLER_PARAMS_T& params) { std::lock_guard guard(controller_params_guard_); controller_params_ = params; has_new_controller_params_ = true; } virtual void setSamplerParams(const SAMPLER_PARAMS_T& params) { std::lock_guard guard(sampler_params_guard_); sampler_params_ = params; has_new_sampler_params_ = true; } virtual void setLogger(const mppi::util::MPPILoggerPtr& logger) { logger_ = logger; controller_->setLogger(logger); } virtual void setLogLevel(const mppi::util::LOG_LEVEL& level) { logger_->setLogLevel(level); controller_->setLogLevel(level); } virtual mppi::util::MPPILoggerPtr getLogger() { return logger_; } virtual mppi::util::MPPILoggerPtr getLogger() const { return logger_; } /** * * @param controller * @param state * @return */ virtual bool updateParameters() { bool changed = false; // Update cost parameters if (hasNewCostParams()) { std::lock_guard guard(cost_params_guard_); changed = true; COST_PARAMS_T cost_params = getNewCostParams(); controller_->cost_->setParams(cost_params); } // Update dynamics params if (hasNewDynamicsParams()) { std::lock_guard guard(dynamics_params_guard_); changed = true; DYN_PARAMS_T dyn_params = getNewDynamicsParams(); controller_->model_->setParams(dyn_params); } // Update controller params if (hasNewControllerParams()) { std::lock_guard guard(controller_params_guard_); changed = true; CONTROLLER_PARAMS_T controller_params = getNewControllerParams(); controller_->setParams(controller_params); } // Update sampler params if (hasNewSamplerParams()) { std::lock_guard guard(sampler_params_guard_); changed = true; SAMPLER_PARAMS_T sampler_params = getNewSamplerParams(); controller_->setSamplingParams(sampler_params); } return changed; } /** * two concepts of time * 1. wall clock: how long it takes according to actual time to optimize * 2. robot time: how long has elapsed from the perspective of the robot (per the state estimator) * @param controller * @param is_alive * @return the millisecond number that the loop iteration started at */ void runControlIteration(std::atomic* is_alive) { std::chrono::steady_clock::time_point loop_start = std::chrono::steady_clock::now(); if (!is_alive->load()) { // break out if it should stop return; } double temp_last_state_time = getStateTime(); double temp_last_used_state_update_time = last_used_state_update_time_; // If it is the first iteration and we have received state, we should not wait for timestamps to differ bool skip_first_loop = num_iter_ == 0 && has_received_state_; // wait for a new state to compute control sequence from while (temp_last_used_state_update_time == temp_last_state_time && !skip_first_loop && is_alive->load()) { usleep(50); temp_last_state_time = getStateTime(); // In case when runControlIteration is ran before getting state and state time is specifically 0 skip_first_loop = num_iter_ == 0 && has_received_state_; } if (!is_alive->load()) { // if we have been stopped, exit return; } // updates the parameters if needed to ensure most up to date information updateParameters(); // gets the most recent possible state and time locked together so that we can ensure they are synced and up to date this->access_guard_.lock(); s_array state = state_; temp_last_state_time = state_time_; this->access_guard_.unlock(); // Check if there are nans in the initial state and don't continue if (!isfinite(state.sum())) { return; } if (this->controller_->model_->checkRequiresBuffer()) { std::lock_guard guard(dynamics_params_guard_); this->controller_->model_->updateFromBuffer(this->getSmoothedBuffer(temp_last_state_time)); HANDLE_ERROR(cudaStreamSynchronize(this->controller_->model_->stream_)); } // Check the robots status for this optimization int temp_status = checkStatus(); // calculate how much we should slide the control sequence double dt = temp_last_state_time - temp_last_used_state_update_time; if (num_iter_ == 0) { // // should only happen on the first iteration dt = 0; last_optimization_stride_ = 0; } else { last_optimization_stride_ = std::max(int(round(dt / this->controller_->getDt())), optimization_stride_); } // printf("calc optimization stride %f %f %f %d\n", dt, temp_last_used_state_update_time, temp_last_state_time, // last_optimization_stride_); // determine how long we should stride based off of robot time if (last_optimization_stride_ > 0 && last_optimization_stride_ < controller_->getNumTimesteps()) { controller_->updateImportanceSamplingControl(state, last_optimization_stride_); controller_->slideControlSequence(last_optimization_stride_); } // Compute a new control sequence std::chrono::steady_clock::time_point optimization_start = std::chrono::steady_clock::now(); controller_->computeControl(state, last_optimization_stride_); // Compute the nominal control sequence MPPIFreeEnergyStatistics fe_stats = controller_->getFreeEnergyStatistics(); c_traj control_traj = controller_->getControlSeq(); if (!control_traj.allFinite()) { std::cerr << "ERROR: Nan in control inside plant" << std::endl; std::cerr << control_traj << std::endl; throw std::runtime_error("Control Trajectory inside plant has a NaN"); } s_traj state_traj = controller_->getTargetStateSeq(); if (!state_traj.allFinite()) { std::cerr << "ERROR: Nan in state inside plant" << std::endl; std::cerr << state_traj << std::endl; throw std::runtime_error("State Trajectory inside plant has a NaN"); } o_traj output_traj = controller_->getTargetOutputSeq(); if (!output_traj.allFinite()) { std::cerr << "ERROR: Nan in output inside plant" << std::endl; std::cerr << output_traj << std::endl; throw std::runtime_error("Output Trajectory inside plant has a NaN"); } optimization_duration_ = mppi::math::timeDiffms(std::chrono::steady_clock::now(), optimization_start); std::chrono::steady_clock::time_point feedback_start = std::chrono::steady_clock::now(); // TODO make sure this is zero by default FB_STATE_T feedback_state; if (controller_->getFeedbackEnabled()) { controller_->computeFeedback(state); feedback_state = controller_->getFeedbackState(); } feedback_duration_ = mppi::math::timeDiffms(std::chrono::steady_clock::now(), feedback_start); // Set the updated solution for execution setSolution(state_traj, control_traj, output_traj, feedback_state, temp_last_state_time); status_ = temp_status; pubFreeEnergyStatistics(fe_stats); // calculate the propogated feedback trajectory controller_->computeFeedbackPropagatedStateSeq(); // Update the average loop time data double prev_iter_percent = (num_iter_ - 1.0) / num_iter_; avg_optimize_time_ms_ = prev_iter_percent * avg_optimize_time_ms_ + optimization_duration_ / num_iter_; avg_feedback_time_ms_ = prev_iter_percent * avg_feedback_time_ms_ + feedback_duration_ / num_iter_; optimize_loop_duration_ = mppi::math::timeDiffms(std::chrono::steady_clock::now(), loop_start); avg_loop_time_ms_ = prev_iter_percent * avg_loop_time_ms_ + optimize_loop_duration_ / num_iter_; } void runControlLoop(std::atomic* is_alive) { // Initial condition of the robot state_ = init_state_; // Initial control value u_ = init_u_; controller_->resetControls(); // Start the control loop. while (is_alive->load()) { double wait_until_real_time = getCurrentTime() + (1.0 / hz_) * optimization_stride_; runControlIteration(is_alive); double wait_until_state_time = last_used_state_update_time_ + (1.0 / hz_) * optimization_stride_; // printf("last used state update time %f last_stride = %d\n", last_used_state_update_time_, // last_optimization_stride_); printf("wait until time %f current time %f\n", wait_until_time, getCurrentTime()); std::chrono::steady_clock::time_point sleep_start = std::chrono::steady_clock::now(); while (is_alive->load() && wait_until_state_time > getStateTime()) { updateParameters(); usleep(50); } sleep_duration_ = mppi::math::timeDiffms(std::chrono::steady_clock::now(), sleep_start); double prev_iter_percent = (num_iter_ - 1.0) / num_iter_; avg_sleep_time_ms_ = prev_iter_percent * avg_sleep_time_ms_ + sleep_duration_ / num_iter_; // printf("sleep: %f loop_time %f at time %f", sleep_duration_, optimize_loop_duration_, getCurrentTime()); // std::cout << " loop ended at: " << // std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count() // << std::endl; } } }; #endif // BASE_PLANT_H_ ================================================ FILE: include/mppi/core/buffer.hpp ================================================ /** * @file buffer.hpp * @brief Buffer of states and controls * @author Bogdan Vlahov * @version 0.0.1 * @date 2024-05-04 */ #pragma once #include #include #include #include #include template struct BufferMessage { double time = -1; T data; bool required = true; BufferMessage(double time, T data) { this->time = time; this->data = data; } }; template class Buffer { public: using buffer_trajectory = typename DYN_T::buffer_trajectory; using c_array = typename DYN_T::control_array; void updateExtraValue(const std::string& name, float value, double time) { std::lock_guard guard(this->buffer_guard_); if (prev_extra_.find(name) == prev_extra_.end()) { prev_extra_.emplace(std::make_pair(name, std::list>())); } insertionSort(prev_extra_[name], time, value); } void updateControls(c_array& control, double time) { std::lock_guard guard(this->buffer_guard_); insertionSort(prev_controls_, time, control); } template static void insertionSort(std::list>& list, double time, T val) { if (list.empty()) { list.push_back(BufferMessage(time, val)); return; } for (auto it = list.rbegin(); it != list.rend(); it++) { if (it->time < time) { list.insert(it.base(), BufferMessage(time, val)); return; } } list.push_front(BufferMessage(time, val)); } template static void cleanList(std::list>& list, double time, double buffer_time_horizon) { if (list.empty()) { return; } auto it = list.begin(); // iterate until the time is greater than for (; (it != list.end() && it->time < time - buffer_time_horizon); it++) { } list.erase(list.begin(), it); } static Eigen::Quaternionf interp(std::list>& list, double time) { if (list.empty()) { return Eigen::Quaternionf::Identity(); } auto it = list.rbegin(); // iterate until the time is greater than for (; (it != list.rend() && it->time > time); it++) { } if (it == list.rbegin()) { return it->data; } // if we search entire list then use the first index auto it_new = std::prev(it); if (it == list.rend()) { return list.begin()->data; } double diff = it_new->time - it->time; double alpha = (time - it->time) / diff; // ensure we don't check beyond, i.e. use closest when interpolating outside bounds alpha = std::min(std::max(alpha, 0.0), 1.0); return it->data.slerp(alpha, it_new->data); } template static T interp(std::list>& list, double time) { if (list.empty()) { return T(); } auto it = list.rbegin(); // iterate until the time is greater than for (; (it != list.rend() && it->time > time); it++) { } if (it == list.rbegin()) { return it->data; } // if we search entire list then use the first index auto it_new = std::prev(it); if (it == list.rend()) { return list.begin()->data; } double diff = it_new->time - it->time; double alpha = (time - it->time) / diff; // ensure we don't check beyond, i.e. use closest when interpolating outside bounds alpha = std::min(std::max(alpha, 0.0), 1.0); return (1 - alpha) * it->data + alpha * it_new->data; } void updateOdometry(Eigen::Vector3f& pos, Eigen::Quaternionf& quat, Eigen::Vector3f& vel, Eigen::Vector3f& omega, double time) { this->buffer_guard_.lock(); // inserts odometry into buffers using insertion sort insertionSort(prev_position_, time, pos); insertionSort(prev_quaternion_, time, quat); insertionSort(prev_velocity_, time, vel); insertionSort(prev_omega_, time, omega); this->buffer_guard_.unlock(); } std::map getInterpState(double time) { std::lock_guard guard(this->buffer_guard_); std::map result; if (prev_position_.empty()) { return result; } Eigen::Vector3f interp_pos = interp(prev_position_, time); result["POS_X"] = interp_pos.x(); result["POS_Y"] = interp_pos.y(); result["POS_Z"] = interp_pos.z(); Eigen::Quaternionf interp_quat = interp(prev_quaternion_, time); result["Q_W"] = interp_quat.w(); result["Q_X"] = interp_quat.x(); result["Q_Y"] = interp_quat.y(); result["Q_Z"] = interp_quat.z(); float roll, pitch, yaw; mppi::math::Quat2EulerNWU(interp_quat, roll, pitch, yaw); result["ROLL"] = roll; result["PITCH"] = pitch; result["YAW"] = yaw; Eigen::Vector3f interp_vel = interp(prev_velocity_, time); result["VEL_X"] = interp_vel.x(); result["VEL_Y"] = interp_vel.y(); result["VEL_Z"] = interp_vel.z(); Eigen::Vector3f interp_omega = interp(prev_omega_, time); result["OMEGA_X"] = interp_omega.x(); result["OMEGA_Y"] = interp_omega.y(); result["OMEGA_Z"] = interp_omega.z(); auto controls_interp = interp(prev_controls_, time); for (int i = 0; i < controls_interp.rows(); i++) { result["CONTROL_" + std::to_string(i)] = controls_interp(i); } for (auto& it : prev_extra_) { result[it.first] = interp(it.second, time); } return result; } buffer_trajectory getSmoothedBuffer(double latest_time, double buffer_tau, double buffer_dt) { // does the latest state to make sure we have valid values std::map start_vals = getInterpState(latest_time); buffer_trajectory result; // if not enough data return empty message this->buffer_guard_.lock(); float time_diff = prev_position_.rbegin()->time - prev_position_.begin()->time; if (time_diff < buffer_tau) { std::cout << "not enough time for buffer, returning early" << prev_position_.rbegin()->time << " - " << prev_position_.begin()->time << " = " << time_diff << " < " << buffer_tau << std::endl; this->buffer_guard_.unlock(); return result; } this->buffer_guard_.unlock(); int steps = buffer_tau / buffer_dt + 1; for (const auto& start_val : start_vals) { result[start_val.first] = Eigen::VectorXf(steps); result[start_val.first](steps - 1) = start_val.second; } // goes from [t - tau, t) for (int t = 0; t <= steps - 1; t++) { // get query time double query_time = latest_time - (steps - 1) * buffer_dt + t * buffer_dt; std::map interp_vals = getInterpState(query_time); // interpolate values for (auto& interp_val : interp_vals) { result[interp_val.first](t) = interp_val.second; } } return result; } void cleanBuffers(double time, double horizon) { std::lock_guard guard(this->buffer_guard_); cleanList(prev_position_, time, horizon); cleanList(prev_quaternion_, time, horizon); cleanList(prev_velocity_, time, horizon); cleanList(prev_omega_, time, horizon); cleanList(prev_controls_, time, horizon); for (auto& it : prev_extra_) { cleanList(it.second, time, horizon); } } void clearBuffers() { std::lock_guard guard(this->buffer_guard_); prev_position_.clear(); prev_quaternion_.clear(); prev_velocity_.clear(); prev_omega_.clear(); prev_controls_.clear(); prev_extra_.clear(); } std::list> getPrevPositionList() { return prev_position_; } std::list> getPrevQuaternionList() { return prev_quaternion_; } std::list> getPrevVelocityList() { return prev_velocity_; } std::list> getPrevOmegaList() { return prev_omega_; } std::list> getPrevControlList() { return prev_controls_; } std::map>> getPrevExtraList() { return prev_extra_; } double getLatestOdomTime() { if (prev_position_.empty()) { return 0; } return prev_position_.rbegin()->time; } double getOldestOdomTime() { if (prev_position_.empty()) { return 0; } return prev_position_.begin()->time; } private: std::mutex buffer_guard_; std::list> prev_position_; std::list> prev_quaternion_; std::list> prev_velocity_; std::list> prev_omega_; std::list> prev_controls_; std::map>> prev_extra_; }; ================================================ FILE: include/mppi/core/buffered_plant.hpp ================================================ // // Created by jason on 7/20/21. // #ifndef MPPIGENERIC_BUFFERED_PLANT_H #define MPPIGENERIC_BUFFERED_PLANT_H #include "buffer.hpp" #include "base_plant.hpp" template class BufferedPlant : public BasePlant { public: using s_array = typename BasePlant::s_array; using c_array = typename BasePlant::c_array; using buffer_trajectory = typename BasePlant::buffer_trajectory; BufferedPlant(std::shared_ptr controller, int hz, int opt_stide) : BasePlant(controller, hz, opt_stide) { } void updateExtraValue(const std::string& name, float value, double time) { buffer_.updateExtraValue(name, value, time); } void updateControls(c_array& control, double time) { buffer_.updateControls(control, time); } void updateOdometry(Eigen::Vector3f& pos, Eigen::Quaternionf& quat, Eigen::Vector3f& vel, Eigen::Vector3f& omega, double time) { buffer_.updateOdometry(pos, quat, vel, omega, time); /** * Uses the most recent odometry information * If any other sources are more delayed it uses the most recent value * If other sources are more recent it gets interpolated to odom time */ std::map smoothed = buffer_.getInterpState(time); s_array state = this->controller_->model_->stateFromMap(smoothed); BasePlant::updateState(state, time); } std::map getInterpState(double time) { return buffer_.getInterpState(time); } bool updateParameters() { // removes extra values from the buffer double time = this->getStateTime(); buffer_.cleanBuffers(time, buffer_time_horizon_); return BasePlant::updateParameters(); } buffer_trajectory getSmoothedBuffer(double latest_time) { return buffer_.getSmoothedBuffer(latest_time, buffer_tau_, buffer_dt_); } void cleanBuffers(double time) { buffer_.cleanBuffers(time, buffer_time_horizon_); } void clearBuffers() { buffer_.clearBuffers(); } double getBufferDt() const { return buffer_dt_; } void setBufferDt(const double buff_dt) { buffer_dt_ = buff_dt; } protected: Buffer buffer_; double buffer_time_horizon_ = 2.0; // how long to store values in the buffer double buffer_tau_ = 1.0; // how in history to create well sampled positions from double buffer_dt_ = 0.02; // the spacing between well sampled buffer positions }; #endif // MPPIGENERIC_BUFFERED_PLANT_H ================================================ FILE: include/mppi/core/mppi_common.cu ================================================ #include #include #include #include #include // CUDA barriers were first implemented in CUDA 11 #if defined(CMAKE_USE_CUDA_BARRIERS) && defined(CUDART_VERSION) && CUDART_VERSION > 11000 #include using barrier = cuda::barrier; // Turn on/off various CUDA barriers from CMake configuration #ifdef CMAKE_USE_CUDA_BARRIERS_DYN #define USE_CUDA_BARRIERS_DYN #endif #ifdef CMAKE_USE_CUDA_BARRIERS_COST #define USE_CUDA_BARRIERS_COST #endif #ifdef CMAKE_USE_CUDA_BARRIERS_ROLLOUT #define USE_CUDA_BARRIERS_ROLLOUT #endif #endif #include namespace cg = cooperative_groups; namespace mp1 = mppi::p1; namespace mppi { namespace kernels { /******************************************************************************************************************* * Kernel Functions *******************************************************************************************************************/ template __global__ void rolloutKernel(DYN_T* __restrict__ dynamics, SAMPLING_T* __restrict__ sampling, COST_T* __restrict__ costs, float dt, const int num_timesteps, const int num_rollouts, const float* __restrict__ init_x_d, float lambda, float alpha, float* __restrict__ trajectory_costs_d) { // Get thread and block id const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int block_idx = blockIdx.x; const int global_idx = blockDim.x * block_idx + thread_idx; const int shared_idx = blockDim.x * thread_idz + thread_idx; const int distribution_idx = threadIdx.z; const int distribution_dim = blockDim.z; const int sample_dim = blockDim.x; // Ensure that there is enough room for the SHARED_MEM_REQUEST_GRD_BYTES and SHARED_MEM_REQUEST_BLK_BYTES portions to // be aligned to the float4 boundary. const int size_of_theta_s_bytes = calcClassSharedMemSize(dynamics, blockDim); const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int size_of_theta_c_bytes = calcClassSharedMemSize(costs, blockDim); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* x_shared = entire_buffer; float* x_next_shared = &x_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* y_shared = &x_next_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* x_dot_shared = &y_shared[math::nearest_multiple_4(sample_dim * DYN_T::OUTPUT_DIM * distribution_dim)]; float* u_shared = &x_dot_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* theta_s_shared = &u_shared[math::nearest_multiple_4(sample_dim * DYN_T::CONTROL_DIM * distribution_dim)]; float* theta_d_shared = &theta_s_shared[size_of_theta_s_bytes / sizeof(float)]; float* theta_c_shared = &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; float* running_cost_shared = &theta_c_shared[size_of_theta_c_bytes / sizeof(float)]; int* crash_status_shared = (int*)&running_cost_shared[math::nearest_multiple_4(blockDim.x * blockDim.y * blockDim.z)]; #ifdef USE_CUDA_BARRIERS_ROLLOUT barrier* barrier_shared = (barrier*)&crash_status_shared[math::nearest_multiple_4(sample_dim * distribution_dim)]; #endif // Create local state, state dot and controls int running_cost_index = thread_idx + blockDim.x * (thread_idy + blockDim.y * thread_idz); float* x = &x_shared[shared_idx * DYN_T::STATE_DIM]; float* x_next = &x_next_shared[shared_idx * DYN_T::STATE_DIM]; float* x_temp; float* xdot = &x_dot_shared[shared_idx * DYN_T::STATE_DIM]; float* u = &u_shared[shared_idx * DYN_T::CONTROL_DIM]; float* y = &y_shared[shared_idx * DYN_T::OUTPUT_DIM]; float* running_cost = &running_cost_shared[running_cost_index]; running_cost[0] = 0.0f; int* crash_status = &crash_status_shared[shared_idx]; crash_status[0] = 0; // We have not crashed yet as of the first trajectory. #ifdef USE_CUDA_BARRIERS_ROLLOUT barrier* bar = &barrier_shared[shared_idx]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif // Load global array to shared array loadGlobalToShared(num_rollouts, blockDim.y, global_idx, thread_idy, thread_idz, init_x_d, x, xdot, u); __syncthreads(); /*<----Start of simulation loop-----> */ dynamics->initializeDynamics(x, u, y, theta_s_shared, 0.0f, dt); sampling->initializeDistributions(y, 0.0f, dt, theta_d_shared); costs->initializeCosts(y, u, theta_c_shared, 0.0f, dt); __syncthreads(); for (int t = 0; t < num_timesteps; t++) { // Load noise trajectories scaled by the exploration factor sampling->readControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // applies constraints as defined in dynamics.cuh see specific dynamics class for what happens here // usually just control clamping dynamics->enforceConstraints(x, u); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // Copy control constraints back to global memory sampling->writeControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); // Increment states dynamics->step(x, x_next, xdot, u, y, theta_s_shared, t, dt); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif running_cost[0] += costs->computeRunningCost(y, u, t, theta_c_shared, crash_status) + sampling->computeLikelihoodRatioCost(u, theta_d_shared, global_idx, t, distribution_idx, lambda, alpha); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif x_temp = x; x = x_next; x_next = x_temp; } // Add all costs together running_cost = &running_cost_shared[thread_idx + blockDim.x * blockDim.y * thread_idz]; __syncthreads(); costArrayReduction(running_cost, blockDim.y, thread_idy, blockDim.y, thread_idy == 0, blockDim.x); // Compute terminal cost and the final cost for each thread computeAndSaveCost(num_rollouts, num_timesteps, global_idx, costs, y, running_cost[0] / (num_timesteps), theta_c_shared, trajectory_costs_d); } template __global__ void rolloutCostKernel(COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, const float* __restrict__ y_d, float* __restrict__ trajectory_costs_d) { // Get thread and block id const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int global_idx = blockIdx.x; const int distribution_idx = threadIdx.z; const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int size_of_theta_c_bytes = calcClassSharedMemSize(costs, blockDim); int running_cost_index = thread_idx + blockDim.x * (thread_idy + blockDim.y * thread_idz); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* y_shared = entire_buffer; float* u_shared = &y_shared[math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::OUTPUT_DIM)]; float* running_cost_shared = &u_shared[math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::CONTROL_DIM)]; int* crash_status_shared = (int*)&running_cost_shared[math::nearest_multiple_4(blockDim.x * blockDim.y * blockDim.z)]; float* theta_c = (float*)&crash_status_shared[math::nearest_multiple_4(blockDim.x * blockDim.z)]; float* theta_d = &theta_c[size_of_theta_c_bytes / sizeof(float)]; #ifdef USE_CUDA_BARRIERS_COST barrier* barrier_shared = (barrier*)&theta_d[size_of_theta_d_bytes / sizeof(float)]; #endif // Create local state, state dot and controls float* y; float* u; int* crash_status; // Initialize running cost and total cost float* running_cost; int sample_time_offset = 0; // Load global array to shared array y = &y_shared[(blockDim.x * thread_idz + thread_idx) * COST_T::OUTPUT_DIM]; u = &u_shared[(blockDim.x * thread_idz + thread_idx) * COST_T::CONTROL_DIM]; crash_status = &crash_status_shared[thread_idz * blockDim.x + thread_idx]; crash_status[0] = 0; // We have not crashed yet as of the first trajectory. running_cost = &running_cost_shared[running_cost_index]; running_cost[0] = 0.0f; #ifdef USE_CUDA_BARRIERS_COST barrier* bar = &barrier_shared[(blockDim.x * thread_idz + thread_idx)]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif /*<----Start of simulation loop-----> */ #ifdef USE_COST_WITH_OFF_NUM_TIMESTEPS const int max_time_iters = ceilf((float)(num_timesteps - 2) / blockDim.x); #else const int max_time_iters = ceilf((float)num_timesteps / blockDim.x); #endif costs->initializeCosts(y, u, theta_c, 0.0f, dt); sampling->initializeDistributions(y, 0.0f, dt, theta_d); __syncthreads(); for (int time_iter = 0; time_iter < max_time_iters; ++time_iter) { int t = thread_idx + time_iter * blockDim.x; if (COALESCE) { // Fill entire shared mem sequentially using sequential threads_idx int amount_to_fill = (time_iter + 1) * blockDim.x > num_timesteps ? num_timesteps % blockDim.x : blockDim.x; mp1::loadArrayParallel( y_shared, blockDim.x * thread_idz * COST_T::OUTPUT_DIM, y_d, ((num_rollouts * thread_idz + global_idx) * num_timesteps + time_iter * blockDim.x) * COST_T::OUTPUT_DIM, COST_T::OUTPUT_DIM * amount_to_fill); } else if (t < num_timesteps) { // t = num_timesteps is the terminal state for outside this for-loop sample_time_offset = (num_rollouts * thread_idz + global_idx) * num_timesteps + t; mp1::loadArrayParallel(y, 0, y_d, sample_time_offset * COST_T::OUTPUT_DIM); } if (t < num_timesteps) { // load controls from t = 0 to t = num_timesteps - 1 sampling->readControlSample(global_idx, t, distribution_idx, u, theta_d, blockDim.y, thread_idy, y); } #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif // Compute cost if (t < num_timesteps) { running_cost[0] += costs->computeRunningCost(y, u, t, theta_c, crash_status) + sampling->computeLikelihoodRatioCost(u, theta_d, global_idx, t, distribution_idx, lambda, alpha); } #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif } // Add all costs together running_cost = &running_cost_shared[blockDim.x * blockDim.y * thread_idz]; __syncthreads(); costArrayReduction(running_cost, blockDim.x * blockDim.y, thread_idx + blockDim.x * thread_idy, blockDim.x * blockDim.y, thread_idx == blockDim.x - 1 && thread_idy == 0); // point every thread to the last output at t = NUM_TIMESTEPS for terminal cost calculation const int last_y_index = (num_timesteps - 1) % blockDim.x; y = &y_shared[(blockDim.x * thread_idz + last_y_index) * COST_T::OUTPUT_DIM]; #ifdef USE_COST_WITH_OFF_NUM_TIMESTEPS // load last output array const int t = num_timesteps - 1; mp1::loadArrayParallel( y_shared, (blockDim.x * thread_idz + last_y_index) * COST_T::OUTPUT_DIM, y_d, ((global_idx + num_rollouts * thread_idz) * num_timesteps + t) * COST_T::OUTPUT_DIM, COST_T::OUTPUT_DIM); __syncthreads(); #endif // Compute terminal cost and the final cost for each thread computeAndSaveCost(num_rollouts, num_timesteps, global_idx, costs, y, running_cost[0] / (num_timesteps), theta_c, trajectory_costs_d); } template __global__ void rolloutDynamicsKernel(DYN_T* __restrict__ dynamics, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, const float* __restrict__ init_x_d, float* __restrict__ y_d) { // Get thread and block id const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int block_idx = blockIdx.x; const int global_idx = blockDim.x * block_idx + thread_idx; const int shared_idx = blockDim.x * thread_idz + thread_idx; const int distribution_idx = threadIdx.z; const int distribution_dim = blockDim.z; const int sample_dim = blockDim.x; // Ensure that there is enough room for the SHARED_MEM_REQUEST_GRD_BYTES and SHARED_MEM_REQUEST_BLK_BYTES portions to // be aligned to the float4 boundary. const int size_of_theta_s_bytes = calcClassSharedMemSize(dynamics, blockDim); const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* x_shared = entire_buffer; float* x_next_shared = &x_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* y_shared = &x_next_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* x_dot_shared = &y_shared[math::nearest_multiple_4(sample_dim * DYN_T::OUTPUT_DIM * distribution_dim)]; float* u_shared = &x_dot_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* theta_s_shared = &u_shared[math::nearest_multiple_4(sample_dim * DYN_T::CONTROL_DIM * distribution_dim)]; float* theta_d_shared = &theta_s_shared[size_of_theta_s_bytes / sizeof(float)]; #ifdef USE_CUDA_BARRIERS_DYN barrier* barrier_shared = (barrier*)&theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; #endif // Create local state, state dot and controls float* x = &x_shared[shared_idx * DYN_T::STATE_DIM]; float* x_next = &x_next_shared[shared_idx * DYN_T::STATE_DIM]; float* x_temp; float* xdot = &x_dot_shared[shared_idx * DYN_T::STATE_DIM]; float* u = &u_shared[shared_idx * DYN_T::CONTROL_DIM]; float* y = &y_shared[shared_idx * DYN_T::OUTPUT_DIM]; #ifdef USE_CUDA_BARRIERS_DYN barrier* bar = &barrier_shared[shared_idx]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif // Load global array to shared array loadGlobalToShared(num_rollouts, blockDim.y, global_idx, thread_idy, thread_idz, init_x_d, x, xdot, u); __syncthreads(); /*<----Start of simulation loop-----> */ dynamics->initializeDynamics(x, u, y, theta_s_shared, 0.0f, dt); sampling->initializeDistributions(y, 0.0f, dt, theta_d_shared); __syncthreads(); for (int t = 0; t < num_timesteps; t++) { // Load noise trajectories scaled by the exploration factor sampling->readControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif // applies constraints as defined in dynamics.cuh see specific dynamics class for what happens here // usually just control clamping dynamics->enforceConstraints(x, u); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif // Copy control constraints back to global memory sampling->writeControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); // Increment states dynamics->step(x, x_next, xdot, u, y, theta_s_shared, t, dt); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif x_temp = x; x = x_next; x_next = x_temp; // Copy state to global memory int sample_time_offset = (num_rollouts * thread_idz + global_idx) * num_timesteps + t; mp1::loadArrayParallel(y_d, sample_time_offset * DYN_T::OUTPUT_DIM, y, 0); } } template __global__ void visualizeKernel(DYN_T* __restrict__ dynamics, SAMPLING_T* __restrict__ sampling, COST_T* __restrict__ costs, float dt, const int num_timesteps, const int num_rollouts, const float* __restrict__ init_x_d, float lambda, float alpha, float* __restrict__ y_d, float* __restrict__ cost_traj_d, int* __restrict__ crash_status_d) { // Get thread and block id const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int block_idx = blockIdx.x; const int global_idx = blockDim.x * block_idx + thread_idx; const int shared_idx = blockDim.x * thread_idz + thread_idx; const int distribution_idx = threadIdx.z; const int distribution_dim = blockDim.z; const int sample_dim = blockDim.x; // Ensure that there is enough room for the SHARED_MEM_REQUEST_GRD_BYTES and SHARED_MEM_REQUEST_BLK_BYTES portions to // be aligned to the float4 boundary. const int size_of_theta_s_bytes = calcClassSharedMemSize(dynamics, blockDim); const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int size_of_theta_c_bytes = calcClassSharedMemSize(costs, blockDim); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* x_shared = entire_buffer; float* x_next_shared = &x_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* y_shared = &x_next_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* x_dot_shared = &y_shared[math::nearest_multiple_4(sample_dim * DYN_T::OUTPUT_DIM * distribution_dim)]; float* u_shared = &x_dot_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* theta_s_shared = &u_shared[math::nearest_multiple_4(sample_dim * DYN_T::CONTROL_DIM * distribution_dim)]; float* theta_d_shared = &theta_s_shared[size_of_theta_s_bytes / sizeof(float)]; float* theta_c_shared = &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; float* running_cost_shared = &theta_c_shared[size_of_theta_c_bytes / sizeof(float)]; int* crash_status_shared = (int*)&running_cost_shared[math::nearest_multiple_4(num_timesteps * blockDim.y * blockDim.z)]; #ifdef USE_CUDA_BARRIERS_ROLLOUT barrier* barrier_shared = (barrier*)&crash_status_shared[math::nearest_multiple_4(sample_dim * distribution_dim)]; #endif // Create local state, state dot and controls float* x = &x_shared[shared_idx * DYN_T::STATE_DIM]; float* x_next = &x_next_shared[shared_idx * DYN_T::STATE_DIM]; float* x_temp; float* xdot = &x_dot_shared[shared_idx * DYN_T::STATE_DIM]; float* u = &u_shared[shared_idx * DYN_T::CONTROL_DIM]; float* y = &y_shared[shared_idx * DYN_T::OUTPUT_DIM]; float* running_cost = &running_cost_shared[blockDim.x * (thread_idz * blockDim.y + thread_idy)]; int* crash_status = &crash_status_shared[shared_idx]; crash_status[0] = 0; // We have not crashed yet as of the first trajectory. int cost_index; #ifdef USE_CUDA_BARRIERS_ROLLOUT barrier* bar = &barrier_shared[shared_idx]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif // Load global array to shared array loadGlobalToShared(num_rollouts, blockDim.y, global_idx, thread_idy, thread_idz, init_x_d, x, xdot, u); __syncthreads(); /*<----Start of simulation loop-----> */ dynamics->initializeDynamics(x, u, y, theta_s_shared, 0.0f, dt); sampling->initializeDistributions(y, 0.0f, dt, theta_d_shared); costs->initializeCosts(y, u, theta_c_shared, 0.0f, dt); __syncthreads(); for (int t = 0; t < num_timesteps; t++) { // Load noise trajectories scaled by the exploration factor sampling->readVisControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // applies constraints as defined in dynamics.cuh see specific dynamics class for what happens here // usually just control clamping dynamics->enforceConstraints(x, u); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // Increment states dynamics->step(x, x_next, xdot, u, y, theta_s_shared, t, dt); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif if (t > 0) { float cost = costs->computeRunningCost(y, u, t, theta_c_shared, crash_status) + sampling->computeLikelihoodRatioCost(u, theta_d_shared, global_idx, t, distribution_idx, lambda, alpha); running_cost[t - 1] = cost / (num_timesteps); crash_status_d[global_idx * num_timesteps + t] = crash_status[0]; } #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif x_temp = x; x = x_next; x_next = x_temp; // Copy state to global memory int sample_time_offset = (num_rollouts * thread_idz + global_idx) * num_timesteps + t; mp1::loadArrayParallel(y_d, sample_time_offset * DYN_T::OUTPUT_DIM, y, 0); } // Add all thread_y components of cost together running_cost = &running_cost_shared[thread_idx + blockDim.x * blockDim.y * thread_idz]; __syncthreads(); costArrayReduction(running_cost, blockDim.y, thread_idy, blockDim.y, thread_idy == blockDim.y - 1, blockDim.x); // Compute terminal cost for each thread if (threadIdx.x == 0 && threadIdx.y == 0) { cost_index = (threadIdx.z * num_rollouts + global_idx) * (num_timesteps + 1) + num_timesteps; cost_traj_d[cost_index] = costs->terminalCost(y, theta_c_shared) / (num_timesteps); } __syncthreads(); // Copy to global memory int parallel_index, step; mp1::getParallel1DIndex(parallel_index, step); if (num_timesteps % 4 == 0) { float4* cost_traj_d4 = reinterpret_cast(&cost_traj_d[(thread_idz * num_rollouts + global_idx) * num_timesteps]); float4* running_cost_shared4 = reinterpret_cast(&running_cost_shared[thread_idz * num_timesteps * blockDim.y]); for (int i = parallel_index; i < num_timesteps / 4; i += step) { cost_traj_d4[i] = running_cost_shared4[i]; } } else if (num_timesteps % 2 == 0) { float2* cost_traj_d2 = reinterpret_cast(&cost_traj_d[(thread_idz * num_rollouts + global_idx) * num_timesteps]); float2* running_cost_shared2 = reinterpret_cast(&running_cost_shared[thread_idz * num_timesteps * blockDim.y]); for (int i = parallel_index; i < num_timesteps / 2; i += step) { cost_traj_d2[i] = running_cost_shared2[i]; } } else { for (int i = parallel_index; i < num_timesteps; i += step) { cost_traj_d[(thread_idz * num_rollouts + global_idx) * num_timesteps + i] = running_cost_shared[thread_idz * num_timesteps * blockDim.y + i]; } } } template __global__ void visualizeCostKernel(COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, const float lambda, float alpha, const float* __restrict__ y_d, float* __restrict__ cost_traj_d, int* __restrict__ crash_status_d) { // Get thread and block id const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int global_idx = blockIdx.x; const int shared_idx = blockDim.x * thread_idz + thread_idx; const int distribution_idx = threadIdx.z; const int size_of_theta_c_bytes = calcClassSharedMemSize(costs, blockDim); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* y_shared = entire_buffer; float* u_shared = &y_shared[math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::OUTPUT_DIM)]; float* running_cost_shared = &u_shared[math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::CONTROL_DIM)]; int* crash_status_shared = (int*)&running_cost_shared[math::nearest_multiple_4(blockDim.y * blockDim.z * num_timesteps)]; float* theta_c = (float*)&crash_status_shared[math::nearest_multiple_4(blockDim.x * blockDim.z)]; float* theta_d = &theta_c[size_of_theta_c_bytes / sizeof(float)]; #ifdef USE_CUDA_BARRIERS_COST const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); barrier* barrier_shared = (barrier*)&theta_d[size_of_theta_d_bytes / sizeof(float)]; #endif // Create local state, state dot and controls float* y; float* u; int* crash_status; // Initialize running cost and total cost float* running_cost; int sample_time_offset = 0; int cost_index = 0; // Load global array to shared array y = &y_shared[shared_idx * COST_T::OUTPUT_DIM]; u = &u_shared[shared_idx * COST_T::CONTROL_DIM]; crash_status = &crash_status_shared[shared_idx]; crash_status[0] = 0; // We have not crashed yet as of the first trajectory. #ifdef USE_CUDA_BARRIERS_COST barrier* bar = &barrier_shared[(blockDim.x * thread_idz + thread_idx)]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif /*<----Start of simulation loop-----> */ #ifdef USE_COST_WITH_OFF_NUM_TIMESTEPS const int max_time_iters = ceilf((float)(num_timesteps - 2) / blockDim.x); #else const int max_time_iters = ceilf((float)num_timesteps / blockDim.x); #endif costs->initializeCosts(y, u, theta_c, 0.0f, dt); sampling->initializeDistributions(y, 0.0f, dt, theta_d); __syncthreads(); for (int time_iter = 0; time_iter < max_time_iters; ++time_iter) { int t = thread_idx + time_iter * blockDim.x; cost_index = (thread_idz * num_rollouts + global_idx) * (num_timesteps) + t - 1; running_cost = &running_cost_shared[blockDim.x * (thread_idz * blockDim.y + thread_idy) + t - 1]; if (COALESCE) { // Fill entire shared mem sequentially using sequential threads_idx int amount_to_fill = (time_iter + 1) * blockDim.x > num_timesteps ? num_timesteps % blockDim.x : blockDim.x; mp1::loadArrayParallel( y_shared, blockDim.x * thread_idz * COST_T::OUTPUT_DIM, y_d, ((num_rollouts * thread_idz + global_idx) * num_timesteps + time_iter * blockDim.x) * COST_T::OUTPUT_DIM, COST_T::OUTPUT_DIM * amount_to_fill); } else if (t < num_timesteps) { // t = num_timesteps is the terminal state for outside this for-loop sample_time_offset = (num_rollouts * thread_idz + global_idx) * num_timesteps + t; mp1::loadArrayParallel(y, 0, y_d, sample_time_offset * COST_T::OUTPUT_DIM); } if (t < num_timesteps) { // load controls from t = 0 to t = num_timesteps - 1 sampling->readVisControlSample(global_idx, t, distribution_idx, u, theta_d, blockDim.y, thread_idy, y); } #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif // Compute cost if (t < num_timesteps) { float cost = costs->computeRunningCost(y, u, t, theta_c, crash_status) + sampling->computeLikelihoodRatioCost(u, theta_d, global_idx, t, distribution_idx, lambda, alpha); running_cost[0] = cost / (num_timesteps); crash_status_d[global_idx * num_timesteps + t] = crash_status[0]; } #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif } // consolidate y threads into single cost running_cost = &running_cost_shared[thread_idx + blockDim.x * blockDim.y * thread_idz]; __syncthreads(); costArrayReduction(running_cost, blockDim.y, thread_idy, blockDim.y, thread_idy == blockDim.y - 1, blockDim.x); // point every thread to the last output at t = NUM_TIMESTEPS for terminal cost calculation const int last_y_index = (num_timesteps - 1) % blockDim.x; y = &y_shared[(blockDim.x * thread_idz + last_y_index) * COST_T::OUTPUT_DIM]; #ifdef USE_COST_WITH_OFF_NUM_TIMESTEPS // load last output array const int t = num_timesteps - 1; mp1::loadArrayParallel( y_shared, (blockDim.x * thread_idz + last_y_index) * COST_T::OUTPUT_DIM, y_d, ((global_idx + num_rollouts * thread_idz) * num_timesteps + t) * COST_T::OUTPUT_DIM, COST_T::OUTPUT_DIM); __syncthreads(); #endif // Compute terminal cost for each thread if (threadIdx.x == 0 && threadIdx.y == 0) { cost_index = (threadIdx.z * num_rollouts + global_idx) * (num_timesteps + 1) + num_timesteps; cost_traj_d[cost_index] = costs->terminalCost(y, theta_c) / (num_timesteps); } __syncthreads(); // Copy to global memory if (num_timesteps % 4 == 0) { float4* cost_traj_d4 = reinterpret_cast(&cost_traj_d[(thread_idz * num_rollouts + global_idx) * num_timesteps]); float4* running_cost_shared4 = reinterpret_cast(&running_cost_shared[thread_idz * num_timesteps * blockDim.y]); for (int i = thread_idx; i < num_timesteps / 4; i += blockDim.x) { cost_traj_d4[i] = running_cost_shared4[i]; } } else if (num_timesteps % 2 == 0) { float2* cost_traj_d2 = reinterpret_cast(&cost_traj_d[(thread_idz * num_rollouts + global_idx) * num_timesteps]); float2* running_cost_shared2 = reinterpret_cast(&running_cost_shared[thread_idz * num_timesteps * blockDim.y]); for (int i = thread_idx; i < num_timesteps / 2; i += blockDim.x) { cost_traj_d2[i] = running_cost_shared2[i]; } } else { for (int i = thread_idx; i < num_timesteps; i += blockDim.x) { cost_traj_d[(thread_idz * num_rollouts + global_idx) * num_timesteps + i] = running_cost_shared[thread_idz * num_timesteps * blockDim.y + i]; } } } __global__ void normExpKernel(int num_rollouts, float* trajectory_costs_d, float lambda_inv, float baseline) { int global_idx = (blockDim.x * blockIdx.x + threadIdx.x) * blockDim.z + threadIdx.z; int global_step = blockDim.x * gridDim.x * blockDim.z * gridDim.z; // #if defined(CUDA_VERSION) && CUDA_VERSION > 11060 // auto block = cg::this_grid(); // int global_idx_b = block.thread_rank() + block.block_rank() * block.num_threads(); // int global_step_b = block.num_threads() * block.num_blocks(); // if (global_idx == 200 && threadIdx.y == 0 && threadIdx.z == 0) // { // printf("Global ind: %d, thread_rank: %d\n", global_idx, global_idx_b); // printf("Global step: %d, thread_rank: %d\n", global_step, global_step_b); // } // #endif normExpTransform(num_rollouts * blockDim.z, trajectory_costs_d, lambda_inv, baseline, global_idx, global_step); } __global__ void TsallisKernel(int num_rollouts, float* trajectory_costs_d, float gamma, float r, float baseline) { int global_idx = (blockDim.x * blockIdx.x + threadIdx.x) * blockDim.z + threadIdx.z; int global_step = blockDim.x * gridDim.x * blockDim.z * gridDim.z; TsallisTransform(num_rollouts * blockDim.z, trajectory_costs_d, gamma, r, baseline, global_idx, global_step); } template __global__ void weightedReductionKernel(const float* __restrict__ exp_costs_d, const float* __restrict__ du_d, float* __restrict__ new_u_d, const float normalizer, const int num_timesteps, const int num_rollouts, const int sum_stride) { int thread_idx = threadIdx.x; // Rollout index int block_idx = blockIdx.x; // Timestep // Create a shared array for intermediate sums: CONTROL_DIM x NUM_THREADS extern __shared__ float u_intermediate[]; float u[CONTROL_DIM]; setInitialControlToZero(CONTROL_DIM, thread_idx, u, u_intermediate); __syncthreads(); // Sum the weighted control variations at a desired stride strideControlWeightReduction(num_rollouts, num_timesteps, sum_stride, thread_idx, block_idx, CONTROL_DIM, exp_costs_d, normalizer, du_d, u, u_intermediate); __syncthreads(); // Sum all weighted control variations rolloutWeightReductionAndSaveControl(thread_idx, block_idx, num_rollouts, num_timesteps, CONTROL_DIM, sum_stride, u, u_intermediate, new_u_d); __syncthreads(); } template __global__ void weightedReductionKernel(float* exp_costs_d, float* du_d, float* du_new_d, float2* baseline_and_normalizer_d, int num_timesteps) { int thread_idx = threadIdx.x; // Rollout index int block_idx = blockIdx.x; // Timestep // Create a shared array for intermediate sums: CONTROL_DIM x NUM_THREADS __shared__ float u_intermediate[CONTROL_DIM * ((NUM_ROLLOUTS - 1) / SUM_STRIDE + 1)]; float u[CONTROL_DIM]; setInitialControlToZero(CONTROL_DIM, thread_idx, u, u_intermediate); __syncthreads(); // Sum the weighted control variations at a desired stride strideControlWeightReduction(NUM_ROLLOUTS, num_timesteps, SUM_STRIDE, thread_idx, block_idx, CONTROL_DIM, exp_costs_d, baseline_and_normalizer_d->y, du_d, u, u_intermediate); __syncthreads(); // Sum all weighted control variations rolloutWeightReductionAndSaveControl(thread_idx, block_idx, NUM_ROLLOUTS, num_timesteps, CONTROL_DIM, SUM_STRIDE, u, u_intermediate, du_new_d); __syncthreads(); } /******************************************************************************************************************* * Rollout Kernel Helpers *******************************************************************************************************************/ template __device__ void loadGlobalToShared(const int num_rollouts, const int blocksize_y, const int global_idx, const int thread_idy, const int thread_idz, const float* __restrict__ x_device, float* __restrict__ x_thread, float* __restrict__ xdot_thread, float* __restrict__ u_thread) { // Transfer to shared memory int i; if (global_idx < num_rollouts) { #if true mp1::loadArrayParallel(x_thread, 0, x_device, STATE_DIM * thread_idz); if (STATE_DIM % 4 == 0) { float4* xdot4_t = reinterpret_cast(xdot_thread); for (i = thread_idy; i < STATE_DIM / 4; i += blocksize_y) { xdot4_t[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); } } else if (STATE_DIM % 2 == 0) { float2* xdot2_t = reinterpret_cast(xdot_thread); for (i = thread_idy; i < STATE_DIM / 2; i += blocksize_y) { xdot2_t[i] = make_float2(0.0f, 0.0f); } } else { for (i = thread_idy; i < STATE_DIM; i += blocksize_y) { xdot_thread[i] = 0.0f; } } if (CONTROL_DIM % 4 == 0) { float4* u4_t = reinterpret_cast(u_thread); for (i = thread_idy; i < CONTROL_DIM / 4; i += blocksize_y) { u4_t[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); } } else if (CONTROL_DIM % 2 == 0) { float2* u2_t = reinterpret_cast(u_thread); for (i = thread_idy; i < CONTROL_DIM / 2; i += blocksize_y) { u2_t[i] = make_float2(0.0f, 0.0f); } } else { for (i = thread_idy; i < CONTROL_DIM; i += blocksize_y) { u_thread[i] = 0.0f; } } #else for (i = thread_idy; i < STATE_DIM; i += blocksize_y) { x_thread[i] = x_device[i + STATE_DIM * thread_idz]; xdot_thread[i] = 0.0f; } for (i = thread_idy; i < CONTROL_DIM; i += blocksize_y) { u_thread[i] = 0.0f; } #endif } } template __device__ void computeAndSaveCost(int num_rollouts, int num_timesteps, int global_idx, COST_T* costs, float* output, float running_cost, float* theta_c, float* cost_rollouts_device) { // only want to save 1 cost per trajectory if (threadIdx.y == 0 && global_idx < num_rollouts) { cost_rollouts_device[global_idx + num_rollouts * threadIdx.z] = running_cost + costs->terminalCost(output, theta_c) / (num_timesteps); } } /******************************************************************************************************************* * NormExp Kernel Helpers *******************************************************************************************************************/ float computeBaselineCost(float* cost_rollouts_host, int num_rollouts) { // TODO if we use standard containers in MPPI, should this be replaced with a min algorithm? int best_idx = computeBestIndex(cost_rollouts_host, num_rollouts); return cost_rollouts_host[best_idx]; } float constructBestWeights(float* cost_rollouts_host, int num_rollouts) { int best_idx = computeBestIndex(cost_rollouts_host, num_rollouts); float best_cost = cost_rollouts_host[best_idx]; for (int i = 0; i < num_rollouts; i++) { if (i == best_idx) { cost_rollouts_host[i] = 1.0; } else { cost_rollouts_host[i] = 0.0; } } // printf("Best idx: %d, cost: %f\n", best_cost_idx, best_cost); return best_cost; } int computeBestIndex(float* cost_rollouts_host, int num_rollouts) { float best_cost = cost_rollouts_host[0]; int best_cost_idx = 0; for (int i = 1; i < num_rollouts; i++) { if (cost_rollouts_host[i] < best_cost) { best_cost = cost_rollouts_host[i]; best_cost_idx = i; } } // printf("Best idx: %d, cost: %f\n", best_cost_idx, best_cost); return best_cost_idx; } __device__ inline float computeBaselineCost(int num_rollouts, const float* __restrict__ trajectory_costs_d, float* __restrict__ reduction_buffer, int rollout_idx_global, int rollout_idx_step) { // Copy costs to shared memory float min_cost = 0.0; #if false // potential method to speed up copying costs int prev_size = min(blockDim.x, num_rollouts); float my_val = (rollout_idx_global < num_rollouts) ? trajectory_costs_d[rollout_idx_global] : INFINITY; for (int i = rollout_idx_global + rollout_idx_step; i < num_rollouts; i += rollout_idx_step) { my_val = min(trajectory_costs_d[i], my_val); } reduction_buffer[rollout_idx_global] = my_val; // __syncthreads(); // if (threadIdx.x == 0) // { // for (int i = 0; i < min(blockDim.x, num_rollouts); i++) // { // printf("buff %d: %f\n", i, reduction_buffer[i]); // } // printf("Num rollouts; %d\n", num_rollouts); // } #else int prev_size = num_rollouts / 2; for (int i = rollout_idx_global; i < prev_size; i += rollout_idx_step) { reduction_buffer[i] = min(trajectory_costs_d[i], trajectory_costs_d[i + prev_size]); } if (num_rollouts - 2 * prev_size == 1 && threadIdx.x == blockDim.x - 1) { reduction_buffer[prev_size - 1] = min(reduction_buffer[num_rollouts - 1], reduction_buffer[prev_size - 1]); } #endif __syncthreads(); // find min along the entire array for (int size = prev_size / 2; size > 0; size /= 2) { for (int i = rollout_idx_global; i < size; i += rollout_idx_step) { reduction_buffer[i] = min(reduction_buffer[i], reduction_buffer[i + size]); } __syncthreads(); if (prev_size - 2 * size == 1 && threadIdx.x == blockDim.x - 1) { reduction_buffer[size - 1] = min(reduction_buffer[size - 1], reduction_buffer[prev_size - 1]); } __syncthreads(); prev_size = size; } min_cost = reduction_buffer[0]; return min_cost; } __device__ __host__ inline void normExpTransform(int num_rollouts, float* __restrict__ trajectory_costs_d, float lambda_inv, float baseline, int global_idx, int rollout_idx_step) { for (int i = global_idx; i < num_rollouts; i += rollout_idx_step) { float cost_dif = trajectory_costs_d[i] - baseline; trajectory_costs_d[i] = expf(-lambda_inv * cost_dif); } } __device__ __host__ inline void TsallisTransform(int num_rollouts, float* __restrict__ trajectory_costs_d, float gamma, float r, float baseline, int global_idx, int rollout_idx_step) { for (int i = global_idx; i < num_rollouts; i += rollout_idx_step) { float cost_dif = trajectory_costs_d[i] - baseline; // trajectory_costs_d[i] = mppi::math::expr(-lambda_bar_inv * cost_dif); // trajectory_costs_d[i] = (cost_dif < gamma) * expf(logf(1.0 - cost_dif / gamma) / (r - 1)); if (cost_dif < gamma) { trajectory_costs_d[i] = expf(logf(1.0 - cost_dif / gamma) / (r - 1)); } else { trajectory_costs_d[i] = 0; } } } __device__ inline float computeNormalizer(int num_rollouts, const float* __restrict__ trajectory_costs_d, float* __restrict__ reduction_buffer, int rollout_idx_global, int rollout_idx_step) { // Copy costs to shared memory #if false // potential method to speed up copying costs int prev_size = min(blockDim.x, num_rollouts); float my_val = (rollout_idx_global < num_rollouts) ? trajectory_costs_d[rollout_idx_global] : 0; for (int i = rollout_idx_global + rollout_idx_step; i < num_rollouts; i += rollout_idx_step) { my_val += trajectory_costs_d[i]; } reduction_buffer[rollout_idx_global] = my_val; #else int prev_size = num_rollouts / 2; for (int i = rollout_idx_global; i < prev_size; i += rollout_idx_step) { reduction_buffer[i] = trajectory_costs_d[i] + trajectory_costs_d[i + prev_size]; } if (num_rollouts - 2 * prev_size == 1 && threadIdx.x == blockDim.x - 1) { reduction_buffer[prev_size - 1] += reduction_buffer[num_rollouts - 1]; } #endif __syncthreads(); // sum the entire array for (int size = prev_size / 2; size > 0; size /= 2) { for (int i = rollout_idx_global; i < size; i += rollout_idx_step) { reduction_buffer[i] += reduction_buffer[i + size]; } __syncthreads(); if (prev_size - 2 * size == 1 && threadIdx.x == blockDim.x - 1) { reduction_buffer[size - 1] += reduction_buffer[prev_size - 1]; } __syncthreads(); prev_size = size; } return reduction_buffer[0]; } template __global__ void fullGPUcomputeWeights(float* __restrict__ trajectory_costs_d, float lambda_inv, float2* __restrict__ output) { __shared__ float reduction_buffer[NUM_ROLLOUTS]; // int global_idx = blockIdx.x * blockDim.x + threadIdx.x; // int better_global_idx = (blockIdx.x * blockDim.x + threadIdx.x) * blockDim.y + threadIdx.y; // int global_idx = (blockDim.x * blockIdx.x + threadIdx.x) * blockDim.z + threadIdx.z; // int global_step = blockDim.x * gridDim.x; // int better_global_step = blockDim.x * gridDim.x * blockDim.y * gridDim.y; int global_idx = threadIdx.x; int global_step = blockDim.x; float baseline = computeBaselineCost(NUM_ROLLOUTS, trajectory_costs_d, reduction_buffer, global_idx, global_step); normExpTransform(NUM_ROLLOUTS, trajectory_costs_d, lambda_inv, baseline, global_idx, global_step); __syncthreads(); float normalizer = computeNormalizer(NUM_ROLLOUTS, trajectory_costs_d, reduction_buffer, global_idx, global_step); __syncthreads(); if (threadIdx.x == 0) { *output = make_float2(baseline, normalizer); } } float computeNormalizer(float* cost_rollouts_host, int num_rollouts) { double normalizer = 0.0; for (int i = 0; i < num_rollouts; ++i) { normalizer += cost_rollouts_host[i]; } return normalizer; } void computeFreeEnergy(float& free_energy, float& free_energy_var, float& free_energy_modified, float* cost_rollouts_host, int num_rollouts, float baseline, float lambda) { float var = 0; float norm = 0; for (int i = 0; i < num_rollouts; i++) { norm += cost_rollouts_host[i]; var += SQ(cost_rollouts_host[i]); } norm /= num_rollouts; free_energy = -lambda * logf(norm) + baseline; free_energy_var = lambda * (var / num_rollouts - SQ(norm)); // TODO Figure out the point of the following lines float weird_term = free_energy_var / (norm * sqrtf(1.0 * num_rollouts)); free_energy_modified = lambda * (weird_term + 0.5 * SQ(weird_term)); } /******************************************************************************************************************* * Weighted Reduction Kernel Helpers *******************************************************************************************************************/ __device__ void setInitialControlToZero(int control_dim, int thread_idx, float* __restrict__ u, float* __restrict__ u_intermediate) { if (control_dim % 4 == 0) { for (int i = 0; i < control_dim / 4; i++) { reinterpret_cast(u)[i] = make_float4(0, 0, 0, 0); reinterpret_cast(&u_intermediate[thread_idx * control_dim])[i] = make_float4(0, 0, 0, 0); } } else if (control_dim % 2 == 0) { for (int i = 0; i < control_dim / 2; i++) { reinterpret_cast(u)[i] = make_float2(0, 0); reinterpret_cast(&u_intermediate[thread_idx * control_dim])[i] = make_float2(0, 0); } } else { for (int i = 0; i < control_dim; i++) { u[i] = 0; u_intermediate[thread_idx * control_dim + i] = 0; } } } __device__ void strideControlWeightReduction(const int num_rollouts, const int num_timesteps, const int sum_stride, const int thread_idx, const int block_idx, const int control_dim, const float* __restrict__ exp_costs_d, const float normalizer, const float* __restrict__ du_d, float* __restrict__ u, float* __restrict__ u_intermediate) { // int index = thread_idx * sum_stride + i; for (int i = 0; i < sum_stride; ++i) { // Iterate through the size of the subsection if ((thread_idx * sum_stride + i) < num_rollouts) { // Ensure we do not go out of bounds float weight = exp_costs_d[thread_idx * sum_stride + i] / normalizer; // compute the importance sampling weight for (int j = 0; j < control_dim; ++j) { // Iterate through the control dimensions // Rollout index: (thread_idx*sum_stride + i)*(num_timesteps*control_dim) // Current timestep: block_idx*control_dim u[j] = du_d[(thread_idx * sum_stride + i) * (num_timesteps * control_dim) + block_idx * control_dim + j]; u_intermediate[thread_idx * control_dim + j] += weight * u[j]; } } } } __device__ void rolloutWeightReductionAndSaveControl(int thread_idx, int block_idx, int num_rollouts, int num_timesteps, int control_dim, int sum_stride, float* u, float* u_intermediate, float* du_new_d) { if (thread_idx == 0 && block_idx < num_timesteps) { // block index refers to the current timestep for (int i = 0; i < control_dim; ++i) { // TODO replace with memset? u[i] = 0; } for (int i = 0; i < ((num_rollouts - 1) / sum_stride + 1); ++i) { // iterate through the each subsection for (int j = 0; j < control_dim; ++j) { u[j] += u_intermediate[i * control_dim + j]; } } for (int i = 0; i < control_dim; i++) { du_new_d[block_idx * control_dim + i] = u[i]; } } } template __device__ void warpReduceAdd(volatile float* sdata, const int tid, const int stride) { if (BLOCKSIZE >= 64) { sdata[tid * stride] += sdata[(tid + 32) * stride]; } if (BLOCKSIZE >= 32) { sdata[tid * stride] += sdata[(tid + 16) * stride]; } if (BLOCKSIZE >= 16) { sdata[tid * stride] += sdata[(tid + 8) * stride]; } if (BLOCKSIZE >= 8) { sdata[tid * stride] += sdata[(tid + 4) * stride]; } if (BLOCKSIZE >= 4) { sdata[tid * stride] += sdata[(tid + 2) * stride]; } if (BLOCKSIZE >= 2) { sdata[tid * stride] += sdata[(tid + 1) * stride]; } } __device__ void costArrayReduction(float* running_cost, const int start_size, const int index, const int step, const bool catch_condition, const int stride) { int prev_size = start_size; const bool block_power_of_2 = (prev_size & (prev_size - 1)) == 0; const int stop_condition = (block_power_of_2) ? 32 : 0; int size; int j; for (size = prev_size / 2; size > stop_condition; size /= 2) { for (j = index; j < size; j += step) { running_cost[j * stride] += running_cost[(j + size) * stride]; } __syncthreads(); if (prev_size - 2 * size == 1 && catch_condition) { running_cost[(size - 1) * stride] += running_cost[(prev_size - 1) * stride]; } __syncthreads(); prev_size = size; } switch (size * 2) { case 64: if (index < 32) { warpReduceAdd<64>(running_cost, index, stride); } break; case 32: if (index < 16) { warpReduceAdd<32>(running_cost, index, stride); } break; case 16: if (index < 8) { warpReduceAdd<16>(running_cost, index, stride); } break; case 8: if (index < 4) { warpReduceAdd<8>(running_cost, index, stride); } break; case 4: if (index < 2) { warpReduceAdd<4>(running_cost, index, stride); } break; case 2: if (index < 1) { warpReduceAdd<2>(running_cost, index, stride); } break; } __syncthreads(); } /******************************************************************************************************************* * Launch Functions *******************************************************************************************************************/ template void launchSplitRolloutKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float* __restrict__ init_x_d, float* __restrict__ y_d, float* __restrict__ trajectory_costs, dim3 dimDynBlock, dim3 dimCostBlock, cudaStream_t stream, bool synchronize) { if (num_rollouts % dimDynBlock.x != 0) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_rollouts (" << num_rollouts << ") must be evenly divided by dynamics block size x (" << dimDynBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } if (num_timesteps < dimCostBlock.x) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_timesteps (" << num_timesteps << ") must be greater than or equal to cost block size x (" << dimCostBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } // Run Dynamics const int gridsize_x = math::int_ceil(num_rollouts, dimDynBlock.x); dim3 dimGrid(gridsize_x, 1, 1); unsigned dynamics_shared_size = calcRolloutDynamicsKernelSharedMemSize(dynamics, sampling, dimDynBlock); HANDLE_ERROR(cudaFuncSetAttribute(rolloutDynamicsKernel, cudaFuncAttributeMaxDynamicSharedMemorySize, dynamics_shared_size)); rolloutDynamicsKernel<<>>( dynamics->model_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, init_x_d, y_d); HANDLE_ERROR(cudaGetLastError()); // Run Costs dim3 dimCostGrid(num_rollouts, 1, 1); unsigned cost_shared_size = calcRolloutCostKernelSharedMemSize(costs, sampling, dimCostBlock); rolloutCostKernel<<>>( costs->cost_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, lambda, alpha, y_d, trajectory_costs); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } template void launchRolloutKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float* __restrict__ init_x_d, float* __restrict__ trajectory_costs, dim3 dimBlock, cudaStream_t stream, bool synchronize) { if (num_rollouts % dimBlock.x != 0) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_rollouts (" << num_rollouts << ") must be evenly divided by rollout thread block size x (" << dimBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } const int gridsize_x = math::int_ceil(num_rollouts, dimBlock.x); dim3 dimGrid(gridsize_x, 1, 1); unsigned shared_mem_size = calcRolloutCombinedKernelSharedMemSize(dynamics, costs, sampling, dimBlock); HANDLE_ERROR(cudaFuncSetAttribute(rolloutKernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size)); rolloutKernel<<>>( dynamics->model_d_, sampling->sampling_d_, costs->cost_d_, dt, num_timesteps, num_rollouts, init_x_d, lambda, alpha, trajectory_costs); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } template void launchVisualizeKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float* __restrict__ init_x_d, float* __restrict__ y_d, float* __restrict__ trajectory_costs, int* __restrict__ crash_status_d, dim3 dimVisBlock, cudaStream_t stream, bool synchronize) { if (num_rollouts <= 1) { // Not enough samples to visualize std::cerr << "Not enough samples to visualize" << std::endl; return; } if (num_rollouts % dimVisBlock.x != 0) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_rollouts (" << num_rollouts << ") must be evenly divided by vis block size x (" << dimVisBlock.x << ")" << std::endl; return; } const int gridsize_x = math::int_ceil(num_rollouts, dimVisBlock.x); dim3 dimGrid(gridsize_x, 1, 1); unsigned shared_mem_size = calcVisualizeKernelSharedMemSize(dynamics, costs, sampling, num_timesteps, dimVisBlock); HANDLE_ERROR(cudaFuncSetAttribute(rolloutKernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size)); visualizeKernel<<>>( dynamics->model_d_, sampling->sampling_d_, costs->cost_d_, dt, num_timesteps, num_rollouts, init_x_d, lambda, alpha, y_d, trajectory_costs, crash_status_d); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } template void launchVisualizeCostKernel(COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float* __restrict__ y_d, int* __restrict__ sampled_crash_status_d, float* __restrict__ cost_traj_result, dim3 dimBlock, cudaStream_t stream, bool synchronize) { if (num_rollouts <= 1) { // Not enough samples to visualize std::cerr << "Not enough samples to visualize" << std::endl; return; } dim3 dimCostGrid(num_rollouts, 1, 1); unsigned shared_mem_size = calcVisCostKernelSharedMemSize(costs, sampling, num_timesteps, dimBlock); visualizeCostKernel<<>>( costs->cost_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, lambda, alpha, y_d, cost_traj_result, sampled_crash_status_d); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } template void launchWeightedReductionKernel(const float* __restrict__ exp_costs_d, const float* __restrict__ du_d, float* __restrict__ new_u_d, const float normalizer, const int num_timesteps, const int num_rollouts, const int sum_stride, cudaStream_t stream, bool synchronize) { dim3 dimBlock(math::int_ceil(num_rollouts, sum_stride), 1, 1); dim3 dimGrid(num_timesteps, 1, 1); unsigned shared_mem_size = math::nearest_multiple_4(CONTROL_DIM * dimBlock.x) * sizeof(float); weightedReductionKernel<<>>( exp_costs_d, du_d, new_u_d, normalizer, num_timesteps, num_rollouts, sum_stride); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } void launchNormExpKernel(int num_rollouts, int blocksize_x, float* trajectory_costs_d, float lambda_inv, float baseline, cudaStream_t stream, bool synchronize) { dim3 dimBlock(blocksize_x, 1, 1); dim3 dimGrid((num_rollouts - 1) / blocksize_x + 1, 1, 1); normExpKernel<<>>(num_rollouts, trajectory_costs_d, lambda_inv, baseline); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } void launchTsallisKernel(int num_rollouts, int blocksize_x, float* trajectory_costs_d, float gamma, float r, float baseline, cudaStream_t stream, bool synchronize) { dim3 dimBlock(blocksize_x, 1, 1); dim3 dimGrid((num_rollouts - 1) / blocksize_x + 1, 1, 1); TsallisKernel<<>>(num_rollouts, trajectory_costs_d, gamma, r, baseline); // CudaCheckError(); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } template void launchWeightTransformKernel(float* __restrict__ costs_d, float2* __restrict__ baseline_and_norm_d, const float lambda_inv, const int num_systems, cudaStream_t stream, bool synchronize) { // Figure out max size of threads from the device properties (slows down this method a lot) // int device_id = 0; // cudaDeviceProp deviceProp; // cudaGetDeviceProperties(&deviceProp, device_id); // int blocksize_x = deviceProp.maxThreadsDim[0]; const int blocksize_x = 1024; dim3 dimBlock(blocksize_x, 1, 1); // Can't be split into multiple blocks because we want to do all the math in shared memory dim3 dimGrid(1, 1, 1); for (int i = 0; i < num_systems; i++) { fullGPUcomputeWeights <<>>(costs_d + i * NUM_ROLLOUTS, lambda_inv, baseline_and_norm_d + i); HANDLE_ERROR(cudaGetLastError()); } if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } template void launchweightedReductionKernel(float* exp_costs_d, float* du_d, float* du_new_d, float2* baseline_and_normalizer_d, int num_timesteps, cudaStream_t stream, bool synchronize) { dim3 dimBlock((NUM_ROLLOUTS - 1) / SUM_STRIDE + 1, 1, 1); dim3 dimGrid(num_timesteps, 1, 1); weightedReductionKernel <<>>(exp_costs_d, du_d, du_new_d, baseline_and_normalizer_d, num_timesteps); // CudaCheckError(); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } /******************************************************************************************************************* * Shared Memory Calculation Functions *******************************************************************************************************************/ template unsigned calcRolloutDynamicsKernelSharedMemSize(const DYN_T* dynamics, const SAMPLER_T* sampler, dim3& dimBlock) { const int dynamics_num_shared = dimBlock.x * dimBlock.z; unsigned dynamics_shared_size = sizeof(float) * (3 * math::nearest_multiple_4(dynamics_num_shared * DYN_T::STATE_DIM) + math::nearest_multiple_4(dynamics_num_shared * DYN_T::OUTPUT_DIM) + math::nearest_multiple_4(dynamics_num_shared * DYN_T::CONTROL_DIM)) + calcClassSharedMemSize(dynamics, dimBlock) + calcClassSharedMemSize(sampler, dimBlock); #ifdef USE_CUDA_BARRIERS_DYN dynamics_shared_size += math::int_multiple_const(dynamics_num_shared * sizeof(barrier), 16); #endif return dynamics_shared_size; } template unsigned calcRolloutCostKernelSharedMemSize(const COST_T* cost, const SAMPLER_T* sampler, dim3& dimBlock) { const int cost_num_shared = dimBlock.x * dimBlock.z; unsigned cost_shared_size = sizeof(float) * (math::nearest_multiple_4(cost_num_shared * COST_T::OUTPUT_DIM) + math::nearest_multiple_4(cost_num_shared * COST_T::CONTROL_DIM) + math::nearest_multiple_4(cost_num_shared * dimBlock.y)) + sizeof(int) * math::nearest_multiple_4(cost_num_shared) + calcClassSharedMemSize(cost, dimBlock) + calcClassSharedMemSize(sampler, dimBlock); #ifdef USE_CUDA_BARRIERS_COST cost_shared_size += math::int_multiple_const(cost_num_shared * sizeof(barrier), 16); #endif return cost_shared_size; } template unsigned calcRolloutCombinedKernelSharedMemSize(const DYN_T* dynamics, const COST_T* cost, const SAMPLER_T* sampler, dim3& dimBlock) { const int num_shared = dimBlock.x * dimBlock.z; unsigned shared_mem_size = sizeof(float) * (3 * math::nearest_multiple_4(num_shared * DYN_T::STATE_DIM) + math::nearest_multiple_4(num_shared * DYN_T::OUTPUT_DIM) + math::nearest_multiple_4(num_shared * DYN_T::CONTROL_DIM) + math::nearest_multiple_4(num_shared * dimBlock.y)) + sizeof(int) * math::nearest_multiple_4(num_shared) + calcClassSharedMemSize(dynamics, dimBlock) + calcClassSharedMemSize(cost, dimBlock) + calcClassSharedMemSize(sampler, dimBlock); #ifdef USE_CUDA_BARRIERS_ROLLOUT shared_mem_size += math::int_multiple_const(num_shared * sizeof(barrier), 16); #endif return shared_mem_size; } template unsigned calcVisualizeKernelSharedMemSize(const DYN_T* dynamics, const COST_T* cost, const SAMPLER_T* sampler, const int& num_timesteps, dim3& dimBlock) { const int num_shared = dimBlock.x * dimBlock.z; unsigned shared_mem_size = sizeof(float) * (3 * math::nearest_multiple_4(num_shared * DYN_T::STATE_DIM) + math::nearest_multiple_4(num_shared * DYN_T::OUTPUT_DIM) + math::nearest_multiple_4(num_shared * DYN_T::CONTROL_DIM) + math::nearest_multiple_4(num_timesteps * dimBlock.y * dimBlock.z)) + sizeof(int) * math::nearest_multiple_4(num_shared) + calcClassSharedMemSize(dynamics, dimBlock) + calcClassSharedMemSize(cost, dimBlock) + calcClassSharedMemSize(sampler, dimBlock); #ifdef USE_CUDA_BARRIERS_ROLLOUT shared_mem_size += math::int_multiple_const(num_shared * sizeof(barrier), 16); #endif return shared_mem_size; } template unsigned calcVisCostKernelSharedMemSize(const COST_T* cost, const SAMPLER_T* sampler, const int& num_timesteps, dim3& dimBlock) { const int shared_num = dimBlock.x * dimBlock.z; unsigned shared_mem_size = sizeof(float) * (math::nearest_multiple_4(shared_num * COST_T::OUTPUT_DIM) + math::nearest_multiple_4(shared_num * COST_T::CONTROL_DIM) + math::nearest_multiple_4(dimBlock.z * num_timesteps * dimBlock.y)) + sizeof(int) * math::nearest_multiple_4(dimBlock.z * num_timesteps) + calcClassSharedMemSize(cost, dimBlock) + calcClassSharedMemSize(sampler, dimBlock); #ifdef USE_CUDA_BARRIERS_COST shared_mem_size += math::int_multiple_const(shared_num * sizeof(barrier), 16); #endif return shared_mem_size; } template __host__ __device__ unsigned calcClassSharedMemSize(const T* class_ptr, const dim3& dimBlock) { const int num_shared = dimBlock.x * dimBlock.z; unsigned shared_size = math::int_multiple_const(class_ptr->getGrdSharedSizeBytes(), sizeof(float4)) + num_shared * math::int_multiple_const(class_ptr->getBlkSharedSizeBytes(), sizeof(float4)); return shared_size; } } // namespace kernels } // namespace mppi ================================================ FILE: include/mppi/core/mppi_common.cuh ================================================ // // Created by Manan Gandhi on 12/2/19. // #ifndef MPPIGENERIC_MPPI_COMMON_CUH #define MPPIGENERIC_MPPI_COMMON_CUH #include namespace mppi { namespace kernels { /******************************************************************************************************************* * Kernel functions *******************************************************************************************************************/ template __global__ void rolloutCostKernel(COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, const float* __restrict__ y_d, float* __restrict__ trajectory_costs_d); template __global__ void rolloutKernel(DYN_T* __restrict__ dynamics, SAMPLING_T* __restrict__ sampling, COST_T* __restrict__ costs, float dt, const int num_timesteps, const int num_rollouts, const float* __restrict__ init_x_d, float lambda, float alpha, float* __restrict__ trajectory_costs_d); template __global__ void rolloutDynamicsKernel(DYN_T* __restrict__ dynamics, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, const float* __restrict__ init_x_d, float* __restrict__ y_d); template __global__ void visualizeKernel(DYN_T* __restrict__ dynamics, SAMPLING_T* __restrict__ sampling, COST_T* __restrict__ costs, float dt, const int num_timesteps, const int num_rollouts, const float* __restrict__ init_x_d, float lambda, float alpha, float* __restrict__ y_d, float* __restrict__ cost_traj_d, int* __restrict__ crash_status_d); template __global__ void visualizeCostKernel(COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, const float lambda, float alpha, const float* __restrict__ y_d, float* __restrict__ cost_traj_d, int* __restrict__ crash_status_d); template __global__ void weightedReductionKernel(const float* __restrict__ exp_costs_d, const float* __restrict__ du_d, float* __restrict__ new_u_d, const float normalizer, const int num_timesteps, const int num_rollouts, const int sum_stride); // Norm Exponential Kernel __global__ void normExpKernel(int num_rollouts, float* trajectory_costs_d, float gamma, float baseline); // Tsallis Kernel __global__ void TsallisKernel(int num_rollouts, float* trajectory_costs_d, float gamma, float r, float baseline); /******************************************************************************************************************* * RolloutKernel Helpers *******************************************************************************************************************/ /* * loadGlobalToShared * Copy global memory into shared memory * * Args: * state_dim: Number of states, defined in DYN_T * control_dim: Number of controls, defined in DYN_T * num_rollouts: Total number of rollouts * blocksize_y: Y dimension of each block of threads * global_idx: Current rollout index. * thread_idy: Current y index of block dimension. * thread_idz: Current z index of block dimension. * x0_device: initial condition in device memory * x_thread: state in shared memory * xdot_thread: state_dot in shared memory * u_thread: control / perturbed control in shared memory * */ template __device__ void loadGlobalToShared(const int num_rollouts, const int blocksize_y, const int global_idx, const int thread_idy, const int thread_idz, const float* __restrict__ x_device, float* __restrict__ x_thread, float* __restrict__ xdot_thread, float* __restrict__ u_thread); /** * @brief Calculate the terminal cost and add it to the trajectories' overall cost * * @tparam COST_T - Cost Function class * @param num_rollouts * @param num_timesteps * @param global_idx - sample trajectory index * @param costs - GPU version of cost function * @param x_thread - terminal state x * @param running_cost - current cost of the sample trajectory * @param theta_c - shared memory for the cost function * @param cost_rollouts_device - global memory array storing the cost of each sample * * @return */ template __device__ void computeAndSaveCost(int num_rollouts, int num_timesteps, int global_idx, COST_T* costs, float* x_thread, float running_cost, float* theta_c, float* cost_rollouts_device); /** * @brief conduct a warp reduction of addition s[tid * stride] += s[(tid + BLOCKSIZE) * stride] * * @tparam BLOCKSIZE - how many threads are doing the reduction * @param sdata - float array to do the reduction on * @param tid - current thread index * @param stride - how spaced out the summations should be * * @return */ template __device__ void warpReduceAdd(volatile float* sdata, const int tid, const int stride = 1); /** * @brief conduct a sum of floats in array through a GPU reduction algorithm * * @param running_cost - array of floats to be summed * @param start_size - number of items to be summed * @param index - GPU thread index * @param step - GPU step to avoid overlap of threads * @param catch_condition - when to stop summation * @param stride - how far apart the desired floats are in the array * * @return */ __device__ inline void costArrayReduction(float* running_cost, const int start_size, const int index, const int step, const bool catch_condition, const int stride = 1); // Norm Exp Kernel Helpers __device__ __host__ inline void normExpTransform(const int num_rollouts, float* __restrict__ trajectory_costs_d, const float lambda_inv, const float baseline, const int global_idx, const int rollout_idx_step); // Tsallis Kernel Helpers __device__ __host__ inline void TsallisTransform(const int num_rollouts, float* __restrict__ trajectory_costs_d, const float gamma, float r, const float baseline, const int global_idx, const int rollout_idx_step); float computeBaselineCost(float* cost_rollouts_host, int num_rollouts); float computeNormalizer(float* cost_rollouts_host, int num_rollouts); float constructBestWeights(float* cost_rollouts_host, int num_rollouts); int computeBestIndex(float* cost_rollouts_host, int num_rollouts); /** * Calculates the free energy mean and variance from the different * cost trajectories after normExpKernel * Inputs: * cost_rollouts_host - sampled cost trajectories * num_rollouts - the number of sampled cost trajectories * lambda - the lambda term from the definition of free energy * baseline - minimum cost trajectory * Outputs: * free_energy - the free energy of the samples * free_energy_var - the variance of the free energy calculation */ void computeFreeEnergy(float& free_energy, float& free_energy_var, float& free_energy_modified, float* cost_rollouts_host, int num_rollouts, float baseline, float lambda = 1.0); /******************************************************************************************************************* * Weighted Reduction Kernel Helpers *******************************************************************************************************************/ /** * @brief set controls to zero * * @param control_dim * @param thread_idx - threadIdx.x * @param u - memory for control array * @param u_intermediate - shared memory for control array * * @return */ __device__ void setInitialControlToZero(int control_dim, int thread_idx, float* __restrict__ u, float* __restrict__ u_intermediate); /** * @brief calculated the weighted sum of the controls * * @param num_rollouts * @param num_timesteps * @param sum_stride - how many summations to do in a single thread * @param thread_idx - threadIdx.x * @param block_idx - blockIdx.x * @param control_dim * @param exp_costs_d - global memory of the weights * @param normalizer - sum of all the weights to use as a normalizing term * @param du_d - global memory of all sampled controls * @param u - local memory to store a control from a single time * @param u_intermediate - shared memory containing the weighted sum * * @return */ __device__ void strideControlWeightReduction(const int num_rollouts, const int num_timesteps, const int sum_stride, const int thread_idx, const int block_idx, const int control_dim, const float* __restrict__ exp_costs_d, const float normalizer, const float* __restrict__ du_d, float* __restrict__ u, float* __restrict__ u_intermediate); __device__ void rolloutWeightReductionAndSaveControl(int thread_idx, int block_idx, int num_rollouts, int num_timesteps, int control_dim, int sum_stride, float* u, float* u_intermediate, float* du_new_d); /** * Launch Kernel Methods **/ template void launchSplitRolloutKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float* __restrict__ init_x_d, float* __restrict__ y_d, float* __restrict__ trajectory_costs, dim3 dimDynBlock, dim3 dimCostBlock, cudaStream_t stream, bool synchronize = true); template void launchRolloutKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float* __restrict__ init_x_d, float* __restrict__ trajectory_costs, dim3 dimBlock, cudaStream_t stream, bool synchronize = true); template void launchVisualizeCostKernel(COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float* __restrict__ y_d, int* __restrict__ sampled_crash_status_d, float* __restrict__ cost_traj_result, dim3 dimBlock, cudaStream_t stream, bool synchronize = true); template void launchVisualizeKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float* __restrict__ init_x_d, float* __restrict__ y_d, float* __restrict__ trajectory_costs, int* __restrict__ crash_status_d, dim3 dimVisBlock, cudaStream_t stream, bool synchronize = true); template void launchWeightedReductionKernel(const float* __restrict__ exp_costs_d, const float* __restrict__ du_d, float* __restrict__ new_u_d, const float normalizer, const int num_timesteps, const int num_rollouts, const int sum_stride, cudaStream_t stream, bool synchronize = true); void launchNormExpKernel(int num_rollouts, int blocksize_x, float* trajectory_costs_d, float lambda_inv, float baseline, cudaStream_t stream, bool synchronize = true); void launchTsallisKernel(int num_rollouts, int blocksize_x, float* trajectory_costs_d, float gamma, float r, float baseline, cudaStream_t stream, bool synchronize = true); template void launchWeightedReductionKernel(float* exp_costs_d, float* du_d, float* du_new_d, float normalizer, int num_timesteps, cudaStream_t stream, bool synchronize = true); /******************************************************************************************************************* * Shared Memory Calculators for various kernels *******************************************************************************************************************/ template unsigned calcRolloutDynamicsKernelSharedMemSize(const DYN_T* dynamics, const SAMPLER_T* sampler, dim3& dimBlock); template unsigned calcRolloutCostKernelSharedMemSize(const COST_T* cost, const SAMPLER_T* sampler, dim3& dimBlock); template unsigned calcRolloutCombinedKernelSharedMemSize(const DYN_T* dynamics, const COST_T* cost, const SAMPLER_T* sampler, dim3& dimBlock); template unsigned calcVisualizeKernelSharedMemSize(const DYN_T* dynamics, const COST_T* cost, const SAMPLER_T* sampler, const int& num_timesteps, dim3& dimBlock); template unsigned calcVisCostKernelSharedMemSize(const COST_T* cost, const SAMPLER_T* sampler, const int& num_timesteps, dim3& dimBlock); template __host__ __device__ inline unsigned calcClassSharedMemSize(const T* class_ptr, const dim3& dimBlock); } // namespace kernels } // namespace mppi #if __CUDACC__ #include "mppi_common.cu" #endif #endif // MPPIGENERIC_MPPI_COMMON_CUH ================================================ FILE: include/mppi/core/rmppi_kernels.cu ================================================ #include namespace mp1 = mppi::p1; namespace mppi { namespace kernels { namespace rmppi { template __global__ void initEvalDynKernel(DYN_T* __restrict__ dynamics, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, int samples_per_condition, const int* __restrict__ strides_d, const float* __restrict__ states_d, float* __restrict__ y_d) { // Get thread and block id const int global_idx = blockDim.x * blockIdx.x + threadIdx.x; const int distribution_idx = threadIdx.z; const int candidate_idx = global_idx / samples_per_condition; const int candidate_sample_idx = global_idx % samples_per_condition; const int tdy = threadIdx.y; const int size_of_theta_s_bytes = calcClassSharedMemSize(dynamics, blockDim); const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int shared_idx = blockDim.x * threadIdx.z + threadIdx.x; const int distribution_dim = blockDim.z; const int sample_dim = blockDim.x; // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* x_shared = entire_buffer; float* x_next_shared = &x_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* y_shared = &x_next_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* x_dot_shared = &y_shared[math::nearest_multiple_4(sample_dim * DYN_T::OUTPUT_DIM * distribution_dim)]; float* u_shared = &x_dot_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* theta_s_shared = &u_shared[math::nearest_multiple_4(sample_dim * DYN_T::CONTROL_DIM * distribution_dim)]; float* theta_d_shared = &theta_s_shared[size_of_theta_s_bytes / sizeof(float)]; #ifdef USE_CUDA_BARRIERS_DYN barrier* barrier_shared = (barrier*)&theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; #endif // Create local state, state dot and controls float* x = &(reinterpret_cast(x_shared)[shared_idx * DYN_T::STATE_DIM]); float* x_next = &(reinterpret_cast(x_next_shared)[shared_idx * DYN_T::STATE_DIM]); float* x_temp; float* xdot = &(reinterpret_cast(x_dot_shared)[shared_idx * DYN_T::STATE_DIM]); float* u = &(reinterpret_cast(u_shared)[shared_idx * DYN_T::CONTROL_DIM]); float* y = &(reinterpret_cast(y_shared)[shared_idx * DYN_T::OUTPUT_DIM]); // Load global array to shared array mppi::p1::loadArrayParallel(x, 0, states_d, candidate_idx * DYN_T::STATE_DIM); int stride = strides_d[candidate_idx]; for (int i = tdy; i < DYN_T::CONTROL_DIM; i += blockDim.y) { u[i] = 0; } for (int i = tdy; i < DYN_T::OUTPUT_DIM; i += blockDim.y) { y[i] = 0; } __syncthreads(); #ifdef USE_CUDA_BARRIERS_DYN barrier* bar = &barrier_shared[(shared_idx)]; if (tdy == 0) { init(bar, blockDim.y); } #endif /*<----Start of simulation loop-----> */ dynamics->initializeDynamics(x, u, y, theta_s_shared, 0.0, dt); sampling->initializeDistributions(y, 0.0, dt, theta_d_shared); __syncthreads(); for (int t = 0; t < num_timesteps; t++) { // Load noise trajectories scaled by the exploration factor int candidate_t = min(t + stride, num_timesteps - 1); sampling->readControlSample(candidate_sample_idx, candidate_t, distribution_idx, u, theta_d_shared, blockDim.y, tdy, y); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif // applies constraints as defined in dynamics.cuh see specific dynamics class for what happens here // usually just control clamping dynamics->enforceConstraints(x, u); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif // Increment states dynamics->step(x, x_next, xdot, u, y, theta_s_shared, t, dt); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif x_temp = x; x = x_next; x_next = x_temp; // Copy state to global memory int sample_time_offset = (num_rollouts * threadIdx.z + global_idx) * num_timesteps + t; mp1::loadArrayParallel(y_d, sample_time_offset * DYN_T::OUTPUT_DIM, y, 0); } } template __global__ void initEvalCostKernel(COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, const int samples_per_condition, const int* __restrict__ strides_d, const float* __restrict__ y_d, float* __restrict__ trajectory_costs_d) { // Get thread and block id const int global_idx = blockIdx.x; const int distribution_idx = threadIdx.z; const int candidate_idx = global_idx / samples_per_condition; const int candidate_sample_idx = global_idx % samples_per_condition; const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int shared_idx = blockDim.x * threadIdx.z + threadIdx.x; const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int size_of_theta_c_bytes = calcClassSharedMemSize(costs, blockDim); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* y_shared = entire_buffer; float* u_shared = &y_shared[math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::OUTPUT_DIM)]; float* running_cost_shared = &u_shared[math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::CONTROL_DIM)]; int* crash_status_shared = (int*)&running_cost_shared[math::nearest_multiple_4(blockDim.x * blockDim.z * blockDim.y)]; float* theta_c = (float*)&crash_status_shared[math::nearest_multiple_4(blockDim.x * blockDim.z)]; float* theta_d = &theta_c[size_of_theta_c_bytes / sizeof(float)]; #ifdef USE_CUDA_BARRIERS_COST barrier* barrier_shared = (barrier*)&theta_d[size_of_theta_d_bytes / sizeof(float)]; #endif // Create local state, state dot and controls float* y; float* u; int* crash_status; // Initialize running cost and total cost float* running_cost; int sample_time_offset = 0; int stride = strides_d[candidate_idx]; // Load global array to shared array y = &y_shared[(shared_idx)*COST_T::OUTPUT_DIM]; u = &u_shared[(shared_idx)*COST_T::CONTROL_DIM]; crash_status = &crash_status_shared[shared_idx]; crash_status[0] = 0; // We have not crashed yet as of the first trajectory. const int running_cost_index = thread_idx + blockDim.x * (thread_idy + blockDim.y * thread_idz); running_cost = &running_cost_shared[running_cost_index]; running_cost[0] = 0; #ifdef USE_CUDA_BARRIERS_COST barrier* bar = &barrier_shared[(blockDim.x * thread_idz + thread_idx)]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif /*<----Start of simulation loop-----> */ const int max_time_iters = ceilf((float)num_timesteps / blockDim.x); costs->initializeCosts(y, u, theta_c, 0.0, dt); sampling->initializeDistributions(y, 0.0, dt, theta_d); __syncthreads(); for (int time_iter = 0; time_iter < max_time_iters; ++time_iter) { int t = thread_idx + time_iter * blockDim.x; if (COALESCE) { // Fill entire shared mem sequentially using sequential threads_idx int amount_to_fill = (time_iter + 1) * blockDim.x > num_timesteps ? num_timesteps % blockDim.x : blockDim.x; mp1::loadArrayParallel( y_shared, blockDim.x * thread_idz * COST_T::OUTPUT_DIM, y_d, ((num_rollouts * thread_idz + global_idx) * num_timesteps + time_iter * blockDim.x) * COST_T::OUTPUT_DIM, COST_T::OUTPUT_DIM * amount_to_fill); } else if (t < num_timesteps) { // t = num_timesteps is the terminal state for outside this for-loop sample_time_offset = (num_rollouts * thread_idz + global_idx) * num_timesteps + t; mp1::loadArrayParallel(y, 0, y_d, sample_time_offset * COST_T::OUTPUT_DIM); } if (t < num_timesteps) { // load controls from t = 0 to t = num_timesteps - 1 int candidate_t = min(t + stride, num_timesteps - 1); sampling->readControlSample(candidate_sample_idx, candidate_t, distribution_idx, u, theta_d, blockDim.y, thread_idy, y); } #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif // Compute cost if (t < num_timesteps) { running_cost[0] += costs->computeRunningCost(y, u, t, theta_c, crash_status) + sampling->computeLikelihoodRatioCost(u, theta_d, global_idx, t, distribution_idx, lambda, alpha); } #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif } // Add all costs together running_cost = &running_cost_shared[blockDim.x * blockDim.y * thread_idz]; __syncthreads(); costArrayReduction(running_cost, blockDim.x * blockDim.y, thread_idx + blockDim.x * thread_idy, blockDim.x * blockDim.y, thread_idx == blockDim.x - 1 && thread_idy == 0); // point every thread to the last output at t = NUM_TIMESTEPS for terminal cost calculation const int last_y_index = (num_timesteps - 1) % blockDim.x; y = &y_shared[(blockDim.x * thread_idz + last_y_index) * COST_T::OUTPUT_DIM]; // Compute terminal cost and the final cost for each thread computeAndSaveCost(num_rollouts, num_timesteps, global_idx, costs, y, running_cost[0] / (num_timesteps), theta_c, trajectory_costs_d); } template __global__ void initEvalKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, int samples_per_condition, float lambda, float alpha, const int* __restrict__ strides_d, const float* __restrict__ states_d, float* __restrict__ trajectory_costs_d) { // Get thread and block id const int global_idx = blockDim.x * blockIdx.x + threadIdx.x; const int distribution_idx = threadIdx.z; const int candidate_idx = global_idx / samples_per_condition; const int candidate_sample_idx = global_idx % samples_per_condition; const int tdy = threadIdx.y; const int size_of_theta_s_bytes = calcClassSharedMemSize(dynamics, blockDim); const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int size_of_theta_c_bytes = calcClassSharedMemSize(costs, blockDim); const int shared_idx = blockDim.x * threadIdx.z + threadIdx.x; const int distribution_dim = blockDim.z; const int sample_dim = blockDim.x; // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* x_shared = entire_buffer; float* x_next_shared = &x_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* y_shared = &x_next_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* x_dot_shared = &y_shared[math::nearest_multiple_4(sample_dim * DYN_T::OUTPUT_DIM * distribution_dim)]; float* u_shared = &x_dot_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* theta_s_shared = &u_shared[math::nearest_multiple_4(sample_dim * DYN_T::CONTROL_DIM * distribution_dim)]; float* theta_d_shared = &theta_s_shared[size_of_theta_s_bytes / sizeof(float)]; float* theta_c_shared = &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; float* running_cost_shared = &theta_c_shared[size_of_theta_c_bytes / sizeof(float)]; int* crash_status_shared = (int*)&running_cost_shared[math::nearest_multiple_4(blockDim.x * blockDim.y * blockDim.z)]; #ifdef USE_CUDA_BARRIERS_ROLLOUT barrier* barrier_shared = (barrier*)&crash_status_shared[math::nearest_multiple_4(sample_dim * distribution_dim)]; #endif // Create local state, state dot and controls int running_cost_index = threadIdx.x + blockDim.x * (tdy + blockDim.y * threadIdx.z); float* x = &(reinterpret_cast(x_shared)[shared_idx * DYN_T::STATE_DIM]); float* x_next = &(reinterpret_cast(x_next_shared)[shared_idx * DYN_T::STATE_DIM]); float* x_temp; float* xdot = &(reinterpret_cast(x_dot_shared)[shared_idx * DYN_T::STATE_DIM]); float* u = &(reinterpret_cast(u_shared)[shared_idx * DYN_T::CONTROL_DIM]); float* y = &(reinterpret_cast(y_shared)[shared_idx * DYN_T::OUTPUT_DIM]); float* running_cost = &running_cost_shared[running_cost_index]; running_cost[0] = 0.0f; int* crash_status = &crash_status_shared[shared_idx]; crash_status[0] = 0; // We have not crashed yet as of the first trajectory. // Load global array to shared array mppi::p1::loadArrayParallel(x, 0, states_d, candidate_idx * DYN_T::STATE_DIM); int stride = strides_d[candidate_idx]; int p_index, step; mppi::p1::getParallel1DIndex(p_index, step); for (int i = p_index; i < DYN_T::CONTROL_DIM; i += step) { u[i] = 0; } for (int i = p_index; i < DYN_T::OUTPUT_DIM; i += step) { y[i] = 0; } __syncthreads(); #ifdef USE_CUDA_BARRIERS_ROLLOUT barrier* bar = &barrier_shared[(shared_idx)]; if (tdy == 0) { init(bar, blockDim.y); } #endif /*<----Start of simulation loop-----> */ dynamics->initializeDynamics(x, u, y, theta_s_shared, 0.0, dt); sampling->initializeDistributions(y, 0.0, dt, theta_d_shared); costs->initializeCosts(y, u, theta_c_shared, 0.0f, dt); __syncthreads(); for (int t = 0; t < num_timesteps; t++) { // Load noise trajectories scaled by the exploration factor int candidate_t = min(t + stride, num_timesteps - 1); sampling->readControlSample(candidate_sample_idx, candidate_t, distribution_idx, u, theta_d_shared, blockDim.y, tdy, y); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // applies constraints as defined in dynamics.cuh see specific dynamics class for what happens here dynamics->enforceConstraints(x, u); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // Increment states dynamics->step(x, x_next, xdot, u, y, theta_s_shared, t, dt); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif running_cost[0] += costs->computeRunningCost(y, u, t, theta_c_shared, crash_status) + sampling->computeLikelihoodRatioCost(u, theta_d_shared, global_idx, t, distribution_idx, lambda, alpha); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif x_temp = x; x = x_next; x_next = x_temp; } // Add all costs together running_cost = &running_cost_shared[threadIdx.x + blockDim.x * blockDim.y * threadIdx.z]; __syncthreads(); costArrayReduction(running_cost, blockDim.y, tdy, blockDim.y, tdy == 0, blockDim.x); // Compute terminal cost and the final cost for each thread computeAndSaveCost(num_rollouts, num_timesteps, global_idx, costs, y, running_cost[0] / (num_timesteps), theta_c_shared, trajectory_costs_d); } template __global__ void rolloutRMPPIDynamicsKernel(DYN_T* __restrict__ dynamics, FB_T* __restrict__ fb_controller, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, const float* __restrict__ init_x_d, float* __restrict__ y_d) { // Get thread and block id const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int block_idx = blockIdx.x; const int global_idx = blockDim.x * block_idx + thread_idx; const int shared_idx = blockDim.x * thread_idz + thread_idx; const int shared_nom_idx = blockDim.x * NOMINAL_STATE_IDX + thread_idx; const int distribution_idx = threadIdx.z; const int distribution_dim = blockDim.z; const int sample_dim = blockDim.x; const int size_of_theta_s_bytes = calcClassSharedMemSize(dynamics, blockDim); const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int size_of_theta_fb_bytes = calcClassSharedMemSize(fb_controller, blockDim); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* x_shared = entire_buffer; float* x_next_shared = &x_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* y_shared = &x_next_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* x_dot_shared = &y_shared[math::nearest_multiple_4(sample_dim * DYN_T::OUTPUT_DIM * distribution_dim)]; float* u_shared = &x_dot_shared[math::nearest_multiple_4(sample_dim * DYN_T::STATE_DIM * distribution_dim)]; float* theta_s_shared = &u_shared[math::nearest_multiple_4(sample_dim * DYN_T::CONTROL_DIM * distribution_dim)]; float* theta_d_shared = &theta_s_shared[size_of_theta_s_bytes / sizeof(float)]; float* theta_fb = &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; #ifdef USE_CUDA_BARRIERS_DYN barrier* barrier_shared = (barrier*)&theta_fb[size_of_theta_fb_bytes / sizeof(float)]; #endif // Create local state, state dot and controls float* x = &(reinterpret_cast(x_shared)[shared_idx * DYN_T::STATE_DIM]); float* x_next = &(reinterpret_cast(x_next_shared)[shared_idx * DYN_T::STATE_DIM]); float* x_nom = &(reinterpret_cast(x_shared)[shared_nom_idx * DYN_T::STATE_DIM]); float* x_nom_next = &(reinterpret_cast(x_next_shared)[shared_nom_idx * DYN_T::STATE_DIM]); float* x_temp; float* xdot = &(reinterpret_cast(x_dot_shared)[shared_idx * DYN_T::STATE_DIM]); float* u = &(reinterpret_cast(u_shared)[shared_idx * DYN_T::CONTROL_DIM]); float* y = &(reinterpret_cast(y_shared)[shared_idx * DYN_T::OUTPUT_DIM]); #ifdef USE_CUDA_BARRIERS_DYN barrier* bar = &barrier_shared[shared_idx]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif // The array to hold K(x,x*) float fb_control[DYN_T::CONTROL_DIM]; int i; // Load global array to shared array ::mppi::kernels::loadGlobalToShared( num_rollouts, blockDim.y, global_idx, thread_idy, thread_idz, init_x_d, x, xdot, u); __syncthreads(); /*<----Start of simulation loop-----> */ dynamics->initializeDynamics(x, u, y, theta_s_shared, 0.0, dt); sampling->initializeDistributions(y, 0.0, dt, theta_d_shared); fb_controller->initializeFeedback(x, u, theta_fb, 0.0, dt); __syncthreads(); for (int t = 0; t < num_timesteps; t++) { // Load noise trajectories scaled by the exploration factor sampling->readControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif // Now find feedback control for (i = 0; i < DYN_T::CONTROL_DIM; i++) { fb_control[i] = 0; } // we do not apply feedback on the nominal state if (thread_idz != NOMINAL_STATE_IDX) { fb_controller->k(x, x_nom, t, theta_fb, fb_control); } for (i = thread_idy; i < DYN_T::CONTROL_DIM; i += blockDim.y) { u[i] += fb_control[i]; } #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif // applies constraints as defined in dynamics.cuh see specific dynamics class for what happens here // usually just control clamping dynamics->enforceConstraints(x, u); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif // copy back feedback-filled controls to global memory sampling->writeControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); // Increment states dynamics->step(x, x_next, xdot, u, y, theta_s_shared, t, dt); #ifdef USE_CUDA_BARRIERS_DYN bar->arrive_and_wait(); #else __syncthreads(); #endif x_temp = x; x = x_next; x_next = x_temp; x_temp = x_nom; x_nom = x_nom_next; x_nom_next = x_temp; // Copy state to global memory int sample_time_offset = (num_rollouts * thread_idz + global_idx) * num_timesteps + t; mp1::loadArrayParallel(y_d, sample_time_offset * DYN_T::OUTPUT_DIM, y, 0); } } template __global__ void rolloutRMPPICostKernel(COST_T* __restrict__ costs, DYN_T* __restrict__ dynamics, FB_T* __restrict__ fb_controller, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float value_func_threshold, const float* __restrict__ init_x_d, const float* __restrict__ y_d, float* __restrict__ trajectory_costs_d) { // Get thread and block id const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int global_idx = blockIdx.x; const int distribution_idx = threadIdx.z; const int shared_idx = blockDim.x * thread_idz + thread_idx; const int num_shared = blockDim.x * blockDim.z; const int size_of_theta_s_bytes = calcClassSharedMemSize(dynamics, blockDim); const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int size_of_theta_c_bytes = calcClassSharedMemSize(costs, blockDim); const int size_of_theta_fb_bytes = calcClassSharedMemSize(fb_controller, blockDim); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* y_shared = entire_buffer; float* u_shared = &y_shared[math::nearest_multiple_4(num_shared * COST_T::OUTPUT_DIM)]; float* running_cost_shared = &u_shared[math::nearest_multiple_4(num_shared * COST_T::CONTROL_DIM)]; int* crash_status_shared = (int*)&running_cost_shared[math::nearest_multiple_4(num_shared * 2 * blockDim.y)]; float* theta_c = (float*)&crash_status_shared[math::nearest_multiple_4(num_shared)]; float* theta_d = &theta_c[size_of_theta_c_bytes / sizeof(float)]; float* theta_fb = &theta_d[size_of_theta_d_bytes / sizeof(float)]; #ifdef USE_CUDA_BARRIERS_COST barrier* barrier_shared = (barrier*)&theta_fb[size_of_theta_fb_bytes / sizeof(float)]; #endif // Initialize running cost and total cost int sample_time_offset = 0; // The array to hold K(x,x*) float fb_control[COST_T::CONTROL_DIM]; // Load global array to shared array float* y = &y_shared[shared_idx * COST_T::OUTPUT_DIM]; float* y_nom = &y_shared[(blockDim.x * NOMINAL_STATE_IDX + thread_idx) * COST_T::OUTPUT_DIM]; float* u = &u_shared[shared_idx * COST_T::CONTROL_DIM]; int* crash_status = &crash_status_shared[shared_idx]; const int cost_index = blockDim.x * (thread_idz * blockDim.y + thread_idy) + thread_idx; float* running_cost = &running_cost_shared[cost_index]; float* running_cost_extra = &running_cost_shared[cost_index + num_shared * blockDim.y]; crash_status[0] = 0; // We have not crashed yet as of the first trajectory. running_cost[0] = 0; running_cost_extra[0] = 0; #ifdef USE_CUDA_BARRIERS_COST barrier* bar = &barrier_shared[(blockDim.x * thread_idz + thread_idx)]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif float curr_cost = 0.0f; /*<----Start of simulation loop-----> */ const int max_time_iters = ceilf((float)num_timesteps / blockDim.x); costs->initializeCosts(y, u, theta_c, 0.0, dt); sampling->initializeDistributions(y, 0.0, dt, theta_d); fb_controller->initializeFeedback(y, u, theta_fb, 0.0, dt); __syncthreads(); for (int time_iter = 0; time_iter < max_time_iters; ++time_iter) { int t = thread_idx + time_iter * blockDim.x; if (COALESCE) { // Fill entire shared mem sequentially using sequential threads_idx int amount_to_fill = (time_iter + 1) * blockDim.x > num_timesteps ? num_timesteps % blockDim.x : blockDim.x; mp1::loadArrayParallel( y_shared, blockDim.x * thread_idz * COST_T::OUTPUT_DIM, y_d, ((num_rollouts * thread_idz + global_idx) * num_timesteps + time_iter * blockDim.x) * COST_T::OUTPUT_DIM, COST_T::OUTPUT_DIM * amount_to_fill); } else if (t < num_timesteps) { // t = num_timesteps is the terminal state for outside this for-loop sample_time_offset = (num_rollouts * thread_idz + global_idx) * num_timesteps + t; mp1::loadArrayParallel(y, 0, y_d, sample_time_offset * COST_T::OUTPUT_DIM); } if (t < num_timesteps) { // load controls from t = 0 to t = num_timesteps - 1 sampling->readControlSample(global_idx, t, distribution_idx, u, theta_d, blockDim.y, thread_idy, y); } #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif // Get feedback float x[DYN_T::STATE_DIM]; float x_nom[DYN_T::STATE_DIM]; if (t < num_timesteps) { dynamics->outputToState(y, x); dynamics->outputToState(y_nom, x_nom); fb_controller->k(x, x_nom, t, theta_fb, fb_control); } #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif // Compute cost if (t < num_timesteps) { curr_cost = costs->computeRunningCost(y, u, t, theta_c, crash_status); } if (thread_idz == NOMINAL_STATE_IDX && t < num_timesteps) { running_cost[0] += curr_cost; running_cost_extra[0] += sampling->computeLikelihoodRatioCost(u, theta_d, global_idx, t, distribution_idx, lambda, alpha); } if (thread_idz != NOMINAL_STATE_IDX && t < num_timesteps) { running_cost[0] += curr_cost + sampling->computeLikelihoodRatioCost(u, theta_d, global_idx, t, distribution_idx, lambda, alpha); running_cost_extra[0] += curr_cost + sampling->computeFeedbackCost(fb_control, theta_d, t, distribution_idx, lambda, alpha); } // We need running_state_cost_nom and running_control_cost_nom for the nominal system // We need running_cost_real and running_cost_feedback for the real system // Nominal system needs to know running_state_cost_nom, running_control_cost_nom, and running_cost_feedback // Real system needs to know running_cost_real #ifdef USE_CUDA_BARRIERS_COST bar->arrive_and_wait(); #else __syncthreads(); #endif } // Add all costs together running_cost = &running_cost_shared[blockDim.x * blockDim.y * thread_idz]; running_cost_extra = &running_cost_shared[blockDim.x * blockDim.y * (blockDim.z + thread_idz)]; __syncthreads(); multiCostArrayReduction(running_cost, running_cost_extra, blockDim.x * blockDim.y, thread_idx + blockDim.x * thread_idy, blockDim.x * blockDim.y, thread_idx == blockDim.x - 1 && thread_idy == 0); // point every thread to the last output at t = NUM_TIMESTEPS for terminal cost calculation const int last_y_index = (num_timesteps - 1) % blockDim.x; y = &y_shared[(blockDim.x * thread_idz + last_y_index) * COST_T::OUTPUT_DIM]; if (thread_idx == 0 && thread_idy == 0) { *running_cost += costs->terminalCost(y, theta_c); if (thread_idz != NOMINAL_STATE_IDX) { *running_cost_extra += costs->terminalCost(y, theta_c); } *running_cost /= ((float)num_timesteps); *running_cost_extra /= ((float)num_timesteps); } __syncthreads(); if (thread_idz != NOMINAL_STATE_IDX && thread_idx == 0 && thread_idy == 0) { float* running_nom_cost = &running_cost_shared[blockDim.x * blockDim.y * NOMINAL_STATE_IDX]; const float* running_nom_cost_likelihood_ratio_cost = &running_cost_shared[blockDim.x * blockDim.y * (blockDim.z + NOMINAL_STATE_IDX)]; *running_nom_cost = 0.5 * *running_nom_cost + 0.5 * fmaxf(fminf(*running_cost_extra, value_func_threshold), *running_nom_cost); *running_nom_cost += *running_nom_cost_likelihood_ratio_cost; } __syncthreads(); if (thread_idy == 0 && thread_idx == 0) { trajectory_costs_d[global_idx + thread_idz * num_rollouts] = running_cost[0]; } } template __global__ void rolloutRMPPIKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, FB_T* __restrict__ fb_controller, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float value_func_threshold, const float* __restrict__ init_x_d, float* __restrict__ trajectory_costs_d) { // Get thread and block id const int thread_idx = threadIdx.x; const int thread_idy = threadIdx.y; const int thread_idz = threadIdx.z; const int block_idx = blockIdx.x; const int global_idx = blockDim.x * block_idx + thread_idx; const int shared_idx = blockDim.x * thread_idz + thread_idx; const int shared_nom_idx = blockDim.x * NOMINAL_STATE_IDX + thread_idx; const int distribution_idx = threadIdx.z; const int num_shared = blockDim.x * blockDim.z; const int size_of_theta_s_bytes = calcClassSharedMemSize(dynamics, blockDim); const int size_of_theta_d_bytes = calcClassSharedMemSize(sampling, blockDim); const int size_of_theta_c_bytes = calcClassSharedMemSize(costs, blockDim); const int size_of_theta_fb_bytes = calcClassSharedMemSize(fb_controller, blockDim); // Create shared state and control arrays extern __shared__ float entire_buffer[]; float* x_shared = entire_buffer; float* x_next_shared = &x_shared[math::nearest_multiple_4(num_shared * DYN_T::STATE_DIM)]; float* y_shared = &x_next_shared[math::nearest_multiple_4(num_shared * DYN_T::STATE_DIM)]; float* x_dot_shared = &y_shared[math::nearest_multiple_4(num_shared * DYN_T::OUTPUT_DIM)]; float* u_shared = &x_dot_shared[math::nearest_multiple_4(num_shared * DYN_T::STATE_DIM)]; float* theta_s_shared = &u_shared[math::nearest_multiple_4(num_shared * DYN_T::CONTROL_DIM)]; float* theta_d_shared = &theta_s_shared[size_of_theta_s_bytes / sizeof(float)]; float* theta_c_shared = &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; float* theta_fb = &theta_c_shared[size_of_theta_c_bytes / sizeof(float)]; float* running_cost_shared = &theta_fb[size_of_theta_fb_bytes / sizeof(float)]; int* crash_status_shared = (int*)&running_cost_shared[math::nearest_multiple_4(num_shared * 2 * blockDim.y)]; #ifdef USE_CUDA_BARRIERS_ROLLOUT barrier* barrier_shared = (barrier*)&crash_status_shared[math::nearest_multiple_4(num_shared)]; #endif // Create local state, state dot and controls float* x = &(reinterpret_cast(x_shared)[shared_idx * DYN_T::STATE_DIM]); float* x_next = &(reinterpret_cast(x_next_shared)[shared_idx * DYN_T::STATE_DIM]); float* x_nom = &(reinterpret_cast(x_shared)[shared_nom_idx * DYN_T::STATE_DIM]); float* x_nom_next = &(reinterpret_cast(x_next_shared)[shared_nom_idx * DYN_T::STATE_DIM]); float* x_temp; float* xdot = &(reinterpret_cast(x_dot_shared)[shared_idx * DYN_T::STATE_DIM]); float* u = &(reinterpret_cast(u_shared)[shared_idx * DYN_T::CONTROL_DIM]); float* y = &(reinterpret_cast(y_shared)[shared_idx * DYN_T::OUTPUT_DIM]); int* crash_status = &crash_status_shared[shared_idx]; const int cost_index = blockDim.x * (thread_idz * blockDim.y + thread_idy) + thread_idx; float* running_cost = &running_cost_shared[cost_index]; float* running_cost_extra = &running_cost_shared[cost_index + num_shared * blockDim.y]; crash_status[0] = 0; // We have not crashed yet as of the first trajectory. running_cost[0] = 0; running_cost_extra[0] = 0; #ifdef USE_CUDA_BARRIERS_ROLLOUT barrier* bar = &barrier_shared[shared_idx]; if (thread_idy == 0) { init(bar, blockDim.y); } #endif // The array to hold K(x,x*) float fb_control[DYN_T::CONTROL_DIM]; int i; float curr_cost = 0.0f; // Load global array to shared array ::mppi::kernels::loadGlobalToShared( num_rollouts, blockDim.y, global_idx, thread_idy, thread_idz, init_x_d, x, xdot, u); __syncthreads(); /*<----Start of simulation loop-----> */ dynamics->initializeDynamics(x, u, y, theta_s_shared, 0.0, dt); sampling->initializeDistributions(y, 0.0, dt, theta_d_shared); costs->initializeCosts(y, u, theta_c_shared, 0.0, dt); fb_controller->initializeFeedback(x, u, theta_fb, 0.0, dt); __syncthreads(); for (int t = 0; t < num_timesteps; t++) { // Load noise trajectories scaled by the exploration factor sampling->readControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // Now find feedback control for (i = 0; i < DYN_T::CONTROL_DIM; i++) { fb_control[i] = 0; } // we do not apply feedback on the nominal state if (thread_idz != NOMINAL_STATE_IDX) { fb_controller->k(x, x_nom, t, theta_fb, fb_control); } for (i = thread_idy; i < DYN_T::CONTROL_DIM; i += blockDim.y) { u[i] += fb_control[i]; } #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // applies constraints as defined in dynamics.cuh see specific dynamics class for what happens here // usually just control clamping dynamics->enforceConstraints(x, u); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // copy back feedback-filled controls to global memory sampling->writeControlSample(global_idx, t, distribution_idx, u, theta_d_shared, blockDim.y, thread_idy, y); // Increment states dynamics->step(x, x_next, xdot, u, y, theta_s_shared, t, dt); #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif // Compute cost if (t < num_timesteps) { curr_cost = costs->computeRunningCost(y, u, t, theta_c_shared, crash_status); } if (thread_idz == NOMINAL_STATE_IDX && t < num_timesteps) { running_cost[0] += curr_cost; running_cost_extra[0] += sampling->computeLikelihoodRatioCost(u, theta_d_shared, global_idx, t, distribution_idx, lambda, alpha); } if (thread_idz != NOMINAL_STATE_IDX && t < num_timesteps) { running_cost[0] += curr_cost + sampling->computeLikelihoodRatioCost(u, theta_d_shared, global_idx, t, distribution_idx, lambda, alpha); running_cost_extra[0] += curr_cost + sampling->computeFeedbackCost(fb_control, theta_d_shared, t, distribution_idx, lambda, alpha); } // We need running_state_cost_nom and running_control_cost_nom for the nominal system // We need running_cost_real and running_cost_feedback for the real system // Nominal system needs to know running_state_cost_nom, running_control_cost_nom, and running_cost_feedback // Real system needs to know running_cost_real #ifdef USE_CUDA_BARRIERS_ROLLOUT bar->arrive_and_wait(); #else __syncthreads(); #endif x_temp = x; x = x_next; x_next = x_temp; x_temp = x_nom; x_nom = x_nom_next; x_nom_next = x_temp; } // Add all costs together running_cost = &running_cost_shared[thread_idx + blockDim.x * blockDim.y * thread_idz]; running_cost_extra = &running_cost_shared[thread_idx + blockDim.x * blockDim.y * (blockDim.z + thread_idz)]; __syncthreads(); multiCostArrayReduction(running_cost, running_cost_extra, blockDim.y, thread_idy, blockDim.y, thread_idy == 0, blockDim.x); if (thread_idy == 0) { *running_cost += costs->terminalCost(y, theta_c_shared); if (thread_idz != NOMINAL_STATE_IDX) { *running_cost_extra += costs->terminalCost(y, theta_c_shared); } *running_cost /= ((float)num_timesteps); *running_cost_extra /= ((float)num_timesteps); } __syncthreads(); if (thread_idz != NOMINAL_STATE_IDX && thread_idy == 0) { float* running_nom_cost = &running_cost_shared[blockDim.x * blockDim.y * NOMINAL_STATE_IDX]; const float* running_nom_cost_likelihood_ratio_cost = &running_cost_shared[blockDim.x * blockDim.y * (blockDim.z + NOMINAL_STATE_IDX)]; *running_nom_cost = 0.5 * *running_nom_cost + 0.5 * fmaxf(fminf(*running_cost_extra, value_func_threshold), *running_nom_cost); *running_nom_cost += *running_nom_cost_likelihood_ratio_cost; } __syncthreads(); if (thread_idy == 0) { trajectory_costs_d[global_idx + thread_idz * num_rollouts] = running_cost[0]; } } template void launchSplitInitEvalKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, int samples_per_condition, int* __restrict__ strides_d, float* __restrict__ init_x_d, float* __restrict__ y_d, float* __restrict__ trajectory_costs, dim3 dimDynBlock, dim3 dimCostBlock, cudaStream_t stream, bool synchronize) { if (num_rollouts % dimDynBlock.x != 0) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_rollouts (" << num_rollouts << ") must be evenly divided by eval dynamics block size x (" << dimDynBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } if (num_timesteps < dimCostBlock.x) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_timesteps (" << num_timesteps << ") must be greater than or equal to eval cost block size x (" << dimCostBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } // Run Dynamics const int gridsize_x = math::int_ceil(num_rollouts, dimDynBlock.x); dim3 dimGrid(gridsize_x, 1, 1); unsigned dynamics_shared_size = calcEvalDynKernelSharedMemSize(dynamics, sampling, dimDynBlock); initEvalDynKernel<<>>( dynamics->model_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, samples_per_condition, strides_d, init_x_d, y_d); // Run Costs dim3 dimCostGrid(num_rollouts, 1, 1); unsigned cost_shared_size = calcEvalCostKernelSharedMemSize(costs, sampling, num_rollouts, samples_per_condition, dimCostBlock); initEvalCostKernel<<>>( costs->cost_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, lambda, alpha, samples_per_condition, strides_d, y_d, trajectory_costs); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } template void launchInitEvalKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, int samples_per_condition, int* __restrict__ strides_d, float* __restrict__ init_x_d, float* __restrict__ trajectory_costs, dim3 dimBlock, cudaStream_t stream, bool synchronize) { if (num_rollouts % dimBlock.x != 0) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_rollouts (" << num_rollouts << ") must be evenly divided by eval kernel block size x (" << dimBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } // Run initEvalKernel const int gridsize_x = math::int_ceil(num_rollouts, dimBlock.x); dim3 dimGrid(gridsize_x, 1, 1); unsigned shared_size = calcEvalCombinedKernelSharedMemSize(dynamics, costs, sampling, num_rollouts, samples_per_condition, dimBlock); initEvalKernel<<>>( dynamics->model_d_, costs->cost_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, samples_per_condition, lambda, alpha, strides_d, init_x_d, trajectory_costs); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } } template void launchSplitRMPPIRolloutKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, FB_T* __restrict__ fb_controller, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float value_func_threshold, float* __restrict__ init_x_d, float* __restrict__ y_d, float* __restrict__ trajectory_costs, dim3 dimDynBlock, dim3 dimCostBlock, cudaStream_t stream, bool synchronize) { if (num_rollouts % dimDynBlock.x != 0) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_rollouts (" << num_rollouts << ") must be evenly divided by dynamics block size x (" << dimDynBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } if (num_timesteps < dimCostBlock.x) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_timesteps (" << num_timesteps << ") must be greater than or equal to cost block size x (" << dimCostBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } if (dimDynBlock.z < 2 || dimDynBlock.z != dimCostBlock.z) { std::cerr << __FILE__ << " (" << __LINE__ << "): number of dynamics systems (" << dimDynBlock.z << ") and cost systems (" << dimCostBlock.z << ") must be equal to each other and greater than 2" << std::endl; exit(EXIT_FAILURE); } // Run Dynamics const int gridsize_x = math::int_ceil(num_rollouts, dimDynBlock.x); dim3 dimGrid(gridsize_x, 1, 1); unsigned dynamics_shared_size = calcRMPPIDynKernelSharedMemSize(dynamics, sampling, fb_controller, dimDynBlock); rolloutRMPPIDynamicsKernel <<>>(dynamics->model_d_, fb_controller->feedback_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, init_x_d, y_d); HANDLE_ERROR(cudaGetLastError()); // Run Costs dim3 dimCostGrid(num_rollouts, 1, 1); unsigned cost_shared_size = calcRMPPICostKernelSharedMemSize(costs, sampling, fb_controller, dimCostBlock); rolloutRMPPICostKernel <<>>( costs->cost_d_, dynamics->model_d_, fb_controller->feedback_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, lambda, alpha, value_func_threshold, init_x_d, y_d, trajectory_costs); HANDLE_ERROR(cudaGetLastError()); if (synchronize || true) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } HANDLE_ERROR(cudaGetLastError()); } template void launchRMPPIRolloutKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, FB_T* __restrict__ fb_controller, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float value_func_threshold, float* __restrict__ init_x_d, float* __restrict__ trajectory_costs, dim3 dimBlock, cudaStream_t stream, bool synchronize) { if (num_rollouts % dimBlock.x != 0) { std::cerr << __FILE__ << " (" << __LINE__ << "): num_rollouts (" << num_rollouts << ") must be evenly divided by dynamics block size x (" << dimBlock.x << ")" << std::endl; exit(EXIT_FAILURE); } // if (num_timesteps < dimCostBlock.x) // { // std::cerr << __FILE__ << " (" << __LINE__ << "): num_timesteps (" << num_timesteps // << ") must be greater than or equal to cost block size x (" << dimCostBlock.x << ")" << std::endl; // exit(EXIT_FAILURE); // } if (dimBlock.z < 2) { std::cerr << __FILE__ << " (" << __LINE__ << "): number of dynamics systems (" << dimBlock.z << ") must be greater than 2" << std::endl; exit(EXIT_FAILURE); } // Run RMPPI Rollout kernel const int gridsize_x = math::int_ceil(num_rollouts, dimBlock.x); dim3 dimGrid(gridsize_x, 1, 1); unsigned shared_size = calcRMPPICombinedKernelSharedMemSize(dynamics, costs, sampling, fb_controller, dimBlock); rolloutRMPPIKernel<<>>( dynamics->model_d_, costs->cost_d_, fb_controller->feedback_d_, sampling->sampling_d_, dt, num_timesteps, num_rollouts, lambda, alpha, value_func_threshold, init_x_d, trajectory_costs); HANDLE_ERROR(cudaGetLastError()); if (synchronize || true) { HANDLE_ERROR(cudaStreamSynchronize(stream)); } HANDLE_ERROR(cudaGetLastError()); } template unsigned calcEvalDynKernelSharedMemSize(const DYN_T* dynamics, const SAMPLER_T* sampler, dim3& dimBlock) { const int dynamics_num_shared = dimBlock.x * dimBlock.z; unsigned dynamics_shared_size = sizeof(float) * (3 * math::nearest_multiple_4(dynamics_num_shared * DYN_T::STATE_DIM) + math::nearest_multiple_4(dynamics_num_shared * DYN_T::OUTPUT_DIM) + math::nearest_multiple_4(dynamics_num_shared * DYN_T::CONTROL_DIM)) + calcClassSharedMemSize(dynamics, dimBlock) + calcClassSharedMemSize(sampler, dimBlock); #ifdef USE_CUDA_BARRIERS_DYN dynamics_shared_size += math::int_multiple_const(dynamics_num_shared * sizeof(barrier), 16); #endif return dynamics_shared_size; } template unsigned calcEvalCostKernelSharedMemSize(const COST_T* cost, const SAMPLER_T* sampler, const int num_rollouts, const int samples_per_condition, dim3& dimBlock) { const int cost_num_shared = dimBlock.x * dimBlock.z; unsigned cost_shared_size = sizeof(float) * (math::nearest_multiple_4(cost_num_shared * COST_T::OUTPUT_DIM) + math::nearest_multiple_4(cost_num_shared * COST_T::CONTROL_DIM) + math::nearest_multiple_4(cost_num_shared * dimBlock.y)) + sizeof(int) * math::nearest_multiple_4(cost_num_shared) + calcClassSharedMemSize(cost, dimBlock) + calcClassSharedMemSize(sampler, dimBlock) + math::nearest_multiple_4(num_rollouts / samples_per_condition) * sizeof(float); #ifdef USE_CUDA_BARRIERS_COST cost_shared_size += math::int_multiple_const(cost_num_shared * sizeof(barrier), 16); #endif return cost_shared_size; } template unsigned calcEvalCombinedKernelSharedMemSize(const DYN_T* dynamics, const COST_T* cost, const SAMPLER_T* sampler, const int num_rollouts, const int samples_per_condition, dim3& dimBlock) { const int num_shared = dimBlock.x * dimBlock.z; unsigned shared_size = sizeof(float) * (3 * math::nearest_multiple_4(num_shared * DYN_T::STATE_DIM) + math::nearest_multiple_4(num_shared * DYN_T::OUTPUT_DIM) + math::nearest_multiple_4(num_shared * DYN_T::CONTROL_DIM) + math::nearest_multiple_4(num_shared * dimBlock.y)) + sizeof(int) * math::nearest_multiple_4(num_shared) + calcClassSharedMemSize(dynamics, dimBlock) + calcClassSharedMemSize(cost, dimBlock) + calcClassSharedMemSize(sampler, dimBlock); #ifdef USE_CUDA_BARRIERS_ROLLOUT shared_size += math::int_multiple_const(num_shared * sizeof(barrier), 16); #endif return shared_size; } template unsigned calcRMPPIDynKernelSharedMemSize(const DYN_T* dynamics, const SAMPLER_T* sampler, const FB_T* fb_controller, dim3& dimBlock) { const int dynamics_num_shared = dimBlock.x * dimBlock.z; unsigned dynamics_shared_size = sizeof(float) * (3 * math::nearest_multiple_4(dynamics_num_shared * DYN_T::STATE_DIM) + math::nearest_multiple_4(dynamics_num_shared * DYN_T::OUTPUT_DIM) + math::nearest_multiple_4(dynamics_num_shared * DYN_T::CONTROL_DIM)) + calcClassSharedMemSize(dynamics, dimBlock) + calcClassSharedMemSize(sampler, dimBlock) + calcClassSharedMemSize(fb_controller, dimBlock); #ifdef USE_CUDA_BARRIERS_DYN dynamics_shared_size += math::int_multiple_const(dynamics_num_shared * sizeof(barrier), 16); #endif return dynamics_shared_size; } template unsigned calcRMPPICostKernelSharedMemSize(const COST_T* cost, const SAMPLER_T* sampler, const FB_T* fb_controller, dim3& dimBlock) { const int cost_num_shared = dimBlock.x * dimBlock.z; unsigned cost_shared_size = sizeof(float) * (math::nearest_multiple_4(cost_num_shared * COST_T::OUTPUT_DIM) + math::nearest_multiple_4(cost_num_shared * COST_T::CONTROL_DIM) + math::nearest_multiple_4(cost_num_shared * dimBlock.y * 2)) + sizeof(int) * math::nearest_multiple_4(cost_num_shared) + calcClassSharedMemSize(cost, dimBlock) + calcClassSharedMemSize(sampler, dimBlock) + calcClassSharedMemSize(fb_controller, dimBlock); #ifdef USE_CUDA_BARRIERS_COST cost_shared_size += math::int_multiple_const(cost_num_shared * sizeof(barrier), 16); #endif return cost_shared_size; } template unsigned calcRMPPICombinedKernelSharedMemSize(const DYN_T* dynamics, const COST_T* cost, const SAMPLER_T* sampler, const FB_T* fb_controller, dim3& dimBlock) { const int num_shared = dimBlock.x * dimBlock.z; unsigned shared_size = sizeof(float) * (3 * math::nearest_multiple_4(num_shared * DYN_T::STATE_DIM) + math::nearest_multiple_4(num_shared * DYN_T::OUTPUT_DIM) + math::nearest_multiple_4(num_shared * DYN_T::CONTROL_DIM) + math::nearest_multiple_4(num_shared * dimBlock.y * 2)) + sizeof(int) * math::nearest_multiple_4(num_shared) + calcClassSharedMemSize(dynamics, dimBlock) + calcClassSharedMemSize(sampler, dimBlock) + calcClassSharedMemSize(fb_controller, dimBlock) + calcClassSharedMemSize(cost, dimBlock); #ifdef USE_CUDA_BARRIERS_DYN shared_size += math::int_multiple_const(num_shared * sizeof(barrier), 16); #endif return shared_size; } __device__ void multiCostArrayReduction(float* running_cost, float* running_cost_extra, const int start_size, const int index, const int step, const bool catch_condition, const int stride) { int prev_size = start_size; const bool block_power_of_2 = (prev_size & (prev_size - 1)) == 0; const int stop_condition = (block_power_of_2) ? 32 : 0; int size; int j; for (size = prev_size / 2; size > stop_condition; size /= 2) { for (j = index; j < size; j += step) { running_cost[j * stride] += running_cost[(j + size) * stride]; running_cost_extra[j * stride] += running_cost_extra[(j + size) * stride]; } __syncthreads(); if (prev_size - 2 * size == 1 && catch_condition) { running_cost[(size - 1) * stride] += running_cost[(prev_size - 1) * stride]; running_cost_extra[(size - 1) * stride] += running_cost_extra[(prev_size - 1) * stride]; } __syncthreads(); prev_size = size; } switch (size * 2) { case 64: if (index < 32) { warpReduceAdd<64>(running_cost, index, stride); warpReduceAdd<64>(running_cost_extra, index, stride); } break; case 32: if (index < 16) { warpReduceAdd<32>(running_cost, index, stride); warpReduceAdd<32>(running_cost_extra, index, stride); } break; case 16: if (index < 8) { warpReduceAdd<16>(running_cost, index, stride); warpReduceAdd<16>(running_cost_extra, index, stride); } break; case 8: if (index < 4) { warpReduceAdd<8>(running_cost, index, stride); warpReduceAdd<8>(running_cost_extra, index, stride); } break; case 4: if (index < 2) { warpReduceAdd<4>(running_cost, index, stride); warpReduceAdd<4>(running_cost_extra, index, stride); } break; case 2: if (index < 1) { warpReduceAdd<2>(running_cost, index, stride); warpReduceAdd<2>(running_cost_extra, index, stride); } break; } __syncthreads(); } } // namespace rmppi } // namespace kernels } // namespace mppi ================================================ FILE: include/mppi/core/rmppi_kernels.cuh ================================================ /** * Created by Bogdan Vlahov on 3/25/2023 **/ #pragma once #include #include #include namespace mppi { namespace kernels { namespace rmppi { /** * Kernels Methods **/ template __global__ void initEvalDynKernel(DYN_T* __restrict__ dynamics, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, int samples_per_condition, const int* __restrict__ strides_d, const float* __restrict__ states_d, float* __restrict__ y_d); template __global__ void initEvalCostKernel(COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, const int samples_per_condition, const int* __restrict__ strides_d, const float* __restrict__ y_d, float* __restrict__ trajectory_costs_d); template __global__ void rolloutRMPPIDynamicsKernel(DYN_T* __restrict__ dynamics, FB_T* __restrict__ fb_controller, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, const float* __restrict__ init_x_d, float* __restrict__ y_d); template __global__ void rolloutRMPPICostKernel(COST_T* __restrict__ costs, DYN_T* __restrict__ dynamics, FB_T* __restrict__ fb_controller, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float value_func_threshold, const float* __restrict__ init_x_d, const float* __restrict__ y_d, float* __restrict__ trajectory_costs_d); template __global__ void initEvalKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, int samples_per_condition, float lambda, float alpha, const int* __restrict__ strides_d, const float* __restrict__ states_d, float* __restrict__ trajectory_costs_d); template __global__ void rolloutRMPPIKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, FB_T* __restrict__ fb_controller, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float value_func_threshold, const float* __restrict__ init_x_d, float* __restrict__ trajectory_costs_d); /** * Launch Methods **/ template void launchSplitInitEvalKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, int samples_per_condition, int* __restrict__ strides_d, float* __restrict__ init_x_d, float* __restrict__ y_d, float* __restrict__ trajectory_costs, dim3 dimDynBlock, dim3 dimCostBlock, cudaStream_t stream, bool synchronize = true); template void launchSplitRMPPIRolloutKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, FB_T* __restrict__ fb_controller, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float value_func_threshold, float* __restrict__ init_x_d, float* __restrict__ y_d, float* __restrict__ trajectory_costs, dim3 dimDynBlock, dim3 dimCostBlock, cudaStream_t stream, bool synchronize = true); template void launchInitEvalKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, int samples_per_condition, int* __restrict__ strides_d, float* __restrict__ init_x_d, float* __restrict__ trajectory_costs, dim3 dimBlock, cudaStream_t stream, bool synchronize = true); template void launchRMPPIRolloutKernel(DYN_T* __restrict__ dynamics, COST_T* __restrict__ costs, SAMPLING_T* __restrict__ sampling, FB_T* __restrict__ fb_controller, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, float value_func_threshold, float* __restrict__ init_x_d, float* __restrict__ trajectory_costs, dim3 dimBlock, cudaStream_t stream, bool synchronize = true); /** * Device-only Kernel Helper Methods **/ __device__ void multiCostArrayReduction(float* running_cost, float* running_cost_extra, const int start_size, const int index, const int step, const bool catch_condition, const int stride = 1); /** * Shared Memory Calculators for various kernels */ template unsigned calcEvalDynKernelSharedMemSize(const DYN_T* dynamics, const SAMPLER_T* sampler, dim3& dimBlock); template unsigned calcEvalCostKernelSharedMemSize(const COST_T* cost, const SAMPLER_T* sampler, const int num_rollouts, const int samples_per_condition, dim3& dimBlock); template unsigned calcEvalCombinedKernelSharedMemSize(const DYN_T* dynamics, const COST_T* cost, const SAMPLER_T* sampler, const int num_rollouts, const int samples_per_condition, dim3& dimBlock); template unsigned calcRMPPIDynKernelSharedMemSize(const DYN_T* dynamics, const SAMPLER_T* sampler, const FB_T* fb_controller, dim3& dimBlock); template unsigned calcRMPPICostKernelSharedMemSize(const COST_T* cost, const SAMPLER_T* sampler, const FB_T* fb_controller, dim3& dimBlock); template unsigned calcRMPPICombinedKernelSharedMemSize(const DYN_T* dynamics, const COST_T* cost, const SAMPLER_T* sampler, const FB_T* fb_controller, dim3& dimBlock); } // namespace rmppi } // namespace kernels } // namespace mppi #if __CUDACC__ #include "rmppi_kernels.cu" #endif ================================================ FILE: include/mppi/cost_functions/autorally/ar_robust_cost.cu ================================================ template ARRobustCostImpl::ARRobustCostImpl(cudaStream_t stream) : ARStandardCostImpl(stream) { } template ARRobustCostImpl::~ARRobustCostImpl() { } template __host__ __device__ float ARRobustCostImpl::getStabilizingCost(const float* s) { float penalty_val = 0; float slip; if (fabs(s[4]) < 0.001) { slip = 0; } else { slip = fabs(-atan(s[5] / fabs(s[4]))); } if (slip >= 0.75 * this->params_.max_slip_ang) { float slip_val = fminf(1.0, slip / this->params_.max_slip_ang); float alpha = (slip_val - 0.75) / (1.0 - 0.75); penalty_val = alpha * this->params_.crash_coeff; } // crash if roll is too large if (fabs(s[3]) >= M_PI_2) { penalty_val = this->params_.crash_coeff; } return this->params_.slip_coeff * slip + penalty_val; } template __host__ __device__ float ARRobustCostImpl::getCostmapCost(const float* s) { float cost = 0; // Compute a transformation to get the (x,y) positions of the front and back of the car. #ifdef __CUDA_ARCH__ float x_front = s[0] + this->FRONT_D * __cosf(s[2]); float y_front = s[1] + this->FRONT_D * __sinf(s[2]); float x_back = s[0] + this->BACK_D * __cosf(s[2]); float y_back = s[1] + this->BACK_D * __sinf(s[2]); #else float x_front = s[0] + this->FRONT_D * cosf(s[2]); float y_front = s[1] + this->FRONT_D * sinf(s[2]); float x_back = s[0] + this->BACK_D * cosf(s[2]); float y_back = s[1] + this->BACK_D * sinf(s[2]); #endif float u, v, w; // Transformed coordinates // parameters for the front and back of car this->coorTransform(x_front, y_front, &u, &v, &w); #ifdef __CUDA_ARCH__ float4 track_params_front = tex2D(this->costmap_tex_d_, u / w, v / w); #else float2 query = make_float2(u / w * this->width_, v / w * this->height_); query.x = query.x - 0.5f; query.y = query.y - 0.5f; query.x = fmaxf(0.0f, fminf(this->width_ - 1, query.x)); query.y = fmaxf(0.0f, fminf(this->height_ - 1, query.y)); float4 track_params_front = this->track_costs_[std::round(query.y) * this->width_ + std::round(query.x)]; #endif // Calculate back texture query this->coorTransform(x_back, y_back, &u, &v, &w); #ifdef __CUDA_ARCH__ float4 track_params_back = tex2D(this->costmap_tex_d_, u / w, v / w); #else query = make_float2(u / w * this->width_, v / w * this->height_); query.x = query.x - 0.5f; query.y = query.y - 0.5f; query.x = fmaxf(0.0f, fminf(this->width_ - 1, query.x)); query.y = fmaxf(0.0f, fminf(this->height_ - 1, query.y)); float4 track_params_back = this->track_costs_[std::round(query.y) * this->width_ + std::round(query.x)]; #endif // printf("thread (%d %d %d) front val (%f, %f) %f back_val (%f, %f) %f\n", threadIdx.x, threadIdx.y, threadIdx.z, // x_front, y_front, track_params_front.x, x_back, y_back, track_params_back.x); // Calculate the constraint penalty float constraint_val = fminf(1.0, fmaxf(track_params_front.x, track_params_back.x)); if (constraint_val >= this->params_.boundary_threshold) { float alpha = (constraint_val - this->params_.boundary_threshold) / (1.0 - this->params_.boundary_threshold); cost += alpha * this->params_.crash_coeff; } // Calculate the track positioning cost if (track_params_front.y > this->params_.track_slop) { cost += this->params_.track_coeff * track_params_front.y; } // Calculate the speed cost if (this->params_.desired_speed == -1) { cost += this->params_.speed_coeff * fabs(s[4] - track_params_front.z); } else { cost += this->params_.speed_coeff * fabs(s[4] - this->params_.desired_speed); } // Calculate the heading cost cost += this->params_.heading_coeff * fabs(sinf(s[2]) + track_params_front.w); return cost; } template inline __host__ __device__ float ARRobustCostImpl::computeStateCost(const float* s, int timestep, float* theta_c, int* crash_status) { float stabilizing_cost = getStabilizingCost(s); float costmap_cost = getCostmapCost(s); float cost = stabilizing_cost + costmap_cost; if (cost > this->MAX_COST_VALUE || isnan(cost)) { // TODO Handle max cost value in a generic way cost = this->MAX_COST_VALUE; } return cost; } template inline float ARRobustCostImpl::computeStateCost(const Eigen::Ref y, int timestep, int* crash_status) { return computeStateCost(y.data(), timestep, nullptr, crash_status); } ================================================ FILE: include/mppi/cost_functions/autorally/ar_robust_cost.cuh ================================================ #ifndef AR_ROBUST_COST_CUH_ #define AR_ROBUST_COST_CUH_ #include struct ARRobustCostParams : public ARStandardCostParams { // Miscellaneous float heading_coeff = 0.0; ARRobustCostParams() { control_cost_coeff[0] = 0.0; // steering_coeff control_cost_coeff[1] = 0.0; // throttle_coeff // values, if negative ignored desired_speed = -1; max_slip_ang = 1.5; // Cost term coefficients track_coeff = 33.0; slip_coeff = 0.0; speed_coeff = 20.0; // Constraint penalty thresholds crash_coeff = 125000; boundary_threshold = 0.75; // Miscellaneous track_slop = 0; } }; template class ARRobustCostImpl : public ARStandardCostImpl { public: using PARENT_CLASS = ARStandardCostImpl; using output_array = typename PARENT_CLASS::output_array; ARRobustCostImpl(cudaStream_t stream = 0); // : ARStandardCost(steam); ~ARRobustCostImpl(); std::string getCostFunctionName() { return "AutoRally robust cost function"; } __host__ __device__ float getStabilizingCost(const float* s); __host__ __device__ float getCostmapCost(const float* s); __host__ __device__ float computeStateCost(const float* s, int timestep, float* theta_c, int* crash_status); float computeStateCost(const Eigen::Ref y, int timestep, int* crash_status); using PARENT_CLASS::terminalCost; private: }; #if __CUDACC__ #include "ar_robust_cost.cu" #endif class ARRobustCost : public ARRobustCostImpl { public: ARRobustCost(cudaStream_t stream = 0) : ARRobustCostImpl(stream) { } }; #endif // AR_ROBUST_COST_CUH_ ================================================ FILE: include/mppi/cost_functions/autorally/ar_standard_cost.cu ================================================ #include template ARStandardCostImpl::ARStandardCostImpl(cudaStream_t stream) { this->bindToStream(stream); } template void ARStandardCostImpl::freeCudaMem() { if (this->GPUMemStatus_) { HANDLE_ERROR(cudaFreeArray(costmapArray_d_)); costmapArray_d_ = nullptr; } PARENT_CLASS::freeCudaMem(); } template void ARStandardCostImpl::paramsToDevice() { HANDLE_ERROR(cudaMemcpyAsync(&this->cost_d_->params_, &this->params_, sizeof(PARAMS_T), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(&this->cost_d_->width_, &width_, sizeof(float), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR( cudaMemcpyAsync(&this->cost_d_->height_, &height_, sizeof(float), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } template bool ARStandardCostImpl::changeCostmapSize(int width, int height) { // TODO set flag at top that indicates memory allocation changes if (height < 0 && width < 0) { std::cerr << "ERROR: cannot resize costmap to size less than 1" << std::endl; return false; } if (height != height_ || width != width_ || costmapArray_d_ == nullptr) { track_costs_.resize(width * height); // Allocate memory for the cuda array which is bound the costmap_tex_ // has been allocated in the past, must be freed if (height_ > 0 && width_ > 0) { HANDLE_ERROR(cudaFreeArray(costmapArray_d_)); } // 4 floats of size 32 bits channelDesc_ = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); HANDLE_ERROR(cudaMallocArray(&costmapArray_d_, &channelDesc_, width, height)); // set all of the elements in the array to be zero std::vector zero_array(width_ * height_); zero_array.resize(width * height, make_float4(0, 0, 0, 0)); HANDLE_ERROR(cudaMemcpyToArray(costmapArray_d_, 0, 0, zero_array.data(), width * height * sizeof(float4), cudaMemcpyHostToDevice)); } width_ = width; height_ = height; return true; } template void ARStandardCostImpl::clearCostmapCPU(int width, int height) { changeCostmapSize(width, height); if (width_ < 0 && height_ < 0) { return; } for (int i = 0; i < width_ * height_; i++) { track_costs_[i].x = 0; track_costs_[i].y = 0; track_costs_[i].z = 0; track_costs_[i].w = 0; } } template std::vector ARStandardCostImpl::loadTrackData(std::string map_path) { // check if file exists if (!fileExists(map_path)) { std::cerr << "ERROR: map path invalid, " << map_path << std::endl; return std::vector(); } // load the npz file cnpy::npz_t map_dict = cnpy::npz_load(map_path); float x_min, x_max, y_min, y_max, ppm; float* xBounds = map_dict["xBounds"].data(); float* yBounds = map_dict["yBounds"].data(); float* pixelsPerMeter = map_dict["pixelsPerMeter"].data(); x_min = xBounds[0]; x_max = xBounds[1]; y_min = yBounds[0]; y_max = yBounds[1]; ppm = pixelsPerMeter[0]; int width = int((x_max - x_min) * ppm); int height = int((y_max - y_min) * ppm); if (!changeCostmapSize(width, height)) { std::cerr << "ERROR: load track has invalid sizes" << std::endl; return std::vector(); } float* channel0 = map_dict["channel0"].data(); float* channel1 = map_dict["channel1"].data(); float* channel2 = map_dict["channel2"].data(); float* channel3 = map_dict["channel3"].data(); // copy the track data into CPU side storage for (int i = 0; i < width_ * height_; i++) { // std::cout << i << " = " << channel0[i] << ", " << channel1[i] << ", " << channel2[i] << ", " << channel3[i] << // std::endl; track_costs_[i].x = channel0[i]; track_costs_[i].y = channel1[i]; track_costs_[i].z = channel2[i]; track_costs_[i].w = channel3[i]; } Eigen::Matrix3f R; Eigen::Array3f trs; // Save the scaling and offset R << 1. / (x_max - x_min), 0, 0, 0, 1. / (y_max - y_min), 0, 0, 0, 1; trs << -x_min / (x_max - x_min), -y_min / (y_max - y_min), 1; updateTransform(R, trs); costmapToTexture(); return track_costs_; } template void ARStandardCostImpl::costmapToTexture() { if (width_ < 0 || height_ < 0) { std::cerr << "ERROR: cannot allocate texture with zero size" << std::endl; return; } // transfer CPU version of costmap to GPU float4* costmap_ptr = track_costs_.data(); HANDLE_ERROR( cudaMemcpyToArray(costmapArray_d_, 0, 0, costmap_ptr, width_ * height_ * sizeof(float4), cudaMemcpyHostToDevice)); cudaStreamSynchronize(this->stream_); // Specify texture struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = costmapArray_d_; // Specify texture object parameters struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.addressMode[1] = cudaAddressModeClamp; texDesc.filterMode = cudaFilterModePoint; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 1; // Destroy current texture and create new texture object HANDLE_ERROR(cudaDestroyTextureObject(costmap_tex_d_)); HANDLE_ERROR(cudaCreateTextureObject(&costmap_tex_d_, &resDesc, &texDesc, NULL)); // copy over pointers setup up on CPU code to GPU HANDLE_ERROR(cudaMemcpyAsync(&this->cost_d_->costmapArray_d_, &costmapArray_d_, sizeof(cudaArray*), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(&this->cost_d_->costmap_tex_d_, &costmap_tex_d_, sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice, this->stream_)); cudaStreamSynchronize(this->stream_); } template inline __device__ float4 ARStandardCostImpl::queryTexture(float x, float y) const { // printf("\nquerying point (%f, %f)", x, y); return tex2D(costmap_tex_d_, x, y); } template void ARStandardCostImpl::updateTransform(Eigen::MatrixXf m, Eigen::ArrayXf trs) { this->params_.r_c1.x = m(0, 0); this->params_.r_c1.y = m(1, 0); this->params_.r_c1.z = m(2, 0); this->params_.r_c2.x = m(0, 1); this->params_.r_c2.y = m(1, 1); this->params_.r_c2.z = m(2, 1); this->params_.trs.x = trs(0); this->params_.trs.y = trs(1); this->params_.trs.z = trs(2); // Move the updated parameters to gpu memory if (this->GPUMemStatus_) { paramsToDevice(); } } template __host__ __device__ void ARStandardCostImpl::coorTransform(float x, float y, float* u, float* v, float* w) { ////Compute a projective transform of (x, y, 0, 1) // printf("coordiante transform %f, %f, %f\n", params_.r_c1.x, params_.r_c2.x, params_.trs.x); // converts to the texture [0-1] coordinate system u[0] = this->params_.r_c1.x * x + this->params_.r_c2.x * y + this->params_.trs.x; v[0] = this->params_.r_c1.y * x + this->params_.r_c2.y * y + this->params_.trs.y; w[0] = this->params_.r_c1.z * x + this->params_.r_c2.z * y + this->params_.trs.z; } template __host__ __device__ float4 ARStandardCostImpl::queryTextureTransformed(float x, float y) { float u, v, w; coorTransform(x, y, &u, &v, &w); // printf("input coordinates: %f, %f\n", x, y); // printf("\nu = %f, v = %f, w = %f", u, v, w); // printf("transformed coordinates %f, %f = %f\n", u/w, v/w, tex2D(costmap_tex_d_, u/w, v/w).x); #ifdef __CUDA_ARCH__ return tex2D(costmap_tex_d_, u / w, v / w); #else float2 query = make_float2(u / w * width_, v / w * height_); query.x = query.x - 0.5f; query.y = query.y - 0.5f; query.x = fmaxf(0.0f, fminf(width_ - 1, query.x)); query.y = fmaxf(0.0f, fminf(height_ - 1, query.y)); return this->track_costs_[std::round(query.y) * width_ + std::round(query.x)]; #endif } template Eigen::Matrix3f ARStandardCostImpl::getRotation() { Eigen::Matrix3f m; m(0, 0) = this->params_.r_c1.x; m(1, 0) = this->params_.r_c1.y; m(2, 0) = this->params_.r_c1.z; m(0, 1) = this->params_.r_c2.x; m(1, 1) = this->params_.r_c2.y; m(2, 1) = this->params_.r_c2.z; m(0, 2) = 0.0; m(1, 2) = 0.0; m(2, 2) = 1.0; return m; } template Eigen::Array3f ARStandardCostImpl::getTranslation() { Eigen::Array3f array; array(0) = this->params_.trs.x; array(1) = this->params_.trs.y; array(2) = this->params_.trs.z; return array; } template inline __device__ float ARStandardCostImpl::terminalCost(float* s, float* theta_c) { return 0.0; } template float ARStandardCostImpl::terminalCost(const Eigen::Ref y) { return 0.0; } template inline __host__ __device__ float ARStandardCostImpl::getSpeedCost(float* s, int* crash) { float cost = 0; float error = s[4] - this->params_.desired_speed; if (l1_cost_) { cost = fabs(error); } else { cost = error * error; } return (this->params_.speed_coeff * cost); } template inline __host__ __device__ float ARStandardCostImpl::getStabilizingCost(float* s, int* crash_status) { float stabilizing_cost = 0; if (fabs(s[4]) > 0.001) { float slip = -atan(s[5] / fabs(s[4])); stabilizing_cost = this->params_.slip_coeff * powf(slip, 2); if (fabs(-atan(s[5] / fabs(s[4]))) > this->params_.max_slip_ang) { // If the slip angle is above the max slip angle kill the trajectory. stabilizing_cost += this->params_.crash_coeff; } } // if we roll over kill the trajectory if (fabs(s[3]) > M_PI_2) { crash_status[0] = 1; } // printf("stabilizing %f\n", stabilizing_cost); return stabilizing_cost; } template inline __host__ __device__ float ARStandardCostImpl::getCrashCost(float* s, int* crash, int num_timestep) { float crash_cost = 0; if (crash[0] > 0) { crash_cost = this->params_.crash_coeff; } // printf("crash_cost %f\n", crash_cost); return crash_cost; } template inline __host__ __device__ float ARStandardCostImpl::getTrackCost(float* s, int* crash) { float track_cost = 0; // Compute a transformation to get the (x,y) positions of the front and back of the car. #ifdef __CUDA_ARCH__ float x_front = s[0] + this->FRONT_D * __cosf(s[2]); float y_front = s[1] + this->FRONT_D * __sinf(s[2]); float x_back = s[0] + this->BACK_D * __cosf(s[2]); float y_back = s[1] + this->BACK_D * __sinf(s[2]); #else float x_front = s[0] + this->FRONT_D * cosf(s[2]); float y_front = s[1] + this->FRONT_D * sinf(s[2]); float x_back = s[0] + this->BACK_D * cosf(s[2]); float y_back = s[1] + this->BACK_D * sinf(s[2]); #endif // Cost of front of the car // printf("front before %f, %f\n", x_front, y_front); float track_cost_front = queryTextureTransformed(x_front, y_front).x; // printf("front after %f, %f = %f\n", x_front, y_front, track_cost_front); // Cost for back of the car // printf("back before %f, %f\n", x_back, y_back); float track_cost_back = queryTextureTransformed(x_back, y_back).x; // printf("back after %f, %f = %f\n", x_back, y_back, track_cost_back); track_cost = (fabs(track_cost_front) + fabs(track_cost_back)) / 2.0; if (fabs(track_cost) < this->params_.track_slop) { track_cost = 0; } else { track_cost = this->params_.track_coeff * track_cost; } if (track_cost_front >= this->params_.boundary_threshold || track_cost_back >= this->params_.boundary_threshold) { crash[0] = 1; } // printf("track_cost %f\n", track_cost); return track_cost; } template inline __device__ float ARStandardCostImpl::computeStateCost(float* s, int timestep, float* theta_c, int* crash_status) { // printf("input state %f %f %f %f %f %f %f\n", s[0], s[1], s[2], s[3], s[4], s[5], s[6]); /* int global_idx = blockDim.x * blockIdx.x + threadIdx.x; if(global_idx == 0) { printf("desired_speed %f\n", this->params_.desired_speed); printf("speed_coeff %f\n", this->params_.speed_coeff); printf("track_coeff %f\n", this->params_.track_coeff); printf("max_slip_angle %f\n", this->params_.max_slip_ang); printf("slip_coeff %f\n", this->params_.slip_coeff); printf("track_slop %f\n", this->params_.track_slop); printf("crash_coeff %f\n", this->params_.crash_coeff); printf("discount %f\n", this->params_.discount); printf("boundary_threshold %f\n", this->params_.boundary_threshold); printf("grid_res %d\n", this->params_.grid_res); printf("control_cost_coeff[0] %f\n", this->params_.control_cost_coeff[0]); printf("control_cost_coeff[1] %f\n", this->params_.control_cost_coeff[1]); }*/ float track_cost = getTrackCost(s, crash_status); float speed_cost = getSpeedCost(s, crash_status); // printf("speed %f\n", speed_cost); float stabilizing_cost = getStabilizingCost(s, crash_status); float crash_cost = powf(this->params_.discount, timestep) * getCrashCost(s, crash_status, timestep); float cost = speed_cost + crash_cost + track_cost + stabilizing_cost; if (cost > MAX_COST_VALUE || isnan(cost)) { // TODO Handle max cost value in a generic way cost = MAX_COST_VALUE; } return cost; } ================================================ FILE: include/mppi/cost_functions/autorally/ar_standard_cost.cuh ================================================ #pragma once #ifndef AR_STANDARD_COST_CUH_ #define AR_STANDARD_COST_CUH_ #include #include #include #include #include #include #include struct ARStandardCostParams : public CostParams<2> { float desired_speed = 6.0; float speed_coeff = 4.25; float track_coeff = 200.0; float max_slip_ang = 1.25; float slip_coeff = 10.0; float track_slop = 0; float crash_coeff = 10000; float boundary_threshold = 0.65; // TODO remove from struct int grid_res = 10; /* * Prospective transform matrix * r_c1.x, r_c2.x, trs.x * r_c1.y, r_c2.y, trs.y * r_c1.z, r_c2.z, trs.z */ float3 r_c1; // R matrix col 1 float3 r_c2; // R matrix col 2 float3 trs; // translation vector ARStandardCostParams() { control_cost_coeff[0] = 0.0; // steering_coeff control_cost_coeff[1] = 0.0; // throttle_coeff } }; template class ARStandardCostImpl : public Cost { public: // EIGEN_MAKE_ALIGNED_OPERATOR_NEW using PARENT_CLASS = Cost; using output_array = typename PARENT_CLASS::output_array; static constexpr float MAX_COST_VALUE = 1e16; /** * Constructor * @param width * @param height */ ARStandardCostImpl(cudaStream_t stream = 0); std::string getCostFunctionName() const override { return "AutoRally standard cost function"; } /** * Deallocates the allocated cuda memory for an object */ void freeCudaMem(); inline __host__ __device__ int getHeight() const { return height_; } inline __host__ __device__ int getWidth() const { return width_; } inline std::vector getTrackCostCPU() const { return track_costs_; } inline Eigen::Matrix3f getRotation(); inline Eigen::Array3f getTranslation(); // inline __host__ __device__ cudaArray* getCudaArray() {return costmapArray_d_;} // inline __host__ __device__ cudaArray_t* getCudaArrayPointer() {return &costmapArray_d_;} // inline __host__ __device__ cudaTextureObject_t* getCostmapTex(){return &costmap_tex_d_;}; /** * Copies the parameters to the GPU object */ void paramsToDevice(); /** * alters the costmap size in CPU storage and GPU texture * @param width * @param height * @return */ bool changeCostmapSize(int width, int height); /** * @brief Initializes the costmap to all zeros. * * Initializes a float4 vector to the correct width and height and sets every value to zero on the CPU. * default leaves the sizes alone */ void clearCostmapCPU(int width = -1, int height = -1); /** * @brief Binds the member variable costmap to a CUDA texture. */ void costmapToTexture(); __device__ float4 queryTexture(float x, float y) const; /** * @brief Loads track data from a file. * @param C-string representing the path to the costmap data file. * @param h Matrix representing a transform from world to (offset) costmap coordinates. * @param trs Array representing the offset. */ std::vector loadTrackData(std::string map_path); /** * @brief Updates the current costmap coordinate transform. * @param h Matrix representing a transform from world to (offset) costmap coordinates. * @param trs Array representing the offset. */ void updateTransform(Eigen::MatrixXf m, Eigen::ArrayXf trs); /** * @brief Compute a coordinate transform going from world to costmap coordinates. */ __host__ __device__ void coorTransform(float x, float y, float* u, float* v, float* w); /** * Queries the texture using coorTransform beforehand */ __host__ __device__ float4 queryTextureTransformed(float x, float y); /** *@brief Initializes the debug window for a default 20x20 meter window. */ // void debugDisplayInit(); /** * @brief Initialize and allocate memory for debug window display */ // void debugDisplayInit(int width_m, int height_m, int ppm); bool getDebugDisplayEnabled() { return false; } /** * @brief Display the debug view centered around x and y. * @param x float representing the current x-coordinate * @param y float representing the current y-coordinate */ // cv::Mat getDebugDisplay(float x, float y, float heading); /** * * @param description * @param data */ // void updateCostmap(std::vector description, std::vector data); /** * * @param description * @param data */ // void updateObstacles(std::vector description, std::vector data); /** * @brief Returns whether or not the vehicle has crashed or not */ //__host__ __device__ void getCrash(float* state, int* crash); /** * @brief Compute the cost for achieving a desired speed */ __host__ __device__ float getSpeedCost(float* s, int* crash); /** * @brief Compute a penalty term for crashing */ __host__ __device__ float getCrashCost(float* s, int* crash, int num_timestep); /** * @brief Compute some cost terms that help stabilize the car. */ __host__ __device__ float getStabilizingCost(float* s, int* crash); /** * @brief Compute the current track cost based on the costmap. * Requires using CUDA texture memory so can only be run on the GPU */ __host__ __device__ float getTrackCost(float* s, int* crash); /** * @brief Compute all of the individual cost terms and adds them together. */ inline __device__ float computeStateCost(float* s, int timestep, float* theta_c, int* crash_status); /** * @brief Computes the terminal cost from a state */ __device__ float terminalCost(float* s, float* theta_c); float terminalCost(const Eigen::Ref y); // Constant variables const float FRONT_D = 0.5; ///< Distance from GPS receiver to front of car. const float BACK_D = -0.5; ///< Distance from GPS receiver to back of car. protected: bool l1_cost_ = false; // Whether to use L1 speed cost (if false it is L2) // Primary variables int width_ = -1; ///< width of costmap int height_ = -1; ///< height of costmap. cudaArray* costmapArray_d_; ///< Cuda array for texture binding. cudaChannelFormatDesc channelDesc_; ///< Cuda texture channel description. cudaTextureObject_t costmap_tex_d_; ///< Cuda texture object. // TODO what does this look like on GPU side std::vector track_costs_; // Debugging variables float* debug_data_; ///< Host array for holding debug info. // float* debug_data_d_; ///< Device array for holding debug info. int debug_img_width_; /// Width (in meters) of area imaged by debug view. int debug_img_height_; ///< Height (in meters) of area imaged by debug view. int debug_img_ppm_; ///< Pixels per meter for resolution of debug view. int debug_img_size_; ///< Number of pixels in the debug image. bool debugging_; ///< Indicator for if we're in debugging mode }; #if __CUDACC__ #include "ar_standard_cost.cu" #endif class ARStandardCost : public ARStandardCostImpl { public: ARStandardCost(cudaStream_t stream = 0) : ARStandardCostImpl(stream){}; }; #endif // AR_STANDARD_COST_CUH_ ================================================ FILE: include/mppi/cost_functions/cartpole/cartpole_quadratic_cost.cu ================================================ #include CartpoleQuadraticCost::CartpoleQuadraticCost(cudaStream_t stream) { bindToStream(stream); } float CartpoleQuadraticCost::computeStateCost(const Eigen::Ref s, int timestep, int* crash_status) { return (s[0] - params_.desired_terminal_state[0]) * (s[0] - params_.desired_terminal_state[0]) * params_.cart_position_coeff + (s[1] - params_.desired_terminal_state[1]) * (s[1] - params_.desired_terminal_state[1]) * params_.cart_velocity_coeff + (s[2] - params_.desired_terminal_state[2]) * (s[2] - params_.desired_terminal_state[2]) * params_.pole_angle_coeff + (s[3] - params_.desired_terminal_state[3]) * (s[3] - params_.desired_terminal_state[3]) * params_.pole_angular_velocity_coeff; } __device__ float CartpoleQuadraticCost::computeStateCost(float* state, int timestep, float* theta_c, int* crash_status) { return (state[0] - params_.desired_terminal_state[0]) * (state[0] - params_.desired_terminal_state[0]) * params_.cart_position_coeff + (state[1] - params_.desired_terminal_state[1]) * (state[1] - params_.desired_terminal_state[1]) * params_.cart_velocity_coeff + (state[2] - params_.desired_terminal_state[2]) * (state[2] - params_.desired_terminal_state[2]) * params_.pole_angle_coeff + (state[3] - params_.desired_terminal_state[3]) * (state[3] - params_.desired_terminal_state[3]) * params_.pole_angular_velocity_coeff; } __device__ float CartpoleQuadraticCost::terminalCost(float* state, float* theta_c) { return ((state[0] - params_.desired_terminal_state[0]) * (state[0] - params_.desired_terminal_state[0]) * params_.cart_position_coeff + (state[1] - params_.desired_terminal_state[1]) * (state[1] - params_.desired_terminal_state[1]) * params_.cart_velocity_coeff + (state[2] - params_.desired_terminal_state[2]) * (state[2] - params_.desired_terminal_state[2]) * params_.pole_angle_coeff + (state[3] - params_.desired_terminal_state[3]) * (state[3] - params_.desired_terminal_state[3]) * params_.pole_angular_velocity_coeff) * params_.terminal_cost_coeff; } float CartpoleQuadraticCost::terminalCost(const Eigen::Ref state) { return ((state[0] - params_.desired_terminal_state[0]) * (state[0] - params_.desired_terminal_state[0]) * params_.cart_position_coeff + (state[1] - params_.desired_terminal_state[1]) * (state[1] - params_.desired_terminal_state[1]) * params_.cart_velocity_coeff + (state[2] - params_.desired_terminal_state[2]) * (state[2] - params_.desired_terminal_state[2]) * params_.pole_angle_coeff + (state[3] - params_.desired_terminal_state[3]) * (state[3] - params_.desired_terminal_state[3]) * params_.pole_angular_velocity_coeff) * params_.terminal_cost_coeff; } ================================================ FILE: include/mppi/cost_functions/cartpole/cartpole_quadratic_cost.cuh ================================================ #pragma once #ifndef CARTPOLE_QUADRATIC_COST_CUH_ #define CARTPOLE_QUADRATIC_COST_CUH_ #include #include #include struct CartpoleQuadraticCostParams : public CostParams<1> { float cart_position_coeff = 1000; float cart_velocity_coeff = 100; float pole_angle_coeff = 2000; float pole_angular_velocity_coeff = 100; float terminal_cost_coeff = 0; float desired_terminal_state[4] = { 0, 0, M_PI, 0 }; CartpoleQuadraticCostParams() { this->control_cost_coeff[0] = 10.0; } }; class CartpoleQuadraticCost : public Cost { public: /** * Constructor * @param width * @param height */ CartpoleQuadraticCost(cudaStream_t stream = 0); /** * @brief Compute the state cost */ __device__ float computeStateCost(float* s, int timestep = 0, float* theta_c = nullptr, int* crash_status = nullptr); /** * @brief Compute the state cost on the CPU */ float computeStateCost(const Eigen::Ref s, int timestep = 0, int* crash_status = nullptr); /** * @brief Compute the terminal cost of the system */ __device__ float terminalCost(float* s, float* theta_c); float terminalCost(const Eigen::Ref s); protected: }; #if __CUDACC__ #include "cartpole_quadratic_cost.cu" #endif #endif // CARTPOLE_QUADRATIC_COST_CUH_// Include the cart pole cost. ================================================ FILE: include/mppi/cost_functions/cost.cu ================================================ #include template void Cost::paramsToDevice() { if (GPUMemStatus_) { HANDLE_ERROR(cudaMemcpyAsync(&cost_d_->params_, ¶ms_, sizeof(PARAMS_T), cudaMemcpyHostToDevice, stream_)); HANDLE_ERROR(cudaStreamSynchronize(stream_)); } } template void Cost::freeCudaMem() { if (GPUMemStatus_) { cudaFree(cost_d_); GPUMemStatus_ = false; cost_d_ = nullptr; } } template void Cost::GPUSetup() { CLASS_T* derived = static_cast(this); if (!GPUMemStatus_) { cost_d_ = Managed::GPUSetup(derived); } else { this->logger_->debug("%s: GPU Memory already set.\n", derived->getCostFunctionName().c_str()); } derived->paramsToDevice(); } template __device__ float Cost::computeRunningCost(float* y, float* u, int timestep, float* theta_c, int* crash) { if (threadIdx.y == 0) { CLASS_T* derived = static_cast(this); return derived->computeStateCost(y, timestep, theta_c, crash) + derived->computeControlCost(u, timestep, theta_c, crash); } else { return 0.0f; } } ================================================ FILE: include/mppi/cost_functions/cost.cuh ================================================ #pragma once /* Header file for costs */ #ifndef COSTS_CUH_ #define COSTS_CUH_ #include #include #include #include #include #include template struct CostParams { static const int CONTROL_DIM = C_DIM; float control_cost_coeff[C_DIM]; float discount = 1.0f; CostParams() { // Default set all controls to 1 for (int i = 0; i < C_DIM; ++i) { control_cost_coeff[i] = 1.0f; } } }; // https://cboard.cprogramming.com/cplusplus-programming/122412-crtp-how-pass-type.html template class Cost : public Managed { public: // EIGEN_MAKE_ALIGNED_OPERATOR_NEW /** * typedefs for access to templated class from outside classes */ using ControlIndex = typename DYN_PARAMS_T::ControlIndex; using OutputIndex = typename DYN_PARAMS_T::OutputIndex; using TEMPLATED_DYN_PARAMS = DYN_PARAMS_T; static const int CONTROL_DIM = E_INDEX(ControlIndex, NUM_CONTROLS); static const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS); // TODO typedef CLASS_T COST_T; typedef PARAMS_T COST_PARAMS_T; typedef Eigen::Matrix control_array; // Control at a time t typedef Eigen::Matrix control_matrix; // Control at a time t typedef Eigen::Matrix output_array; // Output at a time t Cost() = default; /** * Destructor must be virtual so that children are properly * destroyed when called from a basePlant reference */ virtual ~Cost() { freeCudaMem(); } virtual std::string getCostFunctionName() const { return "cost function name not set"; } void GPUSetup(); bool getDebugDisplayEnabled() { return false; } /** * Updates the cost parameters * @param params */ void setParams(const PARAMS_T& params) { params_ = params; if (GPUMemStatus_) { CLASS_T& derived = static_cast(*this); derived.paramsToDevice(); } } __host__ __device__ PARAMS_T getParams() { return params_; } void paramsToDevice(); /** * * @param description * @param data */ void updateCostmap(std::vector description, std::vector data){}; /** * deallocates the allocated cuda memory for an object */ void freeCudaMem(); /** * Computes the feedback control cost on CPU for RMPPI */ float computeFeedbackCost(const Eigen::Ref fb_u, const Eigen::Ref std_dev, const float lambda = 1.0f, const float alpha = 0.0f) { float cost = 0.0f; for (int i = 0; i < CONTROL_DIM; i++) { cost += params_.control_cost_coeff[i] * SQ(fb_u(i)) / SQ(std_dev(i)); } return 0.5f * lambda * (1.0f - alpha) * cost; } /** * Computes the control cost on CPU. This is the normal control cost calculation * in MPPI and Tube-MPPI */ float computeControlCost(const Eigen::Ref u, int timestep, int* crash) { return 0.0f; } // =================== METHODS THAT SHOULD HAVE NO DEFAULT ========================== /** * Computes the state cost on the CPU. Should be implemented in subclasses */ float computeStateCost(const Eigen::Ref y, int timestep, int* crash_status) { throw std::logic_error("SubClass did not implement computeStateCost"); } /** * * @param s current state as a float array * @return state cost on GPU */ __device__ float computeStateCost(float* y, int timestep, float* theta_c, int* crash_status); /** * Computes the state cost on the CPU. Should be implemented in subclasses */ float terminalCost(const Eigen::Ref y) { throw std::logic_error("SubClass did not implement terminalCost"); } /** * * @param s terminal state as float array * @return terminal cost on GPU */ __device__ float terminalCost(float* y, float* theta_c); /** * Method to allow setup of costs on the CPU. This is needed for * initializing the memory of an LSTM for example */ void initializeCosts(const Eigen::Ref& output, const Eigen::Ref& control, float t_0, float dt) { } /** * Method to allow setup of costs on the GPU. This is needed for * initializing the memory of an LSTM for example */ __device__ void initializeCosts(float* output, float* control, float* theta_c, float t_0, float dt) { } // ================ END OF METHODS WITH NO DEFAULT =========================== // =================== METHODS THAT SHOULD NOT BE OVERWRITTEN ================ /** * Computes the feedback control cost on GPU used in RMPPI. There is an * assumption that we are provided std_dev and the covriance matrix is * diagonal. */ __device__ float computeFeedbackCost(float* fb_u, float* std_dev, float lambda = 1.0f, float alpha = 0.0f) { float cost = 0.0f; for (int i = 0; i < CONTROL_DIM; i++) { cost += params_.control_cost_coeff[i] * SQ(fb_u[i] / std_dev[i]); } return 0.5f * lambda * (1.0f - alpha) * cost; } /** * Computes the normal control cost for MPPI and Tube-MPPI * 0.5 * lambda * (u*^T \Sigma^{-1} u*^T + 2 * u*^T \Sigma^{-1} (u*^T + noise)) * On the GPU, u = u* + noise already, so we need the following to create * the original cost: * 0.5 * lambda * (u - noise)^T \Sigma^{-1} (u + noise) */ __device__ float computeControlCost(float* u, int timestep, float* theta_c, int* crash) { return 0.0f; } // =================== END METHODS THAT SHOULD NOT BE OVERWRITTEN ============ // =================== METHODS THAT CAN BE OVERWRITTEN ======================= float computeRunningCost(const Eigen::Ref y, const Eigen::Ref u, int timestep, int* crash) { CLASS_T* derived = static_cast(this); return derived->computeStateCost(y, timestep, crash) + derived->computeControlCost(u, timestep, crash); } __device__ float computeRunningCost(float* y, float* u, int timestep, float* theta_c, int* crash); // =================== END METHODS THAT CAN BE OVERWRITTEN =================== inline __host__ __device__ PARAMS_T getParams() const { return params_; } CLASS_T* cost_d_ = nullptr; protected: PARAMS_T params_; }; #if __CUDACC__ #include "cost.cu" #endif template const int Cost::CONTROL_DIM; template const int Cost::OUTPUT_DIM; #endif // COSTS_CUH_ ================================================ FILE: include/mppi/cost_functions/double_integrator/double_integrator_circle_cost.cu ================================================ #include DoubleIntegratorCircleCost::DoubleIntegratorCircleCost(cudaStream_t stream) { bindToStream(stream); } __device__ float DoubleIntegratorCircleCost::computeStateCost(float* s, int timestep, float* theta_c, int* crash_status) { float radial_position = s[0] * s[0] + s[1] * s[1]; float current_velocity = sqrtf(s[2] * s[2] + s[3] * s[3]); float current_angular_momentum = s[0] * s[3] - s[1] * s[2]; float cost = 0; // if ((radial_position < params_.inner_path_radius2) || // (radial_position > params_.outer_path_radius2)) { // crash_status[0] = 1; // Indicates the system has crashed. // } // // if (crash_status[0] > 0) { // If we've crashed once, constantly add the crash cost. // cost += powf(this->params_.discount, timestep)*params_.crash_cost; // } if ((radial_position < params_.inner_path_radius2) || (radial_position > params_.outer_path_radius2)) { cost += powf(this->params_.discount, timestep) * params_.crash_cost; } cost += params_.velocity_cost * abs(current_velocity - params_.velocity_desired); cost += params_.velocity_cost * abs(current_angular_momentum - params_.angular_momentum_desired); return cost; } float DoubleIntegratorCircleCost::computeStateCost(const Eigen::Ref s, int timestep, int* crash_status) { float radial_position = s[0] * s[0] + s[1] * s[1]; float current_velocity = sqrtf(s[2] * s[2] + s[3] * s[3]); float current_angular_momentum = s[0] * s[3] - s[1] * s[2]; float cost = 0; // if ((radial_position < params_.inner_path_radius2) || // (radial_position > params_.outer_path_radius2)) { // crash_status[0] = 1; // Indicates the system has crashed. // } // // if (crash_status[0] > 0) { // If we've crashed once, constantly add the crash cost. // cost += powf(this->params_.discount, timestep)*params_.crash_cost; // } if ((radial_position < params_.inner_path_radius2) || (radial_position > params_.outer_path_radius2)) { cost += powf(this->params_.discount, timestep) * params_.crash_cost; } cost += params_.velocity_cost * abs(current_velocity - params_.velocity_desired); cost += params_.velocity_cost * abs(current_angular_momentum - params_.angular_momentum_desired); return cost; } float DoubleIntegratorCircleCost::terminalCost(const Eigen::Ref s) { return 0; } __device__ float DoubleIntegratorCircleCost::terminalCost(float* state, float* theta_c) { return 0; } ================================================ FILE: include/mppi/cost_functions/double_integrator/double_integrator_circle_cost.cuh ================================================ #pragma once #ifndef DOUBLE_INTEGRATOR_CIRCLE_COST_CUH_ #define DOUBLE_INTEGRATOR_CIRCLE_COST_CUH_ #include #include struct DoubleIntegratorCircleCostParams : public CostParams<2> { float velocity_cost = 1; float crash_cost = 1000; float velocity_desired = 2; float inner_path_radius2 = 1.875 * 1.875; float outer_path_radius2 = 2.125 * 2.125; float angular_momentum_desired = 2 * velocity_desired; // Enforces the system travels counter clockwise DoubleIntegratorCircleCostParams() { control_cost_coeff[0] = 0.01; control_cost_coeff[1] = 0.01; discount = 1.0; } }; class DoubleIntegratorCircleCost : public Cost { public: DoubleIntegratorCircleCost(cudaStream_t stream = nullptr); float computeStateCost(const Eigen::Ref s, int timestep = 0, int* crash_status = nullptr); float terminalCost(const Eigen::Ref s); __device__ float computeStateCost(float* s, int timestep = 0, float* theta_c = nullptr, int* crash_status = nullptr); __device__ float terminalCost(float* s, float* theta_c); }; #if __CUDACC__ #include "double_integrator_circle_cost.cu" #endif #endif //! DOUBLE_INTEGRATOR_CIRCLE_COST_CUH_ ================================================ FILE: include/mppi/cost_functions/double_integrator/double_integrator_robust_cost.cu ================================================ #include #include DoubleIntegratorRobustCost::DoubleIntegratorRobustCost(cudaStream_t stream) { bindToStream(stream); } __device__ float DoubleIntegratorRobustCost::computeStateCost(float* s, int timestep, float* theta_c, int* crash_status) { float radial_position = s[0] * s[0] + s[1] * s[1]; float current_velocity = sqrtf(s[2] * s[2] + s[3] * s[3]); float current_angular_momentum = s[0] * s[3] - s[1] * s[2]; float cost = 0; float normalized_dist_from_center = mppi::math::normDistFromCenter( sqrt(radial_position), sqrt(params_.inner_path_radius2), sqrt(params_.outer_path_radius2)); float steep_percent_boundary = 0.5; float steep_cost = 0.5 * params_.crash_cost; // 10 percent of crash cost // Shallow cost region if (normalized_dist_from_center <= steep_percent_boundary) { cost += mppi::math::linInterp(normalized_dist_from_center, 0, steep_percent_boundary, 0, steep_cost); } // Steep cost region if (normalized_dist_from_center > steep_percent_boundary && normalized_dist_from_center <= 1.0) { cost += mppi::math::linInterp(normalized_dist_from_center, steep_percent_boundary, 1, steep_cost, params_.crash_cost); } // Crash Cost region if (normalized_dist_from_center > 1.0) { cost += params_.crash_cost; } cost += params_.velocity_cost * powf(current_velocity - params_.velocity_desired, 2); cost += params_.velocity_cost * powf(current_angular_momentum - params_.angular_momentum_desired, 2); return cost; } float DoubleIntegratorRobustCost::computeStateCost(const Eigen::Ref s, int timestep, int* crash_status) { float radial_position = s[0] * s[0] + s[1] * s[1]; float current_velocity = sqrtf(s[2] * s[2] + s[3] * s[3]); float current_angular_momentum = s[0] * s[3] - s[1] * s[2]; float cost = 0; float normalized_dist_from_center = mppi::math::normDistFromCenter( sqrt(radial_position), sqrt(params_.inner_path_radius2), sqrt(params_.outer_path_radius2)); float steep_percent_boundary = 0.75; float steep_cost = 0.1 * params_.crash_cost; if (normalized_dist_from_center <= steep_percent_boundary) { cost += mppi::math::linInterp(normalized_dist_from_center, 0, steep_percent_boundary, 0, steep_cost); } else if (normalized_dist_from_center > steep_percent_boundary && normalized_dist_from_center <= 1.0) { cost += mppi::math::linInterp(normalized_dist_from_center, steep_percent_boundary, 1, steep_cost, params_.crash_cost); } else { cost += params_.crash_cost; } cost += params_.velocity_cost * powf(current_velocity - params_.velocity_desired, 2); cost += params_.velocity_cost * powf(current_angular_momentum - params_.angular_momentum_desired, 2); return cost; } float DoubleIntegratorRobustCost::terminalCost(const Eigen::Ref s) { return 0; } __device__ float DoubleIntegratorRobustCost::terminalCost(float* s, float* theta_c) { return 0; } ================================================ FILE: include/mppi/cost_functions/double_integrator/double_integrator_robust_cost.cuh ================================================ #ifndef DOUBLE_INTEGRATOR_ROBUST_COST_CUH_ #define DOUBLE_INTEGRATOR_ROBUST_COST_CUH_ #include class DoubleIntegratorRobustCost : public Cost { public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW DoubleIntegratorRobustCost(cudaStream_t stream = nullptr); float computeStateCost(const Eigen::Ref s, int timestep = 0, int* crash_status = nullptr); float terminalCost(const Eigen::Ref s); __device__ float computeStateCost(float* s, int timestep = 0, float* theta_c = nullptr, int* crash_status = nullptr); __device__ float terminalCost(float* s, float* theta_c); float getLipshitzConstantCost() { return params_.crash_cost; }; }; #if __CUDACC__ #include "double_integrator_robust_cost.cu" #endif #endif //! DOUBLE_INTEGRATOR_ROBUST_COST_CUH_ ================================================ FILE: include/mppi/cost_functions/quadratic_cost/quadratic_cost.cu ================================================ #include template QuadraticCostImpl::QuadraticCostImpl(cudaStream_t stream) { this->bindToStream(stream); } template float QuadraticCostImpl::computeStateCost(const Eigen::Ref s, int timestep, int* crash_status) { float cost = 0; output_array des_state = this->params_.getDesiredState(timestep); Eigen::Matrix coeffs; coeffs = Eigen::Matrix::Zero(); for (int i = 0; i < DYN_T::OUTPUT_DIM; i++) { coeffs(i, i) = this->params_.s_coeffs[i]; } output_array error = s - des_state; cost = error.transpose() * coeffs * error; return cost; } template float QuadraticCostImpl::terminalCost(const Eigen::Ref s) { return 0.0; } template __device__ float QuadraticCostImpl::computeStateCost(float* s, int timestep, float* theta_c, int* crash_status) { float cost = 0; float* desired_state = this->params_.getGoalStatePointer(timestep); for (int i = 0; i < DYN_T::OUTPUT_DIM; i++) { cost += powf(s[i] - desired_state[i], 2) * this->params_.s_coeffs[i]; } return cost; } template __device__ float QuadraticCostImpl::terminalCost(float* s, float* theta_c) { return 0.0; } ================================================ FILE: include/mppi/cost_functions/quadratic_cost/quadratic_cost.cuh ================================================ #pragma once /* * Created on Wed Dec 16 2020 by Bogdan */ #ifndef MPPI_COST_FUNCTIONS_QUADRATIC_COST_CUH_ #define MPPI_COST_FUNCTIONS_QUADRATIC_COST_CUH_ #include #include template struct QuadraticCostTrajectoryParams : public CostParams { typedef DYN_T TEMPLATED_DYN; static const int TIME_HORIZON = SIM_TIME_HORIZON; /** * Defines a general desired state and coeffs */ float s_goal[DYN_T::OUTPUT_DIM * SIM_TIME_HORIZON] = { 0 }; float s_coeffs[DYN_T::OUTPUT_DIM] = { 0 }; int current_time = 0; QuadraticCostTrajectoryParams() { for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { this->control_cost_coeff[i] = 0; } for (int i = 0; i < DYN_T::OUTPUT_DIM; i++) { this->s_coeffs[i] = 1; } } const Eigen::Matrix getDesiredState(int t) { Eigen::Matrix s(s_goal + this->getIndex(t)); return s; } __device__ float* getGoalStatePointer(int t) { return s_goal + this->getIndex(t); } __host__ __device__ int getIndex(int t) { int index = (this->current_time + t); if (index >= TIME_HORIZON) { index = (TIME_HORIZON - 1); } index *= DYN_T::OUTPUT_DIM; return index; } __host__ __device__ void setCurrentTime(int new_time) { current_time = new_time; } }; template class QuadraticCostImpl : public Cost { public: typedef Cost PARENT_CLASS; using output_array = typename PARENT_CLASS::output_array; QuadraticCostImpl(cudaStream_t stream = nullptr); static constexpr float MAX_COST_VALUE = 1e16; /** * Host Functions */ virtual std::string getCostFunctionName() const override { return "Quadratic Cost"; } float computeStateCost(const Eigen::Ref s, int timestep = 0, int* crash_status = nullptr); float terminalCost(const Eigen::Ref s); float getLipshitzConstantCost() { // Find the spectral radius of the state matrix float rho_Q = fabsf(this->params_.s_coeffs[0]); for (int i = 1; i < DYN_T::OUTPUT_DIM; i++) { rho_Q = fmaxf(rho_Q, fabsf(this->params_.s_coeffs[i])); } return 2 * rho_Q; }; /** * Device Functions */ __device__ float computeStateCost(float* s, int timestep = 0, float* theta_c = nullptr, int* crash_status = nullptr); // Custom implementation that does a Nan check. // __device__ float computeRunningCost(float* s, float* u, float* noise, float* std_dev, float lambda, float alpha, // int timestep, float* theta_c, int* crash_status); __device__ float terminalCost(float* s, float* theta_c); }; #if __CUDACC__ #include "quadratic_cost.cu" #endif template class QuadraticCostTrajectory : public QuadraticCostImpl, DYN_T, QuadraticCostTrajectoryParams> { public: QuadraticCostTrajectory(cudaStream_t stream = nullptr) : QuadraticCostImpl>( stream){}; }; template class QuadraticCost : public QuadraticCostImpl, DYN_T, QuadraticCostTrajectoryParams> { public: QuadraticCost(cudaStream_t stream = nullptr) : QuadraticCostImpl>(stream){}; }; #endif // MPPI_COST_FUNCTIONS_QUADRATIC_COST_CUH_ ================================================ FILE: include/mppi/cost_functions/quadrotor/quadrotor_map_cost.cu ================================================ #include #include #include #include template QuadrotorMapCostImpl::QuadrotorMapCostImpl(cudaStream_t stream) { tex_helper_ = new TwoDTextureHelper(1, stream); this->bindToStream(stream); } template QuadrotorMapCostImpl::~QuadrotorMapCostImpl() { delete tex_helper_; } template void QuadrotorMapCostImpl::bindToStream(cudaStream_t stream) { PARENT_CLASS::bindToStream(stream); tex_helper_->bindToStream(stream); } template void QuadrotorMapCostImpl::freeCudaMem() { if (this->GPUMemStatus_) { // HANDLE_ERROR(cudaFreeArray(costmapArray_d_)); // HANDLE_ERROR(cudaDestroyTextureObject(costmap_tex_d_)); tex_helper_->freeCudaMem(); } PARENT_CLASS::freeCudaMem(); } template void QuadrotorMapCostImpl::GPUSetup() { PARENT_CLASS* derived = static_cast(this); tex_helper_->GPUSetup(); derived->GPUSetup(); HANDLE_ERROR(cudaMemcpyAsync(&(this->cost_d_->tex_helper_), &(tex_helper_->ptr_d_), sizeof(TwoDTextureHelper*), cudaMemcpyHostToDevice, this->stream_)); } template void QuadrotorMapCostImpl::paramsToDevice() { if (this->GPUMemStatus_) { // HANDLE_ERROR(cudaMemcpyAsync(&this->cost_d_->params_, &this->params_, sizeof(PARAMS_T), cudaMemcpyHostToDevice, // this->stream_)); // HANDLE_ERROR( // cudaMemcpyAsync(&this->cost_d_->width_, &width_, sizeof(float), cudaMemcpyHostToDevice, this->stream_)); // HANDLE_ERROR( // cudaMemcpyAsync(&this->cost_d_->height_, &height_, sizeof(float), cudaMemcpyHostToDevice, this->stream_)); // HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); tex_helper_->copyToDevice(); } PARENT_CLASS::paramsToDevice(); } template float QuadrotorMapCostImpl::computeStateCost(const Eigen::Ref s, int timestep, int* crash_status) { // TODO query texture on CPU float cost = 0; cost += computeGateSideCost(s.data()); cost += computeHeightCost(s.data()); cost += computeHeadingCost(s.data()); cost += computeSpeedCost(s.data()); cost += computeStabilizingCost(s.data()); cost += computeWaypointCost(s.data()); // Decrease cost if we pass a gate float dist_to_gate = distToWaypoint(s.data(), this->params_.curr_waypoint); if (dist_to_gate < this->params_.gate_margin) { // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && // blockIdx.z == 0) // { // printf("Passing through a gate: state_vec: (%f, %f, %f)\n", s[O_IND_CLASS(DYN_T::PARAMS_T, POS_X)], // s[O_IND_CLASS(typename DYN_T::PARAMS_T, POS_Y), s[O_IND_CLASS(DYN_T::PARAMS_T, POS_Z)]]); // } cost += this->params_.gate_pass_cost; } return cost; } template __device__ float QuadrotorMapCostImpl::computeStateCost(float* s, int timestep, float* theta_c, int* crash_status) { float cost = 0; float costmap_cost, gate_cost, height_cost, heading_cost, speed_cost, stable_cost; float waypoint_cost; costmap_cost = computeCostmapCost(s); gate_cost = computeGateSideCost(s); height_cost = computeHeightCost(s); heading_cost = computeHeadingCost(s); speed_cost = computeSpeedCost(s); stable_cost = computeStabilizingCost(s); waypoint_cost = computeWaypointCost(s); if (gate_cost != 0) { *crash_status = 1; if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { printf("hitting the gate?\n"); } } // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == // 0 && timestep == 1) // { // // if (isnan(costmap_cost) || isnan(gate_cost) || isnan(height_cost) || isnan(heading_cost) || isnan(speed_cost) // // || // // isnan(stable_cost) || isnan(waypoint_cost)) // // { // printf( // "Costs rollout %d: Costmap %5.2f, Gate %5.2f, Height %5.2f," // " Heading %5.2f, Speed %5.2f, Stabilization %5.2f, Waypoint %5.2f\n", timestep, // costmap_cost, gate_cost, height_cost, heading_cost, speed_cost, stable_cost, waypoint_cost); // // } // // printf("Costs %d: height - %5.2f, stable - %5.2f prev_height %5.2f, cur_height: %5.2f\n", timestep, // // height_cost, // // stable_cost, this->params_.prev_waypoint.z, this->params_.curr_waypoint.z); // } cost += costmap_cost + gate_cost + height_cost + heading_cost + speed_cost + stable_cost; // Decrease cost if we pass a gate float dist_to_gate = distToWaypoint(s, this->params_.curr_waypoint); if (dist_to_gate < this->params_.gate_margin) { cost += this->params_.gate_pass_cost; } cost += *crash_status * this->params_.crash_coeff; return cost; } template __host__ __device__ float QuadrotorMapCostImpl::distToWaypoint(float* s, float4 waypoint) { float dist = sqrt(SQ(s[E_INDEX(OutputIndex, POS_X)] - waypoint.x) + SQ(s[E_INDEX(OutputIndex, POS_Y)] - waypoint.y) + SQ(s[E_INDEX(OutputIndex, POS_Z)] - waypoint.z)); return dist; } template void QuadrotorMapCostImpl::updateWaypoint(float4 new_waypoint) { updateWaypoint(new_waypoint.x, new_waypoint.y, new_waypoint.z, new_waypoint.w); } template void QuadrotorMapCostImpl::updateWaypoint(float x, float y, float z, float heading) { if (this->params_.updateWaypoint(x, y, z, heading)) { paramsToDevice(); } } template void QuadrotorMapCostImpl::updateGateBoundaries(float3 left_side, float3 right_side) { updateGateBoundaries(left_side.x, left_side.y, left_side.z, right_side.x, right_side.y, right_side.z); } template void QuadrotorMapCostImpl::updateGateBoundaries(std::vector boundaries) { if (boundaries.size() < 6) { std::cerr << "You need " << 6 - boundaries.size() << " more floats in the" << " call to updateGateBoundaries" << std::endl; return; } updateGateBoundaries(boundaries[0], boundaries[1], boundaries[2], boundaries[3], boundaries[4], boundaries[5]); } template void QuadrotorMapCostImpl::updateGateBoundaries(float left_x, float left_y, float left_z, float right_x, float right_y, float right_z) { if (this->params_.updateGateBoundaries(left_x, left_y, left_z, right_x, right_y, right_z)) { paramsToDevice(); } } template __host__ __device__ float QuadrotorMapCostImpl::computeStabilizingCost(float* s) { float cost = 0; float roll, pitch, yaw; mppi::math::Quat2EulerNWU(&s[6], roll, pitch, yaw); float quat_dist_from_level = SQ(roll) + SQ(pitch); cost += this->params_.attitude_coeff * quat_dist_from_level; return cost; } template __host__ __device__ float QuadrotorMapCostImpl::computeHeadingCost(float* s) { float cost = 0; // float roll, pitch, yaw; // mppi::math::Quat2EulerNWU(&s[6], roll, pitch, yaw); float R[3][3]; mppi::math::Quat2DCM(&s[E_INDEX(OutputIndex, QUAT_W)], R); const float& vx = s[E_INDEX(OutputIndex, VEL_X)]; const float& vy = s[E_INDEX(OutputIndex, VEL_Y)]; const float& vz = s[E_INDEX(OutputIndex, VEL_Z)]; float3 w_v = make_float3(R[0][0] * vx + R[0][1] * vy + R[0][2] * vz, R[1][0] * vx + R[1][1] * vy + R[1][2] * vz, R[2][0] * vx + R[2][1] * vy + R[2][2] * vz); float yaw = atan2f(w_v.y, w_v.x); // Calculate heading to gate float wx = this->params_.curr_waypoint.x - s[E_INDEX(OutputIndex, POS_X)]; float wy = this->params_.curr_waypoint.y - s[E_INDEX(OutputIndex, POS_Y)]; float w_heading = atan2f(wy, wx); float dist_to_gate = distToWaypoint(s, this->params_.curr_waypoint); // Far away from the gate, we want to be pointing at the gate if (dist_to_gate > this->params_.gate_margin) { cost += this->params_.heading_coeff * powf(fabsf(angle_utils::shortestAngularDistance(w_heading, yaw)), this->params_.heading_power); } return cost; } template __host__ __device__ float QuadrotorMapCostImpl::computeSpeedCost(float* s) { float cost = 0; float speed = sqrt(s[E_INDEX(OutputIndex, VEL_X)] * s[E_INDEX(OutputIndex, VEL_X)] + s[E_INDEX(OutputIndex, VEL_Y)] * s[E_INDEX(OutputIndex, VEL_Y)]); float desired_speed = this->params_.desired_speed; // if (this->params_.curr_waypoint == this->params_.end_waypoint) // { // desired_speed = distToWaypoint(s, this->params_.curr_waypoint); // } cost = this->params_.speed_coeff * SQ(speed - desired_speed); return cost; } template __host__ __device__ float QuadrotorMapCostImpl::computeWaypointCost(float* s) { float cost = 0; float dist_to_gate = distToWaypoint(s, this->params_.curr_waypoint); cost = this->params_.dist_to_waypoint_coeff * SQ(dist_to_gate); return cost; } template __host__ __device__ float QuadrotorMapCostImpl::computeGateSideCost(float* s) { float cost = 0; float2 curr_left = make_float2(this->params_.curr_gate_left.x, this->params_.curr_gate_left.y); float2 curr_right = make_float2(this->params_.curr_gate_right.x, this->params_.curr_gate_right.y); // Calculate the side border cost float dist_to_left_side = sqrtf(SQ(s[E_INDEX(OutputIndex, POS_X)] - curr_left.x) + SQ(s[E_INDEX(OutputIndex, POS_Y)] - curr_left.y)); float dist_to_right_side = sqrtf(SQ(s[E_INDEX(OutputIndex, POS_X)] - curr_right.x) + SQ(s[E_INDEX(OutputIndex, POS_Y)] - curr_right.y)); float prev_dist_to_left_side = sqrtf(SQ(s[E_INDEX(OutputIndex, POS_X)] - this->params_.prev_gate_left.x) + SQ(s[E_INDEX(OutputIndex, POS_Y)] - this->params_.prev_gate_left.y)); float prev_dist_to_right_side = sqrtf(SQ(s[E_INDEX(OutputIndex, POS_X)] - this->params_.prev_gate_right.x) + SQ(s[E_INDEX(OutputIndex, POS_Y)] - this->params_.prev_gate_right.y)); float2 gate_vec = curr_left - curr_right; float2 state_vec_right = make_float2(s[E_INDEX(OutputIndex, POS_X)], s[E_INDEX(OutputIndex, POS_Y)]) - curr_right; float2 state_vec_left = make_float2(s[E_INDEX(OutputIndex, POS_X)], s[E_INDEX(OutputIndex, POS_Y)]) - curr_left; const float perp_dist = cross(state_vec_right, gate_vec); const float comp_state_along_gate_right = dot(state_vec_right, gate_vec) / dot(gate_vec, gate_vec); const float threshold = 0.5; const float comp_state_along_gate_left = dot(state_vec_left, -gate_vec) / dot(gate_vec, gate_vec); const float outside_gate = fmaxf(comp_state_along_gate_left, comp_state_along_gate_right); // Find the side closest to the current state // const float closest_side_dist = // fminf(dist_to_left_side, fminf(dist_to_right_side, fminf(prev_dist_to_left_side, prev_dist_to_right_side))); // if (fabsf(perp_dist) < this->params_.min_dist_to_gate_side && // fmaxf(comp_state_along_gate_left, comp_state_along_gate_right) > 1.0f) // { // Within perpendicular distance of min_dist and outside of gate // cost += this->params_.crash_coeff * fmaxf(comp_state_along_gate_left, comp_state_along_gate_right); if (fabsf(perp_dist) < this->params_.min_dist_to_gate_side && ((comp_state_along_gate_right < 0.0f && comp_state_along_gate_right >= 0.0f - threshold) || (comp_state_along_gate_right > 1.0f && comp_state_along_gate_right <= 1.0f + threshold))) { cost += this->params_.crash_coeff * fabsf(comp_state_along_gate_right); if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { printf("Hitting a gate: state_vec: (%f, %f), gate_vec: (%f, %f), projection onto gate: %f\n", state_vec_right.x, state_vec_right.y, gate_vec.x, gate_vec.y, comp_state_along_gate_right); } } // if (closest_side_dist < this->params_.min_dist_to_gate_side) // { // cost += this->params_.crash_coeff * (this->params_.min_dist_to_gate_side - closest_side_dist); // if (fmaxf(comp_state_along_gate_left, comp_state_along_gate_right) > 1.0f) // { // cost += this->params_.crash_coeff * fmaxf(comp_state_along_gate_left, comp_state_along_gate_right); // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && // blockIdx.z == 0 // ) // { // printf("Hitting a gate\n"); // } // } // } return cost; } template __host__ __device__ float QuadrotorMapCostImpl::computeHeightCost(float* s) { // Calculate height cost float cost = 0; // if (s[E_INDEX(OutputIndex, POS_Z)] < 1) { // return this->params_.crash_coeff; // } float d1 = sqrtf(SQ(s[E_INDEX(OutputIndex, POS_X)] - this->params_.prev_waypoint.x) + SQ(s[E_INDEX(OutputIndex, POS_Y)] - this->params_.prev_waypoint.y)); float d2 = sqrtf(SQ(s[E_INDEX(OutputIndex, POS_X)] - this->params_.curr_waypoint.x) + SQ(s[E_INDEX(OutputIndex, POS_Y)] - this->params_.curr_waypoint.y)); float w1 = d1 / (d1 + d2 + 0.001); float w2 = d2 / (d1 + d2 + 0.001); float interpolated_height = (1.0 - w1) * this->params_.prev_waypoint.z + (1.0 - w2) * this->params_.curr_waypoint.z; float height_diff = SQ(fabsf(s[E_INDEX(OutputIndex, POS_Z)] - interpolated_height)); if (height_diff < 0) { cost += this->params_.crash_coeff * (1 - height_diff); } else { cost += this->params_.height_coeff * height_diff; } // cost += this->params_.height_coeff * height_diff; if (height_diff > this->params_.gate_width) { cost += 400; } return cost; } template __device__ float QuadrotorMapCostImpl::computeCostmapCost(float* s) { float cost = 0; if (!this->tex_helper_->checkTextureUse(0)) { return cost; } float3 query_point = make_float3(s[E_INDEX(OutputIndex, POS_X)], s[E_INDEX(OutputIndex, POS_Y)], s[E_INDEX(OutputIndex, POS_Z)]); float3 tex_coords; this->tex_helper_->worldPoseToTexCoord(0, query_point, tex_coords); if (tex_coords.x < 0.0f || tex_coords.x > 1.0f || tex_coords.y < 0.0f || tex_coords.y > 1.0f) { // The vehicle is not in the map anymore cost += this->params_.crash_coeff; if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { printf("left the map\n"); } } float track_cost = this->tex_helper_->queryTexture(0, tex_coords); // Calculate cost based on distance from centerline of the track if (track_cost > this->params_.track_slop) { cost += this->params_.track_coeff * track_cost; } if (track_cost > this->params_.track_boundary_cost) { // the cost at this point on the costmap indicates no longer being on the track cost += this->params_.crash_coeff; } return cost; } template float QuadrotorMapCostImpl::terminalCost(const Eigen::Ref s) { return 0; } template __device__ float QuadrotorMapCostImpl::terminalCost(float* s, float* theta_c) { return 0; } template std::vector QuadrotorMapCostImpl::loadTrackData(std::string map_path) { // check if file exists if (!fileExists(map_path)) { std::cerr << "ERROR: map path invalid, " << map_path << std::endl; return std::vector(); } // load the npz file cnpy::npz_t map_dict = cnpy::npz_load(map_path); float x_min, x_max, y_min, y_max, ppm; float* xBounds = map_dict["xBounds"].data(); float* yBounds = map_dict["yBounds"].data(); float* pixelsPerMeter = map_dict["pixelsPerMeter"].data(); x_min = xBounds[0]; x_max = xBounds[1]; y_min = yBounds[0]; y_max = yBounds[1]; ppm = pixelsPerMeter[0]; int width = int((x_max - x_min) * ppm); int height = int((y_max - y_min) * ppm); if (!changeCostmapSize(width, height)) { std::cerr << "ERROR: load track has invalid sizes" << std::endl; return std::vector(); } float* channel0 = map_dict["channel0"].data(); float* channel1 = map_dict["channel1"].data(); float* channel2 = map_dict["channel2"].data(); float* channel3 = map_dict["channel3"].data(); // copy the track data into CPU side storage for (int i = 0; i < width_ * height_; i++) { // std::cout << i << " = " << channel0[i] << ", " << channel1[i] << ", " << channel2[i] << ", " << channel3[i] << // std::endl; track_costs_[i].x = channel0[i]; track_costs_[i].y = channel1[i]; track_costs_[i].z = channel2[i]; track_costs_[i].w = channel3[i]; } Eigen::Matrix3f R; Eigen::Array3f trs; // Save the scaling and offset R << 1. / (x_max - x_min), 0, 0, 0, 1. / (y_max - y_min), 0, 0, 0, 1; trs << -x_min / (x_max - x_min), -y_min / (y_max - y_min), 1; updateTransform(R, trs); costmapToTexture(); return track_costs_; } template bool QuadrotorMapCostImpl::changeCostmapSize(int width, int height) { // TODO set flag at top that indicates memory allocation changes if (height < 0 && width < 0) { std::cerr << "ERROR: cannot resize costmap to size less than 1" << std::endl; return false; } if (height != height_ || width != width_) { track_costs_.resize(width * height); // Allocate memory for the cuda array which is bound the costmap_tex_ // has been allocated in the past, must be freed if (height_ > 0 && width_ > 0) { HANDLE_ERROR(cudaFreeArray(costmapArray_d_)); } // 4 floats of size 32 bits channelDesc_ = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); HANDLE_ERROR(cudaMallocArray(&costmapArray_d_, &channelDesc_, width, height)); // set all of the elements in the array to be zero std::vector zero_array(width_ * height_); zero_array.resize(width * height, make_float4(0, 0, 0, 0)); HANDLE_ERROR(cudaMemcpyToArray(costmapArray_d_, 0, 0, zero_array.data(), width * height * sizeof(float4), cudaMemcpyHostToDevice)); } width_ = width; height_ = height; return true; } template void QuadrotorMapCostImpl::updateTransform(const Eigen::Ref& m, const Eigen::Ref& trs) { this->params_.r_c1.x = m(0, 0); this->params_.r_c1.y = m(1, 0); this->params_.r_c1.z = m(2, 0); this->params_.r_c2.x = m(0, 1); this->params_.r_c2.y = m(1, 1); this->params_.r_c2.z = m(2, 1); this->params_.trs.x = trs(0); this->params_.trs.y = trs(1); this->params_.trs.z = trs(2); // Move the updated parameters to gpu memory if (this->GPUMemStatus_) { paramsToDevice(); } } template void QuadrotorMapCostImpl::costmapToTexture() { if (width_ < 0 || height_ < 0) { std::cerr << "ERROR: cannot allocate texture with zero size" << std::endl; return; } // transfer CPU version of costmap to GPU float4* costmap_ptr = track_costs_.data(); HANDLE_ERROR( cudaMemcpyToArray(costmapArray_d_, 0, 0, costmap_ptr, width_ * height_ * sizeof(float4), cudaMemcpyHostToDevice)); cudaStreamSynchronize(this->stream_); // Specify texture struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; resDesc.res.array.array = costmapArray_d_; // Specify texture object parameters struct cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.addressMode[1] = cudaAddressModeClamp; texDesc.filterMode = cudaFilterModePoint; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 1; // Destroy current texture and create new texture object HANDLE_ERROR(cudaDestroyTextureObject(costmap_tex_d_)); HANDLE_ERROR(cudaCreateTextureObject(&costmap_tex_d_, &resDesc, &texDesc, NULL)); // copy over pointers setup up on CPU code to GPU HANDLE_ERROR(cudaMemcpyAsync(&this->cost_d_->costmapArray_d_, &costmapArray_d_, sizeof(cudaArray*), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(&this->cost_d_->costmap_tex_d_, &costmap_tex_d_, sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice, this->stream_)); cudaStreamSynchronize(this->stream_); } template __device__ float4 QuadrotorMapCostImpl::queryTexture(float x, float y) const { return tex2D(costmap_tex_d_, x, y); } template __device__ float4 QuadrotorMapCostImpl::queryTextureTransformed(float x, float y) { float u, v, w; coorTransform(x, y, &u, &v, &w); return queryTexture(u / w, v / w); } template __host__ __device__ void QuadrotorMapCostImpl::coorTransform(float x, float y, float* u, float* v, float* w) { ////Compute a projective transform of (x, y, 0, 1) // converts to the texture [0-1] coordinate system u[0] = this->params_.r_c1.x * x + this->params_.r_c2.x * y + this->params_.trs.x; v[0] = this->params_.r_c1.y * x + this->params_.r_c2.y * y + this->params_.trs.y; w[0] = this->params_.r_c1.z * x + this->params_.r_c2.z * y + this->params_.trs.z; } ================================================ FILE: include/mppi/cost_functions/quadrotor/quadrotor_map_cost.cuh ================================================ /* * Created on Wed Jul 22 2020 by Bogdan */ #ifndef MPPI_COST_FUNCTIONS_QUADROTOR_MAP_COST_CUH_ #define MPPI_COST_FUNCTIONS_QUADROTOR_MAP_COST_CUH_ #include #include #include #include #include struct QuadrotorMapCostParams : public CostParams<4> { float3 r_c1; float3 r_c2; float3 trs; float attitude_coeff = 10; float crash_coeff = 1000; float dist_to_waypoint_coeff = 0.0; float heading_coeff = 5; float heading_power = 1.0; // power to take the difference in headings to float height_coeff = 5; float track_coeff = 10; float speed_coeff = 5; float track_slop = 0.0; float gate_pass_cost = -150; float4 curr_waypoint; float4 prev_waypoint; float3 curr_gate_left; float3 curr_gate_right; float3 prev_gate_left; float3 prev_gate_right; float4 end_waypoint; // if the costmap cost is above this, we are no longer on the track float desired_speed = 5; // [m/s] float gate_margin = 0.5; // [m] float min_dist_to_gate_side = 0.5; // [m] TODO find correct value for this float track_boundary_cost = 2.5; float gate_width = 2.15; // [m] QuadrotorMapCostParams() { control_cost_coeff[0] = 1; // roll control_cost_coeff[1] = 1; // pitch control_cost_coeff[2] = 1; // yaw control_cost_coeff[3] = 1; // throttle curr_waypoint = make_float4(0, 0, 0, 0); prev_waypoint = make_float4(0, 0, 0, 0); curr_gate_left = make_float3(0, 0, 0); curr_gate_right = make_float3(0, 0, 0); prev_gate_left = make_float3(0, 0, 0); prev_gate_right = make_float3(0, 0, 0); end_waypoint = make_float4(NAN, NAN, NAN, NAN); } bool updateWaypoint(float x, float y, float z, float heading = 0) { bool changed = false; if (curr_waypoint.x != x || curr_waypoint.y != y || curr_waypoint.z != z || curr_waypoint.w != heading) { prev_waypoint = curr_waypoint; curr_waypoint = make_float4(x, y, z, heading); float3 curr_left = make_float3(x + cosf(heading) * gate_width, y + sinf(heading) * gate_width, z); float3 curr_right = make_float3(x - cosf(heading) * gate_width, y - sinf(heading) * gate_width, z); updateGateBoundaries(curr_left.x, curr_left.y, curr_left.z, curr_right.x, curr_right.y, curr_right.z); changed = true; } return changed; } bool updateGateBoundaries(float left_x, float left_y, float left_z, float right_x, float right_y, float right_z) { bool changed = false; if (curr_gate_left.x != left_x || curr_gate_left.y != left_y || curr_gate_left.z != left_z || curr_gate_right.x != right_x || curr_gate_right.y != right_y || curr_gate_right.z != right_z) { prev_gate_left = curr_gate_left; prev_gate_right = curr_gate_right; curr_gate_left = make_float3(left_x, left_y, left_z); curr_gate_right = make_float3(right_x, right_y, right_z); changed = true; } return changed; } }; template class QuadrotorMapCostImpl : public Cost { public: // I think these typedefs are needed because this class is itself templated? typedef Cost PARENT_CLASS; using output_array = typename PARENT_CLASS::output_array; using control_array = typename PARENT_CLASS::control_array; using OutputIndex = typename PARENT_CLASS::OutputIndex; using ControlIndex = typename PARENT_CLASS::ControlIndex; using DYN_P = QuadrotorDynamicsParams; QuadrotorMapCostImpl(cudaStream_t stream = 0); ~QuadrotorMapCostImpl(); void bindToStream(cudaStream_t stream); void GPUSetup(); void freeCudaMem(); void paramsToDevice(); std::string getCostFunctionName() const override { return std::string("Quadrotor Map Cost"); } float computeStateCost(const Eigen::Ref s, int timestep, int* crash_status); __device__ float computeStateCost(float* s, int timestep, float* theta_c, int* crash_status); float terminalCost(const Eigen::Ref s); __device__ float terminalCost(float* s, float* theta_c = nullptr); __host__ __device__ float computeGateSideCost(float* s); __host__ __device__ float computeHeadingCost(float* s); __host__ __device__ float computeHeightCost(float* s); __host__ __device__ float computeSpeedCost(float* s); __host__ __device__ float computeStabilizingCost(float* s); __host__ __device__ float computeWaypointCost(float* s); __host__ __device__ float distToWaypoint(float* s, float4 waypoint); void updateWaypoint(float4 new_waypoint); void updateWaypoint(float x, float y, float z, float heading = 0); void updateGateBoundaries(float3 left_side, float3 right_side); void updateGateBoundaries(std::vector boundaries); void updateGateBoundaries(float left_x, float left_y, float left_z, float right_x, float right_y, float right_z); /** =================== Cost Map Related Functions ================== **/ __device__ float computeCostmapCost(float* s); std::vector loadTrackData(std::string map_path); __device__ float4 queryTexture(float x, float y) const; /** * alters the costmap size in CPU storage and GPU texture * @param width * @param height * @return */ bool changeCostmapSize(int width, int height); /** * @brief Binds the member variable costmap to a CUDA texture. */ void costmapToTexture(); /** * @brief Updates the current costmap coordinate transform. * @param h Matrix representing a transform from world to (offset) costmap coordinates. * @param trs Array representing the offset. */ void updateTransform(const Eigen::Ref& m, const Eigen::Ref& trs); /** * @brief Compute a coordinate transform going from world to costmap coordinates. */ __host__ __device__ void coorTransform(float x, float y, float* u, float* v, float* w); /** * Queries the texture using coorTransform beforehand */ __device__ float4 queryTextureTransformed(float x, float y); TwoDTextureHelper* tex_helper_ = nullptr; protected: std::string map_path_; // Costmap Related Variables std::vector track_costs_; int width_ = -1; int height_ = -1; cudaArray* costmapArray_d_; ///< Cuda array for texture binding. cudaChannelFormatDesc channelDesc_; ///< Cuda texture channel description. cudaTextureObject_t costmap_tex_d_; ///< Cuda texture object. }; class QuadrotorMapCost : public QuadrotorMapCostImpl { public: QuadrotorMapCost(cudaStream_t stream = 0) : QuadrotorMapCostImpl(stream){}; }; #if __CUDACC__ #include "quadrotor_map_cost.cu" #endif #endif // MPPI_COST_FUNCTIONS_QUADROTOR_MAP_COST_CUH_ ================================================ FILE: include/mppi/cost_functions/quadrotor/quadrotor_quadratic_cost.cu ================================================ #include QuadrotorQuadraticCost::QuadrotorQuadraticCost(cudaStream_t stream) { bindToStream(stream); } /** * Host Functions */ float QuadrotorQuadraticCost::computeStateCost(const Eigen::Ref s, int timestep, int* crash_status) { Eigen::Vector3f x, v, w; // Eigen::Vector4f q; Eigen::Map x_g(this->params_.x_goal()); Eigen::Map v_g(this->params_.v_goal()); Eigen::Map w_g(this->params_.w_goal()); x = s.block<3, 1>(E_INDEX(OutputIndex, POS_X), 0); v = s.block<3, 1>(E_INDEX(OutputIndex, VEL_X), 0); w = s.block<3, 1>(E_INDEX(OutputIndex, ANG_VEL_X), 0); // Quaternion/Angle Costs Eigen::Quaternionf q(s[6], s[7], s[8], s[9]); Eigen::Quaternionf q_g(this->params_.q_goal()[0], this->params_.q_goal()[1], this->params_.q_goal()[2], this->params_.q_goal()[3]); Eigen::Quaternionf q_diff; mppi::math::QuatSubtract(q, q_g, q_diff); Eigen::VectorXf rot_diff; Eigen::ArrayXf rot_coeff; if (this->params_.use_euler) { float r_diff, p_diff, y_diff; mppi::math::Quat2EulerNWU(q_diff, r_diff, p_diff, y_diff); Eigen::Vector3f angle_diff; angle_diff << r_diff, p_diff, y_diff; Eigen::Array3f angle_coeff; angle_coeff << this->params_.roll_coeff, this->params_.pitch_coeff, this->params_.yaw_coeff; rot_diff = angle_diff; rot_coeff = angle_coeff; } else { Eigen::Vector4f q_vec; q_vec << q_diff.w(), q_diff.x(), q_diff.y(), q_diff.z(); Eigen::Array4f q_coeff; q_coeff << this->params_.q_coeff, this->params_.q_coeff, this->params_.q_coeff, this->params_.q_coeff; rot_diff = q_vec; rot_coeff = q_coeff; } Eigen::Vector3f x_cost = this->params_.x_coeff * (x - x_g).array().square(); Eigen::Vector3f v_cost = this->params_.v_coeff * (v - v_g).array().square(); Eigen::Vector3f q_cost = rot_coeff * rot_diff.array().square(); Eigen::Vector3f w_cost = this->params_.w_coeff * (w - w_g).array().square(); return x_cost.sum() + v_cost.sum() + q_cost.sum() + w_cost.sum(); } float QuadrotorQuadraticCost::terminalCost(const Eigen::Ref s) { return this->params_.terminal_cost_coeff * computeStateCost(s); } /** * Device Functions */ __device__ float QuadrotorQuadraticCost::computeStateCost(float* s, int timestep, float* theta_c, int* crash_status) { float s_diff[OUTPUT_DIM]; int i; float sum = 0; for (i = 0; i < OUTPUT_DIM; i++) { s_diff[i] = powf(s[i] - this->params_.s_goal[i], 2); } float q_diff[4]; mppi::math::QuatSubtract(s + 6, this->params_.s_goal + 6, q_diff); for (i = 0; i < 3; i++) { s_diff[i] *= this->params_.x_coeff; } for (i = 3; i < 6; i++) { s_diff[i] *= this->params_.v_coeff; } if (!this->params_.use_euler) { for (i = 6; i < 10; i++) { s_diff[i] = this->params_.q_coeff * q_diff[i - 6]; } } if (this->params_.use_euler) { // Zero out quaternion portion for later for (i = 6; i < 10; i++) { s_diff[i] = 0; } // Get Euler Angles float r_diff, p_diff, y_diff; mppi::math::Quat2EulerNWU(q_diff, r_diff, p_diff, y_diff); sum += this->params_.roll_coeff * SQ(r_diff); sum += this->params_.pitch_coeff * SQ(p_diff); sum += this->params_.yaw_coeff * SQ(y_diff); } for (i = 10; i < 13; i++) { s_diff[i] *= this->params_.w_coeff; } for (i = 0; i < OUTPUT_DIM; i++) { sum += s_diff[i]; } // do a final nan check return sum * (1 - isnan(sum)) + isnan(sum) * MAX_COST_VALUE; } __device__ float QuadrotorQuadraticCost::terminalCost(float* s, float* theta_c) { float cost = this->params_.terminal_cost_coeff * computeStateCost(s); return cost * (1 - isnan(cost)) + isnan(cost) * MAX_COST_VALUE; } ================================================ FILE: include/mppi/cost_functions/quadrotor/quadrotor_quadratic_cost.cuh ================================================ /* * Created on Thu Jun 11 2020 by Bogdan */ #ifndef MPPI_COST_FUNCTIONS_QUADROTOR_QUADRATIC_COST_CUH_ #define MPPI_COST_FUNCTIONS_QUADROTOR_QUADRATIC_COST_CUH_ #include #include #include struct QuadrotorQuadraticCostParams : public CostParams<4> { /** * State for this class is defined as follows: * x - position in 3D space (x, y, z) - meters * v - velocity in 3D space (v_x, v_y_ v_z) - meters/sec * q - quaternion (q_w, q_x, q_y, q_z) * w - angular velocities (roll_dot, pitch_dot, yaw_dot) - rad/sec * * Coordinate Frame is NWU */ float s_goal[13] = { 0, 0, 0, // x 0, 0, 0, // v 1, 0, 0, 0, // q 0, 0, 0 }; // w float* x_goal() { return &s_goal[0]; } float* v_goal() { return &s_goal[3]; } float* q_goal() { return &s_goal[6]; } float* w_goal() { return &s_goal[10]; } float x_coeff = 1.0; float v_coeff = 1.0; // Euler Angle or Quaternion scoring bool use_euler = true; float q_coeff = 1.0; // Quaternion Modifier float roll_coeff = 1.0; float pitch_coeff = 1.0; float yaw_coeff = 1.0; float w_coeff = 1.0; float terminal_cost_coeff = 0; QuadrotorQuadraticCostParams() { control_cost_coeff[0] = 2.0; control_cost_coeff[1] = 2.0; control_cost_coeff[2] = 2.0; control_cost_coeff[3] = 2.0; } Eigen::Matrix getDesiredState() { Eigen::Matrix s; s << s_goal[0], s_goal[1], s_goal[2], s_goal[3], s_goal[4], s_goal[5], s_goal[6], s_goal[7], s_goal[8], s_goal[9], s_goal[10], s_goal[11], s_goal[12]; return s; } }; class QuadrotorQuadraticCost : public Cost { /** * State for this class is defined as follows: * x - position in 3D space (x, y, z) - meters * v - velocity in 3D space (v_x, v_y_ v_z) - meters/sec * q - quaternion (q_w, q_x, q_y, q_z) * w - angular velocities (roll_dot, pitch_dot, yaw_dot) - rad/sec * * Coordinate Frame is NWU * * Control: * roll_rate - rad/sec * pitch_rate - rad/sec * yaw_rate - rad/sec * thrust - Newtons */ public: QuadrotorQuadraticCost(cudaStream_t stream = nullptr); static constexpr float MAX_COST_VALUE = 1e16; /** * Host Functions */ float computeStateCost(const Eigen::Ref s, int timestep = 0, int* crash_status = nullptr); float terminalCost(const Eigen::Ref s); /** * Device Functions */ __device__ float computeStateCost(float* s, int timestep = 0, float* theta_c = nullptr, int* crash_status = nullptr); __device__ float terminalCost(float* s, float* theta_c = nullptr); }; #if __CUDACC__ #include "quadrotor_quadratic_cost.cu" #endif #endif // MPPI_COST_FUNCTIONS_QUADROTOR_QUADRATIC_COSTS_CUH_ ================================================ FILE: include/mppi/ddp/boxqp.h ================================================ #ifndef TRAJOPT_BOXQP_HPP #define TRAJOPT_BOXQP_HPP #include "util.h" #include #include #include #include namespace util { template struct BoxQPOptions { int max_iter; T min_grad_norm; T min_rel_improvement; T step_dec; T min_step; T armijo; bool verbose; BoxQPOptions(int max_iter, T min_grad_norm, T min_rel_improvement, T step_dec, T min_step, T armijo, bool verbose) : max_iter(max_iter) , min_grad_norm(min_grad_norm) , min_rel_improvement(min_rel_improvement) , step_dec(step_dec) , min_step(min_step) , armijo(armijo) , verbose(verbose) { } BoxQPOptions() : BoxQPOptions(100, static_cast(1.0e-8), static_cast(1.0e-8), static_cast(0.6), static_cast(1.0e-22), static_cast(0.1), false) { } }; template struct BoxQPResult { Eigen::Matrix solution; Eigen::LLT> free_chol; Eigen::Matrix free_indices; int result_code; BoxQPResult(const Eigen::Ref>& solution, const Eigen::LLT>& free_chol, const Eigen::Ref>& free_indices, int result_code) : solution(solution), free_chol(free_chol), free_indices(free_indices), result_code(result_code) { } }; template class BoxQP { public: using Scalar = T; using Vector = Eigen::Matrix; using SquareMatrix = Eigen::Matrix; using Hessian = SquareMatrix; using Gradient = Vector; using Solution = Vector; using FreeIndices = Eigen::Matrix; using Result = BoxQPResult; static const int HESSIAN_NOT_POSITIVE_DEFINITE = -1; static const int NO_DESCENT_DIRECTION = 0; static const int MAX_ITERATIONS_EXCEEDED = 1; static const int MAX_LINESEARCH_ITERATIONS_EXCEEDED = 2; static const int NO_BOUNDS = 3; static const int IMPROVEMENT_TOO_SMALL = 4; static const int GRADIENT_NORM_TOO_SMALL = 5; static const int ALL_DIMENSIONS_CLAMPED = 6; BoxQP(int max_iter, Scalar min_grad_norm, Scalar min_rel_improvement, Scalar step_dec, Scalar min_step, Scalar armijo, Logger* logger, bool verbose) : max_iter_(max_iter) , min_grad_(min_grad_norm) , min_rel_improve_(min_rel_improvement) , step_dec_(step_dec) , min_step_(min_step) , armijo_(armijo) , logger_(logger) , verbose_(verbose) { } BoxQP(const BoxQPOptions& options, Logger* logger) : BoxQP(options.max_iter, options.min_grad_norm, options.min_rel_improvement, options.step_dec, options.min_step, options.armijo, logger, options.verbose) { } BoxQP(Logger* logger) : BoxQP(BoxQPOptions(), logger) { } Result operator()(const Eigen::Ref& H, const Eigen::Ref& g, const Eigen::Ref& lower_bound, const Eigen::Ref& upper_bound, const Eigen::Ref& x0) { Eigen::Matrix clamped = Eigen::Matrix::Constant(false); Eigen::Matrix free = Eigen::Matrix::Constant(true); Vector x = x0.cwiseMin(upper_bound).cwiseMax(lower_bound); Scalar old_value = static_cast(0.0); int result = 0; Scalar gnorm = static_cast(0.0); int nfactor = 0; // Initial objective value Scalar value = x.transpose() * g + static_cast(0.5) * (x.transpose() * H * x).value(); // Main loop int iter; for (iter = 0; iter < max_iter_; ++iter) { if (result != 0) { break; } // Check relative improvement if (iter > 0 && (old_value - value) < min_rel_improve_ * std::abs(old_value)) { result = IMPROVEMENT_TOO_SMALL; if (verbose_) logger_->warning("BoxQP: Relative improvement too small - code %d", result); break; } old_value = value; // Get gradient Gradient grad = g + H * x; // Find clamped dimensions Eigen::Matrix old_clamped = clamped; clamped = Eigen::Matrix::Constant(false); for (int i = 0; i < N; ++i) { clamped(i) = (x(i) <= lower_bound(i) && grad(i) > static_cast(0.0)) || (x(i) >= upper_bound(i) && grad(i) < static_cast(0.0)); free(i) = !clamped(i); } // Check for all clamped if (clamped.all()) { result = ALL_DIMENSIONS_CLAMPED; if (verbose_) logger_->warning("BoxQP: All dimensions clamped - code %d", result); break; } // Factorize if clamped has changed if (iter == 0 || (clamped.template cast() - old_clamped.template cast()).template cast().any()) { Eigen::Matrix Hfr(free.count(), N); Eigen::Matrix Hff(free.count(), free.count()); int j = 0, k = 0; for (int i = 0; i < N; ++i) { if (free(i)) { Hfr.row(j++) = H.row(i); } } for (int i = 0; i < N; ++i) { if (free(i)) { Hff.col(k++) = Hfr.col(i); } } llt_.compute(Hff); if (llt_.info() != Eigen::Success) { result = HESSIAN_NOT_POSITIVE_DEFINITE; if (verbose_) logger_->warning("BoxQP: Hessian not positive definite - code %d", result); break; } ++nfactor; } // Check gradient norm gnorm = free.array().select(grad, Gradient::Zero()).norm(); if (gnorm < min_grad_) { result = GRADIENT_NORM_TOO_SMALL; if (verbose_) logger_->warning("BoxQP: Gradient norm too small - code %d", result); break; } // Get search direction Gradient grad_clamped = g + H * (x.cwiseProduct(clamped.template cast())); Vector search = Vector::Zero(); Eigen::Matrix grad_clamped_free(free.count()); Eigen::Matrix x_free(free.count()); int j = 0; for (int i = 0; i < N; ++i) { if (free(i)) { grad_clamped_free(j) = grad_clamped(i); x_free(j) = x(i); ++j; } } auto search_free = -llt_.solve(grad_clamped_free) - x_free; for (int i = N; i > 0; --i) { if (free(i - 1)) { search(i - 1) = search_free(--j); } } // Check for descent direction Scalar sdotg = search.dot(grad); if (sdotg >= static_cast(0.0)) { result = NO_DESCENT_DIRECTION; if (verbose_) logger_->warning("BoxQP: No descent direction - code %d", result); break; } // Armijo linesearch Scalar step = static_cast(1.0); int nstep = 0; Vector xc = (x + step * search).cwiseMin(upper_bound).cwiseMax(lower_bound); Scalar vc = xc.transpose() * g + static_cast(0.5) * (xc.transpose() * H * xc).value(); while (((vc - old_value) / (step * sdotg)) < armijo_) { step *= step_dec_; ++nstep; xc = (x + step * search).cwiseMin(upper_bound).cwiseMax(lower_bound); vc = xc.transpose() * g + static_cast(0.5) * (xc.transpose() * H * xc).value(); if (step < min_step_) { result = MAX_LINESEARCH_ITERATIONS_EXCEEDED; if (verbose_) logger_->warning("BoxQP: Max line search iterations exceeded - code %d", result); break; } } if (verbose_) { logger_->info("iter %-3d\tvalue %-9.5g\t|g| %-9.3g\treduction %-9.3g\tlinesearch %g^%-2d\tn_clamped %d\n", iter, vc, gnorm, old_value - vc, step_dec_, nstep, clamped.count()); } x = xc; value = vc; } if (iter >= max_iter_) { result = MAX_ITERATIONS_EXCEEDED; if (verbose_) logger_->info("BoxQP: All iterations completed - code %d", result); } if (verbose_) { logger_->info("Result code %d\titerations %d\tgradient %-12.6g\tfinal value %-12.6g\tfactorizations %d\n", result, iter, gnorm, value, nfactor); } return { x, llt_, free, result }; } private: int max_iter_; Scalar min_grad_; Scalar min_rel_improve_; Scalar step_dec_; Scalar min_step_; Scalar armijo_; Logger* logger_; bool verbose_; Eigen::LLT> llt_; }; } // namespace util #endif // TRAJOPT_BOXQP_HPP ================================================ FILE: include/mppi/ddp/ddp.h ================================================ #ifndef TRAJOPT_DDP_HPP #define TRAJOPT_DDP_HPP #include "boxqp.h" #include "result.h" #include "util.h" #include #include #include template class DDP { using Scalar = typename DynamicsT::Scalar; using State = typename DynamicsT::State; using StateTrajectory = typename DynamicsT::StateTrajectory; using Control = typename DynamicsT::Control; using ControlTrajectory = typename DynamicsT::ControlTrajectory; public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW DDP(Scalar time_step, int time_steps, int iterations, util::Logger* logger, bool verbose, util::BoxQPOptions qp_options = util::BoxQPOptions()) : dt_(time_step) , H_(time_steps) , iter_(iterations) , logger_(logger) , verbose_(verbose) , Lk_(H_, Eigen::Matrix::Zero()) , df_(H_, Eigen::Matrix::Zero()) , d2L_(H_, Eigen::Matrix::Zero()) , Vxx_(H_, Eigen::Matrix::Zero()) , VxxT_(H_, Eigen::Matrix::Zero()) , boxqp_(qp_options, logger_) { x_.setZero(DynamicsT::StateSize, H_); u_.setZero(DynamicsT::ControlSize, H_); Is_.setIdentity(); dL_.setZero(DynamicsT::StateSize + DynamicsT::ControlSize, H_); L_.setZero(H_); Vx_.setZero(DynamicsT::StateSize, H_); V_.setZero(H_); qx_.setZero(); qu_.setZero(); qux_.setZero(); qxx_.setZero(); quu_.setZero(); lk_.setZero(DynamicsT::ControlSize, H_); cost_.setZero(iter_, H_); } template OptimizerResult run(const Eigen::Ref& x0, const Eigen::Ref& u, DynamicsT& dynamics, CostFunction& run_cost, TerminalCostFunction& term_cost, const Eigen::Ref& u_min = -Control::Constant(std::numeric_limits::infinity()), const Eigen::Ref& u_max = Control::Constant(std::numeric_limits::infinity())) { x_.col(0) = x0; u_ = u; for (int i = 1; i < H_; ++i) { if (i < H_ - 1) { // u_.col(i-1) += Lk_.at(i)*(x_.col(i-1) - x_.col(i)); u_.col(i - 1) = u_.col(i - 1).cwiseMin(u_max).cwiseMax(u_min); } x_.col(i) = x_.col(i - 1) + dynamics.f(x_.col(i - 1), u_.col(i - 1)) * dt_; } int current_iteration = 0; while (current_iteration < iter_) { // Obtain Jacobians -- this can be parallelized but is pretty bad with OpenMP for (int k = 0; k < H_; ++k) { df_.at(k) = dynamics.df(x_.col(k), u_.col(k)) * dt_; df_.at(k).leftCols(DynamicsT::StateSize) += Is_; d2L_.at(k) = run_cost.d2c(x_.col(k), u_.col(k), k); // std::cout << run_cost.c(x_.col(k), u_.col(k), k) << std::endl; dL_.col(k) = run_cost.dc(x_.col(k), u_.col(k), k); L_(k) = run_cost.c(x_.col(k), u_.col(k), k); } // Evaluate boundary condition Vxx_.at(H_ - 1) = term_cost.d2c(x_.col(H_ - 1)); VxxT_.at(H_ - 1) = Vxx_.at(H_ - 1).transpose(); Vxx_.at(H_ - 1) = static_cast(0.5) * (Vxx_.at(H_ - 1) + VxxT_.at(H_ - 1)); Vx_.col(H_ - 1) = term_cost.dc(x_.col(H_ - 1)); V_(H_ - 1) = term_cost.c(x_.col(H_ - 1)); // Perform backward pass for (int k = H_ - 2; k >= 0; --k) { const auto& Phi = df_.at(k).leftCols(DynamicsT::StateSize); const auto& B = df_.at(k).rightCols(DynamicsT::ControlSize); qx_ = dL_.col(k).head(DynamicsT::StateSize) * dt_ + Phi.transpose() * Vx_.col(k + 1); qu_ = dL_.col(k).tail(DynamicsT::ControlSize) * dt_ + B.transpose() * Vx_.col(k + 1); qux_ = d2L_.at(k).bottomLeftCorner(DynamicsT::ControlSize, DynamicsT::StateSize) * dt_ + B.transpose() * Vxx_.at(k + 1) * Phi; qxx_ = d2L_.at(k).topLeftCorner(DynamicsT::StateSize, DynamicsT::StateSize) * dt_ + Phi.transpose() * Vxx_.at(k + 1) * Phi; quu_ = d2L_.at(k).bottomRightCorner(DynamicsT::ControlSize, DynamicsT::ControlSize) * dt_ + B.transpose() * Vxx_.at(k + 1) * B; Eigen::LDLT > ldlt(quu_); if (ldlt.info() == Eigen::Success) { Lk_.at(k) = ldlt.solve(-qux_); lk_.col(k) = ldlt.solve(-qu_); } else { logger_->error("Failed -- DEAD\n"); std::exit(-3); } // Backprop value function Vxx_.at(k) = qxx_ + qux_.transpose() * Lk_.at(k); VxxT_.at(k) = Vxx_.at(k).transpose(); Vxx_.at(k) = static_cast(0.5) * (Vxx_.at(k) + VxxT_.at(k)); Vx_.col(k) = qx_ + qux_.transpose() * lk_.col(k); V_(k) = static_cast(0.5) * (qu_.transpose() * lk_.col(k)).value(); } bool accept = false; Scalar alpha = static_cast(1.0); StateTrajectory xnew = StateTrajectory::Zero(DynamicsT::StateSize, H_); ControlTrajectory unew = ControlTrajectory::Zero(DynamicsT::ControlSize, H_); while (!accept) { xnew.col(0) = x_.col(0); for (int k = 0; k < H_ - 1; ++k) { State dx = xnew.col(k) - x_.col(k); unew.col(k) = u_.col(k) + alpha * lk_.col(k) + Lk_.at(k) * dx; unew.col(k) = unew.col(k).cwiseMin(u_max).cwiseMax(u_min); xnew.col(k + 1) = xnew.col(k) + dynamics.f(xnew.col(k), unew.col(k)) * dt_; cost_(current_iteration, k) = run_cost.c(xnew.col(k), unew.col(k), k) * dt_; } cost_(current_iteration, H_ - 1) = V_(H_ - 1); if (current_iteration == 0 || alpha < static_cast(0.0001) || cost_.row(current_iteration).sum() <= cost_.row(current_iteration - 1).sum()) { u_ = unew; x_ = xnew; accept = true; } else { alpha *= static_cast(0.5); cost_.row(current_iteration).setZero(); unew.setZero(); xnew.setZero(); } } ++current_iteration; } return OptimizerResult(current_iteration, H_, cost_.row(current_iteration - 1).sum(), cost_, x_, u_, Lk_, lk_); } private: Scalar dt_; int H_; int iter_; util::Logger* logger_; bool verbose_; util::EigenAlignedVector Lk_; util::EigenAlignedVector df_; util::EigenAlignedVector d2L_; util::EigenAlignedVector Vxx_; util::EigenAlignedVector VxxT_; util::BoxQP boxqp_; StateTrajectory x_; ControlTrajectory u_; Eigen::Matrix Is_; Eigen::Matrix dL_; Eigen::Matrix L_; Eigen::Matrix Vx_; Eigen::Matrix V_; State qx_; Control qu_; Eigen::Matrix qux_; Eigen::Matrix qxx_; Eigen::Matrix quu_; Eigen::Matrix lk_; Eigen::Matrix cost_; }; #endif // TRAJOPT_DDP_HPP ================================================ FILE: include/mppi/ddp/ddp_costs.h ================================================ #ifndef TRAJOPT_COSTS_HPP #define TRAJOPT_COSTS_HPP #include template struct CostFunction { EIGEN_MAKE_ALIGNED_OPERATOR_NEW using Scalar = typename Dynamics::Scalar; using State = typename Dynamics::State; using Control = typename Dynamics::Control; using Gradient = Eigen::Matrix; using Hessian = Eigen::Matrix; CostFunction() = default; CostFunction(const CostFunction& other) = default; CostFunction(CostFunction&& other) = default; CostFunction& operator=(const CostFunction& other) = default; CostFunction& operator=(CostFunction&& other) = default; virtual ~CostFunction() = default; CostFunction(const Eigen::Ref& target) : xf(target) { } const Eigen::Ref& target() const { return xf; } Eigen::Ref target() { return xf; } const Scalar& target(int idx) const { return xf(idx); } Scalar& target(int idx) { return xf(idx); } virtual Scalar c(const Eigen::Ref& x, const Eigen::Ref& u, int t) = 0; virtual Gradient dc(const Eigen::Ref& x, const Eigen::Ref& u, int t) = 0; virtual Hessian d2c(const Eigen::Ref& x, const Eigen::Ref& u, int t) = 0; State xf; }; template struct TerminalCostFunction { EIGEN_MAKE_ALIGNED_OPERATOR_NEW using Scalar = typename Dynamics::Scalar; using State = typename Dynamics::State; using Gradient = Eigen::Matrix; using Hessian = Eigen::Matrix; TerminalCostFunction() = default; TerminalCostFunction(const TerminalCostFunction& other) = default; TerminalCostFunction(TerminalCostFunction&& other) = default; TerminalCostFunction& operator=(const TerminalCostFunction& other) = default; TerminalCostFunction& operator=(TerminalCostFunction&& other) = default; virtual ~TerminalCostFunction() = default; TerminalCostFunction(const Eigen::Ref& target) : xf(target) { } const Eigen::Ref& target() const { return xf; } Eigen::Ref target() { return xf; } const Scalar& target(int idx) const { return xf(idx); } Scalar& target(int idx) { return xf(idx); } virtual Scalar c(const Eigen::Ref& x) = 0; virtual Gradient dc(const Eigen::Ref& x) = 0; virtual Hessian d2c(const Eigen::Ref& x) = 0; State xf; }; #endif // TRAJOPT_COSTS_HPP ================================================ FILE: include/mppi/ddp/ddp_dynamics.h ================================================ #ifndef TRAJOPT_DYNAMICS_HPP #define TRAJOPT_DYNAMICS_HPP #include "util.h" #include #include #include namespace internal { template struct Differentiable { /*****************************************************************************/ /*** Replicate Eigen's generic functor implementation to avoid inheritance ***/ /*** We only use the fixed-size functionality ********************************/ /*****************************************************************************/ enum { InputsAtCompileTime = S + C, ValuesAtCompileTime = S }; using Scalar = T; using InputType = Eigen::Matrix; using ValueType = Eigen::Matrix; using JacobianType = Eigen::Matrix; int inputs() const { return InputsAtCompileTime; } int values() const { return ValuesAtCompileTime; } int operator()(const Eigen::Ref& xu, Eigen::Ref dx) const { dx = f_(xu.template head(), xu.template tail()); return 0; } /*****************************************************************************/ using DiffFunc = std::function(const Eigen::Ref>&, const Eigen::Ref>&)>; Differentiable(const DiffFunc& f) : f_(f) { } private: DiffFunc f_; }; } // namespace internal namespace DDP_structures { template struct Dynamics { EIGEN_MAKE_ALIGNED_OPERATOR_NEW enum { StateSize = S, ControlSize = C }; using Scalar = T; using State = Eigen::Matrix; using Control = Eigen::Matrix; using StateTrajectory = Eigen::Matrix; using ControlTrajectory = Eigen::Matrix; using Jacobian = typename internal::Differentiable::JacobianType; using StateControl = typename internal::Differentiable::InputType; using FeedbackGain = Eigen::Matrix; using FeedforwardGain = Eigen::Matrix; using FeedbackGainTrajectory = util::EigenAlignedVector; using FeedforwardGainTrajectory = Eigen::Matrix; Dynamics() : diff_([this](const Eigen::Ref& x, const Eigen::Ref& u) -> State { return this->f(x, u); }) , num_diff_(diff_) { } Dynamics(const Dynamics& other) = default; Dynamics(Dynamics&& other) = default; Dynamics& operator=(const Dynamics& other) = default; Dynamics& operator=(Dynamics&& other) = default; virtual ~Dynamics() = default; // Implementation required virtual State f(const Eigen::Ref& x, const Eigen::Ref& u) = 0; // Implementation optional virtual Jacobian df(const Eigen::Ref& x, const Eigen::Ref& u) { num_diff_.df((typename internal::Differentiable::InputType() << x, u).finished(), j_); return j_; } private: Jacobian j_; typename internal::Differentiable diff_; Eigen::NumericalDiff, Eigen::Central> num_diff_; }; } // namespace DDP_structures #endif // TRAJOPT_DYNAMICS_HPP ================================================ FILE: include/mppi/ddp/ddp_model_wrapper.h ================================================ #ifndef DDP_MODEL_WRAPPER_H #define DDP_MODEL_WRAPPER_H #include "ddp_dynamics.h" #include template struct HasAnalyticGrad { template static char Test(decltype(&U::computeGrad)); template static long Test(...); static const bool Has = sizeof(Test(0)) == sizeof(char); }; template bool getGrad(T* model, typename DDP_structures::Dynamics::Jacobian& jac, typename DDP_structures::Dynamics::State& x, typename DDP_structures::Dynamics::Control& u, std::true_type) { // T::dfdx A; // T::dfdu B; Eigen::Matrix A = Eigen::Matrix::Zero(); Eigen::Matrix B = Eigen::Matrix::Zero(); bool exists = model->computeGrad(x, u, A, B); jac.block(0, 0, T::STATE_DIM, T::STATE_DIM) = A; jac.block(0, T::STATE_DIM, T::STATE_DIM, T::CONTROL_DIM) = B; // for (int i = 0; i < T::STATE_DIM; i++){ // for (int j = 0; j < T::STATE_DIM; j++){ // jac(i,j) = A(i,j); // } // } return exists; } template bool getGrad(T* model, typename DDP_structures::Dynamics::Jacobian& jac, typename DDP_structures::Dynamics::State& x, typename DDP_structures::Dynamics::Control& u, std::false_type) { return false; } template struct ModelWrapperDDP : public DDP_structures::Dynamics { EIGEN_MAKE_ALIGNED_OPERATOR_NEW using Scalar = float; using State = typename DDP_structures::Dynamics::State; using Control = typename DDP_structures::Dynamics::Control; using Jacobian = typename DDP_structures::Dynamics::Jacobian; using StateTrajectory = typename DDP_structures::Dynamics::StateTrajectory; using ControlTrajectory = typename DDP_structures::Dynamics::ControlTrajectory; State state; Control control; DYNAMICS_T* model_; ModelWrapperDDP(DYNAMICS_T* model) { model_ = model; } State f(const Eigen::Ref& x, const Eigen::Ref& u) { // This section is specific to the neural network implementation for the autorally state = x; control = u; // Compute the state derivative xDot State dx; State next_state; typename DYNAMICS_T::output_array output; // TODO model_->step(state, next_state, dx, control, output, 0, 0.01); return dx; } Jacobian df(const Eigen::Ref& x, const Eigen::Ref& u) { Jacobian j_ = Jacobian::Zero(DYNAMICS_T::STATE_DIM, DYNAMICS_T::STATE_DIM + DYNAMICS_T::CONTROL_DIM); state = x; control = u; bool analyticGradComputed = getGrad(model_, j_, state, control, std::integral_constant::Has>()); if (!analyticGradComputed) { j_ = DDP_structures::Dynamics::df(x, u); } return j_; } }; #endif // DDP_MODEL_WRAPPER_H ================================================ FILE: include/mppi/ddp/ddp_tracking_costs.h ================================================ #ifndef DDP_TRACKING_COSTS_ #define DDP_TRACKING_COSTS_ #include "ddp_dynamics.h" #include "ddp_costs.h" template struct TrackingCostDDP : public CostFunction { using Dynamics = DynamicsT; using Scalar = typename Dynamics::Scalar; using State = typename Dynamics::State; using Control = typename Dynamics::Control; using Gradient = typename CostFunction::Gradient; using Hessian = typename CostFunction::Hessian; static const int N = Dynamics::StateSize; static const int M = Dynamics::ControlSize; using StateCostWeight = Eigen::Matrix; using ControlCostWeight = Eigen::Matrix; public: Eigen::MatrixXf traj_target_x_; Eigen::MatrixXf traj_target_u_; TrackingCostDDP(const Eigen::Ref& Q, const Eigen::Ref& R, int num_timesteps) : Q_(Q), R_(R) { QR_.setZero(); QR_.template topLeftCorner() = Q; QR_.template bottomRightCorner() = R; traj_target_x_ = Eigen::MatrixXf::Zero(Dynamics::StateSize, num_timesteps); traj_target_u_ = Eigen::MatrixXf::Zero(Dynamics::ControlSize, num_timesteps); } Scalar c(const Eigen::Ref& x, const Eigen::Ref& u, int t) { // TODO use method inside of dynamics to compute this float state_cost = ((x - traj_target_x_.col(t)).transpose() * Q_ * (x - traj_target_x_.col(t))).value(); float control_cost = ((u - traj_target_u_.col(t)).transpose() * R_ * (u - traj_target_u_.col(t))).value(); return state_cost + control_cost; } Gradient dc(const Eigen::Ref& x, const Eigen::Ref& u, int t) { return (Gradient() << Q_ * (x - traj_target_x_.col(t)), R_ * (u - traj_target_u_.col(t))).finished(); } Hessian d2c(const Eigen::Ref& x, const Eigen::Ref& u, int t) { return QR_; } void setTargets(const float* traj_target, const float* control_target, int timesteps) { for (int t = 0; t < timesteps; t++) { for (int i = 0; i < Dynamics::StateSize; i++) { traj_target_x_(i, t) = traj_target[Dynamics::StateSize * t + i]; } } for (int t = 0; t < timesteps; t++) { for (int i = 0; i < Dynamics::ControlSize; i++) { traj_target_u_(i, t) = control_target[Dynamics::ControlSize * t + i]; } } } // Still specific for autorally // void setStop(Eigen::MatrixXf state, int timesteps) // { // for (int t = 0; t < timesteps; t++){ // traj_target_x_.col(t) << state; // } // traj_target_u_ = Eigen::MatrixXf::Zero(Dynamics::ControlSize, timesteps); // Q_.diagonal() << 10.0, 10.0, 25.0, 10.0, 10.0, 10.0, 10.0; // } private: StateCostWeight Q_; ControlCostWeight R_; Hessian QR_; }; template struct TrackingTerminalCost : public TerminalCostFunction { using Dynamics = DynamicsT; using Scalar = typename Dynamics::Scalar; using State = typename Dynamics::State; using Gradient = typename TerminalCostFunction::Gradient; using Hessian = typename TerminalCostFunction::Hessian; static const int N = Dynamics::StateSize; using StateCostWeight = Eigen::Matrix; public: TrackingTerminalCost(const Eigen::Ref& Qf, const Eigen::Ref& xf = State::Zero()) : Qf_(Qf) { this->target() = xf; // Initialize the target to zero } Scalar c(const Eigen::Ref& x) { return ((x - this->target()).transpose() * Qf_ * (x - this->target())).value(); } Gradient dc(const Eigen::Ref& x) { return Qf_ * (x - this->target()); } Hessian d2c(const Eigen::Ref& x) { return Qf_; } private: StateCostWeight Qf_; }; #endif /*DDP_TRACKING_COSTS_H_*/ ================================================ FILE: include/mppi/ddp/result.h ================================================ #ifndef TRAJOPT_RESULT_HPP #define TRAJOPT_RESULT_HPP #include /** * @tparam Dynamics * @brief Represent the return type of an optimization of subject to Dynamics. */ template struct OptimizerResult { using Scalar = typename Dynamics::Scalar; OptimizerResult() = default; OptimizerResult(const OptimizerResult& other) = default; OptimizerResult(OptimizerResult&& other) = default; ~OptimizerResult() = default; OptimizerResult& operator=(const OptimizerResult& other) = default; OptimizerResult& operator=(OptimizerResult&& other) = default; /** * @brief Constructor to use if the optimizer does not produce feedback or feedforward gains. * @param iter Number of iterations performed (I) * @param ts Number of time steps (H) * @param tc Total cost of the final trajectory returned by the optimizer * @param c Cost for each time step in each iteration (I x H) * @param x Optimized state trajectory * @param u Optimized control trajectory */ OptimizerResult(int iter, int ts, Scalar tc, const Eigen::Ref>& c, const Eigen::Ref& x, const Eigen::Ref& u) : iterations(iter), timesteps(ts), total_cost(tc), cost(c), state_trajectory(x), control_trajectory(u) { } /** * @brief Constructor to use if the optimizer produces feedback or feedforward gains. * @param iter Number of iterations performed (I) * @param ts Number of time steps (H) * @param tc Total cost of the final trajectory returned by the optimizer * @param c Cost for each time step in each iteration (I x H) * @param x Optimized state trajectory * @param u Optimized control trajectory * @param fb Feedback gains for every time step in the optimized trajectory * @param ff Feedforward gains for every time step in the optimized trajectory */ OptimizerResult(int iter, int ts, Scalar tc, const Eigen::Ref>& c, const Eigen::Ref& x, const Eigen::Ref& u, const typename Dynamics::FeedbackGainTrajectory& fb, const Eigen::Ref& ff) : iterations(iter) , timesteps(ts) , total_cost(tc) , cost(c) , state_trajectory(x) , control_trajectory(u) , feedback_gain(fb) , feedforward_gain(ff) { } int iterations; ///< # of optimizing iterations (I) int timesteps; ///< # of timesteps (H) Scalar total_cost; ///< Total cost over the trajectory returned by the optimizer after I iterations Eigen::Matrix cost; ///< Cost for each time step cost per iteration (I x H) typename Dynamics::StateTrajectory state_trajectory; ///< State trajectory created by optimizer typename Dynamics::ControlTrajectory control_trajectory; ///< Control trajectory created by optimizer typename Dynamics::FeedbackGainTrajectory feedback_gain; ///< Feedback gain at each time step typename Dynamics::FeedforwardGainTrajectory feedforward_gain; ///< Feedforward gain at each time step }; #endif // TRAJOPT_RESULT_HPP ================================================ FILE: include/mppi/ddp/util.h ================================================ #ifndef TRAJOPT_UTIL_HPP #define TRAJOPT_UTIL_HPP #include #include #include #include #include /** * @namespace util Miscellaneous utilities, convenience items, and type definitions. */ namespace util { /** * @tparam T Scalar type of the Eigen::Matrix being stored * @tparam R Number of Matrix rows * @tparam C Number of Matrix columns * @brief Convenience typedef for Eigen-aligned std::vector storing fixed-size Eigen matrices. */ template using EigenAlignedVector = std::vector, Eigen::aligned_allocator>>; /** * @tparam MatrixType Typedef'd Eigen type * @brief Convenience typedef for Eigen-aligned std::vector storing fixed-size Eigen matrices. */ template using NamedEigenAlignedVector = std::vector>; template /** * @brief Convenience function to combine State and Control into a single vector. Note that a copy may be made if * RVO is not applied. * @param x State vector * @param u Control vector * @return Combined State and Control vector of size StateSize + ControlSize */ inline typename Dynamics::StateControl combine_xu(const Eigen::Ref& x, const Eigen::Ref& u) { using StateControl = typename Dynamics::StateControl; return (StateControl() << x, u).finished(); } /** * @brief Return the number of time steps that fit in the time horizon. * @param tf Time horizon * @param dt Step size * @return Ceiling of the number of dt's in tf */ inline int time_steps(double tf, double dt) { return static_cast(std::ceil(tf / dt)); } /** * @brief Abstract class for logging informational, error, and warning messages. */ class Logger { public: Logger() = default; Logger(const Logger& other) = default; Logger(Logger&& other) = default; virtual ~Logger() = default; Logger& operator=(const Logger& other) = default; Logger& operator=(Logger&& other) = default; /** * @brief Pure virtual function to log informational messages. * @param fmt Format string (if additional arguments are passed) or message to display */ virtual void info(const char* fmt, ...) = 0; /** * @brief Pure virtual function to log warnings. * @param fmt Format string (if additional arguments are passed) or message to display */ virtual void warning(const char* fmt, ...) = 0; /** * @brief Pure virtual function to log errors. * @param fmt Format string (if additional arguments are passed) or message to display */ virtual void error(const char* fmt, ...) = 0; }; /** * @brief A logger that outputs to stdout for info messages and stderr for warnings and errors. */ class DefaultLogger : public Logger { public: /** * @brief Log info messages to stdout. * @param fmt Format string (if additional arguments are passed) or message to display */ virtual void info(const char* fmt, ...) { std::va_list argptr; va_start(argptr, fmt); std::vprintf(fmt, argptr); va_end(argptr); } /** * @brief Log warnings to stderr. * @param fmt Format string (if additional arguments are passed) or message to display */ virtual void warning(const char* fmt, ...) { std::va_list argptr; va_start(argptr, fmt); std::vfprintf(stderr, fmt, argptr); va_end(argptr); } /** * @brief Log errors to stderr. * @param fmt Format string (if additional arguments are passed) or message to display */ virtual void error(const char* fmt, ...) { std::va_list argptr; va_start(argptr, fmt); std::vfprintf(stderr, fmt, argptr); va_end(argptr); } }; } // namespace util #endif // TRAJOPT_UTIL_HPP ================================================ FILE: include/mppi/dynamics/autorally/ar_nn_model.cu ================================================ #include "ar_nn_model.cuh" template NeuralNetModel::NeuralNetModel(std::array control_rngs, cudaStream_t stream) : PARENT_CLASS(control_rngs, stream) { std::vector args{ 6, 32, 32, 4 }; helper_ = new FNNHelper<>(args, stream); } template NeuralNetModel::NeuralNetModel(cudaStream_t stream) : PARENT_CLASS(stream) { std::vector args{ 6, 32, 32, 4 }; helper_ = new FNNHelper<>(args, stream); } template NeuralNetModel::~NeuralNetModel() { freeCudaMem(); free(helper_); } template void NeuralNetModel::freeCudaMem() { if (this->GPUMemStatus_) { helper_->freeCudaMem(); } PARENT_CLASS::freeCudaMem(); } template void NeuralNetModel::updateModel(const std::vector& description, const std::vector& data) { helper_->updateModel(description, data); } template void NeuralNetModel::paramsToDevice() { if (this->GPUMemStatus_) { helper_->paramsToDevice(); } PARENT_CLASS::paramsToDevice(); } template void NeuralNetModel::loadParams(const std::string& model_path) { helper_->loadParams(model_path); } template bool NeuralNetModel::computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { Eigen::Matrix jac; jac.setZero(); // Start with the kinematic and physics model derivatives jac.row(0) << 0, 0, -sin(state(2)) * state(4) - cos(state(2)) * state(5), 0, cos(state(2)), -sin(state(2)), 0, 0, 0; jac.row(1) << 0, 0, cos(state(2)) * state(4) - sin(state(2)) * state(5), 0, sin(state(2)), cos(state(2)), 0, 0, 0; jac.row(2) << 0, 0, 0, 0, 0, 0, -1, 0, 0; state_array state_der; // First do the forward pass computeDynamics(state, control, state_der); Eigen::MatrixXf ip_delta = helper_->getZeroJacobianMatrix(); helper_->computeGrad(ip_delta); jac.bottomRightCorner(DYNAMICS_DIM, DYNAMICS_DIM + C_DIM) += ip_delta; A = jac.leftCols(S_DIM); B = jac.rightCols(C_DIM); return true; } template void NeuralNetModel::computeKinematics(const Eigen::Ref& state, Eigen::Ref state_der) { state_der(0) = cosf(state(2)) * state(4) - sinf(state(2)) * state(5); state_der(1) = sinf(state(2)) * state(4) + cosf(state(2)) * state(5); state_der(2) = -state(6); // Pose estimate actually gives the negative yaw derivative } template void NeuralNetModel::computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { Eigen::VectorXf input = helper_->getZeroInputVector(); Eigen::VectorXf output = helper_->getZeroOutputVector(); int i; for (i = 0; i < DYNAMICS_DIM; i++) { input(i) = state(i + (S_DIM - DYNAMICS_DIM)); } for (i = 0; i < C_DIM; i++) { input(DYNAMICS_DIM + i) = control(i); } helper_->forward(input, output); for (i = 0; i < DYNAMICS_DIM; i++) { state_der(i + (S_DIM - DYNAMICS_DIM)) = output[i]; } } template __device__ void NeuralNetModel::computeKinematics(float* state, float* state_der) { state_der[0] = cosf(state[2]) * state[4] - sinf(state[2]) * state[5]; state_der[1] = sinf(state[2]) * state[4] + cosf(state[2]) * state[5]; state_der[2] = -state[6]; // Pose estimate actually gives the negative yaw derivative } template __device__ void NeuralNetModel::computeDynamics(float* state, float* control, float* state_der, float* theta_s) { float* curr_act; int tdy = threadIdx.y; int i; // curr_act = &theta_s[SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float) + 1 + (2 * LARGEST_LAYER) * (blockDim.x * tdz + // tdx)]; curr_act = helper_->getInputLocation(theta_s); // iterate through the part of the state that should be an input to the NN for (i = tdy; i < DYNAMICS_DIM; i += blockDim.y) { curr_act[i] = state[i + (S_DIM - DYNAMICS_DIM)]; } // iterate through the control to put into first layer for (i = tdy; i < C_DIM; i += blockDim.y) { curr_act[DYNAMICS_DIM + i] = control[i]; } __syncthreads(); curr_act = helper_->forward(nullptr, theta_s); // copies results back into state derivative for (i = tdy; i < DYNAMICS_DIM; i += blockDim.y) { state_der[i + (S_DIM - DYNAMICS_DIM)] = curr_act[i]; } __syncthreads(); } template void NeuralNetModel::GPUSetup() { PARENT_CLASS* derived = static_cast(this); helper_->GPUSetup(); derived->GPUSetup(); HANDLE_ERROR(cudaMemcpyAsync(&(this->model_d_->helper_), &(this->helper_->network_d_), sizeof(FNNHelper<>*), cudaMemcpyHostToDevice, this->stream_)); } template __device__ void NeuralNetModel::initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt) { if (output) { for (int i = 0; i < this->OUTPUT_DIM && i < this->STATE_DIM; i++) { output[i] = state[i]; } } helper_->initialize(theta_s); } template NeuralNetModel::state_array NeuralNetModel::stateFromMap(const std::map& map) { state_array s; s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(POS_Y)) = map.at("POS_Y"); s(S_INDEX(YAW)) = map.at("YAW"); s(S_INDEX(ROLL)) = map.at("ROLL"); s(S_INDEX(BODY_VEL_X)) = map.at("VEL_X"); s(S_INDEX(BODY_VEL_Y)) = map.at("VEL_Y"); s(S_INDEX(YAW_RATE)) = map.at("OMEGA_Z"); return s; } template __host__ __device__ int NeuralNetModel::getGrdSharedSizeBytes() const { return helper_->getGrdSharedSizeBytes(); } template __host__ __device__ int NeuralNetModel::getBlkSharedSizeBytes() const { return helper_->getBlkSharedSizeBytes(); } ================================================ FILE: include/mppi/dynamics/autorally/ar_nn_model.cuh ================================================ #ifndef AR_NN_DYNAMICS_CUH_ #define AR_NN_DYNAMICS_CUH_ #include #include #include #include #include #include #include #include /** * For AutoRally * State x, y, yaw, roll, u_x, u_y, yaw_rate * NeuralNetModel<7,2,3,6,32,32,4> model(dt, u_constraint); * DYNAMICS_DIM = 4 */ struct NNDynamicsParams : public DynamicsParams { enum class StateIndex : int { POS_X = 0, POS_Y, YAW, ROLL, BODY_VEL_X, BODY_VEL_Y, YAW_RATE, NUM_STATES }; enum class ControlIndex : int { STEERING = 0, THROTTLE, NUM_CONTROLS }; enum class OutputIndex : int { POS_X = 0, POS_Y, YAW, ROLL, BODY_VEL_X, BODY_VEL_Y, YAW_RATE, FILLER_1, NUM_OUTPUTS }; }; /** * @file neural_net_model.cuh * @author Grady Williams * @date May 24, 2017 * @copyright 2017 Georgia Institute of Technology * @brief Neural Network Model class definition * @tparam S_DIM state dimension * @tparam C_DIM control dimension * @tparam K_DIM dimensions that are ignored from the state, 1 ignores the first, 2 the first and second, etc. * For example in AutoRally we want to input the last 4 values of our state (dim 7), so our K is 3 * If you use the entire state in the NN K should equal 0 * @tparam layer_args size of the NN layers */ using namespace MPPI_internal; template class NeuralNetModel : public Dynamics, NNDynamicsParams> { public: // TODO remove duplication of calculation of values, pull from the structure EIGEN_MAKE_ALIGNED_OPERATOR_NEW using PARENT_CLASS = Dynamics, NNDynamicsParams>; // Define Eigen fixed size matrices using state_array = typename PARENT_CLASS::state_array; using control_array = typename PARENT_CLASS::control_array; using output_array = typename PARENT_CLASS::output_array; using dfdx = typename PARENT_CLASS::dfdx; using dfdu = typename PARENT_CLASS::dfdu; static const int DYNAMICS_DIM = S_DIM - K_DIM; ///< number of inputs from state NeuralNetModel(cudaStream_t stream = 0); NeuralNetModel(std::array control_rngs, cudaStream_t stream = 0); ~NeuralNetModel(); std::string getDynamicsModelName() const override { return "FCN Autorally Model"; } __device__ int* getNetStructurePtr() { return helper_->getNetStructurePtr(); } __device__ int* getStrideIdcsPtr() { return helper_->getStrideIdcsPtr(); } __device__ float* getThetaPtr() { return helper_->getThetaPtr(); } FNNHelper<>* getHelperPtr() { return helper_; } void paramsToDevice(); __device__ void initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt); void initializeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref output, float t_0, float dt) { PARENT_CLASS::initializeDynamics(state, control, output, t_0, dt); } void GPUSetup(); void freeCudaMem(); void printState(); void printParams(); void loadParams(const std::string& model_path); void updateModel(const std::vector& description, const std::vector& data); bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B); void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); void computeKinematics(const Eigen::Ref& state, Eigen::Ref s_der); __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta_s = nullptr); __device__ void computeKinematics(float* state, float* state_der); state_array stateFromMap(const std::map& map) override; __host__ __device__ int getGrdSharedSizeBytes() const; __host__ __device__ int getBlkSharedSizeBytes() const; private: FNNHelper<>* helper_ = nullptr; }; template const int NeuralNetModel::DYNAMICS_DIM; #if __CUDACC__ #include "ar_nn_model.cu" #endif #endif // AR_NN_DYNAMICS_CUH_ ================================================ FILE: include/mppi/dynamics/bicycle_slip/bicycle_slip_parametric.cu ================================================ // // Created by jason on 12/12/22. // #include "bicycle_slip_parametric.cuh" template BicycleSlipParametricImpl::BicycleSlipParametricImpl(cudaStream_t stream) : PARENT_CLASS(stream) { normals_tex_helper_ = new TwoDTextureHelper(1, stream); } template void BicycleSlipParametricImpl::paramsToDevice() { normals_tex_helper_->copyToDevice(); PARENT_CLASS::paramsToDevice(); } template BicycleSlipParametricImpl::state_array BicycleSlipParametricImpl::stateFromMap(const std::map& map) { state_array s = state_array::Zero(); if (map.find("VEL_X") == map.end() || map.find("VEL_Y") == map.end() || map.find("POS_X") == map.end() || map.find("POS_Y") == map.end() || map.find("ROLL") == map.end() || map.find("PITCH") == map.end()) { std::cout << "WARNING: could not find all keys for ackerman slip dynamics" << std::endl; for (const auto& it : map) { std::cout << "got key " << it.first << std::endl; } return s; } s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(POS_Y)) = map.at("POS_Y"); s(S_INDEX(VEL_X)) = map.at("VEL_X"); s(S_INDEX(VEL_Y)) = map.at("VEL_Y"); s(S_INDEX(OMEGA_Z)) = map.at("OMEGA_Z"); s(S_INDEX(YAW)) = map.at("YAW"); s(S_INDEX(ROLL)) = map.at("ROLL"); s(S_INDEX(PITCH)) = map.at("PITCH"); if (map.find("STEER_ANGLE") != map.end()) { s(S_INDEX(STEER_ANGLE)) = map.at("STEER_ANGLE"); s(S_INDEX(STEER_ANGLE_RATE)) = map.at("STEER_ANGLE_RATE"); } else { std::cout << "WARNING: unable to find BRAKE_STATE or STEER_ANGLE_RATE, using 0" << std::endl; s(S_INDEX(STEER_ANGLE)) = 0; s(S_INDEX(STEER_ANGLE_RATE)) = 0; } if (map.find("BRAKE_STATE") != map.end()) { s(S_INDEX(BRAKE_STATE)) = map.at("BRAKE_STATE"); } else if (map.find("BRAKE_CMD") != map.end()) { std::cout << "WARNING: unable to find BRAKE_STATE" << std::endl; s(S_INDEX(BRAKE_STATE)) = map.at("BRAKE_CMD"); } else { std::cout << "WARNING: unable to find BRAKE_CMD or BRAKE_STATE" << std::endl; s(S_INDEX(BRAKE_STATE)) = 0; } return s; } template void BicycleSlipParametricImpl::GPUSetup() { PARENT_CLASS::GPUSetup(); normals_tex_helper_->GPUSetup(); // makes sure that the device ptr sees the correct texture object HANDLE_ERROR(cudaMemcpyAsync(&(this->model_d_->normals_tex_helper_), &(normals_tex_helper_->ptr_d_), sizeof(TwoDTextureHelper*), cudaMemcpyHostToDevice, this->stream_)); } template void BicycleSlipParametricImpl::freeCudaMem() { normals_tex_helper_->freeCudaMem(); PARENT_CLASS::freeCudaMem(); } template void BicycleSlipParametricImpl::computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { state_der = state_array::Zero(); bool enable_brake = control[C_INDEX(THROTTLE_BRAKE)] < 0; // runs parametric delay model // this->computeParametricDelayDeriv(state, control, state_der); // // runs the parametric part of the steering model // // TODO this needs to be the NN // this->computeParametricSteerDeriv(state, control, state_der); // // TODO need to predict the brake state change // float normal_x_avg, normal_y_avg, normal_z_avg; // RACER::computeBodyFrameNormals>( // this->normals_tex_helper_, state(S_INDEX(YAW)), state(S_INDEX(POS_X)), state(S_INDEX(POS_Y)), // state(S_INDEX(ROLL)), state(S_INDEX(PITCH)), normal_x_avg, normal_y_avg, normal_z_avg); // const float brake = tanh_vel_scale(state(S_INDEX(BRAKE_STATE)), state(S_INDEX(VEL_X)), this->params.c_brake); // const float drag_x = tanh_scale(state(S_INDEX(VEL_X)), this->params.c_rolling); // const float scaled_rpm = state(S_INDEX(ENGINE_RPM)) / 1.0e3 - this->params_.min_rpm; // const float throttle = tanhf(powf(scaled_rpm, 2) * this->params_.c_rpm[2] + scaled_rpm * this->params_.c_rpm[1] + // this->params_.c_rpm[0]) * // this->params_.rpm_scale; // const float x_force = throttle - brake - drag_x; // const float wheel_angle = tanf(state(S_INDEX(STEER_ANGLE)) / this->params_.steer_angle_scale); // const float parametric_omega = (state(S_INDEX(VEL_X)) / this->params_.wheel_base) * wheel_angle; // state_der(S_INDEX(OMEGA_Z)) = (parametric_omega - state(S_INDEX(OMEGA_Z))) * this->params_.c_omega - // state(S_INDEX(OMEGA_Z)) * this->params_.c_v_omega; // const float drag_y = tanh_scale(state(S_INDEX(VEL_Y)), this->params_.c_sliding); // const float y_force = // tanhf(state(S_INDEX(VEL_X)) * state(S_INDEX(OMEGA_Z)) * this->params_.y_f_c[0]) * this->params_.y_f_c[1] - // drag_y; // float wheel_angle, sin_wheel_angle, cos_wheel_angle; // sincosf(wheel_angle, &sin_wheel_angle, &cos_wheel_angle); // state_der(S_INDEX(VEL_X)) = (x_force + x_force * cos_wheel_angle - y_force * sin_wheel_angle) / this->params_.mass // - // state(S_INDEX(VEL_X)) * this->params_.c_vx - normal_x_avg * this->params_.gravity_x + // state(S_INDEX(VEL_Y)) * state(S_INDEX(OMEGA_Z)); // state_der(S_INDEX(VEL_Y)) = (y_force + y_force * cos_wheel_angle + x_force * sin_wheel_angle) / this->params_.mass // - // state(S_INDEX(VEL_Y)) * this->params_.c_vy - normal_y_avg * this->params_.gravity_y - // state(S_INDEX(VEL_X)) * state(S_INDEX(OMEGA_Z)); // // kinematics component // state_der(S_INDEX(YAW)) = state(S_INDEX(OMEGA_Z)); // float yaw, sin_yaw, cos_yaw; // yaw = state[S_INDEX(YAW)]; // sincosf(yaw, &sin_yaw, &cos_yaw); // state_der(S_INDEX(POS_X)) = state(S_INDEX(VEL_X)) * cos_yaw - state(S_INDEX(VEL_Y)) * sin_yaw; // state_der(S_INDEX(POS_Y)) = state(S_INDEX(VEL_X)) * sin_yaw + state(S_INDEX(VEL_Y)) * cos_yaw; } template void BicycleSlipParametricImpl::updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { next_state = state + state_der * dt; next_state(S_INDEX(YAW)) = angle_utils::normalizeAngle(next_state(S_INDEX(YAW))); next_state(S_INDEX(STEER_ANGLE)) = max(min(next_state(S_INDEX(STEER_ANGLE)), this->params_.max_steer_angle), -this->params_.max_steer_angle); next_state(S_INDEX(STEER_ANGLE_RATE)) = state_der(S_INDEX(STEER_ANGLE)); next_state(S_INDEX(BRAKE_STATE)) = min(max(next_state(S_INDEX(BRAKE_STATE)), 0.0f), -this->control_rngs_[C_INDEX(THROTTLE_BRAKE)].x); next_state(S_INDEX(ROLL)) = state(S_INDEX(ROLL)); next_state(S_INDEX(PITCH)) = state(S_INDEX(PITCH)); } template void BicycleSlipParametricImpl::step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt) { computeDynamics(state, control, state_der); updateState(state, next_state, state_der, dt); #ifdef BICYCLE_UNCERTAINTY typename PARENT_CLASS::SharedBlock sb; #endif output = output_array::Zero(); #ifdef BICYCLE_UNCERTAINTY PARENT_CLASS::computeUncertaintyPropagation(state.data(), control.data(), state_der.data(), next_state.data(), dt, &this->params_, &sb, nullptr); #endif float roll = state(S_INDEX(ROLL)); float pitch = state(S_INDEX(PITCH)); RACER::computeStaticSettling>( this->tex_helper_, next_state(S_INDEX(YAW)), next_state(S_INDEX(POS_X)), next_state(S_INDEX(POS_Y)), roll, pitch, output.data()); next_state[S_INDEX(PITCH)] = pitch; next_state[S_INDEX(ROLL)] = roll; this->setOutputs(state_der.data(), next_state.data(), output.data()); output[O_INDEX(BASELINK_VEL_B_Y)] = next_state[S_INDEX(VEL_Y)]; output[O_INDEX(ACCEL_Y)] = state_der[S_INDEX(VEL_Y)]; output[O_INDEX(OMEGA_Z)] = next_state[S_INDEX(OMEGA_Z)]; output[O_INDEX(TOTAL_VELOCITY)] = mppi::math::sign(next_state(S_INDEX(VEL_X))) * sqrt(next_state[S_INDEX(VEL_X)] * next_state[S_INDEX(VEL_X)] + next_state[S_INDEX(VEL_Y)] * next_state[S_INDEX(VEL_Y)]); } template __device__ void BicycleSlipParametricImpl::updateState(float* state, float* next_state, float* state_der, const float dt, typename PARENT_CLASS::DYN_PARAMS_T* params_p) { for (int i = threadIdx.y; i < 8; i += blockDim.y) { next_state[i] = state[i] + state_der[i] * dt; switch (i) { case S_INDEX(YAW): next_state[i] = angle_utils::normalizeAngle(next_state[i]); break; case S_INDEX(STEER_ANGLE): next_state[S_INDEX(STEER_ANGLE)] = max(min(next_state[S_INDEX(STEER_ANGLE)], params_p->max_steer_angle), -params_p->max_steer_angle); next_state[S_INDEX(STEER_ANGLE_RATE)] = state_der[S_INDEX(STEER_ANGLE)]; break; case S_INDEX(BRAKE_STATE): next_state[S_INDEX(BRAKE_STATE)] = min(max(next_state[S_INDEX(BRAKE_STATE)], 0.0f), -this->control_rngs_[C_INDEX(THROTTLE_BRAKE)].x); } } __syncthreads(); } template __device__ void BicycleSlipParametricImpl::computeDynamics(float* state, float* control, float* state_der, float* theta) { DYN_PARAMS_T* params_p = nullptr; if (PARENT_CLASS::SHARED_MEM_REQUEST_GRD_BYTES != 0) { // Allows us to turn on or off global or shared memory version of params params_p = (DYN_PARAMS_T*)theta; } else { params_p = &(this->params_); } // parametric part of the brake this->computeParametricDelayDeriv(state, control, state_der, params_p); // runs the parametric part of the steering model this->computeParametricSteerDeriv(state, control, state_der, params_p); // nullptr if not shared memory // bool enable_brake = control[C_INDEX(THROTTLE_BRAKE)] < 0; // const float brake_cmd = -enable_brake * control[C_INDEX(THROTTLE_BRAKE)]; // float linear_brake_slope = params_p->c_b[1] / (0.9f * (2.0f / 0.02)); // int index = (fabsf(state[S_INDEX(VEL_X)]) > linear_brake_slope && fabsf(state[S_INDEX(VEL_X)]) <= 3.0f) + // (fabsf(state[S_INDEX(VEL_X)]) > 3.0f) * 2; // float throttle = params_p->c_t[index] * control[C_INDEX(THROTTLE_BRAKE)] * params_p->gear_sign; // // calculates the average normals // float normal_x_avg, normal_y_avg, normal_z_avg; // RACER::computeBodyFrameNormals>( // this->normals_tex_helper_, state[S_INDEX(YAW)], state[S_INDEX(POS_X)], state[S_INDEX(POS_Y)], // state[S_INDEX(ROLL)], state[S_INDEX(PITCH)], normal_x_avg, normal_y_avg, normal_z_avg); // const float gravity_x_accel = act_func::tanhshrink_scale(normal_x_avg, params_p->min_normal_x) * // params_p->gravity_x; const float gravity_y_accel = act_func::tanhshrink_scale(normal_y_avg, params_p->min_normal_y) // * params_p->gravity_y; // const float brake_vel = mppi::math::clamp(state[S_INDEX(VEL_X)], -params_p->brake_vel, params_p->brake_vel); // const float rolling_vel = // mppi::math::clamp(state[S_INDEX(VEL_X)], -params_p->max_roll_resistance_vel, // params_p->max_roll_resistance_vel); // const float sliding_vel = mppi::math::clamp(state[S_INDEX(VEL_Y)], -params_p->max_slide_vel, // params_p->max_slide_vel); // const float brake = params_p->c_b[0] * state[S_INDEX(BRAKE_STATE)] * // mppi::math::clamp(state[S_INDEX(VEL_X)], -params_p->brake_vel, params_p->brake_vel); // const float x_drag = params_p->c_v[0] * state[S_INDEX(VEL_X)] + rolling_vel * normal_z_avg * params_p->c_rolling; // const float y_vel_comp = state[S_INDEX(VEL_Y)] * state[S_INDEX(OMEGA_Z)]; // const float accel_x = throttle - brake - x_drag; // const float mu_actual = (params_p->mu + params_p->environment * params_p->mu_env) * normal_z_avg; // state_der[S_INDEX(VEL_X)] = mppi::math::clamp(accel_x, -mu_actual, mu_actual) - gravity_x_accel + y_vel_comp; // float y_accel = -state[S_INDEX(VEL_X)] * state[S_INDEX(OMEGA_Z)] + // mppi::math::sign(state[S_INDEX(VEL_X)]) * state[S_INDEX(OMEGA_Z)] * params_p->vy_omega; // const float drag_y = params_p->c_vy * state[S_INDEX(VEL_Y)] + sliding_vel * normal_z_avg * params_p->c_sliding; // state_der[S_INDEX(VEL_Y)] = y_accel - drag_y - gravity_y_accel; // const float parametric_omega = // (state[S_INDEX(VEL_X)] / params_p->wheel_base) * tanf(state[S_INDEX(STEER_ANGLE)] / // params_p->steer_angle_scale); // state_der[S_INDEX(OMEGA_Z)] = // (parametric_omega - state[S_INDEX(OMEGA_Z)]) * params_p->c_omega - state[S_INDEX(OMEGA_Z)] * // params_p->c_v_omega; // // kinematics component // state_der[S_INDEX(YAW)] = state[S_INDEX(OMEGA_Z)]; // float yaw, sin_yaw, cos_yaw; // yaw = angle_utils::normalizeAngle(state[S_INDEX(YAW)]); // __sincosf(yaw, &sin_yaw, &cos_yaw); // state_der[S_INDEX(POS_X)] = state[S_INDEX(VEL_X)] * cos_yaw - state[S_INDEX(VEL_Y)] * sin_yaw; // state_der[S_INDEX(POS_Y)] = state[S_INDEX(VEL_X)] * sin_yaw + state[S_INDEX(VEL_Y)] * cos_yaw; } template __device__ void BicycleSlipParametricImpl::step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt) { DYN_PARAMS_T* params_p; #ifdef BICYCLE_UNCERTAINTY typename PARENT_CLASS::SharedBlock *sb_mem, *sb; #endif if (PARENT_CLASS::SHARED_MEM_REQUEST_GRD_BYTES != 0) { // Allows us to turn on or off global or shared memory version of params params_p = (DYN_PARAMS_T*)theta_s; } else { params_p = &(this->params_); } #ifdef BICYCLE_UNCERTAINTY if (this->SHARED_MEM_REQUEST_BLK_BYTES != 0) { sb_mem = (typename PARENT_CLASS::SharedBlock*)&theta_s[mppi::math::int_multiple_const( this->SHARED_MEM_REQUEST_GRD_BYTES, sizeof(float4)) / sizeof(float)]; sb = &sb_mem[threadIdx.x + blockDim.x * threadIdx.z]; } #endif const uint tdy = threadIdx.y; computeDynamics(state, control, state_der, theta_s); updateState(state, next_state, state_der, dt, params_p); #ifdef BICYCLE_UNCERTAINTY PARENT_CLASS::computeUncertaintyPropagation(state, control, state_der, next_state, dt, params_p, sb, theta_s); #endif if (tdy == 0) { float roll = state[S_INDEX(ROLL)]; float pitch = state[S_INDEX(PITCH)]; RACER::computeStaticSettling>( this->tex_helper_, next_state[S_INDEX(YAW)], next_state[S_INDEX(POS_X)], next_state[S_INDEX(POS_Y)], roll, pitch, output); next_state[S_INDEX(PITCH)] = pitch; next_state[S_INDEX(ROLL)] = roll; } __syncthreads(); this->setOutputs(state_der, next_state, output); // Rather than need a syncthreads, just overwrite the given outputs using the proper thread if (tdy == O_INDEX(BASELINK_VEL_B_Y) % blockIdx.y) { output[O_INDEX(BASELINK_VEL_B_Y)] = next_state[S_INDEX(VEL_Y)]; } if (tdy == O_INDEX(ACCEL_Y) % blockIdx.y) { output[O_INDEX(ACCEL_Y)] = state_der[S_INDEX(VEL_Y)]; } if (tdy == O_INDEX(OMEGA_Z) % blockIdx.y) { output[O_INDEX(OMEGA_Z)] = next_state[S_INDEX(OMEGA_Z)]; } if (tdy == O_INDEX(TOTAL_VELOCITY) % blockIdx.y) { const float vel_x_sign = output[O_INDEX(TOTAL_VELOCITY)] = mppi::math::sign(next_state[S_INDEX(VEL_X)]) * sqrtf(next_state[S_INDEX(VEL_X)] * next_state[S_INDEX(VEL_X)] + next_state[S_INDEX(VEL_Y)] * next_state[S_INDEX(VEL_Y)]); } } template Eigen::Vector3f BicycleSlipParametricImpl::velocityFromState(const Eigen::Ref& state) { return Eigen::Vector3f(state[S_INDEX(VEL_X)], state(S_INDEX(VEL_Y)), 0); } template Eigen::Vector3f BicycleSlipParametricImpl::angularRateFromState(const Eigen::Ref& state) { return Eigen::Vector3f(0, 0, state[S_INDEX(OMEGA_Z)]); } template __device__ __host__ void RACER::computeBodyFrameNormals(TEX_T* tex_helper, const float yaw, const float x, const float y, const float roll, const float pitch, float& mean_normals_x, float& mean_normals_y, float& mean_normals_z) { float3 front_left = make_float3(2.981f, 0.737f, 0.0f); float3 front_right = make_float3(2.981f, -0.737f, 0.f); float3 rear_left = make_float3(0.0f, 0.737f, 0.0f); float3 rear_right = make_float3(0.0f, -0.737f, 0.0f); float3 body_pose = make_float3(x, y, 0.0f); float3 rotation = make_float3(roll, pitch, yaw); if (tex_helper->checkTextureUse(0)) { float4 front_left_normals = tex_helper->queryTextureAtWorldOffsetPose(0, body_pose, front_left, rotation); float4 front_right_normals = tex_helper->queryTextureAtWorldOffsetPose(0, body_pose, front_right, rotation); float4 rear_left_normals = tex_helper->queryTextureAtWorldOffsetPose(0, body_pose, rear_left, rotation); float4 rear_right_normals = tex_helper->queryTextureAtWorldOffsetPose(0, body_pose, rear_right, rotation); // TODO need to rotate the normals for x,y #ifdef __CUDA_ARCH__ const float cos_yaw = __cosf(yaw); const float sin_yaw = __sinf(yaw); #else const float cos_yaw = cosf(yaw); const float sin_yaw = sinf(yaw); #endif float3 front_left_normals_rot; front_left_normals_rot.x = cos_yaw * front_left_normals.x - sin_yaw * front_left_normals.y; front_left_normals_rot.y = sin_yaw * front_left_normals.x + cos_yaw * front_left_normals.y; front_left_normals_rot.z = front_left_normals.z; float3 front_right_normals_rot; front_right_normals_rot.x = cos_yaw * front_right_normals.x - sin_yaw * front_right_normals.y; front_right_normals_rot.y = sin_yaw * front_right_normals.x + cos_yaw * front_right_normals.y; front_right_normals_rot.z = front_right_normals.z; float3 rear_left_normals_rot; rear_left_normals_rot.x = cos_yaw * rear_left_normals.x - sin_yaw * rear_left_normals.y; rear_left_normals_rot.y = sin_yaw * rear_left_normals.x + cos_yaw * rear_left_normals.y; rear_left_normals_rot.z = rear_left_normals.z; float3 rear_right_normals_rot; rear_right_normals_rot.x = cos_yaw * rear_right_normals.x - sin_yaw * rear_right_normals.y; rear_right_normals_rot.y = sin_yaw * rear_right_normals.x + cos_yaw * rear_right_normals.y; rear_right_normals_rot.z = rear_right_normals.z; mean_normals_x = (front_left_normals_rot.x + front_right_normals_rot.x + rear_right_normals_rot.x + rear_left_normals_rot.x) / 4.0f; mean_normals_y = (front_left_normals_rot.y + front_right_normals_rot.y + rear_right_normals_rot.y + rear_left_normals_rot.y) / 4.0f; mean_normals_z = (front_left_normals_rot.z + front_right_normals_rot.z + rear_right_normals_rot.z + rear_left_normals_rot.z) / 4.0f; // using 2pi so any rotation that accidently uses this will be using identity if (!isfinite(mean_normals_x) || !isfinite(mean_normals_y) || !isfinite(mean_normals_z)) { mean_normals_x = 0.0f; mean_normals_y = 0.0f; mean_normals_z = 1.0f; } } else { mean_normals_x = 0.0f; mean_normals_y = 0.0f; mean_normals_z = 1.0f; } } #ifdef BICYCLE_UNCERTAINTY template __host__ __device__ bool BicycleSlipParametricImpl::computeUncertaintyJacobian(const float* state, const float* control, float* A, PARAMS_T* params_p) { float sin_yaw, cos_yaw, cos_2_delta; #ifdef __CUDA_ARCH__ float yaw_norm = angle_utils::normalizeAngle(state[S_INDEX(YAW)]); float delta = angle_utils::normalizeAngle(state[S_INDEX(STEER_ANGLE)] / params_p->steer_angle_scale); __sincosf(yaw_norm, &sin_yaw, &cos_yaw); cos_2_delta = __cosf(delta) * __cosf(delta); #else sincosf(state[S_INDEX(YAW)], &sin_yaw, &cos_yaw); float delta = state[S_INDEX(STEER_ANGLE)] / params_p->steer_angle_scale; cos_2_delta = cosf(delta) * cosf(delta); #endif int step, pi; mp1::getParallel1DIndex(pi, step); // A = df/dx + df/du * K for (int i = pi; i < this->UNCERTAINTY_DIM * this->UNCERTAINTY_DIM; i += step) { switch (i) { case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(VEL_X), this->UNCERTAINTY_DIM): A[i] = -params_p->c_v[0] - params_p->K_vel_x; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(YAW), this->UNCERTAINTY_DIM): A[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_X), this->UNCERTAINTY_DIM): A[i] = -params_p->K_x * cos_yaw; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_Y), this->UNCERTAINTY_DIM): A[i] = -params_p->K_x * sin_yaw; break; // yaw case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(VEL_X), this->UNCERTAINTY_DIM): A[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(YAW), this->UNCERTAINTY_DIM): A[i] = -fabsf(state[S_INDEX(VEL_X)]) * params_p->K_yaw / (params_p->wheel_base * cos_2_delta); break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_X), this->UNCERTAINTY_DIM): A[i] = state[S_INDEX(VEL_X)] * params_p->K_y * sin_yaw / (params_p->wheel_base * cos_2_delta); break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_Y), this->UNCERTAINTY_DIM): A[i] = -state[S_INDEX(VEL_X)] * params_p->K_y * cos_yaw / (params_p->wheel_base * cos_2_delta); break; // pos x case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(VEL_X), this->UNCERTAINTY_DIM): A[i] = cos_yaw; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(YAW), this->UNCERTAINTY_DIM): A[i] = -sin_yaw * state[S_INDEX(VEL_X)] - cos_yaw * state[S_INDEX(VEL_Y)]; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_X), this->UNCERTAINTY_DIM): A[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_Y), this->UNCERTAINTY_DIM): A[i] = 0.0f; break; // pos y case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(VEL_X), this->UNCERTAINTY_DIM): A[i] = sin_yaw; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(YAW), this->UNCERTAINTY_DIM): A[i] = cos_yaw * state[S_INDEX(VEL_X)] - sin_yaw * state[S_INDEX(VEL_Y)]; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_Y), this->UNCERTAINTY_DIM): A[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_X), this->UNCERTAINTY_DIM): A[i] = 0.0f; break; } } return true; } #endif ================================================ FILE: include/mppi/dynamics/bicycle_slip/bicycle_slip_parametric.cuh ================================================ // // Created by jason on 12/12/22. // #ifndef MPPIGENERIC_BICYCLE_SLIP_PARAMTERIC_CUH #define MPPIGENERIC_BICYCLE_SLIP_PARAMTERIC_CUH #include #include #include #include #include #define BICYCLE_UNCERTAINTY namespace RACER { template __device__ __host__ void computeBodyFrameNormals(TEX_T* tex_helper, const float yaw, const float x, const float y, const float roll, const float pitch, float& mean_normals_x, float& mean_normals_y, float& mean_normals_z); }; struct BicycleSlipParametricParams : public RacerDubinsElevationParams { enum class StateIndex : int { POS_X = 0, POS_Y, YAW, STEER_ANGLE, BRAKE_STATE, VEL_X, VEL_Y, OMEGA_Z, ROLL, PITCH, STEER_ANGLE_RATE, ENGINE_RPM, // TODO: Figure out if more filler is necessary UNCERTAINTY_POS_X, UNCERTAINTY_POS_Y, UNCERTAINTY_YAW, UNCERTAINTY_VEL_X, UNCERTAINTY_POS_X_Y, UNCERTAINTY_POS_X_YAW, UNCERTAINTY_POS_X_VEL_X, UNCERTAINTY_POS_Y_YAW, UNCERTAINTY_POS_Y_VEL_X, UNCERTAINTY_YAW_VEL_X, NUM_STATES }; float gravity_x = -3.9f; float gravity_y = -7.2f; float mass = 0.61; // vx float min_rpm = 1.1f; float rpm_scale = 1.3; float c_rolling[2] = { 0.3f, 0.4f }; float c_rpm[3] = {0.01f, 0.066f, 0.02f}; float c_vx = 0.12f; float c_brake[2] = {0.4f, 4.23f}; // vy float c_vy = 0.12f; float y_f_c[2] = { 0.9f, 0.8f }; float c_sliding[2] = { 1.5f, 1.7f }; float c_omega = 2.2f; float c_v_omega = 4.2f; BicycleSlipParametricParams() { c_t[0] = 2.73f; c_t[1] = 0.15f; c_t[2] = -0.0145f; c_b[0] = 30.0f; c_v[0] = 0.079f; // c_vx } }; template class BicycleSlipParametricImpl : public RacerDubinsElevationImpl { public: using PARENT_CLASS = RacerDubinsElevationImpl; typedef PARAMS_T DYN_PARAMS_T; // static const int SHARED_MEM_REQUEST_GRD_BYTES = sizeof(PARAMS_T); // TODO set to one to prevent array of size 0 error // static const int SHARED_MEM_REQUEST_BLK_BYTES = PARENT_CLASS::SHARED_MEM_REQUEST_BLK_BYTES; typedef typename PARENT_CLASS::state_array state_array; typedef typename PARENT_CLASS::control_array control_array; typedef typename PARENT_CLASS::output_array output_array; typedef typename PARENT_CLASS::dfdx dfdx; typedef typename PARENT_CLASS::dfdu dfdu; BicycleSlipParametricImpl(cudaStream_t stream = nullptr); BicycleSlipParametricImpl(const std::string& model_path, cudaStream_t stream = nullptr); std::string getDynamicsModelName() const override { return "Bicycle Slip Parametric Model"; } void GPUSetup(); void freeCudaMem(); void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt); void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt); __device__ void updateState(float* state, float* next_state, float* state_der, const float dt) { } __device__ void updateState(float* state, float* next_state, float* state_der, const float dt, typename PARENT_CLASS::DYN_PARAMS_T* params_p); __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta = nullptr); __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt); #ifdef BICYCLE_UNCERTAINTY __host__ __device__ bool computeUncertaintyJacobian(const float* state, const float* control, float* A, PARAMS_T* params_p); #endif void paramsToDevice(); // state_array interpolateState(const Eigen::Ref state_1, const Eigen::Ref state_2, // const float alpha); // bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, // Eigen::Ref A, Eigen::Ref B); Eigen::Vector3f velocityFromState(const Eigen::Ref& state); Eigen::Vector3f angularRateFromState(const Eigen::Ref& state); // void enforceLeash(const Eigen::Ref& state_true, const Eigen::Ref& // state_nominal, // const Eigen::Ref& leash_values, Eigen::Ref state_output); state_array stateFromMap(const std::map& map) override; TwoDTextureHelper* getTextureHelperNormals() { return normals_tex_helper_; } void updateRotation(std::array& rotation) { this->tex_helper_->updateRotation(0, rotation); this->normals_tex_helper_->updateRotation(0, rotation); this->normals_tex_helper_->updateRotation(1, rotation); this->normals_tex_helper_->updateRotation(2, rotation); } protected: TwoDTextureHelper* normals_tex_helper_ = nullptr; }; class BicycleSlipParametric : public BicycleSlipParametricImpl { public: BicycleSlipParametric(cudaStream_t stream = nullptr) : BicycleSlipParametricImpl(stream) { } BicycleSlipParametric(const std::string& model_path, cudaStream_t stream = nullptr) : BicycleSlipParametricImpl(model_path, stream) { } }; #if __CUDACC__ #include "bicycle_slip_parametric.cu" #endif #endif // MPPIGENERIC_BICYCLE_SLIP_PARAMTERIC_CUH ================================================ FILE: include/mppi/dynamics/cartpole/cartpole_dynamics.cu ================================================ #include #include CartpoleDynamics::CartpoleDynamics(float cart_mass, float pole_mass, float pole_length, cudaStream_t stream) : Dynamics(stream) { this->params_ = CartpoleDynamicsParams(cart_mass, pole_mass, pole_length); } bool CartpoleDynamics::computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { float theta = state(2); float theta_dot = state(3); float force = control(0); A(0, 1) = 1.0; A(1, 2) = (this->params_.pole_mass * cosf(theta) * (this->params_.pole_length * SQ(theta_dot) + gravity_ * cosf(theta)) - gravity_ * this->params_.pole_mass * SQ(sinf(theta))) / (this->params_.cart_mass + this->params_.pole_mass * SQ(sinf(theta))) - (2 * this->params_.pole_mass * cosf(theta) * sinf(theta) * (force + this->params_.pole_mass * sinf(theta) * (this->params_.pole_length * SQ(theta_dot) + gravity_ * cosf(theta)))) / powf((this->params_.cart_mass + this->params_.pole_mass * SQ(sinf(theta))), 2.0); A(1, 3) = (2 * this->params_.pole_length * this->params_.pole_mass * theta_dot * sinf(theta)) / (this->params_.cart_mass + this->params_.pole_mass * SQ(sinf(theta))); A(2, 3) = 1.0; A(3, 2) = (force * sinf(theta) - gravity_ * cosf(theta) * (this->params_.pole_mass + this->params_.cart_mass) - this->params_.pole_length * this->params_.pole_mass * SQ(theta_dot) * SQ(cosf(theta)) + this->params_.pole_length * this->params_.pole_mass * SQ(theta_dot) * SQ(sinf(theta))) / (this->params_.pole_length * (this->params_.cart_mass + this->params_.pole_mass * SQ(sinf(theta)))) + (2 * this->params_.pole_mass * cosf(theta) * sinf(theta) * (this->params_.pole_length * this->params_.pole_mass * cosf(theta) * sinf(theta) * SQ(theta_dot) + force * cosf(theta) + gravity_ * sinf(theta) * (this->params_.pole_mass + this->params_.cart_mass))) / powf(this->params_.pole_length * (this->params_.cart_mass + this->params_.pole_mass * SQ(sinf(theta))), 2.0); A(3, 3) = -(2 * this->params_.pole_mass * theta_dot * cosf(theta) * sinf(theta)) / (this->params_.cart_mass + this->params_.pole_mass * SQ(sinf(theta))); B(1, 0) = 1 / (this->params_.cart_mass + this->params_.pole_mass * SQ(sinf(theta))); B(3, 0) = -cosf(theta) / (this->params_.pole_length * (this->params_.cart_mass + this->params_.pole_mass * SQ(sinf(theta)))); return true; } void CartpoleDynamics::computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { const float sin_theta = sinf(state(S_INDEX(THETA))); const float cos_theta = cosf(state(S_INDEX(THETA))); float theta_dot = state[S_INDEX(THETA_DOT)]; float force = control[C_INDEX(FORCE)]; const float m_c = this->params_.cart_mass; const float m_p = this->params_.pole_mass; const float l_p = this->params_.pole_length; state_der(S_INDEX(POS_X)) = state(S_INDEX(VEL_X)); state_der(S_INDEX(VEL_X)) = (force + m_p * sin_theta * (l_p * SQ(theta_dot) + gravity_ * cos_theta)) / (m_c + m_p * SQ(sin_theta)); state_der(S_INDEX(THETA)) = theta_dot; state_der(S_INDEX(THETA_DOT)) = (-force * cos_theta - m_p * l_p * SQ(theta_dot) * cos_theta * sin_theta - (m_c + m_p) * gravity_ * sin_theta) / (l_p * (m_c + m_p * SQ(sin_theta))); } void CartpoleDynamics::printState(const Eigen::Ref& state) { printf("Cart position: %f; Cart velocity: %f; Pole angle: %f; Pole rate: %f \n", state(0), state(1), state(2), state(3)); // Needs to be completed } void CartpoleDynamics::printState(float* state) { printf("Cart position: %f; Cart velocity: %f; Pole angle: %f; Pole rate: %f \n", state[0], state[1], state[2], state[3]); // Needs to be completed } void CartpoleDynamics::printParams() { printf("Cart mass: %f; Pole mass: %f; Pole length: %f \n", this->params_.cart_mass, this->params_.pole_mass, this->params_.pole_length); } __device__ void CartpoleDynamics::computeDynamics(float* state, float* control, float* state_der, float* theta_s) { float theta = angle_utils::normalizeAngle(state[S_INDEX(THETA)]); const float sin_theta = __sinf(theta); const float cos_theta = __cosf(theta); float theta_dot = state[S_INDEX(THETA_DOT)]; float force = control[C_INDEX(FORCE)]; const float m_c = this->params_.cart_mass; const float m_p = this->params_.pole_mass; const float l_p = this->params_.pole_length; state_der[S_INDEX(POS_X)] = state[S_INDEX(VEL_X)]; state_der[S_INDEX(VEL_X)] = (force + m_p * sin_theta * (l_p * SQ(theta_dot) + gravity_ * cos_theta)) / (m_c + m_p * SQ(sin_theta)); state_der[S_INDEX(THETA)] = theta_dot; state_der[S_INDEX(THETA_DOT)] = (-force * cos_theta - m_p * l_p * SQ(theta_dot) * cos_theta * sin_theta - (m_c + m_p) * gravity_ * sin_theta) / (l_p * (m_c + m_p * SQ(sin_theta))); } Dynamics::state_array CartpoleDynamics::stateFromMap(const std::map& map) { state_array s; s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(VEL_X)) = map.at("VEL_X"); s(S_INDEX(THETA)) = map.at("THETA"); s(S_INDEX(THETA_DOT)) = map.at("THETA_DOT"); return s; } ================================================ FILE: include/mppi/dynamics/cartpole/cartpole_dynamics.cuh ================================================ #ifndef CARTPOLE_CUH_ #define CARTPOLE_CUH_ #include struct CartpoleDynamicsParams : public DynamicsParams { enum class StateIndex : int { POS_X = 0, VEL_X, THETA, THETA_DOT, NUM_STATES }; enum class ControlIndex : int { FORCE = 0, NUM_CONTROLS }; enum class OutputIndex : int { POS_X = 0, VEL_X, THETA, THETA_DOT, NUM_OUTPUTS }; float cart_mass = 1.0f; float pole_mass = 1.0f; float pole_length = 1.0f; CartpoleDynamicsParams() = default; CartpoleDynamicsParams(float cart_mass, float pole_mass, float pole_length) : cart_mass(cart_mass), pole_mass(pole_mass), pole_length(pole_length){}; }; using namespace MPPI_internal; class CartpoleDynamics : public Dynamics { public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW using PARENT_CLASS = Dynamics; CartpoleDynamics(float cart_mass = 1.0f, float pole_mass = 1.0f, float pole_length = 1.0f, cudaStream_t stream = 0); std::string getDynamicsModelName() const override { return "Cartpole Model"; } /** * runs dynamics using state and control and sets it to state * derivative. Everything is Eigen Matrices, not Eigen Vectors! * * @param state input of current state, passed by reference * @param control input of currrent control, passed by reference * @param state_der output of new state derivative, passed by reference */ void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); /** * compute the Jacobians with respect to state and control * * @param state input of current state, passed by reference * @param control input of currrent control, passed by reference */ bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B); __host__ __device__ float getCartMass() { return this->params_.cart_mass; }; __host__ __device__ float getPoleMass() { return this->params_.pole_mass; }; __host__ __device__ float getPoleLength() { return this->params_.pole_length; }; __host__ __device__ float getGravity() { return gravity_; } void printState(const Eigen::Ref& state); void printState(float* state); void printParams(); // void computeKinematics(const Eigen::Ref&, // Eigen::Ref& state_der) {}; __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta = nullptr); state_array stateFromMap(const std::map& map) override; protected: const float gravity_ = 9.81; }; #if __CUDACC__ #include "cartpole_dynamics.cu" #endif #endif // CARTPOLE_CUH_ ================================================ FILE: include/mppi/dynamics/double_integrator/di_dynamics.cu ================================================ #include DoubleIntegratorDynamics::DoubleIntegratorDynamics(float system_noise, cudaStream_t stream) : Dynamics(stream) { this->params_ = DoubleIntegratorParams(system_noise); // Seed the RNG and initialize the system noise distribution std::random_device rd; gen.seed(rd()); // Seed the RNG with a random number setStateVariance(system_noise); } void DoubleIntegratorDynamics::computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { state_der(0) = state(2); // xdot; state_der(1) = state(3); // ydot; state_der(2) = control(0); // x_force; state_der(3) = control(1); // y_force } bool DoubleIntegratorDynamics::computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { A(0, 2) = 1; A(1, 3) = 1; B(2, 0) = 1; B(3, 1) = 1; return true; } void DoubleIntegratorDynamics::printState(float* state) { printf("X position: %f; Y position: %f; X velocity: %f; Y velocity: %f \n", state[0], state[1], state[2], state[3]); } void DoubleIntegratorDynamics::printState(const float* state) { printf("X position: %f; Y position: %f; X velocity: %f; Y velocity: %f \n", state[0], state[1], state[2], state[3]); } __device__ void DoubleIntegratorDynamics::computeDynamics(float* state, float* control, float* state_der, float* theta_s) { state_der[0] = state[2]; // xdot; state_der[1] = state[3]; // ydot; state_der[2] = control[0]; // x_force; state_der[3] = control[1]; // y_force } void DoubleIntegratorDynamics::setStateVariance(float system_variance) { normal_distribution = std::normal_distribution(0, sqrtf(system_variance)); } void DoubleIntegratorDynamics::computeStateDisturbance(float dt, Eigen::Ref state) { // Generate system noise state_array system_noise; system_noise << 0.0, 0.0, normal_distribution(gen), normal_distribution(gen); state += system_noise * dt; } DoubleIntegratorDynamics::dfdu DoubleIntegratorDynamics::B(const Eigen::Ref& state) { dfdu B = dfdu::Zero(); B(2, 0) = 1; B(3, 1) = 1; return B; } Dynamics::state_array DoubleIntegratorDynamics::stateFromMap(const std::map& map) { state_array s; s(0) = map.at("POS_X"); s(1) = map.at("POS_Y"); s(2) = map.at("VEL_X"); s(3) = map.at("VEL_Y"); return s; } ================================================ FILE: include/mppi/dynamics/double_integrator/di_dynamics.cuh ================================================ #pragma once #ifndef DOUBLE_INTEGRATOR_CUH_ #define DOUBLE_INTEGRATOR_CUH_ #include #include struct DoubleIntegratorParams : public DynamicsParams { enum class StateIndex : int { POS_X = 0, POS_Y, VEL_X, VEL_Y, NUM_STATES }; enum class ControlIndex : int { ACCEL_X = 0, ACCEL_Y, NUM_CONTROLS }; enum class OutputIndex : int { POS_X = 0, POS_Y, VEL_X, VEL_Y, NUM_OUTPUTS }; float system_noise = 1; DoubleIntegratorParams(float noise) : system_noise(noise){}; DoubleIntegratorParams() = default; ~DoubleIntegratorParams() = default; }; using namespace MPPI_internal; class DoubleIntegratorDynamics : public Dynamics { public: // EIGEN_MAKE_ALIGNED_OPERATOR_NEW DoubleIntegratorDynamics(float system_noise = 1, cudaStream_t stream = nullptr); std::string getDynamicsModelName() const override { return "2D Double Integrator Model"; } void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B); void computeStateDisturbance(float dt, Eigen::Ref state); void setStateVariance(float system_variance = 1.0); void printState(float* state); void printState(const float* state); __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta = nullptr); dfdu B(const Eigen::Ref& state); state_array stateFromMap(const std::map& map); private: // Random number generator for system noise std::mt19937 gen; // Standard mersenne_twister_engine which will be seeded std::normal_distribution normal_distribution; }; #if __CUDACC__ #include "di_dynamics.cu" #endif #endif //! DOUBLE_INTEGRATOR_CUH_ ================================================ FILE: include/mppi/dynamics/dubins/dubins.cu ================================================ #include DubinsDynamics::DubinsDynamics(cudaStream_t stream) : Dynamics(stream) { this->params_ = DubinsParams(); } void DubinsDynamics::computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { state_der(S_INDEX(POS_X)) = control(C_INDEX(VEL)) * cos(state(S_INDEX(YAW))); state_der(S_INDEX(POS_Y)) = control(C_INDEX(VEL)) * sin(state(S_INDEX(YAW))); state_der(S_INDEX(YAW)) = control(C_INDEX(YAW_RATE)); } bool DubinsDynamics::computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { A(0, 2) = -control(C_INDEX(VEL)) * sin(state(S_INDEX(YAW))); A(1, 2) = control(C_INDEX(VEL)) * cos(state(S_INDEX(YAW))); B(0, 0) = cos(state(S_INDEX(YAW))); B(1, 0) = sin(state(S_INDEX(YAW))); B(2, 1) = 1; return true; } void DubinsDynamics::updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { next_state = state + state_der * dt; next_state(S_INDEX(YAW)) = angle_utils::normalizeAngle(next_state(S_INDEX(YAW))); } DubinsDynamics::state_array DubinsDynamics::interpolateState(const Eigen::Ref state_1, const Eigen::Ref state_2, const float alpha) { state_array result = (1 - alpha) * state_1 + alpha * state_2; result(2) = angle_utils::interpolateEulerAngleLinear(state_1(2), state_2(2), alpha); return result; } __device__ void DubinsDynamics::updateState(float* state, float* next_state, float* state_der, const float dt) { int i; int tdy = threadIdx.y; // Add the state derivative time dt to the current state. // printf("updateState thread %d, %d = %f, %f\n", threadIdx.x, threadIdx.y, state[S_INDEX(POS_X)], // state_der[S_INDEX(POS_X)]); for (i = tdy; i < STATE_DIM; i += blockDim.y) { next_state[i] = state[i] + state_der[i] * dt; if (i == S_INDEX(YAW)) { next_state[i] = angle_utils::normalizeAngle(next_state[i]); } } } __device__ void DubinsDynamics::computeDynamics(float* state, float* control, float* state_der, float* theta_s) { state_der[S_INDEX(POS_X)] = control[C_INDEX(VEL)] * cos(state[S_INDEX(YAW)]); state_der[S_INDEX(POS_Y)] = control[C_INDEX(VEL)] * sin(state[S_INDEX(YAW)]); state_der[S_INDEX(YAW)] = control[C_INDEX(YAW_RATE)]; } Dynamics::state_array DubinsDynamics::stateFromMap(const std::map& map) { state_array s; s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(POS_Y)) = map.at("POS_Y"); s(S_INDEX(YAW)) = map.at("YAW"); return s; } ================================================ FILE: include/mppi/dynamics/dubins/dubins.cuh ================================================ #pragma once #ifndef DUBINS_CUH_ #define DUBINS_CUH_ #include #include #include struct DubinsParams : public DynamicsParams { enum class StateIndex : int { POS_X = 0, POS_Y, YAW, NUM_STATES }; enum class ControlIndex : int { VEL = 0, YAW_RATE, NUM_CONTROLS }; enum class OutputIndex : int { POS_X = 0, POS_Y, YAW, NUM_OUTPUTS }; DubinsParams() = default; ~DubinsParams() = default; }; using namespace MPPI_internal; /** * state: x, y, theta * control: forward velocity, angular velocity */ class DubinsDynamics : public Dynamics { public: DubinsDynamics(cudaStream_t stream = nullptr); using PARENT_CLASS = Dynamics; using PARENT_CLASS::updateState; // needed as overloading updateState here hides all parent versions of updateState std::string getDynamicsModelName() const override { return "Dubins Model"; } void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt); __device__ void updateState(float* state, float* next_state, float* state_der, const float dt); state_array interpolateState(const Eigen::Ref state_1, const Eigen::Ref state_2, const float alpha); bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B); __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta = nullptr); state_array stateFromMap(const std::map& map) override; private: }; #if __CUDACC__ #include "dubins.cu" #endif #endif //! DUBINS_CUH_ ================================================ FILE: include/mppi/dynamics/dynamics.cu ================================================ #include template void Dynamics::paramsToDevice(bool synchronize) { if (GPUMemStatus_) { HANDLE_ERROR(cudaMemcpyAsync(&model_d_->params_, ¶ms_, sizeof(PARAMS_T), cudaMemcpyHostToDevice, stream_)); HANDLE_ERROR(cudaMemcpyAsync(&model_d_->control_rngs_, &control_rngs_, CONTROL_DIM * sizeof(float2), cudaMemcpyHostToDevice, stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream_)); } } } template void Dynamics::setDefaultControlRanges() { for (int i = 0; i < CONTROL_DIM; i++) { control_rngs_[i].x = -FLT_MAX; control_rngs_[i].y = FLT_MAX; } } template void Dynamics::setControlRanges(std::array& control_rngs, bool synchronize) { for (int i = 0; i < CONTROL_DIM; i++) { control_rngs_[i].x = control_rngs[i].x; control_rngs_[i].y = control_rngs[i].y; } if (GPUMemStatus_) { HANDLE_ERROR(cudaMemcpyAsync(this->model_d_->control_rngs_, this->control_rngs_, CONTROL_DIM * sizeof(float2), cudaMemcpyHostToDevice, stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream_)); } } } template void Dynamics::setControlDeadbands(std::array& control_deadband, bool synchronize) { for (int i = 0; i < CONTROL_DIM; i++) { control_deadband_[i] = control_deadband[i]; } if (GPUMemStatus_) { HANDLE_ERROR(cudaMemcpyAsync(this->model_d_->control_deadband_, this->control_deadband_, CONTROL_DIM * sizeof(float), cudaMemcpyHostToDevice, stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream_)); } } } template void Dynamics::GPUSetup() { CLASS_T* derived = static_cast(this); if (!GPUMemStatus_) { model_d_ = Managed::GPUSetup(derived); } else { this->logger_->debug("%s: GPU Memory already set\n", derived->getDynamicsModelName().c_str()); } derived->paramsToDevice(); } template void Dynamics::freeCudaMem() { if (GPUMemStatus_) { HANDLE_ERROR(cudaFree(model_d_)); GPUMemStatus_ = false; model_d_ = nullptr; } } template __device__ inline void Dynamics::computeStateDeriv(float* state, float* control, float* state_der, float* theta_s) { CLASS_T* derived = static_cast(this); // only propagate a single state, i.e. thread.y = 0 // find the change in x,y,theta based off of the rest of the state if (threadIdx.y == 0) { derived->computeKinematics(state, state_der); } derived->computeDynamics(state, control, state_der, theta_s); } template __device__ void Dynamics::enforceConstraints(float* state, float* control) { // TODO should control_rngs_ be a constant memory parameter int i, p_index, step; mppi::p1::getParallel1DIndex(p_index, step); // parallelize setting the constraints with y dim for (i = p_index; i < CONTROL_DIM; i += step) { if (fabsf(control[i]) < this->control_deadband_[i]) { control[i] = this->zero_control_[i]; } else { control[i] += this->control_deadband_[i] * -mppi::math::sign(control[i]); } control[i] = fminf(fmaxf(this->control_rngs_[i].x, control[i]), this->control_rngs_[i].y); } } template __device__ void Dynamics::updateState(float* state, float* next_state, float* state_der, const float dt) { int i, p_index, step; mppi::p1::getParallel1DIndex(p_index, step); // Add the state derivative time dt to the current state. for (i = p_index; i < STATE_DIM; i += step) { next_state[i] = state[i] + state_der[i] * dt; } } template __device__ inline void Dynamics::step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt) { CLASS_T* derived = static_cast(this); derived->computeStateDeriv(state, control, state_der, theta_s); __syncthreads(); derived->updateState(state, next_state, state_der, dt); __syncthreads(); derived->stateToOutput(next_state, output); } template __host__ __device__ inline void Dynamics::stateToOutput(const float* __restrict__ state, float* __restrict__ output) { // TODO this is a hack int p_index, step; mppi::p1::getParallel1DIndex(p_index, step); for (int i = p_index; i < OUTPUT_DIM && i < STATE_DIM; i += step) { output[i] = state[i]; } } template __host__ __device__ inline void Dynamics::outputToState(const float* __restrict__ output, float* __restrict__ state) { // TODO this is a hack int p_index, step; mppi::p1::getParallel1DIndex(p_index, step); for (int i = p_index; i < OUTPUT_DIM && i < STATE_DIM; i += step) { state[i] = output[i]; } } template Dynamics::state_array Dynamics::getZeroState() const { return state_array::Zero(); } template __host__ __device__ inline void Dynamics::getZeroState(float* state) const { int p_index, step; mppi::p1::getParallel1DIndex(p_index, step); for (int i = p_index; i < STATE_DIM; i += step) { state[i] = 0.0f; } } ================================================ FILE: include/mppi/dynamics/dynamics.cuh ================================================ #pragma once #ifndef DYNAMICS_CUH_ #define DYNAMICS_CUH_ /* Header file for dynamics */ #include #include #include #include #include #include #include #include #include #include #include #include // helpful macros to use the enum setup #ifndef E_INDEX #define E_INDEX(ENUM, enum_val) static_cast(ENUM::enum_val) #endif #ifndef S_INDEX #define S_IND_CLASS(CLASS, enum_val) E_INDEX(CLASS::StateIndex, enum_val) #define S_INDEX(enum_val) S_IND_CLASS(PARENT_CLASS::DYN_PARAMS_T, enum_val) #endif #ifndef C_INDEX #define C_IND_CLASS(CLASS, enum_val) E_INDEX(CLASS::ControlIndex, enum_val) #define C_INDEX(enum_val) C_IND_CLASS(PARENT_CLASS::DYN_PARAMS_T, enum_val) #endif #ifndef O_INDEX #define O_IND_CLASS(CLASS, enum_val) E_INDEX(CLASS::OutputIndex, enum_val) #define O_INDEX(enum_val) O_IND_CLASS(PARENT_CLASS::DYN_PARAMS_T, enum_val) #endif struct DynamicsParams { enum class StateIndex : int { POS_X = 0, NUM_STATES }; enum class ControlIndex : int { VEL_X = 0, NUM_CONTROLS }; enum class OutputIndex : int { POS_X = 0, NUM_OUTPUTS }; }; template using paramsInheritsFrom = typename std::is_base_of; namespace MPPI_internal { template class Dynamics : public Managed { static_assert(paramsInheritsFrom::value, "Dynamics PARAMS_T does not inherit from DynamicsParams"); public: // EIGEN_MAKE_ALIGNED_OPERATOR_NEW static const int STATE_DIM = S_IND_CLASS(PARAMS_T, NUM_STATES); static const int CONTROL_DIM = C_IND_CLASS(PARAMS_T, NUM_CONTROLS); static const int OUTPUT_DIM = O_IND_CLASS(PARAMS_T, NUM_OUTPUTS); typedef CLASS_T DYN_T; typedef PARAMS_T DYN_PARAMS_T; /** * useful typedefs */ typedef Eigen::Matrix control_array; // Control at a time t typedef Eigen::Matrix state_array; // State at a time t typedef Eigen::Matrix output_array; // Output at a time t typedef Eigen::Matrix dfdx; // Jacobian wrt x typedef Eigen::Matrix dfdu; // Jacobian wrt u typedef Eigen::Matrix feedback_matrix; // Feedback matrix typedef Eigen::Matrix Jacobian; // Jacobian of x and u typedef std::map buffer_trajectory; // protected constructor prevent anyone from trying to construct a Dynamics protected: /** * sets the default control ranges to -infinity and +infinity */ Dynamics(cudaStream_t stream = 0) : Managed(stream) { setDefaultControlRanges(); } /** * sets the control ranges to the passed in value * @param control_rngs * @param stream */ Dynamics(std::array& control_rngs, cudaStream_t stream = 0) : Managed(stream) { setControlRanges(control_rngs); } Dynamics(PARAMS_T& params, std::array& control_rngs, cudaStream_t stream = 0) : Managed(stream) { setParams(params); setControlRanges(control_rngs); } Dynamics(PARAMS_T& params, cudaStream_t stream = 0) : Managed(stream) { setParams(params); setDefaultControlRanges(); } public: // This variable defines what the zero control is // For most systems, it should be zero but for things like a quadrotor, // it should be the command to hover control_array zero_control_ = control_array::Zero(); /** * Destructor must be virtual so that children are properly * destroyed when called from a Dynamics reference */ virtual ~Dynamics() { freeCudaMem(); } virtual std::string getDynamicsModelName() const { return "Dynamics model name not set"; } /** * Allocates all of the GPU memory */ void GPUSetup(); void setZeroControl(const Eigen::Ref& zero_control) { zero_control_ = zero_control; } control_array getZeroControl() const { return zero_control_; } std::array getControlRanges() { std::array result; for (int i = 0; i < CONTROL_DIM; i++) { result[i] = control_rngs_[i]; } return result; } __host__ __device__ float2* getControlRangesRaw() { return control_rngs_; } void setControlRanges(std::array& control_rngs, bool synchronize = true); void setControlDeadbands(std::array& control_deadband, bool synchronize = true); void setParams(const PARAMS_T& params) { params_ = params; if (GPUMemStatus_) { CLASS_T& derived = static_cast(*this); derived.paramsToDevice(); } } __device__ __host__ PARAMS_T getParams() { return params_; } /* * */ void freeCudaMem(); /** * */ void printState(float* state); /** * */ // TODO should not assume it is going to cout, pass in stream void printParams(); /** * */ void paramsToDevice(bool synchronize = true); /** * loads the .npz at given path * @param model_path */ void loadParams(const std::string& model_path); /** * updates the internals of the dynamics model * @param description * @param data */ // TODO generalize void updateModel(std::vector description, std::vector data); /** * compute the Jacobians with respect to state and control * * @param state input of current state, passed by reference * @param control input of currrent control, passed by reference * @param A output Jacobian wrt state, passed by reference * @param B output Jacobian wrt control, passed by reference */ bool computeGrad(const Eigen::Ref& state = state_array(), const Eigen::Ref& control = control_array(), Eigen::Ref A = dfdx(), Eigen::Ref B = dfdu()) { return false; } /** * enforces control constraints * @param state * @param control */ void enforceConstraints(Eigen::Ref state, Eigen::Ref control) { for (int i = 0; i < CONTROL_DIM; i++) { if (fabsf(control[i]) < this->control_deadband_[i]) { control[i] = this->zero_control_[i]; } else { control[i] += this->control_deadband_[i] * -mppi::math::sign(control[i]); } control[i] = fminf(fmaxf(this->control_rngs_[i].x, control[i]), this->control_rngs_[i].y); } } /** * updates the current state using s_der * @param s state * @param s_der */ DEPRECATED void updateState(Eigen::Ref state, Eigen::Ref state_der, const float dt) { CLASS_T* derived = static_cast(this); derived->updateState(state, state, state_der, dt); } void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { next_state = state + state_der * dt; } void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt) { CLASS_T* derived = static_cast(this); derived->computeStateDeriv(state, control, state_der); derived->updateState(state, next_state, state_der, dt); derived->stateToOutput(next_state, output); } void stateToOutput(const Eigen::Ref& state, Eigen::Ref output) { // TODO this is a hack for (int i = 0; i < OUTPUT_DIM && i < STATE_DIM; i++) { output[i] = state[i]; } } __host__ __device__ void stateToOutput(const float* __restrict__ state, float* __restrict__ output); __host__ __device__ void outputToState(const float* __restrict__ output, float* __restrict__ state); state_array getZeroState() const; __host__ __device__ void getZeroState(float* state) const; /** * does a linear interpolation of states * @param state_1 * @param state_2 * @param alpha * @return */ state_array interpolateState(const Eigen::Ref& state_1, const Eigen::Ref& state_2, const float alpha) { return (1 - alpha) * state_1 + alpha * state_2; } /** * computes a specific state error * @param pred_state * @param true_state * @return */ state_array computeStateError(const Eigen::Ref& pred_state, const Eigen::Ref& true_state) { return pred_state - true_state; } /** * computes the section of the state derivative that comes form the dyanmics * @param state * @param control * @param state_der */ void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); /** * computes the parts of the state that are based off of kinematics * @param s state * @param s_der */ void computeKinematics(const Eigen::Ref& state, Eigen::Ref s_der){}; /** * computes the full state derivative by calling computeKinematics then computeDynamics * @param state * @param control * @param state_der */ void computeStateDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { CLASS_T* derived = static_cast(this); derived->computeKinematics(state, state_der); derived->computeDynamics(state, control, state_der); } /** * computes the section of the state derivative that comes form the dyanmics * @param state * @param control * @param state_der * @param theta_s shared memory that can be used when computation is computed across the same block */ __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta_s = nullptr); /** * computes the parts of the state that are based off of kinematics * parallelized on X only * @param state * @param state_der */ __device__ void computeKinematics(float* state, float* state_der){}; /** * @param state * @param control * @param state_der * @param theta_s shared memory that can be used when computation is computed across the same block */ __device__ inline void computeStateDeriv(float* state, float* control, float* state_der, float* theta_s); __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt); /** * applies the state derivative * @param state * @param state_der * @param dt */ DEPRECATED __device__ void updateState(float* state, float* state_der, const float dt) { CLASS_T* derived = static_cast(this); derived->updateState(state, state, state_der, dt); } __device__ void updateState(float* state, float* next_state, float* state_der, const float dt); /** * enforces control constraints */ __device__ void enforceConstraints(float* state, float* control); /** * Method to allow setup of dynamics on the GPU. This is needed for * initializing the memory of an LSTM for example */ void initializeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref output, float t_0, float dt) { for (int i = 0; i < OUTPUT_DIM && i < STATE_DIM; i++) { output[i] = state[i]; } } /** * Method to allow setup of dynamics on the GPU. This is needed for * initializing the memory of an LSTM for example */ __device__ void initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt) { for (int i = 0; i < OUTPUT_DIM && i < STATE_DIM; i++) { output[i] = state[i]; } } /** * Method to compute an emergency stopping control */ virtual void getStoppingControl(const Eigen::Ref& state, Eigen::Ref u) { u.setZero(); } /** * Method to enforce a leash on the initial state, which depends on type of dynamics. */ virtual void enforceLeash(const Eigen::Ref& state_true, const Eigen::Ref& state_nominal, const Eigen::Ref& leash_values, Eigen::Ref state_output) { for (int i = 0; i < DYN_T::STATE_DIM; i++) { float diff = fabsf(state_nominal[i] - state_true[i]); if (leash_values[i] < diff) { float leash_dir = fminf(fmaxf(state_nominal[i] - state_true[i], -leash_values[i]), leash_values[i]); state_output[i] = state_true[i] + leash_dir; } else { state_output[i] = state_nominal[i]; } } } virtual bool checkRequiresBuffer() { return requires_buffer_; } bool updateFromBuffer(const buffer_trajectory& buffer) { return true; } bool checkIfKeysInBuffer(const buffer_trajectory& buffer, std::vector& needed_keys) { bool missing_key = false; for (const auto& key : needed_keys) { if (buffer.find(key) == buffer.end()) { // Print out all missing keys this->logger_->warning("Could not find key %s in init buffer for %s\n", key.c_str(), this->getDynamicsModelName().c_str()); missing_key = true; } } return missing_key; } bool checkIfKeysInMap(const std::map& map, std::vector& needed_keys) { bool missing_key = false; for (const auto& key : needed_keys) { if (map.find(key) == map.end()) { // Print out all missing keys this->logger_->warning("Could not find key %s in state map for %s\n", key.c_str(), this->getDynamicsModelName().c_str()); missing_key = true; } } return missing_key; } virtual state_array stateFromMap(const std::map& map) = 0; // control ranges [.x, .y] float2 control_rngs_[CONTROL_DIM]; float control_deadband_[CONTROL_DIM] = { 0.0f }; // device pointer, null on the device CLASS_T* model_d_ = nullptr; protected: // generic parameter structure PARAMS_T params_; bool requires_buffer_ = false; private: void setDefaultControlRanges(); }; #ifdef __CUDACC__ #include "dynamics.cu" #endif template const int Dynamics::STATE_DIM; template const int Dynamics::CONTROL_DIM; template const int Dynamics::OUTPUT_DIM; } // namespace MPPI_internal #endif // DYNAMICS_CUH_ ================================================ FILE: include/mppi/dynamics/linear/linear.cu ================================================ /** * @file linear.cu * @brief Linear Dynamics * @author Bogdan Vlahov * @version * @date 2024-08-16 */ #include #include #include namespace mm = mppi::matrix_multiplication; namespace mp1 = mppi::p1; #define LINEAR_TEMPLATE template #define LINEAR_DYNAMICS LinearDynamicsImpl LINEAR_TEMPLATE LINEAR_DYNAMICS::LinearDynamicsImpl(cudaStream_t stream) : PARENT_CLASS(stream) { this->SHARED_MEM_REQUEST_GRD_BYTES = sizeof(PARAMS_T); } LINEAR_TEMPLATE LINEAR_DYNAMICS::LinearDynamicsImpl(PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS(params, stream) { this->SHARED_MEM_REQUEST_GRD_BYTES = sizeof(PARAMS_T); } LINEAR_TEMPLATE bool LINEAR_DYNAMICS::computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { A = this->params_.getA(); B = this->params_.getB(); return true; } LINEAR_TEMPLATE LINEAR_DYNAMICS::state_array LINEAR_DYNAMICS::stateFromMap(const std::map& map) { state_array x = this->getZeroState(); for (int i = 0; i < this->STATE_DIM; i++) { std::string state_name = "x_" + std::to_string(i); x(i, 0) = map.at(state_name); } return x; } LINEAR_TEMPLATE void LINEAR_DYNAMICS::step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt) { state_der = this->params_.getA() * state + this->params_.getB() * control; next_state = state + state_der * dt; this->stateToOutput(next_state, output); } LINEAR_TEMPLATE __device__ inline void LINEAR_DYNAMICS::step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt) { PARAMS_T* params_p = &(this->params_); if (this->getGrdSharedSizeBytes() != 0) { params_p = (PARAMS_T*)theta_s; } float* A = params_p->A; float* B = params_p->B; const mp1::Parallel1Dir P_DIR = mp1::Parallel1Dir::THREAD_Y; mm::gemm1STATE_DIM, this->STATE_DIM, 1, P_DIR>(A, state, state_der); mm::gemm1STATE_DIM, this->CONTROL_DIM, 1, P_DIR>(B, control, state_der, 1.0f, 1.0f); // __syncthreads(); int index, step; mp1::getParallel1DIndex(index, step); for (int i = index; i < this->STATE_DIM; i += step) { next_state[i] = state[i] + state_der[i] * dt; output[i] = next_state[i]; } } LINEAR_TEMPLATE __device__ void LINEAR_DYNAMICS::initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt) { PARENT_CLASS::initializeDynamics(state, control, output, theta_s, t_0, dt); if (this->getGrdSharedSizeBytes() != 0) { PARAMS_T* shared = (PARAMS_T*)theta_s; *shared = this->params_; } } #undef LINEAR_TEMPLATE #undef LINEAR_DYNAMICS ================================================ FILE: include/mppi/dynamics/linear/linear.cuh ================================================ /** * @file linear.cuh * @brief Linear Dynamics * @author Bogdan Vlahov * @version * @date 2024-08-16 */ #pragma once #include template struct LinearDynamicsParams : public DynamicsParams { public: using state_matrix = Eigen::Matrix; using control_matrix = Eigen::Matrix; enum class StateIndex : int { NUM_STATES = STATE_DIM, }; enum class ControlIndex : int { NUM_CONTROLS = CONTROL_DIM, }; enum class OutputIndex : int { NUM_OUTPUTS = STATE_DIM, }; float A[STATE_DIM * STATE_DIM] = { 0.0f }; float B[STATE_DIM * CONTROL_DIM] = { 0.0f }; LinearDynamicsParams() = default; ~LinearDynamicsParams() = default; void setA(const Eigen::Ref& A_eigen) { memcpy(A, A_eigen.data(), sizeof(float) * STATE_DIM * STATE_DIM); } void setB(const Eigen::Ref& B_eigen) { memcpy(B, B_eigen.data(), sizeof(float) * STATE_DIM * CONTROL_DIM); } float* getA() const { return A; } float* getB() const { return B; } Eigen::Map getA() { Eigen::Map A_eigen(A); return A_eigen; } Eigen::Map getB() { Eigen::Map B_eigen(B); return B_eigen; } }; using namespace MPPI_internal; template > class LinearDynamicsImpl : public Dynamics { public: using PARENT_CLASS = Dynamics; using state_array = typename PARENT_CLASS::state_array; using control_array = typename PARENT_CLASS::control_array; using output_array = typename PARENT_CLASS::output_array; using dfdx = typename PARENT_CLASS::dfdx; using dfdu = typename PARENT_CLASS::dfdu; using PARENT_CLASS::initializeDynamics; LinearDynamicsImpl(cudaStream_t stream = nullptr); LinearDynamicsImpl(PARAMS_T& params, cudaStream_t stream = nullptr); std::string getDynamicsModelName() const override { return "Linear Dynamics"; } bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B); state_array stateFromMap(const std::map& map); void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt); void setA(const Eigen::Ref& A_eigen, bool synchronize = false) { this->params_.setA(A_eigen); this->paramsToDevice(synchronize); } void setB(const Eigen::Ref& B_eigen, bool synchronize = false) { this->params_.setB(B_eigen); this->paramsToDevice(synchronize); } __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt); __device__ void initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt); }; template class LinearDynamics : public LinearDynamicsImpl, LinearDynamicsParams> { public: using PARENT_CLASS = LinearDynamicsImpl, LinearDynamicsParams>; using DYN_PARAMS_T = typename PARENT_CLASS::DYN_PARAMS_T; LinearDynamics(cudaStream_t stream = nullptr) : PARENT_CLASS(stream) { } LinearDynamics(DYN_PARAMS_T& params, cudaStream_t stream = nullptr) : PARENT_CLASS(params, stream) { } }; #ifdef __CUDACC__ #include "linear.cu" #endif ================================================ FILE: include/mppi/dynamics/quadrotor/quadrotor_dynamics.cu ================================================ #include #include QuadrotorDynamics::QuadrotorDynamics(std::array control_rngs, cudaStream_t stream) : PARENT_CLASS(control_rngs, stream) { this->params_ = QuadrotorDynamicsParams(); zero_control_[3] = mppi::math::GRAVITY; } QuadrotorDynamics::QuadrotorDynamics(cudaStream_t stream) : PARENT_CLASS(stream) { this->params_ = QuadrotorDynamicsParams(); float2 thrust_rng; thrust_rng.x = 0; thrust_rng.y = 36; // TODO Figure out if this is a reasonable amount of thrust this->control_rngs_[3] = thrust_rng; this->zero_control_[3] = mppi::math::GRAVITY; } void QuadrotorDynamics::printState(float* state) { int precision = 4; int total_char = precision + 4; printf("Pos x: %*.*f, y: %*.*f, z: %*.*f\n", total_char, precision, state[0], total_char, precision, state[1], total_char, precision, state[2]); printf("Vel x: %*.*f, y: %*.*f, z: %*.*f\n", total_char, precision, state[3], total_char, precision, state[4], total_char, precision, state[5]); printf("Quat w: %*.*f, x: %*.*f, y: %*.*f, z: %*.*f\n", total_char, precision, state[6], total_char, precision, state[7], total_char, precision, state[8], total_char, precision, state[9]); printf("Ang Vel x: %*.*f, y: %*.*f, z: %*.*f\n", total_char, precision, state[10], total_char, precision, state[11], total_char, precision, state[12]); } bool QuadrotorDynamics::computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { Eigen::Quaternionf q(state[6], state[7], state[8], state[9]); Eigen::Matrix3f dcm_lb; // dcm_lb = q.toRotationMatrix(); mppi::math::Quat2DCM(q, dcm_lb); // I can do the math for everything except quaternions A.setZero(); // x_d A.block<3, 3>(0, 3).setIdentity(); // v_d TODO figure out derivative of v_d wrt q // q_d TODO figure out derivative of q_d wrt q // w_d Eigen::Vector3f tau_inv; tau_inv << 1 / this->params_.tau_roll, 1 / this->params_.tau_pitch, 1 / this->params_.tau_yaw; A.block<3, 3>(10, 10) = -1 * tau_inv.asDiagonal(); B.setZero(); // x_d is empty // v_d B.block<3, 1>(3, 3) = dcm_lb.col(2) / this->params_.mass; // q_d using omega2edot as reference B.block<4, 3>(6, 0) << -0.5 * q.x(), -0.5 * q.y(), -0.5 * q.z(), 0.5 * q.w(), -0.5 * q.z(), 0.5 * q.y(), 0.5 * q.z(), 0.5 * q.w(), -0.5 * q.x(), -0.5 * q.y(), 0.5 * q.x(), 0.5 * q.w(); // w_d B.block<3, 3>(10, 0) = tau_inv.asDiagonal(); return false; } void QuadrotorDynamics::computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { // Fill in state_der.block<3, 1>(0, 0); Eigen::Vector3f x_d, v_d, angular_speed_d, u_pqr; Eigen::Matrix angular_speed, v; Eigen::Quaternionf q_d; Eigen::Matrix tau_inv; float u_thrust = control[C_INDEX(THRUST)]; // Assume quaterion is w, x, y, z in state array Eigen::Quaternionf q(state[6], state[7], state[8], state[9]); u_pqr << control[C_INDEX(ANG_RATE_X)], control[C_INDEX(ANG_RATE_Y)], control[C_INDEX(ANG_RATE_Z)]; v = state.block<3, 1>(3, 0); angular_speed << state(10), state(11), state(12); tau_inv << 1 / this->params_.tau_roll, 1 / this->params_.tau_pitch, 1 / this->params_.tau_yaw; Eigen::Matrix3f dcm_lb = Eigen::Matrix3f::Identity(); // x_d = v x_d = v; // v_d = Lvb * [0 0 T]' + g mppi::math::Quat2DCM(q, dcm_lb); v_d = (u_thrust / this->params_.mass) * dcm_lb.col(2); v_d(2) -= mppi::math::GRAVITY; // q_d = H(q) w mppi::math::omega2edot(angular_speed(0), angular_speed(1), angular_speed(2), q, q_d); // w_d = (u_pqr - w)/ tau // Note we assume that a low level controller makes angular velocity tracking // a first order system angular_speed_d = tau_inv.cwiseProduct(u_pqr - angular_speed); // Copy into state_deriv state_der.block<3, 1>(0, 0) = x_d; state_der.block<3, 1>(3, 0) = v_d; state_der(6) = q_d.w(); state_der(7) = q_d.x(); state_der(8) = q_d.y(); state_der(9) = q_d.z(); state_der.block<3, 1>(10, 0) = angular_speed_d; } void QuadrotorDynamics::updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { PARENT_CLASS::updateState(state, next_state, state_der, dt); // Renormalize quaternion Eigen::Quaternionf q(next_state[6], next_state[7], next_state[8], next_state[9]); next_state.block<4, 1>(6, 0) /= q.norm() * copysign(1.0, q.w()); } __device__ void QuadrotorDynamics::computeDynamics(float* state, float* control, float* state_der, float* theta) { // Fill in float* v = state + 3; float* q = state + 6; float* w = state + 10; // Derivatives float* x_d = state_der; float* v_d = state_der + 3; float* q_d = state_der + 6; float* w_d = state_der + 10; float* u_pqr = control; float u_thrust = control[C_INDEX(THRUST)]; float dcm_lb[3][3]; int i; // x_d = v for (i = threadIdx.y; i < 3; i += blockDim.y) { x_d[i] = v[i]; } // v_d = Lvb * [0 0 T]' + g mppi::math::Quat2DCM(q, dcm_lb); for (i = threadIdx.y; i < 3; i += blockDim.y) { v_d[i] = (u_thrust / this->params_.mass) * dcm_lb[i][2]; } __syncthreads(); if (threadIdx.y == 0) { v_d[2] -= mppi::math::GRAVITY; } // q_d = H(q) w mppi::math::omega2edot(w[0], w[1], w[2], q, q_d); // w_d = (u - w) / tau w_d[0] = (u_pqr[0] - w[0]) / this->params_.tau_roll; w_d[1] = (u_pqr[1] - w[1]) / this->params_.tau_pitch; w_d[2] = (u_pqr[2] - w[2]) / this->params_.tau_yaw; __syncthreads(); } __device__ void QuadrotorDynamics::updateState(float* state, float* next_state, float* state_der, float dt) { PARENT_CLASS::updateState(state, next_state, state_der, dt); int i = 0; // renormalze quaternion float* q = next_state + 6; float q_norm = sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]); for (i = threadIdx.y; i < 4; i += blockDim.y) { q[i] /= q_norm * copysignf(1.0, q[0]); } // __syncthreads(); } QuadrotorDynamics::state_array QuadrotorDynamics::stateFromMap(const std::map& map) { state_array s; s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(POS_Y)) = map.at("POS_Y"); s(S_INDEX(POS_Z)) = map.at("POS_Z"); s(S_INDEX(VEL_X)) = map.at("VEL_X"); s(S_INDEX(VEL_Y)) = map.at("VEL_Y"); s(S_INDEX(VEL_Z)) = map.at("VEL_Z"); s(S_INDEX(QUAT_X)) = map.at("Q_X"); s(S_INDEX(QUAT_Y)) = map.at("Q_Y"); s(S_INDEX(QUAT_Z)) = map.at("Q_Z"); s(S_INDEX(QUAT_W)) = map.at("Q_W"); s(S_INDEX(ANG_VEL_X)) = map.at("OMEGA_X"); s(S_INDEX(ANG_VEL_Y)) = map.at("OMEGA_Y"); s(S_INDEX(ANG_VEL_Z)) = map.at("OMEGA_Z"); return QuadrotorDynamics::state_array(); } QuadrotorDynamics::state_array QuadrotorDynamics::getZeroState() const { state_array zero = state_array::Zero(); zero[S_INDEX(QUAT_W)] = 1.0f; return zero; } __host__ __device__ void QuadrotorDynamics::getZeroState(float* state) const { int p_index, step; mppi::p1::getParallel1DIndex(p_index, step); for (int i = p_index; i < STATE_DIM; i += step) { if (i == S_INDEX(QUAT_W)) { state[i] = 1.0f; } else { state[i] = 0.0f; } } } ================================================ FILE: include/mppi/dynamics/quadrotor/quadrotor_dynamics.cuh ================================================ /* * Created on Tue Jun 02 2020 by Bogdan Vlahov * */ #ifndef QUADROTOR_DYNAMICS_CUH_ #define QUADROTOR_DYNAMICS_CUH_ #include struct QuadrotorDynamicsParams : public DynamicsParams { enum class StateIndex : int { POS_X = 0, POS_Y, POS_Z, VEL_X, VEL_Y, VEL_Z, QUAT_W, QUAT_X, QUAT_Y, QUAT_Z, ANG_VEL_X, ANG_VEL_Y, ANG_VEL_Z, NUM_STATES }; enum class ControlIndex : int { ANG_RATE_X = 0, ANG_RATE_Y, ANG_RATE_Z, THRUST, NUM_CONTROLS }; enum class OutputIndex : int { POS_X = 0, POS_Y, POS_Z, VEL_X, VEL_Y, VEL_Z, QUAT_W, QUAT_X, QUAT_Y, QUAT_Z, ANG_VEL_X, ANG_VEL_Y, ANG_VEL_Z, NUM_OUTPUTS }; float tau_roll = 0.25; float tau_pitch = 0.25; float tau_yaw = 0.25; float mass = 1; // kg QuadrotorDynamicsParams(float mass_in) : mass(mass_in){}; QuadrotorDynamicsParams() = default; ~QuadrotorDynamicsParams() = default; }; using namespace MPPI_internal; class QuadrotorDynamics : public Dynamics { /** * State for this class is defined as follows: * x - position in 3D space (x, y, z) - meters * v - velocity in 3D space (v_x, v_y_ v_z) - meters/sec * q - quaternion (q_w, q_x, q_y, q_z) * w - angular velocities (roll_dot, pitch_dot, yaw_dot) - rad/sec * * Coordinate Frame is NWU * * Control: * roll_rate - rad/sec * pitch_rate - rad/sec * yaw_rate - rad/sec * thrust - Newtons */ public: using PARENT_CLASS = Dynamics; using state_array = typename PARENT_CLASS::state_array; using control_array = typename PARENT_CLASS::control_array; using dfdx = typename PARENT_CLASS::dfdx; using dfdu = typename PARENT_CLASS::dfdu; // Constructor QuadrotorDynamics(cudaStream_t stream = 0); QuadrotorDynamics(std::array control_rngs, cudaStream_t stream = 0); using PARENT_CLASS::updateState; // needed as overloading updateState here hides all parent versions of updateState std::string getDynamicsModelName() const override { return "Quadrotor Model"; } void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B); void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt); void printState(float* state); __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta = nullptr); __device__ void updateState(float* state, float* next_state, float* state_der, const float dt); state_array stateFromMap(const std::map& map) override; __host__ __device__ void getZeroState(float* state) const; state_array getZeroState() const; }; #if __CUDACC__ #include "quadrotor_dynamics.cu" #endif #endif // QUADROTOR_DYNAMICS_CUH_ ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins.cu ================================================ #include #include template void RacerDubinsImpl::computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { bool enable_brake = control(C_INDEX(THROTTLE_BRAKE)) < 0; state_der(S_INDEX(BRAKE_STATE)) = fminf(fmaxf((enable_brake * -control(C_INDEX(THROTTLE_BRAKE)) - state(S_INDEX(BRAKE_STATE))) * this->params_.brake_delay_constant, -this->params_.max_brake_rate_neg), this->params_.max_brake_rate_pos); // applying position throttle state_der(S_INDEX(VEL_X)) = (!enable_brake) * this->params_.c_t[0] * control(C_INDEX(THROTTLE_BRAKE)) * this->params_.gear_sign + this->params_.c_b[0] * state(S_INDEX(BRAKE_STATE)) * (state(S_INDEX(VEL_X)) >= 0 ? -1 : 1) - this->params_.c_v[0] * state(S_INDEX(VEL_X)) + this->params_.c_0; state_der(S_INDEX(YAW)) = (state(S_INDEX(VEL_X)) / this->params_.wheel_base) * tan(state(S_INDEX(STEER_ANGLE)) / this->params_.steer_angle_scale); float yaw, sin_yaw, cos_yaw; yaw = state[S_INDEX(YAW)]; sincosf(yaw, &sin_yaw, &cos_yaw); state_der(S_INDEX(POS_X)) = state(S_INDEX(VEL_X)) * cos_yaw; state_der(S_INDEX(POS_Y)) = state(S_INDEX(VEL_X)) * sin_yaw; state_der(S_INDEX(STEER_ANGLE)) = (control(C_INDEX(STEER_CMD)) * this->params_.steer_command_angle_scale - state(S_INDEX(STEER_ANGLE))) * this->params_.steering_constant; state_der(S_INDEX(STEER_ANGLE)) = fmaxf(fminf(state_der(S_INDEX(STEER_ANGLE)), this->params_.max_steer_rate), -this->params_.max_steer_rate); } template bool RacerDubinsImpl::computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { return false; } template void RacerDubinsImpl::updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { // Segmented it to ensure that roll and pitch don't get overwritten for (int i = 0; i < 6; i++) { next_state[i] = state[i] + state_der[i] * dt; } next_state(S_INDEX(YAW)) = angle_utils::normalizeAngle(next_state(S_INDEX(YAW))); next_state(S_INDEX(STEER_ANGLE)) = fmaxf(fminf(next_state(S_INDEX(STEER_ANGLE)), this->params_.max_steer_angle), -this->params_.max_steer_angle); next_state(S_INDEX(STEER_ANGLE_RATE)) = state_der(S_INDEX(STEER_ANGLE)); next_state(S_INDEX(BRAKE_STATE)) = fminf(fmaxf(next_state(S_INDEX(BRAKE_STATE)), 0.0f), -this->control_rngs_[C_INDEX(THROTTLE_BRAKE)].x); } template RacerDubinsImpl::state_array RacerDubinsImpl::interpolateState( const Eigen::Ref state_1, const Eigen::Ref state_2, const float alpha) { state_array result = (1 - alpha) * state_1 + alpha * state_2; result(S_INDEX(YAW)) = angle_utils::interpolateEulerAngleLinear(state_1(S_INDEX(YAW)), state_2(S_INDEX(YAW)), alpha); return result; } template __device__ void RacerDubinsImpl::updateState(float* state, float* next_state, float* state_der, const float dt) { int i; int tdy = threadIdx.y; // Add the state derivative time dt to the current state. // printf("updateState thread %d, %d = %f, %f\n", threadIdx.x, threadIdx.y, state[0], state_der[0]); for (i = tdy; i < 6; i += blockDim.y) { next_state[i] = state[i] + state_der[i] * dt; if (i == S_INDEX(YAW)) { next_state[i] = angle_utils::normalizeAngle(next_state[i]); } if (i == S_INDEX(STEER_ANGLE)) { next_state[i] = fmaxf(fminf(next_state[i], this->params_.max_steer_angle), -this->params_.max_steer_angle); next_state[S_INDEX(STEER_ANGLE_RATE)] = state_der[i]; } if (i == S_INDEX(BRAKE_STATE)) { next_state[i] = fminf(fmaxf(next_state[i], 0.0f), 1.0f); } } } template Eigen::Quaternionf RacerDubinsImpl::attitudeFromState(const Eigen::Ref& state) { float theta = state[S_INDEX(YAW)]; return Eigen::Quaternionf(cos(theta / 2), 0, 0, sin(theta / 2)); } template Eigen::Vector3f RacerDubinsImpl::positionFromState(const Eigen::Ref& state) { return Eigen::Vector3f(state[S_INDEX(POS_X)], state[S_INDEX(POS_Y)], 0); } template Eigen::Vector3f RacerDubinsImpl::velocityFromState(const Eigen::Ref& state) { return Eigen::Vector3f(state[S_INDEX(VEL_X)], 0, 0); } template Eigen::Vector3f RacerDubinsImpl::angularRateFromState(const Eigen::Ref& state) { return Eigen::Vector3f(0, 0, 0); // TODO compute yaw rate from steering angle & vel } template RacerDubinsImpl::state_array RacerDubinsImpl::stateFromOdometry( const Eigen::Quaternionf& q, const Eigen::Vector3f& pos, const Eigen::Vector3f& vel, const Eigen::Vector3f& omega) { state_array s; s.setZero(); s[S_INDEX(POS_X)] = pos[0]; s[S_INDEX(POS_Y)] = pos[1]; s[S_INDEX(VEL_X)] = vel[0]; float _roll, _pitch, yaw; mppi::math::Quat2EulerNWU(q, _roll, _pitch, yaw); s[S_INDEX(YAW)] = yaw; return s; } template __device__ void RacerDubinsImpl::computeDynamics(float* state, float* control, float* state_der, float* theta_s) { bool enable_brake = control[C_INDEX(THROTTLE_BRAKE)] < 0; state_der[S_INDEX(BRAKE_STATE)] = fminf(fmaxf((enable_brake * -control[C_INDEX(THROTTLE_BRAKE)] - state[S_INDEX(BRAKE_STATE)]) * this->params_.brake_delay_constant, -this->params_.max_brake_rate_neg), this->params_.max_brake_rate_pos); // applying position throttle state_der[S_INDEX(VEL_X)] = (!enable_brake) * this->params_.c_t[0] * control[0] * this->params_.gear_sign + this->params_.c_b[0] * state[S_INDEX(BRAKE_STATE)] * (state[S_INDEX(VEL_X)] >= 0 ? -1 : 1) - this->params_.c_v[0] * state[S_INDEX(VEL_X)] + this->params_.c_0; state_der[S_INDEX(YAW)] = (state[S_INDEX(VEL_X)] / this->params_.wheel_base) * tan(state[S_INDEX(STEER_ANGLE)] / this->params_.steer_angle_scale); float yaw, sin_yaw, cos_yaw; yaw = angle_utils::normalizeAngle(state[S_INDEX(YAW)]); __sincosf(yaw, &sin_yaw, &cos_yaw); state_der[S_INDEX(POS_X)] = state[S_INDEX(VEL_X)] * cos_yaw; state_der[S_INDEX(POS_Y)] = state[S_INDEX(VEL_X)] * sin_yaw; state_der[S_INDEX(STEER_ANGLE)] = fmaxf(fminf((control[1] * this->params_.steer_command_angle_scale - state[S_INDEX(STEER_ANGLE)]) * this->params_.steering_constant, this->params_.max_steer_rate), -this->params_.max_steer_rate); } template void RacerDubinsImpl::getStoppingControl(const Eigen::Ref& state, Eigen::Ref u) { u[0] = -1.0; // full brake u[1] = 0.0; // no steering } template void RacerDubinsImpl::enforceLeash(const Eigen::Ref& state_true, const Eigen::Ref& state_nominal, const Eigen::Ref& leash_values, Eigen::Ref state_output) { state_output = state_true; // update state_output for leash, need to handle x and y positions specially, convert to body frame and leash in body // frame. transform x and y into body frame float dx = state_nominal[S_INDEX(POS_X)] - state_true[S_INDEX(POS_X)]; float dy = state_nominal[S_INDEX(POS_Y)] - state_true[S_INDEX(POS_Y)]; float dx_body = dx * cos(state_true[S_INDEX(YAW)]) + dy * sin(state_true[S_INDEX(YAW)]); float dy_body = -dx * sin(state_true[S_INDEX(YAW)]) + dy * cos(state_true[S_INDEX(YAW)]); // determine leash in body frame float y_leash = leash_values[S_INDEX(POS_Y)]; float x_leash = leash_values[S_INDEX(POS_X)]; dx_body = fminf(fmaxf(dx_body, -x_leash), x_leash); dy_body = fminf(fmaxf(dy_body, -y_leash), y_leash); // transform back to map frame dx = dx_body * cos(state_true[S_INDEX(YAW)]) + -dy_body * sin(state_true[S_INDEX(YAW)]); dy = dx_body * sin(state_true[S_INDEX(YAW)]) + dy_body * cos(state_true[S_INDEX(YAW)]); // apply leash state_output[S_INDEX(POS_X)] += dx; state_output[S_INDEX(POS_Y)] += dy; // handle leash for rest of states float diff; for (int i = 0; i < PARENT_CLASS::STATE_DIM; i++) { // use body x and y for leash if (i == S_INDEX(POS_X) || i == S_INDEX(POS_Y)) { // handle outside for loop continue; } else if (i == S_INDEX(YAW)) { diff = angle_utils::shortestAngularDistance(state_true[i], state_nominal[i]); } else { diff = state_nominal[i] - state_true[i]; } if (leash_values[i] < fabsf(diff)) { float leash_dir = fminf(fmaxf(diff, -leash_values[i]), leash_values[i]); state_output[i] = state_true[i] + leash_dir; if (i == S_INDEX(YAW)) { state_output[i] = angle_utils::normalizeAngle(state_output[i]); } } else { state_output[i] = state_nominal[i]; } } } template RacerDubinsImpl::state_array RacerDubinsImpl::stateFromMap(const std::map& map) { state_array s = state_array::Zero(); if (map.find("VEL_X") == map.end() || map.find("VEL_Y") == map.end() || map.find("POS_X") == map.end() || map.find("POS_Y") == map.end()) { return s; } s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(POS_Y)) = map.at("POS_Y"); s(S_INDEX(VEL_X)) = map.at("VEL_X"); s(S_INDEX(YAW)) = map.at("YAW"); if (map.find("STEER_ANGLE") != map.end()) { s(S_INDEX(STEER_ANGLE)) = map.at("STEER_ANGLE"); s(S_INDEX(STEER_ANGLE_RATE)) = map.at("STEER_ANGLE_RATE"); } else { s(S_INDEX(STEER_ANGLE)) = 0; s(S_INDEX(STEER_ANGLE_RATE)) = 0; } if (map.find("BRAKE_STATE") != map.end()) { s(S_INDEX(BRAKE_STATE)) = map.at("BRAKE_STATE"); } else if (map.find("BRAKE_CMD") != map.end()) { s(S_INDEX(BRAKE_STATE)) = map.at("BRAKE_CMD"); } else { s(S_INDEX(BRAKE_STATE)) = 0; } return s; } template __device__ void RacerDubinsImpl::computeParametricDelayDeriv(float* state, float* control, float* state_der, PARAMS_T* params_p) { bool enable_brake = control[C_INDEX(THROTTLE_BRAKE)] < 0.0f; const float brake_error = (enable_brake * -control[C_INDEX(THROTTLE_BRAKE)] - state[S_INDEX(BRAKE_STATE)]); // Compute dynamics state_der[S_INDEX(BRAKE_STATE)] = fminf(fmaxf((brake_error > 0) * brake_error * params_p->brake_delay_constant + (brake_error < 0) * brake_error * params_p->brake_delay_constant_neg, -params_p->max_brake_rate_neg), params_p->max_brake_rate_pos); } template __device__ void RacerDubinsImpl::computeParametricSteerDeriv(float* state, float* control, float* state_der, PARAMS_T* params_p) { state_der[S_INDEX(STEER_ANGLE)] = fmaxf(fminf((control[C_INDEX(STEER_CMD)] * params_p->steer_command_angle_scale - state[S_INDEX(STEER_ANGLE)]) * params_p->steering_constant, params_p->max_steer_rate), -params_p->max_steer_rate); } template void RacerDubinsImpl::computeParametricDelayDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { bool enable_brake = control(C_INDEX(THROTTLE_BRAKE)) < 0.0f; const float brake_error = (enable_brake * -control(C_INDEX(THROTTLE_BRAKE)) - state(S_INDEX(BRAKE_STATE))); state_der(S_INDEX(BRAKE_STATE)) = fminf(fmaxf((brake_error > 0) * brake_error * this->params_.brake_delay_constant + (brake_error < 0) * brake_error * this->params_.brake_delay_constant_neg, -this->params_.max_brake_rate_neg), this->params_.max_brake_rate_pos); } template void RacerDubinsImpl::computeParametricSteerDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { state_der(S_INDEX(STEER_ANGLE)) = (control(C_INDEX(STEER_CMD)) * this->params_.steer_command_angle_scale - state(S_INDEX(STEER_ANGLE))) * this->params_.steering_constant; state_der(S_INDEX(STEER_ANGLE)) = fmaxf(fminf(state_der(S_INDEX(STEER_ANGLE)), this->params_.max_steer_rate), -this->params_.max_steer_rate); } template __host__ __device__ void RacerDubinsImpl::setOutputs(const float* state_der, const float* next_state, float* output) { // Setup output output[O_INDEX(BASELINK_VEL_B_X)] = next_state[S_INDEX(VEL_X)]; output[O_INDEX(BASELINK_VEL_B_Y)] = 0.0f; output[O_INDEX(BASELINK_VEL_B_Z)] = 0.0f; output[O_INDEX(BASELINK_POS_I_X)] = next_state[S_INDEX(POS_X)]; output[O_INDEX(BASELINK_POS_I_Y)] = next_state[S_INDEX(POS_Y)]; output[O_INDEX(PITCH)] = next_state[S_INDEX(PITCH)]; output[O_INDEX(ROLL)] = next_state[S_INDEX(ROLL)]; output[O_INDEX(YAW)] = next_state[S_INDEX(YAW)]; output[O_INDEX(STEER_ANGLE)] = next_state[S_INDEX(STEER_ANGLE)]; output[O_INDEX(STEER_ANGLE_RATE)] = next_state[S_INDEX(STEER_ANGLE_RATE)]; output[O_INDEX(WHEEL_FORCE_B_FL)] = 10000.0f; output[O_INDEX(WHEEL_FORCE_B_FR)] = 10000.0f; output[O_INDEX(WHEEL_FORCE_B_RL)] = 10000.0f; output[O_INDEX(WHEEL_FORCE_B_RR)] = 10000.0f; output[O_INDEX(ACCEL_X)] = state_der[S_INDEX(VEL_X)]; output[O_INDEX(ACCEL_Y)] = 0.0f; output[O_INDEX(OMEGA_Z)] = state_der[S_INDEX(YAW)]; output[O_INDEX(TOTAL_VELOCITY)] = fabsf(next_state[S_INDEX(VEL_X)]); } template __device__ __host__ void RACER::computeStaticSettling(TEX_T* tex_helper, const float yaw, const float x, const float y, float& roll, float& pitch, float* output) { float height = 0.0f; float3 body_front_left = make_float3(2.981f, 0.737f, 0.0f); float3 body_front_right = make_float3(2.981f, -0.737f, 0.f); float3 body_rear_left = make_float3(0.0f, 0.737f, 0.0f); float3 body_rear_right = make_float3(0.0f, -0.737f, 0.0f); float3 body_pose = make_float3(x, y, 0.0f); float3 rotation = make_float3(roll, pitch, yaw); float3 front_left, front_right, rear_left, rear_right; mppi::math::bodyOffsetToWorldPoseEuler(body_front_left, body_pose, rotation, front_left); mppi::math::bodyOffsetToWorldPoseEuler(body_front_right, body_pose, rotation, front_right); mppi::math::bodyOffsetToWorldPoseEuler(body_rear_left, body_pose, rotation, rear_left); mppi::math::bodyOffsetToWorldPoseEuler(body_rear_right, body_pose, rotation, rear_right); // front_left = make_float3(front_left.x * cosf(yaw) - front_left.y * sinf(yaw) + x, // front_left.x * sinf(yaw) + front_left.y * cosf(yaw) + y, 0.0f); // front_right = make_float3(front_right.x * cosf(yaw) - front_right.y * sinf(yaw) + x, // front_right.x * sinf(yaw) + front_right.y * cosf(yaw) + y, 0.0f); // rear_left = make_float3(rear_left.x * cosf(yaw) - rear_left.y * sinf(yaw) + x, // rear_left.x * sinf(yaw) + rear_left.y * cosf(yaw) + y, 0.0f); // rear_right = make_float3(rear_right.x * cosf(yaw) - rear_right.y * sinf(yaw) + x, // rear_right.x * sinf(yaw) + rear_right.y * cosf(yaw) + y, 0.0f); float front_left_height = 0.0f; float front_right_height = 0.0f; float rear_left_height = 0.0f; float rear_right_height = 0.0f; if (tex_helper->checkTextureUse(0)) { front_left_height = tex_helper->queryTextureAtWorldPose(0, front_left); front_right_height = tex_helper->queryTextureAtWorldPose(0, front_right); rear_left_height = tex_helper->queryTextureAtWorldPose(0, rear_left); rear_right_height = tex_helper->queryTextureAtWorldPose(0, rear_right); float front_diff = front_left_height - front_right_height; front_diff = fmaxf(fminf(front_diff, 0.736f * 2.0f), -0.736f * 2.0f); float rear_diff = rear_left_height - rear_right_height; rear_diff = fmaxf(fminf(rear_diff, 0.736f * 2.0f), -0.736f * 2.0f); float front_roll = asinf(front_diff / (0.737f * 2.0f)); float rear_roll = asinf(rear_diff / (0.737f * 2.0f)); roll = (front_roll + rear_roll) / 2.0f; float left_diff = rear_left_height - front_left_height; left_diff = fmaxf(fminf(left_diff, 2.98f), -2.98f); float right_diff = rear_right_height - front_right_height; right_diff = fmaxf(fminf(right_diff, 2.98f), -2.98f); float left_pitch = asinf((left_diff) / 2.981f); float right_pitch = asinf((right_diff) / 2.981f); pitch = (left_pitch + right_pitch) / 2.0f; height = (rear_left_height + rear_right_height) / 2.0f; // using 2pi so any rotation that accidently uses this will be using identity if (!isfinite(roll) || fabsf(roll) > static_cast(M_PI)) { roll = static_cast(2.0 * M_PI); } if (!isfinite(pitch) || fabsf(pitch) > static_cast(M_PI)) { pitch = static_cast(2.0 * M_PI); } if (!isfinite(height)) { height = 0.0f; } } else { roll = 0.0f; pitch = 0.0f; height = 0.0f; } output[E_INDEX(OUTPUT_T, BASELINK_POS_I_Z)] = height; } ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins.cuh ================================================ #ifndef MPPIGENERIC_RACER_DUBINS_CUH #define MPPIGENERIC_RACER_DUBINS_CUH #include #include namespace RACER { template __device__ __host__ static void computeStaticSettling(TEX_T* tex_helper, const float yaw, const float x, const float y, float& roll, float& pitch, float* output); }; struct RacerDubinsParams : public DynamicsParams { enum class StateIndex : int { VEL_X = 0, YAW, POS_X, POS_Y, STEER_ANGLE, BRAKE_STATE, STEER_ANGLE_RATE, NUM_STATES }; enum class ControlIndex : int { THROTTLE_BRAKE = 0, STEER_CMD, NUM_CONTROLS }; enum class OutputIndex : int { BASELINK_VEL_B_X = 0, BASELINK_VEL_B_Y, BASELINK_POS_I_X, BASELINK_POS_I_Y, BASELINK_POS_I_Z, YAW, ROLL, PITCH, STEER_ANGLE, STEER_ANGLE_RATE, WHEEL_FORCE_UP_MAX, WHEEL_FORCE_FWD_MAX, WHEEL_FORCE_SIDE_MAX, // WHEEL_FORCE_UP_FL, // WHEEL_FORCE_UP_FR, // WHEEL_FORCE_UP_RL, // WHEEL_FORCE_UP_RR, // WHEEL_FORCE_FWD_FL, // WHEEL_FORCE_FWD_FR, // WHEEL_FORCE_FWD_RL, // WHEEL_FORCE_FWD_RR, // WHEEL_FORCE_SIDE_FL, // WHEEL_FORCE_SIDE_FR, // WHEEL_FORCE_SIDE_RL, // WHEEL_FORCE_SIDE_RR, ACCEL_X, ACCEL_Y, OMEGA_Z, TOTAL_VELOCITY, UNCERTAINTY_POS_X, UNCERTAINTY_POS_Y, UNCERTAINTY_YAW, UNCERTAINTY_VEL_X, UNCERTAINTY_POS_X_Y, UNCERTAINTY_POS_X_YAW, UNCERTAINTY_POS_X_VEL_X, UNCERTAINTY_POS_Y_YAW, UNCERTAINTY_POS_Y_VEL_X, UNCERTAINTY_YAW_VEL_X, FILLER_1, NUM_OUTPUTS }; // engine model component float c_t[3] = { 1.3, 2.6, 3.9 }; float c_b[3] = { 2.5, 3.5, 4.5 }; float c_v[3] = { 3.7, 4.7, 5.7 }; float c_0 = 4.9; // steering component float steering_constant = .6; float steer_command_angle_scale = 5; float steer_angle_scale = -9.1; float max_steer_angle = 0.5; float max_steer_rate = 5; float steer_accel_constant = 12.1f; float steer_accel_drag_constant = 1.0f; // brake parametric component float brake_delay_constant = 6.6; float brake_delay_constant_neg = 8.2; float max_brake_rate_neg = 0.9; float max_brake_rate_pos = 0.33; // system parameters float wheel_base = 0.3; float low_min_throttle = 0.13; float gravity = -9.81; int gear_sign = 1; }; using namespace MPPI_internal; template class RacerDubinsImpl : public Dynamics { public: typedef Dynamics PARENT_CLASS; typedef typename PARENT_CLASS::state_array state_array; typedef typename PARENT_CLASS::control_array control_array; typedef typename PARENT_CLASS::output_array output_array; typedef typename PARENT_CLASS::dfdx dfdx; typedef typename PARENT_CLASS::dfdu dfdu; using PARENT_CLASS::updateState; // needed as overloading updateState here hides all parent versions of updateState RacerDubinsImpl(cudaStream_t stream = nullptr) : PARENT_CLASS(stream) { } RacerDubinsImpl(PARAMS_T& params, cudaStream_t stream = nullptr) : PARENT_CLASS(params, stream) { } std::string getDynamicsModelName() const override { return "RACER Dubins Model"; } void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); // void computeStateDeriv(const Eigen::Ref& state, const Eigen::Ref& control, // Eigen::Ref state_der, output_array* output=nullptr); // TODO void computeParametricDelayDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); void computeParametricSteerDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der); __device__ void computeParametricDelayDeriv(float* state, float* control, float* state_der, PARAMS_T* params_p); __device__ void computeParametricSteerDeriv(float* state, float* control, float* state_der, PARAMS_T* params_p); void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt); __device__ void updateState(float* state, float* next_state, float* state_der, const float dt); state_array interpolateState(const Eigen::Ref state_1, const Eigen::Ref state_2, const float alpha); bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B); __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta = nullptr); __host__ __device__ void setOutputs(const float* state_der, const float* next_state, float* output); // __device__ void computeStateDeriv(float* state, float* control, float* state_der, float* theta_s, float // *output=nullptr); // TODO void getStoppingControl(const Eigen::Ref& state, Eigen::Ref u); Eigen::Quaternionf attitudeFromState(const Eigen::Ref& state); Eigen::Vector3f positionFromState(const Eigen::Ref& state); Eigen::Vector3f velocityFromState(const Eigen::Ref& state); Eigen::Vector3f angularRateFromState(const Eigen::Ref& state); state_array stateFromOdometry(const Eigen::Quaternionf& q, const Eigen::Vector3f& pos, const Eigen::Vector3f& vel, const Eigen::Vector3f& omega); void enforceLeash(const Eigen::Ref& state_true, const Eigen::Ref& state_nominal, const Eigen::Ref& leash_values, Eigen::Ref state_output); state_array stateFromMap(const std::map& map) override; }; class RacerDubins : public RacerDubinsImpl { public: RacerDubins(cudaStream_t stream = nullptr) : RacerDubinsImpl(stream) { } RacerDubins(RacerDubinsParams& params, cudaStream_t stream = nullptr) : RacerDubinsImpl(params, stream) { } }; #if __CUDACC__ #include "racer_dubins.cu" #endif #endif // MPPIGENERIC_RACER_DUBINS_CUH ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins_elevation.cu ================================================ #include #include namespace mm = mppi::matrix_multiplication; namespace mp1 = mppi::p1; template void RacerDubinsElevationImpl::GPUSetup() { PARENT_CLASS::GPUSetup(); tex_helper_->GPUSetup(); // makes sure that the device ptr sees the correct texture object HANDLE_ERROR(cudaMemcpyAsync(&(this->model_d_->tex_helper_), &(tex_helper_->ptr_d_), sizeof(TwoDTextureHelper*), cudaMemcpyHostToDevice, this->stream_)); } template void RacerDubinsElevationImpl::freeCudaMem() { tex_helper_->freeCudaMem(); PARENT_CLASS::freeCudaMem(); } template void RacerDubinsElevationImpl::paramsToDevice() { // does all the internal texture updates tex_helper_->copyToDevice(); PARENT_CLASS::paramsToDevice(); } template void RacerDubinsElevationImpl::computeParametricAccelDeriv( const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der, const float dt) { float linear_brake_slope = 0.2f; bool enable_brake = control(C_INDEX(THROTTLE_BRAKE)) < 0.0f; int index = (abs(state(S_INDEX(VEL_X))) > linear_brake_slope && abs(state(S_INDEX(VEL_X))) <= 3.0f) + (abs(state(S_INDEX(VEL_X))) > 3.0f) * 2; const float brake_state = fmin(fmax(state(S_INDEX(BRAKE_STATE)), 0.0f), 0.25f); // applying position throttle float throttle = this->params_.c_t[index] * control(C_INDEX(THROTTLE_BRAKE)); float brake = this->params_.c_b[index] * brake_state * (state(S_INDEX(VEL_X)) >= 0.0f ? -1.0f : 1.0f); if (abs(state(S_INDEX(VEL_X))) <= linear_brake_slope) { throttle = this->params_.c_t[index] * max(control(C_INDEX(THROTTLE_BRAKE)) - this->params_.low_min_throttle, 0.0f); brake = this->params_.c_b[index] * brake_state * -state(S_INDEX(VEL_X)); } state_der(S_INDEX(VEL_X)) = (!enable_brake) * throttle * this->params_.gear_sign + brake - this->params_.c_v[index] * state(S_INDEX(VEL_X)) + this->params_.c_0; state_der(S_INDEX(VEL_X)) = min(max(state_der(S_INDEX(VEL_X)), -this->params_.clamp_ax), this->params_.clamp_ax); if (fabsf(state[S_INDEX(PITCH)]) < static_cast(M_PI_2)) { state_der[S_INDEX(VEL_X)] -= this->params_.gravity * sinf(state[S_INDEX(PITCH)]); } state_der(S_INDEX(YAW)) = (state(S_INDEX(VEL_X)) / this->params_.wheel_base) * tanf(state(S_INDEX(STEER_ANGLE)) / this->params_.steer_angle_scale); float yaw, sin_yaw, cos_yaw; yaw = state[S_INDEX(YAW)]; sincosf(yaw, &sin_yaw, &cos_yaw); state_der(S_INDEX(POS_X)) = state(S_INDEX(VEL_X)) * cos_yaw; state_der(S_INDEX(POS_Y)) = state(S_INDEX(VEL_X)) * sin_yaw; } template __host__ __device__ void RacerDubinsElevationImpl::setOutputs(const float* state_der, const float* next_state, float* output) { // Setup output // output[O_INDEX(BASELINK_VEL_B_X)] = next_state[S_INDEX(VEL_X)]; // output[O_INDEX(BASELINK_VEL_B_Y)] = 0.0f; // output[O_INDEX(BASELINK_VEL_B_Z)] = 0.0f; // output[O_INDEX(BASELINK_POS_I_X)] = next_state[S_INDEX(POS_X)]; // output[O_INDEX(BASELINK_POS_I_Y)] = next_state[S_INDEX(POS_Y)]; // output[O_INDEX(PITCH)] = next_state[S_INDEX(PITCH)]; // output[O_INDEX(ROLL)] = next_state[S_INDEX(ROLL)]; // output[O_INDEX(YAW)] = next_state[S_INDEX(YAW)]; // output[O_INDEX(STEER_ANGLE)] = next_state[S_INDEX(STEER_ANGLE)]; // output[O_INDEX(STEER_ANGLE_RATE)] = next_state[S_INDEX(STEER_ANGLE_RATE)]; // output[O_INDEX(WHEEL_FORCE_B_FL)] = 10000.0f; // output[O_INDEX(WHEEL_FORCE_B_FR)] = 10000.0f; // output[O_INDEX(WHEEL_FORCE_B_RL)] = 10000.0f; // output[O_INDEX(WHEEL_FORCE_B_RR)] = 10000.0f; // output[O_INDEX(ACCEL_X)] = state_der[S_INDEX(VEL_X)]; // output[O_INDEX(ACCEL_Y)] = 0.0f; // output[O_INDEX(OMEGA_Z)] = state_der[S_INDEX(YAW)]; // output[O_INDEX(UNCERTAINTY_VEL_X)] = next_state[S_INDEX(UNCERTAINTY_VEL_X)]; // output[O_INDEX(UNCERTAINTY_YAW_VEL_X)] = next_state[S_INDEX(UNCERTAINTY_YAW_VEL_X)]; // output[O_INDEX(UNCERTAINTY_POS_X_VEL_X)] = next_state[S_INDEX(UNCERTAINTY_POS_X_VEL_X)]; // output[O_INDEX(UNCERTAINTY_POS_Y_VEL_X)] = next_state[S_INDEX(UNCERTAINTY_POS_Y_VEL_X)]; // output[O_INDEX(UNCERTAINTY_YAW)] = next_state[S_INDEX(UNCERTAINTY_YAW)]; // output[O_INDEX(UNCERTAINTY_POS_X_YAW)] = next_state[S_INDEX(UNCERTAINTY_POS_X_YAW)]; // output[O_INDEX(UNCERTAINTY_POS_Y_YAW)] = next_state[S_INDEX(UNCERTAINTY_POS_Y_YAW)]; // output[O_INDEX(UNCERTAINTY_POS_X)] = next_state[S_INDEX(UNCERTAINTY_POS_X)]; // output[O_INDEX(UNCERTAINTY_POS_X_Y)] = next_state[S_INDEX(UNCERTAINTY_POS_X_Y)]; // output[O_INDEX(UNCERTAINTY_POS_Y)] = next_state[S_INDEX(UNCERTAINTY_POS_Y)]; int step, pi; mp1::getParallel1DIndex(pi, step); for (int i = pi; i < this->OUTPUT_DIM; i += step) { switch (i) { case O_INDEX(BASELINK_VEL_B_X): output[i] = next_state[S_INDEX(VEL_X)]; break; case O_INDEX(BASELINK_VEL_B_Y): output[i] = 0.0f; break; // case O_INDEX(BASELINK_VEL_B_Z): // output[i] = 0.0f; // break; case O_INDEX(BASELINK_POS_I_X): output[i] = next_state[S_INDEX(POS_X)]; break; case O_INDEX(BASELINK_POS_I_Y): output[i] = next_state[S_INDEX(POS_Y)]; break; case O_INDEX(PITCH): output[i] = next_state[S_INDEX(PITCH)]; break; case O_INDEX(ROLL): output[i] = next_state[S_INDEX(ROLL)]; break; case O_INDEX(YAW): output[i] = next_state[S_INDEX(YAW)]; break; case O_INDEX(STEER_ANGLE): output[i] = next_state[S_INDEX(STEER_ANGLE)]; break; case O_INDEX(STEER_ANGLE_RATE): output[i] = next_state[S_INDEX(STEER_ANGLE_RATE)]; break; case O_INDEX(WHEEL_FORCE_UP_MAX): output[i] = NAN; break; case O_INDEX(WHEEL_FORCE_FWD_MAX): output[i] = NAN; break; case O_INDEX(WHEEL_FORCE_SIDE_MAX): output[i] = NAN; break; // case O_INDEX(WHEEL_FORCE_UP_FL): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_UP_FR): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_UP_RL): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_UP_RR): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_FWD_FL): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_FWD_FR): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_FWD_RL): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_FWD_RR): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_SIDE_FL): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_SIDE_FR): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_SIDE_RL): // output[i] = NAN; // break; // case O_INDEX(WHEEL_FORCE_SIDE_RR): // output[i] = NAN; // break; case O_INDEX(ACCEL_X): output[i] = state_der[S_INDEX(VEL_X)]; break; case O_INDEX(ACCEL_Y): output[i] = 0.0f; break; case O_INDEX(OMEGA_Z): output[i] = state_der[S_INDEX(YAW)]; break; case O_INDEX(UNCERTAINTY_VEL_X): output[i] = next_state[S_INDEX(UNCERTAINTY_VEL_X)]; break; case O_INDEX(UNCERTAINTY_YAW_VEL_X): output[i] = next_state[S_INDEX(UNCERTAINTY_YAW_VEL_X)]; break; case O_INDEX(UNCERTAINTY_POS_X_VEL_X): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_X_VEL_X)]; break; case O_INDEX(UNCERTAINTY_POS_Y_VEL_X): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_Y_VEL_X)]; break; case O_INDEX(UNCERTAINTY_YAW): output[i] = next_state[S_INDEX(UNCERTAINTY_YAW)]; break; case O_INDEX(UNCERTAINTY_POS_X_YAW): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_X_YAW)]; break; case O_INDEX(UNCERTAINTY_POS_Y_YAW): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_Y_YAW)]; break; case O_INDEX(UNCERTAINTY_POS_X): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_X)]; break; case O_INDEX(UNCERTAINTY_POS_X_Y): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_X_Y)]; break; case O_INDEX(UNCERTAINTY_POS_Y): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_Y)]; break; case O_INDEX(TOTAL_VELOCITY): output[i] = fabsf(next_state[S_INDEX(VEL_X)]); break; } } } template void RacerDubinsElevationImpl::step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt) { this->computeParametricDelayDeriv(state, control, state_der); this->computeParametricSteerDeriv(state, control, state_der); computeParametricAccelDeriv(state, control, state_der, dt); SharedBlock sb; // Integrate using racer_dubins updateState this->PARENT_CLASS::updateState(state, next_state, state_der, dt); computeUncertaintyPropagation(state.data(), control.data(), state_der.data(), next_state.data(), dt, &this->params_, &sb, nullptr); float roll = state(S_INDEX(ROLL)); float pitch = state(S_INDEX(PITCH)); RACER::computeStaticSettling>( this->tex_helper_, next_state(S_INDEX(YAW)), next_state(S_INDEX(POS_X)), next_state(S_INDEX(POS_Y)), roll, pitch, output.data()); next_state[S_INDEX(PITCH)] = pitch; next_state[S_INDEX(ROLL)] = roll; this->setOutputs(state_der.data(), next_state.data(), output.data()); } template bool RacerDubinsElevationImpl::computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { A = dfdx::Zero(); B = dfdu::Zero(); float eps = 0.01f; bool enable_brake = control(C_INDEX(THROTTLE_BRAKE)) < 0.0f; // vx float linear_brake_slope = 0.2f; int index = (abs(state(S_INDEX(VEL_X))) > linear_brake_slope && abs(state(S_INDEX(VEL_X))) <= 3.0f) + (abs(state(S_INDEX(VEL_X))) > 3.0f) * 2; A(0, 0) = -this->params_.c_v[index]; if (abs(state(S_INDEX(VEL_X))) < linear_brake_slope) { A(0, 5) = this->params_.c_b[index] * -state(S_INDEX(VEL_X)); } else { A(0, 5) = this->params_.c_b[index] * (state(S_INDEX(VEL_X)) >= 0.0f ? -1.0f : 1.0f); } // TODO zero out if we are above the threshold to match?? // yaw A(1, 0) = (1.0f / this->params_.wheel_base) * tanf(state(S_INDEX(STEER_ANGLE)) / this->params_.steer_angle_scale); A(1, 4) = (state(S_INDEX(VEL_X)) / this->params_.wheel_base) * (1.0 / SQ(cosf(state(S_INDEX(STEER_ANGLE)) / this->params_.steer_angle_scale))) / this->params_.steer_angle_scale; // pos x A(2, 0) = cosf(state(S_INDEX(YAW))); A(2, 1) = -sinf(state(S_INDEX(YAW))) * state(S_INDEX(VEL_X)); // pos y A(3, 0) = sinf(state(S_INDEX(YAW))); A(3, 1) = cosf(state(S_INDEX(YAW))) * state(S_INDEX(VEL_X)); // steer angle float steer_dot = (control(C_INDEX(STEER_CMD)) * this->params_.steer_command_angle_scale - state(S_INDEX(STEER_ANGLE))) * this->params_.steering_constant; if (steer_dot - eps < -this->params_.max_steer_rate || steer_dot + eps > this->params_.max_steer_rate) { A(4, 4) = 0.0f; } else { A(4, 4) = -this->params_.steering_constant; } A(4, 4) = max(min(A(4, 4), this->params_.max_steer_rate), -this->params_.max_steer_rate); // gravity A(0, 7) = -this->params_.gravity * cosf(state(S_INDEX(PITCH))); // brake delay float brake_dot = (enable_brake * -control(C_INDEX(THROTTLE_BRAKE)) - state(S_INDEX(BRAKE_STATE))) * this->params_.brake_delay_constant; if (brake_dot - eps < -this->params_.max_brake_rate_neg || brake_dot + eps > this->params_.max_brake_rate_pos) { A(5, 5) = 0.0f; } else { A(5, 5) = -this->params_.brake_delay_constant; } // steering command B(4, 1) = this->params_.steer_command_angle_scale * this->params_.steering_constant; // throttle command B(0, 0) = this->params_.c_t[index] * this->params_.gear_sign * (!enable_brake); // brake command if ((state(S_INDEX(BRAKE_STATE)) < -this->control_rngs_[C_INDEX(THROTTLE_BRAKE)].x && brake_dot < 0.0f) || (state(S_INDEX(BRAKE_STATE)) > 0.0f && brake_dot > 0.0f)) { B(5, 0) = -this->params_.brake_delay_constant * enable_brake; } return true; } template __host__ __device__ bool RacerDubinsElevationImpl::computeUncertaintyJacobian(const float* state, const float* control, float* A, DYN_PARAMS_T* params_p) { float sin_yaw, cos_yaw, tan_steer_angle, cos_2_delta; #ifdef __CUDA_ARCH__ float yaw_norm = angle_utils::normalizeAngle(state[S_INDEX(YAW)]); float delta = state[S_INDEX(STEER_ANGLE)] / params_p->steer_angle_scale; __sincosf(yaw_norm, &sin_yaw, &cos_yaw); tan_steer_angle = __tanf(delta); cos_2_delta = SQ(__cosf(delta)); #else sincosf(state[S_INDEX(YAW)], &sin_yaw, &cos_yaw); float delta = state[S_INDEX(STEER_ANGLE)] / params_p->steer_angle_scale; tan_steer_angle = tanf(delta); cos_2_delta = SQ(cosf(delta)); #endif // const float cos_2_delta = cos_yaw * cos_yaw; // vx float linear_brake_slope = 0.2f; int index = (fabsf(state[S_INDEX(VEL_X)]) > linear_brake_slope && fabsf(state[S_INDEX(VEL_X)]) <= 3.0f) + (fabsf(state[S_INDEX(VEL_X)]) > 3.0f) * 2; int step, pi; mp1::getParallel1DIndex(pi, step); const float brake_state = fmin(fmax(state[S_INDEX(BRAKE_STATE)], 0.0f), 0.25f); // A = df/dx + df/du * K for (int i = pi; i < UNCERTAINTY_DIM * UNCERTAINTY_DIM; i += step) { switch (i) { case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(VEL_X), UNCERTAINTY_DIM): A[i] = -params_p->c_v[index] - params_p->K_vel_x - (index == 0 ? 1.0f : 0.0f) * params_p->c_b[0] * brake_state; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(YAW), UNCERTAINTY_DIM): A[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_X), UNCERTAINTY_DIM): A[i] = -params_p->K_x * cos_yaw; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_Y), UNCERTAINTY_DIM): A[i] = -params_p->K_x * sin_yaw; break; // yaw case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(VEL_X), UNCERTAINTY_DIM): A[i] = tan_steer_angle / (params_p->wheel_base); break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(YAW), UNCERTAINTY_DIM): A[i] = -fabsf(state[S_INDEX(VEL_X)]) * params_p->K_yaw / (params_p->wheel_base * cos_2_delta); break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_X), UNCERTAINTY_DIM): A[i] = state[S_INDEX(VEL_X)] * params_p->K_y * sin_yaw / (params_p->wheel_base * cos_2_delta); break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_Y), UNCERTAINTY_DIM): A[i] = -state[S_INDEX(VEL_X)] * params_p->K_y * cos_yaw / (params_p->wheel_base * cos_2_delta); break; // pos x case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(VEL_X), UNCERTAINTY_DIM): A[i] = cos_yaw; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(YAW), UNCERTAINTY_DIM): A[i] = -sin_yaw * state[S_INDEX(VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_X), UNCERTAINTY_DIM): A[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_Y), UNCERTAINTY_DIM): A[i] = 0.0f; break; // pos y case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(VEL_X), UNCERTAINTY_DIM): A[i] = sin_yaw; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(YAW), UNCERTAINTY_DIM): A[i] = cos_yaw * state[S_INDEX(VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_Y), UNCERTAINTY_DIM): A[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_X), UNCERTAINTY_DIM): A[i] = 0.0f; break; } } return true; } template __host__ __device__ bool RacerDubinsElevationImpl::computeQ(const float* state, const float* control, const float* state_der, float* Q, DYN_PARAMS_T* params_p, float* theta_s) { const float abs_vx = fabsf(state[S_INDEX(VEL_X)]); const float abs_acc_x = fabsf(state_der[S_INDEX(VEL_X)]); const float delta = state[S_INDEX(STEER_ANGLE)] / params_p->steer_angle_scale; float sin_yaw, cos_yaw, tan_steer_angle, sin_roll; #ifdef __CUDA_ARCH__ const float yaw_norm = angle_utils::normalizeAngle(state[S_INDEX(YAW)]); __sincosf(yaw_norm, &sin_yaw, &cos_yaw); tan_steer_angle = __tanf(delta); sin_roll = __sinf(angle_utils::normalizeAngle(state[S_INDEX(ROLL)])); #else sincosf(state[S_INDEX(YAW)], &sin_yaw, &cos_yaw); tan_steer_angle = tanf(delta); #endif const float side_force = SQ(abs_vx) * tan_steer_angle / params_p->wheel_base + params_p->gravity * sin_roll; // const float Q_11 = params_p->Q_y_f * side_force * side_force * abs_vx; const float Q_11 = fabsf(params_p->Q_y_f * fabsf(side_force) * fmaxf(abs_vx - 2, 0.0f)); const float linear_brake_slope = 0.2f; const int index = (fabsf(state[S_INDEX(VEL_X)]) > linear_brake_slope && fabsf(state[S_INDEX(VEL_X)]) <= 3.0f) + (fabsf(state[S_INDEX(VEL_X)]) > 3.0f) * 2; int step, pi; mp1::getParallel1DIndex(pi, step); for (int i = pi; i < UNCERTAINTY_DIM * UNCERTAINTY_DIM; i += step) { switch (i) { // vel_x case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(VEL_X), UNCERTAINTY_DIM): Q[i] = params_p->Q_x_acc * abs_acc_x + params_p->Q_x_v[index] * abs_vx; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(YAW), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_Y), UNCERTAINTY_DIM): Q[i] = 0.0f; break; // yaw case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(VEL_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(YAW), UNCERTAINTY_DIM): Q[i] = abs_vx * (params_p->Q_omega_steering * fabsf(delta) + params_p->Q_omega_v); break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_Y), UNCERTAINTY_DIM): Q[i] = 0.0f; break; // pos x case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(VEL_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(YAW), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_X), UNCERTAINTY_DIM): Q[i] = Q_11 * sin_yaw * sin_yaw; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_Y), UNCERTAINTY_DIM): Q[i] = -Q_11 * sin_yaw * cos_yaw; break; // pos y case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(VEL_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(YAW), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_Y), UNCERTAINTY_DIM): Q[i] = Q_11 * cos_yaw * cos_yaw; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_X), UNCERTAINTY_DIM): Q[i] = -Q_11 * sin_yaw * cos_yaw; break; } } return true; } template __host__ __device__ void RacerDubinsElevationImpl::uncertaintyStateToMatrix(const float* state, float* uncertainty_matrix) { int step, pi; mp1::getParallel1DIndex(pi, step); for (int i = pi; i < UNCERTAINTY_DIM * UNCERTAINTY_DIM; i += step) { switch (i) { case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(VEL_X), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(VEL_X), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_YAW_VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(VEL_X), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_X_VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(VEL_X), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_Y_VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(YAW), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_YAW_VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(YAW), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_YAW)]; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(YAW), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_X_YAW)]; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(YAW), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_Y_YAW)]; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_X), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_X_VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_X), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_X_YAW)]; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_X), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_X)]; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_X), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_X_Y)]; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_Y), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_Y_VEL_X)]; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_Y), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_Y_YAW)]; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_Y), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_X_Y)]; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_Y), UNCERTAINTY_DIM): uncertainty_matrix[i] = state[S_INDEX(UNCERTAINTY_POS_Y)]; break; } } } template __host__ __device__ void RacerDubinsElevationImpl::uncertaintyMatrixToState(const float* uncertainty_matrix, float* state) { int step, pi; mp1::getParallel1DIndex(pi, step); for (int i = pi + S_INDEX(UNCERTAINTY_POS_X); i < this->STATE_DIM; i += step) { switch (i) { case S_INDEX(UNCERTAINTY_VEL_X): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(VEL_X), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_YAW_VEL_X): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(VEL_X), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_POS_X_VEL_X): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(VEL_X), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_POS_Y_VEL_X): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(VEL_X), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_YAW): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(YAW), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_POS_X_YAW): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(YAW), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_POS_Y_YAW): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(YAW), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_POS_X): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_X), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_POS_X_Y): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_X), UNCERTAINTY_DIM)]; break; case S_INDEX(UNCERTAINTY_POS_Y): state[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_Y), UNCERTAINTY_DIM)]; break; default: break; } } } template __host__ __device__ void RacerDubinsElevationImpl::uncertaintyMatrixToOutput(const float* uncertainty_matrix, float* output) { int step, pi; mp1::getParallel1DIndex(pi, step); for (int i = pi + O_INDEX(UNCERTAINTY_POS_X); i < this->OUTPUT_DIM; i += step) { switch (i) { case O_INDEX(UNCERTAINTY_VEL_X): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(VEL_X), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_YAW_VEL_X): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(VEL_X), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_POS_X_VEL_X): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(VEL_X), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_POS_Y_VEL_X): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(VEL_X), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_YAW): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(YAW), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_POS_X_YAW): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(YAW), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_POS_Y_YAW): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(YAW), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_POS_X): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_X), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_POS_X_Y): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_X), UNCERTAINTY_DIM)]; break; case O_INDEX(UNCERTAINTY_POS_Y): output[i] = uncertainty_matrix[mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_Y), UNCERTAINTY_DIM)]; break; default: break; } } } template __host__ __device__ void RacerDubinsElevationImpl::computeUncertaintyPropagation( const float* state, const float* control, const float* state_der, float* next_state, float dt, DYN_PARAMS_T* params_p, SharedBlock* uncertainty_data, float* theta_s) { CLASS_T* derived = static_cast(this); int step, pi; mp1::getParallel1DIndex(pi, step); // if (params_p->gear_sign == -1) // { // Set Uncertainty to zero in reverse // for (int i = pi; i < UNCERTAINTY_DIM * UNCERTAINTY_DIM; i += step) // { // uncertainty_data->Sigma_a[i] = 0.0f; // } // #ifdef __CUDA_ARCH__ // __syncthreads(); // #endif // derived->uncertaintyMatrixToState(uncertainty_data->Sigma_a, next_state); // #ifdef __CUDA_ARCH__ // __syncthreads(); // #endif // return; // } derived->computeUncertaintyJacobian(state, control, uncertainty_data->A, params_p); derived->uncertaintyStateToMatrix(state, uncertainty_data->Sigma_a); #ifdef __CUDA_ARCH__ __syncthreads(); // TODO: Check if this syncthreads is even needed #endif // Turn A into (I + A dt) for (int i = pi; i < UNCERTAINTY_DIM * UNCERTAINTY_DIM; i += step) { uncertainty_data->A[i] = (i % (UNCERTAINTY_DIM + 1) == 0) + uncertainty_data->A[i] * dt; } #ifdef __CUDA_ARCH__ __syncthreads(); mm::gemm1( uncertainty_data->A, uncertainty_data->Sigma_a, uncertainty_data->Sigma_b, 1.0f, 0.0f, mm::MAT_OP::NONE, mm::MAT_OP::NONE); __syncthreads(); mm::gemm1( uncertainty_data->Sigma_b, uncertainty_data->A, uncertainty_data->Sigma_a, 1.0f, 0.0f, mm::MAT_OP::NONE, mm::MAT_OP::TRANSPOSE); __syncthreads(); #else typedef Eigen::Matrix eigen_uncertainty_matrx; Eigen::Map A_eigen(uncertainty_data->A); Eigen::Map Sigma_a_eigen(uncertainty_data->Sigma_a); Eigen::Map Sigma_b_eigen(uncertainty_data->Sigma_b); Sigma_a_eigen = A_eigen * Sigma_a_eigen * A_eigen.transpose(); #endif // float Q[UNCERTAINTY_DIM * UNCERTAINTY_DIM] = { // 1.0f, 0.0f, 0.0f, 0.0f, // 0.0f, 0.01f, 0.0f, 0.0f, // 0.0f, 0.0f, 1.0f, 0.0f, // 0.0f, 0.0f, 0.0f, 0.25f, // }; derived->computeQ(state, control, state_der, uncertainty_data->Sigma_b, params_p, theta_s); #ifdef __CUDA_ARCH__ __syncthreads(); // TODO: Check if this syncthreads is even needed #endif for (int i = pi; i < UNCERTAINTY_DIM * UNCERTAINTY_DIM; i += step) { uncertainty_data->Sigma_a[i] += uncertainty_data->Sigma_b[i] * dt; } #ifdef __CUDA_ARCH__ __syncthreads(); #endif derived->uncertaintyMatrixToState(uncertainty_data->Sigma_a, next_state); #ifdef __CUDA_ARCH__ __syncthreads(); #endif } template __device__ void RacerDubinsElevationImpl::initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt) { PARENT_CLASS::initializeDynamics(state, control, output, theta_s, t_0, dt); if (this->SHARED_MEM_REQUEST_GRD_BYTES != 0) { // Allows us to turn on or off global or shared memory version of params DYN_PARAMS_T* shared_params = (DYN_PARAMS_T*)theta_s; *shared_params = this->params_; } } template __device__ void RacerDubinsElevationImpl::computeParametricAccelDeriv(float* state, float* control, float* state_der, const float dt, DYN_PARAMS_T* params_p) { const int tdy = threadIdx.y; float linear_brake_slope = 0.2f; // Compute dynamics bool enable_brake = control[C_INDEX(THROTTLE_BRAKE)] < 0.0f; int index = (fabsf(state[S_INDEX(VEL_X)]) > linear_brake_slope && fabsf(state[S_INDEX(VEL_X)]) <= 3.0f) + (fabsf(state[S_INDEX(VEL_X)]) > 3.0f) * 2; const float brake_state = fminf(fmaxf(state[S_INDEX(BRAKE_STATE)], 0.0f), 0.25f); // applying position throttle float throttle = params_p->c_t[index] * control[C_INDEX(THROTTLE_BRAKE)]; float brake = params_p->c_b[index] * brake_state * (state[S_INDEX(VEL_X)] >= 0.0f ? -1.0f : 1.0f); if (fabsf(state[S_INDEX(VEL_X)]) <= linear_brake_slope) { throttle = params_p->c_t[index] * fmaxf(control[C_INDEX(THROTTLE_BRAKE)] - params_p->low_min_throttle, 0.0f); brake = params_p->c_b[index] * brake_state * -state[S_INDEX(VEL_X)]; } if (tdy == 0) { state_der[S_INDEX(VEL_X)] = (!enable_brake) * throttle * params_p->gear_sign + brake - params_p->c_v[index] * state[S_INDEX(VEL_X)] + params_p->c_0; state_der[S_INDEX(VEL_X)] = fminf(fmaxf(state_der[S_INDEX(VEL_X)], -params_p->clamp_ax), params_p->clamp_ax); if (fabsf(state[S_INDEX(PITCH)]) < static_cast(M_PI_2)) { state_der[S_INDEX(VEL_X)] -= params_p->gravity * __sinf(angle_utils::normalizeAngle(state[S_INDEX(PITCH)])); } } state_der[S_INDEX(YAW)] = (state[S_INDEX(VEL_X)] / params_p->wheel_base) * __tanf(angle_utils::normalizeAngle(state[S_INDEX(STEER_ANGLE)] / params_p->steer_angle_scale)); const float yaw_norm = angle_utils::normalizeAngle(state[S_INDEX(YAW)]); state_der[S_INDEX(POS_X)] = state[S_INDEX(VEL_X)] * __cosf(yaw_norm); state_der[S_INDEX(POS_Y)] = state[S_INDEX(VEL_X)] * __sinf(yaw_norm); } template __device__ void RacerDubinsElevationImpl::updateState(float* state, float* next_state, float* state_der, const float dt, DYN_PARAMS_T* params_p) { // const uint tdy = threadIdx.y; int pi, step; mp1::getParallel1DIndex(pi, step); // Set to 6 as the last 3 states do not do euler integration for (int i = pi; i < 6; i += step) { // if (i == S_INDEX(ROLL) || i == S_INDEX(PITCH) || i == S_INDEX(STEER_ANGLE_RATE)) // { // continue; // } next_state[i] = state[i] + state_der[i] * dt; switch (i) { case S_INDEX(YAW): next_state[i] = angle_utils::normalizeAngle(next_state[i]); break; case S_INDEX(STEER_ANGLE): next_state[S_INDEX(STEER_ANGLE)] = fmaxf(fminf(next_state[S_INDEX(STEER_ANGLE)], params_p->max_steer_angle), -params_p->max_steer_angle); next_state[S_INDEX(STEER_ANGLE_RATE)] = state_der[S_INDEX(STEER_ANGLE)]; break; case S_INDEX(BRAKE_STATE): next_state[S_INDEX(BRAKE_STATE)] = fminf(fmaxf(next_state[S_INDEX(BRAKE_STATE)], 0.0f), 1.0f); break; } } __syncthreads(); } template __device__ inline void RacerDubinsElevationImpl::step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt) { DYN_PARAMS_T* params_p; SharedBlock* sb; // TODO the below two if statements conflict in a bad way if (this->SHARED_MEM_REQUEST_GRD_BYTES != 0) { // Allows us to turn on or off global or shared memory version of params params_p = (DYN_PARAMS_T*)theta_s; } else { params_p = &(this->params_); } if (this->SHARED_MEM_REQUEST_BLK_BYTES != 0) { float* sb_mem = &theta_s[mppi::math::int_multiple_const(this->SHARED_MEM_REQUEST_GRD_BYTES, sizeof(float4)) / sizeof(float)]; sb = (SharedBlock*)&sb_mem[(this->SHARED_MEM_REQUEST_BLK_BYTES * threadIdx.x + this->SHARED_MEM_REQUEST_BLK_BYTES * blockDim.x * threadIdx.z) / sizeof(float)]; } this->computeParametricDelayDeriv(state, control, state_der, params_p); this->computeParametricSteerDeriv(state, control, state_der, params_p); computeParametricAccelDeriv(state, control, state_der, dt, params_p); updateState(state, next_state, state_der, dt, params_p); computeUncertaintyPropagation(state, control, state_der, next_state, dt, params_p, sb, theta_s); if (threadIdx.y == 0) { float roll = state[S_INDEX(ROLL)]; float pitch = state[S_INDEX(PITCH)]; RACER::computeStaticSettling>( this->tex_helper_, next_state[S_INDEX(YAW)], next_state[S_INDEX(POS_X)], next_state[S_INDEX(POS_Y)], roll, pitch, output); next_state[S_INDEX(PITCH)] = pitch; next_state[S_INDEX(ROLL)] = roll; } __syncthreads(); this->setOutputs(state_der, next_state, output); } template RacerDubinsElevationImpl::state_array RacerDubinsElevationImpl::stateFromMap(const std::map& map) { std::vector keys = { "VEL_X", "VEL_Y", "POS_X", "POS_Y", "ROLL", "PITCH", "STEER_ANGLE", "STEER_ANGLE_RATE", "BRAKE_STATE" }; bool look_for_uncertainty = false; if (look_for_uncertainty) { keys.push_back("UNCERTAINTY_POS_X"); keys.push_back("UNCERTAINTY_POS_Y"); keys.push_back("UNCERTAINTY_YAW"); keys.push_back("UNCERTAINTY_VEL_X"); keys.push_back("UNCERTAINTY_POS_X_Y"); keys.push_back("UNCERTAINTY_POS_X_YAW"); keys.push_back("UNCERTAINTY_POS_X_VEL_X"); keys.push_back("UNCERTAINTY_POS_Y_YAW"); keys.push_back("UNCERTAINTY_POS_Y_VEL_X"); keys.push_back("UNCERTAINTY_YAW_VEL_X"); } bool found_all_keys = true; for (const auto& key : keys) { if (map.find(key) == map.end()) { this->logger_->warning("Could not find %s key for elevation dynamics\n", key.c_str()); found_all_keys = false; } } state_array s = this->getZeroState(); if (!found_all_keys) { return state_array::Constant(NAN); } s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(POS_Y)) = map.at("POS_Y"); s(S_INDEX(VEL_X)) = map.at("VEL_X"); s(S_INDEX(YAW)) = map.at("YAW"); s(S_INDEX(STEER_ANGLE)) = map.at("STEER_ANGLE"); s(S_INDEX(STEER_ANGLE_RATE)) = map.at("STEER_ANGLE_RATE"); s(S_INDEX(ROLL)) = map.at("ROLL"); s(S_INDEX(PITCH)) = map.at("PITCH"); s(S_INDEX(BRAKE_STATE)) = map.at("BRAKE_STATE"); if (look_for_uncertainty) { s(S_INDEX(UNCERTAINTY_POS_X)) = map.at("UNCERTAINTY_POS_X"); s(S_INDEX(UNCERTAINTY_POS_Y)) = map.at("UNCERTAINTY_POS_Y"); s(S_INDEX(UNCERTAINTY_YAW)) = map.at("UNCERTAINTY_YAW"); s(S_INDEX(UNCERTAINTY_VEL_X)) = map.at("UNCERTAINTY_VEL_X"); s(S_INDEX(UNCERTAINTY_POS_X_Y)) = map.at("UNCERTAINTY_POS_X_Y"); s(S_INDEX(UNCERTAINTY_POS_X_YAW)) = map.at("UNCERTAINTY_POS_X_YAW"); s(S_INDEX(UNCERTAINTY_POS_X_VEL_X)) = map.at("UNCERTAINTY_POS_X_VEL_X"); s(S_INDEX(UNCERTAINTY_POS_Y_YAW)) = map.at("UNCERTAINTY_POS_Y_YAW"); s(S_INDEX(UNCERTAINTY_POS_Y_VEL_X)) = map.at("UNCERTAINTY_POS_Y_VEL_X"); s(S_INDEX(UNCERTAINTY_YAW_VEL_X)) = map.at("UNCERTAINTY_YAW_VEL_X"); } float eps = 1e-6f; if (s(S_INDEX(UNCERTAINTY_POS_X)) < eps) { s(S_INDEX(UNCERTAINTY_POS_X)) = eps; } if (s(S_INDEX(UNCERTAINTY_POS_Y)) < eps) { s(S_INDEX(UNCERTAINTY_POS_Y)) = eps; } if (s(S_INDEX(UNCERTAINTY_YAW)) < eps) { s(S_INDEX(UNCERTAINTY_YAW)) = eps; } if (s(S_INDEX(UNCERTAINTY_VEL_X)) < eps) { s(S_INDEX(UNCERTAINTY_VEL_X)) = eps; } return s; } template Eigen::Quaternionf RacerDubinsElevationImpl::attitudeFromState(const Eigen::Ref& state) { Eigen::Quaternionf q; mppi::math::Euler2QuatNWU(state(S_INDEX(ROLL)), state(S_INDEX(PITCH)), state(S_INDEX(YAW)), q); return q; } template Eigen::Vector3f RacerDubinsElevationImpl::positionFromState(const Eigen::Ref& state) { return Eigen::Vector3f(state[S_INDEX(POS_X)], state[S_INDEX(POS_Y)], 0.0f); } ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins_elevation.cuh ================================================ #ifndef MPPIGENERIC_RACER_DUBINS_ELEVATION_CUH #define MPPIGENERIC_RACER_DUBINS_ELEVATION_CUH #include #include #include #include using namespace MPPI_internal; #ifndef U_INDEX #define U_IND_CLASS(CLASS, enum_val) E_INDEX(CLASS::UncertaintyIndex, enum_val) #define U_INDEX(enum_val) U_IND_CLASS(PARENT_CLASS::DYN_PARAMS_T, enum_val) #endif struct RacerDubinsElevationParams : public RacerDubinsParams { enum class StateIndex : int { VEL_X = 0, YAW, POS_X, POS_Y, STEER_ANGLE, BRAKE_STATE, ROLL, PITCH, STEER_ANGLE_RATE, UNCERTAINTY_POS_X, UNCERTAINTY_POS_Y, UNCERTAINTY_YAW, UNCERTAINTY_VEL_X, UNCERTAINTY_POS_X_Y, UNCERTAINTY_POS_X_YAW, UNCERTAINTY_POS_X_VEL_X, UNCERTAINTY_POS_Y_YAW, UNCERTAINTY_POS_Y_VEL_X, UNCERTAINTY_YAW_VEL_X, NUM_STATES }; enum class UncertaintyIndex : int { VEL_X = 0, YAW, POS_X, POS_Y, NUM_UNCERTAINTIES }; float clamp_ax = 5.5f; // Uncertainty feedback and Noise coefficients float K_x = 1.0f; // feedback for pos_x float K_y = 1.0f; // feedback for pos_y float K_yaw = 1.0f; // feedback for yaw float K_vel_x = 1.0f; // feedback for vel x float Q_x_acc = 1.0f; // Add noise to vel x based on accel_x float Q_x_v[3] = { 41.74219, -0.8187027, -2.2131343 }; // Add noise to vel x based on vel x float Q_y_f = 0.1f; // Add noise to pos x and y based on side force float Q_omega_v = 0.001f; // Add noise to yaw based on vel x float Q_omega_steering = 0.0f; // Add noise to yaw based on steering }; template class RacerDubinsElevationImpl : public RacerDubinsImpl { public: // static const int SHARED_MEM_REQUEST_GRD_BYTES = sizeof(DYN_PARAMS_T); using PARENT_CLASS = RacerDubinsImpl; using PARENT_CLASS::initializeDynamics; typedef PARAMS_T DYN_PARAMS_T; static const int UNCERTAINTY_DIM = U_IND_CLASS(PARAMS_T, NUM_UNCERTAINTIES); struct __align__(16) SharedBlock { float A[UNCERTAINTY_DIM * UNCERTAINTY_DIM] MPPI_ALIGN(16) = { 0.0f }; float Sigma_a[UNCERTAINTY_DIM * UNCERTAINTY_DIM] MPPI_ALIGN(16) = { 0.0f }; float Sigma_b[UNCERTAINTY_DIM * UNCERTAINTY_DIM] MPPI_ALIGN(16) = { 0.0f }; }; typedef typename PARENT_CLASS::state_array state_array; typedef typename PARENT_CLASS::control_array control_array; typedef typename PARENT_CLASS::output_array output_array; typedef typename PARENT_CLASS::dfdx dfdx; typedef typename PARENT_CLASS::dfdu dfdu; RacerDubinsElevationImpl(cudaStream_t stream = nullptr) : PARENT_CLASS(stream) { tex_helper_ = new TwoDTextureHelper(1, stream); this->SHARED_MEM_REQUEST_BLK_BYTES = sizeof(SharedBlock); } RacerDubinsElevationImpl(PARAMS_T& params, cudaStream_t stream = nullptr) : PARENT_CLASS(params, stream) { tex_helper_ = new TwoDTextureHelper(1, stream); this->SHARED_MEM_REQUEST_BLK_BYTES = sizeof(SharedBlock); } std::string getDynamicsModelName() const override { return "RACER Dubins w/ Elevation Model"; } ~RacerDubinsElevationImpl() { freeCudaMem(); delete tex_helper_; } void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { this->PARENT_CLASS::updateState(state, next_state, state_der, dt); } void updateRotation(std::array& rotation) { this->tex_helper_->updateRotation(0, rotation); } void bindToStream(cudaStream_t stream) { this->tex_helper_->bindToStream(stream); this->PARENT_CLASS::bindToStream(stream); } void GPUSetup(); void freeCudaMem(); void paramsToDevice(); void computeParametricAccelDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der, const float dt); __host__ __device__ void setOutputs(const float* state_der, const float* next_state, float* output); __device__ void computeParametricAccelDeriv(float* state, float* control, float* state_der, const float dt, DYN_PARAMS_T* params_p); bool computeGrad(const Eigen::Ref& state = state_array(), const Eigen::Ref& control = control_array(), Eigen::Ref A = dfdx(), Eigen::Ref B = dfdu()); __host__ __device__ void computeUncertaintyPropagation(const float* state, const float* control, const float* state_der, float* next_state, float dt, DYN_PARAMS_T* params_p, SharedBlock* uncertainty_data, float* theta_s); __host__ __device__ void uncertaintyMatrixToOutput(const float* uncertainty_matrix, float* output); __host__ __device__ void uncertaintyMatrixToState(const float* uncertainty_matrix, float* state); __host__ __device__ void uncertaintyStateToMatrix(const float* state, float* uncertainty_matrix); __host__ __device__ bool computeUncertaintyJacobian(const float* state, const float* control, float* A, DYN_PARAMS_T* params_p); __host__ __device__ bool computeQ(const float* state, const float* control, const float* state_der, float* Q, DYN_PARAMS_T* params_p, float* theta_s); __device__ void updateState(float* state, float* next_state, float* state_der, const float dt, DYN_PARAMS_T* params_p); void computeStateDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { } state_array computeStateError(const Eigen::Ref& pred_state, const Eigen::Ref& true_state) { state_array state_diff = pred_state - true_state; state_diff(S_INDEX(YAW)) = angle_utils::shortestAngularDistance(true_state(S_INDEX(YAW)), pred_state(S_INDEX(YAW))); state_diff(S_INDEX(ROLL)) = angle_utils::shortestAngularDistance(true_state(S_INDEX(ROLL)), pred_state(S_INDEX(ROLL))); state_diff(S_INDEX(PITCH)) = angle_utils::shortestAngularDistance(true_state(S_INDEX(PITCH)), pred_state(S_INDEX(PITCH))); return state_diff; } void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt); __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt); __device__ void initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt); Eigen::Quaternionf attitudeFromState(const Eigen::Ref& state); Eigen::Vector3f positionFromState(const Eigen::Ref& state); state_array stateFromMap(const std::map& map) override; TwoDTextureHelper* getTextureHelper() { return tex_helper_; } protected: TwoDTextureHelper* tex_helper_ = nullptr; }; class RacerDubinsElevation : public RacerDubinsElevationImpl { public: RacerDubinsElevation(cudaStream_t stream = nullptr) : RacerDubinsElevationImpl(stream) { } RacerDubinsElevation(RacerDubinsElevationParams& params, cudaStream_t stream = nullptr) : RacerDubinsElevationImpl(params, stream) { } }; #if __CUDACC__ #include "racer_dubins_elevation.cu" #endif #endif // MPPIGENERIC_RACER_DUBINS_ELEVATION_CUH ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins_elevation_lstm_steering.cu ================================================ // // Created by jason on 8/31/22. // #include "racer_dubins_elevation_lstm_steering.cuh" #define TEMPLATE_TYPE template #define TEMPLATE_NAME RacerDubinsElevationLSTMSteeringImpl TEMPLATE_TYPE TEMPLATE_NAME::RacerDubinsElevationLSTMSteeringImpl(int init_input_dim, int init_hidden_dim, std::vector& init_output_layers, int input_dim, int hidden_dim, std::vector& output_layers, int init_len, cudaStream_t stream) : PARENT_CLASS(stream) { this->requires_buffer_ = true; lstm_lstm_helper_ = std::make_shared>(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len, stream); this->SHARED_MEM_REQUEST_GRD_BYTES = lstm_lstm_helper_->getLSTMModel()->getGrdSharedSizeBytes(); this->SHARED_MEM_REQUEST_BLK_BYTES = sizeof(SharedBlock) + lstm_lstm_helper_->getLSTMModel()->getBlkSharedSizeBytes(); } TEMPLATE_TYPE TEMPLATE_NAME::RacerDubinsElevationLSTMSteeringImpl(std::string path, cudaStream_t stream) : PARENT_CLASS(stream) { if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(path); this->params_.max_steer_rate = param_dict.at("parameters/max_rate_pos").data()[0]; this->params_.steering_constant = param_dict.at("parameters/constant").data()[0]; this->params_.steer_accel_constant = param_dict.at("parameters/accel_constant").data()[0]; this->params_.steer_accel_drag_constant = param_dict.at("parameters/accel_drag_constant").data()[0]; lstm_lstm_helper_ = std::make_shared>(path, stream); this->requires_buffer_ = true; this->SHARED_MEM_REQUEST_GRD_BYTES = lstm_lstm_helper_->getLSTMModel()->getGrdSharedSizeBytes(); this->SHARED_MEM_REQUEST_BLK_BYTES = sizeof(SharedBlock) + lstm_lstm_helper_->getLSTMModel()->getBlkSharedSizeBytes(); } TEMPLATE_TYPE void TEMPLATE_NAME::GPUSetup() { lstm_lstm_helper_->GPUSetup(); // makes sure that the device ptr sees the correct lstm model this->network_d_ = lstm_lstm_helper_->getLSTMDevicePtr(); PARENT_CLASS::GPUSetup(); } TEMPLATE_TYPE void TEMPLATE_NAME::bindToStream(cudaStream_t stream) { PARENT_CLASS::bindToStream(stream); lstm_lstm_helper_->getLSTMModel()->bindToStream(stream); } TEMPLATE_TYPE void TEMPLATE_NAME::freeCudaMem() { PARENT_CLASS::freeCudaMem(); lstm_lstm_helper_->freeCudaMem(); } TEMPLATE_TYPE void TEMPLATE_NAME::computeLSTMSteering(Eigen::Ref state, const Eigen::Ref& control, Eigen::Ref state_der) { const float parametric_accel = (control(C_INDEX(STEER_CMD)) * this->params_.steer_command_angle_scale - state(S_INDEX(STEER_ANGLE))) * this->params_.steering_constant; state_der(S_INDEX(STEER_ANGLE_RATE)) = fmaxf(fminf((parametric_accel - state(S_INDEX(STEER_ANGLE_RATE))) * this->params_.steer_accel_constant - state(S_INDEX(STEER_ANGLE_RATE)) * this->params_.steer_accel_drag_constant, this->params_.max_steer_rate), -this->params_.max_steer_rate); Eigen::VectorXf input = lstm_lstm_helper_->getLSTMModel()->getZeroInputVector(); input(0) = state(S_INDEX(STEER_ANGLE)) * 0.2f; input(1) = state(S_INDEX(STEER_ANGLE_RATE)) * 0.2f; input(2) = control(C_INDEX(STEER_CMD)); input(3) = state_der(S_INDEX(STEER_ANGLE_RATE)) * 0.2f; // this is the parametric part as input Eigen::VectorXf nn_output = lstm_lstm_helper_->getLSTMModel()->getZeroOutputVector(); lstm_lstm_helper_->forward(input, nn_output); state_der(S_INDEX(STEER_ANGLE_RATE)) += nn_output(0) * 5.0f; state_der(S_INDEX(STEER_ANGLE)) = state(S_INDEX(STEER_ANGLE_RATE)); } TEMPLATE_TYPE void TEMPLATE_NAME::step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt) { this->computeParametricDelayDeriv(state, control, state_der); this->computeParametricAccelDeriv(state, control, state_der, dt); this->computeLSTMSteering(state, control, state_der); // Integrate using racer_dubins updateState updateState(state, next_state, state_der, dt); SharedBlock sb; computeUncertaintyPropagation(state.data(), control.data(), state_der.data(), next_state.data(), dt, &this->params_, &sb, nullptr); float roll = state(S_INDEX(ROLL)); float pitch = state(S_INDEX(PITCH)); RACER::computeStaticSettling>( this->tex_helper_, next_state(S_INDEX(YAW)), next_state(S_INDEX(POS_X)), next_state(S_INDEX(POS_Y)), roll, pitch, output.data()); next_state[S_INDEX(PITCH)] = pitch; next_state[S_INDEX(ROLL)] = roll; setOutputs(state_der.data(), next_state.data(), output.data()); } TEMPLATE_TYPE __device__ void TEMPLATE_NAME::initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt) { // const int shift = PARENT_CLASS::SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float); // if (PARENT_CLASS::SHARED_MEM_REQUEST_GRD_BYTES != 0) // { // Allows us to turn on or off global or shared memory version of params // DYN_PARAMS_T* shared_params = (DYN_PARAMS_T*)theta_s; // *shared_params = this->params_; // } network_d_->initialize(theta_s, this->SHARED_MEM_REQUEST_BLK_BYTES, this->SHARED_MEM_REQUEST_GRD_BYTES, sizeof(SharedBlock) / sizeof(float)); setOutputs(state, state, output); } template __device__ void RacerDubinsElevationLSTMSteeringImpl::computeLSTMSteering( float* state, float* control, float* state_der, DYN_PARAMS_T* params_p, float* theta_s, const int grd_shift, const int blk_shift, const int sb_shift) { const uint tdy = threadIdx.y; // loads in the input to the network float* input_loc = network_d_->getInputLocation(theta_s, grd_shift, blk_shift, sb_shift); if (tdy == 0) { const float parametric_accel = (control[C_INDEX(STEER_CMD)] * params_p->steer_command_angle_scale - state[S_INDEX(STEER_ANGLE)]) * params_p->steering_constant; state_der[S_INDEX(STEER_ANGLE_RATE)] = fmaxf(fminf((parametric_accel - state[S_INDEX(STEER_ANGLE_RATE)]) * params_p->steer_accel_constant - state[S_INDEX(STEER_ANGLE_RATE)] * params_p->steer_accel_drag_constant, params_p->max_steer_rate), -params_p->max_steer_rate); input_loc[0] = state[S_INDEX(STEER_ANGLE)] * 0.2f; input_loc[1] = state[S_INDEX(STEER_ANGLE_RATE)] * 0.2f; input_loc[2] = control[C_INDEX(STEER_CMD)]; input_loc[3] = state_der[S_INDEX(STEER_ANGLE_RATE)] * 0.2f; // this is the parametric part as input } __syncthreads(); // runs the network float* cur_hidden_cell = network_d_->getHiddenCellLocation(theta_s, grd_shift, blk_shift, sb_shift); float* nn_output = network_d_->forward(nullptr, theta_s, cur_hidden_cell); // copies the results of the network to state derivative if (tdy == 0) { state_der[S_INDEX(STEER_ANGLE_RATE)] += nn_output[0] * 5.0f; state_der[S_INDEX(STEER_ANGLE)] = state[S_INDEX(STEER_ANGLE_RATE)]; } __syncthreads(); } TEMPLATE_TYPE __device__ inline void TEMPLATE_NAME::step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt) { DYN_PARAMS_T* params_p; SharedBlock* sb; // TODO below conficts in a bad way // if (PARENT_CLASS::SHARED_MEM_REQUEST_GRD_BYTES != 0) // { // Allows us to turn on or off global or shared memory version of params // params_p = (DYN_PARAMS_T*)theta_s; // } // else // { // params_p = &(this->params_); // } params_p = &(this->params_); const int grd_shift = this->SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float); // grid size to shift by const int blk_shift = this->SHARED_MEM_REQUEST_BLK_BYTES * (threadIdx.x + blockDim.x * threadIdx.z) / sizeof(float); // blk size to shift by const int sb_shift = sizeof(SharedBlock) / sizeof(float); // how much to shift inside a block to lstm values if (this->SHARED_MEM_REQUEST_BLK_BYTES != 0) { float* sb_mem = &theta_s[grd_shift]; // does the grid shift sb = (SharedBlock*)(sb_mem + blk_shift); } computeParametricDelayDeriv(state, control, state_der, params_p); computeParametricAccelDeriv(state, control, state_der, dt, params_p); computeLSTMSteering(state, control, state_der, params_p, theta_s, grd_shift, blk_shift, sb_shift); updateState(state, next_state, state_der, dt); computeUncertaintyPropagation(state, control, state_der, next_state, dt, params_p, sb, theta_s); if (threadIdx.y == 0) { float roll = state[S_INDEX(ROLL)]; float pitch = state[S_INDEX(PITCH)]; RACER::computeStaticSettling>( this->tex_helper_, next_state[S_INDEX(YAW)], next_state[S_INDEX(POS_X)], next_state[S_INDEX(POS_Y)], roll, pitch, output); next_state[S_INDEX(PITCH)] = pitch; next_state[S_INDEX(ROLL)] = roll; } __syncthreads(); setOutputs(state_der, next_state, output); } TEMPLATE_TYPE bool TEMPLATE_NAME::updateFromBuffer(const buffer_trajectory& buffer) { std::vector keys = { "STEER_ANGLE", "STEER_ANGLE_RATE", "CAN_STEER_CMD" }; bool missing_key = this->checkIfKeysInBuffer(buffer, keys); if (missing_key) { return false; } Eigen::MatrixXf init_buffer = lstm_lstm_helper_->getEmptyBufferMatrix(buffer.at("STEER_ANGLE").rows()); init_buffer.row(0) = buffer.at("STEER_ANGLE") * 0.2f; init_buffer.row(1) = buffer.at("STEER_ANGLE_RATE") * 0.2f; init_buffer.row(2) = buffer.at("CAN_STEER_CMD"); lstm_lstm_helper_->initializeLSTM(init_buffer); return true; } TEMPLATE_TYPE void TEMPLATE_NAME::initializeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref output, float t_0, float dt) { this->lstm_lstm_helper_->resetLSTMHiddenCellCPU(); PARENT_CLASS::initializeDynamics(state, control, output, t_0, dt); } TEMPLATE_TYPE __device__ void TEMPLATE_NAME::updateState(float* state, float* next_state, float* state_der, const float dt) { int i; int tdy = threadIdx.y; // Add the state derivative time dt to the current state. for (i = tdy; i < 6; i += blockDim.y) { next_state[i] = state[i] + state_der[i] * dt; if (i == S_INDEX(YAW)) { next_state[i] = angle_utils::normalizeAngle(next_state[i]); } if (i == S_INDEX(STEER_ANGLE)) { next_state[i] = fmaxf(fminf(next_state[i], this->params_.max_steer_angle), -this->params_.max_steer_angle); next_state[S_INDEX(STEER_ANGLE_RATE)] = state[S_INDEX(STEER_ANGLE_RATE)] + state_der[S_INDEX(STEER_ANGLE_RATE)] * dt; } if (i == S_INDEX(BRAKE_STATE)) { next_state[i] = fminf(fmaxf(next_state[i], 0.0f), 1.0f); } } } TEMPLATE_TYPE void TEMPLATE_NAME::updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { // Segmented it to ensure that roll and pitch don't get overwritten for (int i = 0; i < 6; i++) { next_state[i] = state[i] + state_der[i] * dt; } next_state(S_INDEX(YAW)) = angle_utils::normalizeAngle(next_state(S_INDEX(YAW))); next_state(S_INDEX(STEER_ANGLE)) = fmaxf(fminf(next_state(S_INDEX(STEER_ANGLE)), this->params_.max_steer_angle), -this->params_.max_steer_angle); next_state(S_INDEX(STEER_ANGLE_RATE)) = state(S_INDEX(STEER_ANGLE_RATE)) + state_der(S_INDEX(STEER_ANGLE_RATE)) * dt; next_state(S_INDEX(BRAKE_STATE)) = fminf(fmaxf(next_state(S_INDEX(BRAKE_STATE)), 0.0f), -this->control_rngs_[C_INDEX(THROTTLE_BRAKE)].x); } #undef TEMPLATE_NAME #undef TEMPLATE_TYPE ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins_elevation_lstm_steering.cuh ================================================ // // Created by jason on 8/31/22. // #include "racer_dubins_elevation.cuh" #include #ifndef MPPIGENERIC_RACER_DUBINS_ELEVATION_LSTM_STEERING_CUH #define MPPIGENERIC_RACER_DUBINS_ELEVATION_LSTM_STEERING_CUH template class RacerDubinsElevationLSTMSteeringImpl : public RacerDubinsElevationImpl { static_assert(std::is_base_of::value, "Params don't inherit from RacerDubinsElevationParams."); public: using PARENT_CLASS = RacerDubinsElevationImpl; using DYN_PARAMS_T = typename PARENT_CLASS::DYN_PARAMS_T; using SharedBlock = typename PARENT_CLASS::SharedBlock; using state_array = typename PARENT_CLASS::state_array; using control_array = typename PARENT_CLASS::control_array; using output_array = typename PARENT_CLASS::output_array; using buffer_trajectory = typename PARENT_CLASS::buffer_trajectory; using PARENT_CLASS::computeParametricAccelDeriv; using PARENT_CLASS::computeParametricDelayDeriv; using PARENT_CLASS::computeUncertaintyPropagation; using PARENT_CLASS::setOutputs; RacerDubinsElevationLSTMSteeringImpl(LSTMLSTMConfig config, cudaStream_t stream = nullptr) : RacerDubinsElevationLSTMSteeringImpl(config.init_config.input_dim, config.init_config.hidden_dim, config.init_config.output_layers, config.pred_config.input_dim, config.pred_config.hidden_dim, config.pred_config.output_layers, config.init_len, stream) { } RacerDubinsElevationLSTMSteeringImpl(int init_input_dim, int init_hidden_dim, std::vector& init_output_layers, int input_dim, int hidden_dim, std::vector& output_layers, int init_len, cudaStream_t stream = nullptr); RacerDubinsElevationLSTMSteeringImpl(std::string path, cudaStream_t stream = nullptr); std::string getDynamicsModelName() const override { return "RACER Dubins LSTM Steering Model"; } void bindToStream(cudaStream_t stream); void GPUSetup(); void freeCudaMem(); bool updateFromBuffer(const buffer_trajectory& buffer); void computeLSTMSteering(Eigen::Ref state, const Eigen::Ref& control, Eigen::Ref state_der); void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt); void initializeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref output, float t_0, float dt); __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt); __device__ void initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt); __device__ void updateState(float* state, float* next_state, float* state_der, const float dt); __device__ void computeLSTMSteering(float* state, float* control, float* state_der, DYN_PARAMS_T* params_p, float* theta_s, const int grd_shift, const int blk_shift, const int sb_shift); void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt); std::shared_ptr> getHelper() { return lstm_lstm_helper_; } LSTMHelper<>* network_d_ = nullptr; protected: std::shared_ptr> lstm_lstm_helper_; RacerDubinsElevationLSTMSteeringImpl(cudaStream_t stream = 0) { } }; class RacerDubinsElevationLSTMSteering : public RacerDubinsElevationLSTMSteeringImpl { public: using PARENT_CLASS = RacerDubinsElevationLSTMSteeringImpl; RacerDubinsElevationLSTMSteering(int init_input_dim, int init_hidden_dim, std::vector& init_output_layers, int input_dim, int hidden_dim, std::vector& output_layers, int init_len, cudaStream_t stream = nullptr) : PARENT_CLASS(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len, stream) { } RacerDubinsElevationLSTMSteering(std::string path, cudaStream_t stream = nullptr) : PARENT_CLASS(path, stream) { } protected: RacerDubinsElevationLSTMSteering(cudaStream_t stream = nullptr) : PARENT_CLASS(stream) { } }; #if __CUDACC__ #include "racer_dubins_elevation_lstm_steering.cu" #endif #endif // MPPIGENERIC_RACER_DUBINS_ELEVATION_LSTM_STEERING_CUH ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins_elevation_lstm_unc.cu ================================================ #include "racer_dubins_elevation_lstm_unc.cuh" // TODO fix the clamping of the brake value // TODO look into replacing roll/pitch values with static settled values RacerDubinsElevationLSTMUncertainty::RacerDubinsElevationLSTMUncertainty(LSTMLSTMConfig& steer_config, LSTMLSTMConfig& mean_config, LSTMLSTMConfig& unc_config, cudaStream_t stream) : PARENT_CLASS(steer_config, stream) { this->mean_helper_ = std::make_shared>(mean_config, stream); this->uncertainty_helper_ = std::make_shared>(unc_config, stream); this->SHARED_MEM_REQUEST_GRD_BYTES = lstm_lstm_helper_->getLSTMModel()->getGrdSharedSizeBytes() + mean_helper_->getLSTMModel()->getGrdSharedSizeBytes() + uncertainty_helper_->getLSTMModel()->getGrdSharedSizeBytes(); this->SHARED_MEM_REQUEST_BLK_BYTES = sizeof(SharedBlock) + lstm_lstm_helper_->getLSTMModel()->getBlkSharedSizeBytes() + mean_helper_->getLSTMModel()->getBlkSharedSizeBytes() + uncertainty_helper_->getLSTMModel()->getBlkSharedSizeBytes(); this->requires_buffer_ = true; } RacerDubinsElevationLSTMUncertainty::RacerDubinsElevationLSTMUncertainty(std::string path, cudaStream_t stream) : PARENT_CLASS(stream) { if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } lstm_lstm_helper_ = std::make_shared>(path, "steering/model/", stream); uncertainty_helper_ = std::make_shared>(path, "terra/uncertainty_network/", stream); mean_helper_ = std::make_shared>(path, "terra/mean_network/", stream); // empty path means that the parent class will not load things cnpy::npz_t param_dict = cnpy::npz_load(path); // loads the steering constants this->params_.max_steer_rate = param_dict.at("steering/parameters/max_rate_pos").data()[0]; this->params_.steering_constant = param_dict.at("steering/parameters/constant").data()[0]; this->params_.steer_accel_constant = param_dict.at("steering/parameters/accel_constant").data()[0]; this->params_.steer_accel_drag_constant = param_dict.at("steering/parameters/accel_drag_constant").data()[0]; this->params_.max_brake_rate_pos = param_dict.at("brake/parameters/max_rate_pos").data()[0]; this->params_.max_brake_rate_neg = param_dict.at("brake/parameters/max_rate_neg").data()[0]; for (int i = 0; i < 3; i++) { this->params_.c_t[i] = param_dict.at("terra/parameters/c_t").data()[i]; this->params_.c_b[i] = param_dict.at("terra/parameters/c_b").data()[i]; this->params_.c_v[i] = param_dict.at("terra/parameters/c_v").data()[i]; this->params_.pos_quad_brake_c[i] = param_dict.at("brake/parameters/constant").data()[i]; this->params_.neg_quad_brake_c[i] = param_dict.at("brake/parameters/negative_constant").data()[i]; } this->params_.clamp_ax = 1000.0f; this->params_.wheel_base = 3.0f; this->params_.c_0 = 0.0f; this->params_.max_steer_angle = 5.0f; this->params_.low_min_throttle = 0.0f; this->params_.gravity = param_dict.at("terra/parameters/gravity").data()[0]; this->params_.steer_angle_scale = param_dict.at("terra/parameters/steer_angle_scale").data()[0]; for (int i = 0; i < uncertainty_helper_->getLSTMModel()->getOutputDim(); i++) { this->params_.unc_scale[i] = param_dict.at("terra/parameters/uncertainty_scale").data()[i]; } this->SHARED_MEM_REQUEST_GRD_BYTES = lstm_lstm_helper_->getLSTMModel()->getGrdSharedSizeBytes() + mean_helper_->getLSTMModel()->getGrdSharedSizeBytes() + uncertainty_helper_->getLSTMModel()->getGrdSharedSizeBytes(); this->SHARED_MEM_REQUEST_BLK_BYTES = sizeof(SharedBlock) + lstm_lstm_helper_->getLSTMModel()->getBlkSharedSizeBytes() + mean_helper_->getLSTMModel()->getBlkSharedSizeBytes() + uncertainty_helper_->getLSTMModel()->getBlkSharedSizeBytes(); this->requires_buffer_ = true; } void RacerDubinsElevationLSTMUncertainty::GPUSetup() { uncertainty_helper_->GPUSetup(); this->uncertainty_d_ = uncertainty_helper_->getLSTMDevicePtr(); mean_helper_->GPUSetup(); this->mean_d_ = mean_helper_->getLSTMDevicePtr(); PARENT_CLASS::GPUSetup(); } void RacerDubinsElevationLSTMUncertainty::bindToStream(cudaStream_t stream) { PARENT_CLASS::bindToStream(stream); uncertainty_helper_->getLSTMModel()->bindToStream(stream); mean_helper_->getLSTMModel()->bindToStream(stream); } void RacerDubinsElevationLSTMUncertainty::freeCudaMem() { uncertainty_helper_->freeCudaMem(); mean_helper_->freeCudaMem(); PARENT_CLASS::freeCudaMem(); } bool RacerDubinsElevationLSTMUncertainty::updateFromBuffer(const buffer_trajectory& buffer) { bool parent_success = PARENT_CLASS::updateFromBuffer(buffer); std::vector keys = { "STEER_ANGLE", "STEER_ANGLE_RATE", "CAN_STEER_CMD", "ROLL", "PITCH", "CAN_THROTTLE_CMD", "VEL_X", "BRAKE_STATE", "CAN_BRAKE_CMD", "OMEGA_Z" }; bool missing_key = this->checkIfKeysInBuffer(buffer, keys); if (missing_key || !parent_success) { return false; } Eigen::MatrixXf mean_init_buffer = mean_helper_->getEmptyBufferMatrix(buffer.at("VEL_X").rows()); mean_init_buffer.row(0) = buffer.at("VEL_X"); mean_init_buffer.row(1) = buffer.at("OMEGA_Z"); mean_init_buffer.row(2) = buffer.at("BRAKE_STATE"); mean_init_buffer.row(3) = buffer.at("STEER_ANGLE"); mean_init_buffer.row(4) = buffer.at("STEER_ANGLE_RATE"); mean_init_buffer.row(5) = buffer.at("CAN_THROTTLE_CMD"); mean_init_buffer.row(6) = buffer.at("CAN_BRAKE_CMD"); mean_init_buffer.row(7) = buffer.at("CAN_STEER_CMD"); mean_init_buffer.row(8) = buffer.at("PITCH").unaryExpr(&sinf); // TODO option here if problem // mean_init_buffer.row(9) = buffer.at("ROLL").unaryExpr(&sinf); mean_helper_->initializeLSTM(mean_init_buffer); Eigen::MatrixXf unc_init_buffer = uncertainty_helper_->getEmptyBufferMatrix(buffer.at("VEL_X").rows()); unc_init_buffer.row(0) = buffer.at("VEL_X"); unc_init_buffer.row(1) = buffer.at("OMEGA_Z"); unc_init_buffer.row(2) = buffer.at("BRAKE_STATE"); unc_init_buffer.row(3) = buffer.at("STEER_ANGLE"); unc_init_buffer.row(4) = buffer.at("STEER_ANGLE_RATE"); unc_init_buffer.row(5) = buffer.at("CAN_THROTTLE_CMD"); unc_init_buffer.row(6) = buffer.at("CAN_BRAKE_CMD"); unc_init_buffer.row(7) = buffer.at("CAN_STEER_CMD"); unc_init_buffer.row(8) = buffer.at("PITCH").unaryExpr(&sinf); unc_init_buffer.row(9) = buffer.at("ROLL").unaryExpr(&sinf); uncertainty_helper_->initializeLSTM(unc_init_buffer); return true; } RacerDubinsElevationLSTMUncertainty::state_array RacerDubinsElevationLSTMUncertainty::stateFromMap(const std::map& map) { std::vector needed_keys = { "VEL_X", "VEL_Z", "POS_X", "POS_Y", "POS_Z", "OMEGA_X", "OMEGA_Y", "ROLL", "PITCH", "YAW", "STEER_ANGLE", "STEER_ANGLE_RATE", "BRAKE_STATE", "OMEGA_Z" }; std::vector uncertainty_keys = { "UNCERTAINTY_POS_X", "UNCERTAINTY_POS_Y", "UNCERTAINTY_YAW", "UNCERTAINTY_VEL_X", "UNCERTAINTY_POS_X_Y", "UNCERTAINTY_POS_X_YAW", "UNCERTAINTY_POS_X_VEL_X", "UNCERTAINTY_POS_Y_YAW", "UNCERTAINTY_POS_Y_VEL_X", "UNCERTAINTY_YAW_VEL_X" }; const bool use_uncertainty = false; if (use_uncertainty) { needed_keys.insert(needed_keys.end(), uncertainty_keys.begin(), uncertainty_keys.end()); } bool missing_key = false; state_array s = state_array::Zero(); for (const auto& key : needed_keys) { if (map.find(key) == map.end()) { // Print out all missing keys std::cout << "Could not find key " << key << " for elevation with simple suspension dynamics." << std::endl; missing_key = true; } } if (missing_key) { return state_array::Constant(NAN); } s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(POS_Y)) = map.at("POS_Y"); s(S_INDEX(CG_POS_Z)) = map.at("POS_Z"); s(S_INDEX(VEL_X)) = map.at("VEL_X"); float bl_v_I_z = map.at("VEL_Z") * cosf(map.at("PITCH")) - map.at("VEL_X") * sinf(map.at("PITCH")); s(S_INDEX(CG_VEL_I_Z)) = bl_v_I_z - map.at("OMEGA_Y") * this->params_.c_g.x; s(S_INDEX(STEER_ANGLE)) = map.at("STEER_ANGLE"); s(S_INDEX(STEER_ANGLE_RATE)) = map.at("STEER_ANGLE_RATE"); s(S_INDEX(ROLL)) = map.at("ROLL"); s(S_INDEX(PITCH)) = map.at("PITCH"); s(S_INDEX(YAW)) = map.at("YAW"); s(S_INDEX(OMEGA_Z)) = map.at("OMEGA_Z"); // Set z position to cg's z position in world frame float3 rotation = make_float3(s(S_INDEX(ROLL)), s(S_INDEX(PITCH)), s(S_INDEX(YAW))); float3 world_pose = make_float3(s(S_INDEX(POS_X)), s(S_INDEX(POS_Y)), s(S_INDEX(CG_POS_Z))); float3 cg_in_world_frame; mppi::math::bodyOffsetToWorldPoseEuler(this->params_.c_g, world_pose, rotation, cg_in_world_frame); s(S_INDEX(CG_POS_Z)) = cg_in_world_frame.z; s(S_INDEX(ROLL_RATE)) = map.at("OMEGA_X"); s(S_INDEX(PITCH_RATE)) = map.at("OMEGA_Y"); s(S_INDEX(BRAKE_STATE)) = map.at("BRAKE_STATE"); if (use_uncertainty) { s(S_INDEX(UNCERTAINTY_POS_X)) = map.at("UNCERTAINTY_POS_X"); s(S_INDEX(UNCERTAINTY_POS_Y)) = map.at("UNCERTAINTY_POS_Y"); s(S_INDEX(UNCERTAINTY_YAW)) = map.at("UNCERTAINTY_YAW"); s(S_INDEX(UNCERTAINTY_VEL_X)) = map.at("UNCERTAINTY_VEL_X"); s(S_INDEX(UNCERTAINTY_POS_X_Y)) = map.at("UNCERTAINTY_POS_X_Y"); s(S_INDEX(UNCERTAINTY_POS_X_YAW)) = map.at("UNCERTAINTY_POS_X_YAW"); s(S_INDEX(UNCERTAINTY_POS_X_VEL_X)) = map.at("UNCERTAINTY_POS_X_VEL_X"); s(S_INDEX(UNCERTAINTY_POS_Y_YAW)) = map.at("UNCERTAINTY_POS_Y_YAW"); s(S_INDEX(UNCERTAINTY_POS_Y_VEL_X)) = map.at("UNCERTAINTY_POS_Y_VEL_X"); s(S_INDEX(UNCERTAINTY_YAW_VEL_X)) = map.at("UNCERTAINTY_YAW_VEL_X"); } float eps = 1e-6f; if (s(S_INDEX(UNCERTAINTY_POS_X)) < eps) { s(S_INDEX(UNCERTAINTY_POS_X)) = eps; } if (s(S_INDEX(UNCERTAINTY_POS_Y)) < eps) { s(S_INDEX(UNCERTAINTY_POS_Y)) = eps; } if (s(S_INDEX(UNCERTAINTY_YAW)) < eps) { s(S_INDEX(UNCERTAINTY_YAW)) = eps; } if (s(S_INDEX(UNCERTAINTY_VEL_X)) < eps) { s(S_INDEX(UNCERTAINTY_VEL_X)) = eps; } return s; } void RacerDubinsElevationLSTMUncertainty::initializeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref output, float t_0, float dt) { uncertainty_helper_->resetLSTMHiddenCellCPU(); mean_helper_->resetLSTMHiddenCellCPU(); PARENT_CLASS::initializeDynamics(state, control, output, t_0, dt); } void RacerDubinsElevationLSTMUncertainty::step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt) { // this is the new brake model with quadratics bool enable_brake = control(C_INDEX(THROTTLE_BRAKE)) < 0.0f; const float brake_error = (enable_brake * -control(C_INDEX(THROTTLE_BRAKE)) - state(S_INDEX(BRAKE_STATE))); state_der(S_INDEX(BRAKE_STATE)) = fminf(fmaxf((brake_error > 0) * (brake_error * this->params_.pos_quad_brake_c[0] + brake_error * fabsf(brake_error) * this->params_.pos_quad_brake_c[1]) + (brake_error < 0) * (brake_error * this->params_.neg_quad_brake_c[0] + brake_error * fabsf(brake_error) * this->params_.neg_quad_brake_c[1]), -this->params_.max_brake_rate_neg), this->params_.max_brake_rate_pos); this->computeParametricAccelDeriv(state, control, state_der, dt); this->computeLSTMSteering(state, control, state_der); this->computeSimpleSuspensionStep(state, state_der, output); // only on in reverse, default back to parametric in reverse if (this->params_.gear_sign == 1) { // compute the mean LSTM Eigen::VectorXf input = mean_helper_->getLSTMModel()->getZeroInputVector(); input(0) = state(S_INDEX(VEL_X)); input(1) = state(S_INDEX(OMEGA_Z)); input(2) = state(S_INDEX(BRAKE_STATE)); input(3) = state(S_INDEX(STEER_ANGLE)); input(4) = state(S_INDEX(STEER_ANGLE_RATE)); input(5) = control(C_INDEX(THROTTLE_BRAKE)) >= 0.0f ? control(C_INDEX(THROTTLE_BRAKE)) : 0.0f; input(6) = control(C_INDEX(THROTTLE_BRAKE)) <= 0.0f ? -control(C_INDEX(THROTTLE_BRAKE)) : 0.0f; input(7) = control(C_INDEX(STEER_CMD)); input(8) = sinf(state(S_INDEX(STATIC_PITCH))); input(9) = state_der(S_INDEX(VEL_X)); input(10) = state_der(S_INDEX(YAW)); Eigen::VectorXf mean_output = mean_helper_->getLSTMModel()->getZeroOutputVector(); mean_helper_->forward(input, mean_output); state_der(S_INDEX(VEL_X)) += mean_output(0); state_der(S_INDEX(YAW)) += mean_output(1); } next_state[S_INDEX(OMEGA_Z)] = state_der[S_INDEX(YAW)]; // Integrate using racer_dubins updateState updateState(state, next_state, state_der, dt); SharedBlock sb; computeUncertaintyPropagation(state.data(), control.data(), state_der.data(), next_state.data(), dt, &this->params_, &sb, nullptr); float roll = state(S_INDEX(STATIC_ROLL)); float pitch = state(S_INDEX(STATIC_PITCH)); RACER::computeStaticSettling>( this->tex_helper_, next_state(S_INDEX(YAW)), next_state(S_INDEX(POS_X)), next_state(S_INDEX(POS_Y)), roll, pitch, output.data()); next_state[S_INDEX(STATIC_PITCH)] = pitch; next_state[S_INDEX(STATIC_ROLL)] = roll; this->setOutputs(state_der.data(), next_state.data(), output.data()); } __device__ __host__ bool RacerDubinsElevationLSTMUncertainty::computeQ(const float* state, const float* control, const float* state_der, float* Q, RacerDubinsElevationUncertaintyParams* params_p, float* theta_s) { // in reverse just use base level implementation if (params_p->gear_sign == -1) { PARENT_CLASS::computeQ(state, control, state_der, Q, params_p, theta_s); } // TODO make the roll and pitch the ones from static settling, not the ones calculated online float sin_yaw, cos_yaw; float* uncertainty_output; int step, pi; mp1::getParallel1DIndex(pi, step); #ifdef __CUDA_ARCH__ const float yaw_norm = angle_utils::normalizeAngle(state[S_INDEX(YAW)]); __sincosf(yaw_norm, &sin_yaw, &cos_yaw); const int grd_shift = this->SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float); // grid size to shift by const int blk_shift = this->SHARED_MEM_REQUEST_BLK_BYTES * (threadIdx.x + blockDim.x * threadIdx.z) / sizeof(float); // blk size to shift by const int sb_shift = sizeof(SharedBlock) / sizeof(float) + network_d_->getBlkSharedSizeBytes() / sizeof(float) + mean_d_->getBlkSharedSizeBytes() / sizeof(float); // how much to shift inside a block to lstm values float* input_loc = uncertainty_d_->getInputLocation(theta_s, grd_shift, blk_shift, sb_shift); for (int i = pi; i < uncertainty_d_->getInputDim(); i += step) { switch (i) { case 0: // vx input_loc[i] = state[S_INDEX(VEL_X)]; break; case 1: // omega input_loc[i] = state[S_INDEX(OMEGA_Z)]; break; case 2: // brake state input_loc[i] = state[S_INDEX(BRAKE_STATE)]; break; case 3: // shaft angle input_loc[i] = state[S_INDEX(STEER_ANGLE)]; break; case 4: // shaft velocity input_loc[i] = state[S_INDEX(STEER_ANGLE_RATE)]; break; case 5: // throttle_cmd input_loc[i] = control[C_INDEX(THROTTLE_BRAKE)] >= 0.0f ? control[C_INDEX(THROTTLE_BRAKE)] : 0.0f; break; case 6: // brake_cmd input_loc[i] = control[C_INDEX(THROTTLE_BRAKE)] <= 0.0f ? -control[C_INDEX(THROTTLE_BRAKE)] : 0.0f; break; case 7: // steering cmd input_loc[i] = control[C_INDEX(STEER_CMD)]; break; case 8: // sin roll input_loc[i] = __sinf(state[S_INDEX(STATIC_ROLL)]); break; case 9: // sin pitch input_loc[i] = __sinf(state[S_INDEX(STATIC_PITCH)]); break; case 10: // accel x input_loc[i] = state_der[S_INDEX(VEL_X)]; break; case 11: // yaw rate calculation input_loc[i] = state_der[S_INDEX(YAW)]; break; } } __syncthreads(); float* cur_hidden_cell = uncertainty_d_->getHiddenCellLocation(theta_s, grd_shift, blk_shift, sb_shift); uncertainty_output = uncertainty_d_->forward( nullptr, theta_s + (network_d_->getGrdSharedSizeBytes() + mean_d_->getGrdSharedSizeBytes()) / sizeof(float), cur_hidden_cell); int output_dim = uncertainty_d_->getOutputDim(); #else sincosf(state[S_INDEX(YAW)], &sin_yaw, &cos_yaw); Eigen::VectorXf input = uncertainty_helper_->getLSTMModel()->getZeroInputVector(); input(0) = state[S_INDEX(VEL_X)]; input(1) = state[S_INDEX(OMEGA_Z)]; input(2) = state[S_INDEX(BRAKE_STATE)]; input(3) = state[S_INDEX(STEER_ANGLE)]; input(4) = state[S_INDEX(STEER_ANGLE_RATE)]; input(5) = control[C_INDEX(THROTTLE_BRAKE)] >= 0.0f ? control[C_INDEX(THROTTLE_BRAKE)] : 0.0f; input(6) = control[C_INDEX(THROTTLE_BRAKE)] < 0.0f ? -control[C_INDEX(THROTTLE_BRAKE)] : 0.0f; input(7) = control[C_INDEX(STEER_CMD)]; input(8) = sinf(state[S_INDEX(STATIC_ROLL)]); input(9) = sinf(state[S_INDEX(STATIC_PITCH)]); input(10) = state_der[S_INDEX(VEL_X)]; // these should be using the modified means input(11) = state_der[S_INDEX(YAW)]; Eigen::VectorXf uncertainty_output_eig = uncertainty_helper_->getLSTMModel()->getZeroOutputVector(); uncertainty_helper_->forward(input, uncertainty_output_eig); // TODO need to sigmoid and scale uncertainty_output = uncertainty_output_eig.data(); int output_dim = uncertainty_helper_->getLSTMModel()->getOutputDim(); #endif for (int i = pi; i < output_dim; i += step) { uncertainty_output[i] = fabsf(mppi::nn::sigmoid(uncertainty_output[i]) * params_p->unc_scale[i]); } #ifdef __CUDA_ARCH__ __syncthreads(); #endif const float linear_brake_slope = 0.2f; const int index = (fabsf(state[S_INDEX(VEL_X)]) > linear_brake_slope && fabsf(state[S_INDEX(VEL_X)]) <= 3.0f) + (fabsf(state[S_INDEX(VEL_X)]) > 3.0f) * 2; // TODO add in additional uncertainty based on output dim of network for (int i = pi; i < UNCERTAINTY_DIM * UNCERTAINTY_DIM; i += step) { switch (i) { // vel_x case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(VEL_X), UNCERTAINTY_DIM): Q[i] = uncertainty_output[0] + SQ(params_p->c_b[index] * (index == 0 ? state[S_INDEX(VEL_X)] : 1.0f)) * uncertainty_output[4]; if (output_dim == 7) { Q[i] += uncertainty_output[5]; // additional Q_vx } break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(YAW), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(VEL_X), U_INDEX(POS_Y), UNCERTAINTY_DIM): Q[i] = 0.0f; break; // yaw case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(VEL_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(YAW), UNCERTAINTY_DIM): #ifdef __CUDA_ARCH__ Q[i] = uncertainty_output[1] + SQ((state[S_INDEX(VEL_X)] / params_p->wheel_base) * 1.0f / (SQ(__cosf(state[S_INDEX(STEER_ANGLE)] / params_p->steer_angle_scale)) * params_p->steer_angle_scale)) * uncertainty_output[3]; #else Q[i] = uncertainty_output[1] + SQ((state[S_INDEX(VEL_X)] / params_p->wheel_base) * 1.0f / (SQ(cosf(state[S_INDEX(STEER_ANGLE)] / params_p->steer_angle_scale)) * params_p->steer_angle_scale)) * uncertainty_output[3]; #endif if (output_dim == 7) { Q[i] += uncertainty_output[6]; // additional Q_yaw } break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(YAW), U_INDEX(POS_Y), UNCERTAINTY_DIM): Q[i] = 0.0f; break; // pos x case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(VEL_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(YAW), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_X), UNCERTAINTY_DIM): Q[i] = uncertainty_output[2] * sin_yaw * sin_yaw; break; case mm::columnMajorIndex(U_INDEX(POS_X), U_INDEX(POS_Y), UNCERTAINTY_DIM): Q[i] = -uncertainty_output[2] * sin_yaw * cos_yaw; break; // pos y case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(VEL_X), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(YAW), UNCERTAINTY_DIM): Q[i] = 0.0f; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_Y), UNCERTAINTY_DIM): Q[i] = uncertainty_output[2] * cos_yaw * cos_yaw; break; case mm::columnMajorIndex(U_INDEX(POS_Y), U_INDEX(POS_X), UNCERTAINTY_DIM): Q[i] = -uncertainty_output[2] * sin_yaw * cos_yaw; break; } } return true; } __device__ void RacerDubinsElevationLSTMUncertainty::step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt) { DYN_PARAMS_T* params_p = &(this->params_); const int grd_shift = this->SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float); // grid size to shift by const int blk_shift = this->SHARED_MEM_REQUEST_BLK_BYTES * (threadIdx.x + blockDim.x * threadIdx.z) / sizeof(float); // blk size to shift by float* sb_mem = &theta_s[grd_shift]; // does the grid shift SharedBlock* sb = (SharedBlock*)(sb_mem + blk_shift); const int sb_shift = sizeof(SharedBlock) / sizeof(float) + network_d_->getBlkSharedSizeBytes() / sizeof(float); // how much to shift inside a block to lstm values // this is the new brake model with quadratics bool enable_brake = control[C_INDEX(THROTTLE_BRAKE)] < 0.0f; const float brake_error = (enable_brake * -control[C_INDEX(THROTTLE_BRAKE)] - state[S_INDEX(BRAKE_STATE)]); state_der[S_INDEX(BRAKE_STATE)] = fminf(fmaxf((brake_error > 0) * (brake_error * params_p->pos_quad_brake_c[0] + brake_error * fabsf(brake_error) * params_p->pos_quad_brake_c[1]) + (brake_error < 0) * (brake_error * params_p->neg_quad_brake_c[0] + brake_error * fabsf(brake_error) * params_p->neg_quad_brake_c[1]), -params_p->max_brake_rate_neg), params_p->max_brake_rate_pos); computeParametricAccelDeriv(state, control, state_der, dt, params_p); computeLSTMSteering(state, control, state_der, params_p, theta_s, grd_shift, blk_shift, sizeof(SharedBlock) / sizeof(float)); computeSimpleSuspensionStep(state, state_der, output, params_p, theta_s); // TODO use the settling pitch and roll not the state based one // only on in reverse, default back to parametric in reverse if (params_p->gear_sign == 1) { // computes the mean compensation LSTM float* input_loc = mean_d_->getInputLocation(theta_s, grd_shift, blk_shift, sb_shift); int pi, step; mppi::p1::getParallel1DIndex(pi, step); for (int i = pi; i < mean_d_->getInputDim(); i += step) { switch (i) { case 0: // vx input_loc[i] = state[S_INDEX(VEL_X)]; break; case 1: // omega input_loc[i] = state[S_INDEX(OMEGA_Z)]; break; case 2: // brake state input_loc[i] = state[S_INDEX(BRAKE_STATE)]; break; case 3: // shaft angle input_loc[i] = state[S_INDEX(STEER_ANGLE)]; break; case 4: // shaft velocity input_loc[i] = state[S_INDEX(STEER_ANGLE_RATE)]; break; case 5: // throttle_cmd input_loc[i] = control[C_INDEX(THROTTLE_BRAKE)] >= 0.0f ? control[C_INDEX(THROTTLE_BRAKE)] : 0.0f; break; case 6: // brake_cmd input_loc[i] = control[C_INDEX(THROTTLE_BRAKE)] <= 0.0f ? -control[C_INDEX(THROTTLE_BRAKE)] : 0.0f; break; case 7: // steering cmd input_loc[i] = control[C_INDEX(STEER_CMD)]; break; case 8: // sin pitch #ifdef __CUDA_ARCH__ input_loc[i] = __sinf(state[S_INDEX(STATIC_PITCH)]); #else input_loc[i] = sinf(state[S_INDEX(STATIC_PITCH)]); #endif break; case 9: // accel x input_loc[i] = state_der[S_INDEX(VEL_X)]; break; case 10: // yaw rate calculation input_loc[i] = state_der[S_INDEX(YAW)]; break; } } __syncthreads(); float* cur_hidden_cell = mean_d_->getHiddenCellLocation(theta_s, grd_shift, blk_shift, sb_shift); float* mean_output = mean_d_->forward(nullptr, theta_s + network_d_->getGrdSharedSizeBytes() / sizeof(float), cur_hidden_cell); if (threadIdx.y == 0) { state_der[S_INDEX(VEL_X)] += mean_output[0]; state_der[S_INDEX(YAW)] += mean_output[1]; next_state[S_INDEX(OMEGA_Z)] = state_der[S_INDEX(YAW)]; } __syncthreads(); } next_state[S_INDEX(OMEGA_Z)] = state_der[S_INDEX(YAW)]; updateState(state, next_state, state_der, dt); computeUncertaintyPropagation(state, control, state_der, next_state, dt, params_p, sb, theta_s); float roll = state[S_INDEX(STATIC_ROLL)]; float pitch = state[S_INDEX(STATIC_PITCH)]; RACER::computeStaticSettling>( this->tex_helper_, next_state[S_INDEX(YAW)], next_state[S_INDEX(POS_X)], next_state[S_INDEX(POS_Y)], roll, pitch, output); next_state[S_INDEX(STATIC_PITCH)] = pitch; next_state[S_INDEX(STATIC_ROLL)] = roll; //__syncthreads(); this->setOutputs(state_der, next_state, output); } __device__ void RacerDubinsElevationLSTMUncertainty::initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt) { PARENT_CLASS::initializeDynamics(state, control, output, theta_s, t_0, dt); int blk_shift = sizeof(SharedBlock) / sizeof(float) + network_d_->getBlkSharedSizeBytes() / sizeof(float); mean_d_->initialize(theta_s, this->SHARED_MEM_REQUEST_BLK_BYTES, this->SHARED_MEM_REQUEST_GRD_BYTES, blk_shift, network_d_->getGrdSharedSizeBytes()); blk_shift = sizeof(SharedBlock) / sizeof(float) + network_d_->getBlkSharedSizeBytes() / sizeof(float) + mean_d_->getBlkSharedSizeBytes() / sizeof(float); uncertainty_d_->initialize(theta_s, this->SHARED_MEM_REQUEST_BLK_BYTES, this->SHARED_MEM_REQUEST_GRD_BYTES, blk_shift, network_d_->getGrdSharedSizeBytes() + mean_d_->getGrdSharedSizeBytes()); } ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins_elevation_lstm_unc.cuh ================================================ #include "racer_dubins_elevation_suspension_lstm.cuh" #ifndef MPPIGENERIC_RACER_DUBINS_ELEVATION_LSTM_UNC_CUH #define MPPIGENERIC_RACER_DUBINS_ELEVATION_LSTM_UNC_CUH struct RacerDubinsElevationUncertaintyParams : public RacerDubinsElevationSuspensionParams { enum class StateIndex : int { VEL_X = 0, YAW, POS_X, POS_Y, STEER_ANGLE, BRAKE_STATE, ROLL, PITCH, CG_POS_Z, CG_VEL_I_Z, ROLL_RATE, PITCH_RATE, STEER_ANGLE_RATE, OMEGA_Z, STATIC_ROLL, STATIC_PITCH, UNCERTAINTY_POS_X, UNCERTAINTY_POS_Y, UNCERTAINTY_YAW, UNCERTAINTY_VEL_X, UNCERTAINTY_POS_X_Y, UNCERTAINTY_POS_X_YAW, UNCERTAINTY_POS_X_VEL_X, UNCERTAINTY_POS_Y_YAW, UNCERTAINTY_POS_Y_VEL_X, UNCERTAINTY_YAW_VEL_X, NUM_STATES }; float unc_scale[7] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; float pos_quad_brake_c[3] = { 2.0f, 0.5f, 0.3 }; float neg_quad_brake_c[3] = { 5.84f, 0.15f, 1.7f }; bool use_static_settling = true; }; class RacerDubinsElevationLSTMUncertainty : public RacerDubinsElevationSuspensionImpl { public: using PARENT_CLASS = RacerDubinsElevationSuspensionImpl; typedef typename PARENT_CLASS::DYN_PARAMS_T DYN_PARAMS_T; typedef typename PARENT_CLASS::state_array state_array; typedef typename PARENT_CLASS::control_array control_array; typedef typename PARENT_CLASS::output_array output_array; typedef typename PARENT_CLASS::dfdx dfdx; typedef typename PARENT_CLASS::dfdu dfdu; RacerDubinsElevationLSTMUncertainty(LSTMLSTMConfig& steer_config, LSTMLSTMConfig& mean_config, LSTMLSTMConfig& unc_config, cudaStream_t stream = nullptr); RacerDubinsElevationLSTMUncertainty(std::string path, cudaStream_t stream = nullptr); std::string getDynamicsModelName() const override { // TODO check if mean model is none or not return "RACER Dubins LSTM Uncertainty Model"; } void GPUSetup(); void bindToStream(cudaStream_t stream); void freeCudaMem(); bool updateFromBuffer(const buffer_trajectory& buffer); void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt); void initializeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref output, float t_0, float dt); state_array stateFromMap(const std::map& map); __host__ __device__ bool computeQ(const float* state, const float* control, const float* state_der, float* Q, DYN_PARAMS_T* params_p, float* theta_s); __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt); __device__ void initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt); // __device__ void updateState(float* state, float* next_state, float* state_der, const float dt); // void updateState(const Eigen::Ref state, Eigen::Ref next_state, // Eigen::Ref state_der, const float dt); std::shared_ptr> getUncertaintyHelper() { return uncertainty_helper_; } std::shared_ptr> getMeanHelper() { return mean_helper_; } LSTMHelper<>* uncertainty_d_ = nullptr; LSTMHelper<>* mean_d_ = nullptr; protected: std::shared_ptr> uncertainty_helper_; std::shared_ptr> mean_helper_; }; #if __CUDACC__ #include "racer_dubins_elevation_lstm_unc.cu" #endif #endif // MPPIGENERIC_RACER_DUBINS_ELEVATION_LSTM_UNC_CUH ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins_elevation_suspension_lstm.cu ================================================ // // Created by Bogdan on 02/21/2024 // #include #include "racer_dubins_elevation_suspension_lstm.cuh" #define TEMPLATE_TYPE template #define TEMPLATE_NAME RacerDubinsElevationSuspensionImpl TEMPLATE_TYPE TEMPLATE_NAME::RacerDubinsElevationSuspensionImpl(int init_input_dim, int init_hidden_dim, std::vector& init_output_layers, int input_dim, int hidden_dim, std::vector& output_layers, int init_len, cudaStream_t stream) : PARENT_CLASS(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len, stream) { normals_tex_helper_ = new TwoDTextureHelper(1, stream); } TEMPLATE_TYPE TEMPLATE_NAME::RacerDubinsElevationSuspensionImpl(std::string path, cudaStream_t stream) : PARENT_CLASS(path, stream) { normals_tex_helper_ = new TwoDTextureHelper(1, stream); } TEMPLATE_TYPE void TEMPLATE_NAME::bindToStream(cudaStream_t stream) { normals_tex_helper_->bindToStream(stream); PARENT_CLASS::bindToStream(stream); } TEMPLATE_TYPE void TEMPLATE_NAME::paramsToDevice() { normals_tex_helper_->copyToDevice(); PARENT_CLASS::paramsToDevice(); } TEMPLATE_TYPE void TEMPLATE_NAME::GPUSetup() { PARENT_CLASS::GPUSetup(); normals_tex_helper_->GPUSetup(); // makes sure that the device ptr sees the correct texture object HANDLE_ERROR(cudaMemcpyAsync(&(this->model_d_->normals_tex_helper_), &(normals_tex_helper_->ptr_d_), sizeof(TwoDTextureHelper*), cudaMemcpyHostToDevice, this->stream_)); } TEMPLATE_TYPE void TEMPLATE_NAME::freeCudaMem() { normals_tex_helper_->freeCudaMem(); PARENT_CLASS::freeCudaMem(); } TEMPLATE_TYPE void TEMPLATE_NAME::computeSimpleSuspensionStep(Eigen::Ref state, Eigen::Ref state_der, Eigen::Ref output) { DYN_PARAMS_T* params_p = &(this->params_); // Calculate suspension-based state derivatives const float& x = state(S_INDEX(POS_X)); const float& y = state(S_INDEX(POS_Y)); const float& roll = state(S_INDEX(ROLL)); const float& pitch = state(S_INDEX(PITCH)); const float& yaw = state(S_INDEX(YAW)); float3 wheel_positions_body[W_INDEX(NUM_WHEELS)]; float3 wheel_positions_world[W_INDEX(NUM_WHEELS)]; float3 wheel_positions_cg[W_INDEX(NUM_WHEELS)]; wheel_positions_body[W_INDEX(FR)] = make_float3(2.981f, -0.737f, 0.f); wheel_positions_body[W_INDEX(FL)] = make_float3(2.981f, 0.737f, 0.0f); wheel_positions_body[W_INDEX(BR)] = make_float3(0.0f, 0.737f, 0.0f); wheel_positions_body[W_INDEX(BL)] = make_float3(0.0f, -0.737f, 0.f); float3 body_pose = make_float3(x, y, 0.0f); // rotation matrix representation // float3 rotation = make_float3(roll, pitch, yaw); Eigen::Matrix3f M; mppi::math::Euler2DCM_NWU(roll, pitch, yaw, M); float wheel_pos_z, wheel_vel_z; float wheel_height = 0.0f; float4 wheel_normal_world = make_float4(0.0f, 0.0f, 1.0f, 0.0f); int pi, step; state_der(S_INDEX(ROLL)) = state(S_INDEX(ROLL_RATE)); state_der(S_INDEX(PITCH)) = state(S_INDEX(PITCH_RATE)); state_der(S_INDEX(CG_POS_Z)) = state(S_INDEX(CG_VEL_I_Z)); state_der(S_INDEX(CG_VEL_I_Z)) = 0.0f; state_der(S_INDEX(ROLL_RATE)) = 0.0f; state_der(S_INDEX(PITCH_RATE)) = 0.0f; output(O_INDEX(WHEEL_FORCE_UP_MAX)) = -std::numeric_limits::max(); output(O_INDEX(WHEEL_FORCE_FWD_MAX)) = -std::numeric_limits::max(); output(O_INDEX(WHEEL_FORCE_SIDE_MAX)) = -std::numeric_limits::max(); mppi::p1::getParallel1DIndex(pi, step); float wheel_yaw, sin_wheel_yaw, cos_wheel_yaw; for (int i = pi; i < W_INDEX(NUM_WHEELS); i += step) { // Calculate wheel position in different frames wheel_positions_cg[i] = wheel_positions_body[i] - params_p->c_g; mppi::math::bodyOffsetToWorldPoseDCM(wheel_positions_body[i], body_pose, M, wheel_positions_world[i]); if (this->tex_helper_->checkTextureUse(0)) { wheel_height = this->tex_helper_->queryTextureAtWorldPose(0, wheel_positions_world[i]); if (!isfinite(wheel_height)) { wheel_height = state(S_INDEX(CG_POS_Z)) - params_p->wheel_radius; } } if (normals_tex_helper_->checkTextureUse(0)) { wheel_normal_world = normals_tex_helper_->queryTextureAtWorldPose(0, wheel_positions_world[i]); if (!isfinite(wheel_normal_world.x) || !isfinite(wheel_normal_world.y) || !isfinite(wheel_normal_world.z)) { wheel_normal_world = make_float4(0.0, 0.0, 1.0, 0.0); } } if (i == W_INDEX(FR) || i == W_INDEX(FL)) { wheel_yaw = yaw + S_INDEX(STEER_ANGLE) / -9.1f; } else { wheel_yaw = yaw; } sincosf(wheel_yaw, &sin_wheel_yaw, &cos_wheel_yaw); // Calculate wheel heights, velocities, and forces wheel_pos_z = state(S_INDEX(CG_POS_Z)) + roll * wheel_positions_cg[i].y - pitch * wheel_positions_cg[i].x - params_p->wheel_radius; wheel_vel_z = state(S_INDEX(CG_VEL_I_Z)) + state(S_INDEX(ROLL_RATE)) * wheel_positions_cg[i].y - state(S_INDEX(PITCH_RATE)) * wheel_positions_cg[i].x; // h_dot = - V_I * N_I float h_dot = -(state[S_INDEX(VEL_X)] * cos_wheel_yaw * wheel_normal_world.x + state[S_INDEX(VEL_X)] * sin_wheel_yaw * wheel_normal_world.y); float wheel_force = -params_p->spring_k * (wheel_pos_z - wheel_height) - params_p->drag_c * (wheel_vel_z - h_dot); float up_wheel_force = wheel_force; // + params_p->mass * (9.81f / 4.0f); float fwd_wheel_force = wheel_force / wheel_normal_world.z * (wheel_normal_world.x * cos_wheel_yaw + wheel_normal_world.y * sin_wheel_yaw + wheel_normal_world.z * (-pitch)); float side_wheel_force = wheel_force / wheel_normal_world.z * (-wheel_normal_world.x * sin_wheel_yaw + wheel_normal_world.y * cos_wheel_yaw + wheel_normal_world.z * roll); output(O_INDEX(WHEEL_FORCE_UP_MAX)) = fmaxf(output(O_INDEX(WHEEL_FORCE_UP_MAX)), up_wheel_force); output(O_INDEX(WHEEL_FORCE_FWD_MAX)) = fmaxf(output(O_INDEX(WHEEL_FORCE_FWD_MAX)), fabsf(fwd_wheel_force)); output(O_INDEX(WHEEL_FORCE_SIDE_MAX)) = fmaxf(output(O_INDEX(WHEEL_FORCE_SIDE_MAX)), fabsf(side_wheel_force)); // output(O_INDEX(WHEEL_FORCE_UP_FL) + i) = up_wheel_force; // output(O_INDEX(WHEEL_FORCE_FWD_FL) + i) = fwd_wheel_force; // output(O_INDEX(WHEEL_FORCE_SIDE_FL) + i) = side_wheel_force; state_der(S_INDEX(CG_VEL_I_Z)) += wheel_force / params_p->mass; state_der(S_INDEX(ROLL_RATE)) += wheel_force * wheel_positions_cg[i].y / params_p->I_xx; state_der(S_INDEX(PITCH_RATE)) += -wheel_force * wheel_positions_cg[i].x / params_p->I_yy; } if (output(O_INDEX(WHEEL_FORCE_UP_MAX)) == 0.0f) { std::cout << "got state: " << state.transpose() << std::endl; } } TEMPLATE_TYPE void TEMPLATE_NAME::step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt) { this->computeParametricDelayDeriv(state, control, state_der); this->computeParametricAccelDeriv(state, control, state_der, dt); this->computeLSTMSteering(state, control, state_der); computeSimpleSuspensionStep(state, state_der, output); // Integrate using Euler Integration updateState(state, next_state, state_der, dt); SharedBlock sb; computeUncertaintyPropagation(state.data(), control.data(), state_der.data(), next_state.data(), dt, &this->params_, &sb, nullptr); // float roll = state(S_INDEX(ROLL)); // float pitch = state(S_INDEX(PITCH)); // RACER::computeStaticSettling>( // this->tex_helper_, next_state(S_INDEX(YAW)), next_state(S_INDEX(POS_X)), next_state(S_INDEX(POS_Y)), roll, // pitch, output.data()); // next_state[S_INDEX(PITCH)] = pitch; // next_state[S_INDEX(ROLL)] = roll; this->setOutputs(state_der.data(), next_state.data(), output.data()); // printf("CPU t: %3.0f, VEL_Z(t + 1): %f, VEL_Z(t): %f, VEl_Z'(t): %f\n", t, next_state(S_INDEX(CG_VEL_I_Z)), // state(S_INDEX(CG_VEL_I_Z)), // state_der(S_INDEX(CG_VEL_I_Z))); } TEMPLATE_TYPE __device__ void TEMPLATE_NAME::computeSimpleSuspensionStep(float* state, float* state_der, float* output, DYN_PARAMS_T* params_p, float* theta_s) { // computes the velocity dot int pi, step; mppi::p1::getParallel1DIndex(pi, step); if (pi == 0) { state_der[S_INDEX(CG_VEL_I_Z)] = 0.0f; state_der[S_INDEX(ROLL_RATE)] = 0.0f; state_der[S_INDEX(PITCH_RATE)] = 0.0f; state_der[S_INDEX(ROLL)] = state[S_INDEX(ROLL_RATE)]; state_der[S_INDEX(PITCH)] = state[S_INDEX(PITCH_RATE)]; state_der[S_INDEX(CG_POS_Z)] = state[S_INDEX(CG_VEL_I_Z)]; output[O_INDEX(WHEEL_FORCE_UP_MAX)] = 0.0f; output[O_INDEX(WHEEL_FORCE_FWD_MAX)] = 0.0f; output[O_INDEX(WHEEL_FORCE_SIDE_MAX)] = 0.0f; } const int grd_shift = this->SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float); // grid size to shift by const int blk_shift = this->SHARED_MEM_REQUEST_BLK_BYTES * (threadIdx.x + blockDim.x * threadIdx.z) / sizeof(float); // blk size to shift by // uses same memory as compute space for the uncertainty float* wheel_force_up = theta_s + grd_shift + blk_shift; float* wheel_force_fwd = theta_s + grd_shift + blk_shift + 4; float* wheel_force_side = theta_s + grd_shift + blk_shift + 8; __syncthreads(); // Calculate suspension-based state derivatives const float& x = state[S_INDEX(POS_X)]; const float& y = state[S_INDEX(POS_Y)]; const float& roll = state[S_INDEX(ROLL)]; const float& pitch = state[S_INDEX(PITCH)]; const float& yaw = state[S_INDEX(YAW)]; float3 wheel_positions_body; float3 wheel_positions_world; float3 body_pose = make_float3(x, y, 0.0f); float3 rotation = make_float3(roll, pitch, yaw); // rotation matrix representation // TODO Check if M needs to be in shared memory float M[3][3]; mppi::math::Euler2DCM_NWU(roll, pitch, yaw, M); // mppi::math::Euler2DCM_NWU(rotation.x, rotation.y, rotation.z, M); float wheel_pos_z, wheel_vel_z; float wheel_height = 0.0f; float h_dot = 0.0f; float4 wheel_normal_world = make_float4(0.0f, 0.0f, 1.0f, 0.0f); float3 wheel_positions_cg; float wheel_yaw, cos_wheel_yaw, sin_wheel_yaw; __syncthreads(); for (int i = pi; i < W_INDEX(NUM_WHEELS); i += step) { wheel_yaw = yaw; // get body frame wheel positions switch (i) { case W_INDEX(FR): wheel_positions_body = make_float3(2.981f, -0.737f, 0.f); wheel_yaw += S_INDEX(STEER_ANGLE) / -9.1f; break; case W_INDEX(FL): wheel_positions_body = make_float3(2.981f, 0.737f, 0.0f); wheel_yaw += S_INDEX(STEER_ANGLE) / -9.1f; break; case W_INDEX(BR): wheel_positions_body = make_float3(0.0f, 0.737f, 0.0f); break; case W_INDEX(BL): wheel_positions_body = make_float3(0.0f, -0.737f, 0.f); break; default: break; } __sincosf(wheel_yaw, &sin_wheel_yaw, &cos_wheel_yaw); // Calculate wheel position in different frames wheel_positions_cg = wheel_positions_body - params_p->c_g; mppi::math::bodyOffsetToWorldPoseDCM(wheel_positions_body, body_pose, M, wheel_positions_world); if (this->tex_helper_->checkTextureUse(0)) { wheel_height = this->tex_helper_->queryTextureAtWorldPose(0, wheel_positions_world); if (!isfinite(wheel_height)) { wheel_height = state[S_INDEX(CG_POS_Z)] - params_p->wheel_radius; } } if (normals_tex_helper_->checkTextureUse(0)) { wheel_normal_world = normals_tex_helper_->queryTextureAtWorldPose(0, wheel_positions_world); if (!isfinite(wheel_normal_world.x) || !isfinite(wheel_normal_world.y) || !isfinite(wheel_normal_world.z)) { wheel_normal_world = make_float4(0.0, 0.0, 1.0, 0.0); } } // Calculate wheel heights, velocities, and forces wheel_pos_z = state[S_INDEX(CG_POS_Z)] + roll * wheel_positions_cg.y - pitch * wheel_positions_cg.x - params_p->wheel_radius; wheel_vel_z = state[S_INDEX(CG_VEL_I_Z)] + state[S_INDEX(ROLL_RATE)] * wheel_positions_cg.y - state[S_INDEX(PITCH_RATE)] * wheel_positions_cg.x; // h_dot = - V_I * N_I h_dot = -(state[S_INDEX(VEL_X)] * cos_wheel_yaw * wheel_normal_world.x + state[S_INDEX(VEL_X)] * sin_wheel_yaw * wheel_normal_world.y); float wheel_force = -params_p->spring_k * (wheel_pos_z - wheel_height) - params_p->drag_c * (wheel_vel_z - h_dot); float up_wheel_force = wheel_force; // + params_p->mass * (9.81f / 4.0f); float fwd_wheel_force = wheel_force / wheel_normal_world.z * (wheel_normal_world.x * cos_wheel_yaw + wheel_normal_world.y * sin_wheel_yaw + wheel_normal_world.z * (-pitch)); float side_wheel_force = wheel_force / wheel_normal_world.z * (-wheel_normal_world.x * sin_wheel_yaw + wheel_normal_world.y * cos_wheel_yaw + wheel_normal_world.z * roll); wheel_force_up[i] = up_wheel_force; wheel_force_fwd[i] = fabsf(fwd_wheel_force); wheel_force_side[i] = fabsf(side_wheel_force); // output[O_INDEX(WHEEL_FORCE_UP_FL) + i] = up_wheel_force; // output[O_INDEX(WHEEL_FORCE_FWD_FL) + i] = fwd_wheel_force; // output[O_INDEX(WHEEL_FORCE_SIDE_FL) + i] = side_wheel_force; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 600 // block-specific atomicAdd is only available on compute capabilities 6.x and greater atomicAdd_block(&state_der[S_INDEX(CG_VEL_I_Z)], wheel_force / params_p->mass); atomicAdd_block(&state_der[S_INDEX(ROLL_RATE)], wheel_force * wheel_positions_cg.y / params_p->I_xx); atomicAdd_block(&state_der[S_INDEX(PITCH_RATE)], -wheel_force * wheel_positions_cg.x / params_p->I_yy); #elif defined(__CUDA_ARCH__) atomicAdd(&state_der[S_INDEX(CG_VEL_I_Z)], wheel_force / params_p->mass); atomicAdd(&state_der[S_INDEX(ROLL_RATE)], wheel_force * wheel_positions_cg.y / params_p->I_xx); atomicAdd(&state_der[S_INDEX(PITCH_RATE)], -wheel_force * wheel_positions_cg.x / params_p->I_yy); #endif } __syncthreads(); output[O_INDEX(WHEEL_FORCE_UP_MAX)] = fmaxf(wheel_force_up[0], fmaxf(wheel_force_up[1], fmaxf(wheel_force_up[2], wheel_force_up[3]))); output[O_INDEX(WHEEL_FORCE_FWD_MAX)] = fmaxf(wheel_force_fwd[0], fmaxf(wheel_force_fwd[1], fmaxf(wheel_force_fwd[2], wheel_force_fwd[3]))); output[O_INDEX(WHEEL_FORCE_SIDE_MAX)] = fmaxf(wheel_force_side[0], fmaxf(wheel_force_side[1], fmaxf(wheel_force_side[2], wheel_force_side[3]))); } TEMPLATE_TYPE __device__ void TEMPLATE_NAME::step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt) { DYN_PARAMS_T* params_p; SharedBlock* sb; // if (GRANDPARENT_CLASS::SHARED_MEM_REQUEST_GRD_BYTES != 0) // { // Allows us to turn on or off global or shared memory version of params // params_p = (DYN_PARAMS_T*)theta_s; // } // else // { // params_p = &(this->params_); // } params_p = &(this->params_); const int grd_shift = this->SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float); // grid size to shift by const int blk_shift = this->SHARED_MEM_REQUEST_BLK_BYTES * (threadIdx.x + blockDim.x * threadIdx.z) / sizeof(float); // blk size to shift by const int sb_shift = sizeof(SharedBlock) / sizeof(float); // how much to shift inside a block to lstm values if (this->SHARED_MEM_REQUEST_BLK_BYTES != 0) { float* sb_mem = &theta_s[grd_shift]; // does the grid shift sb = (SharedBlock*)(sb_mem + blk_shift); } computeParametricDelayDeriv(state, control, state_der, params_p); computeParametricAccelDeriv(state, control, state_der, dt, params_p); computeLSTMSteering(state, control, state_der, params_p, theta_s, grd_shift, blk_shift, sb_shift); computeSimpleSuspensionStep(state, state_der, output, params_p, theta_s); updateState(state, next_state, state_der, dt); computeUncertaintyPropagation(state, control, state_der, next_state, dt, params_p, sb, theta_s); // if (pi == 0) // { // float roll = state[S_INDEX(ROLL)]; // float pitch = state[S_INDEX(PITCH)]; // RACER::computeStaticSettling>( // this->tex_helper_, next_state[S_INDEX(YAW)], next_state[S_INDEX(POS_X)], next_state[S_INDEX(POS_Y)], roll, // pitch, output); // next_state[S_INDEX(PITCH)] = pitch; // next_state[S_INDEX(ROLL)] = roll; // } __syncthreads(); // if (threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0) // { // printf("GPU t: %3d, VEL_Z(t + 1): %f, VEL_Z(t): %f, VEl_Z'(t): %f\n", t, next_state[S_INDEX(CG_VEL_I_Z)], // state[S_INDEX(CG_VEL_I_Z)], // state_der[S_INDEX(CG_VEL_I_Z)]); // } this->setOutputs(state_der, next_state, output); } TEMPLATE_TYPE __device__ void TEMPLATE_NAME::updateState(float* state, float* next_state, float* state_der, const float dt) { int i; int tdy, step; mppi::p1::getParallel1DIndex(tdy, step); // Add the state derivative time dt to the current state. for (i = tdy; i < S_INDEX(STEER_ANGLE_RATE); i += step) { next_state[i] = state[i] + state_der[i] * dt; if (i == S_INDEX(YAW)) { next_state[i] = angle_utils::normalizeAngle(next_state[i]); } if (i == S_INDEX(STEER_ANGLE)) { next_state[i] = fmaxf(fminf(next_state[i], this->params_.max_steer_angle), -this->params_.max_steer_angle); next_state[S_INDEX(STEER_ANGLE_RATE)] = state[S_INDEX(STEER_ANGLE_RATE)] + state_der[S_INDEX(STEER_ANGLE_RATE)] * dt; } if (i == S_INDEX(BRAKE_STATE)) { next_state[i] = fminf(fmaxf(next_state[i], 0.0f), 1.0f); } } } TEMPLATE_TYPE void TEMPLATE_NAME::updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { // Segmented it to ensure that roll and pitch don't get overwritten for (int i = 0; i < S_INDEX(STEER_ANGLE_RATE); i++) { next_state[i] = state[i] + state_der[i] * dt; } next_state(S_INDEX(YAW)) = angle_utils::normalizeAngle(next_state(S_INDEX(YAW))); next_state(S_INDEX(STEER_ANGLE)) = fmaxf(fminf(next_state(S_INDEX(STEER_ANGLE)), this->params_.max_steer_angle), -this->params_.max_steer_angle); next_state(S_INDEX(STEER_ANGLE_RATE)) = state(S_INDEX(STEER_ANGLE_RATE)) + state_der(S_INDEX(STEER_ANGLE_RATE)) * dt; next_state(S_INDEX(BRAKE_STATE)) = fminf(fmaxf(next_state(S_INDEX(BRAKE_STATE)), 0.0f), -this->control_rngs_[C_INDEX(THROTTLE_BRAKE)].x); } TEMPLATE_TYPE __host__ __device__ void TEMPLATE_NAME::setOutputs(const float* state_der, const float* next_state, float* output) { // Setup output int step, pi; mp1::getParallel1DIndex(pi, step); for (int i = pi; i < this->OUTPUT_DIM; i += step) { switch (i) { case O_INDEX(BASELINK_VEL_B_X): output[i] = next_state[S_INDEX(VEL_X)]; break; case O_INDEX(BASELINK_VEL_B_Y): output[i] = 0.0f; break; // case O_INDEX(BASELINK_VEL_B_Z): // output[i] = 0.0f; // break; case O_INDEX(BASELINK_POS_I_X): output[i] = next_state[S_INDEX(POS_X)]; break; case O_INDEX(BASELINK_POS_I_Y): output[i] = next_state[S_INDEX(POS_Y)]; break; case O_INDEX(BASELINK_POS_I_Z): output[i] = next_state[S_INDEX(CG_POS_Z)] - next_state[S_INDEX(PITCH)] * (-this->params_.c_g.x); break; case O_INDEX(PITCH): output[i] = next_state[S_INDEX(PITCH)]; break; case O_INDEX(ROLL): output[i] = next_state[S_INDEX(ROLL)]; break; case O_INDEX(YAW): output[i] = next_state[S_INDEX(YAW)]; break; case O_INDEX(STEER_ANGLE): output[i] = next_state[S_INDEX(STEER_ANGLE)]; break; case O_INDEX(STEER_ANGLE_RATE): output[i] = next_state[S_INDEX(STEER_ANGLE_RATE)]; break; case O_INDEX(ACCEL_X): output[i] = state_der[S_INDEX(VEL_X)]; break; case O_INDEX(ACCEL_Y): output[i] = 0.0f; break; case O_INDEX(OMEGA_Z): output[i] = state_der[S_INDEX(YAW)]; break; case O_INDEX(UNCERTAINTY_VEL_X): output[i] = next_state[S_INDEX(UNCERTAINTY_VEL_X)]; break; case O_INDEX(UNCERTAINTY_YAW_VEL_X): output[i] = next_state[S_INDEX(UNCERTAINTY_YAW_VEL_X)]; break; case O_INDEX(UNCERTAINTY_POS_X_VEL_X): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_X_VEL_X)]; break; case O_INDEX(UNCERTAINTY_POS_Y_VEL_X): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_Y_VEL_X)]; break; case O_INDEX(UNCERTAINTY_YAW): output[i] = next_state[S_INDEX(UNCERTAINTY_YAW)]; break; case O_INDEX(UNCERTAINTY_POS_X_YAW): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_X_YAW)]; break; case O_INDEX(UNCERTAINTY_POS_Y_YAW): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_Y_YAW)]; break; case O_INDEX(UNCERTAINTY_POS_X): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_X)]; break; case O_INDEX(UNCERTAINTY_POS_X_Y): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_X_Y)]; break; case O_INDEX(UNCERTAINTY_POS_Y): output[i] = next_state[S_INDEX(UNCERTAINTY_POS_Y)]; break; case O_INDEX(TOTAL_VELOCITY): output[i] = fabsf(next_state[S_INDEX(VEL_X)]); break; } } } TEMPLATE_TYPE TEMPLATE_NAME::state_array TEMPLATE_NAME::stateFromMap(const std::map& map) { std::vector needed_keys = { "VEL_X", "VEL_Z", "POS_X", "POS_Y", "POS_Z", "OMEGA_X", "OMEGA_Y", "ROLL", "PITCH", "YAW", "STEER_ANGLE", "STEER_ANGLE_RATE", "BRAKE_STATE" }; std::vector uncertainty_keys = { "UNCERTAINTY_POS_X", "UNCERTAINTY_POS_Y", "UNCERTAINTY_YAW", "UNCERTAINTY_VEL_X", "UNCERTAINTY_POS_X_Y", "UNCERTAINTY_POS_X_YAW", "UNCERTAINTY_POS_X_VEL_X", "UNCERTAINTY_POS_Y_YAW", "UNCERTAINTY_POS_Y_VEL_X", "UNCERTAINTY_YAW_VEL_X" }; const bool use_uncertainty = false; if (use_uncertainty) { needed_keys.insert(needed_keys.end(), uncertainty_keys.begin(), uncertainty_keys.end()); } bool missing_key = false; state_array s = state_array::Zero(); for (const auto& key : needed_keys) { if (map.find(key) == map.end()) { // Print out all missing keys std::cout << "Could not find key " << key << " for elevation with simple suspension dynamics." << std::endl; missing_key = true; } } if (missing_key) { return state_array::Constant(NAN); } s(S_INDEX(POS_X)) = map.at("POS_X"); s(S_INDEX(POS_Y)) = map.at("POS_Y"); s(S_INDEX(CG_POS_Z)) = map.at("POS_Z"); s(S_INDEX(VEL_X)) = map.at("VEL_X"); float bl_v_I_z = map.at("VEL_Z") * cosf(map.at("PITCH")) - map.at("VEL_X") * sinf(map.at("PITCH")); s(S_INDEX(CG_VEL_I_Z)) = bl_v_I_z - map.at("OMEGA_Y") * this->params_.c_g.x; s(S_INDEX(STEER_ANGLE)) = map.at("STEER_ANGLE"); s(S_INDEX(STEER_ANGLE_RATE)) = map.at("STEER_ANGLE_RATE"); s(S_INDEX(ROLL)) = map.at("ROLL"); s(S_INDEX(PITCH)) = map.at("PITCH"); s(S_INDEX(YAW)) = map.at("YAW"); // Set z position to cg's z position in world frame float3 rotation = make_float3(s(S_INDEX(ROLL)), s(S_INDEX(PITCH)), s(S_INDEX(YAW))); float3 world_pose = make_float3(s(S_INDEX(POS_X)), s(S_INDEX(POS_Y)), s(S_INDEX(CG_POS_Z))); float3 cg_in_world_frame; mppi::math::bodyOffsetToWorldPoseEuler(this->params_.c_g, world_pose, rotation, cg_in_world_frame); s(S_INDEX(CG_POS_Z)) = cg_in_world_frame.z; s(S_INDEX(ROLL_RATE)) = map.at("OMEGA_X"); s(S_INDEX(PITCH_RATE)) = map.at("OMEGA_Y"); s(S_INDEX(BRAKE_STATE)) = map.at("BRAKE_STATE"); if (use_uncertainty) { s(S_INDEX(UNCERTAINTY_POS_X)) = map.at("UNCERTAINTY_POS_X"); s(S_INDEX(UNCERTAINTY_POS_Y)) = map.at("UNCERTAINTY_POS_Y"); s(S_INDEX(UNCERTAINTY_YAW)) = map.at("UNCERTAINTY_YAW"); s(S_INDEX(UNCERTAINTY_VEL_X)) = map.at("UNCERTAINTY_VEL_X"); s(S_INDEX(UNCERTAINTY_POS_X_Y)) = map.at("UNCERTAINTY_POS_X_Y"); s(S_INDEX(UNCERTAINTY_POS_X_YAW)) = map.at("UNCERTAINTY_POS_X_YAW"); s(S_INDEX(UNCERTAINTY_POS_X_VEL_X)) = map.at("UNCERTAINTY_POS_X_VEL_X"); s(S_INDEX(UNCERTAINTY_POS_Y_YAW)) = map.at("UNCERTAINTY_POS_Y_YAW"); s(S_INDEX(UNCERTAINTY_POS_Y_VEL_X)) = map.at("UNCERTAINTY_POS_Y_VEL_X"); s(S_INDEX(UNCERTAINTY_YAW_VEL_X)) = map.at("UNCERTAINTY_YAW_VEL_X"); } float eps = 1e-6f; if (s(S_INDEX(UNCERTAINTY_POS_X)) < eps) { s(S_INDEX(UNCERTAINTY_POS_X)) = eps; } if (s(S_INDEX(UNCERTAINTY_POS_Y)) < eps) { s(S_INDEX(UNCERTAINTY_POS_Y)) = eps; } if (s(S_INDEX(UNCERTAINTY_YAW)) < eps) { s(S_INDEX(UNCERTAINTY_YAW)) = eps; } if (s(S_INDEX(UNCERTAINTY_VEL_X)) < eps) { s(S_INDEX(UNCERTAINTY_VEL_X)) = eps; } return s; } #undef TEMPLATE_NAME #undef TEMPLATE_TYPE ================================================ FILE: include/mppi/dynamics/racer_dubins/racer_dubins_elevation_suspension_lstm.cuh ================================================ // // Created by Bogdan on 02/21/2024 // #pragma once #include "racer_dubins_elevation_lstm_steering.cuh" #ifndef W_INDEX #define W_IND_CLASS(CLASS, enum_val) E_INDEX(CLASS::WheelIndex, enum_val) #define W_INDEX(enum_val) W_IND_CLASS(PARENT_CLASS::DYN_PARAMS_T, enum_val) #define W_INDEX_NEWLY_CREATED true #endif struct RacerDubinsElevationSuspensionParams : public RacerDubinsElevationParams { enum class WheelIndex : int { FL = 0, FR, BL, BR, NUM_WHEELS, }; enum class StateIndex : int { VEL_X = 0, YAW, POS_X, POS_Y, STEER_ANGLE, BRAKE_STATE, ROLL, PITCH, CG_POS_Z, CG_VEL_I_Z, ROLL_RATE, PITCH_RATE, STEER_ANGLE_RATE, UNCERTAINTY_POS_X, UNCERTAINTY_POS_Y, UNCERTAINTY_YAW, UNCERTAINTY_VEL_X, UNCERTAINTY_POS_X_Y, UNCERTAINTY_POS_X_YAW, UNCERTAINTY_POS_X_VEL_X, UNCERTAINTY_POS_Y_YAW, UNCERTAINTY_POS_Y_VEL_X, UNCERTAINTY_YAW_VEL_X, FILLER_1, NUM_STATES }; float spring_k = 14000.0f; // [N / m] float drag_c = 1000.0f; // [N * s / m] float mass = 1447.0f; // [kg] float I_xx = 1.0f / 12 * mass * 2 * SQ(1.5f); // [kg * m^2] float I_yy = 1.0f / 12 * mass * (SQ(1.5f) + SQ(3.0f)); // [kg * m^2] float wheel_radius = 0.32f; // [m] // Cost force threshold on the order of 3000 N // TODO Figure out Center of Gravity float3 c_g = make_float3(2.981f * 0.5f, 0.0f, 0.0f); }; template class RacerDubinsElevationSuspensionImpl : public RacerDubinsElevationLSTMSteeringImpl { public: using PARENT_CLASS = RacerDubinsElevationLSTMSteeringImpl; using GRANDPARENT_CLASS = typename PARENT_CLASS::PARENT_CLASS; using DYN_PARAMS_T = typename PARENT_CLASS::DYN_PARAMS_T; using SharedBlock = typename PARENT_CLASS::SharedBlock; using state_array = typename PARENT_CLASS::state_array; using control_array = typename PARENT_CLASS::control_array; using output_array = typename PARENT_CLASS::output_array; using buffer_trajectory = typename PARENT_CLASS::buffer_trajectory; using PARENT_CLASS::computeParametricAccelDeriv; using PARENT_CLASS::computeParametricDelayDeriv; using PARENT_CLASS::computeUncertaintyPropagation; RacerDubinsElevationSuspensionImpl(LSTMLSTMConfig config, cudaStream_t stream) : RacerDubinsElevationSuspensionImpl(config.init_config.input_dim, config.init_config.hidden_dim, config.init_config.output_layers, config.pred_config.input_dim, config.pred_config.hidden_dim, config.pred_config.output_layers, config.init_len, stream) { } RacerDubinsElevationSuspensionImpl(int init_input_dim, int init_hidden_dim, std::vector& init_output_layers, int input_dim, int hidden_dim, std::vector& output_layers, int init_len, cudaStream_t stream); RacerDubinsElevationSuspensionImpl(std::string path, cudaStream_t stream = nullptr); std::string getDynamicsModelName() const override { return "RACER Dubins LSTM Steering and Suspension Model"; } void bindToStream(cudaStream_t stream); void paramsToDevice(); void GPUSetup(); void freeCudaMem(); void computeSimpleSuspensionStep(Eigen::Ref state, Eigen::Ref state_der, Eigen::Ref output); void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref& control, Eigen::Ref output, const float t, const float dt); __device__ void computeSimpleSuspensionStep(float* state, float* state_der, float* output, DYN_PARAMS_T* params_p, float* theta_s); __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt); __device__ void updateState(float* state, float* next_state, float* state_der, const float dt); void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt); __host__ __device__ void setOutputs(const float* state_der, const float* next_state, float* output); TwoDTextureHelper* getTextureHelperNormals() { return normals_tex_helper_; } void updateRotation(std::array& rotation) { this->tex_helper_->updateRotation(0, rotation); this->normals_tex_helper_->updateRotation(0, rotation); } state_array stateFromMap(const std::map& map); protected: TwoDTextureHelper* normals_tex_helper_ = nullptr; RacerDubinsElevationSuspensionImpl(cudaStream_t stream = nullptr) : PARENT_CLASS(stream) { normals_tex_helper_ = new TwoDTextureHelper(1, stream); } }; class RacerDubinsElevationSuspension : public RacerDubinsElevationSuspensionImpl { public: using PARENT_CLASS = RacerDubinsElevationSuspensionImpl; RacerDubinsElevationSuspension(int init_input_dim, int init_hidden_dim, std::vector& init_output_layers, int input_dim, int hidden_dim, std::vector& output_layers, int init_len, cudaStream_t stream = nullptr) : PARENT_CLASS(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len, stream) { } RacerDubinsElevationSuspension(std::string path, cudaStream_t stream = 0) : PARENT_CLASS(path, stream) { } protected: // protected constructor so you cannot init without a network setup properly RacerDubinsElevationSuspension(cudaStream_t stream = 0) : PARENT_CLASS(stream = 0) { } }; #ifdef __CUDACC__ #include "racer_dubins_elevation_suspension_lstm.cu" #endif #if W_INDEX_NEWLY_CREATED #undef W_IND_CLASS #undef W_IND #undef W_INDEX #endif ================================================ FILE: include/mppi/dynamics/racer_suspension/racer_suspension.cu ================================================ #include #include #include void RacerSuspension::GPUSetup() { auto* derived = static_cast(this); tex_helper_->GPUSetup(); derived->GPUSetup(); } void RacerSuspension::freeCudaMem() { tex_helper_->freeCudaMem(); } void RacerSuspension::paramsToDevice() { if (this->GPUMemStatus_) { // does all the internal texture updates tex_helper_->copyToDevice(); // makes sure that the device ptr sees the correct texture object HANDLE_ERROR(cudaMemcpyAsync(&(this->model_d_->tex_helper_), &(tex_helper_->ptr_d_), sizeof(TwoDTextureHelper*), cudaMemcpyHostToDevice, this->stream_)); } PARENT_CLASS::paramsToDevice(); } // combined computeDynamics & updateState void RacerSuspension::step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref control, Eigen::Ref output, const float t, const float dt) { Eigen::Matrix3f omegaJac; computeStateDeriv(state, control, state_der, output, &omegaJac); // approximate implicit euler for angular rate states Eigen::Vector3f deltaOmega = (Eigen::Matrix3f::Identity() - dt * omegaJac).inverse() * dt * state_der.segment<3>(S_INDEX(OMEGA_B_X)); state_array delta_x = state_der * dt; delta_x.segment<3>(S_INDEX(OMEGA_B_X)) = deltaOmega; next_state = state + delta_x; float q_norm = next_state.segment<4>(S_INDEX(ATTITUDE_QW)).norm(); next_state.segment<4>(S_INDEX(ATTITUDE_QW)) /= q_norm; } void RacerSuspension::updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt) { next_state = state + state_der * dt; float q_norm = next_state.segment<4>(S_INDEX(ATTITUDE_QW)).norm(); next_state.segment<4>(S_INDEX(ATTITUDE_QW)) /= q_norm; } __device__ void RacerSuspension::updateState(float* state, float* next_state, float* state_der, const float dt) { unsigned int i; unsigned int tdy = threadIdx.y; // Add the state derivative time dt to the current state. // printf("updateState thread %d, %d = %f, %f\n", threadIdx.x, threadIdx.y, state[0], state_der[0]); for (i = tdy; i < PARENT_CLASS::STATE_DIM; i += blockDim.y) { next_state[i] = state[i] + state_der[i] * dt; } __syncthreads(); if (tdy == 0) { float norm = sqrtf(powf(next_state[S_INDEX(ATTITUDE_QW)], 2) + powf(next_state[S_INDEX(ATTITUDE_QX)], 2) + powf(next_state[S_INDEX(ATTITUDE_QY)], 2) + powf(next_state[S_INDEX(ATTITUDE_QZ)], 2)); next_state[S_INDEX(ATTITUDE_QW)] /= norm; next_state[S_INDEX(ATTITUDE_QX)] /= norm; next_state[S_INDEX(ATTITUDE_QY)] /= norm; next_state[S_INDEX(ATTITUDE_QZ)] /= norm; } } __device__ __host__ static float stribeck_friction(float v, float mu_s, float v_slip, float& partial_mu_partial_v) { float mu = v / v_slip * mu_s; partial_mu_partial_v = 0; if (mu > mu_s) { return mu_s; } if (mu < -mu_s) { return -mu_s; } partial_mu_partial_v = mu_s / v_slip; return mu; } __device__ __host__ void RacerSuspension::computeStateDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der, Eigen::Ref output, Eigen::Matrix3f* omegaJacobian) { Eigen::Vector3f p_I = state.segment<3>(S_INDEX(P_I_X)); Eigen::Vector3f v_I = state.segment<3>(S_INDEX(V_I_X)); Eigen::Vector3f omega = state.segment<3>(S_INDEX(OMEGA_B_X)); Eigen::Quaternionf q(state[S_INDEX(ATTITUDE_QW)], state[S_INDEX(ATTITUDE_QX)], state[S_INDEX(ATTITUDE_QY)], state[S_INDEX(ATTITUDE_QZ)]); Eigen::Matrix3f R = q.toRotationMatrix(); float tan_delta = tan(state[S_INDEX(STEER_ANGLE)]); Eigen::Matrix3f Rdot = R * mppi::math::skewSymmetricMatrix(omega); // linear engine model float vel_x = (R.transpose() * v_I)[0]; float throttle = max(0.0, control[C_INDEX(THROTTLE_BRAKE)]); float brake = max(0.0, -control[C_INDEX(THROTTLE_BRAKE)]); float acc = params_.c_t * throttle - copysign(params_.c_b * brake, vel_x) - params_.c_v * vel_x + params_.c_0; float propulsion_force = params_.mass * acc; Eigen::Vector3f f_B = Eigen::Vector3f::Zero(); Eigen::Vector3f tau_B = Eigen::Vector3f::Zero(); Eigen::Matrix3f tau_B_jac = Eigen::Matrix3f::Zero(); Eigen::Vector3f f_r_B[4]; // for each wheel for (int i = 0; i < 4; i++) { // compute suspension from elevation map Eigen::Vector3f p_w_nom_B_i = cudaToEigen(params_.wheel_pos_wrt_base_link[i]) - cudaToEigen(params_.cg_pos_wrt_base_link); Eigen::Vector3f p_w_nom_I_i = p_I + (R * p_w_nom_B_i).eval(); float h_i = 0; Eigen::Vector3f n_I_i(0, 0, 1); // TODO: Enable elevation map querying // if (tex_helper_->checkTextureUse(TEXTURE_ELEVATION_MAP)) // { // float4 wheel_elev = tex_helper_->queryTextureAtWorldPose(TEXTURE_ELEVATION_MAP, EigenToCuda(p_w_nom_I_i)); // h_i = wheel_elev.w; // // std::cout << "h_i " << h_i << std::endl; // n_I_i = Eigen::Vector3f(wheel_elev.x, wheel_elev.y, wheel_elev.z); // } Eigen::Vector3f p_c_I_i(p_w_nom_I_i[0], p_w_nom_I_i[1], h_i); float l_i = p_w_nom_I_i[2] - p_c_I_i[2]; Eigen::Vector3f p_dot_w_nom_I_i = v_I + (Rdot * p_w_nom_B_i).eval(); Eigen::Matrix3f p_dot_w_nom_I_i_Jac = R * Eigen::Matrix3f::Identity().colwise().cross(p_w_nom_B_i); float h_dot_i = -n_I_i[0] / n_I_i[2] * p_dot_w_nom_I_i[0] - n_I_i[1] / n_I_i[2] * p_dot_w_nom_I_i[1]; Eigen::RowVector3f h_dot_i_Jac = (-n_I_i[0] / n_I_i[2] * p_dot_w_nom_I_i_Jac.row(0)).eval() - (n_I_i[1] / n_I_i[2] * p_dot_w_nom_I_i_Jac.row(1)).eval(); float l_dot_i = p_dot_w_nom_I_i[2] - h_dot_i; Eigen::RowVector3f l_dot_i_Jac = p_dot_w_nom_I_i_Jac.row(2) - h_dot_i_Jac; float f_k_i = -params_.k_s[i] * (l_i - params_.l_0[i]) - params_.c_s[i] * l_dot_i; Eigen::RowVector3f f_k_i_Jac = -params_.c_s[i] * l_dot_i_Jac; if (f_k_i < 0) { f_k_i = 0; f_k_i_Jac = Eigen::RowVector3f::Zero(); } // contact frame Eigen::Vector3f p_dot_c_I_i(p_dot_w_nom_I_i[0], p_dot_w_nom_I_i[1], h_dot_i); Eigen::Matrix3f p_dot_c_I_i_Jac; p_dot_c_I_i_Jac.row(0) = p_dot_w_nom_I_i_Jac.row(0); p_dot_c_I_i_Jac.row(1) = p_dot_w_nom_I_i_Jac.row(1); p_dot_c_I_i_Jac.row(2) = h_dot_i_Jac; float delta_i; if (i == RacerSuspensionParams::WHEEL_FRONT_LEFT) { delta_i = atan(params_.wheel_base * tan_delta / (params_.wheel_base - tan_delta * params_.width / 2)); } else if (i == RacerSuspensionParams::WHEEL_FRONT_RIGHT) { delta_i = atan(params_.wheel_base * tan_delta / (params_.wheel_base + tan_delta * params_.width / 2)); } else { // rear wheels delta_i = 0; } Eigen::Vector3f n_B_i = R.transpose() * n_I_i; Eigen::Vector3f wheel_dir_B_i(cos(delta_i), sin(delta_i), 0); Eigen::Vector3f s_B_i = n_B_i.cross(wheel_dir_B_i).normalized(); Eigen::Vector3f t_B_i = s_B_i.cross(n_B_i); Eigen::Matrix3f R_C_i_to_B; R_C_i_to_B.col(0) = t_B_i; R_C_i_to_B.col(1) = s_B_i; R_C_i_to_B.col(2) = n_B_i; // contact velocity Eigen::Vector3f p_dot_c_B_i = R.transpose() * p_dot_c_I_i; Eigen::Matrix3f p_dot_c_B_i_Jac = R.transpose() * p_dot_c_I_i_Jac; float v_w_t_i = t_B_i.dot(p_dot_c_B_i); float v_w_s_i = s_B_i.dot(p_dot_c_B_i); Eigen::RowVector3f v_w_s_i_Jac = s_B_i.transpose() * p_dot_c_B_i_Jac; // compute contact forces float f_n_i = f_k_i; float partial_mu_s_partial_v; float mu_s = stribeck_friction(v_w_s_i, params_.mu, params_.v_slip, partial_mu_s_partial_v); float f_s_i = -mu_s * f_n_i; float f_t_i = max(-params_.mu * f_n_i, min(propulsion_force / 4, params_.mu * f_n_i)); Eigen::RowVector3f f_n_i_Jac = f_k_i_Jac; Eigen::RowVector3f f_t_i_Jac = Eigen::RowVector3f::Zero(); if (propulsion_force / 4 > params_.mu * f_n_i) { f_t_i_Jac = params_.mu * f_n_i_Jac; } else if (propulsion_force / 4 < -params_.mu * f_n_i) { f_t_i_Jac = -params_.mu * f_n_i_Jac; } Eigen::RowVector3f f_s_i_Jac = (-f_n_i * partial_mu_s_partial_v * v_w_s_i_Jac).eval() - (mu_s * f_n_i_Jac).eval(); // contact force & location Eigen::Vector3f f_r_C_i(f_t_i, f_s_i, f_n_i); Eigen::Matrix3f f_r_C_i_Jac; f_r_C_i_Jac.row(0) = f_t_i_Jac; f_r_C_i_Jac.row(1) = f_s_i_Jac; f_r_C_i_Jac.row(2) = f_n_i_Jac; f_r_B[i] = R_C_i_to_B * f_r_C_i; Eigen::Matrix3f f_r_B_i_Jac = R_C_i_to_B = f_r_C_i_Jac; Eigen::Vector3f p_c_B_i = R.transpose() * (p_c_I_i - p_I).eval(); // accumulate forces & moments f_B += f_r_B[i]; tau_B += (p_c_B_i.cross(f_r_B[i])).eval(); tau_B_jac += -f_r_B_i_Jac.colwise().cross(p_c_B_i); if (output.data() != nullptr) { output[O_INDEX(WHEEL_POS_I_FL_X) + i * 2] = p_w_nom_I_i[0]; output[O_INDEX(WHEEL_POS_I_FL_Y) + i * 2] = p_w_nom_I_i[1]; const float force_magn = sqrtf(powf(f_r_B[i][0], 2.0f) + powf(f_r_B[i][1], 2.0f) + powf(f_r_B[i][2], 2.0f)); output[O_INDEX(WHEEL_FORCE_B_FL) + i] = force_magn; } } Eigen::Vector3f g(0, 0, params_.gravity); // TODO gravity is negative to match dubins model state_der.segment<3>(S_INDEX(P_I_X)) = v_I; state_der.segment<3>(S_INDEX(V_I_X)) = (1 / params_.mass * R * f_B).eval() + g; Eigen::Quaternionf qdot; qdot.coeffs() = 0.5 * (q * Eigen::Quaternionf(0, omega[0], omega[1], omega[2])).coeffs(); state_der[S_INDEX(ATTITUDE_QW)] = qdot.w(); state_der[S_INDEX(ATTITUDE_QX)] = qdot.x(); state_der[S_INDEX(ATTITUDE_QY)] = qdot.y(); state_der[S_INDEX(ATTITUDE_QZ)] = qdot.z(); Eigen::Vector3f J_diag(params_.Jxx, params_.Jyy, params_.Jzz); Eigen::Vector3f J_inv_diag(1.0 / params_.Jxx, 1.0 / params_.Jyy, 1.0 / params_.Jzz); state_der.segment<3>(S_INDEX(OMEGA_B_X)) = J_inv_diag.cwiseProduct(J_diag.cwiseProduct(omega).cross(omega) + tau_B); if (omegaJacobian) { Eigen::Matrix3f J = J_diag.asDiagonal(); Eigen::Matrix3f Jwxw_jac = J.colwise().cross(omega) - Eigen::Matrix3f::Identity().colwise().cross(J_diag.cwiseProduct(omega)); *omegaJacobian = J_inv_diag.asDiagonal() * (Jwxw_jac + tau_B_jac).eval(); } // Steering actuator model float steer = control[C_INDEX(STEER_CMD)] / params_.steer_command_angle_scale; state_der[S_INDEX(STEER_ANGLE)] = params_.steering_constant * (steer - state[S_INDEX(STEER_ANGLE)]); if (output.data() != nullptr) { Eigen::Vector3f COM_v_B = R.transpose() * v_I; Eigen::Vector3f p_base_link_in_B = -cudaToEigen(params_.cg_pos_wrt_base_link); Eigen::Vector3f base_link_v_B = COM_v_B + omega.cross(p_base_link_in_B); output[O_INDEX(BASELINK_VEL_B_X)] = base_link_v_B[0]; output[O_INDEX(BASELINK_VEL_B_X)] = base_link_v_B[1]; output[O_INDEX(BASELINK_VEL_B_X)] = base_link_v_B[2]; Eigen::Vector3f base_link_p_I = p_I + (R * p_base_link_in_B).eval(); output[O_INDEX(BASELINK_POS_I_X)] = base_link_p_I[0]; output[O_INDEX(BASELINK_POS_I_Y)] = base_link_p_I[1]; output[O_INDEX(BASELINK_POS_I_Z)] = base_link_p_I[2]; float roll, pitch, yaw; mppi::math::Quat2EulerNWU(q, roll, pitch, yaw); output[O_INDEX(YAW)] = yaw; output[O_INDEX(PITCH)] = pitch; output[O_INDEX(ROLL)] = roll; output[O_INDEX(STEER_ANGLE)] = state[S_INDEX(STEER_ANGLE)]; output[O_INDEX(STEER_ANGLE_RATE)] = state_der[S_INDEX(STEER_ANGLE)]; // output[O_INDEX(CENTER_POS_I_X)] = output[O_INDEX(BASELINK_POS_I_X)]; // TODO // output[O_INDEX(CENTER_POS_I_Y)] = output[O_INDEX(BASELINK_POS_I_Y)]; // output[O_INDEX(CENTER_POS_I_Z)] = output[O_INDEX(BASELINK_POS_I_Z)]; output[O_INDEX(ACCEL_X)] = 0; // TODO: fill in with proper accel_x // #ifdef __CUDA_ARCH__ // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0 && blockIdx.y == 0 && // blockIdx.z == 0) // { // printf("GPU: "); // #else // printf("CPU: "); // #endif // printf("Output: "); // for (int j = 0; j < DYN_T::OUTPUT_DIM; j++) // { // printf("%6.2f, ", output[j]); // } // printf("\n"); // #ifdef __CUDA_ARCH__ // } // #endif } } __device__ void RacerSuspension::step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt) { computeStateDeriv(state, control, state_der, theta_s, output); __syncthreads(); updateState(state, next_state, state_der, dt); } __device__ void RacerSuspension::computeStateDeriv(float* state, float* control, float* state_der, float* theta_s, float* output) { Eigen::Map state_v(state); Eigen::Map control_v(control); Eigen::Map state_der_v(state_der); if (output) { Eigen::Map output_v(output); // Eigen::Ref output_r(output_v); computeStateDeriv(state_v, control_v, state_der_v, output_v); } else { Eigen::Map output_v(nullptr); computeStateDeriv(state_v, control_v, state_der_v, output_v); } // for (int i = 0; i < PARENT_CLASS::STATE_DIM; i++) // { // state_der[i] = 0; // } // state_der[S_INDEX(V_I_X)] = control[1]; // if (output) // { // for (int i = 0; i < PARENT_CLASS::OUTPUT_DIM; i++) // { // output[i] = 0; // } // } } Eigen::Quaternionf RacerSuspension::attitudeFromState(const Eigen::Ref& state) { return Eigen::Quaternionf(state[S_INDEX(ATTITUDE_QW)], state[S_INDEX(ATTITUDE_QX)], state[S_INDEX(ATTITUDE_QY)], state[S_INDEX(ATTITUDE_QZ)]); } Eigen::Vector3f RacerSuspension::positionFromState(const Eigen::Ref& state) { Eigen::Vector3f p_COM = state.segment<3>(S_INDEX(P_I_X)); Eigen::Quaternionf q = attitudeFromState(state); return p_COM - q * cudaToEigen(params_.cg_pos_wrt_base_link); } Eigen::Vector3f RacerSuspension::velocityFromState(const Eigen::Ref& state) { Eigen::Vector3f COM_v_I = state.segment<3>(S_INDEX(V_I_X)); Eigen::Quaternionf q_B_to_I = attitudeFromState(state); Eigen::Vector3f COM_v_B = q_B_to_I.conjugate() * COM_v_I; Eigen::Vector3f omega = state.segment<3>(S_INDEX(OMEGA_B_X)); Eigen::Vector3f p_base_link_in_B = -cudaToEigen(params_.cg_pos_wrt_base_link); Eigen::Vector3f base_link_v_B = COM_v_B + omega.cross(p_base_link_in_B); return base_link_v_B; } Eigen::Vector3f RacerSuspension::angularRateFromState(const Eigen::Ref& state) { return state.segment<3>(S_INDEX(OMEGA_B_X)); } RacerSuspension::state_array RacerSuspension::stateFromOdometry(const Eigen::Quaternionf& q_B_to_I, const Eigen::Vector3f& pos_base_link_I, const Eigen::Vector3f& vel_base_link_B, const Eigen::Vector3f& omega_B) { state_array s; s.setZero(); s[S_INDEX(ATTITUDE_QW)] = q_B_to_I.w(); s[S_INDEX(ATTITUDE_QX)] = q_B_to_I.x(); s[S_INDEX(ATTITUDE_QY)] = q_B_to_I.y(); s[S_INDEX(ATTITUDE_QZ)] = q_B_to_I.z(); s.segment<3>(S_INDEX(OMEGA_B_X)) = omega_B; Eigen::Vector3f p_COM_wrt_base_link = cudaToEigen(params_.cg_pos_wrt_base_link); Eigen::Vector3f p_I = pos_base_link_I + q_B_to_I * p_COM_wrt_base_link; s.segment<3>(S_INDEX(P_I_X)) = p_I; Eigen::Vector3f COM_v_B = vel_base_link_B + omega_B.cross(p_COM_wrt_base_link); Eigen::Vector3f COM_v_I = q_B_to_I * COM_v_B; s.segment<3>(S_INDEX(V_I_X)) = COM_v_I; return s; } void RacerSuspension::enforceLeash(const Eigen::Ref& state_true, const Eigen::Ref& state_nominal, const Eigen::Ref& leash_values, Eigen::Ref state_output) { // update state_output for leash, need to handle x and y positions specially, convert to body frame and leash in body // frame. transform x and y into body frame float dx = state_nominal[S_INDEX(P_I_X)] - state_true[S_INDEX(P_I_X)]; float dy = state_nominal[S_INDEX(P_I_Y)] - state_true[S_INDEX(P_I_Y)]; float roll, pitch, yaw; Eigen::Quaternionf q(state_true[S_INDEX(ATTITUDE_QW)], state_true[S_INDEX(ATTITUDE_QX)], state_true[S_INDEX(ATTITUDE_QY)], state_true[S_INDEX(ATTITUDE_QZ)]); mppi::math::Quat2EulerNWU(q, roll, pitch, yaw); float dx_body = dx * cos(yaw) + dy * sin(yaw); float dy_body = -dx * sin(yaw) + dy * cos(yaw); // determine leash in body frame float x_leash = leash_values[S_INDEX(P_I_X)]; float y_leash = leash_values[S_INDEX(P_I_Y)]; dx_body = fminf(fmaxf(dx_body, -x_leash), x_leash); dy_body = fminf(fmaxf(dy_body, -y_leash), y_leash); // transform back to map frame dx = dx_body * cos(yaw) + -dy_body * sin(yaw); dy = dx_body * sin(yaw) + dy_body * cos(yaw); // apply leash state_output[S_INDEX(P_I_X)] += dx; state_output[S_INDEX(P_I_Y)] += dy; // TODO: Figure out leash for quaternion? // handle leash for rest of states float diff; for (int i = 0; i < STATE_DIM; i++) { // use body x and y for leash if (i == S_INDEX(P_I_X) || i == S_INDEX(P_I_Y) || i == S_INDEX(ATTITUDE_QW) || i == S_INDEX(ATTITUDE_QX) || i == S_INDEX(ATTITUDE_QY) || i == S_INDEX(ATTITUDE_QZ)) { // handle outside for loop continue; } else { diff = fabsf(state_nominal[i] - state_true[i]); } if (leash_values[i] < diff) { float leash_dir = fminf(fmaxf(state_nominal[i] - state_true[i], -leash_values[i]), leash_values[i]); state_output[i] = state_true[i] + leash_dir; } else { state_output[i] = state_nominal[i]; } } } RacerSuspension::state_array RacerSuspension::stateFromMap(const std::map& map) { state_array s; // TODO return s; } ================================================ FILE: include/mppi/dynamics/racer_suspension/racer_suspension.cuh ================================================ #ifndef MPPIGENERIC_RACER_SUSPENSION_CUH #define MPPIGENERIC_RACER_SUSPENSION_CUH #include #include #include struct RacerSuspensionParams : public DynamicsParams { enum class StateIndex : int { P_I_X = 0, P_I_Y, P_I_Z, ATTITUDE_QW, ATTITUDE_QX, ATTITUDE_QY, ATTITUDE_QZ, V_I_X, V_I_Y, V_I_Z, OMEGA_B_X, OMEGA_B_Y, OMEGA_B_Z, STEER_ANGLE, NUM_STATES }; enum class ControlIndex : int { THROTTLE_BRAKE = 0, STEER_CMD, NUM_CONTROLS }; enum class OutputIndex : int { BASELINK_VEL_B_X = 0, BASELINK_VEL_B_Y, BASELINK_VEL_B_Z, BASELINK_POS_I_X, BASELINK_POS_I_Y, BASELINK_POS_I_Z, YAW, ROLL, PITCH, STEER_ANGLE, STEER_ANGLE_RATE, WHEEL_POS_I_FL_X, WHEEL_POS_I_FL_Y, WHEEL_POS_I_FR_X, WHEEL_POS_I_FR_Y, WHEEL_POS_I_RL_X, WHEEL_POS_I_RL_Y, WHEEL_POS_I_RR_X, WHEEL_POS_I_RR_Y, WHEEL_FORCE_B_FL, WHEEL_FORCE_B_FR, WHEEL_FORCE_B_RL, WHEEL_FORCE_B_RR, ACCEL_X, ACCEL_Y, OMEGA_Z, NUM_OUTPUTS }; // suspension model params float wheel_radius = 0.32; float mass = 1447; float wheel_base = 2.981; float width = 1.5; float height = 1.5; float gravity = -9.81; float k_s[4] = { 14000, 14000, 14000, 14000 }; float c_s[4] = { 2000, 2000, 2000, 2000 }; float l_0[4] = { wheel_radius + mass / 4 * (-gravity) / k_s[0], wheel_radius + mass / 4 * (-gravity) / k_s[1], wheel_radius + mass / 4 * (-gravity) / k_s[2], wheel_radius + mass / 4 * (-gravity) / k_s[3], }; float3 cg_pos_wrt_base_link = make_float3(wheel_base / 2, 0, 0.2); // float3 wheel_pos_front_left = make_float3(wheel_base, width/2, 0); // float3 wheel_pos_front_right = make_float3(wheel_base, -width/2, 0); // float3 wheel_pos_rear_left = make_float3(0, width/2, 0); // float3 wheel_pos_rear_right = make_float3(0, -width/2, 0); float3 wheel_pos_wrt_base_link[4] = { make_float3(wheel_base, width / 2, 0), make_float3(wheel_base, -width / 2, 0), make_float3(0, width / 2, 0), make_float3(0, -width / 2, 0) }; float Jxx = 1.0 / 12 * mass * (height * height + width * width); float Jyy = 1.0 / 12 * mass * (height * height + wheel_base * wheel_base); float Jzz = 1.0 / 12 * mass * (wheel_base * wheel_base + width * width); float mu = 0.65; float v_slip = 0.1; static const int WHEEL_FRONT_LEFT = 0; static const int WHEEL_FRONT_RIGHT = 1; static const int WHEEL_REAR_LEFT = 2; static const int WHEEL_REAR_RIGHT = 3; // throttle model params float c_t = 3.0; float c_b = 10.0; float c_v = 0.2; float c_0 = 0; // steering model params float steering_constant = .6; float steer_command_angle_scale = -2.45; int gear_sign = 1; RacerSuspensionParams() { recalcParams(); } void __host__ recalcParams() { cg_pos_wrt_base_link = make_float3(wheel_base / 2, 0, 0.2); l_0[0] = wheel_radius + mass / 4 * (-gravity) / k_s[0]; l_0[1] = wheel_radius + mass / 4 * (-gravity) / k_s[1]; l_0[2] = wheel_radius + mass / 4 * (-gravity) / k_s[2]; l_0[3] = wheel_radius + mass / 4 * (-gravity) / k_s[3]; wheel_pos_wrt_base_link[0] = make_float3(wheel_base, width / 2, 0); wheel_pos_wrt_base_link[1] = make_float3(wheel_base, -width / 2, 0); wheel_pos_wrt_base_link[2] = make_float3(0, width / 2, 0); wheel_pos_wrt_base_link[3] = make_float3(0, -width / 2, 0); Jxx = 1.0 / 12 * mass * (height * height + width * width); Jyy = 1.0 / 12 * mass * (height * height + wheel_base * wheel_base); Jzz = 1.0 / 12 * mass * (wheel_base * wheel_base + width * width); } }; using namespace MPPI_internal; /** * state: x, y, z, quat [w, x, y, z], vx_I, vy_I, vz_I, omega_x, omega_y, omega_z, shaft angle, shaft angle velocity * control: throttle/brake, steering angle command * sensors: texture 0 is elevation map (normal x, normal y, normal z, height) */ class RacerSuspension : public Dynamics { public: typedef Dynamics PARENT_CLASS; typedef typename PARENT_CLASS::state_array state_array; typedef typename PARENT_CLASS::control_array control_array; typedef typename PARENT_CLASS::output_array output_array; typedef typename PARENT_CLASS::dfdx dfdx; typedef typename PARENT_CLASS::dfdu dfdu; using PARENT_CLASS::updateState; // needed as overloading updateState here hides all parent versions of updateState // number of floats for computing the state derivative BLOCK_DIM_X * BLOCK_DIM_Z times static const int SHARED_MEM_REQUEST_BLK_BYTES = 0; static const int TEXTURE_ELEVATION_MAP = 0; RacerSuspension(cudaStream_t stream = nullptr) : PARENT_CLASS(stream) { tex_helper_ = new TwoDTextureHelper(1, stream); } RacerSuspension(RacerSuspensionParams& params, cudaStream_t stream) : PARENT_CLASS(params, stream) { tex_helper_ = new TwoDTextureHelper(1, stream); } std::string getDynamicsModelName() const override { return "RACER Suspension Model"; } ~RacerSuspension() { delete tex_helper_; } void GPUSetup(); void freeCudaMem(); void paramsToDevice(); void updateState(const Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const float dt); void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, const Eigen::Ref control, Eigen::Ref output, const float t, const float dt); __device__ __host__ void computeStateDeriv(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der, Eigen::Ref output, Eigen::Matrix3f* omegaJacobian = nullptr); __device__ void updateState(float* state, float* next_state, float* state_der, const float dt); __device__ void computeStateDeriv(float* state, float* control, float* state_der, float* theta_s, float* output = nullptr); void enforceLeash(const Eigen::Ref& state_true, const Eigen::Ref& state_nominal, const Eigen::Ref& leash_values, Eigen::Ref state_output); __device__ void step(float* state, float* next_state, float* state_der, float* control, float* output, float* theta_s, const float t, const float dt); TwoDTextureHelper* getTextureHelper() { return tex_helper_; } Eigen::Quaternionf attitudeFromState(const Eigen::Ref& state); Eigen::Vector3f positionFromState(const Eigen::Ref& state); Eigen::Vector3f velocityFromState(const Eigen::Ref& state); Eigen::Vector3f angularRateFromState(const Eigen::Ref& state); state_array stateFromOdometry(const Eigen::Quaternionf& q_B_to_I, const Eigen::Vector3f& pos_base_link_I, const Eigen::Vector3f& vel_base_link_B, const Eigen::Vector3f& omega_B); state_array stateFromMap(const std::map& map) override; protected: TwoDTextureHelper* tex_helper_ = nullptr; }; #if __CUDACC__ #include "racer_suspension.cu" #endif #endif // MPPIGENERIC_RACER_SUSPENSION_CUH ================================================ FILE: include/mppi/feedback_controllers/CCM/ccm.h ================================================ /* * Created on Tue Jun 30 2020 by Bogdan */ #ifndef FEEDBACK_CONTROLLERS_CCM_CCM_H_ #define FEEDBACK_CONTROLLERS_CCM_CCM_H_ #include #include #include #include namespace ccm { // Convienient Types template using Vectorf = Eigen::Matrix; // ================ Chebyshev Methods ================= // Can also be written as pts, and weights being passed in. template std::tuple, Vectorf> chebyshevPts() { Vectorf pts, weights; int K = N - 1; for (int i = K; i >= 0; i--) { pts[i] = (cosf(M_PI * i / K) + 1) / 2.0; } // Weights calculations. Weights look to be symmetric int Kh = K * K - (K % 2 == 0) * 1; weights[0] = 0.5 / Kh; weights[K] = 0.5 / Kh; int K_iteration = K / 2; for (int k = 1; k <= K_iteration; k++) { float w_k = 0; for (int j = 0; j <= K_iteration; j++) { int beta = 2; if (j == 0 || j == K_iteration) { beta = 1; } w_k += beta * cosf((M_2_PI * j * k) / K) / (K * (1 - 4 * j * j)); } weights[k] = w_k; weights[K - k] = w_k; } return std::make_tuple(pts, weights); } // Create a polynomial matrix given a set of points and the max degree of the polynomials template Eigen::Matrix chebyshevPolynomial(const Eigen::Ref>& pts) { Eigen::Matrix T = Eigen::Matrix::Zero(); T.row(0) = Vectorf::Ones(); T.row(1) = pts; for (int i = 2; i < D; i++) { for (int j = 0; j < N; j++) { T(i, j) = 2 * pts(j) * T(i - 1, j) - T(i - 2, j); } } return T; } // Create the derivative of the polynomial matrix provided template Eigen::Matrix chebyshevPolynomialDerivative(const Eigen::Ref>& T, const Eigen::Ref>& pts) { Eigen::Matrix dT = Eigen::Matrix::Zero(); dT.row(1) = Vectorf::Ones(); for (int i = 2; i < D; i++) { for (int j = 0; j < N; j++) { dT(i, j) = 2 * T(i - 1, j) + 2 * pts(j) * dT(i - 1, j) - dT(i - 2, j); } } return dT; } /** * Empty GPU Class for now **/ template class LinearCCMGPU : public GPUFeedbackController, DYN_T, GPUState> { public: // using DYN_T = typename GPUFeedbackController, DYN_T, GPUState>::DYN_T; LinearCCMGPU(cudaStream_t stream = 0) : GPUFeedbackController, DYN_T, GPUState>(stream){}; }; struct CCMParams { int a = 0; }; template class LinearCCM : public FeedbackController, CCMParams, NUM_TIMESTEPS> { public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW // Typedefs using state_array = typename DYN_T::state_array; using control_array = typename DYN_T::control_array; using B_matrix = typename DYN_T::dfdu; using RiemannianMetric = typename DYN_T::dfdx; typedef FeedbackController, CCMParams, NUM_TIMESTEPS> PARENT_CLASS; using TEMPLATED_FEEDBACK_STATE = typename PARENT_CLASS::TEMPLATED_FEEDBACK_STATE; typedef Eigen::Matrix state_trajectory; typedef Eigen::Matrix control_trajectory; LinearCCM() = default; ~LinearCCM() = default; LinearCCM(DYN_T* dyn) { model_ = dyn; M_ = RiemannianMetric::Identity(); std::cout << "\033[1;33mCALLED CONSTRUCTOR\033[0m" << std::endl; } /** * Methods that have to be overwritten for FeedbackController **/ void initTrackingController(){}; void computeFeedback(const Eigen::Ref& init_state, const Eigen::Ref& goal_traj, const Eigen::Ref& control_traj) { // setNominalControlTrajectory(control_traj); // setNominalStateTrajectory(goal_traj); } control_array k_(const Eigen::Ref& x_act, const Eigen::Ref& x_goal, int t, TEMPLATED_FEEDBACK_STATE& fb_state) { control_array u_nom_t = u_nominal_traj_.col(t); return u_feedback(x_act, x_goal, u_nom_t); } // Generic Method for calculating the Metric based on state x RiemannianMetric M(const Eigen::Ref& x) { RiemannianMetric W; float x0_2 = 0; // powf(x[0], 2); float x1_2 = 0; // powf(x[1], 2); /** lambda = 0.8 **/ // float w_1_term = 0.0004616 * x0_2 + 0.0004616 * x1_2 + 2.0459139; // float w_2_term = -0.0007369 * x0_2 - 0.007369 * x1_2 - 1.9129315; // float w_3_term = -0.0007369 * x0_2 - 0.007369 * x1_2 - 1.9129315; // float w_4_term = 0.0026403 * x0_2 + 0.0026403 * x1_2 + 5.2164191; /** lmabda = 1.5 **/ // float w_1_term = 0.006889 * x0_2 + 0.006889 * x1_2 + 3.6390396; // float w_2_term = -0.034485 * x0_2 - 0.034485 * x1_2 - 7.1123521; // float w_3_term = -0.034485 * x0_2 - 0.034485 * x1_2 - 7.1123521; // float w_4_term = 0.2511511 * x0_2 + 0.2511511 * x1_2 + 24.9433551; /** lambda = 3.5 **/ float w_1_term = 0.0005948 * x0_2 + 0.0005948 * x1_2 + 2.2416827; float w_2_term = -0.0044842 * x0_2 - 0.0044842 * x1_2 - 8.2434395; float w_3_term = -0.0044842 * x0_2 - 0.0044842 * x1_2 - 8.2434395; float w_4_term = 0.0521421 * x0_2 + 0.0521421 * x1_2 + 59.1072868; W << w_1_term, 0, w_2_term, 0, 0, w_1_term, 0, w_2_term, w_3_term, 0, w_4_term, 0, 0, w_3_term, 0, w_4_term; return W.inverse(); // return M_; } float Energy(const Eigen::Ref& delta_x, const Eigen::Ref& x) { return delta_x.transpose() * M(x) * delta_x; } // TODO Replace with some call to Dynamics B_matrix B(const Eigen::Ref& x) { return model_->B(x); } state_array f(const Eigen::Ref& x, const Eigen::Ref& u = control_array::Zero()) { state_array x_der; model_->computeStateDeriv(x, u, x_der); return x_der; } control_array u_feedback(const Eigen::Ref& x_act, int t) { state_array x_nom_t = x_nominal_traj_.col(t); control_array u_nom_t = u_nominal_traj_.col(t); state_array delta_x = x_act - x_nom_t; return u_feedback(x_act, x_nom_t, u_nom_t); } control_array u_feedback(const Eigen::Ref& x_act, const Eigen::Ref& x_nom, const Eigen::Ref& u_nom, bool debug = false) { state_array delta_x = x_act - x_nom; float E = Energy(delta_x, x_act); control_array lhs = 2 * B(x_act).transpose() * M(x_act) * delta_x; float normalize_lhs = lhs.norm() * lhs.norm(); float rhs = -2 * lambda_ * E - 2 * delta_x.transpose() * M(x_act) * (f(x_act) - f(x_nom) + (B(x_act) - B(x_nom)) * u_nom); if (debug) { std::cout << "Delta x: " << delta_x.transpose() << std::endl; std::cout << "E: " << E << ", RHS: " << rhs << ", LHS Norm: " << normalize_lhs << std::endl; } if (rhs > 0 || normalize_lhs == 0) { return control_array::Zero(); } else { return rhs / normalize_lhs * lhs; } } void setNominalControlTrajectory(const Eigen::Ref&& u_traj) { u_nominal_traj_ = u_traj; } void setNominalStateTrajectory(const Eigen::Ref& x_traj) { x_nominal_traj_ = x_traj; } void setM(const Eigen::Ref& M_new) { M_ = M_new; } protected: state_trajectory x_nominal_traj_ = state_trajectory::Zero(); control_trajectory u_nominal_traj_ = control_trajectory::Zero(); float lambda_ = 1.0; RiemannianMetric M_ = RiemannianMetric::Identity(); DYN_T* model_; B_matrix B_ = B_matrix::Zero(); }; } // namespace ccm #endif // FEEDBACK_CONTROLLERS_CCM_CCM_H_ ================================================ FILE: include/mppi/feedback_controllers/DDP/ddp.cu ================================================ #include template DeviceDDPImpl::DeviceDDPImpl(int num_timesteps, cudaStream_t stream) : num_timesteps_(num_timesteps) , GPUFeedbackController>(stream) { } template __device__ void DeviceDDPImpl::k(const float* __restrict__ x_act, const float* __restrict__ x_goal, const int t, float* __restrict__ theta, float* __restrict__ control_output) { float* fb_gain_t = &(this->state_.fb_gain_traj_[DYN_T::STATE_DIM * DYN_T::CONTROL_DIM * t]); float e = 0; for (int i = 0; i < DYN_T::STATE_DIM; i++) { e = x_act[i] - x_goal[i]; if (DYN_T::CONTROL_DIM % 4 == 0) { // load 4 floats in at a time to save on global memory reads float4* fb_gain_t4 = reinterpret_cast(&fb_gain_t[i * DYN_T::CONTROL_DIM]); for (int j = 0; j < DYN_T::CONTROL_DIM / 4; j++) { reinterpret_cast(control_output)[j] = fb_gain_t4[j] * e; } } else if (DYN_T::CONTROL_DIM % 2 == 0) { // load 2 floats in at a time to save on global memory reads float2* fb_gain_t2 = reinterpret_cast(&fb_gain_t[i * DYN_T::CONTROL_DIM]); for (int j = 0; j < DYN_T::CONTROL_DIM / 2; j++) { reinterpret_cast(control_output)[j] = fb_gain_t2[j] * e; } } else { for (int j = 0; j < DYN_T::CONTROL_DIM; j++) { control_output[j] += fb_gain_t[i * DYN_T::CONTROL_DIM + j] * e; } } } } /** * CPU Class for DDP Methods */ template DDPFeedback::DDPFeedback(DYN_T* model, float dt, int num_timesteps, cudaStream_t stream) { model_ = model; this->dt_ = dt; this->num_timesteps_ = std::max(num_timesteps, NUM_TIMESTEPS); this->gpu_controller_->freeCudaMem(); // Remove allocated CUDA mem from default constructor this->gpu_controller_ = std::make_shared>(this->num_timesteps_, stream); } template void DDPFeedback::initTrackingController() { util::DefaultLogger logger; bool verbose = false; ddp_model_ = std::make_shared>(model_); ddp_solver_ = std::make_shared>>(this->dt_, this->num_timesteps_, this->params_.num_iterations, &logger, verbose); result_ = OptimizerResult>(); result_.feedback_gain = feedback_gain_trajectory(this->num_timesteps_); for (int i = 0; i < this->num_timesteps_; i++) { result_.feedback_gain[i] = DYN_T::feedback_matrix::Zero(); } run_cost_ = std::make_shared>>(this->params_.Q, this->params_.R, this->num_timesteps_); terminal_cost_ = std::make_shared>>(this->params_.Q_f); } template void DDPFeedback::setParams(const DDPParams& params) { this->params_ = params; run_cost_ = std::make_shared>>(this->params_.Q, this->params_.R, this->num_timesteps_); terminal_cost_ = std::make_shared>>(this->params_.Q_f); } template void DDPFeedback::computeFeedback(const Eigen::Ref& init_state, const Eigen::Ref& goal_traj, const Eigen::Ref& control_traj) { run_cost_->setTargets(goal_traj.data(), control_traj.data(), this->num_timesteps_); terminal_cost_->xf = run_cost_->traj_target_x_.col(this->num_timesteps_ - 1); // update control ranges for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { control_min_(i) = model_->control_rngs_[i].x; control_max_(i) = model_->control_rngs_[i].y; } result_ = ddp_solver_->run(init_state, control_traj, *ddp_model_, *run_cost_, *terminal_cost_, control_min_, control_max_); // Copy Feedback Gains into Feedback State for (size_t i = 0; i < result_.feedback_gain.size(); i++) { int i_index = i * DYN_T::STATE_DIM * DYN_T::CONTROL_DIM; for (size_t j = 0; j < DYN_T::CONTROL_DIM * DYN_T::STATE_DIM; j++) { this->getFeedbackStatePointer()->fb_gain_traj_[i_index + j] = result_.feedback_gain[i].data()[j]; } } // Actually put new feedback gain trajectory onto the GPU // this->gpu_controller_->copyToDevice(); } ================================================ FILE: include/mppi/feedback_controllers/DDP/ddp.cuh ================================================ /* * Created on Sun Sep 28 2020 by Bogdan */ #ifndef FEEDBACK_CONTROLLERS_DDP_CUH_ #define FEEDBACK_CONTROLLERS_DDP_CUH_ #include #include #include #include #include #include template struct DDPParams { using StateCostWeight = typename TrackingCostDDP>::StateCostWeight; using Hessian = typename TrackingTerminalCost>::Hessian; using ControlCostWeight = typename TrackingCostDDP>::ControlCostWeight; StateCostWeight Q = StateCostWeight::Identity(); Hessian Q_f = Hessian::Identity(); ControlCostWeight R = ControlCostWeight::Identity(); int num_iterations = 1; }; template struct DDPFeedbackState : GPUState { static const int FEEDBACK_SIZE = DYN_T::CONTROL_DIM * DYN_T::STATE_DIM * N_TIMESTEPS; static const int NUM_TIMESTEPS = N_TIMESTEPS; /** * Variables **/ float fb_gain_traj_[FEEDBACK_SIZE] MPPI_ALIGN(16) = { 0.0 }; // ensure it is aligned to 16 bytes /** * Methods **/ bool isEqual(const DDPFeedbackState& other) const { for (int i = 0; i < FEEDBACK_SIZE; i++) { if (this->fb_gain_traj_[i] != other.fb_gain_traj_[i]) { return false; } } return true; } }; /** * Needed for Test in base_plant_tester.cu **/ template bool operator==(const DDPFeedbackState& lhs, const DDPFeedbackState& rhs) { return lhs.isEqual(rhs); }; /** * DDP GPU Controller class starting point. This class is where the actual * methods for DDP on the GPU are implemented but it is not used directly since * setting up the GPU_FB_T value would be painful */ template class DeviceDDPImpl : public GPUFeedbackController> { public: using PARAMS_T = DDPFeedbackState; // static const int SHARED_MEM_REQUEST_BLK_BYTES = DYN_T::CONTROL_DIM * DYN_T::STATE_DIM; DeviceDDPImpl(int num_timesteps, cudaStream_t stream = 0); DeviceDDPImpl(cudaStream_t stream = 0) : GPUFeedbackController>(stream){}; void allocateCUDAMemory(){}; void deallocateCUDAMemory(){}; __device__ void k(const float* __restrict__ x_act, const float* __restrict__ x_goal, const int t, float* __restrict__ theta, float* __restrict__ control_output); protected: // Needed for allocating memory for feedback gains int num_timesteps_ = 1; }; /** * Alias class for DDP GPU Controller. This sets up the class derivation correctly and is * used inside of the CPU version of DDP */ template class DeviceDDP : public DeviceDDPImpl, DYN_T, NUM_TIMESTEPS> { public: DeviceDDP(int num_timesteps, cudaStream_t stream = 0) : DeviceDDPImpl, DYN_T, NUM_TIMESTEPS>(num_timesteps, stream){}; DeviceDDP(cudaStream_t stream = 0) : DeviceDDPImpl, DYN_T, NUM_TIMESTEPS>(stream){}; }; /** * CPU Class for DDP. This is what the user should interact with */ template class DDPFeedback : public FeedbackController, DDPParams, NUM_TIMESTEPS> { public: /** * Aliases **/ typedef util::EigenAlignedVector feedback_gain_trajectory; typedef FeedbackController, DDPParams, NUM_TIMESTEPS> PARENT_CLASS; using control_array = typename PARENT_CLASS::control_array; using state_array = typename PARENT_CLASS::state_array; using state_trajectory = typename PARENT_CLASS::state_trajectory; using control_trajectory = typename PARENT_CLASS::control_trajectory; using INTERNAL_STATE_T = typename PARENT_CLASS::TEMPLATED_FEEDBACK_STATE; using feedback_gain_matrix = typename DYN_T::feedback_matrix; using square_state_matrix = typename DDPParams::StateCostWeight; using square_control_matrix = typename DDPParams::ControlCostWeight; /** * Variables **/ std::shared_ptr> ddp_model_; std::shared_ptr>> run_cost_; std::shared_ptr>> terminal_cost_; std::shared_ptr>> ddp_solver_; OptimizerResult> result_; control_array control_min_; control_array control_max_; DYN_T* model_; DDPFeedback(DYN_T* model, float dt, int num_timesteps = NUM_TIMESTEPS, cudaStream_t stream = 0); void setParams(const DDPParams& params) override; void initTrackingController(); control_array k_(const Eigen::Ref& x_act, const Eigen::Ref& x_goal, int t, INTERNAL_STATE_T& fb_state) { int index = DYN_T::STATE_DIM * DYN_T::CONTROL_DIM * t; Eigen::Map fb_gain(&(fb_state.fb_gain_traj_[index])); control_array u_output = fb_gain * (x_act - x_goal); return u_output; } feedback_gain_trajectory getFeedbackGainsEigen() { return result_.feedback_gain; } void computeFeedback(const Eigen::Ref& init_state, const Eigen::Ref& goal_traj, const Eigen::Ref& control_traj); }; #ifdef __CUDACC__ #include "ddp.cu" #endif #endif // FEEDBACK_CONTROLLERS_DDP_CUH_ ================================================ FILE: include/mppi/feedback_controllers/feedback.cu ================================================ #include // ===================== GPUFeedbackController ======================== template void GPUFeedbackController::copyToDevice(bool synchronize) { if (GPUMemStatus_) { HANDLE_ERROR( cudaMemcpyAsync(&feedback_d_->state_, &state_, sizeof(FEEDBACK_STATE_T), cudaMemcpyHostToDevice, stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream_)); } } } template void GPUFeedbackController::freeCudaMem() { if (GPUMemStatus_) { CLASS_T* derived = static_cast(this); derived->deallocateCUDAMemory(); cudaFree(feedback_d_); GPUMemStatus_ = false; feedback_d_ = nullptr; } } template void GPUFeedbackController::GPUSetup() { CLASS_T* derived = static_cast(this); if (!GPUMemStatus_) { feedback_d_ = Managed::GPUSetup(derived); derived->allocateCUDAMemory(); } else { this->logger_->debug("Feedback Controller GPU Memory already set\n"); } derived->copyToDevice(); } // ===================== FeedbackController ======================== ================================================ FILE: include/mppi/feedback_controllers/feedback.cuh ================================================ /* * Created on Sun Sep 6 2020 by Bogdan */ #ifndef FEEDBACK_BASE_CUH_ #define FEEDBACK_BASE_CUH_ #include #include #include #include struct GPUState { }; /** * This is the begining of the mess. So we have a GPU Feedback class that houses the methods and data * that are used on the GPU. The first template argument is for that cool thing that Jason knows the name of. * The second template argument is the dynamics class so that we can automatically pull out the state and control * dims needed. The final template argument is the state class. This state class is what will be passed back and forth * from the CPU to the GPU and vice versa (if necessary). For example, this would be your k_p, k_d, and k_i terms for a * PID, or the trajectory of feedback gains for DDP. * * Things a new controller will need: * - a k method. This is how to use the feedback controller on the GPU * - a GPU_STATE_T class. This should contain the relevant data to transfer from CPU to GPU and vice versa * Optional Things to implement: * - copyFromDevice(), copyToDevice(), paramsToDevice(): if your data structure FEEDBACK_STATE_T is complex, you will * need to implement these yourself * - allocateCUDAMemory(), deallocateCUDAMemory(): If your feedback class needs some CUDA memory beyond the * FEEDBACK_STATE_T, you will need to create and clear it here. */ template class GPUFeedbackController : public Managed { public: /** * Type Aliasing */ using DYN_T = TEMPLATED_DYNAMICS; using FEEDBACK_STATE_T = GPU_STATE_T; GPU_FB_T* feedback_d_ = nullptr; /** * Constructors */ GPUFeedbackController() = default; GPUFeedbackController(cudaStream_t stream = 0) : Managed(stream) { this->SHARED_MEM_REQUEST_GRD_BYTES = 0; this->SHARED_MEM_REQUEST_BLK_BYTES = 0; } /** * =================== METHODS THAT SHOULD NOT BE OVERWRITTEN ================ */ virtual ~GPUFeedbackController() { freeCudaMem(); }; // Overwrite of Managed->GPUSetup to call allocateCUDAMemory as well void GPUSetup(); void freeCudaMem(); void setFeedbackState(const FEEDBACK_STATE_T& state) { state_ = state; if (GPUMemStatus_) { GPU_FB_T& derived = static_cast(*this); derived.copyToDevice(); } } __host__ __device__ FEEDBACK_STATE_T getFeedbackState() { return state_; } __host__ __device__ FEEDBACK_STATE_T* getFeedbackStatePointer() { return &state_; } /** * ==================== NECESSARY METHODS TO OVERWRITE ===================== */ __device__ void k(const float* __restrict__ x_act, const float* __restrict__ x_goal, const int t, float* __restrict__ theta, float* __restrict__ control_output) { } /** * ===================== OPTIONAL METHODS TO OVERWRITE ====================== */ /** * Only needed to allocate/deallocate additional CUDA memory when appropriate, * GPU pointer is already handled. */ void allocateCUDAMemory() { } void deallocateCUDAMemory() { } __device__ void initializeFeedback(const float* __restrict__ x, const float* __restrict__ u, float* __restrict__ theta, const float t, const float dt) {} // Abstract method to copy information to GPU // void copyToDevice() {} // Copies the params to the device at the moment void copyToDevice(bool synchronize = true); // Method to return potential diagnostic information from GPU void copyFromDevice(bool synchronize = true) { } protected: FEEDBACK_STATE_T state_; }; /** * Steps to making a new one * Create the GPUFeedback class as an impl class like costs but is still templated on DYN * The actual GPUFeedback_act class will then be templated on DYN and inherit from the GPUFeedbackImpl * Write the feedback controller to use the GPUFeedback_act as thee GPU_FEEDBACK_T template option * It will then automatically create the right pointer */ template class FeedbackController { public: // Type Defintions and aliases typedef typename GPU_FB_T::DYN_T DYN_T; // typedef FEEDBACK_STATE_T TEMPLATED_FEEDBACK_STATE; typedef PARAMS_T TEMPLATED_PARAMS; typedef GPU_FB_T TEMPLATED_GPU_FEEDBACK; typedef typename GPU_FB_T::FEEDBACK_STATE_T TEMPLATED_FEEDBACK_STATE; static const int FB_TIMESTEPS = NUM_TIMESTEPS; using state_array = typename DYN_T::state_array; using control_array = typename DYN_T::control_array; typedef Eigen::Matrix control_trajectory; // A control trajectory typedef Eigen::Matrix state_trajectory; // A state trajectory // Constructors and Generators FeedbackController(float dt = 0.01, int num_timesteps = NUM_TIMESTEPS, cudaStream_t stream = 0) : dt_(dt), num_timesteps_(num_timesteps) { gpu_controller_ = std::make_shared(stream); auto logger = std::make_shared(); setLogger(logger); } virtual ~FeedbackController() { freeCudaMem(); }; virtual __host__ void GPUSetup() { gpu_controller_->GPUSetup(); } virtual __host__ void freeCudaMem() { gpu_controller_->freeCudaMem(); } virtual __host__ void initTrackingController() = 0; virtual __host__ void setParams(const PARAMS_T& params) { params_ = params; } PARAMS_T getParams() { return params_; } // CPU Methods /** * Compute feedback control method that should not be overwritten. * Input: * - x_act: the state where the system is * - x_goal: the state we want to be at * - index: the number of timesteps from the initial time we are */ virtual __host__ control_array k(const Eigen::Ref& x_act, const Eigen::Ref& x_goal, int t) { TEMPLATED_FEEDBACK_STATE* gpu_feedback_state = getFeedbackStatePointer(); return k_(x_act, x_goal, t, *gpu_feedback_state); } /** * Feeback Control Method to overwrite. */ virtual __host__ control_array k_(const Eigen::Ref& x_act, const Eigen::Ref& x_goal, int t, TEMPLATED_FEEDBACK_STATE& fb_state) = 0; // might not be a needed method virtual __host__ void computeFeedback(const Eigen::Ref& init_state, const Eigen::Ref& goal_traj, const Eigen::Ref& control_traj) = 0; // TODO Construct a default version of this method that uses the state_ variable automatically virtual __host__ control_array interpolateFeedback_(const Eigen::Ref& state, const Eigen::Ref& goal_state, double rel_time, TEMPLATED_FEEDBACK_STATE& fb_state) { int lower_idx = (int)(rel_time / dt_); int upper_idx = lower_idx + 1; double alpha = (rel_time - lower_idx * dt_) / dt_; control_array u_fb = (1 - alpha) * k_(state, goal_state, lower_idx, fb_state) + alpha * k_(state, goal_state, upper_idx, fb_state); return u_fb; } virtual __host__ control_array interpolateFeedback(const Eigen::Ref& state, const Eigen::Ref& goal_state, double rel_time) { TEMPLATED_FEEDBACK_STATE* fb_state = getFeedbackStatePointer(); return interpolateFeedback_(state, goal_state, rel_time, *fb_state); } GPU_FB_T* getDevicePointer() { return gpu_controller_->feedback_d_; } std::shared_ptr getHostPointer() { return gpu_controller_; } void bindToStream(cudaStream_t stream) { gpu_controller_->bindToStream(stream); } /** * Calls GPU version */ void copyToDevice(bool synchronize = true) { this->gpu_controller_->copyToDevice(synchronize); } TEMPLATED_FEEDBACK_STATE getFeedbackState() { return this->gpu_controller_->getFeedbackState(); } TEMPLATED_FEEDBACK_STATE* getFeedbackStatePointer() { return this->gpu_controller_->getFeedbackStatePointer(); } void setFeedbackState(const TEMPLATED_FEEDBACK_STATE& gpu_fb_state) { this->gpu_controller_->setFeedbackState(gpu_fb_state); } float getDt() { return dt_; } void setDt(float dt) { dt_ = dt; } __host__ void setLogger(const mppi::util::MPPILoggerPtr& logger) { logger_ = logger; gpu_controller_->setLogger(logger); } __host__ void setLogLevel(const mppi::util::LOG_LEVEL& level) { logger_->setLogLevel(level); gpu_controller_->setLogLevel(level); } __host__ mppi::util::MPPILoggerPtr getLogger() { return logger_; } __host__ mppi::util::MPPILoggerPtr getLogger() const { return logger_; } protected: std::shared_ptr gpu_controller_; float dt_; int num_timesteps_; PARAMS_T params_; mppi::util::MPPILoggerPtr logger_ = nullptr; }; #ifdef __CUDACC__ #include "feedback.cu" #endif #endif // FEEDBACK_BASE_CUH_ ================================================ FILE: include/mppi/instantiations/autorally_mppi/autorally_mppi.cuh ================================================ #ifndef MPPIGENERIC_CONTROLLERERS_AUTORALLY_MPPI_CUH #define MPPIGENERIC_CONTROLLERERS_AUTORALLY_MPPI_CUH #include #include #include #include // #ifdef USE_NEURAL_NETWORK_MODEL__ /*Use neural network dynamics model*/ const int MPPI_NUM_ROLLOUTS__ = 1920; const int BLOCKSIZE_X = 8; const int BLOCKSIZE_Y = 16; const int NUM_TIMESTEPS = 150; typedef NeuralNetModel<7, 2, 3> DynamicsModel; typedef ARStandardCost CostFunctionClass; typedef DDPFeedback FEEDBACK_T; typedef mppi::sampling_distributions::GaussianDistribution Sampler; // #elif USE_BASIS_FUNC_MODEL__ /*Use the basis function model* */ // const int MPPI_NUM_ROLLOUTS__ = 2560; // const int BLOCKSIZE_X = 16; // const int BLOCKSIZE_Y = 4; // typedef GeneralizedLinear DynamicsModel; // #endif #endif // MPPIGENERIC_CONTROLLERERS_AUTORALLY_MPPI_CUH ================================================ FILE: include/mppi/instantiations/cartpole_mppi/cartpole_mppi.cuh ================================================ #ifndef MPPIGENERIC_CONTROLLERERS_CARTPOLE_MPPI_CUH #define MPPIGENERIC_CONTROLLERERS_CARTPOLE_MPPI_CUH #include // #include #include #include #include #endif // MPPIGENERIC_CONTROLLERERS_CARTPOLE_MPPI_CUH ================================================ FILE: include/mppi/instantiations/double_integrator_mppi/double_integrator_mppi.cuh ================================================ #ifndef MPPIGENERIC_CONTROLLERERS_DOUBLE_INTEGRATOR_CUH #define MPPIGENERIC_CONTROLLERERS_DOUBLE_INTEGRATOR_CUH #include #include #include #include #include #include #include #endif // MPPIGENERIC_CONTROLLERERS_DOUBLE_INTEGRATOR_CUH ================================================ FILE: include/mppi/instantiations/quadrotor_mppi/quadrotor_mppi.cuh ================================================ #ifndef MPPI_GENERIC_CONTROLLERS_QUADROTOR_MPPI_CUH_ #define MPPI_GENERIC_CONTROLLERS_QUADROTOR_MPPI_CUH_ #include // #include // #include #include #include #include #include #endif // MPPI_GENERIC_CONTROLLERS_QUADROTOR_MPPI_CUH_ ================================================ FILE: include/mppi/sampling_distributions/colored_noise/colored_noise.cu ================================================ /** * Created by Bogdan Vlahov on 3/25/2023 **/ #define COLORED_TEMPLATE template class PARAMS_TEMPLATE, class DYN_PARAMS_T> #define COLORED_NOISE ColoredNoiseDistributionImpl #include #include #include __global__ void configureFrequencyNoise(cufftComplex* noise, float* variance, int num_samples, int control_dim, int num_freq) { int sample_index = blockDim.x * blockIdx.x + threadIdx.x; int freq_index = blockDim.y * blockIdx.y + threadIdx.y; int control_index = blockDim.z * blockIdx.z + threadIdx.z; if (sample_index < num_samples && freq_index < num_freq && control_index < control_dim) { int noise_index = (sample_index * control_dim + control_index) * num_freq + freq_index; int variance_index = control_index * num_freq + freq_index; noise[noise_index].x *= variance[variance_index]; if (freq_index == 0) { noise[noise_index].y = 0; } else if (num_freq % 2 == 1 && freq_index == num_freq - 1) { noise[noise_index].y = 0; } else { noise[noise_index].y *= variance[variance_index]; } } } __global__ void rearrangeNoise(float* input, float* output, float* variance, int num_trajectories, int num_timesteps, int control_dim, int offset_t, float decay_rate) { const int sample_index = blockIdx.x * blockDim.x + threadIdx.x; const int time_index = blockIdx.y * blockDim.y + threadIdx.y; const int control_index = blockIdx.z * blockDim.z + threadIdx.z; const float decayed_offset = decay_rate == 0 ? 0 : powf(decay_rate, time_index); if (sample_index < num_trajectories && time_index < (num_timesteps) && control_index < control_dim) { // cuFFT does not normalize inverse transforms so a division by the num_timesteps is required output[(sample_index * num_timesteps + time_index) * control_dim + control_index] = (input[(sample_index * control_dim + control_index) * 2 * num_timesteps + time_index] - input[(sample_index * control_dim + control_index) * 2 * num_timesteps + offset_t] * decayed_offset) / (variance[control_index] * 2 * num_timesteps); // printf("ROLLOUT %d CONTROL %d TIME %d: in %f out: %f\n", sample_index, control_index, time_index, // input[(sample_index * control_dim + control_index) * num_timesteps + time_index], // output[(sample_index * num_timesteps + time_index) * control_dim + control_index]); } } void powerlaw_psd_gaussian(std::vector& exponents, int num_timesteps, int num_trajectories, float* control_noise_d, int offset_t, curandGenerator_t& gen, float offset_decay_rate, cudaStream_t stream, float fmin) { const int BLOCKSIZE_X = 32; const int BLOCKSIZE_Y = 32; const int BLOCKSIZE_Z = 1; int control_dim = exponents.size(); std::vector sample_freq; const int sample_num_timesteps = num_timesteps * 2; fftfreq(sample_num_timesteps, sample_freq); float cutoff_freq = fmaxf(fmin, 1.0f / sample_num_timesteps); int freq_size = sample_freq.size(); int smaller_index = 0; Eigen::MatrixXf sample_freqs(freq_size, control_dim); // Adjust the weighting of each frequency by the exponents for (int i = 0; i < freq_size; i++) { if (sample_freq[i] < cutoff_freq) { smaller_index++; } else if (smaller_index < freq_size) { for (int j = 0; j < smaller_index; j++) { sample_freq[j] = sample_freq[smaller_index]; for (int k = 0; k < control_dim; k++) { sample_freqs(j, k) = powf(sample_freq[smaller_index], -exponents[k] / 2.0f); } } } for (int j = 0; j < control_dim; j++) { sample_freqs(i, j) = powf(sample_freq[i], -exponents[j] / 2.0f); } } // Calculate variance float sigma[control_dim] = { 0 }; for (int i = 0; i < control_dim; i++) { for (int j = 1; j < freq_size - 1; j++) { sigma[i] += SQ(sample_freqs(j, i)); } // std::for_each(sample_freq.begin() + 1, sample_freq.end() - 1, [&sigma, &i](float j) { sigma[i] += powf(j, 2); }); sigma[i] += SQ(sample_freqs(freq_size - 1, i) * ((1.0f + (sample_num_timesteps % 2)) / 2.0f)); sigma[i] = 2.0f * sqrtf(sigma[i]) / sample_num_timesteps; } // Sample the noise in frequency domain and reutrn to time domain cufftHandle plan; const int batch = num_trajectories * control_dim; // Need 2 * (sample_num_timesteps / 2 + 1) * batch of randomly sampled values // float* samples_in_freq_d; float* sigma_d; float* noise_in_time_d; cufftComplex* samples_in_freq_complex_d; float* freq_coeffs_d; // HANDLE_ERROR(cudaMallocAsync((void**)&samples_in_freq_d, sizeof(float) * 2 * batch * freq_size, stream)); // HANDLE_ERROR(cudaMallocAsync((void**)&samples_in_freq_d, sizeof(float) * 2 * batch * sample_num_timesteps, // stream)); #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaMallocAsync((void**)&freq_coeffs_d, sizeof(float) * freq_size * control_dim, stream)); HANDLE_ERROR(cudaMallocAsync((void**)&samples_in_freq_complex_d, sizeof(cufftComplex) * batch * freq_size, stream)); HANDLE_ERROR(cudaMallocAsync((void**)&noise_in_time_d, sizeof(float) * batch * sample_num_timesteps, stream)); HANDLE_ERROR(cudaMallocAsync((void**)&sigma_d, sizeof(float) * control_dim, stream)); #else HANDLE_ERROR(cudaMalloc((void**)&freq_coeffs_d, sizeof(float) * freq_size * control_dim)); HANDLE_ERROR(cudaMalloc((void**)&samples_in_freq_complex_d, sizeof(cufftComplex) * batch * freq_size)); HANDLE_ERROR(cudaMalloc((void**)&noise_in_time_d, sizeof(float) * batch * sample_num_timesteps)); HANDLE_ERROR(cudaMalloc((void**)&sigma_d, sizeof(float) * control_dim)); #endif // curandSetStream(gen, stream); HANDLE_CURAND_ERROR(curandGenerateNormal(gen, (float*)samples_in_freq_complex_d, 2 * batch * freq_size, 0.0, 1.0)); HANDLE_ERROR(cudaMemcpyAsync(freq_coeffs_d, sample_freqs.data(), sizeof(float) * freq_size * control_dim, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(sigma_d, sigma, sizeof(float) * control_dim, cudaMemcpyHostToDevice, stream)); const int variance_grid_x = (num_trajectories - 1) / BLOCKSIZE_X + 1; const int variance_grid_y = (freq_size - 1) / BLOCKSIZE_Y + 1; const int variance_grid_z = (control_dim - 1) / BLOCKSIZE_Z + 1; dim3 grid(variance_grid_x, variance_grid_y, variance_grid_z); dim3 block(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z); // configureFrequencyNoise<<>>((cuComplex*) samples_in_freq_d, freq_coeffs_d, freq_size, // batch); configureFrequencyNoise<<>>(samples_in_freq_complex_d, freq_coeffs_d, num_trajectories, control_dim, freq_size); HANDLE_ERROR(cudaGetLastError()); HANDLE_CUFFT_ERROR(cufftPlan1d(&plan, sample_num_timesteps, CUFFT_C2R, batch)); HANDLE_CUFFT_ERROR(cufftSetStream(plan, stream)); // freq_data needs to be batch number of sample_num_timesteps/2 + 1 cuComplex values // time_data needs to be batch * sample_num_timesteps floats HANDLE_CUFFT_ERROR(cufftExecC2R(plan, samples_in_freq_complex_d, noise_in_time_d)); const int reorder_grid_x = (num_trajectories - 1) / BLOCKSIZE_X + 1; const int reorder_grid_y = (num_timesteps - 1) / BLOCKSIZE_Y + 1; const int reorder_grid_z = (control_dim - 1) / BLOCKSIZE_Z + 1; dim3 reorder_grid(reorder_grid_x, reorder_grid_y, reorder_grid_z); dim3 reorder_block(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z); // std::cout << "Grid: " << reorder_grid.x << ", " << reorder_grid.y << ", " << reorder_grid.z << std::endl; // std::cout << "Block: " << reorder_block.x << ", " << reorder_block.y << ", " << reorder_block.z << std::endl; rearrangeNoise<<>>(noise_in_time_d, control_noise_d, sigma_d, num_trajectories, num_timesteps, control_dim, offset_t, offset_decay_rate); HANDLE_ERROR(cudaGetLastError()); HANDLE_ERROR(cudaStreamSynchronize(stream)); HANDLE_CUFFT_ERROR(cufftDestroy(plan)); #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 // HANDLE_ERROR(cudaFreeAsync(samples_in_freq_d, stream)); HANDLE_ERROR(cudaFreeAsync(freq_coeffs_d, stream)); HANDLE_ERROR(cudaFreeAsync(sigma_d, stream)); HANDLE_ERROR(cudaFreeAsync(samples_in_freq_complex_d, stream)); HANDLE_ERROR(cudaFreeAsync(noise_in_time_d, stream)); #else // HANDLE_ERROR(cudaFree(samples_in_freq_d)); HANDLE_ERROR(cudaFree(freq_coeffs_d)); HANDLE_ERROR(cudaFree(sigma_d)); HANDLE_ERROR(cudaFree(samples_in_freq_complex_d)); HANDLE_ERROR(cudaFree(noise_in_time_d)); #endif } namespace mppi { namespace sampling_distributions { COLORED_TEMPLATE COLORED_NOISE::ColoredNoiseDistributionImpl(cudaStream_t stream) : PARENT_CLASS::GaussianDistributionImpl(stream) { } COLORED_TEMPLATE COLORED_NOISE::ColoredNoiseDistributionImpl(const SAMPLING_PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS::GaussianDistributionImpl(params, stream) { } COLORED_TEMPLATE __host__ void COLORED_NOISE::freeCudaMem() { if (this->GPUMemStatus_) { cudaFree(freq_coeffs_d_); cudaFree(samples_in_freq_complex_d_); cudaFree(noise_in_time_d_); cudaFree(frequency_sigma_d_); freq_coeffs_d_ = nullptr; frequency_sigma_d_ = nullptr; noise_in_time_d_ = nullptr; samples_in_freq_complex_d_ = nullptr; cufftDestroy(plan_); } PARENT_CLASS::freeCudaMem(); } COLORED_TEMPLATE __host__ void COLORED_NOISE::allocateCUDAMemoryHelper() { PARENT_CLASS::allocateCUDAMemoryHelper(); if (this->GPUMemStatus_) { const int sample_num_timesteps = 2 * this->getNumTimesteps(); const int freq_size = sample_num_timesteps / 2 + 1; #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 if (frequency_sigma_d_) { HANDLE_ERROR(cudaFreeAsync(frequency_sigma_d_, this->stream_)); } if (samples_in_freq_complex_d_) { HANDLE_ERROR(cudaFreeAsync(samples_in_freq_complex_d_, this->stream_)); } if (noise_in_time_d_) { HANDLE_ERROR(cudaFreeAsync(noise_in_time_d_, this->stream_)); } if (freq_coeffs_d_) { HANDLE_ERROR(cudaFreeAsync(freq_coeffs_d_, this->stream_)); } HANDLE_ERROR( cudaMallocAsync((void**)&freq_coeffs_d_, sizeof(float) * freq_size * this->CONTROL_DIM, this->stream_)); HANDLE_ERROR(cudaMallocAsync((void**)&frequency_sigma_d_, sizeof(float) * this->CONTROL_DIM, this->stream_)); HANDLE_ERROR(cudaMallocAsync((void**)&samples_in_freq_complex_d_, sizeof(cufftComplex) * this->getNumRollouts() * this->CONTROL_DIM * freq_size * this->getNumDistributions(), this->stream_)); HANDLE_ERROR(cudaMallocAsync((void**)&noise_in_time_d_, sizeof(float) * this->getNumRollouts() * this->CONTROL_DIM * sample_num_timesteps * this->getNumDistributions(), this->stream_)); #else if (frequency_sigma_d_) { HANDLE_ERROR(cudaFree(frequency_sigma_d_)); } if (samples_in_freq_complex_d_) { HANDLE_ERROR(cudaFree(samples_in_freq_complex_d_)); } if (noise_in_time_d_) { HANDLE_ERROR(cudaFree(noise_in_time_d_)); } if (freq_coeffs_d_) { HANDLE_ERROR(cudaFree(freq_coeffs_d_)); } HANDLE_ERROR(cudaMalloc((void**)&freq_coeffs_d_, sizeof(float) * freq_size * this->CONTROL_DIM)); HANDLE_ERROR(cudaMalloc((void**)&frequency_sigma_d_, sizeof(float) * this->CONTROL_DIM)); HANDLE_ERROR(cudaMalloc((void**)&samples_in_freq_complex_d_, sizeof(cufftComplex) * this->getNumRollouts() * this->CONTROL_DIM * freq_size * this->getNumDistributions())); HANDLE_ERROR(cudaMalloc((void**)&noise_in_time_d_, sizeof(float) * this->getNumRollouts() * this->CONTROL_DIM * sample_num_timesteps * this->getNumDistributions())); #endif // Recreate FFT Plan HANDLE_CUFFT_ERROR(cufftPlan1d(&plan_, sample_num_timesteps, CUFFT_C2R, this->getNumRollouts() * this->getNumDistributions() * this->CONTROL_DIM)); HANDLE_CUFFT_ERROR(cufftSetStream(plan_, this->stream_)); } } COLORED_TEMPLATE __host__ void COLORED_NOISE::generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize) { const int BLOCKSIZE_X = this->params_.rewrite_controls_block_dim.x; const int BLOCKSIZE_Y = this->params_.rewrite_controls_block_dim.y; const int BLOCKSIZE_Z = this->params_.rewrite_controls_block_dim.z; const int num_trajectories = this->getNumRollouts() * this->getNumDistributions(); std::vector sample_freq; const int sample_num_timesteps = 2 * this->getNumTimesteps(); fftfreq(sample_num_timesteps, sample_freq); const float cutoff_freq = fmaxf(this->params_.fmin, 1.0f / sample_num_timesteps); const int freq_size = sample_freq.size(); int smaller_index = 0; const int local_control_dim = this->CONTROL_DIM; // Needed for methods which use pass by reference Eigen::MatrixXf sample_freqs(freq_size, local_control_dim); // Adjust the weighting of each frequency by the exponents for (int i = 0; i < freq_size; i++) { if (sample_freq[i] < cutoff_freq) { smaller_index++; } else if (smaller_index < freq_size) { for (int j = 0; j < smaller_index; j++) { sample_freq[j] = sample_freq[smaller_index]; for (int k = 0; k < this->CONTROL_DIM; k++) { sample_freqs(j, k) = powf(sample_freq[smaller_index], -this->params_.exponents[k] / 2.0f); } } } for (int j = 0; j < this->CONTROL_DIM; j++) { sample_freqs(i, j) = powf(sample_freq[i], -this->params_.exponents[j] / 2.0f); } } // Calculate variance float sigma[this->CONTROL_DIM] = { 0 }; for (int i = 0; i < this->CONTROL_DIM; i++) { for (int j = 1; j < freq_size - 1; j++) { sigma[i] += SQ(sample_freqs(j, i)); } sigma[i] += SQ(sample_freqs(freq_size - 1, i) * ((1.0f + (sample_num_timesteps % 2)) / 2.0f)); sigma[i] = 2.0f * sqrtf(sigma[i]) / sample_num_timesteps; } // Sample the noise in frequency domain and reutrn to time domain const int batch = num_trajectories * this->CONTROL_DIM; // Need 2 * (sample_num_timesteps / 2 + 1) * batch of randomly sampled values HANDLE_CURAND_ERROR(curandGenerateNormal(gen, (float*)samples_in_freq_complex_d_, 2 * batch * freq_size, 0.0, 1.0)); HANDLE_ERROR(cudaMemcpyAsync(freq_coeffs_d_, sample_freqs.data(), sizeof(float) * freq_size * this->CONTROL_DIM, cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(frequency_sigma_d_, sigma, sizeof(float) * this->CONTROL_DIM, cudaMemcpyHostToDevice, this->stream_)); const int num_trajectories_grid_x = mppi::math::int_ceil(num_trajectories, BLOCKSIZE_X); const int variance_grid_y = (freq_size - 1) / BLOCKSIZE_Y + 1; const int control_grid_z = mppi::math::int_ceil(local_control_dim, BLOCKSIZE_Z); dim3 grid(num_trajectories_grid_x, variance_grid_y, control_grid_z); dim3 block(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z); configureFrequencyNoise<<stream_>>>(samples_in_freq_complex_d_, freq_coeffs_d_, num_trajectories, this->CONTROL_DIM, freq_size); HANDLE_ERROR(cudaGetLastError()); // freq_data needs to be batch number of num_timesteps/2 + 1 cuComplex values // time_data needs to be batch * num_timesteps floats HANDLE_CUFFT_ERROR(cufftExecC2R(plan_, samples_in_freq_complex_d_, noise_in_time_d_)); // Change axes ordering from [trajectories, control, time] to [trajectories, time, control] const int reorder_grid_y = mppi::math::int_ceil(this->getNumTimesteps(), BLOCKSIZE_Y); dim3 reorder_grid(num_trajectories_grid_x, reorder_grid_y, control_grid_z); rearrangeNoise<<stream_>>>( noise_in_time_d_, this->control_samples_d_, frequency_sigma_d_, num_trajectories, this->getNumTimesteps(), this->CONTROL_DIM, optimization_stride, this->getOffsetDecayRate()); // Rewrite pure noise into actual control samples dim3 control_writing_grid; control_writing_grid.x = mppi::math::int_ceil(this->getNumRollouts(), BLOCKSIZE_X); control_writing_grid.y = mppi::math::int_ceil(this->getNumTimesteps(), BLOCKSIZE_Y); control_writing_grid.z = mppi::math::int_ceil(this->getNumDistributions(), BLOCKSIZE_Z); unsigned int std_dev_mem_size = this->getNumDistributions() * this->CONTROL_DIM; // Allocate shared memory for std_deviations per timestep or constant across the trajectory std_dev_mem_size = mppi::math::nearest_multiple_4( this->params_.time_specific_std_dev ? std_dev_mem_size * this->getNumTimesteps() : std_dev_mem_size); unsigned int shared_mem_size = std_dev_mem_size + mppi::math::nearest_multiple_4(this->getNumDistributions() * this->getNumTimesteps() * this->CONTROL_DIM) + mppi::math::nearest_multiple_4(BLOCKSIZE_X * BLOCKSIZE_Y * BLOCKSIZE_Z * this->CONTROL_DIM); shared_mem_size *= sizeof(float); setGaussianControls<<params_.rewrite_controls_block_dim, shared_mem_size, this->stream_>>>( this->control_means_d_, this->std_dev_d_, this->control_samples_d_, this->CONTROL_DIM, this->getNumTimesteps(), this->getNumRollouts(), this->getNumDistributions(), optimization_stride, powf(this->params_.std_dev_decay, iteration_num), this->params_.pure_noise_trajectories_percentage, this->params_.time_specific_std_dev); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } } // namespace sampling_distributions } // namespace mppi #undef COLORED_TEMPLATE #undef COLORED_NOISE ================================================ FILE: include/mppi/sampling_distributions/colored_noise/colored_noise.cuh ================================================ #pragma once /** * Created by Bogdan, Dec 16, 2021 * based off of https://github.com/felixpatzelt/colorednoise/blob/master/colorednoise.py */ #include #include #include #include #include #include #include #include __global__ void configureFrequencyNoise(cufftComplex* noise, float* variance, int num_samples, int control_dim, int num_freq); __global__ void rearrangeNoise(float* input, float* output, float* variance, int num_trajectories, int num_timesteps, int control_dim, int offset_t, float decay_rate = 0.0); void fftfreq(const int num_samples, std::vector& result, const float spacing = 1) { // result is of size floor(n/2) + 1 int result_size = num_samples / 2 + 1; result.clear(); result.resize(result_size); for (int i = 0; i < result_size; i++) { result[i] = i / (spacing * num_samples); } } void powerlaw_psd_gaussian(std::vector& exponents, int num_timesteps, int num_trajectories, float* control_noise_d, int offset_t, curandGenerator_t& gen, float offset_decay_rate, cudaStream_t stream = 0, float fmin = 0.0); namespace mppi { namespace sampling_distributions { template struct ColoredNoiseParamsImpl : public GaussianParamsImpl { float exponents[C_DIM * MAX_DISTRIBUTIONS] = { 0.0f }; float offset_decay_rate = 0.97; float fmin = 0.0; ColoredNoiseParamsImpl(int num_rollouts = 1, int num_timesteps = 1, int num_distributions = 1) : GaussianParamsImpl(num_rollouts, num_timesteps, num_distributions) { } void copyExponentToDistribution(const int src_distribution_idx, const int dest_distribution_idx) { bool src_out_of_distribution = src_distribution_idx >= MAX_DISTRIBUTIONS; if (src_out_of_distribution || dest_distribution_idx >= MAX_DISTRIBUTIONS) { printf("%s Distribution %d is out of range. There are only %d total distributions\n", src_out_of_distribution ? "Src" : "Dest", src_out_of_distribution ? src_distribution_idx : dest_distribution_idx, MAX_DISTRIBUTIONS); return; } float* exponents_src = exponents[C_DIM * src_distribution_idx]; float* exponents_dest = exponents[C_DIM * dest_distribution_idx]; for (int i = 0; i < C_DIM; i++) { exponents_dest[i] = exponents_src[i]; } } }; template using ColoredNoiseParams = ColoredNoiseParamsImpl; template class PARAMS_TEMPLATE = ColoredNoiseParams, class DYN_PARAMS_T = DynamicsParams> class ColoredNoiseDistributionImpl : public GaussianDistributionImpl { public: using PARENT_CLASS = GaussianDistributionImpl; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; using control_array = typename PARENT_CLASS::control_array; static const int CONTROL_DIM = PARENT_CLASS::CONTROL_DIM; ColoredNoiseDistributionImpl(cudaStream_t stream = 0); ColoredNoiseDistributionImpl(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0); ~ColoredNoiseDistributionImpl() { freeCudaMem(); } __host__ virtual std::string getSamplingDistributionName() const override { return "Colored Noise"; } __host__ void allocateCUDAMemoryHelper(); __host__ void freeCudaMem(); __host__ void generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize = true); __host__ __device__ float getOffsetDecayRate() const { return this->params_.offset_decay_rate; } void setOffsetDecayRate(const float decay_rate) { this->params_.offset_decay_rate = decay_rate; } protected: cufftHandle plan_; float* frequency_sigma_d_ = nullptr; float* noise_in_time_d_ = nullptr; cufftComplex* samples_in_freq_complex_d_ = nullptr; float* freq_coeffs_d_ = nullptr; }; template class ColoredNoiseDistribution : public ColoredNoiseDistributionImpl, ColoredNoiseParams, DYN_PARAMS_T> { public: using PARENT_CLASS = ColoredNoiseDistributionImpl; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; ColoredNoiseDistribution(cudaStream_t stream = 0) : PARENT_CLASS(stream) { } ColoredNoiseDistribution(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0) : PARENT_CLASS(params, stream) { } }; } // namespace sampling_distributions } // namespace mppi #ifdef __CUDACC__ #include "colored_noise.cu" #endif ================================================ FILE: include/mppi/sampling_distributions/gaussian/gaussian.cu ================================================ /** * Created by Bogdan Vlahov on 3/24/2023 **/ #include #include #include #include namespace mppi { namespace sampling_distributions { #define GAUSSIAN_TEMPLATE template class PARAMS_TEMPLATE, class DYN_PARAMS_T> #define GAUSSIAN_CLASS GaussianDistributionImpl __global__ void setGaussianControls(const float* __restrict__ mean_d, const float* __restrict__ std_dev_d, float* __restrict__ control_samples_d, const int control_dim, const int num_timesteps, const int num_rollouts, const int num_distributions, const int optimization_stride, const float std_dev_decay, const float pure_noise_percentage, const bool time_specific_std_dev) { const int trajectory_index = threadIdx.x + blockDim.x * blockIdx.x; const int distribution_index = threadIdx.z + blockDim.z * blockIdx.z; const int time_index = threadIdx.y + blockDim.y * blockIdx.y; const bool valid_index = trajectory_index < num_rollouts && time_index < num_timesteps && distribution_index < num_distributions; const auto& num_timesteps_block = blockDim.y; const auto& num_rollouts_block = blockDim.x; const int shared_noise_index = threadIdx.z * num_timesteps_block * num_rollouts_block * control_dim + threadIdx.x * num_timesteps_block * control_dim + threadIdx.y * control_dim; const int global_noise_index = min(distribution_index, num_distributions) * num_timesteps * num_rollouts * control_dim + min(trajectory_index, num_rollouts) * num_timesteps * control_dim + min(time_index, num_timesteps) * control_dim; const int shared_mean_index = distribution_index * num_timesteps * control_dim + time_index * control_dim; // Std Deviation setup int std_dev_size = num_distributions * control_dim; int shared_std_dev_index = threadIdx.z * num_timesteps * control_dim + threadIdx.y * control_dim; int global_std_dev_index = min(distribution_index, num_distributions) * num_timesteps * control_dim + min(time_index, num_timesteps) * control_dim; shared_std_dev_index = time_specific_std_dev ? shared_std_dev_index : 0; global_std_dev_index = time_specific_std_dev ? global_std_dev_index : 0; std_dev_size = time_specific_std_dev ? num_timesteps * std_dev_size : std_dev_size; // local variables int i, j, k; // Shared memory setup /** * @brief Shared memory setup * This kernel has three shared memory arrays, mean_shared, std_dev_shared, and control_samples_shared. * In order to prevent memory alignment issues, the memory is being over-allocated to ensure that they are start on * the float4 boundary mean_shared - size should be num_timesteps * num_distributions * control_dim std_dev_shared = * num_distributions * control_dim if time_specific_std_dev is false std_dev_shared = num_distributions * * num_timesteps * control_dim if time_specific_std_dev is true control_samples_shared = BLOCKSIZE_X * BLOCKSIZE_Y * * BLOCKSIZE_Z * control_dim * */ extern __shared__ float entire_buffer[]; // Create memory_aligned shared memory pointers float* mean_shared = entire_buffer; float* std_dev_shared = &mean_shared[mppi::math::nearest_multiple_4(num_timesteps * num_distributions * control_dim)]; float* control_samples_shared = &std_dev_shared[mppi::math::nearest_multiple_4(std_dev_size)]; if (control_dim % 4 == 0) { // Step 1: copy means into shared memory for (i = threadIdx.z; i < num_distributions; i += blockDim.z) { for (j = threadIdx.y; j < num_timesteps; j += blockDim.y) { const int mean_index = (i * num_timesteps + j) * control_dim; float4* mean_shared4 = reinterpret_cast(&mean_shared[mean_index]); const float4* mean_d4 = reinterpret_cast(&mean_d[mean_index]); for (k = threadIdx.x; k < control_dim / 4; k += blockDim.x) { mean_shared4[k] = mean_d4[k]; } } } // Step 2: load std_dev to shared memory const float4* std_dev_d4 = reinterpret_cast(&std_dev_d[global_std_dev_index]); float4* std_dev_shared4 = reinterpret_cast(&std_dev_shared[shared_std_dev_index]); for (i = threadIdx.x; i < control_dim / 4; i += blockDim.x) { std_dev_shared4[i] = std_dev_decay * std_dev_d4[i]; } // Step 3: load noise into shared memory float4* control_samples_shared4 = reinterpret_cast(&control_samples_shared[shared_noise_index]); float4* control_samples_d4 = reinterpret_cast(&control_samples_d[global_noise_index]); // Create const pointre to mean in shared memory as it shouldn't change henceforth const float4* mean_shared4 = reinterpret_cast(&mean_shared[shared_mean_index]); for (i = 0; valid_index && i < control_dim / 4; i++) { control_samples_shared4[i] = control_samples_d4[i]; } __syncthreads(); // wait for all copying from global to shared memory to finish // Step 4: do mean + variance calculations if (valid_index && (trajectory_index == 0 || time_index < optimization_stride)) { // 0 noise trajectory for (i = 0; i < control_dim / 4; i++) { control_samples_shared4[i] = mean_shared4[i]; } } else if (valid_index && trajectory_index >= (1.0f - pure_noise_percentage) * num_rollouts) { // doing zero mean trajectories for (i = 0; i < control_dim / 4; i++) { control_samples_shared4[i] = std_dev_shared4[i] * control_samples_shared4[i]; } } else if (valid_index) { for (i = 0; i < control_dim / 4; i++) { control_samples_shared4[i] = mean_shared4[i] + std_dev_shared4[i] * control_samples_shared4[i]; } } // save back to global memory for (i = 0; valid_index && i < control_dim / 4; i++) { control_samples_d4[i] = control_samples_shared4[i]; } } else if (control_dim % 2 == 0) { // Step 1: copy means into shared memory for (i = threadIdx.z; i < num_distributions; i += blockDim.z) { for (j = threadIdx.y; j < num_timesteps; j += blockDim.y) { const int mean_index = (i * num_timesteps + j) * control_dim; float2* mean_shared2 = reinterpret_cast(&mean_shared[mean_index]); const float2* mean_d2 = reinterpret_cast(&mean_d[mean_index]); for (k = threadIdx.x; k < control_dim / 2; k += blockDim.x) { mean_shared2[k] = mean_d2[k]; } } } // Step 2: load std_dev to shared memory const float2* std_dev_d2 = reinterpret_cast(&std_dev_d[global_std_dev_index]); float2* std_dev_shared2 = reinterpret_cast(&std_dev_shared[shared_std_dev_index]); for (i = threadIdx.x; i < control_dim / 2; i += blockDim.x) { std_dev_shared2[i] = std_dev_decay * std_dev_d2[i]; } // Step 3: load noise into shared memory float2* control_samples_shared2 = reinterpret_cast(&control_samples_shared[shared_noise_index]); float2* control_samples_d2 = reinterpret_cast(&control_samples_d[global_noise_index]); // Create const pointer to mean in shared memory as it shouldn't change henceforth const float2* mean_shared2 = reinterpret_cast(&mean_shared[shared_mean_index]); for (i = 0; valid_index && i < control_dim / 2; i++) { control_samples_shared2[i] = control_samples_d2[i]; } __syncthreads(); // wait for all copying from global to shared memory to finish // Step 4: do mean + variance calculations if (valid_index && (trajectory_index == 0 || time_index < optimization_stride)) { // 0 noise trajectory for (i = 0; i < control_dim / 2; i++) { control_samples_shared2[i] = mean_shared2[i]; } } else if (valid_index && trajectory_index >= (1.0f - pure_noise_percentage) * num_rollouts) { // doing zero mean trajectories for (i = 0; i < control_dim / 2; i++) { control_samples_shared2[i] = std_dev_shared2[i] * control_samples_shared2[i]; } } else if (valid_index) { for (i = 0; i < control_dim / 2; i++) { control_samples_shared2[i] = mean_shared2[i] + std_dev_shared2[i] * control_samples_shared2[i]; } } // save back to global memory for (i = 0; valid_index && i < control_dim / 2; i++) { control_samples_d2[i] = control_samples_shared2[i]; } } else { // No memory alignment to take advantage of // Step 1: copy means into shared memory for (i = threadIdx.z; i < num_distributions; i += blockDim.z) { for (j = threadIdx.y; j < num_timesteps; j += blockDim.y) { const int mean_index = (i * num_timesteps + j) * control_dim; for (k = threadIdx.x; k < control_dim; k += blockDim.x) { mean_shared[mean_index + k] = mean_d[mean_index + k]; } } } // __syncthreads(); // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && threadIdx.y == 0) // { // printf("Mean: "); // for (i = 0; i < num_timesteps; i++) // { // printf("%f, ", mean_d[i]); // } // printf("\n"); // } // __syncthreads(); // Step 2: load std_dev to shared memory for (i = threadIdx.x; i < control_dim; i += blockDim.x) { std_dev_shared[shared_std_dev_index + i] = std_dev_decay * std_dev_d[global_std_dev_index + i]; } // Step 3: load noise into shared memory for (i = 0; valid_index && i < control_dim; i++) { control_samples_shared[shared_noise_index + i] = control_samples_d[global_noise_index + i]; } // __syncthreads(); // if (trajectory_index == 10 && time_index == 20) // { // printf("Control noise %d at time %d: ", trajectory_index, time_index); // for (i = 0; i < control_dim; i++) // { // printf("%f, ", control_samples_shared[shared_noise_index + i]); // } // printf("\n std_dev_decay: %f, optimization_stride: %d\n", std_dev_decay, optimization_stride); // printf("Std Dev: %f\n", std_dev_shared[shared_std_dev_index]); // } // __syncthreads(); __syncthreads(); // wait for all copying from global to shared memory to finish // Step 4: do mean + variance calculations if (valid_index && (trajectory_index == 0 || time_index < optimization_stride)) { // 0 noise trajectory for (i = 0; i < control_dim; i++) { control_samples_shared[shared_noise_index + i] = mean_shared[shared_mean_index + i]; } } else if (valid_index && trajectory_index >= (1.0f - pure_noise_percentage) * num_rollouts) { // doing zero mean trajectories for (i = 0; i < control_dim; i++) { control_samples_shared[shared_noise_index + i] = std_dev_shared[shared_std_dev_index + i] * control_samples_shared[shared_noise_index + i]; } } else if (valid_index) { for (i = 0; i < control_dim; i++) { control_samples_shared[shared_noise_index + i] = mean_shared[shared_mean_index + i] + std_dev_shared[shared_std_dev_index + i] * control_samples_shared[shared_noise_index + i]; } } __syncthreads(); // save back to global memory for (i = 0; valid_index && i < control_dim; i++) { control_samples_d[global_noise_index + i] = control_samples_shared[shared_noise_index + i]; } } } GAUSSIAN_TEMPLATE GAUSSIAN_CLASS::GaussianDistributionImpl(cudaStream_t stream) : PARENT_CLASS::SamplingDistribution(stream) { } GAUSSIAN_TEMPLATE GAUSSIAN_CLASS::GaussianDistributionImpl(const SAMPLING_PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS::SamplingDistribution(params, stream) { } GAUSSIAN_TEMPLATE __host__ void GAUSSIAN_CLASS::allocateCUDAMemoryHelper() { if (this->GPUMemStatus_) { if (std_dev_d_) { #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaFreeAsync(std_dev_d_, this->stream_)); #else HANDLE_ERROR(cudaFree(std_dev_d_)); #endif } if (control_means_d_) { // deallocate previous memory control trajectory means #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaFreeAsync(control_means_d_, this->stream_)); #else HANDLE_ERROR(cudaFree(control_means_d_)); #endif } int std_dev_size = CONTROL_DIM * this->getNumDistributions(); if (this->params_.time_specific_std_dev) { std_dev_size *= this->getNumTimesteps(); } #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaMallocAsync((void**)&std_dev_d_, sizeof(float) * std_dev_size, this->stream_)); HANDLE_ERROR(cudaMallocAsync((void**)&control_means_d_, sizeof(float) * this->getNumDistributions() * this->getNumTimesteps() * CONTROL_DIM, this->stream_)); #else HANDLE_ERROR(cudaMalloc((void**)&std_dev_d_, sizeof(float) * std_dev_size)); HANDLE_ERROR(cudaMalloc((void**)&control_means_d_, sizeof(float) * this->getNumDistributions() * this->getNumTimesteps() * CONTROL_DIM)); #endif means_.resize(this->getNumDistributions() * this->getNumTimesteps() * CONTROL_DIM); // Ensure that the device side point knows where the the standard deviation memory is located HANDLE_ERROR(cudaMemcpyAsync(&this->sampling_d_->std_dev_d_, &std_dev_d_, sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(&this->sampling_d_->control_means_d_, &control_means_d_, sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); } } GAUSSIAN_TEMPLATE __host__ void GAUSSIAN_CLASS::freeCudaMem() { if (this->GPUMemStatus_) { HANDLE_ERROR(cudaFree(control_means_d_)); HANDLE_ERROR(cudaFree(std_dev_d_)); control_means_d_ = nullptr; std_dev_d_ = nullptr; } PARENT_CLASS::freeCudaMem(); } GAUSSIAN_TEMPLATE void GAUSSIAN_CLASS::paramsToDevice(bool synchronize) { PARENT_CLASS::paramsToDevice(false); if (this->GPUMemStatus_) { if (this->params_.time_specific_std_dev) { HANDLE_ERROR(cudaMemcpyAsync(this->std_dev_d_, this->params_.std_dev, sizeof(float) * CONTROL_DIM * this->getNumTimesteps() * this->getNumDistributions(), cudaMemcpyHostToDevice, this->stream_)); } else { HANDLE_ERROR(cudaMemcpyAsync(this->std_dev_d_, this->params_.std_dev, sizeof(float) * CONTROL_DIM * this->getNumDistributions(), cudaMemcpyHostToDevice, this->stream_)); } if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } } GAUSSIAN_TEMPLATE __host__ void GAUSSIAN_CLASS::generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize) { if (this->params_.use_same_noise_for_all_distributions) { HANDLE_CURAND_ERROR(curandGenerateNormal( gen, this->control_samples_d_, this->getNumTimesteps() * this->getNumRollouts() * CONTROL_DIM, 0.0f, 1.0f)); for (int i = 1; i < this->getNumDistributions(); i++) { HANDLE_ERROR(cudaMemcpyAsync( &this->control_samples_d_[this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM * i], this->control_samples_d_, sizeof(float) * this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyDeviceToDevice, this->stream_)); } } else { HANDLE_CURAND_ERROR(curandGenerateNormal( gen, this->control_samples_d_, this->getNumTimesteps() * this->getNumRollouts() * this->getNumDistributions() * CONTROL_DIM, 0.0f, 1.0f)); } const int BLOCKSIZE_X = this->params_.rewrite_controls_block_dim.x; const int BLOCKSIZE_Y = this->params_.rewrite_controls_block_dim.y; const int BLOCKSIZE_Z = this->params_.rewrite_controls_block_dim.z; /** * Generate noise samples with mean added **/ dim3 control_writing_grid; control_writing_grid.x = mppi::math::int_ceil(this->getNumRollouts(), BLOCKSIZE_X); control_writing_grid.y = mppi::math::int_ceil(this->getNumTimesteps(), BLOCKSIZE_Y); control_writing_grid.z = mppi::math::int_ceil(this->getNumDistributions(), BLOCKSIZE_Z); unsigned int std_dev_mem_size = this->getNumDistributions() * CONTROL_DIM; // Allocate shared memory for std_deviations per timestep or constant across the trajectory std_dev_mem_size = mppi::math::nearest_multiple_4( this->params_.time_specific_std_dev ? std_dev_mem_size * this->getNumTimesteps() : std_dev_mem_size); unsigned int shared_mem_size = std_dev_mem_size + mppi::math::nearest_multiple_4(this->getNumDistributions() * this->getNumTimesteps() * CONTROL_DIM) + mppi::math::nearest_multiple_4(BLOCKSIZE_X * BLOCKSIZE_Y * BLOCKSIZE_Z * CONTROL_DIM); shared_mem_size *= sizeof(float); // std::cout << "Shared mem size: " << shared_mem_size << " bytes. BLOCKSIZE_X: " << BLOCKSIZE_X // << ", BLOCKSIZE_Y: " << BLOCKSIZE_Y << ", BLOCKSIZE_Z: " << BLOCKSIZE_Z // << "Grid: (" << control_writing_grid.x << ", " << control_writing_grid.y // << ", " << control_writing_grid.z << ")" << std::endl; setGaussianControls<<params_.rewrite_controls_block_dim, shared_mem_size, this->stream_>>>( this->control_means_d_, this->std_dev_d_, this->control_samples_d_, CONTROL_DIM, this->getNumTimesteps(), this->getNumRollouts(), this->getNumDistributions(), optimization_stride, powf(this->params_.std_dev_decay, iteration_num), this->params_.pure_noise_trajectories_percentage, this->params_.time_specific_std_dev); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } GAUSSIAN_TEMPLATE __host__ void GAUSSIAN_CLASS::updateDistributionParamsFromDevice(const float* trajectory_weights_d, float normalizer, const int& distribution_i, bool synchronize) { if (distribution_i >= this->getNumDistributions()) { this->logger_->error( "Updating distributional params for distribution %d out of %d total. Distribution out of bounds.\n", distribution_i, this->getNumDistributions()); return; } float* control_samples_i_d = &(this->control_samples_d_[distribution_i * this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM]); float* control_mean_i_d = &(this->control_means_d_[distribution_i * this->getNumTimesteps() * CONTROL_DIM]); mppi::kernels::launchWeightedReductionKernel(trajectory_weights_d, control_samples_i_d, control_mean_i_d, normalizer, this->getNumTimesteps(), this->getNumRollouts(), this->params_.sum_strides, this->stream_, synchronize); HANDLE_ERROR(cudaMemcpyAsync(&means_[distribution_i * this->getNumTimesteps() * CONTROL_DIM], control_mean_i_d, sizeof(float) * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } GAUSSIAN_TEMPLATE __host__ void GAUSSIAN_CLASS::setHostOptimalControlSequence(float* optimal_control_trajectory, const int& distribution_i, bool synchronize) { if (distribution_i >= this->getNumDistributions()) { this->logger_->error( "Asking for optimal control sequence from distribution %d out of %d total. Distribution out of bounds.\n", distribution_i, this->getNumDistributions()); return; } HANDLE_ERROR(cudaMemcpyAsync( optimal_control_trajectory, &(this->control_means_d_[this->getNumTimesteps() * CONTROL_DIM * distribution_i]), sizeof(float) * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } GAUSSIAN_TEMPLATE __host__ __device__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const float* __restrict__ u, float* __restrict__ theta_d, const int sample_index, const int t, const int distribution_idx, const float lambda, const float alpha) { SAMPLING_PARAMS_T* params_p = (SAMPLING_PARAMS_T*)theta_d; const int distribution_i = distribution_idx >= params_p->num_distributions ? 0 : distribution_idx; float* std_dev = &(params_p->std_dev[CONTROL_DIM * distribution_i]); if (params_p->time_specific_std_dev) { std_dev = &(params_p->std_dev[(distribution_i * params_p->num_timesteps + t) * CONTROL_DIM]); } float* mean = &(this->control_means_d_[(params_p->num_timesteps * distribution_i + t) * CONTROL_DIM]); float* control_cost_coeff = params_p->control_cost_coeff; float cost = 0.0f; int i, step; mppi::p1::getParallel1DIndex(i, step); if (CONTROL_DIM % 4 == 0) { float4 cost_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float4 mean_i, std_dev_i, u_i, control_cost_coeff_i; for (; i < CONTROL_DIM / 4; i += step) { if (sample_index >= (1.0f - params_p->pure_noise_trajectories_percentage) * params_p->num_rollouts) { mean_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f); } else { mean_i = reinterpret_cast(mean)[i]; // read mean value from global memory only once } std_dev_i = reinterpret_cast(std_dev)[i]; u_i = reinterpret_cast(u)[i]; control_cost_coeff_i = reinterpret_cast(control_cost_coeff)[i]; // cost_i += control_cost_coeff_i * mean_i * (mean_i + 2 * (u_i - mean_i)) / (std_dev_i * std_dev_i); cost_i += control_cost_coeff_i * mean_i * (mean_i - 2.0f * u_i) / (std_dev_i * std_dev_i); // Proper way } cost += cost_i.x + cost_i.y + cost_i.z + cost_i.w; } else if (CONTROL_DIM % 2 == 0) { float2 cost_i = make_float2(0.0f, 0.0f); float2 mean_i, std_dev_i, u_i, control_cost_coeff_i; for (; i < CONTROL_DIM / 2; i += step) { if (sample_index >= (1.0f - params_p->pure_noise_trajectories_percentage) * params_p->num_rollouts) { mean_i = make_float2(0.0f, 0.0f); } else { mean_i = reinterpret_cast(mean)[i]; // read mean value from global memory only once } std_dev_i = reinterpret_cast(std_dev)[i]; u_i = reinterpret_cast(u)[i]; control_cost_coeff_i = reinterpret_cast(control_cost_coeff)[i]; // cost_i += control_cost_coeff_i * mean_i * (mean_i + 2 * (u_i - mean_i)) / (std_dev_i * std_dev_i); cost_i += control_cost_coeff_i * mean_i * (mean_i - 2.0f * u_i) / (std_dev_i * std_dev_i); // Proper way } cost += cost_i.x + cost_i.y; } else { float mean_i; for (; i < CONTROL_DIM; i += step) { if (sample_index >= (1.0f - params_p->pure_noise_trajectories_percentage) * params_p->num_rollouts) { mean_i = 0.0f; } else { mean_i = mean[i]; // read mean value from global memory only once } cost += control_cost_coeff[i] * mean_i * (mean_i - 2.0f * u[i]) / (std_dev[i] * std_dev[i]); // Proper way // float noise = u[i] - mean_i; // cost += control_cost_coeff[i] * mean_i * (u[i] + noise) / (std_dev[i] * std_dev[i]); // Way in cost kernel } } return 0.5f * lambda * (1.0f - alpha) * cost; } GAUSSIAN_TEMPLATE __host__ __device__ float GAUSSIAN_CLASS::computeFeedbackCost(const float* __restrict__ u_fb, float* __restrict__ theta_d, const int t, const int distribution_idx, const float lambda, const float alpha) { SAMPLING_PARAMS_T* params_p = (SAMPLING_PARAMS_T*)theta_d; const int distribution_i = distribution_idx >= params_p->num_distributions ? 0 : distribution_idx; float* std_dev = &(params_p->std_dev[CONTROL_DIM * distribution_i]); if (params_p->time_specific_std_dev) { std_dev = &(params_p->std_dev[(distribution_i * params_p->num_timesteps + t) * CONTROL_DIM]); } float* control_cost_coeff = params_p->control_cost_coeff; float cost = 0.0f; int i, step; mppi::p1::getParallel1DIndex(i, step); if (CONTROL_DIM % 4 == 0) { float4 cost_i = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float4 std_dev_i, control_cost_coeff_i, u_fb_i; for (; i < CONTROL_DIM / 4; i += step) { std_dev_i = reinterpret_cast(std_dev)[i]; u_fb_i = reinterpret_cast(u_fb)[i]; control_cost_coeff_i = reinterpret_cast(control_cost_coeff)[i]; cost_i += control_cost_coeff_i * (u_fb_i * u_fb_i) / (std_dev_i * std_dev_i); } cost += cost_i.x + cost_i.y + cost_i.z + cost_i.w; } else if (CONTROL_DIM % 2 == 0) { float2 cost_i = make_float2(0.0f, 0.0f); float2 std_dev_i, control_cost_coeff_i, u_fb_i; for (; i < CONTROL_DIM / 2; i += step) { std_dev_i = reinterpret_cast(std_dev)[i]; control_cost_coeff_i = reinterpret_cast(control_cost_coeff)[i]; u_fb_i = reinterpret_cast(u_fb)[i]; cost_i += control_cost_coeff_i * (u_fb_i * u_fb_i) / (std_dev_i * std_dev_i); } cost += cost_i.x + cost_i.y; } else { for (; i < CONTROL_DIM; i += step) { cost += control_cost_coeff[i] * (u_fb[i] * u_fb[i]) / (std_dev[i] * std_dev[i]); } } return 0.5f * lambda * (1.0f - alpha) * cost; } GAUSSIAN_TEMPLATE __host__ float GAUSSIAN_CLASS::computeFeedbackCost(const Eigen::Ref& u_fb, const int t, const int distribution_idx, const float lambda, const float alpha) { float cost = 0.0f; const int distribution_i = distribution_idx >= this->params_.num_distributions ? 0 : distribution_idx; float* std_dev = &(this->params_.std_dev[CONTROL_DIM * distribution_i]); if (this->params_.time_specific_std_dev) { std_dev = &(this->params_.std_dev[(distribution_i * this->getNumTimesteps() + t) * CONTROL_DIM]); } for (int i = 0; i < CONTROL_DIM; i++) { cost += this->params_.control_cost_coeff[i] * u_fb(i) * u_fb(i) / (std_dev[i] * std_dev[i]); } return 0.5f * lambda * (1.0f - alpha) * cost; } GAUSSIAN_TEMPLATE __host__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const Eigen::Ref& u, const int sample_index, const int t, const int distribution_idx, const float lambda, const float alpha) { float cost = 0.0f; const int distribution_i = distribution_idx >= this->params_.num_distributions ? 0 : distribution_idx; const int mean_index = (distribution_i * this->getNumTimesteps() + t) * CONTROL_DIM; float* mean = &(this->means_[mean_index]); float* std_dev = &(this->params_.std_dev[CONTROL_DIM * distribution_i]); control_array zero_mean = control_array::Zero(); if (sample_index >= (1.0f - this->params_.pure_noise_trajectories_percentage) * this->params_.num_rollouts) { mean = zero_mean.data(); } if (this->params_.time_specific_std_dev) { std_dev = &(this->params_.std_dev[(distribution_i * this->getNumTimesteps() + t) * CONTROL_DIM]); } for (int i = 0; i < CONTROL_DIM; i++) { cost += this->params_.control_cost_coeff[i] * mean[i] * (mean[i] - 2.0f * u(i)) / (std_dev[i] * std_dev[i]); // Proper way } return 0.5f * lambda * (1.0f - alpha) * cost; } GAUSSIAN_TEMPLATE __host__ void GAUSSIAN_CLASS::copyImportanceSamplerToDevice(const float* importance_sampler, const int& distribution_idx, bool synchronize) { HANDLE_ERROR(cudaMemcpyAsync(&control_means_d_[this->getNumTimesteps() * CONTROL_DIM * distribution_idx], importance_sampler, sizeof(float) * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyHostToDevice, this->stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } #undef GAUSSIAN_TEMPLATE #undef GAUSSIAN_CLASS } // namespace sampling_distributions } // namespace mppi ================================================ FILE: include/mppi/sampling_distributions/gaussian/gaussian.cuh ================================================ #pragma once /** * Created by Bogdan Vlahov on 3/24/2023 **/ #include #include namespace mppi { namespace sampling_distributions { __global__ void setGaussianControls(const float* __restrict__ mean_d, const float* __restrict__ std_dev_d, float* __restrict__ control_samples_d, const int control_dim, const int num_timesteps, const int num_rollouts, const int num_distributions, const int optimization_stride, const float std_dev_decay, const float pure_noise_percentage, const bool time_specific_std_dev = false); // Set the default number of distributions to 2 since that is currently the most we would use template struct GaussianParamsImpl : public SamplingParams { static const int MAX_DISTRIBUTIONS = MAX_DISTRIBUTIONS_T; float std_dev[C_DIM * MAX_DISTRIBUTIONS] MPPI_ALIGN(sizeof(float4)) = { 0.0f }; float control_cost_coeff[C_DIM] MPPI_ALIGN(sizeof(float4)) = { 0.0f }; float pure_noise_trajectories_percentage = 0.01f; float std_dev_decay = 1.0f; // Kernel launching params dim3 rewrite_controls_block_dim = dim3(32, 16, 1); int sum_strides = 32; // Various flags bool time_specific_std_dev = false; GaussianParamsImpl(int num_rollouts = 1, int num_timesteps = 1, int num_distributions = 1) : SamplingParams::SamplingParams(num_rollouts, num_timesteps, num_distributions) { for (int i = 0; i < this->CONTROL_DIM * MAX_DISTRIBUTIONS; i++) { std_dev[i] = 1.0f; } } void copyStdDevToDistribution(const int src_distribution_idx, const int dest_distribution_idx) { bool src_out_of_distribution = src_distribution_idx >= MAX_DISTRIBUTIONS; if (src_out_of_distribution || dest_distribution_idx >= MAX_DISTRIBUTIONS) { printf("%s Distribution %d is out of range. There are only %d total distributions\n", src_out_of_distribution ? "Src" : "Dest", src_out_of_distribution ? src_distribution_idx : dest_distribution_idx, MAX_DISTRIBUTIONS); return; } float* std_dev_src = std_dev[this->CONTROL_DIM * src_distribution_idx]; float* std_dev_dest = std_dev[this->CONTROL_DIM * dest_distribution_idx]; for (int i = 0; i < this->CONTROL_DIM; i++) { std_dev_dest[i] = std_dev_src[i]; } } }; template using GaussianParams = GaussianParamsImpl; template struct GaussianTimeVaryingStdDevParams : public GaussianParamsImpl { float std_dev[C_DIM * MAX_TIMESTEPS * MAX_DISTRIBUTIONS_T] = { 0.0f }; GaussianTimeVaryingStdDevParams(int num_rollouts = 1, int num_timesteps = 1, int num_distributions = 1) : GaussianParamsImpl::GaussianParamsImpl(num_rollouts, num_timesteps, num_distributions) { this->time_specific_std_dev = true; for (int i = 0; i < this->CONTROL_DIM * MAX_TIMESTEPS * this->MAX_DISTRIBUTIONS; i++) { std_dev[i] = 1.0f; } } void copyStdDevToDistribution(const int src_distribution_idx, const int dest_distribution_idx) { bool src_out_of_distribution = src_distribution_idx >= this->MAX_DISTRIBUTIONS; if (src_out_of_distribution || dest_distribution_idx >= this->MAX_DISTRIBUTIONS) { printf("%s Distribution %d is out of range. There are only %d total distributions\n", src_out_of_distribution ? "Src" : "Dest", src_out_of_distribution ? src_distribution_idx : dest_distribution_idx, this->MAX_DISTRIBUTIONS); return; } float* std_dev_src = std_dev[this->CONTROL_DIM * this->num_timesteps * src_distribution_idx]; float* std_dev_dest = std_dev[this->CONTROL_DIM * this->num_timesteps * dest_distribution_idx]; for (int i = 0; i < this->CONTROL_DIM * this->num_timesteps; i++) { std_dev_dest[i] = std_dev_src[i]; } } }; template class PARAMS_TEMPLATE = GaussianParams, class DYN_PARAMS_T = DynamicsParams> class GaussianDistributionImpl : public SamplingDistribution { public: using PARENT_CLASS = SamplingDistribution; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; using control_array = typename PARENT_CLASS::control_array; static const int CONTROL_DIM = PARENT_CLASS::CONTROL_DIM; typedef Eigen::Matrix TEST_TYPE; GaussianDistributionImpl(cudaStream_t stream = 0); GaussianDistributionImpl(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0); ~GaussianDistributionImpl() { freeCudaMem(); } __host__ virtual std::string getSamplingDistributionName() const override { return "Gaussian"; } __host__ void allocateCUDAMemoryHelper(); __host__ __device__ float computeFeedbackCost(const float* __restrict__ u_fb, float* __restrict__ theta_d, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); __host__ float computeFeedbackCost(const Eigen::Ref& u_fb, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); __host__ __device__ float computeLikelihoodRatioCost(const float* __restrict__ u, float* __restrict__ theta_d, const int sample_index, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); __host__ float computeLikelihoodRatioCost(const Eigen::Ref& u, const int sample_index, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); __host__ void copyImportanceSamplerToDevice(const float* importance_sampler, const int& distribution_idx, bool synchronize = true); __host__ void freeCudaMem(); __host__ void generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize = true); __host__ void paramsToDevice(bool synchronize = true); __host__ void setHostOptimalControlSequence(float* optimal_control_trajectory, const int& distribution_idx, bool synchronize = true); __host__ void setNumDistributions(const int num_distributions, bool synchronize = false) { if (num_distributions > SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS) { this->logger_->error("GaussianParams can't handle more than %d distributions but %d were requested\n", SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS, num_distributions); throw std::out_of_range("Can't set num distributions higher than allowed in params"); } PARENT_CLASS::setNumDistributions(num_distributions, synchronize); } __host__ void updateDistributionParamsFromDevice(const float* trajectory_weights_d, float normalizer, const int& distribution_i, bool synchronize = false) override; protected: float* std_dev_d_ = nullptr; float* control_means_d_ = nullptr; std::vector means_; }; template class GaussianDistribution : public GaussianDistributionImpl, GaussianParams, DYN_PARAMS_T> { public: using PARENT_CLASS = GaussianDistributionImpl; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; GaussianDistribution(cudaStream_t stream = 0) : PARENT_CLASS(stream) { } GaussianDistribution(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0) : PARENT_CLASS(params, stream) { } }; template const int GaussianParamsImpl::MAX_DISTRIBUTIONS; } // namespace sampling_distributions } // namespace mppi #if __CUDACC__ #include "gaussian.cu" #endif ================================================ FILE: include/mppi/sampling_distributions/nln/nln.cu ================================================ /** * Created by Bogdan Vlahov on Jan 7, 2024 **/ #define NLN_TEMPLATE template class PARAMS_TEMPLATE, class DYN_PARAMS_T> #define NLN_NOISE NLNDistributionImpl #include #include #include __global__ void createNLNNoise(float* __restrict__ normal_noise, const float* __restrict__ log_normal_noise, const int num_trajectories, const int num_timesteps, const int control_dim) { const int sample_index = blockIdx.x * blockDim.x + threadIdx.x; const int time_index = blockIdx.y * blockDim.y + threadIdx.y; const int control_index = blockIdx.z * blockDim.z + threadIdx.z; if (sample_index < num_trajectories && time_index < num_timesteps && control_index < control_dim) { const int normal_index = (sample_index * num_timesteps + time_index) * control_dim + control_index; const int log_normal_index = (control_index * num_trajectories + sample_index) * num_timesteps + time_index; normal_noise[normal_index] = normal_noise[normal_index] * log_normal_noise[log_normal_index]; } } namespace mppi { namespace sampling_distributions { NLN_TEMPLATE NLN_NOISE::NLNDistributionImpl(cudaStream_t stream) : PARENT_CLASS::GaussianDistributionImpl(stream) { calculateLogMeanAndVariance(); } NLN_TEMPLATE NLN_NOISE::NLNDistributionImpl(const SAMPLING_PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS::GaussianDistributionImpl(params, stream) { calculateLogMeanAndVariance(); } NLN_TEMPLATE __host__ void NLN_NOISE::freeCudaMem() { if (this->GPUMemStatus_) { cudaFree(log_normal_noise_d_); } PARENT_CLASS::freeCudaMem(); } NLN_TEMPLATE __host__ void NLN_NOISE::allocateCUDAMemoryHelper() { PARENT_CLASS::allocateCUDAMemoryHelper(); if (this->GPUMemStatus_) { if (log_normal_noise_d_) { HANDLE_ERROR(cudaFreeAsync(log_normal_noise_d_, this->stream_)); } HANDLE_ERROR(cudaMallocAsync((void**)&log_normal_noise_d_, sizeof(float) * this->CONTROL_DIM * this->getNumRollouts() * this->getNumTimesteps() * this->getNumDistributions(), this->stream_)); } } NLN_TEMPLATE void NLN_NOISE::setParams(const SAMPLING_PARAMS_T& params, bool synchronize) { bool adjusted_variance = false; for (int i = 0; i < this->CONTROL_DIM * this->getNumDistributions(); i++) { if (this->params_.std_dev[i] != params.std_dev[i]) { adjusted_variance = true; break; } } PARENT_CLASS::setParams(params, synchronize); if (adjusted_variance) { calculateLogMeanAndVariance(); } } NLN_TEMPLATE void NLN_NOISE::calculateLogMeanAndVariance() { float normal_variance, log_variance; log_noise_mean_.resize(this->CONTROL_DIM * this->getNumDistributions()); log_noise_std_dev_.resize(this->CONTROL_DIM * this->getNumDistributions()); for (int i = 0; i < this->CONTROL_DIM * this->getNumDistributions(); i++) { normal_variance = this->params_.std_dev[i] * this->params_.std_dev[i]; log_noise_mean_[i] = expf(0.5 * normal_variance); float exp_normal_variance = expf(normal_variance); log_variance = exp_normal_variance * (exp_normal_variance - 1.0f); log_noise_std_dev_[i] = sqrtf(log_variance); } } NLN_TEMPLATE __host__ void NLN_NOISE::generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize) { // generate log normal noise for (int i = 0; i < this->CONTROL_DIM; i++) { HANDLE_CURAND_ERROR( curandGenerateLogNormal(gen, log_normal_noise_d_ + i * this->getNumRollouts() * this->getNumTimesteps(), this->getNumRollouts() * this->getNumTimesteps(), 0.0f, this->params_.std_dev[i])); } // generate normal noise HANDLE_CURAND_ERROR(curandGenerateNormal(gen, this->control_samples_d_, this->getNumTimesteps() * this->getNumRollouts() * CONTROL_DIM, 0.0f, 1.0f)); // create NLN noise by multiplying normal and log normal noise const int BLOCKSIZE_X = this->params_.rewrite_controls_block_dim.x; const int BLOCKSIZE_Y = this->params_.rewrite_controls_block_dim.y; const int BLOCKSIZE_Z = this->params_.rewrite_controls_block_dim.z; dim3 combine_noise_grid; combine_noise_grid.x = mppi::math::int_ceil(this->getNumRollouts(), BLOCKSIZE_X); combine_noise_grid.y = mppi::math::int_ceil(this->getNumTimesteps(), BLOCKSIZE_Y); combine_noise_grid.z = mppi::math::int_ceil(this->CONTROL_DIM, BLOCKSIZE_Z); createNLNNoise<<params_.rewrite_controls_block_dim, 0, this->stream_>>>( this->control_samples_d_, log_normal_noise_d_, this->getNumRollouts(), this->getNumTimesteps(), this->CONTROL_DIM); HANDLE_ERROR(cudaGetLastError()); for (int i = 1; i < this->getNumDistributions(); i++) { HANDLE_ERROR(cudaMemcpyAsync( &this->control_samples_d_[this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM * i], this->control_samples_d_, sizeof(float) * this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyDeviceToDevice, this->stream_)); } /** * Generate noise samples with mean added **/ dim3 control_writing_grid; control_writing_grid.x = mppi::math::int_ceil(this->getNumRollouts(), BLOCKSIZE_X); control_writing_grid.y = mppi::math::int_ceil(this->getNumTimesteps(), BLOCKSIZE_Y); control_writing_grid.z = mppi::math::int_ceil(this->getNumDistributions(), BLOCKSIZE_Z); unsigned int std_dev_mem_size = this->getNumDistributions() * CONTROL_DIM; // Allocate shared memory for std_deviations per timestep or constant across the trajectory std_dev_mem_size = mppi::math::nearest_multiple_4( this->params_.time_specific_std_dev ? std_dev_mem_size * this->getNumTimesteps() : std_dev_mem_size); unsigned int shared_mem_size = std_dev_mem_size + mppi::math::nearest_multiple_4(this->getNumDistributions() * this->getNumTimesteps() * CONTROL_DIM) + mppi::math::nearest_multiple_4(BLOCKSIZE_X * BLOCKSIZE_Y * BLOCKSIZE_Z * CONTROL_DIM); shared_mem_size *= sizeof(float); setGaussianControls<<params_.rewrite_controls_block_dim, shared_mem_size, this->stream_>>>( this->control_means_d_, this->std_dev_d_, this->control_samples_d_, CONTROL_DIM, this->getNumTimesteps(), this->getNumRollouts(), this->getNumDistributions(), optimization_stride, powf(this->params_.std_dev_decay, iteration_num), this->params_.pure_noise_trajectories_percentage, this->params_.time_specific_std_dev); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } } // namespace sampling_distributions } // namespace mppi #undef NLN_NOISE #undef NLN_TEMPLATE ================================================ FILE: include/mppi/sampling_distributions/nln/nln.cuh ================================================ #pragma once /** * Created by Bogdan, Jan 7, 2024 * based off of https://github.com/IhabMohamed/log-MPPI_ros */ #include #include __global__ void createNLNNoise(float* __restrict__ normal_noise, const float* __restrict__ log_normal_noise, const int num_trajectories, const int num_timesteps, const int control_dim); namespace mppi { namespace sampling_distributions { template class PARAMS_TEMPLATE = GaussianParams, class DYN_PARAMS_T = DynamicsParams> class NLNDistributionImpl : public GaussianDistributionImpl { public: using PARENT_CLASS = GaussianDistributionImpl; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; using control_array = typename PARENT_CLASS::control_array; static const int CONTROL_DIM = PARENT_CLASS::CONTROL_DIM; NLNDistributionImpl(cudaStream_t stream = 0); NLNDistributionImpl(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0); ~NLNDistributionImpl() { freeCudaMem(); } __host__ virtual std::string getSamplingDistributionName() const override { return "NLN"; } void setParams(const SAMPLING_PARAMS_T& params, bool synchronize = true); void calculateLogMeanAndVariance(); __host__ void allocateCUDAMemoryHelper(); __host__ void freeCudaMem(); __host__ void generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize = true); protected: float* log_normal_noise_d_ = nullptr; std::vector log_noise_mean_; std::vector log_noise_std_dev_; }; template class NLNDistribution : public NLNDistributionImpl, GaussianParams, DYN_PARAMS_T> { public: using PARENT_CLASS = NLNDistributionImpl; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; NLNDistribution(cudaStream_t stream = 0) : PARENT_CLASS(stream) { } NLNDistribution(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0) : PARENT_CLASS(params, stream) { } }; } // namespace sampling_distributions } // namespace mppi #ifdef __CUDACC__ #include "nln.cu" #endif ================================================ FILE: include/mppi/sampling_distributions/piecewise_linear/piecewise_linear_noise.cuh ================================================ #pragma once /** * Created by David, Apr 11, 2021 */ #include #include #include #include #include #include __global__ void createPiecewiseLinearNoise(const int num_timesteps, const int num_trajectories, const int control_dim, const int num_piecewise_segments, const int optimization_stride, const float* scale_piecewise_noise, const float* frac_add_nominal_traj, const float* scale_add_nominal_noise, const unsigned int* switch_num, const float* switch_times, const float* switch_values, float* nominal_control, const float* control_std_dev, float* output) { int sample_index = blockIdx.x * blockDim.x + threadIdx.x; int time_index = blockIdx.y * blockDim.y + threadIdx.y; int control_index = blockIdx.z * blockDim.z + threadIdx.z; if (sample_index >= num_trajectories || time_index >= num_timesteps || control_index >= control_dim) { return; } float output_val = 0.0; if (sample_index == 1 || time_index < optimization_stride) { // if sample index = 1, use nominal control // sample_index = 0 is replaced by 0 controls later in the kernel output_val = nominal_control[time_index * control_dim + control_index]; } else if (float(sample_index) < frac_add_nominal_traj[0] * float(num_trajectories)) { // randomly vary output_val for frac_add_nominal_traj fraction of the trajectories output_val = output[(sample_index * num_timesteps + time_index) * control_dim + control_index] * scale_add_nominal_noise[0]; output_val += nominal_control[time_index * control_dim + control_index]; } else { // all others, use piecewise linear noise. // determine start/stop times and piecewise fraction int switch_num_val = switch_num[sample_index * control_dim + control_index]; switch_num_val = min(switch_num_val, num_piecewise_segments); int segment_index = 0; float start_time = 0.0f; float end_time = 1.0f; float time_frac = float(time_index) / float(num_timesteps); for (int i = 0; i < switch_num_val; i++) { float switch_time = switch_times[(sample_index * (num_piecewise_segments + 1) + i) * control_dim + control_index]; if (switch_time < time_frac) { segment_index += 1; if (start_time < switch_time) { start_time = switch_time; } } if (switch_time > time_frac) { if (end_time > switch_time) { end_time = switch_time; } } } // compute noise, interpolated between first and second value float first_val = 0; if (start_time < float(optimization_stride) / float(num_timesteps)) { // first value should always be nominal control at optimization stride start first_val = nominal_control[optimization_stride * control_dim + control_index]; start_time = float(optimization_stride) / float(num_timesteps); } else { first_val = switch_values[(sample_index * (num_piecewise_segments + 1) + segment_index) * control_dim + control_index]; } float second_val = switch_values[(sample_index * (num_piecewise_segments + 1) + segment_index + 1) * control_dim + control_index]; float frac_interval = (time_frac - start_time) / (end_time - start_time); output_val = (1.0f - frac_interval) * first_val + frac_interval * second_val; output_val = output_val * 2.0 - 1.0; // scale to [-1, 1] output_val = output_val * scale_piecewise_noise[control_index]; // scale by scale_piecewise_noise if (float(sample_index) < (frac_add_nominal_traj[0] + frac_add_nominal_traj[1]) * float(num_trajectories)) { // randomly vary output_val for frac_add_nominal_traj[1] fraction of the trajectories output_val = output_val * scale_add_nominal_noise[1]; output_val += nominal_control[time_index * control_dim + control_index]; } } // control noise output gets scaled by the kernel, and added to nominal control. // so we need to undo these operations to get exactly the control trajectory we want. output[(sample_index * num_timesteps + time_index) * control_dim + control_index] = output_val / control_std_dev[control_index] - nominal_control[time_index * control_dim + control_index]; } void piecewise_linear_noise(const int num_timesteps, const int num_trajectories, const int control_dim, const int num_piecewise_segments, const int optimization_stride, std::vector& scale_piecewise_noise, std::vector& frac_add_nominal_traj, std::vector& scale_add_nominal_noise, float* control_d, float* control_noise_d, const float* control_std_dev_d, curandGenerator_t& gen, cudaStream_t stream = 0) { // generate piecewise linear random noise, in 2 steps: // 1. randomly decide start/stop times of segments (uniformly divide t=0:T) // 2. randomly decide start and stop values of segments (uniform within bounds) const int BLOCKSIZE_X = 32; const int BLOCKSIZE_Y = 32; const int BLOCKSIZE_Z = 1; // try to sample num_piecewise_segments with reasonable poisson distribution const float switch_num_poisson_lambda = 0.5f * float(num_piecewise_segments); // create random start/stop times and values // curandSetStream(gen, stream); unsigned int* switch_num_d; float* switch_times_d; float* switch_values_d; HANDLE_ERROR(cudaMalloc((void**)&switch_num_d, sizeof(unsigned int) * num_trajectories * control_dim)); HANDLE_ERROR(cudaMalloc((void**)&switch_times_d, sizeof(float) * num_trajectories * (num_piecewise_segments + 1) * control_dim)); HANDLE_ERROR(cudaMalloc((void**)&switch_values_d, sizeof(float) * num_trajectories * (num_piecewise_segments + 1) * control_dim)); HANDLE_CURAND_ERROR(curandGeneratePoisson(gen, (unsigned int*)switch_num_d, num_trajectories * control_dim, switch_num_poisson_lambda)); HANDLE_CURAND_ERROR(curandGenerateUniform(gen, (float*)switch_times_d, num_trajectories * (num_piecewise_segments + 1) * control_dim)); HANDLE_CURAND_ERROR(curandGenerateUniform(gen, (float*)switch_values_d, num_trajectories * (num_piecewise_segments + 1) * control_dim)); float* scale_piecewise_noise_d; float* frac_add_nominal_traj_d; float* scale_add_nominal_noise_d; HANDLE_ERROR(cudaMalloc((void**)&scale_piecewise_noise_d, sizeof(float) * control_dim)); HANDLE_ERROR(cudaMalloc((void**)&frac_add_nominal_traj_d, sizeof(float) * control_dim)); HANDLE_ERROR(cudaMalloc((void**)&scale_add_nominal_noise_d, sizeof(float) * control_dim)); HANDLE_ERROR(cudaMemcpy(scale_piecewise_noise_d, scale_piecewise_noise.data(), sizeof(float) * control_dim, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(frac_add_nominal_traj_d, frac_add_nominal_traj.data(), sizeof(float) * control_dim, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(scale_add_nominal_noise_d, scale_add_nominal_noise.data(), sizeof(float) * control_dim, cudaMemcpyHostToDevice)); // create piecewise linear noise const int grid_x = (num_trajectories - 1) / BLOCKSIZE_X + 1; const int grid_y = (num_timesteps - 1) / BLOCKSIZE_Y + 1; const int grid_z = (control_dim - 1) / BLOCKSIZE_Z + 1; dim3 grid(grid_x, grid_y, grid_z); dim3 block(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z); createPiecewiseLinearNoise<<>>( num_timesteps, num_trajectories, control_dim, num_piecewise_segments, optimization_stride, scale_piecewise_noise_d, frac_add_nominal_traj_d, scale_add_nominal_noise_d, switch_num_d, switch_times_d, switch_values_d, control_d, control_std_dev_d, control_noise_d); // cleanup HANDLE_ERROR(cudaGetLastError()); HANDLE_ERROR(cudaStreamSynchronize(stream)); HANDLE_ERROR(cudaFree(switch_times_d)); HANDLE_ERROR(cudaFree(switch_values_d)); HANDLE_ERROR(cudaFree(scale_piecewise_noise_d)); } ================================================ FILE: include/mppi/sampling_distributions/sampling_distribution.cu ================================================ /* Created by Bogdan Vlahov on 3/22/2023 */ #include template class PARAMS_TEMPLATE, class DYN_PARAMS_T> void SamplingDistribution::GPUSetup() { CLASS_T* derived = static_cast(this); if (!GPUMemStatus_) { sampling_d_ = Managed::GPUSetup(derived); allocateCUDAMemory(); resizeVisualizationControlTrajectories(true); } else { this->logger_->debug("%s: GPU Memory already set.\n", derived->getSamplingDistributionName().c_str()); } derived->paramsToDevice(); } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __host__ void SamplingDistribution::freeCudaMem() { if (GPUMemStatus_) { HANDLE_ERROR(cudaFree(sampling_d_)); HANDLE_ERROR(cudaFree(control_samples_d_)); HANDLE_ERROR(cudaFree(vis_control_samples_d_)); GPUMemStatus_ = false; sampling_d_ = nullptr; control_samples_d_ = nullptr; vis_control_samples_d_ = nullptr; } } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __device__ void SamplingDistribution::initializeDistributions( const float* __restrict__ output, const float t_0, const float dt, float* __restrict__ theta_d) { SAMPLING_PARAMS_T* shared = reinterpret_cast(theta_d); *shared = this->params_; // #ifdef __CUDA_ARCH__ // if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) // { // printf("Num timesteps: %d %d\n", this->params_.num_timesteps, shared->num_timesteps); // } // #endif } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __host__ void SamplingDistribution::paramsToDevice(bool synchronize) { if (GPUMemStatus_) { HANDLE_ERROR( cudaMemcpyAsync(&sampling_d_->params_, ¶ms_, sizeof(SAMPLING_PARAMS_T), cudaMemcpyHostToDevice, stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream_)); } } } // By default, call the update from device method by first putting the weights into gpu memory template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __host__ void SamplingDistribution::updateDistributionParamsFromHost( const Eigen::Ref& trajectory_weights, float normalizer, const int& distribution_i, bool synchronize) { float* trajectory_weights_d; #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaMallocAsync((void**)&trajectory_weights_d, sizeof(float) * this->getNumRollouts(), this->stream_)); #else HANDLE_ERROR(cudaMalloc((void**)&trajectory_weights_d, sizeof(float) * this->getNumRollouts())); #endif HANDLE_ERROR(cudaMemcpyAsync(trajectory_weights_d, trajectory_weights.data(), sizeof(float) * this->getNumRollouts(), cudaMemcpyHostToDevice, this->stream_)); updateDistributionParamsFromDevice(trajectory_weights_d, normalizer, distribution_i, false); #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaFreeAsync(trajectory_weights_d, this->stream_)); #else HANDLE_ERROR(cudaFree(trajectory_weights_d)); #endif if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __host__ void SamplingDistribution::allocateCUDAMemory(bool synchronize) { if (GPUMemStatus_) { if (control_samples_d_) { // deallocate previous memory for control samples #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaFreeAsync(control_samples_d_, stream_)); #else HANDLE_ERROR(cudaFree(control_samples_d_)); #endif // control_samples_d_ = nullptr; } #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaMallocAsync((void**)&control_samples_d_, sizeof(float) * this->getNumDistributions() * this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM, stream_)); #else HANDLE_ERROR(cudaMalloc((void**)&control_samples_d_, sizeof(float) * this->getNumDistributions() * this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM)); #endif HANDLE_ERROR(cudaMemcpyAsync(&sampling_d_->control_samples_d_, &control_samples_d_, sizeof(float*), cudaMemcpyHostToDevice, stream_)); } CLASS_T* derived = static_cast(this); derived->allocateCUDAMemoryHelper(); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(stream_)); } } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __host__ void SamplingDistribution::resizeVisualizationControlTrajectories(bool synchronize) { if (GPUMemStatus_) { if (vis_control_samples_d_) { // deallocate previous memory for control samples #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaFreeAsync(vis_control_samples_d_, vis_stream_)); #else HANDLE_ERROR(cudaFree(vis_control_samples_d_)); #endif // vis_control_samples_d_ = nullptr; } if (this->getNumVisRollouts() == 0) { // No need to allocate mmemory if it will end up being zero vis_control_samples_d_ = nullptr; } else { #if defined(CUDART_VERSION) && CUDART_VERSION > 11200 HANDLE_ERROR(cudaMallocAsync((void**)&vis_control_samples_d_, sizeof(float) * this->getNumDistributions() * this->getNumVisRollouts() * this->getNumTimesteps() * CONTROL_DIM, vis_stream_)); #else HANDLE_ERROR(cudaMalloc((void**)&vis_control_samples_d_, sizeof(float) * this->getNumDistributions() * this->getNumVisRollouts() * this->getNumTimesteps() * CONTROL_DIM)); #endif } HANDLE_ERROR(cudaMemcpyAsync(&sampling_d_->vis_control_samples_d_, &vis_control_samples_d_, sizeof(float*), cudaMemcpyHostToDevice, vis_stream_)); } if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(vis_stream_)); } } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __device__ void SamplingDistribution::readControlSample( const int& sample_index, const int& t, const int& distribution_index, float* __restrict__ control, float* __restrict__ theta_d, const int& block_size, const int& thread_index, const float* __restrict__ output) { SAMPLING_PARAMS_T* params_p = (SAMPLING_PARAMS_T*)theta_d; const int distribution_i = distribution_index >= params_p->num_distributions ? 0 : distribution_index; const int control_index = ((params_p->num_rollouts * distribution_i + sample_index) * params_p->num_timesteps + t) * CONTROL_DIM; if (CONTROL_DIM % 4 == 0) { float4* u4 = reinterpret_cast(control); const float4* u4_d = reinterpret_cast(&(this->control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM / 4; i += block_size) { u4[i] = u4_d[i]; } } else if (CONTROL_DIM % 2 == 0) { float2* u2 = reinterpret_cast(control); const float2* u2_d = reinterpret_cast(&(this->control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM / 2; i += block_size) { u2[i] = u2_d[i]; } } else { float* u = reinterpret_cast(control); const float* u_d = reinterpret_cast(&(this->control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM; i += block_size) { u[i] = u_d[i]; } } } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __device__ void SamplingDistribution::readVisControlSample( const int& sample_index, const int& t, const int& distribution_index, float* __restrict__ control, float* __restrict__ theta_d, const int& block_size, const int& thread_index, const float* __restrict__ output) { SAMPLING_PARAMS_T* params_p = (SAMPLING_PARAMS_T*)theta_d; const int distribution_i = distribution_index >= params_p->num_distributions ? 0 : distribution_index; const int control_index = ((params_p->num_visualization_rollouts * distribution_i + sample_index) * params_p->num_timesteps + t) * CONTROL_DIM; if (CONTROL_DIM % 4 == 0) { float4* u4 = reinterpret_cast(control); const float4* u4_d = reinterpret_cast(&(this->vis_control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM / 4; i += block_size) { u4[i] = u4_d[i]; } } else if (CONTROL_DIM % 2 == 0) { float2* u2 = reinterpret_cast(control); const float2* u2_d = reinterpret_cast(&(this->vis_control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM / 2; i += block_size) { u2[i] = u2_d[i]; } } else { float* u = reinterpret_cast(control); const float* u_d = reinterpret_cast(&(this->vis_control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM; i += block_size) { u[i] = u_d[i]; } } } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __host__ __device__ float* SamplingDistribution::getControlSample( const int& sample_index, const int& t, const int& distribution_index, const float* __restrict__ theta_d, const float* __restrict__ output) { #ifdef __CUDA_ARCH__ SAMPLING_PARAMS_T* params_p = (SAMPLING_PARAMS_T*)theta_d; #else SAMPLING_PARAMS_T* params_p = &this->params_; #endif const int distribution_i = distribution_index >= params_p->num_distributions ? 0 : distribution_index; return &this->control_samples_d_[((params_p->num_rollouts * distribution_i + sample_index) * params_p->num_timesteps + t) * CONTROL_DIM]; } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __host__ __device__ float* SamplingDistribution::getVisControlSample( const int& sample_index, const int& t, const int& distribution_index, const float* __restrict__ theta_d, const float* __restrict__ output) { #ifdef __CUDA_ARCH__ SAMPLING_PARAMS_T* params_p = (SAMPLING_PARAMS_T*)theta_d; #else SAMPLING_PARAMS_T* params_p = &this->params_; #endif const int distribution_i = distribution_index >= params_p->num_distributions ? 0 : distribution_index; return &this->vis_control_samples_d_ [((params_p->num_visualization_rollouts * distribution_i + sample_index) * params_p->num_timesteps + t) * CONTROL_DIM]; } template class PARAMS_TEMPLATE, class DYN_PARAMS_T> __device__ void SamplingDistribution::writeControlSample( const int& sample_index, const int& t, const int& distribution_index, const float* __restrict__ control, float* __restrict__ theta_d, const int& block_size, const int& thread_index, const float* __restrict__ output) { SAMPLING_PARAMS_T* params_p = (SAMPLING_PARAMS_T*)theta_d; const int distribution_i = distribution_index >= params_p->num_distributions ? 0 : distribution_index; const int control_index = ((params_p->num_rollouts * distribution_i + sample_index) * params_p->num_timesteps + t) * CONTROL_DIM; if (CONTROL_DIM % 4 == 0) { const float4* u4 = reinterpret_cast(control); float4* u4_d = reinterpret_cast(&(this->control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM / 4; i += block_size) { u4_d[i] = u4[i]; } } else if (CONTROL_DIM % 2 == 0) { const float2* u2 = reinterpret_cast(control); float2* u2_d = reinterpret_cast(&(this->control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM / 2; i += block_size) { u2_d[i] = u2[i]; } } else { const float* u = reinterpret_cast(control); float* u_d = reinterpret_cast(&(this->control_samples_d_[control_index])); for (int i = thread_index; i < CONTROL_DIM; i += block_size) { u_d[i] = u[i]; } } } ================================================ FILE: include/mppi/sampling_distributions/sampling_distribution.cuh ================================================ #pragma once /* Created by Bogdan Vlahov on 3/22/2023 */ #include #include #include namespace mppi { namespace sampling_distributions { // Auto-align all Parameter structures to float4 so that any data after it in saved memory has full memory alignment template struct alignas(float4) SamplingParams { static const int CONTROL_DIM = C_DIM; bool use_same_noise_for_all_distributions = true; int num_rollouts = 1; int num_timesteps = 1; int num_distributions = 1; int num_visualization_rollouts = 0; SamplingParams(int num_rollouts, int num_timesteps, int num_distributions = 1) : num_rollouts{ num_rollouts }, num_timesteps{ num_timesteps }, num_distributions{ num_distributions } { } SamplingParams() = default; }; template typename PARAMS_TEMPLATE, class DYN_PARAMS_T> class SamplingDistribution : public Managed { public: /************************************* * Setup typedefs and aliases *************************************/ typedef CLASS_T SAMPLING_T; using ControlIndex = typename DYN_PARAMS_T::ControlIndex; using OutputIndex = typename DYN_PARAMS_T::OutputIndex; using TEMPLATED_DYN_PARAMS = DYN_PARAMS_T; static const int CONTROL_DIM = C_IND_CLASS(DYN_PARAMS_T, NUM_CONTROLS); // static const int CONTROL_DIM = E_INDEX(ControlIndex, NUM_CONTROLS); typedef PARAMS_TEMPLATE SAMPLING_PARAMS_T; typedef Eigen::Matrix control_array; static_assert(std::is_base_of, SAMPLING_PARAMS_T>::value, "Sampling Distribution PARAMS_T does not inherit from SamplingParams"); /************************************* * Constructors and Destructors *************************************/ SamplingDistribution(cudaStream_t stream = 0) : Managed(stream) { this->SHARED_MEM_REQUEST_GRD_BYTES = sizeof(SAMPLING_PARAMS_T); this->SHARED_MEM_REQUEST_BLK_BYTES = 0; } SamplingDistribution(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0) : params_{ params }, Managed(stream) { this->SHARED_MEM_REQUEST_GRD_BYTES = sizeof(SAMPLING_PARAMS_T); this->SHARED_MEM_REQUEST_BLK_BYTES = 0; } virtual ~SamplingDistribution() { freeCudaMem(); } /** * @brief Get the Sampling Distribution Name object * * @return std::string - name of the sampling distribution */ __host__ virtual std::string getSamplingDistributionName() const { return "Sampling distribution name not set"; } /************************************* * DEFAULT CLASS METHODS THAT SHOULD NOT NEED OVERWRITING *************************************/ void GPUSetup(); /** * Updates the sampling distribution parameters * @param params */ void setParams(const SAMPLING_PARAMS_T& params, bool synchronize = true) { bool reallocate_memory = params_.num_timesteps != params.num_timesteps || params_.num_rollouts != params.num_rollouts || params_.num_distributions != params.num_distributions; bool reallocate_vis_memory = params_.num_timesteps != params.num_timesteps || params_.num_visualization_rollouts != params.num_visualization_rollouts || params_.num_distributions != params.num_distributions; params_ = params; if (GPUMemStatus_) { if (reallocate_memory) { allocateCUDAMemory(false); } if (reallocate_vis_memory) { resizeVisualizationControlTrajectories(true); } CLASS_T& derived = static_cast(*this); derived.paramsToDevice(synchronize); } } // __host__ __device__ SAMPLING_PARAMS_T getParams() const // { // return params_; // } __host__ __device__ const SAMPLING_PARAMS_T getParams() const { return params_; } __host__ __device__ int getNumTimesteps() const { return this->params_.num_timesteps; } __host__ __device__ int getNumRollouts() const { return this->params_.num_rollouts; } __host__ __device__ int getNumVisRollouts() const { return this->params_.num_visualization_rollouts; } __host__ __device__ int getNumDistributions() const { return this->params_.num_distributions; } __host__ void setNumTimesteps(const int num_timesteps, bool synchronize = false) { const bool reallocate_memory = params_.num_timesteps != num_timesteps; this->params_.num_timesteps = num_timesteps; if (GPUMemStatus_ && reallocate_memory) { if (reallocate_memory) { allocateCUDAMemory(false); resizeVisualizationControlTrajectories(true); } CLASS_T& derived = static_cast(*this); derived.paramsToDevice(synchronize); } } __host__ void setNumVisRollouts(const int num_visualization_rollouts, bool synchronize = false) { const bool reallocate_memory = params_.num_visualization_rollouts != num_visualization_rollouts; this->params_.num_visualization_rollouts = num_visualization_rollouts; if (GPUMemStatus_ && reallocate_memory) { if (reallocate_memory) { resizeVisualizationControlTrajectories(true); } CLASS_T& derived = static_cast(*this); derived.paramsToDevice(synchronize); } } __host__ void setNumRollouts(const int num_rollouts, bool synchronize = false) { const bool reallocate_memory = params_.num_rollouts != num_rollouts; this->params_.num_rollouts = num_rollouts; if (GPUMemStatus_ && reallocate_memory) { if (reallocate_memory) { allocateCUDAMemory(false); } CLASS_T& derived = static_cast(*this); derived.paramsToDevice(synchronize); } } __host__ void setNumDistributions(const int num_distributions, bool synchronize = false) { const bool reallocate_memory = params_.num_distributions != num_distributions; this->params_.num_distributions = num_distributions; if (GPUMemStatus_ && reallocate_memory) { if (reallocate_memory) { allocateCUDAMemory(false); resizeVisualizationControlTrajectories(true); } CLASS_T& derived = static_cast(*this); derived.paramsToDevice(synchronize); } } __host__ void resizeVisualizationControlTrajectories(bool synchronize = true); __host__ void setVisStream(cudaStream_t stream) { vis_stream_ = stream; } __host__ void allocateCUDAMemory(bool synchronize = false); /** * @brief deallocates the allocated cuda memory for the sampling distribution */ __host__ void freeCudaMem(); /** * @brief Get a pointer to a specific control sample. This is useful for plugging into methods like enforceConstraints * * @param sample_index - sample number out of num_rollouts * @param t - timestep out of num_timesteps * @param distribution_index - distribution index (if it is larger than num_distributions, it just defaults to first * distribution for future compatibility with sampling dynamical systems) * @param output - output pointer for compatibility with a output-based sampling distribution * @return float* pointer to the control array that is at [distribution_index][sample_index][t] */ __host__ __device__ float* getControlSample(const int& sample_index, const int& t, const int& distribution_index, const float* __restrict__ theta_d = nullptr, const float* __restrict__ output = nullptr); /** * @brief Get a pointer to a specific visualization control sample. * * @param sample_index - sample number out of num_visualization_rollouts * @param t - timestep out of num_timesteps * @param distribution_index - distribution index (if it is larger than num_distributions, it just defaults to first * distribution for future compatibility with sampling dynamical systems) * @param output - output pointer for compatibility with a output-based sampling distribution * @return float* pointer to the control array that is at [distribution_index][sample_index][t] */ __host__ __device__ float* getVisControlSample(const int& sample_index, const int& t, const int& distribution_index, const float* __restrict__ theta_d = nullptr, const float* __restrict__ output = nullptr); /** * @brief Method for starting up any potential work for distributions. By default, it just loads the params into * shared memory * * @param output - initial output * @param t_0 - starting time * @param dt - step size * @param theta_d - shared memory pointer to sampling distribution space */ __device__ void initializeDistributions(const float* __restrict__ output, const float t_0, const float dt, float* __restrict__ theta_d); __host__ void paramsToDevice(bool synchronize = true); /** * @brief Look up a specific control sample located at [distribution_index][sample_index][t] and put it into the * control array * * @param sample_index - sample number out of num_rollouts * @param t - timestep out of num_timesteps * @param distribution_index - distribution index (if it is larger than num_distributions, it just defaults to first * distribution for future compatibility with sampling dynamical systems) * @param control - pointer to fill with the specific control array * @param theta_d - shared memory pointer for passing through params * @param block_size - parallelizable step size for the gpu (normally blockDim.y) * @param thread_index - parallelizable index for the gpu (normally threadIdx.y) * @param output - output pointer for compatibility with a output-based sampling distribution */ __device__ void readControlSample(const int& sample_index, const int& t, const int& distribution_index, float* __restrict__ control, float* __restrict__ theta_d, const int& block_size = 1, const int& thread_index = 1, const float* __restrict__ output = nullptr); /** * @brief Look up a specific visualization control sample located at [distribution_index][sample_index][t] and put it * into the control array * * @param sample_index - sample number out of num_visualization_rollouts * @param t - timestep out of num_timesteps * @param distribution_index - distribution index (if it is larger than num_distributions, it just defaults to first * distribution for future compatibility with sampling dynamical systems) * @param control - pointer to fill with the specific control array * @param theta_d - shared memory pointer for passing through params * @param block_size - parallelizable step size for the gpu (normally blockDim.y) * @param thread_index - parallelizable index for the gpu (normally threadIdx.y) * @param output - output pointer for compatibility with a output-based sampling distribution */ __device__ void readVisControlSample(const int& sample_index, const int& t, const int& distribution_index, float* __restrict__ control, float* __restrict__ theta_d, const int& block_size = 1, const int& thread_index = 1, const float* __restrict__ output = nullptr); /** * @brief Update the distribution according to the weights of each sample. Should only be used if weights only exist * on the host side. Otherwise, use updateDistributionParamsFromDevice * * @param trajectory_weights - vector of size num_rollouts containing the weight of each sample * @param normalizer - the sum of all trajectory weights * @param distribution_i - which distribution to update * @param synchronize - whether or not to run cudaStreamSynchronize */ __host__ void updateDistributionParamsFromHost(const Eigen::Ref& trajectory_weights, float normalizer, const int& distribution_i, bool synchronize = false); /************************************* * Methods that need to be overwritten by derived classes *************************************/ /** * @brief method for allocating additional CUDA memory in derived */ __host__ void allocateCUDAMemoryHelper(); __host__ __device__ float computeFeedbackCost(const float* __restrict__ u_fb, float* __restrict__ theta_d, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); /** * @brief host method to calculate the cost for feedback based on the sampling distribution in RMPPI * * @param u_fb - feedback control * @param t - timestep * @param distribution_idx - distribution index * @param lambda - MPPI temperature parameter * @param alpha - coeff to turn off the likelihood cost (set to 1 -> no likelihood cost, set to 0 -> all likelihood * * @return feedback cost */ __host__ float computeFeedbackCost(const Eigen::Ref& u_fb, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); /** * @brief Device method to calculate the likelihood ratio cost for a given sample u * * @param u - sampled control * @param theta_d - shared memory for sampling distribution * @param t - timestep * @param distribution_idx - distribution index (if it is larger than num_distributions, it just defaults to first * distribution for future compatibility with sampling dynamical systems) * @param lambda - MPPI temperature parameter * @param alpha - coeff to turn off the likelihood cost (set to 1 -> no likelihood cost, set to 0 -> all likelihood * cost) */ __host__ __device__ float computeLikelihoodRatioCost(const float* __restrict__ u, float* __restrict__ theta_d, const int sample_index, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); /** * @brief Host-side method to calculate the likelihood ration cost for a given sample u * * @param u - sampled control * @param t - timestep * @param distribution_idx - distribution index (if it is larger than num_distributions, it just defaults to first * distribution for future compatibility with sampling dynamical systems) * @param lambda - MPPI temperature parameter * @param alpha - coeff to turn off the likelihood cost (set to 1 -> no likelihood cost, set to 0 -> all likelihood * cost) */ __host__ float computeLikelihoodRatioCost(const Eigen::Ref& u, const int sample_index, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); /** * @brief Get the latest importance sampler from time-shifting on the controller and update the device importance * sampler * * @param importance_sampler - host pointer to a control sequence that is NUM_TIMESTEPS * CONTROL_DIM * @param distribution_idx - which distribution is the importance sampler meant for * @param synchronize - whether or not to run cudaStreamSynchronize */ __host__ void copyImportanceSamplerToDevice(const float* importance_sampler, const int& distribution_idx, bool synchronize = true); /** * @brief Generate control samples that will be on the GPU. * * @param optimization_stride - timestep to start control samples from * @param iteration_num - which iteration of the algorithm we are on. Useful for decaying std_dev * @param gen - pseudo-random noise generator */ __host__ void generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize = true); /** * @brief Set the Host-side Optimal Control Trajectory * * @param optimal_control_trajectory - pointer to CPU memory location to store the optimal control * @param distribution_idx - which distribution we are looking for the optimal control from (Useful for Tube and * RMPPI) * @param synchronize - whether or not to run cudaStreamSynchronize */ __host__ void setHostOptimalControlSequence(float* optimal_control_trajectory, const int& distribution_idx, bool synchronize = true); /** * @brief takes in the cost of each sample generated and conducts an update of the distribution (For Gaussians, mean * update) * * @param trajectory_weights_d - vector of weights of size num_rollouts located on the GPU * @param normalizer - sum of all weights * @param distribution_i - which distribution to update * @param synchronize - whether or not to run cudaStreamSynchronize */ __host__ virtual void updateDistributionParamsFromDevice(const float* trajectory_weights_d, float normalizer, const int& distribution_i, bool synchronize = false) = 0; /** * @brief Write to a specific control sample located at [distribution_index][sample_index][t] from the * control array * * @param sample_index - sample number out of num_rollouts * @param t - timestep out of num_timesteps * @param distribution_index - distribution index (if it is larger than num_distributions, it just defaults to first * distribution for future compatibility with sampling dynamical systems) * @param control - pointer to control array with the desired data * @param theta_d - shared memory pointer for passing through params * @param block_size - parallelizable step size for the gpu (normally blockDim.y) * @param thread_index - parallelizable index for the gpu (normally threadIdx.y) * @param output - output pointer for compatibility with a output-based sampling distribution */ __device__ void writeControlSample(const int& sample_index, const int& t, const int& distribution_index, const float* __restrict__ control, float* __restrict__ theta_d, const int& block_size = 1, const int& thread_index = 1, const float* __restrict__ output = nullptr); CLASS_T* sampling_d_ = nullptr; cudaStream_t vis_stream_ = nullptr; protected: float* control_samples_d_ = nullptr; float* vis_control_samples_d_ = nullptr; SAMPLING_PARAMS_T params_; }; #ifdef __CUDACC__ #include "sampling_distribution.cu" #endif template class PARAMS_TEMPLATE, class DYN_PARAMS_T> const int SamplingDistribution::CONTROL_DIM; // template // const int SamplingParams::CONTROL_DIM; } // namespace sampling_distributions } // namespace mppi ================================================ FILE: include/mppi/sampling_distributions/smooth-MPPI/smooth-MPPI.cu ================================================ /** * Created by Bogdan Vlahov on Jan 8, 2024 **/ #define SMOOTH_MPPI_TEMPLATE template class PARAMS_TEMPLATE, class DYN_PARAMS_T> #define SMOOTH_MPPI_NOISE SmoothMPPIDistributionImpl #include #include #include namespace mppi { namespace sampling_distributions { __global__ void integrateNoise(const float* __restrict__ action_deriv_d, const float* __restrict__ control_mean_d, float* __restrict__ control_d, const int num_samples, const int num_timesteps, const int control_dim, const float dt) { const int sample_index = blockIdx.x * blockDim.x + threadIdx.x; const int control_index = blockIdx.y * blockDim.y + threadIdx.y; int noise_index = 0, mean_index = 0; if (sample_index < num_samples && control_index < control_dim) { for (int t = 0; t < num_timesteps; t++) { noise_index = (sample_index * num_timesteps + t) * control_dim + control_index; mean_index = t * control_dim + control_index; control_d[noise_index] = control_mean_d[mean_index] + action_deriv_d[noise_index] * dt; } } } __global__ void shiftControlTrajectory(float* __restrict__ control_trajectory_d, const int num_distributions, const int num_timesteps, const int control_dim, const int shift_index) { extern __shared__ float shared_mean[]; const int dist_index = blockIdx.x; const int control_index = threadIdx.x; mppi::p1::loadArrayParallel( shared_mean, 0, control_trajectory_d, dist_index * num_timesteps * control_dim, num_timesteps * control_dim); __syncthreads(); int p_index, p_step; mppi::p1::getParallel1DIndex(p_index, p_step); for (int t = 0; t < num_timesteps; t++) { int write_index = t * control_dim; int read_index = min(t + shift_index, num_timesteps - 1) * control_dim; // if (control_dim % 4 == 0) // { // for (int i = p_index; i < control_dim / 4; i += 4 * p_step) // { // reinterpret_cast(&shared_mean[write_index])[i] = // reinterpret_cast(&shared_mean[read_index])[i]; // } // } // else if (control_dim % 2 == 0) // { // for (int i = p_index; i < control_dim / 2; i += 2 * p_step) // { // reinterpret_cast(&shared_mean[write_index])[i] = // reinterpret_cast(&shared_mean[read_index])[i]; // } // } // else // { for (int i = 0; i < control_dim; i += 1) { shared_mean[write_index + i] = shared_mean[read_index + i]; } // } } __syncthreads(); mppi::p1::loadArrayParallel( control_trajectory_d, dist_index * num_timesteps * control_dim, shared_mean, 0, num_timesteps * control_dim); } SMOOTH_MPPI_TEMPLATE SMOOTH_MPPI_NOISE::SmoothMPPIDistributionImpl(cudaStream_t stream) : PARENT_CLASS::GaussianDistributionImpl(stream) { } SMOOTH_MPPI_TEMPLATE SMOOTH_MPPI_NOISE::SmoothMPPIDistributionImpl(const SAMPLING_PARAMS_T& params, cudaStream_t stream) : PARENT_CLASS::GaussianDistributionImpl(params, stream) { } SMOOTH_MPPI_TEMPLATE __host__ void SMOOTH_MPPI_NOISE::freeCudaMem() { if (this->GPUMemStatus_) { HANDLE_ERROR(cudaFree(deriv_action_mean_d_)); HANDLE_ERROR(cudaFree(deriv_action_noise_d_)); } PARENT_CLASS::freeCudaMem(); } SMOOTH_MPPI_TEMPLATE __host__ void SMOOTH_MPPI_NOISE::allocateCUDAMemoryHelper() { PARENT_CLASS::allocateCUDAMemoryHelper(); if (this->GPUMemStatus_) { if (deriv_action_noise_d_) { HANDLE_ERROR(cudaFreeAsync(deriv_action_noise_d_, this->stream_)); } HANDLE_ERROR(cudaMallocAsync((void**)&deriv_action_noise_d_, sizeof(float) * this->CONTROL_DIM * this->getNumRollouts() * this->getNumDistributions() * this->getNumTimesteps(), this->stream_)); if (deriv_action_mean_d_) { HANDLE_ERROR(cudaFreeAsync(deriv_action_mean_d_, this->stream_)); } HANDLE_ERROR(cudaMallocAsync( (void**)&deriv_action_mean_d_, sizeof(float) * this->CONTROL_DIM * this->getNumTimesteps() * this->getNumDistributions(), this->stream_)); } } SMOOTH_MPPI_TEMPLATE __host__ void SMOOTH_MPPI_NOISE::generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize) { if (this->params_.use_same_noise_for_all_distributions) { HANDLE_CURAND_ERROR(curandGenerateNormal( gen, this->deriv_action_noise_d_, this->getNumTimesteps() * this->getNumRollouts() * CONTROL_DIM, 0.0f, 1.0f)); for (int i = 1; i < this->getNumDistributions(); i++) { HANDLE_ERROR(cudaMemcpyAsync( &this->deriv_action_noise_d_[this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM * i], this->deriv_action_noise_d_, sizeof(float) * this->getNumRollouts() * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyDeviceToDevice, this->stream_)); } } else { HANDLE_CURAND_ERROR(curandGenerateNormal( gen, this->deriv_action_noise_d_, this->getNumTimesteps() * this->getNumRollouts() * this->getNumDistributions() * CONTROL_DIM, 0.0f, 1.0f)); } // Shift derivative action sequence dim3 control_block = dim3(this->CONTROL_DIM, 1, 1); dim3 control_grid = dim3(this->getNumDistributions(), 1, 1); unsigned int shared_mean_size = sizeof(float) * this->CONTROL_DIM * this->getNumTimesteps(); shiftControlTrajectory<<stream_>>>( deriv_action_mean_d_, this->getNumDistributions(), this->getNumTimesteps(), this->CONTROL_DIM, optimization_stride); const int BLOCKSIZE_X = this->params_.rewrite_controls_block_dim.x; const int BLOCKSIZE_Y = this->params_.rewrite_controls_block_dim.y; const int BLOCKSIZE_Z = this->params_.rewrite_controls_block_dim.z; /** * Generate noise samples with mean added **/ dim3 control_writing_grid; control_writing_grid.x = mppi::math::int_ceil(this->getNumRollouts(), BLOCKSIZE_X); control_writing_grid.y = mppi::math::int_ceil(this->getNumTimesteps(), BLOCKSIZE_Y); control_writing_grid.z = mppi::math::int_ceil(this->getNumDistributions(), BLOCKSIZE_Z); unsigned int std_dev_mem_size = this->getNumDistributions() * CONTROL_DIM; // Allocate shared memory for std_deviations per timestep or constant across the trajectory std_dev_mem_size = mppi::math::nearest_multiple_4( this->params_.time_specific_std_dev ? std_dev_mem_size * this->getNumTimesteps() : std_dev_mem_size); unsigned int shared_mem_size = std_dev_mem_size + mppi::math::nearest_multiple_4(this->getNumDistributions() * this->getNumTimesteps() * CONTROL_DIM) + mppi::math::nearest_multiple_4(BLOCKSIZE_X * BLOCKSIZE_Y * BLOCKSIZE_Z * CONTROL_DIM); shared_mem_size *= sizeof(float); setGaussianControls<<params_.rewrite_controls_block_dim, shared_mem_size, this->stream_>>>( this->deriv_action_mean_d_, this->std_dev_d_, this->deriv_action_noise_d_, CONTROL_DIM, this->getNumTimesteps(), this->getNumRollouts(), this->getNumDistributions(), optimization_stride, powf(this->params_.std_dev_decay, iteration_num), this->params_.pure_noise_trajectories_percentage, this->params_.time_specific_std_dev); // integrate derivative controls to make true controls dim3 integrate_noise_grid; integrate_noise_grid.x = mppi::math::int_ceil(this->getNumRollouts() * this->getNumDistributions(), BLOCKSIZE_X); integrate_noise_grid.y = 1; integrate_noise_grid.z = 1; dim3 integrate_noise_block(BLOCKSIZE_X, this->CONTROL_DIM, 1); // control_writing_grid.y = mppi::math::int_ceil(this->getNumTimesteps(), BLOCKSIZE_Y); // control_writing_grid.z = mppi::math::int_ceil(this->getNumDistributions(), BLOCKSIZE_Z); // std::cout << "Integrating samples " << this->getNumRollouts() * this->getNumDistributions() << std::endl; integrateNoise<<stream_>>>( this->deriv_action_noise_d_, this->control_means_d_, this->control_samples_d_, this->getNumRollouts() * this->getNumDistributions(), this->getNumTimesteps(), this->CONTROL_DIM, this->params_.dt); HANDLE_ERROR(cudaGetLastError()); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } SMOOTH_MPPI_TEMPLATE __host__ void SMOOTH_MPPI_NOISE::updateDistributionParamsFromDevice(const float* trajectory_weights_d, float normalizer, const int& distribution_i, bool synchronize) { if (distribution_i >= this->getNumDistributions()) { std::cerr << "Updating distributional params for distribution " << distribution_i << " out of " << this->getNumDistributions() << " total." << std::endl; return; } int mean_index = distribution_i * this->getNumTimesteps() * this->CONTROL_DIM; int sample_index = distribution_i * this->getNumRollouts() * this->getNumTimesteps() * this->CONTROL_DIM; float* deriv_action_noise_i_d = &(this->deriv_action_noise_d_[sample_index]); float* deriv_action_mean_i_d = &(this->deriv_action_mean_d_[mean_index]); mppi::kernels::launchWeightedReductionKernel( trajectory_weights_d, deriv_action_noise_i_d, deriv_action_mean_i_d, normalizer, this->getNumTimesteps(), this->getNumRollouts(), this->params_.sum_strides, this->stream_, synchronize); dim3 grid(1, 1, 1); dim3 block(1, this->CONTROL_DIM, 1); // std::cout << "Integrating optimal sequence" << std::endl; integrateNoise<<stream_>>>(deriv_action_mean_i_d, &this->control_means_d_[mean_index], &this->control_samples_d_[sample_index], 1, this->getNumTimesteps(), this->CONTROL_DIM, this->params_.dt); HANDLE_ERROR(cudaMemcpyAsync(&this->control_means_d_[mean_index], &this->control_samples_d_[sample_index], sizeof(float) * this->getNumTimesteps() * this->CONTROL_DIM, cudaMemcpyDeviceToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(&this->means_[mean_index], &this->control_means_d_[mean_index], sizeof(float) * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); // HANDLE_ERROR(cudaMemcpyAsync(&means_[distribution_i * this->getNumTimesteps() * CONTROL_DIM], // deriv_action_mean_i_d, // sizeof(float) * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyDeviceToHost, // this->stream_)); if (synchronize) { HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); } } // SMOOTH_MPPI_TEMPLATE // __host__ void SMOOTH_MPPI_NOISE::setHostOptimalControlSequence(float* optimal_control_trajectory, // const int& distribution_i, bool synchronize) // { // if (distribution_i >= this->getNumDistributions()) // { // std::cerr << "Asking for optimal control sequence from distribution " << distribution_i << " out of " // << this->getNumDistributions() << " total." << std::endl; // return; // } // HANDLE_ERROR(cudaMemcpyAsync( // optimal_control_trajectory, &(this->control_means_d_[this->getNumTimesteps() * CONTROL_DIM * distribution_i]), // sizeof(float) * this->getNumTimesteps() * CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); // if (synchronize) // { // HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); // } // } } // namespace sampling_distributions } // namespace mppi ================================================ FILE: include/mppi/sampling_distributions/smooth-MPPI/smooth-MPPI.cuh ================================================ #pragma once /** * Created by Bogdan, Jan 8, 2024 */ #include namespace mppi { namespace sampling_distributions { __global__ void integrateNoise(const float* __restrict__ action_deriv_d, const float* __restrict__ control_mean_d, float* __restrict__ control_d, const int num_samples, const int num_timesteps, const int control_dim, const float dt); __global__ void shiftControlTrajectory(float* __restrict__ control_trajectory_d, const int num_distributions, const int num_timesteps, const int control_dim, const int shift_index); template struct SmoothMPPIParamsImpl : public GaussianParamsImpl { float dt = 0.015f; dim3 shift_trajectory_block; SmoothMPPIParamsImpl(int num_rollouts = 1, int num_timesteps = 1, int num_distributions = 1) : GaussianParamsImpl(num_rollouts, num_timesteps, num_distributions) { } }; template using SmoothMPPIParams = SmoothMPPIParamsImpl; template class PARAMS_TEMPLATE = SmoothMPPIParams, class DYN_PARAMS_T = DynamicsParams> class SmoothMPPIDistributionImpl : public GaussianDistributionImpl { public: using PARENT_CLASS = GaussianDistributionImpl; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; using control_array = typename PARENT_CLASS::control_array; static const int CONTROL_DIM = PARENT_CLASS::CONTROL_DIM; SmoothMPPIDistributionImpl(cudaStream_t stream = 0); SmoothMPPIDistributionImpl(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0); ~SmoothMPPIDistributionImpl() { freeCudaMem(); } __host__ virtual std::string getSamplingDistributionName() const override { return "Smooth-MPPI"; } __host__ void allocateCUDAMemoryHelper(); __host__ void freeCudaMem(); __host__ void generateSamples(const int& optimization_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize = true); // __host__ void copyImportanceSamplerToDevice(const float* importance_sampler, // const int& distribution_idx, bool synchronize = true); // __host__ void setHostOptimalControlSequence(float* optimal_control_trajectory, const int& distribution_i, // bool synchronize = true); __host__ void updateDistributionParamsFromDevice(const float* trajectory_weights_d, float normalizer, const int& distribution_i, bool synchronize = false) override; protected: float* deriv_action_mean_d_ = nullptr; float* deriv_action_noise_d_ = nullptr; }; template class SmoothMPPIDistribution : public SmoothMPPIDistributionImpl, SmoothMPPIParams, DYN_PARAMS_T> { public: using PARENT_CLASS = SmoothMPPIDistributionImpl; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; SmoothMPPIDistribution(cudaStream_t stream = 0) : PARENT_CLASS(stream) { } SmoothMPPIDistribution(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0) : PARENT_CLASS(params, stream) { } }; } // namespace sampling_distributions } // namespace mppi #ifdef __CUDACC__ #include "smooth-MPPI.cu" #endif ================================================ FILE: include/mppi/shaping_functions/CEM/cem_shaping_function.cu ================================================ template __host__ __device__ float CEMShapingFunctionImpl::computeWeight( float* traj_cost, float baseline, int global_idx) { if (traj_cost[global_idx] >= baseline) { return (1.0 / ((int)(NUM_ROLLOUTS) * this->params_.gamma + 1)); } else { return 0.0; } } template void CEMShapingFunctionImpl::computeWeights(cost_traj& trajectory_costs, float* trajectory_costs_d, cudaStream_t stream) { // Copy the costs back to the host HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs.data(), trajectory_costs_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(stream)); std::vector indexes(NUM_ROLLOUTS); iota(indexes.begin(), indexes.end(), 0); std::nth_element(indexes.begin(), indexes.begin() + (int)(NUM_ROLLOUTS * this->params_.gamma), indexes.end(), [&trajectory_costs](int i1, int i2) { return trajectory_costs(i1) > trajectory_costs(i2); }); this->baseline_ = trajectory_costs(indexes[(int)NUM_ROLLOUTS * this->params_.gamma]); // launch the WeightKernel to calculate the weights if (this->params_.gamma == 0) { trajectory_costs(indexes[0]) = 1.0; for (int i = 1; i < NUM_ROLLOUTS; i++) { trajectory_costs(indexes[i]) = 0.0; } } else { CLASS_T* derived = static_cast(this); derived->launchWeightKernel(trajectory_costs_d, this->baseline_, stream); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs.data(), trajectory_costs_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); } } ================================================ FILE: include/mppi/shaping_functions/CEM/cem_shaping_function.cuh ================================================ #ifndef MPPIGENERIC_CEM_SHAPING_FUNCTION_CUH #define MPPIGENERIC_CEM_SHAPING_FUNCTION_CUH #include #include #include struct CEMShapingFunctionParams { float gamma; // represents the elite sample percentage }; template class CEMShapingFunctionImpl : public ShapingFunctionImpl { public: typedef Eigen::Matrix cost_traj; CEMShapingFunctionImpl() { this->normalizer_ = 1.0; } __host__ __device__ float computeWeight(float* traj_cost, float baseline, int global_idx); void computeWeights(cost_traj& trajectory_costs, float* trajectory_costs_d, cudaStream_t stream = nullptr); /* * TODO what should this be for CEM? void computeFreeEnergy(float& free_energy, float& free_energy_var, float& free_energy_modified, float* cost_rollouts_host, float baseline); */ }; template class CEMShapingFunction : public CEMShapingFunctionImpl, CEMShapingFunctionParams, NUM_ROLLOUTS, BDIM_X> { public: }; #if __CUDACC__ #include "cem_shaping_function.cu" #endif #endif // MPPIGENERIC_CEM_SHAPING_FUNCTION_CUH ================================================ FILE: include/mppi/shaping_functions/shaping_function.cu ================================================ #include "shaping_function.cuh" #include namespace mppi_common { template __global__ void weightKernel(float* trajectory_costs_d, float baseline, CLASS_T* shape_function) { int global_idx = (blockDim.x * blockIdx.x + threadIdx.x) * blockDim.z + threadIdx.z; if (global_idx < NUM_ROLLOUTS * blockDim.z) { trajectory_costs_d[global_idx] = shape_function->computeWeight(trajectory_costs_d, baseline, global_idx); } } } // namespace mppi_common template void ShapingFunctionImpl::paramsToDevice() { if (GPUMemStatus_) { HANDLE_ERROR( cudaMemcpyAsync(&shaping_function_d_->params_, ¶ms_, sizeof(PARAMS_T), cudaMemcpyHostToDevice, stream_)); HANDLE_ERROR(cudaStreamSynchronize(stream_)); } } template void ShapingFunctionImpl::freeCudaMem() { if (GPUMemStatus_) { cudaFree(shaping_function_d_); GPUMemStatus_ = false; shaping_function_d_ = nullptr; } } template void ShapingFunctionImpl::GPUSetup() { CLASS_T* derived = static_cast(this); if (!GPUMemStatus_) { shaping_function_d_ = Managed::GPUSetup(derived); } else { std::cout << "GPU Memory already set" << std::endl; // TODO should this be an exception? } derived->paramsToDevice(); } template __host__ __device__ float ShapingFunctionImpl::computeWeight(float* traj_cost, float baseline, int global_idx) { return expf(-this->params_.lambda_inv * (traj_cost[global_idx] - baseline)); } template void ShapingFunctionImpl::computeWeights(cost_traj& trajectory_costs, float* trajectory_costs_d, cudaStream_t stream) { // Copy the costs back to the host HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs.data(), trajectory_costs_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(stream)); baseline_ = trajectory_costs.minCoeff(); // launch the WeightKernel to calculate the weights CLASS_T* derived = static_cast(this); derived->launchWeightKernel(trajectory_costs_d, baseline_, stream); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs.data(), trajectory_costs_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); normalizer_ = trajectory_costs.sum(); // TODO after this you should grab the baseline value and the normalizer } template void ShapingFunctionImpl::launchWeightKernel(float* trajectory_costs_d, float baseline, cudaStream_t stream) { dim3 dimBlock(BDIM_X, 1, 1); dim3 dimGrid((NUM_ROLLOUTS - 1) / BDIM_X + 1, 1, 1); mppi_common::weightKernel <<<1, NUM_ROLLOUTS>>>(trajectory_costs_d, baseline, this->shaping_function_d_); CudaCheckError(); } template void ShapingFunctionImpl::computeFreeEnergy( float& free_energy, float& free_energy_var, float& free_energy_modified, float* cost_rollouts_host, float baseline) { float var = 0; float norm = 0; for (int i = 0; i < NUM_ROLLOUTS; i++) { norm += cost_rollouts_host[i]; var += SQ(cost_rollouts_host[i]); } norm /= NUM_ROLLOUTS; free_energy = -1.0f / this->params_.lambda_inv * logf(norm) + baseline; free_energy_var = 1.0f / this->params_.lambda_inv * (var / NUM_ROLLOUTS - SQ(norm)); // TODO Figure out the point of the following lines float weird_term = free_energy_var / (norm * sqrtf(1.0 * NUM_ROLLOUTS)); free_energy_modified = 1.0f / this->params_.lambda_inv * (weird_term + 0.5 * SQ(weird_term)); } ================================================ FILE: include/mppi/shaping_functions/shaping_function.cuh ================================================ #ifndef MPPIGENERIC_SHAPING_FUNCTION_CUH #define MPPIGENERIC_SHAPING_FUNCTION_CUH #include #include namespace mppi_common { template __global__ void weightKernel(float* trajectory_costs_d, float baseline, CLASS_T* shape_function); } struct ShapingFunctionParams { float lambda_inv = 1.0; // also known as gamma }; template class ShapingFunctionImpl : public Managed { public: typedef Eigen::Matrix cost_traj; ShapingFunctionImpl() { paramsToDevice(); } __host__ __device__ float computeWeight(float* traj_cost, float baseline, int global_idx); void computeWeights(cost_traj& trajectory_costs, float* trajectory_costs_d, cudaStream_t stream = nullptr); void launchWeightKernel(float* trajectory_costs_d, float baseline, cudaStream_t stream = nullptr); void computeFreeEnergy(float& free_energy, float& free_energy_var, float& free_energy_modified, float* cost_rollouts_host, float baseline); PARAMS_T getParams() { return params_; } float getBaseline() { return baseline_; } float getNormalizer() { return normalizer_; } void setParams(const PARAMS_T& params) { this->params_ = params; paramsToDevice(); } void GPUSetup(); void freeCudaMem(); void paramsToDevice(); CLASS_T* shaping_function_d_ = nullptr; protected: PARAMS_T params_; float baseline_; float normalizer_; }; template class ShapingFunction : public ShapingFunctionImpl, ShapingFunctionParams, NUM_ROLLOUTS, BDIM_X> { public: }; #if __CUDACC__ #include "shaping_function.cu" #endif #endif // MPPIGENERIC_SHAPING_FUNCTION_CUH ================================================ FILE: include/mppi/utils/activation_functions.cuh ================================================ #ifndef ACTIVATION_FUNCTIONS_CUH_ #define ACTIVATION_FUNCTIONS_CUH_ #include "math_utils.h" namespace mppi { namespace nn { /** * There is hardware support for the tanh in some GPUs, so the hand written version is slower * Checked on a 3080 and 1050ti * @param input * @return */ inline __host__ __device__ float tanh(float input) { // There is hardware support for tanh starting in Turing (above 750+) that makes this approximation slower // #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 750) // const float num = __expf(2.0f * input); // // return __fdiv_rz(num - 1.0f, num + 1.0f); // return (num - 1.0f) / (num + 1.0f); // #else // return tanhf(input); // #endif return tanhf(input); } inline __host__ __device__ float tanh_accurate(float input) { return tanhf(input); } inline __host__ __device__ float tanh_deriv(float input) { return 1.0f - SQ(mppi::nn::tanh(input)); } inline __host__ __device__ float relu(float input) { return fmaxf(0.0f, input); } /** * Uses hardware support for tanh function, should be faster * Tested on 3080 and 1050ti * @param input * @return */ inline __host__ __device__ float sigmoid(float input) { #ifdef __CUDA_ARCH__ // these three are roughly equivalent on 3080 // return __fmul_rz(1.0f + tanhf(__fmul_rz(input, 0.5f)), 0.5f); // return __fmaf_rz(tanhf(__fmul_rz(input, 0.5f)), 0.5f, 0.5f); return (1.0f + mppi::nn::tanh(input / 2.0f)) / 2.0f; #else return (1.0f / (1.0f + expf(-input))); #endif } inline __host__ __device__ float sigmoid_accurate(float input) { return (1.0f / (1.0f + expf(-input))); } /** * * @param input * @return */ __host__ __device__ static inline float tanh_vel_scale(float state, float vel, const float constants[2]) { return state * constants[1] * mppi::nn::tanh(vel * constants[0]); } /** * * @param input * @return */ __host__ __device__ static inline float tanh_scale(float state, const float constants[2]) { return constants[1] * mppi::nn::tanh(state * constants[0]); } /** * * @param x * @return */ __host__ __device__ static inline float tanhshrink(float x) { return x - mppi::nn::tanh(x); } /** * * @param x * @return */ __host__ __device__ static inline float tanhshrink_scale(float x, float scale) { return scale * tanhshrink(x / scale); } } // namespace nn } // namespace mppi #endif // ACTIVATION_FUNCTIONS_CUH_ ================================================ FILE: include/mppi/utils/angle_utils.cuh ================================================ #ifndef ANGLES_CUH_ #define ANGLES_CUH_ #include #include namespace angle_utils { /** * * @param angle * @return */ __host__ __device__ static inline double normalizeAngle(double angle) { const double result = fmod(angle + M_PI, 2.0 * M_PI); if (result <= 0.0) return result + M_PI; return result - M_PI; } // float version might not be exact // different systems will have slightly different values. __host__ __device__ static inline float normalizeAngle(float angle) { const float result = fmodf(angle + static_cast(M_PI), static_cast(2.0 * M_PI)); if (result <= 0.0f) return result + static_cast(M_PI); return result - static_cast(M_PI); } /** * * @param from * @param to * @return */ __host__ __device__ static inline double shortestAngularDistance(double from, double to) { return normalizeAngle(to - from); } __host__ __device__ static inline float shortestAngularDistance(float from, float to) { return normalizeAngle(to - from); } /** * Does a linear interpolation of the euler angle while respecting -pi to pi wrapping * solution from https://www.ri.cmu.edu/pub_files/pub4/kuffner_james_2004_1/kuffner_james_2004_1.pdf algorithm 6 * @param angle_1 * @param angle_2 * @param alpha * @return */ __host__ __device__ static inline double interpolateEulerAngleLinear(double angle_1, double angle_2, double alpha) { double angle_diff = shortestAngularDistance(angle_1, angle_2); return normalizeAngle(angle_1 + alpha * angle_diff); } __host__ __device__ static inline float interpolateEulerAngleLinear(float angle_1, float angle_2, float alpha) { float angle_diff = shortestAngularDistance(angle_1, angle_2); return normalizeAngle(angle_1 + alpha * angle_diff); } } // namespace angle_utils #endif // ANGLES_CUH_ ================================================ FILE: include/mppi/utils/cuda_math_utils.cuh ================================================ // // Created by jason on 4/25/22. // #ifndef MPPIGENERIC_CUDA_MATH_UTILS_CUH #define MPPIGENERIC_CUDA_MATH_UTILS_CUH #include #ifndef __UNROLL #define __xstr__(s) __str__(s) #define __str__(s) #s #ifdef __CUDACC__ #define __UNROLL(a) _Pragma("unroll") #elif defined(__GNUC__) // GCC is the compiler and uses different unroll syntax #define __UNROLL(a) _Pragma(__xstr__(GCC unroll a)) #endif #endif // Matching float4 syntax template struct __align__(4 * sizeof(T)) type4 { T x; T y; T z; T w; // Allow writing to struct using array index __host__ __device__ T& operator[](int i) { assert(i >= 0); assert(i < 4); return (i > 1) ? ((i == 2) ? z : w) : ((i == 0) ? x : y); } // Allow reading from struct using array index __host__ __device__ const T& operator[](int i) const { assert(i >= 0); assert(i < 4); return (i > 1) ? ((i == 2) ? z : w) : ((i == 0) ? x : y); } }; template struct __align__(2 * sizeof(T)) type2 { T x; T y; // Allow writing to struct using array index __host__ __device__ T& operator[](int i) { assert(i >= 0); assert(i < 2); return (i == 0) ? x : y; } // Allow reading from struct using array index __host__ __device__ const T& operator[](int i) const { assert(i >= 0); assert(i < 2); return (i == 0) ? x : y; } }; // Scalar-Vector Multiplication __host__ __device__ inline float2 operator*(const float2& a, const float& b) { return make_float2(a.x * b, a.y * b); } __host__ __device__ inline float3 operator*(const float3& a, const float& b) { return make_float3(a.x * b, a.y * b, a.z * b); } __host__ __device__ inline float4 operator*(const float4& a, const float& b) { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); } __host__ __device__ inline float2 operator*(const float& b, const float2& a) { return make_float2(a.x * b, a.y * b); } __host__ __device__ inline float3 operator*(const float& b, const float3& a) { return make_float3(a.x * b, a.y * b, a.z * b); } __host__ __device__ inline float4 operator*(const float& b, const float4& a) { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); } // Scalar-Vector Addition __host__ __device__ inline float2 operator+(const float2& a, const float& b) { return make_float2(a.x + b, a.y + b); } __host__ __device__ inline float3 operator+(const float3& a, const float& b) { return make_float3(a.x + b, a.y + b, a.z + b); } __host__ __device__ inline float4 operator+(const float4& a, const float& b) { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); } // Scalar-Vector Subtraction __host__ __device__ inline float2 operator-(const float2& a, const float& b) { return make_float2(a.x - b, a.y - b); } __host__ __device__ inline float3 operator-(const float3& a, const float& b) { return make_float3(a.x - b, a.y - b, a.z - b); } __host__ __device__ inline float4 operator-(const float4& a, const float& b) { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); } // Scalar-Vector Dvision __host__ __device__ inline float2 operator/(const float2& a, const float& b) { return make_float2(a.x / b, a.y / b); } __host__ __device__ inline float3 operator/(const float3& a, const float& b) { return make_float3(a.x / b, a.y / b, a.z / b); } __host__ __device__ inline float4 operator/(const float4& a, const float& b) { return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); } // Vector-Vector Addition __host__ __device__ inline float2 operator+(const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); } __host__ __device__ inline float3 operator+(const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); } __host__ __device__ inline float4 operator+(const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } // Vector-Vector Subtraction __host__ __device__ inline float2 operator-(const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); } __host__ __device__ inline float3 operator-(const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } __host__ __device__ inline float4 operator-(const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } // Vector-Vector Multiplication __host__ __device__ inline float2 operator*(const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); } __host__ __device__ inline float3 operator*(const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } __host__ __device__ inline float4 operator*(const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } // Vector-Vector Division __host__ __device__ inline float2 operator/(const float2& a, const float2& b) { return make_float2(a.x / b.x, a.y / b.y); } __host__ __device__ inline float3 operator/(const float3& a, const float3& b) { return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); } __host__ __device__ inline float4 operator/(const float4& a, const float4& b) { return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } // Scalar-Vector multiply and set __host__ __device__ inline float2& operator*=(float2& a, const float& b) { a.x *= b; a.y *= b; return a; } __host__ __device__ inline float3& operator*=(float3& a, const float& b) { a.x *= b; a.y *= b; a.z *= b; return a; } __host__ __device__ inline float4& operator*=(float4& a, const float& b) { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; } // Vector-Vector add and set __host__ __device__ inline float2& operator+=(float2& a, const float2& b) { a.x += b.x; a.y += b.y; return a; } __host__ __device__ inline float3& operator+=(float3& a, const float3& b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } __host__ __device__ inline float4& operator+=(float4& a, const float4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } __host__ __device__ inline float dot(const float2& a, const float2& b) { return a.x * b.x + a.y * b.y; } __host__ __device__ inline float cross(const float2& a, const float2& b) { return a.x * b.y - a.y * b.x; } __host__ __device__ inline float norm(const float2& a) { return sqrtf(dot(a, a)); } __host__ __device__ inline float dot(const float3& a, const float3& b) { return a.x * b.x + a.y * b.y + a.z * b.z; } __host__ __device__ inline float norm(const float3& a) { return sqrtf(dot(a, a)); } __host__ __device__ inline float dot(const float4& a, const float4& b) { return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; } __host__ __device__ inline float norm(const float4& a) { return sqrtf(dot(a, a)); } __host__ __device__ inline float2 operator-(const float2& a) { return make_float2(-a.x, -a.y); } __host__ __device__ inline float3 operator-(const float3& a) { return make_float3(-a.x, -a.y, -a.z); } __host__ __device__ inline float4 operator-(const float4& a) { return make_float4(-a.x, -a.y, -a.z, -a.w); } __host__ __device__ inline bool operator==(const float2& lhs, const float2& rhs) { return (lhs.x == rhs.x) && (lhs.y == rhs.y); } __host__ __device__ inline bool operator==(const float3& lhs, const float3& rhs) { return (lhs.x == rhs.x) && (lhs.y == rhs.y) && (lhs.z == rhs.z); } __host__ __device__ inline bool operator==(const float4& lhs, const float4& rhs) { return (lhs.x == rhs.x) && (lhs.y == rhs.y) && (lhs.z == rhs.z) && (lhs.w == rhs.w); } // Create different data types given the same input. Needed for handling border values in cudaTextures with Border // adress mode template inline __host__ __device__ DATA_T createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a); template <> inline __host__ __device__ float createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a) { return r; } template <> inline __host__ __device__ float2 createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a) { return make_float2(r, g); } template <> inline __host__ __device__ float3 createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a) { return make_float3(r, g, b); } template <> inline __host__ __device__ float4 createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a) { return make_float4(r, g, b, a); } template <> inline __host__ __device__ int createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a) { return static_cast(r); } template <> inline __host__ __device__ int2 createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a) { return make_int2(static_cast(r), static_cast(g)); } template <> inline __host__ __device__ int3 createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a) { return make_int3(static_cast(r), static_cast(g), static_cast(b)); } template <> inline __host__ __device__ int4 createPartialCudaTuple(const float& r, const float& g, const float& b, const float& a) { return make_int4(static_cast(r), static_cast(g), static_cast(b), static_cast(a)); } #endif // MPPIGENERIC_CUDA_MATH_UTILS_CUH ================================================ FILE: include/mppi/utils/eigen_type_conversions.h ================================================ #pragma once #include #include __device__ __host__ Eigen::Vector3f cudaToEigen(const float3& v) { return Eigen::Vector3f(v.x, v.y, v.z); } __device__ __host__ float3 EigenToCuda(const Eigen::Vector3f& v) { return make_float3(v[0], v[1], v[2]); } ================================================ FILE: include/mppi/utils/file_utils.h ================================================ // // Created by jgibson37 on 1/9/20. // #ifndef MPPIGENERIC_FILE_UTILS_H #define MPPIGENERIC_FILE_UTILS_H #include #include inline bool fileExists(const std::string& name) { return (access(name.c_str(), F_OK) != -1); } #endif // MPPIGENERIC_FILE_UTILS_H ================================================ FILE: include/mppi/utils/gpu_err_chk.cuh ================================================ #ifndef CUDA_TESTING_GPU_ERR_CHK_CUH #define CUDA_TESTING_GPU_ERR_CHK_CUH #include #include #include #include // For block idx and thread idx, etc #include #ifndef DEPRECATED #if __cplusplus >= 201402L #define DEPRECATED [[deprecated]] #elif defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) #define DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) #define DEPRECATED __declspec(deprecated) #else #pragma message("WARNING: You need to implement DEPRECATED for this compiler") #define DEPRECATED #endif #endif // #ifndef __DEPRECATED__ // #if defined(_WIN32) // # define __DEPRECATED__(msg) __declspec(deprecated(msg)) // #elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) // # define __DEPRECATED__(msg) __attribute__((deprecated)) // #else // # define __DEPRECATED__(msg) __attribute__((deprecated(msg))) // #endif // #endif inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } inline void __cudaCheckError(const char* file, const int line) { cudaError err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line, cudaGetErrorString(err)); exit(-1); } // More careful checking. However, this will affect performance. // Comment away if needed. err = cudaDeviceSynchronize(); if (cudaSuccess != err) { fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file, line, cudaGetErrorString(err)); exit(-1); } } inline const char* cufftGetErrorString(cufftResult& code) { // Codes from https://docs.nvidia.com/cuda/cufft/index.html#cufftresult switch (code) { case CUFFT_SUCCESS: return "Success"; case CUFFT_INVALID_PLAN: return "cuFFT was passed an invalid plan handle"; case CUFFT_ALLOC_FAILED: return "cuFFT failed to allocate GPU or CPU memory"; case CUFFT_INVALID_VALUE: return "User specified an invalid pointer or parameter"; case CUFFT_INTERNAL_ERROR: return "Driver or internal cuFFT library error"; case CUFFT_EXEC_FAILED: return "Failed to execute an FFT on the GPU"; case CUFFT_SETUP_FAILED: return "The cuFFT library failed to initialize"; case CUFFT_INVALID_SIZE: return "User specified an invalid transform size"; case CUFFT_UNALIGNED_DATA: return "No longer used but unaligned data"; case CUFFT_INVALID_DEVICE: return "Execution of a plan was on different GPU than plan creation"; case CUFFT_NO_WORKSPACE: return "No workspace has been provided prior to plan execution"; case CUFFT_NOT_IMPLEMENTED: return "Function does not implement functionality for parameters given."; #if CUDART_VERSION < 13000 case CUFFT_INCOMPLETE_PARAMETER_LIST: return "Missing parameters in call"; case CUFFT_PARSE_ERROR: return "Internal plan database error"; case CUFFT_LICENSE_ERROR: return "License Error. Used in previous versions"; #else case CUFFT_MISSING_DEPENDENCY: return "cuFFT is unable to find a dependency"; case CUFFT_NVRTC_FAILURE: return "An NVRTC failure was encountered during a cuFFT operation"; case CUFFT_NVJITLINK_FAILURE: return "An nvJitLink failure was encountered during a cuFFT operation"; case CUFFT_NVSHMEM_FAILURE: return "An NVSHMEM failure was encountered during a cuFFT operation"; #endif case CUFFT_NOT_SUPPORTED: return "Operation is not supported for parameters given."; default: return "cuFFT ERROR"; } } inline const char* curandGetErrorString(curandStatus_t code) { // Codes from https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437 switch (code) { case CURAND_STATUS_SUCCESS: return "No errors."; case CURAND_STATUS_VERSION_MISMATCH: return "Header file and linked library version do not match."; case CURAND_STATUS_NOT_INITIALIZED: return "Generator not initialized."; case CURAND_STATUS_ALLOCATION_FAILED: return "Memory allocation failed."; case CURAND_STATUS_TYPE_ERROR: return "Generator is wrong type."; case CURAND_STATUS_OUT_OF_RANGE: return "Argument out of range."; case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return "Length requested is not a multple of dimension."; case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: return "GPU does not have double precision required by MRG32k3a."; case CURAND_STATUS_LAUNCH_FAILURE: return "Kernel launch failure."; case CURAND_STATUS_PREEXISTING_FAILURE: return "Preexisting failure on library entry."; case CURAND_STATUS_INITIALIZATION_FAILED: return "Initialization of CUDA failed."; case CURAND_STATUS_ARCH_MISMATCH: return "Architecture mismatch, GPU does not support requested feature."; case CURAND_STATUS_INTERNAL_ERROR: return "Internal library error."; default: return "Cureand Error"; } } inline void cufftAssert(cufftResult code, const char* file, int line, bool abort = true) { if (code != CUFFT_SUCCESS) { fprintf(stderr, "CUFFTassert: %s %s %d\n", cufftGetErrorString(code), file, line); if (abort) { exit(code); } } } inline void curandAssert(curandStatus_t code, const char* file, int line, bool abort = true) { if (code != CURAND_STATUS_SUCCESS) { fprintf(stderr, "Curandassert: %s %s %d\n", curandGetErrorString(code), file, line); if (abort) { exit(code); } } } #define CudaCheckError() __cudaCheckError(__FILE__, __LINE__) #define HANDLE_ERROR(ans) \ { \ gpuAssert((ans), __FILE__, __LINE__); \ } #define HANDLE_CUFFT_ERROR(ans) \ { \ cufftAssert((ans), __FILE__, __LINE__); \ } #define HANDLE_CURAND_ERROR(ans) \ { \ curandAssert((ans), __FILE__, __LINE__); \ } #endif //! CUDA_TESTING_GPU_ERR_CHK_CUH ================================================ FILE: include/mppi/utils/logger.hpp ================================================ #pragma once /** * Created by Bogdan on 11/01/2023 */ #include #include #include #include namespace mppi { namespace util { enum class LOG_LEVEL : int { DEBUG = 0, INFO, WARNING, ERROR, NONE }; static LOG_LEVEL GLOBAL_LOG_LEVEL = LOG_LEVEL::WARNING; const char BLACK[] = "\033[0;30m"; const char RED[] = "\033[0;31m"; const char GREEN[] = "\033[0;32m"; const char YELLOW[] = "\033[0;33m"; const char BLUE[] = "\033[0;34m"; const char MAGENTA[] = "\033[0;35m"; const char CYAN[] = "\033[0;36m"; const char WHITE[] = "\033[0;37m"; const char RESET[] = "\033[0m"; class MPPILogger { public: MPPILogger() = default; MPPILogger(const MPPILogger& other) = default; MPPILogger(MPPILogger&& other) = default; virtual ~MPPILogger() = default; MPPILogger& operator=(const MPPILogger& other) = default; MPPILogger& operator=(MPPILogger&& other) = default; explicit MPPILogger(LOG_LEVEL level) { setLogLevel(level); } /** * @brief Set the Log Level * * @param level */ void setLogLevel(const LOG_LEVEL& level) { log_level_ = level; } /** * @brief Set the Output Stream * * @param output file stream to write (stdout, stderr, nullptr, etc.) */ void setOutputStream(std::FILE* const output) { output_stream_ = output; } /** * @brief Get the Log Level object * * @return LOG_LEVEL */ LOG_LEVEL getLogLevel() const { return log_level_; } /** * @brief Get the Output Stream object * * @return std::FILE* */ std::FILE* getOutputStream() const { return output_stream_; } /** * @brief Log debug messages using virtual debug_impl() method * * @tparam ...Args variadic template type of args used in the format string * @param fmt format string used in printf * @param args extra args used by the format string fmt */ template void debug(const char* fmt, Args const&... args) { std::string message = format_string(fmt, args...); this->debug_impl(message); } /** * @brief Log info messages using virtual info_impl() method * * @tparam ...Args variadic template type of args used in the format string * @param fmt format string used in printf * @param args extra args used by the format string fmt */ template void info(const char* fmt, Args const&... args) { std::string message = format_string(fmt, args...); this->info_impl(message); } /** * @brief Log warning messages using virtual warning_impl() method * * @tparam ...Args variadic template type of args used in the format string * @param fmt format string used in printf * @param args extra args used by the format string fmt */ template void warning(const char* fmt, Args const&... args) { std::string message = format_string(fmt, args...); this->warning_impl(message); } /** * @brief Log errror messages using virtual errror_impl() method * * @tparam ...Args variadic template type of args used in the format string * @param fmt format string used in printf * @param args extra args used by the format string fmt */ template void error(const char* fmt, Args const&... args) { std::string message = format_string(fmt, args...); this->error_impl(message); } protected: LOG_LEVEL log_level_ = GLOBAL_LOG_LEVEL; std::FILE* output_stream_ = stdout; virtual void debug_impl(const std::string& message) { if (log_level_ <= LOG_LEVEL::DEBUG) { surround_fprintf(output_stream_, GREEN, RESET, message); } } virtual void info_impl(const std::string& message) { if (log_level_ <= LOG_LEVEL::INFO) { surround_fprintf(output_stream_, CYAN, RESET, message); } } virtual void warning_impl(const std::string& message) { if (log_level_ <= LOG_LEVEL::WARNING) { surround_fprintf(output_stream_, YELLOW, RESET, message); } } virtual void error_impl(const std::string& message) { if (log_level_ <= LOG_LEVEL::ERROR) { surround_fprintf(output_stream_, RED, RESET, message); } } /** * @brief Print message to stream with coloring defined by prefix * * @param fstream where the message will be printed to * @param prefix prefix string to print before message. Expected to be a color code * @param suffix suffix string to print after message. Expected to be a color reset code * @param message actual message to be printed */ virtual void surround_fprintf(std::FILE* fstream, const char* prefix, const char* suffix, const std::string& message) { std::fprintf(fstream, "%s%s%s", prefix, message.c_str(), suffix); } /** * @brief create a string out of format string and variable number of additional arguments * * @tparam ...Args variadic template type for extra arguments passed to format_string() * @param fmt format string defining how to display additional arguments * @param args additional arguments for formatting * * @return std::string containing formatted text */ template std::string format_string(const char* fmt, Args const&... args) { // figure out size of formatted string std::vector buf(1 + std::snprintf(nullptr, 0, fmt, args...)); // Fill buffer with formatted string std::snprintf(buf.data(), buf.size(), fmt, args...); return std::string(buf.data()); } }; using MPPILoggerPtr = std::shared_ptr; } // namespace util } // namespace mppi ================================================ FILE: include/mppi/utils/managed.cuh ================================================ /* * Software License Agreement (BSD License) * Copyright (c) 2013, Georgia Institute of Technology * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /********************************************** * @file managed.cuh * @author Grady Williams * @date May 24, 2017 * @copyright 2017 Georgia Institute of Technology * @brief Class to be inherited by classes passed * to CUDA kernels. Helps unify stream management. ***********************************************/ #ifndef MPPI_MANAGED_CUH_ #define MPPI_MANAGED_CUH_ #include "gpu_err_chk.cuh" #include /** * @class Managed managed.cuh * @brief Class for setting the stream to be used by dynamics and cost functions used * by MPPIController. * * This class has one variable, which is the CUDA stream, and a function which sets the * stream. It is meant to be inherited by costs and dynamics classes which are passed into * the MPPIController kernels. In the past, this class used unified memory (hence the managed name), * so that classes could be passed by reference to CUDA kernels. However, as of right now, * the difficulties of using unified memory with multi-threaded CPU programs make getting * good performance with unified memory difficult, so this has been removed. Future implementations * may bring back the unified memory feature. */ class Managed { public: cudaStream_t stream_ = 0; ///< The CUDA Stream that the class is bound too. 0 is the default (NULL) stream. // true when allocated bool GPUMemStatus_ = false; Managed(cudaStream_t stream = 0) { this->bindToStream(stream); auto logger = std::make_shared(); setLogger(logger); } /** @brief Sets the stream and synchronizes the device. @param stream is the CUDA stream that the object is assigned too. */ void bindToStream(cudaStream_t stream) { stream_ = stream; cudaDeviceSynchronize(); } // REQUIRED: basic interface, make sure to implement in each class // GPUSetup - allocates the GPU object // freeCudaMem -> deallocates what is setup in GPUSetup // getParams, setParams -> gets and sets the parameters // paramsToDevice -> copies the parameters over to the GPU side // OPTIONAL: // printParams // other printing methods __host__ void setLogger(const mppi::util::MPPILoggerPtr& logger) { logger_ = logger; } __host__ void setLogLevel(const mppi::util::LOG_LEVEL& level) { logger_->setLogLevel(level); } __host__ mppi::util::MPPILoggerPtr getLogger() { return logger_; } __host__ mppi::util::MPPILoggerPtr getLogger() const { return logger_; } __device__ __host__ int getGrdSharedSizeBytes() const { return SHARED_MEM_REQUEST_GRD_BYTES; } __device__ __host__ int getBlkSharedSizeBytes() const { return SHARED_MEM_REQUEST_BLK_BYTES; } protected: template static T* GPUSetup(T* host_ptr) { // Allocate enough space on the GPU for the object T* device_ptr; cudaMalloc((void**)&device_ptr, sizeof(T)); // Cudamemcpy HANDLE_ERROR(cudaMemcpyAsync(device_ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice, host_ptr->stream_)); cudaDeviceSynchronize(); host_ptr->GPUMemStatus_ = true; return device_ptr; } mppi::util::MPPILoggerPtr logger_ = nullptr; int SHARED_MEM_REQUEST_GRD_BYTES = 0; ///< Amount of shared memory we need per BLOCK. int SHARED_MEM_REQUEST_BLK_BYTES = 0; ///< Amount of shared memory we need per ROLLOUT. // TODO CRTP template this on the base class for allocation and dealloation }; #endif /* MPPI_MANAGED_CUH_*/ ================================================ FILE: include/mppi/utils/math_utils.h ================================================ /* * Created on Mon Jun 01 2020 by Bogdan * */ #ifndef MATH_UTILS_H_ #define MATH_UTILS_H_ // Needed for sampling without replacement #include #include #include #include #include #include #include #include #include #include #include #ifndef SQ #define SQ(a) ((a) * (a)) #endif // SQ // For aligning parameters within structs such as a float array to 16 bytes // Ex: float name[size] MPPI_ALIGN(16) = {0.0f}; #if defined(__CUDACC__) // NVCC #define MPPI_ALIGN(n) __align__(n) #elif defined(__GNUC__) // GCC #define MPPI_ALIGN(n) __attribute__((aligned(n))) #elif defined(_MSC_VER) // MSVC #define MPPI_ALIGN(n) __declspec(align(n)) #else #error "Please provide a definition for MPPI_ALIGN macro for your host compiler!" #endif namespace mppi { namespace math { namespace matrix = ::mppi::matrix_multiplication; const float GRAVITY = 9.81f; // Based off of https://gormanalysis.com/blog/random-numbers-in-cpp inline std::vector sample_without_replacement(const int k, const int N, std::default_random_engine g = std::default_random_engine()) { if (k > N) { throw std::logic_error("Can't sample more than n times without replacement"); } // Create an unordered set to store the samples std::unordered_set samples; // For loop runs k times for (int r = N - k; r < N; r++) { if (r == 0) { samples.insert(N - 1); continue; } int v = std::uniform_int_distribution<>(1, r)(g); // sample between 1 and r if (!samples.insert(v - 1).second) { // if v exists in the set samples.insert(r - 1); } } // Copy set into a vector std::vector final_sequence(samples.begin(), samples.end()); // Shuffle the vector to get the final sequence of sampling std::shuffle(final_sequence.begin(), final_sequence.end(), g); return final_sequence; } inline __host__ __device__ float expr(const float r, const float x) { float mid_term = 1.0f + (r - 1.0f) * x; return (mid_term > 0) * powf(mid_term, 1.0f / (r - 1.0f)); } /** * Linear interpolation * Given two coordinates (x_min, y_min) and (x_max, y_max) * And the x location of a third (x), return the y location * along the line between the two points */ inline __host__ __device__ float linInterp(const float x, const float x_min, const float x_max, const float y_min, const float y_max) { return (x - x_min) / (x_max - x_min) * (y_max - y_min) + y_min; } /** * Return the sign of a variable (1 for positive, -1 for negative, 0 for 0) **/ template inline __host__ __device__ int sign(const T& a) { return (a > 0) - (a < 0); } inline __host__ __device__ int int_ceil(const int& a, const int& b) { return a == 0 ? a : (a - 1) / b + 1; } inline constexpr __host__ __device__ int int_ceil_const(const int& a, const int& b) { return a == 0 ? a : (a - 1) / b + 1; } /** * @brief gives back the next multiple of b larger than or equal to a. * For example, if a = 3, b = 2, this method returns 4 * * @param a - int to be larger than * @param b - int to be multiple of * @return int - the next multiple of b larger than a */ inline constexpr __host__ __device__ int int_multiple_const(const int& a, const int& b) { return a == 0 ? a : ((a - 1) / b + 1) * b; } // Returns the int version of ceil(a/4) inline __host__ __device__ int nearest_quotient_4(const int& a) { return int_ceil(a, 4); } // Returns the next multiple of 4 larger than or equal to a, Useful for calculating aligned memory sizes inline __host__ __device__ int nearest_multiple_4(const int& a) { return int_ceil(a, 4) * 4; } /** * Calculates the normalized distance from the centerline * @param r - current radius * @param r_in - the inside radius of a track * @param r_out - the outside radius of a track * @return norm_dist - a normalized distance away from the centerline * norm_dist = 0 -> on the centerline * norm_dist = 1 -> on one of the track boundaries inner, or outer */ inline __host__ __device__ float normDistFromCenter(const float r, const float r_in, const float r_out) { float r_center = (r_in + r_out) / 2.0f; float r_width = (r_out - r_in); float dist_from_center = fabsf(r - r_center); float norm_dist = dist_from_center / (r_width * 0.5f); return norm_dist; } /** * Multiply two quaternions together which gives you their rotations added together * q_3 = q_1 x q_2 * Inputs: * q_1 - first quaternion * q_2 - second quaternion * q_3 - output quaternion */ inline __host__ __device__ void QuatMultiply(const float q_1[4], const float q_2[4], float q_3[4], bool normalize = true) { q_3[0] = q_1[0] * q_2[0] - q_1[1] * q_2[1] - q_1[2] * q_2[2] - q_1[3] * q_2[3]; q_3[1] = q_1[1] * q_2[0] + q_1[0] * q_2[1] - q_1[3] * q_2[2] + q_1[2] * q_2[3]; q_3[2] = q_1[2] * q_2[0] + q_1[3] * q_2[1] + q_1[0] * q_2[2] - q_1[1] * q_2[3]; q_3[3] = q_1[3] * q_2[0] - q_1[2] * q_2[1] + q_1[1] * q_2[2] + q_1[0] * q_2[3]; if (normalize) { #ifdef __CUDA_ARCH__ float inv_norm = rsqrtf(SQ(q_3[0]) + SQ(q_3[1]) + SQ(q_3[2]) + SQ(q_3[3])); #else float inv_norm = 1.0f / sqrtf(SQ(q_3[0]) + SQ(q_3[1]) + SQ(q_3[2]) + SQ(q_3[3])); #endif __UNROLL(4) for (int i = 0; i < 4; i++) { q_3[i] *= inv_norm; } } } inline __host__ __device__ void QuatInv(const float q[4], float q_inv[4]) { #ifdef __CUDA_ARCH__ float inv_norm = rsqrtf(SQ(q[0]) + SQ(q[1]) + SQ(q[2]) + SQ(q[3])); #else float inv_norm = 1.0f / sqrtf(SQ(q[0]) + SQ(q[1]) + SQ(q[2]) + SQ(q[3])); #endif q_inv[0] = q[0] * inv_norm; q_inv[1] = -q[1] * inv_norm; q_inv[2] = -q[2] * inv_norm; q_inv[3] = -q[3] * inv_norm; } /** * Calculate the rotation required to get from q_1 to q_2 * In Euler angles, this would be direct subraction but * in quaternions, it doesn't quite work that way */ inline __device__ void QuatSubtract(float q_1[4], float q_2[4], float q_3[4]) { float q_1_inv[4]; QuatInv(q_1, q_1_inv); QuatMultiply(q_2, q_1_inv, q_3); } /* * The Euler rotation sequence is 3-2-1 (roll, pitch, yaw) from Body to World */ inline __device__ void Euler2QuatNWU(const double& r, const double& p, const double& y, double q[4]) { double phi_2 = r / 2.0; double theta_2 = p / 2.0; double psi_2 = y / 2.0; double cos_phi_2 = cos(phi_2); double sin_phi_2 = sin(phi_2); double cos_theta_2 = cos(theta_2); double sin_theta_2 = sin(theta_2); double cos_psi_2 = cos(psi_2); double sin_psi_2 = sin(psi_2); q[0] = cos_phi_2 * cos_theta_2 * cos_psi_2 + sin_phi_2 * sin_theta_2 * sin_psi_2; q[1] = -cos_phi_2 * sin_theta_2 * sin_psi_2 + cos_theta_2 * cos_psi_2 * sin_phi_2; q[2] = cos_phi_2 * cos_psi_2 * sin_theta_2 + sin_phi_2 * cos_theta_2 * sin_psi_2; q[3] = cos_phi_2 * cos_theta_2 * sin_psi_2 - sin_phi_2 * cos_psi_2 * sin_theta_2; } /* * The Euler rotation sequence is 3-2-1 (roll, pitch, yaw) from Body to World */ inline __host__ __device__ void Euler2QuatNWU(const float& r, const float& p, const float& y, float q[4]) { float sin_phi_2, cos_phi_2, sin_theta_2, cos_theta_2, sin_psi_2, cos_psi_2; #ifdef __CUDA_ARCH__ float r_norm = angle_utils::normalizeAngle(r * 0.5f); float p_norm = angle_utils::normalizeAngle(p * 0.5f); float y_norm = angle_utils::normalizeAngle(y * 0.5f); __sincosf(r_norm, &sin_phi_2, &cos_phi_2); __sincosf(p_norm, &sin_theta_2, &cos_theta_2); __sincosf(y_norm, &sin_psi_2, &cos_psi_2); #else sincosf(r * 0.5f, &sin_phi_2, &cos_phi_2); sincosf(p * 0.5f, &sin_theta_2, &cos_theta_2); sincosf(y * 0.5f, &sin_psi_2, &cos_psi_2); #endif q[0] = cos_phi_2 * cos_theta_2 * cos_psi_2 + sin_phi_2 * sin_theta_2 * sin_psi_2; q[1] = -cos_phi_2 * sin_theta_2 * sin_psi_2 + cos_theta_2 * cos_psi_2 * sin_phi_2; q[2] = cos_phi_2 * cos_psi_2 * sin_theta_2 + sin_phi_2 * cos_theta_2 * sin_psi_2; q[3] = cos_phi_2 * cos_theta_2 * sin_psi_2 - sin_phi_2 * cos_psi_2 * sin_theta_2; } // (RPY rotation sequence) /* * Returns an euler sequence 3-2-1 (roll pitch yaw) that when applied takes you from body to world */ inline __host__ __device__ void Quat2EulerNWU(const float q[4], float& r, float& p, float& y) { r = atan2f(2.0f * q[3] * q[2] + 2.0f * q[0] * q[1], q[0] * q[0] + q[3] * q[3] - q[2] * q[2] - q[1] * q[1]); float temp = -2.0f * q[0] * q[2] + 2.0f * q[1] * q[3]; // Clamp value between -1 and 1 to prevent NaNs p = -asinf(fmaxf(fminf(1.0f, temp), -1.0f)); y = atan2f(2.0f * q[2] * q[1] + 2.0f * q[3] * q[0], q[0] * q[0] + q[1] * q[1] - q[2] * q[2] - q[3] * q[3]); } inline __host__ __device__ void Quat2DCM(const float q[4], float M[3][3]) { M[0][0] = SQ(q[0]) + SQ(q[1]) - SQ(q[2]) - SQ(q[3]); M[0][1] = 2 * (q[1] * q[2] - q[0] * q[3]); M[0][2] = 2 * (q[1] * q[3] + q[0] * q[2]); M[1][0] = 2 * (q[1] * q[2] + q[0] * q[3]); M[1][1] = SQ(q[0]) - SQ(q[1]) + SQ(q[2]) - SQ(q[3]); M[1][2] = 2 * (q[2] * q[3] - q[0] * q[1]); M[2][0] = 2 * (q[1] * q[3] - q[0] * q[2]); M[2][1] = 2 * (q[2] * q[3] + q[0] * q[1]); M[2][2] = SQ(q[0]) - SQ(q[1]) - SQ(q[2]) + SQ(q[3]); } inline __host__ __device__ void QuatSubtract(const Eigen::Quaternionf& q_1, const Eigen::Quaternionf& q_2, Eigen::Quaternionf& q_3) { q_3 = q_2 * q_1.inverse(); } inline __host__ __device__ void QuatMultiply(const Eigen::Quaternionf& q_1, const Eigen::Quaternionf& q_2, Eigen::Quaternionf& q_3, bool normalize = true) { q_3 = q_1 * q_2; if (normalize) { q_3.normalize(); } } inline __host__ __device__ void QuatInv(const Eigen::Quaternionf& q, Eigen::Quaternionf& q_f) { q_f = q.inverse(); } /* * rotates a point by the given quaternion */ inline __host__ __device__ void RotatePointByQuat(const float q[4], const float3& point, float3& output) { // converts the point into a quaternion format float pq[4] = { 0.0f, point.x, point.y, point.z }; float q_inv[4]; float temp[4]; QuatInv(q, q_inv); QuatMultiply(q, pq, temp, false); QuatMultiply(temp, q_inv, pq, false); // converts the quaternion back into a point output = make_float3(pq[1], pq[2], pq[3]); } /* * rotates a point by the given quaternion */ inline __host__ __device__ void RotatePointByQuat(const Eigen::Quaternionf& q, const float3& point, float3& output) { // converts the point into a quaternion format Eigen::Quaternionf q_inv, temp, pq; pq.w() = 0.0f; pq.x() = point.x; pq.y() = point.y; pq.z() = point.z; QuatInv(q, q_inv); QuatMultiply(q, pq, temp, false); QuatMultiply(temp, q_inv, pq, false); // converts the quaternion back into a point output = make_float3(pq.x(), pq.y(), pq.z()); } /* * rotates a point by the given quaternion */ inline __host__ __device__ void RotatePointByQuat(const Eigen::Quaternionf& q, const Eigen::Ref& point, Eigen::Ref output) { output = q * point; } /* * rotates a point by the given quaternion */ inline __host__ __device__ void RotatePointByQuat(const float q[4], float3& point) { RotatePointByQuat(q, point, point); } /* * rotates a point by the given quaternion */ inline __host__ __device__ void RotatePointByQuat(const Eigen::Quaternionf& q, float3& point) { RotatePointByQuat(q, point, point); } /* * rotates a point by the given quaternion */ inline __host__ __device__ void RotatePointByQuat(const Eigen::Quaternionf& q, Eigen::Ref point) { RotatePointByQuat(q, point, point); } /* * rotates a point by the given DCM Matrix. Transpose is used as M stored in row-major */ inline __host__ __device__ void RotatePointByDCM(const float M[3][3], const float3& point, float3& output, matrix::MAT_OP operation = matrix::MAT_OP::NONE) { if (operation == matrix::MAT_OP::NONE) { matrix::gemm1<3, 3, 1, p1::Parallel1Dir::NONE>((float*)M, (const float*)&point, (float*)&output, 1.0f, 0.0f, matrix::MAT_OP::TRANSPOSE); } else if (operation == matrix::MAT_OP::TRANSPOSE) { matrix::gemm1<3, 3, 1, p1::Parallel1Dir::NONE>((float*)M, (const float*)&point, (float*)&output, 1.0f, 0.0f, matrix::MAT_OP::NONE); } } /* * rotates a point by the given DCM Matrix */ inline __host__ __device__ void RotatePointByDCM(const Eigen::Ref& M, const float3& point, float3& output, matrix::MAT_OP operation = matrix::MAT_OP::NONE) { Eigen::Map point_eigen((float*)&point); Eigen::Map output_eigen((float*)&output); if (operation == matrix::MAT_OP::NONE) { output_eigen = M * point_eigen; } else if (operation == matrix::MAT_OP::TRANSPOSE) { output_eigen = M.transpose() * point_eigen; } } /* * rotates a point by the given DCM Matrix */ inline __host__ __device__ void RotatePointByDCM(const Eigen::Ref& M, const Eigen::Ref& point, Eigen::Ref output, matrix::MAT_OP operation = matrix::MAT_OP::NONE) { if (operation == matrix::MAT_OP::NONE) { output = M * point; } else if (operation == matrix::MAT_OP::TRANSPOSE) { output = M.transpose() * point; } } /* * The Euler rotation sequence is 3-2-1 (roll, pitch, yaw) from Body to World */ inline __host__ __device__ void Euler2QuatNWU(const float& r, const float& p, const float& y, Eigen::Quaternionf& q) { // double psi = clamp_radians(euler.roll); // double theta = clamp_radians(euler.pitch); // double phi = clamp_radians(euler.yaw); float sin_phi_2, cos_phi_2, sin_theta_2, cos_theta_2, sin_psi_2, cos_psi_2; #ifdef __CUDA_ARCH__ float r_norm = angle_utils::normalizeAngle(r * 0.5f); float p_norm = angle_utils::normalizeAngle(p * 0.5f); float y_norm = angle_utils::normalizeAngle(y * 0.5f); __sincosf(r_norm, &sin_phi_2, &cos_phi_2); __sincosf(p_norm, &sin_theta_2, &cos_theta_2); __sincosf(y_norm, &sin_psi_2, &cos_psi_2); #else sincosf(r * 0.5f, &sin_phi_2, &cos_phi_2); sincosf(p * 0.5f, &sin_theta_2, &cos_theta_2); sincosf(y * 0.5f, &sin_psi_2, &cos_psi_2); #endif q.w() = cos_phi_2 * cos_theta_2 * cos_psi_2 + sin_phi_2 * sin_theta_2 * sin_psi_2; q.x() = -cos_phi_2 * sin_theta_2 * sin_psi_2 + cos_theta_2 * cos_psi_2 * sin_phi_2; q.y() = cos_phi_2 * cos_psi_2 * sin_theta_2 + sin_phi_2 * cos_theta_2 * sin_psi_2; q.z() = cos_phi_2 * cos_theta_2 * sin_psi_2 - sin_phi_2 * cos_psi_2 * sin_theta_2; } /* * The Euler rotation sequence is 3-2-1 (roll, pitch, yaw) from Body to World */ inline __host__ __device__ void Euler2DCM_NWU(const float& r, const float& p, const float& y, float M[3][3]) { float sin_phi, cos_phi, sin_theta, cos_theta, sin_psi, cos_psi; #ifdef __CUDA_ARCH__ float r_norm = angle_utils::normalizeAngle(r); float p_norm = angle_utils::normalizeAngle(p); float y_norm = angle_utils::normalizeAngle(y); __sincosf(r_norm, &sin_phi, &cos_phi); __sincosf(p_norm, &sin_theta, &cos_theta); __sincosf(y_norm, &sin_psi, &cos_psi); #else sincosf(r, &sin_phi, &cos_phi); sincosf(p, &sin_theta, &cos_theta); sincosf(y, &sin_psi, &cos_psi); #endif M[0][0] = cos_theta * cos_psi; M[0][1] = sin_phi * sin_theta * cos_psi - cos_phi * sin_psi; M[0][2] = cos_phi * sin_theta * cos_psi + sin_phi * sin_psi; M[1][0] = cos_theta * sin_psi; M[1][1] = sin_phi * sin_theta * sin_psi + cos_phi * cos_psi; M[1][2] = cos_phi * sin_theta * sin_psi - sin_phi * cos_psi; M[2][0] = -sin_theta; M[2][1] = sin_phi * cos_theta; M[2][2] = cos_phi * cos_theta; } /* * The Euler rotation sequence is 3-2-1 (roll, pitch, yaw) from Body to World */ inline __host__ __device__ void Euler2DCM_NWU(const float& r, const float& p, const float& y, Eigen::Ref M) { float sin_phi, cos_phi, sin_theta, cos_theta, sin_psi, cos_psi; #ifdef __CUDA_ARCH__ float r_norm = angle_utils::normalizeAngle(r); float p_norm = angle_utils::normalizeAngle(p); float y_norm = angle_utils::normalizeAngle(y); __sincosf(r_norm, &sin_phi, &cos_phi); __sincosf(p_norm, &sin_theta, &cos_theta); __sincosf(y_norm, &sin_psi, &cos_psi); #else sincosf(r, &sin_phi, &cos_phi); sincosf(p, &sin_theta, &cos_theta); sincosf(y, &sin_psi, &cos_psi); #endif M(0, 0) = cos_theta * cos_psi; M(0, 1) = sin_phi * sin_theta * cos_psi - cos_phi * sin_psi; M(0, 2) = cos_phi * sin_theta * cos_psi + sin_phi * sin_psi; M(1, 0) = cos_theta * sin_psi; M(1, 1) = sin_phi * sin_theta * sin_psi + cos_phi * cos_psi; M(1, 2) = cos_phi * sin_theta * sin_psi - sin_phi * cos_psi; M(2, 0) = -sin_theta; M(2, 1) = sin_phi * cos_theta; M(2, 2) = cos_phi * cos_theta; } // (RPY rotation sequence) /* * Returns an euler sequence 3-2-1 (roll pitch yaw) that when applied takes you from body to world */ inline void __host__ __device__ Quat2EulerNWU(const Eigen::Quaternionf& q, float& r, float& p, float& y) { r = atan2f(2.0f * q.z() * q.y() + 2.0f * q.w() * q.x(), q.w() * q.w() + q.z() * q.z() - q.y() * q.y() - q.x() * q.x()); float temp = -2.0f * q.w() * q.y() + 2.0f * q.x() * q.z(); p = -asinf(fmaxf(-1.0f, fminf(temp, 1.0f))); y = atan2f(2.0f * q.y() * q.x() + 2.0f * q.z() * q.w(), q.w() * q.w() + q.x() * q.x() - q.y() * q.y() - q.z() * q.z()); } inline void Quat2DCM(const Eigen::Quaternionf& q, Eigen::Ref DCM) { DCM = q.toRotationMatrix(); } inline __device__ void omega2edot(const float p, const float q, const float r, const float e[4], float ed[4]) { ed[0] = 0.5f * (-p * e[1] - q * e[2] - r * e[3]); ed[1] = 0.5f * (p * e[0] - q * e[3] + r * e[2]); ed[2] = 0.5f * (p * e[3] + q * e[0] - r * e[1]); ed[3] = 0.5f * (-p * e[2] + q * e[1] + r * e[0]); } // Can't use Eigen::Ref on Quaternions inline void omega2edot(const float p, const float q, const float r, const Eigen::Quaternionf& e, Eigen::Quaternionf& ed) { ed.w() = 0.5f * (-p * e.x() - q * e.y() - r * e.z()); ed.x() = 0.5f * (p * e.w() - q * e.z() + r * e.y()); ed.y() = 0.5f * (p * e.z() + q * e.w() - r * e.x()); ed.z() = 0.5f * (-p * e.y() + q * e.x() + r * e.w()); } inline __host__ __device__ void bodyOffsetToWorldPoseQuat(const float3& offset, const float3& body_pose, const float q[4], float3& output) { // rotate body vector into world frame float3 rotated_offset = make_float3(offset.x, offset.y, offset.z); RotatePointByQuat(q, rotated_offset); // add offset to body pose output.x = body_pose.x + rotated_offset.x; output.y = body_pose.y + rotated_offset.y; output.z = body_pose.z + rotated_offset.z; } inline __host__ __device__ void bodyOffsetToWorldPoseQuat(const float3& offset, const float3& body_pose, const Eigen::Quaternionf& q, float3& output) { // rotate body vector into world frame float3 rotated_offset = make_float3(offset.x, offset.y, offset.z); RotatePointByQuat(q, rotated_offset); // add offset to body pose output.x = body_pose.x + rotated_offset.x; output.y = body_pose.y + rotated_offset.y; output.z = body_pose.z + rotated_offset.z; } inline __host__ __device__ void bodyOffsetToWorldPoseQuat(const Eigen::Ref& offset, const Eigen::Ref& body_pose, const Eigen::Quaternionf& q, Eigen::Ref output) { RotatePointByQuat(q, offset, output); // add offset to body pose output += body_pose; } inline __host__ __device__ void bodyOffsetToWorldPoseEuler(const float3& offset, const float3& body_pose, const float3& rotation, float3& output) { // convert RPY to Rotation Matrix float M[3][3]; math::Euler2DCM_NWU(rotation.x, rotation.y, rotation.z, M); RotatePointByDCM(M, offset, output); // add offset to body pose output.x += body_pose.x; output.y += body_pose.y; output.z += body_pose.z; } inline __host__ __device__ void bodyOffsetToWorldPoseEuler(const float3& offset, const float3& body_pose, const Eigen::Ref& rotation, float3& output) { // convert RPY to Rotation Matrix float M[3][3]; math::Euler2DCM_NWU(rotation.x(), rotation.y(), rotation.z(), M); RotatePointByDCM(M, offset, output); // add offset to body pose output.x += body_pose.x; output.y += body_pose.y; output.z += body_pose.z; } inline __host__ __device__ void bodyOffsetToWorldPoseEuler(const Eigen::Ref& offset, const Eigen::Ref& body_pose, const Eigen::Ref& rotation, Eigen::Ref output) { // convert RPY to Rotation Matrix Eigen::Matrix3f M; math::Euler2DCM_NWU(rotation.x(), rotation.y(), rotation.z(), M); RotatePointByDCM(M, offset, output); // add offset to body pose output += body_pose; } inline __host__ __device__ void bodyOffsetToWorldPoseDCM(const float3& offset, const float3& body_pose, const float rotation[3][3], float3& output) { RotatePointByDCM(rotation, offset, output); // add offset to body pose output.x += body_pose.x; output.y += body_pose.y; output.z += body_pose.z; } inline __host__ __device__ void bodyOffsetToWorldPoseDCM(const float3& offset, const float3& body_pose, const Eigen::Ref& rotation, float3& output) { RotatePointByDCM(rotation, offset, output); // add offset to body pose output.x += body_pose.x; output.y += body_pose.y; output.z += body_pose.z; } inline __host__ __device__ void bodyOffsetToWorldPoseDCM(const Eigen::Ref& offset, const Eigen::Ref& body_pose, const Eigen::Ref& rotation, Eigen::Ref output) { RotatePointByDCM(rotation, offset, output); // add offset to body pose output += body_pose; } inline __device__ __host__ Eigen::Matrix3f skewSymmetricMatrix(Eigen::Vector3f& v) { Eigen::Matrix3f m; m << 0.0f, -v[2], v[1], v[2], 0.0f, -v[0], -v[1], v[0], 0.0f; return m; } inline __host__ double timeDiffms(const std::chrono::steady_clock::time_point& end, const std::chrono::steady_clock::time_point& start) { return (end - start).count() / 1e6; } inline __host__ __device__ double normalCDF(double x) { return 0.5 * erfc(-x * M_SQRT1_2); } inline __host__ std::vector calculateCk(size_t steps) { // calculate params only when more steps are required static std::vector c_vec; if (c_vec.size() < steps) { c_vec.resize(steps, 0); c_vec[0] = 1.0; for (size_t k = 1; k <= steps; k++) { double c_k = 0; for (size_t m = 0; m < k; m++) { c_k += c_vec[m] * c_vec[k - 1 - m] / ((m + 1.0) * (2.0 * m + 1.0)); } c_vec[k] = c_k; } } return c_vec; } /** * Implementation based on * https://en.wikipedia.org/wiki/Error_function#Inverse_functions and Horner's * method */ inline __host__ double inverseErrorFunc(double x, int num_precision = 5) { std::vector c_k = calculateCk(num_precision); double output = 0; for (int i = num_precision; i > 0; i--) { output = (c_k[i] / (2.0 * i + 1.0) + output) * x * x * M_PI / 4.0; } output = (output + c_k[0]) * x / M_2_SQRTPI; return output; } inline __host__ double inverseErrorFuncSlow(double x, int num_precision = 5) { std::vector c_k = calculateCk(num_precision); double slow_output = 0; for (int i = 0; i <= num_precision; i++) { slow_output += c_k[i] / (2.0 * i + 1.0) * std::pow(x / M_2_SQRTPI, 2 * i + 1); } return slow_output; } /** * https://en.wikipedia.org/wiki/Normal_distribution#Quantile_function */ inline __host__ double inverseNormalCDF(double x, int num_precision = 10) { return M_SQRT2 * inverseErrorFunc(2.0 * x - 1.0, num_precision); } inline __host__ double inverseNormalCDFSlow(double x, int num_precision = 10) { return M_SQRT2 * inverseErrorFuncSlow(2.0 * x - 1.0, num_precision); } inline __device__ __host__ float clamp(float value, float min, float max) { return fminf(fmaxf(value, min), max); } inline __device__ __host__ float sign(float value) { return value >= 0 ? 1 : -1; } } // namespace math } // namespace mppi #endif // MATH_UTILS_H_ ================================================ FILE: include/mppi/utils/matrix_mult_utils.cuh ================================================ // // Created by Bogdan on 8/20/23 // #pragma once #include namespace mppi { namespace matrix_multiplication { /** * Utility Functions **/ inline __host__ __device__ int2 const unravelColumnMajor(const int index, const int num_rows) { int col = index / num_rows; int row = index % num_rows; return make_int2(row, col); } inline __host__ __device__ int2 const unravelRowMajor(const int index, const int num_cols) { int row = index / num_cols; int col = index % num_cols; return make_int2(row, col); } inline __host__ __device__ constexpr int columnMajorIndex(const int row, const int col, const int num_rows) { return col * num_rows + row; } inline __host__ __device__ constexpr int rowMajorIndex(const int row, const int col, const int num_cols) { return row * num_cols + col; } /** * Utility Classes **/ enum class MAT_OP : int { NONE = 0, TRANSPOSE }; template class devMatrix { public: T* data = nullptr; static constexpr int rows = M; static constexpr int cols = N; devMatrix(T* n_data) { data = n_data; }; T operator()(const int i, const int j) const { return data[columnMajorIndex(i, j, rows)]; } }; /** * @brief GEneral Matrix Multiplication * Conducts the operation * C = alpha * op(A) * op(B) + beta * C * on matrices of type T * TODO: Add transpose options like cuBLAS GEMM * Inputs: * op(A) - T-type column-major matrix of size M * K, stored in shared/global mem * op(B) - T-type column-major matrix of size K * N, stored in shared/global mem * alpha - T-type to multiply A * B * beta - T-type multipling C * A_OP - whether or not you should use A or A transpose * B_OP - whether or not you should use B or B transpose * Outputs: * C - float column-major matrix of size M * N, stored in shared/global mem * */ template inline __device__ __host__ void gemm1(const T* A, const T* B, T* C, const T alpha = 1, const T beta = 0, const MAT_OP A_OP = MAT_OP::NONE, const MAT_OP B_OP = MAT_OP::NONE) { int parallel_index; int parallel_step; int p, k; p1::getParallel1DIndex(parallel_index, parallel_step); int2 mn; const bool all_stride = (A_OP == MAT_OP::NONE) && (B_OP == MAT_OP::TRANSPOSE); for (p = parallel_index; p < M * N; p += parallel_step) { T accumulator = 0; mn = unravelColumnMajor(p, M); if (K % 4 == 0 && sizeof(type4) <= 16 && !all_stride) { // Fetch 4 B values using single load memory operator of up to 128 bits since B is contiguous wrt k __UNROLL(10) for (k = 0; k < K; k += 4) { if (A_OP == MAT_OP::NONE && B_OP == MAT_OP::NONE) { const type4 b_tmp = reinterpret_cast*>(&B[columnMajorIndex(k, mn.y, K)])[0]; accumulator += A[columnMajorIndex(mn.x, k + 0, M)] * b_tmp[0]; accumulator += A[columnMajorIndex(mn.x, k + 1, M)] * b_tmp[1]; accumulator += A[columnMajorIndex(mn.x, k + 2, M)] * b_tmp[2]; accumulator += A[columnMajorIndex(mn.x, k + 3, M)] * b_tmp[3]; } else if (A_OP == MAT_OP::TRANSPOSE && B_OP == MAT_OP::NONE) { const type4 b_tmp = reinterpret_cast*>(&B[columnMajorIndex(k, mn.y, K)])[0]; const type4 a_tmp = reinterpret_cast*>(&A[rowMajorIndex(mn.x, k, K)])[0]; accumulator += a_tmp[0] * b_tmp[0]; accumulator += a_tmp[1] * b_tmp[1]; accumulator += a_tmp[2] * b_tmp[2]; accumulator += a_tmp[3] * b_tmp[3]; } else if (A_OP == MAT_OP::TRANSPOSE && B_OP == MAT_OP::TRANSPOSE) { // const type4 b_tmp = reinterpret_cast*>(&B[columnMajorIndex(k, mn.y, K)])[0]; const type4 a_tmp = reinterpret_cast*>(&A[rowMajorIndex(mn.x, k, K)])[0]; accumulator += a_tmp[0] * B[rowMajorIndex(k + 0, mn.y, N)]; accumulator += a_tmp[1] * B[rowMajorIndex(k + 1, mn.y, N)]; accumulator += a_tmp[2] * B[rowMajorIndex(k + 2, mn.y, N)]; accumulator += a_tmp[3] * B[rowMajorIndex(k + 3, mn.y, N)]; } } } else if (K % 2 == 0 && sizeof(type2) <= 16 && !all_stride) { // Fetch 2 B values using single load memory operator of up to 128 bits since B is contiguous wrt k __UNROLL(10) for (k = 0; k < K; k += 2) { if (A_OP == MAT_OP::NONE && B_OP == MAT_OP::NONE) { const type2 b_tmp = reinterpret_cast*>(&B[columnMajorIndex(k, mn.y, K)])[0]; accumulator += A[columnMajorIndex(mn.x, k + 0, M)] * b_tmp[0]; accumulator += A[columnMajorIndex(mn.x, k + 1, M)] * b_tmp[1]; } else if (A_OP == MAT_OP::TRANSPOSE && B_OP == MAT_OP::NONE) { const type2 b_tmp = reinterpret_cast*>(&B[columnMajorIndex(k, mn.y, K)])[0]; const type2 a_tmp = reinterpret_cast*>(&A[rowMajorIndex(mn.x, k, K)])[0]; accumulator += a_tmp[0] * b_tmp[0]; accumulator += a_tmp[1] * b_tmp[1]; } else if (A_OP == MAT_OP::TRANSPOSE && B_OP == MAT_OP::TRANSPOSE) { const type2 a_tmp = reinterpret_cast*>(&A[rowMajorIndex(mn.x, k, K)])[0]; accumulator += a_tmp[0] * B[rowMajorIndex(k + 0, mn.y, N)]; accumulator += a_tmp[1] * B[rowMajorIndex(k + 1, mn.y, N)]; } } } else { // Either K is odd or sizeof(T) is large enough that T a; T b; __UNROLL(10) for (k = 0; k < K; k++) { if (A_OP == MAT_OP::NONE && B_OP == MAT_OP::NONE) { a = A[columnMajorIndex(mn.x, k, M)]; b = B[columnMajorIndex(k, mn.y, K)]; } else if (A_OP == MAT_OP::TRANSPOSE && B_OP == MAT_OP::NONE) { a = A[rowMajorIndex(mn.x, k, K)]; b = B[columnMajorIndex(k, mn.y, K)]; } else if (A_OP == MAT_OP::TRANSPOSE && B_OP == MAT_OP::TRANSPOSE) { a = A[rowMajorIndex(mn.x, k, K)]; b = B[rowMajorIndex(k, mn.y, N)]; } else { a = A[columnMajorIndex(mn.x, k, M)]; b = B[rowMajorIndex(k, mn.y, N)]; } accumulator += a * b; } } if (beta == 0) { // Special case to remove extraneous memory accesses C[p] = alpha * accumulator; } else { C[p] = alpha * accumulator + beta * C[p]; } } } /** * @brief GEneral Matrix Multiplication * Conducts the operation * C = alpha * A * B + beta * C * using two parallelization directions * TODO: Add transpose options like cuBLAS GEMM * Inputs: * A - float column-major matrix of size M * K, stored in shared/global mem * B - float column-major matrix of size K * N, stored in shared/global mem * alpha - float to multiply A * B * beta - float multipling C * Outputs: * C - float column-major matrix of size M * N, stored in shared/global mem * */ template inline __device__ void gemm2(const float* A, const float* B, float* C, const float alpha = 1.0f, const float beta = 0.0f) { int m_ind_start; int m_ind_size; int n_ind_start; int n_ind_size; p2::getParallel2DIndex(m_ind_start, n_ind_start, m_ind_size, n_ind_size); for (int m = m_ind_start; m < M; m += m_ind_size) { for (int n = n_ind_start; n < N; n += n_ind_size) { float accumulator = 0; __UNROLL(10) for (int k = 0; k < K; k++) { accumulator += A[columnMajorIndex(m, k, M)] * B[columnMajorIndex(k, n, K)]; } C[columnMajorIndex(m, n, M)] = alpha * accumulator + beta * C[columnMajorIndex(m, n, M)]; } } } /** * @brief Perform Guass Jordan Elimination in place on columan-major MxN matrix A. * Useful for solving Cx = b where A = [C | b] as well as inverting matrices. * * @tparam M - number of rows * @tparam N - number of cols * @tparam P_DIR - Parallelization axes for on the GPU * @tparam T - type of data in A * @param A - column-major matrix of type T with M rows and N cols * * @return reduced row echelon form of A is returned in A. */ template inline __host__ __device__ void GaussJordanElimination(T* A) { int p_index, step; p1::getParallel1DIndex(p_index, step); int row, col, offset = 0; T accumulator; for (int i = 0; i < M; i++) { // Check if row-swap is needed row = i; while (A[columnMajorIndex(row, i + offset, M)] == 0) { row++; if (row == M) { // column is all zeros, need to move on to the next column offset++; if (i + offset >= N) { // Ran out of columns to check. return; } row = i; } } // swap rows if needed (row != i) for (col = i + p_index; row != i && col < N; col += step) { accumulator = A[columnMajorIndex(row, col, M)]; A[columnMajorIndex(row, col, M)] = A[columnMajorIndex(i, col, M)]; A[columnMajorIndex(i, col, M)] = accumulator; } // normalize the current row accumulator = 1.0f / A[columnMajorIndex(i, i + offset, M)]; for (col = i + offset + p_index; col < N; col += step) { A[columnMajorIndex(i, col, M)] *= accumulator; } #ifdef __CUDA_ARCH__ __syncthreads(); #endif // Now eliminate pivot from both rows above and below for (row = p_index; row < M; row += step) { if (row == i) { continue; } accumulator = -A[columnMajorIndex(row, i + offset, M)]; for (col = i + offset; col < N; col++) { A[columnMajorIndex(row, col, M)] += accumulator * A[columnMajorIndex(i, col, M)]; } } #ifdef __CUDA_ARCH__ __syncthreads(); #endif } } template void matMult1(const devMatrix& A, const devMatrix& B, devMatrix& C) { gemm1(A.data, B.data, C.data); } } // namespace matrix_multiplication } // namespace mppi ================================================ FILE: include/mppi/utils/nn_helpers/fnn_helper.cu ================================================ // // Created by jason on 8/18/22. // #include "fnn_helper.cuh" template FNNHelper::FNNHelper(const std::vector& layers, cudaStream_t stream) : Managed(stream) { setupMemory(layers); } template FNNHelper::FNNHelper(std::string model_path, cudaStream_t stream) : Managed(stream) { loadParams(model_path); } template FNNHelper::FNNHelper(const cnpy::npz_t& param_dict, cudaStream_t stream) : Managed(stream) { loadParams(param_dict); } template FNNHelper::FNNHelper(const cnpy::npz_t& param_dict, std::string prefix, cudaStream_t stream) : Managed(stream) { loadParams(prefix, param_dict); } template FNNHelper::~FNNHelper() { if (this->GPUMemStatus_) { freeCudaMem(); } delete theta_; delete stride_idcs_; delete net_structure_; } template void FNNHelper::loadParams(const std::string& model_path) { int i, j, k; std::string bias_name = ""; std::string weight_name = ""; if (!fileExists(model_path)) { std::cerr << "Could not load neural net model at path: " << model_path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(model_path); loadParams(param_dict); } template void FNNHelper::loadParams(const cnpy::npz_t& param_dict) { loadParams("", param_dict); } template void FNNHelper::loadParams(std::string prefix, const cnpy::npz_t& param_dict, bool add_slash) { int i, j, k; std::string bias_name = ""; std::string weight_name = ""; if (add_slash && !prefix.empty() && *prefix.rbegin() != '/') { prefix.append("/"); } int counter = 1; bias_name = prefix + "dynamics_b" + std::to_string(counter); weight_name = prefix + "dynamics_W" + std::to_string(counter); assert(static_cast(param_dict.at(weight_name).num_vals) % static_cast(param_dict.at(bias_name).num_vals) == 0); std::vector layers = { static_cast(param_dict.at(weight_name).num_vals) / static_cast(param_dict.at(bias_name).num_vals) }; while (param_dict.find(bias_name) != param_dict.end()) { layers.push_back(static_cast(param_dict.at(bias_name).num_vals)); counter += 1; bias_name = prefix + "dynamics_b" + std::to_string(counter); } // TODO more asserts for proper sizing of weights // TODO only setup memory if it is different than stored setupMemory(layers); for (i = 0; i < NUM_LAYERS - 1; i++) { // NN index from 1 bias_name = prefix + "dynamics_b" + std::to_string(i + 1); weight_name = prefix + "dynamics_W" + std::to_string(i + 1); cnpy::NpyArray weight_i_raw = param_dict.at(weight_name); cnpy::NpyArray bias_i_raw = param_dict.at(bias_name); double* weight_i = weight_i_raw.data(); double* bias_i = bias_i_raw.data(); // copy over the weights for (j = 0; j < net_structure_[i + 1]; j++) { for (k = 0; k < net_structure_[i]; k++) { theta_[stride_idcs_[2 * i] + j * net_structure_[i] + k] = weight_i[j * net_structure_[i] + k]; } } // copy over the bias for (j = 0; j < net_structure_[i + 1]; j++) { theta_[stride_idcs_[2 * i + 1] + j] = bias_i[j]; } } for (int i = 0; i < NUM_PARAMS; i++) { assert(isfinite(theta_[i])); } changed_weights_ = true; // Save parameters to GPU memory paramsToDevice(); } template void FNNHelper::setupMemory(const std::vector& layers) { assert(layers.size() >= 2); bool setupGPU = this->GPUMemStatus_; // TODO should see if this is different or not if (setupGPU) { freeCudaMem(); } NUM_LAYERS = layers.size(); LARGEST_LAYER = layers[0]; STRIDE_SIZE = (NUM_LAYERS - 1) * 2; for (int i = 1; i < layers.size(); i++) { // the +1 here is from the bias NUM_PARAMS += (layers[i - 1] + 1) * layers[i]; if (layers[i] > LARGEST_LAYER) { LARGEST_LAYER = layers[i]; } } INPUT_DIM = layers.front(); OUTPUT_DIM = layers.back(); // TODO allocate more memory so we can copy as float4's if (theta_ != nullptr) { delete theta_; delete net_structure_; delete stride_idcs_; } theta_ = (float*)::operator new(sizeof(float) * NUM_PARAMS); net_structure_ = (int*)::operator new(sizeof(int) * NUM_LAYERS); stride_idcs_ = (int*)::operator new(sizeof(int) * STRIDE_SIZE); memset(theta_, 0.0, sizeof(float) * NUM_PARAMS); memset(net_structure_, -1, sizeof(int) * NUM_LAYERS); memset(stride_idcs_, -1, sizeof(int) * STRIDE_SIZE); for (int i = 0; i < NUM_LAYERS; i++) { net_structure_[i] = layers[i]; } int stride = 0; for (int i = 0; i < NUM_LAYERS - 1; i++) { stride_idcs_[2 * i] = stride; stride += net_structure_[i + 1] * net_structure_[i]; stride_idcs_[2 * i + 1] = stride; stride += net_structure_[i + 1]; } PARAM_SIZE = (NUM_LAYERS + NUM_PARAMS + STRIDE_SIZE) * sizeof(float); this->SHARED_MEM_REQUEST_GRD_BYTES = mppi::math::int_multiple_const(PARAM_SIZE * USE_SHARED, sizeof(float4)); this->SHARED_MEM_REQUEST_BLK_BYTES = mppi::math::int_multiple_const(2 * LARGEST_LAYER * sizeof(float), sizeof(float4)); weighted_in_.resize(NUM_LAYERS - 1); // zeros out every matrix for (int i = 1; i < NUM_LAYERS; i++) { weighted_in_[i - 1] = Eigen::MatrixXf::Zero(net_structure_[i], 1); } if (setupGPU) { GPUSetup(); } } template void FNNHelper::updateModel(const std::vector& data) { assert(data.size() == NUM_PARAMS); std::vector description(NUM_LAYERS); for (int i = 0; i < NUM_LAYERS; i++) { description[i] = net_structure_[i]; } updateModel(description, data); } template void FNNHelper::updateModel(const std::vector& description, const std::vector& data) { // TODO can change the description of the network for (int i = 0; i < description.size(); i++) { if (description[i] != net_structure_[i]) { throw std::invalid_argument("Invalid model trying to to be set for NN"); } } for (int i = 0; i < NUM_LAYERS - 1; i++) { for (int j = 0; j < net_structure_[i + 1]; j++) { for (int k = 0; k < net_structure_[i]; k++) { theta_[stride_idcs_[2 * i] + j * net_structure_[i] + k] = data[stride_idcs_[2 * i] + j * net_structure_[i] + k]; } } } for (int i = 0; i < NUM_LAYERS - 1; i++) { for (int j = 0; j < net_structure_[i + 1]; j++) { theta_[stride_idcs_[2 * i + 1] + j] = data[stride_idcs_[2 * i + 1] + j]; } } for (int i = 0; i < NUM_PARAMS; i++) { assert(isfinite(theta_[i])); } changed_weights_ = true; if (this->GPUMemStatus_) { paramsToDevice(); } } template void FNNHelper::freeCudaMem() { if (this->GPUMemStatus_) { HANDLE_ERROR(cudaFree(weights_d_)); HANDLE_ERROR(cudaFree(network_d_)); this->GPUMemStatus_ = false; } } template void FNNHelper::GPUSetup() { if (!this->GPUMemStatus_) { network_d_ = Managed::GPUSetup>(this); HANDLE_ERROR(cudaMalloc((void**)&(this->weights_d_), PARAM_SIZE)); HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->weights_d_), &(weights_d_), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->theta_), &(weights_d_), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); float* incr_ptr = (weights_d_ + NUM_PARAMS); HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->stride_idcs_), &(incr_ptr), sizeof(int*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr = (weights_d_ + NUM_PARAMS + STRIDE_SIZE); HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->net_structure_), &(incr_ptr), sizeof(int*), cudaMemcpyHostToDevice, this->stream_)); changed_weights_ = true; paramsToDevice(); } else { this->logger_->debug("FNN GPU Memory Already set\n"); } } template void FNNHelper::paramsToDevice() { if (this->GPUMemStatus_ && changed_weights_) { HANDLE_ERROR( cudaMemcpyAsync(this->weights_d_, theta_, NUM_PARAMS * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); float* incr_ptr = weights_d_ + NUM_PARAMS; HANDLE_ERROR( cudaMemcpyAsync(incr_ptr, stride_idcs_, STRIDE_SIZE * sizeof(int), cudaMemcpyHostToDevice, this->stream_)); incr_ptr = weights_d_ + NUM_PARAMS + STRIDE_SIZE; HANDLE_ERROR( cudaMemcpyAsync(incr_ptr, net_structure_, NUM_LAYERS * sizeof(int), cudaMemcpyHostToDevice, this->stream_)); changed_weights_ = false; } } template bool FNNHelper::computeGrad(const Eigen::Ref& input, Eigen::Ref A) { // compute forward to see gradient values Eigen::VectorXf output = getZeroOutputVector(); forward(input, output); return computeGrad(A); } template bool FNNHelper::computeGrad(Eigen::Ref A) { // Start backprop Eigen::MatrixXf ip_delta = Eigen::MatrixXf::Identity(OUTPUT_DIM, OUTPUT_DIM); // Main backprop loop for (int i = NUM_LAYERS - 2; i > 0; i--) { Eigen::MatrixXf zp = weighted_in_[i - 1]; for (int j = 0; j < net_structure_[i]; j++) { zp(j) = mppi::nn::tanh_deriv(zp(j)); } Eigen::Map> cur_weights( theta_ + stride_idcs_[2 * i], net_structure_[i + 1], net_structure_[i]); ip_delta = ((cur_weights).transpose() * ip_delta).eval(); for (int j = 0; j < OUTPUT_DIM; j++) { ip_delta.col(j) = ip_delta.col(j).array() * zp.array(); } } Eigen::Map> cur_weights( theta_ + stride_idcs_[0], net_structure_[1], net_structure_[0]); ip_delta = (((cur_weights).transpose()) * ip_delta).transpose().eval(); for (int i = 0; i < INPUT_DIM; i++) { A.col(i) = ip_delta.col(i); } return true; } template void FNNHelper::forward(const Eigen::Ref& input, Eigen::Ref output) { int i, j; Eigen::MatrixXf acts = input; for (i = 0; i < NUM_LAYERS - 1; i++) { Eigen::Map> cur_weights( theta_ + stride_idcs_[2 * i], net_structure_[i + 1], net_structure_[i]); Eigen::Map> cur_bias( theta_ + stride_idcs_[2 * i + 1], net_structure_[i + 1], 1); weighted_in_[i] = (cur_weights * acts + cur_bias).eval(); acts = Eigen::MatrixXf::Zero(net_structure_[i + 1], 1); if (i < NUM_LAYERS - 2) { // Last layer doesn't apply any non-linearity for (j = 0; j < net_structure_[i + 1]; j++) { acts(j) = mppi::nn::tanh((weighted_in_[i])(j)); // Nonlinear component. } } else { for (j = 0; j < net_structure_[i + 1]; j++) { acts(j) = (weighted_in_[i])(j); } } } output = acts; } template __device__ float* FNNHelper::initialize(float* theta_s) { float* new_theta_s = theta_s; if (USE_SHARED) { // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && USE_SHARED != 0) // { // memcpy(theta_s, theta_, NUM_PARAMS * sizeof(float)); // new_theta_s += NUM_PARAMS; // memcpy(new_theta_s, stride_idcs_, STRIDE_SIZE * sizeof(int)); // new_theta_s += STRIDE_SIZE; // memcpy(new_theta_s, net_structure_, NUM_LAYERS * sizeof(int)); // new_theta_s += NUM_LAYERS; // } for (int i = blockDim.x * threadIdx.y + threadIdx.x; i < NUM_PARAMS; i += blockDim.x * blockDim.y) { theta_s[i] = theta_[i]; } int* stride_idcs = (int*)(theta_s + NUM_PARAMS); for (int i = threadIdx.y; i < STRIDE_SIZE; i += blockDim.y) { stride_idcs[i] = stride_idcs_[i]; } int* net_structure = (int*)(theta_s + NUM_PARAMS + STRIDE_SIZE); for (int i = threadIdx.y; i < NUM_LAYERS; i += blockDim.y) { net_structure[i] = net_structure_[i]; } __syncthreads(); } return new_theta_s; } template __device__ float* FNNHelper::forward(float* input, float* theta_s, float* curr_act) { float* next_act; float* tmp_act; float tmp; float* W; float* b; uint tdy = threadIdx.y; uint i, j, k; uint tdx = threadIdx.x; uint tdz = threadIdx.z; float* theta = theta_; int* stride_idcs = stride_idcs_; int* net_structure = net_structure_; if (USE_SHARED != 0) { theta = theta_s; stride_idcs = (int*)(theta_s + NUM_PARAMS); net_structure = (int*)(theta_s + NUM_PARAMS + STRIDE_SIZE); } next_act = &curr_act[LARGEST_LAYER]; // iterate through the part of the state that should be an input to the NN if (input != nullptr) { for (i = tdy; i < INPUT_DIM; i += blockDim.y) { curr_act[i] = input[i]; } __syncthreads(); } // iterate through each layer for (i = 0; i < NUM_LAYERS - 1; i++) { W = &theta[stride_idcs[2 * i]]; // weights b = &theta[stride_idcs[2 * i + 1]]; // biases // for first non input layer until last layer this thread deals with // calculates the next activation based on current for (j = tdy; j < net_structure[i + 1]; j += blockDim.y) { tmp = 0; // apply each neuron activation from current layer for (k = 0; k < net_structure[i]; k++) { // No atomic add necessary. tmp += W[j * net_structure[i] + k] * curr_act[k]; } // add bias from next layer and neuron tmp += b[j]; if (i < NUM_LAYERS - 2) { tmp = mppi::nn::tanh(tmp); } next_act[j] = tmp; } // Swap the two pointers tmp_act = curr_act; curr_act = next_act; next_act = tmp_act; __syncthreads(); } return curr_act; } template __device__ float* FNNHelper::forward(float* input, float* theta_s) { uint tdx = threadIdx.x; uint tdz = threadIdx.z; float* curr_act = &theta_s[SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float) + (2 * LARGEST_LAYER) * (blockDim.x * tdz + tdx)]; return forward(input, theta_s, curr_act); } template __device__ float* FNNHelper::getInputLocation(float* theta_s) { uint tdx = threadIdx.x; uint tdz = threadIdx.z; return theta_s + SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float) + (2 * LARGEST_LAYER) * (blockDim.x * tdz + tdx); } ================================================ FILE: include/mppi/utils/nn_helpers/fnn_helper.cuh ================================================ #ifndef MPPIGENERIC_FNN_HELPER_CUH #define MPPIGENERIC_FNN_HELPER_CUH #include "meta_math.h" #include #include #include #include #include #include template class FNNHelper : public Managed { public: // EIGEN_MAKE_ALIGNED_OPERATOR_NEW explicit FNNHelper(const std::vector& layers, cudaStream_t stream = 0); explicit FNNHelper(std::string, cudaStream_t stream = 0); explicit FNNHelper(const cnpy::npz_t& param_dict, cudaStream_t stream = 0); explicit FNNHelper(const cnpy::npz_t& param_dict, std::string prefix, cudaStream_t stream = 0); ~FNNHelper(); void loadParams(const std::string& model_path); void loadParams(const cnpy::npz_t& npz); void loadParams(std::string prefix, const cnpy::npz_t& npz, bool add_slash = true); void GPUSetup(); void updateModel(const std::vector& description, const std::vector& data); void updateModel(const std::vector& data); void freeCudaMem(); void paramsToDevice(); bool computeGrad(Eigen::Ref A); bool computeGrad(const Eigen::Ref& input, Eigen::Ref A); void forward(const Eigen::Ref& input, Eigen::Ref output); __device__ float* initialize(float* theta_s); __device__ float* forward(float* input, float* theta_s); __device__ float* forward(float* input, float* theta_s, float* curr_act); __host__ __device__ int* getNetStructurePtr() const { return net_structure_; } __host__ __device__ int* getStrideIdcsPtr() const { return stride_idcs_; } __host__ __device__ float* getThetaPtr() const { return theta_; } __host__ __device__ int getNumLayers() const { return this->NUM_LAYERS; } __host__ __device__ int getNumParams() const { return this->NUM_PARAMS; } __host__ __device__ int getLargestLayer() const { return this->LARGEST_LAYER; } __host__ __device__ int getStrideSize() const { return this->STRIDE_SIZE; } Eigen::VectorXf getZeroInputVector() { return Eigen::VectorXf::Zero(INPUT_DIM); } Eigen::VectorXf getZeroOutputVector() { return Eigen::VectorXf::Zero(OUTPUT_DIM); } Eigen::MatrixXf getZeroJacobianMatrix() { return Eigen::MatrixXf::Zero(OUTPUT_DIM, INPUT_DIM); } __device__ __host__ int getInputDim() const { return INPUT_DIM; } __host__ __device__ int getOutputDim() const { return OUTPUT_DIM; } __device__ float* getInputLocation(float* theta_s); void setAllWeights(float input) { std::vector vals(NUM_PARAMS); std::fill(vals.begin(), vals.end(), input); updateModel(vals); } // device pointer, null on the device FNNHelper* network_d_ = nullptr; float* weights_d_ = nullptr; private: int NUM_LAYERS = 0; ///< Total number of layers (including in/out layer) int LARGEST_LAYER = 0; ///< Number of neurons in the largest layer(including in/out neurons) int NUM_PARAMS = 0; ///< Total number of model parameters; int PARAM_SIZE = 0; ///< Maximum size of the parameters stored in shared memory int STRIDE_SIZE = 0; int INPUT_DIM = 0; int OUTPUT_DIM = 0; // packed by all weights that connect layer 1 to layer 2 neuron 1, bias for all connections from layer 1 to layer 2 // then layer 2 neuron 2, etc float* theta_ = nullptr; // index into theta for weights and bias (layer 0 weights start, no bias in input layer, layer 1 weights start, layer1 // bias start... int* stride_idcs_ = nullptr; //[neurons in layer 1, neurons in layer 2, ...] int* net_structure_ = nullptr; std::atomic changed_weights_ = { false }; std::vector weighted_in_; /// eigen aligned does not matter here since we have a variable size eigen matrix void setupMemory(const std::vector& layers); }; #if __CUDACC__ #include "fnn_helper.cu" #endif #endif // MPPIGENERIC_FNN_HELPER_CUH ================================================ FILE: include/mppi/utils/nn_helpers/lstm_helper.cu ================================================ // // Created by jason on 8/20/22. // #include "lstm_helper.cuh" template LSTMHelper::LSTMHelper(LSTMConfig& config, cudaStream_t stream) : LSTMHelper(config.input_dim, config.hidden_dim, config.output_layers, stream) { } template LSTMHelper::LSTMHelper(int input_dim, int hidden_dim, std::vector& output_layers, cudaStream_t stream) : Managed(stream) { output_nn_ = new FNNHelper(output_layers, this->stream_); setupMemory(input_dim, hidden_dim); } template LSTMHelper::LSTMHelper(std::string path, cudaStream_t stream) : Managed(stream) { loadParams(path); } template LSTMHelper::LSTMHelper(const cnpy::npz_t& param_dict, std::string prefix, bool add_slash, cudaStream_t stream) : Managed(stream) { loadParams(prefix, param_dict, add_slash); } template void LSTMHelper::setupMemory(int input_dim, int hidden_dim) { HIDDEN_DIM = hidden_dim; INPUT_DIM = input_dim; OUTPUT_DIM = output_nn_->getOutputDim(); assert(output_nn_->getInputDim() == INPUT_DIM + HIDDEN_DIM); bool setupGPU = this->GPUMemStatus_; if (setupGPU) { freeCudaMem(); } HIDDEN_HIDDEN_SIZE = HIDDEN_DIM * HIDDEN_DIM; INPUT_HIDDEN_SIZE = HIDDEN_DIM * INPUT_DIM; // The initial cell and hidden does not need to be stored in shared memory LSTM_PARAM_SIZE_BYTES = (HIDDEN_DIM * 4 + HIDDEN_HIDDEN_SIZE * 4 + INPUT_HIDDEN_SIZE * 4) * sizeof(float); LSTM_SHARED_MEM_GRD_BYTES = mppi::math::int_multiple_const(LSTM_PARAM_SIZE_BYTES * USE_SHARED, sizeof(float4)); SHARED_MEM_REQUEST_GRD_BYTES = output_nn_->getGrdSharedSizeBytes() + LSTM_SHARED_MEM_GRD_BYTES; SHARED_MEM_REQUEST_BLK_BYTES = output_nn_->getBlkSharedSizeBytes() + mppi::math::int_multiple_const((2 * HIDDEN_DIM) * sizeof(float), sizeof(float4)); hidden_state_ = Eigen::VectorXf(HIDDEN_DIM); hidden_state_.setZero(); cell_state_ = Eigen::VectorXf(HIDDEN_DIM); cell_state_.setZero(); if (weights_ != nullptr) { delete weights_; } // allocate all the memory dynamically weights_ = (float*)::operator new(LSTM_PARAM_SIZE_BYTES + 2 * HIDDEN_DIM * sizeof(float)); W_im_ = weights_; W_fm_ = weights_ + HIDDEN_HIDDEN_SIZE; W_om_ = weights_ + 2 * HIDDEN_HIDDEN_SIZE; W_cm_ = weights_ + 3 * HIDDEN_HIDDEN_SIZE; W_ii_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE; W_fi_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + INPUT_HIDDEN_SIZE; W_oi_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + 2 * INPUT_HIDDEN_SIZE; W_ci_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + 3 * INPUT_HIDDEN_SIZE; b_i_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE; b_f_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + HIDDEN_DIM; b_o_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + 2 * HIDDEN_DIM; b_c_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + 3 * HIDDEN_DIM; initial_hidden_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + 4 * HIDDEN_DIM; initial_cell_ = weights_ + 4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + 5 * HIDDEN_DIM; memset(weights_, 0.0f, LSTM_PARAM_SIZE_BYTES + HIDDEN_DIM * 2 * sizeof(float)); if (setupGPU) { GPUSetup(); } } template void LSTMHelper::updateLSTMInitialStates(const Eigen::Ref hidden, const Eigen::Ref cell) { setHiddenState(hidden); setCellState(cell); } template void LSTMHelper::GPUSetup() { output_nn_->GPUSetup(); if (!this->GPUMemStatus_) { network_d_ = Managed::GPUSetup>(this); HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->output_nn_), &(this->output_nn_->network_d_), sizeof(OUTPUT_FNN_T*), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMalloc((void**)&(this->weights_d_), LSTM_PARAM_SIZE_BYTES + HIDDEN_DIM * 2 * sizeof(float))); // copies all pointers to be right on the GPU side float* incr_ptr = weights_d_; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->weights_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->W_im_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_HIDDEN_SIZE; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->W_fm_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_HIDDEN_SIZE; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->W_om_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_HIDDEN_SIZE; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->W_cm_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_HIDDEN_SIZE; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->W_ii_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += INPUT_HIDDEN_SIZE; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->W_fi_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += INPUT_HIDDEN_SIZE; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->W_oi_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += INPUT_HIDDEN_SIZE; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->W_ci_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += INPUT_HIDDEN_SIZE; HANDLE_ERROR( cudaMemcpyAsync(&(this->network_d_->b_i_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_DIM; HANDLE_ERROR( cudaMemcpyAsync(&(this->network_d_->b_f_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_DIM; HANDLE_ERROR( cudaMemcpyAsync(&(this->network_d_->b_o_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_DIM; HANDLE_ERROR( cudaMemcpyAsync(&(this->network_d_->b_c_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_DIM; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->initial_hidden_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); incr_ptr += HIDDEN_DIM; HANDLE_ERROR(cudaMemcpyAsync(&(this->network_d_->initial_cell_), &(incr_ptr), sizeof(float*), cudaMemcpyHostToDevice, this->stream_)); paramsToDevice(); } else { this->logger_->debug("LSTM GPU Memory already set\n"); } } template void LSTMHelper::freeCudaMem() { output_nn_->freeCudaMem(); if (this->GPUMemStatus_) { HANDLE_ERROR(cudaFree(network_d_)); HANDLE_ERROR(cudaFree(weights_d_)); this->GPUMemStatus_ = false; } } template void LSTMHelper::paramsToDevice() { if (this->GPUMemStatus_) { // copies entire params to device HANDLE_ERROR(cudaMemcpyAsync(this->weights_d_, this->weights_, LSTM_PARAM_SIZE_BYTES + HIDDEN_DIM * 2 * sizeof(float), cudaMemcpyHostToDevice, stream_)); } } template __device__ void LSTMHelper::initialize(float* theta_s) { this->initialize(theta_s, SHARED_MEM_REQUEST_BLK_BYTES, SHARED_MEM_REQUEST_GRD_BYTES, 0); } template __device__ void LSTMHelper::initialize(float* theta_s, int blk_size, int grd_size, int blk_offset) { this->initialize(theta_s, blk_size, grd_size, blk_offset, 0); } template __device__ void LSTMHelper::initialize(float* theta_s, int blk_size, int grd_size, int blk_offset, int grd_offset) { if (USE_SHARED) { // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) // { // memcpy(theta_s, weights_, LSTM_PARAM_SIZE_BYTES); // } for (int i = blockDim.x * threadIdx.y + threadIdx.x; i < LSTM_PARAM_SIZE_BYTES / sizeof(float); i += blockDim.x * blockDim.y) { theta_s[i + grd_offset / sizeof(float)] = weights_[i]; } output_nn_->initialize(theta_s + grd_offset / sizeof(float) + LSTM_SHARED_MEM_GRD_BYTES / sizeof(float)); __syncthreads(); } // copies the initial cell and hidden state to the correct place const int shift = grd_size / sizeof(float) + blk_size * (threadIdx.x + blockDim.x * threadIdx.z) / sizeof(float) + blk_offset; for (int i = threadIdx.y; i < HIDDEN_DIM; i += blockDim.y) { (theta_s + shift)[i] = (weights_ + LSTM_PARAM_SIZE_BYTES / sizeof(float))[i]; (theta_s + shift + HIDDEN_DIM)[i] = (weights_ + LSTM_PARAM_SIZE_BYTES / sizeof(float) + HIDDEN_DIM)[i]; } // if (threadIdx.y == 0) // { // memcpy(theta_s + shift, weights_ + LSTM_PARAM_SIZE_BYTES / sizeof(float), 2 * HIDDEN_DIM * sizeof(float)); // } __syncthreads(); } template void LSTMHelper::copyHiddenCellToDevice() { if (this->GPUMemStatus_) { HANDLE_ERROR(cudaMemcpyAsync(this->weights_d_ + 4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + 4 * HIDDEN_DIM, this->weights_ + 4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + 4 * HIDDEN_DIM, 2 * HIDDEN_DIM * sizeof(float), cudaMemcpyHostToDevice, stream_)); } } template void LSTMHelper::updateOutputModel(const std::vector& description, const std::vector& data) { output_nn_->updateModel(description, data); } template void LSTMHelper::updateOutputModel(const std::vector& data) { output_nn_->updateModel(data); } template void LSTMHelper::forward(const Eigen::Ref& input) { Eigen::Map> eig_W_im(W_im_, HIDDEN_DIM, HIDDEN_DIM); Eigen::Map> eig_W_fm(W_fm_, HIDDEN_DIM, HIDDEN_DIM); Eigen::Map> eig_W_om(W_om_, HIDDEN_DIM, HIDDEN_DIM); Eigen::Map> eig_W_cm(W_cm_, HIDDEN_DIM, HIDDEN_DIM); Eigen::Map> eig_W_ii(W_ii_, HIDDEN_DIM, INPUT_DIM); Eigen::Map> eig_W_fi(W_fi_, HIDDEN_DIM, INPUT_DIM); Eigen::Map> eig_W_oi(W_oi_, HIDDEN_DIM, INPUT_DIM); Eigen::Map> eig_W_ci(W_ci_, HIDDEN_DIM, INPUT_DIM); Eigen::Map> eig_b_i(b_i_, HIDDEN_DIM, 1); Eigen::Map> eig_b_f(b_f_, HIDDEN_DIM, 1); Eigen::Map> eig_b_o(b_o_, HIDDEN_DIM, 1); Eigen::Map> eig_b_c(b_c_, HIDDEN_DIM, 1); Eigen::VectorXf g_i = eig_W_im * hidden_state_ + eig_W_ii * input + eig_b_i; Eigen::VectorXf g_f = eig_W_fm * hidden_state_ + eig_W_fi * input + eig_b_f; Eigen::VectorXf g_o = eig_W_om * hidden_state_ + eig_W_oi * input + eig_b_o; Eigen::VectorXf g_c = eig_W_cm * hidden_state_ + eig_W_ci * input + eig_b_c; g_i = g_i.unaryExpr(&mppi::nn::sigmoid); g_f = g_f.unaryExpr(&mppi::nn::sigmoid); g_o = g_o.unaryExpr(&mppi::nn::sigmoid); g_c = g_c.unaryExpr(&mppi::nn::tanh); Eigen::VectorXf c_next = g_i.cwiseProduct(g_c) + g_f.cwiseProduct(cell_state_); Eigen::VectorXf h_next = g_o.cwiseProduct(c_next.unaryExpr(&mppi::nn::tanh)); hidden_state_ = h_next; cell_state_ = c_next; } template void LSTMHelper::forward(const Eigen::Ref& input, Eigen::Ref output) { forward(input); Eigen::VectorXf nn_input = output_nn_->getZeroInputVector(); for (int i = 0; i < HIDDEN_DIM; i++) { nn_input(i) = hidden_state_(i); } for (int i = 0; i < INPUT_DIM; i++) { nn_input(i + HIDDEN_DIM) = input(i); } output_nn_->forward(nn_input, output); } template __device__ float* LSTMHelper::forward(float* input, float* theta_s) { const int block_idx = (blockDim.x * threadIdx.z + threadIdx.x) * SHARED_MEM_REQUEST_BLK_BYTES / sizeof(float) + SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float); return forward(input, theta_s, theta_s + block_idx); } template __device__ float* LSTMHelper::forward(float* input, float* theta_s, float* block_ptr) { float* c = &block_ptr[0]; float* g_o = &block_ptr[2 * HIDDEN_DIM]; // input gate return forward(input, theta_s, c, g_o); } template __device__ float* LSTMHelper::forward(float* input, float* theta_s, float* hidden_cell, float* block_ptr) { // Weights float* W_ii = this->W_ii_; float* W_im = this->W_im_; float* W_fi = this->W_fi_; float* W_fm = this->W_fm_; float* W_oi = this->W_oi_; float* W_om = this->W_om_; float* W_ci = this->W_ci_; float* W_cm = this->W_cm_; // Biases float* b_i = this->b_i_; // hidden_size float* b_f = this->b_f_; // hidden_size float* b_o = this->b_o_; // hidden_size float* b_c = this->b_c_; // hidden_size if (USE_SHARED) { // Weights W_im = &theta_s[0]; W_fm = &theta_s[HIDDEN_HIDDEN_SIZE]; W_om = &theta_s[2 * HIDDEN_HIDDEN_SIZE]; W_cm = &theta_s[3 * HIDDEN_HIDDEN_SIZE]; W_ii = &theta_s[4 * HIDDEN_HIDDEN_SIZE]; W_fi = &theta_s[4 * HIDDEN_HIDDEN_SIZE + INPUT_HIDDEN_SIZE]; W_oi = &theta_s[4 * HIDDEN_HIDDEN_SIZE + 2 * INPUT_HIDDEN_SIZE]; W_ci = &theta_s[4 * HIDDEN_HIDDEN_SIZE + 3 * INPUT_HIDDEN_SIZE]; // Biases b_i = &theta_s[4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE]; b_f = &theta_s[4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + HIDDEN_DIM]; b_o = &theta_s[4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + 2 * HIDDEN_DIM]; b_c = &theta_s[4 * HIDDEN_HIDDEN_SIZE + 4 * INPUT_HIDDEN_SIZE + 3 * HIDDEN_DIM]; } uint i, j; // Intermediate outputs // for each block we have prior cell/hidden state float* const h = &hidden_cell[0]; float* const c = &hidden_cell[HIDDEN_DIM]; // each block has place to compute g_o, and input float* const g_o = &block_ptr[0]; // output gate float* const x = &block_ptr[HIDDEN_DIM]; // FNN needs space for input and activations float* const output_act = g_o; uint tdy = threadIdx.y; // load input into theta_s if (input != nullptr) { for (i = tdy; i < INPUT_DIM; i += blockDim.y) { x[i] = input[i]; } __syncthreads(); } float temp_g_i = 0; float temp_g_f = 0; float temp_g_o = 0; float temp_cell_update = 0; // apply each gate in parallel for (i = tdy; i < HIDDEN_DIM; i += blockDim.y) { temp_g_i = 0; temp_g_f = 0; temp_g_o = 0; temp_cell_update = 0; for (j = 0; j < INPUT_DIM; j++) { int index = i * INPUT_DIM + j; temp_g_i += W_ii[index] * x[j]; temp_g_f += W_fi[index] * x[j]; temp_g_o += W_oi[index] * x[j]; temp_cell_update += W_ci[index] * x[j]; } for (j = 0; j < HIDDEN_DIM; j++) { int index = i * HIDDEN_DIM + j; temp_g_i += W_im[index] * h[j]; temp_g_f += W_fm[index] * h[j]; temp_g_o += W_om[index] * h[j]; temp_cell_update += W_cm[index] * h[j]; } temp_g_i += b_i[i]; temp_g_f += b_f[i]; temp_g_o += b_o[i]; temp_cell_update += b_c[i]; temp_g_i = mppi::nn::sigmoid(temp_g_i); temp_g_f = mppi::nn::sigmoid(temp_g_f); temp_cell_update = mppi::nn::tanh(temp_cell_update); g_o[i] = mppi::nn::sigmoid(temp_g_o); c[i] = temp_g_i * temp_cell_update + temp_g_f * c[i]; } __syncthreads(); // copy computed hidden/cell state to theta_s for (i = tdy; i < HIDDEN_DIM; i += blockDim.y) { h[i] = mppi::nn::tanh(c[i]) * g_o[i]; // actually using c_next intentionally output_act[i] = h[i]; } // copy input to activation // for (i = tdy; i < INPUT_DIM; i += blockDim.y) // { // output_act[i + HIDDEN_DIM] = x[i]; // } __syncthreads(); return output_nn_->forward(nullptr, theta_s + LSTM_SHARED_MEM_GRD_BYTES / sizeof(float), output_act); } template void LSTMHelper::resetHiddenCellCPU() { for (int i = 0; i < HIDDEN_DIM; i++) { hidden_state_(i) = initial_hidden_[i]; cell_state_(i) = initial_cell_[i]; } } template void LSTMHelper::setHiddenState(const Eigen::Ref hidden_state) { for (int i = 0; i < HIDDEN_DIM; i++) { initial_hidden_[i] = hidden_state(i); hidden_state_(i) = hidden_state(i); } } template void LSTMHelper::setCellState(const Eigen::Ref cell_state) { for (int i = 0; i < HIDDEN_DIM; i++) { initial_cell_[i] = cell_state(i); cell_state_(i) = cell_state(i); } } template void LSTMHelper::loadParams(const std::string& model_path) { if (!fileExists(model_path)) { std::cerr << "Could not load neural net model at path: " << model_path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(model_path); loadParams(param_dict); } template void LSTMHelper::loadParams(const cnpy::npz_t& param_dict) { loadParams("", param_dict); } template void LSTMHelper::loadParams(std::string prefix, const cnpy::npz_t& param_dict, bool add_slash) { if (add_slash && !prefix.empty() && *prefix.rbegin() != '/') { prefix.append("/"); } if (param_dict.find("model/" + prefix + "lstm/weight_hh_l0") != param_dict.end()) { prefix.insert(0, "model/"); } // assumes it has been unonioned if (output_nn_ == nullptr) { output_nn_ = new FNNHelper(param_dict, prefix + "output/", this->stream_); } else { output_nn_->loadParams(prefix + "output/", param_dict); } cnpy::NpyArray weight_hh_raw = param_dict.at(prefix + "lstm/weight_hh_l0"); cnpy::NpyArray bias_hh_raw = param_dict.at(prefix + "lstm/bias_hh_l0"); cnpy::NpyArray weight_ih_raw = param_dict.at(prefix + "lstm/weight_ih_l0"); cnpy::NpyArray bias_ih_raw = param_dict.at(prefix + "lstm/bias_ih_l0"); double* weight_hh = weight_hh_raw.data(); double* bias_hh = bias_hh_raw.data(); double* weight_ih = weight_ih_raw.data(); double* bias_ih = bias_ih_raw.data(); int input_dim = weight_ih_raw.shape[1]; int hidden_dim = bias_hh_raw.shape[0] / 4; setupMemory(input_dim, hidden_dim); for (int i = 0; i < HIDDEN_HIDDEN_SIZE; i++) { W_im_[i] = weight_hh[i]; W_fm_[i] = weight_hh[i + HIDDEN_HIDDEN_SIZE]; W_cm_[i] = weight_hh[i + 2 * HIDDEN_HIDDEN_SIZE]; W_om_[i] = weight_hh[i + 3 * HIDDEN_HIDDEN_SIZE]; assert(isfinite(W_im_[i])); assert(isfinite(W_fm_[i])); assert(isfinite(W_cm_[i])); assert(isfinite(W_om_[i])); } for (int i = 0; i < INPUT_HIDDEN_SIZE; i++) { W_ii_[i] = weight_ih[i]; W_fi_[i] = weight_ih[i + INPUT_HIDDEN_SIZE]; W_ci_[i] = weight_ih[i + 2 * INPUT_HIDDEN_SIZE]; W_oi_[i] = weight_ih[i + 3 * INPUT_HIDDEN_SIZE]; assert(isfinite(W_ii_[i])); assert(isfinite(W_fi_[i])); assert(isfinite(W_ci_[i])); assert(isfinite(W_oi_[i])); } for (int i = 0; i < HIDDEN_DIM; i++) { b_i_[i] = bias_hh[i] + bias_ih[i]; b_f_[i] = bias_hh[i + HIDDEN_DIM] + bias_ih[i + HIDDEN_DIM]; b_c_[i] = bias_hh[i + 2 * HIDDEN_DIM] + bias_ih[i + 2 * HIDDEN_DIM]; b_o_[i] = bias_hh[i + 3 * HIDDEN_DIM] + bias_ih[i + 3 * HIDDEN_DIM]; assert(isfinite(b_i_[i])); assert(isfinite(b_f_[i])); assert(isfinite(b_c_[i])); assert(isfinite(b_o_[i])); } // Save parameters to GPU memory paramsToDevice(); } template __device__ float* LSTMHelper::getInputLocation(float* theta_s) { const int block_idx = (blockDim.x * threadIdx.z + threadIdx.x) * SHARED_MEM_REQUEST_BLK_BYTES / sizeof(float) + SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float); float* x = theta_s + block_idx + 3 * HIDDEN_DIM; return x; } template __device__ float* LSTMHelper::getInputLocation(float* theta_s, const int grd_shift, const int blk_shift, const int shift) { float* x = theta_s + grd_shift + blk_shift + shift + 3 * HIDDEN_DIM; return x; } template __device__ float* LSTMHelper::getHiddenCellLocation(float* theta_s, const int grd_shift, const int blk_shift, const int shift) { float* x = theta_s + grd_shift + blk_shift + shift; return x; } ================================================ FILE: include/mppi/utils/nn_helpers/lstm_helper.cuh ================================================ #ifndef MPPIGENERIC_LSTM_HELPER_CUH #define MPPIGENERIC_LSTM_HELPER_CUH #include "fnn_helper.cuh" struct LSTMConfig { int input_dim = -1; int hidden_dim = -1; std::vector output_layers = {-1}; }; template class LSTMHelper : public Managed { public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW typedef FNNHelper OUTPUT_FNN_T; // TODO create destructor that deallocates memory LSTMHelper(LSTMConfig& config, cudaStream_t stream = 0); LSTMHelper(int input_dim, int hidden_dim, std::vector& output_layers, cudaStream_t stream = 0); LSTMHelper(std::string, cudaStream_t stream = 0); LSTMHelper(const cnpy::npz_t& param_dict, std::string prefix, bool add_slash = true, cudaStream_t stream = 0); ~LSTMHelper() { freeCudaMem(); } void loadParams(const std::string& model_path); void loadParams(const cnpy::npz_t& npz); void loadParams(std::string prefix, const cnpy::npz_t& npz, bool add_slash = true); __device__ void initialize(float* theta_s); __device__ void initialize(float* theta_s, int blk_size, int grd_size, int blk_offset); __device__ void initialize(float* theta_s, int blk_size, int grd_size, int blk_offset, int grd_offset); void GPUSetup(); void freeCudaMem(); void paramsToDevice(); void copyHiddenCellToDevice(); void updateOutputModel(const std::vector& description, const std::vector& data); void updateOutputModel(const std::vector& data); void updateLSTMInitialStates(const Eigen::Ref hidden, const Eigen::Ref cell); // bool computeGrad(Eigen::Ref A); // bool computeGrad(const Eigen::Ref& input, Eigen::Ref A); void forward(const Eigen::Ref& input, Eigen::Ref output); void forward(const Eigen::Ref& input); __device__ float* forward(float* input, float* theta_s); __device__ float* forward(float* input, float* theta_s, float* block_ptr); __device__ float* forward(float* input, float* theta_s, float* hidden_cell, float* block_ptr); __device__ float* getInputLocation(float* theta_s); __device__ float* getInputLocation(float* theta_s, const int grd_shift, const int blk_shift, const int shift); __device__ float* getHiddenCellLocation(float* theta_s, const int grd_shift, const int blk_shift, const int shift); void resetHiddenCellCPU(); void setAllValues(std::vector& lstm, std::vector& output) { assert(lstm.size() == getNumParams()); memcpy(this->weights_, lstm.data(), LSTM_PARAM_SIZE_BYTES + HIDDEN_DIM * 2 * sizeof(float)); resetHiddenCellCPU(); output_nn_->updateModel(output); } void setAllValues(float input) { for (int i = 0; i < LSTM_PARAM_SIZE_BYTES / sizeof(float) + 2 * HIDDEN_DIM; i++) { weights_[i] = input; } resetHiddenCellCPU(); output_nn_->setAllWeights(input); } Eigen::VectorXf getHiddenState() { return hidden_state_; } Eigen::VectorXf getCellState() { return cell_state_; } __host__ __device__ OUTPUT_FNN_T* getOutputModel() const { return output_nn_; } __host__ __device__ float* getOutputWeights() const { return output_nn_->weights_d_; } __host__ __device__ int getHiddenDim() const { return HIDDEN_DIM; } __host__ __device__ int getInputDim() const { return INPUT_DIM; } __host__ __device__ int getOutputDim() const { return OUTPUT_DIM; } __host__ __device__ int getHiddenHiddenSize() const { return HIDDEN_HIDDEN_SIZE; } __host__ __device__ int getInputHiddenSize() const { return INPUT_HIDDEN_SIZE; } __host__ __device__ int getOutputGrdSharedSizeBytes() const { return output_nn_->getGrdSharedSizeBytes(); } __host__ __device__ int getLSTMGrdSharedSizeBytes() const { return LSTM_SHARED_MEM_GRD_BYTES; } __host__ __device__ int getBlkLSTMSharedSizeBytes() const { return (3 * HIDDEN_DIM + INPUT_DIM) * sizeof(float); } __host__ __device__ float* getWeights() { return weights_; } int getNumParams() const { return LSTM_PARAM_SIZE_BYTES / sizeof(float) + 2 * HIDDEN_DIM; } Eigen::VectorXf getZeroInputVector() { return Eigen::VectorXf::Zero(INPUT_DIM); } Eigen::VectorXf getZeroOutputVector() { return Eigen::VectorXf::Zero(OUTPUT_DIM); } void setHiddenState(const Eigen::Ref hidden_state); void setCellState(const Eigen::Ref hidden_state); // device pointer, null on the device LSTMHelper* network_d_ = nullptr; float* weights_d_ = nullptr; private: // params OUTPUT_FNN_T* output_nn_ = nullptr; int HIDDEN_DIM = 0; int INPUT_DIM = 0; int OUTPUT_DIM = 0; int NUM_PARAMS = 1; int LSTM_PARAM_SIZE_BYTES = 0; int LSTM_SHARED_MEM_GRD_BYTES = 0; ///< Amount of shared memory the LSTM needs per block int HIDDEN_HIDDEN_SIZE = HIDDEN_DIM * HIDDEN_DIM; int INPUT_HIDDEN_SIZE = HIDDEN_DIM * (INPUT_DIM); float* W_im_ = nullptr; ///< HIDDEN_HIDDEN_SIZE float* W_fm_ = nullptr; ///< HIDDEN_HIDDEN_SIZE float* W_om_ = nullptr; ///< HIDDEN_HIDDEN_SIZE float* W_cm_ = nullptr; ///< HIDDEN_HIDDEN_SIZE float* W_ii_ = nullptr; ///< INPUT_HIDDEN_SIZE float* W_fi_ = nullptr; ///< INPUT_HIDDEN_SIZE float* W_oi_ = nullptr; ///< INPUT_HIDDEN_SIZE float* W_ci_ = nullptr; ///< INPUT_HIDDEN_SIZE float* b_i_ = nullptr; ///< HIDDEN_SIZE float* b_f_ = nullptr; ///< HIDDEN_SIZE float* b_o_ = nullptr; ///< HIDDEN_SIZE float* b_c_ = nullptr; ///< HIDDEN_SIZE float* initial_hidden_ = nullptr; float* initial_cell_ = nullptr; float* weights_ = nullptr; Eigen::VectorXf hidden_state_; Eigen::VectorXf cell_state_; void setupMemory(int input_dim, int hidden_dim); }; #if __CUDACC__ #include "lstm_helper.cu" #endif #endif // MPPIGENERIC_LSTM_HELPER_CUH ================================================ FILE: include/mppi/utils/nn_helpers/lstm_lstm_helper.cu ================================================ #include "lstm_lstm_helper.cuh" template LSTMLSTMHelper::LSTMLSTMHelper(int init_input_dim, int init_hidden_dim, std::vector& init_output_layers, int input_dim, int hidden_dim, std::vector& output_layers, int init_len, cudaStream_t stream) { init_len_ = init_len; init_model_ = std::make_shared>(init_input_dim, init_hidden_dim, init_output_layers); lstm_ = std::make_shared>(input_dim, hidden_dim, output_layers, stream); assert(init_model_->getOutputDim() == lstm_->getHiddenDim() * 2); } template LSTMLSTMHelper::LSTMLSTMHelper(std::string path, cudaStream_t stream) : LSTMLSTMHelper::LSTMLSTMHelper(path, "", stream) { } template LSTMLSTMHelper::LSTMLSTMHelper(std::string path, std::string prefix, cudaStream_t stream) { if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(path); init_model_ = std::make_shared>(param_dict, prefix + "init_", false); lstm_ = std::make_shared>(param_dict, prefix, true, stream); if (param_dict.find("init_length") != param_dict.end()) // this is the new api but assert is meaningless right now { // TODO remove once all models are updated to the lastest export API assert(param_dict.find("init_length") != param_dict.end()); init_len_ = static_cast(param_dict.at("init_length").data()[0]) + 1; } else { // old api here assert(param_dict.find("num_points") != param_dict.end()); int num_points = param_dict.at("num_points").data()[0]; assert(param_dict.find("init_input") != param_dict.end()); init_len_ = param_dict.at("init_input").shape[0] / (num_points * init_model_->getInputDim()); } } template void LSTMLSTMHelper::initializeLSTM(const Eigen::Ref& buffer) { assert(buffer.rows() == init_model_->getInputDim()); assert(buffer.cols() >= init_len_); // reset hidden/cell state init_model_->resetHiddenCellCPU(); int t = buffer.cols() - init_len_; for (; t < buffer.cols() - 1; t++) { init_model_->forward(buffer.col(t)); } // run full model with output at end Eigen::VectorXf output = init_model_->getZeroOutputVector(); init_model_->forward(buffer.col(t), output); // set the lstm initial hidden/cell to output lstm_->setHiddenState(output.head(lstm_->getHiddenDim())); lstm_->setCellState(output.tail(lstm_->getHiddenDim())); // only copies the hidden/cell states lstm_->copyHiddenCellToDevice(); } template void LSTMLSTMHelper::loadParamsInit(const std::string& model_path) { init_model_->loadParams(model_path); } template void LSTMLSTMHelper::loadParamsInit(const cnpy::npz_t& npz) { init_model_->loadParams(npz); } template void LSTMLSTMHelper::loadParamsLSTM(const std::string& model_path) { lstm_->loadParams(model_path); } template void LSTMLSTMHelper::loadParamsLSTM(const cnpy::npz_t& npz) { lstm_->loadParams(npz); } template void LSTMLSTMHelper::loadParams(std::string prefix, std::string model_path) { if (!prefix.empty() && *prefix.rbegin() != '/') { prefix.append("/"); } if (!fileExists(model_path)) { std::cerr << "Could not load neural net model at path: " << model_path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(model_path); lstm_->loadParams(prefix, param_dict); init_model_->loadParams(prefix + "init_", param_dict, false); } template void LSTMLSTMHelper::GPUSetup() { lstm_->GPUSetup(); } template void LSTMLSTMHelper::freeCudaMem() { lstm_->freeCudaMem(); } template void LSTMLSTMHelper::forward(const Eigen::Ref& input, Eigen::Ref output) { lstm_->forward(input, output); } template void LSTMLSTMHelper::updateOutputModel(const std::vector& description, const std::vector& data) { lstm_->updateOutputModel(description, data); } template void LSTMLSTMHelper::updateOutputModelInit(const std::vector& description, const std::vector& data) { init_model_->updateOutputModel(description, data); } template void LSTMLSTMHelper::updateOutputModel(const std::vector& data) { lstm_->updateOutputModel(data); } template void LSTMLSTMHelper::updateOutputModelInit(const std::vector& data) { init_model_->updateOutputModel(data); } template std::shared_ptr> LSTMLSTMHelper::getInitModel() { return init_model_; } template std::shared_ptr> LSTMLSTMHelper::getLSTMModel() { return lstm_; } ================================================ FILE: include/mppi/utils/nn_helpers/lstm_lstm_helper.cuh ================================================ // // Created by jason on 8/23/22. // #ifndef MPPIGENERIC_LSTM_LSTM_HELPER_CUH #define MPPIGENERIC_LSTM_LSTM_HELPER_CUH #include "lstm_helper.cuh" struct LSTMLSTMConfig { LSTMConfig init_config; LSTMConfig pred_config; int init_len = 0; }; template class LSTMLSTMHelper { public: LSTMLSTMHelper(LSTMLSTMConfig config, cudaStream_t stream = 0) : LSTMLSTMHelper(config.init_config.input_dim, config.init_config.hidden_dim, config.init_config.output_layers, config.pred_config.input_dim, config.pred_config.hidden_dim, config.pred_config.output_layers, config.init_len, stream) { } LSTMLSTMHelper(LSTMConfig init_config, LSTMConfig pred_config, int init_len, cudaStream_t stream = 0) : LSTMLSTMHelper(init_config.input_dim, init_config.hidden_dim, init_config.output_layers, pred_config.input_dim, pred_config.hidden_dim, pred_config.output_layers, init_len, stream) { } LSTMLSTMHelper(int init_input_dim, int init_hidden_dim, std::vector& init_output_layers, int input_dim, int hidden_dim, std::vector& output_layers, int init_len, cudaStream_t = 0); // LSTMLSTMHelper(std::string init_path, std::string lstm_path, cudaStream_t = 0); LSTMLSTMHelper(std::string path, std::string prefix, cudaStream_t = 0); LSTMLSTMHelper(std::string path, cudaStream_t = 0); void loadParams(std::string prefix, std::string path); void loadParamsInit(const std::string& model_path); void loadParamsInit(const cnpy::npz_t& npz); void loadParamsLSTM(const std::string& model_path); void loadParamsLSTM(const cnpy::npz_t& npz); void updateOutputModelInit(const std::vector& description, const std::vector& data); void updateOutputModel(const std::vector& description, const std::vector& data); void updateOutputModelInit(const std::vector& data); void updateOutputModel(const std::vector& data); void GPUSetup(); void freeCudaMem(); Eigen::VectorXf getZeroInputVector() { return lstm_->getZeroInputVector(); } Eigen::VectorXf getZeroOutputVector() { return lstm_->getZeroOutputVector(); } void resetInitHiddenCPU() { init_model_->resetHiddenCellCPU(); } void resetLSTMHiddenCellCPU() { lstm_->resetHiddenCellCPU(); } Eigen::MatrixXf getEmptyBufferMatrix() { return Eigen::MatrixXf(init_model_->getInputDim(), init_len_); } Eigen::MatrixXf getEmptyBufferMatrix(int length) { return Eigen::MatrixXf(init_model_->getInputDim(), length); } void initializeLSTM(const Eigen::Ref& buffer); void forward(const Eigen::Ref& input, Eigen::Ref output); std::shared_ptr> getInitModel(); std::shared_ptr> getLSTMModel(); LSTMHelper* getLSTMDevicePtr() { return lstm_->network_d_; } int getInitLen() { return init_len_; } private: std::shared_ptr> init_model_ = nullptr; std::shared_ptr> lstm_ = nullptr; int init_len_ = -1; }; #if __CUDACC__ #include "lstm_lstm_helper.cu" #endif #endif // MPPIGENERIC_LSTM_LSTM_HELPER_CUH ================================================ FILE: include/mppi/utils/nn_helpers/meta_math.h ================================================ /********************************************** * @file meta_math.h * @author Grady Williams * @date May 24, 2017 * @copyright 2017 Georgia Institute of Technology * @brief template arithmetic for figuring out how * much memory to allocate for neural network on GPU ***********************************************/ #ifndef MPPIGENERIC_META_MATH_H #define MPPIGENERIC_META_MATH_H template constexpr int input_dim(int first, Args... args) { return first; } template constexpr int output_dim(int last) { return last; } template constexpr int output_dim(int first, Args... args) { return output_dim(args...); } template constexpr int param_counter(int first) { return first; } template constexpr int param_counter(int first, int next) { return (first + 1) * next; } template constexpr int param_counter(int first, int next, Args... args) { return (first + 1) * next + param_counter(next, args...); } template constexpr int layer_counter(int first) { return 1; } template constexpr int layer_counter(int first, Args... args) { return 1 + layer_counter(args...); } template constexpr int neuron_counter(int first) { return first; } template constexpr int neuron_counter(int first, Args... args) { return (first > neuron_counter(args...)) ? first : neuron_counter(args...); } #endif // MPPIGENERIC_META_MATH_H ================================================ FILE: include/mppi/utils/numerical_integration.h ================================================ // // Created by mgandhi3 on 7/16/21. // #ifndef MPPIGENERIC_NUMERICAL_INTEGRATION_H #define MPPIGENERIC_NUMERICAL_INTEGRATION_H #include template void rk4integrate(DYN* dynamics, float dt, const Eigen::Ref& x_k, const Eigen::Ref& u_k, Eigen::Ref x_kp1) { // Assume a zero order hold on the control typename DYN::state_array k1, k2, k3, k4, x_next1, x_next2, x_next3, x_next4; typename DYN::output_array output; dynamics->step(x_k, x_next1, k1, u_k, output, 0, dt / 2); dynamics->step(x_next1, x_next2, k2, u_k, output, 0, dt / 2); dynamics->step(x_next2, x_next3, k3, u_k, output, 0, dt); dynamics->step(x_next3, x_next4, k4, u_k, output, 0, dt / 2); // dynamics->computeStateDeriv(x_k, u_k, k1); // dynamics->computeStateDeriv(x_k + k1 * dt / 2, u_k, k2); // dynamics->computeStateDeriv(x_k + k2 * dt / 2, u_k, k3); // dynamics->computeStateDeriv(x_k + k3 * dt, u_k, k4); x_kp1 = x_k + (k1 + 2 * k2 + 2 * k3 + k4) * dt / 6; } #endif // MPPIGENERIC_NUMERICAL_INTEGRATION_H ================================================ FILE: include/mppi/utils/parallel_utils.cuh ================================================ // // Created by Bogdan on 8/20/23. // #pragma once #include namespace mppi { namespace p1 // parallelize to 1 index and step { enum class Parallel1Dir : int { THREAD_X = 0, THREAD_Y, THREAD_Z, THREAD_XY, THREAD_YX, THREAD_XZ, THREAD_ZX, THREAD_YZ, THREAD_ZY, THREAD_XYZ, GLOBAL_X, GLOBAL_Y, GLOBAL_Z, NONE, }; template inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step); template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.x; p_step = blockDim.x; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.y; p_step = blockDim.y; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.z; p_step = blockDim.z; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.x + blockDim.x * threadIdx.y; p_step = blockDim.x * blockDim.y; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.x + blockDim.x * threadIdx.z; p_step = blockDim.x * blockDim.z; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.y + blockDim.y * threadIdx.x; p_step = blockDim.y * blockDim.x; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.y + blockDim.y * threadIdx.z; p_step = blockDim.y * blockDim.z; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.z + blockDim.z * threadIdx.x; p_step = blockDim.z * blockDim.x; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.z + blockDim.z * threadIdx.y; p_step = blockDim.z * blockDim.y; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); p_step = blockDim.x * blockDim.y * blockDim.z; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.x + blockDim.x * blockIdx.x; p_step = gridDim.x * blockDim.x; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.y + blockDim.y * blockIdx.y; p_step = gridDim.y * blockDim.y; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { #ifdef __CUDA_ARCH__ p_index = threadIdx.z + blockDim.z * blockIdx.z; p_step = gridDim.z * blockDim.z; #else p_index = 0; p_step = 1; #endif } template <> inline __host__ __device__ void getParallel1DIndex(int& p_index, int& p_step) { p_index = 0; p_step = 1; } template inline __device__ void loadArrayParallel(T* __restrict__ a1, const int off1, const T* __restrict__ a2, const int off2, const int N) { int p_index, p_step; getParallel1DIndex(p_index, p_step); if (N % 4 == 0 && sizeof(type4) <= 16 && off1 % 4 == 0 && off2 % 4 == 0) { for (int i = p_index; i < N / 4; i += p_step) { reinterpret_cast*>(&a1[off1])[i] = reinterpret_cast*>(&a2[off2])[i]; } } else if (N % 2 == 0 && sizeof(type2) <= 16 && off1 % 2 == 0 && off2 % 2 == 0) { for (int i = p_index; i < N / 2; i += p_step) { reinterpret_cast*>(&a1[off1])[i] = reinterpret_cast*>(&a2[off2])[i]; } } else { for (int i = p_index; i < N; i += p_step) { a1[off1 + i] = a2[off2 + i]; } } } template inline __device__ void loadArrayParallel(T* __restrict__ a1, const int off1, const T* __restrict__ a2, const int off2) { loadArrayParallel(a1, off1, a2, off2, N); } } // namespace p1 namespace p2 // parallelize using 2 indices and steps { enum class Parallel2Dir : int { THREAD_XY = 0, THREAD_XZ, THREAD_YZ, THREAD_YX, THREAD_ZX, THREAD_ZY, NONE }; template inline __host__ __device__ void getParallel2DIndex(int& p1_index, int& p2_index, int& p1_step, int& p2_step) { #ifndef __CUDA_ARCH__ p1_index = 0; p2_index = 0; p1_step = 1; p2_step = 1; #endif } template <> inline __device__ void getParallel2DIndex(int& p1_index, int& p2_index, int& p1_step, int& p2_step) { #ifdef __CUDA_ARCH__ p1_index = threadIdx.x; p1_step = blockDim.x; p2_index = threadIdx.y; p2_step = blockDim.y; #else p1_index = 0; p2_index = 0; p1_step = 1; p2_step = 1; #endif } template <> inline __device__ void getParallel2DIndex(int& p1_index, int& p2_index, int& p1_step, int& p2_step) { #ifdef __CUDA_ARCH__ p1_index = threadIdx.y; p1_step = blockDim.y; p2_index = threadIdx.z; p2_step = blockDim.z; #else p1_index = 0; p2_index = 0; p1_step = 1; p2_step = 1; #endif } template <> inline __device__ void getParallel2DIndex(int& p1_index, int& p2_index, int& p1_step, int& p2_step) { #ifdef __CUDA_ARCH__ p1_index = threadIdx.x; p1_step = blockDim.x; p2_index = threadIdx.z; p2_step = blockDim.z; #else p1_index = 0; p2_index = 0; p1_step = 1; p2_step = 1; #endif } template <> inline __device__ void getParallel2DIndex(int& p1_index, int& p2_index, int& p1_step, int& p2_step) { #ifdef __CUDA_ARCH__ p1_index = threadIdx.y; p1_step = blockDim.y; p2_index = threadIdx.x; p2_step = blockDim.x; #else p1_index = 0; p2_index = 0; p1_step = 1; p2_step = 1; #endif } template <> inline __device__ void getParallel2DIndex(int& p1_index, int& p2_index, int& p1_step, int& p2_step) { #ifdef __CUDA_ARCH__ p1_index = threadIdx.z; p1_step = blockDim.z; p2_index = threadIdx.y; p2_step = blockDim.y; #else p1_index = 0; p2_index = 0; p1_step = 1; p2_step = 1; #endif } template <> inline __device__ void getParallel2DIndex(int& p1_index, int& p2_index, int& p1_step, int& p2_step) { #ifdef __CUDA_ARCH__ p1_index = threadIdx.z; p1_step = blockDim.z; p2_index = threadIdx.x; p2_step = blockDim.x; #else p1_index = 0; p2_index = 0; p1_step = 1; p2_step = 1; #endif } template <> inline __device__ void getParallel2DIndex(int& p1_index, int& p2_index, int& p1_step, int& p2_step) { p1_index = 0; p1_step = 1; p2_index = 0; p2_step = 1; } } // namespace p2 } // namespace mppi ================================================ FILE: include/mppi/utils/risk_utils.cu ================================================ #include // #include // #include #include namespace mppi { template __host__ __device__ void insertionSort(T* __restrict__ array, const int N) { T temp; int j; for (int i = 1; i < N; i++) { temp = array[i]; j = i - 1; while (j >= 0 && array[j] > temp) { array[j + 1] = array[j]; --j; } array[j + 1] = temp; } } __host__ __device__ float RiskMeasure::var(float* __restrict__ costs, const int num_costs, float alpha) { float cost = 0.0f; #ifdef __CUDA_ARCH__ // thrust::device_ptr thrust_ptr = thrust::device_pointer_cast(costs); // thrust::sort(thrust::seq, thrust_ptr, thrust_ptr + num_costs); insertionSort<>(costs, num_costs); #else std::sort(costs, costs + num_costs); #endif float h_idx = h_index(num_costs, alpha); int next_idx = min((int)ceilf(h_idx), num_costs - 1); int prev_idx = max((int)floorf(h_idx), 0); cost = costs[prev_idx] + (h_idx - prev_idx) * (costs[next_idx] - costs[prev_idx]); return cost; } __host__ __device__ float RiskMeasure::cvar(float* __restrict__ costs, const int num_costs, float alpha) { float cost = 0.0f; float value_at_risk = var(costs, num_costs, alpha); // also sorts costs int num_costs_above = 1; float sum_costs_above = value_at_risk; float h_idx = h_index(num_costs, alpha); for (int i = ceilf(h_idx); i < num_costs; i++) { num_costs_above++; sum_costs_above += costs[i]; } cost = sum_costs_above / num_costs_above; return cost; } } // namespace mppi ================================================ FILE: include/mppi/utils/risk_utils.cuh ================================================ #pragma once namespace mppi { class RiskMeasure { public: enum FUNC_TYPE : int { MAX = 0, MIN, VAR, CVAR, MEAN, MEDIAN, NUM_FUNCS }; FUNC_TYPE func_ = CVAR; float alpha_ = 0.8; __host__ __device__ float shaping_func(float* __restrict__ costs, const int num_costs) { return shaping_func(costs, num_costs, func_, alpha_); } static __host__ __device__ float shaping_func(float* __restrict__ costs, const int num_costs, const FUNC_TYPE type = MEAN, const float risk_tolerance = 0.5f) { float cost = 0.0f; if (num_costs == 1) { return costs[0]; } switch (type) { case CVAR: cost = cvar(costs, num_costs, risk_tolerance); break; case MAX: cost = max_measure(costs, num_costs); break; case MEDIAN: cost = var(costs, num_costs, 0.5f); break; case MIN: cost = min_measure(costs, num_costs); break; case VAR: cost = var(costs, num_costs, risk_tolerance); break; default: // go to mean case case MEAN: cost = mean_measure(costs, num_costs); break; } return cost; } static __host__ __device__ float max_measure(const float* __restrict__ costs, const int num_costs) { float max_cost = costs[0]; for (int i = 1; i < num_costs; i++) { if (costs[i] > max_cost) { max_cost = costs[i]; } } return max_cost; } static __host__ __device__ float min_measure(const float* __restrict__ costs, const int num_costs) { float min_cost = costs[0]; for (int i = 1; i < num_costs; i++) { if (costs[i] < min_cost) { min_cost = costs[i]; } } return min_cost; } static __host__ __device__ float mean_measure(const float* __restrict__ costs, const int num_costs) { float cost = 0.0f; for (int i = 0; i < num_costs; i++) { cost += costs[i]; } return cost / num_costs; } static __host__ __device__ float h_index(const int num_costs, const float alpha) { return alpha * (num_costs - 1); } static __host__ __device__ float var(float* __restrict__ costs, const int num_costs, float alpha); static __host__ __device__ float cvar(float* __restrict__ costs, const int num_costs, float alpha); }; } // namespace mppi #ifdef __CUDACC__ #include "risk_utils.cu" #endif ================================================ FILE: include/mppi/utils/test_helper.h ================================================ // // Created by mgandhi3 on 1/13/20. // #ifndef MPPIGENERIC_KERNEL_TESTS_TEST_HELPER_H #define MPPIGENERIC_KERNEL_TESTS_TEST_HELPER_H #include #include #include #include inline void array_assert_float_eq(const std::vector& known, const std::vector& compute, int size) { ASSERT_EQ(compute.size(), size) << "The computed vector size is not the given size!"; ASSERT_EQ(known.size(), compute.size()) << "Two vectors are not the same size!"; for (int i = 0; i < size; i++) { ASSERT_FLOAT_EQ(known[i], compute[i]) << "Failed at index: " << i; } } inline void array_assert_float_eq(const float known, const std::vector& compute, const int size) { ASSERT_EQ(compute.size(), size) << "The computed vector size is not the given size!"; for (int i = 0; i < size; i++) { ASSERT_FLOAT_EQ(known, compute[i]) << "Failed at index: " << i; } } template inline void array_assert_float_eq(const std::array& known, const std::array& compute) { ASSERT_EQ(compute.size(), size) << "The computed array size is not the given size!"; for (int i = 0; i < size; i++) { if (isnan(known[i]) || isnan(compute[i])) { ASSERT_EQ(isnan(known[i]), isnan(compute[i])) << "NaN check failed at index: " << i; } else if (isinf(known[i]) || isinf(compute[i])) { ASSERT_EQ(isinf(known[i]), isinf(compute[i])) << "inf check failed at index: " << i; } else { ASSERT_FLOAT_EQ(known[i], compute[i]) << "Failed at index: " << i; } } } template inline void array_expect_float_eq(const std::array& known, const std::array& compute) { ASSERT_EQ(compute.size(), size) << "The computed array size is not the given size!"; for (int i = 0; i < size; i++) { EXPECT_FLOAT_EQ(known[i], compute[i]) << "Failed at index: " << i; } } template inline void array_expect_near(const std::array& known, const std::array& compute, float tol) { ASSERT_EQ(compute.size(), size) << "The computed array size is not the given size!"; for (int i = 0; i < size; i++) { EXPECT_NEAR(known[i], compute[i], tol) << "Failed at index: " << i; } } template inline void array_assert_float_eq(const float known, const std::array& compute) { ASSERT_EQ(compute.size(), size) << "The computed array size is not the given size!"; for (int i = 0; i < size; i++) { ASSERT_FLOAT_EQ(known, compute[i]) << "Failed at index: " << i; } } template inline void eigen_assert_float_eq(const Eigen::Ref& known, const Eigen::Ref& compute) { for (int i = 0; i < known.size(); i++) { ASSERT_FLOAT_EQ(known.data()[i], compute.data()[i]) << "Failed at index: " << i; } } template inline void eigen_assert_float_near(const Eigen::Ref& known, const Eigen::Ref& compute, float tol) { for (int i = 0; i < known.size(); i++) { ASSERT_NEAR(known.data()[i], compute.data()[i], tol * known.data()[i]) << "Failed at index: " << i; } } template inline void array_assert_float_near(const std::array& known, const std::array& compute, float tol) { ASSERT_EQ(compute.size(), size) << "The computed array size is not the given size!"; for (int i = 0; i < size; i++) { ASSERT_NEAR(known[i], compute[i], tol) << "Failed at index: " << i; } } #endif // MPPIGENERIC_KERNEL_TESTS_TEST_HELPER_H ================================================ FILE: include/mppi/utils/texture_helpers/texture_helper.cu ================================================ #include "texture_helper.cuh" template TextureHelper::TextureHelper(int number, cudaStream_t stream) : Managed(stream), size_(number) { textures_.resize(number); textures_buffer_.resize(number); cpu_values_.resize(number); cpu_buffer_values_.resize(number); textures_d_ = textures_.data(); } template TextureHelper::~TextureHelper() { freeCudaMem(); } template void TextureHelper::GPUSetup() { if (!GPUMemStatus_) { TEX_T* derived = static_cast(this); ptr_d_ = Managed::GPUSetup(derived); // allocates memory to access params on the GPU by pointer HANDLE_ERROR(cudaMalloc(¶ms_d_, sizeof(TextureParams) * textures_.size())); HANDLE_ERROR(cudaMemcpyAsync(&(ptr_d_->textures_d_), &(params_d_), sizeof(TextureParams*), cudaMemcpyHostToDevice, this->stream_)); copyToDevice(true); } else { std::cout << "GPU Memory already set" << std::endl; } } template void TextureHelper::freeCudaMem() { if (this->GPUMemStatus_) { for (int index = 0; index < textures_.size(); index++) { freeCudaMem(textures_[index]); } if (params_d_ != nullptr) { HANDLE_ERROR(cudaFree(params_d_)); } if (ptr_d_ != nullptr) { HANDLE_ERROR(cudaFree(ptr_d_)); } } this->GPUMemStatus_ = false; this->params_d_ = nullptr; this->ptr_d_ = nullptr; CudaCheckError(); } template void TextureHelper::freeCudaMem(TextureParams& texture) { if (texture.allocated) { HANDLE_ERROR(cudaFreeArray(texture.array_d)); HANDLE_ERROR(cudaDestroyTextureObject(texture.tex_d)); texture.allocated = false; texture.array_d = nullptr; texture.tex_d = 0; } } template void TextureHelper::allocateCudaTexture(int index) { // if already allocated, deallocate if (this->GPUMemStatus_ && textures_[index].allocated) { freeCudaMem(textures_[index]); } } template __host__ __device__ void TextureHelper::bodyOffsetToWorldPose(const float3& offset, const float3& body_pose, const float3& rotation, float3& output) { mppi::math::bodyOffsetToWorldPoseEuler(offset, body_pose, rotation, output); } template __host__ __device__ void TextureHelper::worldPoseToMapPose(const int index, const float3& input, float3& output) { float3 diff = make_float3(input.x - textures_d_[index].origin.x, input.y - textures_d_[index].origin.y, input.z - textures_d_[index].origin.z); float3* rotation_mat_ptr = textures_d_[index].rotations; output.x = (rotation_mat_ptr[0].x * diff.x + rotation_mat_ptr[0].y * diff.y + rotation_mat_ptr[0].z * diff.z); output.y = (rotation_mat_ptr[1].x * diff.x + rotation_mat_ptr[1].y * diff.y + rotation_mat_ptr[1].z * diff.z); output.z = (rotation_mat_ptr[2].x * diff.x + rotation_mat_ptr[2].y * diff.y + rotation_mat_ptr[2].z * diff.z); } template __host__ __device__ void TextureHelper::mapPoseToTexCoord(const int index, const float3& input, float3& output) { // printf("res %f %f %f extent %f %f %f\n", textures_d_[index].resolution.x, textures_d_[index].resolution.y, // textures_d_[index].resolution.z, textures_d_[index].extent.width, textures_d_[index].extent.depth); // from map frame to pixels [m] -> [px] output.x = input.x / textures_d_[index].resolution.x; output.y = input.y / textures_d_[index].resolution.y; output.z = input.z / textures_d_[index].resolution.z; // normalize pixel values output.x /= textures_d_[index].extent.width; output.y /= textures_d_[index].extent.height; if (textures_d_[index].extent.depth != 0) { output.z /= textures_d_[index].extent.depth; } } template __host__ __device__ void TextureHelper::worldPoseToTexCoord(const int index, const float3& input, float3& output) { float3 map; worldPoseToMapPose(index, input, map); mapPoseToTexCoord(index, map, output); // printf("world to map (%f, %f, %f) -> (%f, %f, %f) -> (%f, %f, %f)\n", input.x, input.y, input.z, map.x, map.y, // map.z, output.x, output.y, output.z); } template __host__ __device__ void TextureHelper::bodyOffsetWorldToTexCoord(const int index, const float3& offset, const float3& body_pose, const float3& rotation, float3& output) { float3 offset_result; bodyOffsetToWorldPose(offset, body_pose, rotation, offset_result); worldPoseToTexCoord(index, offset_result, output); } template void TextureHelper::copyToDevice(bool synchronize) { // TODO lock the buffer // copies the buffer to the CPU side version for (int i = 0; i < textures_buffer_.size(); i++) { if (textures_buffer_[i].update_params) { // copy over params from buffer to object textures_[i].origin = textures_buffer_[i].origin; textures_[i].rotations[0] = textures_buffer_[i].rotations[0]; textures_[i].rotations[1] = textures_buffer_[i].rotations[1]; textures_[i].rotations[2] = textures_buffer_[i].rotations[2]; textures_[i].resolution = textures_buffer_[i].resolution; textures_[i].update_params = true; textures_buffer_[i].update_params = false; } // copy the relevant things over from buffer if (textures_buffer_[i].update_data) { // moves data from cpu buffer to cpu side cpu_values_[i] = std::move(cpu_buffer_values_[i]); // cpu_buffer_values are resized in the updateTexture method textures_[i].update_data = true; textures_buffer_[i].update_data = false; textures_[i].use = textures_buffer_[i].use; } if (textures_buffer_[i].update_mem) { textures_[i].extent = textures_buffer_[i].extent; textures_[i].texDesc = textures_buffer_[i].texDesc; textures_[i].update_mem = true; textures_buffer_[i].update_mem = false; } } // TODO unlock buffer if (!this->GPUMemStatus_) { return; } // goes through and checks what needs to be copied and does it TEX_T* derived = static_cast(this); for (int i = 0; i < textures_.size(); i++) { TextureParams* param = &textures_[i]; // do the allocation and texture creation if (param->update_mem) { derived->allocateCudaTexture(i); derived->createCudaTexture(i, false); } // if allocated if (param->allocated) { // if we have new parameter values copy it over if (param->update_params) { derived->copyParamsToGPU(i, false); } // if we have updated data copy it over if (param->update_data) { // copies data to the GPU derived->copyDataToGPU(i, false); } } } if (synchronize) { cudaStreamSynchronize(this->stream_); } } template void TextureHelper::createCudaTexture(int index, bool sync) { TextureParams* cpu_param = &textures_[index]; cpu_param->resDesc.res.array.array = cpu_param->array_d; HANDLE_ERROR(cudaCreateTextureObject(&(cpu_param->tex_d), &cpu_param->resDesc, &cpu_param->texDesc, NULL)); cpu_param->allocated = true; cpu_param->update_mem = false; copyParamsToGPU(index, sync); } template void TextureHelper::addNewTexture(const cudaExtent& extent) { // update the buffer not the actual textures textures_buffer_.resize(textures_buffer_.size() + 1); textures_.resize(textures_.size() + 1); textures_buffer_.back().extent = extent; textures_.back().extent = extent; size_ = textures_.size(); if (this->GPUMemStatus_) { TEX_T* derived = static_cast(this); int index = textures_.size() - 1; // TODO resize the device side array that stores textures derived->allocateCudaTexture(index); derived->createCudaTexture(index); textures_.back().allocated = true; } } template __host__ __device__ DATA_T TextureHelper::queryTextureAtWorldOffsetPose(const int index, const float3& input, const float3& offset, const float3& rotation) { float3 tex_coords; bodyOffsetWorldToTexCoord(index, offset, input, rotation, tex_coords); TEX_T* derived = static_cast(this); return derived->queryTexture(index, tex_coords); } template __host__ __device__ DATA_T TextureHelper::queryTextureAtWorldPose(const int index, const float3& input) { float3 tex_coords; worldPoseToTexCoord(index, input, tex_coords); TEX_T* derived = static_cast(this); return derived->queryTexture(index, tex_coords); } template __host__ __device__ DATA_T TextureHelper::queryTextureAtMapPose(const int index, const float3& input) { float3 tex_coords; mapPoseToTexCoord(index, input, tex_coords); TEX_T* derived = static_cast(this); return derived->queryTexture(index, tex_coords); } template void TextureHelper::updateOrigin(int index, float3 new_origin) { this->textures_buffer_[index].origin = new_origin; this->textures_buffer_[index].update_params = true; } template void TextureHelper::updateRotation(int index, std::array& new_rotation) { this->textures_buffer_[index].rotations[0] = new_rotation[0]; this->textures_buffer_[index].rotations[1] = new_rotation[1]; this->textures_buffer_[index].rotations[2] = new_rotation[2]; this->textures_buffer_[index].update_params = true; } template void TextureHelper::updateResolution(int index, float resolution) { this->textures_buffer_[index].resolution.x = resolution; this->textures_buffer_[index].resolution.y = resolution; this->textures_buffer_[index].resolution.z = resolution; this->textures_buffer_[index].update_params = true; } template void TextureHelper::updateResolution(int index, float3 resolution) { this->textures_buffer_[index].resolution.x = resolution.x; this->textures_buffer_[index].resolution.y = resolution.y; this->textures_buffer_[index].resolution.z = resolution.z; this->textures_buffer_[index].update_params = true; } template bool TextureHelper::setExtent(int index, cudaExtent& extent) { // checks if the extent has changed and reallocates if yes TextureParams* param = &textures_buffer_[index]; if (param->extent.width != extent.width || param->extent.height != extent.height || param->extent.depth != extent.depth) { // flag to update mem next time we should param->update_mem = true; this->textures_buffer_[index].extent = extent; return true; } return false; } template void TextureHelper::copyParamsToGPU(int index, bool sync) { TextureParams* cpu_param = &textures_[index]; // Copy entire param structure over from CPU to GPU HANDLE_ERROR(cudaMemcpyAsync(&(params_d_[index]), cpu_param, sizeof(TextureParams), cudaMemcpyHostToDevice, this->stream_)); cpu_param->update_params = false; if (sync) { cudaStreamSynchronize(this->stream_); } } template void TextureHelper::updateAddressMode(int index, cudaTextureAddressMode mode) { this->textures_buffer_[index].texDesc.addressMode[0] = mode; this->textures_buffer_[index].texDesc.addressMode[1] = mode; this->textures_buffer_[index].texDesc.addressMode[2] = mode; this->textures_buffer_[index].update_mem = true; } template void TextureHelper::updateAddressMode(int index, int layer, cudaTextureAddressMode mode) { this->textures_buffer_[index].texDesc.addressMode[layer] = mode; this->textures_buffer_[index].update_mem = true; } template void TextureHelper::updateBorder(int index, float value) { this->textures_buffer_[index].texDesc.borderColor[0] = value; this->textures_buffer_[index].texDesc.borderColor[1] = value; this->textures_buffer_[index].texDesc.borderColor[2] = value; this->textures_buffer_[index].texDesc.borderColor[3] = value; this->textures_buffer_[index].update_mem = true; } template void TextureHelper::updateBorder(int index, int layer, float value) { this->textures_buffer_[index].texDesc.borderColor[layer] = value; this->textures_buffer_[index].update_mem = true; } ================================================ FILE: include/mppi/utils/texture_helpers/texture_helper.cuh ================================================ // // Created by jason on 1/5/22. // #ifndef MPPIGENERIC_TEXTURE_HELPER_CUH #define MPPIGENERIC_TEXTURE_HELPER_CUH #include #include #include #include template struct TextureParams { cudaExtent extent; cudaArray* array_d = nullptr; cudaTextureObject_t tex_d = 0; cudaChannelFormatDesc channelDesc; cudaResourceDesc resDesc; cudaTextureDesc texDesc; float3 origin; float3 rotations[3]; float3 resolution; bool use = false; // indicates that the texture is to be used or not, separate from allocation bool allocated = false; // indicates that the texture has been allocated on the GPU bool update_data = false; bool update_mem = false; // indicates the GPU structure should be updated at the next convenient time bool update_params = false; TextureParams() { extent = make_cudaExtent(1, 1, 1); resDesc.resType = cudaResourceTypeArray; channelDesc = cudaCreateChannelDesc(); // clamp memset(&texDesc, 0, sizeof(texDesc)); texDesc.addressMode[0] = cudaAddressModeClamp; texDesc.addressMode[1] = cudaAddressModeClamp; texDesc.addressMode[2] = cudaAddressModeClamp; texDesc.borderColor[0] = 0.0f; texDesc.borderColor[1] = 0.0f; texDesc.borderColor[2] = 0.0f; texDesc.borderColor[3] = 0.0f; texDesc.filterMode = cudaFilterModeLinear; texDesc.readMode = cudaReadModeElementType; texDesc.normalizedCoords = 1; origin = make_float3(0.0, 0.0, 0.0); rotations[0] = make_float3(1, 0, 0); rotations[1] = make_float3(0, 1, 0); rotations[2] = make_float3(0, 0, 1); resolution = make_float3(1, 1, 1); } }; template class TextureHelper : public Managed { protected: TextureHelper(int number, cudaStream_t stream = 0); public: virtual ~TextureHelper(); void GPUSetup(); static void freeCudaMem(TextureParams& texture); virtual void freeCudaMem(); /** * helper method to deallocate the index before allocating new ones */ virtual void allocateCudaTexture(int index); /** * helper method to create a cuda texture * @param index */ virtual void createCudaTexture(int index, bool sync = true); /** * Copies texture information to the GPU version of the object */ virtual void copyToDevice(bool synchronize = false); /** * */ virtual void addNewTexture(const cudaExtent& extent); __host__ __device__ void bodyOffsetToWorldPose(const float3& offset, const float3& body_pose, const float3& rotation, float3& output); __host__ __device__ void worldPoseToMapPose(const int index, const float3& input, float3& output); __host__ __device__ void mapPoseToTexCoord(const int index, const float3& input, float3& output); __host__ __device__ void worldPoseToTexCoord(const int index, const float3& input, float3& output); __host__ __device__ void bodyOffsetWorldToTexCoord(const int index, const float3& offset, const float3& body_pose, const float3& rotation, float3& output); __host__ __device__ DATA_T queryTextureAtWorldOffsetPose(const int index, const float3& input, const float3& offset, const float3& rotation); __host__ __device__ DATA_T queryTextureAtWorldPose(const int index, const float3& input); __host__ __device__ DATA_T queryTextureAtMapPose(const int index, const float3& input); virtual void updateOrigin(int index, float3 new_origin); virtual void updateRotation(int index, std::array& new_rotation); virtual void updateResolution(int index, float resolution); virtual void updateResolution(int index, float3 resolution); virtual bool setExtent(int index, cudaExtent& extent); virtual void copyDataToGPU(int index, bool sync = false) = 0; virtual void copyParamsToGPU(int index, bool sync = false); virtual void enableTexture(int index) { this->textures_buffer_[index].update_params = true; this->textures_buffer_[index].use = true; } virtual void disableTexture(int index) { this->textures_buffer_[index].update_params = true; this->textures_buffer_[index].use = false; } __device__ __host__ bool checkTextureUse(int index) const { return this->textures_d_[index].use; } void updateAddressMode(int index, cudaTextureAddressMode mode); void updateAddressMode(int index, int layer, cudaTextureAddressMode mode); void updateBorder(int index, float value); void updateBorder(int index, int layer, float value); std::vector> getTextures() { return textures_; } std::vector> getBufferTextures() { return textures_buffer_; } std::vector> getCpuValues() { return cpu_values_; } std::vector> getCpuBufferValues() { return cpu_buffer_values_; } void updateDataAtIndex(int index) { this->textures_buffer_[index].update_data = true; } __device__ __host__ int size() { return size_; } TEX_T* ptr_d_ = nullptr; __host__ __device__ float3 getOrigin(int index) const { return this->textures_d_[index].origin; } __host__ __device__ float3 getResolution(int index) const { return this->textures_d_[index].resolution; } __host__ __device__ cudaExtent getExtent(int index) const { return this->textures_d_[index].extent; } protected: // std::mutex buffer_lck_; // stores the values we actually use std::vector> textures_; // buffer that can be updated async, does not have correct cuda memory locations std::vector> textures_buffer_; // memory allocation can vary depending on 1D, 2D, or 3D implementation std::vector> cpu_values_; std::vector> cpu_buffer_values_; // helper, on CPU points to vector data (textures_.data()), on GPU points to device copy (params_d_ variable) TextureParams* textures_d_ = nullptr; // device pointer to the parameters malloced memory TextureParams* params_d_ = nullptr; int size_ = 0; }; #if __CUDACC__ #include "texture_helper.cu" #endif #endif // MPPIGENERIC_TEXTURE_HELPER_CUH ================================================ FILE: include/mppi/utils/texture_helpers/three_d_texture_helper.cu ================================================ #include "three_d_texture_helper.cuh" template ThreeDTextureHelper::ThreeDTextureHelper(int number, bool synched, cudaStream_t stream) : TextureHelper, DATA_T>(number, stream) { layer_copy_.resize(number); this->synched_ = synched; for (std::vector& layer : layer_copy_) { // sets all current indexes to be true std::fill(layer.begin(), layer.end(), false); } } template void ThreeDTextureHelper::allocateCudaTexture(int index) { TextureHelper, DATA_T>::allocateCudaTexture(index); TextureParams* param = &this->textures_[index]; // TODO check to make sure our alloc is correct, i.e. extent is valid HANDLE_ERROR(cudaMalloc3DArray(&(param->array_d), &(param->channelDesc), param->extent)); } template void ThreeDTextureHelper::updateTexture(const int index, const int z_index, std::vector& values, bool column_major) { TextureParams* param = &this->textures_buffer_[index]; int w = param->extent.width; int h = param->extent.height; int d = param->extent.depth; // check that the sizes are correct if (values.size() != w * h) { throw std::runtime_error(std::string("Error: invalid size to updateTexture ") + std::to_string(values.size()) + " != " + std::to_string(w * h)); } // TODO needs to be in the data format used for textures if (this->cpu_buffer_values_[index].size() != w * h * d) { this->cpu_buffer_values_[index].resize(w * h * d); // copies values back to the buffer if it has been recently moved std::copy(this->cpu_values_[index].begin(), this->cpu_values_[index].end(), this->cpu_buffer_values_[index].begin()); } // copy over values to cpu side holder if (column_major) { for (int j = 0; j < w; j++) { for (int i = 0; i < h; i++) { int columnMajorIndex = j * h + i; int rowMajorIndex = (w * h * z_index) + i * w + j; this->cpu_buffer_values_[index][rowMajorIndex] = values[columnMajorIndex]; } } } else { auto start = this->cpu_buffer_values_[index].begin() + (w * h * z_index); std::copy(values.begin(), values.end(), start); } // tells the object to copy it over next time that happens layer_copy_[index][z_index] = true; if (!synched_) { param->update_data = true; } } template void ThreeDTextureHelper::updateTexture( const int index, const int z_index, const Eigen::Ref, 0, Eigen::Stride> values, bool column_major) { TextureParams* param = &this->textures_buffer_[index]; int w = param->extent.width; int h = param->extent.height; int d = param->extent.depth; // check that the sizes are correct if (values.size() != w * h) { throw std::runtime_error(std::string("Error: invalid size to updateTexture ") + std::to_string(values.size()) + " != " + std::to_string(w * h)); } if (this->cpu_buffer_values_[index].size() != w * h * d) { this->cpu_buffer_values_[index].resize(w * h * d); // copies values back to the buffer if it has been recently moved std::copy(this->cpu_values_[index].begin(), this->cpu_values_[index].end(), this->cpu_buffer_values_[index].begin()); } // copy over values to cpu side holder if (column_major) { for (int j = 0; j < w; j++) { for (int i = 0; i < h; i++) { int columnMajorIndex = j * h + i; int rowMajorIndex = (w * h * z_index) + i * w + j; this->cpu_buffer_values_[index][rowMajorIndex] = values.data()[columnMajorIndex]; } } } else { auto start = this->cpu_buffer_values_[index].data() + (w * h * z_index); memcpy(start, values.data(), values.size() * sizeof(DATA_T)); } // tells the object to copy it over next time that happens layer_copy_[index][z_index] = true; if (!synched_) { param->update_data = true; } } // TODO update texture where everything is copied over in one go template __host__ __device__ DATA_T ThreeDTextureHelper::queryTexture(const int index, const float3& point) { #ifdef __CUDA_ARCH__ return tex3D(this->textures_d_[index].tex_d, point.x, point.y, point.z); #else TextureParams* param = &this->textures_[index]; float3 query = make_float3(point.x * param->extent.width, point.y * param->extent.height, point.z * param->extent.depth); query.x = query.x - 0.5f; query.y = query.y - 0.5f; query.z = query.z - 0.5f; if (param->texDesc.addressMode[0] == cudaAddressModeClamp) { if (query.x > param->extent.width - 1) { query.x = param->extent.width - 1; } else if (query.x <= 0.0f) { query.x = 0.0; } } else if (param->texDesc.addressMode[0] == cudaAddressModeBorder) { if (query.x > param->extent.width - 1 || query.x <= 0.0) { return createPartialCudaTuple(param->texDesc.borderColor[0], param->texDesc.borderColor[1], param->texDesc.borderColor[2], param->texDesc.borderColor[3]); } } else { throw std::runtime_error(std::string("using unsupported address mode on the CPU in texture utils")); } if (param->texDesc.addressMode[1] == cudaAddressModeClamp) { if (query.y > param->extent.height - 1) { query.y = param->extent.height - 1; } else if (query.y <= 0.0f) { query.y = 0.0; } } else if (param->texDesc.addressMode[1] == cudaAddressModeBorder) { if (query.y > param->extent.height - 1 || query.y <= 0.0) { return createPartialCudaTuple(param->texDesc.borderColor[0], param->texDesc.borderColor[1], param->texDesc.borderColor[2], param->texDesc.borderColor[3]); } } else { throw std::runtime_error(std::string("using unsupported address mode on the CPU in texture utils")); } if (param->texDesc.addressMode[2] == cudaAddressModeClamp) { if (query.z > param->extent.depth - 1) { query.z = param->extent.depth - 1; } else if (query.z <= 0.0f) { query.z = 0.0; } } else if (param->texDesc.addressMode[2] == cudaAddressModeBorder) { if (query.z > param->extent.depth - 1 || query.z <= 0.0) { return createPartialCudaTuple(param->texDesc.borderColor[0], param->texDesc.borderColor[1], param->texDesc.borderColor[2], param->texDesc.borderColor[3]); } } else if (param->texDesc.addressMode[2] == cudaAddressModeWrap) { while (query.z > param->extent.depth - 1) { query.z -= param->extent.depth - 1; } while (query.z < 0.0f) { query.z += param->extent.depth - 1; } } else { throw std::runtime_error(std::string("using unsupported address mode on the CPU in texture utils")); } const int w = param->extent.width; const int h = param->extent.height; if (param->texDesc.filterMode == cudaFilterModeLinear) { // the value is distributed evenly in the space starting at half a cell from 0.0 int x_min = std::min((int)std::floor(query.x), w - 2); int x_max = x_min + 1; int y_min = std::min((int)std::floor(query.y), h - 2); int y_max = y_min + 1; int z_min = std::min((int)std::floor(query.z), (int)param->extent.depth - 2); int z_max = z_min + 1; float x_d = (query.x - x_min) / (x_max - x_min); float y_d = (query.y - y_min) / (y_max - y_min); float z_d = (query.z - z_min) / (z_max - z_min); /** * does trilinear interpolation https://en.wikipedia.org/wiki/Trilinear_interpolation */ // Query corners of a cube DATA_T c_000 = this->cpu_values_[index][(z_min * h + y_min) * w + x_min]; DATA_T c_100 = this->cpu_values_[index][(z_min * h + y_min) * w + x_max]; DATA_T c_010 = this->cpu_values_[index][(z_min * h + y_max) * w + x_min]; DATA_T c_001 = this->cpu_values_[index][(z_max * h + y_min) * w + x_min]; DATA_T c_110 = this->cpu_values_[index][(z_min * h + y_max) * w + x_max]; DATA_T c_101 = this->cpu_values_[index][(z_max * h + y_min) * w + x_max]; DATA_T c_011 = this->cpu_values_[index][(z_max * h + y_max) * w + x_min]; DATA_T c_111 = this->cpu_values_[index][(z_max * h + y_max) * w + x_max]; // interpolate along x to make a square DATA_T c_00 = c_000 * (1 - x_d) + c_100 * x_d; DATA_T c_01 = c_001 * (1 - x_d) + c_101 * x_d; DATA_T c_10 = c_010 * (1 - x_d) + c_110 * x_d; DATA_T c_11 = c_011 * (1 - x_d) + c_111 * x_d; // inperpolate along y to make a line DATA_T c_0 = c_00 * (1 - y_d) + c_10 * y_d; DATA_T c_1 = c_01 * (1 - y_d) + c_11 * y_d; // interpolate along z to get the point DATA_T result = c_0 * (1 - z_d) + c_1 * z_d; // does the actual interpolation return result; } else if (param->texDesc.filterMode == cudaFilterModePoint) { int rowMajorIndex = (std::round(query.z) * h + std::round(query.y)) * w + std::round(query.x); return this->cpu_values_[index][rowMajorIndex]; } else { throw std::runtime_error(std::string("using unsupported filter mode on the CPU in texture utils")); } #endif } template bool ThreeDTextureHelper::setExtent(int index, cudaExtent& extent) { if (extent.depth == 0) { throw std::runtime_error(std::string("Error: extent in setExtent invalid," " cannot use depth == 0 in 3D texture: using ") + std::to_string(extent.depth)); } if (!TextureHelper, DATA_T>::setExtent(index, extent)) { return false; } this->cpu_buffer_values_[index].resize(extent.width * extent.height * extent.depth); this->cpu_values_[index].resize(extent.width * extent.height * extent.depth); // TODO recopy better when depth changes if possible // this means we have changed our extent so we need to copy over all the data layers again for (std::vector& layer : layer_copy_) { // resizes the array to account for change in depth layer.resize(extent.depth); // sets all current indexes to be true std::fill(layer.begin(), layer.end(), true); } return true; } template void ThreeDTextureHelper::copyDataToGPU(int index, bool sync) { TextureParams* param = &this->textures_[index]; auto copy_vec = layer_copy_[index].begin(); int w = param->extent.width; int h = param->extent.height; int d = param->extent.depth; cudaMemcpy3DParms params = { 0 }; params.srcPtr = make_cudaPitchedPtr(this->cpu_values_[index].data(), w * sizeof(DATA_T), w, h); params.dstArray = param->array_d; params.kind = cudaMemcpyHostToDevice; params.dstPos = make_cudaPos(0, 0, 0); params.srcPos = make_cudaPos(0, 0, 0); // TODO check if we just need to copy it all and do that // current index of z we are looking at int cur_z_index = 0; int prev_z_pos = -1; // TODO since we cannot have an extent of zero depth this is fine while (cur_z_index + 1 < d) { if (*(copy_vec + 1) and *copy_vec) { // if next index is true and cur is true keep building up copy copy_vec++; cur_z_index++; continue; } else if (!*(copy_vec + 1) and *copy_vec) { // if the next one is false and current is true begin a copy params.extent = make_cudaExtent(w, h, cur_z_index - prev_z_pos); HANDLE_ERROR(cudaMemcpy3DAsync(¶ms, this->stream_)); prev_z_pos = cur_z_index; } else if (*(copy_vec + 1) and !*copy_vec) { // if the next one is true and cur is false start building up copy params.dstPos = make_cudaPos(0, 0, cur_z_index + 1); params.srcPos = make_cudaPos(0, 0, cur_z_index + 1); prev_z_pos = cur_z_index; } // increment counters copy_vec++; cur_z_index++; } // execute whatever copy is left if (prev_z_pos + 1 != cur_z_index) { params.extent = make_cudaExtent(w, h, cur_z_index - prev_z_pos); HANDLE_ERROR(cudaMemcpy3DAsync(¶ms, this->stream_)); } for (std::vector& layer : layer_copy_) { std::fill(layer.begin(), layer.end(), false); } if (sync) { cudaStreamSynchronize(this->stream_); } param->update_data = false; } ================================================ FILE: include/mppi/utils/texture_helpers/three_d_texture_helper.cuh ================================================ // // Created by jason on 1/10/22. // #ifndef MPPIGENERIC_THREE_D_TEXTURE_HELPER_CUH #define MPPIGENERIC_THREE_D_TEXTURE_HELPER_CUH #include // TODO needs to 3D wrap around angles, i.e set 3D to wrap template class ThreeDTextureHelper : public TextureHelper, DATA_T> { public: ThreeDTextureHelper(int number, bool synced = false, cudaStream_t stream = 0); void allocateCudaTexture(int index) override; void updateTexture(const int index, const int z_index, std::vector& data, bool column_major = false); void updateTexture(const int index, const int z_index, const Eigen::Ref, 0, Eigen::Stride> values, bool column_major = true); bool setExtent(int index, cudaExtent& extent) override; void copyDataToGPU(int index, bool sync = false) override; std::vector> getLayerCopy() { return layer_copy_; } __host__ __device__ DATA_T queryTexture(const int index, const float3& point); protected: // cpu values // layer -> 3D array -> actual texture lookup thing std::vector> layer_copy_; // indicator what 2D part of the 3D array needs to be copied over // if we should require every depth to be updated before sending to GPU bool synched_ = false; }; #if __CUDACC__ #include "three_d_texture_helper.cu" #endif #endif // MPPIGENERIC_THREE_D_TEXTURE_HELPER_CUH ================================================ FILE: include/mppi/utils/texture_helpers/two_d_texture_helper.cu ================================================ // // Created by jason on 1/5/22. // #include "two_d_texture_helper.cuh" template void TwoDTextureHelper::allocateCudaTexture(int index) { TextureHelper, DATA_T>::allocateCudaTexture(index); TextureParams* param = &this->textures_[index]; int w = param->extent.width; int h = param->extent.height; HANDLE_ERROR(cudaMallocArray(&(param->array_d), &(param->channelDesc), w, h)); } template void TwoDTextureHelper::updateTexture(const int index, std::vector& values, bool column_major) { TextureParams* param = &this->textures_buffer_[index]; int w = param->extent.width; int h = param->extent.height; // check that the sizes are correct if (values.size() != w * h) { throw std::runtime_error(std::string("Error: invalid size to updateTexture ") + std::to_string(values.size()) + " != " + std::to_string(w * h)); } // copy over values to cpu side holder this->cpu_buffer_values_[index].resize(w * h); if (column_major) { for (int j = 0; j < w; j++) { for (int i = 0; i < h; i++) { int columnMajorIndex = j * h + i; int rowMajorIndex = i * w + j; this->cpu_buffer_values_[index][rowMajorIndex] = values[columnMajorIndex]; } } } else { // std::copy(values.begin(), values.end(), cpu_buffer_values_[index].begin()); this->cpu_buffer_values_[index] = std::move(values); } // tells the object to copy it over next time that happens param->update_data = true; } template void TwoDTextureHelper::updateTexture(const int index, std::vector& data, cudaExtent& extent, bool column_major) { setExtent(index, extent); updateTexture(index, data, column_major); } template __host__ __device__ DATA_T TwoDTextureHelper::queryTexture(const int index, const float3& point) { #ifdef __CUDA_ARCH__ return tex2D(this->textures_d_[index].tex_d, point.x, point.y); #else return queryTextureCPU(index, point); #endif } template bool TwoDTextureHelper::setExtent(int index, cudaExtent& extent) { if (extent.depth != 0) { throw std::runtime_error(std::string("Error: extent in setExtent invalid," " cannot use depth != 0 in 2D texture: using ") + std::to_string(extent.depth)); } return TextureHelper, DATA_T>::setExtent(index, extent); } template void TwoDTextureHelper::copyDataToGPU(int index, bool sync) { TextureParams* param = &this->textures_[index]; int w = param->extent.width; int h = param->extent.height; HANDLE_ERROR(cudaMemcpy2DToArrayAsync(param->array_d, 0, 0, this->cpu_values_[index].data(), w * sizeof(DATA_T), w * sizeof(DATA_T), h, cudaMemcpyHostToDevice, this->stream_)); if (sync) { cudaStreamSynchronize(this->stream_); } param->update_data = false; } template void TwoDTextureHelper::updateTexture( const int index, const Eigen::Ref, 0, Eigen::Stride> values, bool column_major) { TextureParams* param = &this->textures_buffer_[index]; int w = param->extent.width; int h = param->extent.height; this->cpu_buffer_values_[index].resize(w * h); if (column_major) { // if we are column major transform to row major for (int j = 0; j < w; j++) { for (int i = 0; i < h; i++) { int columnMajorIndex = j * h + i; int rowMajorIndex = i * w + j; this->cpu_buffer_values_[index][rowMajorIndex] = values.data()[columnMajorIndex]; } } } else { // if we row major copy directly memcpy(this->cpu_buffer_values_[index].data(), values.data(), values.size() * sizeof(DATA_T)); } // tells the object to copy it over next time that happens param->update_data = true; } template void TwoDTextureHelper::updateTexture( int index, const Eigen::Ref, 0, Eigen::Stride> values, cudaExtent& extent, bool column_major) { setExtent(index, extent); updateTexture(index, values, column_major); } template DATA_T TwoDTextureHelper::queryTextureCPU(const int index, const float3& point) { TextureParams* param = &this->textures_[index]; // convert normalized to array index float2 query = make_float2(point.x * param->extent.width, point.y * param->extent.height); // we subtract half a grid cell since the elevation map is the elevation at the center of the grid cell query.x = query.x - 0.5f; query.y = query.y - 0.5f; // if (this->cpu_values_[index].size() < param->extent.width * param->extent.height) // { // return DATA_T(); // } if (param->texDesc.addressMode[0] == cudaAddressModeClamp) { if (query.x > param->extent.width - 1) { query.x = param->extent.width - 1; } else if (query.x <= 0.0) { query.x = 0.0; } } else if (param->texDesc.addressMode[0] == cudaAddressModeBorder) { if (query.x > param->extent.width - 1 || query.x <= 0.0) { return createPartialCudaTuple(param->texDesc.borderColor[0], param->texDesc.borderColor[1], param->texDesc.borderColor[2], param->texDesc.borderColor[3]); } } else { throw std::runtime_error(std::string("using unsupported address mode on the CPU in texture utils")); } if (param->texDesc.addressMode[1] == cudaAddressModeClamp) { if (query.y > param->extent.height - 1) { query.y = param->extent.height - 1; } else if (query.y <= 0.0) { query.y = 0.0; } } else if (param->texDesc.addressMode[1] == cudaAddressModeBorder) { if (query.y > param->extent.height - 1 || query.y <= 0.0) { return createPartialCudaTuple(param->texDesc.borderColor[0], param->texDesc.borderColor[1], param->texDesc.borderColor[2], param->texDesc.borderColor[3]); } } else { throw std::runtime_error(std::string("using unsupported address mode on the CPU in texture utils")); } int w = param->extent.width; if (param->texDesc.filterMode == cudaFilterModeLinear) { // the value is distributed evenly in the space starting at half a cell from 0.0 int x_min = std::min((int)std::floor(query.x), w - 2); int x_max = x_min + 1; int y_min = std::min((int)std::floor(query.y), (int)param->extent.height - 2); int y_max = y_min + 1; // does bilinear interpolation https://en.wikipedia.org/wiki/Bilinear_interpolation DATA_T Q_11 = this->cpu_values_[index][y_min * w + x_min]; DATA_T Q_12 = this->cpu_values_[index][y_min * w + x_max]; DATA_T Q_21 = this->cpu_values_[index][y_max * w + x_min]; DATA_T Q_22 = this->cpu_values_[index][y_max * w + x_max]; DATA_T y_min_interp = Q_11 * ((x_max - query.x) / (x_max - x_min)) + Q_12 * ((query.x - x_min) / (x_max - x_min)); DATA_T y_max_interp = Q_21 * ((x_max - query.x) / (x_max - x_min)) + Q_22 * ((query.x - x_min) / (x_max - x_min)); DATA_T result = y_min_interp * ((y_max - query.y) / (y_max - y_min)) + y_max_interp * ((query.y - y_min) / (y_max - y_min)); // does the actual interpolation return result; } else if (param->texDesc.filterMode == cudaFilterModePoint) { int rowMajorIndex = std::round(query.y) * w + std::round(query.x); return this->cpu_values_[index][rowMajorIndex]; } else { throw std::runtime_error(std::string("using unsupported filter mode on the CPU in texture utils")); } } ================================================ FILE: include/mppi/utils/texture_helpers/two_d_texture_helper.cuh ================================================ // // Created by jason on 1/5/22. // #ifndef MPPIGENERIC_TWO_D_TEXTURE_HELPER_CUH #define MPPIGENERIC_TWO_D_TEXTURE_HELPER_CUH #include template class TwoDTextureHelper : public TextureHelper, DATA_T> { public: TwoDTextureHelper(int number, cudaStream_t stream = 0) : TextureHelper, DATA_T>(number, stream) { } void allocateCudaTexture(int index) override; void updateTexture(int index, const Eigen::Ref, 0, Eigen::Stride>, cudaExtent& extent, bool column_major = true); void updateTexture(const int index, const Eigen::Ref, 0, Eigen::Stride>, bool column_major = true); void updateTexture(const int index, std::vector& data, bool column_major = false); void updateTexture(const int index, std::vector& data, cudaExtent& extent, bool column_major = false); bool setExtent(int index, cudaExtent& extent) override; void copyDataToGPU(int index, bool sync = false); __host__ __device__ DATA_T queryTexture(const int index, const float3& point); DATA_T queryTextureCPU(const int index, const float3& point); protected: }; #if __CUDACC__ #include "two_d_texture_helper.cu" #endif #endif // MPPIGENERIC_TWO_D_TEXTURE_HELPER_CUH ================================================ FILE: include/mppi/utils/type_printing.h ================================================ /** * C++ 11 solution to print out variable type * Copied from: * https://stackoverflow.com/questions/81870/is-it-possible-to-print-a-variables-type-in-standard-c * Created by Bogdan on 1/12/2020 */ #ifndef UTILS_TYPE_PRINTING_H_ #define UTILS_TYPE_PRINTING_H_ #include #include #ifndef _MSC_VER #include #endif #include #include #include // tldr use TYPE(variable) to get std::string of the variable's type #ifndef TYPE #define TYPE(a) type_name() #endif template std::string type_name() { typedef typename std::remove_reference::type TR; std::unique_ptr own( #ifndef _MSC_VER abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr), #else nullptr, #endif std::free); std::string r = own != nullptr ? own.get() : typeid(TR).name(); if (std::is_const::value) r += " const"; if (std::is_volatile::value) r += " volatile"; if (std::is_lvalue_reference::value) r += "&"; else if (std::is_rvalue_reference::value) r += "&&"; return r; } #endif /* UTILS_TYPE_PRINTING_H_*/ ================================================ FILE: include/mppi/version.h.in ================================================ /** * @file version.h * @brief Templated File for MPPI-Generic Version * @author Bogdan Vlahov * @version 0.0.1 * @date 2026-04-14 */ #pragma once #define MPPI_GENERIC_VERSION @PROJECT_VERSION@ #define MPPI_GENERIC_VERSION_STRING "@PROJECT_VERSION@" #define MPPI_GENERIC_VERSION_MAJOR @PROJECT_VERSION_MAJOR@ #define MPPI_GENERIC_VERSION_MINOR @PROJECT_VERSION_MINOR@ #define MPPI_GENERIC_VERSION_PATCH @PROJECT_VERSION_PATCH@ ================================================ FILE: resources/PF_1000_cell_init.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:ec9c111b7d37c3074b4b4a19adf63585a1d5b333ddda4af216a562693ad492d1 size 657158 ================================================ FILE: resources/PF_1000_hidden_init.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:5df4063baca04f0079b94e8c997047722e56d8f12a5de8229274acc6a1340b5d size 657158 ================================================ FILE: resources/PF_1000_lstm.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:375d75e3276e9fa020d0c33c4c7b2a8fc7af5ae93021942488ffc2234262b7f2 size 12078 ================================================ FILE: resources/PF_1000_output.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:6b26b5ec02d2f88b314995819e624d52c0444764a804ac3f334ed74fe4155911 size 1042 ================================================ FILE: resources/ackerman_test.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:6a7a4299fc0f03eff7e7caba21247f70f892f2b4705076ff485a9bea2064442f size 14734540 ================================================ FILE: resources/autorally_nnet_09_12_2018.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:fc67d5a365e513c65ecb5c38c93a2f227dade9ec5c092e3c63878f95e8865694 size 12722 ================================================ FILE: resources/bicycle_slip_hybrid.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:bcd2fbb7262ef3b35637027c15ad7904b02e265ccb9ed8d1b54e73c884f8eb15 size 4512514 ================================================ FILE: resources/bicycle_slip_hybrid_test.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:75f53e00e525d0b2faad589c6776e33e09f338ce69fbb06e46b624683580f0e9 size 4512514 ================================================ FILE: resources/bicycle_slip_kinematic_test.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:594e9b4fa900438043b7019923b05a730e7c5add894a2e85c3f0a22c0876a3f3 size 26482688 ================================================ FILE: resources/bicycle_slip_test.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:c3acbdeff79819386ccb0e02fd84845388a784ad34b8e6023a1dc0d7e1bff2fc size 18634406 ================================================ FILE: resources/body_loss_bicycle_slip_kinematic_2023_03_10-12_54_39.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:74dabe9f4d367a6112132efc36a2dbaf5d004da8fa0ef12a863108cf8ed578e3 size 4505014 ================================================ FILE: resources/ccrf_track.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:546f8e88874311082a65fde8d3ebf707bab0f8369746d5ff9b7f725f2d9f1528 size 4321310 ================================================ FILE: resources/cell_init.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:479e450b1dcca2791941b1d50ce285ce7345f5a7ffb5c1133f9bc52887e6ca08 size 132358 ================================================ FILE: resources/hidden_init.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:93965332fd41d8e37d47dd79bbb4755f4a5a6f719a7a55165407722b230eb2d9 size 132358 ================================================ FILE: resources/lstm.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:3135fec388eab4fefec41948a9f0cbff0d11c14657992f26350ed1e1013691f7 size 12078 ================================================ FILE: resources/lstm_lstm_ackerman.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:3ecf222b541fe8767dd835792dc6f06e7878d58e1f216877c082898969ff5302 size 1021466 ================================================ FILE: resources/lstm_lstm_bicycle_slip_kinematic.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:7de7bce16b34fb3f89cc1b53cbd391053eb757b476230b3055438b9510a58714 size 4505014 ================================================ FILE: resources/lstm_lstm_steering.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:83aef869ea23e01cb0a33abcae179ad1baa83d24bc2c60665c59b17950f71e73 size 195760 ================================================ FILE: resources/lstm_lstm_steering_accel.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:c65b093c690ea938e7ae6e49e64dba7f3b042b67027149a46a020d95852dc8a0 size 2489169 ================================================ FILE: resources/lstm_lstm_test.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:3c0a405fc6db14efb3bc736cfc9930dd7bd66faf42f4d7f3941a0ccd185d8388 size 200797 ================================================ FILE: resources/network_rde_2024_03_22-19_00_26.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:77a37ae6d963904c20892c9e4f5f4180f4b5641b0396ad66cc6273d41c474c35 size 18252124 ================================================ FILE: resources/network_rde_test.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:e50bf425bcf93de7ba80b0d3cc49eb8ec8ae3bd6255c149efa8e2472614849c0 size 18250976 ================================================ FILE: resources/output.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:87285415580e5dc351d3c315270750c88af9b158f93b24f4f1c0407cd77452d5 size 1042 ================================================ FILE: resources/sim_PF_200_cell_init.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:dcd107ddbf3189ae984f05c0f40f88edfdbc834898840548e782e6698b59825a size 132358 ================================================ FILE: resources/sim_PF_200_hidden_init.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:fa86275cfccfec3dd7483fed63422b081f28f9e31f8db8acb5d0a978fb306f13 size 132358 ================================================ FILE: resources/sim_PF_200_lstm.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:41952b66ccaf1084665ec69925d7a1ff6f6ddaec8423b556b194e1cde3b4c7d0 size 12078 ================================================ FILE: resources/sim_PF_200_output.npz ================================================ version https://git-lfs.github.com/spec/v1 oid sha256:ae29fa9fc96be8ac44efb3451f7be332d52d2dffd3ea5a34fff97cc49e4669d0 size 1042 ================================================ FILE: scripts/autorally/lstm_converter.py ================================================ # reads in an LSTM npz and saves it out so that we can read it with cnpy import torch import numpy as np import pickle from torch import Tensor from torch.nn.parameter import Parameter #filename = "/home/jason/Documents/research/MPPI-Generic-Model-Learning/Results/train/autorally/PF_1000_integrated_fixed_1/LSTM_1X15-1000X10_0.02-0_I/model_store/LSTM_1X15-1000X10_0.02-0_I_37_1.npz" filename = "/home/jason/Documents/research/MPPI-Generic-Model-Learning/Results/train/autorally/PF_200_fixed/LSTM_1X15-200X10_0.02-0_I/LSTM_1X15-200X10_0.02-0_I.npz" network = np.load(open(filename, 'rb'), allow_pickle=True) hidden_init = network["hidden_init"].item() cell_init = network["cell_init"].item() output = network["output"].item() lstm = network["lstm"].item() np.savez("PF_200_hidden_init.npz", **hidden_init) np.savez("PF_200_cell_init.npz", **cell_init) np.savez("PF_200_output.npz", **output) np.savez("PF_200_lstm.npz", **lstm) test = np.load(open("/home/jason/catkin_ws/ar/src/MPPI-ROS/" "submodules/MPPI-Generic/resources/hidden_init.npz", 'rb'), allow_pickle=True) print(test) print(test.keys()) print(test["dynamics_W1"]) test = np.load(open("PF_1000_hidden_init.npz", 'rb'), allow_pickle=True) print(test) print(test.keys()) print(test["dynamics_W1"]) #np.savez(filename, **pa) ================================================ FILE: scripts/autorally/test/generateTestMaps.py ================================================ #!/usr/bin/env python3 import numpy as np from PIL import Image import math import argparse def genLoadTrackDataTestMap(args): # used to check if the map is being loaded correctly width = 10 height = 20 pixelsPerMeter = 2 channel0 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel1 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel2 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel3 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) counter = 0 for i in range(0, width*pixelsPerMeter): for j in range(0, height*pixelsPerMeter): # TODO this is actually flipped, but loaded into CUDA correctly #print("putting width and height, ", pixelsPerMeter * height * i + j, " ", counter) counter+= 1 channel0[i, j] = counter channel1[i, j] = counter * 10 channel2[i, j] = counter * 100 channel3[i, j] = counter * 1000 #print(channel0.flatten()) #Save data to numpy array, each channel is saved individually as an array in row major order. track_dict = {"xBounds":np.array([-width/2, width/2], dtype = np.float32), "yBounds":np.array([-height/2, height/2], dtype = np.float32), "pixelsPerMeter":np.array([pixelsPerMeter], dtype=np.float32), "channel0":channel0.flatten(), "channel1":channel1.flatten(), "channel2":channel2.flatten(), "channel3":channel3.flatten()} np.savez(args.output+"track_map.npz", **track_dict) # used to check for valid outputs of the AutoRally standard cost function # dims match the standard ones used to marietta track width = 30 height = 30 pixelsPerMeter = 20 channel0 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel1 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel2 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel3 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) for i in range(0, width*pixelsPerMeter): for j in range(0, height*pixelsPerMeter): x = (j*1.0)/pixelsPerMeter y = (i*1.0)/pixelsPerMeter channel0[i, j] = abs(height/2.0 - y) + ((x)/(width)) #if ((x == 12.5 or x == 13.5) and y == 0): #print(x,y, " = ", channel0[i,j]) #if channel0[i, j] < 1.0: # print("putting width and height ",i , " ", j, " = ", channel0[i,j]) #print(channel0.flatten()) track_dict = {"xBounds":np.array([-13, 17], dtype = np.float32), "yBounds":np.array([-10, 20], dtype = np.float32), "pixelsPerMeter":np.array([pixelsPerMeter], dtype=np.float32), "channel0":channel0.flatten(), "channel1":channel1.flatten(), "channel2":channel2.flatten(), "channel3":channel3.flatten()} np.savez(args.output+"track_map_standard.npz", **track_dict) # used to check for valid outputs of the AutoRally robust cost function # dims match the standard ones used to CCRF track width = 70 height = 55 pixelsPerMeter = 20 channel0 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel1 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel2 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) channel3 = np.zeros((width*pixelsPerMeter, height*pixelsPerMeter), np.float32) for i in range(0, width*pixelsPerMeter): for j in range(0, height*pixelsPerMeter): x = (j*1.0)/pixelsPerMeter y = (i*1.0)/pixelsPerMeter channel1[i, j] = abs(height/2.0 - y) + ((x)/(width)) if x > 50 or x < 15: channel0[i,j] = 1.0 elif x > 40 or x < 25: channel0[i,j] = 0.6 channel2[i,j] = x channel3[i,j] = math.atan2(y,x) #print("putting x,y = ",x , " ", y, " = ", channel0[i,j], channel1[i,j], channel2[i,j], channel3[i,j]) #print(channel3.flatten()) track_dict = {"xBounds":np.array([-25, 45], dtype = np.float32), "yBounds":np.array([-50, 5], dtype = np.float32), "pixelsPerMeter":np.array([pixelsPerMeter], dtype=np.float32), "channel0":channel0.flatten(), "channel1":channel1.flatten(), "channel2":channel2.flatten(), "channel3":channel3.flatten()} np.savez(args.output+"track_map_robust.npz", **track_dict) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Creates a npz map file for testing purposes") # parser.add_argument("-i", "--input", type = str, help = "Costmap in old .txt format") # parser.add_argument("-d", "--display", type = str, # help = "Name of image to save costmap as", # default="display_image.jpg") parser.add_argument("-o", "--output", type = str, help = "File to save map to", default = "../../../resource/autorally/test/") args = parser.parse_args() genLoadTrackDataTestMap(args) ================================================ FILE: scripts/autorally/test/generateTestNetwork.py ================================================ #!/usr/bin/env python3 import numpy as np from PIL import Image import pickle import argparse def genLoadNetworkDataTest(args): param_dict = {} layer_counter = 0 net_structure = [6, 32, 32, 4] params = 0 for i in range(1, len(net_structure)): inc = net_structure[i-1] * net_structure[i] print("param = ", params, " inc = ", inc) param_dict["dynamics_W" + str(i)] = np.arange(params, params + inc, dtype=np.float64) params += inc inc = net_structure[i] print("param = ", params, " inc = ", inc) param_dict["dynamics_b" + str(i)] = np.arange(params, params + inc, dtype=np.float64) params += inc print(param_dict) np.savez(args.output+"/neuralNetLoadTest.npz", **param_dict) def genComputationNetworkTest(args): param_dict = {} net_structure = [4, 3, 4] for i in range(1, len(net_structure)): param_dict["dynamics_W" + str(i)] = 1 param_dict["dynamics_b" + str(i)] = 1 print(param_dict) np.savez(args.output+"/neuralNetComputeTest.npz", **param_dict) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Creates a npz file for testing the NN") # parser.add_argument("-i", "--input", type = str, help = "Costmap in old .txt format") # parser.add_argument("-d", "--display", type = str, # help = "Name of image to save costmap as", # default="display_image.jpg") parser.add_argument("-o", "--output", type = str, help = "File to save map to", default = "../../../resource/autorally/test/") args = parser.parse_args() genLoadNetworkDataTest(args) genComputationNetworkTest(args) ================================================ FILE: scripts/colored_noise.py ================================================ """Generate colored noise.""" """Source: https://github.com/felixpatzelt/colorednoise/blob/master/colorednoise.py""" from numpy import sqrt, newaxis from numpy.fft import irfft, rfftfreq from numpy.random import normal from numpy import sum as npsum def powerlaw_psd_gaussian(exponent, size, fmin=0): """Gaussian (1/f)**beta noise. Based on the algorithm in: Timmer, J. and Koenig, M.: On generating power law noise. Astron. Astrophys. 300, 707-710 (1995) Normalised to unit variance Parameters: ----------- exponent : float The power-spectrum of the generated noise is proportional to S(f) = (1 / f)**beta flicker / pink noise: exponent beta = 1 brown noise: exponent beta = 2 Furthermore, the autocorrelation decays proportional to lag**-gamma with gamma = 1 - beta for 0 < beta < 1. There may be finite-size issues for beta close to one. shape : int or iterable The output has the given shape, and the desired power spectrum in the last coordinate. That is, the last dimension is taken as time, and all other components are independent. fmin : float, optional Low-frequency cutoff. Default: 0 corresponds to original paper. It is not actually zero, but 1/samples. Returns ------- out : array The samples. Examples: --------- # generate 1/f noise == pink noise == flicker noise >>> import colorednoise as cn >>> y = cn.powerlaw_psd_gaussian(1, 5) """ # Make sure size is a list so we can iterate it and assign to it. try: size = list(size) except TypeError: size = [size] # The number of samples in each time series samples = size[-1] # Calculate Frequencies (we asume a sample rate of one) # Use fft functions for real output (-> hermitian spectrum) f = rfftfreq(samples) # Build scaling factors for all frequencies s_scale = f fmin = max(fmin, 1./samples) # Low frequency cutoff ix = npsum(s_scale < fmin) # Index of the cutoff if ix and ix < len(s_scale): s_scale[:ix] = s_scale[ix] s_scale = s_scale**(-exponent/2.) # Calculate theoretical output standard deviation from scaling w = s_scale[1:].copy() w[-1] *= (1 + (samples % 2)) / 2. # correct f = +-0.5 sigma = 2 * sqrt(npsum(w**2)) / samples # Adjust size to generate one Fourier component per frequency size[-1] = len(f) # Add empty dimension(s) to broadcast s_scale along last # dimension of generated random power + phase (below) dims_to_add = len(size) - 1 s_scale = s_scale[(newaxis,) * dims_to_add + (Ellipsis,)] # Generate scaled random power + phase sr = normal(scale=s_scale, size=size) si = normal(scale=s_scale, size=size) # If the signal length is even, frequencies +/- 0.5 are equal # so the coefficient must be real. if not (samples % 2): si[...,-1] = 0 # Regardless of signal length, the DC component must be real si[...,0] = 0 # Combine power + corrected phase to Fourier components s = sr + 1J * si # Transform to real time series & scale to unit variance y = irfft(s, n=samples, axis=-1) / sigma return y if __name__ == "__main__": y = powerlaw_psd_gaussian(1, 20, 0.07) print("final samples:\n{}".format(y)) ================================================ FILE: scripts/double_integrator/generate_free_energy_video.py ================================================ import matplotlib.pyplot as plt from matplotlib.animation import FuncAnimation import matplotlib.animation as animation import numpy as np from matplotlib import rc import argparse controller_dict = {'v': 'vanilla_large_', 't': 'tube_', 'rs': 'robust_sc_', 'rr': 'robust_rc_', 'rhv': 'test_mppi_', 'rht': 'test_tmppi_', 'rhr': 'test_rmppi_', 'cc': 'robust_cc_', 'vr': 'vanilla_large_robust_', 'tr': 'tube_robust_'} title_dict = {'v': 'MPPI Standard Cost', 't': 'Tube-MPPI Standard Cost', 'rs': 'RMPPI LQR Standard Cost', 'rr': 'RMPPI LQR Robust Cost', 'rhv': 'MPPI Autorally', 'rht': 'Tube-MPPI Autorally', 'rhr': 'RMPPI Autorally', 'cc': 'RMPPI CCM Robust Cost', 'vr': 'MPPI Robust Cost', 'tr': 'Tube-MPPI Robust Cost'} rc('font', **{'size': 30}) fig, ax = plt.subplots() fig.set_dpi(100) fig.set_size_inches(10, 10) xdata, ydata, yndata = [], [], [] ax.set_ylabel('Log(Free Energy)',labelpad=-30, position=(0.5,.6)) ax.set_xlabel('Time (sec)') ax.set_yscale('log') #ax.xaxis.set_ticklabels([]) title = None ln3, = ax.plot([], [], 'r', alpha=0.7, linewidth=3, label='Nominal') ln2, = ax.plot([], [], 'b', alpha=0.7, linewidth=3, label='Real') ln1, = ax.plot(np.linspace(0,1000,100), 1*np.ones(100), 'k', alpha=0.7, linewidth=3, label='Crash') def init(): ax.set_xlim(0, time_window) ax.set_ylim(ymin, ymax) return ln1, ln2, ln3 def update(frame): xdata.append(time[frame]) ydata.append(real_fe[frame]) yndata.append(nominal_fe[frame]) ln2.set_data(xdata, ydata) ln3.set_data(xdata, yndata) ax.legend(loc=2) if (xdata[-1] - xdata[0]) > time_window: xdata.pop(0) ydata.pop(0) yndata.pop(0) ln1.set_data(xdata, 1) ax.set_xlim(xdata[0], xdata[-1]) else: ax.set_xlim(xdata[0], xdata[0]+time_window) return ln1, ln2, ln3 def index_data(time, nominal_fe, real_fe, begin_time, end_time): ind = tuple([begin_time < time]) time = time[ind] real_fe = real_fe[ind] nominal_fe = nominal_fe[ind] ind = tuple([time < end_time]) time = time[ind] real_fe = real_fe[ind] nominal_fe = nominal_fe[ind] return time, nominal_fe, real_fe def main(args): build_dir = args['build_dir'] data_dir = build_dir + 'examples/' controller_name = controller_dict[args['controller']] fps = int(args['fps']) # Let us load the data first global real_fe, nominal_fe, time_window, time, ymin, ymax time_window = int(args['time_window']) # Load the free energy data if args['controller'] == 'v': time = np.linspace(0.02, 0.02*5000, 5000) real_fe = np.load(data_dir + controller_name + 'real_free_energy.npy')/1000 nominal_fe = -1*np.ones_like(real_fe) elif args['controller'] == 'rhv': data = np.load(data_dir + controller_name + 'real_free_energy.npz') time = data['arr_0'] real_fe = data['arr_1']/10000 nominal_fe = -1*np.ones_like(real_fe) data.close() time, nominal_fe, real_fe = index_data(time, nominal_fe, real_fe, 150, 190) elif args['controller'] == 'rht': data = np.load(data_dir + controller_name + 'real_free_energy.npz') time = data['arr_0'] real_fe = data['arr_1']/10000 data.close() data = np.load(data_dir + controller_name + 'nominal_free_energy.npz') nominal_fe = data['arr_1']/10000 data.close() time, nominal_fe, real_fe = index_data(time, nominal_fe, real_fe, 275, 315) elif args['controller'] == 'rhr': data = np.load(data_dir + controller_name + 'real_free_energy.npz') time = data['arr_0'] real_fe = data['arr_1']/125000 data.close() data = np.load(data_dir + controller_name + 'nominal_free_energy.npz') nominal_fe = data['arr_1']/125000 data.close() time, nominal_fe, real_fe = index_data(time, nominal_fe, real_fe, 180, 220) elif args['controller'] == 'cc': time = np.linspace(0.02, 0.02*3000, 3000) real_fe = np.load(data_dir + controller_name + 'real_free_energy.npy')/100 nominal_fe = np.load(data_dir + controller_name + 'nominal_free_energy.npy')/100 elif args['controller'] == 'vr': time = np.linspace(0.02, 0.02*5000, 5000) real_fe = np.load(data_dir + controller_name + 'real_free_energy.npy')/100 nominal_fe = -1*np.ones_like(real_fe) elif args['controller'] == 'tr': time = np.linspace(0.02, 0.02*5000, 5000) real_fe = np.load(data_dir + controller_name + 'real_free_energy.npy')/100 nominal_fe = np.load(data_dir + controller_name + 'nominal_free_energy.npy')/100 elif not args['controller'] == 'rr': time = np.linspace(0.02, 0.02*5000, 5000) real_fe = np.load(data_dir + controller_name + 'real_free_energy.npy')/1000 nominal_fe = np.load(data_dir + controller_name + 'nominal_free_energy.npy')/1000 else: time = np.linspace(0.02, 0.02*5000, 5000) real_fe = np.load(data_dir + controller_name + 'real_free_energy.npy')/100 nominal_fe = np.load(data_dir + controller_name + 'nominal_free_energy.npy')/100 ax.set_title(title_dict[args['controller']]) if np.mean(nominal_fe) == -1: ymin = np.amin(real_fe) else: ymin = min(np.amin(real_fe), np.amin(nominal_fe)) ymax = max(1.1,max(np.amax(real_fe), np.amax(nominal_fe))) # Set up formatting for the movie files Writer = animation.writers['ffmpeg'] writer = Writer(fps=fps, bitrate=-1) ani = FuncAnimation(fig, update, frames=np.arange(0,time.shape[0],1), init_func=init, blit=False, interval=1000/fps, repeat=False) if args['save_mp4']: ani.save(title_dict[args['controller']] + '_free_energy' + '.mp4', writer=writer) else: print('Not saving mp4 file, pass --save_mp4=True if you want to save it') plt.show() if __name__ == "__main__": parser = argparse.ArgumentParser(description = 'Say hello') parser.add_argument('--build_dir', help='Location of MPPI-Generic build folder', required=True) parser.add_argument('--controller', help="Which controller we are plotting", required=True) parser.add_argument('--time_window', help='Time window size (s)', required=False, default=5) parser.add_argument('--fps', required=False, default=50) parser.add_argument('--save_mp4', required=False, default=False) args = vars(parser.parse_args()) main(args) ================================================ FILE: scripts/double_integrator/generate_trajectory_video.py ================================================ import matplotlib.pyplot as plt from matplotlib.animation import FuncAnimation import matplotlib.animation as animation import numpy as np from matplotlib import rc import argparse controller_dict = {'v': 'vanilla_large_', 'vr': 'vanilla_large_robust_', 't': 'tube_', 'tr': 'tube_robust_', 'rs': 'robust_sc_', 'rr': 'robust_rc_', 'cc': 'robust_cc_', 'vr': 'vanilla_large_robust_', 'tr': 'tube_robust_'} title_dict = {'v': 'MPPI Standard Cost', 't': 'Tube-MPPI Standard Cost', 'vr': 'MPPI Robust Cost', 'tr': 'Tube-MPPI Robust Cost', 'rs': 'RMPPI LQR Standard Cost', 'rr': 'RMPPI LQR Robust Cost', 'cc': 'RMPPI CCM Robust Cost'} rc('font', **{'size': 30}) track_radius_outer = 2 + .125 track_radius_inner = 2 - .125 # Draw the bounds for the track theta = np.linspace(0,2*np.pi, 1000) x_track_inner = track_radius_inner*np.cos(theta) y_track_inner = track_radius_inner*np.sin(theta) x_track_outer = track_radius_outer*np.cos(theta) y_track_outer = track_radius_outer*np.sin(theta) fig, ax = plt.subplots() fig.set_dpi(100) fig.set_size_inches(10, 10) xdata, ydata = [], [] xndata, yndata = [], [] ax.plot(x_track_outer, y_track_outer, 'k', linewidth=2) ax.plot(x_track_inner, y_track_inner, 'k', linewidth=2) ax.axis('equal') ln3, = ax.plot([],[], 'g', alpha=0.7, linewidth=2, label='Nominal Trajectory') ln2, = ax.plot([], [], 'ro', alpha=0.5, label='Nominal State') ln1, = ax.plot([], [], 'bo', alpha=0.5, label='Actual State') ax.set_ylabel('Y Pos (m)') ax.set_xlabel('X Pos (m)') title = ax.text(0.5,0.5, "", bbox={'facecolor':'w', 'alpha':0.5, 'pad':20}, transform=ax.transAxes, ha="center", fontsize=30) def init(): ax.set_xlim(-2.25, 2.25) ax.set_ylim(-2.25, 2.25) return ln1, ln2, ln3 def update(frame): xndata.append(nominal_state[frame, 0]) yndata.append(nominal_state[frame, 1]) xdata.append(actual_state[frame, 0]) ydata.append(actual_state[frame, 1]) xnt_data = nominal_trajectory[frame,:,0] ynt_data = nominal_trajectory[frame,:,1] if len(xdata) > 20: xdata.pop(0) ydata.pop(0) xndata.pop(0) yndata.pop(0) ln3.set_data(xnt_data, ynt_data) ln2.set_data(xndata, yndata) ln1.set_data(xdata, ydata) # fig.legend() title.set_text("Time: {val:05.2f} (sec)".format(val=frame*0.02+0.02)) return ln1, ln2, ln3, title def main(args): build_dir = args['build_dir'] data_dir = build_dir + 'examples/' controller_name = controller_dict[args['controller']] fps = int(args['fps']) # Let us load the data first global actual_state, nominal_trajectory, nominal_state, real_fe, nominal_fe actual_state = np.load(data_dir + controller_name + 'state_trajectory.npy') nominal_trajectory = np.load(data_dir + controller_name + 'nominal_trajectory.npy') nominal_state = nominal_trajectory[:,0,:] ax.set_title(title_dict[args['controller']]) # Set up formatting for the movie files Writer = animation.writers['ffmpeg'] writer = Writer(fps=fps, bitrate=-1) ani = FuncAnimation(fig, update, frames=np.arange(0,actual_state.shape[0],1), init_func=init, blit=True, interval=1000/fps, repeat=False) if args['save_mp4']: ani.save(title_dict[args['controller']] + '_trajectory' + '.mp4', writer=writer) else: print('Not saving mp4 file, pass --save_mp4 if you want to save it') # plt.show() if __name__ == "__main__": parser = argparse.ArgumentParser(description = 'Say hello') parser.add_argument('--build_dir', help='Location of MPPI-Generic build folder', required=True) parser.add_argument('--controller', help="Which controller we are plotting", required=True) parser.add_argument('--fps', required=False, default=50) parser.add_argument('--save_mp4', required=False, default=False) args = vars(parser.parse_args()) main(args) ================================================ FILE: scripts/double_integrator/plot_DI_test_trajectories.py ================================================ import numpy as np import matplotlib.pyplot as plt import argparse track_radius_outer = 2 + .125 track_radius_inner = 2 - .125 # Draw the bounds for the track theta = np.linspace(0,2*np.pi, 1000) x_track_inner = track_radius_inner*np.cos(theta) y_track_inner = track_radius_inner*np.sin(theta) x_track_outer = track_radius_outer*np.cos(theta) y_track_outer = track_radius_outer*np.sin(theta) def plot_trajectories(axis, axis_name, trajectory_array): axis.set_title(axis_name) time_horizon, trajectory_length, state_dim = trajectory_array.shape i = 0 speed_vec = np.zeros((time_horizon)) for i in range(time_horizon): if (trajectory_array[i,0,:] == 0).all(): # Means that the system stopped saving trajectories because of task failure break axis.plot(trajectory_array[i,:,0], trajectory_array[i,:,1], 'b', linewidth=0.5, alpha=0.7) # Compute the speed speed_vec[i] = np.linalg.norm(trajectory_array[i,0,2:]) # Check for tube failure axis.plot(trajectory_array[:i,0,0], trajectory_array[:i,0,1], 'y', linewidth=2, alpha=1.0) average_speed = np.mean(speed_vec) print(axis_name) print(average_speed) axis.plot(x_track_inner, y_track_inner, 'r', linewidth=2) axis.plot(x_track_outer, y_track_outer, 'r', linewidth=2) def plot_trajectories2(axis, axis_name, trajectory_array): axis.set_title(axis_name) time_horizon, state_dim = trajectory_array.shape i = 0 speed_vec = np.zeros((time_horizon)) for i in range(time_horizon): # Compute the speed speed_vec[i] = np.linalg.norm(trajectory_array[i,2:]) # Check for tube failure axis.plot(trajectory_array[:i,0], trajectory_array[:i,1], 'y', linewidth=2, alpha=1.0) average_speed = np.mean(speed_vec) print(axis_name) print(average_speed) axis.plot(x_track_inner, y_track_inner, 'r', linewidth=2) axis.plot(x_track_outer, y_track_outer, 'r', linewidth=2) def main(args): build_dir = args['build_dir'] data_dir = build_dir + 'tests/controllers/' CCM_true = args['CCM'] # Create the figures for each subplot fig, axs = plt.subplots(2,2) for ax in axs.flat: ax.set(xlabel='X Position (m)', ylabel='Y Position (m)') # Hide x labels and tick labels for top plots and y ticks for right plots. for ax in axs.flat: ax.label_outer() # # Plot the vanilla trajectory # vanilla_nominal = np.load(data_dir + 'vanilla_nominal.npy') # plot_trajectories(axs[0,0], 'Vanilla MPPI Nominal Disturbance', vanilla_nominal) # # # Vanilla MPPI with large noise # vanilla_large = np.load(data_dir + 'vanilla_large_track_feedback.npy') # plot_trajectories(axs[0,1], 'Vanilla MPPI Tracking Large Disturbance', vanilla_large) # # # Vanilla MPPI with large noise + a tracking controller # # vanilla_large_track = np.load(data_dir + 'vanilla_large_track.npy') # # plot_trajectories(axs[1,0], 'Vanilla MPPI Large Disturbance + Tracking', vanilla_large_track) # # # Tube MPPI with large noise # tube_large = np.load(data_dir + 'tube_large_actual.npy') # plot_trajectories(axs[1,0], 'Tube MPPI Large Disturbance', tube_large) # # # RMPPI with large noise # # Create the figures for each subplot # # fig2, axs2 = plt.subplots(2,2) # # # # for ax in axs2.flat: # # ax.set(xlabel='X Position (m)', ylabel='Y Position (m)') # robust_large = np.load(data_dir + 'robust_large_actual.npy') # plot_trajectories(axs[1,1], 'Robust LQR MPPI Actual', robust_large) if (CCM_true): fig, axs = plt.subplots(2,2) for ax in axs.flat: ax.set(xlabel='X Position (m)', ylabel='Y Position (m)') # robust_ccm = np.load(data_dir + 'robust_large_actual_CCM_t_4950.npy') # plot_trajectories(axs[0,0], 'Robust CCM MPPI Actual', robust_ccm) # # robust_ccm = np.load(data_dir + 'robust_large_nominal_CCM_t_4950.npy') # plot_trajectories(axs[0,1], 'Robust CCM MPPI Nominal', robust_ccm) robust_ccm = np.load(data_dir + 'robust_large_actual_traj_CCM_t_2999.npy') plot_trajectories2(axs[0,0], 'Robust CCM MPPI Actual', robust_ccm) robust_ccm = np.load(data_dir + 'robust_large_nominal_traj_CCM_t_2999.npy') plot_trajectories(axs[0,1], 'Robust CCM MPPI Nominal', robust_ccm) robust_large = np.load(data_dir + 'robust_large_actual.npy') plot_trajectories(axs[1,0], 'Robust LQR MPPI Actual', robust_large) robust_large = np.load(data_dir + 'robust_large_nominal.npy') plot_trajectories(axs[1,1], 'Robust LQR MPPI Nominal', robust_large) # robust_large = np.load(data_dir + 'robust_ancillary.npy') # plot_trajectories(axs2[1,0], 'Robust MPPI Ancillary', robust_large) # # Tube MPPI ancillary trajectories # tube_ancillary = np.load(data_dir + 'tube_ancillary.npy') # plot_trajectories(axs[1,1], 'Tube MPPI Ancillary Solution', tube_ancillary) plt.show() if __name__ == "__main__": parser = argparse.ArgumentParser(description = 'Say hello') parser.add_argument('--build_dir', help='Location of MPPI-Generic build folder', required=True) parser.add_argument('--CCM', help="Plot the CCM stuff?", required=False, default=0) args = vars(parser.parse_args()) main(args) ================================================ FILE: scripts/double_integrator/plot_DI_test_trajectories_tube_only.py ================================================ import numpy as np import matplotlib.pyplot as plt import argparse from matplotlib.figure import Figure from matplotlib.backends.backend_gtk3agg import FigureCanvas from matplotlib.backends.backend_gtk3 import ( NavigationToolbar2GTK3 as NavigationToolbar) track_radius_outer = 2 + .125 track_radius_inner = 2 - .125 # Draw the bounds for the track theta = np.linspace(0,2*np.pi, 1000) x_track_inner = track_radius_inner*np.cos(theta) y_track_inner = track_radius_inner*np.sin(theta) x_track_outer = track_radius_outer*np.cos(theta) y_track_outer = track_radius_outer*np.sin(theta) cur_timestep = 0 nominal = None actual = None feedback = None axs = None def callback(event): global actual, acillary, nominal, feedback global axs ''' this function gets called if we hit a key''' plt.cla() plot_boundaries(axs) global cur_timestep if event.key == "right": # print((actual[cur_timestep,0,:] == 0).all()) if not (actual[cur_timestep,0,:] == 0).all(): # Means that the system stopped saving trajectories because of task failure cur_timestep += 1 elif event.key == "left": if cur_timestep > 0: cur_timestep -= 1 # only plot the last n states states = 100 min_index = max(cur_timestep - states, 0) # plot actual trajectory axs.plot(actual[cur_timestep,:,0], actual[cur_timestep,:,1], 'b', linewidth=2.0, label='actual trajectory') axs.plot(actual[min_index:cur_timestep,0,0], actual[min_index:cur_timestep,0,1], 'bx', label='actual state') # plot actual state with feedback axs.plot(feedback[cur_timestep,:,0], feedback[cur_timestep,:,1], 'c', linewidth=2.0, label='actual feedback trajectory') # plot ancillary trajectory # axs.plot(ancillary[cur_timestep,:,0], ancillary[cur_timestep,:,1], 'k', linewidth=2.0, label='ancillary trajectory') # plt nominal state axs.plot(nominal[cur_timestep,:,0], nominal[cur_timestep,:,1], 'g', linewidth=2.0, label='nominal trajectory') axs.plot(nominal[min_index:cur_timestep,0,0], nominal[min_index:cur_timestep,0,1], 'g+', label='nominal state') # fix legend plt.legend(bbox_to_anchor=(.80, 1.15), loc='upper left', borderaxespad=0.) def plot_trajectories(axis, axis_name, trajectory_array): axis.set_title(axis_name) time_horizon, trajectory_length, state_dim = trajectory_array.shape i = 0 for i in range(time_horizon): if (trajectory_array[i,0,:] == 0).all(): # Means that the system stopped saving trajectories because of task failure break axis.plot(trajectory_array[i,:,0], trajectory_array[i,:,1], 'b', linewidth=0.5) axis.plot(trajectory_array[:i,0,0], trajectory_array[:i,0,1], 'y', linewidth=2) def plot_boundaries(axis): axis.plot(x_track_inner, y_track_inner, 'r', linewidth=2, label='boundary') axis.plot(x_track_outer, y_track_outer, 'r', linewidth=2) def main(args): build_dir = args['build_dir'] controller_flag = args['controller'] data_dir = build_dir + 'tests/controllers/' # Create the figures for each subplot global axs fig, axs = plt.subplots(1,1) # set up callback for arrow keys fig.canvas.mpl_connect('key_press_event', callback) axs.set(xlabel='X Position (m)', ylabel='Y Position (m)') # # Hide x labels and tick labels for top plots and y ticks for right plots. # for ax in axs.flat: # ax.label_outer() # Tube MPPI with large noise global actual, ancillary, nominal, feedback if controller_flag == '0': # 0 for tube, 1 for robust actual = np.load(data_dir + 'tube_large_actual.npy') nominal = np.load(data_dir + 'tube_large_nominal.npy') ancillary = np.load(data_dir + 'tube_ancillary.npy') feedback = np.load(data_dir + 'tube_large_feedback.npy') elif controller_flag == '1': actual = np.load(data_dir + 'robust_large_actual.npy') nominal = np.load(data_dir + 'robust_large_nominal.npy') ancillary = np.load(data_dir + 'robust_ancillary.npy') feedback = np.load(data_dir + 'robust_large_feedback.npy') else: pass plot_boundaries(axs) # TODO plot the initial location plt.show() if __name__ == "__main__": parser = argparse.ArgumentParser(description = 'Say hello') parser.add_argument('--build_dir', help='Location of MPPI-Generic build folder', required=True) parser.add_argument('--controller', help='0: Tube MPPI, 1: Robust MPPI', required=True) args = vars(parser.parse_args()) main(args) ================================================ FILE: scripts/double_integrator/plot_stuff.py ================================================ import numpy as np from matplotlib import rc rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 16}) rc('text', usetex=True) import matplotlib.pyplot as plt import argparse track_radius_outer = 2 + .125 track_radius_inner = 2 - .125 # Draw the bounds for the track theta = np.linspace(0,2*np.pi, 1000) x_track_inner = track_radius_inner*np.cos(theta) y_track_inner = track_radius_inner*np.sin(theta) x_track_outer = track_radius_outer*np.cos(theta) y_track_outer = track_radius_outer*np.sin(theta) def plot_fe_vs_time(fe_array, crash_cost=1.0, title="", savefig=0): fig = plt.figure() axis = plt.gca() # axis.set_title(title) time = np.linspace(0,fe_array.shape[0]*0.02, fe_array.shape[0]) plt.plot(time, np.ones_like(time), 'r', alpha=0.7, label='Crash Threshold') plt.plot(time, fe_array/crash_cost, 'b', alpha=0.7, label='Free Energy') plt.legend() if savefig: fig.savefig(title+'.pdf', bbox_inches='tight') def plot_fe_bounded(fe_array, fe_bound, title="", savefig=0): fig = plt.figure() axis = plt.gca() # axis.set_title(title) # plt.yscale('log') time = np.linspace(0,fe_array.shape[0]*0.02, fe_array.shape[0]) plt.plot(time, fe_array, 'b', alpha=0.7, label='FE Increase') plt.plot(time, fe_bound, 'r', alpha=0.7, label='Bound') plt.legend(bbox_to_anchor=(1.05, 1),loc='upper left') if savefig: fig.savefig(title+'.pdf', bbox_inches='tight') def plot_fe_vs_space(fe_array, state_array): pass def plot_trajectory(trajectory, title="", savefig=0): fig = plt.figure() axis = plt.gca() # axis.set_title(title) axis.set_aspect('equal', 'box') axis.set(xlim=(-2.5, 2.5), ylim=(-2.5, 2.5)) axis.plot(trajectory[:,0], trajectory[:,1], 'limegreen', alpha=0.95, linewidth=1.5, label='Actual Trajectory') axis.plot(x_track_inner, y_track_inner, 'k', linewidth=2) axis.plot(x_track_outer, y_track_outer, 'k', linewidth=2) if savefig: fig.savefig(title+'_state_trajectory.pdf', bbox_inches='tight') def plot_nominal_trajectory(trajectory, nominal_trajectory, title="", savefig=0): fig = plt.figure() axis = plt.gca() # axis.set_title(title) axis.set_aspect('equal', 'box') axis.set(xlim=(-2.3, 2.3), ylim=(-2.3, 2.3)) time_horizon, trajectory_length, state_dim = nominal_trajectory.shape for i in range(time_horizon-1): axis.plot(nominal_trajectory[i,:,0], nominal_trajectory[i,:,1], 'b', linewidth=1.0, alpha=0.7) axis.plot(nominal_trajectory[time_horizon-1,:,0], nominal_trajectory[time_horizon-1,:,1], 'b', linewidth=1.0, alpha=0.7, label='Nom') axis.plot(trajectory[:,0], trajectory[:,1], 'limegreen', alpha=0.95, linewidth=1.5, label='Act') axis.plot(x_track_inner, y_track_inner, 'k', linewidth=2) axis.plot(x_track_outer, y_track_outer, 'k', linewidth=2) # plt.legend(bbox_to_anchor=(1.05, 1),loc='upper left') if savefig: fig.savefig(title+'_nominal_trajectory.pdf', bbox_inches='tight') def plot_nominal_state_used_vs_time(ns_array, title=""): plt.figure() axis = plt.gca() axis.set_title(title) time = np.linspace(0,ns_array.shape[0]*0.02, ns_array.shape[0]) plt.scatter(time, ns_array) def main(args): build_dir = args['build_dir'] data_dir = build_dir + 'examples/' show_plot = args['show_fig'] save_fig = args['save_fig'] vanilla_fe = np.load(data_dir + "vanilla_free_energy.npy") vanilla_state_trajectory = np.load(data_dir + "vanilla_state_trajectory.npy") vanilla_robust_state_trajectory = np.load(data_dir + "vanilla_large_robust_state_trajectory.npy") vanilla_robust_fe = np.load(data_dir + "vanilla_large_robust_free_energy.npy") # vanilla_large_fe = np.load(data_dir + "vanilla_large_free_energy.npy") vanilla_large_state_trajectory = np.load(data_dir + "vanilla_large_state_trajectory.npy") vanilla_sc_nominal_trajectory = np.load(data_dir + "vanilla_nominal_trajectory.npy") # tube_large_rfe = np.load(data_dir + "tube_real_free_energy.npy") # tube_large_nfe = np.load(data_dir + "tube_nominal_free_energy.npy") tube_large_state_trajectory = np.load(data_dir + "tube_state_trajectory.npy") tube_large_nominal_state_used = np.load(data_dir + "tube_nominal_state_used.npy") tube_sc_nominal_trajectory = np.load(data_dir + "tube_nominal_trajectory.npy") tube_robust_state_trajectory = np.load(data_dir + "tube_robust_state_trajectory.npy") tube_robust_nominal_trajectory = np.load(data_dir + "tube_robust_nominal_trajectory.npy") tube_robust_rfe = np.load(data_dir + "tube_robust_real_free_energy.npy") robust_sc_rfe = np.load(data_dir + "robust_sc_real_free_energy.npy") # robust_sc_nfe = np.load(data_dir + "robust_sc_nominal_free_energy.npy") robust_sc_state_trajectory = np.load(data_dir + "robust_sc_state_trajectory.npy") robust_sc_nominal_state_used = np.load(data_dir + "robust_sc_nominal_state_used.npy") robust_sc_nominal_trajectory = np.load(data_dir + "robust_sc_nominal_trajectory.npy") robust_rc_rfe = np.load(data_dir + "robust_rc_real_free_energy.npy") # robust_rc_nfe = np.load(data_dir + "robust_rc_nominal_free_energy.npy") robust_rc_state_trajectory = np.load(data_dir + "robust_rc_state_trajectory.npy") robust_rc_nominal_state_used = np.load(data_dir + "robust_rc_nominal_state_used.npy") robust_rc_rfe_bound = np.load(data_dir + "robust_rc_real_free_energy_bound.npy") robust_rc_nfe_bound = np.load(data_dir + "robust_rc_nominal_free_energy_bound.npy") robust_rc_rfe_growth_bound = np.load(data_dir + "robust_rc_real_free_energy_growth_bound.npy")[1:] # robust_rc_rfc_growth = np.load(data_dir + "robust_rc_real_free_energy_growth.npy") robust_rc_rfc_growth = robust_rc_rfe[1:] - robust_rc_rfe[:-1] robust_rc_nfc_growth = np.load(data_dir + "robust_rc_nominal_free_energy_growth.npy") robust_rc_nominal_trajectory = np.load(data_dir + "robust_rc_nominal_trajectory.npy") robust_ccm_rfe = np.load(data_dir + "robust_large_actual_free_energy_CCM_t_2999.npy") robust_ccm_nfe = np.load(data_dir + "robust_large_nominal_free_energy_CCM_t_2999.npy") robust_ccm_state_trajectory = np.load(data_dir + "robust_large_actual_traj_CCM_t_2999.npy") robust_ccm_nominal_state_used = np.load(data_dir + "robust_large_nominal_state_used_CCM_t_2999.npy") robust_ccm_rfe_bound = np.load(data_dir + "robust_large_actual_free_energy_bound_CCM_t_2999.npy") robust_ccm_nfe_bound = np.load(data_dir + "robust_large_nominal_free_energy_bound_CCM_t_2999.npy") robust_ccm_rfe_growth_bound = np.load(data_dir + "robust_large_actual_free_energy_growth_bound_CCM_t_2999.npy") # robust_ccm_rfc_growth = np.load(data_dir + "robust_large_actual_free_energy_growth_CCM_t_2999.npy") robust_ccm_rfc_growth = robust_ccm_rfe[1:] - robust_ccm_rfe[:-1] robust_ccm_nfc_growth = np.load(data_dir + "robust_large_nominal_free_energy_growth_CCM_t_2999.npy") robust_ccm_nominal_trajectory = np.load(data_dir + "robust_large_nominal_traj_CCM_t_2999.npy") # plot_fe_vs_time(vanilla_fe, 1000,"Vanilla Real Free Energy") # plot_trajectory(vanilla_state_trajectory, "Vanilla State Trajectory", save_fig) # plot_fe_vs_time(vanilla_large_fe, 1000,"Vanilla Large Real Free Energy") # plot_trajectory(vanilla_large_state_trajectory, "Vanilla Large State Trajectory", save_fig) # plot_fe_vs_time(vanilla_robust_fe, 100, "Vanilla Robust Real Free Energy", save_fig) # plot_trajectory(vanilla_robust_state_trajectory, "Vanilla Robust State Trajectory", save_fig) # # plot_fe_vs_time(tube_large_rfe, 1000,"Tube Real Free Energy") # plot_fe_vs_time(tube_large_nfe, 1000,"Tube Nominal Free Energy") # plot_fe_vs_time(tube_robust_rfe, 100, "Tube Robust Real Free Energy", save_fig) # plot_trajectory(tube_large_state_trajectory, "Tube State Trajectory", save_fig) # plot_nominal_trajectory(tube_large_state_trajectory, tube_sc_nominal_trajectory, "Tube Standard State Trajectory", save_fig) # plot_nominal_trajectory(tube_robust_state_trajectory, tube_robust_nominal_trajectory, "Tube Robust State Trajectory", save_fig) # # plot_nominal_state_used_vs_time(tube_large_nominal_state_used) # # plot_fe_vs_time(robust_sc_rfe, 1000, "Robust Standard Real Free Energy", save_fig) # plot_fe_vs_time(robust_sc_nfe, 1000, "Robust Standard Nominal Free Energy") # plot_trajectory(robust_sc_state_trajectory, "Robust Standard State Trajectory") # # plot_nominal_state_used_vs_time(robust_sc_nominal_state_used) # plot_nominal_trajectory(robust_sc_state_trajectory, robust_sc_nominal_trajectory, "Robust Standard State Trajectory", save_fig) # plot_fe_vs_time(robust_rc_rfe, 100, "Robust Robust Real Free Energy", save_fig) plot_fe_bounded(robust_rc_rfc_growth, robust_rc_rfe_growth_bound, "Robust Robust Real Free Energy Growth") # plot_fe_bounded(robust_rc_nfe, robust_rc_nfe_bound, "Robust Robust Nominal Free Energy") # plot_fe_vs_time(robust_rc_nfc_growth, 0,"Robust Robust Nominal Free Energy Growth") # plot_trajectory(robust_rc_state_trajectory, "Robust Robust State Trajectory") # plot_nominal_trajectory(robust_rc_state_trajectory, robust_rc_nominal_trajectory, "Robust Robust State Trajectory", save_fig) # plot_nominal_state_used_vs_time(robust_rc_nominal_state_used) # Correct bound incorrect_bound = (robust_ccm_rfe_bound - robust_ccm_nfe) correct_bound = 8*incorrect_bound # plot_fe_vs_time(robust_ccm_rfe, 100, "Robust CCM Real Free Energy", save_fig) plot_fe_bounded(robust_ccm_rfc_growth, (robust_ccm_rfe_growth_bound-incorrect_bound+correct_bound)[1:], "Robust CCM Real Free Energy Growth") # plot_fe_bounded(robust_ccm_nfe, robust_ccm_nfe_bound, "Robust CCM Nominal Free Energy") # plot_fe_vs_time(robust_ccm_nfc_growth, 0,"Robust CCM Nominal Free Energy Growth") # plot_trajectory(robust_ccm_state_trajectory, "Robust CCM State Trajectory") # plot_nominal_state_used_vs_time(robust_ccm_nominal_state_used) # plot_nominal_trajectory(robust_ccm_state_trajectory, robust_ccm_nominal_trajectory, "Robust CCM State Trajectory", save_fig) if show_plot: plt.show() if __name__ == "__main__": parser = argparse.ArgumentParser(description = 'Say hello') parser.add_argument('--build_dir', help='Location of MPPI-Generic build folder', required=True) parser.add_argument('--show_fig', default=False, help='Show the figure upon completion of plotting.', required=False) parser.add_argument('--save_fig', default=False, help='Save the figures after they are generated.', required=False) args = vars(parser.parse_args()) main(args) ================================================ FILE: scripts/run_autoformatter.sh ================================================ #! /usr/bin/env bash if [ "$1" == "" ]; then dir="./" else dir="$1" fi # Format cuda header files echo "Formatting cuh files..." find $dir -name "*.cuh" ! -path "*build*" ! -path "*submodules*" | xargs -I % bash -c "clang-format --style=file -i --assume-filename=cu %" # Format cuda source files echo "Formatting cu files..." find $dir -name "*.cu" ! -path "*build*" ! -path "*submodules*" | xargs -I % bash -c "clang-format --style=file -i %" # Format cpp source files echo "Formatting cpp files..." find $dir -name "*.cpp" ! -path "*build*" ! -path "*submodules*" | xargs -I % bash -c "clang-format --style=file -i %" # Format h source files echo "Formatting h files..." find $dir -name "*.h" ! -path "*build*" ! -path "*submodules*" | xargs -I % bash -c "clang-format --style=file -i %" # Format hpp source files echo "Formatting hpp files..." find $dir -name "*.hpp" ! -path "*build*" ! -path "*submodules*" | xargs -I % bash -c "clang-format --style=file -i %" ================================================ FILE: src/CMakeLists.txt ================================================ add_subdirectory(controllers) ================================================ FILE: src/controllers/CMakeLists.txt ================================================ # Needed for methods to be built into libraries in Release mode set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fno-guess-branch-probability") add_subdirectory(cartpole) add_subdirectory(autorally) add_subdirectory(double_integrator) add_subdirectory(quadrotor) ================================================ FILE: src/controllers/autorally/CMakeLists.txt ================================================ set(LIBRARY_NAME autorally_mppi) SET (LIB_MAJOR 0) SET (LIB_MINOR 0) SET (LIB_RELEASE 1) add_library(${LIBRARY_NAME} SHARED autorally_mppi.cu ) target_link_libraries( ${LIBRARY_NAME} ${YAML_CPP_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME} ) SET (_soversion ${LIB_MAJOR}.${LIB_MINOR}.${LIB_RELEASE}) install(TARGETS ${LIBRARY_NAME} # IMPORTANT: Add the library to the "export-set" EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION bin LIBRARY DESTINATION lib/${CMAKE_PROJECT_NAME} ) ================================================ FILE: src/controllers/autorally/autorally_mppi.cu ================================================ #include /* * This file contains the instantiations of the controller for the cart pole. * Will have a dynamics model of cartpole, some cost function, * and a controller of just MPPI, (not tube or R) */ // Convenience typedef for the MPPI Controller. template class VanillaMPPIController; // template class VanillaMPPIController; ================================================ FILE: src/controllers/cartpole/CMakeLists.txt ================================================ set(LIBRARY_NAME cartpole_mppi) SET (LIB_MAJOR 0) SET (LIB_MINOR 0) SET (LIB_RELEASE 1) add_library(${LIBRARY_NAME} SHARED cartpole_mppi.cu ) target_link_libraries( ${LIBRARY_NAME} ${YAML_CPP_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME} ) SET (_soversion ${LIB_MAJOR}.${LIB_MINOR}.${LIB_RELEASE}) install(TARGETS ${LIBRARY_NAME} # IMPORTANT: Add the library to the "export-set" EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION bin LIBRARY DESTINATION lib/${CMAKE_PROJECT_NAME} ) ================================================ FILE: src/controllers/cartpole/cartpole_mppi.cu ================================================ #include /* * This file contains the instantiations of the controller for the cart pole. * Will have a dynamics model of cartpole, some cost function, * and a controller of just MPPI, (not tube or R) */ // Num_timesteps, num_rollouts, blockdim x, blockdim y const int NUMBER_TIMESTEPS = 100; // template class GPUFeedbackController, CartpoleDynamics>, CartpoleDynamics>; // template class DeviceDDPImpl, CartpoleDynamics>; // template class DeviceDDP; template class DDPFeedback; template class DDPFeedback; #define SAMPLER_T mppi::sampling_distributions::GaussianDistribution template class SAMPLER_T; template class mppi::sampling_distributions::GaussianDistributionImpl< SAMPLER_T, mppi::sampling_distributions::GaussianParams, CartpoleDynamics::DYN_PARAMS_T>; // template __host__ void mppi::sampling_distributions::SamplingDistribution::freeCudaMem(); template class mppi::sampling_distributions::SamplingDistribution< SAMPLER_T, mppi::sampling_distributions::GaussianParams, CartpoleDynamics::DYN_PARAMS_T>; template class Controller, SAMPLER_T, NUMBER_TIMESTEPS, 2048>; template class Controller, SAMPLER_T, NUMBER_TIMESTEPS, 256>; template class Controller, SAMPLER_T, 150, 512>; template class VanillaMPPIController, NUMBER_TIMESTEPS, 2048>; template class VanillaMPPIController, NUMBER_TIMESTEPS, 256>; template class VanillaMPPIController, 150, 512>; #undef SAMPLER_T // template class TubeMPPIController, NUMBER_TIMESTEPS, 2048>; ================================================ FILE: src/controllers/double_integrator/CMakeLists.txt ================================================ set(LIBRARY_NAME double_integrator_mppi) SET (LIB_MAJOR 0) SET (LIB_MINOR 0) SET (LIB_RELEASE 1) add_library(${LIBRARY_NAME} SHARED double_integrator_mppi.cu ) target_link_libraries( ${LIBRARY_NAME} ${YAML_CPP_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME} ) SET (_soversion ${LIB_MAJOR}.${LIB_MINOR}.${LIB_RELEASE}) install(TARGETS ${LIBRARY_NAME} # IMPORTANT: Add the library to the "export-set" EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION bin LIBRARY DESTINATION lib/${CMAKE_PROJECT_NAME} ) ================================================ FILE: src/controllers/double_integrator/double_integrator_mppi.cu ================================================ #include /* * This file contains the instantiations of the controller for the cart pole. * Will have a dynamics model of cartpole, some cost function, * and a controller of just MPPI, (not tube or R) */ // Num_timesteps, num_rollouts, blockdim x, blockdim y typedef mppi::sampling_distributions::GaussianDistribution Sampler; template class VanillaMPPIController, 100, 512, Sampler>; template class VanillaMPPIController, 50, 1024, Sampler>; template class TubeMPPIController, 100, 512, Sampler>; template class TubeMPPIController, 50, 1024, Sampler>; template class TubeMPPIController, 100, 1024, Sampler>; ================================================ FILE: src/controllers/quadrotor/CMakeLists.txt ================================================ set(LIBRARY_NAME quadrotor_mppi) SET (LIB_MAJOR 0) SET (LIB_MINOR 0) SET (LIB_RELEASE 1) add_library(${LIBRARY_NAME} SHARED quadrotor_mppi.cu ) target_link_libraries( ${LIBRARY_NAME} ${YAML_CPP_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME} ) SET (_soversion ${LIB_MAJOR}.${LIB_MINOR}.${LIB_RELEASE}) install(TARGETS ${LIBRARY_NAME} # IMPORTANT: Add the library to the "export-set" EXPORT ${PROJECT_NAME}-targets RUNTIME DESTINATION bin LIBRARY DESTINATION lib/${CMAKE_PROJECT_NAME} ) ================================================ FILE: src/controllers/quadrotor/quadrotor_mppi.cu ================================================ #include typedef mppi::sampling_distributions::GaussianDistribution Sampler; template class VanillaMPPIController, 100, 512, Sampler>; template class VanillaMPPIController, 100, 512, Sampler>; ================================================ FILE: tests/CMakeLists.txt ================================================ # Add project includes and cuda includes for g++ #include test utils include_directories(include) # Create header file pointing to test neural network set(HEADER_PATH ${CMAKE_CURRENT_SOURCE_DIR}/templated_headers/) # Create header file pointing to test map set(TEST_MAP_FOLDER "${PROJECT_BINARY_DIR}") set(TEST_NETWORK_FOLDER "${PROJECT_BINARY_DIR}") if(MPPI_BUILD_TESTS) message("") find_package(PythonInterp) # Create neural network map execute_process(COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/autorally/test/generateTestNetwork.py -o ${TEST_NETWORK_FOLDER} OUTPUT_FILE /dev/null ) message(STATUS "Created ${TEST_NETWORK_FOLDER}/*.npz") message("") endif() if(MPPI_BUILD_TESTS) message("") find_package(PythonInterp) # Create test map execute_process(COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/autorally/test/generateTestMaps.py -o ${TEST_MAP_FOLDER}/ OUTPUT_FILE /dev/null ) message(STATUS "Created folder ${TEST_MAP_FOLDER}/*.npz") message("") endif() file(GLOB HEADERS ${HEADER_PATH}*) foreach(T_FILE IN LISTS HEADERS) get_filename_component(T_NAME ${T_FILE} NAME_WE) string(REPLACE ".in" "" STRIPPED_NAME ${T_NAME}) message("stripped name ${STRIPPED_NAME}") set(TEST_NETWORK_HEADER_FILE ${PROJECT_BINARY_DIR}/include/${STRIPPED_NAME}.h) configure_file(${T_FILE} ${TEST_NETWORK_HEADER_FILE}) message(STATUS "Generated ${TEST_NETWORK_HEADER_FILE}") endforeach() # Add the generated header files to list of includes include_directories(${PROJECT_BINARY_DIR}/include) message("") add_subdirectory(dynamics) add_subdirectory(cost_functions) add_subdirectory(feedback_controllers) add_subdirectory(mppi_core) add_subdirectory(controllers) add_subdirectory(misc) add_subdirectory(sampling_distributions) add_subdirectory(shaping_functions) add_subdirectory(math_utils) add_subdirectory(integration) add_subdirectory(texture_helpers) add_subdirectory(nn_helpers) ================================================ FILE: tests/controllers/CMakeLists.txt ================================================ set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) # add_test(NAME ${T_NAME} COMMAND ${T_NAME}) gtest_add_tests(TARGET ${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: tests/controllers/controller_generic_tests.cu ================================================ #include #include #include #include #include #include #include #include #include #include #include static const int number_rollouts = 1200; static const int NUM_TIMESTEPS = 100; using FEEDBACK_T = DDPFeedback; const dim3 rolloutDim(1, 2, 1); using SAMPLER_T = mppi::sampling_distributions::GaussianDistribution; class TestController : public Controller { public: typedef Controller PARENT_CLASS; using PARAMS_T = PARENT_CLASS::TEMPLATED_PARAMS; TestController(MockDynamics* model, MockCost* cost, FEEDBACK_T* fb_controller, SAMPLER_T* sampler, float dt, int max_iter, float lambda, float alpha, int num_timesteps = 100, const Eigen::Ref& init_control_traj = control_trajectory::Zero(), cudaStream_t stream = nullptr) : PARENT_CLASS(model, cost, fb_controller, sampler, dt, max_iter, lambda, alpha, num_timesteps, init_control_traj, stream) { // Allocate CUDA memory for the controller allocateCUDAMemoryHelper(0); } TestController(MockDynamics* model, MockCost* cost, FEEDBACK_T* fb_controller, SAMPLER_T* sampler, PARAMS_T& params, cudaStream_t stream = nullptr) : PARENT_CLASS(model, cost, fb_controller, sampler, params, stream) { // Allocate CUDA memory for the controller allocateCUDAMemoryHelper(0); } virtual void computeControl(const Eigen::Ref& state, int optimization_stride) override { } virtual void calculateSampledStateTrajectories() override{}; void computeControl(const Eigen::Ref& state, const std::array noise) { int trajectory_size = control_trajectory().size(); for (int i = 0; i < number_rollouts; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampler_->getControlSample(i, 0, 0), noise[i].data(), sizeof(float) * trajectory_size, cudaMemcpyHostToDevice, stream_)); } HANDLE_ERROR(cudaStreamSynchronize(stream_)); // Normally rolloutKernel would be called here and would transform // control_noise_d_ from u to u + noise // Instead we just get back noise in this test this->copySampledControlFromDevice(); } virtual void slideControlSequence(int steps) override { } cudaStream_t getStream() { return stream_; } }; class ControllerTests : public ::testing::Test { protected: void SetUp() override { mockDynamics = new MockDynamics(); mockCost = new MockCost(); mockFeedback = new FEEDBACK_T(mockDynamics, dt); sampler = new SAMPLER_T(); HANDLE_ERROR(cudaStreamCreate(&stream)); MockDynamics tmp_dynamics(); // expect double check rebind EXPECT_CALL(*mockCost, bindToStream(testing::_)).Times(1); EXPECT_CALL(*mockDynamics, bindToStream(testing::_)).Times(1); // EXPECT_CALL(mockFeedback, bindToStream(stream)).Times(1); // expect GPU setup called again EXPECT_CALL(*mockCost, GPUSetup()).Times(1); EXPECT_CALL(*mockDynamics, GPUSetup()).Times(1); // EXPECT_CALL(mockFeedback, GPUSetup()).Times(1); controller = new TestController(mockDynamics, mockCost, mockFeedback, sampler, dt, max_iter, lambda, alpha); auto controller_params = controller->getParams(); controller_params.dynamics_rollout_dim_ = rolloutDim; controller_params.cost_rollout_dim_ = rolloutDim; controller->setParams(controller_params); } void TearDown() override { delete controller; delete mockDynamics; delete mockCost; delete mockFeedback; delete sampler; } MockDynamics* mockDynamics; MockCost* mockCost; FEEDBACK_T* mockFeedback; SAMPLER_T* sampler; TestController* controller; float dt = 0.1; int max_iter = 1; float lambda = 1.2; float alpha = 0.1; cudaStream_t stream; }; TEST_F(ControllerTests, ConstructorDestructor) { int num_timesteps = 10; TestController::control_trajectory init_control_trajectory = TestController::control_trajectory::Ones(); // expect double check rebind EXPECT_CALL(*mockCost, bindToStream(stream)).Times(1); EXPECT_CALL(*mockDynamics, bindToStream(stream)).Times(1); // // EXPECT_CALL(mockFeedback, bindToStream(stream)).Times(1); // // expect GPU setup called again EXPECT_CALL(*mockCost, GPUSetup()).Times(1); EXPECT_CALL(*mockDynamics, GPUSetup()).Times(1); // EXPECT_CALL(mockFeedback, GPUSetup()).Times(1); TestController* controller_test = new TestController(mockDynamics, mockCost, mockFeedback, sampler, dt, max_iter, lambda, alpha, num_timesteps, init_control_trajectory, stream); EXPECT_EQ(controller_test->model_, mockDynamics); EXPECT_EQ(controller_test->cost_, mockCost); EXPECT_EQ(controller_test->getDt(), dt); EXPECT_EQ(controller_test->getNumIters(), max_iter); EXPECT_EQ(controller_test->getLambda(), lambda); EXPECT_EQ(controller_test->getAlpha(), alpha); EXPECT_EQ(controller_test->getNumTimesteps(), num_timesteps); EXPECT_EQ(controller_test->getControlSeq(), init_control_trajectory); EXPECT_EQ(controller_test->getStream(), stream); EXPECT_EQ(controller_test->getFeedbackEnabled(), false); // TODO check that a random seed was set and stream was set // EXPECT_NE(controller_test->getRandomSeed(), 0); // TODO check for correct defaults delete controller_test; } TEST_F(ControllerTests, ParamBasedConstructor) { int num_timesteps = 10; TestController::TEMPLATED_PARAMS controller_params; controller_params.num_timesteps_ = num_timesteps; controller_params.dt_ = dt; controller_params.num_iters_ = max_iter; controller_params.lambda_ = lambda; controller_params.alpha_ = alpha; controller_params.init_control_traj_ = TestController::control_trajectory::Ones(); // expect double check rebind EXPECT_CALL(*mockCost, bindToStream(stream)).Times(1); EXPECT_CALL(*mockDynamics, bindToStream(stream)).Times(1); // // EXPECT_CALL(mockFeedback, bindToStream(stream)).Times(1); // // expect GPU setup called again EXPECT_CALL(*mockCost, GPUSetup()).Times(1); EXPECT_CALL(*mockDynamics, GPUSetup()).Times(1); // EXPECT_CALL(mockFeedback, GPUSetup()).Times(1); TestController* controller_test = new TestController(mockDynamics, mockCost, mockFeedback, sampler, controller_params, stream); EXPECT_EQ(controller_test->model_, mockDynamics); EXPECT_EQ(controller_test->cost_, mockCost); EXPECT_EQ(controller_test->getDt(), dt); EXPECT_EQ(controller_test->getNumIters(), max_iter); EXPECT_EQ(controller_test->getLambda(), lambda); EXPECT_EQ(controller_test->getAlpha(), alpha); EXPECT_EQ(controller_test->getNumTimesteps(), num_timesteps); EXPECT_EQ(controller_test->getControlSeq(), controller_params.init_control_traj_); EXPECT_EQ(controller_test->getStream(), stream); EXPECT_EQ(controller_test->getFeedbackEnabled(), false); // TODO check that a random seed was set and stream was set // EXPECT_NE(controller_test->getRandomSeed(), 0); // TODO check for correct defaults delete controller_test; } TEST_F(ControllerTests, setNumTimesteps) { controller->setNumTimesteps(10); EXPECT_EQ(controller->getNumTimesteps(), 10); controller->setNumTimesteps(1000); EXPECT_EQ(controller->getNumTimesteps(), 100); } TEST_F(ControllerTests, smoothControlTrajectory) { TestController::control_trajectory u; u.col(0) = TestController::control_array::Ones(); u.col(1) = 2.0 * TestController::control_array::Ones(); Eigen::Matrix control_history = Eigen::Matrix::Zero(); // smooth using only one value from the control trajectory controller->setNumTimesteps(1); controller->smoothControlTrajectoryHelper(u, control_history); for (int i = 0; i < MockDynamics::CONTROL_DIM; i++) { EXPECT_FLOAT_EQ(u(i, 0), (1.0 * 17 + 1.0 * 12 + 1.0 * -3) / 35.0) << "i = " << i; } // Reset trajectory and smooth over two steps u.col(0) = TestController::control_array::Ones(); controller->setNumTimesteps(2); controller->smoothControlTrajectoryHelper(u, control_history); for (int i = 0; i < MockDynamics::CONTROL_DIM; i++) { EXPECT_FLOAT_EQ(u(i, 0), (1.0 * 17 + 2.0 * 12 + 2.0 * -3) / 35.0) << "i = " << i; EXPECT_FLOAT_EQ(u(i, 1), (1.0 * 12 + 2.0 * 17 + 2.0 * 12 + 2.0 * -3) / 35.0) << "i = " << i; } } TEST_F(ControllerTests, slideControlSequenceHelper) { TestController::control_trajectory u; for (int i = 0; i < controller->getNumTimesteps(); i++) { TestController::control_array control = TestController::control_array::Ones(); control = control * i; u.col(i) = control.transpose(); } controller->slideControlSequenceHelper(1, u); for (int i = 0; i < controller->getNumTimesteps(); i++) { for (int j = 0; j < MockDynamics::CONTROL_DIM; j++) { int val = std::min(i + 1, controller->getNumTimesteps() - 1); if (i + 1 > controller->getNumTimesteps() - 1) { EXPECT_FLOAT_EQ(u(j, i), 0); } else { EXPECT_FLOAT_EQ(u(j, i), val); } } } controller->slideControlSequenceHelper(10, u); for (int i = 0; i < controller->getNumTimesteps(); i++) { for (int j = 0; j < MockDynamics::CONTROL_DIM; j++) { int val = std::min(i + 11, controller->getNumTimesteps() - 1); if (i + 10 > controller->getNumTimesteps() - 2) { EXPECT_FLOAT_EQ(u(j, i), 0); } else { EXPECT_FLOAT_EQ(u(j, i), val); } } } } TEST_F(ControllerTests, computeStateTrajectoryHelper) { TestController::state_array x = TestController::state_array::Ones(); TestController::state_array xdot = TestController::state_array::Ones(); EXPECT_CALL(*mockDynamics, step(testing::_, testing::_, testing::_, testing::_, testing::_, testing::_, dt)) .Times(controller->getNumTimesteps() - 1); TestController::state_trajectory result = TestController::state_trajectory::Ones(); TestController::control_trajectory u = TestController::control_trajectory::Zero(); controller->computeStateTrajectoryHelper(result, x, u); // TODO: Figure out if we can actually check output if output is not part of input // for (int i = 0; i < controller->getNumTimesteps(); i++) // { // for (int j = 0; j < MockDynamics::STATE_DIM; j++) // { // EXPECT_FLOAT_EQ(result(j, i), 1.0) << "t: " << i << ", s_dim: " << j << "\n"; // } // } } TEST_F(ControllerTests, interpolateControl) { TestController::control_trajectory traj; for (int i = 0; i < controller->getNumTimesteps(); i++) { traj.col(i) = TestController::control_array::Ones() * i; } for (double i = 0; i < controller->getNumTimesteps() - 1; i += 0.25) { TestController::control_array result = controller->interpolateControls(i * controller->getDt(), traj); EXPECT_FLOAT_EQ(result(0), i) << i; } } TEST_F(ControllerTests, interpolateFeedback) { controller->initFeedback(); auto fb_state = controller->getFeedbackState(); for (int i = 0; i < fb_state.FEEDBACK_SIZE; i++) { fb_state.fb_gain_traj_[i] = i; } TestController::state_trajectory s_traj = TestController::state_trajectory::Zero(); TestController::state_array state = TestController::state_array::Ones(); for (double i = 0; i < controller->getNumTimesteps() - 1; i += 0.25) { TestController::state_array interpolated_state = controller->interpolateState(s_traj, i * controller->getDt()); TestController::control_array result = controller->interpolateFeedback(state, interpolated_state, i * controller->getDt(), fb_state); EXPECT_FLOAT_EQ(result(0), i); } } TEST_F(ControllerTests, getCurrentControlTest) { EXPECT_CALL(*mockDynamics, enforceConstraints(testing::_, testing::_)).Times(4 * (controller->getNumTimesteps() - 1)); TestController::control_trajectory traj; controller->initFeedback(); auto fb_state = controller->getFeedbackState(); for (int i = 0; i < controller->getNumTimesteps(); i++) { for (int j = 0; j < MockDynamics::STATE_DIM * MockDynamics::CONTROL_DIM; j++) { int i_index = i * MockDynamics::STATE_DIM * MockDynamics::CONTROL_DIM; fb_state.fb_gain_traj_[i_index + j] = i_index + j; } traj.col(i) = TestController::control_array::Ones() * i; } TestController::state_trajectory s_traj = TestController::state_trajectory::Zero(); TestController::state_array state = TestController::state_array::Ones(); for (double i = 0; i < controller->getNumTimesteps() - 1; i += 0.25) { TestController::state_array interpolated_state = controller->interpolateState(s_traj, i * controller->getDt()); TestController::control_array result = controller->getCurrentControl(state, i * controller->getDt(), interpolated_state, traj, fb_state); EXPECT_FLOAT_EQ(result(0), i * 2); } } TEST_F(ControllerTests, saveControlHistoryHelper_1) { int steps = 1; TestController::control_trajectory u = TestController::control_trajectory::Random(); Eigen::Matrix u_history; u_history.setOnes(); controller->saveControlHistoryHelper(steps, u, u_history); for (int i = 0; i < MockDynamics::CONTROL_DIM; ++i) { EXPECT_FLOAT_EQ(u_history(i, 0), 1.0f) << "History column 0 failed"; EXPECT_FLOAT_EQ(u_history(i, 1), u(i, steps - 1)) << "History column 1 failed"; } } TEST_F(ControllerTests, saveControlHistoryHelper_2) { int steps = 4; TestController::control_trajectory u = TestController::control_trajectory::Random(); Eigen::Matrix u_history; u_history.setOnes(); controller->saveControlHistoryHelper(steps, u, u_history); for (int i = 0; i < MockDynamics::CONTROL_DIM; ++i) { EXPECT_FLOAT_EQ(u_history(i, 0), u(i, steps - 2)) << "History column 0 failed"; EXPECT_FLOAT_EQ(u_history(i, 1), u(i, steps - 1)) << "History column 1 failed"; } } ================================================ FILE: tests/controllers/controller_kernel_testing.cu ================================================ #include #include #include #include #include #include #include #include class DoubleIntegratorDummyCost : public Cost { public: DoubleIntegratorDummyCost(cudaStream_t stream = 0) { bindToStream(stream); } __device__ float computeStateCost(float* s, int timestep, float* theta_c, int* crash_status) { float cost = SQ(s[0]); for (int i = 0; i < 100; i++) { cost = abs(cos(cost)); } return cost; } float computeStateCost(const Eigen::Ref s, int timestep, int* crash_status) { float cost = SQ(s[0]); for (int i = 0; i < 100; i++) { cost = abs(cos(cost)); } return cost; } float terminalCost(const Eigen::Ref s) { return 0; } __device__ float terminalCost(float* s, float* theta_c) { return 0; } }; #include const int MAX_TIMESTEPS = 200; using DI_FEEDBACK_T = DDPFeedback; template using DI_Vanilla = VanillaMPPIController; template using DI_Colored = ColoredMPPIController; template using DI_Tube = TubeMPPIController; template using DI_Robust = RobustMPPIController; // TODO: Add more dynamics/cost function specializations template class ControllerKernelChoiceTest : public ::testing::Test { public: using DYN_T = typename CONTROLLER_T::TEMPLATED_DYNAMICS; using COST_T = typename CONTROLLER_T::TEMPLATED_COSTS; using FB_T = typename CONTROLLER_T::TEMPLATED_FEEDBACK; using SAMPLER_T = typename CONTROLLER_T::TEMPLATED_SAMPLING; using CONTROLLER_PARAMS_T = typename CONTROLLER_T::TEMPLATED_PARAMS; using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T; DYN_T* model = nullptr; COST_T* cost = nullptr; SAMPLER_T* sampler = nullptr; FB_T* fb_controller = nullptr; std::shared_ptr controller; mppi::util::MPPILoggerPtr logger = std::make_shared(mppi::util::LOG_LEVEL::DEBUG); void SetUp() override { model = new DYN_T(); cost = new COST_T(); fb_controller = new FB_T(model, dt); SAMPLER_PARAMS_T sampler_params; for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { sampler_params.std_dev[i] = std_dev; } sampler = new SAMPLER_T(sampler_params); HANDLE_ERROR(cudaStreamCreate(&stream)); CONTROLLER_PARAMS_T controller_params; setUpControllerParams(controller_params); controller = std::make_shared(model, cost, fb_controller, sampler, controller_params, stream); controller->setLogger(logger); controller->setLogLevel(mppi::util::LOG_LEVEL::WARNING); } void setUpControllerParams(CONTROLLER_PARAMS_T& params) { params.dt_ = dt; params.num_timesteps_ = num_timesteps; params.dynamics_rollout_dim_ = rollout_dyn; params.cost_rollout_dim_ = rollout_cost; } void TearDown() override { controller.reset(); delete fb_controller; delete sampler; delete model; delete cost; HANDLE_ERROR(cudaStreamSynchronize(stream)); } cudaStream_t stream; protected: const int num_timesteps = 150; float dt = 0.01; // float lambda = 1.0; // float alpha = 0.0; float std_dev = 2.0; dim3 rollout_dyn = dim3(64, DYN_T::STATE_DIM, 1); dim3 rollout_cost = dim3(96, 1, 1); dim3 eval_dyn = dim3(64, DYN_T::STATE_DIM, 1); dim3 eval_cost = dim3(96, 1, 1); }; template using DIFFERENT_CONTROLLERS = ::testing::Types, DI_Colored, DI_Tube, DI_Robust, DI_Vanilla, DI_Colored, DI_Tube, DI_Robust, DI_Vanilla, DI_Colored, DI_Tube, DI_Robust, DI_Vanilla, DI_Colored, DI_Tube, DI_Robust>; TYPED_TEST_SUITE(ControllerKernelChoiceTest, DIFFERENT_CONTROLLERS<128>); TYPED_TEST(ControllerKernelChoiceTest, CheckAppropriateKernelSelection) { const int further_evaluations = 20; auto empty_state = this->model->getZeroState(); this->controller->setLogLevel(mppi::util::LOG_LEVEL::DEBUG); this->controller->chooseAppropriateKernel(); auto auto_kernel_choice = this->controller->getKernelChoiceAsEnum(); // Start testing single kernel this->controller->setKernelChoice(kernelType::USE_SINGLE_KERNEL); auto start_single_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < further_evaluations; i++) { this->controller->computeControl(empty_state, 1); } auto end_single_kernel_time = std::chrono::steady_clock::now(); // Start testing split kernel this->controller->setKernelChoice(kernelType::USE_SPLIT_KERNELS); auto start_split_kernel_time = std::chrono::steady_clock::now(); for (int i = 0; i < further_evaluations; i++) { this->controller->computeControl(empty_state, 1); } auto end_split_kernel_time = std::chrono::steady_clock::now(); float single_kernel_duration = mppi::math::timeDiffms(end_single_kernel_time, start_single_kernel_time); float split_kernel_duration = mppi::math::timeDiffms(end_split_kernel_time, start_split_kernel_time); if (fabsf(single_kernel_duration - split_kernel_duration) < 1.0f) { // the kernels are within 1 ms of each other this->logger->info("Durations of both kernels too close to determine winner: split = %f ms, single = %f ms\n", split_kernel_duration, single_kernel_duration); } else { ASSERT_EQ(split_kernel_duration <= single_kernel_duration, auto_kernel_choice == kernelType::USE_SPLIT_KERNELS) << "chooseAppropriateKernel() did not choose the faster kernel, single: " << single_kernel_duration << " ms, split: " << split_kernel_duration; } } TYPED_TEST(ControllerKernelChoiceTest, KernelChoiceThroughSharedMemCheck) { auto controller_params = this->controller->getParams(); // set Rollout Cost dim so high as to ensure the kernel would run out of shared mem if used controller_params.cost_rollout_dim_.x = 320000; this->controller->setParams(controller_params); this->controller->chooseAppropriateKernel(); ASSERT_EQ(this->controller->getKernelChoiceAsEnum(), kernelType::USE_SINGLE_KERNEL); } TYPED_TEST(ControllerKernelChoiceTest, NoUsableKernelCheck) { auto controller_params = this->controller->getParams(); // set Rollout Cost dim so high as to ensure the kernel would run out of shared mem if used controller_params.cost_rollout_dim_.x = 320000; controller_params.dynamics_rollout_dim_.x = 320000; this->controller->setParams(controller_params); EXPECT_ANY_THROW({ this->controller->chooseAppropriateKernel(); }); } TYPED_TEST(ControllerKernelChoiceTest, MoreEvaluationsDoNotAdjustChoice) { int num_evaluations = this->controller->getNumKernelEvaluations(); this->controller->setLogLevel(mppi::util::LOG_LEVEL::DEBUG); this->controller->chooseAppropriateKernel(); auto curr_select_kernel = this->controller->getKernelChoiceAsEnum(); float prev_num_eval_time = 0; for (int i = 0; i < 5; i++) { this->controller->setNumKernelEvaluations(num_evaluations); auto start_num_eval_time = std::chrono::steady_clock::now(); this->controller->chooseAppropriateKernel(); auto end_num_eval_time = std::chrono::steady_clock::now(); float num_eval_time = mppi::math::timeDiffms(end_num_eval_time, start_num_eval_time); ASSERT_EQ(this->controller->getKernelChoiceAsEnum(), curr_select_kernel); if (i != 0) { ASSERT_TRUE(prev_num_eval_time < num_eval_time); } prev_num_eval_time = num_eval_time; num_evaluations *= 2; } } ================================================ FILE: tests/controllers/rmppi_test.cu ================================================ #include #include #include #include #include #include #include #include #include // Used to generate random noise for control trajectories /****************************************************************************** * Test class for RobustControllerPrivateMethods ******************************************************************************/ const int NUM_TIMESTEPS = 100; using DYN = DoubleIntegratorDynamics; using COST = DoubleIntegratorCircleCost; using SAMPLING = mppi::sampling_distributions::GaussianDistribution; using SAMPLER_PARAMS = SAMPLING::SAMPLING_PARAMS_T; using FB_CONTROLLER = DDPFeedback; using CONTROLLER_T = RobustMPPIController; class TestRobust : public CONTROLLER_T { public: TestRobust(DYN* model, COST* cost, DDPFeedback* fb_controller, SAMPLING* sampler, float dt, int max_iter, float lambda, float alpha, float value_function_threshold, int num_timesteps, const Eigen::Ref& init_control_traj, cudaStream_t stream) : RobustMPPIController(model, cost, fb_controller, sampler, dt, max_iter, lambda, alpha, value_function_threshold, num_timesteps, init_control_traj, 9, 1, stream) { } TestRobust(DYN* model, COST* cost, DDPFeedback* fb_controller, SAMPLING* sampler, RobustMPPIController::TEMPLATED_PARAMS& params, cudaStream_t stream) : RobustMPPIController(model, cost, fb_controller, sampler, params, stream) { } // Test to make sure that its nonzero // Test to make sure that cuda memory is allocated NominalCandidateVector getCandidates(const Eigen::Ref& nominal_x_k, const Eigen::Ref& nominal_x_kp1, const Eigen::Ref& real_x_kp1) { getInitNominalStateCandidates(nominal_x_k, nominal_x_kp1, real_x_kp1); return candidate_nominal_states_; }; Eigen::MatrixXf getWeights() { return line_search_weights_; }; void updateCandidates(int value) { updateNumCandidates(value); } bool getCudaMemStatus() { return importance_sampling_cuda_mem_init_; } void deallocateNSCMemory() { deallocateNominalStateCandidateMemory(); } void resetNSCMemory() { resetCandidateCudaMem(); } Eigen::MatrixXi getStrideIS(int stride) { computeImportanceSamplerStride(stride); return importance_sampler_strides_; } float getComputeCandidateBaseline(const Eigen::Ref& traj_costs_in) { candidate_trajectory_costs_ = traj_costs_in; return computeCandidateBaseline(); } int getComputeBestIndex(const Eigen::Ref& traj_costs_in) { candidate_trajectory_costs_ = traj_costs_in; computeBestIndex(); return best_index_; } state_array getNominalStateFromOptimization(const Eigen::Ref& nominal_x_k, const Eigen::Ref& nominal_x_kp1, const Eigen::Ref& real_x_kp1, bool nominal_state_init) { nominal_state_trajectory_.col(0) = nominal_x_k; nominal_state_trajectory_.col(1) = nominal_x_kp1; nominal_state_init_ = nominal_state_init; computeNominalStateAndStride(real_x_kp1, 1); // Default the stride to 1 return nominal_state_; } float* getFeedbackGainVector() { return this->fb_controller_->getFeedbackState().fb_gain_traj_; } TEMPLATED_FEEDBACK::feedback_gain_trajectory getFeedbackGainsEigen() { return this->fb_controller_->getFeedbackGainsEigen(); } }; // Text fixture for nominal state selection class RMPPINominalStateCandidates : public ::testing::Test { public: protected: void SetUp() override { model = new DYN(10); // Initialize the double integrator dynamics cost = new COST(); // Initialize the cost function fb_controller = new FB_CONTROLLER(model, dt); SAMPLER_PARAMS sampler_params; sampler_params.std_dev[0] = 0.001; sampler_params.std_dev[1] = 0.001; sampler = new SAMPLING(sampler_params); controller_params.dt_ = dt; controller_params.lambda_ = lambda; controller_params.alpha_ = alpha; // controller_params.control_std_dev_ << 0.0001, 0.0001; controller_params.num_iters_ = 3; controller_params.value_function_threshold_ = 1000.0; controller_params.num_timesteps_ = 100; controller_params.init_control_traj_.setZero(); controller_params.dynamics_rollout_dim_ = dim3(64, 4, 2); controller_params.cost_rollout_dim_ = dim3(64, 4, 2); controller_params.eval_dyn_kernel_dim_ = dim3(64, 4, 1); controller_params.eval_cost_kernel_dim_ = dim3(controller_params.num_timesteps_, 1, 1); // Q, Qf, R auto fb_params = fb_controller->getParams(); fb_params.Q.setIdentity(); fb_params.Q_f.setIdentity(); fb_params.R.setIdentity(); fb_controller->setParams(fb_params); test_controller = new TestRobust(model, cost, fb_controller, sampler, controller_params, 0); } void TearDown() override { delete test_controller; delete model; delete cost; delete fb_controller; delete sampler; } DYN* model; COST* cost; TestRobust* test_controller; FB_CONTROLLER* fb_controller; SAMPLING* sampler; TestRobust::TEMPLATED_PARAMS controller_params; float dt = 0.01; float lambda = 0.5; float alpha = 0.01; }; TEST_F(RMPPINominalStateCandidates, UpdateNumCandidates_LessThan3) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; ASSERT_DEATH(test_controller->updateCandidates(1), "ERROR: number of candidates must be greater or equal to 3\n"); } TEST_F(RMPPINominalStateCandidates, UpdateNumCandidates_Negative) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; ASSERT_DEATH(test_controller->updateCandidates(-1), "ERROR: number of candidates must be greater or equal to 3\n"); } TEST_F(RMPPINominalStateCandidates, UpdateNumCandidates_Even) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; ASSERT_DEATH(test_controller->updateCandidates(4), "ERROR: number of candidates must be odd\n"); } TEST_F(RMPPINominalStateCandidates, UpdateNumCandidates_TooManyCandidates) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; ASSERT_DEATH(test_controller->updateCandidates(99), "cannot exceed"); } TEST_F(RMPPINominalStateCandidates, CandidateVectorSizeNonzero) { DYN::state_array x_star_k, x_star_kp1, x_kp1; x_star_k << -4, 0, 0, 0; x_star_kp1 << 4, 0, 0, 0; x_kp1 << 4, 4, 0, 0; auto candidates = test_controller->getCandidates(x_star_k, x_star_kp1, x_kp1); ASSERT_TRUE(candidates.size() > 0); } TEST_F(RMPPINominalStateCandidates, CudaMemoryInitialized) { ASSERT_TRUE(test_controller->getCudaMemStatus()); } TEST_F(RMPPINominalStateCandidates, CudaMemoryRemoved) { test_controller->deallocateNSCMemory(); ASSERT_FALSE(test_controller->getCudaMemStatus()); } TEST_F(RMPPINominalStateCandidates, CudaMemoryReset) { test_controller->deallocateNSCMemory(); // Remove memory ASSERT_FALSE(test_controller->getCudaMemStatus()); test_controller->resetNSCMemory(); // Should allocate ASSERT_TRUE(test_controller->getCudaMemStatus()); test_controller->resetNSCMemory(); // Should remove then allocate again ASSERT_TRUE(test_controller->getCudaMemStatus()); } TEST_F(RMPPINominalStateCandidates, LineSearchWeights_9) { test_controller->updateCandidates(9); auto controller_weights = test_controller->getWeights(); Eigen::MatrixXf known_weights(3, 9); known_weights << 1, 3.0 / 4, 1.0 / 2, 1.0 / 4, 0, 0, 0, 0, 0, 0, 1.0 / 4, 1.0 / 2, 3.0 / 4, 1, 3.0 / 4, 1.0 / 2, 1.0 / 4, 0, 0, 0, 0, 0, 0, 1.0 / 4, 1.0 / 2, 3.0 / 4, 1; // std::cout << controller_weights << std::endl; // std::cout << known_weights << std::endl; ASSERT_TRUE(controller_weights == known_weights) << "Known Weights: \n" << known_weights << "\nComputed Weights: \n" << controller_weights; } TEST_F(RMPPINominalStateCandidates, ImportanceSampler_Stride_2) { int stride = 2; test_controller->updateCandidates(9); Eigen::MatrixXi known_stride(1, 9); known_stride << 0, 1, 1, 2, 2, 2, 2, 2, 2; auto compute_stride = test_controller->getStrideIS(stride); ASSERT_TRUE(known_stride == compute_stride) << "Known Stride: \n" << known_stride << "\nComputed Stride: \n" << compute_stride; } TEST_F(RMPPINominalStateCandidates, ImportanceSampler_Stride_4) { int stride = 4; test_controller->updateCandidates(9); Eigen::MatrixXi known_stride(1, 9); known_stride << 0, 1, 2, 3, 4, 4, 4, 4, 4; auto compute_stride = test_controller->getStrideIS(stride); ASSERT_TRUE(known_stride == compute_stride) << "Known Stride: \n" << known_stride << "\nComputed Stride: \n" << compute_stride; } TEST_F(RMPPINominalStateCandidates, InitEvalSelection_Weights) { /* * This test will ensure that the line search process to select the * number of points for free energy evaluation is correct. */ DYN::state_array x_star_k, x_star_kp1, x_kp1; x_star_k << -4, 0, 0, 0; x_star_kp1 << 0, 4, 0, 0; x_kp1 << 4, 4, 0, 0; const int num_candidates = 9; test_controller->updateCandidates(num_candidates); Eigen::Matrix known_candidates; known_candidates << -4, -3, -2, -1, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0; // Create the controller so we can test functions from it -> this should be a fixture auto candidates = test_controller->getCandidates(x_star_k, x_star_kp1, x_kp1); for (int i = 0; i < num_candidates; ++i) { ASSERT_TRUE(candidates[i] == known_candidates.col(i)) << "Index: " << i << "\nKnown Point: \n" << known_candidates.col(i) << "\nComputed Point: \n" << candidates[i]; } } class RMPPINominalStateSelection : public ::testing::Test { public: int num_samples = 64; int num_candidates = 9; Eigen::MatrixXf trajectory_costs; protected: void SetUp() override { model = new DYN(10); // Initialize the double integrator dynamics cost = new COST; // Initialize the cost function fb_controller = new FB_CONTROLLER(model, dt); SAMPLER_PARAMS sampler_params; sampler_params.std_dev[0] = 0.001; sampler_params.std_dev[1] = 0.001; sampler = new SAMPLING(sampler_params); init_control_traj.setZero(); // Q, Qf, R auto fb_params = fb_controller->getParams(); fb_params.Q.setIdentity(); fb_params.Q_f.setIdentity(); fb_params.R.setIdentity(); fb_controller->setParams(fb_params); test_controller = new TestRobust(model, cost, fb_controller, sampler, dt, 3, lambda, alpha, 1000.0, 100, init_control_traj, 0); auto controller_params = test_controller->getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 4, 2); controller_params.cost_rollout_dim_ = dim3(64, 1, 2); controller_params.eval_dyn_kernel_dim_ = dim3(num_samples, 4, 1); controller_params.eval_cost_kernel_dim_ = dim3(controller_params.num_timesteps_, 1, 1); test_controller->setParams(controller_params); test_controller->chooseAppropriateKernel(); // Set the size of the trajectory costs function trajectory_costs.resize(num_samples * num_candidates, 1); // Fill the trajectory costs with random costs trajectory_costs = Eigen::MatrixXf::Random(num_samples * num_candidates, 1); } void TearDown() override { delete test_controller; delete cost; delete model; delete fb_controller; delete sampler; } DYN* model; COST* cost; TestRobust* test_controller; FB_CONTROLLER* fb_controller; SAMPLING* sampler; TestRobust::control_trajectory init_control_traj; float dt = 0.01; float lambda = 0.5; float alpha = 0.01; }; TEST_F(RMPPINominalStateSelection, GetCandidateBaseline) { // Compute baseline float baseline = trajectory_costs(0); for (int i = 0; i < trajectory_costs.size(); i++) { if (trajectory_costs(i) < baseline) { baseline = trajectory_costs(i); } } // float compute_baseline = test_controller->getComputeCandidateBaseline(trajectory_costs); ASSERT_FLOAT_EQ(baseline, compute_baseline); } TEST_F(RMPPINominalStateSelection, ComputeBestCandidate) { float baseline = trajectory_costs(0); for (int i = 0; i < num_samples; i++) { if (trajectory_costs(i) < baseline) { baseline = trajectory_costs(i); } } float value_func_threshold_ = 1000.0; Eigen::MatrixXf candidate_free_energy; candidate_free_energy.resize(num_candidates, 1); candidate_free_energy.setZero(); // Should probably be in a cuda kernel? Will have to profile. for (int i = 0; i < num_candidates; i++) { for (int j = 0; j < num_samples; j++) { candidate_free_energy(i) += expf(-1.0 / lambda * (trajectory_costs(i * num_samples + j) - baseline)); } } for (int i = 0; i < num_candidates; i++) { candidate_free_energy(i) /= (1.0 * num_samples); } for (int i = 0; i < num_candidates; i++) { candidate_free_energy(i) = -lambda * logf(candidate_free_energy(i)) + baseline; } // Now get the closest initial condition that is above the threshold. int bestIdx = 0; for (int i = 1; i < num_candidates; i++) { if (candidate_free_energy(i) < value_func_threshold_) { bestIdx = i; } } int best_index_compute = test_controller->getComputeBestIndex(trajectory_costs); ASSERT_EQ(bestIdx, best_index_compute); } TEST_F(RMPPINominalStateSelection, ComputeNominalStateAndStride_InitFalse) { // Set the number of candidates to 3 const int num_candidates = 3; test_controller->updateCandidates(num_candidates); // We know that the cost penalizes any trajectory that exists our donut which // is centered around the origin with radius 2. Set the 3 relevant points of // the system such that x_k_star (the previous nominal state) is the best // free energy candidate. DYN::state_array nominal_x_k, nominal_x_kp1, real_x_kp1; nominal_x_k << 2, 0, 0, 0; nominal_x_kp1 << 5, 0, 0, 0; real_x_kp1 << 5, 0, 0, 0; bool nominal_state_init = false; auto nominal_state = test_controller->getNominalStateFromOptimization(nominal_x_k, nominal_x_kp1, real_x_kp1, nominal_state_init); // Since nominal state init is false, this should be the real state ASSERT_EQ(real_x_kp1, nominal_state) << "\nExpected state << \n" << real_x_kp1 << "\nComputed state: \n" << nominal_state; } TEST_F(RMPPINominalStateSelection, ComputeNominalStateAndStride_PreviousNominal) { // Set the number of candidates to 3 const int num_candidates = 3; test_controller->updateCandidates(num_candidates); // We know that the cost penalizes any trajectory that exists our donut which // is centered around the origin with radius 2. Set the 3 relevant points of // the system such that x_k_star (the previous nominal state) is the best // free energy candidate. DYN::state_array nominal_x_k, nominal_x_kp1, real_x_kp1; nominal_x_k << 2, 0, 0, 0; nominal_x_kp1 << 15, 0, 0, 0; real_x_kp1 << 15, 0, 0, 0; bool nominal_state_init = true; auto nominal_state = test_controller->getNominalStateFromOptimization(nominal_x_k, nominal_x_kp1, real_x_kp1, nominal_state_init); // Since nominal state init is false, this should be the real state ASSERT_EQ(nominal_x_k, nominal_state) << "\nExpected state << \n" << nominal_x_k << "\nComputed state: \n" << nominal_state; } TEST_F(RMPPINominalStateSelection, ComputeNominalStateAndStride_CurrentNominal) { // Set the number of candidates to 3 const int num_candidates = 3; test_controller->updateCandidates(num_candidates); // We know that the cost penalizes any trajectory that exists our donut which // is centered around the origin with radius 2. Set the 3 relevant points of // the system such that x_k_star (the nominal state) is the best // free energy candidate. DYN::state_array nominal_x_k, nominal_x_kp1, real_x_kp1; nominal_x_k << -100, 0, 0, 0; nominal_x_kp1 << 2, 0, 0, 0; real_x_kp1 << 100, 0, 0, 0; bool nominal_state_init = true; auto nominal_state = test_controller->getNominalStateFromOptimization(nominal_x_k, nominal_x_kp1, real_x_kp1, nominal_state_init); // Since nominal state init is false, this should be the real state ASSERT_EQ(nominal_x_kp1, nominal_state) << "\nExpected state << \n" << nominal_x_kp1 << "\nComputed state: \n" << nominal_state; } TEST_F(RMPPINominalStateSelection, ComputeNominalStateAndStride_CurrentReal) { // Set the number of candidates to 3 const int num_candidates = 3; test_controller->updateCandidates(num_candidates); // We know that the cost penalizes any trajectory that exists our donut which // is centered around the origin with radius 2. Set the 3 relevant points of // the system such that x_k_star (the previous nominal state) is the best // free energy candidate. DYN::state_array nominal_x_k, nominal_x_kp1, real_x_kp1; nominal_x_k << -100, 0, 0, 0; nominal_x_kp1 << -100, 0, 0, 0; real_x_kp1 << 2, 0, 0, 0; bool nominal_state_init = true; auto nominal_state = test_controller->getNominalStateFromOptimization(nominal_x_k, nominal_x_kp1, real_x_kp1, nominal_state_init); // Since nominal state init is false, this should be the real state ASSERT_EQ(real_x_kp1, nominal_state) << "\nExpected state << \n" << real_x_kp1 << "\nComputed state: \n" << nominal_state; } TEST_F(RMPPINominalStateSelection, DDPFeedbackGainInternalStorage) { DYN::state_array x; x << 2, 0, 0, 0; int stride = 1; test_controller->updateImportanceSamplingControl(x, stride); test_controller->updateImportanceSamplingControl(x, stride); auto fb_parm = test_controller->getFeedbackState(); auto feedback_gain_eigen_aligned = test_controller->getFeedbackGainsEigen(); for (size_t i = 0; i < feedback_gain_eigen_aligned.size(); i++) { int i_index = i * DYN::STATE_DIM * DYN::CONTROL_DIM; for (size_t j = 0; j < DYN::CONTROL_DIM * DYN::STATE_DIM; j++) { ASSERT_FLOAT_EQ(fb_parm.fb_gain_traj_[i_index + j], feedback_gain_eigen_aligned[i].data()[j]) << " at i = " << i; } } } bool tubeFailure(float* s) { float inner_path_radius2 = 1.675 * 1.675; float outer_path_radius2 = 2.325 * 2.325; float radial_position = s[0] * s[0] + s[1] * s[1]; if ((radial_position < inner_path_radius2) || (radial_position > outer_path_radius2)) { return true; } else { return false; } } TEST(RMPPITest, RobustMPPILargeVariance) { using DYNAMICS = DoubleIntegratorDynamics; using COST_T = DoubleIntegratorCircleCost; const int num_timesteps = 50; // Optimization time horizon using FEEDBACK_T = DDPFeedback; // Noise enters the system during the "true" state propagation. In this case the noise is nominal DYNAMICS model(100); // Initialize the double integrator dynamics COST_T cost; // Initialize the cost function float dt = 0.02; // Timestep of dynamics propagation int max_iter = 1; // Maximum running iterations of optimization float lambda = 4; // Learning rate parameter float alpha = 0.00; const int total_time_horizon = 5000; FEEDBACK_T fb_controller(&model, dt); SAMPLING::SAMPLING_PARAMS_T sampler_params; // control variance sampler_params.std_dev[0] = 1; sampler_params.std_dev[1] = 1; SAMPLING sampler = SAMPLING(sampler_params); std::vector actual_trajectory_save(num_timesteps * total_time_horizon * DYNAMICS::STATE_DIM); std::vector nominal_trajectory_save(num_timesteps * total_time_horizon * DYNAMICS::STATE_DIM); // std::vector ancillary_trajectory_save(num_timesteps*total_time_horizon*DYNAMICS::STATE_DIM); // Set the initial state DYNAMICS::state_array x; x << 2, 0, 0, 1; DYNAMICS::state_array xdot; // DDP cost parameters auto fb_params = fb_controller.getParams(); /** * Q = * [500, 0, 0, 0 * 0, 500, 0, 0 * 0, 0, 100, 0 * 0, 0, 0, 100] */ fb_params.Q.diagonal() << 500, 500, 100, 100; /** * Qf = I */ fb_params.Q_f = DYNAMICS::dfdx::Identity(); /** * R = I */ fb_params.R = FEEDBACK_T::square_control_matrix::Identity(); fb_controller.setParams(fb_params); // Value function threshold float value_function_threshold = 10.0; // DoubleIntegratorRobustCost cost2; // auto controller2 = RobustMPPIController(&model, &cost2, dt, max_iter, gamma, value_function_threshold, Q, Qf, R, control_var); // Initialize the R MPPI controller auto controller = RobustMPPIController( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha, value_function_threshold); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 4, 2); controller_params.cost_rollout_dim_ = dim3(num_timesteps, 1, 2); controller_params.eval_dyn_kernel_dim_ = dim3(64, 4, 1); controller_params.eval_cost_kernel_dim_ = dim3(num_timesteps, 1, 1); controller.setParams(controller_params); int fail_count = 0; // Start the while loop for (int t = 0; t < total_time_horizon; ++t) { // Print the system state if (t % 100 == 0) { printf("Current Time: %f ", t * dt); model.printState(x.data()); std::cout << " Candidate Free Energies: " << controller.getCandidateFreeEnergy().transpose() << std::endl; } if (cost.computeStateCost(x) > 1000) { fail_count++; } if (tubeFailure(x.data())) { cnpy::npy_save("robust_sc_large_actual.npy", actual_trajectory_save.data(), { total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM }, "w"); // cnpy::npy_save("robust_sc_ancillary.npy", ancillary_trajectory_save.data(), // {total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM},"w"); cnpy::npy_save("robust_sc_large_nominal.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM }, "w"); printf("Current Time: %f ", t * dt); model.printState(x.data()); std::cout << " Candidate Free Energies: " << controller.getCandidateFreeEnergy().transpose() << std::endl; std::cout << "Tube failure!!" << std::endl; FAIL() << "Visualize the trajectories by running scripts/double_integrator/plot_DI_test_trajectories; " "the argument to this python file is the build directory of MPPI-Generic"; } // Update the importance sampler controller.updateImportanceSamplingControl(x, 1); // Compute the control controller.computeControl(x); // Save the trajectory from the nominal state auto nominal_trajectory = controller.getTargetStateSeq(); // Save the ancillary trajectory // auto ancillary_trajectory = controller.getAncillaryStateSeq(); for (int i = 0; i < num_timesteps; i++) { for (int j = 0; j < DYNAMICS::STATE_DIM; j++) { actual_trajectory_save[t * num_timesteps * DYNAMICS::STATE_DIM + i * DYNAMICS::STATE_DIM + j] = x(j); // ancillary_trajectory_save[t * num_timesteps * DYNAMICS::STATE_DIM + // i*DYNAMICS::STATE_DIM + j] = ancillary_trajectory(j, i); nominal_trajectory_save[t * num_timesteps * DYNAMICS::STATE_DIM + i * DYNAMICS::STATE_DIM + j] = nominal_trajectory(j, i); } } // Get the open loop control DYNAMICS::control_array current_control = controller.getControlSeq().col(0); // std::cout << "Current OL control: " << current_control.transpose() << std::endl; // Apply the feedback given the current state current_control += controller.getFeedbackControl(x, controller.getTargetStateSeq().col(0), 0); // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add the "true" noise of the system model.computeStateDisturbance(dt, x); // Slide the control sequence controller.slideControlSequence(1); } cnpy::npy_save("robust_sc_large_actual.npy", actual_trajectory_save.data(), { total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM }, "w"); // cnpy::npy_save("robust_sc_ancillary.npy",ancillary_trajectory_save.data(), // {total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM},"w"); cnpy::npy_save("robust_sc_large_nominal.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM }, "w"); } TEST(RMPPITest, RobustMPPILargeVarianceRobustCost) { using DYNAMICS = DoubleIntegratorDynamics; using COST_T = DoubleIntegratorRobustCost; const int num_timesteps = 50; // Optimization time horizon using FEEDBACK_T = DDPFeedback; float dt = 0.02; // Timestep of dynamics propagation // Noise enters the system during the "true" state propagation. In this case the noise is nominal DYNAMICS model(100); // Initialize the double integrator dynamics COST_T cost; // Initialize the cost function FEEDBACK_T fb_controller(&model, dt); auto params = cost.getParams(); params.velocity_desired = 2; params.crash_cost = 100; cost.setParams(params); SAMPLING::SAMPLING_PARAMS_T sampler_params; // control variance sampler_params.std_dev[0] = 1; sampler_params.std_dev[1] = 1; SAMPLING sampler = SAMPLING(sampler_params); int max_iter = 3; // Maximum running iterations of optimization float lambda = 2.0; // Learning rate parameter float alpha = 0.0; int crash_status[1] = { 0 }; const int total_time_horizon = 5000; std::vector actual_trajectory_save(num_timesteps * total_time_horizon * DoubleIntegratorDynamics::STATE_DIM); std::vector nominal_trajectory_save(num_timesteps * total_time_horizon * DoubleIntegratorDynamics::STATE_DIM); // std::vector ancillary_trajectory_save(num_timesteps*total_time_horizon*DoubleIntegratorDynamics::STATE_DIM); std::vector feedback_trajectory_save(num_timesteps * total_time_horizon * DoubleIntegratorDynamics::STATE_DIM); // Set the initial state DYNAMICS::state_array x; x << 2, 0, 0, 1; DYNAMICS::state_array xdot; // DDP cost parameters auto fb_params = fb_controller.getParams(); /** * Q = * [500, 0, 0, 0 * 0, 500, 0, 0 * 0, 0, 100, 0 * 0, 0, 0, 100] */ fb_params.Q.diagonal() << 500, 500, 100, 100; /** * Qf = I */ fb_params.Q_f = DYNAMICS::dfdx::Identity(); /** * R = I */ fb_params.R = FEEDBACK_T::square_control_matrix::Identity(); // fb_params.num_iterations = 4; fb_controller.setParams(fb_params); // Value function threshold float value_function_threshold = 10.0; // DoubleIntegratorRobustCost cost2; // auto controller2 = RobustMPPIController(&model, &cost2, dt, max_iter, gamma, value_function_threshold, Q, Qf, R, control_var); using CONTROLLER_PARAMS = typename RobustMPPIController::TEMPLATED_PARAMS; CONTROLLER_PARAMS controller_params; controller_params.dt_ = dt; controller_params.num_iters_ = max_iter; controller_params.lambda_ = lambda; controller_params.alpha_ = alpha; controller_params.value_function_threshold_ = value_function_threshold; controller_params.dynamics_rollout_dim_ = dim3(64, 4, 2); controller_params.cost_rollout_dim_ = dim3(num_timesteps, 1, 2); controller_params.eval_dyn_kernel_dim_ = dim3(64, 4, 1); controller_params.eval_cost_kernel_dim_ = dim3(num_timesteps, 1, 1); // Initialize the R MPPI controller auto controller = RobustMPPIController( &model, &cost, &fb_controller, &sampler, controller_params); controller.setKernelChoice(kernelType::USE_SPLIT_KERNELS); int fail_count = 0; // Start the while loop for (int t = 0; t < total_time_horizon; ++t) { if (cost.computeStateCost(x, t, crash_status) > 1000) { fail_count++; crash_status[0] = 0; } if (tubeFailure(x.data())) { cnpy::npy_save("robust_rc_large_actual.npy", actual_trajectory_save.data(), { total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM }, "w"); // cnpy::npy_save("robust_rc_ancillary.npy", ancillary_trajectory_save.data(), // {total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM},"w"); cnpy::npy_save("robust_rc_large_nominal.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("robust_rc_large_feedback.npy", feedback_trajectory_save.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); printf("Current Time: %f ", t * dt); model.printState(x.data()); std::cout << " Candidate Free Energies: " << controller.getCandidateFreeEnergy().transpose() << std::endl; std::cout << "Tube failure!!" << std::endl; FAIL() << "Visualize the trajectories by running scripts/double_integrator/plot_DI_test_trajectories; " "the argument to this python file is the build directory of MPPI-Generic"; } // Update the importance sampler controller.updateImportanceSamplingControl(x, 1); // Compute the control controller.computeControl(x, 1); controller.computeFeedbackPropagatedStateSeq(); // Print the system state if (t % 100 == 0) { printf("Current Time: %f ", t * dt); model.printState(x.data()); auto free_energy_stats = controller.getFreeEnergyStatistics(); std::cout << " Candidate Free Energies: " << controller.getCandidateFreeEnergy().transpose() << std::endl; std::cout << "Real FE [mean, variance]: [" << free_energy_stats.real_sys.freeEnergyMean << ", " << free_energy_stats.real_sys.freeEnergyVariance << "]" << std::endl; std::cout << "Nominal FE [mean, variance]: [" << free_energy_stats.nominal_sys.freeEnergyMean << ", " << free_energy_stats.nominal_sys.freeEnergyVariance << "]" << std::endl; std::cout << "Algorithm Health Normalizer: [" << controller.getNormalizerPercent() << "]" << std::endl; std::cout << "DF(x, x0, u): [" << controller.computeDF() << "]\n" << std::endl; } // Save the trajectory from the nominal state auto nominal_trajectory = controller.getTargetStateSeq(); // Save the ancillary trajectory // auto ancillary_trajectory = controller.getAncillaryStateSeq(); // Compute the propagated state trajectory auto propagated_trajectory = controller.getFeedbackPropagatedStateSeq(); // Compute the actual trajectory with no feedback auto actual_trajectory = controller.getTargetStateSeq(); controller.computeStateTrajectoryHelper(actual_trajectory, x, controller.getControlSeq()); for (int i = 0; i < num_timesteps; i++) { for (int j = 0; j < DoubleIntegratorDynamics::STATE_DIM; j++) { actual_trajectory_save[t * num_timesteps * DoubleIntegratorDynamics::STATE_DIM + i * DoubleIntegratorDynamics::STATE_DIM + j] = actual_trajectory(j, i); // ancillary_trajectory_save[t * num_timesteps * DoubleIntegratorDynamics::STATE_DIM + // i*DoubleIntegratorDynamics::STATE_DIM + j] = ancillary_trajectory(j, i); nominal_trajectory_save[t * num_timesteps * DoubleIntegratorDynamics::STATE_DIM + i * DoubleIntegratorDynamics::STATE_DIM + j] = nominal_trajectory(j, i); feedback_trajectory_save[t * num_timesteps * DoubleIntegratorDynamics::STATE_DIM + i * DoubleIntegratorDynamics::STATE_DIM + j] = propagated_trajectory(j, i); } } // Get the open loop control DYNAMICS::control_array current_control = controller.getControlSeq().col(0); // std::cout << "Current OL control: " << current_control.transpose() << std::endl; // Apply the feedback given the current state DYNAMICS::control_array fb_control = controller.getFeedbackControl(x, controller.getTargetStateSeq().col(0), 0); current_control += fb_control; // Compute real free energy bound // Nominal free energy bound is always alpha // Bound of real free energy growth // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add the "true" noise of the system model.computeStateDisturbance(dt, x); // Slide the control sequence controller.slideControlSequence(1); } cnpy::npy_save("robust_rc_large_actual.npy", actual_trajectory_save.data(), { total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM }, "w"); // cnpy::npy_save("robust_rc_ancillary.npy",ancillary_trajectory_save.data(), // {total_time_horizon, num_timesteps, DYNAMICS::STATE_DIM},"w"); cnpy::npy_save("robust_rc_large_nominal.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save("robust_rc_large_feedback.npy", feedback_trajectory_save.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); } ================================================ FILE: tests/controllers/tube_mppi_test.cu ================================================ #include #include #include #include #include #include #include #include bool tubeFailure(float* s) { float inner_path_radius2 = 1.675 * 1.675; float outer_path_radius2 = 2.325 * 2.325; float radial_position = s[0] * s[0] + s[1] * s[1]; if ((radial_position < inner_path_radius2) || (radial_position > outer_path_radius2)) { return true; } else { return false; } } class DoubleIntegratorTubeMPPI : public ::testing::Test { public: static const int num_timesteps = 100; static const int num_rollouts = 512; using DYN = DoubleIntegratorDynamics; using COST = DoubleIntegratorCircleCost; using FB_CONTROLLER = DDPFeedback; using SAMPLING = mppi::sampling_distributions::GaussianDistribution; using VANILLA_CONTROLLER = VanillaMPPIController; using TUBE_CONTROLLER = TubeMPPIController; void SetUp() override { } }; TEST_F(DoubleIntegratorTubeMPPI, Construction) { // Define the model and cost DYN model; COST cost; float dt = 0.01; auto fb_controller = FB_CONTROLLER(&model, dt); auto fb_params = fb_controller.getParams(); int max_iter = 10; float lambda = 0.5; float alpha = 0.0; // DDP cost parameters Eigen::MatrixXf Q; Eigen::MatrixXf R; fb_params.Q = 100 * FB_CONTROLLER::square_state_matrix::Identity(); fb_params.Q_f = fb_params.Q; fb_params.R = FB_CONTROLLER::square_control_matrix::Identity(); fb_controller.setParams(fb_params); // Sampling Distribution setup SAMPLING sampler = SAMPLING(); auto vanilla_controller = VANILLA_CONTROLLER(&model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller = TUBE_CONTROLLER(&model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); // This controller needs the ancillary controller running separately for base plant reasons. } TEST_F(DoubleIntegratorTubeMPPI, ConstructionUsingParams) { // Define the model and cost DYN model; COST cost; TUBE_CONTROLLER::TEMPLATED_PARAMS controller_params; controller_params.dt_ = 0.01; controller_params.num_iters_ = 10; controller_params.lambda_ = 0.5; controller_params.alpha_ = 0.0; // control std dev // controller_params.control_std_dev_ << 1, 1; auto fb_controller = FB_CONTROLLER(&model, controller_params.dt_); auto fb_params = fb_controller.getParams(); // DDP cost parameters Eigen::MatrixXf Q; Eigen::MatrixXf R; fb_params.Q = 100 * FB_CONTROLLER::square_state_matrix::Identity(); fb_params.Q_f = fb_params.Q; fb_params.R = FB_CONTROLLER::square_control_matrix::Identity(); fb_controller.setParams(fb_params); // Sampling Distribution setup SAMPLING::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < DYN::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; } SAMPLING sampler = SAMPLING(sampler_params); auto vanilla_controller = VANILLA_CONTROLLER(&model, &cost, &fb_controller, &sampler, controller_params); auto controller = TUBE_CONTROLLER(&model, &cost, &fb_controller, &sampler, controller_params); // This controller needs the ancillary controller running separately for base plant reasons. } class DoubleIntegratorTracking : public ::testing::Test { public: static const int num_timesteps = 50; static const int num_rollouts = 1024; const unsigned int total_time_horizon = 500; using DYN = DoubleIntegratorDynamics; using COST = DoubleIntegratorCircleCost; using FB_CONTROLLER = DDPFeedback; using SAMPLING = mppi::sampling_distributions::GaussianDistribution; using VANILLA_CONTROLLER = VanillaMPPIController; using TUBE_CONTROLLER = TubeMPPIController; DYN model; COST cost; float dt = 0.02; // Timestep of dynamics propagation int max_iter = 3; // Maximum running iterations of optimization float lambda = 4; // Learning rate parameter float alpha = 0.0; FB_CONTROLLER fb_controller = FB_CONTROLLER(&model, dt); SAMPLING sampler; DYN::state_array x, xdot; void SetUp() override { auto params = cost.getParams(); params.velocity_desired = 2; cost.setParams(params); x << 2, 0, 0, 1; // control std dev SAMPLING::SAMPLING_PARAMS_T sampler_params; for (int i = 0; i < DYN::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 1; sampler_params.control_cost_coeff[i] = 1.0; } sampler = SAMPLING(sampler_params); } }; TEST_F(DoubleIntegratorTracking, VanillaMPPINominalVariance) { std::vector nominal_trajectory_save(num_timesteps * total_time_horizon * DYN::STATE_DIM); // Initialize the vanilla MPPI controller auto vanilla_controller = VANILLA_CONTROLLER(&model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = vanilla_controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 4, 1); controller_params.cost_rollout_dim_ = dim3(50, 1, 1); // controller_params.seed_ = 42; vanilla_controller.setParams(controller_params); int fail_count = 0; int crash_status[1] = { 0 }; // Start the while loop for (int t = 0; t < total_time_horizon; ++t) { if (cost.computeStateCost(x, t, crash_status) > 1000) { fail_count++; crash_status[0] = 0; } if (tubeFailure(x.data())) { FAIL(); } // Compute the control vanilla_controller.computeControl(x, 1); // Save the nominal trajectory auto nominal_trajectory = vanilla_controller.getTargetStateSeq(); for (int i = 0; i < num_timesteps; i++) { for (int j = 0; j < DYN::STATE_DIM; j++) { nominal_trajectory_save[t * num_timesteps * DYN::STATE_DIM + i * DYN::STATE_DIM + j] = nominal_trajectory(j, i); } } // Propagate the state forward model.computeDynamics(x, vanilla_controller.getControlSeq().col(0), xdot); model.updateState(x, xdot, dt); // Add the "true" noise of the system model.computeStateDisturbance(dt, x); // Slide the control sequence vanilla_controller.slideControlSequence(1); } // save it to file cnpy::npy_save("vanilla_nominal.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); // std::cout << "Number of times constraints were violated: " << fail_count << std::endl; } TEST_F(DoubleIntegratorTracking, VanillaMPPILargeVariance) { std::vector nominal_trajectory_save(num_timesteps * total_time_horizon * DYN::STATE_DIM); model.setStateVariance(100); // Initialize the vanilla MPPI controller auto vanilla_controller = VANILLA_CONTROLLER(&model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = vanilla_controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 4, 1); controller_params.cost_rollout_dim_ = dim3(50, 1, 1); // controller_params.seed_ = 42; vanilla_controller.setParams(controller_params); // bool success = false; int fail_count = 0; // Start the while loop for (int t = 0; t < total_time_horizon; ++t) { if (tubeFailure(x.data())) { // success = true; fail_count++; } if (fail_count > 50) { break; } // Compute the control vanilla_controller.computeControl(x, 1); // Save the nominal trajectory auto nominal_trajectory = vanilla_controller.getTargetStateSeq(); for (int i = 0; i < num_timesteps; i++) { for (int j = 0; j < DYN::STATE_DIM; j++) { nominal_trajectory_save[t * num_timesteps * DYN::STATE_DIM + i * DYN::STATE_DIM + j] = nominal_trajectory(j, i); } } // Propagate the state forward model.computeDynamics(x, vanilla_controller.getControlSeq().col(0), xdot); model.updateState(x, xdot, dt); // Add the "true" noise of the system model.computeStateDisturbance(dt, x); // Slide the control sequence vanilla_controller.slideControlSequence(1); // if (success) { // break; // } } cnpy::npy_save("vanilla_large.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); // std::cout << "Number of times constraints were violated: " << fail_count << std::endl; } TEST_F(DoubleIntegratorTracking, VanillaMPPILargeVarianceTracking) { std::vector nominal_trajectory_save(num_timesteps * total_time_horizon * DYN::STATE_DIM); std::vector actual_feedback_trajectory_save(num_timesteps * total_time_horizon * DYN::STATE_DIM); model.setStateVariance(100); // Initialize the vanilla MPPI controller auto vanilla_controller = VANILLA_CONTROLLER(&model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = vanilla_controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 4, 1); controller_params.cost_rollout_dim_ = dim3(50, 1, 1); // controller_params.seed_ = 42; vanilla_controller.setParams(controller_params); // DDP cost parameters auto fb_params = vanilla_controller.getFeedbackParams(); fb_params.Q.diagonal() << 500, 500, 100, 100; vanilla_controller.setFeedbackParams(fb_params); // bool success = false; int fail_count = 0; // Start the while loop for (int t = 0; t < total_time_horizon; ++t) { if (tubeFailure(x.data())) { // success = true; fail_count++; } if (fail_count > 50) { break; } // Compute the control vanilla_controller.computeControl(x, 1); // Compute the feedback gains vanilla_controller.computeFeedback(x); // Save the nominal trajectory auto nominal_trajectory = vanilla_controller.getTargetStateSeq(); auto nominal_control = vanilla_controller.getControlSeq(); vanilla_controller.computeFeedbackPropagatedStateSeq(); auto feedback_state_trajectory = vanilla_controller.getFeedbackPropagatedStateSeq(); for (int i = 0; i < num_timesteps; i++) { for (int j = 0; j < DYN::STATE_DIM; j++) { nominal_trajectory_save[t * num_timesteps * DYN::STATE_DIM + i * DYN::STATE_DIM + j] = nominal_trajectory(j, i); actual_feedback_trajectory_save[t * num_timesteps * DYN::STATE_DIM + i * DYN::STATE_DIM + j] = feedback_state_trajectory(j, i); } } // Get the open loop control DYN::control_array current_control = nominal_control.col(0); // Apply the feedback given the current state current_control += vanilla_controller.getFeedbackControl(x, nominal_trajectory.col(0), 0); // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add the "true" noise of the system model.computeStateDisturbance(dt, x); // Slide the control sequence vanilla_controller.slideControlSequence(1); } cnpy::npy_save("vanilla_large_track_actual.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); cnpy::npy_save("vanilla_large_track_feedback.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); // std::cout << "Number of times constraints were violated: " << fail_count << std::endl; } TEST_F(DoubleIntegratorTracking, TubeMPPILargeVariance) { // Noise enters the system during the "true" state propagation. In this case the noise is nominal model.setStateVariance(100); auto fb_params = fb_controller.getParams(); /** * Q = * [500, 0, 0, 0 * 0, 500, 0, 0 * 0, 0, 100, 0 * 0, 0, 0, 100] */ fb_params.Q.diagonal() << 500, 500, 100, 100; /** * Qf = I */ fb_params.Q_f = FB_CONTROLLER::square_state_matrix::Identity(); /** * R = I */ fb_params.R = FB_CONTROLLER::square_control_matrix::Identity(); fb_controller.setParams(fb_params); // To pass it should be lambda = 4, vel desired = 2, vel cost = 1 crash cost 1000, nom threshold 20 std::vector actual_trajectory_save(num_timesteps * total_time_horizon * DYN::STATE_DIM); std::vector nominal_trajectory_save(num_timesteps * total_time_horizon * DYN::STATE_DIM); // std::vector ancillary_trajectory_save(num_timesteps*total_time_horizon*DYN::STATE_DIM); std::vector feedback_trajectory_save(num_timesteps * total_time_horizon * DYN::STATE_DIM); // Initialize the tube MPPI controller auto controller = TUBE_CONTROLLER(&model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 1, 1); controller_params.cost_rollout_dim_ = dim3(50, 1, 1); // controller_params.seed_ = 42; controller.setParams(controller_params); controller.setNominalThreshold(100); int fail_count = 0; int crash_status[1] = { 0 }; // Start the while loop for (int t = 0; t < total_time_horizon; ++t) { // Print the system state if (t % 100 == 0) { float current_cost = cost.computeStateCost(x, 1, crash_status); printf("Current Time: %f ", t * dt); printf("Current State Cost: %f ", current_cost); model.printState(x.data()); auto free_energy_stats = controller.getFreeEnergyStatistics(); std::cout << "Real FE [mean, variance]: [" << free_energy_stats.real_sys.freeEnergyMean << ", " << free_energy_stats.real_sys.freeEnergyVariance << "]" << std::endl; std::cout << "Nominal FE [mean, variance]: [" << free_energy_stats.nominal_sys.freeEnergyMean << ", " << free_energy_stats.nominal_sys.freeEnergyVariance << "]" << std::endl; std::cout << "Algorithm Health Normalizer: [" << controller.getNormalizerPercent() << "]\n" << std::endl; } if (cost.computeStateCost(x, t, crash_status) > 1000) { fail_count++; crash_status[0] = 0; } if (tubeFailure(x.data())) { float current_cost = cost.computeStateCost(x, 1, crash_status); printf("Current Time: %f ", t * dt); printf("Current State Cost: %f ", current_cost); model.printState(x.data()); auto free_energy_stats = controller.getFreeEnergyStatistics(); std::cout << "Real FE [mean, variance]: [" << free_energy_stats.real_sys.freeEnergyMean << ", " << free_energy_stats.real_sys.freeEnergyVariance << "]" << std::endl; std::cout << "Nominal FE [mean, variance]: [" << free_energy_stats.nominal_sys.freeEnergyMean << ", " << free_energy_stats.nominal_sys.freeEnergyVariance << "]" << std::endl; std::cout << "Algorithm Health Normalizer: [" << controller.getNormalizerPercent() << "]\n" << std::endl; cnpy::npy_save("tube_large_actual.npy", actual_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); // cnpy::npy_save("tube_ancillary.npy", ancillary_trajectory_save.data(), // {total_time_horizon, num_timesteps, DYN::STATE_DIM},"w"); cnpy::npy_save("tube_large_nominal.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); cnpy::npy_save("tube_large_feedback.npy", feedback_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); FAIL() << "Visualize the trajectories by running scripts/double_integrator/plot_DI_test_trajectories; " "the argument to this python file is the build directory of MPPI-Generic"; } // Compute the control controller.computeControl(x, 1); // Save the trajectory from the nominal state auto nominal_trajectory = controller.getTargetStateSeq(); auto actual_trajectory = controller.getActualStateSeq(); // Get the feedback gains associated with the nominal state and control trajectory controller.computeFeedback(x); // Save the ancillary trajectory // auto ancillary_trajectory = controller.getAncillaryStateSeq(); // Compute the propagated feedback trajectory controller.computeFeedbackPropagatedStateSeq(); auto propagated_feedback_trajectory = controller.getFeedbackPropagatedStateSeq(); for (int i = 0; i < num_timesteps; i++) { for (int j = 0; j < DYN::STATE_DIM; j++) { actual_trajectory_save[t * num_timesteps * DYN::STATE_DIM + i * DYN::STATE_DIM + j] = actual_trajectory(j, i); // ancillary_trajectory_save[t * num_timesteps * DYN::STATE_DIM + // i*DYN::STATE_DIM + j] = ancillary_trajectory(j, i); nominal_trajectory_save[t * num_timesteps * DYN::STATE_DIM + i * DYN::STATE_DIM + j] = nominal_trajectory(j, i); feedback_trajectory_save[t * num_timesteps * DYN::STATE_DIM + i * DYN::STATE_DIM + j] = propagated_feedback_trajectory(j, i); } } // Get the open loop control DYN::control_array current_control = controller.getControlSeq().col(0); // Apply the feedback given the current state current_control += controller.getFeedbackControl(x, controller.getTargetStateSeq().col(0), 0); // Propagate the state forward model.computeDynamics(x, current_control, xdot); model.updateState(x, xdot, dt); // Add the "true" noise of the system model.computeStateDisturbance(dt, x); // Slide the control sequence controller.slideControlSequence(1); } cnpy::npy_save("tube_large_actual.npy", actual_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); // cnpy::npy_save("tube_ancillary.npy",ancillary_trajectory_save.data(), // {total_time_horizon, num_timesteps, DYN::STATE_DIM},"w"); cnpy::npy_save("tube_large_nominal.npy", nominal_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); cnpy::npy_save("tube_large_feedback.npy", feedback_trajectory_save.data(), { total_time_horizon, num_timesteps, DYN::STATE_DIM }, "w"); } ================================================ FILE: tests/controllers/vanilla_mppi_test.cu ================================================ #include #include #include class Cartpole_VanillaMPPI : public ::testing::Test { public: static const int NUM_TIMESTEPS = 100; static const int NUM_ROLLOUTS = 2048; using DYN_T = CartpoleDynamics; using COST_T = CartpoleQuadraticCost; using FB_T = DDPFeedback; using SAMPLING_T = mppi::sampling_distributions::GaussianDistribution; using CONTROLLER_T = VanillaMPPIController; using control_trajectory = CONTROLLER_T::control_trajectory; using control_array = CONTROLLER_T::control_array; DYN_T model = DYN_T(1.0, 1.0, 1.0); COST_T cost; FB_T* fb_controller; SAMPLING_T* sampler; CONTROLLER_T* controller; float dt = 0.01; int max_iter = 10; float gamma = 0.5; float lambda = 0.25; float alpha = 0.01; control_trajectory init_control = control_trajectory::Constant(0); control_array control_std_dev = control_array::Constant(5.0); cudaStream_t stream; void SetUp() override { fb_controller = new FB_T(&model, dt); HANDLE_ERROR(cudaStreamCreate(&stream)); auto sampler_params = SAMPLING_T::SAMPLING_PARAMS_T(); for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { sampler_params.std_dev[i] = control_std_dev[i]; } EXPECT_EQ(model.getGrdSharedSizeBytes(), 0); EXPECT_EQ(model.getBlkSharedSizeBytes(), 0); sampler = new SAMPLING_T(sampler_params); controller = new CONTROLLER_T(&model, &cost, fb_controller, sampler, dt, max_iter, lambda, alpha, NUM_TIMESTEPS, init_control, stream); } void TearDown() override { delete controller; delete fb_controller; delete sampler; } }; TEST_F(Cartpole_VanillaMPPI, BindToStream) { EXPECT_EQ(controller->stream_, controller->model_->stream_) << "Stream bind to dynamics failure"; EXPECT_EQ(controller->stream_, controller->cost_->stream_) << "Stream bind to cost failure"; EXPECT_EQ(controller->stream_, controller->fb_controller_->getHostPointer()->stream_) << "Stream bind to feedback " "failure"; EXPECT_EQ(controller->stream_, controller->sampler_->stream_) << "Stream bind to sampling distribution failure"; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST_F(Cartpole_VanillaMPPI, UpdateNoiseStdDev) { control_array new_control_std_dev = control_array::Constant(3.5); auto sampler_params = sampler->getParams(); for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { sampler_params.std_dev[i] = new_control_std_dev[i]; } sampler->setParams(sampler_params, true); EXPECT_FLOAT_EQ(new_control_std_dev[0], controller->sampler_->getParams().std_dev[0]); } TEST_F(Cartpole_VanillaMPPI, SwingUpTest) { CartpoleQuadraticCostParams new_params; new_params.cart_position_coeff = 100; new_params.pole_angle_coeff = 200; new_params.cart_velocity_coeff = 10; new_params.pole_angular_velocity_coeff = 20; new_params.control_cost_coeff[0] = 1; new_params.terminal_cost_coeff = 0; new_params.desired_terminal_state[0] = -20; new_params.desired_terminal_state[1] = 0; new_params.desired_terminal_state[2] = M_PI; new_params.desired_terminal_state[3] = 0; cost.setParams(new_params); auto sampler_params = sampler->getParams(); sampler_params.control_cost_coeff[0] = 1.0; sampler_params.pure_noise_trajectories_percentage = 0.01f; sampler_params.rewrite_controls_block_dim = dim3(32, 16, 1); sampler->setParams(sampler_params); auto controller_params = controller->getParams(); max_iter = 1; controller_params.dynamics_rollout_dim_ = dim3(64, 4, 1); controller_params.cost_rollout_dim_ = dim3(64, 1, 1); controller_params.num_iters_ = max_iter; controller_params.slide_control_scale_[0] = 1.0; controller->setParams(controller_params); controller->chooseAppropriateKernel(); DYN_T::state_array current_state = DYN_T::state_array::Zero(); int time_horizon = 1000; DYN_T::state_array xdot(4, 1); for (int i = 0; i < time_horizon; ++i) { if (i % 50 == 0) { printf("Current Time: %f ", i * dt); printf("Current Baseline Cost: %f ", controller->getBaselineCost()); model.printState(current_state.data()); } // Compute the control controller->computeControl(current_state, 1); DYN_T::control_array control; control = controller->getControlSeq().block(0, 0, DYN_T::CONTROL_DIM, 1); // Increment the state model.computeStateDeriv(current_state, control, xdot); model.updateState(current_state, xdot, dt); controller->slideControlSequence(1); } EXPECT_LT(controller->getBaselineCost(), 1.0); } TEST_F(Cartpole_VanillaMPPI, getSampledStateTrajectoriesTest) { // float dt = 0.01; // float max_iter = 1; // float lambda = 0.25; // float alpha = 0.01; // CartpoleDynamics::control_array control_std_dev = CartpoleDynamics::control_array::Constant(5.0); // auto fb_controller = DDPFeedback(&model, dt); // auto controller = // VanillaMPPIController, 100, 2048, // 64, // 8>(&model, &cost, &fb_controller, dt, max_iter, lambda, alpha, control_std_dev); DYN_T::state_array current_state = DYN_T::state_array::Zero(); controller->setPercentageSampledControlTrajectories(0.25); controller->calculateSampledStateTrajectories(); const auto outputs = controller->getSampledOutputTrajectories(); EXPECT_EQ(outputs.size(), 0.25 * NUM_ROLLOUTS); } class Quadrotor_VanillaMPPI : public ::testing::Test { public: static const int NUM_TIMESTEPS = 150; static const int NUM_ROLLOUTS = 2048; using DYN_T = QuadrotorDynamics; using COST_T = QuadrotorQuadraticCost; using FB_T = DDPFeedback; using SAMPLING_T = mppi::sampling_distributions::GaussianDistribution; using CONTROLLER_T = VanillaMPPIController; using control_trajectory = CONTROLLER_T::control_trajectory; using control_array = CONTROLLER_T::control_array; float dt = 0.01; int max_iter = 1; float lambda = 4.0; float alpha = 0.9; cudaStream_t stream; control_array control_std_dev = control_array::Constant(0.5); control_trajectory init_control = control_trajectory::Constant(0.0); DYN_T model; COST_T cost; FB_T fb_controller = FB_T(&model, dt); SAMPLING_T* sampler; CONTROLLER_T* controller; void SetUp() override { HANDLE_ERROR(cudaStreamCreate(&stream)); auto sampler_params = SAMPLING_T::SAMPLING_PARAMS_T(); for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { sampler_params.std_dev[i] = control_std_dev[i]; } for (int i = 0; i < NUM_TIMESTEPS; i++) { init_control(3, i) = mppi::math::GRAVITY; } sampler = new SAMPLING_T(sampler_params); controller = new CONTROLLER_T(&model, &cost, &fb_controller, sampler, dt, max_iter, lambda, alpha, NUM_TIMESTEPS, init_control, stream); } void TearDown() override { delete controller; delete sampler; } }; TEST_F(Quadrotor_VanillaMPPI, HoverTest) { QuadrotorQuadraticCostParams new_params; new_params.x_goal()[2] = 1; new_params.x_coeff = 400; new_params.v_coeff = 150; // new_params.q_coeff = 15; new_params.roll_coeff = 15; new_params.pitch_coeff = 15; new_params.yaw_coeff = 15; new_params.w_coeff = 5; float acceptable_distance = 0.15; // meters std::cout << "Goal: " << new_params.getDesiredState().transpose() << std::endl; cost.setParams(new_params); // int max_iter = 1; // float lambda = 4.0; // float alpha = 0.9; // DYN_T::control_array control_std_dev = DYN_T::control_array::Constant(0.5); // control_std_dev[3] = 2; auto sampler_params = sampler->getParams(); sampler_params.std_dev[3] = 2.0f; sampler_params.rewrite_controls_block_dim = dim3(32, 16, 1); sampler_params.sum_strides = 32; sampler->setParams(sampler_params); // CONTROLLER::control_trajectory init_control = CONTROLLER::control_trajectory::Zero(); // for (int i = 0; i < num_timesteps; i++) // { // init_control(3, i) = mppi::math::GRAVITY; // } // auto controller = CONTROLLER(&model, &cost, &fb_controller, dt, max_iter, lambda, alpha, control_std_dev, // num_timesteps, init_control); auto controller_params = controller->getParams(); controller_params.dynamics_rollout_dim_ = dim3(32, 4, 1); controller_params.cost_rollout_dim_ = dim3(32, 4, 1); controller->setParams(controller_params); controller->setDebug(true); controller->chooseAppropriateKernel(); DYN_T::state_array current_state = DYN_T::state_array::Zero(); // current_state(6) = 1; // set q_w to 1 current_state(S_IND_CLASS(DYN_T::DYN_PARAMS_T, QUAT_W)) = 1; // set q_w to 1 int time_horizon = 3000; // float xdot[DYN_T::STATE_DIM]; DYN_T::state_array xdot(4, 1); DYN_T::control_array control; control = controller->getControlSeq().block(0, 0, DYN_T::CONTROL_DIM, 1); int far_away_cnt = 0; Eigen::Vector3f goal_pos = new_params.getDesiredState().block<3, 1>(0, 0); std::chrono::steady_clock::time_point loop_start = std::chrono::steady_clock::now(); std::chrono::steady_clock::time_point loop_end = std::chrono::steady_clock::now(); for (int i = 0; i < time_horizon; ++i) { if (i % 50 == 0) { printf("Current Time: %6.2f ", i * dt); printf("Current Baseline Cost: %f \n", controller->getBaselineCost()); printf("State Cost: %f\n", controller->cost_->computeStateCost(current_state)); model.printState(current_state.data()); std::cout << "Control: " << control.transpose() << std::endl; std::cout << "ComputeControl took " << mppi::math::timeDiffms(loop_end, loop_start) << " ms" << std::endl; } if (std::isnan(controller->getBaselineCost()) || control.hasNaN() || current_state.hasNaN()) { printf("ENCOUNTERED A NAN!!\n"); printf("Current Time: %f ", i * dt); printf("Current Baseline Cost: %f \n", controller->getBaselineCost()); model.printState(current_state.data()); std::cout << "Control: " << control.transpose() << std::endl; FAIL(); } // Compute the control loop_start = std::chrono::steady_clock::now(); controller->computeControl(current_state, 1); loop_end = std::chrono::steady_clock::now(); control = controller->getControlSeq().block(0, 0, DYN_T::CONTROL_DIM, 1); // Increment the state model.computeStateDeriv(current_state, control, xdot); model.updateState(current_state, xdot, dt); controller->slideControlSequence(1); Eigen::Vector3f pos = current_state.block<3, 1>(0, 0); float dist = (pos - goal_pos).norm(); if (dist > acceptable_distance) { far_away_cnt++; } } float percentage_in_ball = float(far_away_cnt) / time_horizon; std::cout << "Amount of time outside " << acceptable_distance << " m: " << percentage_in_ball * 100 << "%" << std::endl; EXPECT_LT(percentage_in_ball, 0.1); } ================================================ FILE: tests/cost_functions/CMakeLists.txt ================================================ set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) # add_test(NAME ${T_NAME} COMMAND ${T_NAME}) gtest_add_tests(TARGET ${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: tests/cost_functions/autorally_robust_cost_test.cu ================================================ // // Created by jgibson37 on 2/7/20. // #include #include #include #include // Auto-generated header file #include class ARRobustCostTest : public ::testing::Test { public: ARRobustCost cost; protected: void SetUp() override { ARRobustCostParams params; params.boundary_threshold = 0.0; params.crash_coeff = 10000; params.track_slop = 0.0; params.desired_speed = 10; params.speed_coeff = 10; params.heading_coeff = 20; cost.setParams(params); } void TearDown() override { } }; TEST_F(ARRobustCostTest, Constructor) { // checks for CRTP ARRobustCost* robust = cost.cost_d_; EXPECT_EQ(robust, nullptr); } TEST_F(ARRobustCostTest, GPUSetup) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); cost.bindToStream(stream); EXPECT_EQ(cost.stream_, stream) << "Stream binding failure."; EXPECT_EQ(cost.GPUMemStatus_, false); EXPECT_EQ(cost.cost_d_, nullptr); cost.GPUSetup(); EXPECT_EQ(cost.GPUMemStatus_, true); EXPECT_NE(cost.cost_d_, nullptr); HANDLE_ERROR(cudaStreamDestroy(stream)); } void checkParameters(ARRobustCostParams& params, ARRobustCostParams& result) { EXPECT_FLOAT_EQ(params.speed_coeff, result.speed_coeff); EXPECT_FLOAT_EQ(params.track_coeff, result.track_coeff); EXPECT_FLOAT_EQ(params.heading_coeff, result.heading_coeff); EXPECT_FLOAT_EQ(params.control_cost_coeff[0], result.control_cost_coeff[0]); EXPECT_FLOAT_EQ(params.control_cost_coeff[1], result.control_cost_coeff[1]); EXPECT_FLOAT_EQ(params.slip_coeff, result.slip_coeff); EXPECT_FLOAT_EQ(params.crash_coeff, result.crash_coeff); EXPECT_FLOAT_EQ(params.boundary_threshold, result.boundary_threshold); EXPECT_FLOAT_EQ(params.max_slip_ang, result.max_slip_ang); EXPECT_FLOAT_EQ(params.track_slop, result.track_slop); EXPECT_FLOAT_EQ(params.r_c1.x, result.r_c1.x); EXPECT_FLOAT_EQ(params.r_c1.y, result.r_c1.y); EXPECT_FLOAT_EQ(params.r_c1.z, result.r_c1.z); EXPECT_FLOAT_EQ(params.r_c2.x, result.r_c2.x); EXPECT_FLOAT_EQ(params.r_c2.y, result.r_c2.y); EXPECT_FLOAT_EQ(params.r_c2.z, result.r_c2.z); EXPECT_FLOAT_EQ(params.trs.x, result.trs.x); EXPECT_FLOAT_EQ(params.trs.y, result.trs.y); EXPECT_FLOAT_EQ(params.trs.z, result.trs.z); } TEST_F(ARRobustCostTest, setParams) { ARRobustCostParams params; params.speed_coeff = 1.0; params.track_coeff = 2.0; params.heading_coeff = 3.0; params.control_cost_coeff[0] = 4.0; params.control_cost_coeff[1] = 5.0; params.slip_coeff = 6.0; params.crash_coeff = 7.0; params.boundary_threshold = 8.0; params.max_slip_ang = 9.0; params.track_slop = 10.0; params.r_c1.x = 12; params.r_c1.y = 13; params.r_c1.z = 14; params.r_c2.x = 15; params.r_c2.y = 16; params.r_c2.z = 17; params.trs.x = 18; params.trs.y = 19; params.trs.z = 20; cost.setParams(params); ARRobustCostParams result = cost.getParams(); checkParameters(params, result); cost.GPUSetup(); int width, height; launchParameterTestKernel<>(cost, params, width, height); checkParameters(params, result); } TEST_F(ARRobustCostTest, getStabilizingCostTest) { ARRobustCostParams params; params.max_slip_ang = 1.25; params.crash_coeff = 10000; params.slip_coeff = 10; cost.setParams(params); float s[7]; for (int i = 0; i < 7; i++) { s[i] = 0; } s[4] = 0.24; s[5] = 0.0; float result = cost.getStabilizingCost(s); EXPECT_FLOAT_EQ(result, 0.0); s[4] = 1.0; s[5] = 1.0; result = cost.getStabilizingCost(s); EXPECT_FLOAT_EQ(result, 0.785398 * 10); // EXPECT_FLOAT_EQ(result, fabs(atan(s[5]/s[4])) * params.slip_coeff + params.crash_coeff); s[4] = 1.0; s[5] = 10.0; result = cost.getStabilizingCost(s); EXPECT_FLOAT_EQ(result, 1.4711 * 10 + params.crash_coeff); s[3] = 1.5; s[4] = 1.0; s[5] = 10.0; result = cost.getStabilizingCost(s); EXPECT_FLOAT_EQ(result, 1.4711 * 10 + params.crash_coeff); } float calculateRobustCostmapValue(ARRobustCost& cost, float3 state, int width, int height, float x_min, float x_max, float y_min, float y_max, int ppm) { float x_front = state.x + cost.FRONT_D * cosf(state.z); float y_front = state.y + cost.FRONT_D * sinf(state.z); float x_back = state.x + cost.BACK_D * cosf(state.z); float y_back = state.y + cost.BACK_D * sinf(state.z); // check for overflow float new_x = max(min(x_front - x_min, x_max - x_min), 0.0f); float new_y = max(min(y_front - y_min, y_max - y_min), 0.0f); // calculate the track value float front_track = fabs(height / 2.0f - (new_y)) + (new_x) / width; std::cout << "front point = " << new_x << ", " << new_y << " = " << front_track << std::endl; new_x = max(min(x_back - x_min + 1.0 / (width * ppm), x_max - x_min), 0.0f); new_y = max(min(y_back - y_min + 1.0 / (height * ppm), y_max - y_min), 0.0f); float back_track = fabs(height / 2.0f - (new_y)) + (new_x) / width; std::cout << "back point = " << new_x << ", " << new_y << " = " << back_track << std::endl; return (front_track + back_track) / 2.0f; } TEST_F(ARRobustCostTest, getCostmapCostSpeedMapTest) { ARRobustCostParams params = cost.getParams(); params.desired_speed = -1; cost.setParams(params); cost.GPUSetup(); cost.loadTrackData(mppi::tests::robust_test_map_file); std::vector> states; std::array array = { 0.0 }; array[0] = 3.0; // X array[1] = 0.0; // Y array[2] = M_PI_2; // Theta array[3] = 0.0; // Roll array[4] = 2.0; // Vx array[5] = 1.0; // Vy array[6] = 0.1; // Yaw dot array[7] = 0.5; // steering array[8] = 0.3; // throttle states.push_back(array); std::vector cost_results; launchGetCostmapCostTestKernel(cost, states, cost_results); EXPECT_FLOAT_EQ(cost_results[0], 11629.229); } TEST_F(ARRobustCostTest, getCostmapCostSpeedNoMapTest) { cost.GPUSetup(); cost.loadTrackData(mppi::tests::robust_test_map_file); std::vector> states; std::array array = { 0.0 }; array[0] = 3.0; // X array[1] = 0.0; // Y array[2] = M_PI_2; // Theta array[3] = 0.0; // Roll array[4] = 2.0; // Vx array[5] = 1.0; // Vy array[6] = 0.1; // Yaw dot array[7] = 0.5; // steering array[8] = 0.3; // throttle states.push_back(array); std::vector cost_results; launchGetCostmapCostTestKernel(cost, states, cost_results); EXPECT_FLOAT_EQ(cost_results[0], 11349.729); } TEST_F(ARRobustCostTest, computeCostTest) { cost.GPUSetup(); cost.loadTrackData(mppi::tests::robust_test_map_file); checkGPURolloutCost(cost, 0.02); // std::vector> states; // std::array array = { 0.0 }; // array[0] = 3.0; // X // array[1] = 0.0; // Y // array[2] = M_PI_2; // Theta // array[3] = 0.0; // Roll // array[4] = 2.0; // Vx // array[5] = 1.0; // Vy // array[6] = 0.1; // Yaw dot // array[7] = 0.5; // steering // array[8] = 0.3; // throttle // states.push_back(array); // std::vector cost_results; // launchComputeCostTestKernel<>(cost, states, cost_results); // EXPECT_FLOAT_EQ(cost_results[0], 11349.729); } ================================================ FILE: tests/cost_functions/autorally_standard_cost_test.cu ================================================ // // Created by jason on 1/7/20. // #include #include #include // Auto-generated header file #include TEST(ARStandardCost, Constructor) { ARStandardCost cost; } TEST(ARStandardCost, BindStream) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); ARStandardCost cost(stream); EXPECT_EQ(cost.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST(ARStandardCost, SetGetParamsHost) { ARStandardCostParams params; params.desired_speed = 25; params.speed_coeff = 2; params.track_coeff = 100; params.max_slip_ang = 1.5; params.slip_coeff = 1000; params.track_slop = 0.2; params.crash_coeff = 10000; params.control_cost_coeff[0] = 20; params.control_cost_coeff[1] = 10; params.boundary_threshold = 10; params.discount = 0.9; params.grid_res = 2; params.r_c1.x = 0; params.r_c1.y = 1; params.r_c1.z = 2; params.r_c2.x = 3; params.r_c2.y = 4; params.r_c2.z = 5; params.trs.x = 6; params.trs.y = 7; params.trs.z = 8; ARStandardCost cost; cost.setParams(params); ARStandardCostParams result_params = cost.getParams(); EXPECT_FLOAT_EQ(params.desired_speed, result_params.desired_speed); EXPECT_FLOAT_EQ(params.speed_coeff, result_params.speed_coeff); EXPECT_FLOAT_EQ(params.track_coeff, result_params.track_coeff); EXPECT_FLOAT_EQ(params.max_slip_ang, result_params.max_slip_ang); EXPECT_FLOAT_EQ(params.slip_coeff, result_params.slip_coeff); EXPECT_FLOAT_EQ(params.track_slop, result_params.track_slop); EXPECT_FLOAT_EQ(params.crash_coeff, result_params.crash_coeff); EXPECT_FLOAT_EQ(params.control_cost_coeff[0], result_params.control_cost_coeff[0]); EXPECT_FLOAT_EQ(params.control_cost_coeff[1], result_params.control_cost_coeff[1]); EXPECT_FLOAT_EQ(params.boundary_threshold, result_params.boundary_threshold); EXPECT_FLOAT_EQ(params.discount, result_params.discount); EXPECT_EQ(params.grid_res, result_params.grid_res); EXPECT_FLOAT_EQ(params.r_c1.x, result_params.r_c1.x); EXPECT_FLOAT_EQ(params.r_c1.y, result_params.r_c1.y); EXPECT_FLOAT_EQ(params.r_c1.z, result_params.r_c1.z); EXPECT_FLOAT_EQ(params.r_c2.x, result_params.r_c2.x); EXPECT_FLOAT_EQ(params.r_c2.y, result_params.r_c2.y); EXPECT_FLOAT_EQ(params.r_c2.z, result_params.r_c2.z); EXPECT_FLOAT_EQ(params.trs.x, result_params.trs.x); EXPECT_FLOAT_EQ(params.trs.y, result_params.trs.y); EXPECT_FLOAT_EQ(params.trs.z, result_params.trs.z); } TEST(ARStandardCost, GPUSetupAndParamsToDeviceTest) { // TODO make sre GPUMemstatus is false on the GPU so deallocation can be automatic ARStandardCostParams params; ARStandardCost cost; params.desired_speed = 25; params.speed_coeff = 2; params.track_coeff = 100; params.max_slip_ang = 1.5; params.slip_coeff = 1000; params.track_slop = 0.2; params.crash_coeff = 10000; params.control_cost_coeff[0] = 20; params.control_cost_coeff[1] = 10; params.boundary_threshold = 10; params.discount = 0.9; params.grid_res = 2; params.r_c1.x = 0; params.r_c1.y = 1; params.r_c1.z = 2; params.r_c2.x = 3; params.r_c2.y = 4; params.r_c2.z = 5; params.trs.x = 6; params.trs.y = 7; params.trs.z = 8; cost.setParams(params); EXPECT_EQ(cost.GPUMemStatus_, false); EXPECT_EQ(cost.cost_d_, nullptr); cost.GPUSetup(); EXPECT_EQ(cost.GPUMemStatus_, true); EXPECT_NE(cost.cost_d_, nullptr); ARStandardCostParams result_params; int width, height; launchParameterTestKernel<>(cost, result_params, width, height); EXPECT_FLOAT_EQ(params.desired_speed, result_params.desired_speed); EXPECT_FLOAT_EQ(params.speed_coeff, result_params.speed_coeff); EXPECT_FLOAT_EQ(params.track_coeff, result_params.track_coeff); EXPECT_FLOAT_EQ(params.max_slip_ang, result_params.max_slip_ang); EXPECT_FLOAT_EQ(params.slip_coeff, result_params.slip_coeff); EXPECT_FLOAT_EQ(params.track_slop, result_params.track_slop); EXPECT_FLOAT_EQ(params.crash_coeff, result_params.crash_coeff); EXPECT_FLOAT_EQ(params.control_cost_coeff[0], result_params.control_cost_coeff[0]); EXPECT_FLOAT_EQ(params.control_cost_coeff[1], result_params.control_cost_coeff[1]); EXPECT_FLOAT_EQ(params.boundary_threshold, result_params.boundary_threshold); EXPECT_FLOAT_EQ(params.discount, result_params.discount); EXPECT_EQ(params.grid_res, result_params.grid_res); EXPECT_FLOAT_EQ(params.r_c1.x, result_params.r_c1.x); EXPECT_FLOAT_EQ(params.r_c1.y, result_params.r_c1.y); EXPECT_FLOAT_EQ(params.r_c1.z, result_params.r_c1.z); EXPECT_FLOAT_EQ(params.r_c2.x, result_params.r_c2.x); EXPECT_FLOAT_EQ(params.r_c2.y, result_params.r_c2.y); EXPECT_FLOAT_EQ(params.r_c2.z, result_params.r_c2.z); EXPECT_FLOAT_EQ(params.trs.x, result_params.trs.x); EXPECT_FLOAT_EQ(params.trs.y, result_params.trs.y); EXPECT_FLOAT_EQ(params.trs.z, result_params.trs.z); // neither should be set by this sequence EXPECT_EQ(width, -1); EXPECT_EQ(height, -1); params.desired_speed = 5; params.r_c1.x = 4; params.r_c1.y = 5; params.r_c1.z = 6; cost.setParams(params); launchParameterTestKernel<>(cost, result_params, width, height); EXPECT_FLOAT_EQ(params.desired_speed, result_params.desired_speed); EXPECT_FLOAT_EQ(params.speed_coeff, result_params.speed_coeff); EXPECT_FLOAT_EQ(params.track_coeff, result_params.track_coeff); EXPECT_FLOAT_EQ(params.max_slip_ang, result_params.max_slip_ang); EXPECT_FLOAT_EQ(params.slip_coeff, result_params.slip_coeff); EXPECT_FLOAT_EQ(params.track_slop, result_params.track_slop); EXPECT_FLOAT_EQ(params.crash_coeff, result_params.crash_coeff); EXPECT_FLOAT_EQ(params.control_cost_coeff[0], result_params.control_cost_coeff[0]); EXPECT_FLOAT_EQ(params.control_cost_coeff[1], result_params.control_cost_coeff[1]); EXPECT_FLOAT_EQ(params.boundary_threshold, result_params.boundary_threshold); EXPECT_FLOAT_EQ(params.discount, result_params.discount); EXPECT_EQ(params.grid_res, result_params.grid_res); EXPECT_FLOAT_EQ(params.r_c1.x, result_params.r_c1.x); EXPECT_FLOAT_EQ(params.r_c1.y, result_params.r_c1.y); EXPECT_FLOAT_EQ(params.r_c1.z, result_params.r_c1.z); EXPECT_FLOAT_EQ(params.r_c2.x, result_params.r_c2.x); EXPECT_FLOAT_EQ(params.r_c2.y, result_params.r_c2.y); EXPECT_FLOAT_EQ(params.r_c2.z, result_params.r_c2.z); EXPECT_FLOAT_EQ(params.trs.x, result_params.trs.x); EXPECT_FLOAT_EQ(params.trs.y, result_params.trs.y); EXPECT_FLOAT_EQ(params.trs.z, result_params.trs.z); // neither should be set by this sequence EXPECT_EQ(width, -1); EXPECT_EQ(height, -1); } TEST(ARStandardCost, coorTransformTest) { float x, y, u, v, w; ARStandardCostParams params; ARStandardCost cost; x = 0; y = 10; params.r_c1.x = 0; params.r_c1.y = 1; params.r_c1.z = 2; params.r_c2.x = 3; params.r_c2.y = 4; params.r_c2.z = 5; params.trs.x = 6; params.trs.y = 7; params.trs.z = 8; cost.setParams(params); cost.coorTransform(x, y, &u, &v, &w); EXPECT_FLOAT_EQ(u, 36); EXPECT_FLOAT_EQ(v, 47); EXPECT_FLOAT_EQ(w, 58); } TEST(ARStandardCost, changeCostmapSizeTestValidInputs) { ARStandardCost cost; cost.changeCostmapSize(4, 8); EXPECT_EQ(cost.getWidth(), 4); EXPECT_EQ(cost.getHeight(), 8); EXPECT_EQ(cost.getTrackCostCPU().capacity(), 4 * 8); std::vector result; // launchCheckCudaArray(result, cost.getCudaArray(), 4*8); // EXPECT_EQ(result.size(), 4*8); // TODO verify that cuda is properly allocating the memory /* for(int i = 0; i < 4*8; i++) { EXPECT_FLOAT_EQ(result[i].x, 0); EXPECT_FLOAT_EQ(result[i].y, 0); EXPECT_FLOAT_EQ(result[i].z, 0); EXPECT_FLOAT_EQ(result[i].w, 0); } */ } TEST(ARStandardCost, changeCostmapSizeTestFail) { ARStandardCost cost; cost.changeCostmapSize(4, 8); EXPECT_EQ(cost.getWidth(), 4); EXPECT_EQ(cost.getHeight(), 8); EXPECT_EQ(cost.getTrackCostCPU().capacity(), 4 * 8); testing::internal::CaptureStderr(); cost.changeCostmapSize(-1, -1); std::string error_msg = testing::internal::GetCapturedStderr(); EXPECT_NE(error_msg, ""); EXPECT_EQ(cost.getWidth(), 4); EXPECT_EQ(cost.getHeight(), 8); EXPECT_EQ(cost.getTrackCostCPU().capacity(), 4 * 8); } TEST(ARStandardCost, clearCostmapTest) { ARStandardCost cost; cost.clearCostmapCPU(4, 8); EXPECT_EQ(cost.getWidth(), 4); EXPECT_EQ(cost.getHeight(), 8); EXPECT_EQ(cost.getTrackCostCPU().capacity(), 4 * 8); for (int i = 0; i < 4 * 8; i++) { EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).x, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).y, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).z, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).w, 0); } } TEST(ARStandardCost, clearCostmapTestDefault) { ARStandardCost cost; cost.clearCostmapCPU(4, 8); EXPECT_EQ(cost.getWidth(), 4); EXPECT_EQ(cost.getHeight(), 8); EXPECT_EQ(cost.getTrackCostCPU().capacity(), 4 * 8); for (int i = 0; i < 4 * 8; i++) { EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).x, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).y, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).z, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).w, 0); } testing::internal::CaptureStderr(); cost.clearCostmapCPU(); std::string error_msg = testing::internal::GetCapturedStderr(); EXPECT_NE(error_msg, ""); EXPECT_EQ(cost.getWidth(), 4); EXPECT_EQ(cost.getHeight(), 8); EXPECT_EQ(cost.getTrackCostCPU().capacity(), 4 * 8); for (int i = 0; i < 4 * 8; i++) { EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).x, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).y, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).z, 0); EXPECT_FLOAT_EQ(cost.getTrackCostCPU().at(i).w, 0); } } TEST(ARStandardCost, clearCostmapTestDefaultFail) { ARStandardCost cost; testing::internal::CaptureStderr(); cost.clearCostmapCPU(); std::string error_msg = testing::internal::GetCapturedStderr(); EXPECT_NE(error_msg, ""); EXPECT_EQ(cost.getWidth(), -1); EXPECT_EQ(cost.getHeight(), -1); EXPECT_EQ(cost.getTrackCostCPU().capacity(), 0); } TEST(ARStandardCost, updateTransformCPUTest) { ARStandardCost cost; Eigen::MatrixXf R(3, 3); Eigen::ArrayXf trs(3); R(0, 0) = 1; R(0, 1) = 2; R(0, 2) = 3; R(1, 0) = 4; R(1, 1) = 5; R(1, 2) = 6; R(2, 0) = 7; R(2, 1) = 8; R(2, 2) = 9; trs(0) = 10; trs(1) = 11; trs(2) = 12; cost.updateTransform(R, trs); /* * Prospective transform matrix * r_c1.x, r_c2.x, trs.x * r_c1.y, r_c2.y, trs.y * r_c1.z, r_c2.z, trs.z */ EXPECT_FLOAT_EQ(cost.getParams().r_c1.x, 1); EXPECT_FLOAT_EQ(cost.getParams().r_c2.x, 2); EXPECT_FLOAT_EQ(cost.getParams().trs.x, 10); EXPECT_FLOAT_EQ(cost.getParams().r_c1.y, 4); EXPECT_FLOAT_EQ(cost.getParams().r_c2.y, 5); EXPECT_FLOAT_EQ(cost.getParams().trs.y, 11); EXPECT_FLOAT_EQ(cost.getParams().r_c1.z, 7); EXPECT_FLOAT_EQ(cost.getParams().r_c2.z, 8); EXPECT_FLOAT_EQ(cost.getParams().trs.z, 12); EXPECT_EQ(cost.GPUMemStatus_, false); } TEST(ARStandardCost, updateTransformGPUTest) { ARStandardCost cost; cost.GPUSetup(); Eigen::MatrixXf R(3, 3); Eigen::ArrayXf trs(3); R(0, 0) = 1; R(0, 1) = 2; R(0, 2) = 3; R(1, 0) = 4; R(1, 1) = 5; R(1, 2) = 6; R(2, 0) = 7; R(2, 1) = 8; R(2, 2) = 9; trs(0) = 10; trs(1) = 11; trs(2) = 12; cost.updateTransform(R, trs); /* * Prospective transform matrix * r_c1.x, r_c2.x, trs.x * r_c1.y, r_c2.y, trs.y * r_c1.z, r_c2.z, trs.z */ std::vector results; launchTransformTestKernel<>(results, cost); EXPECT_EQ(results.size(), 3); EXPECT_FLOAT_EQ(results.at(0).x, 1); EXPECT_FLOAT_EQ(results.at(0).y, 4); EXPECT_FLOAT_EQ(results.at(0).z, 7); EXPECT_FLOAT_EQ(results.at(1).x, 2); EXPECT_FLOAT_EQ(results.at(1).y, 5); EXPECT_FLOAT_EQ(results.at(1).z, 8); EXPECT_FLOAT_EQ(results.at(2).x, 10); EXPECT_FLOAT_EQ(results.at(2).y, 11); EXPECT_FLOAT_EQ(results.at(2).z, 12); EXPECT_EQ(cost.GPUMemStatus_, true); } TEST(ARStandardCost, LoadTrackDataInvalidLocationTest) { ARStandardCost cost; testing::internal::CaptureStderr(); cost.loadTrackData("/null"); std::string error_msg = testing::internal::GetCapturedStderr(); EXPECT_NE(error_msg, ""); EXPECT_NE(error_msg.find("/null", 0), std::string::npos); } TEST(ARStandardCost, LoadTrackDataTest) { ARStandardCost cost; // TODO set parameters for cost map cost.GPUSetup(); // load std::vector costmap = cost.loadTrackData(mppi::tests::test_map_file); Eigen::Matrix3f R = cost.getRotation(); Eigen::Array3f trs = cost.getTranslation(); EXPECT_FLOAT_EQ(costmap.at(0).x, 1); EXPECT_FLOAT_EQ(costmap.at(0).y, 10); EXPECT_FLOAT_EQ(costmap.at(0).z, 100); EXPECT_FLOAT_EQ(costmap.at(0).w, 1000); EXPECT_FLOAT_EQ(costmap.at(1).x, 2); EXPECT_FLOAT_EQ(costmap.at(1).y, 20); EXPECT_FLOAT_EQ(costmap.at(1).z, 200); EXPECT_FLOAT_EQ(costmap.at(1).w, 2000); // check transformation, should not have a rotation EXPECT_FLOAT_EQ(R(0, 0), 1.0 / (10)); EXPECT_FLOAT_EQ(R(1, 1), 1.0 / (20)); EXPECT_FLOAT_EQ(R(2, 2), 1.0); EXPECT_FLOAT_EQ(R(0, 1), 0); EXPECT_FLOAT_EQ(R(0, 2), 0); EXPECT_FLOAT_EQ(R(1, 0), 0); EXPECT_FLOAT_EQ(R(1, 2), 0); EXPECT_FLOAT_EQ(R(2, 0), 0); EXPECT_FLOAT_EQ(R(2, 1), 0); EXPECT_FLOAT_EQ(trs(0), 0.5); EXPECT_FLOAT_EQ(trs(1), 0.5); EXPECT_FLOAT_EQ(trs(2), 1); // check on the GPU std::vector results; launchTransformTestKernel<>(results, cost); EXPECT_EQ(results.size(), 3); // check diag EXPECT_FLOAT_EQ(results.at(0).x, 1.0 / 10); EXPECT_FLOAT_EQ(results.at(1).y, 1.0 / (20)); EXPECT_FLOAT_EQ(results.at(0).y, 0); EXPECT_FLOAT_EQ(results.at(0).z, 0); EXPECT_FLOAT_EQ(results.at(1).x, 0); EXPECT_FLOAT_EQ(results.at(1).z, 0); EXPECT_FLOAT_EQ(results.at(2).x, 0.5); EXPECT_FLOAT_EQ(results.at(2).y, 0.5); EXPECT_FLOAT_EQ(results.at(2).z, 1); int counter = 0; for (int i = 0; i < 2 * 10; i++) { for (int j = 0; j < 2 * 20; j++) { EXPECT_FLOAT_EQ(costmap.at(counter).x, counter + 1); EXPECT_FLOAT_EQ(costmap.at(counter).y, (counter + 1) * 10); EXPECT_FLOAT_EQ(costmap.at(counter).z, (counter + 1) * 100); EXPECT_FLOAT_EQ(costmap.at(counter).w, (counter + 1) * 1000); counter++; } } } TEST(ARStandardCost, costmapToTextureNoSizeTest) { ARStandardCost cost; cost.GPUSetup(); testing::internal::CaptureStderr(); cost.costmapToTexture(); std::string error_msg = testing::internal::GetCapturedStderr(); EXPECT_NE(error_msg, ""); } TEST(ARStandardCost, costmapToTextureNoLoadTest) { ARStandardCost cost; cost.GPUSetup(); cost.changeCostmapSize(4, 8); cost.costmapToTexture(); std::vector results; std::vector query_points; float2 point; point.x = 0.0f; point.y = 0.0f; query_points.push_back(point); point.x = 1.0f; point.y = 0.0f; query_points.push_back(point); launchTextureTestKernel<>(cost, results, query_points); EXPECT_EQ(query_points.size(), results.size()); EXPECT_FLOAT_EQ(results.at(0).x, 0); EXPECT_FLOAT_EQ(results.at(0).y, 0); EXPECT_FLOAT_EQ(results.at(0).z, 0); EXPECT_FLOAT_EQ(results.at(0).w, 0); EXPECT_FLOAT_EQ(results.at(1).x, 0); EXPECT_FLOAT_EQ(results.at(1).y, 0); EXPECT_FLOAT_EQ(results.at(1).z, 0); EXPECT_FLOAT_EQ(results.at(1).w, 0); } TEST(ARStandardCost, costmapToTextureLoadTest) { ARStandardCost cost; cost.GPUSetup(); Eigen::Matrix3f R; Eigen::Array3f trs; std::vector costmap = cost.loadTrackData(mppi::tests::test_map_file); cost.costmapToTexture(); std::vector results; std::vector query_points(8); query_points[0].x = 0; query_points[0].y = 0; query_points[1].x = 0.1; query_points[1].y = 0; query_points[2].x = 0.5; query_points[2].y = 0; query_points[3].x = 1.0; query_points[3].y = 0; query_points[4].x = 0.0; query_points[4].y = 0.5; query_points[5].x = 0.0; query_points[5].y = 1.0; query_points[6].x = 0.5; query_points[6].y = 0.5; query_points[7].x = 1.0; query_points[7].y = 1.0; launchTextureTestKernel<>(cost, results, query_points); EXPECT_EQ(query_points.size(), results.size()); // 0,0 EXPECT_FLOAT_EQ(results.at(0).x, 1); EXPECT_FLOAT_EQ(results.at(0).y, 10); EXPECT_FLOAT_EQ(results.at(0).z, 100); EXPECT_FLOAT_EQ(results.at(0).w, 1000); // 0.1, 0 EXPECT_FLOAT_EQ(results.at(1).x, 2); EXPECT_FLOAT_EQ(results.at(1).y, 20); EXPECT_FLOAT_EQ(results.at(1).z, 200); EXPECT_FLOAT_EQ(results.at(1).w, 2000); // 0.5, 0 EXPECT_FLOAT_EQ(results.at(2).x, 11); EXPECT_FLOAT_EQ(results.at(2).y, 110); EXPECT_FLOAT_EQ(results.at(2).z, 1100); EXPECT_FLOAT_EQ(results.at(2).w, 11000); // 1.0, 0 EXPECT_FLOAT_EQ(results.at(3).x, 20); EXPECT_FLOAT_EQ(results.at(3).y, 200); EXPECT_FLOAT_EQ(results.at(3).z, 2000); EXPECT_FLOAT_EQ(results.at(3).w, 20000); // 0.0, 0.5 EXPECT_FLOAT_EQ(results.at(4).x, 401); EXPECT_FLOAT_EQ(results.at(4).y, 4010); EXPECT_FLOAT_EQ(results.at(4).z, 40100); EXPECT_FLOAT_EQ(results.at(4).w, 401000); // 0.0, 1.0 EXPECT_FLOAT_EQ(results.at(5).x, 781); EXPECT_FLOAT_EQ(results.at(5).y, 7810); EXPECT_FLOAT_EQ(results.at(5).z, 78100); EXPECT_FLOAT_EQ(results.at(5).w, 781000); // 0.5, 0.5 EXPECT_FLOAT_EQ(results.at(6).x, 411); EXPECT_FLOAT_EQ(results.at(6).y, 4110); EXPECT_FLOAT_EQ(results.at(6).z, 41100); EXPECT_FLOAT_EQ(results.at(6).w, 411000); // 1,1 EXPECT_FLOAT_EQ(results.at(7).x, 800); EXPECT_FLOAT_EQ(results.at(7).y, 8000); EXPECT_FLOAT_EQ(results.at(7).z, 80000); EXPECT_FLOAT_EQ(results.at(7).w, 800000); } TEST(ARStandardCost, costmapToTextureTransformTest) { ARStandardCost cost; cost.GPUSetup(); std::vector costmap = cost.loadTrackData(mppi::tests::test_map_file); cost.costmapToTexture(); std::vector results; std::vector query_points(8); query_points[0].x = -5; query_points[0].y = -10; query_points[1].x = -4; query_points[1].y = -10; query_points[2].x = 0; query_points[2].y = -10; query_points[3].x = 5; query_points[3].y = -10; query_points[4].x = -5; query_points[4].y = 0; query_points[5].x = -5; query_points[5].y = 10; query_points[6].x = 0; query_points[6].y = 0; query_points[7].x = 5; query_points[7].y = 10; launchTextureTransformTestKernel<>(cost, results, query_points); EXPECT_EQ(query_points.size(), results.size()); // 0,0 EXPECT_FLOAT_EQ(results.at(0).x, 1); EXPECT_FLOAT_EQ(results.at(0).y, 10); EXPECT_FLOAT_EQ(results.at(0).z, 100); EXPECT_FLOAT_EQ(results.at(0).w, 1000); // 0.1, 0 EXPECT_FLOAT_EQ(results.at(1).x, 2); EXPECT_FLOAT_EQ(results.at(1).y, 20); EXPECT_FLOAT_EQ(results.at(1).z, 200); EXPECT_FLOAT_EQ(results.at(1).w, 2000); // 0.5, 0 EXPECT_FLOAT_EQ(results.at(2).x, 11); EXPECT_FLOAT_EQ(results.at(2).y, 110); EXPECT_FLOAT_EQ(results.at(2).z, 1100); EXPECT_FLOAT_EQ(results.at(2).w, 11000); // 1.0, 0 EXPECT_FLOAT_EQ(results.at(3).x, 20); EXPECT_FLOAT_EQ(results.at(3).y, 200); EXPECT_FLOAT_EQ(results.at(3).z, 2000); EXPECT_FLOAT_EQ(results.at(3).w, 20000); // 0.0, 0.5 EXPECT_FLOAT_EQ(results.at(4).x, 401); EXPECT_FLOAT_EQ(results.at(4).y, 4010); EXPECT_FLOAT_EQ(results.at(4).z, 40100); EXPECT_FLOAT_EQ(results.at(4).w, 401000); // 0.0, 1.0 EXPECT_FLOAT_EQ(results.at(5).x, 781); EXPECT_FLOAT_EQ(results.at(5).y, 7810); EXPECT_FLOAT_EQ(results.at(5).z, 78100); EXPECT_FLOAT_EQ(results.at(5).w, 781000); // 0.5, 0.5 EXPECT_FLOAT_EQ(results.at(6).x, 411); EXPECT_FLOAT_EQ(results.at(6).y, 4110); EXPECT_FLOAT_EQ(results.at(6).z, 41100); EXPECT_FLOAT_EQ(results.at(6).w, 411000); // 1,1 EXPECT_FLOAT_EQ(results.at(7).x, 800); EXPECT_FLOAT_EQ(results.at(7).y, 8000); EXPECT_FLOAT_EQ(results.at(7).z, 80000); EXPECT_FLOAT_EQ(results.at(7).w, 800000); } TEST(ARStandardCost, TerminalCostTest) { ARStandardCost cost; cost.GPUSetup(); std::vector> states; std::array array = { 0.0 }; array[0] = 3.0; // X array[1] = 0.0; // Y array[2] = M_PI_2; // Theta array[3] = 0.0; // Roll array[4] = 2.0; // Vx array[5] = 1.0; // Vy array[6] = 0.1; // Yaw dot states.push_back(array); std::vector cost_results; launchTerminalCostTestKernel<>(cost, states, cost_results); EXPECT_FLOAT_EQ(cost_results[0], 0.0); } TEST(ARStandardCost, getSpeedCostTest) { ARStandardCost cost; ARStandardCostParams params; params.desired_speed = 25; params.speed_coeff = 10; cost.setParams(params); float state[7]; for (int i = 0; i < 7; i++) { state[i] = 0; } int crash = 0; state[4] = 10; float result = cost.getSpeedCost(state, &crash); EXPECT_FLOAT_EQ(result, 15 * 15 * 10); params.desired_speed = 0; params.speed_coeff = 100; cost.setParams(params); result = cost.getSpeedCost(state, &crash); EXPECT_FLOAT_EQ(result, 10 * 10 * 100); } TEST(ARStandardCost, getStablizingCostTest) { ARStandardCost cost; ARStandardCostParams params; params.slip_coeff = 25; params.crash_coeff = 1000; params.max_slip_ang = 0.5; cost.setParams(params); float state[7]; for (int i = 0; i < 7; i++) { state[i] = 0; } state[4] = 0.1; state[5] = 0.0; int crash = 0; float result = cost.getStabilizingCost(state, &crash); EXPECT_FLOAT_EQ(result, 0); EXPECT_EQ(crash, 0); state[5] = 0.01; result = cost.getStabilizingCost(state, &crash); EXPECT_FLOAT_EQ(result, 0.2483460072); EXPECT_EQ(crash, 0); state[5] = 0.2; result = cost.getStabilizingCost(state, &crash); EXPECT_FLOAT_EQ(result, 1030.6444); EXPECT_EQ(crash, 0); state[3] = 1.6; state[5] = 0.0; result = cost.getStabilizingCost(state, &crash); EXPECT_FLOAT_EQ(result, 0.0); EXPECT_EQ(crash, 1); state[3] = -1.6; result = cost.getStabilizingCost(state, &crash); EXPECT_FLOAT_EQ(result, 0.0); EXPECT_EQ(crash, 1); } TEST(ARStandardCost, getCrashCostTest) { ARStandardCost cost; ARStandardCostParams params; params.crash_coeff = 10000; cost.setParams(params); float state[7]; for (int i = 0; i < 7; i++) { state[i] = 0; } int crash = 0; state[4] = 10; float result = cost.getCrashCost(state, &crash, 10); EXPECT_FLOAT_EQ(result, 0); crash = 1; result = cost.getCrashCost(state, &crash, 10); EXPECT_FLOAT_EQ(result, 10000); } float calculateStandardCostmapValue(ARStandardCost& cost, float3 state, int width, int height, float x_min, float x_max, float y_min, float y_max, int ppm) { float x_front = state.x + cost.FRONT_D * cosf(state.z); float y_front = state.y + cost.FRONT_D * sinf(state.z); float x_back = state.x + cost.BACK_D * cosf(state.z); float y_back = state.y + cost.BACK_D * sinf(state.z); float new_x = max(min(x_front - x_min, x_max - x_min), 0.0f); float new_y = max(min(y_front - y_min, y_max - y_min), 0.0f); float front = fabs(height / 2.0f - (new_y)) + (new_x) / width; // std::cout << "front point = " << new_x << ", " << new_y << " = " << front << std::endl; new_x = max(min(x_back - x_min + 1.0 / (width * ppm), x_max - x_min), 0.0f); new_y = max(min(y_back - y_min + 1.0 / (height * ppm), y_max - y_min), 0.0f); float back = fabs(height / 2.0f - (new_y)) + (new_x) / width; // std::cout << "back point = " << new_x << ", " << new_y << " = " << back << std::endl; return (front + back) / 2.0f; } TEST(ARStandardCost, getTrackCostTest) { std::cout << "==========================\n\n" << std::endl; ARStandardCost cost; ARStandardCostParams params; params.track_coeff = 1; params.track_slop = 0.0; params.boundary_threshold = 1.0; cost.setParams(params); cost.GPUSetup(); cost.loadTrackData(mppi::tests::standard_test_map_file); std::vector states(4); states[0].x = -13.5; states[0].y = -10; states[0].z = 0.0; // theta states[1].x = 0; states[1].y = -10.0; states[1].z = 0.0; // theta states[2].x = 0.0; states[2].y = 0.0; states[2].z = M_PI / 2; // theta states[3].x = 3.0; states[3].y = 0.0; states[3].z = M_PI_2; // theta std::vector cost_results; std::vector crash_results; launchTrackCostTestKernel<>(cost, states, cost_results, crash_results); EXPECT_NEAR(cost_results[0], params.track_coeff * calculateStandardCostmapValue(cost, states[0], 30, 30, -13, 17, -10, 20, 20), 0.001); EXPECT_FLOAT_EQ(crash_results[0], 1.0); EXPECT_NEAR(cost_results[1], params.track_coeff * calculateStandardCostmapValue(cost, states[1], 30, 30, -13, 17, -10, 20, 20), 0.001); EXPECT_FLOAT_EQ(crash_results[1], 1.0); EXPECT_NEAR(cost_results[2], params.track_coeff * calculateStandardCostmapValue(cost, states[2], 30, 30, -13, 17, -10, 20, 20), 0.1); EXPECT_FLOAT_EQ(crash_results[2], 1.0); EXPECT_NEAR(cost_results[3], params.track_coeff * calculateStandardCostmapValue(cost, states[3], 30, 30, -13, 17, -10, 20, 20), 0.1); EXPECT_FLOAT_EQ(crash_results[3], 1.0); } TEST(ARStandardCost, computeCostIndividualTest) { ARStandardCost cost; ARStandardCostParams params; params.track_coeff = 0; params.speed_coeff = 0; params.crash_coeff = 0.0; params.slip_coeff = 0.0; params.control_cost_coeff[0] = 0.0; params.control_cost_coeff[1] = 0.0; params.discount = 0.9; cost.setParams(params); cost.GPUSetup(); cost.loadTrackData(mppi::tests::standard_test_map_file); params = cost.getParams(); std::vector> states; std::array array = { 0.0 }; array[0] = 3.0; // X array[1] = 0.0; // Y array[2] = M_PI_2; // Theta array[3] = 0.0; // Roll array[4] = 2.0; // Vx array[5] = 1.0; // Vy array[6] = 0.1; // Yaw dot array[7] = 0.5; // steering array[8] = 0.3; // throttle states.push_back(array); std::vector cost_results; std::vector timestep; timestep.push_back(1); std::vector crash; crash.push_back(0); launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], 0.0); params.speed_coeff = 4.25; cost.setParams(params); float speed_cost = powf(4.0, 2) * 4.25; launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], speed_cost); params.speed_coeff = 0.0; params.slip_coeff = 10; cost.setParams(params); float slip_cost = powf(-atanf(1.0 / 2.0), 2) * 10; launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], slip_cost); params.slip_coeff = 0.0; params.track_coeff = 200.0; cost.setParams(params); float track_cost = 1116.3333; launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], track_cost); params.track_coeff = 0.0; params.crash_coeff = 10000; cost.setParams(params); float crash_cost = 9000; launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], crash_cost); params.speed_coeff = 4.25; params.track_coeff = 200; params.slip_coeff = 10; params.crash_coeff = 10000; cost.setParams(params); launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], speed_cost + slip_cost + track_cost + crash_cost); timestep[0] = 4; launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], speed_cost + slip_cost + track_cost + powf(0.9, timestep[0]) * params.crash_coeff); } TEST(ARStandardCost, computeCostOverflowTest) { ARStandardCost cost; ARStandardCostParams params; params.track_coeff = 0; params.speed_coeff = 10; params.crash_coeff = 0.0; params.slip_coeff = 0.0; params.control_cost_coeff[0] = 0.0; params.control_cost_coeff[1] = 0.0; params.desired_speed = ARStandardCost::MAX_COST_VALUE; cost.setParams(params); cost.GPUSetup(); cost.loadTrackData(mppi::tests::standard_test_map_file); params = cost.getParams(); std::vector> states; std::array array = { 0.0 }; array[0] = 3.0; // X array[1] = 0.0; // Y array[2] = M_PI_2; // Theta array[3] = 0.0; // Roll array[4] = 2.0; // Vx array[5] = 1.0; // Vy array[6] = 0.1; // Yaw dot array[7] = 0.5; // steering array[8] = 0.3; // throttle states.push_back(array); std::vector cost_results; std::vector timestep; timestep.push_back(1); std::vector crash; crash.push_back(0); launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], ARStandardCost::MAX_COST_VALUE); cost_results[0] = 0; params.desired_speed = NAN; cost.setParams(params); launchComputeCostTestKernel<>(cost, states, cost_results, timestep, crash); EXPECT_FLOAT_EQ(cost_results[0], ARStandardCost::MAX_COST_VALUE); } TEST(ARStandardCost, matchAutoRallyTest) { ARStandardCost cost; ARStandardCostParams params; params.desired_speed = 6.0; params.track_coeff = 200; params.speed_coeff = 4.25; params.crash_coeff = 10000.0; params.max_slip_ang = 1.25; params.slip_coeff = 10.0; params.track_slop = 0.0; params.boundary_threshold = 0.65; params.control_cost_coeff[0] = 0.0; params.control_cost_coeff[1] = 0.0; params.discount = 0.9; cost.setParams(params); cost.GPUSetup(); cost.loadTrackData(mppi::tests::ccrf_map); params = cost.getParams(); std::vector> states; std::vector timesteps; std::vector crash; std::array array = { 0.0 }; // input state 19.113796 -36.497066 7.431655 0.055569 4.922609 0.069374 -1.785045 // input control -0.134569 -0.485720 // timestep 1 // crash = 0 // cost returned 122.039955 array = { 19.113796, -36.497066, 7.431655, 0.055569, 4.922609, 0.069374, -1.785045, -0.134569, -0.485720 }; states.push_back(array); timesteps.push_back(1); crash.push_back(0); // input state 19.076693 -36.588978 7.397079 0.066307 4.951416 0.068694 -1.694298 // input control 0.183665 -0.441535 // timestep 1 // cost returned 128.190033 // crash = 0 array = { 19.076693, -36.588978, 7.397079, 0.066307, 4.951416, 0.068694, -1.694298, 0.183665, -0.441535 }; states.push_back(array); timesteps.push_back(1); crash.push_back(0); // input state -1.470056 -13.121619 1.015522 -0.076247 5.938981 0.375569 2.230900 // input control 0.273864 -0.176446 // timestep, crash 1 0 // cost returned 9137.753906 array = { -1.470056, -13.121619, 1.015522, -0.076247, 5.938981, 0.375569, 2.230900, 0.273864, -0.176446 }; states.push_back(array); timesteps.push_back(1); crash.push_back(0); // input state -1.413821 -13.016726 0.970904 -0.079068 5.893999 0.607982 2.259659 // input control 0.225313 -0.035700 // timestep, crash 2 1 // cost returned 8402.457031 array = { -1.413821, -13.016726, 0.970904, -0.079068, 5.893999, 0.607982, 2.259659, 0.225313, -0.035700 }; states.push_back(array); timesteps.push_back(2); crash.push_back(1); // input state 7.273710 -10.255844 -0.030983 -0.067543 4.353069 -0.036942 -0.177069 // input control -0.038980 -0.335935 // timestep, crash 99 1 // cost returned 2857.133545 array = { 7.273710, -10.255844, -0.030983, -0.067543, 4.353069, -0.036942, -0.177069, -0.038980, -0.335935 }; states.push_back(array); timesteps.push_back(99); crash.push_back(1); std::vector cost_results; launchComputeCostTestKernel<>(cost, states, cost_results, timesteps, crash); EXPECT_FLOAT_EQ(cost_results[0], 122.039955); EXPECT_EQ(crash[0], 0); EXPECT_FLOAT_EQ(cost_results[1], 128.190033); EXPECT_EQ(crash[1], 0); EXPECT_FLOAT_EQ(cost_results[2], 9137.753906); EXPECT_EQ(crash[2], 1); EXPECT_FLOAT_EQ(cost_results[3], 8402.457031); EXPECT_EQ(crash[3], 1); EXPECT_FLOAT_EQ(cost_results[4], 2857.133545); EXPECT_EQ(crash[4], 1); } ================================================ FILE: tests/cost_functions/cartpole_quadratic_cost_test.cu ================================================ #include #include #include #include TEST(CartpoleQuadraticCost, Constructor) { CartpoleQuadraticCost cost; // Test passes if the object is constructed without runetime error. } TEST(CartpoleQuadraticCost, BindStream) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); CartpoleQuadraticCost cost(stream); EXPECT_EQ(cost.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST(CartpoleQuadraticCost, GPUMemoryNull) { CartpoleQuadraticCost cost; ASSERT_EQ(cost.cost_d_, nullptr); } TEST(CartpoleQuadraticCost, GPUSetup) { CartpoleQuadraticCost cost; cost.GPUSetup(); ASSERT_FALSE(cost.cost_d_ == nullptr); } TEST(CartpoleQuadraticCost, SetParamsCPU) { CartpoleQuadraticCostParams new_params; new_params.cart_position_coeff = 3; new_params.pole_angle_coeff = 3; new_params.cart_velocity_coeff = 3; new_params.pole_angular_velocity_coeff = 3; new_params.control_cost_coeff[0] = 5; new_params.terminal_cost_coeff = 20; new_params.desired_terminal_state[0] = 3; new_params.desired_terminal_state[1] = 2; new_params.desired_terminal_state[2] = 3.14; new_params.desired_terminal_state[3] = 1; CartpoleQuadraticCost cost; cost.setParams(new_params); auto current_params = cost.getParams(); EXPECT_FLOAT_EQ(new_params.cart_position_coeff, current_params.cart_position_coeff); EXPECT_FLOAT_EQ(new_params.pole_angle_coeff, current_params.pole_angle_coeff); EXPECT_FLOAT_EQ(new_params.cart_velocity_coeff, current_params.cart_velocity_coeff); EXPECT_FLOAT_EQ(new_params.pole_angular_velocity_coeff, current_params.pole_angular_velocity_coeff); EXPECT_FLOAT_EQ(new_params.control_cost_coeff[0], current_params.control_cost_coeff[0]); EXPECT_FLOAT_EQ(new_params.terminal_cost_coeff, current_params.terminal_cost_coeff); EXPECT_FLOAT_EQ(new_params.desired_terminal_state[0], current_params.desired_terminal_state[0]); EXPECT_FLOAT_EQ(new_params.desired_terminal_state[1], current_params.desired_terminal_state[1]); EXPECT_FLOAT_EQ(new_params.desired_terminal_state[2], current_params.desired_terminal_state[2]); EXPECT_FLOAT_EQ(new_params.desired_terminal_state[3], current_params.desired_terminal_state[3]); } TEST(CartpoleQuadraticCost, SetParamsGPU) { CartpoleQuadraticCost cost; cost.GPUSetup(); CartpoleQuadraticCostParams new_params; new_params.cart_position_coeff = 5; new_params.pole_angle_coeff = 6; new_params.cart_velocity_coeff = 7; new_params.pole_angular_velocity_coeff = 8; new_params.control_cost_coeff[0] = 9; new_params.terminal_cost_coeff = 2000; new_params.desired_terminal_state[0] = 3; new_params.desired_terminal_state[1] = 2; new_params.desired_terminal_state[2] = 3.14; new_params.desired_terminal_state[3] = 1; CartpoleQuadraticCostParams gpu_params; cost.setParams(new_params); if (cost.GPUMemStatus_) { // Launch kernel to grab parameters from the GPU launchParameterTestKernel(cost, gpu_params); } else { FAIL() << "GPU Setup has not been called or is not functioning."; } EXPECT_FLOAT_EQ(new_params.cart_position_coeff, gpu_params.cart_position_coeff); EXPECT_FLOAT_EQ(new_params.pole_angle_coeff, gpu_params.pole_angle_coeff); EXPECT_FLOAT_EQ(new_params.cart_velocity_coeff, gpu_params.cart_velocity_coeff); EXPECT_FLOAT_EQ(new_params.pole_angular_velocity_coeff, gpu_params.pole_angular_velocity_coeff); EXPECT_FLOAT_EQ(new_params.control_cost_coeff[0], gpu_params.control_cost_coeff[0]); EXPECT_FLOAT_EQ(new_params.terminal_cost_coeff, gpu_params.terminal_cost_coeff); EXPECT_FLOAT_EQ(new_params.desired_terminal_state[0], gpu_params.desired_terminal_state[0]); EXPECT_FLOAT_EQ(new_params.desired_terminal_state[1], gpu_params.desired_terminal_state[1]); EXPECT_FLOAT_EQ(new_params.desired_terminal_state[2], gpu_params.desired_terminal_state[2]); EXPECT_FLOAT_EQ(new_params.desired_terminal_state[3], gpu_params.desired_terminal_state[3]); } TEST(CartpoleQuadraticCost, ComputeStateCost) { CartpoleQuadraticCost cost; CartpoleQuadraticCost::output_array state; state << 1, 2, 3, 4; float cost_compute = cost.computeStateCost(state); float cost_known = (state[0] - cost.getParams().desired_terminal_state[0]) * (state(0) - cost.getParams().desired_terminal_state[0]) * cost.getParams().cart_position_coeff + (state[1] - cost.getParams().desired_terminal_state[1]) * (state(1) - cost.getParams().desired_terminal_state[1]) * cost.getParams().cart_velocity_coeff + (state[2] - cost.getParams().desired_terminal_state[2]) * (state(2) - cost.getParams().desired_terminal_state[2]) * cost.getParams().pole_angle_coeff + (state[3] - cost.getParams().desired_terminal_state[3]) * (state(3) - cost.getParams().desired_terminal_state[3]) * cost.getParams().pole_angular_velocity_coeff; ASSERT_EQ(cost_known, cost_compute); } TEST(CartpoleQuadraticCost, ComputeRunningCost) { CartpoleQuadraticCost cost; CartpoleQuadraticCost::output_array state; CartpoleQuadraticCost::control_array control, noise, std_dev; state << 5, 3, 2, 4; control << 6; noise << 0.3; std_dev << 2; int timestep = 0; int crash_status[1] = { 0 }; float cost_compute = cost.computeRunningCost(state, control, timestep, crash_status); float cost_known = (state[0] - cost.getParams().desired_terminal_state[0]) * (state(0) - cost.getParams().desired_terminal_state[0]) * cost.getParams().cart_position_coeff + (state[1] - cost.getParams().desired_terminal_state[1]) * (state(1) - cost.getParams().desired_terminal_state[1]) * cost.getParams().cart_velocity_coeff + (state[2] - cost.getParams().desired_terminal_state[2]) * (state(2) - cost.getParams().desired_terminal_state[2]) * cost.getParams().pole_angle_coeff + (state[3] - cost.getParams().desired_terminal_state[3]) * (state(3) - cost.getParams().desired_terminal_state[3]) * cost.getParams().pole_angular_velocity_coeff; cost_known = cost_known; ASSERT_EQ(cost_known, cost_compute); } TEST(CartpoleQuadraticCost, ComputeTerminalCost) { CartpoleQuadraticCost cost; std::array state = { 2.f, 3.f, 7.f, 43.f }; // float cost_compute = cost.terminalCost(state); float cost_compute = 0.0f; float cost_known = ((state[0] - cost.getParams().desired_terminal_state[0]) * (state[0] - cost.getParams().desired_terminal_state[0]) * cost.getParams().cart_position_coeff + (state[1] - cost.getParams().desired_terminal_state[1]) * (state[1] - cost.getParams().desired_terminal_state[1]) * cost.getParams().cart_velocity_coeff + (state[2] - cost.getParams().desired_terminal_state[2]) * (state[2] - cost.getParams().desired_terminal_state[2]) * cost.getParams().pole_angle_coeff + (state[3] - cost.getParams().desired_terminal_state[3]) * (state[3] - cost.getParams().desired_terminal_state[3]) * cost.getParams().pole_angular_velocity_coeff) * cost.getParams().terminal_cost_coeff; ASSERT_EQ(cost_known, cost_compute); } ================================================ FILE: tests/cost_functions/general_cost_test.cu ================================================ #include #include #include #include #include template __global__ void copyParamsOnGPU(COST_T* cost_d, typename COST_T::COST_PARAMS_T* params_d) { *params_d = cost_d->getParams(); } template class GeneralCostTest : public ::testing::Test { public: COST_T cost; protected: void SetUp() override { } void TearDown() override { } }; using DIFFERENT_COSTS = ::testing::Types; TYPED_TEST_SUITE(GeneralCostTest, DIFFERENT_COSTS); TYPED_TEST(GeneralCostTest, GPUSetup) { ASSERT_EQ(this->cost.cost_d_, nullptr); this->cost.GPUSetup(); ASSERT_NE(this->cost.cost_d_, nullptr); } TYPED_TEST(GeneralCostTest, VerifyGPUParams) { using PARAMS_T = typename TypeParam::COST_PARAMS_T; this->cost.GPUSetup(); PARAMS_T* params_d; HANDLE_ERROR(cudaMalloc((void**)¶ms_d, sizeof(PARAMS_T))); PARAMS_T params_gpu_after, params_gpu_before; copyParamsOnGPU<<<1, 1>>>(this->cost.cost_d_, params_d); HANDLE_ERROR(cudaMemcpy(¶ms_gpu_before, params_d, sizeof(PARAMS_T), cudaMemcpyDeviceToHost)); PARAMS_T cost_params = this->cost.getParams(); cost_params.control_cost_coeff[0] = 27; cost_params.discount = 0.97; this->cost.setParams(cost_params); copyParamsOnGPU<<<1, 1>>>(this->cost.cost_d_, params_d); HANDLE_ERROR(cudaMemcpy(¶ms_gpu_after, params_d, sizeof(PARAMS_T), cudaMemcpyDeviceToHost)); // Ensure params before params update are different ASSERT_NE(params_gpu_before.discount, cost_params.discount); ASSERT_NE(params_gpu_before.control_cost_coeff[0], cost_params.control_cost_coeff[0]); // Ensure params after params update are the same ASSERT_EQ(params_gpu_after.discount, cost_params.discount); ASSERT_EQ(params_gpu_after.control_cost_coeff[0], cost_params.control_cost_coeff[0]); } TYPED_TEST(GeneralCostTest, CPUvsGPURolloutCost) { checkGPURolloutCost(this->cost, 0.01); } ================================================ FILE: tests/cost_functions/general_quadratic_costs.cu ================================================ #include #include #include typedef QuadraticCost DIQuadCost; TEST(DIQuadraticCost, Constructor) { DIQuadCost(); } template __global__ void computeCostKernel(COST_T* cost, float* s, int* t, float* result) { int crash = 0; *result = cost->computeStateCost(s, *t, nullptr, &crash); } class DITargetQuadraticCost : public testing::Test { public: using DYN = DoubleIntegratorDynamics; using COST = QuadraticCost; DIQuadCost cost; DIQuadCost::COST_PARAMS_T cost_params; DYN::output_array s; dim3 dimBlock; dim3 dimGrid; void SetUp() override { cost_params = cost.getParams(); cost_params.s_goal[0] = 1; cost_params.s_goal[1] = -5; cost_params.s_goal[2] = 0; cost_params.s_goal[3] = 0.5; cost.setParams(cost_params); s << 0, 0, 0, 0; dimBlock = dim3(10, 11, 5); dimGrid = dim3(1, 1, 1); } }; class DITrajQuadraticCost : public testing::Test { public: using DYN = DoubleIntegratorDynamics; static const int TIME_HORIZON = 5; using COST = QuadraticCostTrajectory; COST cost; COST::COST_PARAMS_T cost_params; float s_traj[28] = { 1, -5, 0, 0.5, 2, -5.5, 0, 1, 5, 5, 1, 10, 1, 1, 1, 1, 2, 3, 4, 5, 11, -5, 3, 8, 8, 8, 8, 8 }; DYN::output_array s; // GPU Variables dim3 dimBlock; dim3 dimGrid; float* s_dev; float* resulting_cost_d; int* time_d; cudaStream_t stream; void SetUp() override { cost_params = cost.getParams(); for (int i = 0; i < TIME_HORIZON * DYN::OUTPUT_DIM; i++) { cost_params.s_goal[i] = s_traj[i]; } // cost_params.s_goal[0] = 1; // cost_params.s_goal[1] = -5; // cost_params.s_goal[2] = 0; // cost_params.s_goal[3] = 0.5; cost.setParams(cost_params); s << 0, 0, 0, 0; dimBlock = dim3(10, 11, 5); dimGrid = dim3(1, 1, 1); HANDLE_ERROR(cudaMalloc((void**)&s_dev, sizeof(float) * DYN::OUTPUT_DIM)); HANDLE_ERROR(cudaMalloc((void**)&resulting_cost_d, sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&time_d, sizeof(int))); HANDLE_ERROR(cudaStreamCreate(&stream)); cost.bindToStream(stream); cost.GPUSetup(); } void TearDown() override { HANDLE_ERROR(cudaFree(s_dev)); HANDLE_ERROR(cudaFree(resulting_cost_d)); HANDLE_ERROR(cudaFree(time_d)); } }; TEST_F(DITargetQuadraticCost, SimpleStateCostCPU) { float state_cost = cost.computeStateCost(s); ASSERT_FLOAT_EQ(state_cost, 26.25); } TEST_F(DITargetQuadraticCost, LateStateCostCPU) { float state_cost = cost.computeStateCost(s, 10); ASSERT_FLOAT_EQ(state_cost, 26.25); } TEST_F(DITargetQuadraticCost, WeightStateCostCPU) { cost_params.s_coeffs[0] = 10; cost_params.s_coeffs[1] = 5; cost_params.s_coeffs[2] = 7; cost_params.s_coeffs[3] = .1; cost.setParams(cost_params); float state_cost = cost.computeStateCost(s); ASSERT_FLOAT_EQ(state_cost, 10 + 5 * 5 * 5 + 0 + 0.025); } TEST_F(DITargetQuadraticCost, LipschitzConstant) { cost_params.s_coeffs[0] = -10; cost_params.s_coeffs[1] = 5; cost_params.s_coeffs[2] = 7; cost_params.s_coeffs[3] = .1; cost.setParams(cost_params); float lipschitz_constant = cost.getLipshitzConstantCost(); ASSERT_FLOAT_EQ(lipschitz_constant, 20); } TEST_F(DITargetQuadraticCost, StateCostGPU) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); cost.bindToStream(stream); cost.GPUSetup(); int time = 0; float result_cost_h = -1; // Setup GPU params float* s_dev; float* resulting_cost_d; int* time_d; HANDLE_ERROR(cudaMalloc((void**)&s_dev, sizeof(float) * DYN::OUTPUT_DIM)); HANDLE_ERROR(cudaMalloc((void**)&resulting_cost_d, sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&time_d, sizeof(int))); HANDLE_ERROR(cudaMemcpyAsync(s_dev, s.data(), sizeof(float) * DYN::OUTPUT_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(time_d, &time, sizeof(int), cudaMemcpyHostToDevice, stream)); computeCostKernel<<>>(cost.cost_d_, s_dev, time_d, resulting_cost_d); HANDLE_ERROR(cudaMemcpyAsync(&result_cost_h, resulting_cost_d, sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); ASSERT_FLOAT_EQ(result_cost_h, 26.25); } TEST_F(DITargetQuadraticCost, LateStateCostGPU) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); cost.bindToStream(stream); cost.GPUSetup(); int time = 10; float result_cost_h = -1; // Setup GPU params float* s_dev; float* resulting_cost_d; int* time_d; HANDLE_ERROR(cudaMalloc((void**)&s_dev, sizeof(float) * DYN::OUTPUT_DIM)); HANDLE_ERROR(cudaMalloc((void**)&resulting_cost_d, sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&time_d, sizeof(int))); HANDLE_ERROR(cudaMemcpyAsync(s_dev, s.data(), sizeof(float) * DYN::OUTPUT_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(time_d, &time, sizeof(int), cudaMemcpyHostToDevice, stream)); computeCostKernel<<>>(cost.cost_d_, s_dev, time_d, resulting_cost_d); HANDLE_ERROR(cudaMemcpyAsync(&result_cost_h, resulting_cost_d, sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); ASSERT_FLOAT_EQ(result_cost_h, 26.25); } TEST_F(DITrajQuadraticCost, MidTrajStateCostGPU) { int time = 2; float result_cost_h = -1; // Setup GPU params HANDLE_ERROR(cudaMemcpyAsync(s_dev, s.data(), sizeof(float) * DYN::OUTPUT_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(time_d, &time, sizeof(int), cudaMemcpyHostToDevice, stream)); computeCostKernel<<>>(cost.cost_d_, s_dev, time_d, resulting_cost_d); HANDLE_ERROR(cudaMemcpyAsync(&result_cost_h, resulting_cost_d, sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); float ground_truth = powf(cost_params.s_goal[8], 2) + powf(cost_params.s_goal[9], 2) + powf(cost_params.s_goal[10], 2) + powf(cost_params.s_goal[11], 2); ASSERT_FLOAT_EQ(result_cost_h, ground_truth); } ================================================ FILE: tests/cost_functions/quadrotor_map_cost_test.cu ================================================ #include #include #include TEST(QuadrotorMapCost, checkHeadingCost) { GTEST_SKIP_("The calculation has fundamentally changed, uses velocity vector and not orientation"); using COST = QuadrotorMapCost; COST cost; COST::output_array curr_state = COST::output_array::Zero(); Eigen::Quaternionf temp_quat; float deg2rad = M_PI / 180; // Have velocity in the y direction curr_state[4] = 1; // Get quaternion for yaw of 30 degrees mppi::math::Euler2QuatNWU(0, 0, 30.0 * deg2rad, temp_quat); temp_quat.normalize(); curr_state[6] = temp_quat.w(); curr_state[7] = temp_quat.x(); curr_state[8] = temp_quat.y(); curr_state[9] = temp_quat.z(); // Current state has yaw of 30 degrees auto params = cost.getParams(); params.attitude_coeff = 0; params.heading_coeff = 10; params.curr_waypoint = make_float4(0, 1, 0, 0); cost.setParams(params); float expected_cost = params.heading_coeff * powf(M_PI_2 - 30 * deg2rad, 2); float calculated_cost = cost.computeHeadingCost(curr_state.data()); EXPECT_FLOAT_EQ(expected_cost, calculated_cost); } TEST(QuadrotorMapCost, checkSpeedCost) { using COST = QuadrotorMapCost; COST cost; COST::output_array curr_state = COST::output_array::Zero(); Eigen::Quaternionf temp_quat; // Get quaternion for yaw of 30 degrees curr_state[3] = 3; curr_state[4] = 4; // Current state has yaw of 30 degrees auto params = cost.getParams(); params.speed_coeff = 10; params.desired_speed = 10; cost.setParams(params); float expected_cost = params.speed_coeff * powf(10 - 5, 2); float calculated_cost = cost.computeSpeedCost(curr_state.data()); EXPECT_FLOAT_EQ(expected_cost, calculated_cost); } ================================================ FILE: tests/cost_functions/quadrotor_quadratic_cost_test.cu ================================================ #include #include #include #include TEST(QuadrotorQuadraticCost, Constructor) { QuadrotorQuadraticCost(); } template void __global__ StateCostKernel(COST_T* costs, float* s_d, float* result) { result[0] = costs->computeStateCost(s_d); result[1] = costs->terminalCost(s_d); } TEST(QuadrotorQuadraticCost, ControlCost) { using COST = QuadrotorQuadraticCost; const int num_results = 2; /** * CPU Setup */ COST cost; COST::output_array s = COST::output_array::Random(); Eigen::Quaternionf q_test(s[6], s[7], s[8], s[9]); q_test.normalize(); s[6] = q_test.w(); s[7] = q_test.x(); s[8] = q_test.y(); s[9] = q_test.z(); // COST::control_array std_dev = COST::control_array::Constant(0.5); // float lambda = 0.8; QuadrotorQuadraticCostParams new_params; new_params.x_coeff = 5; cost.setParams(new_params); /** * GPU Setup */ cost.GPUSetup(); cudaStream_t stream_t; cudaStreamCreate(&stream_t); float* s_d; // float* eps_d; // float* std_dev_d; float* GPU_result_d; // Allocate Memory on GPU size_t state_size = sizeof(float) * COST::OUTPUT_DIM; HANDLE_ERROR(cudaMalloc((void**)&s_d, state_size)); // HANDLE_ERROR(cudaMalloc((void**)&eps_d, state_size)); // HANDLE_ERROR(cudaMalloc((void**)&std_dev_d, state_size)); HANDLE_ERROR(cudaMalloc((void**)&GPU_result_d, sizeof(float) * num_results)); // Copy data to GPU HANDLE_ERROR(cudaMemcpyAsync(s_d, s.data(), state_size, cudaMemcpyHostToDevice, stream_t)); // HANDLE_ERROR(cudaMemcpyAsync(eps_d, eps.data(), state_size, // cudaMemcpyHostToDevice, stream_t)); // HANDLE_ERROR(cudaMemcpyAsync(std_dev_d, std_dev.data(), state_size, // cudaMemcpyHostToDevice, stream_t)); HANDLE_ERROR(cudaStreamSynchronize(stream_t)); std::array CPU_result; CPU_result[0] = cost.computeStateCost(s); CPU_result[1] = cost.terminalCost(s); // call GPU Kernel std::array GPU_result; dim3 dimBlock(1, 1, 1); dim3 dimGrid(1, 1, 1); StateCostKernel<<>>(cost.cost_d_, s_d, GPU_result_d); CudaCheckError(); HANDLE_ERROR(cudaStreamSynchronize(stream_t)); // Copy GPU results back to Host HANDLE_ERROR( cudaMemcpyAsync(GPU_result.data(), GPU_result_d, sizeof(float) * num_results, cudaMemcpyDeviceToHost, stream_t)); HANDLE_ERROR(cudaStreamSynchronize(stream_t)); std::cout << "State Cost: " << CPU_result[0] << std::endl; array_assert_float_eq(CPU_result, GPU_result); } ================================================ FILE: tests/dynamics/CMakeLists.txt ================================================ set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) gtest_discover_tests(${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ## Autorally Dynamics Constant Memory Tests #add_executable(ar_dynamics_nn_test_constant ../test_main.cpp ar_dynamics_nn_test.cu) #target_compile_definitions(ar_dynamics_nn_test_constant PRIVATE MPPI_NNET_USING_CONSTANT_MEM__) #target_link_libraries(ar_dynamics_nn_test_constant # ${GTEST_LIBRARIES} # cnpy) # #gtest_add_tests(TARGET ar_dynamics_nn_test_constant) # #set_target_properties(ar_dynamics_nn_test_constant PROPERTIES FOLDER test) ================================================ FILE: tests/dynamics/angle_utils_test.cu ================================================ #include #include #include #include TEST(AngleUtils, normalizeAngleKnownDouble) { double angle = 0.5; EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), 0.5); angle = M_PI_2; EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), M_PI_2); angle = M_PI; EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), M_PI); angle = -M_PI; EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), M_PI); angle = -M_PI * 3; EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), M_PI); angle = M_PI * 8; EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), 0); } TEST(AngleUtils, normalizeAngleKnownFloat) { float angle = 0.5f; EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), 0.5f); angle = static_cast(M_PI_2); EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), static_cast(M_PI_2)); angle = static_cast(M_PI); EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), static_cast(M_PI)); angle = -static_cast(M_PI); EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), static_cast(M_PI)); angle = -static_cast(M_PI) * 3.0f + 1e-3f; EXPECT_FLOAT_EQ(angle_utils::normalizeAngle(angle), -static_cast(M_PI) + 1e-3f); angle = static_cast(M_PI) * 8.0f; EXPECT_NEAR(angle_utils::normalizeAngle(angle), 0.0f, 1e-6); } TEST(AngleUtils, normalizeAngleRandom) { std::random_device rd; std::mt19937 e2(rd()); std::uniform_real_distribution<> dist(-100, 100); for (int i = 0; i < 1000; i++) { double rand = dist(e2); double result = angle_utils::normalizeAngle(rand); EXPECT_TRUE(result <= M_PI && result > -M_PI) << result; } for (int i = 0; i < 1000; i++) { float rand = dist(e2); float result = angle_utils::normalizeAngle(rand); EXPECT_TRUE(result <= M_PI && result > -M_PI) << result; } } TEST(AngleUtils, shortestAngularDistanceDouble) { double angle_1 = 0; double angle_2 = 0; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), 0); angle_1 = 0; angle_2 = M_PI_2; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), M_PI_2); angle_1 = 0; angle_2 = -M_PI; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), M_PI); angle_1 = M_PI_4; angle_2 = -M_PI_4; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), -M_PI_2); angle_1 = -M_PI_4; angle_2 = M_PI_4; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), M_PI_2); angle_1 = 3 * M_PI_4; angle_2 = -3 * M_PI_4; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), M_PI_2); angle_1 = -3 * M_PI_4; angle_2 = 3 * M_PI_4; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), -M_PI_2); } TEST(AngleUtils, shortestAngularDistanceFloat) { float angle_1 = 0; float angle_2 = 0; EXPECT_NEAR(angle_utils::shortestAngularDistance(angle_1, angle_2), 0, 1e-6); angle_1 = 0; angle_2 = M_PI_2; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), M_PI_2); // TODO is right? angle_1 = 0; angle_2 = -M_PI; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), M_PI); angle_1 = M_PI_4; angle_2 = -M_PI_4; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), -M_PI_2); angle_1 = -M_PI_4; angle_2 = M_PI_4; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), M_PI_2); angle_1 = 3 * M_PI_4; angle_2 = -3 * M_PI_4; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), M_PI_2); angle_1 = -3 * M_PI_4; angle_2 = 3 * M_PI_4; EXPECT_FLOAT_EQ(angle_utils::shortestAngularDistance(angle_1, angle_2), -M_PI_2); } TEST(AngleUtils, interpolateEulerAngleLinearDouble) { double angle_1 = 0; double angle_2 = 0; double alpha = 0.5; EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, alpha), 0); angle_1 = 0; angle_2 = M_PI_2; EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.0), 0); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.25), M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.5), M_PI_4); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.75), M_PI_4 * 3 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 1.0), M_PI_2); angle_1 = M_PI_4; angle_2 = -M_PI_4; EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.0), M_PI_4); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.25), M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.5), 0); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.75), -M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 1.0), -M_PI_4); angle_1 = 3 * M_PI_4; angle_2 = -3 * M_PI_4; EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.0), 3 * M_PI_4); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.25), 7 * M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.5), M_PI); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.75), -7 * M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 1.0), -3 * M_PI_4); } TEST(AngleUtils, interpolateEulerAngleLinearFloat) { float angle_1 = 0; float angle_2 = 0; float alpha = 0.5; EXPECT_NEAR(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, alpha), 0, 1e-7); angle_1 = 0; angle_2 = M_PI_2; EXPECT_NEAR(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.0f), 0, 1e-7); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.25f), M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.5f), M_PI_4); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.75f), M_PI_4 * 3 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 1.0f), M_PI_2); angle_1 = M_PI_4; angle_2 = -M_PI_4; EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.0f), M_PI_4); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.25f), M_PI_4 / 2); EXPECT_NEAR(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.5f), 0, 1e-7); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.75f), -M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 1.0f), -M_PI_4); angle_1 = 3 * M_PI_4; angle_2 = -3 * M_PI_4; EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.0f), 3 * M_PI_4); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.25f), 7 * M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.5f), M_PI); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 0.75f), -7 * M_PI_4 / 2); EXPECT_FLOAT_EQ(angle_utils::interpolateEulerAngleLinear(angle_1, angle_2, 1.0f), -3 * M_PI_4); } ================================================ FILE: tests/dynamics/ar_dynamics_nn_test.cu ================================================ // // Created by jgibson37 on 1/13/20. // #include #include #include #include #include // Auto-generated header file #include #include "mppi/ddp/ddp_model_wrapper.h" /** * Note: the analytical solution for the test NN is outlined in the python script */ TEST(ARNeuralNetDynamics, verifyTemplateParamters) { int state_dim = NeuralNetModel<7, 2, 3>::STATE_DIM; EXPECT_EQ(state_dim, 7); int control_dim = NeuralNetModel<7, 2, 3>::CONTROL_DIM; EXPECT_EQ(control_dim, 2); int dynamics_dim = NeuralNetModel<7, 2, 3>::DYNAMICS_DIM; EXPECT_EQ(dynamics_dim, 7 - 3); NeuralNetModel<7, 2, 3> model; int* net_structure = model.getHelperPtr()->getNetStructurePtr(); EXPECT_EQ(net_structure[0], 6); EXPECT_EQ(net_structure[1], 32); EXPECT_EQ(net_structure[2], 32); EXPECT_EQ(net_structure[3], 4); EXPECT_EQ(model.getHelperPtr()->getLargestLayer(), 32); EXPECT_EQ(model.getHelperPtr()->getNumParams(), (6 + 1) * 32 + (32 + 1) * 32 + (32 + 1) * 4); EXPECT_EQ(model.getBlkSharedSizeBytes(), 256); EXPECT_EQ(model.getGrdSharedSizeBytes(), 5696); EXPECT_EQ(model.getHelperPtr()->getBlkSharedSizeBytes(), 256); EXPECT_EQ(model.getHelperPtr()->getGrdSharedSizeBytes(), 5696); } TEST(ARNeuralNetDynamics, BindStreamControlRanges) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); std::array u_constraint = {}; u_constraint[0].x = -1.0; u_constraint[0].y = 1.0; u_constraint[1].x = -2.0; u_constraint[1].y = 2.0; NeuralNetModel<7, 2, 3> model(u_constraint, stream); EXPECT_EQ(model.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST(ARNeuralNetDynamics, BindStreamDefaultArgRanges) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); NeuralNetModel<7, 2, 3> model(stream); EXPECT_EQ(model.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST(ARNeuralNetDynamics, ControlRangesSetDefaultCPU) { NeuralNetModel<7, 2, 3> model; std::array ranges = model.getControlRanges(); for (int i = 0; i < 2; i++) { EXPECT_FLOAT_EQ(ranges[0].x, -FLT_MAX); EXPECT_FLOAT_EQ(ranges[0].y, FLT_MAX); } } TEST(ARNeuralNetDynamics, ControlRangesSetCPU) { std::array u_constraint = {}; u_constraint[0].x = -1.0; u_constraint[0].y = 1.0; u_constraint[1].x = -2.0; u_constraint[1].y = 2.0; NeuralNetModel<7, 2, 3> model(u_constraint); std::array ranges = model.getControlRanges(); EXPECT_FLOAT_EQ(ranges[0].x, -1.0); EXPECT_FLOAT_EQ(ranges[0].y, 1.0); EXPECT_FLOAT_EQ(ranges[1].x, -2.0); EXPECT_FLOAT_EQ(ranges[1].y, 2.0); } TEST(ARNeuralNetDynamics, stideIdcsSetDefault) { NeuralNetModel<7, 2, 3> model; int* result = model.getHelperPtr()->getStrideIdcsPtr(); EXPECT_EQ(result[0], 0); EXPECT_EQ(result[1], 192); EXPECT_EQ(result[2], 224); EXPECT_EQ(result[3], 1248); EXPECT_EQ(result[4], 1280); EXPECT_EQ(result[5], 1408); } TEST(ARNeuralNetDynamics, GPUSetupAndParamsCheck) { NeuralNetModel<7, 2, 3> model; float* theta = model.getHelperPtr()->getThetaPtr(); int* stride = model.getHelperPtr()->getStrideIdcsPtr(); int* net_structure = model.getHelperPtr()->getNetStructurePtr(); std::array theta_result = {}; std::array stride_result = {}; std::array net_structure_result = {}; EXPECT_EQ(model.GPUMemStatus_, false); EXPECT_EQ(model.model_d_, nullptr); EXPECT_NE(model.getHelperPtr(), nullptr); model.GPUSetup(); EXPECT_EQ(model.GPUMemStatus_, true); EXPECT_NE(model.model_d_, nullptr); // launch kernel launchParameterCheckTestKernel, 1412, 6, 4>(model, theta_result, stride_result, net_structure_result); for (int i = 0; i < 1412; i++) { // these are a bunch of mostly random values and nan != nan if (!isnan(theta[i])) { EXPECT_FLOAT_EQ(theta_result[i], theta[i]); } } for (int i = 0; i < 6; i++) { EXPECT_EQ(stride_result[i], stride[i]); } for (int i = 0; i < 4; i++) { EXPECT_EQ(net_structure[i], net_structure_result[i]); } } TEST(ARNeuralNetDynamics, UpdateModelTest) { NeuralNetModel<7, 2, 3> model; float* theta = model.getHelperPtr()->getThetaPtr(); int* stride = model.getHelperPtr()->getStrideIdcsPtr(); int* net_structure = model.getHelperPtr()->getNetStructurePtr(); std::array theta_result = {}; std::array stride_result = {}; std::array net_structure_result = {}; model.GPUSetup(); std::vector theta_vec(1412); srand(time(NULL)); for (int i = 0; i < 1412; i++) { theta_vec[i] = rand(); } model.updateModel({ 6, 32, 32, 4 }, theta_vec); // check CPU for (int i = 0; i < 1412; i++) { // these are a bunch of mostly random values and nan != nan if (!isnan(theta_vec[i])) { EXPECT_FLOAT_EQ(model.getHelperPtr()->getThetaPtr()[i], theta_vec[i]); } } // launch kernel launchParameterCheckTestKernel, 1412, 6, 4>(model, theta_result, stride_result, net_structure_result); for (int i = 0; i < 1412; i++) { // these are a bunch of mostly random values and nan != nan if (!isnan(theta_vec[i])) { EXPECT_FLOAT_EQ(theta_result[i], theta_vec[i]) << "failed at index " << i; } } for (int i = 0; i < 6; i++) { EXPECT_EQ(stride_result[i], stride[i]); } for (int i = 0; i < 4; i++) { EXPECT_EQ(net_structure[i], net_structure_result[i]); } } TEST(ARNeuralNetDynamics, LoadModelTest) { NeuralNetModel<7, 2, 3> model; EXPECT_EQ(model.getGrdSharedSizeBytes(), model.getHelperPtr()->getGrdSharedSizeBytes()) << "Shared mem request " "doesn't match between " "Dynamics and Neural " "Network"; EXPECT_EQ(model.getBlkSharedSizeBytes(), model.getHelperPtr()->getBlkSharedSizeBytes()) << "Shared mem request " "doesn't match between " "Dynamics and Neural " "Network"; model.GPUSetup(); // TODO procedurally generate a NN in python and save and run like costs std::string path = mppi::tests::test_load_nn_file; model.loadParams(path); // check CPU for (int i = 0; i < 1412; i++) { EXPECT_FLOAT_EQ(model.getHelperPtr()->getThetaPtr()[i], i) << "failed at index " << i; } std::array theta_result = {}; std::array stride_result = {}; std::array net_structure_result = {}; EXPECT_EQ(model.getGrdSharedSizeBytes(), model.getHelperPtr()->getGrdSharedSizeBytes()) << "Shared mem request " "doesn't match between " "Dynamics and Neural " "Network"; EXPECT_EQ(model.getBlkSharedSizeBytes(), model.getHelperPtr()->getBlkSharedSizeBytes()) << "Shared mem request " "doesn't match between " "Dynamics and Neural " "Network"; // launch kernel launchParameterCheckTestKernel, 1412, 6, 4>(model, theta_result, stride_result, net_structure_result); for (int i = 0; i < 1412; i++) { EXPECT_FLOAT_EQ(theta_result[i], i) << "failed at index " << i; } } TEST(ARNeuralNetDynamics, computeKinematicsTestCPU) { NeuralNetModel<7, 2, 3> model; NeuralNetModel<7, 2, 3>::state_array s(7, 1); NeuralNetModel<7, 2, 3>::state_array s_der(7, 1); s(2) = 0.0; // yaw s(4) = 1.0; // body frame vx s(5) = 2.0; // body frame vy s(6) = 0.0; // yaw dot model.computeKinematics(s, s_der); EXPECT_FLOAT_EQ(s_der(0), 1.0); EXPECT_FLOAT_EQ(s_der(1), 2.0); EXPECT_FLOAT_EQ(s_der(2), 0.0); s(2) = M_PI / 2; // yaw s(4) = 3.0; // body frame vx s(5) = 5.0; // body frame vy s(6) = 1.0; // yaw dot model.computeKinematics(s, s_der); EXPECT_FLOAT_EQ(s_der(0), -5); EXPECT_FLOAT_EQ(s_der(1), 3.0); EXPECT_FLOAT_EQ(s_der(2), -1.0); } TEST(ARNeuralNetDynamics, computeKinematicsTestGPU) { NeuralNetModel<7, 2, 3> model; std::vector> s(1); std::vector> s_der(1); model.GPUSetup(); for (int y_dim = 1; y_dim < 17; y_dim++) { s[0] = { 0.0 }; s[0][2] = 0.0; // yaw s[0][4] = 1.0; // body frame vx s[0][5] = 2.0; // body frame vy s[0][6] = 0.0; // yaw dot s_der[0] = { 0.0 }; launchComputeKinematicsTestKernel, 7>(model, s, s_der, y_dim); EXPECT_FLOAT_EQ(s_der[0][0], 1.0); EXPECT_FLOAT_EQ(s_der[0][1], 2.0); EXPECT_FLOAT_EQ(s_der[0][2], 0.0); s[0][2] = M_PI / 2; // yaw s[0][4] = 3.0; // body frame vx s[0][5] = 5.0; // body frame vy s[0][6] = 1.0; // yaw dot launchComputeKinematicsTestKernel, 7>(model, s, s_der, y_dim); EXPECT_FLOAT_EQ(s_der[0][0], -5); EXPECT_FLOAT_EQ(s_der[0][1], 3.0); EXPECT_FLOAT_EQ(s_der[0][2], -1.0); } } TEST(ARNeuralNetDynamics, updateStateGPUTest) { NeuralNetModel<7, 2, 3> model; std::vector> s(1); // x_dot, y_dot, theta_dot std::vector> s_der(1); model.GPUSetup(); for (int j = 1; j < 17; j++) { s[0] = { 0.0 }; s[0][2] = 0.0; // yaw s[0][4] = 1.0; // body frame vx s[0][5] = 2.0; // body frame vy s[0][6] = 0.0; // yaw dot s_der[0] = { 0.0 }; s_der[0][0] = 1.0; s_der[0][1] = 2.0; s_der[0][2] = 3.0; launchUpdateStateTestKernel, 7>(model, s, s_der, 0.1, j); EXPECT_FLOAT_EQ(s_der[0][0], 1); EXPECT_FLOAT_EQ(s_der[0][1], 2); EXPECT_FLOAT_EQ(s_der[0][2], 3); EXPECT_FLOAT_EQ(s_der[0][3], 0); EXPECT_FLOAT_EQ(s_der[0][4], 0); EXPECT_FLOAT_EQ(s_der[0][5], 0); EXPECT_FLOAT_EQ(s_der[0][6], 0); EXPECT_FLOAT_EQ(s[0][0], 0.1); EXPECT_FLOAT_EQ(s[0][1], 0.2); EXPECT_FLOAT_EQ(s[0][2], 0.3); EXPECT_FLOAT_EQ(s[0][3], 0.0); EXPECT_FLOAT_EQ(s[0][4], 1.0); EXPECT_FLOAT_EQ(s[0][5], 2.0); EXPECT_FLOAT_EQ(s[0][6], 0.0); } } /** * * @tparam CLASS_T * @param model * @param s * @param ds * @param u * @param du */ template void compareFiniteDifferenceGradient(CLASS_T& model, Eigen::MatrixXf& s, Eigen::MatrixXf& ds, Eigen::MatrixXf& u, Eigen::MatrixXf& du) { Eigen::MatrixXf s_2(7, 1); s_2 = s + ds; Eigen::MatrixXf u_2(2, 1); u_2 = u + du; Eigen::MatrixXf s_der(7, 1); Eigen::MatrixXf s_der_2(7, 1); s_der.setZero(); s_der_2.setZero(); Eigen::MatrixXf calculated_A(7, 7); Eigen::MatrixXf calculated_B(7, 2); model.computeDynamics(s_2, u_2, s_der_2); model.computeDynamics(s, u, s_der); std::cout << "s_der\n" << s_der << std::endl; std::cout << "s_der_2\n" << s_der_2 << std::endl; std::cout << "s_der_2 - s_der\n" << (s_der_2 - s_der) << std::endl; Eigen::MatrixXf A(7, 7); Eigen::MatrixXf B(7, 2); model.computeGrad(s, u, A, B); std::cout << "A = \n" << A << std::endl; std::cout << "B = \n" << B << std::endl; // compare A for (int i = 0; i < 7; i++) { for (int j = 0; j < 7; j++) { EXPECT_NEAR(calculated_A(i, j), A(i, j), 0.01) << "failed at index = " << i << ", " << j; } } // compare B for (int i = 0; i < 7; i++) { for (int j = 0; j < 2; j++) { EXPECT_NEAR(calculated_B(i, j), B(i, j), 0.01) << "failed at index = " << i << ", " << j; } } } /* // Note math for analytical solution is in the python script TEST(ARNeuralNetDynamics, computeGrad) { GTEST_SKIP(); NeuralNetModel<7,2,3,6,32,32,4> model; Eigen::MatrixXf s(7, 1); Eigen::MatrixXf ds(7, 1); Eigen::MatrixXf u(2, 1); Eigen::MatrixXf du(2, 1); s.setZero(); ds.setZero(); u.setZero(); du.setZero(); ds << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0; std::vector theta(1412); std::fill(theta.begin(), theta.end(), 1); model.updateModel({6, 32, 32, 4}, theta); compareFiniteDifferenceGradient(model, s, ds, u, du); } */ TEST(ARNeuralNetDynamics, computeDynamicsCPU) { NeuralNetModel<7, 2, 3> model; NeuralNetModel<7, 2, 3>::state_array s; NeuralNetModel<7, 2, 3>::control_array u; NeuralNetModel<7, 2, 3>::state_array s_der; s.setZero(); s_der.setZero(); u << 1, -1; std::vector theta(1412); std::fill(theta.begin(), theta.end(), 1); model.updateModel({ 6, 32, 32, 4 }, theta); model.computeDynamics(s, u, s_der); EXPECT_FLOAT_EQ(s(0), 0); EXPECT_FLOAT_EQ(s(1), 0); EXPECT_FLOAT_EQ(s(2), 0); EXPECT_FLOAT_EQ(s(3), 0); EXPECT_FLOAT_EQ(s(4), 0); EXPECT_FLOAT_EQ(s(5), 0); EXPECT_FLOAT_EQ(s(6), 0); EXPECT_FLOAT_EQ(s_der(0), 0); EXPECT_FLOAT_EQ(s_der(1), 0); EXPECT_FLOAT_EQ(s_der(2), 0); EXPECT_FLOAT_EQ(s_der(3), 33); EXPECT_FLOAT_EQ(s_der(4), 33); EXPECT_FLOAT_EQ(s_der(5), 33); EXPECT_FLOAT_EQ(s_der(6), 33); EXPECT_FLOAT_EQ(u(0), 1.0); EXPECT_FLOAT_EQ(u(1), -1.0); } TEST(ARNeuralNetDynamics, computeDynamicsGPU) { NeuralNetModel<7, 2, 3> model; std::vector> s(1); // x_dot, y_dot, theta_dot std::vector> s_der(1); // steering, throttle std::vector> u(1); std::vector theta(1412); model.GPUSetup(); std::fill(theta.begin(), theta.end(), 1); model.updateModel({ 6, 32, 32, 4 }, theta); for (int y_dim = 1; y_dim < 17; y_dim++) { s[0] = { 0 }; s_der[0] = { 0 }; u[0] = { 0 }; u[0][0] = 1.0; u[0][1] = -1.0; launchComputeDynamicsTestKernel, 7, 2>(model, s, u, s_der, y_dim); EXPECT_FLOAT_EQ(s[0][0], 0); EXPECT_FLOAT_EQ(s[0][1], 0); EXPECT_FLOAT_EQ(s[0][2], 0); EXPECT_FLOAT_EQ(s[0][3], 0); EXPECT_FLOAT_EQ(s[0][4], 0); EXPECT_FLOAT_EQ(s[0][5], 0); EXPECT_FLOAT_EQ(s[0][6], 0); EXPECT_FLOAT_EQ(s_der[0][0], 0); EXPECT_FLOAT_EQ(s_der[0][1], 0); EXPECT_FLOAT_EQ(s_der[0][2], 0); EXPECT_FLOAT_EQ(s_der[0][3], 33) << "at y_dim " << y_dim; EXPECT_FLOAT_EQ(s_der[0][4], 33) << "at y_dim " << y_dim; EXPECT_FLOAT_EQ(s_der[0][5], 33) << "at y_dim " << y_dim; EXPECT_FLOAT_EQ(s_der[0][6], 33) << "at y_dim " << y_dim; EXPECT_FLOAT_EQ(u[0][0], 1.0); EXPECT_FLOAT_EQ(u[0][1], -1.0); } } // TODO compute state deriv CPU TEST(ARNeuralNetDynamics, computeStateDerivCPU) { } TEST(ARNeuralNetDynamics, computeStateDerivGPU) { NeuralNetModel<7, 2, 3> model; model.GPUSetup(); std::vector> s(1); // x_dot, y_dot, theta_dot std::vector> s_der(1); // steering, throttle std::vector> u(1); std::vector theta(1412); std::fill(theta.begin(), theta.end(), 1); model.updateModel({ 6, 32, 32, 4 }, theta); for (int j = 1; j < 17; j++) { s[0] = { 0.0 }; s[0][4] = 1; s[0][5] = 2; s[0][6] = 3; s_der[0] = { 0.0 }; u[0] = { 0.0 }; launchComputeStateDerivTestKernel, 7, 2>(model, s, u, s_der, j); EXPECT_FLOAT_EQ(s[0][0], 0); EXPECT_FLOAT_EQ(s[0][1], 0); EXPECT_FLOAT_EQ(s[0][2], 0); EXPECT_FLOAT_EQ(s[0][3], 0); EXPECT_FLOAT_EQ(s[0][4], 1); EXPECT_FLOAT_EQ(s[0][5], 2); EXPECT_FLOAT_EQ(s[0][6], 3); EXPECT_FLOAT_EQ(s_der[0][0], 1); EXPECT_FLOAT_EQ(s_der[0][1], 2); EXPECT_FLOAT_EQ(s_der[0][2], -3); EXPECT_FLOAT_EQ(s_der[0][3], 33); EXPECT_FLOAT_EQ(s_der[0][4], 33); EXPECT_FLOAT_EQ(s_der[0][5], 33); EXPECT_FLOAT_EQ(s_der[0][6], 33); EXPECT_FLOAT_EQ(u[0][0], 0); EXPECT_FLOAT_EQ(u[0][1], 0); } } // TODO add in a generic GPU/CPU method call void parseTextIntoDataPointHelper(std::string text, std::array& state, std::array& state_result, std::array& state_der, std::array& control) { size_t line_pos = 0; size_t prev_line_pos = 1; int what_var = 0; text.append(" *"); while ((line_pos = text.find("*", prev_line_pos)) != std::string::npos) { std::string line = text.substr(prev_line_pos, line_pos - prev_line_pos); line.append(" "); size_t value_pos = 0; size_t prev_value_pos = 0; int counter = 0; while ((value_pos = line.find(" ", prev_value_pos)) != std::string::npos) { std::string value = line.substr(prev_value_pos, value_pos - prev_value_pos); // makes sure it is a number if (!value.empty()) { float number = 0; // = std::stoi(value.substr(0, std::string::npos)); if (value[0] == '-' && isdigit(value[1]) || isdigit(value[0])) { number = std::stof(value); } else { prev_value_pos = value_pos + 1; continue; } if (what_var == 0) { state[counter++] = number; } else if (what_var == 1) { control[counter++] = number; } else if (what_var == 2) { state_der[counter++] = number; } else if (what_var == 3) { state_result[counter++] = number; } } prev_value_pos = std::min(value_pos + 1, line.length()); } what_var++; prev_line_pos = std::min(line_pos + 1, text.length()); } } class DynamicsDummy : public NeuralNetModel<7, 2, 3> { public: bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { return false; }; }; TEST(ARNeuralNetDynamics, computeGradTest) { std::string path = mppi::tests::old_autorally_network_file; Eigen::Matrix::STATE_DIM, NeuralNetModel<7, 2, 3>::STATE_DIM + NeuralNetModel<7, 2, 3>::CONTROL_DIM> numeric_jac; Eigen::Matrix::STATE_DIM, NeuralNetModel<7, 2, 3>::STATE_DIM + NeuralNetModel<7, 2, 3>::CONTROL_DIM> analytic_jac; NeuralNetModel<7, 2, 3>::state_array state; state << 4.264431, -30.974377, -0.955451, -0.028595, 3.346700, 0.048521, 0.315486; NeuralNetModel<7, 2, 3>::control_array control; control << -0.221381, 0.089168; auto analytic_grad_model = NeuralNetModel<7, 2, 3>(); analytic_grad_model.loadParams(path); NeuralNetModel<7, 2, 3>::dfdx A_analytic = NeuralNetModel<7, 2, 3>::dfdx::Zero(); NeuralNetModel<7, 2, 3>::dfdu B_analytic = NeuralNetModel<7, 2, 3>::dfdu::Zero(); analytic_grad_model.computeGrad(state, control, A_analytic, B_analytic); auto numerical_grad_model = DynamicsDummy(); numerical_grad_model.loadParams(path); std::shared_ptr> ddp_model = std::make_shared>(&numerical_grad_model); analytic_jac.leftCols::STATE_DIM>() = A_analytic; analytic_jac.rightCols::CONTROL_DIM>() = B_analytic; numeric_jac = ddp_model->df(state, control); ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-1) << "Numeric Jacobian\n" << numeric_jac << "\nAnalytic Jacobian\n" << analytic_jac; } ================================================ FILE: tests/dynamics/bicycle_slip_parametric_model_test.cu ================================================ #include #include #include #include #include #include class BicycleSlipParametricTest : public ::testing::Test { public: cudaStream_t stream; void SetUp() override { CudaCheckError(); HANDLE_ERROR(cudaStreamCreate(&stream)); } void TearDown() override { CudaCheckError(); HANDLE_ERROR(cudaStreamDestroy(stream)); } }; TEST_F(BicycleSlipParametricTest, Template) { auto dynamics = BicycleSlipParametric(); EXPECT_EQ(12, BicycleSlipParametric::STATE_DIM); EXPECT_EQ(2, BicycleSlipParametric::CONTROL_DIM); EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_NE(dynamics.getTextureHelperNormals(), nullptr); } TEST_F(BicycleSlipParametricTest, BindStream) { auto dynamics = BicycleSlipParametric(stream); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_EQ(dynamics.getTextureHelper()->stream_, stream); EXPECT_NE(dynamics.getTextureHelperNormals(), nullptr); EXPECT_EQ(dynamics.getTextureHelperNormals()->stream_, stream); } /* float c_t = 1.3; float c_b = 2.5; float c_v = 3.7; float c_0 = 4.9; float wheel_base = 0.3; */ TEST_F(BicycleSlipParametricTest, ComputeDynamics) { BicycleSlipParametric dynamics = BicycleSlipParametric(); auto params = dynamics.getParams(); BicycleSlipParametric::state_array x = BicycleSlipParametric::state_array::Zero(); BicycleSlipParametric::control_array u = BicycleSlipParametric::control_array::Zero(); // computeDynamics should not touch the roll/pitch element BicycleSlipParametric::state_array x_dot = BicycleSlipParametric::state_array::Ones() * 0.153; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)) = 2.0f; x(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)) = 1.0f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 1.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -6.5); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = M_PI_2; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), -1.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -6.5); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)) = 0.6f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), -1.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2 + 1.0 * 0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -6.5 - 2 * 0.6 + 0.6 * 1.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), -0.6 * 2.2 - 4.2 * 0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = 0.0f; x(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)) = 0.0f; x(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)) = 0.0f; x(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)) = 0.1f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), -0.66); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2 - 30 * 0.1 * 0.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)) = 0.5f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.5f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), -0.66); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 0.5 - 30 * 0.1 * 0.5); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)) = -0.5f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), -0.5f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), -0.66); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0.079 * 0.5 + 30 * 0.1 * 0.5); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)) = -2.0f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), -2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), -0.66); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0.079 * 2 + 30 * 0.1 * 0.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)) = 2.0f; x(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)) = 0.0f; x(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)) = 1.0f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), -0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), (2 / 0.3) * tanf(1.0f / -9.1f) * 2.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)) = 2.0f; x(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)) = -0.6; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), -0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), -0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -0.6 * 1.6 + 2 * 0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), (0.6 + (2 / 0.3) * tanf(1.0f / -9.1f)) * 2.2 + 4.2 * 0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)) = 2.0f; float yaw_rate = (2 / 0.3) * tanf(1.0f / -9.1f); x(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)) = yaw_rate; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), -0.6); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), yaw_rate * 1.6 - 2 * yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), -4.2 * yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)) = 2.0f; u(C_IND_CLASS(BicycleSlipParametricParams, THROTTLE_BRAKE)) = 1.0; u(C_IND_CLASS(BicycleSlipParametricParams, STEER_CMD)) = 0.3; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0.3); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2 + 4 * -0.0145f + 2 * 0.15 + 2.73); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), yaw_rate * 1.6 - 2 * yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), -4.2 * yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); u(C_IND_CLASS(BicycleSlipParametricParams, THROTTLE_BRAKE)) = 0.3; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 2.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0.3); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0.0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.079 * 2 + 0.3 * (4 * -0.0145f + 2 * 0.15 + 2.73)); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), yaw_rate * 1.6 - 2 * yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), -4.2 * yaw_rate); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); } TEST_F(BicycleSlipParametricTest, ComputeDynamicsWithNormals) { BicycleSlipParametric dynamics = BicycleSlipParametric(); auto params = dynamics.getParams(); BicycleSlipParametric::state_array x = BicycleSlipParametric::state_array::Zero(); BicycleSlipParametric::control_array u = BicycleSlipParametric::control_array::Zero(); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelperNormals(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(0.0f, 0.0f, 1.0f, 0.0f); } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); // computeDynamics should not touch the roll/pitch element BicycleSlipParametric::state_array x_dot = BicycleSlipParametric::state_array::Ones() * 0.153; // dynamics.computeDynamics(x, u, x_dot); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); // EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(0.4f, 0.0f, 1.0 - sqrt(0.4f * 0.4f), 0.0f); } helper->updateTexture(0, data_vec); helper->copyToDevice(true); x = BicycleSlipParametric::state_array::Zero(); x_dot = BicycleSlipParametric::state_array::Ones() * 0.153; u(C_IND_CLASS(BicycleSlipParametricParams, THROTTLE_BRAKE)) = 5.0f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0.4 * 3.9 + 4.02); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); u(C_IND_CLASS(BicycleSlipParametricParams, THROTTLE_BRAKE)) = 0.0f; x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = M_PI_2; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_NEAR(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0, 1.0e-7); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.4 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = -M_PI_2; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_NEAR(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0, 1.0e-7); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -0.4 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = M_PI_4; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), sqrtf(2) / 2 * 0.4 * 3.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), sqrtf(2) / 2 * 0.4 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = -M_PI_4; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), sqrtf(2) / 2 * 0.4 * 3.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -sqrtf(2) / 2 * 0.4 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(-0.4f, 0.0f, 1.0 - sqrt(0.4f * 0.4f), 0.0f); } helper->updateTexture(0, data_vec); helper->copyToDevice(true); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = 0; u(C_IND_CLASS(BicycleSlipParametricParams, THROTTLE_BRAKE)) = 5.0f; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.4 * 3.9 + 4.02); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); u(C_IND_CLASS(BicycleSlipParametricParams, THROTTLE_BRAKE)) = 0.0f; x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = M_PI_2; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_NEAR(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0, 1.0e-7); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -0.4 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = -M_PI_2; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_NEAR(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0, 1.0e-7); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.4 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = M_PI_4; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -sqrtf(2) / 2 * 0.4 * 3.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -sqrtf(2) / 2 * 0.4 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = -M_PI_4; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -sqrtf(2) / 2 * 0.4 * 3.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), sqrtf(2) / 2 * 0.4 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(0.2f, 0.1f, 1.0 - sqrt(0.2f * 0.2f + 0.1f * 0.1f), 0.0f); } helper->updateTexture(0, data_vec); helper->copyToDevice(true); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = 0; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0.2 * 3.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.1 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = M_PI_2; float sq2_2 = sqrtf(2) / 2; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), -0.1 * 3.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.2 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); x(S_IND_CLASS(BicycleSlipParametricParams, YAW)) = -M_PI_2; dynamics.computeDynamics(x, u, x_dot); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.0f); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0.1 * 3.9); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), -0.2 * 7.2); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(x_dot(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 0); } TEST_F(BicycleSlipParametricTest, TestModelGPU) { BicycleSlipParametric dynamics = BicycleSlipParametric(); dynamics.GPUSetup(); CudaCheckError(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(100); std::vector> s_der(100); // steering, throttle std::vector> u(100); for (int state_index = 0; state_index < s.size(); state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } // These variables will be changed so initialized to the right size only // Run dynamics on dynamicsU // Run dynamics on GPU for (int y_dim = 1; y_dim <= 4; y_dim++) { launchComputeDynamicsTestKernel(dynamics, s, u, s_der, y_dim); for (int point = 0; point < 100; point++) { BicycleSlipParametric::state_array state = state_trajectory.col(point); BicycleSlipParametric::control_array control = control_trajectory.col(point); BicycleSlipParametric::state_array state_der_cpu = BicycleSlipParametric::state_array::Zero(); dynamics.computeDynamics(state, control, state_der_cpu); for (int dim = 0; dim < 12; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-5) << "at point " << point << " dim " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s_der[point][dim])); } } } dynamics.freeCudaMem(); CudaCheckError(); } TEST_F(BicycleSlipParametricTest, TestModelGPUNormals) { BicycleSlipParametric dynamics = BicycleSlipParametric(); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelperNormals(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(0.2f, 0.1f, 1.0 - sqrt(0.2f * 0.2f + 0.1f * 0.1f), 0.0f); } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); dynamics.GPUSetup(); CudaCheckError(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(100); std::vector> s_der(100); // steering, throttle std::vector> u(100); for (int state_index = 0; state_index < s.size(); state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } // These variables will be changed so initialized to the right size only // Run dynamics on dynamicsU // Run dynamics on GPU for (int y_dim = 1; y_dim <= 4; y_dim++) { launchComputeDynamicsTestKernel(dynamics, s, u, s_der, y_dim); for (int point = 0; point < 100; point++) { BicycleSlipParametric::state_array state = state_trajectory.col(point); BicycleSlipParametric::control_array control = control_trajectory.col(point); BicycleSlipParametric::state_array state_der_cpu = BicycleSlipParametric::state_array::Zero(); dynamics.computeDynamics(state, control, state_der_cpu); for (int dim = 0; dim < 12; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-5) << "at point " << point << " dim " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s_der[point][dim])); } } } dynamics.freeCudaMem(); CudaCheckError(); } TEST_F(BicycleSlipParametricTest, TestModelGPUNormalsReverse) { BicycleSlipParametric dynamics = BicycleSlipParametric(); auto params = dynamics.getParams(); params.gear_sign = -1; dynamics.setParams(params); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelperNormals(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(0.2f, 0.1f, 1.0 - sqrt(0.2f * 0.2f + 0.1f * 0.1f), 0.0f); } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); dynamics.GPUSetup(); CudaCheckError(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(100); std::vector> s_der(100); // steering, throttle std::vector> u(100); for (int state_index = 0; state_index < s.size(); state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } // These variables will be changed so initialized to the right size only // Run dynamics on dynamicsU // Run dynamics on GPU for (int y_dim = 1; y_dim <= 4; y_dim++) { launchComputeDynamicsTestKernel(dynamics, s, u, s_der, y_dim); for (int point = 0; point < 100; point++) { BicycleSlipParametric::state_array state = state_trajectory.col(point); BicycleSlipParametric::control_array control = control_trajectory.col(point); BicycleSlipParametric::state_array state_der_cpu = BicycleSlipParametric::state_array::Zero(); dynamics.computeDynamics(state, control, state_der_cpu); for (int dim = 0; dim < 12; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-5) << "at point " << point << " dim " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s_der[point][dim])); } } } dynamics.freeCudaMem(); CudaCheckError(); } TEST_F(BicycleSlipParametricTest, TestUpdateState) { CudaCheckError(); BicycleSlipParametric dynamics = BicycleSlipParametric(); BicycleSlipParametric::state_array state = BicycleSlipParametric::state_array::Zero(); BicycleSlipParametric::state_array next_state = BicycleSlipParametric::state_array::Ones() * NAN; BicycleSlipParametric::state_array state_der = BicycleSlipParametric::state_array::Ones(); // TODO add in the elevation map dynamics.updateState(state, next_state, state_der, 0.1); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, POS_X)), 0.1f); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, POS_Y)), 0.1f); EXPECT_NEAR(next_state(S_IND_CLASS(BicycleSlipParametricParams, YAW)), 0.1f, 1.0e-5); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE)), 0.1f); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, BRAKE_STATE)), 0.1f); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, VEL_X)), 0.1f); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, VEL_Y)), 0.1f); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, OMEGA_Z)), 0.1f); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, ROLL)), 0); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, PITCH)), 0); EXPECT_FLOAT_EQ(next_state(S_IND_CLASS(BicycleSlipParametricParams, STEER_ANGLE_RATE)), 1.0f); CudaCheckError(); } TEST_F(BicycleSlipParametricTest, TestStepGPUvsCPU) { const int num_rollouts = 1; const float dt = 0.1f; CudaCheckError(); BicycleSlipParametric dynamics = BicycleSlipParametric(); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); std::vector normal_data_vec; normal_data_vec.resize(10 * 20); for (int i = 0; i < normal_data_vec.size(); i++) { normal_data_vec[i] = make_float4(0.2f, 0.1f, 1.0 - sqrt(0.2f * 0.2f + 0.1f * 0.1f), 0.0f); } TwoDTextureHelper* normal_helper = dynamics.getTextureHelperNormals(); normal_helper->setExtent(0, extent); normal_helper->updateRotation(0, new_rot_mat); normal_helper->updateOrigin(0, make_float3(1, 2, 3)); normal_helper->updateTexture(0, normal_data_vec); normal_helper->updateResolution(0, 10); normal_helper->enableTexture(0); normal_helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(num_rollouts); std::vector> s_next(num_rollouts); std::vector> s_der(num_rollouts); // steering, throttle std::vector> u(num_rollouts); BicycleSlipParametric::state_array state; BicycleSlipParametric::state_array next_state_cpu; BicycleSlipParametric::control_array control; BicycleSlipParametric::output_array output; BicycleSlipParametric::state_array state_der_cpu = BicycleSlipParametric::state_array::Zero(); // Run dynamics on dynamicsU // Run dynamics on GPU for (int y_dim = 1; y_dim <= 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } launchStepTestKernel(dynamics, s, u, s_der, s_next, 0, dt, y_dim); for (int point = 0; point < num_rollouts; point++) { state = state_trajectory.col(point); control = control_trajectory.col(point); state_der_cpu = BicycleSlipParametric::state_array::Zero(); dynamics.step(state, next_state_cpu, state_der_cpu, control, output, 0, dt); // -1 for the filler state to get 4 aligned states for (int dim = 0; dim < BicycleSlipParametric::STATE_DIM - 1; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-4) << "at point " << point << " with y_dim " << y_dim << " at dim " << dim; // EXPECT_NEAR(state(dim), s[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_NEAR(next_state_cpu(dim), s_next[point][dim], 1e-4) << "at point " << point << " with y_dim " << y_dim << " at dim " << dim; EXPECT_TRUE(isfinite(s_next[point][dim])); } } } dynamics.freeCudaMem(); } // // TEST_F(BicycleSlipParametricTest, TestStepReverse) // { // CudaCheckError(); // using DYN = BicycleSlipParametric; // const float tol = 1e-6; // DYN dynamics = DYN(); // auto params = dynamics.getParams(); // params.c_0 = 0; // params.c_b[0] = 1; // params.c_b[1] = 10; // params.c_b[2] = 100; // params.c_v[0] = 0.25; // params.c_v[1] = 0.5; // params.c_v[2] = 0.75; // params.c_t[0] = 2; // params.c_t[1] = 20; // params.c_t[2] = 200; // params.low_min_throttle = 0.2; // params.steer_command_angle_scale = 0.5; // params.steering_constant = 0.5; // params.wheel_base = 0.5; // params.max_steer_rate = 5; // params.gear_sign = -1; // dynamics.setParams(params); // DYN::state_array state; // DYN::state_array next_state; // DYN::state_array state_der = DYN::state_array::Zero(); // DYN::control_array control; // DYN::output_array output; // float dt = 0.1; // // TODO add in the elevation map // // // Basic initial state and no movement should stay still // state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; // control << 0, 0; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_TRUE(state_der == DYN::state_array::Zero()); // EXPECT_NEAR(next_state(0), 0.0, tol); // EXPECT_NEAR(next_state(1), 0.0, tol); // EXPECT_NEAR(next_state(2), 0.0, tol); // EXPECT_NEAR(next_state(3), 0.0, tol); // EXPECT_NEAR(next_state(4), 0.0, tol); // EXPECT_NEAR(next_state(5), 0.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), 0.0, tol); // EXPECT_NEAR(output(23), 0.0, tol); // // // Apply full throttle from zero state // state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; // control << 1, 0; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(state_der(0), -1.6, tol); // EXPECT_NEAR(next_state(0), -0.16, tol); // EXPECT_NEAR(next_state(1), 0.0, tol); // EXPECT_NEAR(next_state(2), 0.0, tol); // EXPECT_NEAR(next_state(3), 0.0, tol); // EXPECT_NEAR(next_state(4), 0.0, tol); // EXPECT_NEAR(next_state(5), 0.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), 0.0, tol); // EXPECT_NEAR(output(23), -1.6, tol); // // // Apply throttle to a state with positive velocity // state << 1, 0, 0, 0, 0, -0.0, 0.0, 0, 0; // control << 1, 0; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(state_der(0), -5.5, tol); // EXPECT_NEAR(next_state(0), 0.45, tol); // EXPECT_NEAR(next_state(1), 0.0, tol); // EXPECT_NEAR(next_state(2), 0.1, tol); // EXPECT_NEAR(next_state(3), 0.0, tol); // EXPECT_NEAR(next_state(4), 0.0, tol); // EXPECT_NEAR(next_state(5), 0.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), 0.0, tol); // EXPECT_NEAR(output(23), -5.5, tol); // EXPECT_NEAR(output(24), 0.0, tol); // // // Apply full throttle and half left turn to origin state // state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; // control << 1, 0.5; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(state_der(0), -1.6, tol); // EXPECT_NEAR(next_state(0), -0.16, tol); // EXPECT_NEAR(next_state(1), 0.0, tol); // EXPECT_NEAR(next_state(2), 0.0, tol); // EXPECT_NEAR(next_state(3), 0.0, tol); // EXPECT_NEAR(next_state(4), powf(0.5, 3) * dt, tol); // EXPECT_NEAR(next_state(5), 0.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), powf(0.5, 3), tol); // EXPECT_NEAR(output(23), -1.6, tol); // // // Apply full throttle and half left turn to a moving state oriented 30 degrees to the left // float yaw = M_PI / 6; // state << 1.0, yaw, 0, 0, 0, -0.0, 0.0, 0, 0; // control << 1, 0.5; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(state_der(0), -5.5, tol); // EXPECT_NEAR(next_state(0), 0.45, tol); // EXPECT_NEAR(next_state(1), yaw, tol); // EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); // EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); // EXPECT_NEAR(next_state(4), powf(0.5, 3) * dt, tol); // EXPECT_NEAR(next_state(5), 0.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), powf(0.5, 3), tol); // EXPECT_NEAR(output(23), -5.5, tol); // // // Apply full throttle and half left turn to a moving state oriented 30 degrees to the left which is already // turning float steer_angle = M_PI / 8; state << 1.0, yaw, 0, 0, steer_angle, -0.0, 0.0, 0, 0; control << 1, 0.5; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(state_der(0), -5.5, tol); // EXPECT_NEAR(next_state(0), 0.45, tol); // EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); // EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); // EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); // EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); // EXPECT_NEAR(next_state(5), 0.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); // EXPECT_NEAR(output(23), -5.5, tol); // // // Apply full brake and half left turn to a moving state oriented 30 degrees to the left which is already turning // state << 1.0, yaw, 0, 0, steer_angle, 1.0, 0, 0.0, 0, 0; // control << -1, 0.5; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(state_der(0), -5.5, tol); // EXPECT_NEAR(next_state(0), 1 - 5.5 * dt, tol); // EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); // EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); // EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); // EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); // EXPECT_NEAR(next_state(5), 1.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); // EXPECT_NEAR(output(23), -5.5, tol); // // /** // * Apply full brake and half left turn to a moving state oriented 30 degrees to the left which is already turning // * and on a downward facing hill // */ // float pitch = 20 * M_PI / 180; // state << 1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; // control << -1, 0.5; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(next_state(0), 1 + (-5.5 + 9.81 * sinf(pitch)) * dt, tol); // EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); // EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); // EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); // EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); // EXPECT_NEAR(next_state(5), 1.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); // EXPECT_NEAR(output(23), (-5.5 + 9.81 * sinf(pitch)), tol); // // /** // * Apply full brake and half left turn to a backwards moving state oriented 30 degrees to the left which is already // * turning and on a downward facing hill // */ // state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; // control << -1, 0.5; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); // EXPECT_NEAR(next_state(1), yaw + 0.086361105 * dt, tol); // EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); // EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); // EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); // EXPECT_NEAR(next_state(5), 1.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); // EXPECT_NEAR(output(23), (5.5 + 9.81 * sinf(pitch)), tol); // // /** // * Apply full brake and half right turn to a backwards moving state oriented 30 degrees to the left which is // already // * turning and on a downward facing hill // */ // state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; // control << -1, -0.5; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); // EXPECT_NEAR(next_state(1), yaw + 0.086361105 * dt, tol); // EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); // EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); // EXPECT_NEAR(next_state(4), steer_angle + (-0.25 - steer_angle) * 0.5 * dt, tol); // EXPECT_NEAR(next_state(5), 1.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), (-0.25 - steer_angle) * 0.5, tol); // EXPECT_NEAR(output(23), (5.5 + 9.81 * sinf(pitch)), tol); // // /** // * Apply full brake and half right turn to a backwards moving state with a huge steering angle to test max steer // * angle and steering rate. We are also on a downward facing hill and are already oriented 30 degrees to the left // */ // steer_angle *= 100; // state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; // control << -1, -0.5; // dynamics.step(state, next_state, state_der, control, output, 0, dt); // EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); // EXPECT_NEAR(next_state(1), yaw + tan(steer_angle / -9.1) * dt * -2, tol); // EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); // EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); // EXPECT_NEAR(next_state(4), params.max_steer_angle, tol); // EXPECT_NEAR(next_state(5), 1.0, tol); // EXPECT_NEAR(next_state(6), 0.0, tol); // EXPECT_NEAR(next_state(7), 0.0, tol); // EXPECT_NEAR(next_state(8), -params.max_steer_rate, tol); // EXPECT_NEAR(output(23), (5.5 + 9.81 * sinf(pitch)), tol); // } // // TEST_F(BicycleSlipParametricTest, TestStepGPUvsCPUReverse) // { // const int num_rollouts = 2000; // const float dt = 0.1f; // CudaCheckError(); // BicycleSlipParametric dynamics = BicycleSlipParametric(); // auto params = dynamics.getParams(); // params.gear_sign = -1; // dynamics.setParams(params); // // cudaExtent extent = make_cudaExtent(10, 20, 0); // TwoDTextureHelper* helper = dynamics.getTextureHelper(); // helper->setExtent(0, extent); // // std::vector data_vec; // data_vec.resize(10 * 20); // for (int i = 0; i < data_vec.size(); i++) // { // data_vec[i] = i * 1.0f; // } // // std::array new_rot_mat{}; // new_rot_mat[0] = make_float3(0, 1, 0); // new_rot_mat[1] = make_float3(1, 0, 0); // new_rot_mat[2] = make_float3(0, 0, 1); // helper->updateRotation(0, new_rot_mat); // helper->updateOrigin(0, make_float3(1, 2, 3)); // // helper->updateTexture(0, data_vec); // helper->updateResolution(0, 10); // helper->enableTexture(0); // helper->copyToDevice(true); // // CudaCheckError(); // dynamics.GPUSetup(); // CudaCheckError(); // // Eigen::Matrix control_trajectory; // control_trajectory = Eigen::Matrix::Random(); // Eigen::Matrix state_trajectory; // state_trajectory = Eigen::Matrix::Random(); // // std::vector> s(num_rollouts); // std::vector> s_next(num_rollouts); // std::vector> s_der(num_rollouts); // // steering, throttle // std::vector> u(num_rollouts); // // BicycleSlipParametric::state_array state; // BicycleSlipParametric::state_array next_state_cpu; // BicycleSlipParametric::control_array control; // BicycleSlipParametric::output_array output; // BicycleSlipParametric::state_array state_der_cpu = BicycleSlipParametric::state_array::Zero(); // // // Run dynamics on dynamicsU // // Run dynamics on GPU // for (int y_dim = 1; y_dim <= 16; y_dim++) // { // for (int state_index = 0; state_index < num_rollouts; state_index++) // { // for (int dim = 0; dim < s[0].size(); dim++) // { // s[state_index][dim] = state_trajectory.col(state_index)(dim); // } // for (int dim = 0; dim < u[0].size(); dim++) // { // u[state_index][dim] = control_trajectory.col(state_index)(dim); // } // } // // launchStepTestKernel(dynamics, s, u, s_der, s_next, 0, dt, y_dim); // for (int point = 0; point < num_rollouts; point++) // { // state = state_trajectory.col(point); // control = control_trajectory.col(point); // state_der_cpu = BicycleSlipParametric::state_array::Zero(); // // dynamics.step(state, next_state_cpu, state_der_cpu, control, output, 0, dt); // for (int dim = 0; dim < BicycleSlipParametric::STATE_DIM; dim++) // { // EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; // // EXPECT_NEAR(state(dim), s[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; // EXPECT_NEAR(next_state_cpu(dim), s_next[point][dim], 1e-4) << "at index " << point << " with y_dim " << // y_dim; EXPECT_TRUE(isfinite(s_next[point][dim])); // } // } // } // dynamics.freeCudaMem(); // } // // TEST_F(BicycleSlipParametricTest, ComputeStateTrajectoryFiniteTest) // { // BicycleSlipParametric dynamics = BicycleSlipParametric(); // using PARAMS = BicycleSlipParametric::DYN_PARAMS_T; // PARAMS params; // params.c_t[0] = 3.0; // params.c_b[0] = 0.2; // params.c_v[0] = 0.2; // params.c_0 = 0.2; // params.wheel_base = 3.0; // params.steering_constant = 1.0; // dynamics.setParams(params); // // Eigen::Matrix control_trajectory; // control_trajectory = Eigen::Matrix::Zero(); // Eigen::Matrix state_trajectory; // state_trajectory = Eigen::Matrix::Zero(); // BicycleSlipParametric::state_array state_der; // BicycleSlipParametric::state_array x, x_next; // BicycleSlipParametric::output_array output; // x << 0, 1.46919e-6, 0.0140179, 1.09739e-8, -0.000735827; // // for (int i = 0; i < 500; i++) // { // BicycleSlipParametric::control_array u = control_trajectory.col(i); // dynamics.step(x, x_next, state_der, u, output, i, 0.02); // dynamics.computeDynamics(x, u, state_der); // EXPECT_TRUE(x.allFinite()); // EXPECT_TRUE(x_next.allFinite()); // EXPECT_TRUE(state_der.allFinite()); // EXPECT_TRUE(u.allFinite()); // EXPECT_TRUE(state_der != BicycleSlipParametric::state_array::Zero()); // x = x_next; // } // params.steering_constant = 0.5; // dynamics.setParams(params); // // x << 0, 1.46919e-6, 0.0140179, 1.09739e-8, -1.0; // for (int i = 0; i < 500; i++) // { // BicycleSlipParametric::control_array u = control_trajectory.col(i); // dynamics.step(x, x_next, state_der, u, output, i, 0.02); // dynamics.computeDynamics(x, u, state_der); // EXPECT_TRUE(x.allFinite()); // EXPECT_TRUE(x_next.allFinite()); // EXPECT_TRUE(state_der.allFinite()); // EXPECT_TRUE(u.allFinite()); // EXPECT_TRUE(state_der != BicycleSlipParametric::state_array::Zero()); // x = x_next; // } // } // // // class LinearDummy : public BicycleSlipParametric { // // public: // // bool computeGrad(const Eigen::Ref & state, // // const Eigen::Ref& control, // // Eigen::Ref A, // // Eigen::Ref B) { // // return false; // // }; // // }; // // // TEST_F(BicycleSlipParametricTest, TestComputeGradComputation) { // // Eigen::Matrix // // numeric_jac; // // Eigen::Matrix // // analytic_jac; // // // // const int num_rollouts = 100; // // Eigen::Matrix control_trajectory; // // control_trajectory = Eigen::Matrix::Random(); // // Eigen::Matrix state_trajectory; // // state_trajectory = Eigen::Matrix::Random(); // // // // // TODO properly scale the random values // // // // auto analytic_grad_model = BicycleSlipParametric(); // // // // BicycleSlipParametric::dfdx A_analytic = BicycleSlipParametric::dfdx::Zero(); // // BicycleSlipParametric::dfdu B_analytic = BicycleSlipParametric::dfdu::Zero(); // // // // auto numerical_grad_model = LinearDummy(); // // // // std::shared_ptr> ddp_model = // // std::make_shared>(&numerical_grad_model); // // // // auto params = analytic_grad_model.getParams(); // // // params.c_t[0] = 3.0; // // // params.c_b[0] = 0.2; // // // params.c_v[0] = 0.2; // // params.c_0 = 0.0; // // params.wheel_base = 3.0; // // params.steering_constant = 1.1; // // params.low_min_throttle = 0.0; // // params.max_brake_rate_pos = 10.0; // // analytic_grad_model.setParams(params); // // numerical_grad_model.setParams(params); // // // // auto limits = analytic_grad_model.getControlRanges(); // // limits[0].x = -0.3; // // limits[0].y = 1.0; // // limits[1].x = -1.0; // // limits[1].y = 1.0; // // analytic_grad_model.setControlRanges(limits); // // numerical_grad_model.setControlRanges(limits); // // // // state_trajectory.col(0) << 5, M_PI_4, 1, 1, 3, 0, 0, 0, 2; // // control_trajectory.col(0) << 0.5, 1; // // // // state_trajectory.col(1) << 5, M_PI_4, 1, 1, 3, 0, 0, 0, 2; // // control_trajectory.col(1) << -1.0, 1; // // // // // state_trajectory.col(5) = state_trajectory.col(5).cwiseAbs(); // // // // for (int i = 0; i < num_rollouts; i++) // // { // // BicycleSlipParametric::state_array state = state_trajectory.col(i); // // BicycleSlipParametric::control_array control = control_trajectory.col(i); // // // // if (abs(state(0)) < 1) // // { // // state(0) = state(0) * 10; // // state(1) = state(1) * M_PI; // // state(2) = state(2) * 100; // // state(3) = state(3) * 100; // // state(4) = state(4) * 5; // // state(6) = state(6) * M_PI_2; // // state(7) = state(7) * M_PI_2; // // state(8) = state(8) * 10; // // } // // state(5) = min(abs(state(5) / 0.33f), 0.3f); // // // // // std::cout << "iteration " << i << std::endl; // // // std::cout << "state: " << state.transpose() << std::endl; // // // std::cout << "control: " << control.transpose() << std::endl; // // // // bool analytic_grad = analytic_grad_model.computeGrad(state, control, A_analytic, B_analytic); // // EXPECT_TRUE(analytic_grad); // // // // analytic_jac.leftCols() = A_analytic; // // analytic_jac.rightCols() = B_analytic; // // numeric_jac = ddp_model->df(state, control); // // // // EXPECT_LT((numeric_jac - analytic_jac).norm(), 5e-2) // // << "at index " << i << "\nstate: " << state.transpose() << "\ncontrol " << control.transpose() // // << "\nNumeric Jacobian\n" // // << numeric_jac << "\nAnalytic Jacobian\n" // // << analytic_jac; // // } // // } ================================================ FILE: tests/dynamics/cartpole_dynamics_tests.cu ================================================ // // Created by mgandhi3 on 10/4/19. // #include #include #include #include #include #include TEST(CartPole, StateDim) { auto CP = CartpoleDynamics(1, 1, 1); EXPECT_EQ(4, CartpoleDynamics::STATE_DIM); } TEST(CartPole, ControlDim) { auto CP = CartpoleDynamics(1, 1, 1); EXPECT_EQ(1, CartpoleDynamics::CONTROL_DIM); } TEST(CartPole, Equilibrium) { auto CP = CartpoleDynamics(1, 1, 1); CartpoleDynamics::state_array state; state << 0, 0, 0, 0; CartpoleDynamics::control_array control; control << 0; CartpoleDynamics::state_array state_dot_compute; state_dot_compute << 1, 1, 1, 1; CartpoleDynamics::state_array state_dot_known; state_dot_known << 0, 0, 0, 0; CP.computeDynamics(state, control, state_dot_compute); for (int i = 0; i < CartpoleDynamics::STATE_DIM; i++) { EXPECT_NEAR(state_dot_known(i), state_dot_compute(i), 1e-4) << "Failed at index: " << i; } } TEST(CartPole, BindStream) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); auto CP = CartpoleDynamics(1, 1, 2, stream); EXPECT_EQ(CP.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST(CartPole, SetGetParamsHost) { auto params = CartpoleDynamicsParams(2.0, 3.0, 4.0); auto CP = CartpoleDynamics(1, 1, 2); CP.setParams(params); auto CP_params = CP.getParams(); EXPECT_FLOAT_EQ(params.cart_mass, CP_params.cart_mass); EXPECT_FLOAT_EQ(params.pole_mass, CP_params.pole_mass); EXPECT_FLOAT_EQ(params.pole_length, CP_params.pole_length); } TEST(CartPole, CartPole_GPUSetup_Test) { auto CP_host = new CartpoleDynamics(1, 2, 2); CP_host->GPUSetup(); // float mass; // launchParameterTestKernel(*CP_host, mass); EXPECT_TRUE(CP_host->GPUMemStatus_); delete (CP_host); } TEST(CartPole, GetCartMassFromGPU) { auto CP_host = new CartpoleDynamics(1, 1, 2); CP_host->GPUSetup(); auto params = CartpoleDynamicsParams(2.0, 3.0, 4.0); CP_host->setParams(params); float mass; launchCartMassTestKernel(*CP_host, mass); EXPECT_FLOAT_EQ(params.cart_mass, mass); CP_host->freeCudaMem(); delete (CP_host); } TEST(CartPole, GetPoleMassFromGPU) { auto CP_host = new CartpoleDynamics(1, 1, 2); CP_host->GPUSetup(); auto params = CartpoleDynamicsParams(2.0, 3.0, 4.0); CP_host->setParams(params); float mass; launchPoleMassTestKernel(*CP_host, mass); EXPECT_FLOAT_EQ(params.pole_mass, mass); CP_host->freeCudaMem(); delete (CP_host); } TEST(CartPole, GetPoleLengthFromGPU) { auto CP_host = new CartpoleDynamics(1, 1, 2); CP_host->GPUSetup(); auto params = CartpoleDynamicsParams(2.0, 3.0, 4.0); CP_host->setParams(params); float length; launchPoleLengthTestKernel(*CP_host, length); EXPECT_FLOAT_EQ(params.pole_length, length); CP_host->freeCudaMem(); delete (CP_host); } TEST(CartPole, GetGravityFromGPU) { auto CP_host = new CartpoleDynamics(1, 1, 2); CP_host->GPUSetup(); auto params = CartpoleDynamicsParams(2.0, 3.0, 4.0); CP_host->setParams(params); float gravity_gpu; launchGravityTestKernel(*CP_host, gravity_gpu); EXPECT_FLOAT_EQ(CP_host->getGravity(), gravity_gpu); CP_host->freeCudaMem(); delete (CP_host); } TEST(CartPole, TestDynamicsGPU) { auto CP_host = new CartpoleDynamics(1, 1, 2); CP_host->GPUSetup(); auto params = CartpoleDynamicsParams(2.0, 3.0, 4.0); CP_host->setParams(params); CartpoleDynamics::state_array state; state(0) = 0.1; state(1) = 0.3; state(2) = 0.23; state(3) = 0.334; CartpoleDynamics::control_array control; control(0) = 0.654; // These variables will be changed so initialized to the right size only CartpoleDynamics::state_array state_der_cpu = CartpoleDynamics::state_array::Zero(); float state_der_gpu[CartpoleDynamics::STATE_DIM]; // Run dynamics on CPU CP_host->computeDynamics(state, control, state_der_cpu); // Run dynamics on GPU launchDynamicsTestKernel(*CP_host, state.data(), control.data(), state_der_gpu); // Compare CPU and GPU Results for (int i = 0; i < CartpoleDynamics::STATE_DIM; i++) { EXPECT_FLOAT_EQ(state_der_cpu(i), state_der_gpu[i]); } CP_host->freeCudaMem(); delete (CP_host); } class CartpoleDummy : public CartpoleDynamics { public: CartpoleDummy(float cartMass, float poleMass, float poleLength, cudaStream_t stream = 0) : CartpoleDynamics(cartMass, poleMass, poleLength, stream){}; bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { return false; }; }; TEST(CartPole, TestComputeGradComputation) { Eigen::Matrix numeric_jac; Eigen::Matrix analytic_jac; CartpoleDynamics::state_array state; state << 1, 2, 3, 4; CartpoleDynamics::control_array control; control << 5; auto analytic_grad_model = CartpoleDynamics(1, 1, 1); CartpoleDynamics::dfdx A_analytic = CartpoleDynamics::dfdx::Zero(); CartpoleDynamics::dfdu B_analytic = CartpoleDynamics::dfdu::Zero(); analytic_grad_model.computeGrad(state, control, A_analytic, B_analytic); auto numerical_grad_model = CartpoleDummy(1, 1, 1); std::shared_ptr> ddp_model = std::make_shared>(&numerical_grad_model); analytic_jac.leftCols() = A_analytic; analytic_jac.rightCols() = B_analytic; numeric_jac = ddp_model->df(state, control); ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-3) << "Numeric Jacobian\n" << numeric_jac << "\nAnalytic Jacobian\n" << analytic_jac; } ================================================ FILE: tests/dynamics/dubins_dynamics_tests.cu ================================================ #include #include #include #include #include #include TEST(DubinsDynamics, Template) { auto dynamics = DubinsDynamics(); EXPECT_EQ(3, DubinsDynamics::STATE_DIM); EXPECT_EQ(2, DubinsDynamics::CONTROL_DIM); } TEST(DubinsDynamics, BindStream) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); auto dynamics = DubinsDynamics(stream); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST(DubinsDynamics, ComputeDynamics) { DubinsDynamics dynamics = DubinsDynamics(); DubinsDynamics::state_array x; x << 0, 0, 0; DubinsDynamics::control_array u; u << 0, 0; DubinsDynamics::state_array next_x; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 0); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), 0); x << 1, 2, 0; u << 1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 1); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), 0); x << 1, 2, 0; u << 3, 1; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 3); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), 1); x << 1, 2, M_PI_2; u << 4, 1; dynamics.computeDynamics(x, u, next_x); EXPECT_NEAR(next_x(0), 0.0, 1e-5); EXPECT_FLOAT_EQ(next_x(1), 4 * sin(M_PI_2)); EXPECT_FLOAT_EQ(next_x(2), 1); // TODO test case for flipping across angle discontinuity x << 1, 2, M_PI_2; u << 4, 1; dynamics.computeDynamics(x, u, next_x); EXPECT_NEAR(next_x(0), 0.0, 1e-5); EXPECT_FLOAT_EQ(next_x(1), 4 * sin(M_PI_2)); EXPECT_FLOAT_EQ(next_x(2), 1); } TEST(DubinsDynamics, TestDynamicsGPU) { DubinsDynamics dynamics = DubinsDynamics(); dynamics.GPUSetup(); DubinsDynamics::state_array state; state(0) = 0.5; state(1) = 0.7; state(2) = M_PI_4; DubinsDynamics::control_array control; control(0) = 3.0; control(1) = 2.0; std::vector> s(1); for (int dim = 0; dim < 3; dim++) { s[0][dim] = state(dim); } std::vector> s_der(1); // steering, throttle std::vector> u(1); for (int dim = 0; dim < 2; dim++) { u[0][dim] = control(dim); } // These variables will be changed so initialized to the right size only DubinsDynamics::state_array state_der_cpu = DubinsDynamics::state_array::Zero(); // Run dynamics on dynamicsU dynamics.computeDynamics(state, control, state_der_cpu); // Run dynamics on GPU for (int y_dim = 1; y_dim <= 3; y_dim++) { launchComputeDynamicsTestKernel(dynamics, s, u, s_der, y_dim); for (int dim = 0; dim < 3; dim++) { EXPECT_FLOAT_EQ(state_der_cpu(dim), s_der[0][dim]); } } dynamics.freeCudaMem(); } TEST(DubinsDynamics, TestUpdateStateGPU) { DubinsDynamics dynamics = DubinsDynamics(); dynamics.GPUSetup(); DubinsDynamics::state_array state; state(0) = 0.5; state(1) = 0.7; state(2) = M_PI; DubinsDynamics::control_array control; control(0) = 3.0; control(1) = 2.0; std::vector> s(1); for (int dim = 0; dim < 3; dim++) { s[0][dim] = state(dim); } std::vector> s_der(1); // steering, throttle std::vector> u(1); for (int dim = 0; dim < 2; dim++) { u[0][dim] = control(dim); } // These variables will be changed so initialized to the right size only DubinsDynamics::state_array state_der_cpu = DubinsDynamics::state_array::Zero(); // Run dynamics on dynamicsU dynamics.computeDynamics(state, control, state_der_cpu); dynamics.updateState(state, state_der_cpu, 0.1f); // Run dynamics on GPU for (int y_dim = 1; y_dim <= 3; y_dim++) { launchComputeStateDerivTestKernel(dynamics, s, u, s_der, y_dim); launchUpdateStateTestKernel(dynamics, s, s_der, 0.1f, y_dim); for (int dim = 0; dim < 3; dim++) { EXPECT_FLOAT_EQ(state_der_cpu(dim), s_der[0][dim]); } } dynamics.freeCudaMem(); } class DubinsDummy : public DubinsDynamics { public: bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { return false; }; }; TEST(DubinsDynamics, TestComputeGradComputation) { Eigen::Matrix numeric_jac; Eigen::Matrix analytic_jac; DubinsDynamics::state_array state; state << 1, 2, 3; DubinsDynamics::control_array control; control << 5; auto analytic_grad_model = DubinsDynamics(); DubinsDynamics::dfdx A_analytic = DubinsDynamics::dfdx::Zero(); DubinsDynamics::dfdu B_analytic = DubinsDynamics::dfdu::Zero(); analytic_grad_model.computeGrad(state, control, A_analytic, B_analytic); auto numerical_grad_model = DubinsDummy(); std::shared_ptr> ddp_model = std::make_shared>(&numerical_grad_model); analytic_jac.leftCols() = A_analytic; analytic_jac.rightCols() = B_analytic; numeric_jac = ddp_model->df(state, control); ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-3) << "Numeric Jacobian\n" << numeric_jac << "\nAnalytic Jacobian\n" << analytic_jac; } ================================================ FILE: tests/dynamics/dynamics_generic_tests.cu ================================================ #include #include #include #include #include template struct DynamicsTesterParams : public DynamicsParams { enum class StateIndex : int { NUM_STATES = STATE_DIM }; enum class ControlIndex : int { NUM_CONTROLS = CONTROL_DIM }; enum class OutputIndex : int { NUM_OUTPUTS = OUTPUT_DIM }; int var_1 = 1; int var_2 = 2; float4 var_4; }; template class DynamicsTester : public MPPI_internal::Dynamics, DynamicsTesterParams> { public: typedef MPPI_internal::Dynamics, DynamicsTesterParams> PARENT_CLASS; using state_array = typename PARENT_CLASS::state_array; using control_array = typename PARENT_CLASS::control_array; DynamicsTester(cudaStream_t stream = 0) : PARENT_CLASS(stream) { } DynamicsTester(DynamicsTesterParams& params, cudaStream_t stream = 0) : PARENT_CLASS(params, stream) { } DynamicsTester(std::array control_rngs, cudaStream_t stream = 0) : PARENT_CLASS(control_rngs, stream) { } void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { state_der(1) = control(0); } void computeKinematics(const Eigen::Ref& state, Eigen::Ref s_der) { s_der(0) = state(0) + state(1); }; // TODO must be properly parallelized __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta_s = nullptr) { state_der[1] = control[0]; } // TODO must be properly parallelized __device__ void computeKinematics(float* state, float* state_der) { state_der[0] = state[0] + state[1]; } state_array stateFromMap(const std::map& map) { return state_array::Zero(); } }; TEST(Dynamics, BindStream) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); auto tester = DynamicsTester<>(stream); EXPECT_EQ(tester.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); HANDLE_ERROR(cudaStreamCreate(&stream)); std::array tester_ranges{}; tester = DynamicsTester<>(tester_ranges, stream); EXPECT_EQ(tester.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST(Dynamics, GPUSetupAndCudaFree) { DynamicsTester<> tester; EXPECT_EQ(tester.model_d_, nullptr); EXPECT_EQ(tester.GPUMemStatus_, false); tester.GPUSetup(); EXPECT_NE(tester.model_d_, nullptr); EXPECT_EQ(tester.GPUMemStatus_, true); tester.freeCudaMem(); EXPECT_EQ(tester.model_d_, nullptr); EXPECT_EQ(tester.GPUMemStatus_, false); } TEST(Dynamics, setParamsCPU) { DynamicsTester<> tester; DynamicsTesterParams<> params_result = tester.getParams(); EXPECT_EQ(params_result.var_1, 1); EXPECT_EQ(params_result.var_2, 2); DynamicsTesterParams<> params; params.var_1 = 10; params.var_2 = 20; params.var_4.x = 1.5; params.var_4.y = 2.5; params.var_4.z = 3.5; params.var_4.w = 4.5; tester.setParams(params); params_result = tester.getParams(); EXPECT_EQ(params_result.var_1, params.var_1); EXPECT_EQ(params_result.var_2, params.var_2); EXPECT_EQ(params_result.var_4.x, params.var_4.x); EXPECT_EQ(params_result.var_4.y, params.var_4.y); EXPECT_EQ(params_result.var_4.z, params.var_4.z); EXPECT_EQ(params_result.var_4.w, params.var_4.w); // Test params constructor params.var_4.z = 1.5; DynamicsTester<> tester_2 = DynamicsTester<>(params); params_result = tester_2.getParams(); EXPECT_EQ(params_result.var_1, params.var_1); EXPECT_EQ(params_result.var_2, params.var_2); EXPECT_EQ(params_result.var_4.x, params.var_4.x); EXPECT_EQ(params_result.var_4.y, params.var_4.y); EXPECT_EQ(params_result.var_4.z, params.var_4.z); EXPECT_EQ(params_result.var_4.w, params.var_4.w); } TEST(Dynamics, setParamsGPU) { DynamicsTester<> tester; tester.GPUSetup(); DynamicsTesterParams<> params_result = tester.getParams(); EXPECT_EQ(params_result.var_1, 1); EXPECT_EQ(params_result.var_2, 2); DynamicsTesterParams<> params; params.var_1 = 10; params.var_2 = 20; params.var_4.x = 1.5; params.var_4.y = 2.5; params.var_4.z = 3.5; params.var_4.w = 4.5; tester.setParams(params); launchParameterTestKernel, DynamicsTesterParams<>>(tester, params_result); EXPECT_EQ(params_result.var_1, params.var_1); EXPECT_EQ(params_result.var_2, params.var_2); EXPECT_EQ(params_result.var_4.x, params.var_4.x); EXPECT_EQ(params_result.var_4.y, params.var_4.y); EXPECT_EQ(params_result.var_4.z, params.var_4.z); EXPECT_EQ(params_result.var_4.w, params.var_4.w); // Test params constructor params.var_4.z = 1.5; DynamicsTester<> tester_2 = DynamicsTester<>(params); tester_2.GPUSetup(); launchParameterTestKernel, DynamicsTesterParams<>>(tester_2, params_result); EXPECT_EQ(params_result.var_1, params.var_1); EXPECT_EQ(params_result.var_2, params.var_2); EXPECT_EQ(params_result.var_4.x, params.var_4.x); EXPECT_EQ(params_result.var_4.y, params.var_4.y); EXPECT_EQ(params_result.var_4.z, params.var_4.z); EXPECT_EQ(params_result.var_4.w, params.var_4.w); } TEST(Dynamics, ClassConstants) { DynamicsTester<> tester; EXPECT_EQ(tester.STATE_DIM, 1); EXPECT_EQ(tester.CONTROL_DIM, 1); EXPECT_EQ(tester.getGrdSharedSizeBytes(), 0); EXPECT_EQ(tester.getBlkSharedSizeBytes(), 0); DynamicsTester<56, 65> tester_2; int state_dim = DynamicsTester<56, 65>::STATE_DIM; EXPECT_EQ(state_dim, 56); int control_dim = DynamicsTester<56, 65>::CONTROL_DIM; EXPECT_EQ(control_dim, 65); EXPECT_EQ(tester_2.getGrdSharedSizeBytes(), 0); EXPECT_EQ(tester_2.getBlkSharedSizeBytes(), 0); } TEST(Dynamics, SetControlRangesDefault) { DynamicsTester<> tester; auto ranges = tester.getControlRanges(); EXPECT_FLOAT_EQ(ranges[0].x, -FLT_MAX); EXPECT_FLOAT_EQ(ranges[0].y, FLT_MAX); DynamicsTester<4, 2> tester_2; auto ranges_2 = tester.getControlRanges(); for (int i = 0; i < ranges_2.size(); i++) { EXPECT_FLOAT_EQ(ranges_2[i].x, -FLT_MAX); EXPECT_FLOAT_EQ(ranges_2[i].y, FLT_MAX); } auto params3 = tester.getParams(); DynamicsTester<> tester_3 = DynamicsTester<>(params3); auto ranges_3 = tester_3.getControlRanges(); EXPECT_FLOAT_EQ(ranges_3[0].x, -FLT_MAX); EXPECT_FLOAT_EQ(ranges_3[0].y, FLT_MAX); auto params4 = tester_2.getParams(); DynamicsTester<4, 2> tester_4 = DynamicsTester<4, 2>(params4); auto ranges_4 = tester_4.getControlRanges(); for (int i = 0; i < ranges_4.size(); i++) { EXPECT_FLOAT_EQ(ranges_4[i].x, -FLT_MAX); EXPECT_FLOAT_EQ(ranges_4[i].y, FLT_MAX); } } TEST(Dynamics, SetControlRanges) { std::array tester_ranges{}; tester_ranges[0].x = -2; tester_ranges[0].y = 5; DynamicsTester<> tester(tester_ranges); auto ranges = tester.getControlRanges(); EXPECT_FLOAT_EQ(ranges[0].x, -2); EXPECT_FLOAT_EQ(ranges[0].y, 5); tester_ranges[0].x = -5; tester_ranges[0].y = 6; tester.setControlRanges(tester_ranges); ranges = tester.getControlRanges(); EXPECT_FLOAT_EQ(ranges[0].x, tester_ranges[0].x) << "failed at index: " << 0; EXPECT_FLOAT_EQ(ranges[0].y, tester_ranges[0].y) << "failed at index: " << 0; std::array tester_2_ranges{}; tester_ranges[0].x = -3; tester_ranges[0].y = 8; tester_ranges[1].x = -11; tester_ranges[1].y = 23; DynamicsTester<4, 2> tester_2(tester_2_ranges); auto ranges_2 = tester_2.getControlRanges(); for (int i = 0; i < ranges_2.size(); i++) { EXPECT_FLOAT_EQ(ranges_2[i].x, tester_2_ranges[i].x) << "failed at index: " << i; EXPECT_FLOAT_EQ(ranges_2[i].y, tester_2_ranges[i].y) << "failed at index: " << i; } } TEST(Dynamics, SetControlRangesGPU) { std::array tester_ranges{}; tester_ranges[0].x = -2; tester_ranges[0].y = 5; DynamicsTester<> tester(tester_ranges); tester.GPUSetup(); std::array ranges_result = {}; launchControlRangesTestKernel, 1>(tester, ranges_result); EXPECT_FLOAT_EQ(ranges_result[0].x, -2); EXPECT_FLOAT_EQ(ranges_result[0].y, 5); std::array tester_2_ranges{}; tester_2_ranges[0].x = -5; tester_2_ranges[0].y = 6; tester_2_ranges[1].x = -10; tester_2_ranges[1].y = 20; DynamicsTester<4, 2> tester_2(tester_2_ranges); tester_2.GPUSetup(); std::array ranges_result_2 = {}; launchControlRangesTestKernel, 2>(tester_2, ranges_result_2); for (int i = 0; i < ranges_result_2.size(); i++) { EXPECT_FLOAT_EQ(ranges_result_2[i].x, tester_2_ranges[i].x) << "failed at index: " << i; EXPECT_FLOAT_EQ(ranges_result_2[i].y, tester_2_ranges[i].y) << "failed at index: " << i; } } TEST(Dynamics, enforceConstraintsCPU) { std::array tester_ranges{}; tester_ranges[0].x = -2; tester_ranges[0].y = 5; DynamicsTester<> tester(tester_ranges); DynamicsTester<>::state_array s(1, 1); DynamicsTester<>::control_array u(1, 1); u(0) = 100; tester.enforceConstraints(s, u); EXPECT_FLOAT_EQ(u(0), 5); u(0) = -42178; tester.enforceConstraints(s, u); EXPECT_FLOAT_EQ(u(0), -2); u(0) = 2; tester.enforceConstraints(s, u); EXPECT_FLOAT_EQ(u(0), 2); u(0) = -1.5; tester.enforceConstraints(s, u); EXPECT_FLOAT_EQ(u(0), -1.5); } TEST(Dynamics, enforceConstraintsGPU) { std::array tester_ranges{}; tester_ranges[0].x = -2; tester_ranges[0].y = 5; tester_ranges[1].x = -6; tester_ranges[1].y = 8; tester_ranges[2].x = -11; tester_ranges[2].y = 16; DynamicsTester<1, 3> tester(tester_ranges); tester.GPUSetup(); std::vector> states(4); std::vector> controls(4); states[0][0] = 48; states[1][0] = 4518; states[2][0] = 451; states[3][0] = 4516; controls[0][0] = 48; controls[0][1] = 48; controls[0][2] = 48; controls[1][0] = -51; controls[1][1] = -51; controls[1][2] = -51; controls[2][0] = 2; controls[2][1] = 2; controls[2][2] = 2; controls[3][0] = -1.5; controls[3][1] = -1.5; controls[3][2] = -1.5; // try a bunch of different y dim for (int j = 1; j < 6; j++) { states[0][0] = 48; states[1][0] = 4518; states[2][0] = 451; states[3][0] = 4516; controls[0][0] = 48; controls[0][1] = 48; controls[0][2] = 48; controls[1][0] = -51; controls[1][1] = -51; controls[1][2] = -51; controls[2][0] = 2; controls[2][1] = 2; controls[2][2] = 2; controls[3][0] = -1.5; controls[3][1] = -1.5; controls[3][2] = -1.5; launchEnforceConstraintTestKernel, 1, 3>(tester, states, controls, j); EXPECT_FLOAT_EQ(controls[0][0], 5); EXPECT_FLOAT_EQ(controls[0][1], 8); EXPECT_FLOAT_EQ(controls[0][2], 16); EXPECT_FLOAT_EQ(controls[1][0], -2); EXPECT_FLOAT_EQ(controls[1][1], -6); EXPECT_FLOAT_EQ(controls[1][2], -11); EXPECT_FLOAT_EQ(controls[2][0], 2); EXPECT_FLOAT_EQ(controls[2][1], 2); EXPECT_FLOAT_EQ(controls[2][2], 2); EXPECT_FLOAT_EQ(controls[3][0], -1.5); EXPECT_FLOAT_EQ(controls[3][1], -1.5); EXPECT_FLOAT_EQ(controls[3][2], -1.5); EXPECT_FLOAT_EQ(states[0][0], 48); EXPECT_FLOAT_EQ(states[1][0], 4518); EXPECT_FLOAT_EQ(states[2][0], 451); EXPECT_FLOAT_EQ(states[3][0], 4516); } } TEST(Dynamics, updateStateCPU) { DynamicsTester<> tester; DynamicsTester<>::state_array s; DynamicsTester<>::state_array s_der; s(0) = 5; s_der(0) = 10; tester.updateState(s, s_der, 0.1); EXPECT_FLOAT_EQ(s(0), 6); EXPECT_FLOAT_EQ(s_der(0), 10); } TEST(Dynamics, updateStateGPU) { DynamicsTester<4, 1> tester; std::vector> s(1); std::vector> s_der(1); s[0][0] = 0; s[0][1] = 1; s[0][2] = 2; s[0][3] = 3; s_der[0][0] = 0; s_der[0][1] = 1; s_der[0][2] = 2; s_der[0][3] = 3; for (int i = 1; i < 6; i++) { s[0][0] = 0; s[0][1] = 1; s[0][2] = 2; s[0][3] = 3; s_der[0][0] = 0; s_der[0][1] = 1; s_der[0][2] = 2; s_der[0][3] = 3; launchUpdateStateTestKernel, 4>(tester, s, s_der, 0.1, i); EXPECT_FLOAT_EQ(s[0][0], 0); EXPECT_FLOAT_EQ(s[0][1], 1.1); EXPECT_FLOAT_EQ(s[0][2], 2.2); EXPECT_FLOAT_EQ(s[0][3], 3.3); EXPECT_FLOAT_EQ(s_der[0][0], 0); EXPECT_FLOAT_EQ(s_der[0][1], 1); EXPECT_FLOAT_EQ(s_der[0][2], 2); EXPECT_FLOAT_EQ(s_der[0][3], 3); } } TEST(Dynamics, computeStateDerivCPU) { DynamicsTester<2, 1> tester; DynamicsTester<2, 1>::state_array s; DynamicsTester<2, 1>::state_array s_der; DynamicsTester<2, 1>::control_array u; s(0) = 5; s(1) = 10; s_der(0) = 10; s_der(1) = 20; u(0) = 3; tester.computeStateDeriv(s, u, s_der); EXPECT_FLOAT_EQ(s(0), 5); EXPECT_FLOAT_EQ(s(1), 10); EXPECT_FLOAT_EQ(s_der(0), 15); EXPECT_FLOAT_EQ(s_der(1), 3); EXPECT_FLOAT_EQ(u(0), 3); } TEST(Dynamics, computeStateDerivGPU) { DynamicsTester<2, 1> tester; std::vector> s(1); std::vector> u(1); std::vector> s_der(1); for (int j = 1; j < 6; j++) { s[0][0] = 5; s[0][1] = 10; s_der[0][0] = 10; s_der[0][1] = 20; u[0][0] = 3; launchComputeStateDerivTestKernel, 2, 1>(tester, s, u, s_der, j); EXPECT_FLOAT_EQ(s[0][0], 5) << "j = " << j; EXPECT_FLOAT_EQ(s[0][1], 10) << "j = " << j; EXPECT_FLOAT_EQ(s_der[0][0], 15) << "j = " << j; EXPECT_FLOAT_EQ(s_der[0][1], 3) << "j = " << j; EXPECT_FLOAT_EQ(u[0][0], 3) << "j = " << j; } } TEST(Dynamics, stepGPU) { DynamicsTester<2, 1> tester; tester.GPUSetup(); std::vector> s(1); std::vector> u(1); std::vector> s_der(1); std::vector> s_next(1); int t = 0; float dt = 0.5; for (int dim_y = 1; dim_y < 6; dim_y++) { s[0][0] = 5; s[0][1] = 10; s_der[0][0] = 10; s_der[0][1] = 20; u[0][0] = 3; launchStepTestKernel>(tester, s, u, s_der, s_next, t, dt, dim_y); EXPECT_FLOAT_EQ(s[0][0], 5) << "dim_y = " << dim_y; EXPECT_FLOAT_EQ(s[0][1], 10) << "dim_y = " << dim_y; EXPECT_FLOAT_EQ(s_der[0][0], 15) << "dim_y = " << dim_y; EXPECT_FLOAT_EQ(s_der[0][1], 3) << "dim_y = " << dim_y; EXPECT_FLOAT_EQ(s_next[0][0], 5 + 7.5) << "dim_y = " << dim_y; EXPECT_FLOAT_EQ(s_next[0][1], 10 + 1.5) << "dim_y = " << dim_y; EXPECT_FLOAT_EQ(u[0][0], 3) << "dim_y = " << dim_y; } } TEST(Dynamics, stepCPU) { using DYN = DynamicsTester<2, 1>; DYN tester; DYN::state_array x; DYN::state_array x_dot; DYN::state_array x_next; DYN::control_array u; DYN::output_array output; int t = 0; float dt = 0.5; x(0) = 5; x(1) = 10; x_dot(0) = 10; x_dot(1) = 20; u(0) = 3; tester.step(x, x_next, x_dot, u, output, t, dt); EXPECT_FLOAT_EQ(x(0), 5); EXPECT_FLOAT_EQ(x(1), 10); EXPECT_FLOAT_EQ(x_dot(0), 15); EXPECT_FLOAT_EQ(x_dot(1), 3); EXPECT_FLOAT_EQ(x_next(0), 5 + 7.5); EXPECT_FLOAT_EQ(x_next(1), 10 + 1.5); EXPECT_FLOAT_EQ(u(0), 3); } TEST(Dynamics, interpolateState) { DynamicsTester<3, 1> tester; DynamicsTester<3, 1>::state_array s1; DynamicsTester<3, 1>::state_array s2; s1(0) = 1.5; s2(0) = 2.0; s1(1) = -3.0; s2(1) = -3.5; s1(2) = -0.25; s2(2) = 0.25; EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0)(0), 1.5); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.25)(0), 1.625); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.5)(0), 1.75); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.75)(0), 1.875); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 1.0)(0), 2.0); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0)(1), -3.0); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.25)(1), -3.125); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.5)(1), -3.25); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.75)(1), -3.375); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 1.0)(1), -3.5); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0)(2), -0.25); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.25)(2), -0.125); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.5)(2), 0); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 0.75)(2), 0.125); EXPECT_FLOAT_EQ(tester.interpolateState(s1, s2, 1.0)(2), 0.25); } ================================================ FILE: tests/dynamics/linear_dynamics_tests.cu ================================================ #include #include #include TEST(Linear, Dimensionality) { using DYN_T = LinearDynamics<4, 12>; ASSERT_EQ(4, DYN_T::STATE_DIM); ASSERT_EQ(4, DYN_T::OUTPUT_DIM); ASSERT_EQ(12, DYN_T::CONTROL_DIM); } TEST(Linear, SetParamsA) { using DYN_T = LinearDynamics<4, 12>; using DFDX = typename DYN_T::dfdx; // Set A from params DFDX A = DFDX::Random(); typename DYN_T::DYN_PARAMS_T params; params.setA(A); auto dynamics = DYN_T(params); auto dyn_params = dynamics.getParams(); Eigen::Map A_params_result(dyn_params.A); float diff = (A - A_params_result).sum(); EXPECT_FLOAT_EQ(diff, 0); // Set A from dynamics A = DFDX::Random(); dynamics.setA(A); dyn_params = dynamics.getParams(); Eigen::Map A_class_result(dyn_params.A); diff = (A - A_class_result).sum(); EXPECT_FLOAT_EQ(diff, 0); } TEST(Linear, SetParamsB) { using DYN_T = LinearDynamics<4, 12>; using DFDU = typename DYN_T::dfdu; // Set B from params DFDU B = DFDU::Random(); typename DYN_T::DYN_PARAMS_T params; params.setB(B); auto dynamics = DYN_T(params); auto dyn_params = dynamics.getParams(); Eigen::Map B_result(dyn_params.B); float diff = (B - B_result).sum(); EXPECT_FLOAT_EQ(diff, 0); // Set A from dynamics B = DFDU::Random(); dynamics.setB(B); dyn_params = dynamics.getParams(); Eigen::Map B_class_result(dyn_params.B); diff = (B - B_class_result).sum(); EXPECT_FLOAT_EQ(diff, 0); } TEST(Linear, CheckSharedMemorySizes) { using DYN_T = LinearDynamics<4, 12>; auto dynamics = DYN_T(); dynamics.GPUSetup(); int output_gpu[2] = { 0 }; int output_cpu[2] = { 0 }; launchGetSharedMemorySizesKernel(dynamics, output_gpu); output_cpu[0] = dynamics.getGrdSharedSizeBytes(); output_cpu[1] = dynamics.getBlkSharedSizeBytes(); ASSERT_EQ(output_cpu[0], output_gpu[0]); ASSERT_EQ(output_cpu[1], output_gpu[1]); } TEST(Linear, StepCPUGPUComparison) { using DYN_T = LinearDynamics<10, 12>; using DFDU = typename DYN_T::dfdu; using DFDX = typename DYN_T::dfdx; DFDU B = DFDU::Random(); DFDX A = DFDX::Random(); auto dynamics = DYN_T(); dynamics.setA(A); dynamics.setB(B); typename DYN_T::buffer_trajectory buffer; std::vector x_sizes = { 1, 2, 4, 8, 16, 32 }; for (const auto& x_dim : x_sizes) { checkGPUComputationStep(dynamics, 0.01, 32, x_dim, buffer); } } TEST(Linear, JacobianCheck) { using DYN_T = LinearDynamics<10, 6>; using DFDU = typename DYN_T::dfdu; using DFDX = typename DYN_T::dfdx; using state_array = typename DYN_T::state_array; using control_array = typename DYN_T::control_array; auto dynamics = DYN_T(); DFDX A = DFDX::Identity(); DFDU B = DFDU::Random(); dynamics.setA(A); dynamics.setB(B); DFDX Jacobian_A; DFDU Jacobian_B; state_array x = dynamics.getZeroState(); control_array u = dynamics.getZeroControl(); dynamics.computeGrad(x, u, Jacobian_A, Jacobian_B); float a_diff = (Jacobian_A - A).array().abs().sum(); float b_diff = (Jacobian_B - B).array().abs().sum(); ASSERT_EQ(a_diff, 0); ASSERT_EQ(b_diff, 0); } TEST(Linear, HardCodeCPUTest) { using DYN_T = LinearDynamics<3, 1>; using DFDU = typename DYN_T::dfdu; using DFDX = typename DYN_T::dfdx; using state_array = typename DYN_T::state_array; using control_array = typename DYN_T::control_array; using output_array = typename DYN_T::output_array; auto dynamics = DYN_T(); DFDX A = DFDX::Identity(); DFDU B = DFDU::Zero(); B(2, 0) = 0.5f; dynamics.setA(A); dynamics.setB(B); state_array x = dynamics.getZeroState(); control_array u = dynamics.getZeroControl(); x << 1, 5, 10; u << 3; float dt = 0.1; output_array y; state_array x_der, x_next; dynamics.step(x, x_next, x_der, u, y, 0, dt); // Check derivative ASSERT_FLOAT_EQ(x_der[0], 1.0f); ASSERT_FLOAT_EQ(x_der[1], 5.0f); ASSERT_FLOAT_EQ(x_der[2], 10.0f + 1.5f); // Check next state ASSERT_FLOAT_EQ(x_next[0], 1.0f + 1.0f * 0.1f); ASSERT_FLOAT_EQ(x_next[1], 5.0f + 5.0f * 0.1f); ASSERT_FLOAT_EQ(x_next[2], 10.0f + (10.0f + 1.5f) * 0.1f); // Check output ASSERT_FLOAT_EQ(y[0], 1.0f + 1.0f * 0.1f); ASSERT_FLOAT_EQ(y[1], 5.0f + 5.0f * 0.1f); ASSERT_FLOAT_EQ(y[2], 10.0f + (10.0f + 1.5f) * 0.1f); } ================================================ FILE: tests/dynamics/quadrotor_dynamics_tests.cu ================================================ #include #include #include template void __global__ DynamicsDerivKernel(DYN_T* model, float* state_d, float* u_d, float* state_deriv_d) { model->computeDynamics(state_d, u_d, state_deriv_d); } template void __global__ UpdateStateKernel(DYN_T* model, float* state_d, float* u_d, float* state_deriv_d, float dt) { model->computeDynamics(state_d, u_d, state_deriv_d); model->updateState(state_d, state_deriv_d, dt); } TEST(QuadrotorDynamics, Constructor) { QuadrotorDynamics(); } TEST(QuadrotorDynamics, CompareDynamics_CPU_GPU) { using DYN = QuadrotorDynamics; DYN model; DYN::state_array s = DYN::state_array::Random(); DYN::control_array u = DYN::control_array::Random(); DYN::state_array state_deriv_cpu, state_deriv_gpu; Eigen::Quaternionf q_test(s[6], s[7], s[8], s[9]); q_test.normalize(); s[6] = q_test.w(); s[7] = q_test.x(); s[8] = q_test.y(); s[9] = q_test.z(); /** * GPU Setup */ model.GPUSetup(); cudaStream_t s1; cudaStreamCreate(&s1); float* s_d; float* u_d; float* state_deriv_GPU; // Allocate GPU Memory // size_t control_size = sizeof(float) * DYN::CONTROL_DIM; HANDLE_ERROR(cudaMalloc((void**)&u_d, sizeof(float) * DYN::CONTROL_DIM)); HANDLE_ERROR(cudaMalloc((void**)&s_d, sizeof(float) * DYN::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&state_deriv_GPU, sizeof(float) * DYN::STATE_DIM)); // Copy data to GPU HANDLE_ERROR(cudaMemcpyAsync(u_d, u.data(), sizeof(float) * DYN::CONTROL_DIM, cudaMemcpyHostToDevice, s1)); HANDLE_ERROR(cudaMemcpyAsync(s_d, s.data(), sizeof(float) * DYN::STATE_DIM, cudaMemcpyHostToDevice, s1)); HANDLE_ERROR(cudaStreamSynchronize(s1)); model.computeDynamics(s, u, state_deriv_cpu); // Call GPU Kernel dim3 dimBlock(1, 5, 1); dim3 dimGrid(1, 1, 1); DynamicsDerivKernel<<>>(model.model_d_, s_d, u_d, state_deriv_GPU); CudaCheckError(); HANDLE_ERROR(cudaStreamSynchronize(s1)); // Copy GPU results back to Host HANDLE_ERROR(cudaMemcpyAsync(state_deriv_gpu.data(), state_deriv_GPU, sizeof(float) * DYN::STATE_DIM, cudaMemcpyDeviceToHost, s1)); HANDLE_ERROR(cudaStreamSynchronize(s1)); eigen_assert_float_eq(state_deriv_cpu, state_deriv_gpu); } TEST(QuadrotorDynamics, UpdateState_CPU_GPU) { using DYN = QuadrotorDynamics; DYN model; QuadrotorDynamicsParams params = QuadrotorDynamicsParams(2.5); DYN::state_array s_cpu = DYN::state_array::Random(); DYN::control_array u = DYN::control_array::Random(); DYN::state_array s_gpu; DYN::state_array state_deriv_cpu, state_deriv_gpu; float dt = 0.01; Eigen::Quaternionf q_test(s_cpu[6], s_cpu[7], s_cpu[8], s_cpu[9]); q_test.normalize(); s_cpu[6] = q_test.w(); s_cpu[7] = q_test.x(); s_cpu[8] = q_test.y(); s_cpu[9] = q_test.z(); s_gpu = s_cpu; /** * GPU Setup */ model.GPUSetup(); model.setParams(params); cudaStream_t s1; cudaStreamCreate(&s1); float* s_d; float* u_d; float* state_deriv_GPU; // Allocate GPU Memory HANDLE_ERROR(cudaMalloc((void**)&u_d, sizeof(float) * DYN::CONTROL_DIM)); HANDLE_ERROR(cudaMalloc((void**)&s_d, sizeof(float) * DYN::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&state_deriv_GPU, sizeof(float) * DYN::STATE_DIM)); // Copy data to GPU HANDLE_ERROR(cudaMemcpyAsync(u_d, u.data(), sizeof(float) * DYN::CONTROL_DIM, cudaMemcpyHostToDevice, s1)); HANDLE_ERROR(cudaMemcpyAsync(s_d, s_gpu.data(), sizeof(float) * DYN::STATE_DIM, cudaMemcpyHostToDevice, s1)); HANDLE_ERROR(cudaStreamSynchronize(s1)); model.computeDynamics(s_cpu, u, state_deriv_cpu); model.updateState(s_cpu, state_deriv_cpu, dt); // Call GPU Kernel dim3 dimBlock(1, 5, 1); dim3 dimGrid(1, 1, 1); UpdateStateKernel<<>>(model.model_d_, s_d, u_d, state_deriv_GPU, dt); CudaCheckError(); HANDLE_ERROR(cudaStreamSynchronize(s1)); // Copy GPU results back to Host HANDLE_ERROR(cudaMemcpyAsync(s_gpu.data(), s_d, sizeof(float) * DYN::STATE_DIM, cudaMemcpyDeviceToHost, s1)); HANDLE_ERROR(cudaMemcpyAsync(state_deriv_gpu.data(), state_deriv_GPU, sizeof(float) * DYN::STATE_DIM, cudaMemcpyDeviceToHost, s1)); HANDLE_ERROR(cudaStreamSynchronize(s1)); eigen_assert_float_eq(s_cpu, s_gpu); } ================================================ FILE: tests/dynamics/racer_dubins_elevation_lstm_steering_model_test.cu ================================================ #include #include #include #include #include #include #include class RacerDubinsElevationLSTMSteeringTest : public ::testing::Test { public: cudaStream_t stream; void SetUp() override { CudaCheckError(); HANDLE_ERROR(cudaStreamCreate(&stream)); } void TearDown() override { CudaCheckError(); HANDLE_ERROR(cudaStreamDestroy(stream)); } std::vector init_output_layers = { 23, 100, 8 }; std::vector output_layers = { 8, 20, 1 }; }; TEST_F(RacerDubinsElevationLSTMSteeringTest, Template) { auto dynamics = RacerDubinsElevationLSTMSteering(3, 20, init_output_layers, 4, 4, output_layers, 11); EXPECT_EQ(19, RacerDubinsElevationLSTMSteering::STATE_DIM); EXPECT_EQ(2, RacerDubinsElevationLSTMSteering::CONTROL_DIM); EXPECT_TRUE(dynamics.checkRequiresBuffer()); EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_EQ(dynamics.getBlkSharedSizeBytes(), 384); EXPECT_EQ(dynamics.getGrdSharedSizeBytes(), 1408); } TEST_F(RacerDubinsElevationLSTMSteeringTest, BindStream) { auto dynamics = RacerDubinsElevationLSTMSteering(3, 20, init_output_layers, 4, 4, output_layers, 11); dynamics.bindToStream(stream); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_EQ(dynamics.getTextureHelper()->stream_, stream); EXPECT_NE(dynamics.getHelper(), nullptr); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->stream_, stream); auto dynamics2 = RacerDubinsElevationLSTMSteering(3, 20, init_output_layers, 4, 4, output_layers, 11, stream); EXPECT_EQ(dynamics2.stream_, stream) << "Stream binding failure."; EXPECT_NE(dynamics2.getTextureHelper(), nullptr); EXPECT_EQ(dynamics2.getTextureHelper()->stream_, stream); EXPECT_NE(dynamics2.getHelper(), nullptr); EXPECT_EQ(dynamics2.getHelper()->getLSTMModel()->stream_, stream); } /* float c_t = 1.3; float c_b = 2.5; float c_v = 3.7; float c_0 = 4.9; float wheel_base = 0.3; */ // TEST_F(RacerDubinsElevationLSTMSteeringTest, ComputeDynamics) // { // RacerDubinsElevationLSTMSteering dynamics = RacerDubinsElevationLSTMSteering(); // auto params = dynamics.getParams(); // RacerDubinsElevationLSTMSteering::state_array x = RacerDubinsElevationLSTMSteering::state_array::Zero(); // RacerDubinsElevationLSTMSteering::control_array u = RacerDubinsElevationLSTMSteering::control_array::Zero(); // // computeDynamics should not touch the roll/pitch element // RacerDubinsElevationLSTMSteering::state_array next_x = RacerDubinsElevationLSTMSteering::state_array::Ones() * // 0.153; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9); EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 0); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 1, M_PI_2, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 2.6 - 4.7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 1); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 1, M_PI_2, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.5 - 4.7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 1); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -1, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 4.7 + 2.6 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -1); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -1, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.5 + 4.7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -1); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 7, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.9 - 5.7 * 7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 7); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -7, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.9 + 5.7 * 7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -7); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 7, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 4.5 - 5.7 * 7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 7); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -7, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 4.5 + 5.7 * 7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -7); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 1, M_PI_2, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 0, 1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 4.7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), (1 / .3) * tan(0)); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 1); // EXPECT_FLOAT_EQ(next_x(4), 5 * 0.6); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -1, M_PI_2, 0, 3, 5.0, 0.5, -0.5, 0.0, 0.0; // u << -1, -1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 4.7 + 3.5 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), (-1 / .3) * tan(5.0 / -10.2)); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), -1); // EXPECT_FLOAT_EQ(next_x(4), -5); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -0.4, M_PI_2, 0, 3, 5.0, 0.5, -0.5, 0.0, 0.0; // u << -1, -1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.7 * 0.4 + 2.5 * 0.4 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), (-0.4 / .3) * tan(5.0 / -9.1)); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), -0.4); // EXPECT_FLOAT_EQ(next_x(4), -5); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 0.4, M_PI_2, 0, 3, 5.0, 0.5, -0.5, 0.0, 0.0; // u << 0.1, -1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.7 * 0.4 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), (0.4 / .3) * tan(5.0 / -9.1)); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 0.4); // EXPECT_FLOAT_EQ(next_x(4), -5); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // } // TEST_F(RacerDubinsElevationLSTMSteeringTest, TestModelGPU) // { // RacerDubinsElevationLSTMSteering dynamics = RacerDubinsElevationLSTMSteering(); // dynamics.GPUSetup(); // CudaCheckError(); // Eigen::Matrix control_trajectory; // control_trajectory = Eigen::Matrix::Random(); // Eigen::Matrix state_trajectory; // state_trajectory = Eigen::Matrix::Random(); // std::vector> s(100); // std::vector> s_der(100); // // steering, throttle // std::vector> u(100); // for (int state_index = 0; state_index < s.size(); state_index++) // { // for (int dim = 0; dim < s[0].size(); dim++) // { // s[state_index][dim] = state_trajectory.col(state_index)(dim); // } // for (int dim = 0; dim < u[0].size(); dim++) // { // u[state_index][dim] = control_trajectory.col(state_index)(dim); // } // } // // These variables will be changed so initialized to the right size only // // Run dynamics on dynamicsU // // Run dynamics on GPU // for (int y_dim = 1; y_dim <= 4; y_dim++) // { // launchComputeDynamicsTestKernel(dynamics, s, u, s_der, y_dim); // for (int point = 0; point < 100; point++) // { // RacerDubinsElevationLSTMSteering::state_array state = state_trajectory.col(point); // RacerDubinsElevationLSTMSteering::control_array control = control_trajectory.col(point); // RacerDubinsElevationLSTMSteering::state_array state_der_cpu = // RacerDubinsElevationLSTMSteering::state_array::Zero(); // dynamics.computeDynamics(state, control, state_der_cpu); // for (int dim = 0; dim < 6; dim++) // { // EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-5) // << "at point " << point << " dim " << dim << " with y_dim " << y_dim; // EXPECT_TRUE(isfinite(s_der[point][dim])); // } // } // } // dynamics.freeCudaMem(); // CudaCheckError(); // } // TEST_F(RacerDubinsElevationLSTMSteeringTest, TestUpdateState) // { // CudaCheckError(); // RacerDubinsElevationLSTMSteering dynamics = RacerDubinsElevationLSTMSteering(); // RacerDubinsElevationLSTMSteering::state_array state; // RacerDubinsElevationLSTMSteering::state_array next_state; // RacerDubinsElevationLSTMSteering::state_array state_der; // // TODO add in the elevation map // state << 0, 0, 0, 0, 0, -0.5, 0.5; // state_der << 1, 1, 1, 1, 1, 0, 0; // dynamics.updateState(state, next_state, state_der, 0.1); // EXPECT_TRUE(state_der != RacerDubinsElevationLSTMSteering::state_array::Zero()); // EXPECT_FLOAT_EQ(next_state(0), 0.1); // EXPECT_FLOAT_EQ(next_state(1), 0.1); // EXPECT_FLOAT_EQ(next_state(2), 0.1); // EXPECT_FLOAT_EQ(next_state(3), 0.1); // EXPECT_FLOAT_EQ(next_state(4), 0.1); // EXPECT_FLOAT_EQ(next_state(5), 0.0); // EXPECT_FLOAT_EQ(next_state(6), 0.0); // state << 0, M_PI - 0.1, 0, 0, 0, -0.5, 0.5; // state_der << 1, 1, 1, 1, 1; // dynamics.updateState(state, next_state, state_der, 1.0); // EXPECT_TRUE(state_der != RacerDubinsElevationLSTMSteering::state_array::Zero()); // EXPECT_FLOAT_EQ(next_state(0), 1.0); // EXPECT_FLOAT_EQ(next_state(1), 1.0 - M_PI - 0.1); // EXPECT_FLOAT_EQ(next_state(2), 1.0); // EXPECT_FLOAT_EQ(next_state(3), 1.0); // EXPECT_FLOAT_EQ(next_state(4), 0.5); // EXPECT_FLOAT_EQ(next_state(5), 0.0); // EXPECT_FLOAT_EQ(next_state(6), 0.0); // state << 0, -M_PI + 0.1, 0, 0, 0, -0.5, 0.5; // state_der << 1, -1, 1, 1, 1; // dynamics.updateState(state, next_state, state_der, 1.0); // EXPECT_TRUE(state_der != RacerDubinsElevationLSTMSteering::state_array::Zero()); // EXPECT_FLOAT_EQ(next_state(0), 1.0); // EXPECT_FLOAT_EQ(next_state(1), M_PI + 0.1 - 1.0); // EXPECT_FLOAT_EQ(next_state(2), 1.0); // EXPECT_FLOAT_EQ(next_state(3), 1.0); // EXPECT_FLOAT_EQ(next_state(4), 0.5); // EXPECT_FLOAT_EQ(next_state(5), 0.0); // EXPECT_FLOAT_EQ(next_state(6), 0.0); // CudaCheckError(); // } TEST_F(RacerDubinsElevationLSTMSteeringTest, TestStep) { GTEST_SKIP() << "Skipping test since they have not been updated to accel "; CudaCheckError(); using DYN = RacerDubinsElevationLSTMSteering; const float tol = 1e-6; auto dynamics = RacerDubinsElevationLSTMSteering(3, 20, init_output_layers, 4, 4, output_layers, 11); auto params = dynamics.getParams(); params.c_0 = 0; params.c_b[0] = 1; params.c_b[1] = 10; params.c_b[2] = 100; params.c_v[0] = 0.25; params.c_v[1] = 0.5; params.c_v[2] = 0.75; params.c_t[0] = 2; params.c_t[1] = 20; params.c_t[2] = 200; params.low_min_throttle = 0.2; params.steer_command_angle_scale = 0.5; params.steering_constant = 0.5; params.wheel_base = 0.5; params.max_steer_rate = 5; params.max_steer_angle = 5; dynamics.setParams(params); DYN::state_array state; DYN::state_array next_state; DYN::state_array state_der = DYN::state_array::Zero(); DYN::control_array control; DYN::output_array output; float dt = 0.1; // TODO add in the elevation map auto model = dynamics.getHelper(); model->getLSTMModel()->getOutputModel()->setAllWeights(0.1f); model->getInitModel()->getOutputModel()->setAllWeights(0.01f); model->getLSTMModel()->setAllValues(0.3f); model->getInitModel()->setAllValues(0.01f); Eigen::MatrixXf buffer = model->getEmptyBufferMatrix(); buffer.setOnes(); buffer = buffer * 0.01f; // Basic initial state and no movement should stay still state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 0, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), 0.0, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 4.1500520706176758 * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 4.1500520706176758, tol); EXPECT_NEAR(output(23), 0.0, tol); // Apply full throttle from zero state state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 1.6, tol); EXPECT_NEAR(next_state(0), 0.16, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 5.2751355171203613 * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 5.2751355171203613, tol); EXPECT_NEAR(output(23), 1.6, tol); // Apply throttle to a state with positive velocity state << 1, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 5.5, tol); EXPECT_NEAR(next_state(0), 1.55, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.1, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 7.1901092529296875 * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 7.1901092529296875, tol); EXPECT_NEAR(output(23), 5.5, tol); // Apply full throttle and half left turn to origin state state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0.5; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 1.6, tol); EXPECT_NEAR(next_state(0), 0.16, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 6.1967658996582031 * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 6.1967658996582031, tol); EXPECT_NEAR(output(23), 1.6, tol); // Apply full throttle and half left turn to a moving state oriented 30 degrees to the left float yaw = M_PI / 6; state << 1.0, yaw, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0.5; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 5.5, tol); EXPECT_NEAR(next_state(0), 1.55, tol); EXPECT_NEAR(next_state(1), yaw, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), 9.0641689300537109 * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 9.0641689300537109, tol); EXPECT_NEAR(output(23), 5.5, tol); // Apply full throttle and half left turn to a moving state oriented 30 degrees to the left which is already turning float steer_angle = M_PI / 8; state << 1.0, yaw, 0, 0, steer_angle, -0.0, 0.0, 0, 0; control << 1, 0.5; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 5.5, tol); EXPECT_NEAR(next_state(0), 1.55, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + 9.3808889389038086 * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 9.3808889389038086, tol); EXPECT_NEAR(output(23), 5.5, tol); // Apply full brake and half left turn to a moving state oriented 30 degrees to the left which is already turning state << 1.0, yaw, 0, 0, steer_angle, 1.0, -0.0, 0.0, 0, 0; control << -1, 0.5; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -5.5, tol); EXPECT_NEAR(next_state(0), 1 - 5.5 * dt, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + 9.3808889389038086 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 9.3808889389038086, tol); EXPECT_NEAR(output(23), -5.5, tol); /** * Apply full brake and half left turn to a moving state oriented 30 degrees to the left which is already turning * and on a downward facing hill */ float pitch = 20 * M_PI / 180; state << 1.0, yaw, 0, 0, steer_angle, 1, -0.0, pitch, 0, 0; control << -1, 0.5; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), 1 + (-5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + 9.3808889389038086 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 9.3808889389038086, tol); EXPECT_NEAR(output(23), (-5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half left turn to a backwards moving state oriented 30 degrees to the left which is already * turning and on a downward facing hill */ state << -1.0, yaw, 0, 0, steer_angle, 1, -0.0, pitch, 0, 0; control << -1, 0.5; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + 0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + 3.5283551216125488 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 3.5283551216125488, tol); EXPECT_NEAR(output(23), (5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half right turn to a backwards moving state oriented 30 degrees to the left which is already * turning and on a downward facing hill */ state << -1.0, yaw, 0, 0, steer_angle, 1, -0.0, pitch, 0, 0; control << -1, -0.5; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + 0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + -0.32771033048629761 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), -0.32771033048629761, tol); EXPECT_NEAR(output(23), (5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half right turn to a backwards moving state with a huge steering angle to test max steer * angle and steering rate. We are also on a downward facing hill and are already oriented 30 degrees to the left */ steer_angle *= 100; state << -1.0, yaw, 0, 0, steer_angle, 1, -0.0, pitch, 0, 0; control << -1, -0.5; model->initializeLSTM(buffer); dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + tan(steer_angle / -9.1) * dt * -2, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), params.max_steer_angle, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 15.97845268249511, tol); EXPECT_NEAR(output(23), (5.5 + 9.81 * sinf(pitch)), tol); } TEST_F(RacerDubinsElevationLSTMSteeringTest, TestStepGPUvsCPUNoNetwork) { const int num_rollouts = 1024; const float dt = 0.1f; CudaCheckError(); using DYN = RacerDubinsElevationLSTMSteering; RacerDubinsElevationLSTMSteering dynamics = RacerDubinsElevationLSTMSteering(3, 20, init_output_layers, 4, 4, output_layers, 11); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); EXPECT_NE(dynamics.getHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.network_d_, nullptr); EXPECT_EQ(dynamics.network_d_, dynamics.getHelper()->getLSTMDevicePtr()); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(num_rollouts); std::vector> s_next(num_rollouts); std::vector> s_der(num_rollouts); // steering, throttle std::vector> u(num_rollouts); RacerDubinsElevationLSTMSteering::state_array state; RacerDubinsElevationLSTMSteering::state_array next_state_cpu; RacerDubinsElevationLSTMSteering::control_array control; RacerDubinsElevationLSTMSteering::output_array output; RacerDubinsElevationLSTMSteering::state_array state_der_cpu = RacerDubinsElevationLSTMSteering::state_array::Zero(); // Run dynamics on dynamicsU // Run dynamics on GPU for (int y_dim = 1; y_dim <= 16; y_dim++) { DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } dynamics.updateFromBuffer(buffer); launchStepTestKernel(dynamics, s, u, s_der, s_next, 0, dt, y_dim); for (int point = 0; point < num_rollouts; point++) { dynamics.initializeDynamics(state, control, output, 0, 0); state = state_trajectory.col(point); control = control_trajectory.col(point); state_der_cpu = RacerDubinsElevationLSTMSteering::state_array::Zero(); dynamics.step(state, next_state_cpu, state_der_cpu, control, output, 0, dt); // for (int dim = 0; dim < RacerDubinsElevationLSTMSteering::STATE_DIM; dim++) for (int dim = 0; dim < RacerDubinsElevationLSTMSteering::STATE_DIM; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim << " dim " << dim; // EXPECT_NEAR(state(dim), s[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_NEAR(next_state_cpu(dim), s_next[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim << " dim " << dim; EXPECT_TRUE(isfinite(s_next[point][dim])); } } } dynamics.freeCudaMem(); } TEST_F(RacerDubinsElevationLSTMSteeringTest, TestStepGPUvsCPU) { const int num_rollouts = 64; const float dt = 0.1f; CudaCheckError(); using DYN = RacerDubinsElevationLSTMSteering; RacerDubinsElevationLSTMSteering dynamics = RacerDubinsElevationLSTMSteering(mppi::tests::steering_lstm); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_rate, 17.590296); EXPECT_FLOAT_EQ(dynamics.getParams().steering_constant, 3.286375); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_constant, 9.301527); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_drag_constant, -0.60327667); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); EXPECT_NE(dynamics.getHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.network_d_, nullptr); EXPECT_EQ(dynamics.network_d_, dynamics.getHelper()->getLSTMDevicePtr()); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, dt, 16, 32, buffer, 1.0e-4); } // TODO assert they are different when a network is loaded, check params are the same TEST_F(RacerDubinsElevationLSTMSteeringTest, TestStepGPUvsCPUReverse) { using DYN = RacerDubinsElevationLSTMSteering; const int num_rollouts = 1024; const float dt = 0.1f; CudaCheckError(); RacerDubinsElevationLSTMSteering dynamics = RacerDubinsElevationLSTMSteering(mppi::tests::steering_lstm); auto params = dynamics.getParams(); params.gear_sign = -1; dynamics.setParams(params); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_rate, 17.590296); EXPECT_FLOAT_EQ(dynamics.getParams().steering_constant, 3.286375); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_constant, 9.301527); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_drag_constant, -0.60327667); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); EXPECT_NE(dynamics.getHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.network_d_, nullptr); EXPECT_EQ(dynamics.network_d_, dynamics.getHelper()->getLSTMDevicePtr()); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, dt, 16, 32, buffer, 1.0e-4); } TEST_F(RacerDubinsElevationLSTMSteeringTest, compareToElevationWithoutSteering) { // by default the network will output zeros and not effect any states using DYN = RacerDubinsElevationLSTMSteering; using DYN_PARAMS = DYN::DYN_PARAMS_T; const int num_rollouts = 1024; const float dt = 0.1f; CudaCheckError(); auto dynamics = RacerDubinsElevationLSTMSteering(3, 20, init_output_layers, 4, 4, output_layers, 11); RacerDubinsElevation dynamics2 = RacerDubinsElevation(); auto params = dynamics.getParams(); dynamics.setParams(params); dynamics2.setParams(params); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); TwoDTextureHelper* helper2 = dynamics2.getTextureHelper(); helper2->setExtent(0, extent); helper2->updateRotation(0, new_rot_mat); helper2->updateOrigin(0, make_float3(1, 2, 3)); data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; } helper2->updateTexture(0, data_vec); helper2->updateResolution(0, 10); helper2->enableTexture(0); helper2->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); dynamics2.GPUSetup(); CudaCheckError(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); RacerDubinsElevationLSTMSteering::state_array state; RacerDubinsElevationLSTMSteering::state_array next_state_cpu; RacerDubinsElevationLSTMSteering::control_array control; RacerDubinsElevationLSTMSteering::output_array output; RacerDubinsElevationLSTMSteering::state_array state_der_cpu = RacerDubinsElevationLSTMSteering::state_array::Zero(); RacerDubinsElevationLSTMSteering::state_array state2; RacerDubinsElevationLSTMSteering::state_array next_state_cpu2; RacerDubinsElevationLSTMSteering::control_array control2; RacerDubinsElevationLSTMSteering::output_array output2; RacerDubinsElevationLSTMSteering::state_array state_der_cpu2 = RacerDubinsElevationLSTMSteering::state_array::Zero(); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); dynamics.updateFromBuffer(buffer); for (int point = 0; point < num_rollouts; point++) { dynamics.initializeDynamics(state, control, output, 0, 0); state = state_trajectory.col(point); control = control_trajectory.col(point); state_der_cpu = RacerDubinsElevationLSTMSteering::state_array::Zero(); dynamics2.initializeDynamics(state2, control2, output2, 0, 0); state2 = state_trajectory.col(point); control2 = control_trajectory.col(point); state_der_cpu2 = RacerDubinsElevationLSTMSteering::state_array::Zero(); dynamics.step(state, next_state_cpu, state_der_cpu, control, output, 0, dt); dynamics2.step(state2, next_state_cpu2, state_der_cpu2, control2, output2, 0, dt); for (int dim = 0; dim < RacerDubinsElevationLSTMSteering::STATE_DIM; dim++) { if (dim == S_IND_CLASS(DYN_PARAMS, STEER_ANGLE) or dim == S_IND_CLASS(DYN_PARAMS, STEER_ANGLE_RATE)) { // this is done since the steering wheel setup is different, accel version continue; } EXPECT_NEAR(state_der_cpu(dim), state_der_cpu2(dim), 1e-4) << "state der at index " << point << " dim " << dim; EXPECT_NEAR(next_state_cpu(dim), next_state_cpu2(dim), 1e-4) << "next state at index " << point << " dim " << dim; } for (int dim = 0; dim < RacerDubinsElevationLSTMSteering::OUTPUT_DIM; dim++) { if (dim == O_IND_CLASS(DYN_PARAMS, STEER_ANGLE) or dim == O_IND_CLASS(DYN_PARAMS, STEER_ANGLE_RATE) or dim == O_IND_CLASS(DYN_PARAMS, WHEEL_FORCE_UP_MAX) or dim == O_IND_CLASS(DYN_PARAMS, WHEEL_FORCE_FWD_MAX) or dim == O_IND_CLASS(DYN_PARAMS, WHEEL_FORCE_SIDE_MAX) or dim == O_IND_CLASS(DYN_PARAMS, FILLER_1)) { // this is done since the steering wheel setup is different, accel version continue; } EXPECT_NEAR(output(dim), output2(dim), 1e-4) << "output at index " << point << " dim " << dim; } } params.gear_sign = -1; dynamics.setParams(params); dynamics2.setParams(params); // check in reverse as well for (int point = 0; point < num_rollouts; point++) { dynamics.initializeDynamics(state, control, output, 0, 0); state = state_trajectory.col(point); control = control_trajectory.col(point); state_der_cpu = RacerDubinsElevationLSTMSteering::state_array::Zero(); dynamics2.initializeDynamics(state2, control2, output2, 0, 0); state2 = state_trajectory.col(point); control2 = control_trajectory.col(point); state_der_cpu2 = RacerDubinsElevationLSTMSteering::state_array::Zero(); dynamics.step(state, next_state_cpu, state_der_cpu, control, output, 0, dt); dynamics2.step(state2, next_state_cpu2, state_der_cpu2, control2, output2, 0, dt); for (int dim = 0; dim < RacerDubinsElevationLSTMSteering::STATE_DIM; dim++) { if (dim == S_IND_CLASS(DYN_PARAMS, STEER_ANGLE) or dim == S_IND_CLASS(DYN_PARAMS, STEER_ANGLE_RATE)) { // this is done since the steering wheel setup is different, accel version continue; } EXPECT_NEAR(state_der_cpu(dim), state_der_cpu2(dim), 1e-4) << "at index " << point << " dim " << dim; EXPECT_NEAR(next_state_cpu(dim), next_state_cpu2(dim), 1e-4) << "at index " << point << " dim " << dim; } for (int dim = 0; dim < RacerDubinsElevationLSTMSteering::OUTPUT_DIM; dim++) { if (dim == O_IND_CLASS(DYN_PARAMS, STEER_ANGLE) or dim == O_IND_CLASS(DYN_PARAMS, STEER_ANGLE_RATE) or dim == O_IND_CLASS(DYN_PARAMS, WHEEL_FORCE_UP_MAX) or dim == O_IND_CLASS(DYN_PARAMS, WHEEL_FORCE_FWD_MAX) or dim == O_IND_CLASS(DYN_PARAMS, WHEEL_FORCE_SIDE_MAX) or dim == O_IND_CLASS(DYN_PARAMS, FILLER_1)) { // this is done since the steering wheel setup is different, accel version continue; } EXPECT_NEAR(output(dim), output2(dim), 1e-4) << "at index " << point << " dim " << dim; } } dynamics.freeCudaMem(); } /* class LinearDummy : public RacerDubinsElevationLSTMSteering { public: bool computeGrad(const Eigen::Ref & state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { return false; }; }; TEST_F(RacerDubinsElevationLSTMSteeringTest, TestComputeGradComputation) { Eigen::Matrix numeric_jac; Eigen::Matrix analytic_jac; RacerDubinsElevationLSTMSteering::state_array state; state << 1, 2, 3, 4; RacerDubinsElevationLSTMSteering::control_array control; control << 5; auto analytic_grad_model = RacerDubinsElevationLSTMSteering(); RacerDubinsElevationLSTMSteering::dfdx A_analytic = RacerDubinsElevationLSTMSteering::dfdx::Zero(); RacerDubinsElevationLSTMSteering::dfdu B_analytic = RacerDubinsElevationLSTMSteering::dfdu::Zero(); analytic_grad_model.computeGrad(state, control, A_analytic, B_analytic); auto numerical_grad_model = LinearDummy(); std::shared_ptr> ddp_model = std::make_shared>(&numerical_grad_model); analytic_jac.leftCols() = A_analytic; analytic_jac.rightCols() = B_analytic; numeric_jac = ddp_model->df(state, control); ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-3) << "Numeric Jacobian\n" << numeric_jac << "\nAnalytic Jacobian\n" << analytic_jac; } */ ================================================ FILE: tests/dynamics/racer_dubins_elevation_lstm_uncertainty_model_test.cu ================================================ #include #include #include #include #include #include #include #include class RacerDubinsElevationLSTMUncertaintyTest : public ::testing::Test { public: cudaStream_t stream; cudaExtent extent = make_cudaExtent(10, 20, 0); float map_resolution = 10.0f; // [m / px] float3 origin = make_float3(1, 2, 3); // [m, m, m] std::vector data_vec; std::vector normal_vec; void SetUp() override { CudaCheckError(); HANDLE_ERROR(cudaStreamCreate(&stream)); steer_config.init_config.input_dim = 3; steer_config.init_config.hidden_dim = 20; steer_config.init_config.output_layers = { 23, 100, 8 }; steer_config.pred_config.input_dim = 4; steer_config.pred_config.hidden_dim = 4; steer_config.pred_config.output_layers = { 8, 20, 1 }; steer_config.init_len = 11; unc_config.init_config.input_dim = 10; unc_config.init_config.hidden_dim = 20; unc_config.init_config.output_layers = { 30, 100, 8 }; unc_config.pred_config.input_dim = 13; unc_config.pred_config.hidden_dim = 4; unc_config.pred_config.output_layers = { 17, 20, 5 }; unc_config.init_len = 11; mean_config.init_config.input_dim = 10; mean_config.init_config.hidden_dim = 20; mean_config.init_config.output_layers = { 30, 100, 8 }; mean_config.pred_config.input_dim = 12; mean_config.pred_config.hidden_dim = 4; mean_config.pred_config.output_layers = { 16, 20, 2 }; mean_config.init_len = 11; data_vec.resize(extent.width * extent.height); normal_vec.resize(extent.width * extent.height); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; Eigen::Vector4f random_normal = Eigen::Vector4f::Random(); normal_vec[i] = make_float4(random_normal.x(), random_normal.y(), random_normal.z(), random_normal.w()); } } void TearDown() override { CudaCheckError(); HANDLE_ERROR(cudaStreamDestroy(stream)); } LSTMLSTMConfig steer_config; LSTMLSTMConfig mean_config; LSTMLSTMConfig unc_config; }; TEST_F(RacerDubinsElevationLSTMUncertaintyTest, Template) { auto dynamics = RacerDubinsElevationLSTMUncertainty(steer_config, mean_config, unc_config); EXPECT_EQ(26, RacerDubinsElevationLSTMUncertainty::STATE_DIM); EXPECT_EQ(2, RacerDubinsElevationLSTMUncertainty::CONTROL_DIM); EXPECT_TRUE(dynamics.checkRequiresBuffer()); EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_NE(dynamics.getHelper(), nullptr); EXPECT_NE(dynamics.getUncertaintyHelper(), nullptr); EXPECT_NE(dynamics.getMeanHelper(), nullptr); } TEST_F(RacerDubinsElevationLSTMUncertaintyTest, BindStreamConstructor) { auto dynamics = RacerDubinsElevationLSTMUncertainty(steer_config, mean_config, unc_config, stream); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_EQ(dynamics.getTextureHelper()->stream_, stream); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->stream_, stream); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->stream_, stream); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->stream_, stream); } // TEST_F(RacerDubinsElevationLSTMUncertaintyTest, Deconstructor) // { // { // auto dynamics = RacerDubinsElevationLSTMUncertainty(steer_config, mean_config, unc_config); // dynamics = RacerDubinsElevationLSTMUncertainty(steer_config, mean_config, unc_config); // } // } TEST_F(RacerDubinsElevationLSTMUncertaintyTest, BindStream) { auto dynamics = RacerDubinsElevationLSTMUncertainty(steer_config, mean_config, unc_config); dynamics.bindToStream(stream); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_EQ(dynamics.getTextureHelper()->stream_, stream); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->stream_, stream); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->stream_, stream); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->stream_, stream); } TEST_F(RacerDubinsElevationLSTMUncertaintyTest, GPUSetup) { auto dynamics = RacerDubinsElevationLSTMUncertainty(steer_config, mean_config, unc_config, stream); dynamics.GPUSetup(); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_EQ(dynamics.getTextureHelper()->stream_, stream); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->stream_, stream); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->stream_, stream); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->stream_, stream); EXPECT_EQ(dynamics.GPUMemStatus_, true); EXPECT_NE(dynamics.model_d_, nullptr); // steering model EXPECT_NE(dynamics.network_d_, nullptr); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->GPUMemStatus_, true); EXPECT_NE(dynamics.getHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.getHelper()->getLSTMModel()->network_d_, nullptr); // mean model EXPECT_NE(dynamics.mean_d_, nullptr); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->GPUMemStatus_, true); EXPECT_NE(dynamics.getMeanHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.getMeanHelper()->getLSTMModel()->network_d_, nullptr); // mean model EXPECT_NE(dynamics.uncertainty_d_, nullptr); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->GPUMemStatus_, true); EXPECT_NE(dynamics.getUncertaintyHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.getUncertaintyHelper()->getLSTMModel()->network_d_, nullptr); } // TODO test case that checks that I get the same as base class here TEST_F(RacerDubinsElevationLSTMUncertaintyTest, TestLoadParams) { CudaCheckError(); using DYN = RacerDubinsElevationLSTMUncertainty; RacerDubinsElevationLSTMUncertainty dynamics = RacerDubinsElevationLSTMUncertainty(mppi::tests::racer_dubins_elevation_uncertainty_test); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_angle, 5.0); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_rate, 17.590296); EXPECT_FLOAT_EQ(dynamics.getParams().steering_constant, 3.286375); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_constant, 9.30152702); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_drag_constant, -0.60327667); EXPECT_FLOAT_EQ(dynamics.getParams().max_brake_rate_pos, 10); EXPECT_FLOAT_EQ(dynamics.getParams().max_brake_rate_neg, 10); EXPECT_FLOAT_EQ(dynamics.getParams().pos_quad_brake_c[0], 3); EXPECT_FLOAT_EQ(dynamics.getParams().pos_quad_brake_c[1], 0.1); EXPECT_FLOAT_EQ(dynamics.getParams().pos_quad_brake_c[2], 0.48); EXPECT_FLOAT_EQ(dynamics.getParams().neg_quad_brake_c[0], 5.8); EXPECT_FLOAT_EQ(dynamics.getParams().neg_quad_brake_c[1], 0.1); EXPECT_FLOAT_EQ(dynamics.getParams().neg_quad_brake_c[2], 1.4); EXPECT_FLOAT_EQ(dynamics.getParams().c_t[0], 3.0364573); EXPECT_FLOAT_EQ(dynamics.getParams().c_t[1], 4.59772491); EXPECT_FLOAT_EQ(dynamics.getParams().c_t[2], 4.06954288); EXPECT_FLOAT_EQ(dynamics.getParams().c_b[0], 2.56373692); EXPECT_FLOAT_EQ(dynamics.getParams().c_b[1], 6.18813848); EXPECT_FLOAT_EQ(dynamics.getParams().c_b[2], 25.52443695); EXPECT_FLOAT_EQ(dynamics.getParams().c_v[0], 4.39438224e-01); EXPECT_FLOAT_EQ(dynamics.getParams().c_v[1], 2.37689335e-02); EXPECT_FLOAT_EQ(dynamics.getParams().c_v[2], 2.12573977e-05); EXPECT_FLOAT_EQ(dynamics.getParams().steer_angle_scale, -13.25719738); EXPECT_FLOAT_EQ(dynamics.getParams().gravity, -3.1667469); EXPECT_FLOAT_EQ(dynamics.getParams().unc_scale[0], 0.5); EXPECT_FLOAT_EQ(dynamics.getParams().unc_scale[1], 0.2); EXPECT_FLOAT_EQ(dynamics.getParams().unc_scale[2], 0.5); EXPECT_FLOAT_EQ(dynamics.getParams().unc_scale[3], 0.02); EXPECT_FLOAT_EQ(dynamics.getParams().unc_scale[4], 0.02); EXPECT_FLOAT_EQ(dynamics.getParams().unc_scale[5], 0.1); EXPECT_FLOAT_EQ(dynamics.getParams().unc_scale[6], 0.1); EXPECT_FLOAT_EQ(dynamics.getParams().low_min_throttle, 0.0); EXPECT_FLOAT_EQ(dynamics.getParams().c_0, 0.0); EXPECT_FLOAT_EQ(dynamics.getParams().wheel_base, 3.0); EXPECT_GT(dynamics.getParams().clamp_ax, 10.0f); EXPECT_EQ(dynamics.getHelper()->getInitModel()->getInputDim(), 3); EXPECT_EQ(dynamics.getHelper()->getInitModel()->getOutputDim(), 8); EXPECT_EQ(dynamics.getHelper()->getInitModel()->getHiddenDim(), 20); EXPECT_EQ(dynamics.getHelper()->getInitModel()->getOutputModel()->getInputDim(), 23); EXPECT_EQ(dynamics.getHelper()->getInitModel()->getOutputModel()->getOutputDim(), 8); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->getInputDim(), 4); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->getOutputDim(), 1); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->getHiddenDim(), 4); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->getOutputModel()->getInputDim(), 8); EXPECT_EQ(dynamics.getHelper()->getLSTMModel()->getOutputModel()->getOutputDim(), 1); EXPECT_EQ(dynamics.getHelper()->getInitLen(), 11); EXPECT_EQ(dynamics.getMeanHelper()->getInitModel()->getInputDim(), 9); EXPECT_EQ(dynamics.getMeanHelper()->getInitModel()->getOutputDim(), 8); EXPECT_EQ(dynamics.getMeanHelper()->getInitModel()->getHiddenDim(), 20); EXPECT_EQ(dynamics.getMeanHelper()->getInitModel()->getOutputModel()->getInputDim(), 29); EXPECT_EQ(dynamics.getMeanHelper()->getInitModel()->getOutputModel()->getOutputDim(), 8); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->getInputDim(), 11); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->getOutputDim(), 2); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->getHiddenDim(), 4); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->getOutputModel()->getInputDim(), 15); EXPECT_EQ(dynamics.getMeanHelper()->getLSTMModel()->getOutputModel()->getOutputDim(), 2); EXPECT_EQ(dynamics.getMeanHelper()->getInitLen(), 11); EXPECT_EQ(dynamics.getUncertaintyHelper()->getInitModel()->getInputDim(), 10); EXPECT_EQ(dynamics.getUncertaintyHelper()->getInitModel()->getOutputDim(), 8); EXPECT_EQ(dynamics.getUncertaintyHelper()->getInitModel()->getHiddenDim(), 20); EXPECT_EQ(dynamics.getUncertaintyHelper()->getInitModel()->getOutputModel()->getInputDim(), 30); EXPECT_EQ(dynamics.getUncertaintyHelper()->getInitModel()->getOutputModel()->getOutputDim(), 8); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->getInputDim(), 12); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->getOutputDim(), 7); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->getHiddenDim(), 4); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->getOutputModel()->getInputDim(), 16); EXPECT_EQ(dynamics.getUncertaintyHelper()->getLSTMModel()->getOutputModel()->getOutputDim(), 7); EXPECT_EQ(dynamics.getUncertaintyHelper()->getInitLen(), 11); } TEST_F(RacerDubinsElevationLSTMUncertaintyTest, TestGPUvsCPU) { CudaCheckError(); using DYN = RacerDubinsElevationLSTMUncertainty; RacerDubinsElevationLSTMUncertainty dynamics = RacerDubinsElevationLSTMUncertainty(mppi::tests::racer_dubins_elevation_uncertainty_test); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); TwoDTextureHelper* helper = dynamics.getTextureHelper(); TwoDTextureHelper* normal_helper = dynamics.getTextureHelperNormals(); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, origin); helper->updateTexture(0, data_vec, extent); helper->updateResolution(0, map_resolution); helper->enableTexture(0); helper->copyToDevice(false); normal_helper->updateRotation(0, new_rot_mat); normal_helper->updateOrigin(0, origin); normal_helper->updateTexture(0, normal_vec, extent); normal_helper->updateResolution(0, map_resolution); normal_helper->enableTexture(0); normal_helper->copyToDevice(true); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["CAN_STEER_CMD"] = Eigen::VectorXf::Random(51); buffer["ROLL"] = Eigen::VectorXf::Random(51); buffer["PITCH"] = Eigen::VectorXf::Random(51); buffer["CAN_THROTTLE_CMD"] = Eigen::VectorXf::Random(51); buffer["BRAKE_STATE"] = Eigen::VectorXf::Random(51); buffer["CAN_BRAKE_CMD"] = Eigen::VectorXf::Random(51); buffer["OMEGA_Z"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, 0.02f, 16, 32, buffer, 1.0e-4); } TEST_F(RacerDubinsElevationLSTMUncertaintyTest, TestStepGPUvsCPUReverse) { using DYN = RacerDubinsElevationLSTMUncertainty; CudaCheckError(); using DYN = RacerDubinsElevationLSTMUncertainty; RacerDubinsElevationLSTMUncertainty dynamics = RacerDubinsElevationLSTMUncertainty(mppi::tests::racer_dubins_elevation_uncertainty_test); auto params = dynamics.getParams(); params.gear_sign = -1; dynamics.setParams(params); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); TwoDTextureHelper* helper = dynamics.getTextureHelper(); TwoDTextureHelper* normal_helper = dynamics.getTextureHelperNormals(); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, origin); helper->updateTexture(0, data_vec, extent); helper->updateResolution(0, map_resolution); helper->enableTexture(0); helper->copyToDevice(false); normal_helper->updateRotation(0, new_rot_mat); normal_helper->updateOrigin(0, origin); normal_helper->updateTexture(0, normal_vec, extent); normal_helper->updateResolution(0, map_resolution); normal_helper->enableTexture(0); normal_helper->copyToDevice(true); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["CAN_STEER_CMD"] = Eigen::VectorXf::Random(51); buffer["ROLL"] = Eigen::VectorXf::Random(51); buffer["PITCH"] = Eigen::VectorXf::Random(51); buffer["CAN_THROTTLE_CMD"] = Eigen::VectorXf::Random(51); buffer["BRAKE_STATE"] = Eigen::VectorXf::Random(51); buffer["CAN_BRAKE_CMD"] = Eigen::VectorXf::Random(51); buffer["OMEGA_Z"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, 0.02f, 16, 32, buffer, 1.0e-2); } TEST_F(RacerDubinsElevationLSTMUncertaintyTest, TestMatchesPython) { using DYN = RacerDubinsElevationLSTMUncertainty; CudaCheckError(); using DYN = RacerDubinsElevationLSTMUncertainty; RacerDubinsElevationLSTMUncertainty dynamics = RacerDubinsElevationLSTMUncertainty(mppi::tests::racer_dubins_elevation_uncertainty_test); auto params = dynamics.getParams(); params.K_x = 0.0f; params.K_y = 0.0f; params.K_yaw = 0.0f; params.K_vel_x = 0.0f; params.wheel_base = 3.0f; dynamics.setParams(params); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); TwoDTextureHelper* helper = dynamics.getTextureHelper(); TwoDTextureHelper* normal_helper = dynamics.getTextureHelperNormals(); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, origin); helper->updateTexture(0, data_vec, extent); helper->updateResolution(0, map_resolution); helper->enableTexture(0); helper->copyToDevice(false); normal_helper->updateRotation(0, new_rot_mat); normal_helper->updateOrigin(0, origin); normal_helper->updateTexture(0, normal_vec, extent); normal_helper->updateResolution(0, map_resolution); normal_helper->enableTexture(0); normal_helper->copyToDevice(true); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["CAN_STEER_CMD"] = Eigen::VectorXf::Random(51); buffer["ROLL"] = Eigen::VectorXf::Random(51); buffer["PITCH"] = Eigen::VectorXf::Random(51); buffer["CAN_THROTTLE_CMD"] = Eigen::VectorXf::Random(51); buffer["BRAKE_STATE"] = Eigen::VectorXf::Random(51); buffer["CAN_BRAKE_CMD"] = Eigen::VectorXf::Random(51); buffer["OMEGA_Z"] = Eigen::VectorXf::Random(51); EXPECT_TRUE(fileExists(mppi::tests::racer_dubins_elevation_uncertainty_test)); if (!fileExists(mppi::tests::racer_dubins_elevation_uncertainty_test)) { std::cerr << "Could not load neural net model at path: " << mppi::tests::racer_dubins_elevation_uncertainty_test.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(mppi::tests::racer_dubins_elevation_uncertainty_test); float T = param_dict.at("T").data()[0]; float dt = param_dict.at("dt").data()[0]; float init_length = param_dict.at("init_length").data()[0] + 1; int traj_length = T / dt; int num_points = static_cast(param_dict.at("num_points").data()[0]); int input_dim = static_cast(param_dict.at("input_dim").data()[0]); int state_dim = static_cast(param_dict.at("state_dim").data()[0]); int state_dim_raw = static_cast(param_dict.at("state_dim_raw").data()[0]); int output_dim = static_cast(param_dict.at("output_dim").data()[0]); int output_dim_raw = static_cast(param_dict.at("output_dim_raw").data()[0]); double* init_inputs = param_dict.at("init_input").data(); double* init_states = param_dict.at("init_state").data(); double* states = param_dict.at("state").data(); double* inputs = param_dict.at("input").data(); double* outputs = param_dict.at("output").data(); // double* inputs = param_dict.at("input").data(); // double* outputs = param_dict.at("output").data(); double* init_hidden = nullptr; // = param_dict.at("init/hidden").data(); double* init_cell = nullptr; // = param_dict.at("init/cell").data(); int hidden_dim = 0; // double* hidden = param_dict.at("hidden").data(); // double* cell = param_dict.at("cell").data(); for (int point = 0; point < num_points; point++) { // std::cout << "\n\n\n\n\n\n\n================= on point " << point << "=============" << std::endl; // sets up the buffer correctly for (int t = 0; t < init_length; t++) { int buffer_index = (51 - init_length) + t; int init_input_shift = point * init_length * input_dim + t * input_dim; buffer["CAN_THROTTLE_CMD"](buffer_index) = init_inputs[init_input_shift + 0]; buffer["CAN_BRAKE_CMD"](buffer_index) = init_inputs[init_input_shift + 1]; buffer["CAN_STEER_CMD"](buffer_index) = init_inputs[init_input_shift + 2]; buffer["PITCH"](buffer_index) = init_inputs[init_input_shift + 3]; buffer["ROLL"](buffer_index) = init_inputs[init_input_shift + 4]; int init_state_shift = point * init_length * state_dim_raw + t * state_dim_raw; buffer["VEL_X"](buffer_index) = init_states[init_state_shift + 3]; buffer["OMEGA_Z"](buffer_index) = init_states[init_state_shift + 5]; buffer["BRAKE_STATE"](buffer_index) = init_states[init_state_shift + 6]; buffer["STEER_ANGLE"](buffer_index) = init_states[init_state_shift + 7]; buffer["STEER_ANGLE_RATE"](buffer_index) = init_states[init_state_shift + 8]; } typename DYN::state_array state = DYN::state_array::Zero(); typename DYN::state_array next_state = DYN::state_array::Zero(); typename DYN::control_array control = DYN::control_array::Zero(); typename DYN::state_array state_der = DYN::state_array::Zero(); typename DYN::output_array output_array = DYN::output_array::Zero(); int state_shift = point * traj_length * state_dim; state(S_IND_CLASS(DYN::DYN_PARAMS_T, POS_X)) = states[state_shift + 0]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, POS_Y)) = states[state_shift + 1]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, YAW)) = states[state_shift + 2]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, VEL_X)) = states[state_shift + 3]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, OMEGA_Z)) = states[state_shift + 5]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, BRAKE_STATE)) = states[state_shift + 6]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE)) = states[state_shift + 7]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE_RATE)) = states[state_shift + 8]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_POS_X)) = 1.0e-5; state(S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_POS_Y)) = 1.0e-5; state(S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_YAW)) = 1.0e-5; state(S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_VEL_X)) = 1.0e-5; // std::cout << "got initial state " << state.transpose() << std::endl; assert(state(S_IND_CLASS(DYN::DYN_PARAMS_T, BRAKE_STATE)) >= 0.0f && state(S_IND_CLASS(DYN::DYN_PARAMS_T, BRAKE_STATE)) <= 0.25f); assert(state(S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE)) >= -5 && state(S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE)) <= 5); if (dynamics.checkRequiresBuffer()) { dynamics.updateFromBuffer(buffer); } dynamics.initializeDynamics(state, control, output_array, 0, dt); // checks init values for hidden and cell states hidden_dim = dynamics.getHelper()->getLSTMModel()->getHiddenDim(); init_hidden = param_dict.at("init/steering/hidden").data(); init_cell = param_dict.at("init/steering/cell").data(); for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(dynamics.getHelper()->getLSTMModel()->getHiddenState()[i], init_hidden[point * hidden_dim + i], 1.0e-5); EXPECT_NEAR(dynamics.getHelper()->getLSTMModel()->getCellState()[i], init_cell[point * hidden_dim + i], 1.0e-5); } hidden_dim = dynamics.getMeanHelper()->getLSTMModel()->getHiddenDim(); init_hidden = param_dict.at("init/terra/mean_network/hidden").data(); init_cell = param_dict.at("init/terra/mean_network/cell").data(); for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(dynamics.getMeanHelper()->getLSTMModel()->getHiddenState()[i], init_hidden[point * hidden_dim + i], 1.0e-5); EXPECT_NEAR(dynamics.getMeanHelper()->getLSTMModel()->getCellState()[i], init_cell[point * hidden_dim + i], 1.0e-5); } hidden_dim = dynamics.getUncertaintyHelper()->getLSTMModel()->getHiddenDim(); init_hidden = param_dict.at("init/terra/uncertainty_network/hidden").data(); init_cell = param_dict.at("init/terra/uncertainty_network/cell").data(); for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(dynamics.getUncertaintyHelper()->getLSTMModel()->getHiddenState()[i], init_hidden[point * hidden_dim + i], 1.0e-5); EXPECT_NEAR(dynamics.getUncertaintyHelper()->getLSTMModel()->getCellState()[i], init_cell[point * hidden_dim + i], 1.0e-5); } // TODO find a way to scale the tol as a function of number of computations // TODO scale tolerance by the number of timesteps float tol = 1.0e-5; // run init part of the networks and check hidden and cell states there for (int t = 1; t < 50; t++) { int input_shift = point * traj_length * input_dim + (t - 1) * input_dim; control(C_IND_CLASS(DYN::DYN_PARAMS_T, THROTTLE_BRAKE)) = inputs[input_shift + 0] - inputs[input_shift + 1]; control(C_IND_CLASS(DYN::DYN_PARAMS_T, STEER_CMD)) = inputs[input_shift + 2]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, STATIC_PITCH)) = inputs[input_shift + 3]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, STATIC_ROLL)) = inputs[input_shift + 4]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, PITCH)) = inputs[input_shift + 3]; state(S_IND_CLASS(DYN::DYN_PARAMS_T, ROLL)) = inputs[input_shift + 4]; // std::cout << "at time t " << t - 1 << " got control " << control.transpose() << std::endl; // std::cout << "at time t " << t - 1 << " got state " << state.transpose() << std::endl; // std::cout << "at time t " << t - 1 << " got vx " << state(S_IND_CLASS(DYN::DYN_PARAMS_T, VEL_X)) << std::endl; dynamics.step(state, next_state, state_der, control, output_array, 0.0f, dt); int output_shift = point * traj_length * output_dim + (t - 1) * output_dim; EXPECT_NEAR(state_der(S_IND_CLASS(DYN::DYN_PARAMS_T, VEL_X)), outputs[output_shift + 0], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, VEL_X) << " at time " << t; EXPECT_NEAR(state_der(S_IND_CLASS(DYN::DYN_PARAMS_T, YAW)), outputs[output_shift + 2], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, YAW) << " at time " << t; EXPECT_NEAR(state_der(S_IND_CLASS(DYN::DYN_PARAMS_T, BRAKE_STATE)), outputs[output_shift + 3], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, BRAKE_STATE) << " at time " << t; EXPECT_NEAR(state_der(S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE_RATE)), outputs[output_shift + 4], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE_RATE) << " at time " << t; state_shift = point * traj_length * state_dim + t * state_dim; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, POS_X)), states[state_shift + 0], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, POS_X) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, POS_Y)), states[state_shift + 1], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, POS_Y) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, YAW)), angle_utils::normalizeAngle(states[state_shift + 2]), tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, YAW) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, VEL_X)), states[state_shift + 3], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, VEL_X) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, OMEGA_Z)), states[state_shift + 5], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, OMEGA_Z) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, BRAKE_STATE)), states[state_shift + 6], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, BRAKE_STATE) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE)), states[state_shift + 7], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE_RATE)), states[state_shift + 8], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, STEER_ANGLE_RATE) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_POS_X)), states[state_shift + state_dim_raw], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_POS_X) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_POS_Y)), states[state_shift + state_dim_raw + 5], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_POS_Y) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_YAW)), states[state_shift + state_dim_raw + 10], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_YAW) << " at time " << t; EXPECT_NEAR(next_state(S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_VEL_X)), states[state_shift + state_dim_raw + 15], tol) << "at sample " << point << ", next state dim: " << S_IND_CLASS(DYN::DYN_PARAMS_T, UNCERTAINTY_VEL_X) << " at time " << t; // checks init values for hidden and cell states hidden_dim = dynamics.getHelper()->getLSTMModel()->getHiddenDim(); double* hidden = param_dict.at("pred/steering/hidden").data(); double* cell = param_dict.at("pred/steering/cell").data(); int hidden_shift = point * traj_length * hidden_dim + (t - 1) * hidden_dim; for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(dynamics.getHelper()->getLSTMModel()->getHiddenState()[i], hidden[hidden_shift + i], tol) << "steering hidden lstm at point " << point << " dim " << i; EXPECT_NEAR(dynamics.getHelper()->getLSTMModel()->getCellState()[i], cell[hidden_shift + i], tol) << "steering cell lstm at point " << point << " dim " << i; } hidden_dim = dynamics.getMeanHelper()->getLSTMModel()->getHiddenDim(); hidden = param_dict.at("pred/terra/mean_network/hidden").data(); cell = param_dict.at("pred/terra/mean_network/cell").data(); hidden_shift = point * traj_length * hidden_dim + (t - 1) * hidden_dim; for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(dynamics.getMeanHelper()->getLSTMModel()->getHiddenState()[i], hidden[hidden_shift + i], tol) << "steering hidden lstm at point " << point << " dim " << i; EXPECT_NEAR(dynamics.getMeanHelper()->getLSTMModel()->getCellState()[i], cell[hidden_shift + i], tol) << "steering cell lstm at point " << point << " dim " << i; } hidden_dim = dynamics.getUncertaintyHelper()->getLSTMModel()->getHiddenDim(); hidden = param_dict.at("pred/terra/uncertainty_network/hidden").data(); cell = param_dict.at("pred/terra/uncertainty_network/cell").data(); hidden_shift = point * traj_length * hidden_dim + (t - 1) * hidden_dim; for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(dynamics.getUncertaintyHelper()->getLSTMModel()->getHiddenState()[i], hidden[hidden_shift + i], tol) << "steering hidden lstm at point " << point << " dim " << i; EXPECT_NEAR(dynamics.getUncertaintyHelper()->getLSTMModel()->getCellState()[i], cell[hidden_shift + i], tol) << "steering cell lstm at point " << point << " dim " << i; } state = next_state; tol += 5.0e-4 / traj_length; } } } ================================================ FILE: tests/dynamics/racer_dubins_elevation_model_test.cu ================================================ #include #include #include #include #include #include class RacerDubinsElevationTest : public ::testing::Test { public: cudaStream_t stream; void SetUp() override { CudaCheckError(); HANDLE_ERROR(cudaStreamCreate(&stream)); } void TearDown() override { CudaCheckError(); HANDLE_ERROR(cudaStreamDestroy(stream)); } }; TEST_F(RacerDubinsElevationTest, Template) { auto dynamics = RacerDubinsElevation(); EXPECT_EQ(19, RacerDubinsElevation::STATE_DIM); EXPECT_EQ(2, RacerDubinsElevation::CONTROL_DIM); EXPECT_NE(dynamics.getTextureHelper(), nullptr); } TEST_F(RacerDubinsElevationTest, BindStream) { auto dynamics = RacerDubinsElevation(stream); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_EQ(dynamics.getTextureHelper()->stream_, stream); } /* float c_t = 1.3; float c_b = 2.5; float c_v = 3.7; float c_0 = 4.9; float wheel_base = 0.3; */ // TEST_F(RacerDubinsElevationTest, ComputeDynamics) // { // RacerDubinsElevation dynamics = RacerDubinsElevation(); // auto params = dynamics.getParams(); // RacerDubinsElevation::state_array x = RacerDubinsElevation::state_array::Zero(); // RacerDubinsElevation::control_array u = RacerDubinsElevation::control_array::Zero(); // // computeDynamics should not touch the roll/pitch element // RacerDubinsElevation::state_array next_x = RacerDubinsElevation::state_array::Ones() * 0.153; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 0); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 1, M_PI_2, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 2.6 - 4.7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 1); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 1, M_PI_2, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.5 - 4.7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 1); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -1, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 4.7 + 2.6 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -1); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -1, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.5 + 4.7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -1); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 7, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.9 - 5.7 * 7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 7); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -7, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.9 + 5.7 * 7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -7); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 7, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 4.5 - 5.7 * 7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 7); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -7, 0, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 4.5 + 5.7 * 7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -7); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 1, M_PI_2, 0, 3, 0, 0.5, -0.5, 0.0, 0.0; // u << 0, 1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 4.7 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), (1 / .3) * tan(0)); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 1); // EXPECT_FLOAT_EQ(next_x(4), 5 * 0.6); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -1, M_PI_2, 0, 3, 5.0, 0.5, -0.5, 0.0, 0.0; // u << -1, -1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 4.7 + 3.5 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), (-1 / .3) * tan(5.0 / -10.2)); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), -1); // EXPECT_FLOAT_EQ(next_x(4), -5); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << -0.4, M_PI_2, 0, 3, 5.0, 0.5, -0.5, 0.0, 0.0; // u << -1, -1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.7 * 0.4 + 2.5 * 0.4 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), (-0.4 / .3) * tan(5.0 / -9.1)); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), -0.4); // EXPECT_FLOAT_EQ(next_x(4), -5); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // x << 0.4, M_PI_2, 0, 3, 5.0, 0.5, -0.5, 0.0, 0.0; // u << 0.1, -1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.7 * 0.4 - sinf(-0.5) * -9.81); // EXPECT_FLOAT_EQ(next_x(1), (0.4 / .3) * tan(5.0 / -9.1)); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 0.4); // EXPECT_FLOAT_EQ(next_x(4), -5); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // } // TEST_F(RacerDubinsElevationTest, TestModelGPU) // { // RacerDubinsElevation dynamics = RacerDubinsElevation(); // dynamics.GPUSetup(); // CudaCheckError(); // Eigen::Matrix control_trajectory; // control_trajectory = Eigen::Matrix::Random(); // Eigen::Matrix state_trajectory; // state_trajectory = Eigen::Matrix::Random(); // std::vector> s(100); // std::vector> s_der(100); // // steering, throttle // std::vector> u(100); // for (int state_index = 0; state_index < s.size(); state_index++) // { // for (int dim = 0; dim < s[0].size(); dim++) // { // s[state_index][dim] = state_trajectory.col(state_index)(dim); // } // for (int dim = 0; dim < u[0].size(); dim++) // { // u[state_index][dim] = control_trajectory.col(state_index)(dim); // } // } // // These variables will be changed so initialized to the right size only // // Run dynamics on dynamicsU // // Run dynamics on GPU // for (int y_dim = 1; y_dim <= 4; y_dim++) // { // launchComputeDynamicsTestKernel(dynamics, s, u, s_der, y_dim); // for (int point = 0; point < 100; point++) // { // RacerDubinsElevation::state_array state = state_trajectory.col(point); // RacerDubinsElevation::control_array control = control_trajectory.col(point); // RacerDubinsElevation::state_array state_der_cpu = RacerDubinsElevation::state_array::Zero(); // dynamics.computeDynamics(state, control, state_der_cpu); // for (int dim = 0; dim < 6; dim++) // { // EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-5) // << "at point " << point << " dim " << dim << " with y_dim " << y_dim; // EXPECT_TRUE(isfinite(s_der[point][dim])); // } // } // } // dynamics.freeCudaMem(); // CudaCheckError(); // } // TEST_F(RacerDubinsElevationTest, TestUpdateState) // { // CudaCheckError(); // RacerDubinsElevation dynamics = RacerDubinsElevation(); // RacerDubinsElevation::state_array state; // RacerDubinsElevation::state_array next_state; // RacerDubinsElevation::state_array state_der; // // TODO add in the elevation map // state << 0, 0, 0, 0, 0, -0.5, 0.5; // state_der << 1, 1, 1, 1, 1, 0, 0; // dynamics.updateState(state, next_state, state_der, 0.1); // EXPECT_TRUE(state_der != RacerDubinsElevation::state_array::Zero()); // EXPECT_FLOAT_EQ(next_state(0), 0.1); // EXPECT_FLOAT_EQ(next_state(1), 0.1); // EXPECT_FLOAT_EQ(next_state(2), 0.1); // EXPECT_FLOAT_EQ(next_state(3), 0.1); // EXPECT_FLOAT_EQ(next_state(4), 0.1); // EXPECT_FLOAT_EQ(next_state(5), 0.0); // EXPECT_FLOAT_EQ(next_state(6), 0.0); // state << 0, M_PI - 0.1, 0, 0, 0, -0.5, 0.5; // state_der << 1, 1, 1, 1, 1; // dynamics.updateState(state, next_state, state_der, 1.0); // EXPECT_TRUE(state_der != RacerDubinsElevation::state_array::Zero()); // EXPECT_FLOAT_EQ(next_state(0), 1.0); // EXPECT_FLOAT_EQ(next_state(1), 1.0 - M_PI - 0.1); // EXPECT_FLOAT_EQ(next_state(2), 1.0); // EXPECT_FLOAT_EQ(next_state(3), 1.0); // EXPECT_FLOAT_EQ(next_state(4), 0.5); // EXPECT_FLOAT_EQ(next_state(5), 0.0); // EXPECT_FLOAT_EQ(next_state(6), 0.0); // state << 0, -M_PI + 0.1, 0, 0, 0, -0.5, 0.5; // state_der << 1, -1, 1, 1, 1; // dynamics.updateState(state, next_state, state_der, 1.0); // EXPECT_TRUE(state_der != RacerDubinsElevation::state_array::Zero()); // EXPECT_FLOAT_EQ(next_state(0), 1.0); // EXPECT_FLOAT_EQ(next_state(1), M_PI + 0.1 - 1.0); // EXPECT_FLOAT_EQ(next_state(2), 1.0); // EXPECT_FLOAT_EQ(next_state(3), 1.0); // EXPECT_FLOAT_EQ(next_state(4), 0.5); // EXPECT_FLOAT_EQ(next_state(5), 0.0); // EXPECT_FLOAT_EQ(next_state(6), 0.0); // CudaCheckError(); // } TEST_F(RacerDubinsElevationTest, TestStep) { CudaCheckError(); using DYN = RacerDubinsElevation; const float tol = 1e-6; DYN dynamics = DYN(); auto params = dynamics.getParams(); params.c_0 = 0; params.c_b[0] = 1; params.c_b[1] = 10; params.c_b[2] = 100; params.c_v[0] = 0.25; params.c_v[1] = 0.5; params.c_v[2] = 0.75; params.c_t[0] = 2; params.c_t[1] = 20; params.c_t[2] = 200; params.low_min_throttle = 0.2; params.steer_command_angle_scale = 0.5; params.steering_constant = 0.5; params.wheel_base = 0.5; params.max_steer_rate = 5; dynamics.setParams(params); DYN::state_array state; DYN::state_array next_state; DYN::state_array state_der = DYN::state_array::Zero(); DYN::control_array control; DYN::output_array output; float dt = 0.1; // TODO add in the elevation map // Basic initial state and no movement should stay still state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 0, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_TRUE(state_der == DYN::state_array::Zero()); EXPECT_NEAR(next_state(0), 0.0, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), 0.0, tol); // check the first index of throttle state << 0.54, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 0.21, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -0.115, tol); EXPECT_NEAR(next_state(0), 0.5285, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.054, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -0.115, tol); // check the start of second index of throttle state << 0.56, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 0.01, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -0.08, tol); EXPECT_NEAR(next_state(0), 0.552, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.056, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -0.08, tol); // check the start of second index of throttle with different dt state << 0.56, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 0.21, 0; dynamics.step(state, next_state, state_der, control, output, 0, 0.2); EXPECT_NEAR(state_der(0), -0.12, tol); EXPECT_NEAR(next_state(0), 0.536, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.112, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -0.12, tol); // check the end of second index of throttle state << 2.99, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 0.01, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -1.295, tol); EXPECT_NEAR(next_state(0), 2.8605, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.299, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -1.295, tol); // check the end of second index of throttle state << 3.01, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 0.01, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -0.2575, tol); EXPECT_NEAR(next_state(0), 2.98425, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.301, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -0.2575, tol); // Apply full throttle from zero state state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 1.6, tol); EXPECT_NEAR(next_state(0), 0.16, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), 1.6, tol); // Apply throttle to a state with positive velocity state << 1, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 5.5, tol); EXPECT_NEAR(next_state(0), 1.55, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.1, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), 5.5, tol); // Apply full throttle and half left turn to origin state state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 1.6, tol); EXPECT_NEAR(next_state(0), 0.16, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), powf(0.5, 3) * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), powf(0.5, 3), tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), 1.6, tol); // Apply full throttle and half left turn to a moving state oriented 30 degrees to the left float yaw = M_PI / 6; state << 1.0, yaw, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 5.5, tol); EXPECT_NEAR(next_state(0), 1.55, tol); EXPECT_NEAR(next_state(1), yaw, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), powf(0.5, 3) * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), powf(0.5, 3), tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), 5.5, tol); // Apply full throttle and half left turn to a moving state oriented 30 degrees to the left which is already turning float steer_angle = M_PI / 8; state << 1.0, yaw, 0, 0, steer_angle, -0.0, 0.0, 0, 0; control << 1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), 5.5, tol); EXPECT_NEAR(state_der(1), -0.086361105, tol); EXPECT_NEAR(next_state(0), 1.55, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), 5.5, tol); // Apply full brake and half left turn to a moving state oriented 30 degrees to the left which is already turning state << 1.0, yaw, 0, 0, steer_angle, 1.0, 0.0, 0, 0; control << -1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -5.5, tol); EXPECT_NEAR(next_state(0), 1 - 5.5 * dt, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -5.5, tol); /** * Apply full brake and half left turn to a moving state oriented 30 degrees to the left which is already turning * and on a downward facing hill */ float pitch = 20 * M_PI / 180; state << 1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; control << -1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), 1 + (-5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), (-5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half left turn to a backwards moving state oriented 30 degrees to the left which is already * turning and on a downward facing hill */ state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; control << -1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(state_der(1), 0.086361105, tol); EXPECT_NEAR(next_state(1), yaw + 0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), (5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half right turn to a backwards moving state oriented 30 degrees to the left which is already * turning and on a downward facing hill */ state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; control << -1, -0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + 0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (-0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (-0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), (5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half right turn to a backwards moving state with a huge steering angle to test max steer * angle and steering rate. We are also on a downward facing hill and are already oriented 30 degrees to the left */ steer_angle *= 100; state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; control << -1, -0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + tan(steer_angle / -9.1) * dt * -2, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), params.max_steer_angle, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), -params.max_steer_rate, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), (5.5 + 9.81 * sinf(pitch)), tol); } TEST_F(RacerDubinsElevationTest, TestStepGPUvsCPU) { const int num_rollouts = 1000; const float dt = 0.1f; CudaCheckError(); RacerDubinsElevation dynamics = RacerDubinsElevation(); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(num_rollouts); std::vector> s_next(num_rollouts); std::vector> s_der(num_rollouts); // steering, throttle std::vector> u(num_rollouts); RacerDubinsElevation::state_array state; RacerDubinsElevation::state_array next_state_cpu; RacerDubinsElevation::control_array control; RacerDubinsElevation::output_array output; RacerDubinsElevation::state_array state_der_cpu = RacerDubinsElevation::state_array::Zero(); // Run dynamics on dynamicsU // Run dynamics on GPU for (int y_dim = 1; y_dim <= 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } launchStepTestKernel(dynamics, s, u, s_der, s_next, 0, dt, y_dim); for (int point = 0; point < num_rollouts; point++) { state = state_trajectory.col(point); control = control_trajectory.col(point); state_der_cpu = RacerDubinsElevation::state_array::Zero(); dynamics.step(state, next_state_cpu, state_der_cpu, control, output, 0, dt); for (int dim = 0; dim < RacerDubinsElevation::STATE_DIM; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; // EXPECT_NEAR(state(dim), s[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_NEAR(next_state_cpu(dim), s_next[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s_next[point][dim])); } } } dynamics.freeCudaMem(); } TEST_F(RacerDubinsElevationTest, TestStepReverse) { CudaCheckError(); using DYN = RacerDubinsElevation; const float tol = 1e-6; DYN dynamics = DYN(); auto params = dynamics.getParams(); params.c_0 = 0; params.c_b[0] = 1; params.c_b[1] = 10; params.c_b[2] = 100; params.c_v[0] = 0.25; params.c_v[1] = 0.5; params.c_v[2] = 0.75; params.c_t[0] = 2; params.c_t[1] = 20; params.c_t[2] = 200; params.low_min_throttle = 0.2; params.steer_command_angle_scale = 0.5; params.steering_constant = 0.5; params.wheel_base = 0.5; params.max_steer_rate = 5; params.gear_sign = -1; dynamics.setParams(params); DYN::state_array state; DYN::state_array next_state; DYN::state_array state_der = DYN::state_array::Zero(); DYN::control_array control; DYN::output_array output; float dt = 0.1; // TODO add in the elevation map // Basic initial state and no movement should stay still state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 0, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_TRUE(state_der == DYN::state_array::Zero()); EXPECT_NEAR(next_state(0), 0.0, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), 0.0, tol); // Apply full throttle from zero state state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -1.6, tol); EXPECT_NEAR(next_state(0), -0.16, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -1.6, tol); // Apply throttle to a state with positive velocity state << 1, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -5.5, tol); EXPECT_NEAR(next_state(0), 0.45, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.1, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), 0.0, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), 0.0, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -5.5, tol); // Apply full throttle and half left turn to origin state state << 0, 0, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -1.6, tol); EXPECT_NEAR(next_state(0), -0.16, tol); EXPECT_NEAR(next_state(1), 0.0, tol); EXPECT_NEAR(next_state(2), 0.0, tol); EXPECT_NEAR(next_state(3), 0.0, tol); EXPECT_NEAR(next_state(4), powf(0.5, 3) * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), powf(0.5, 3), tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -1.6, tol); // Apply full throttle and half left turn to a moving state oriented 30 degrees to the left float yaw = M_PI / 6; state << 1.0, yaw, 0, 0, 0, -0.0, 0.0, 0, 0; control << 1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -5.5, tol); EXPECT_NEAR(next_state(0), 0.45, tol); EXPECT_NEAR(next_state(1), yaw, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), powf(0.5, 3) * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), powf(0.5, 3), tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -5.5, tol); // Apply full throttle and half left turn to a moving state oriented 30 degrees to the left which is already turning float steer_angle = M_PI / 8; state << 1.0, yaw, 0, 0, steer_angle, -0.0, 0.0, 0, 0; control << 1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -5.5, tol); EXPECT_NEAR(next_state(0), 0.45, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 0.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -5.5, tol); // Apply full brake and half left turn to a moving state oriented 30 degrees to the left which is already turning state << 1.0, yaw, 0, 0, steer_angle, 1.0, 0, 0.0, 0, 0; control << -1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(state_der(0), -5.5, tol); EXPECT_NEAR(next_state(0), 1 - 5.5 * dt, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), -5.5, tol); /** * Apply full brake and half left turn to a moving state oriented 30 degrees to the left which is already turning * and on a downward facing hill */ float pitch = 20 * M_PI / 180; state << 1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; control << -1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), 1 + (-5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + -0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), 1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), 1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), (-5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half left turn to a backwards moving state oriented 30 degrees to the left which is already * turning and on a downward facing hill */ state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; control << -1, 0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + 0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), (5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half right turn to a backwards moving state oriented 30 degrees to the left which is already * turning and on a downward facing hill */ state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; control << -1, -0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + 0.086361105 * dt, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), steer_angle + (-0.25 - steer_angle) * 0.5 * dt, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), (-0.25 - steer_angle) * 0.5, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), (5.5 + 9.81 * sinf(pitch)), tol); /** * Apply full brake and half right turn to a backwards moving state with a huge steering angle to test max steer * angle and steering rate. We are also on a downward facing hill and are already oriented 30 degrees to the left */ steer_angle *= 100; state << -1.0, yaw, 0, 0, steer_angle, 1.0, 0, pitch, 0, 0; control << -1, -0.5; dynamics.step(state, next_state, state_der, control, output, 0, dt); EXPECT_NEAR(next_state(0), -1 + (5.5 + 9.81 * sinf(pitch)) * dt, tol); EXPECT_NEAR(next_state(1), yaw + tan(steer_angle / -9.1) * dt * -2, tol); EXPECT_NEAR(next_state(2), -1 * cos(yaw) * dt, tol); EXPECT_NEAR(next_state(3), -1 * sin(yaw) * dt, tol); EXPECT_NEAR(next_state(4), params.max_steer_angle, tol); EXPECT_NEAR(next_state(5), 1.0, tol); EXPECT_NEAR(next_state(6), 0.0, tol); EXPECT_NEAR(next_state(7), 0.0, tol); EXPECT_NEAR(next_state(8), -params.max_steer_rate, tol); EXPECT_NEAR(output(O_IND_CLASS(DYN::DYN_PARAMS_T, ACCEL_X)), (5.5 + 9.81 * sinf(pitch)), tol); } TEST_F(RacerDubinsElevationTest, TestStepGPUvsCPUReverse) { const int num_rollouts = 1000; const float dt = 0.1f; CudaCheckError(); RacerDubinsElevation dynamics = RacerDubinsElevation(); auto params = dynamics.getParams(); params.gear_sign = -1; dynamics.setParams(params); cudaExtent extent = make_cudaExtent(10, 20, 0); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, make_float3(1, 2, 3)); helper->updateTexture(0, data_vec); helper->updateResolution(0, 10); helper->enableTexture(0); helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(num_rollouts); std::vector> s_next(num_rollouts); std::vector> s_der(num_rollouts); // steering, throttle std::vector> u(num_rollouts); RacerDubinsElevation::state_array state; RacerDubinsElevation::state_array next_state_cpu; RacerDubinsElevation::control_array control; RacerDubinsElevation::output_array output; RacerDubinsElevation::state_array state_der_cpu = RacerDubinsElevation::state_array::Zero(); // Run dynamics on dynamicsU // Run dynamics on GPU for (int y_dim = 1; y_dim <= 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } launchStepTestKernel(dynamics, s, u, s_der, s_next, 0, dt, y_dim); for (int point = 0; point < num_rollouts; point++) { state = state_trajectory.col(point); control = control_trajectory.col(point); state_der_cpu = RacerDubinsElevation::state_array::Zero(); dynamics.step(state, next_state_cpu, state_der_cpu, control, output, 0, dt); for (int dim = 0; dim < RacerDubinsElevation::STATE_DIM; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; // EXPECT_NEAR(state(dim), s[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_NEAR(next_state_cpu(dim), s_next[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s_next[point][dim])); } } } dynamics.freeCudaMem(); } TEST_F(RacerDubinsElevationTest, ComputeStateTrajectoryFiniteTest) { RacerDubinsElevation dynamics = RacerDubinsElevation(); using PARAMS = RacerDubinsElevation::DYN_PARAMS_T; PARAMS params; params.c_t[0] = 3.0; params.c_b[0] = 0.2; params.c_v[0] = 0.2; params.c_0 = 0.2; params.wheel_base = 3.0; params.steering_constant = 1.0; dynamics.setParams(params); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Zero(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Zero(); RacerDubinsElevation::state_array state_der; RacerDubinsElevation::state_array x, x_next; RacerDubinsElevation::output_array output; x << 0, 1.46919e-6, 0.0140179, 1.09739e-8, -0.000735827; for (int i = 0; i < 500; i++) { RacerDubinsElevation::control_array u = control_trajectory.col(i); dynamics.step(x, x_next, state_der, u, output, i, 0.02); dynamics.computeDynamics(x, u, state_der); EXPECT_TRUE(x.allFinite()); EXPECT_TRUE(x_next.allFinite()); EXPECT_TRUE(state_der.allFinite()); EXPECT_TRUE(u.allFinite()); EXPECT_TRUE(state_der != RacerDubinsElevation::state_array::Zero()); x = x_next; } params.steering_constant = 0.5; dynamics.setParams(params); x << 0, 1.46919e-6, 0.0140179, 1.09739e-8, -1.0; for (int i = 0; i < 500; i++) { RacerDubinsElevation::control_array u = control_trajectory.col(i); dynamics.step(x, x_next, state_der, u, output, i, 0.02); dynamics.computeDynamics(x, u, state_der); EXPECT_TRUE(x.allFinite()); EXPECT_TRUE(x_next.allFinite()); EXPECT_TRUE(state_der.allFinite()); EXPECT_TRUE(u.allFinite()); EXPECT_TRUE(state_der != RacerDubinsElevation::state_array::Zero()); x = x_next; } } class LinearDummy : public RacerDubinsElevation { public: bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { return false; }; }; TEST_F(RacerDubinsElevationTest, TestComputeGradComputation) { GTEST_SKIP(); Eigen::Matrix numeric_jac; Eigen::Matrix analytic_jac; const int num_rollouts = 100; Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); // TODO properly scale the random values auto analytic_grad_model = RacerDubinsElevation(); RacerDubinsElevation::dfdx A_analytic = RacerDubinsElevation::dfdx::Zero(); RacerDubinsElevation::dfdu B_analytic = RacerDubinsElevation::dfdu::Zero(); auto numerical_grad_model = LinearDummy(); std::shared_ptr> ddp_model = std::make_shared>(&numerical_grad_model); auto params = analytic_grad_model.getParams(); // params.c_t[0] = 3.0; // params.c_b[0] = 0.2; // params.c_v[0] = 0.2; params.c_0 = 0.0; params.wheel_base = 3.0; params.steering_constant = 1.1; params.low_min_throttle = 0.0; params.max_brake_rate_pos = 10.0; analytic_grad_model.setParams(params); numerical_grad_model.setParams(params); auto limits = analytic_grad_model.getControlRanges(); limits[0].x = -0.3; limits[0].y = 1.0; limits[1].x = -1.0; limits[1].y = 1.0; analytic_grad_model.setControlRanges(limits); numerical_grad_model.setControlRanges(limits); state_trajectory.col(0) << 5, M_PI_4, 1, 1, 3, 0, 0, 0, 2; control_trajectory.col(0) << 0.5, 1; state_trajectory.col(1) << 5, M_PI_4, 1, 1, 3, 0, 0, 0, 2; control_trajectory.col(1) << -1.0, 1; // state_trajectory.col(5) = state_trajectory.col(5).cwiseAbs(); for (int i = 0; i < num_rollouts; i++) { RacerDubinsElevation::state_array state = state_trajectory.col(i); RacerDubinsElevation::control_array control = control_trajectory.col(i); if (abs(state(0)) < 1) { state(0) = state(0) * 10; state(1) = state(1) * M_PI; state(2) = state(2) * 100; state(3) = state(3) * 100; state(4) = state(4) * 5; state(6) = state(6) * M_PI_2; state(7) = state(7) * M_PI_2; state(8) = state(8) * 10; } state(5) = min(abs(state(5) / 0.33f), 0.3f); // std::cout << "iteration " << i << std::endl; // std::cout << "state: " << state.transpose() << std::endl; // std::cout << "control: " << control.transpose() << std::endl; bool analytic_grad = analytic_grad_model.computeGrad(state, control, A_analytic, B_analytic); EXPECT_TRUE(analytic_grad); analytic_jac.leftCols() = A_analytic; analytic_jac.rightCols() = B_analytic; numeric_jac = ddp_model->df(state, control); EXPECT_LT((numeric_jac - analytic_jac).norm(), 5e-2) << "at index " << i << "\nstate: " << state.transpose() << "\ncontrol " << control.transpose() << "\nNumeric Jacobian\n" << numeric_jac << "\nAnalytic Jacobian\n" << analytic_jac; } } ================================================ FILE: tests/dynamics/racer_dubins_elevation_suspension_test.cu ================================================ #include #include #include #include #include #include #include class RacerDubinsElevationSuspensionTest : public ::testing::Test { public: cudaStream_t stream; cudaExtent extent = make_cudaExtent(10, 20, 0); float map_resolution = 10.0f; // [m / px] float3 origin = make_float3(1, 2, 3); // [m, m, m] std::vector data_vec; std::vector normal_vec; void SetUp() override { CudaCheckError(); HANDLE_ERROR(cudaStreamCreate(&stream)); data_vec.resize(extent.width * extent.height); normal_vec.resize(extent.width * extent.height); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = i * 1.0f; Eigen::Vector4f random_normal = Eigen::Vector4f::Random(); normal_vec[i] = make_float4(random_normal.x(), random_normal.y(), random_normal.z(), random_normal.w()); } } void TearDown() override { CudaCheckError(); HANDLE_ERROR(cudaStreamDestroy(stream)); } }; TEST_F(RacerDubinsElevationSuspensionTest, Template) { using DYNAMICS = RacerDubinsElevationSuspension; DYNAMICS dynamics = DYNAMICS(mppi::tests::steering_lstm, stream); EXPECT_EQ(24, DYNAMICS::STATE_DIM); EXPECT_EQ(2, DYNAMICS::CONTROL_DIM); EXPECT_NE(dynamics.getTextureHelper(), nullptr); } TEST_F(RacerDubinsElevationSuspensionTest, TestStepGPUvsCPU) { const int num_rollouts = 1000; const float dt = 0.1f; CudaCheckError(); using DYN = RacerDubinsElevationSuspension; DYN dynamics = DYN(mppi::tests::steering_lstm, stream); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_rate, 17.590296); EXPECT_FLOAT_EQ(dynamics.getParams().steering_constant, 3.286375); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_constant, 9.301527); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_drag_constant, -0.60327667); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, origin); helper->updateTexture(0, data_vec); helper->updateResolution(0, map_resolution); helper->enableTexture(0); helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); EXPECT_NE(dynamics.getHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.network_d_, nullptr); EXPECT_EQ(dynamics.network_d_, dynamics.getHelper()->getLSTMDevicePtr()); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, dt, 16, 32, buffer, 1.0e-4); } TEST_F(RacerDubinsElevationSuspensionTest, TestStepGPUvsCPUWithPartialElevationMap) { const int num_rollouts = 1000; const float dt = 0.1f; CudaCheckError(); using DYN = RacerDubinsElevationSuspension; DYN dynamics = DYN(mppi::tests::steering_lstm, stream); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_rate, 17.590296); EXPECT_FLOAT_EQ(dynamics.getParams().steering_constant, 3.286375); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_constant, 9.301527); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_drag_constant, -0.60327667); for (int i = 0; i < data_vec.size(); i++) { if (i < 10) { data_vec[i] = NAN; } } TwoDTextureHelper* helper = dynamics.getTextureHelper(); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, origin); helper->updateTexture(0, data_vec, extent); helper->updateResolution(0, map_resolution); helper->enableTexture(0); helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); EXPECT_NE(dynamics.getHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.network_d_, nullptr); EXPECT_EQ(dynamics.network_d_, dynamics.getHelper()->getLSTMDevicePtr()); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, dt, 16, 32, buffer, 1.0e-4); } TEST_F(RacerDubinsElevationSuspensionTest, TestStepGPUvsCPUWithPartialElevationAndNormalMap) { const int num_rollouts = 1000; const float dt = 0.1f; CudaCheckError(); using DYN = RacerDubinsElevationSuspension; DYN dynamics = DYN(mppi::tests::steering_lstm, stream); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_rate, 17.590296); EXPECT_FLOAT_EQ(dynamics.getParams().steering_constant, 3.286375); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_constant, 9.301527); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_drag_constant, -0.60327667); for (int i = 0; i < data_vec.size(); i++) { if (i < 10) { data_vec[i] = NAN; normal_vec[i] = make_float4(NAN, NAN, NAN, NAN); } } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); TwoDTextureHelper* helper = dynamics.getTextureHelper(); TwoDTextureHelper* normal_helper = dynamics.getTextureHelperNormals(); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, origin); helper->updateTexture(0, data_vec, extent); helper->updateResolution(0, map_resolution); helper->enableTexture(0); helper->copyToDevice(false); normal_helper->updateRotation(0, new_rot_mat); normal_helper->updateOrigin(0, origin); normal_helper->updateTexture(0, normal_vec, extent); normal_helper->updateResolution(0, map_resolution); normal_helper->enableTexture(0); normal_helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); EXPECT_NE(dynamics.getHelper()->getLSTMDevicePtr(), nullptr); EXPECT_NE(dynamics.network_d_, nullptr); EXPECT_EQ(dynamics.network_d_, dynamics.getHelper()->getLSTMDevicePtr()); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, dt, 16, 32, buffer, 1.0e-4); } TEST_F(RacerDubinsElevationSuspensionTest, TestStepGPUvsCPUOutput) { const float dt = 0.1f; CudaCheckError(); using DYN = RacerDubinsElevationSuspension; DYN dynamics = DYN(mppi::tests::steering_lstm, stream); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_rate, 17.590296); EXPECT_FLOAT_EQ(dynamics.getParams().steering_constant, 3.286375); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_constant, 9.301527); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_drag_constant, -0.60327667); TwoDTextureHelper* helper = dynamics.getTextureHelper(); helper->setExtent(0, extent); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, origin); helper->updateTexture(0, data_vec); helper->updateResolution(0, map_resolution); helper->enableTexture(0); helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, dt, 16, 32, buffer, 1.0e-4); } TEST_F(RacerDubinsElevationSuspensionTest, TestStepGPUvsCPUWithPartialElevationAndNormalMapCheckOutput) { const float dt = 0.1f; CudaCheckError(); using DYN = RacerDubinsElevationSuspension; DYN dynamics = DYN(mppi::tests::steering_lstm, stream); EXPECT_FLOAT_EQ(dynamics.getParams().max_steer_rate, 17.590296); EXPECT_FLOAT_EQ(dynamics.getParams().steering_constant, 3.286375); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_constant, 9.301527); EXPECT_FLOAT_EQ(dynamics.getParams().steer_accel_drag_constant, -0.60327667); for (int i = 0; i < data_vec.size(); i++) { if (i < 10) { data_vec[i] = NAN; normal_vec[i] = make_float4(NAN, NAN, NAN, NAN); } } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); TwoDTextureHelper* helper = dynamics.getTextureHelper(); TwoDTextureHelper* normal_helper = dynamics.getTextureHelperNormals(); helper->updateRotation(0, new_rot_mat); helper->updateOrigin(0, origin); helper->updateTexture(0, data_vec, extent); helper->updateResolution(0, map_resolution); helper->enableTexture(0); helper->copyToDevice(false); normal_helper->updateRotation(0, new_rot_mat); normal_helper->updateOrigin(0, origin); normal_helper->updateTexture(0, normal_vec, extent); normal_helper->updateResolution(0, map_resolution); normal_helper->enableTexture(0); normal_helper->copyToDevice(true); CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); DYN::buffer_trajectory buffer; buffer["VEL_X"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE"] = Eigen::VectorXf::Random(51); buffer["STEER_ANGLE_RATE"] = Eigen::VectorXf::Random(51); buffer["STEER_CMD"] = Eigen::VectorXf::Random(51); checkGPUComputationStep(dynamics, dt, 16, 32, buffer, 1.0e-4); } ================================================ FILE: tests/dynamics/racer_dubins_model_test.cu ================================================ #include #include #include #include #include #include TEST(RacerDubins, Template) { auto dynamics = RacerDubins(); EXPECT_EQ(7, RacerDubins::STATE_DIM); EXPECT_EQ(2, RacerDubins::CONTROL_DIM); } TEST(RacerDubins, BindStream) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); auto dynamics = RacerDubins(stream); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; HANDLE_ERROR(cudaStreamDestroy(stream)); } /* float c_t = 1.3; float c_b = 2.5; float c_v = 3.7; float c_0 = 4.9; float wheel_base = 0.3; */ TEST(RacerDubins, ComputeDynamics) { RacerDubins dynamics = RacerDubins(); RacerDubinsParams params = dynamics.getParams(); RacerDubins::state_array x = RacerDubins::state_array::Zero(); RacerDubins::control_array u = RacerDubins::control_array::Zero(); RacerDubins::state_array next_x = RacerDubins::state_array::Zero(); dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), 0); EXPECT_FLOAT_EQ(next_x(3), 0); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), 0); EXPECT_FLOAT_EQ(next_x(6), 0); x << 1, M_PI_2, 0, 3, 0, 0, 0; u << 1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 + 1.3 - 3.7); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_NEAR(next_x(2), 0, 1e-7); EXPECT_FLOAT_EQ(next_x(3), 1); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), 0); EXPECT_FLOAT_EQ(next_x(6), 0); x << 1, 0, 0, 3, 0, 0, 0; u << -1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.7); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), 1); EXPECT_FLOAT_EQ(next_x(3), 0); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), 0.33); EXPECT_FLOAT_EQ(next_x(6), 0); x << 1, 0, 0, 3, 0, 0.33, 0; u << -1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 - 0.33 * 2.5 - 3.7); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), 1); EXPECT_FLOAT_EQ(next_x(3), 0); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), 0.33); EXPECT_FLOAT_EQ(next_x(6), 0); x << 1, 0, 0, 3, 0, 1.0, 0; u << 1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 - 2.5 + 1.3 - 3.7); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), 1); EXPECT_FLOAT_EQ(next_x(3), 0); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), -0.9); EXPECT_FLOAT_EQ(next_x(6), 0); x << -1, 0, 0, 3, 0, 0, 0; u << 1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.7 + 1.3); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), -1); EXPECT_FLOAT_EQ(next_x(3), 0); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), 0); EXPECT_FLOAT_EQ(next_x(6), 0); x << -1, 0, 0, 3, 0, 1.0, 0; u << -1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 + 2.5 + 3.7); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), -1); EXPECT_FLOAT_EQ(next_x(3), 0); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), 0); EXPECT_FLOAT_EQ(next_x(6), 0); x << -3, 0, 0, 3, 0, 1.0, 0; u << -1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 + 2.5 + 3.7 * 3); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), -3); EXPECT_FLOAT_EQ(next_x(3), 0); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), 0); EXPECT_FLOAT_EQ(next_x(6), 0); x << 4, 0, 0, 3, 0, 1.0, 0; u << -1, 0; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 - 2.5 - 3.7 * 4); EXPECT_FLOAT_EQ(next_x(1), 0); EXPECT_FLOAT_EQ(next_x(2), 4); EXPECT_FLOAT_EQ(next_x(3), 0); EXPECT_FLOAT_EQ(next_x(4), 0); EXPECT_FLOAT_EQ(next_x(5), 0); EXPECT_FLOAT_EQ(next_x(6), 0); x << 1, M_PI, 0, 3, 0, 0, 0; u << 0, 1; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.7); EXPECT_FLOAT_EQ(next_x(1), (1 / .3) * tan(0)); EXPECT_FLOAT_EQ(next_x(2), -1); EXPECT_NEAR(next_x(3), 0, 1e-7); EXPECT_FLOAT_EQ(next_x(4), 1 * 5 * 0.6); EXPECT_FLOAT_EQ(next_x(5), 0); EXPECT_FLOAT_EQ(next_x(6), 0); x << 1, M_PI, 0, 0, 0.5, 0, 0; u << 1, -1; dynamics.computeDynamics(x, u, next_x); EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.7 + 1.3); EXPECT_FLOAT_EQ(next_x(1), (1 / .3) * tan(0.5 / -9.1)); EXPECT_FLOAT_EQ(next_x(2), -1); EXPECT_NEAR(next_x(3), 0, 1e-7); EXPECT_FLOAT_EQ(next_x(4), (-1 * 5 - 0.5) * 0.6); EXPECT_FLOAT_EQ(next_x(5), 0); EXPECT_FLOAT_EQ(next_x(6), 0); } TEST(RacerDubins, enforceLeash) { RacerDubins dynamics = RacerDubins(); RacerDubins::state_array state_true = RacerDubins::state_array::Zero(); RacerDubins::state_array state_nominal = RacerDubins::state_array::Zero(); RacerDubins::state_array leash_values = RacerDubins::state_array::Zero(); RacerDubins::state_array output; double tol = 1e-7; // if the two are equal it should match state_true = RacerDubins::state_array::Random(); state_nominal = state_true; dynamics.enforceLeash(state_true, state_nominal, leash_values, output); // std::cout << "true : " << state_true.transpose() << std::endl; // std::cout << "nominal: " << state_nominal.transpose() << std::endl; // std::cout << "output : " << output.transpose() << std::endl; EXPECT_LT((output - state_true).norm(), tol); // if the two are very far apart but zero leash values state_nominal << 1, 1, 1, 1, 1, 1, 1, 1; dynamics.enforceLeash(state_true, state_nominal, leash_values, output); EXPECT_LT((output - state_true).norm(), tol); // if the two are far apart but large leash values state_nominal << 1, 1, 1, 1, 1, 1, 1, 1; leash_values << 2, 2, 2, 2, 2, 2, 2, 2; dynamics.enforceLeash(state_true, state_nominal, leash_values, output); EXPECT_LT((output - state_nominal).norm(), tol); // check yaw discont state_true << -3, -3, -3, -3, -3, -3, -3; state_nominal << 3, 3, 3, 3, 3, 3, 3; leash_values << 0, 1.0, 0, 0, 0, 0, 0; dynamics.enforceLeash(state_true, state_nominal, leash_values, output); EXPECT_FLOAT_EQ(output(0), -3); EXPECT_FLOAT_EQ(output(1), 3); EXPECT_FLOAT_EQ(output(2), -3); EXPECT_FLOAT_EQ(output(3), -3); EXPECT_FLOAT_EQ(output(4), -3); EXPECT_FLOAT_EQ(output(5), -3); EXPECT_FLOAT_EQ(output(6), -3); // check yaw discont clamp state_true << -3, -3, -3, -3, -3, -3, -3; state_nominal << 3, 3, 3, 3, 3, 3, 3; leash_values << 0, 0.15, 0, 0, 0, 0, 0; dynamics.enforceLeash(state_true, state_nominal, leash_values, output); EXPECT_FLOAT_EQ(output(0), -3); EXPECT_FLOAT_EQ(output(1), angle_utils::normalizeAngle(-3 - 0.15)); EXPECT_FLOAT_EQ(output(2), -3); EXPECT_FLOAT_EQ(output(3), -3); EXPECT_FLOAT_EQ(output(4), -3); EXPECT_FLOAT_EQ(output(5), -3); EXPECT_FLOAT_EQ(output(6), -3); // check yaw discont clamp state_true << 3, 3, 3, 3, 3, 3, 3; state_nominal << -3, -3, -3, -3, -3, -3, -3; leash_values << 0, 0.15, 0, 0, 0, 0, 0; dynamics.enforceLeash(state_true, state_nominal, leash_values, output); EXPECT_FLOAT_EQ(output(0), 3); EXPECT_FLOAT_EQ(output(1), angle_utils::normalizeAngle(3 + 0.15)); EXPECT_FLOAT_EQ(output(2), 3); EXPECT_FLOAT_EQ(output(3), 3); EXPECT_FLOAT_EQ(output(4), 3); EXPECT_FLOAT_EQ(output(5), 3); EXPECT_FLOAT_EQ(output(6), 3); leash_values = RacerDubins::state_array::Ones(); std::cout << "=========" << std::endl; for (int i = 0; i < RacerDubins::STATE_DIM; i++) { state_true = RacerDubins::state_array::Zero(); state_nominal = RacerDubins::state_array::Zero(); state_true(i) = 1.0; state_nominal(i) = 1.1; dynamics.enforceLeash(state_true, state_nominal, leash_values, output); for (int j = 0; j < RacerDubins::STATE_DIM; j++) { if (i == j) { EXPECT_FLOAT_EQ(output(j), 1.1); } else { EXPECT_FLOAT_EQ(output(j), 0.0); } } } } TEST(RacerDubins, TestModelGPU) { RacerDubins dynamics = RacerDubins(); dynamics.GPUSetup(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(100); std::vector> s_der(100); // steering, throttle std::vector> u(100); for (int state_index = 0; state_index < s.size(); state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } // Run dynamics on GPU for (int y_dim = 1; y_dim <= 4; y_dim++) { launchComputeDynamicsTestKernel(dynamics, s, u, s_der, y_dim); for (int point = 0; point < 100; point++) { RacerDubins::state_array state = state_trajectory.col(point); RacerDubins::control_array control = control_trajectory.col(point); RacerDubins::state_array state_der_cpu = RacerDubins::state_array::Zero(); dynamics.computeDynamics(state, control, state_der_cpu); for (int dim = 0; dim < RacerDubins::STATE_DIM; dim++) { EXPECT_NEAR(state(dim), s[point][dim], 1e-5) << "at sample " << point << ", state dim: " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s[point][dim])); } for (int dim = 0; dim < RacerDubins::CONTROL_DIM; dim++) { EXPECT_NEAR(control(dim), u[point][dim], 1e-5) << "at sample " << point << ", state dim: " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(u[point][dim])); } for (int dim = 0; dim < RacerDubins::STATE_DIM; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-5) << "at sample " << point << ", state dim: " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s_der[point][dim])); } } } dynamics.freeCudaMem(); } TEST(RacerDubins, TestUpdateState) { RacerDubins dynamics = RacerDubins(); std::array ranges{}; ranges[0].y = FLT_MAX; ranges[0].x = -1; ranges[1].y = FLT_MAX; ranges[1].x = -FLT_MAX; dynamics.setControlRanges(ranges); RacerDubins::state_array state; RacerDubins::state_array state_der; state << 0, 0, 0, 0, 0, 0, 0; state_der << 1, 1, 1, 1, 1, 1, 0; dynamics.updateState(state, state_der, 0.1); EXPECT_TRUE(state_der != RacerDubins::state_array::Zero()); EXPECT_FLOAT_EQ(state(0), 0.1); EXPECT_NEAR(state(1), 0.1, 1e-7); EXPECT_FLOAT_EQ(state(2), 0.1); EXPECT_FLOAT_EQ(state(3), 0.1); EXPECT_FLOAT_EQ(state(4), 0.1); EXPECT_FLOAT_EQ(state(5), 0.1); EXPECT_FLOAT_EQ(state(6), 1); state << 0, M_PI - 0.1, 0, 0, 0, 0, 0; state_der << 1, 1, 1, 1, 1, -1, 1; dynamics.updateState(state, state_der, 1.0); EXPECT_TRUE(state_der != RacerDubins::state_array::Zero()); EXPECT_FLOAT_EQ(state(0), 1.0); EXPECT_FLOAT_EQ(state(1), 1.0 - M_PI - 0.1); EXPECT_FLOAT_EQ(state(2), 1.0); EXPECT_FLOAT_EQ(state(3), 1.0); EXPECT_FLOAT_EQ(state(4), 0.5); // max steer angle is 0.5 EXPECT_FLOAT_EQ(state(5), 0); EXPECT_FLOAT_EQ(state(6), 1); state << 0, M_PI - 0.1, 0, 0, 0, 0, 0; state_der << 1, 1, 1, 1, 1, 2, 1; dynamics.updateState(state, state_der, 1.0); EXPECT_TRUE(state_der != RacerDubins::state_array::Zero()); EXPECT_FLOAT_EQ(state(0), 1.0); EXPECT_FLOAT_EQ(state(1), 1.0 - M_PI - 0.1); EXPECT_FLOAT_EQ(state(2), 1.0); EXPECT_FLOAT_EQ(state(3), 1.0); EXPECT_FLOAT_EQ(state(4), 0.5); // max steer angle is 0.5 EXPECT_FLOAT_EQ(state(5), 1.0); EXPECT_FLOAT_EQ(state(6), 1); state << 0, -M_PI + 0.1, 0, 0, 0, 0, 0; state_der << 1, -1, 1, 1, 1, 1, 1; dynamics.updateState(state, state_der, 1.0); EXPECT_TRUE(state_der != RacerDubins::state_array::Zero()); EXPECT_FLOAT_EQ(state(0), 1.0); EXPECT_FLOAT_EQ(state(1), M_PI + 0.1 - 1.0); EXPECT_FLOAT_EQ(state(2), 1.0); EXPECT_FLOAT_EQ(state(3), 1.0); EXPECT_FLOAT_EQ(state(4), 0.5); // max steer angle is 0.5 EXPECT_FLOAT_EQ(state(5), 1); EXPECT_FLOAT_EQ(state(6), 1); } TEST(RacerDubins, TestUpdateStateGPU) { RacerDubins dynamics = RacerDubins(); dynamics.GPUSetup(); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(100); std::vector> s_der(100); // steering, throttle std::vector> u(100); RacerDubins::state_array state; RacerDubins::control_array control; RacerDubins::state_array state_der_cpu = RacerDubins::state_array::Zero(); // Run dynamics on dynamicsU // Run dynamics on GPU for (int y_dim = 1; y_dim <= 4; y_dim++) { for (int state_index = 0; state_index < s.size(); state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } launchComputeStateDerivTestKernel(dynamics, s, u, s_der, y_dim); launchUpdateStateTestKernel(dynamics, s, s_der, 0.1f, y_dim); for (int point = 0; point < 100; point++) { RacerDubins::state_array state = state_trajectory.col(point); RacerDubins::control_array control = control_trajectory.col(point); RacerDubins::state_array state_der_cpu = RacerDubins::state_array::Zero(); dynamics.computeDynamics(state, control, state_der_cpu); dynamics.updateState(state, state_der_cpu, 0.1f); for (int dim = 0; dim < RacerDubins::STATE_DIM; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-5) << "at index " << point << " with y_dim " << y_dim; EXPECT_NEAR(state(dim), s[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s[point][dim])); } } } dynamics.freeCudaMem(); } TEST(RacerDubins, ComputeStateTrajectoryFiniteTest) { RacerDubins dynamics = RacerDubins(); RacerDubinsParams params; params.c_t[0] = 3.0; params.c_b[0] = 0.2; params.c_v[0] = 0.2; params.c_0 = 0.2; params.wheel_base = 3.0; params.steering_constant = 1.0; dynamics.setParams(params); Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Zero(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Zero(); RacerDubins::state_array state_der; RacerDubins::state_array x; x << 0, 1.46919e-6, 0.0140179, 1.09739e-8, -0.000735827; for (int i = 0; i < 500; i++) { RacerDubins::control_array u = control_trajectory.col(i); dynamics.computeDynamics(x, u, state_der); EXPECT_TRUE(x.allFinite()); EXPECT_TRUE(u.allFinite()); EXPECT_TRUE(state_der.allFinite()); dynamics.updateState(x, state_der, 0.02); EXPECT_TRUE(x.allFinite()); EXPECT_TRUE(u.allFinite()); EXPECT_TRUE(state_der != RacerDubins::state_array::Zero()); } params.steering_constant = 0.5; dynamics.setParams(params); x << 0, 1.46919e-6, 0.0140179, 1.09739e-8, -1.0; for (int i = 0; i < 500; i++) { RacerDubins::control_array u = control_trajectory.col(i); dynamics.computeDynamics(x, u, state_der); EXPECT_TRUE(x.allFinite()); EXPECT_TRUE(u.allFinite()); EXPECT_TRUE(state_der.allFinite()); dynamics.updateState(x, state_der, 0.02); EXPECT_TRUE(x.allFinite()); EXPECT_TRUE(u.allFinite()); EXPECT_TRUE(state_der != RacerDubins::state_array::Zero()); } } /* class LinearDummy : public RacerDubins { public: bool computeGrad(const Eigen::Ref & state, const Eigen::Ref& control, Eigen::Ref A, Eigen::Ref B) { return false; }; }; TEST(RacerDubins, TestComputeGradComputation) { Eigen::Matrix numeric_jac; Eigen::Matrix analytic_jac; RacerDubins::state_array state; state << 1, 2, 3, 4; RacerDubins::control_array control; control << 5; auto analytic_grad_model = RacerDubins(); RacerDubins::dfdx A_analytic = RacerDubins::dfdx::Zero(); RacerDubins::dfdu B_analytic = RacerDubins::dfdu::Zero(); analytic_grad_model.computeGrad(state, control, A_analytic, B_analytic); auto numerical_grad_model = LinearDummy(); std::shared_ptr> ddp_model = std::make_shared>(&numerical_grad_model); analytic_jac.leftCols() = A_analytic; analytic_jac.rightCols() = B_analytic; numeric_jac = ddp_model->df(state, control); ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-3) << "Numeric Jacobian\n" << numeric_jac << "\nAnalytic Jacobian\n" << analytic_jac; } */ ================================================ FILE: tests/dynamics/racer_suspension_model_test.cu ================================================ #include #include #include #include #include #include #include template __global__ void runGPUDynamics(DYN_T* dynamics, const int num_timesteps, float dt, const float* __restrict__ x_init_d, const float* __restrict__ u_d, float* __restrict__ x_next_d, float* __restrict__ output_d) { extern __shared__ float entire_buffer[]; float* x_shared = entire_buffer; float* x_dot_shared = &x_shared[mppi::math::nearest_multiple_4(BLOCKSIZE_X * BLOCKSIZE_Z * 2 * DYN_T::STATE_DIM)]; float* u_shared = &x_dot_shared[mppi::math::nearest_multiple_4(BLOCKSIZE_X * BLOCKSIZE_Z * DYN_T::STATE_DIM)]; float* y_shared = &u_shared[mppi::math::nearest_multiple_4(BLOCKSIZE_X * BLOCKSIZE_Z * DYN_T::CONTROL_DIM)]; float* theta_s = &y_shared[mppi::math::nearest_multiple_4(BLOCKSIZE_X * BLOCKSIZE_Z * 2 * DYN_T::OUTPUT_DIM)]; int global_idx = blockIdx.x * blockDim.x + threadIdx.x; int j = 0; float* x; float* x_next; float* x_temp; float* x_dot; float* u; float* y; x = &x_shared[(BLOCKSIZE_X * threadIdx.z + threadIdx.x + 0) * DYN_T::STATE_DIM]; x_next = &x_shared[(BLOCKSIZE_X * threadIdx.z + threadIdx.x + 1) * DYN_T::STATE_DIM]; x_dot = &x_dot_shared[(BLOCKSIZE_X * threadIdx.z + threadIdx.x) * DYN_T::STATE_DIM]; u = &u_shared[(BLOCKSIZE_X * threadIdx.z + threadIdx.x) * DYN_T::CONTROL_DIM]; y = &y_shared[(BLOCKSIZE_X * threadIdx.z + threadIdx.x) * DYN_T::OUTPUT_DIM]; for (j = threadIdx.y; j < DYN_T::STATE_DIM; j += blockDim.y) { x[j] = x_init_d[j]; x_dot[j] = 0; } for (j = threadIdx.y; j < DYN_T::OUTPUT_DIM; j += blockDim.y) { y[j] = 0; } __syncthreads(); dynamics->initializeDynamics(x, u, y, theta_s, 0, dt); __syncthreads(); for (int t = 0; t < num_timesteps; t++) { for (j = threadIdx.y; j < DYN_T::CONTROL_DIM; j += blockDim.y) { u[j] = u_d[global_idx * num_timesteps * DYN_T::CONTROL_DIM + t * DYN_T::CONTROL_DIM + j]; } for (j = threadIdx.y; j < DYN_T::STATE_DIM; j += blockDim.y) { x_next_d[global_idx * num_timesteps * DYN_T::STATE_DIM + t * DYN_T::STATE_DIM + j] = x[j]; } for (j = threadIdx.y; j < DYN_T::OUTPUT_DIM; j += blockDim.y) { output_d[global_idx * num_timesteps * DYN_T::OUTPUT_DIM + t * DYN_T::OUTPUT_DIM + j] = y[j]; } __syncthreads(); dynamics->enforceConstraints(x, u); __syncthreads(); dynamics->step(x, x_next, x_dot, u, y, theta_s, t, dt); __syncthreads(); x_temp = x; x = x_next; x_next = x_temp; } } class RacerSuspensionTest : public ::testing::Test { public: cudaStream_t stream; void SetUp() override { CudaCheckError(); HANDLE_ERROR(cudaStreamCreate(&stream)); } void TearDown() override { CudaCheckError(); HANDLE_ERROR(cudaStreamDestroy(stream)); } }; TEST_F(RacerSuspensionTest, Template) { auto dynamics = RacerSuspension(stream); EXPECT_EQ(14, RacerSuspension::STATE_DIM); EXPECT_EQ(2, RacerSuspension::CONTROL_DIM); EXPECT_NE(dynamics.getTextureHelper(), nullptr); } TEST_F(RacerSuspensionTest, BindStream) { auto dynamics = RacerSuspension(stream); EXPECT_EQ(dynamics.stream_, stream) << "Stream binding failure."; EXPECT_NE(dynamics.getTextureHelper(), nullptr); EXPECT_EQ(dynamics.getTextureHelper()->stream_, stream); } TEST_F(RacerSuspensionTest, OmegaJacobian) { using DYN_PARAMS = RacerSuspensionParams; auto dynamics = RacerSuspension(stream); RacerSuspension::state_array x = RacerSuspension::state_array::Zero(); x[S_IND_CLASS(DYN_PARAMS, ATTITUDE_QW)] = 1; x[S_IND_CLASS(DYN_PARAMS, OMEGA_B_X)] = 0.1; x[S_IND_CLASS(DYN_PARAMS, OMEGA_B_Y)] = -0.03; x[S_IND_CLASS(DYN_PARAMS, OMEGA_B_Z)] = 0.02; x[S_IND_CLASS(DYN_PARAMS, V_I_X)] = 2; RacerSuspension::state_array x_dot0 = RacerSuspension::state_array::Zero(); RacerSuspension::state_array x_dot1 = RacerSuspension::state_array::Zero(); RacerSuspension::control_array u = RacerSuspension::control_array::Zero(); RacerSuspension::output_array output = RacerSuspension::output_array::Zero(); Eigen::Matrix3f omega_jac; float delta = 0.001; dynamics.computeStateDeriv(x, u, x_dot0, output, &omega_jac); for (int i = 0; i < 3; i++) { x[S_IND_CLASS(DYN_PARAMS, OMEGA_B_X) + i] += delta; dynamics.computeStateDeriv(x, u, x_dot1, output); x[S_IND_CLASS(DYN_PARAMS, OMEGA_B_X) + i] -= delta; float abs_tol = 2; EXPECT_NEAR(omega_jac.col(i)[0], (x_dot1[S_IND_CLASS(DYN_PARAMS, OMEGA_B_X)] - x_dot0[S_IND_CLASS(DYN_PARAMS, OMEGA_B_X)]) / delta, abs_tol); EXPECT_NEAR(omega_jac.col(i)[1], (x_dot1[S_IND_CLASS(DYN_PARAMS, OMEGA_B_Y)] - x_dot0[S_IND_CLASS(DYN_PARAMS, OMEGA_B_Y)]) / delta, abs_tol); EXPECT_NEAR(omega_jac.col(i)[2], (x_dot1[S_IND_CLASS(DYN_PARAMS, OMEGA_B_Z)] - x_dot0[S_IND_CLASS(DYN_PARAMS, OMEGA_B_Z)]) / delta, abs_tol); } } TEST_F(RacerSuspensionTest, CPUvsGPU) { GTEST_SKIP(); const int NUM_PARALLEL_TESTS = 10; // MUST BE EVEN const int NUM_TIMESTEPS = 8; const float dt = 0.02; typedef RacerSuspension DYN; typedef Eigen::Matrix state_traj; typedef Eigen::Matrix control_traj; typedef Eigen::Matrix output_traj; typedef util::EigenAlignedVector control_trajectories; typedef util::EigenAlignedVector state_trajectories; typedef util::EigenAlignedVector output_trajectories; using state_array = typename DYN::state_array; using control_array = typename DYN::control_array; using output_array = typename DYN::output_array; using DYN_PARAMS = RacerSuspensionParams; auto dynamics = RacerSuspension(stream); auto control_range = dynamics.getControlRanges(); control_range[0] = make_float2(-1, 1); control_range[1] = make_float2(-1, 1); dynamics.setControlRanges(control_range); dynamics.GPUSetup(); std::default_random_engine generator(15.0); std::normal_distribution throttle_distribution(0.3, 0.3); std::normal_distribution steering_distribution(0.0, 0.8); float* x_init_d; float* u_d; float* x_next_d; float* output_d; HANDLE_ERROR(cudaMalloc((void**)&x_init_d, sizeof(float) * DYN::STATE_DIM * NUM_PARALLEL_TESTS)); HANDLE_ERROR(cudaMalloc((void**)&u_d, sizeof(float) * DYN::CONTROL_DIM * NUM_PARALLEL_TESTS * NUM_TIMESTEPS)); HANDLE_ERROR(cudaMalloc((void**)&x_next_d, sizeof(float) * DYN::STATE_DIM * NUM_PARALLEL_TESTS * NUM_TIMESTEPS)); HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(float) * DYN::OUTPUT_DIM * NUM_PARALLEL_TESTS * NUM_TIMESTEPS)); // Input variables DYN::state_array x_init = DYN::state_array::Zero(); x_init[S_IND_CLASS(DYN_PARAMS, P_I_X)] = 10; x_init[S_IND_CLASS(DYN_PARAMS, P_I_Y)] = 20; x_init[S_IND_CLASS(DYN_PARAMS, P_I_Z)] = 30; x_init[S_IND_CLASS(DYN_PARAMS, ATTITUDE_QW)] = 1; HANDLE_ERROR( cudaMemcpyAsync(x_init_d, x_init.data(), sizeof(float) * DYN::STATE_DIM, cudaMemcpyHostToDevice, stream)); control_trajectories u_noise; for (int s = 0; s < NUM_PARALLEL_TESTS; s++) { control_traj sample_u = control_traj::Zero(); for (int t = 0; t < NUM_TIMESTEPS; t++) { sample_u(0, t) = throttle_distribution(generator); sample_u(1, t) = steering_distribution(generator); } u_noise.push_back(sample_u); HANDLE_ERROR(cudaMemcpyAsync(u_d + s * NUM_TIMESTEPS * DYN::CONTROL_DIM, u_noise[s].data(), sizeof(float) * NUM_TIMESTEPS * DYN::CONTROL_DIM, cudaMemcpyHostToDevice, stream)); } // Output variables state_trajectories x_next_CPU(NUM_PARALLEL_TESTS); state_trajectories x_next_GPU(NUM_PARALLEL_TESTS); output_trajectories output_CPU(NUM_PARALLEL_TESTS); output_trajectories output_GPU(NUM_PARALLEL_TESTS); // CPU Test for (int s = 0; s < NUM_PARALLEL_TESTS; s++) { state_traj sample_state_traj; output_traj sample_output_traj; state_array xdot; state_array x = x_init; state_array x_next = x_init; output_array output = output_array::Zero(); control_array u; for (int t = 0; t < NUM_TIMESTEPS; t++) { if (t == 0) { dynamics.initializeDynamics(x, u, output, t, dt); } u = u_noise[s].col(t); sample_state_traj.col(t) = x; sample_output_traj.col(t) = output; dynamics.enforceConstraints(x, u); dynamics.step(x, x_next, xdot, u, output, t, dt); x = x_next; } x_next_CPU[s] = sample_state_traj; output_CPU[s] = sample_output_traj; } // GPU Test const int BLOCKSIZE_X = 2; const int BLOCKSIZE_Y = 8; const int NUM_GRIDS_X = (NUM_PARALLEL_TESTS - 1) / BLOCKSIZE_X + 1; dim3 block_dim(BLOCKSIZE_X, BLOCKSIZE_Y, 1); dim3 grid_dim(NUM_GRIDS_X, 1, 1); // Ensure that there won't memory overwriting due to inproper indexing static_assert(NUM_PARALLEL_TESTS % BLOCKSIZE_X == 0); int num_shared = block_dim.x * block_dim.z; unsigned shared_mem = mppi::kernels::calcClassSharedMemSize(&dynamics, block_dim) + sizeof(float) * (3 * mppi::math::nearest_multiple_4(num_shared * DYN::STATE_DIM) + mppi::math::nearest_multiple_4(num_shared * DYN::OUTPUT_DIM) + mppi::math::nearest_multiple_4(num_shared * DYN::CONTROL_DIM)); runGPUDynamics<<>>( dynamics.model_d_, NUM_TIMESTEPS, dt, x_init_d, u_d, x_next_d, output_d); for (int s = 0; s < NUM_PARALLEL_TESTS; s++) { HANDLE_ERROR(cudaMemcpyAsync(x_next_GPU[s].data(), x_next_d + s * NUM_TIMESTEPS * DYN::STATE_DIM, sizeof(float) * NUM_TIMESTEPS * DYN::STATE_DIM, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaMemcpyAsync(output_GPU[s].data(), output_d + s * NUM_TIMESTEPS * DYN::OUTPUT_DIM, sizeof(float) * NUM_TIMESTEPS * DYN::OUTPUT_DIM, cudaMemcpyDeviceToHost, stream)); } HANDLE_ERROR(cudaStreamSynchronize(stream)); for (int s = 0; s < NUM_PARALLEL_TESTS; s++) { for (int t = 0; t < NUM_TIMESTEPS; t++) { for (int d = 0; d < DYN::STATE_DIM; d++) { ASSERT_NEAR(x_next_CPU[s](d, t), x_next_GPU[s](d, t), 1.0) << "Sample " << s << ", t = " << t << ", state_dim: " << d << std::endl; } for (int d = 0; d < DYN::OUTPUT_DIM; d++) { ASSERT_TRUE(isfinite(output_CPU[s](d, t))) << "NaNs/inf at sample " << s << " t: " << t << ", dim: " << d << std::endl; ASSERT_NEAR(output_CPU[s](d, t), output_GPU[s](d, t), 1.0) << "Sample " << s << ", t = " << t << ", output_dim: " << d << std::endl; } } } } /* float c_t = 1.3; float c_b = 2.5; float c_v = 3.7; float c_0 = 4.9; float wheel_base = 0.3; */ // TEST_F(RacerSuspensionTest, ComputeDynamics) // { // RacerSuspension dynamics = RacerSuspension(); // RacerDubinsParams params = dynamics.getParams(); // RacerSuspension::state_array x = RacerSuspension::state_array::Zero(); // RacerSuspension::control_array u = RacerSuspension::control_array::Zero(); // // // computeDynamics should not touch the roll/pitch element // RacerSuspension::state_array next_x = RacerSuspension::state_array::Ones() * 0.153; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 0); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // // x << 1, M_PI_2, 0, 3, 0, 0.5, -0.5; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 1.3 - 3.7); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_NEAR(next_x(2), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(3), 1); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // // x << 1, 0, 0, 3, 0, 0.5, -0.5; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 2.5 - 3.7); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 1); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // // x << -1, 0, 0, 3, 0, 0.5, -0.5; // u << 1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 3.7 + 1.3); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -1); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // // x << -1, 0, 0, 3, 0, 0.5, -0.5; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 2.5 + 3.7); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -1); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // // x << -3, 0, 0, 3, 0, 0.5, -0.5; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 + 2.5 * 3 + 3.7 * 3); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), -3); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // // x << 4, 0, 0, 3, 0, 0.5, -0.5; // u << -1, 0; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 2.5 * 4 - 3.7 * 4); // EXPECT_FLOAT_EQ(next_x(1), 0); // EXPECT_FLOAT_EQ(next_x(2), 4); // EXPECT_FLOAT_EQ(next_x(3), 0); // EXPECT_FLOAT_EQ(next_x(4), 0); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // // x << 1, M_PI, 0, 3, 0, 0.5, -0.5; // u << 0, 1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.7); // EXPECT_FLOAT_EQ(next_x(1), (1 / .3) * tan(0)); // EXPECT_FLOAT_EQ(next_x(2), -1); // EXPECT_NEAR(next_x(3), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(4), -1 / 2.45); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // // x << 1, M_PI, 0, 0, 0.5, 0.5, -0.5; // u << 1, -1; // dynamics.computeDynamics(x, u, next_x); // EXPECT_FLOAT_EQ(next_x(0), 4.9 - 3.7 + 1.3); // EXPECT_FLOAT_EQ(next_x(1), (1 / .3) * tan(0.5)); // EXPECT_FLOAT_EQ(next_x(2), -1); // EXPECT_NEAR(next_x(3), 0, 1e-7); // EXPECT_FLOAT_EQ(next_x(4), 1 / 2.45); // EXPECT_FLOAT_EQ(next_x(5), 0.153); // EXPECT_FLOAT_EQ(next_x(6), 0.153); // } // // TEST_F(RacerSuspensionTest, TestModelGPU) // { // RacerSuspension dynamics = RacerSuspension(); // dynamics.GPUSetup(); // CudaCheckError(); // // Eigen::Matrix control_trajectory; // control_trajectory = Eigen::Matrix::Random(); // Eigen::Matrix state_trajectory; // state_trajectory = Eigen::Matrix::Random(); // // std::vector> s(100); // std::vector> s_der(100); // // steering, throttle // std::vector> u(100); // for (int state_index = 0; state_index < s.size(); state_index++) // { // for (int dim = 0; dim < s[0].size(); dim++) // { // s[state_index][dim] = state_trajectory.col(state_index)(dim); // } // for (int dim = 0; dim < u[0].size(); dim++) // { // u[state_index][dim] = control_trajectory.col(state_index)(dim); // } // } // // // These variables will be changed so initialized to the right size only // // // Run dynamics on dynamicsU // // Run dynamics on GPU // for (int y_dim = 1; y_dim <= 4; y_dim++) // { // launchComputeDynamicsTestKernel(dynamics, s, u, s_der, y_dim); // for (int point = 0; point < 100; point++) // { // RacerSuspension::state_array state = state_trajectory.col(point); // RacerSuspension::control_array control = control_trajectory.col(point); // RacerSuspension::state_array state_der_cpu = RacerSuspension::state_array::Zero(); // // dynamics.computeDynamics(state, control, state_der_cpu); // for (int dim = 0; dim < RacerSuspension::STATE_DIM; dim++) // { // EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], 1e-5) << "at index " << point << " with y_dim " << y_dim; // EXPECT_TRUE(isfinite(s_der[point][dim])); // } // } // } // // dynamics.freeCudaMem(); // CudaCheckError(); // } // // TEST_F(RacerSuspensionTest, TestUpdateState) // { // CudaCheckError(); // RacerSuspension dynamics = RacerSuspension(); // RacerSuspension::state_array state; // RacerSuspension::state_array state_der; // // // TODO add in the elevation map // // state << 0, 0, 0, 0, 0, -0.5, 0.5; // state_der << 1, 1, 1, 1, 1, 0, 0; // dynamics.updateState(state, state_der, 0.1); // EXPECT_TRUE(state_der == RacerSuspension::state_array::Zero()); // EXPECT_FLOAT_EQ(state(0), 0.1); // EXPECT_FLOAT_EQ(state(1), 0.1); // EXPECT_FLOAT_EQ(state(2), 0.1); // EXPECT_FLOAT_EQ(state(3), 0.1); // EXPECT_FLOAT_EQ(state(4), 1.0 + (0 - 1.0) * expf(-0.6 * 0.1)); // EXPECT_FLOAT_EQ(state(5), -0.5); // EXPECT_FLOAT_EQ(state(6), 0.5); // // state << 0, M_PI - 0.1, 0, 0, 0, -0.5, 0.5; // state_der << 1, 1, 1, 1, 1; // dynamics.updateState(state, state_der, 1.0); // EXPECT_TRUE(state_der == RacerSuspension::state_array::Zero()); // EXPECT_FLOAT_EQ(state(0), 1.0); // EXPECT_FLOAT_EQ(state(1), 1.0 - M_PI - 0.1); // EXPECT_FLOAT_EQ(state(2), 1.0); // EXPECT_FLOAT_EQ(state(3), 1.0); // EXPECT_FLOAT_EQ(state(4), 1.0 + (0 - 1.0) * expf(-0.6 * 1.0)); // EXPECT_FLOAT_EQ(state(5), -0.5); // EXPECT_FLOAT_EQ(state(6), 0.5); // // state << 0, -M_PI + 0.1, 0, 0, 0, -0.5, 0.5; // state_der << 1, -1, 1, 1, 1; // dynamics.updateState(state, state_der, 1.0); // EXPECT_TRUE(state_der == RacerSuspension::state_array::Zero()); // EXPECT_FLOAT_EQ(state(0), 1.0); // EXPECT_FLOAT_EQ(state(1), M_PI + 0.1 - 1.0); // EXPECT_FLOAT_EQ(state(2), 1.0); // EXPECT_FLOAT_EQ(state(3), 1.0); // EXPECT_FLOAT_EQ(state(4), 1.0 + (0 - 1.0) * expf(-0.6 * 1.0)); // EXPECT_FLOAT_EQ(state(5), -0.5); // EXPECT_FLOAT_EQ(state(6), 0.5); // // CudaCheckError(); // } // // TEST_F(RacerSuspensionTest, TestUpdateStateGPU) // { // CudaCheckError(); // RacerSuspension dynamics = RacerSuspension(); // CudaCheckError(); // dynamics.GPUSetup(); // CudaCheckError(); // // Eigen::Matrix control_trajectory; // control_trajectory = Eigen::Matrix::Random(); // Eigen::Matrix state_trajectory; // state_trajectory = Eigen::Matrix::Random(); // // std::vector> s(100); // std::vector> s_der(100); // // steering, throttle // std::vector> u(100); // // RacerSuspension::state_array state; // RacerSuspension::control_array control; // RacerSuspension::state_array state_der_cpu = RacerSuspension::state_array::Zero(); // // // Run dynamics on dynamicsU // // Run dynamics on GPU // for (int y_dim = 1; y_dim <= 10; y_dim++) // { // for (int state_index = 0; state_index < s.size(); state_index++) // { // for (int dim = 0; dim < s[0].size(); dim++) // { // s[state_index][dim] = state_trajectory.col(state_index)(dim); // } // for (int dim = 0; dim < u[0].size(); dim++) // { // u[state_index][dim] = control_trajectory.col(state_index)(dim); // } // } // // launchComputeStateDerivTestKernel(dynamics, s, u, s_der, y_dim); // launchUpdateStateTestKernel(dynamics, s, s_der, 0.1f, y_dim); // for (int point = 0; point < 100; point++) // { // RacerSuspension::state_array state = state_trajectory.col(point); // RacerSuspension::control_array control = control_trajectory.col(point); // RacerSuspension::state_array state_der_cpu = RacerSuspension::state_array::Zero(); // // dynamics.computeDynamics(state, control, state_der_cpu); // dynamics.updateState(state, state_der_cpu, 0.1f); // for (int dim = 0; dim < RacerSuspension::STATE_DIM; dim++) // { // if (dim < 5) // { // EXPECT_FLOAT_EQ(state_der_cpu(dim), s_der[point][dim]) << "at index " << point << " with y_dim " << y_dim; // EXPECT_NEAR(state(dim), s[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; // EXPECT_TRUE(isfinite(s[point][dim])); // } // else // { // EXPECT_FLOAT_EQ(state_der_cpu(dim), s_der[point][dim]) << "at index " << point << " with y_dim " << y_dim; // EXPECT_NEAR(s[point][dim], 0.0, 1e-4) // << "at index " << point << " with y_dim " << y_dim << " state index " << dim; // EXPECT_TRUE(isfinite(s[point][dim])); // } // } // } // } // dynamics.freeCudaMem(); // } // // TEST_F(RacerSuspensionTest, ComputeStateTrajectoryFiniteTest) // { // RacerSuspension dynamics = RacerSuspension(); // RacerDubinsParams params; // params.c_t = 3.0; // params.c_b = 0.2; // params.c_v = 0.2; // params.c_0 = 0.2; // params.wheel_base = 3.0; // params.steering_constant = 1.0; // dynamics.setParams(params); // // Eigen::Matrix control_trajectory; // control_trajectory = Eigen::Matrix::Zero(); // Eigen::Matrix state_trajectory; // state_trajectory = Eigen::Matrix::Zero(); // RacerSuspension::state_array state_der; // RacerSuspension::state_array x; // x << 0, 1.46919e-6, 0.0140179, 1.09739e-8, -0.000735827; // // for (int i = 0; i < 500; i++) // { // RacerSuspension::control_array u = control_trajectory.col(i); // dynamics.computeDynamics(x, u, state_der); // EXPECT_TRUE(x.allFinite()); // EXPECT_TRUE(u.allFinite()); // EXPECT_TRUE(state_der.allFinite()); // dynamics.updateState(x, state_der, 0.02); // EXPECT_TRUE(x.allFinite()); // EXPECT_TRUE(u.allFinite()); // EXPECT_TRUE(state_der == RacerSuspension::state_array::Zero()); // } // params.steering_constant = 0.5; // dynamics.setParams(params); // // x << 0, 1.46919e-6, 0.0140179, 1.09739e-8, -1.0; // for (int i = 0; i < 500; i++) // { // RacerSuspension::control_array u = control_trajectory.col(i); // dynamics.computeDynamics(x, u, state_der); // EXPECT_TRUE(x.allFinite()); // EXPECT_TRUE(u.allFinite()); // EXPECT_TRUE(state_der.allFinite()); // dynamics.updateState(x, state_der, 0.02); // EXPECT_TRUE(x.allFinite()); // EXPECT_TRUE(u.allFinite()); // EXPECT_TRUE(state_der == RacerSuspension::state_array::Zero()); // } // } // // /* // class LinearDummy : public RacerSuspension { // public: // bool computeGrad(const Eigen::Ref & state, // const Eigen::Ref& control, // Eigen::Ref A, // Eigen::Ref B) { // return false; // }; // }; // // TEST_F(RacerSuspensionTest, TestComputeGradComputation) { // Eigen::Matrix numeric_jac; Eigen::Matrix analytic_jac; RacerSuspension::state_array // state; state << 1, 2, 3, 4; RacerSuspension::control_array control; control << 5; // // auto analytic_grad_model = RacerSuspension(); // // RacerSuspension::dfdx A_analytic = RacerSuspension::dfdx::Zero(); // RacerSuspension::dfdu B_analytic = RacerSuspension::dfdu::Zero(); // // analytic_grad_model.computeGrad(state, control, A_analytic, B_analytic); // // auto numerical_grad_model = LinearDummy(); // // std::shared_ptr> ddp_model = // std::make_shared>(&numerical_grad_model); // // analytic_jac.leftCols() = A_analytic; // analytic_jac.rightCols() = B_analytic; // numeric_jac = ddp_model->df(state, control); // // ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-3) << "Numeric Jacobian\n" << numeric_jac << "\nAnalytic // Jacobian\n" // << analytic_jac; // } // // */ ================================================ FILE: tests/feedback_controllers/CMakeLists.txt ================================================ add_executable(generic_feedback_controller_test ${PROJECT_SOURCE_DIR}/tests/test_main.cpp generic_feedback_controller_test.cu) target_link_libraries(generic_feedback_controller_test gtest gmock gtest_main cartpole_mppi) gtest_discover_tests(generic_feedback_controller_test) set_target_properties(generic_feedback_controller_test PROPERTIES FOLDER test) add_executable(ddp_tests ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ddp_test.cu) target_link_libraries(ddp_tests gtest gmock gtest_main cartpole_mppi) gtest_discover_tests(ddp_tests) set_target_properties(ddp_tests PROPERTIES FOLDER test) ================================================ FILE: tests/feedback_controllers/ddp_test.cu ================================================ #include // #include // #include // #include #include #include class ModelWrapper_Test : public testing::Test { public: CartpoleDynamics model = CartpoleDynamics(1, 1, 1); std::shared_ptr> ddp_model = std::make_shared>(&model); CartpoleDynamics::state_array state; CartpoleDynamics::control_array control; }; TEST_F(ModelWrapper_Test, StateDerivative_1) { CartpoleDynamics::state_array result; CartpoleDynamics::state_array known_result; state << 0, 0, 0, 0; control << 0; result = ddp_model->f(state, control); known_result << 0, 0, 0, 0; ASSERT_EQ(known_result, result); } TEST_F(ModelWrapper_Test, StateDerivative_2) { CartpoleDynamics::state_array result; CartpoleDynamics::state_array known_result; state << 1, 2, 3, 4; control << 5; result = ddp_model->f(state, control); model.computeStateDeriv(state, control, known_result); ASSERT_EQ(known_result, result); } TEST_F(ModelWrapper_Test, Jacobian_1) { CartpoleDynamics::Jacobian result; CartpoleDynamics::Jacobian known_result; CartpoleDynamics::dfdx A = CartpoleDynamics::dfdx::Zero(); CartpoleDynamics::dfdu B = CartpoleDynamics::dfdu::Zero(); state << 1, 2, 3, 4; control << 5; model.computeGrad(state, control, A, B); known_result.leftCols() = A; known_result.rightCols() = B; result = ddp_model->df(state, control); ASSERT_EQ(known_result, result); } class TrackingCosts_Test : public testing::Test { public: typedef Eigen::Matrix square_u_matrix; typedef Eigen::Matrix square_xu_matrix; CartpoleDynamics::dfdx Q; square_u_matrix R; square_xu_matrix QR; int num_timesteps; std::shared_ptr>> tracking_cost; std::shared_ptr>> terminal_cost; CartpoleDynamics::state_array state; CartpoleDynamics::control_array control; protected: void SetUp() override { Q = CartpoleDynamics::dfdx::Identity(); R = square_u_matrix::Identity(); QR = square_xu_matrix::Identity(); num_timesteps = 100; tracking_cost = std::make_shared>>(Q, R, num_timesteps); terminal_cost = std::make_shared>>(Q); state.resize(CartpoleDynamics::STATE_DIM, 1); control.resize(CartpoleDynamics::CONTROL_DIM, 1); } }; TEST_F(TrackingCosts_Test, ComputeCost) { state << 1, 2, 3, 4; control << 5; int timestep = 0; ASSERT_FLOAT_EQ(1 + 4 + 9 + 16 + 25, tracking_cost->c(state, control, timestep)); ASSERT_FLOAT_EQ(1 + 4 + 9 + 16, terminal_cost->c(state)); } TEST_F(TrackingCosts_Test, ComputeCostGradient) { Eigen::Matrix known_result; state << 1, 2, 3, 4; control << 5; known_result.block(0, 0, 4, 1) = state; known_result.block(4, 0, 1, 1) = control; // std::cout << known_result << std::endl; int timestep = 0; ASSERT_EQ(QR * known_result, tracking_cost->dc(state, control, timestep)) << "Known:\n" << QR * known_result << "\nTracking cost: \n" << tracking_cost->dc(state, control, timestep); ASSERT_EQ(Q * state, terminal_cost->dc(state)) << "Known:\n" << Q * state << "\nTerminal cost: \n" << terminal_cost->dc(state); } TEST_F(TrackingCosts_Test, ComputeCostHessian) { state << 1, 2, 3, 4; control << 5; int timestep = 0; ASSERT_EQ(QR, tracking_cost->d2c(state, control, timestep)); ASSERT_EQ(Q, terminal_cost->d2c(state)); } TEST(DDPSolver_Test, Cartpole_Tracking) { using SAMPLER_T = mppi::sampling_distributions::GaussianDistribution; // SETUP THE MPPI CONTROLLER CartpoleDynamics model = CartpoleDynamics(1.0, 1.0, 1.0); CartpoleQuadraticCost cost; CartpoleQuadraticCostParams new_params; new_params.cart_position_coeff = 100; new_params.pole_angle_coeff = 200; new_params.cart_velocity_coeff = 10; new_params.pole_angular_velocity_coeff = 20; new_params.control_cost_coeff[0] = 1; new_params.terminal_cost_coeff = 0; new_params.desired_terminal_state[0] = 0; new_params.desired_terminal_state[1] = 0; new_params.desired_terminal_state[2] = M_PI; new_params.desired_terminal_state[3] = 0; cost.setParams(new_params); float dt = 0.01; int max_iter = 100; float lambda = 0.25; float alpha = 0.001; const int num_timesteps = 100; auto fb_controller = DDPFeedback(&model, dt); auto sampler = SAMPLER_T(); auto sampler_params = sampler.getParams(); for (int i = 0; i < CartpoleDynamics::CONTROL_DIM; i++) { sampler_params.std_dev[i] = 5.0; } sampler.setParams(sampler_params); DDPParams fb_params; fb_params.Q = 100 * CartpoleDynamics::dfdx:: Identity(); // Eigen::MatrixXf::Identity(CartpoleDynamics::STATE_DIM,CartpoleDynamics::STATE_DIM); fb_params.R = TrackingCosts_Test::square_u_matrix:: Identity(); // Eigen::MatrixXf::Identity(CartpoleDynamics::CONTROL_DIM,CartpoleDynamics::CONTROL_DIM); fb_params.Q_f = fb_params.Q; fb_params.num_iterations = 20; fb_controller.setParams(fb_params); fb_controller.initTrackingController(); auto controller = VanillaMPPIController, num_timesteps, 2048, SAMPLER_T>( &model, &cost, &fb_controller, &sampler, dt, max_iter, lambda, alpha); auto controller_params = controller.getParams(); controller_params.dynamics_rollout_dim_ = dim3(64, 8, 1); controller_params.cost_rollout_dim_ = dim3(num_timesteps, 1, 1); controller.setParams(controller_params); CartpoleDynamics::state_array current_state = model.getZeroState(); // Compute the control controller.computeControl(current_state, 0); auto nominal_state = controller.getTargetStateSeq(); auto nominal_control = controller.getControlSeq(); // END MPPI CONTROLLER // util::DefaultLogger logger; // bool verbose = false; // int num_iterations = 20; // Eigen::MatrixXf Q; // Eigen::MatrixXf R; // Eigen::MatrixXf QR; // Q = 100*Eigen::MatrixXf::Identity(CartpoleDynamics::STATE_DIM,CartpoleDynamics::STATE_DIM); // R = Eigen::MatrixXf::Identity(CartpoleDynamics::CONTROL_DIM,CartpoleDynamics::CONTROL_DIM); // QR = Eigen::MatrixXf(CartpoleDynamics::STATE_DIM+ CartpoleDynamics::CONTROL_DIM, // CartpoleDynamics::STATE_DIM + CartpoleDynamics::CONTROL_DIM); // QR.template topLeftCorner() = Q; // QR.template bottomRightCorner() = R; // std::shared_ptr> ddp_model = // std::make_shared>(&model); // std::shared_ptr>> ddp_solver_ = // std::make_shared>>(dt, num_timesteps, num_iterations, &logger, // verbose); // std::shared_ptr>> tracking_cost = // std::make_shared>>(Q,R,num_timesteps); // std::shared_ptr>> terminal_cost = // std::make_shared>>(Q); // tracking_cost->setTargets(nominal_state.data(), nominal_control.data(), num_timesteps); // terminal_cost->xf = tracking_cost->traj_target_x_.col(num_timesteps - 1); Eigen::Matrix s; s << 0.0, 0.0, 0.0, 0.0; Eigen::MatrixXf control_traj = Eigen::MatrixXf::Zero(CartpoleDynamics::CONTROL_DIM, num_timesteps); fb_controller.computeFeedback(s, nominal_state, nominal_control); // OptimizerResult> result_ = ddp_solver_->run(s, control_traj, // *ddp_model, *tracking_cost, *terminal_cost); // std::cout << result_.state_trajectory << std::endl; for (int i = 0; i < num_timesteps; ++i) { ASSERT_NEAR((nominal_state.col(i) - fb_controller.result_.state_trajectory.col(i)).norm(), 0.0f, 1e-2) << "Failed on timestep: " << i; } } TEST(DDPSolver_Test, Quadrotor_Tracking) { const int num_timesteps = 500; using DYN = QuadrotorDynamics; using COST = QuadrotorQuadraticCost; using CONTROLLER = VanillaMPPIController, num_timesteps, 2048>; std::array control_ranges; for (int i = 0; i < 3; i++) { control_ranges[i] = make_float2(-2.5, 2.5); } control_ranges[3] = make_float2(0, 36); DYN model(control_ranges); DYN::state_array x_goal; x_goal << 6, 4, 3, // position 0, 0, 0, // velocity 0.7071068, 0, 0, 0.7071068, // quaternion 0, 0, 0; // angular speed float dt = 0.01; // Create DDP and find feedback gains util::DefaultLogger logger; // bool verbose = false; // CONTROLLER::StateCostWeight Q; // CONTROLLER::ControlCostWeight R; // CONTROLLER::Hessian Qf; DDPParams fb_params; fb_params.Q = DDPParams::StateCostWeight::Identity(); fb_params.R = DDPParams::ControlCostWeight::Identity(); fb_params.Q_f = DDPParams::Hessian::Identity(); fb_params.Q.diagonal() << 25, 25, 300, 15, 15, 300, 0, 0, 0, 0, 30, 30, 30; fb_params.Q_f.diagonal() << 250, 250, 3000, 150, 150, 3000, 0, 0, 0, 0, 300, 300, 300; fb_params.R.diagonal() << 550, 550, 550, 1; fb_params.num_iterations = 100; Eigen::MatrixXf control_traj = CONTROLLER::control_trajectory::Zero(); CONTROLLER::state_trajectory ddp_state_traj = CONTROLLER::state_trajectory::Zero(); for (int i = 0; i < num_timesteps; i++) { ddp_state_traj.col(i) = x_goal; control_traj.col(i) = model.zero_control_; } auto fb_controller = DDPFeedback(&model, dt); fb_controller.setParams(fb_params); fb_controller.initTrackingController(); // auto ddp_model = std::make_shared>(&model); // auto ddp_solver = std::make_shared>>(dt, // num_timesteps, // num_iterations, // &logger, // verbose); // auto tracking_cost = // std::make_shared>>(Q, R, num_timesteps); // auto terminal_cost = // std::make_shared>>(Qf); // tracking_cost->setTargets(ddp_state_traj.data(), control_traj.data(), // num_timesteps); // terminal_cost->xf = tracking_cost->traj_target_x_.col(num_timesteps - 1); DYN::state_array x_real; x_real << 0, -0.5, 0, // position 0, 0.5, 0, // velocity 1, 0, 0, 0, // quaternion 0, 0, 0; // angular speed // DYN::control_array control_min, control_max; // for (int i = 0; i < DYN::CONTROL_DIM; i++) { // control_min(i) = control_ranges[i].x; // control_max(i) = control_ranges[i].y; // } std::cout << "Starting DDP" << std::endl; fb_controller.computeFeedback(x_real, ddp_state_traj, control_traj); // OptimizerResult> result_ = ddp_solver->run(x_real, // control_traj, // *ddp_model, // *tracking_cost, // *terminal_cost, // control_min, // control_max); auto control_feedback_gains = fb_controller.result_.feedback_gain; std::cout << "DDP Optimal State Sequence:\n" << fb_controller.result_.state_trajectory.transpose() << std::endl; DYN::state_array x_deriv, x, x_nom; x = x_real; DYN::control_array u_total, fb_u; for (int t = 0; t < num_timesteps; ++t) { x_nom = fb_controller.result_.state_trajectory.col(t); fb_u = fb_controller.k(x, x_nom, t); u_total = fb_u; model.enforceConstraints(x, u_total); std::cout << " t = " << t * dt << ", State_diff norm: " << (x - x_goal).norm() << ", Control: " << u_total.transpose() << std::endl; std::cout << "Feedback matrix:\n" << control_feedback_gains[t] << std::endl; model.computeStateDeriv(x, u_total, x_deriv); model.updateState(x, x_deriv, dt); } std::cout << std::fixed << std::setprecision(5) << "X_goal: " << x_goal.transpose() << "\nX_end: " << x.transpose() << "\ndiff in state: " << (x - x_goal).norm() << std::endl; EXPECT_LE((x - x_goal).norm(), 3); } ================================================ FILE: tests/feedback_controllers/generic_feedback_controller_test.cu ================================================ #include #include #include template struct DynamicsTesterParams : public DynamicsParams { enum class StateIndex : int { NUM_STATES = STATE_DIM }; enum class ControlIndex : int { NUM_CONTROLS = CONTROL_DIM }; enum class OutputIndex : int { NUM_OUTPUTS = OUTPUT_DIM }; int var_1 = 1; int var_2 = 2; float4 var_4; }; struct FeedbackParams { int var_10 = 10; int var_20 = 20; float var_30 = 3.14; }; template class DynamicsTester : public MPPI_internal::Dynamics, DynamicsTesterParams> { public: using PARENT_CLASS = MPPI_internal::Dynamics, DynamicsTesterParams>; using state_array = typename PARENT_CLASS::state_array; using control_array = typename PARENT_CLASS::control_array; DynamicsTester(cudaStream_t stream = 0) : PARENT_CLASS(stream) { } DynamicsTester(std::array control_rngs, cudaStream_t stream = 0) : PARENT_CLASS(control_rngs, stream) { } void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { state_der(1) = control(0); } void computeKinematics(const Eigen::Ref& state, Eigen::Ref s_der) { s_der(0) = state(0) + state(1); }; // TODO must be properly parallelized __device__ void computeDynamics(float* state, float* control, float* state_der, float* theta_s = nullptr) { state_der[1] = control[0]; } // TODO must be properly parallelized __device__ void computeKinematics(float* state, float* state_der) { state_der[0] = state[0] + state[1]; } }; struct TestGPUState : GPUState { int testing = 5; }; class TestGPUFeedbackController : public GPUFeedbackController, TestGPUState> { public: typedef MockDynamics DYN_T; typedef GPUFeedbackController, TestGPUState> PARENT_CLASS; TestGPUFeedbackController(cudaStream_t stream = 0) : PARENT_CLASS(stream) { } void allocateCudaMemory() { } void deallocateCUDAMemory() { } void copyToDevice() { } void copyFromDevice() { } }; class TestFeedbackController : public FeedbackController { public: typedef FeedbackController PARENT_CLASS; using INTERNAL_STATE_T = typename PARENT_CLASS::TEMPLATED_FEEDBACK_STATE; TestFeedbackController(float dt = 0.01, int num_timesteps = 10, cudaStream_t stream = 0) : PARENT_CLASS(dt, num_timesteps, stream) { } control_array k_(const Eigen::Ref& x_act, const Eigen::Ref& x_goal, int t, INTERNAL_STATE_T& fb_state) override { } // might not be a needed method void computeFeedback(const Eigen::Ref& init_state, const Eigen::Ref& goal_traj, const Eigen::Ref& control_traj) override { } void initTrackingController() override { } }; TEST(FeedbackController, Constructor) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); TestFeedbackController feedbackController(0.01, 10, stream); feedbackController.GPUSetup(); EXPECT_EQ(feedbackController.getHostPointer()->stream_, stream) << "Stream binding failure."; EXPECT_EQ(feedbackController.getHostPointer()->GPUMemStatus_, true) << "GPU not set up"; EXPECT_NE(feedbackController.getDevicePointer(), nullptr) << "GPU is not in device pointer"; HANDLE_ERROR(cudaStreamDestroy(stream)); } TEST(FeedbackController, test) { } ================================================ FILE: tests/include/kernel_tests/core/normexp_kernel_test.cu ================================================ #include "normexp_kernel_test.cuh" template void launchNormExp_KernelTest(std::array& trajectory_costs_host, float gamma, float baseline, std::array& normalized_compute) { // Allocate CUDA memory float* trajectory_costs_d; HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * trajectory_costs_host.size())) HANDLE_ERROR(cudaMemcpy(trajectory_costs_d, trajectory_costs_host.data(), sizeof(float) * trajectory_costs_host.size(), cudaMemcpyHostToDevice)) mppi::kernels::normExpKernel<<<1, NUM_ROLLOUTS>>>(NUM_ROLLOUTS, trajectory_costs_d, gamma, baseline); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(normalized_compute.data(), trajectory_costs_d, sizeof(float) * trajectory_costs_host.size(), cudaMemcpyDeviceToHost)) cudaFree(trajectory_costs_d); } template void launchGenericNormExpKernelTest(std::array trajectory_costs_host, float gamma, float baseline, std::array& normalized_compute) { // Allocate CUDA memory float* trajectory_costs_d; HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * trajectory_costs_host.size())); HANDLE_ERROR(cudaMemcpy(trajectory_costs_d, trajectory_costs_host.data(), sizeof(float) * trajectory_costs_host.size(), cudaMemcpyHostToDevice)); dim3 dimBlock(BLOCKSIZE_X, 1, 1); dim3 dimGrid((NUM_ROLLOUTS - 1) / BLOCKSIZE_X + 1, 1, 1); mppi::kernels::normExpKernel<<>>(NUM_ROLLOUTS, trajectory_costs_d, gamma, baseline); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(normalized_compute.data(), trajectory_costs_d, sizeof(float) * trajectory_costs_host.size(), cudaMemcpyDeviceToHost)); cudaFree(trajectory_costs_d); } ================================================ FILE: tests/include/kernel_tests/core/normexp_kernel_test.cuh ================================================ #pragma once #ifndef MPPI_GENERIC_KERNEL_TESTS_MPPI_CORE_NORMEXP_KERNEL_TEST_CUH_ #define MPPI_GENERIC_KERNEL_TESTS_MPPI_CORE_NORMEXP_KERNEL_TEST_CUH_ #include #include template void launchNormExp_KernelTest(std::array& trajectory_costs_host, float gamma, float baseline, std::array& normalized_compute); template void launchGenericNormExpKernelTest(std::array trajectory_costs_host, float gamma, float baseline, std::array& normalized_compute); template void launchAutorallyNormExpKernelTest(std::array trajectory_costs_host, float gamma, float baseline, std::array& normalized_compute); #if __CUDACC__ #include "normexp_kernel_test.cu" #endif #endif //! MPPI_GENERIC_KERNEL_TESTS_MPPI_CORE_NORMEXP_KERNEL_TEST_CUH_ ================================================ FILE: tests/include/kernel_tests/core/rmppi_kernel_test.cu ================================================ // // Created by mgandhi on 5/23/20. // #include "rmppi_kernel_test.cuh" template void launchCPURMPPIRolloutKernel(DYN_T* model, COST_T* cost, SAMPLER_T* sampler, FB_T* fb_controller, const float dt, const int num_timesteps, const int num_rollouts, const float lambda, const float alpha, const float value_func_threshold, const int nominal_idx, const int real_idx, const Eigen::Ref& initial_real_state, const Eigen::Ref& initial_nominal_state, Eigen::Ref trajectory_costs) { using state_array = typename DYN_T::state_array; using output_array = typename DYN_T::output_array; using control_array = typename DYN_T::control_array; Eigen::MatrixXf control_noise_real = Eigen::MatrixXf::Zero(DYN_T::CONTROL_DIM, num_rollouts * num_timesteps); Eigen::MatrixXf control_noise_nom = Eigen::MatrixXf::Zero(DYN_T::CONTROL_DIM, num_rollouts * num_timesteps); Eigen::MatrixXi crash_status_real = Eigen::MatrixXi::Zero(num_rollouts, 1); Eigen::MatrixXi crash_status_nom = Eigen::MatrixXi::Zero(num_rollouts, 1); HANDLE_ERROR(cudaMemcpy(control_noise_nom.data(), sampler->getControlSample(0, 0, nominal_idx), sizeof(float) * DYN_T::CONTROL_DIM * num_rollouts * num_timesteps, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(control_noise_real.data(), sampler->getControlSample(0, 0, real_idx), sizeof(float) * DYN_T::CONTROL_DIM * num_rollouts * num_timesteps, cudaMemcpyDeviceToHost)); state_array curr_real_state, curr_nom_state, next_real_state, next_nom_state, state_der_real, state_der_nom; output_array output_real, output_nom; control_array u_real, u_nom, u_feedback; auto sampler_params = sampler->getParams(); for (int sample_idx = 0; sample_idx < num_rollouts; sample_idx++) { curr_real_state = initial_real_state; curr_nom_state = initial_nominal_state; float running_real_cost = 0; float running_nom_cost = 0; float tracking_nom_cost = 0; float tracking_real_cost = 0; model->initializeDynamics(curr_real_state, u_real, output_real, 0, dt); cost->initializeCosts(output_real, u_real, 0, dt); for (int t = 0; t < num_timesteps; t++) { u_real = control_noise_real.col(sample_idx * num_timesteps + t); u_nom = control_noise_nom.col(sample_idx * num_timesteps + t); u_feedback = fb_controller->k(curr_real_state, curr_nom_state, t); u_real += u_feedback; model->enforceConstraints(curr_real_state, u_real); model->enforceConstraints(curr_nom_state, u_nom); model->step(curr_real_state, next_real_state, state_der_real, u_real, output_real, t, dt); model->step(curr_nom_state, next_nom_state, state_der_nom, u_nom, output_nom, t, dt); float real_cost = cost->computeRunningCost(output_real, u_real, t, &crash_status_real(sample_idx)); float nom_cost = cost->computeRunningCost(output_nom, u_nom, t, &crash_status_nom(sample_idx)); tracking_real_cost += real_cost + sampler->computeFeedbackCost(u_feedback.data(), (float*)&sampler_params, t, real_idx, lambda, alpha); running_real_cost += real_cost + sampler->computeLikelihoodRatioCost(u_real, t, real_idx, lambda, alpha); running_nom_cost += nom_cost; tracking_nom_cost += sampler->computeLikelihoodRatioCost(u_nom, t, nominal_idx, lambda, alpha); curr_real_state = next_real_state; curr_nom_state = next_nom_state; } running_real_cost += cost->terminalCost(output_real); tracking_real_cost += cost->terminalCost(output_real); running_nom_cost += cost->terminalCost(output_nom); tracking_nom_cost /= num_timesteps; tracking_real_cost /= num_timesteps; running_nom_cost /= num_timesteps; running_real_cost /= num_timesteps; running_nom_cost = 0.5f * running_nom_cost + 0.5f * fmaxf(fminf(tracking_real_cost, value_func_threshold), running_nom_cost); running_nom_cost += tracking_nom_cost; trajectory_costs(sample_idx + nominal_idx * num_rollouts, 0) = running_nom_cost; trajectory_costs(sample_idx + real_idx * num_rollouts, 0) = running_real_cost; } } template void launchCPUInitEvalKernel(DYN_T* model, COST_T* cost, SAMPLER_T* sampler, const float dt, const int num_timesteps, const int num_candidates, const int num_samples, const float lambda, const float alpha, const Eigen::Ref& candidates, const Eigen::Ref& strides, Eigen::Ref trajectory_costs) { using state_array = typename DYN_T::state_array; using output_array = typename DYN_T::output_array; using control_array = typename DYN_T::control_array; const int num_rollouts = num_candidates * num_samples; Eigen::MatrixXi crash_status = Eigen::MatrixXi::Zero(num_rollouts, 1); // Get control samples from sampler Eigen::MatrixXf control_noise = Eigen::MatrixXf::Zero(DYN_T::CONTROL_DIM, num_rollouts * num_timesteps); HANDLE_ERROR(cudaMemcpy(control_noise.data(), sampler->getControlSample(0, 0, 0), sizeof(float) * DYN_T::CONTROL_DIM * num_rollouts * num_timesteps, cudaMemcpyDeviceToHost)); state_array curr_state, next_state, state_der; output_array output; control_array u; for (int candidate_idx = 0; candidate_idx < num_candidates; candidate_idx++) { for (int sample_idx = 0; sample_idx < num_samples; sample_idx++) { int global_idx = candidate_idx * num_samples + sample_idx; float& running_cost = trajectory_costs(global_idx, 0); curr_state = candidates.col(candidate_idx); model->initializeDynamics(curr_state, u, output, 0, dt); cost->initializeCosts(output, u, 0, dt); // sampler->initializeDistributions(); for (int t = 0; t < num_timesteps; t++) { int control_index = sample_idx * num_timesteps + min(t + strides(candidate_idx, 0), num_timesteps - 1); u = control_noise.col(control_index); model->enforceConstraints(curr_state, u); model->step(curr_state, next_state, state_der, u, output, t, dt); // if (t > 0) // { running_cost += cost->computeRunningCost(output, u, t, &crash_status(global_idx)); running_cost += sampler->computeLikelihoodRatioCost(u, t, 0, lambda, alpha); // } curr_state = next_state; } running_cost += cost->terminalCost(output); running_cost /= (num_timesteps); } } } ================================================ FILE: tests/include/kernel_tests/core/rmppi_kernel_test.cuh ================================================ // // Created by mgandhi on 5/23/20. // #ifndef MPPIGENERIC_RMPPI_KERNEL_TEST_CUH #define MPPIGENERIC_RMPPI_KERNEL_TEST_CUH #include template void launchCPURMPPIRolloutKernel(DYN_T* model, COST_T* cost, SAMPLER_T* sampler, FB_T* fb_controller, const float dt, const int num_timesteps, const int num_rollouts, const float lambda, const float alpha, const float value_func_threshold, const int nominal_idx, const int real_idx, const Eigen::Ref& initial_real_state, const Eigen::Ref& initial_nominal_state, Eigen::Ref trajectory_costs); template void launchCPUInitEvalKernel(DYN_T* model, COST_T* cost, SAMPLER_T* sampler, const float dt, const int num_timesteps, const int num_candidates, const int num_samples, const float lambda, const float alpha, const Eigen::Ref& candidates, const Eigen::Ref& strides, Eigen::Ref trajectory_costs); #if __CUDACC__ #include "rmppi_kernel_test.cu" #endif #endif // MPPIGENERIC_RMPPI_KERNEL_TEST_CUH ================================================ FILE: tests/include/kernel_tests/core/rollout_kernel_test.cu ================================================ #include "rollout_kernel_test.cuh" #include #include // const int STATE_DIM = 12; // const int CONTROL_DIM = 3; // const int NUM_ROLLOUTS = 100; // .99 times this number has to be an integer... TODO fix how brittle this is // const int BLOCKSIZE_X = 32; // const int BLOCKSIZE_Y = 8; // Blocksize_y has to be greater than the control dim TODO fix how we step through the // controls template __global__ void loadGlobalToShared_KernelTest(float* x0_device, float* x_thread_device, float* xdot_thread_device, float* u_thread_device) { const int STATE_DIM = 12; const int CONTROL_DIM = 3; const int NUM_ROLLOUTS = 100; // .99 times this number has to be an integer... TODO fix how brittle this is const int BLOCKSIZE_X = 32; const int BLOCKSIZE_Y = 8; // Blocksize_y has to be greater than the control dim TODO fix how we step through the controls int thread_idx = threadIdx.x; int thread_idy = threadIdx.y; int thread_idz = threadIdx.z; int block_idx = blockIdx.x; int global_idx = threadIdx.x + block_idx * blockDim.x; // Create shared arrays which hold state and control data __shared__ float x_shared[BLOCKSIZE_X * STATE_DIM * BLOCKSIZE_Z]; __shared__ float xdot_shared[BLOCKSIZE_X * STATE_DIM * BLOCKSIZE_Z]; __shared__ float u_shared[BLOCKSIZE_X * CONTROL_DIM * BLOCKSIZE_Z]; float* x_thread; float* xdot_thread; float* u_thread; float* du_thread; if (global_idx < NUM_ROLLOUTS) { x_thread = &x_shared[(blockDim.x * thread_idz + thread_idx) * STATE_DIM]; xdot_thread = &xdot_shared[(blockDim.x * thread_idz + thread_idx) * STATE_DIM]; u_thread = &u_shared[(blockDim.x * thread_idz + thread_idx) * CONTROL_DIM]; } __syncthreads(); mppi::kernels::loadGlobalToShared(NUM_ROLLOUTS, BLOCKSIZE_Y, global_idx, thread_idy, thread_idz, x0_device, x_thread, xdot_thread, u_thread); __syncthreads(); // Check if on the first rollout the correct values were copied over // Prevent y threads from all writing to the same memory if (global_idx == 1 && thread_idy == 0) { for (int i = 0; i < STATE_DIM; ++i) { int ind = i + thread_idz * STATE_DIM; int ind_thread = i + thread_idz * STATE_DIM * blockDim.x; x_thread_device[ind] = x_shared[ind_thread]; xdot_thread_device[ind] = xdot_shared[ind_thread]; } for (int i = 0; i < CONTROL_DIM; ++i) { int ind = i + thread_idz * CONTROL_DIM; int ind_thread = i + thread_idz * CONTROL_DIM * blockDim.x; u_thread_device[ind] = u_shared[ind_thread]; } __syncthreads(); } // To test what the results are, we have to return them back to the host. } void launchGlobalToShared_KernelTest(const std::vector& x0_host, std::vector& x_thread_host, std::vector& xdot_thread_host, std::vector& u_thread_host) { const int STATE_DIM = 12; const int CONTROL_DIM = 3; const int NUM_ROLLOUTS = 100; // .99 times this number has to be an integer... TODO fix how brittle this is const int BLOCKSIZE_X = 32; const int BLOCKSIZE_Y = 8; // Blocksize_y has to be greater than the control dim TODO fix how we step through the controls // Define the initial condition x0_device and the exploration variance in global device memory float* x0_device; HANDLE_ERROR(cudaMalloc((void**)&x0_device, sizeof(float) * STATE_DIM)); HANDLE_ERROR(cudaMemcpy(x0_device, x0_host.data(), sizeof(float) * STATE_DIM, cudaMemcpyHostToDevice)); // Define the return arguments in global device memory float* x_thread_device; float* xdot_thread_device; float* u_thread_device; HANDLE_ERROR(cudaMalloc((void**)&x_thread_device, sizeof(float) * STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&xdot_thread_device, sizeof(float) * STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&u_thread_device, sizeof(float) * CONTROL_DIM)); dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y); dim3 dimGrid(2048); loadGlobalToShared_KernelTest<<>>(x0_device, x_thread_device, xdot_thread_device, u_thread_device); CudaCheckError(); // Copy the data back to the host HANDLE_ERROR(cudaMemcpy(x_thread_host.data(), x_thread_device, sizeof(float) * STATE_DIM, cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(xdot_thread_host.data(), xdot_thread_device, sizeof(float) * STATE_DIM, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(u_thread_host.data(), u_thread_device, sizeof(float) * CONTROL_DIM, cudaMemcpyDeviceToHost)); // Free the cuda memory that we allocated cudaFree(x0_device); cudaFree(x_thread_device); cudaFree(xdot_thread_device); cudaFree(u_thread_device); } /** * Tube-MPPI versions of the kernel tests */ // This is to test tube-mppi calls to the kernel void launchGlobalToShared_KernelTest_nom_act( const std::vector& x0_host_act, std::vector& x_thread_host_act, std::vector& xdot_thread_host_act, std::vector& u_thread_host_act, const std::vector& x0_host_nom, std::vector& x_thread_host_nom, std::vector& xdot_thread_host_nom, std::vector& u_thread_host_nom) { const int STATE_DIM = 12; const int CONTROL_DIM = 3; const int NUM_ROLLOUTS = 100; // .99 times this number has to be an integer... TODO fix how brittle this is const int BLOCKSIZE_X = 32; const int BLOCKSIZE_Y = 8; // Blocksize_y has to be greater than the control dim TODO fix how we step through the controls // Define the initial condition x0_device and the exploration variance in global device memory // Need twice as much memory for tube-mppi float* x0_device; HANDLE_ERROR(cudaMalloc((void**)&x0_device, sizeof(float) * STATE_DIM * 2)); // Copy both act and nominal initial state HANDLE_ERROR(cudaMemcpy(x0_device, x0_host_act.data(), sizeof(float) * STATE_DIM, cudaMemcpyHostToDevice)); HANDLE_ERROR( cudaMemcpy(x0_device + STATE_DIM, x0_host_nom.data(), sizeof(float) * STATE_DIM, cudaMemcpyHostToDevice)); // Define the return arguments in global device memory float* x_thread_device; float* xdot_thread_device; float* u_thread_device; HANDLE_ERROR(cudaMalloc((void**)&x_thread_device, sizeof(float) * STATE_DIM * 2)); HANDLE_ERROR(cudaMalloc((void**)&xdot_thread_device, sizeof(float) * STATE_DIM * 2)); HANDLE_ERROR(cudaMalloc((void**)&u_thread_device, sizeof(float) * CONTROL_DIM * 2)); dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y, 2); dim3 dimGrid(100); loadGlobalToShared_KernelTest<2> <<>>(x0_device, x_thread_device, xdot_thread_device, u_thread_device); CudaCheckError(); // Copy the initial_state for actual and nominal HANDLE_ERROR( cudaMemcpy(x_thread_host_act.data(), x_thread_device, sizeof(float) * STATE_DIM, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(x_thread_host_nom.data(), x_thread_device + STATE_DIM, sizeof(float) * STATE_DIM, cudaMemcpyDeviceToHost)); // Copy the xdot for actual and nominal HANDLE_ERROR( cudaMemcpy(xdot_thread_host_act.data(), xdot_thread_device, sizeof(float) * STATE_DIM, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(xdot_thread_host_nom.data(), xdot_thread_device + STATE_DIM, sizeof(float) * STATE_DIM, cudaMemcpyDeviceToHost)); // copy the initial u for actual and nominal HANDLE_ERROR( cudaMemcpy(u_thread_host_act.data(), u_thread_device, sizeof(float) * CONTROL_DIM, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(u_thread_host_nom.data(), u_thread_device + CONTROL_DIM, sizeof(float) * CONTROL_DIM, cudaMemcpyDeviceToHost)); // Free the cuda memory that we allocated cudaFree(x0_device); cudaFree(x_thread_device); cudaFree(xdot_thread_device); cudaFree(u_thread_device); } template __global__ void computeAndSaveCostAllRollouts_KernelTest(COST_T* cost, int state_dim, int num_rollouts, float* running_costs, float* terminal_state, float* cost_rollout_device) { int tid = blockDim.x * blockIdx.x + threadIdx.x; // index on rollouts // if (tid == 0) { // printf("Current state [%f, %f, %f, %f]\n", terminal_state[state_dim * tid], // terminal_state[state_dim * tid + 1], terminal_state[state_dim * tid + 2], // terminal_state[state_dim * tid + 3]); // printf("Current cost [%f]\n", running_costs[tid]); // } mppi::kernels::computeAndSaveCost(num_rollouts, 2, tid, cost, &terminal_state[state_dim * tid], running_costs[tid], nullptr, cost_rollout_device); // if (tid == 0) { // printf("Total cost [%f]\n", cost_rollout_device[tid]); // } } template void launchComputeAndSaveCostAllRollouts_KernelTest(COST_T& cost, const std::array& cost_all_rollouts, const std::array& terminal_states, std::array& cost_compute) { const int BLOCKSIZE_X = 32; const int BLOCKSIZE_Y = 8; // Allocate CUDA memory float* cost_all_rollouts_device; float* terminal_states_device; float* cost_compute_device; HANDLE_ERROR(cudaMalloc((void**)&cost_all_rollouts_device, sizeof(float) * cost_all_rollouts.size())); HANDLE_ERROR(cudaMalloc((void**)&terminal_states_device, sizeof(float) * terminal_states.size())); HANDLE_ERROR(cudaMalloc((void**)&cost_compute_device, sizeof(float) * cost_compute.size())); // Copy Host side data to the Device HANDLE_ERROR(cudaMemcpy(cost_all_rollouts_device, cost_all_rollouts.data(), sizeof(float) * cost_all_rollouts.size(), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(terminal_states_device, terminal_states.data(), sizeof(float) * terminal_states.size(), cudaMemcpyHostToDevice)); // Launch kernel dim3 blocksize(BLOCKSIZE_X, 1); dim3 gridsize((NUM_ROLLOUTS + (BLOCKSIZE_X - 1)) / BLOCKSIZE_X, 1); computeAndSaveCostAllRollouts_KernelTest<<>>( cost.cost_d_, STATE_DIM, NUM_ROLLOUTS, cost_all_rollouts_device, terminal_states_device, cost_compute_device); CudaCheckError(); // Copy Device side data to the host HANDLE_ERROR(cudaMemcpy(cost_compute.data(), cost_compute_device, sizeof(float) * cost_compute.size(), cudaMemcpyDeviceToHost)); // free cuda Memory cudaFree(cost_all_rollouts_device); cudaFree(terminal_states_device); cudaFree(cost_compute_device); } template void launchRolloutKernel_nom_act(DYN_T* dynamics, COST_T* costs, SAMPLER_T* sampler, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, const std::vector& x0, const std::vector& nom_control_seq, std::vector& trajectory_costs_act, std::vector& trajectory_costs_nom, cudaStream_t stream) { float* initial_state_d; float* trajectory_costs_d; const int BLOCKSIZE_X = 32; // TODO we should change this parameter to be passed in to test multiple of them const int BLOCKSIZE_Y = 8; /** * Ensure dynamics, costs, and sampler exist on GPU */ dynamics->bindToStream(stream); costs->bindToStream(stream); sampler->bindToStream(stream); // Call the GPU setup functions of the dynamics, costs, and sampler dynamics->GPUSetup(); costs->GPUSetup(); sampler->GPUSetup(); sampler->setNumTimesteps(num_timesteps); sampler->setNumRollouts(num_rollouts); sampler->setNumDistributions(2); // Create x init cuda array HANDLE_ERROR(cudaMalloc((void**)&initial_state_d, sizeof(float) * DYN_T::STATE_DIM * 2)); // Create cost trajectory cuda array HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * num_rollouts * 2)); // Create random noise generator curandGenerator_t gen; curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetStream(gen, stream); /** * Fill in GPU arrays */ HANDLE_ERROR( cudaMemcpyAsync(initial_state_d, x0.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(initial_state_d + DYN_T::STATE_DIM, x0.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); sampler->copyImportanceSamplerToDevice(nom_control_seq.data(), 0, false); sampler->copyImportanceSamplerToDevice(nom_control_seq.data(), 1, false); // Generate samples and do stream synchronize sampler->generateSamples(1, 0, gen, true); dim3 threadsPerBlock(BLOCKSIZE_X, BLOCKSIZE_Y, 2); // Launch rollout kernel mppi::kernels::launchRolloutKernel(dynamics, costs, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, initial_state_d, trajectory_costs_d, threadsPerBlock, stream, false); // Copy the costs back to the host HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_act.data(), trajectory_costs_d, num_rollouts * sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_nom.data(), trajectory_costs_d + num_rollouts, num_rollouts * sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); cudaFree(initial_state_d); cudaFree(trajectory_costs_d); } template __global__ void autorallyRolloutKernel(int num_timesteps, float* state_d, float* U_d, float* du_d, float* nu_d, float* costs_d, DYNAMICS_T* dynamics_model, COSTS_T* mppi_costs, int opt_delay, float lambda, float alpha, float dt) { int i, j; int tdx = threadIdx.x; int tdy = threadIdx.y; int bdx = blockIdx.x; // Initialize the local state, controls, and noise float* s; float* s_der; float* u; float* nu; float* du; int* crash; const int STATE_DIM = DYNAMICS_T::STATE_DIM; const int CONTROL_DIM = DYNAMICS_T::CONTROL_DIM; const int SHARED_MEM_REQUEST_GRD = DYNAMICS_T::SHARED_MEM_REQUEST_GRD; const int SHARED_MEM_REQUEST_BLK = DYNAMICS_T::SHARED_MEM_REQUEST_BLK; // Create shared arrays for holding state and control data. __shared__ float state_shared[BLOCKSIZE_X * STATE_DIM]; __shared__ float state_der_shared[BLOCKSIZE_X * STATE_DIM]; __shared__ float control_shared[BLOCKSIZE_X * CONTROL_DIM]; __shared__ float control_var_shared[BLOCKSIZE_X * CONTROL_DIM]; __shared__ float exploration_variance[BLOCKSIZE_X * CONTROL_DIM]; __shared__ int crash_status[BLOCKSIZE_X]; // Create a shared array for the dynamics model to use __shared__ float theta[SHARED_MEM_REQUEST_GRD / sizeof(float) + 1 + SHARED_MEM_REQUEST_BLK * BLOCKSIZE_X]; __shared__ float theta_c[COSTS_T::SHARED_MEM_REQUEST_GRD + COSTS_T::SHARED_MEM_REQUEST_BLK * BLOCKSIZE_X]; __shared__ float y[DYNAMICS_T::OUTPUT_DIM]; // Initialize trajectory cost float running_cost = 0; int global_idx = BLOCKSIZE_X * bdx + tdx; if (global_idx < NUM_ROLLOUTS) { // Portion of the shared array belonging to each x-thread index. s = &state_shared[tdx * STATE_DIM]; s_der = &state_der_shared[tdx * STATE_DIM]; u = &control_shared[tdx * CONTROL_DIM]; du = &control_var_shared[tdx * CONTROL_DIM]; nu = &exploration_variance[tdx * CONTROL_DIM]; crash = &crash_status[tdx]; // Load the initial state, nu, and zero the noise for (i = tdy; i < STATE_DIM; i += blockDim.y) { s[i] = state_d[i]; s_der[i] = 0; } // Load nu for (i = tdy; i < CONTROL_DIM; i += blockDim.y) { u[i] = 0; du[i] = 0; nu[i] = nu_d[i]; } crash[0] = 0; } __syncthreads(); /*<----Start of simulation loop-----> */ dynamics_model->initializeDynamics(s, u, y, theta, 0.0, dt); mppi_costs->initializeCosts(s, u, theta_c, 0.0, dt); for (i = 0; i < num_timesteps; i++) { if (global_idx < NUM_ROLLOUTS) { for (j = tdy; j < CONTROL_DIM; j += blockDim.y) { // Noise free rollout if (global_idx == 0 || i < opt_delay) { // Don't optimize variables that are already being executed du[j] = 0.0; u[j] = U_d[i * CONTROL_DIM + j]; } else if (global_idx >= .99 * NUM_ROLLOUTS) { du[j] = du_d[CONTROL_DIM * num_timesteps * (BLOCKSIZE_X * bdx + tdx) + i * CONTROL_DIM + j] * nu[j]; u[j] = du[j]; } else { du[j] = du_d[CONTROL_DIM * num_timesteps * (BLOCKSIZE_X * bdx + tdx) + i * CONTROL_DIM + j] * nu[j]; u[j] = U_d[i * CONTROL_DIM + j] + du[j]; } du_d[CONTROL_DIM * num_timesteps * (BLOCKSIZE_X * bdx + tdx) + i * CONTROL_DIM + j] = u[j]; } } __syncthreads(); dynamics_model->enforceConstraints(s, &du_d[(global_idx * num_timesteps + i) * CONTROL_DIM]); if (tdy == 0 && global_idx < NUM_ROLLOUTS) { dynamics_model->enforceConstraints(s, u); } __syncthreads(); // Compute the cost of the being in the current state if (tdy == 0 && global_idx < NUM_ROLLOUTS && i > 0 && crash[0] > -1) { // Running average formula running_cost += (mppi_costs->computeRunningCost(s, u, du, nu, lambda, alpha, i, theta_c, crash) - running_cost) / (1.0 * i); // printf("AutoRa Current State rollout %i, timestep: %i: [%f, %f, %f, %f]\n", global_idx, i, s[0], s[1], // s[2], s[3]); printf("AutoRa Running Cost rollout %i, timestep: %i: %f\n", global_idx, i, running_cost); } // Compute the dynamics if (global_idx < NUM_ROLLOUTS) { dynamics_model->computeStateDeriv(s, u, s_der, theta); } __syncthreads(); // Update the state if (global_idx < NUM_ROLLOUTS) { dynamics_model->updateState(s, s_der, dt); } // //Check to see if the rollout will result in a (physical) crash. // if (tdy == 0 && global_idx < NUM_ROLLOUTS) { // mppi_costs.getCrash(s, crash); // } } /* <------- End of the simulation loop ----------> */ if (global_idx < NUM_ROLLOUTS && tdy == 0) { // Write cost results back to global memory. costs_d[(BLOCKSIZE_X)*bdx + tdx] = running_cost + mppi_costs->terminalCost(s, theta_c); } } template void launchAutorallyRolloutKernelTest( DYNAMICS_T* dynamics, COSTS_T* costs, float dt, float lambda, float alpha, std::array state_array, std::array control_array, std::array control_noise_array, std::array sigma_u, std::array& costs_out, std::array& control_noise_out, int opt_delay, cudaStream_t stream) { float* state_d; float* U_d; float* du_d; float* nu_d; float* costs_d; // Allocate CUDA memory for the rollout HANDLE_ERROR(cudaMalloc((void**)&state_d, sizeof(float) * state_array.size())); HANDLE_ERROR(cudaMalloc((void**)&U_d, sizeof(float) * control_array.size())); HANDLE_ERROR(cudaMalloc((void**)&du_d, sizeof(float) * control_noise_array.size())); HANDLE_ERROR(cudaMalloc((void**)&nu_d, sizeof(float) * sigma_u.size())); HANDLE_ERROR(cudaMalloc((void**)&costs_d, sizeof(float) * costs_out.size())); // Copy the initial values HANDLE_ERROR( cudaMemcpyAsync(state_d, state_array.data(), sizeof(float) * state_array.size(), cudaMemcpyHostToDevice, stream)); HANDLE_ERROR( cudaMemcpyAsync(U_d, control_array.data(), sizeof(float) * control_array.size(), cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(du_d, control_noise_array.data(), sizeof(float) * control_noise_array.size(), cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(nu_d, sigma_u.data(), sizeof(float) * sigma_u.size(), cudaMemcpyHostToDevice, stream)); const int GRIDSIZE_X = (NUM_ROLLOUTS - 1) / BLOCKSIZE_X + 1; dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y, 1); dim3 dimGrid(GRIDSIZE_X, 1, 1); autorallyRolloutKernel <<>>(NUM_TIMESTEPS, state_d, U_d, du_d, nu_d, costs_d, dynamics->model_d_, costs->cost_d_, opt_delay, lambda, alpha, dt); CudaCheckError(); // Copy data back HANDLE_ERROR( cudaMemcpyAsync(costs_out.data(), costs_d, sizeof(float) * costs_out.size(), cudaMemcpyDeviceToHost, stream)); // Copy the noise back HANDLE_ERROR(cudaMemcpyAsync(control_noise_out.data(), du_d, sizeof(float) * control_noise_out.size(), cudaMemcpyDeviceToHost, stream)); // Deallocate CUDA Memory HANDLE_ERROR(cudaFree(state_d)); HANDLE_ERROR(cudaFree(U_d)); HANDLE_ERROR(cudaFree(du_d)); HANDLE_ERROR(cudaFree(nu_d)); HANDLE_ERROR(cudaFree(costs_d)); } template void launchCPURolloutKernel(DYN_T* model, COST_T* cost, SAMPLER_T* sampler, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, const Eigen::Ref& x0, Eigen::Ref trajectory_costs, cudaStream_t stream) { using state_array = typename DYN_T::state_array; using output_array = typename DYN_T::output_array; using control_array = typename DYN_T::control_array; Eigen::MatrixXf control_noise = Eigen::MatrixXf::Zero(DYN_T::CONTROL_DIM, num_rollouts * num_timesteps); Eigen::MatrixXi crash_status = Eigen::MatrixXi::Zero(num_rollouts, 1); HANDLE_ERROR(cudaMemcpy(control_noise.data(), sampler->getControlSample(0, 0, 0), sizeof(float) * DYN_T::CONTROL_DIM * num_rollouts * num_timesteps, cudaMemcpyDeviceToHost)); state_array curr_x, next_x, x_der; control_array u; output_array y; for (int sample_idx = 0; sample_idx < num_rollouts; sample_idx++) { curr_x = x0; model->initializeDynamics(curr_x, u, y, 0, dt); cost->initializeCosts(y, u, 0, dt); float& running_cost = trajectory_costs(sample_idx, 0); running_cost = 0.0f; for (int t = 0; t < num_timesteps; t++) { u = control_noise.col(t + num_timesteps * sample_idx); model->enforceConstraints(curr_x, u); model->step(curr_x, next_x, x_der, u, y, t, dt); running_cost += cost->computeRunningCost(y, u, t, &crash_status(sample_idx)); running_cost += sampler->computeLikelihoodRatioCost(u, t, 0, lambda, alpha); curr_x = next_x; } running_cost += cost->terminalCost(y); running_cost /= num_timesteps; } } ================================================ FILE: tests/include/kernel_tests/core/rollout_kernel_test.cuh ================================================ #pragma once #ifndef KERNEL_TESTS_MPPI_CORE_MPPI_CORE_KERNEL_TEST_CUH_ #define KERNEL_TESTS_MPPI_CORE_MPPI_CORE_KERNEL_TEST_CUH_ #include #include #include #include // Declare some sizes for the kernel parameters template __global__ void loadGlobalToShared_KernelTest(float* x0_device, float* x_thread, float* xdot_thread, float* u_thread); void launchGlobalToShared_KernelTest(const std::vector& x0_host, std::vector& x_thread_host, std::vector& xdot_thread_host, std::vector& u_thread_host); void launchGlobalToShared_KernelTest_nom_act( const std::vector& x0_host_act, std::vector& x_thread_host_act, std::vector& xdot_thread_host_act, std::vector& u_thread_host_act, const std::vector& x0_host_nom, std::vector& x_thread_host_nom, std::vector& xdot_thread_host_nom, std::vector& u_thread_host_nom); template void launchRolloutKernel_nom_act(DYN_T* dynamics, COST_T* costs, SAMPLER_T* sampler, float dt, const int num_timesteps, const int num_rollouts, float lambda, float alpha, const std::vector& x0, const std::vector& nom_control_seq, std::vector& trajectory_costs_act, std::vector& trajectory_costs_nom, cudaStream_t stream = 0); template __global__ void computeAndSaveCostAllRollouts_KernelTest(COST_T* cost, int state_dim, int num_rollouts, float* running_costs, float* terminal_state, float* cost_rollout_device); template void launchComputeAndSaveCostAllRollouts_KernelTest(COST_T& cost, const std::array& cost_all_rollouts, const std::array& terminal_states, std::array& cost_compute); template __global__ void autorallyRolloutKernel(int num_timesteps, float* state_d, float* U_d, float* du_d, float* nu_d, float* costs_d, DYNAMICS_T* dynamics_model, COSTS_T* mppi_costs, int opt_delay, float lambda, float alpha, float dt); template void launchAutorallyRolloutKernelTest( DYNAMICS_T* dynamics, COSTS_T* costs, float dt, float lambda, float alpha, std::array state_array, std::array control_array, std::array control_noise_array, std::array sigma_u, std::array& costs_out, std::array& control_noise_out, int opt_delay, cudaStream_t stream); #if __CUDACC__ #include "rollout_kernel_test.cu" #endif #endif // !KERNEL_TESTS_MPPI_CORE_MPPI_CORE_KERNEL_TEST_CUH_ ================================================ FILE: tests/include/kernel_tests/core/weightedreduction_kernel_test.cu ================================================ #include "weightedreduction_kernel_test.cuh" __global__ void setInitialControlToZero_KernelTest(int control_dim, float* u_d, float* u_intermediate) { int thread_idx = blockDim.x * blockIdx.x + threadIdx.x; mppi::kernels::setInitialControlToZero(control_dim, thread_idx, u_d, u_intermediate); } template void launchSetInitialControlToZero_KernelTest(std::array& u_host, std::array& u_intermediate_host) { float* u_dev; float* u_intermediate_dev; // Allocate Memory HANDLE_ERROR(cudaMalloc((void**)&u_dev, sizeof(float) * u_host.size())) HANDLE_ERROR(cudaMalloc((void**)&u_intermediate_dev, sizeof(float) * u_intermediate_host.size())) setInitialControlToZero_KernelTest<<<1, num_threads>>>(control_dim, u_dev, u_intermediate_dev); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(u_host.data(), u_dev, sizeof(float) * u_host.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR(cudaMemcpy(u_intermediate_host.data(), u_intermediate_dev, sizeof(float) * u_intermediate_host.size(), cudaMemcpyDeviceToHost)) cudaFree(u_dev); cudaFree(u_intermediate_dev); } template __global__ void strideControlWeightReduction_KernelTest(int num_rollouts, int num_timesteps, int sum_stride, float* exp_costs_d, float normalizer, float* du_d, float* u_intermediate) { int thread_idx = threadIdx.x; int block_idx = blockIdx.x; float u_thread[control_dim]; float* u_intermediate_thread = &u_intermediate[block_idx * control_dim * ((num_rollouts - 1) / sum_stride + 1)]; mppi::kernels::strideControlWeightReduction(num_rollouts, num_timesteps, sum_stride, thread_idx, block_idx, control_dim, exp_costs_d, normalizer, du_d, u_thread, u_intermediate_thread); } template void launchStrideControlWeightReduction_KernelTest( float normalizer, const std::array& exp_costs_host, const std::array& du_host, std::array& u_intermediate_host) { float* exp_costs_dev; float* du_dev; float* u_intermediate_dev; // Allocate Memory HANDLE_ERROR(cudaMalloc((void**)&exp_costs_dev, sizeof(float) * exp_costs_host.size())); HANDLE_ERROR(cudaMalloc((void**)&du_dev, sizeof(float) * du_host.size())) HANDLE_ERROR(cudaMalloc((void**)&u_intermediate_dev, sizeof(float) * u_intermediate_host.size())); HANDLE_ERROR( cudaMemcpy(exp_costs_dev, exp_costs_host.data(), sizeof(float) * exp_costs_host.size(), cudaMemcpyHostToDevice)) HANDLE_ERROR(cudaMemcpy(du_dev, du_host.data(), sizeof(float) * du_host.size(), cudaMemcpyHostToDevice)) dim3 blockdim((num_rollouts - 1) / sum_stride + 1, 1, 1); dim3 griddim(num_timesteps, 1, 1); strideControlWeightReduction_KernelTest<<>>( num_rollouts, num_timesteps, sum_stride, exp_costs_dev, normalizer, du_dev, u_intermediate_dev); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(u_intermediate_host.data(), u_intermediate_dev, sizeof(float) * u_intermediate_host.size(), cudaMemcpyDeviceToHost)) cudaFree(exp_costs_dev); cudaFree(du_dev); cudaFree(u_intermediate_dev); } template __global__ void rolloutWeightReductionAndSaveControl_KernelTest(int num_rollouts, int num_timesteps, int sum_stride, float* u_intermediate, float* du_new_d) { int thread_idx = threadIdx.x; // Current cell int block_idx = blockIdx.x; // Current timestep float u[control_dim]; float* u_intermediate_thread = &u_intermediate[block_idx * control_dim * ((num_rollouts - 1) / sum_stride + 1)]; mppi::kernels::rolloutWeightReductionAndSaveControl(thread_idx, block_idx, num_rollouts, num_timesteps, control_dim, sum_stride, u, u_intermediate_thread, du_new_d); } template void launchRolloutWeightReductionAndSaveControl_KernelTest( const std::array& u_intermediate_host, std::array& du_new_host) { float* u_intermediate_dev; float* du_new_dev; // Allocate Memory HANDLE_ERROR(cudaMalloc((void**)&u_intermediate_dev, sizeof(float) * u_intermediate_host.size())) HANDLE_ERROR(cudaMalloc((void**)&du_new_dev, sizeof(float) * du_new_host.size())) HANDLE_ERROR(cudaMemcpy(u_intermediate_dev, u_intermediate_host.data(), sizeof(float) * u_intermediate_host.size(), cudaMemcpyHostToDevice)) dim3 blockdim((num_rollouts - 1) / sum_stride + 1, 1, 1); dim3 griddim(num_timesteps, 1, 1); rolloutWeightReductionAndSaveControl_KernelTest <<>>(num_rollouts, num_timesteps, sum_stride, u_intermediate_dev, du_new_dev); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(du_new_host.data(), du_new_dev, sizeof(float) * du_new_host.size(), cudaMemcpyDeviceToHost)) cudaFree(u_intermediate_dev); cudaFree(du_new_dev); } // Old Weighted Reduction Kernel Implementation for comparison template __global__ void autorallyWeightedReductionKernel(float* states_d, float* du_d, float normalizer, int num_timesteps) { int tdx = threadIdx.x; int bdx = blockIdx.x; __shared__ float u_system[CONTROL_DIM * ((NUM_ROLLOUTS - 1) / BLOCKSIZE_WRX + 1)]; int stride = BLOCKSIZE_WRX; float u[CONTROL_DIM]; int i, j; for (i = 0; i < CONTROL_DIM; i++) { u[i] = 0; } for (j = 0; j < CONTROL_DIM; j++) { u_system[tdx * CONTROL_DIM + j] = 0; } __syncthreads(); if (BLOCKSIZE_WRX * tdx < NUM_ROLLOUTS) { float weight = 0; for (i = 0; i < stride; i++) { if (stride * tdx + i < NUM_ROLLOUTS) { weight = states_d[stride * tdx + i] / normalizer; for (j = 0; j < CONTROL_DIM; j++) { u[j] = du_d[(stride * tdx + i) * (num_timesteps * CONTROL_DIM) + bdx * CONTROL_DIM + j]; u_system[tdx * CONTROL_DIM + j] += weight * u[j]; } } } } __syncthreads(); if (tdx == 0 && bdx < num_timesteps) { for (i = 0; i < CONTROL_DIM; i++) { u[i] = 0; } for (i = 0; i < (NUM_ROLLOUTS - 1) / BLOCKSIZE_WRX + 1; i++) { for (j = 0; j < CONTROL_DIM; j++) { u[j] += u_system[CONTROL_DIM * i + j]; } } for (i = 0; i < CONTROL_DIM; i++) { du_d[CONTROL_DIM * bdx + i] = u[i]; } } } template void launchAutoRallyWeightedReductionKernelTest( std::array exp_costs, std::array perturbed_controls, float normalizer, std::array& controls_out, cudaStream_t stream) { float* exp_costs_d_; float* perturbed_controls_d_; // Allocate CUDA memory for the device pointers HANDLE_ERROR(cudaMalloc((void**)&exp_costs_d_, sizeof(float) * exp_costs.size())); HANDLE_ERROR(cudaMalloc((void**)&perturbed_controls_d_, sizeof(float) * perturbed_controls.size())); // Memcpy the data to the device HANDLE_ERROR(cudaMemcpyAsync(exp_costs_d_, exp_costs.data(), sizeof(float) * exp_costs.size(), cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(perturbed_controls_d_, perturbed_controls.data(), sizeof(float) * perturbed_controls.size(), cudaMemcpyHostToDevice, stream)); // Setup the block and grid sizes dim3 dimBlock((NUM_ROLLOUTS - 1) / BLOCKSIZE_WRX + 1, 1, 1); dim3 dimGrid(NUM_TIMESTEPS, 1, 1); // Kernel launch and error check autorallyWeightedReductionKernel <<>>(exp_costs_d_, perturbed_controls_d_, normalizer, NUM_TIMESTEPS); CudaCheckError(); // Copy the result back to the GPU HANDLE_ERROR(cudaMemcpyAsync(controls_out.data(), perturbed_controls_d_, sizeof(float) * controls_out.size(), cudaMemcpyDeviceToHost, stream)); // Free the CUDA memory HANDLE_ERROR(cudaFree(exp_costs_d_)); HANDLE_ERROR(cudaFree(perturbed_controls_d_)); } template void launchWeightedReductionKernelTest(std::array exp_costs, std::array perturbed_controls, float normalizer, std::array& controls_out, cudaStream_t stream) { float* exp_costs_d_; float* perturbed_controls_d_; float* optimal_controls_d_; // Allocate CUDA memory for the device pointers HANDLE_ERROR(cudaMalloc((void**)&exp_costs_d_, sizeof(float) * exp_costs.size())); HANDLE_ERROR(cudaMalloc((void**)&perturbed_controls_d_, sizeof(float) * perturbed_controls.size())); HANDLE_ERROR(cudaMalloc((void**)&optimal_controls_d_, sizeof(float) * controls_out.size())); // Memcpy the data to the device HANDLE_ERROR(cudaMemcpyAsync(exp_costs_d_, exp_costs.data(), sizeof(float) * exp_costs.size(), cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(perturbed_controls_d_, perturbed_controls.data(), sizeof(float) * perturbed_controls.size(), cudaMemcpyHostToDevice, stream)); // Setup the block and grid sizes dim3 dimBlock((NUM_ROLLOUTS - 1) / SUM_STRIDE + 1, 1, 1); dim3 dimGrid(NUM_TIMESTEPS, 1, 1); mppi::kernels::weightedReductionKernel<<>>( exp_costs_d_, perturbed_controls_d_, optimal_controls_d_, normalizer, NUM_TIMESTEPS, NUM_ROLLOUTS, SUM_STRIDE); CudaCheckError(); // Copy the result back to the GPU HANDLE_ERROR(cudaMemcpyAsync(controls_out.data(), optimal_controls_d_, sizeof(float) * controls_out.size(), cudaMemcpyDeviceToHost, stream)); // Free the CUDA memory HANDLE_ERROR(cudaFree(exp_costs_d_)); HANDLE_ERROR(cudaFree(perturbed_controls_d_)); HANDLE_ERROR(cudaFree(optimal_controls_d_)); } ================================================ FILE: tests/include/kernel_tests/core/weightedreduction_kernel_test.cuh ================================================ #pragma once #ifndef MPPI_GENERIC_KERNEL_TESTS_MPPI_CORE_WEIGHTEDREDUCTION_KERNEL_TEST_CUH_ #define MPPI_GENERIC_KERNEL_TESTS_MPPI_CORE_WEIGHTEDREDUCTION_KERNEL_TEST_CUH_ #include #include __global__ void setInitialControlToZero_KernelTest(int control_dim, float* u_d, float* u_intermediate); template void launchSetInitialControlToZero_KernelTest(std::array& u_host, std::array& u_intermediate_host); template __global__ void strideControlWeightReduction_KernelTest(int num_rollouts, int num_timesteps, int sum_stride, float* exp_costs_d, float normalizer, float* du_d, float* u_intermediate); template void launchStrideControlWeightReduction_KernelTest( float normalizer, const std::array& exp_costs_host, const std::array& du_host, std::array& u_intermediate_host); template __global__ void rolloutWeightReductionAndSaveControl_KernelTest(int num_rollouts, int num_timesteps, int sum_stride, float* u_intermediate, float* du_new_d); template void launchRolloutWeightReductionAndSaveControl_KernelTest( const std::array& u_intermediate_host, std::array& du_new_host); template __global__ void autorallyWeightedReductionKernel(float* states_d, float* du_d, float normalizer, int num_timesteps); template void launchAutoRallyWeightedReductionKernelTest( std::array exp_costs, std::array perturbed_controls, float normalizer, std::array& controls_out, cudaStream_t stream); template void launchWeightedReductionKernelTest(std::array exp_costs, std::array perturbed_controls, float normalizer, std::array& controls_out, cudaStream_t stream); #if __CUDACC__ #include "weightedreduction_kernel_test.cu" #endif #endif //! MPPI_GENERIC_KERNEL_TESTS_MPPI_CORE_WEIGHTEDREDUCTION_KERNEL_TEST_CUH_ ================================================ FILE: tests/include/kernel_tests/cost_functions/autorally/ar_robust_cost_kernel_test.cu ================================================ // // Created by jgibson37 on 2/7/20. // template __global__ void getCostmapCostTestKernel(CLASS_T* cost, float* test_xu, float* cost_results, int num_points) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num_points) { float* state = &test_xu[tid]; cost_results[tid] = cost->getCostmapCost(state); } } template void launchGetCostmapCostTestKernel(CLASS_T& cost, std::vector>& test_xu, std::vector& cost_results) { int num_test_points = test_xu.size(); cost_results.resize(num_test_points * 9); float* cost_results_d; float* test_xu_d; HANDLE_ERROR(cudaMalloc((void**)&cost_results_d, sizeof(float) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&test_xu_d, sizeof(float) * 9 * num_test_points)) for (int i = 0; i < num_test_points; i++) { for (int j = 0; j < 9; j++) { cost_results[9 * i + j] = test_xu[i][j]; } } HANDLE_ERROR(cudaMemcpy(test_xu_d, test_xu.data(), sizeof(float) * 9 * num_test_points, cudaMemcpyHostToDevice)); // TODO amount should depend on the number of query points dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); getCostmapCostTestKernel <<>>(static_cast(cost.cost_d_), test_xu_d, cost_results_d, num_test_points); CudaCheckError(); cudaDeviceSynchronize(); // Copy the memory back to the host HANDLE_ERROR( cudaMemcpy(cost_results.data(), cost_results_d, sizeof(float) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(cost_results_d); cudaFree(test_xu_d); } ================================================ FILE: tests/include/kernel_tests/cost_functions/autorally/ar_robust_cost_kernel_test.cuh ================================================ // // Created by jgibson37 on 2/7/20. // #ifndef MPPIGENERIC_AR_ROBUST_COST_KERNEL_TEST_CUH #define MPPIGENERIC_AR_ROBUST_COST_KERNEL_TEST_CUH #include "ar_standard_cost_kernel_test.cuh" template __global__ void getCostmapCostTestKernel(CLASS_T* cost, float* test_xu, float* cost_results, int num_points); template void launchGetCostmapCostTestKernel(CLASS_T& cost, std::vector>& test_xu, std::vector& cost_results); #include "ar_robust_cost_kernel_test.cu" #endif // MPPIGENERIC_AR_ROBUST_COST_KERNEL_TEST_CUH ================================================ FILE: tests/include/kernel_tests/cost_functions/autorally/ar_standard_cost_kernel_test.cu ================================================ #include template __global__ void parameterTestKernel(const COST_T* cost, PARAMS_T& params, int& width, int& height) { int tid = blockIdx.x * blockDim.x + threadIdx.x; // printf("\nEntering the kernel!\n"); // printf("The thread id is: %i\n", tid); if (tid == 0) { // printf("") params = cost->getParams(); width = cost->getWidth(); height = cost->getHeight(); } } template void launchParameterTestKernel(const COST_T& cost, PARAMS_T& params, int& width, int& height) { PARAMS_T* params_d; int* width_d; int* height_d; HANDLE_ERROR(cudaMalloc((void**)¶ms_d, sizeof(PARAMS_T))) HANDLE_ERROR(cudaMalloc((void**)&width_d, sizeof(float))) HANDLE_ERROR(cudaMalloc((void**)&height_d, sizeof(float))) parameterTestKernel<<<1, 1>>>(static_cast(cost.cost_d_), *params_d, *width_d, *height_d); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(¶ms, params_d, sizeof(PARAMS_T), cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(&width, width_d, sizeof(float), cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(&height, height_d, sizeof(float), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(params_d); } // TODO actually check texture __global__ void checkCudaArrayKernel(float4* result_arr, cudaArray* array, int number) { int tid = blockIdx.x * blockDim.x + threadIdx.x; // printf("\nEntering the kernel!\n"); // printf("The thread id is: %i\n", tid); if (tid < number) { // printf("The thread id is: %i\n", tid); result_arr[tid].x = 0.0; result_arr[tid].y = 0.0; result_arr[tid].z = 0.0; result_arr[tid].w = 0.0; // result_arr[tid] = array[tid]; // printf(array[tid]); } } void launchCheckCudaArray(std::vector& result_arr, cudaArray* array, int number) { float4* results_d; HANDLE_ERROR(cudaMalloc((void**)&results_d, sizeof(float4) * number)); result_arr.resize(number); dim3 threadsPerBlock(4, 1); dim3 numBlocks(1, 1); checkCudaArrayKernel<<>>(results_d, array, number); CudaCheckError(); cudaDeviceSynchronize(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(result_arr.data(), results_d, sizeof(float4) * number, cudaMemcpyDeviceToHost)); cudaFree(results_d); } template __global__ void transformTestKernel(float3* results, COST_T* cost) { int tid = blockIdx.x * blockDim.x + threadIdx.x; // printf("\nEntering the kernel!\n"); // printf("The thread id is: %i\n", tid); if (tid == 0) { // printf("") results[0] = cost->getParams().r_c1; results[1] = cost->getParams().r_c2; results[2] = cost->getParams().trs; } } template void launchTransformTestKernel(std::vector& result, const COST_T& cost) { result.resize(3); // Allocate memory on the CPU for checking the mass float3* results_d; HANDLE_ERROR(cudaMalloc((void**)&results_d, sizeof(float3) * 3)) transformTestKernel<<<1, 1>>>(results_d, cost.cost_d_); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(result.data(), results_d, sizeof(float3) * 3, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(results_d); } template __global__ void textureTestKernel(const COST_T& cost, float4* test_results, float2* test_indexes, int num_points) { int tid = blockIdx.x * blockDim.x + threadIdx.x; // printf("\nEntering the kernel!\n"); // printf("The thread id is: %i\n", tid); if (tid < num_points) { // printf("thread id: %i went to check texture at index %i, %i\n", tid, test_indexes[tid].x, test_indexes[tid].y); // query texture float4 track_params_back = cost.queryTexture(test_indexes[tid].x, test_indexes[tid].y); // put result in array // printf("thread id: %i got texture point (%f, %f, %f, %f)\n", tid, track_params_back.x, track_params_back.y, // track_params_back.z, track_params_back.w); test_results[tid] = track_params_back; // test_results[tid].x = 1; } } template void launchTextureTestKernel(const COST_T& cost, std::vector& test_results, std::vector& test_indexes) { int num_test_points = test_indexes.size(); test_results.resize(num_test_points); float4* tex_results_d; float2* tex_test_indexes_d; HANDLE_ERROR(cudaMalloc((void**)&tex_results_d, sizeof(float4) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&tex_test_indexes_d, sizeof(float2) * num_test_points)) HANDLE_ERROR( cudaMemcpy(tex_test_indexes_d, test_indexes.data(), sizeof(float2) * num_test_points, cudaMemcpyHostToDevice)); // TODO amount should depend on the number of query points dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); textureTestKernel<<>>(*cost.cost_d_, tex_results_d, tex_test_indexes_d, num_test_points); CudaCheckError(); cudaDeviceSynchronize(); // Copy the memory back to the host HANDLE_ERROR( cudaMemcpy(test_results.data(), tex_results_d, sizeof(float4) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(tex_results_d); cudaFree(tex_test_indexes_d); } template __global__ void textureTransformTestKernel(COST_T& cost, float4* test_results, float2* test_indexes, int num_points) { int tid = blockIdx.x * blockDim.x + threadIdx.x; // printf("\nEntering the kernel!\n"); // printf("The thread id is: %i\n", tid); if (tid < num_points) { // query texture float4 track_params_back = cost.queryTextureTransformed(test_indexes[tid].x, test_indexes[tid].y); // put result in array test_results[tid] = track_params_back; // test_results[tid].x = 1; } } template void launchTextureTransformTestKernel(const COST_T& cost, std::vector& test_results, std::vector& test_indexes) { int num_test_points = test_indexes.size(); test_results.resize(num_test_points); float4* tex_results_d; float2* tex_test_indexes_d; HANDLE_ERROR(cudaMalloc((void**)&tex_results_d, sizeof(float4) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&tex_test_indexes_d, sizeof(float2) * num_test_points)) HANDLE_ERROR( cudaMemcpy(tex_test_indexes_d, test_indexes.data(), sizeof(float2) * num_test_points, cudaMemcpyHostToDevice)); // TODO amount should depend on the number of query points dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); textureTransformTestKernel<<>>(*cost.cost_d_, tex_results_d, tex_test_indexes_d, num_test_points); CudaCheckError(); cudaDeviceSynchronize(); // Copy the memory back to the host HANDLE_ERROR( cudaMemcpy(test_results.data(), tex_results_d, sizeof(float4) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(tex_results_d); cudaFree(tex_test_indexes_d); } template __global__ void trackCostTestKernel(COST_T* cost, float3* test_indexes, int num_points, float* cost_results, int* crash_results) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num_points) { float state[7]; int crash = 0; state[0] = test_indexes[tid].x; state[1] = test_indexes[tid].y; state[2] = test_indexes[tid].z; // printf("got test indexes %d, state %f, %f, %f\n", tid, state[0], state[1], state[2]); cost_results[tid] = cost->getTrackCost(state, &crash); // printf("set results %d\n", tid); crash_results[tid] = crash; // printf("set crash results %d\n", tid); } } template void launchTrackCostTestKernel(const COST_T& cost, std::vector& test_indexes, std::vector& cost_results, std::vector& crash_results) { int num_test_points = test_indexes.size(); crash_results.resize(num_test_points); cost_results.resize(num_test_points); float* cost_results_d; int* crash_results_d; float3* test_indexes_d; HANDLE_ERROR(cudaMalloc((void**)&cost_results_d, sizeof(float) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&crash_results_d, sizeof(int) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&test_indexes_d, sizeof(float3) * num_test_points)) HANDLE_ERROR( cudaMemcpy(test_indexes_d, test_indexes.data(), sizeof(float3) * num_test_points, cudaMemcpyHostToDevice)); // TODO amount should depend on the number of query points dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); trackCostTestKernel<<>>(cost.cost_d_, test_indexes_d, num_test_points, cost_results_d, crash_results_d); CudaCheckError(); cudaDeviceSynchronize(); // Copy the memory back to the host HANDLE_ERROR( cudaMemcpy(cost_results.data(), cost_results_d, sizeof(float) * num_test_points, cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(crash_results.data(), crash_results_d, sizeof(int) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(cost_results_d); cudaFree(crash_results_d); cudaFree(test_indexes_d); } template __global__ void terminalCostTestKernel(COST_T& cost, float* test_xu, float* cost_results, int num_points) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num_points) { float* state = &test_xu[tid]; cost_results[tid] = cost.terminalCost(state, nullptr); } } template void launchTerminalCostTestKernel(const COST_T& cost, std::vector>& x, std::vector& cost_results) { int num_test_points = x.size(); cost_results.resize(num_test_points * 7); float* cost_results_d; float* u_d; HANDLE_ERROR(cudaMalloc((void**)&cost_results_d, sizeof(float) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&u_d, sizeof(float) * 7 * num_test_points)) for (int i = 0; i < num_test_points; i++) { for (int j = 0; j < 7; j++) { cost_results[7 * i + j] = x[i][j]; } } HANDLE_ERROR(cudaMemcpy(u_d, x.data(), sizeof(float) * 7 * num_test_points, cudaMemcpyHostToDevice)); // TODO amount should depend on the number of query points dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); terminalCostTestKernel<<>>(*cost.cost_d_, u_d, cost_results_d, num_test_points); CudaCheckError(); cudaDeviceSynchronize(); // Copy the memory back to the host HANDLE_ERROR( cudaMemcpy(cost_results.data(), cost_results_d, sizeof(float) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(cost_results_d); cudaFree(u_d); } template __global__ void computeCostTestKernel(COST_T* cost, float* test_xu, float* cost_results, int* timestep, int* crash, int num_points) { // __shared__ float theta_c[COST_T::SHARED_MEM_REQUEST_GRD_BYTES + COST_T::SHARED_MEM_REQUEST_BLK_BYTES * 1024]; extern __shared__ float theta_c[]; int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num_points) { float* state = &test_xu[tid * 9]; float* control = &test_xu[tid * 9 + 7]; cost->initializeCosts(state, control, theta_c, 0.0, 0.01); cost_results[tid] = cost->computeRunningCost(state, control, timestep[tid], theta_c, crash + tid); } } template void launchComputeCostTestKernel(const COST_T& cost, std::vector>& test_xu, std::vector& cost_results, std::vector& timesteps, std::vector& crash) { int num_test_points = test_xu.size(); cost_results.resize(num_test_points); float* cost_results_d; float* test_xu_d; int* timestep_d; int* crash_d; HANDLE_ERROR(cudaMalloc((void**)&cost_results_d, sizeof(float) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&test_xu_d, sizeof(float) * 9 * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)×tep_d, sizeof(float) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&crash_d, sizeof(float) * num_test_points)) HANDLE_ERROR(cudaMemcpy(test_xu_d, test_xu.data(), sizeof(float) * 9 * num_test_points, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(timestep_d, timesteps.data(), sizeof(int) * num_test_points, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(crash_d, crash.data(), sizeof(int) * num_test_points, cudaMemcpyHostToDevice)); // TODO amount should depend on the number of query points dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); unsigned shared_mem_size = mppi::kernels::calcClassSharedMemSize(&cost, threadsPerBlock); computeCostTestKernel<<>>(cost.cost_d_, test_xu_d, cost_results_d, timestep_d, crash_d, num_test_points); CudaCheckError(); cudaDeviceSynchronize(); // Copy the memory back to the host HANDLE_ERROR( cudaMemcpy(cost_results.data(), cost_results_d, sizeof(float) * num_test_points, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(crash.data(), crash_d, sizeof(float) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(cost_results_d); cudaFree(test_xu_d); } ================================================ FILE: tests/include/kernel_tests/cost_functions/autorally/ar_standard_cost_kernel_test.cuh ================================================ // // Created by jason on 1/8/20. // #ifndef MPPIGENERIC_AR_STANDARD_COST_KERNEL_TEST_CUH #define MPPIGENERIC_AR_STANDARD_COST_KERNEL_TEST_CUH #include template __global__ void parameterTestKernel(const COST_T* cost, PARAMS_T& params, int& width, int& height); template void launchParameterTestKernel(const COST_T& cost, PARAMS_T& params, int& width, int& height); void launchCheckCudaArray(std::vector& result_arr, cudaArray* array, int number); __global__ void checkCudaArrayKernel(float4* result_arr, cudaArray* array, int number); template void launchTransformTestKernel(std::vector& result, const COST_T& cost); template __global__ void transformTestKernel(float3* results, COST_T* cost); template void launchTextureTestKernel(const COST_T& cost, std::vector& test_results, std::vector& test_indexes); template __global__ void textureTestKernel(const COST_T& cost, float4* test_results, float2* test_indexes, int num_points); template void launchTextureTransformTestKernel(const COST_T& cost, std::vector& test_results, std::vector& test_indexes); template __global__ void textureTransformTestKernel(COST_T& cost, float4* test_results, float2* test_indexes, int num_points); template void launchTrackCostTestKernel(const COST_T& cost, std::vector& test_indexes, std::vector& cost_results, std::vector& crash_results); template __global__ void trackCostTestKernel(COST_T* cost, float3* test_indexes, int num_points, float* cost_results, int* crash_results); template void launchComputeCostTestKernel(const COST_T& cost, std::vector>& test_xu, std::vector& cost_results, std::vector& timesteps); template __global__ void computeCostTestKernel(COST_T& cost, float* test_xu, float* cost_results, int* timesteps, int num_points); #include "ar_standard_cost_kernel_test.cu" #endif // MPPIGENERIC_AR_STANDARD_COST_KERNEL_TEST_CUH ================================================ FILE: tests/include/kernel_tests/cost_functions/cartpole/cartpole_quadratic_cost_kernel_test.cu ================================================ __global__ void parameterTestKernel(CartpoleQuadraticCost* cost_d, CartpoleQuadraticCostParams& params_d) { // The parameters have been set outside of the kernel on the device, copy the current values of the parameters to // params_d int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { params_d = cost_d->getParams(); } } void launchParameterTestKernel(const CartpoleQuadraticCost& cost, CartpoleQuadraticCostParams& param_check) { // Allocate memory for the device side parameter structure CartpoleQuadraticCostParams* param_d = nullptr; HANDLE_ERROR(cudaMalloc((void**)¶m_d, sizeof(CartpoleQuadraticCostParams))) parameterTestKernel<<<1, 1>>>(cost.cost_d_, *param_d); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(¶m_check, param_d, sizeof(param_check), cudaMemcpyDeviceToHost)) cudaFree(param_d); } ================================================ FILE: tests/include/kernel_tests/cost_functions/cartpole/cartpole_quadratic_cost_kernel_test.cuh ================================================ #ifndef MPPIGENERIC_CARTPOLE_QUADRATIC_COST_KERNEL_TEST_CUH #define MPPIGENERIC_CARTPOLE_QUADRATIC_COST_KERNEL_TEST_CUH #include /** * In this cost function, we are supplied with a device side cost class, and a device side parameter structure */ __global__ void parameterTestKernel(CartpoleQuadraticCost* cost_d, CartpoleQuadraticCostParams* params_d); /** * */ void launchParameterTestKernel(const CartpoleQuadraticCost& cost, CartpoleQuadraticCostParams& param_check); #include "cartpole_quadratic_cost_kernel_test.cu" #endif //! MPPIGENERIC_CARTPOLE_QUADRATIC_COST_KERNEL_TEST_CUH ================================================ FILE: tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu ================================================ #include "cost_generic_kernel_tests.cuh" template __global__ void computeRunningCostTestKernel(COST_T* __restrict__ cost, const float* __restrict__ y_d, const float* __restrict__ u_d, int num_rollouts, int num_timesteps, float dt, float* __restrict__ output_cost) { const int sample_idx = blockIdx.x; const int t = threadIdx.x; const int max_time_iters = ceilf((float)num_timesteps / blockDim.x); const int sys_index = threadIdx.z + blockDim.z * blockIdx.z; extern __shared__ float entire_buffer[]; float* y_shared = entire_buffer; float* u_shared = &y_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::OUTPUT_DIM)]; // int* crash_shared = (int*)&u_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::CONTROL_DIM)]; // float* running_cost_shared = (float*)&crash_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z)]; // float* theta_c_shared = &running_cost_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z * blockDim.y)]; float* running_cost_shared = &u_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::CONTROL_DIM)]; int* crash_shared = (int*)&running_cost_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z * blockDim.y)]; float* theta_c_shared = (float*)&crash_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z)]; const int shared_idx = threadIdx.x + blockDim.x * threadIdx.z; const int cost_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); float* y = &y_shared[shared_idx * COST_T::OUTPUT_DIM]; float* u = &u_shared[shared_idx * COST_T::CONTROL_DIM]; float* running_cost = &running_cost_shared[cost_idx]; int* crash = &crash_shared[shared_idx]; *crash = 0; // Fill in shared memory const int global_mem_index = (num_rollouts * sys_index + sample_idx) * num_timesteps + t; mp1::loadArrayParallel(y, 0, y_d, global_mem_index * COST_T::OUTPUT_DIM); mp1::loadArrayParallel(u, 0, u_d, global_mem_index * COST_T::CONTROL_DIM); __syncthreads(); cost->initializeCosts(y, u, theta_c_shared, 0, dt); __syncthreads(); running_cost[0] = cost->computeRunningCost(y, u, t, theta_c_shared, crash); __syncthreads(); // if (threadIdx.y == 0) // { // int num_zeros = 0; // for (int i = 0; i < blockDim.y; i++) // { // if (running_cost[i * blockDim.x] == 0) // { // num_zeros++; // } // } // if (num_zeros != blockDim.y - 1) // { // printf("Sample %d, t %d block_y %d: ", sample_idx, t, blockDim.y); // for (int i = 0; i < blockDim.y; i++) // { // printf("%f, ", running_cost[i * blockDim.x]); // } // printf("\n"); // } // } // if (sample_idx == 0 && t == 0 && blockDim.y < 4) // { // printf("Cost y %d: %f\n", threadIdx.y, running_cost[0]); // } // __syncthreads(); running_cost = &running_cost_shared[threadIdx.x + blockDim.x * blockDim.y * threadIdx.z]; __syncthreads(); mppi::kernels::costArrayReduction(running_cost, blockDim.y, threadIdx.y, blockDim.y, threadIdx.y == 0, blockDim.x); // int prev_size = blockDim.y; // // Allow for better computation when blockDim.x is a power of 2 // const bool block_power_of_2 = (prev_size & (prev_size - 1)) == 0; // const int stop_condition = (block_power_of_2) ? 32 : 0; // int size; // const int xy_index = threadIdx.y; // const int xy_step = blockDim.y; // for (size = prev_size / 2; size > stop_condition; size /= 2) // { // for (int j = xy_index; j < size; j += xy_step) // { // running_cost[j * blockDim.x] += running_cost[(j + size) * blockDim.x]; // } // __syncthreads(); // if (prev_size - 2 * size == 1 && xy_index == 0) // { // running_cost[(size - 1) * blockDim.x] += running_cost[(prev_size - 1) * blockDim.x]; // } // __syncthreads(); // prev_size = size; // } // if (xy_index < 32 && stop_condition != 0) // { // unroll the last warp // switch (size * 2) // { // case 64: // mppi::kernels::warpReduceAdd<64>(running_cost, xy_index, blockDim.x); // break; // case 32: // mppi::kernels::warpReduceAdd<32>(running_cost, xy_index, blockDim.x); // break; // case 16: // mppi::kernels::warpReduceAdd<16>(running_cost, xy_index, blockDim.x); // break; // case 8: // mppi::kernels::warpReduceAdd<8>(running_cost, xy_index, blockDim.x); // break; // case 4: // mppi::kernels::warpReduceAdd<4>(running_cost, xy_index, blockDim.x); // break; // case 2: // mppi::kernels::warpReduceAdd<2>(running_cost, xy_index, blockDim.x); // break; // case 1: // mppi::kernels::warpReduceAdd<1>(running_cost, xy_index, blockDim.x); // break; // } // } // __syncthreads(); // __syncthreads(); // if (sample_idx == 0 && t == 0 && blockDim.y < 4) // { // printf("Final Cost y %d: %f\n", threadIdx.y, running_cost[0]); // } __syncthreads(); if (threadIdx.y == 0) { output_cost[global_mem_index] = running_cost[0]; } } template __global__ void computeTerminalCostTestKernel(COST_T* __restrict__ cost, const float* __restrict__ y_d, int num_rollouts, float dt, float* __restrict__ output_cost) { const int sample_idx = threadIdx.x + blockIdx.x * blockDim.x; const int sys_index = threadIdx.z + blockDim.z * blockIdx.z; extern __shared__ float entire_buffer[]; float* y_shared = entire_buffer; float* u_shared = &y_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::OUTPUT_DIM)]; float* theta_c_shared = &u_shared[mppi::math::nearest_multiple_4(blockDim.x * blockDim.z * COST_T::CONTROL_DIM)]; const int shared_idx = threadIdx.x + blockDim.x * threadIdx.z; float* y = &y_shared[shared_idx * COST_T::OUTPUT_DIM]; float* u = &u_shared[shared_idx * COST_T::CONTROL_DIM]; // Fill in shared memory const int global_mem_index = num_rollouts * sys_index + sample_idx; mp1::loadArrayParallel(y, 0, y_d, global_mem_index * COST_T::OUTPUT_DIM); for (int i = threadIdx.y; i < COST_T::CONTROL_DIM; i += blockDim.y) { u[i] = 0.0f; } __syncthreads(); cost->initializeCosts(y, u, theta_c_shared, 0, dt); __syncthreads(); output_cost[global_mem_index] = cost->terminalCost(y, theta_c_shared); } template void launchRunningCostTestKernel(COST_T& cost, std::vector>& y, std::vector>& u, int num_rollouts, int num_timesteps, float dt, int dim_y, std::vector& output_costs) { if (y.size() != num_rollouts * num_timesteps) { std::cout << "Number of outputs does not match num_rollouts * num_timesteps. Output: " << y.size() << ", num_rollouts: " << num_rollouts << ", num_timesteps: " << num_timesteps << std::endl; exit(-1); } if (u.size() != y.size()) { std::cout << "Number of controls does not match number of outputs. Output: " << y.size() << ", Control: " << u.size() << std::endl; exit(-1); } dim3 block_dim = dim3(num_timesteps, dim_y, 1); dim3 grid_dim = dim3(num_rollouts, 1, 1); cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); // Global Memory setup float* y_d; float* u_d; float* output_costs_d; HANDLE_ERROR(cudaMalloc((void**)&y_d, sizeof(float) * num_rollouts * num_timesteps * COST_T::OUTPUT_DIM)); HANDLE_ERROR(cudaMalloc((void**)&u_d, sizeof(float) * num_rollouts * num_timesteps * COST_T::CONTROL_DIM)); HANDLE_ERROR(cudaMalloc((void**)&output_costs_d, sizeof(float) * num_rollouts * num_timesteps)); // Copy data to GPU for (int k = 0; k < num_rollouts; k++) { for (int t = 0; t < num_timesteps; t++) { const int index = k * num_timesteps + t; HANDLE_ERROR(cudaMemcpyAsync(y_d + index * COST_T::OUTPUT_DIM, y[index].data(), sizeof(float) * COST_T::OUTPUT_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(u_d + index * COST_T::CONTROL_DIM, u[index].data(), sizeof(float) * COST_T::CONTROL_DIM, cudaMemcpyHostToDevice, stream)); } } // Figure shared memory size const int block_num_shared = block_dim.x * block_dim.z; const int block_cost_shared_mem_size = mppi::math::int_multiple_const(COST_T::SHARED_MEM_REQUEST_GRD_BYTES, sizeof(float4)) + block_num_shared * mppi::math::int_multiple_const(COST_T::SHARED_MEM_REQUEST_BLK_BYTES, sizeof(float4)); unsigned compute_cost_shared_mem_size = (mppi::math::nearest_multiple_4(block_num_shared * COST_T::OUTPUT_DIM) + mppi::math::nearest_multiple_4(block_num_shared * COST_T::CONTROL_DIM) + mppi::math::nearest_multiple_4(block_num_shared * dim_y)) * sizeof(float) + mppi::math::nearest_multiple_4(block_num_shared) * sizeof(int) + block_cost_shared_mem_size; // Launch kernel computeRunningCostTestKernel<<>>( cost.cost_d_, y_d, u_d, num_rollouts, num_timesteps, dt, output_costs_d); // Copy memory back to CPU output_costs.resize(num_rollouts * num_timesteps); HANDLE_ERROR(cudaMemcpyAsync(output_costs.data(), output_costs_d, sizeof(float) * num_rollouts * num_timesteps, cudaMemcpyDeviceToHost, stream)); // Free memory HANDLE_ERROR(cudaFree(y_d)); HANDLE_ERROR(cudaFree(u_d)); HANDLE_ERROR(cudaFree(output_costs_d)); HANDLE_ERROR(cudaStreamSynchronize(stream)); } template void launchRolloutCostKernel(COST_T& cost, SAMPLING_T& sampler, std::vector>& y, int num_rollouts, int num_timesteps, float dt, int dim_y, std::vector& output_costs) { if (y.size() != num_rollouts * num_timesteps) { std::cout << "Number of outputs does not match num_rollouts * num_timesteps. Output: " << y.size() << ", num_rollouts: " << num_rollouts << ", num_timesteps: " << num_timesteps << std::endl; exit(-1); } dim3 block_dim = dim3(num_timesteps, dim_y, 1); dim3 grid_dim = dim3(num_rollouts, 1, 1); cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); sampler.bindToStream(stream); // Global Memory setup float* y_d; float* output_costs_d; HANDLE_ERROR(cudaMalloc((void**)&y_d, sizeof(float) * num_rollouts * num_timesteps * COST_T::OUTPUT_DIM)); HANDLE_ERROR(cudaMalloc((void**)&output_costs_d, sizeof(float) * num_rollouts * num_timesteps)); // Copy data to GPU for (int k = 0; k < num_rollouts; k++) { for (int t = 0; t < num_timesteps; t++) { const int index = k * num_timesteps + t; HANDLE_ERROR(cudaMemcpyAsync(y_d + index * COST_T::OUTPUT_DIM, y[index].data(), sizeof(float) * COST_T::OUTPUT_DIM, cudaMemcpyHostToDevice, stream)); } } // Figure shared memory size // const int block_num_shared = block_dim.x * block_dim.z; unsigned compute_cost_shared_mem_size = mppi::kernels::calcRolloutCostKernelSharedMemSize(&cost, &sampler, block_dim); // sizeof(float) * (mppi::math::nearest_multiple_4(block_num_shared * COST_T::OUTPUT_DIM) + // mppi::math::nearest_multiple_4(block_num_shared * COST_T::CONTROL_DIM) + // mppi::math::nearest_multiple_4(block_num_shared * block_dim.y)) + // sizeof(int) * mppi::math::nearest_multiple_4(block_num_shared) + // mppi::math::int_multiple_const(COST_T::SHARED_MEM_REQUEST_GRD_BYTES, sizeof(float4)) + // block_num_shared * mppi::math::int_multiple_const(COST_T::SHARED_MEM_REQUEST_BLK_BYTES, sizeof(float4)) + // mppi::math::int_multiple_const(SAMPLING_T::SHARED_MEM_REQUEST_GRD_BYTES, sizeof(float4)) + // block_num_shared * mppi::math::int_multiple_const(SAMPLING_T::SHARED_MEM_REQUEST_BLK_BYTES, sizeof(float4)); // #ifdef USE_CUDA_BARRIERS_COST // compute_cost_shared_mem_size += mppi::math::int_multiple_const(block_num_shared * sizeof(barrier), 16); // #endif // Launch kernel mppi::kernels::rolloutCostKernel<<>>( cost.cost_d_, sampler.sampling_d_, dt, num_timesteps, num_rollouts, 1.0, 0.0, y_d, output_costs_d); // Copy memory back to CPU output_costs.resize(num_rollouts); HANDLE_ERROR(cudaMemcpyAsync(output_costs.data(), output_costs_d, sizeof(float) * num_rollouts, cudaMemcpyDeviceToHost, stream)); // Free memory HANDLE_ERROR(cudaFree(y_d)); HANDLE_ERROR(cudaFree(output_costs_d)); HANDLE_ERROR(cudaStreamSynchronize(stream)); } template void launchTerminalCostTestKernel(COST_T& cost, std::vector>& y, float dt, int dim_x, int dim_y, std::vector& output_costs) { const int num_rollouts = y.size(); dim3 block_dim = dim3(dim_x, dim_y, 1); dim3 grid_dim = dim3(num_rollouts, 1, 1); cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); // Global Memory setup float* y_d; float* output_costs_d; HANDLE_ERROR(cudaMalloc((void**)&y_d, sizeof(float) * num_rollouts * COST_T::OUTPUT_DIM)); HANDLE_ERROR(cudaMalloc((void**)&output_costs_d, sizeof(float) * num_rollouts)); // Copy data to GPU for (int k = 0; k < num_rollouts; k++) { HANDLE_ERROR(cudaMemcpyAsync(y_d + k * COST_T::OUTPUT_DIM, y[k].data(), sizeof(float) * COST_T::OUTPUT_DIM, cudaMemcpyHostToDevice, stream)); } // Figure shared memory size const int block_num_shared = block_dim.x * block_dim.z; const int block_cost_shared_mem_size = mppi::math::int_multiple_const(COST_T::SHARED_MEM_REQUEST_GRD_BYTES, sizeof(float4)) + block_num_shared * mppi::math::int_multiple_const(COST_T::SHARED_MEM_REQUEST_BLK_BYTES, sizeof(float4)); unsigned compute_cost_shared_mem_size = (mppi::math::nearest_multiple_4(block_num_shared * COST_T::OUTPUT_DIM) + mppi::math::nearest_multiple_4(block_num_shared * COST_T::CONTROL_DIM)) * sizeof(float) + block_cost_shared_mem_size; // Launch kernel computeTerminalCostTestKernel<<>>( cost.cost_d_, y_d, num_rollouts, dt, output_costs_d); // Copy memory back to CPU HANDLE_ERROR(cudaMemcpyAsync(output_costs.data(), output_costs_d, sizeof(float) * num_rollouts, cudaMemcpyDeviceToHost, stream)); // Free memory HANDLE_ERROR(cudaFree(y_d)); HANDLE_ERROR(cudaFree(output_costs_d)); HANDLE_ERROR(cudaStreamSynchronize(stream)); } template void checkGPURolloutCost(COST_T& cost, float dt) { cost.GPUSetup(); const int num_rollouts = 512; const int num_timesteps = 8; #ifdef USE_ROLLOUT_COST_KERNEL using SAMPLER_T = mppi::sampling_distributions::GaussianDistribution; using SAMPLER_PARAMS = typename SAMPLER_T::SAMPLING_PARAMS_T; SAMPLER_PARAMS params; for (int i = 0; i < COST_T::CONTROL_DIM; i++) { params.std_dev[i] = 1.0; params.control_cost_coeff[i] = 0.0; } params.num_rollouts = num_rollouts; params.num_timesteps = num_timesteps; SAMPLER_T sampler(params); sampler.GPUSetup(); // Curand generator cudaStream_t stream; cudaStreamCreate(&stream); curandGenerator_t gen; int seed = 42; curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen, seed); curandSetGeneratorOffset(gen, 0); curandSetStream(gen, stream); sampler.bindToStream(stream); #endif using control_trajectory = Eigen::Matrix; using output_trajectory = Eigen::Matrix; std::vector> y(num_timesteps * num_rollouts); #ifndef USE_ROLLOUT_COST_KERNEL std::vector> u(num_timesteps * num_rollouts); #endif std::vector output_cost_gpu; for (int n = 0; n < num_rollouts; n++) { #ifndef USE_ROLLOUT_COST_KERNEL control_trajectory u_traj = control_trajectory::Random(); #endif output_trajectory y_traj = output_trajectory::Random(); for (int t = 0; t < num_timesteps; t++) { int index = t + n * num_timesteps; for (int i = 0; i < COST_T::OUTPUT_DIM; i++) { y[index][i] = y_traj(i, t); } #ifndef USE_ROLLOUT_COST_KERNEL for (int i = 0; i < COST_T::CONTROL_DIM; i++) { u[index][i] = u_traj(i, t); } #endif } } #ifdef USE_ROLLOUT_COST_KERNEL sampler.generateSamples(0, 0, gen, true); #endif for (int y_dim = 1; y_dim < 32; y_dim++) { #ifdef USE_ROLLOUT_COST_KERNEL launchRolloutCostKernel(cost, sampler, y, num_rollouts, num_timesteps, dt, y_dim, output_cost_gpu); #else launchRunningCostTestKernel(cost, y, u, num_rollouts, num_timesteps, dt, y_dim, output_cost_gpu); #endif for (int n = 0; n < num_rollouts; n++) { #ifdef USE_ROLLOUT_COST_KERNEL control_trajectory u_traj_n; HANDLE_ERROR(cudaMemcpy(u_traj_n.data(), sampler.getControlSample(n, 0, 0), sizeof(float) * num_timesteps * COST_T::CONTROL_DIM, cudaMemcpyDeviceToHost)); float total_cost = 0; for (int t = 0; t < num_timesteps; t++) { int index = t + n * num_timesteps; typename COST_T::control_array u_index = u_traj_n.col(t); Eigen::Map y_index(y[index].data()); #else for (int t = 0; t < num_timesteps; t++) { int index = t + n * num_timesteps; Eigen::Map u_index(u[index].data()); Eigen::Map y_index(y[index].data()); #endif int crash = 0; float cpu_cost = cost.computeRunningCost(y_index, u_index, t, &crash); #ifdef USE_ROLLOUT_COST_KERNEL cpu_cost += sampler.computeLikelihoodRatioCost(u_index, n, t, 0); total_cost += cpu_cost; #else ASSERT_NEAR(cpu_cost, output_cost_gpu[index], cpu_cost * 0.0015) << " Sample " << n << ", t " << t << " y_dim " << y_dim; #endif } #ifdef USE_ROLLOUT_COST_KERNEL total_cost /= num_timesteps; Eigen::Map y_index(y[(n + 1) * num_timesteps - 1].data()); total_cost += cost.terminalCost(y_index) / num_timesteps; ASSERT_NEAR(total_cost, output_cost_gpu[n], total_cost * 0.001) << " Sample " << n << " y_dim " << y_dim; #endif } } } ================================================ FILE: tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cuh ================================================ #pragma once #include #include #include #include #include #include namespace mp1 = ::mppi::p1; #define USE_ROLLOUT_COST_KERNEL template __global__ void computeRunningCostTestKernel(COST_T* __restrict__ cost, const float* __restrict__ y_d, const float* __restrict__ u_d, int num_rollouts, int num_timesteps, float dt, float* __restrict__ output_cost); template __global__ void computeTerminalCostTestKernel(COST_T* __restrict__ cost, const float* __restrict__ y_d, int num_rollouts, float dt, float* __restrict__ output_cost); template void launchRunningCostTestKernel(COST_T& cost, std::vector>& y, std::vector>& u, int num_rollouts, int num_timesteps, float dt, int dim_y, std::vector& output_costs); template void launchRolloutCostKernel(COST_T& cost, SAMPLING_T& sampler, std::vector>& y, int num_rollouts, int num_timesteps, float dt, int dim_y, std::vector& output_costs); template void launchTerminalCostTestKernel(COST_T& cost, std::vector>& y, float dt, int dim_x, int dim_y, std::vector& output_costs); template void checkGPURolloutCost(COST_T& cost, float dt); #ifdef __CUDACC__ #include "cost_generic_kernel_tests.cu" #endif ================================================ FILE: tests/include/kernel_tests/dynamics/autorally/ar_nn_dynamics_kernel_test.cu ================================================ template __global__ void parameterCheckTestKernel(NETWORK_T* model, float* theta, int* stride, int* net_structure) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { for (int i = 0; i < THETA_SIZE; i++) { theta[i] = model->getThetaPtr()[i]; } for (int i = 0; i < STRIDE_SIZE; i++) { stride[i] = model->getStrideIdcsPtr()[i]; } for (int i = 0; i < NUM_LAYERS; i++) { net_structure[i] = model->getNetStructurePtr()[i]; } } } template void launchParameterCheckTestKernel(NETWORK_T& model, std::array& theta, std::array& stride, std::array& net_structure) { float* theta_d; int* stride_d; int* net_structure_d; HANDLE_ERROR(cudaMalloc((void**)&theta_d, sizeof(float) * theta.size())) HANDLE_ERROR(cudaMalloc((void**)&stride_d, sizeof(int) * stride.size())) HANDLE_ERROR(cudaMalloc((void**)&net_structure_d, sizeof(int) * net_structure.size())) dim3 threadsPerBlock(1, 1); dim3 numBlocks(1, 1); parameterCheckTestKernel <<>>(model.model_d_, theta_d, stride_d, net_structure_d); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(theta.data(), theta_d, sizeof(float) * theta.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR(cudaMemcpy(stride.data(), stride_d, sizeof(int) * stride.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR( cudaMemcpy(net_structure.data(), net_structure_d, sizeof(int) * net_structure.size(), cudaMemcpyDeviceToHost)) cudaDeviceSynchronize(); cudaFree(theta_d); cudaFree(stride_d); cudaFree(net_structure_d); } template __global__ void fullARNNTestKernel(NETWORK_T* model, float* state, float* control, float* state_der, float dt) { __shared__ float theta[NETWORK_T::SHARED_MEM_REQUEST_GRD_BYTES / sizeof(float) + 1 + NETWORK_T::SHARED_MEM_REQUEST_BLK_BYTES * BLOCKDIM_X * BLOCKDIM_Z]; __shared__ float output[S_DIM * BLOCKDIM_X * BLOCKDIM_Z]; __shared__ float s_der[S_DIM * BLOCKDIM_X * BLOCKDIM_Z]; model->initializeDynamics(state, control, output, theta, 0.0f, 0.0f); int tid = blockIdx.x * blockDim.x + threadIdx.x; // calls enforce constraints -> compute state derivative -> increment state // printf("before enforceConstraints %d, %d\n", threadIdx.x, threadIdx.y); // printf("enforceConstraints %d, %d\n", threadIdx.x, threadIdx.y); // printf("before enforce Constraints %f, %f\n", (control+(tid*C_DIM))[0], (control+(tid*C_DIM))[1]); model->enforceConstraints(state + (tid * S_DIM), control + (tid * C_DIM)); __syncthreads(); // printf("after enforce Constraints %f, %f\n", (control+(tid*C_DIM))[0], (control+(tid*C_DIM))[1]); model->computeStateDeriv(state + (tid * S_DIM), control + (tid * C_DIM), state_der + (tid * S_DIM), theta); __syncthreads(); int shared_indexer = BLOCKDIM_X * threadIdx.z + threadIdx.x; for (int i = threadIdx.y; i < S_DIM; i += blockDim.y) { // printf("index for shared %d, %d\n", S_DIM*shared_indexer + i, S_DIM*tid+i); s_der[S_DIM * shared_indexer + i] = state_der[S_DIM * tid + i]; } __syncthreads(); // if(threadIdx.y == 0) { // printf("state_der = %f, %f, %f, %f, %f, %f, %f\n", state_der[0], state_der[1], state_der[2], state_der[3], // state_der[4], state_der[5], state_der[6]); printf("state = %f, %f, %f, %f, %f, %f, %f\n", state[0], state[1], // state[2], state[3], state[4], state[5], state[6]); printf("s_der = %f, %f, %f, %f, %f, %f, %f\n", s_der[0], // s_der[1], s_der[2], s_der[3], s_der[4], s_der[5], s_der[6]); //} model->updateState(state + (tid * S_DIM), s_der + (shared_indexer * S_DIM), dt); } template void launchFullARNNTestKernel(NETWORK_T& model, std::vector>& state, std::vector>& control, std::vector>& state_der, float dt, int dim_y) { float* state_d; float* state_der_d; float* control_d; HANDLE_ERROR(cudaMalloc((void**)&state_d, sizeof(float) * S_DIM * state.size() * BLOCKDIM_Z)) HANDLE_ERROR(cudaMalloc((void**)&state_der_d, sizeof(float) * S_DIM * state_der.size() * BLOCKDIM_Z)) HANDLE_ERROR(cudaMalloc((void**)&control_d, sizeof(float) * C_DIM * control.size() * BLOCKDIM_Z)) HANDLE_ERROR(cudaMemcpy(state_d, state.data(), sizeof(float) * S_DIM * state.size(), cudaMemcpyHostToDevice)) HANDLE_ERROR( cudaMemcpy(state_der_d, state_der.data(), sizeof(float) * S_DIM * state_der.size(), cudaMemcpyHostToDevice)) HANDLE_ERROR(cudaMemcpy(control_d, control.data(), sizeof(float) * C_DIM * control.size(), cudaMemcpyHostToDevice)) // make sure you cannot use invalid inputs dim3 threadsPerBlock(state.size(), dim_y, BLOCKDIM_Z); dim3 numBlocks(1, 1); // launch kernel fullARNNTestKernel <<>>(model.model_d_, state_d, control_d, state_der_d, dt); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR( cudaMemcpy(state_der.data(), state_der_d, sizeof(float) * S_DIM * state_der.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR(cudaMemcpy(control.data(), control_d, sizeof(float) * C_DIM * control.size(), cudaMemcpyDeviceToHost)) cudaDeviceSynchronize(); cudaFree(state_d); cudaFree(state_der_d); cudaFree(control_d); } ================================================ FILE: tests/include/kernel_tests/dynamics/autorally/ar_nn_dynamics_kernel_test.cuh ================================================ #ifndef AR_NN_DYNAMICS_KERNEL_TEST_CUH #define AR_NN_DYNAMICS_KERNEL_TEST_CUH #include #include #include "../dynamics_generic_kernel_tests.cuh" #include template void launchParameterCheckTestKernel(NETWORK_T& model, std::array& theta, std::array& stride, std::array& net_structure); template __global__ void parameterCheckTestKernel(NETWORK_T* model, float* theta, int* stride, int* net_structure); template void launchFullARNNTestKernel(NETWORK_T& model, std::vector>& state, std::vector>& control, std::vector>& state_der, float dt, int dim_y); template __global__ void fullARNNTestKernel(NETWORK_T* model, float* state, float* control, float* state_der, float dt); #include "ar_nn_dynamics_kernel_test.cu" #endif // AR_NN_DYNAMICS_KERNEL_TEST_CUH ================================================ FILE: tests/include/kernel_tests/dynamics/cartpole/cartpole_dynamics_kernel_test.cu ================================================ // // Created by mgandhi3 on 1/7/20. // /** * Kernels to test device functions */ #include "cartpole_dynamics_kernel_test.cuh" __global__ void CartMassTestKernel(CartpoleDynamics* CP, float& mass_check) { int tid = blockIdx.x * blockDim.x + threadIdx.x; // printf("\nEntering the kernel!\n"); // printf("The thread id is: %i\n", tid); if (tid == 0) { // printf("This is gravity: %f\n", CP->getGravity()); // printf("This is the mass of the cart: %f\n", CP->getCartMass()); // printf("This is the mass of the pole: %f\n", CP->getPoleMass()); // printf("This is the length of the pole: %f\n", CP->getPoleLength()); // printf("This is the value of GPUMemstatus on the GPU: %d\n", CP->GPUMemStatus_); // printf("This is the value of CP_device on the GPU: %d\n", CP->CP_device); mass_check = CP->getCartMass(); } } __global__ void PoleMassTestKernel(CartpoleDynamics* CP, float& mass_check) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { mass_check = CP->getPoleMass(); } } __global__ void GravityTestKernel(CartpoleDynamics* CP, float& gravity_check) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { gravity_check = CP->getGravity(); } } __global__ void PoleLengthTestKernel(CartpoleDynamics* CP, float& length_check) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { length_check = CP->getPoleLength(); } } __global__ void DynamicsTestKernel(CartpoleDynamics* CP, float* state, float* control, float* state_der) { // int tid = blockIdx.x*blockDim.x + threadIdx.x; /** * This will probably do stupid things because of parallelization * Fix later */ CP->computeDynamics(state, control, state_der); } /** * Wrapper for kernels */ void launchCartMassTestKernel(const CartpoleDynamics& CP, float& mass_check) { // Allocate memory on the CPU for checking the mass float* mass_check_device; HANDLE_ERROR(cudaMalloc((void**)&mass_check_device, sizeof(float))); CartMassTestKernel<<<1, 1>>>(CP.model_d_, *mass_check_device); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(&mass_check, mass_check_device, sizeof(float), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(mass_check_device); } void launchPoleMassTestKernel(const CartpoleDynamics& CP, float& mass_check) { // Allocate memory on the CPU for checking the mass float* mass_check_device; HANDLE_ERROR(cudaMalloc((void**)&mass_check_device, sizeof(float))); PoleMassTestKernel<<<1, 1>>>(CP.model_d_, *mass_check_device); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(&mass_check, mass_check_device, sizeof(float), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(mass_check_device); } void launchPoleLengthTestKernel(const CartpoleDynamics& CP, float& length_check) { // Allocate memory on the CPU for checking the mass float* length_check_device; HANDLE_ERROR(cudaMalloc((void**)&length_check_device, sizeof(float))); PoleLengthTestKernel<<<1, 1>>>(CP.model_d_, *length_check_device); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(&length_check, length_check_device, sizeof(float), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(length_check_device); } void launchGravityTestKernel(const CartpoleDynamics& CP, float& gravity_check) { // Allocate memory on the CPU for checking the mass float* gravity_check_device; HANDLE_ERROR(cudaMalloc((void**)&gravity_check_device, sizeof(float))); GravityTestKernel<<<1, 1>>>(CP.model_d_, *gravity_check_device); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(&gravity_check, gravity_check_device, sizeof(float), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(gravity_check_device); } void launchDynamicsTestKernel(const CartpoleDynamics& CP, float* state_cpu, float* control_cpu, float* state_der_cpu) { // Allocate memory on the CPU for checking the mass float* state_gpu; float* control_gpu; float* state_der_gpu; HANDLE_ERROR(cudaMalloc((void**)&state_gpu, sizeof(float) * CP.STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&control_gpu, sizeof(float) * CP.CONTROL_DIM)); HANDLE_ERROR(cudaMalloc((void**)&state_der_gpu, sizeof(float) * CP.STATE_DIM)); HANDLE_ERROR(cudaMemcpy(state_gpu, state_cpu, sizeof(float) * CP.STATE_DIM, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(control_gpu, control_cpu, sizeof(float) * CP.CONTROL_DIM, cudaMemcpyHostToDevice)); DynamicsTestKernel<<<1, 1>>>(CP.model_d_, state_gpu, control_gpu, state_der_gpu); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state_der_cpu, state_der_gpu, sizeof(float) * CP.STATE_DIM, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(state_gpu); cudaFree(control_gpu); cudaFree(state_der_gpu); } ================================================ FILE: tests/include/kernel_tests/dynamics/cartpole/cartpole_dynamics_kernel_test.cuh ================================================ #pragma once #ifndef KERNEL_TESTS_DYNAMICS_CARTPOLE_CARTPOLE_KERNEL_TEST_CUH_ #define KERNEL_TESTS_DYNAMICS_CARTPOLE_CARTPOLE_KERNEL_TEST_CUH_ #include __global__ void CartMassTestKernel(CartpoleDynamics* CP, float& mass_check); __global__ void PoleMassTestKernel(CartpoleDynamics* CP, float& mass_check); __global__ void PoleLengthTestKernel(CartpoleDynamics* CP, float& length_check); __global__ void GravityTestKernel(CartpoleDynamics* CP, float& gravity_check); __global__ void DynamicsTestKernel(CartpoleDynamics* CP, float* state, float* control, float* state_der); void launchCartMassTestKernel(const CartpoleDynamics&, float& mass_check); void launchPoleMassTestKernel(const CartpoleDynamics&, float& mass_check); void launchPoleLengthTestKernel(const CartpoleDynamics&, float& length_check); void launchGravityTestKernel(const CartpoleDynamics&, float& gravity_check); void launchDynamicsTestKernel(const CartpoleDynamics&, float* state_cpu, float* control_cpu, float* state_der_cpu); #ifdef __CUDACC__ #include "cartpole_dynamics_kernel_test.cu" #endif #endif //! KERNEL_TESTS_DYNAMICS_CARTPOLE_CARTPOLE_KERNEL_TEST_CUH_ ================================================ FILE: tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu ================================================ #include template __global__ void parameterTestKernel(CLASS_T* class_t, PARAMS_T& params) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { params = class_t->getParams(); } } template void launchParameterTestKernel(CLASS_T& class_t, PARAMS_T& params) { PARAMS_T* params_d; HANDLE_ERROR(cudaMalloc((void**)¶ms_d, sizeof(PARAMS_T))) parameterTestKernel<<<1, 1>>>(static_cast(class_t.model_d_), *params_d); HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(¶ms, params_d, sizeof(PARAMS_T), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(params_d); } template __global__ void getSharedMemorySizesKernel(DYN_T* __restrict__ dynamics, int* __restrict__ output_d) { output_d[0] = dynamics->getGrdSharedSizeBytes(); output_d[1] = dynamics->getBlkSharedSizeBytes(); } template void launchGetSharedMemorySizesKernel(DYNAMICS_T& dynamics, int shared_mem_sizes[2]) { int* shared_mem_sizes_d; HANDLE_ERROR(cudaMalloc((void**)&shared_mem_sizes_d, sizeof(int) * 2)); getSharedMemorySizesKernel<<<1, 1>>>(dynamics.model_d_, shared_mem_sizes_d); HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(shared_mem_sizes, shared_mem_sizes_d, sizeof(int) * 2, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaFree(shared_mem_sizes_d)); } template __global__ void controlRangesTestKernel(DYNAMICS_T* dynamics, float2* control_rngs) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { float2* raw_ptr = dynamics->getControlRangesRaw(); for (int i = 0; i < C_DIM; i++) { control_rngs[i].x = raw_ptr[i].x; control_rngs[i].y = raw_ptr[i].y; } } } template void launchControlRangesTestKernel(DYNAMICS_T& dynamics, std::array& control_rngs) { float2* ranges_d; HANDLE_ERROR(cudaMalloc((void**)&ranges_d, sizeof(float2) * control_rngs.size())) controlRangesTestKernel<<<1, 1>>>(static_cast(dynamics.model_d_), ranges_d); HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(control_rngs.data(), ranges_d, sizeof(float2) * control_rngs.size(), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(ranges_d); } template __global__ void enforceConstraintTestKernel(DYNAMICS_T* dynamics, float* state, float* control, int num) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num) { dynamics->enforceConstraints(&state[tid * S_DIM], &control[tid * C_DIM]); } } template void launchEnforceConstraintTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& control, int dim_y) { int count = state.size(); float* state_d; float* control_d; HANDLE_ERROR(cudaMalloc((void**)&state_d, sizeof(float) * S_DIM * state.size())) HANDLE_ERROR(cudaMalloc((void**)&control_d, sizeof(float) * C_DIM * control.size())) HANDLE_ERROR(cudaMemcpy(state_d, state.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(control_d, control.data(), sizeof(float) * C_DIM * count, cudaMemcpyHostToDevice)); dim3 threadsPerBlock(count, dim_y); dim3 numBlocks(1, 1); enforceConstraintTestKernel <<>>(static_cast(dynamics.model_d_), state_d, control_d, count); HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(control.data(), control_d, sizeof(float) * C_DIM * control.size(), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(state_d); cudaFree(control_d); } template __global__ void updateStateTestKernel(DYNAMICS_T* dynamics, float* state, float* state_der, float dt, int num) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num) { dynamics->updateState(state + (tid * S_DIM), state_der + (tid * S_DIM), dt); } } template void launchUpdateStateTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& state_der, float dt, int dim_y) { int count = state.size(); float* state_d; float* state_der_d; HANDLE_ERROR(cudaMalloc((void**)&state_d, sizeof(float) * S_DIM * count)) HANDLE_ERROR(cudaMalloc((void**)&state_der_d, sizeof(float) * S_DIM * count)) HANDLE_ERROR(cudaMemcpy(state_d, state.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(state_der_d, state_der.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); dim3 threadsPerBlock(count, dim_y); dim3 numBlocks(1, 1); updateStateTestKernel <<>>(static_cast(dynamics.model_d_), state_d, state_der_d, dt, count); HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(state_der.data(), state_der_d, sizeof(float) * S_DIM * state_der.size(), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(state_d); cudaFree(state_der_d); } template __global__ void computeKinematicsTestKernel(DYNAMICS_T* dynamics, float* state, float* state_der, int num) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num && threadIdx.y == 0) { dynamics->computeKinematics(state + (tid * S_DIM), state_der + (tid * S_DIM)); } } template void launchComputeKinematicsTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& state_der, int dim_y) { int count = state.size(); float* state_d; float* state_der_d; HANDLE_ERROR(cudaMalloc((void**)&state_d, sizeof(float) * S_DIM * state.size())) HANDLE_ERROR(cudaMalloc((void**)&state_der_d, sizeof(float) * S_DIM * state_der.size())) HANDLE_ERROR(cudaMemcpy(state_d, state.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(state_der_d, state_der.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); dim3 threadsPerBlock(count, dim_y); dim3 numBlocks(1, 1); computeKinematicsTestKernel <<>>(static_cast(dynamics.model_d_), state_d, state_der_d, count); HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(state_der.data(), state_der_d, sizeof(float) * S_DIM * state_der.size(), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(state_d); cudaFree(state_der_d); } template __global__ void computeDynamicsTestKernel(DYNAMICS_T* model, float* state, float* control, float* state_der, int count) { extern __shared__ float entire_buffer[]; float* output = entire_buffer; float* theta = &output[mppi::math::nearest_multiple_4(blockDim.x * DYNAMICS_T::OUTPUT_DIM)]; int tid = blockIdx.x * blockDim.x + threadIdx.x; model->initializeDynamics(state, control, output, theta, 0.0f, 0.0f); __syncthreads(); if (tid < count) { model->computeDynamics(state + (tid * S_DIM), control + (tid * C_DIM), state_der + (tid * S_DIM), theta); } } template void launchComputeDynamicsTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& control, std::vector>& state_der, int dim_y) { int count = state.size(); float* state_d; float* state_der_d; float* control_d; HANDLE_ERROR(cudaMalloc((void**)&state_d, sizeof(float) * S_DIM * count)) HANDLE_ERROR(cudaMalloc((void**)&state_der_d, sizeof(float) * S_DIM * count)) HANDLE_ERROR(cudaMalloc((void**)&control_d, sizeof(float) * C_DIM * count)) HANDLE_ERROR(cudaMemcpy(state_d, state.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(state_der_d, state_der.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(control_d, control.data(), sizeof(float) * C_DIM * count, cudaMemcpyHostToDevice)); // make sure you cannot use invalid inputs const int gridsize_x = (count - 1) / BLOCKDIM_X + 1; dim3 threadsPerBlock(BLOCKDIM_X, dim_y); dim3 numBlocks(gridsize_x, 1); unsigned shared_mem = mppi::kernels::calcClassSharedMemSize(&dynamics, threadsPerBlock) + mppi::math::nearest_multiple_4(threadsPerBlock.x * DYNAMICS_T::OUTPUT_DIM); // launch kernel computeDynamicsTestKernel <<>>(dynamics.model_d_, state_d, control_d, state_der_d, count); HANDLE_ERROR(cudaGetLastError()); HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(state_der.data(), state_der_d, sizeof(float) * S_DIM * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(control.data(), control_d, sizeof(float) * C_DIM * count, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(state_d); cudaFree(state_der_d); cudaFree(control_d); } template __global__ void computeStateDerivTestKernel(DYNAMICS_T* dynamics, float* state, float* control, float* state_der, int num) { extern __shared__ float entire_buffer[]; float* output = entire_buffer; float* theta = &output[mppi::math::nearest_multiple_4(blockDim.x * DYNAMICS_T::OUTPUT_DIM)]; dynamics->initializeDynamics(state, control, output, theta, 0.0f, 0.0f); __syncthreads(); int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num) { dynamics->computeStateDeriv(state + (tid * S_DIM), control + (tid * C_DIM), state_der + (tid * S_DIM), theta); } } template void launchComputeStateDerivTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& control, std::vector>& state_der, int dim_y) { int count = state.size(); float* state_d; float* control_d; float* state_der_d; HANDLE_ERROR(cudaMalloc((void**)&state_d, sizeof(float) * S_DIM * state.size())) HANDLE_ERROR(cudaMalloc((void**)&state_der_d, sizeof(float) * S_DIM * state_der.size())) HANDLE_ERROR(cudaMalloc((void**)&control_d, sizeof(float) * C_DIM * control.size())) HANDLE_ERROR(cudaMemcpy(state_d, state.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(state_der_d, state_der.data(), sizeof(float) * S_DIM * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(control_d, control.data(), sizeof(float) * C_DIM * count, cudaMemcpyHostToDevice)); const int gridsize_x = (count - 1) / BLOCKDIM_X + 1; dim3 threadsPerBlock(BLOCKDIM_X, dim_y); dim3 numBlocks(gridsize_x, 1); unsigned shared_mem = mppi::kernels::calcClassSharedMemSize(&dynamics, threadsPerBlock) + mppi::math::nearest_multiple_4(threadsPerBlock.x * DYNAMICS_T::OUTPUT_DIM); computeStateDerivTestKernel<<>>( static_cast(dynamics.model_d_), state_d, control_d, state_der_d, count); HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(state_der.data(), state_der_d, sizeof(float) * S_DIM * state_der.size(), cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(control.data(), control_d, sizeof(float) * C_DIM * control.size(), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(state_d); cudaFree(state_der_d); cudaFree(control_d); } template __global__ void stepTestKernel(DYNAMICS_T* dynamics, float* state, float* control, float* state_der, float* next_state, float* output, int t, float dt, int num) { int tid = blockIdx.x * blockDim.x + threadIdx.x; extern __shared__ float entire_buffer[]; float* theta = entire_buffer; // float* theta = reinterpret_cast(theta_s4); float* x = state + (tid * DYNAMICS_T::STATE_DIM); float* x_dot = state_der + (tid * DYNAMICS_T::STATE_DIM); float* x_next = next_state + (tid * DYNAMICS_T::STATE_DIM); float* u = control + (tid * DYNAMICS_T::CONTROL_DIM); float* y = output + (tid * DYNAMICS_T::OUTPUT_DIM); if (tid < num) { dynamics->initializeDynamics(state, control, output, theta, 0.0f, dt); } __syncthreads(); if (tid < num) { dynamics->enforceConstraints(x, u); dynamics->step(x, x_next, x_dot, u, y, theta, t, dt); } } template // here for compatability void launchStepTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& control, std::vector>& state_der, std::vector>& next_state, int t, float dt, int dim_y, int dim_x = 32) { std::vector> output(state.size()); launchStepTestKernel(dynamics, state, control, state_der, next_state, output, t, dt, dim_y, dim_x); } template void launchStepTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& control, std::vector>& state_der, std::vector>& next_state, std::vector>& output, int t, float dt, int dim_y, int dim_x) { if (state.size() != control.size()) { std::cerr << "Num States doesn't match num controls" << std::endl; return; } if (state.size() != state_der.size()) { std::cerr << "Num States doesn't match num state_ders" << std::endl; return; } if (state.size() != next_state.size()) { std::cerr << "Num States doesn't match num next_states" << std::endl; return; } if (state.size() % dim_x != 0) { std::cerr << "Num States needs be evenly divisible by dim_x: " << state.size() << " state queries, " << dim_x << " parallelizable threads per block" << std::endl; return; } int count = state.size(); float* state_d; float* control_d; float* state_der_d; float* next_state_d; float* output_d; HANDLE_ERROR(cudaMalloc((void**)&state_d, sizeof(float) * DYNAMICS_T::STATE_DIM * count)); HANDLE_ERROR(cudaMalloc((void**)&state_der_d, sizeof(float) * DYNAMICS_T::STATE_DIM * count)); HANDLE_ERROR(cudaMalloc((void**)&next_state_d, sizeof(float) * DYNAMICS_T::STATE_DIM * count)); HANDLE_ERROR(cudaMalloc((void**)&control_d, sizeof(float) * DYNAMICS_T::CONTROL_DIM * count)); HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(float) * DYNAMICS_T::OUTPUT_DIM * count)); HANDLE_ERROR( cudaMemcpy(state_d, state.data(), sizeof(float) * DYNAMICS_T::STATE_DIM * count, cudaMemcpyHostToDevice)); // HANDLE_ERROR( // cudaMemcpy(state_der_d, state_der.data(), sizeof(float) * DYNAMICS_T::STATE_DIM * count, // cudaMemcpyHostToDevice)); // HANDLE_ERROR(cudaMemcpy(next_state_d, next_state.data(), sizeof(float) * DYNAMICS_T::STATE_DIM * count, // cudaMemcpyHostToDevice)); HANDLE_ERROR( cudaMemcpy(control_d, control.data(), sizeof(float) * DYNAMICS_T::CONTROL_DIM * count, cudaMemcpyHostToDevice)); const int gridsize_x = (count - 1) / dim_x + 1; dim3 threadsPerBlock(dim_x, dim_y); dim3 numBlocks(gridsize_x, 1); unsigned shared_mem = mppi::kernels::calcClassSharedMemSize(&dynamics, threadsPerBlock); stepTestKernel<<>>( dynamics.model_d_, state_d, control_d, state_der_d, next_state_d, output_d, t, dt, count); HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR( cudaMemcpy(state.data(), state_d, sizeof(float) * DYNAMICS_T::STATE_DIM * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(state_der.data(), state_der_d, sizeof(float) * DYNAMICS_T::STATE_DIM * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(next_state.data(), next_state_d, sizeof(float) * DYNAMICS_T::STATE_DIM * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(control.data(), control_d, sizeof(float) * DYNAMICS_T::CONTROL_DIM * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(output.data(), output_d, sizeof(float) * DYNAMICS_T::OUTPUT_DIM * count, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(state_d); cudaFree(state_der_d); cudaFree(control_d); cudaFree(next_state_d); cudaFree(output_d); } template void checkGPUComputationStep(DYN_T& dynamics, float dt, int max_y_dim, int x_dim, typename DYN_T::buffer_trajectory buffer, double tol) { CudaCheckError(); dynamics.GPUSetup(); CudaCheckError(); const int num_points = 1024; Eigen::Matrix control_trajectory; control_trajectory = Eigen::Matrix::Random(); Eigen::Matrix state_trajectory; state_trajectory = Eigen::Matrix::Random(); std::vector> s(num_points); std::vector> s_der(num_points); std::vector> s_next(num_points); std::vector> output(num_points); // steering, throttle std::vector> u(num_points); for (int state_index = 0; state_index < s.size(); state_index++) { for (int dim = 0; dim < s[0].size(); dim++) { s[state_index][dim] = state_trajectory.col(state_index)(dim); } for (int dim = 0; dim < u[0].size(); dim++) { u[state_index][dim] = control_trajectory.col(state_index)(dim); } } // Run dynamics on GPU for (int y_dim = 1; y_dim <= max_y_dim; y_dim++) { if (dynamics.checkRequiresBuffer()) { dynamics.updateFromBuffer(buffer); } launchStepTestKernel(dynamics, s, u, s_der, s_next, output, 0, dt, y_dim, x_dim); for (int point = 0; point < num_points; point++) { typename DYN_T::state_array state = state_trajectory.col(point); typename DYN_T::state_array next_state = DYN_T::state_array::Zero(); typename DYN_T::control_array control = control_trajectory.col(point); typename DYN_T::state_array state_der_cpu = DYN_T::state_array::Zero(); typename DYN_T::output_array output_array_cpu = DYN_T::output_array::Zero(); dynamics.initializeDynamics(state, control, output_array_cpu, 0, dt); dynamics.step(state, next_state, state_der_cpu, control, output_array_cpu, 0.0f, dt); for (int dim = 0; dim < DYN_T::STATE_DIM; dim++) { EXPECT_NEAR(state(dim), s[point][dim], tol) << "at sample " << point << ", state dim: " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s[point][dim])); } for (int dim = 0; dim < DYN_T::CONTROL_DIM; dim++) { EXPECT_NEAR(control(dim), u[point][dim], tol) << "at sample " << point << ", control dim: " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(u[point][dim])); } for (int dim = 0; dim < DYN_T::STATE_DIM; dim++) { EXPECT_NEAR(state_der_cpu(dim), s_der[point][dim], tol) << "at sample " << point << ", state deriv dim: " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s_der[point][dim])); } for (int dim = 0; dim < DYN_T::STATE_DIM; dim++) { EXPECT_NEAR(next_state(dim), s_next[point][dim], tol) << "at sample " << point << ", next state dim: " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s[point][dim])); } for (int dim = 0; dim < DYN_T::OUTPUT_DIM; dim++) { if (isnan(output_array_cpu(dim)) && isnan(output[point][dim])) { continue; } EXPECT_NEAR(output_array_cpu(dim), output[point][dim], tol * 1000) // TODO this is a stupid hack << "at sample " << point << ", output dim: " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(s_der[point][dim])); } } } dynamics.freeCudaMem(); } ================================================ FILE: tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh ================================================ #ifndef DYNAMICS_GENERIC_KERNEL_TESTS_CUH_ #define DYNAMICS_GENERIC_KERNEL_TESTS_CUH_ #include #include #include template __global__ void parameterTestKernel(CLASS_T* class_t, PARAMS_T& params); template void launchParameterTestKernel(CLASS_T& class_t, PARAMS_T& params); template __global__ void controlRangesTestKernel(DYNAMICS_T* dynamics, float2* control_rngs); template void launchControlRangesTestKernel(DYNAMICS_T& dynamics, std::array& control_rngs); template __global__ void enforceConstraintTestKernel(DYNAMICS_T* dynamics, float* state, float* control, int num); template void launchEnforceConstraintTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& control, int dim_y); template __global__ void computeKinematicsTestKernel(DYNAMICS_T* model, float* state, float* state_der, int num); template void launchComputeKinematicsTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& state_der, int dim_y); template __global__ void computeDynamicsTestKernel(DYNAMICS_T* model, float* state, float* control, float* state_der, int count); template void launchComputeDynamicsTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& control, std::vector>& state_der, int dim_y); template __global__ void updateStateTestKernel(DYNAMICS_T* dynamics, float* state, float* state_der, float dt, int num); template void launchUpdateStateTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& state_der, float dt, int dim_y); template __global__ void computeStateDerivTestKernel(DYNAMICS_T* dynamics, float* state, float* control, float* state_der, int num); template void launchComputeStateDerivTestKernel(DYNAMICS_T& dynamics, std::vector>& state, std::vector>& control, std::vector>& state_der, int dim_y); template __global__ void getSharedMemorySizesKernel(DYN_T* __restrict__ dynamics, int* __restrict__ output_d); template void launchGetSharedMemorySizesKernel(DYN_T& dynamics, int shared_mem_sizes[2]); template void checkGPUComputationStep(DYN_T& dynamics, float dt, int max_y_dim, int x_dim, typename DYN_T::buffer_trajectory buffer, double tol = 1.0e-5); #if __CUDACC__ #include "dynamics_generic_kernel_tests.cu" #endif #endif // DYNAMICS_GENERIC_KERNEL_TESTS_CUH_ ================================================ FILE: tests/include/kernel_tests/shaping_functions/shaping_function_kernels_tests.cu ================================================ #include "shaping_function_kernels_tests.cuh" template void launchShapingFunction_KernelTest(std::array& trajectory_costs_host, CLASS_T& shape_function, float baseline, std::array& normalized_compute) { // Allocate CUDA memory float* trajectory_costs_d; HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * trajectory_costs_host.size())) HANDLE_ERROR(cudaMemcpy(trajectory_costs_d, trajectory_costs_host.data(), sizeof(float) * trajectory_costs_host.size(), cudaMemcpyHostToDevice)) mppi_common::weightKernel <<<1, NUM_ROLLOUTS>>>(trajectory_costs_d, baseline, shape_function.shaping_function_d_); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(normalized_compute.data(), trajectory_costs_d, sizeof(float) * trajectory_costs_host.size(), cudaMemcpyDeviceToHost)) cudaFree(trajectory_costs_d); } template void launchShapingFunction_KernelTest(typename CLASS_T::cost_traj& trajectory_costs_host, CLASS_T& shape_function, float baseline, std::array& normalized_compute, cudaStream_t stream) { // Allocate CUDA memory float* trajectory_costs_d; HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * trajectory_costs_host.size())) HANDLE_ERROR(cudaMemcpy(trajectory_costs_d, trajectory_costs_host.data(), sizeof(float) * trajectory_costs_host.size(), cudaMemcpyHostToDevice)) shape_function.launchWeightKernel(trajectory_costs_d, baseline, stream); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(normalized_compute.data(), trajectory_costs_d, sizeof(float) * trajectory_costs_host.size(), cudaMemcpyDeviceToHost)) cudaFree(trajectory_costs_d); } ================================================ FILE: tests/include/kernel_tests/shaping_functions/shaping_function_kernels_tests.cuh ================================================ #ifndef MPPIGENERIC_SHAPING_FUNCTION_KERNEL_TEST_CUH #define MPPIGENERIC_SHAPING_FUNCTION_KERNEL_TEST_CUH #include #include template void launchShapingFunction_KernelTest(typename CLASS_T::cost_traj& trajectory_costs_host, CLASS_T& shape_function, float baseline, std::array& normalized_compute, cudaStream_t stream = nullptr); template void launchShapingFunction_KernelTest(std::array& trajectory_costs_host, CLASS_T& shape_function, float baseline, std::array& normalized_compute); #if __CUDACC__ #include "shaping_function_kernels_tests.cu" #endif #endif // MPPIGENERIC_SHAPING_FUNCTION_KERNEL_TEST_CUH ================================================ FILE: tests/include/kernel_tests/utils/network_helper_kernel_test.cuh ================================================ // // Created by jason on 8/19/22. // #ifndef MPPIGENERIC_NETWORK_HELPER_KERNEL_TEST_CUH #define MPPIGENERIC_NETWORK_HELPER_KERNEL_TEST_CUH #include #include // TODO check on multiple different blocks template __global__ void parameterCheckTestKernel(NETWORK_T* model, float* theta, int* stride, int* net_structure, float* shared_theta, int* shared_stride, int* shared_net_structure) { extern __shared__ float theta_s[]; model->initialize(theta_s); float* theta_shared_ptr = theta_s; int* stride_shared_ptr = (int*)(theta_s + model->getNumParams()); int* structure_shared_ptr = (int*)(theta_s + model->getNumParams() + model->getStrideSize()); for (int i = 0; i < model->getNumParams(); i++) { theta[i] = model->getThetaPtr()[i]; shared_theta[i] = theta_shared_ptr[i]; } for (int i = 0; i < model->getStrideSize(); i++) { stride[i] = model->getStrideIdcsPtr()[i]; shared_stride[i] = stride_shared_ptr[i]; } for (int i = 0; i < model->getNumLayers(); i++) { net_structure[i] = model->getNetStructurePtr()[i]; shared_net_structure[i] = structure_shared_ptr[i]; } } template void launchParameterCheckTestKernel(NETWORK_T& model, std::array& theta, std::array& stride, std::array& net_structure, std::array& shared_theta, std::array& shared_stride, std::array& shared_net_structure) { float* theta_d; int* stride_d; int* net_structure_d; float* shared_theta_d; int* shared_stride_d; int* shared_net_structure_d; HANDLE_ERROR(cudaMalloc((void**)&theta_d, sizeof(float) * theta.size())) HANDLE_ERROR(cudaMalloc((void**)&stride_d, sizeof(int) * stride.size())) HANDLE_ERROR(cudaMalloc((void**)&net_structure_d, sizeof(int) * net_structure.size())) HANDLE_ERROR(cudaMalloc((void**)&shared_theta_d, sizeof(float) * theta.size())) HANDLE_ERROR(cudaMalloc((void**)&shared_stride_d, sizeof(int) * stride.size())) HANDLE_ERROR(cudaMalloc((void**)&shared_net_structure_d, sizeof(int) * net_structure.size())) dim3 threadsPerBlock(3, 1); dim3 numBlocks(10, 1); unsigned shared_size = mppi::kernels::calcClassSharedMemSize(&model, threadsPerBlock); parameterCheckTestKernel<<>>( model.network_d_, theta_d, stride_d, net_structure_d, shared_theta_d, shared_stride_d, shared_net_structure_d); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(theta.data(), theta_d, sizeof(float) * theta.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR(cudaMemcpy(stride.data(), stride_d, sizeof(int) * stride.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR( cudaMemcpy(net_structure.data(), net_structure_d, sizeof(int) * net_structure.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR(cudaMemcpy(shared_theta.data(), shared_theta_d, sizeof(float) * theta.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR(cudaMemcpy(shared_stride.data(), shared_stride_d, sizeof(int) * stride.size(), cudaMemcpyDeviceToHost)) HANDLE_ERROR(cudaMemcpy(shared_net_structure.data(), shared_net_structure_d, sizeof(int) * net_structure.size(), cudaMemcpyDeviceToHost)) cudaDeviceSynchronize(); cudaFree(theta_d); cudaFree(stride_d); cudaFree(net_structure_d); cudaFree(shared_theta_d); cudaFree(shared_stride_d); cudaFree(shared_net_structure_d); } template __global__ void parameterCheckTestKernel(NETWORK_T* model, float* lstm_params, float* shared_lstm_params, float* fnn_params, float* shared_fnn_params) { extern __shared__ float theta_s[]; model->initialize(theta_s); uint tid = blockIdx.x; float* lstm_params_start = lstm_params + tid * (model->getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model->getHiddenDim()); memcpy(lstm_params_start, model->getWeights(), model->getLSTMGrdSharedSizeBytes() + 2 * model->getHiddenDim() * sizeof(float)); float* fnn_params_start = fnn_params + tid * model->getOutputGrdSharedSizeBytes() / sizeof(float); memcpy(fnn_params_start, model->getOutputWeights(), model->getOutputGrdSharedSizeBytes()); float* shared_lstm_params_start = shared_lstm_params + tid * (model->getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model->getHiddenDim()); memcpy(shared_lstm_params_start, theta_s, model->getLSTMGrdSharedSizeBytes()); memcpy(shared_lstm_params_start + model->getLSTMGrdSharedSizeBytes() / sizeof(float), theta_s + model->getGrdSharedSizeBytes() / sizeof(float), model->getHiddenDim() * 2 * sizeof(float)); float* shared_fnn_params_start = shared_fnn_params + tid * model->getOutputGrdSharedSizeBytes() / sizeof(float); memcpy(shared_fnn_params_start, theta_s + model->getLSTMGrdSharedSizeBytes() / sizeof(float), model->getOutputGrdSharedSizeBytes()); } template void launchParameterCheckTestKernel(NETWORK_T& model, std::vector& lstm_params, std::vector& shared_lstm_params, std::vector& fnn_params, std::vector& shared_fnn_params, int num) { lstm_params.resize((model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim()) * num); shared_lstm_params.resize((model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim()) * num); fnn_params.resize(model.getOutputGrdSharedSizeBytes() / sizeof(float) * num); shared_fnn_params.resize(model.getOutputGrdSharedSizeBytes() / sizeof(float) * num); std::fill(lstm_params.begin(), lstm_params.end(), -1); std::fill(shared_lstm_params.begin(), shared_lstm_params.end(), -1); std::fill(fnn_params.begin(), fnn_params.end(), -2); std::fill(shared_fnn_params.begin(), shared_fnn_params.end(), -2); float* lstm_params_d = nullptr; float* shared_lstm_params_d = nullptr; float* fnn_params_d = nullptr; float* shared_fnn_params_d = nullptr; HANDLE_ERROR(cudaMalloc((void**)&lstm_params_d, (model.getLSTMGrdSharedSizeBytes() + 2 * model.getHiddenDim() * sizeof(float)) * num)); HANDLE_ERROR(cudaMalloc((void**)&shared_lstm_params_d, (model.getLSTMGrdSharedSizeBytes() + 2 * model.getHiddenDim() * sizeof(float)) * num)); HANDLE_ERROR(cudaMalloc((void**)&fnn_params_d, model.getOutputGrdSharedSizeBytes() * num)); HANDLE_ERROR(cudaMalloc((void**)&shared_fnn_params_d, model.getOutputGrdSharedSizeBytes() * num)); dim3 threadsPerBlock(1, 1); dim3 numBlocks(num, 1); unsigned shared_size = mppi::kernels::calcClassSharedMemSize(&model, threadsPerBlock); parameterCheckTestKernel<<>>( model.network_d_, lstm_params_d, shared_lstm_params_d, fnn_params_d, shared_fnn_params_d); CudaCheckError(); HANDLE_ERROR(cudaMemcpy(lstm_params.data(), lstm_params_d, (model.getLSTMGrdSharedSizeBytes() + 2 * model.getHiddenDim() * sizeof(float)) * num, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(shared_lstm_params.data(), shared_lstm_params_d, (model.getLSTMGrdSharedSizeBytes() + 2 * model.getHiddenDim() * sizeof(float)) * num, cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(fnn_params.data(), fnn_params_d, model.getOutputGrdSharedSizeBytes() * num, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(shared_fnn_params.data(), shared_fnn_params_d, model.getOutputGrdSharedSizeBytes() * num, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(lstm_params_d); cudaFree(shared_lstm_params_d); cudaFree(fnn_params_d); cudaFree(shared_fnn_params_d); } template __global__ void forwardTestKernel(NETWORK_T* network, float* input, float* output, int num, int steps) { int tid = blockIdx.x * blockDim.x + threadIdx.x; extern __shared__ float theta_s[]; float* local_input = input + (tid * network->getInputDim()); float* local_output = output + (tid * network->getOutputDim()); network->initialize(theta_s); if (tid < num) { float* curr_act = nullptr; for (uint step = 0; step < steps; step++) { curr_act = network->forward(local_input, theta_s); } for (uint i = threadIdx.y; i < network->getOutputDim(); i += blockDim.y) { local_output[i] = curr_act[i]; } __syncthreads(); } } template void launchForwardTestKernel(NETWORK_T& model, std::vector>& input, std::vector>& output, int dim_y, int steps = 1) { if (input.size() != output.size()) { std::cerr << "Num States doesn't match num controls" << std::endl; return; } int count = input.size(); float* input_d; float* output_d; HANDLE_ERROR(cudaMalloc((void**)&input_d, sizeof(float) * model.getInputDim() * count)); HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(float) * model.getOutputDim() * count)); HANDLE_ERROR(cudaMemcpy(input_d, input.data(), sizeof(float) * model.getInputDim() * count, cudaMemcpyHostToDevice)); HANDLE_ERROR( cudaMemcpy(output_d, input.data(), sizeof(float) * model.getOutputDim() * count, cudaMemcpyHostToDevice)); const int gridsize_x = (count - 1) / BLOCKSIZE_X + 1; dim3 threadsPerBlock(BLOCKSIZE_X, dim_y); dim3 numBlocks(gridsize_x, 1); unsigned shared_size = mppi::kernels::calcClassSharedMemSize(&model, threadsPerBlock); forwardTestKernel <<>>(model.network_d_, input_d, output_d, count, steps); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(input.data(), input_d, sizeof(float) * model.getInputDim() * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR( cudaMemcpy(output.data(), output_d, sizeof(float) * model.getOutputDim() * count, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(input_d); cudaFree(output_d); } template __global__ void forwardTestKernelPreload(NETWORK_T* network, float* input, float* output, int num, int steps) { int tid = blockIdx.x * blockDim.x + threadIdx.x; extern __shared__ float theta_s[]; float* local_input = input + (tid * network->getInputDim()); float* local_output = output + (tid * network->getOutputDim()); network->initialize(theta_s); if (tid < num) { float* input_loc = network->getInputLocation(theta_s); for (int i = threadIdx.y; i < network->getInputDim(); i += blockDim.y) { input_loc[i] = local_input[i]; } __syncthreads(); float* curr_act = nullptr; for (uint step = 0; step < steps; step++) { curr_act = network->forward(nullptr, theta_s); } for (uint i = threadIdx.y; i < network->getOutputDim(); i += blockDim.y) { local_output[i] = curr_act[i]; } __syncthreads(); } } template void launchForwardTestKernelPreload(NETWORK_T& model, std::vector>& input, std::vector>& output, int dim_y, int steps = 1) { if (input.size() != output.size()) { std::cerr << "Num States doesn't match num controls" << std::endl; return; } int count = input.size(); float* input_d; float* output_d; HANDLE_ERROR(cudaMalloc((void**)&input_d, sizeof(float) * INPUT_DIM * count)); HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(float) * OUTPUT_DIM * count)); HANDLE_ERROR(cudaMemcpy(input_d, input.data(), sizeof(float) * INPUT_DIM * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(output_d, input.data(), sizeof(float) * OUTPUT_DIM * count, cudaMemcpyHostToDevice)); const int gridsize_x = (count - 1) / BLOCKSIZE_X + 1; dim3 threadsPerBlock(BLOCKSIZE_X, dim_y); dim3 numBlocks(gridsize_x, 1); unsigned shared_size = mppi::kernels::calcClassSharedMemSize(&model, threadsPerBlock); forwardTestKernelPreload <<>>(model.network_d_, input_d, output_d, count, steps); CudaCheckError(); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(input.data(), input_d, sizeof(float) * INPUT_DIM * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(output.data(), output_d, sizeof(float) * OUTPUT_DIM * count, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(input_d); cudaFree(output_d); } #endif // MPPIGENERIC_NETWORK_HELPER_KERNEL_TEST_CUH ================================================ FILE: tests/include/kernel_tests/utils/texture_test_kernels.cuh ================================================ // // Created by jason on 1/9/22. // #ifndef MPPIGENERIC_TEXTURE_TEST_KERNELS_CUH #define MPPIGENERIC_TEXTURE_TEST_KERNELS_CUH #include template __global__ void textureTestKernel(TEX_T& tex, DATA_T* test_results, float4* test_indexes, int num_points) { int tid = blockIdx.x * blockDim.x + threadIdx.x; // printf("\nEntering the kernel!\n"); // printf("The thread id is: %i\n", tid); if (tid < num_points) { // printf("thread ia: %i went to check texture at index %i, %i\n", tid, test_indexes[tid].x, test_indexes[tid].y); // query texture float3 query_point = make_float3(test_indexes[tid].x, test_indexes[tid].y, test_indexes[tid].z); int index = round(test_indexes[tid].w); test_results[tid] = tex.queryTexture(index, query_point); // printf("query at %d %f,%f,%f = %f, %f, %f, %f\n", index, query_point.x, query_point.y, query_point.z, // test_results[tid].x, test_results[tid].y, test_results[tid].z, test_results[tid].w); } } template std::vector getTextureAtPointsKernel(const TEX_T& helper, std::vector& query_points) { std::vector results; int num_test_points = query_points.size(); results.resize(num_test_points); float4* tex_results_d; float4* tex_query_points_d; HANDLE_ERROR(cudaMalloc((void**)&tex_results_d, sizeof(DATA_T) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&tex_query_points_d, sizeof(float4) * num_test_points)) HANDLE_ERROR( cudaMemcpy(tex_query_points_d, query_points.data(), sizeof(float4) * num_test_points, cudaMemcpyHostToDevice)); dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); textureTestKernel<<>>(*helper.ptr_d_, tex_results_d, tex_query_points_d, num_test_points); CudaCheckError(); cudaDeviceSynchronize(); HANDLE_ERROR(cudaMemcpy(results.data(), tex_results_d, sizeof(DATA_T) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(tex_results_d); cudaFree(tex_query_points_d); return results; } template __global__ void textureAtMapPoseTestKernel(TEX_T& tex, DATA_T* test_results, float4* test_indexes, int num_points) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num_points) { // query texture float3 query_point = make_float3(test_indexes[tid].x, test_indexes[tid].y, test_indexes[tid].z); int index = round(test_indexes[tid].w); test_results[tid] = tex.queryTextureAtMapPose(index, query_point); // printf("query at %d %f,%f,%f = %f, %f, %f, %f\n", index, query_point.x, query_point.y, query_point.z, // test_results[tid].x, test_results[tid].y, test_results[tid].z, test_results[tid].w); } } template std::vector getTextureAtMapPointsKernel(const TEX_T& helper, std::vector& query_points) { std::vector results; int num_test_points = query_points.size(); results.resize(num_test_points); float4* tex_results_d; float4* tex_query_points_d; HANDLE_ERROR(cudaMalloc((void**)&tex_results_d, sizeof(DATA_T) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&tex_query_points_d, sizeof(float4) * num_test_points)) HANDLE_ERROR( cudaMemcpy(tex_query_points_d, query_points.data(), sizeof(float4) * num_test_points, cudaMemcpyHostToDevice)); dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); textureAtMapPoseTestKernel<<>>(*helper.ptr_d_, tex_results_d, tex_query_points_d, num_test_points); CudaCheckError(); cudaDeviceSynchronize(); HANDLE_ERROR(cudaMemcpy(results.data(), tex_results_d, sizeof(DATA_T) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(tex_results_d); cudaFree(tex_query_points_d); return results; } template __global__ void textureAtWorldPoseTestKernel(TEX_T& tex, DATA_T* test_results, float4* test_indexes, int num_points) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num_points) { // query texture float3 query_point = make_float3(test_indexes[tid].x, test_indexes[tid].y, test_indexes[tid].z); int index = round(test_indexes[tid].w); test_results[tid] = tex.queryTextureAtWorldPose(index, query_point); // printf("query at %d %f,%f,%f = %f, %f, %f, %f\n", index, query_point.x, query_point.y, query_point.z, // test_results[tid].x, test_results[tid].y, test_results[tid].z, test_results[tid].w); } } template std::vector getTextureAtWorldPointsKernel(const TEX_T& helper, std::vector& query_points) { std::vector results; int num_test_points = query_points.size(); results.resize(num_test_points); float4* tex_results_d; float4* tex_query_points_d; HANDLE_ERROR(cudaMalloc((void**)&tex_results_d, sizeof(DATA_T) * num_test_points)) HANDLE_ERROR(cudaMalloc((void**)&tex_query_points_d, sizeof(float4) * num_test_points)) HANDLE_ERROR( cudaMemcpy(tex_query_points_d, query_points.data(), sizeof(float4) * num_test_points, cudaMemcpyHostToDevice)); dim3 threadsPerBlock(num_test_points, 1); dim3 numBlocks(1, 1); textureAtWorldPoseTestKernel<<>>(*helper.ptr_d_, tex_results_d, tex_query_points_d, num_test_points); CudaCheckError(); cudaDeviceSynchronize(); HANDLE_ERROR(cudaMemcpy(results.data(), tex_results_d, sizeof(DATA_T) * num_test_points, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); cudaFree(tex_results_d); cudaFree(tex_query_points_d); return results; } #if __CUDACC__ #include "texture_test_kernels.cuh" #endif #endif // MPPIGENERIC_TEXTURE_TEST_KERNELS_CUH ================================================ FILE: tests/include/mppi_test/mock_classes/mock_classes.h ================================================ // // Created by jgibson37 on 2/22/24. // #ifndef MPPIGENERIC_MOCK_CLASSES_H #define MPPIGENERIC_MOCK_CLASSES_H const int NUM_TIMESTEPS = 100; #include #include #include #include #endif // MPPIGENERIC_MOCK_CLASSES_H ================================================ FILE: tests/include/mppi_test/mock_classes/mock_controller.h ================================================ // // Created by jason on 4/14/20. // #ifndef MPPIGENERIC_MOCK_CONTROLLER_H #define MPPIGENERIC_MOCK_CONTROLLER_H #include #include #include #include // ===== mock controller ==== class MockController : public Controller { public: MOCK_METHOD0(calculateSampledStateTrajectories, void()); MOCK_METHOD0(resetControls, void()); MOCK_METHOD1(computeFeedback, void(const Eigen::Ref& state)); MOCK_METHOD1(slideControlSequence, void(int stride)); MOCK_METHOD5(getCurrentControl, control_array(Eigen::Ref, double, Eigen::Ref, Eigen::Ref, TEMPLATED_FEEDBACK_STATE&)); MOCK_METHOD2(computeControl, void(const Eigen::Ref& state, int optimization_stride)); MOCK_METHOD(control_trajectory, getControlSeq, (), (const, override)); MOCK_METHOD(state_trajectory, getTargetStateSeq, (), (const, override)); // MOCK_METHOD(output_trajectory, getTargetOutputSeq, (), (const, override)); MOCK_METHOD(TEMPLATED_FEEDBACK_STATE, getFeedbackState, (), (const, override)); MOCK_METHOD(control_array, getFeedbackControl, (const Eigen::Ref&, const Eigen::Ref&, int), (override)); MOCK_METHOD1(updateImportanceSampler, void(const Eigen::Ref& nominal_control)); MOCK_METHOD0(allocateCUDAMemory, void()); MOCK_METHOD0(computeFeedbackPropagatedStateSeq, void()); MOCK_METHOD0(smoothControlTrajectory, void()); MOCK_METHOD1(computeStateTrajectory, void(const Eigen::Ref& x0)); MOCK_METHOD1(setPercentageSampledControlTrajectories, void(float new_perc)); MOCK_METHOD0(getSampledNoise, std::vector()); MOCK_METHOD0(getDt, float()); }; #endif // MPPIGENERIC_MOCK_CONTROLLER_H ================================================ FILE: tests/include/mppi_test/mock_classes/mock_costs.h ================================================ // // Created by jason on 4/14/20. // #ifndef MPPIGENERIC_MOCK_COSTS_H #define MPPIGENERIC_MOCK_COSTS_H #include #include #include // ===== mock cost ==== typedef struct { int test = 1; } mockCostParams; class MockCost : public Cost { public: MOCK_METHOD1(bindToStream, void(cudaStream_t stream)); MOCK_METHOD1(setParams, void(mockCostParams params)); MOCK_METHOD0(getParams, mockCostParams()); MOCK_METHOD0(GPUSetup, void()); MOCK_METHOD0(freeCudaMem, void()); }; #endif // MPPIGENERIC_MOCK_COSTS_H ================================================ FILE: tests/include/mppi_test/mock_classes/mock_dynamics.h ================================================ // // Created by jason on 4/14/20. // #ifndef MPPIGENERIC_MOCK_DYNAMICS_H #define MPPIGENERIC_MOCK_DYNAMICS_H #include #include #include // ===== mock dynamics ==== struct mockDynamicsParams : public DynamicsParams { int test = 2; bool copy_everything = false; float buffer[(1 + 1) * 10] = { 0.0 }; void updateBuffer(Eigen::Matrix new_buffer) { } }; class MockDynamics : public MPPI_internal::Dynamics { public: typedef Eigen::Matrix control_array; // Control at a time t typedef Eigen::Matrix state_array; // State at a time t typedef Eigen::Matrix output_array; // Output at a time t typedef Eigen::Matrix dfdx; // Jacobian wrt x typedef Eigen::Matrix dfdu; // Jacobian wrt u typedef Eigen::Matrix feedback_matrix; // Feedback matrix typedef Eigen::Matrix Jacobian; // Jacobian of x and u typedef std::map buffer_trajectory; MOCK_METHOD1(bindToStream, void(cudaStream_t stream)); MOCK_METHOD1(setParams, void(mockDynamicsParams params)); MOCK_METHOD0(GPUSetup, void()); MOCK_METHOD0(getControlRanges, std::array()); MOCK_METHOD0(getControlRangesRaw, void()); MOCK_METHOD0(getParams, mockDynamicsParams()); // MOCK_METHOD0(freeCudaMem, void()); MOCK_METHOD0(paramsToDevice, void()); MOCK_METHOD3(computeDynamics, void(const Eigen::Ref&, const Eigen::Ref&, Eigen::Ref)); MOCK_METHOD2(computeKinematics, void(Eigen::Ref&, Eigen::Ref)); MOCK_METHOD3(computeStateDeriv, void(const Eigen::Ref&, const Eigen::Ref&, Eigen::Ref)); MOCK_METHOD4(updateState, void(const Eigen::Ref, Eigen::Ref, Eigen::Ref, float)); MOCK_METHOD7(step, void(Eigen::Ref, Eigen::Ref, Eigen::Ref, const Eigen::Ref&, Eigen::Ref, float, float)); MOCK_METHOD3(updateState, void(const Eigen::Ref, Eigen::Ref, float)); MOCK_METHOD2(enforceConstraints, void(Eigen::Ref, Eigen::Ref)); MOCK_METHOD4(computeGrad, bool(const Eigen::Ref&, const Eigen::Ref, Eigen::Ref, Eigen::Ref)); MOCK_METHOD1(updateFromBuffer, void(const buffer_trajectory& buffer)); MOCK_METHOD1(stateFromMap, state_array(const std::map&)); MOCK_METHOD0(freeCudaMem, void()); }; #endif // MPPIGENERIC_MOCK_DYNAMICS_H ================================================ FILE: tests/include/mppi_test/mock_classes/mock_feedback.h ================================================ // // Created by jason on 2/21/24. // #ifndef MPPIGENERIC_TESTS_INCLUDE_MPPI_TEST_MOCK_CLASSES_MOCK_FEEDBACK_H_ #define MPPIGENERIC_TESTS_INCLUDE_MPPI_TEST_MOCK_CLASSES_MOCK_FEEDBACK_H_ #include #include struct mockGPUFeedbackParams { }; class MockGPUFeedback : public GPUFeedbackController { public: using DYN_T = MockDynamics; using FEEDBACK_STATE_T = GPUState; MockGPUFeedback(cudaStream_t stream) : GPUFeedbackController(stream) { } }; class MockFeedback : public FeedbackController { public: MOCK_METHOD0(initTrackingController, void()); MOCK_METHOD4(k_, control_array(const Eigen::Ref& x_act, const Eigen::Ref& x_goal, int t, GPUState& fb_state)); MOCK_METHOD3(computeFeedback, void(const Eigen::Ref& init_state, const Eigen::Ref& goal_traj, const Eigen::Ref& control_traj)); MOCK_METHOD0(freeCudaMem, void()); }; #endif // MPPIGENERIC_TESTS_INCLUDE_MPPI_TEST_MOCK_CLASSES_MOCK_FEEDBACK_H_ ================================================ FILE: tests/include/mppi_test/mock_classes/mock_sampler.h ================================================ // // Created by jason on 2/21/24. // #ifndef MPPIGENERIC_TESTS_INCLUDE_MPPI_TEST_MOCK_CLASSES_MOCK_SAMPLER_H_ #define MPPIGENERIC_TESTS_INCLUDE_MPPI_TEST_MOCK_CLASSES_MOCK_SAMPLER_H_ #include #include class MockSamplingDistribution : public mppi::sampling_distributions::SamplingDistribution< MockSamplingDistribution, mppi::sampling_distributions::SamplingParams, mockDynamicsParams> { public: MOCK_METHOD1(bindToStream, void(cudaStream_t stream)); MOCK_METHOD0(GPUSetup, void()); MOCK_METHOD1(resizeVisualizationCotnrolTrajectories, void(bool synchronize)); MOCK_METHOD1(allocateCUDAMemory, void(bool synchronize)); MOCK_METHOD0(allocateCUDAMemoryHelper, void()); MOCK_METHOD0(freeCudaMem, void()); MOCK_METHOD4(generateSamples, void(const int& opt_stride, const int& iteration_num, curandGenerator_t& gen, bool synchronize)); MOCK_METHOD3(setHostOptimalControlSequence, void(float* optimal_control_trajectory, const int& distribution_idx, bool synchronize)); MOCK_METHOD4(updateDistributionParamsFromDevice, void(const float* trajectory_weights_d, float normalizer, const int& distribution_i, bool synchronize)); }; #endif // MPPIGENERIC_TESTS_INCLUDE_MPPI_TEST_MOCK_CLASSES_MOCK_SAMPLER_H_ ================================================ FILE: tests/integration/CMakeLists.txt ================================================ set(CMAKE_CXX_STANDARD 17) set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) gtest_add_tests(TARGET ${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: tests/integration/integrator_tests.cu ================================================ // // Created by mgandhi3 on 7/22/21. // #include #include #include struct TestDynamicsParams : public DynamicsParams { }; using namespace MPPI_internal; class TestDynamics : public Dynamics { public: explicit TestDynamics(cudaStream_t stream = nullptr) : Dynamics(stream){}; void computeDynamics(const Eigen::Ref& state, const Eigen::Ref& control, Eigen::Ref state_der) { state_der = -2 * state; } state_array stateFromMap(const std::map& map) { } }; TEST(Integration, RK4) { using DYN = TestDynamics; DYN dynamics; DYN::state_array x_k, x_kp1; x_k << 3; DYN::control_array u_k; float dt = 0.05; int num_timesteps = 100; for (int i = 0; i < num_timesteps; ++i) { rk4integrate(&dynamics, dt, x_k, u_k, x_kp1); x_k = x_kp1; } ASSERT_NEAR(0.0, x_k[0], 1e-6); } ================================================ FILE: tests/math_utils/CMakeLists.txt ================================================ set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) gtest_add_tests(TARGET ${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: tests/math_utils/cuda_math_utils_tests.cu ================================================ #include #include #include #include template __global__ void VectorVectorAddTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, T* __restrict__ output2) { *output2 = *input1 + *input2; } template __global__ void VectorVectorSubTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, T* __restrict__ output2) { *output2 = *input1 - *input2; } template __global__ void VectorVectorMultTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, T* __restrict__ output2) { *output2 = (*input1) * (*input2); } template __global__ void VectorVectorDivTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, T* __restrict__ output2) { *output2 = (*input1) / (*input2); } template __global__ void VectorScalarAddTestKernel(const T* __restrict__ input1, const S* __restrict__ input2, T* __restrict__ output2) { *output2 = *input1 + *input2; } template __global__ void VectorScalarMultTestKernel(const T* __restrict__ input1, const S* __restrict__ input2, T* __restrict__ output2) { *output2 = (*input1) * (*input2); } template __global__ void VectorScalarSubTestKernel(const T* __restrict__ input1, const S* __restrict__ input2, T* __restrict__ output2) { *output2 = *input1 - *input2; } template __global__ void VectorScalarDivTestKernel(const T* __restrict__ input1, const S* __restrict__ input2, T* __restrict__ output2) { *output2 = (*input1) / (*input2); } template __global__ void VectorVectorScalarAddMultTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, const S* __restrict__ scalar, T* __restrict__ output2) { *output2 = *input1 + (*input2) * (*scalar); } template class CudaFloatStructsTests : public ::testing::Test { public: T input1_cpu; T input2_cpu; T output_cpu; T output_gpu; T* input1_d; T* input2_d; T* output_d; float scalar; float* scalar_d; void SetUp() override { HANDLE_ERROR(cudaMalloc((void**)&scalar_d, sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&input1_d, sizeof(T))); HANDLE_ERROR(cudaMalloc((void**)&input2_d, sizeof(T))); HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(T))); // Setup random number generator std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution dist(0.0f, 10.f); initializeStruct(input1_cpu, gen, dist); initializeStruct(input2_cpu, gen, dist); HANDLE_ERROR(cudaMemcpy(input1_d, &input1_cpu, sizeof(T), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(input2_d, &input2_cpu, sizeof(T), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(scalar_d, &scalar, sizeof(float), cudaMemcpyHostToDevice)); } void initializeStruct(T& val, std::mt19937& gen, std::uniform_real_distribution& dist) { std::cout << "Did not specialize!" << std::endl; } T addVec(const T& val1, const T& val2) { T a; std::cout << "Wrong addVec!" << std::endl; return a; } T addScalar(const T& val1, const float& val2) { T a; std::cout << "Wrong addScalar!" << std::endl; return a; } T subVec(const T& val1, const T& val2) { T a; std::cout << "Wrong subVec!" << std::endl; return a; } T subScalar(const T& val1, const float& val2) { T a; std::cout << "Wrong subScalar!" << std::endl; return a; } T multVec(const T& val1, const T& val2) { T a; std::cout << "Wrong multVec!" << std::endl; return a; } T multScalar(const T& val1, const float& val2) { T a; std::cout << "Wrong multScalar!" << std::endl; return a; } T divVec(const T& val1, const T& val2) { T a; std::cout << "Wrong divVec!" << std::endl; return a; } T divScalar(const T& val1, const float& val2) { T a; std::cout << "Wrong divScalar!" << std::endl; return a; } bool assert_same(const T& val1, const T& val2, const std::string& str = ""); void TearDown() override { HANDLE_ERROR(cudaFree(scalar_d)); HANDLE_ERROR(cudaFree(input1_d)); HANDLE_ERROR(cudaFree(input2_d)); HANDLE_ERROR(cudaFree(output_d)); } }; template <> void CudaFloatStructsTests::initializeStruct(float2& val, std::mt19937& gen, std::uniform_real_distribution& dist) { val = make_float2(dist(gen), dist(gen)); } template <> bool CudaFloatStructsTests::assert_same(const float2& val1, const float2& val2, const std::string& str) { bool result = true; result = result && (val1.x == val2.x); result = result && (val1.y == val2.y); if (!result) { printf("(%f, %f) != (%f, %f)", val1.x, val1.y, val2.x, val2.y); std::cout << str << std::endl; } return result; } template <> float2 CudaFloatStructsTests::addVec(const float2& val1, const float2& val2) { float2 output; output.x = val1.x + val2.x; output.y = val1.y + val2.y; return output; } template <> float2 CudaFloatStructsTests::multVec(const float2& val1, const float2& val2) { float2 output; output.x = val1.x * val2.x; output.y = val1.y * val2.y; return output; } template <> float2 CudaFloatStructsTests::addScalar(const float2& val1, const float& val2) { float2 output; output.x = val1.x + val2; output.y = val1.y + val2; return output; } template <> float2 CudaFloatStructsTests::multScalar(const float2& val1, const float& val2) { float2 output; output.x = val1.x * val2; output.y = val1.y * val2; return output; } template <> float2 CudaFloatStructsTests::subVec(const float2& val1, const float2& val2) { float2 output; output.x = val1.x - val2.x; output.y = val1.y - val2.y; return output; } template <> float2 CudaFloatStructsTests::divVec(const float2& val1, const float2& val2) { float2 output; output.x = val1.x / val2.x; output.y = val1.y / val2.y; return output; } template <> float2 CudaFloatStructsTests::subScalar(const float2& val1, const float& val2) { float2 output; output.x = val1.x - val2; output.y = val1.y - val2; return output; } template <> float2 CudaFloatStructsTests::divScalar(const float2& val1, const float& val2) { float2 output; output.x = val1.x / val2; output.y = val1.y / val2; return output; } template <> void CudaFloatStructsTests::initializeStruct(float4& val, std::mt19937& gen, std::uniform_real_distribution& dist) { val = make_float4(dist(gen), dist(gen), dist(gen), dist(gen)); } template <> bool CudaFloatStructsTests::assert_same(const float4& val1, const float4& val2, const std::string& str) { bool result = true; result = result && (val1.x == val2.x); result = result && (val1.y == val2.y); result = result && (val1.z == val2.z); result = result && (val1.w == val2.w); if (!result) { printf("(%f, %f, %f, %f) != (%f, %f, %f, %f)", val1.x, val1.y, val1.z, val1.w, val2.x, val2.y, val2.z, val2.w); std::cout << str << std::endl; } return result; } template <> float4 CudaFloatStructsTests::addVec(const float4& val1, const float4& val2) { float4 output; output.x = val1.x + val2.x; output.y = val1.y + val2.y; output.z = val1.z + val2.z; output.w = val1.w + val2.w; return output; } template <> float4 CudaFloatStructsTests::multVec(const float4& val1, const float4& val2) { float4 output; output.x = val1.x * val2.x; output.y = val1.y * val2.y; output.z = val1.z * val2.z; output.w = val1.w * val2.w; return output; } template <> float4 CudaFloatStructsTests::addScalar(const float4& val1, const float& val2) { float4 output; output.x = val1.x + val2; output.y = val1.y + val2; output.z = val1.z + val2; output.w = val1.w + val2; return output; } template <> float4 CudaFloatStructsTests::multScalar(const float4& val1, const float& val2) { float4 output; output.x = val1.x * val2; output.y = val1.y * val2; output.z = val1.z * val2; output.w = val1.w * val2; return output; } template <> float4 CudaFloatStructsTests::subVec(const float4& val1, const float4& val2) { float4 output; output.x = val1.x - val2.x; output.y = val1.y - val2.y; output.z = val1.z - val2.z; output.w = val1.w - val2.w; return output; } template <> float4 CudaFloatStructsTests::divVec(const float4& val1, const float4& val2) { float4 output; output.x = val1.x / val2.x; output.y = val1.y / val2.y; output.z = val1.z / val2.z; output.w = val1.w / val2.w; return output; } template <> float4 CudaFloatStructsTests::subScalar(const float4& val1, const float& val2) { float4 output; output.x = val1.x - val2; output.y = val1.y - val2; output.z = val1.z - val2; output.w = val1.w - val2; return output; } template <> float4 CudaFloatStructsTests::divScalar(const float4& val1, const float& val2) { float4 output; output.x = val1.x / val2; output.y = val1.y / val2; output.z = val1.z / val2; output.w = val1.w / val2; return output; } template <> void CudaFloatStructsTests::initializeStruct(float3& val, std::mt19937& gen, std::uniform_real_distribution& dist) { val = make_float3(dist(gen), dist(gen), dist(gen)); } template <> bool CudaFloatStructsTests::assert_same(const float3& val1, const float3& val2, const std::string& str) { bool result = true; result = result && (val1.x == val2.x); result = result && (val1.y == val2.y); result = result && (val1.z == val2.z); if (!result) { printf("(%f, %f, %f) != (%f, %f, %f)", val1.x, val1.y, val1.z, val2.x, val2.y, val2.z); std::cout << str << std::endl; } return result; } template <> float3 CudaFloatStructsTests::addVec(const float3& val1, const float3& val2) { float3 output; output.x = val1.x + val2.x; output.y = val1.y + val2.y; output.z = val1.z + val2.z; return output; } template <> float3 CudaFloatStructsTests::multVec(const float3& val1, const float3& val2) { float3 output; output.x = val1.x * val2.x; output.y = val1.y * val2.y; output.z = val1.z * val2.z; return output; } template <> float3 CudaFloatStructsTests::addScalar(const float3& val1, const float& val2) { float3 output; output.x = val1.x + val2; output.y = val1.y + val2; output.z = val1.z + val2; return output; } template <> float3 CudaFloatStructsTests::multScalar(const float3& val1, const float& val2) { float3 output; output.x = val1.x * val2; output.y = val1.y * val2; output.z = val1.z * val2; return output; } template <> float3 CudaFloatStructsTests::subVec(const float3& val1, const float3& val2) { float3 output; output.x = val1.x - val2.x; output.y = val1.y - val2.y; output.z = val1.z - val2.z; return output; } template <> float3 CudaFloatStructsTests::divVec(const float3& val1, const float3& val2) { float3 output; output.x = val1.x / val2.x; output.y = val1.y / val2.y; output.z = val1.z / val2.z; return output; } template <> float3 CudaFloatStructsTests::subScalar(const float3& val1, const float& val2) { float3 output; output.x = val1.x - val2; output.y = val1.y - val2; output.z = val1.z - val2; return output; } template <> float3 CudaFloatStructsTests::divScalar(const float3& val1, const float& val2) { float3 output; output.x = val1.x / val2; output.y = val1.y / val2; output.z = val1.z / val2; return output; } using TYPE_TESTS = ::testing::Types; TYPED_TEST_SUITE(CudaFloatStructsTests, TYPE_TESTS); TYPED_TEST(CudaFloatStructsTests, VecAddScalar) { using T = TypeParam; T ground_truth_output = this->addScalar(this->input1_cpu, this->scalar); this->output_cpu = this->input1_cpu + this->scalar; std::string cpu_fail = "Doesn't pass cpu test"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; // Parallelization tests T zero = this->output_gpu; for (int blocks = 1; blocks < 10; blocks++) { for (int threads = 1; threads < 128; threads++) { HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); VectorScalarAddTestKernel<<>>(this->input1_d, this->scalar_d, this->output_d); HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); std::string fail_gpu = " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; } } } TYPED_TEST(CudaFloatStructsTests, VecSubScalar) { using T = TypeParam; T ground_truth_output = this->subScalar(this->input1_cpu, this->scalar); this->output_cpu = this->input1_cpu - this->scalar; std::string cpu_fail = "Doesn't pass cpu test"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; // Parallelization tests T zero = this->output_gpu; for (int blocks = 1; blocks < 10; blocks++) { for (int threads = 1; threads < 128; threads++) { HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); VectorScalarSubTestKernel<<>>(this->input1_d, this->scalar_d, this->output_d); HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); std::string fail_gpu = " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; } } } TYPED_TEST(CudaFloatStructsTests, VecMultScalar) { using T = TypeParam; T ground_truth_output = this->multScalar(this->input1_cpu, this->scalar); this->output_cpu = this->input1_cpu * this->scalar; std::string cpu_fail = "Doesn't pass cpu test"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; // Parallelization tests T zero = this->output_gpu; for (int blocks = 1; blocks < 10; blocks++) { for (int threads = 1; threads < 128; threads++) { HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); VectorScalarMultTestKernel<<>>(this->input1_d, this->scalar_d, this->output_d); HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); std::string fail_gpu = " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; } } } TYPED_TEST(CudaFloatStructsTests, VecDivScalar) { using T = TypeParam; T ground_truth_output = this->divScalar(this->input1_cpu, this->scalar); this->output_cpu = this->input1_cpu / this->scalar; std::string cpu_fail = "Doesn't pass cpu test"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; // Parallelization tests T zero = this->output_gpu; for (int blocks = 1; blocks < 10; blocks++) { for (int threads = 1; threads < 128; threads++) { HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); VectorScalarDivTestKernel<<>>(this->input1_d, this->scalar_d, this->output_d); HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); std::string fail_gpu = " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; } } } TYPED_TEST(CudaFloatStructsTests, VecAddVec) { using T = TypeParam; T ground_truth_output = this->addVec(this->input1_cpu, this->input2_cpu); this->output_cpu = this->input1_cpu + this->input2_cpu; std::string cpu_fail = "Doesn't pass cpu test"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; // Parallelization tests T zero = this->output_gpu; for (int blocks = 1; blocks < 10; blocks++) { for (int threads = 1; threads < 128; threads++) { HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); VectorVectorAddTestKernel<<>>(this->input1_d, this->input2_d, this->output_d); HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); std::string fail_gpu = " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; } } } TYPED_TEST(CudaFloatStructsTests, VecSubVec) { using T = TypeParam; T ground_truth_output = this->subVec(this->input1_cpu, this->input2_cpu); this->output_cpu = this->input1_cpu - this->input2_cpu; std::string cpu_fail = "Doesn't pass cpu test"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; // Parallelization tests T zero = this->output_gpu; for (int blocks = 1; blocks < 10; blocks++) { for (int threads = 1; threads < 128; threads++) { HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); VectorVectorSubTestKernel<<>>(this->input1_d, this->input2_d, this->output_d); HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); std::string fail_gpu = " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; } } } TYPED_TEST(CudaFloatStructsTests, VecDivVec) { using T = TypeParam; T ground_truth_output = this->divVec(this->input1_cpu, this->input2_cpu); this->output_cpu = this->input1_cpu / this->input2_cpu; std::string cpu_fail = "Doesn't pass cpu test"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; // Parallelization tests T zero = this->output_gpu; for (int blocks = 1; blocks < 10; blocks++) { for (int threads = 1; threads < 128; threads++) { HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); VectorVectorDivTestKernel<<>>(this->input1_d, this->input2_d, this->output_d); HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); std::string fail_gpu = " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; } } } TYPED_TEST(CudaFloatStructsTests, VecAddVecMultScalar) { using T = TypeParam; T ground_truth_output = this->addVec(this->input1_cpu, this->multScalar(this->input2_cpu, this->scalar)); this->output_cpu = this->input1_cpu + this->input2_cpu * this->scalar; std::string cpu_fail = "Doesn't pass cpu test"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; // Parallelization tests T zero = this->output_gpu; for (int blocks = 1; blocks < 10; blocks++) { for (int threads = 1; threads < 128; threads++) { HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); VectorVectorScalarAddMultTestKernel <<>>(this->input1_d, this->input2_d, this->scalar_d, this->output_d); HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); std::string fail_gpu = " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; } } } ================================================ FILE: tests/math_utils/math_utils_test.cu ================================================ #include #include #include #include #include #include TEST(MATH_UTILS, SQ) { EXPECT_FLOAT_EQ(0.25, SQ(0.5)); EXPECT_FLOAT_EQ(0.25, SQ(0.5 - 1.0)); EXPECT_FLOAT_EQ(25, SQ(5)); EXPECT_FLOAT_EQ(25, SQ(5 - 10)); EXPECT_FLOAT_EQ(625, SQ(5 * 5)); } TEST(MATH_UTILS, QuatInv) { // Create an unnormalized quaternion. std::array q{ 1, 2, 3, 2 }; std::array q_inv{}; mppi::math::QuatInv(q.data(), q_inv.data()); // Compute the inverse using eigen. // Don't use the array constructor because eigen stores it as [x, y, z, w] internally. const Eigen::Quaternionf eigen_q{ q[0], q[1], q[2], q[3] }; const Eigen::Quaternionf eigen_q_inv = eigen_q.inverse().normalized(); std::array correct_q_inv{ eigen_q_inv.w(), eigen_q_inv.x(), eigen_q_inv.y(), eigen_q_inv.z() }; // q_inv should be normalized. constexpr float tol = 1e-7; array_assert_float_near<4>(q_inv, correct_q_inv, tol); } TEST(MATH_UTILS, RotatePointByDCM) { const float tol = 1e-6; Eigen::Quaternionf q_eig = Eigen::Quaternionf::UnitRandom(); Eigen::Vector3f point_eig = Eigen::Vector3f::Random(); std::array q{ q_eig.w(), q_eig.x(), q_eig.y(), q_eig.z() }; float3 point = make_float3(point_eig.x(), point_eig.y(), point_eig.z()); float M[3][3]; Eigen::Matrix3f M_eig; mppi::math::Quat2DCM(q.data(), M); mppi::math::Quat2DCM(q_eig, M_eig); for (int row = 0; row < 3; row++) { for (int col = 0; col < 3; col++) { ASSERT_NEAR(M[row][col], M_eig(row, col), tol) << " failed at row: " << row << ", col: " << col; } } float3 result, result_eig_rotation_matrix; Eigen::Vector3f result_eig; mppi::math::RotatePointByDCM(M, point, result); mppi::math::RotatePointByDCM(M_eig, point, result_eig_rotation_matrix); mppi::math::RotatePointByDCM(M_eig, point_eig, result_eig); ASSERT_NEAR(result.x, result_eig_rotation_matrix.x, tol) << " failed comparing R stored in 2d float array and R in " "Eigen matrix"; ASSERT_NEAR(result.y, result_eig_rotation_matrix.y, tol) << " failed comparing R stored in 2d float array and R in " "Eigen matrix"; ASSERT_NEAR(result.z, result_eig_rotation_matrix.z, tol) << " failed comparing R stored in 2d float array and R in " "Eigen matrix"; ASSERT_NEAR(result.x, result_eig.x(), tol) << " failed comparing GPU and Eigen methods of rotating by DCM"; ASSERT_NEAR(result.y, result_eig.y(), tol) << " failed comparing GPU and Eigen methods of rotating by DCM"; ASSERT_NEAR(result.z, result_eig.z(), tol) << " failed comparing GPU and Eigen methods of rotating by DCM"; } TEST(MATH_UTILS, RotatePointByDCMTranspose) { const float tol = 1e-6; Eigen::Quaternionf q_eig = Eigen::Quaternionf::UnitRandom(); Eigen::Vector3f point_eig = Eigen::Vector3f::Random(); std::array q{ q_eig.w(), q_eig.x(), q_eig.y(), q_eig.z() }; float3 point = make_float3(point_eig.x(), point_eig.y(), point_eig.z()); float M[3][3]; Eigen::Matrix3f M_eig; mppi::math::Quat2DCM(q.data(), M); mppi::math::Quat2DCM(q_eig, M_eig); for (int row = 0; row < 3; row++) { for (int col = 0; col < 3; col++) { ASSERT_NEAR(M[row][col], M_eig(row, col), tol) << " failed at row: " << row << ", col: " << col; } } float3 result, result_eig_rotation_matrix; Eigen::Vector3f result_eig; mppi::math::RotatePointByDCM(M, point, result, mppi::matrix_multiplication::MAT_OP::TRANSPOSE); mppi::math::RotatePointByDCM(M_eig, point, result_eig_rotation_matrix, mppi::matrix_multiplication::MAT_OP::TRANSPOSE); mppi::math::RotatePointByDCM(M_eig, point_eig, result_eig, mppi::matrix_multiplication::MAT_OP::TRANSPOSE); ASSERT_NEAR(result.x, result_eig_rotation_matrix.x, tol) << " failed comparing R stored in 2d float array and R in " "Eigen matrix"; ASSERT_NEAR(result.y, result_eig_rotation_matrix.y, tol) << " failed comparing R stored in 2d float array and R in " "Eigen matrix"; ASSERT_NEAR(result.z, result_eig_rotation_matrix.z, tol) << " failed comparing R stored in 2d float array and R in " "Eigen matrix"; ASSERT_NEAR(result.x, result_eig.x(), tol) << " failed comparing GPU and Eigen methods of rotating by DCM"; ASSERT_NEAR(result.y, result_eig.y(), tol) << " failed comparing GPU and Eigen methods of rotating by DCM"; ASSERT_NEAR(result.z, result_eig.z(), tol) << " failed comparing GPU and Eigen methods of rotating by DCM"; } TEST(MATH_UTILS, QuatMultiply) { // Create an unnormalized quaternion. std::array q1{ 1, 2, 3, 4 }; std::array q2{ 8, 7, 6, 5 }; std::array q3{}; std::array q3_norm{}; Eigen::Quaternionf eigen_result, eigen_result_norm; // Compare the multiplication using eigen. // Don't use the array constructor because eigen stores it as [x, y, z, w] internally. const Eigen::Quaternionf eigen_q1{ q1[0], q1[1], q1[2], q1[3] }; const Eigen::Quaternionf eigen_q2{ q2[0], q2[1], q2[2], q2[3] }; // Ground Truth const Eigen::Quaternionf eigen_q3 = (eigen_q1 * eigen_q2); const Eigen::Quaternionf eigen_q3_norm = eigen_q3.normalized(); // Our Methods mppi::math::QuatMultiply(q1.data(), q2.data(), q3.data(), false); mppi::math::QuatMultiply(q1.data(), q2.data(), q3_norm.data()); mppi::math::QuatMultiply(eigen_q1, eigen_q2, eigen_result, false); mppi::math::QuatMultiply(eigen_q1, eigen_q2, eigen_result_norm); std::array correct_q3{ eigen_q3.w(), eigen_q3.x(), eigen_q3.y(), eigen_q3.z() }; std::array eigen_q3_array{ eigen_result.w(), eigen_result.x(), eigen_result.y(), eigen_result.z() }; std::array correct_q3_norm{ eigen_q3_norm.w(), eigen_q3_norm.x(), eigen_q3_norm.y(), eigen_q3_norm.z() }; std::array eigen_q3_norm_array{ eigen_result_norm.w(), eigen_result_norm.x(), eigen_result_norm.y(), eigen_result_norm.z() }; // q_inv should be normalized. constexpr auto tol = 1e-7; array_assert_float_near<4>(q3, correct_q3, tol); array_assert_float_near<4>(eigen_q3_array, correct_q3, tol); array_assert_float_near<4>(q3_norm, correct_q3_norm, tol); array_assert_float_near<4>(eigen_q3_norm_array, correct_q3_norm, tol); } TEST(MATH_UTILS, RotatePointByQuat) { for (int i = 0; i < 100; i++) { Eigen::Quaternionf rotation = Eigen::Quaternionf::UnitRandom(); Eigen::Vector3f translation = Eigen::Vector3f::Random(); float3 point = make_float3(translation.x(), translation.y(), translation.z()); float q[4] = { rotation.w(), rotation.x(), rotation.y(), rotation.z() }; float3 output, output_eig; Eigen::Vector3f output_all_eig; // Test method with 3 parameters, output going into the last mppi::math::RotatePointByQuat(rotation, point, output_eig); mppi::math::RotatePointByQuat(q, point, output); mppi::math::RotatePointByQuat(rotation, translation, output_all_eig); // Test method with 2 parameters, modifying the second float3 output2 = point; float3 output2_eig = point; Eigen::Vector3f output2_all_eig = translation; mppi::math::RotatePointByQuat(rotation, output2_eig); mppi::math::RotatePointByQuat(q, output2); mppi::math::RotatePointByQuat(rotation, output2_all_eig); // Ground Truth auto result = rotation * translation; EXPECT_NEAR(output.x, result.x(), 1.0e-5); EXPECT_NEAR(output.y, result.y(), 1.0e-5); EXPECT_NEAR(output.z, result.z(), 1.0e-5); EXPECT_NEAR(output_eig.x, result.x(), 1.0e-5); EXPECT_NEAR(output_eig.y, result.y(), 1.0e-5); EXPECT_NEAR(output_eig.z, result.z(), 1.0e-5); EXPECT_NEAR(output_all_eig.x(), result.x(), 1.0e-5); EXPECT_NEAR(output_all_eig.y(), result.y(), 1.0e-5); EXPECT_NEAR(output_all_eig.z(), result.z(), 1.0e-5); EXPECT_NEAR(output2.x, result.x(), 1.0e-5); EXPECT_NEAR(output2.y, result.y(), 1.0e-5); EXPECT_NEAR(output2.z, result.z(), 1.0e-5); EXPECT_NEAR(output2_eig.x, result.x(), 1.0e-5); EXPECT_NEAR(output2_eig.y, result.y(), 1.0e-5); EXPECT_NEAR(output2_eig.z, result.z(), 1.0e-5); EXPECT_NEAR(output2_all_eig.x(), result.x(), 1.0e-5); EXPECT_NEAR(output2_all_eig.y(), result.y(), 1.0e-5); EXPECT_NEAR(output2_all_eig.z(), result.z(), 1.0e-5); } } TEST(MATH_UTILS, QuatDCM) { for (int iteration = 0; iteration < 100; iteration++) { Eigen::Quaternionf q_eig = Eigen::Quaternionf::UnitRandom(); float q[4] = { q_eig.w(), q_eig.x(), q_eig.y(), q_eig.z() }; float M[3][3]; Eigen::Matrix3f M_eig; M_eig = q_eig.toRotationMatrix(); mppi::math::Quat2DCM(q, M); for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { EXPECT_NEAR(M_eig(i, j), M[i][j], 1.0e-5); } } } } TEST(MATH_UTILS, EulerDCM) { for (int iteration = 0; iteration < 100; iteration++) { Eigen::Quaternionf q_eig = Eigen::Quaternionf::UnitRandom(); float r, p, y; float M[3][3]; Eigen::Matrix3f M_eig, M_result; M_result = q_eig.toRotationMatrix(); mppi::math::Quat2EulerNWU(q_eig, r, p, y); mppi::math::Euler2DCM_NWU(r, p, y, M); mppi::math::Euler2DCM_NWU(r, p, y, M_eig); for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { EXPECT_NEAR(M_result(i, j), M[i][j], 1.0e-5); EXPECT_NEAR(M_result(i, j), M_eig(i, j), 1.0e-5); } } } } TEST(MATH_UTILS, TranslateVectorByQuat) { const float tol = 1e-6; Eigen::Vector3f origin_eig = Eigen::Vector3f::Random(); Eigen::Vector3f body_pose_eig = Eigen::Vector3f::Random(); Eigen::Quaternionf rotation_eig = Eigen::Quaternionf::UnitRandom(); float3 origin_f3 = make_float3(origin_eig.x(), origin_eig.y(), origin_eig.z()); float3 body_pose_f3 = make_float3(body_pose_eig.x(), body_pose_eig.y(), body_pose_eig.z()); float rotation_array[4] = { rotation_eig.w(), rotation_eig.x(), rotation_eig.y(), rotation_eig.z() }; // Create output variables float3 output_eig, output_array; Eigen::Vector3f output_all_eig; // Ground Truth Eigen::Matrix3f R_eig = rotation_eig.toRotationMatrix(); Eigen::Vector3f correct_value = origin_eig + R_eig * body_pose_eig; mppi::math::bodyOffsetToWorldPoseQuat(body_pose_f3, origin_f3, rotation_eig, output_eig); mppi::math::bodyOffsetToWorldPoseQuat(body_pose_f3, origin_f3, rotation_array, output_array); mppi::math::bodyOffsetToWorldPoseQuat(body_pose_eig, origin_eig, rotation_eig, output_all_eig); EXPECT_NEAR(output_eig.x, correct_value.x(), tol); EXPECT_NEAR(output_eig.y, correct_value.y(), tol); EXPECT_NEAR(output_eig.z, correct_value.z(), tol); EXPECT_NEAR(output_array.x, correct_value.x(), tol); EXPECT_NEAR(output_array.y, correct_value.y(), tol); EXPECT_NEAR(output_array.z, correct_value.z(), tol); EXPECT_NEAR(output_all_eig.x(), correct_value.x(), tol); EXPECT_NEAR(output_all_eig.y(), correct_value.y(), tol); EXPECT_NEAR(output_all_eig.z(), correct_value.z(), tol); } TEST(MATH_UTILS, TranslateVectorByDCM) { const float tol = 1e-6; Eigen::Vector3f origin_eig = Eigen::Vector3f::Random(); Eigen::Vector3f body_pose_eig = Eigen::Vector3f::Random(); Eigen::Quaternionf rotation_eig = Eigen::Quaternionf::UnitRandom(); float3 origin_f3 = make_float3(origin_eig.x(), origin_eig.y(), origin_eig.z()); float3 body_pose_f3 = make_float3(body_pose_eig.x(), body_pose_eig.y(), body_pose_eig.z()); Eigen::Matrix3f R_eig = rotation_eig.toRotationMatrix(); float R_float[3][3]; for (int row = 0; row < 3; row++) { for (int col = 0; col < 3; col++) { R_float[row][col] = R_eig(row, col); } } // Create output variables float3 output_eig, output_array; Eigen::Vector3f output_all_eig; // Ground Truth Eigen::Vector3f correct_value = origin_eig + R_eig * body_pose_eig; // Our Methods mppi::math::bodyOffsetToWorldPoseDCM(body_pose_f3, origin_f3, R_eig, output_eig); mppi::math::bodyOffsetToWorldPoseDCM(body_pose_f3, origin_f3, R_float, output_array); mppi::math::bodyOffsetToWorldPoseDCM(body_pose_eig, origin_eig, R_eig, output_all_eig); EXPECT_NEAR(output_eig.x, correct_value.x(), tol); EXPECT_NEAR(output_eig.y, correct_value.y(), tol); EXPECT_NEAR(output_eig.z, correct_value.z(), tol); EXPECT_NEAR(output_array.x, correct_value.x(), tol); EXPECT_NEAR(output_array.y, correct_value.y(), tol); EXPECT_NEAR(output_array.z, correct_value.z(), tol); EXPECT_NEAR(output_all_eig.x(), correct_value.x(), tol); EXPECT_NEAR(output_all_eig.y(), correct_value.y(), tol); EXPECT_NEAR(output_all_eig.z(), correct_value.z(), tol); } TEST(MATH_UTILS, TranslateVectorByEuler) { const float tol = 1e-6; Eigen::Vector3f origin_eig = Eigen::Vector3f::Random(); Eigen::Vector3f body_pose_eig = Eigen::Vector3f::Random(); Eigen::Quaternionf q = Eigen::Quaternionf::UnitRandom(); Eigen::Matrix3f R_eig = q.toRotationMatrix(); float3 origin_f3 = make_float3(origin_eig.x(), origin_eig.y(), origin_eig.z()); float3 body_pose_f3 = make_float3(body_pose_eig.x(), body_pose_eig.y(), body_pose_eig.z()); float3 rotation; Eigen::Vector3f rotation_eig; mppi::math::Quat2EulerNWU(q, rotation.x, rotation.y, rotation.z); rotation_eig << rotation.x, rotation.y, rotation.z; // Create output variables float3 output_eig, output_array; Eigen::Vector3f output_all_eig; // Ground Truth Eigen::Vector3f correct_value = origin_eig + R_eig * body_pose_eig; // Our Methods mppi::math::bodyOffsetToWorldPoseEuler(body_pose_f3, origin_f3, rotation_eig, output_eig); mppi::math::bodyOffsetToWorldPoseEuler(body_pose_f3, origin_f3, rotation, output_array); mppi::math::bodyOffsetToWorldPoseEuler(body_pose_eig, origin_eig, rotation_eig, output_all_eig); EXPECT_NEAR(output_eig.x, correct_value.x(), tol); EXPECT_NEAR(output_eig.y, correct_value.y(), tol); EXPECT_NEAR(output_eig.z, correct_value.z(), tol); EXPECT_NEAR(output_array.x, correct_value.x(), tol); EXPECT_NEAR(output_array.y, correct_value.y(), tol); EXPECT_NEAR(output_array.z, correct_value.z(), tol); EXPECT_NEAR(output_all_eig.x(), correct_value.x(), tol); EXPECT_NEAR(output_all_eig.y(), correct_value.y(), tol); EXPECT_NEAR(output_all_eig.z(), correct_value.z(), tol); } TEST(MATH_UTILS, SkewSymmetricMatrixSameAsCrossProd) { Eigen::Vector3f a(1, 2, 3); Eigen::Vector3f b(8, 3, 9); eigen_assert_float_eq(a.cross(b), mppi::math::skewSymmetricMatrix(a) * b); } namespace mm = mppi::matrix_multiplication; namespace mp1 = mppi::p1; namespace mp2 = mppi::p2; template __global__ void gemm1d(T* A, T* B, T* C, T alpha = 1, T beta = 0, int shared_mem_size = 0) { extern __shared__ float buff[]; T* A_p; T* B_p; if (shared_mem_size != 0) { A_p = (T*)&buff[0]; B_p = &A_p[M * K]; mp1::loadArrayParallel(A_p, 0, A, 0); mp1::loadArrayParallel((T*)&buff[0], M * K, B, 0); __syncthreads(); } else { A_p = A; B_p = B; } mm::gemm1(A_p, B_p, C, alpha, beta, A_OP, B_OP); } template __global__ void gemm1dBatchKernel(const float* A, const float* B, float* C) { const float* A_local = &A[M * K * threadIdx.x]; const float* B_local = &B[K * N * threadIdx.x]; float* C_local = &C[M * N * threadIdx.x]; mm::gemm1(A_local, B_local, C_local); } template float2 testMatMulHost(int size, T alpha = 1, T beta = 0) { typedef Eigen::Matrix A_MAT; typedef Eigen::Matrix B_MAT; typedef Eigen::Matrix C_MAT; A_MAT A; B_MAT B; C_MAT C_Eigen = C_MAT::Ones(M, N); C_MAT C_ours = C_Eigen; C_Eigen *= beta; float eigen_time_ms = 0; if (A_OP == mm::MAT_OP::NONE && B_OP == mm::MAT_OP::NONE) { A = A_MAT::Random(M, K); B = B_MAT::Random(K, N); auto start = std::chrono::steady_clock::now(); C_Eigen += A * B * alpha; auto stop = std::chrono::steady_clock::now(); eigen_time_ms = mppi::math::timeDiffms(stop, start); } else if (A_OP == mm::MAT_OP::TRANSPOSE && B_OP == mm::MAT_OP::NONE) { A = A_MAT::Random(K, M); B = B_MAT::Random(K, N); auto start = std::chrono::steady_clock::now(); C_Eigen += A.transpose() * B * alpha; auto stop = std::chrono::steady_clock::now(); eigen_time_ms = mppi::math::timeDiffms(stop, start); } if (A_OP == mm::MAT_OP::NONE && B_OP == mm::MAT_OP::TRANSPOSE) { A = A_MAT::Random(M, K); B = B_MAT::Random(N, K); auto start = std::chrono::steady_clock::now(); C_Eigen += A * B.transpose() * alpha; auto stop = std::chrono::steady_clock::now(); eigen_time_ms = mppi::math::timeDiffms(stop, start); } else if (A_OP == mm::MAT_OP::TRANSPOSE && B_OP == mm::MAT_OP::TRANSPOSE) { A = A_MAT::Random(K, M); B = B_MAT::Random(N, K); auto start = std::chrono::steady_clock::now(); C_Eigen += A.transpose() * B.transpose() * alpha; auto stop = std::chrono::steady_clock::now(); eigen_time_ms = mppi::math::timeDiffms(stop, start); } auto our_start = std::chrono::steady_clock::now(); mm::gemm1(A.data(), B.data(), C_ours.data(), alpha, beta, A_OP, B_OP); auto our_stop = std::chrono::steady_clock::now(); float ours_time_ms = mppi::math::timeDiffms(our_stop, our_start); double avg_err = (C_Eigen - C_ours).norm() / C_Eigen.size(); EXPECT_TRUE(avg_err < 1e-7) << "C_Eigen:\n" << C_Eigen << "\nC_ours:\n" << C_ours; return make_float2(eigen_time_ms, ours_time_ms); } template float2 testMatMulp1(int size, T alpha = 1, T beta = 0) { typedef Eigen::Matrix A_MAT; typedef Eigen::Matrix B_MAT; typedef Eigen::Matrix C_MAT; cudaStream_t stream; cudaStreamCreate(&stream); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); A_MAT A; B_MAT B; C_MAT C_CPU = C_MAT::Ones(M, N); C_MAT C_GPU = C_CPU; C_CPU *= beta; // Device Memory Locations T* A_d; T* B_d; T* C_d; dim3 block(1, 1, 1); dim3 grid(1, 1, 1); HANDLE_ERROR(cudaMalloc((void**)&A_d, sizeof(T) * M * K)); HANDLE_ERROR(cudaMalloc((void**)&B_d, sizeof(T) * K * N)); HANDLE_ERROR(cudaMalloc((void**)&C_d, sizeof(T) * M * N)); float cpu_time_ms = 0; if (A_OP == mm::MAT_OP::NONE && B_OP == mm::MAT_OP::NONE) { A = A_MAT::Random(M, K); B = B_MAT::Random(K, N); auto start = std::chrono::steady_clock::now(); C_CPU += A * B * alpha; auto stop = std::chrono::steady_clock::now(); cpu_time_ms = mppi::math::timeDiffms(stop, start); } else if (A_OP == mm::MAT_OP::TRANSPOSE && B_OP == mm::MAT_OP::NONE) { A = A_MAT::Random(K, M); B = B_MAT::Random(K, N); auto start = std::chrono::steady_clock::now(); C_CPU += A.transpose() * B * alpha; auto stop = std::chrono::steady_clock::now(); cpu_time_ms = mppi::math::timeDiffms(stop, start); } if (A_OP == mm::MAT_OP::NONE && B_OP == mm::MAT_OP::TRANSPOSE) { A = A_MAT::Random(M, K); B = B_MAT::Random(N, K); auto start = std::chrono::steady_clock::now(); C_CPU += A * B.transpose() * alpha; auto stop = std::chrono::steady_clock::now(); cpu_time_ms = mppi::math::timeDiffms(stop, start); } else if (A_OP == mm::MAT_OP::TRANSPOSE && B_OP == mm::MAT_OP::TRANSPOSE) { A = A_MAT::Random(K, M); B = B_MAT::Random(N, K); auto start = std::chrono::steady_clock::now(); C_CPU += A.transpose() * B.transpose() * alpha; auto stop = std::chrono::steady_clock::now(); cpu_time_ms = mppi::math::timeDiffms(stop, start); } int grid_dim = 1; if (P_DIR == mp1::Parallel1Dir::THREAD_X) { block.x = size; } else if (P_DIR == mp1::Parallel1Dir::THREAD_Y) { block.y = size; } else if (P_DIR == mp1::Parallel1Dir::THREAD_Z) { block.z = size; } else if (P_DIR == mp1::Parallel1Dir::GLOBAL_X) { grid_dim = (M * N - 1) / (size) + 1; block.x = size; grid.x = grid_dim; } else if (P_DIR == mp1::Parallel1Dir::GLOBAL_Y) { grid_dim = (M * N - 1) / (size) + 1; block.y = size; grid.y = grid_dim; } else if (P_DIR == mp1::Parallel1Dir::GLOBAL_Z) { grid_dim = (M * N - 1) / (size) + 1; block.y = size; grid.y = grid_dim; } int shared_mem_num = ((M * K + K * N) * sizeof(T) <= 1 << 16) ? (M * K + K * N) : 0; HANDLE_ERROR(cudaMemcpyAsync(A_d, A.data(), sizeof(T) * M * K, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(B_d, B.data(), sizeof(T) * K * N, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(C_d, C_GPU.data(), sizeof(T) * M * N, cudaMemcpyHostToDevice, stream)); cudaEventRecord(start, stream); gemm1d <<>>(A_d, B_d, C_d, alpha, beta, shared_mem_num); cudaEventRecord(stop, stream); HANDLE_ERROR(cudaMemcpyAsync(C_GPU.data(), C_d, sizeof(T) * M * N, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); HANDLE_ERROR(cudaEventSynchronize(stop)); float gpu_time_ms = 0; HANDLE_ERROR(cudaEventElapsedTime(&gpu_time_ms, start, stop)); double avg_err = (C_CPU - C_GPU).norm() / C_CPU.size(); EXPECT_TRUE(avg_err < 1e-7) << "C_CPU:\n" << C_CPU << "\nC_GPU:\n" << C_GPU; cudaFree(A_d); cudaFree(B_d); cudaFree(C_d); return make_float2(cpu_time_ms, gpu_time_ms); }; TEST(MATH_UTILS, float_BasicMatrixMult_x) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<10, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<10, 2, 50, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(256, 1, 0); } TEST(MATH_UTILS, float_BasicMatrixMult_y) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Y, float>(16, 1, 0); testMatMulp1<10, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Y, float>(16, 1, 0); testMatMulp1<10, 2, 50, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Y, float>(256, 1, 0); } TEST(MATH_UTILS, float_BasicMatrixMult_z) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Z, float>(16, 1, 0); testMatMulp1<10, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Z, float>(16, 1, 0); testMatMulp1<10, 2, 50, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Z, float>(64, 1, 0); } TEST(MATH_UTILS, float_BasicMatrixMult_none) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::NONE, float>(16, 1, 0); testMatMulp1<10, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::NONE, float>(16, 1, 0); testMatMulp1<10, 2, 50, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::NONE, float>(256, 1, 0); } TEST(MATH_UTILS, float_TransposeA_MatrixMult_x) { testMatMulp1<3, 2, 5, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); } TEST(MATH_UTILS, float_TransposeA_TransposeB_MatrixMult_x) { testMatMulp1<3, 2, 5, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::TRANSPOSE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); } TEST(MATH_UTILS, float_TransposeB_MatrixMult_x) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::TRANSPOSE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); } TEST(MATH_UTILS, int_BasicMatrixMult_x) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Z, int>(16, 1, 0); } TEST(MATH_UTILS, double_BasicMatrixMult_x) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Z, double>(16, 1, 0); } TEST(MATH_UTILS, float_MatrixMult_UnalignedK_x) { testMatMulp1<3, 17, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 17, 5, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 17, 5, mm::MAT_OP::NONE, mm::MAT_OP::TRANSPOSE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 1, 4, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::TRANSPOSE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); } TEST(MATH_UTILS, float_MatrixMult_alignedK2_x) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 2, 5, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::TRANSPOSE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 2, 5, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::TRANSPOSE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); } TEST(MATH_UTILS, float_MatrixMult_alignedK4_x) { testMatMulp1<3, 4, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 4, 5, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 4, 5, mm::MAT_OP::NONE, mm::MAT_OP::TRANSPOSE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); testMatMulp1<3, 4, 5, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::TRANSPOSE, mp1::Parallel1Dir::THREAD_X, float>(16, 1, 0); } TEST(MATH_UTILS, float_BasicMatrixMult_alpha_z) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Z, float>(16, 3.0, 0); } TEST(MATH_UTILS, float_BasicMatrixMult_alpha_beta_z) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_Z, float>(16, 3.0, 0.2); } TEST(MATH_UTILS, float_BasicMatrixMult_alpha_beta_x) { testMatMulp1<3, 2, 5, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>(5, 3.0, 0.2); } TEST(MATH_UTILS, HostMatrixMult) { const int M = 10; const int K = 5; const int N = 6; const mm::MAT_OP A_OP = mm::MAT_OP::NONE; const mm::MAT_OP B_OP = mm::MAT_OP::NONE; const mp1::Parallel1Dir P_DIR = mp1::Parallel1Dir::THREAD_X; using T = float; T alpha = 1.0; T beta = 0.0; testMatMulHost(alpha, beta); } TEST(MATH_UTILS, SpeedSharedMemMatrixMult) { const int NUM_ITERATIONS = 100; float2 total_ms = make_float2(0, 0); const int thread_dim = 1024; for (int i = 0; i < NUM_ITERATIONS; i++) { total_ms += testMatMulp1<128, 256, 128, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>( thread_dim, 3.0, 0.2); } std::cout << NUM_ITERATIONS << " GPU Shared Mem Matrix Multiplications took " << total_ms.y / NUM_ITERATIONS << " ms on average" << std::endl; std::cout << NUM_ITERATIONS << " CPU Shared Mem Matrix Multiplications took " << total_ms.x / NUM_ITERATIONS << " ms on average" << std::endl; } TEST(MATH_UTILS, SpeedLargeMatrixMult) { const int NUM_ITERATIONS = 5; float2 total_ms = make_float2(0, 0); const int thread_dim = 1024; for (int i = 0; i < NUM_ITERATIONS; i++) { total_ms += testMatMulp1<1024, 2048, 1024, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>( thread_dim, 3.0, 0.2); } std::cout << NUM_ITERATIONS << " GPU Matrix Multiplications took " << total_ms.y / NUM_ITERATIONS << " ms on average" << std::endl; std::cout << NUM_ITERATIONS << " CPU Matrix Multiplications took " << total_ms.x / NUM_ITERATIONS << " ms on average" << std::endl; } TEST(MATH_UTILS, SpeedLargeMatrixMultGlobal) { const int NUM_ITERATIONS = 10; float2 total_ms = make_float2(0, 0); const int thread_dim = 1024; for (int i = 0; i < NUM_ITERATIONS; i++) { total_ms += testMatMulp1<1024, 2048, 1024, mm::MAT_OP::TRANSPOSE, mm::MAT_OP::NONE, mp1::Parallel1Dir::GLOBAL_X, float>( thread_dim, 3.0, 0.2); } std::cout << NUM_ITERATIONS << " GPU Matrix Multiplications took " << total_ms.y / NUM_ITERATIONS << " ms on average" << std::endl; std::cout << NUM_ITERATIONS << " CPU Matrix Multiplications took " << total_ms.x / NUM_ITERATIONS << " ms on average" << std::endl; } TEST(MATH_UTILS, SpeedSmallMatrixMult) { const int NUM_ITERATIONS = 10; float2 total_ms = make_float2(0, 0); const int thread_dim = 16; for (int i = 0; i < NUM_ITERATIONS; i++) { total_ms += testMatMulp1<5, 4, 3, mm::MAT_OP::NONE, mm::MAT_OP::NONE, mp1::Parallel1Dir::THREAD_X, float>( thread_dim, 3.0, 0.2); } std::cout << NUM_ITERATIONS << " GPU Matrix Multiplications took " << total_ms.y / NUM_ITERATIONS << " ms on average" << std::endl; std::cout << NUM_ITERATIONS << " CPU Matrix Multiplications took " << total_ms.x / NUM_ITERATIONS << " ms on average" << std::endl; } template __global__ void GaussJordanKernel(T* A_d) { int batch_idx; if (PARALLELIZATION_DIR == mp1::Parallel1Dir::THREAD_X) { batch_idx = blockIdx.x; } else if (PARALLELIZATION_DIR == mp1::Parallel1Dir::THREAD_Y) { batch_idx = blockIdx.y; } else if (PARALLELIZATION_DIR == mp1::Parallel1Dir::THREAD_Z) { batch_idx = blockIdx.z; } extern __shared__ float A_shared_s[]; T* A_shared = (T*)A_shared_s; mp1::loadArrayParallel(A_shared, 0, A_d + batch_idx * M * N, 0); __syncthreads(); mm::GaussJordanElimination(A_shared); __syncthreads(); mp1::loadArrayParallel(A_d + batch_idx * M * N, 0, A_shared, 0); } TEST(MATH_UTILS, GaussJordanFactorizationAgainstInverse) { cudaStream_t stream; cudaStreamCreate(&stream); const int M = 3; const int N = 2 * M; Eigen::MatrixXf A = Eigen::MatrixXf::Zero(M, N); Eigen::MatrixXf A_result = A; Eigen::MatrixXf first_A; Eigen::MatrixXf A_inv; dim3 block_size(1, 1, 1); dim3 grid_size(1, 1, 1); int shared_mem_size = sizeof(float) * M * N; float* A_d; cudaMalloc((void**)&A_d, sizeof(float) * M * N); float max_error = 0.0f; Eigen::MatrixXf diff; first_A = Eigen::MatrixXf::Random(M, M); A_inv = first_A.inverse(); for (int tdx = 1; tdx < 64; tdx++) { A << first_A, Eigen::MatrixXf::Identity(M, M); cudaMemcpyAsync(A_d, A.data(), sizeof(float) * M * N, cudaMemcpyHostToDevice, stream); block_size.x = tdx; GaussJordanKernel<<>>(A_d); cudaMemcpyAsync(A_result.data(), A_d, sizeof(float) * M * N, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); diff = A_result.block(0, M, M, N - M) - A_inv; max_error = diff.norm(); ASSERT_NEAR(max_error, 0.0f, sqrtf(1e-4 * M * N)) << "failed for " << tdx << " parallel x threads.\nDiff:\n" << diff << "\nCPU:\n" << A_inv << "\nGPU:\n" << A_result.block(0, M, M, N - M); } block_size = dim3(1, 1, 1); for (int tdy = 1; tdy < 64; tdy++) { // first_A = Eigen::MatrixXf::Random(M, M); A << first_A, Eigen::MatrixXf::Identity(M, M); // A_inv = first_A.inverse(); cudaMemcpyAsync(A_d, A.data(), sizeof(float) * M * N, cudaMemcpyHostToDevice, stream); block_size.y = tdy; GaussJordanKernel<<>>(A_d); cudaMemcpyAsync(A_result.data(), A_d, sizeof(float) * M * N, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); diff = A_result.block(0, M, M, N - M) - A_inv; max_error = diff.lpNorm(); ASSERT_NEAR(max_error, 0.0f, 1e-4 * A_inv.norm()) << "failed for " << tdy << " parallel y threads.\nDiff:\n" << diff << "\nCPU:\n" << A_inv << "\nGPU:\n" << A_result.block(0, M, M, N - M); } block_size = dim3(1, 1, 1); for (int tdz = 1; tdz < 64; tdz++) { // first_A = Eigen::MatrixXf::Random(M, M); A << first_A, Eigen::MatrixXf::Identity(M, M); // A_inv = first_A.inverse(); cudaMemcpyAsync(A_d, A.data(), sizeof(float) * M * N, cudaMemcpyHostToDevice, stream); block_size.z = tdz; GaussJordanKernel<<>>(A_d); cudaMemcpyAsync(A_result.data(), A_d, sizeof(float) * M * N, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); diff = A_result.block(0, M, M, N - M) - A_inv; max_error = diff.lpNorm(); ASSERT_NEAR(max_error, 0.0f, 1e-4 * A_inv.norm()) << "failed for " << tdz << " parallel z threads.\nDiff:\n" << diff << "\nCPU:\n" << A_inv << "\nGPU:\n" << A_result.block(0, M, M, N - M); } cudaFree(A_d); } TEST(MATH_UTILS, GaussJordanEliminationColumnSkip) { cudaStream_t stream; cudaStreamCreate(&stream); const int M = 2; const int N = 5; Eigen::MatrixXf A = Eigen::MatrixXf::Zero(M, N); Eigen::MatrixXf A_result = A; // A << 0, 2, 3, 1, 1, 4; A << 1, 2, 2, 3, 4, 2, 4, 4, 1, 8; Eigen::MatrixXf b_d = Eigen::MatrixXf::Zero(M, N - M); // b_d << 2.5, 1.5; b_d << 2, 0, 4, 0, 1, 0; dim3 block_size(1, 1, 1); dim3 grid_size(1, 1, 1); int shared_mem_size = sizeof(float) * M * N; float* A_d; cudaMalloc((void**)&A_d, sizeof(float) * M * N); float max_error = 0.0f; Eigen::MatrixXf diff; for (int tdx = 1; tdx < 65; tdx++) { cudaMemcpyAsync(A_d, A.data(), sizeof(float) * M * N, cudaMemcpyHostToDevice, stream); block_size.x = tdx; GaussJordanKernel<<>>(A_d); cudaMemcpyAsync(A_result.data(), A_d, sizeof(float) * M * N, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); diff = A_result.block(0, M, M, N - M) - b_d; max_error = diff.norm(); ASSERT_NEAR(max_error, 0.0f, 1e-4 * M * N) << "failed for " << tdx << " parallel x threads.\nDiff:\n" << diff << "\nCPU:\n" << b_d << "\nGPU:\n" << A_result.block(0, M, M, N - M); } cudaFree(A_d); } TEST(MATH_UTILS, GaussJordanEliminationRowSwap) { cudaStream_t stream; cudaStreamCreate(&stream); const int M = 2; const int N = 3; Eigen::MatrixXf A = Eigen::MatrixXf::Zero(M, N); Eigen::MatrixXf A_result = A; A << 0, 2, 3, 1, 1, 4; Eigen::MatrixXf b_d = Eigen::MatrixXf::Zero(M, N - M); b_d << 2.5, 1.5; dim3 block_size(1, 1, 1); dim3 grid_size(1, 1, 1); int shared_mem_size = sizeof(float) * M * N; float* A_d; cudaMalloc((void**)&A_d, sizeof(float) * M * N); float max_error = 0.0f; Eigen::MatrixXf diff; for (int tdx = 1; tdx < 65; tdx++) { cudaMemcpyAsync(A_d, A.data(), sizeof(float) * M * N, cudaMemcpyHostToDevice, stream); block_size.x = tdx; GaussJordanKernel<<>>(A_d); cudaMemcpyAsync(A_result.data(), A_d, sizeof(float) * M * N, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); diff = A_result.block(0, M, M, N - M) - b_d; max_error = diff.norm(); ASSERT_NEAR(max_error, 0.0f, 1e-4 * M * N) << "failed for " << tdx << " parallel x threads.\nDiff:\n" << diff << "\nCPU:\n" << b_d << "\nGPU:\n" << A_result.block(0, M, M, N - M); } cudaFree(A_d); } TEST(MATH_UTILS, GaussJordanFactorizationBatched) { cudaStream_t stream; cudaStreamCreate(&stream); const int M = 3; const int N = 2 * M; dim3 block_size(1, 1, 1); dim3 grid_size(1, 1, 1); int shared_mem_size = sizeof(double) * M * N; double* A_d = nullptr; // cudaMalloc((void**)&A_d, sizeof(double) * M * N); for (int batches = 1; batches < 1000; batches++) { std::vector A_batch(batches * M * N); std::vector A_cpu(batches * M * N); std::vector A_gpu(batches * M * N); if (A_d) { cudaFree(A_d); } cudaMalloc((void**)&A_d, sizeof(double) * batches * M * N); grid_size.x = batches; // Fill in A_batch for (int i = 0; i < batches; i++) { Eigen::Map A_batch_i(&A_batch.data()[i * M * N], M, N); Eigen::Map A_cpu_i(&A_cpu.data()[i * M * N], M, N); A_batch_i << Eigen::MatrixXd::Random(M, M) + Eigen::MatrixXd::Identity(M, M), Eigen::MatrixXd::Identity(M, M); A_cpu_i << Eigen::MatrixXd::Identity(M, M), A_batch_i.block(0, 0, M, M).inverse(); } for (int tdx = 1; tdx < 128; tdx++) { // Copy over to GPU and run GJ Elimination cudaMemcpyAsync(A_d, A_batch.data(), sizeof(double) * batches * M * N, cudaMemcpyHostToDevice, stream); block_size.x = tdx; GaussJordanKernel<<>>(A_d); cudaMemcpyAsync(A_gpu.data(), A_d, sizeof(double) * batches * M * N, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); for (int i = 0; i < A_gpu.size(); i++) { // double tolerance = fabsf(A_cpu[i]) < 1 ? 1e-3 : 1e-3 * fabsf(A_cpu[i]); double tolerance = 0.1f; ASSERT_LT(fabsf(A_gpu[i] - A_cpu[i]), tolerance) << i % (M * N) << "th item in " << " batch: " << i / (M * N) << " out of " << batches << ", CPU: " << A_cpu[i] << ", GPU: " << A_gpu[i]; } } } cudaFree(A_d); } /** 2 Dimensional Parallelization Tests **/ template __global__ void gemm2d(const float* A, const float* B, float* C) { mm::gemm2(A, B, C); } TEST(MATH_UTILS, DISABLED_Matrix3fMultiplicationP2) { const int M = 300; const int K = 30; const int N = 3; const int max_thread_size = 1; typedef Eigen::Matrix A_mat; typedef Eigen::Matrix B_mat; typedef Eigen::Matrix C_mat; cudaEvent_t start; cudaEvent_t stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaStream_t stream; cudaStreamCreate(&stream); A_mat A = A_mat::Random(); B_mat B = B_mat::Random(); C_mat C_CPU = A * B; C_mat C_GPU; // std::cout << "C_CPU:\n" << C_CPU << std::endl; float* A_d; float* B_d; float* C_d; dim3 block(1, 1, 1); dim3 grid(1, 1, 1); HANDLE_ERROR(cudaMalloc((void**)&A_d, sizeof(float) * A.size())); HANDLE_ERROR(cudaMalloc((void**)&B_d, sizeof(float) * B.size())); HANDLE_ERROR(cudaMalloc((void**)&C_d, sizeof(float) * C_CPU.size())); auto inner_loop_before = [&](int size1, int size2, mp2::Parallel2Dir P_DIR) { A = A_mat::Random(); B = B_mat::Random(); C_CPU = A * B; block = dim3(1, 1, 1); if (P_DIR == mp2::Parallel2Dir::THREAD_XY) { block.x = size1; block.y = max(size2, 1); } else if (P_DIR == mp2::Parallel2Dir::THREAD_XZ) { block.x = size1; block.z = max(min(size2, 64), 1); } else if (P_DIR == mp2::Parallel2Dir::THREAD_YX) { block.y = size1; block.x = max(size2, 1); } else if (P_DIR == mp2::Parallel2Dir::THREAD_YZ) { block.y = size1; block.z = max(min(size2, 64), 1); } else if (P_DIR == mp2::Parallel2Dir::THREAD_ZY) { block.z = max(min(size1, 64), 1); block.y = max(size2, 1); } else if (P_DIR == mp2::Parallel2Dir::THREAD_ZX) { block.z = max(min(size1, 64), 1); block.x = max(size2, 1); } cudaEventRecord(start, stream); HANDLE_ERROR(cudaMemcpyAsync(A_d, A.data(), sizeof(float) * A.size(), cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(B_d, B.data(), sizeof(float) * B.size(), cudaMemcpyHostToDevice, stream)); }; auto inner_loop_after = [&](int size1, int size2, mp2::Parallel2Dir P_DIR) { HANDLE_ERROR(cudaMemcpyAsync(C_GPU.data(), C_d, sizeof(float) * C_GPU.size(), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); cudaEventRecord(stop, stream); float duration = 0; cudaEventElapsedTime(&duration, start, stop); // std::cout << "C_GPU " << " with " << block_x_size << " parallelization:\n" << C_GPU << std::endl; // std::cout << block_x_size << " parallelization diff " << duration << " ms:\n" << C_CPU - C_GPU << std::endl; double avg_err = (C_CPU - C_GPU).norm() / C_CPU.size(); // std::cout << block_x_size << " parallelization diff " << duration << " ms: " << avg_err << std::endl; // std::string dir_s = "none"; // if (P_DIR == mp2::Parallel2Dir::THREAD_XY) // { // dir_s = "xy"; // } else if (P_DIR == mp2::Parallel2Dir::THREAD_XZ) // { // dir_s = "xz"; // } else if (P_DIR == mp2::Parallel2Dir::THREAD_YX) // { // dir_s = "yx"; // } else if (P_DIR == mp2::Parallel2Dir::THREAD_YZ) // { // dir_s = "yz"; // } else if (P_DIR == mp2::Parallel2Dir::THREAD_ZY) // { // dir_s = "zy"; // } else if (P_DIR == mp2::Parallel2Dir::THREAD_ZX) // { // dir_s = "zx"; // } // printf("%d-%d-%d parallelization in %s-dir, time elapsed %f ms, diff: %f\n", block.x, block.y, block.z, // dir_s.c_str(), duration, avg_err); printf("%d-%d-%d parallelization, time elapsed %f ms, diff: %f\n", block.x, block.y, block.z, duration, avg_err); ASSERT_TRUE(avg_err < 1e-07); }; for (int block_size_1 = 1; block_size_1 <= max_thread_size; block_size_1++) { int block_size_2 = max_thread_size - block_size_1; inner_loop_before(block_size_1, block_size_2, mp2::Parallel2Dir::THREAD_XY); gemm2d<<>>(A_d, B_d, C_d); inner_loop_after(block_size_1, block_size_2, mp2::Parallel2Dir::THREAD_XY); } for (int block_size_1 = 1; block_size_1 <= max_thread_size; block_size_1++) { int block_size_2 = max_thread_size - block_size_1; inner_loop_before(block_size_1, block_size_2, mp2::Parallel2Dir::THREAD_XZ); gemm2d<<>>(A_d, B_d, C_d); inner_loop_after(block_size_1, block_size_2, mp2::Parallel2Dir::THREAD_XZ); } for (int block_size_1 = 1; block_size_1 <= max_thread_size; block_size_1++) { int block_size_2 = max_thread_size - block_size_1; inner_loop_before(block_size_1, block_size_2, mp2::Parallel2Dir::THREAD_YZ); gemm2d<<>>(A_d, B_d, C_d); inner_loop_after(block_size_1, block_size_2, mp2::Parallel2Dir::THREAD_YZ); // std::cout << "C_GPU:\n" << C_GPU << "\nC_CPU:\n" << C_CPU << std::endl; } cudaFree(A_d); cudaFree(B_d); cudaFree(C_d); } ================================================ FILE: tests/misc/CMakeLists.txt ================================================ # Double integrator dynamics tests set(TARGET_NAME miscellaneous_tests) add_executable(${TARGET_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp miscellaneous_tests.cu) target_link_libraries(${TARGET_NAME} gtest gmock gtest_main double_integrator_mppi) set_target_properties(${TARGET_NAME} PROPERTIES FOLDER test) gtest_add_tests(TARGET miscellaneous_tests) ================================================ FILE: tests/misc/di_dynamics_kernel_tests.cu ================================================ #include "di_dynamics_kernel_tests.cuh" __global__ void CheckModelSize(DoubleIntegratorDynamics* DI, long* model_size_check) { model_size_check[0] = sizeof(*DI); } ================================================ FILE: tests/misc/di_dynamics_kernel_tests.cuh ================================================ #pragma once #ifndef KERNEL_TESTS_DYNAMICS_DOUBLE_INTEGRATOR_DI_KERNEL_TEST_CUH_ #define KERNEL_TESTS_DYNAMICS_DOUBLE_INTEGRATOR_DI_KERNEL_TEST_CUH_ #include __global__ void CheckModelSize(DoubleIntegratorDynamics* DI, long* model_size_check); #include "di_dynamics_kernel_tests.cu" #endif //! KERNEL_TESTS_DYNAMICS_CARTPOLE_CARTPOLE_KERNEL_TEST_CUH_ ================================================ FILE: tests/misc/miscellaneous_tests.cu ================================================ // // Created by mgandhi3 on 3/4/20. // #include #include #include "di_dynamics_kernel_tests.cuh" #include #include #include TEST(Miscellaneous, CompareModelSize) { auto model = std::make_shared(); model->GPUSetup(); long* model_size_GPU_d; HANDLE_ERROR(cudaMalloc((void**)&model_size_GPU_d, sizeof(long))); CheckModelSize<<<1, 1>>>(model->model_d_, model_size_GPU_d); CudaCheckError(); long model_size_GPU; HANDLE_ERROR(cudaMemcpy(&model_size_GPU, model_size_GPU_d, sizeof(long), cudaMemcpyDeviceToHost)); ASSERT_EQ(sizeof(*model), model_size_GPU); std::cout << "Size of the shared pointer to the model:" << sizeof(model) << std::endl; // Should be the size of a pointer so 8 bytes for a 64 bit system std::cout << "Size of the model itself: " << sizeof(*model) << std::endl; // Should be bigger? std::cout << "Size of the model on the GPU: " << model_size_GPU << std::endl; std::cout << "Size of the control ranges in the model: " << sizeof(model->control_rngs_) << std::endl; // Should be 16 bytes ie. 4 floats! std::cout << "Size of the parameter structure of the model: " << sizeof(DoubleIntegratorParams) << std::endl; std::cout << "Size of the device pointer of the model: " << sizeof(model->model_d_) << std::endl; std::cout << "Size of the stream: " << sizeof(model->stream_) << std::endl; std::cout << "Size of GPU Memstatus: " << sizeof(model->GPUMemStatus_) << std::endl; } TEST(Miscellaneous, EigenNormalRandomVector) { std::random_device rd; std::mt19937 gen(rd()); // here you could also set a seed std::normal_distribution dis(1, 2); // generate a matrix expression Eigen::MatrixXd M = Eigen::MatrixXd::NullaryExpr(100, 100, [&]() { return dis(gen); }); EXPECT_NEAR(M.mean(), 1, 1e-1); EXPECT_NEAR(sqrtf((M.array() * M.array()).mean() - M.mean() * M.mean()), 2, 1e-1); } TEST(Miscellaneous, CreateRandomStateArray) { DoubleIntegratorDynamics::state_array X; DoubleIntegratorDynamics::state_array temp = DoubleIntegratorDynamics::state_array::Zero(); std::random_device rd; std::mt19937 gen(rd()); // here you could also set a seed std::normal_distribution dis(1, 2); // generate a matrix expression X = DoubleIntegratorDynamics::state_array::NullaryExpr([&]() { return dis(gen); }); std::cout << temp + X * 0.01 << std::endl; } TEST(Miscellaneous, Smoothing) { Eigen::Matrix filter_coefficients; filter_coefficients << -3, 12, 17, 12, -3; filter_coefficients /= 35.0; Eigen::Matrix control_buffer = Eigen::Matrix::Ones(); std::cout << "previous control buffer" << std::endl; std::cout << control_buffer << std::endl; Eigen::Matrix control_history; control_history << 1, 2, 3, 4, 5, 6; control_buffer.topRows<2>() = control_history; Eigen::Matrix nominal_control = 5 * Eigen::Matrix::Ones(); // Fill the last two timesteps with the end of the current nominal control trajectory nominal_control.col(9) << 10, 10, 10; control_buffer.middleRows(2, 10) = nominal_control.transpose(); control_buffer.row(10 + 2) = nominal_control.transpose().row(10 - 1); control_buffer.row(10 + 3) = nominal_control.transpose().row(10 - 1); std::cout << "current control buffer" << std::endl; std::cout << control_buffer << std::endl; // Apply convolutional filter to each timestep for (int i = 0; i < 10; ++i) { nominal_control.col(i) = (filter_coefficients * control_buffer.middleRows(i, 5)).transpose(); } std::cout << nominal_control << std::endl; } ================================================ FILE: tests/mppi_core/CCM_tests.cu ================================================ #include #include #include #include #include #include #include #include #include const int NUM_TIMESTEPS = 50; const int NUM_ROLLOUTS_CONST = 1024; // Might be simpler to create a new Controller CLass from RMPPI template class RMPPICCMDoubleIntegratorController : public RobustMPPIController, MAX_TIMESTEPS, NUM_ROLLOUTS, B_X, B_Y, RobustMPPIParams, S> { protected: std::mt19937 rng_gen_; std::normal_distribution control_dist_; ccm::LinearCCM CCM_feedback_controller_; public: typedef RobustMPPIController, MAX_TIMESTEPS, NUM_ROLLOUTS, B_X, B_Y, RobustMPPIParams, S> PARENT_CLASS; using control_array = typename PARENT_CLASS::control_array; using control_trajectory = typename PARENT_CLASS::control_trajectory; using state_array = typename PARENT_CLASS::state_array; // Constructor... Yeah It ain't pretty RMPPICCMDoubleIntegratorController( DYN_T* model, COST_T* cost, ccm::LinearCCM* fb_controller, float dt, float lambda, float alpha, float value_function_threshold, const Eigen::Ref& control_std_dev, int num_timesteps = MAX_TIMESTEPS, const Eigen::Ref& init_control_traj = control_trajectory::Zero(), int num_candidate_nominal_states = 9, int optimization_stride = 1, cudaStream_t stream = nullptr) : PARENT_CLASS(model, cost, fb_controller, dt, 1, lambda, alpha, value_function_threshold, control_std_dev, num_timesteps, init_control_traj, num_candidate_nominal_states, optimization_stride, stream) { control_dist_ = std::normal_distribution(0, 1); CCM_feedback_controller_ = ccm::LinearCCM(model); } void ptrToVec(const float* input, int num, std::vector& output) { output.assign(input, input + num); if (output.size() != num) { output.assign(num, 0.0); for (int i = 0; i < num; i++) { output[i] = input[i]; } } } void computeControl(const Eigen::Ref& state, int optimization_stride = 1) override { std::cout << "Candidate chosen: " << this->best_index_ << std::endl; // Rewrite computeControl using the CCM Rollout Kernel int c_dim = DYN_T::CONTROL_DIM; int s_dim = DYN_T::STATE_DIM; int single_control_traj_size = this->getNumTimesteps() * c_dim; int multi_control_traj_size = NUM_ROLLOUTS * single_control_traj_size; // Handy dandy pointers to nominal data float* trajectory_costs_nominal_d = this->trajectory_costs_d_ + NUM_ROLLOUTS; float* initial_state_nominal_d = this->initial_state_d_ + s_dim; float* control_noise_nominal_d = this->control_noise_d_ + multi_control_traj_size; float* control_nominal_d = this->control_d_ + single_control_traj_size; for (int opt_iter = 0; opt_iter < this->getNumIters(); opt_iter++) { // Create noise for trajectories std::vector control_noise_vec(multi_control_traj_size * 2, 0); for (int i = 0; i < multi_control_traj_size; i++) { control_noise_vec[i] = control_dist_(rng_gen_); control_noise_vec[multi_control_traj_size + i] = control_noise_vec[i]; } std::vector x_init_act_vec, x_init_nom_vec, u_traj_vec; std::vector control_std_dev_vec; ptrToVec(state.data(), s_dim, x_init_act_vec); ptrToVec(this->nominal_state_.data(), s_dim, x_init_nom_vec); ptrToVec(this->nominal_control_trajectory_.data(), single_control_traj_size, u_traj_vec); ptrToVec(this->getControlStdDev().data(), c_dim, control_std_dev_vec); // Launch rollout kernel using CCM // TODO pass in alpha std::array costs_act_CPU, costs_nom_CPU; launchRMPPIRolloutKernelCCMCPU( this->model_, this->cost_, &CCM_feedback_controller_, this->getDt(), this->getNumTimesteps(), optimization_stride, this->getLambda(), this->getAlpha(), this->getValueFunctionThreshold(), x_init_nom_vec, x_init_act_vec, control_std_dev_vec, u_traj_vec, control_noise_vec, costs_act_CPU, costs_nom_CPU); for (int i = 0; i < NUM_ROLLOUTS; i++) { this->trajectory_costs_(i) = costs_act_CPU[i]; this->trajectory_costs_nominal_(i) = costs_nom_CPU[i]; } // Control noise should be modified to contain u + noise this->setBaseline(mppi_common::computeBaselineCost(this->trajectory_costs_.data(), NUM_ROLLOUTS), 0); this->setBaseline(mppi_common::computeBaselineCost(this->trajectory_costs_nominal_.data(), NUM_ROLLOUTS), 1); // Copy data over to GPU HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_d_, this->trajectory_costs_.data(), NUM_ROLLOUTS * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_nominal_d, this->trajectory_costs_nominal_.data(), NUM_ROLLOUTS * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->control_noise_d_, control_noise_vec.data(), multi_control_traj_size * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(control_noise_nominal_d, control_noise_vec.data() + multi_control_traj_size, multi_control_traj_size * sizeof(float), cudaMemcpyHostToDevice, this->stream_)); // After rollout kernel, control_d_ and nominal_control_d are written to // and not read from so there is nothing to copy to them // HANDLE_ERROR(cudaMemcpyAsync(this->control_d_, // this->nominal_control_trajectory_.data(), // single_control_traj_size * sizeof(float), // cudaMemcpyHostToDevice, this->stream_)); // // TODO Not done in RMPPI RolloutKernel but I think it should be // HANDLE_ERROR(cudaMemcpyAsync(control_nominal_d, // this->nominal_control_trajectory_.data(), // single_control_traj_size * sizeof(float), // cudaMemcpyHostToDevice, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); // In this case this->gamma = 1 / lambda mppi_common::launchNormExpKernel(NUM_ROLLOUTS, B_X, this->trajectory_costs_d_, 1.0 / this->getLambda(), this->getBaselineCost(0), this->stream_); mppi_common::launchNormExpKernel(NUM_ROLLOUTS, B_X, trajectory_costs_nominal_d, 1.0 / this->getLambda(), this->getBaselineCost(1), this->stream_); HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_.data(), this->trajectory_costs_d_, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->trajectory_costs_nominal_.data(), trajectory_costs_nominal_d, NUM_ROLLOUTS * sizeof(float), cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); this->setNormalizer(mppi_common::computeNormalizer(this->trajectory_costs_.data(), NUM_ROLLOUTS), 0); this->setNormalizer(mppi_common::computeNormalizer(this->trajectory_costs_nominal_.data(), NUM_ROLLOUTS), 1); // Compute real free energy mppi_common::computeFreeEnergy(this->free_energy_statistics_.real_sys.freeEnergyMean, this->free_energy_statistics_.real_sys.freeEnergyVariance, this->free_energy_statistics_.real_sys.freeEnergyModifiedVariance, this->trajectory_costs_.data(), NUM_ROLLOUTS, this->getBaselineCost(0), this->getLambda()); // Compute Nominal State free Energy mppi_common::computeFreeEnergy(this->free_energy_statistics_.nominal_sys.freeEnergyMean, this->free_energy_statistics_.nominal_sys.freeEnergyVariance, this->free_energy_statistics_.nominal_sys.freeEnergyModifiedVariance, this->trajectory_costs_nominal_.data(), NUM_ROLLOUTS, this->getBaselineCost(1), this->getLambda()); mppi_common::launchWeightedReductionKernel( this->trajectory_costs_d_, this->control_noise_d_, this->control_d_, this->getNormalizerCost(0), this->getNumTimesteps(), this->stream_); mppi_common::launchWeightedReductionKernel( trajectory_costs_nominal_d, control_noise_nominal_d, control_nominal_d, this->getNormalizerCost(1), this->getNumTimesteps(), this->stream_); // Transfer the new control to the host HANDLE_ERROR(cudaMemcpyAsync(this->control_.data(), this->control_d_, sizeof(float) * single_control_traj_size, cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaMemcpyAsync(this->nominal_control_trajectory_.data(), control_nominal_d, sizeof(float) * single_control_traj_size, cudaMemcpyDeviceToHost, this->stream_)); cudaStreamSynchronize(this->stream_); } this->computeStateTrajectoryHelper(this->nominal_state_trajectory_, this->nominal_state_, this->nominal_control_trajectory_); // Ugly hack for computeDF() method this->propagated_feedback_state_trajectory_.col(0) = state; this->free_energy_statistics_.real_sys.normalizerPercent = this->getNormalizerCost(0) / NUM_ROLLOUTS; this->free_energy_statistics_.real_sys.increase = this->getBaselineCost(0) - this->free_energy_statistics_.real_sys.previousBaseline; this->free_energy_statistics_.nominal_sys.normalizerPercent = this->getNormalizerCost(1) / NUM_ROLLOUTS; this->free_energy_statistics_.nominal_sys.increase = this->getBaselineCost(1) - this->free_energy_statistics_.nominal_sys.previousBaseline; } // Ugly hack for computeDF() method void setPropogatedFeedbackState(const Eigen::Ref& next_real_state) { this->propagated_feedback_state_trajectory_.col(1) = next_real_state; } void computeNominalFeedbackGains(const Eigen::Ref& state) override { } control_array getCCMFeedbackGains(const Eigen::Ref& x_act, const Eigen::Ref& x_nom, const Eigen::Ref& u_nom) { control_array fb_u = CCM_feedback_controller_.u_feedback(x_act, x_nom, u_nom); std::cout << "Act: " << x_act.transpose() << std::endl; std::cout << "nom: " << x_nom.transpose() << std::endl; std::cout << "U: " << u_nom.transpose() << std::endl; std::cout << "Feedback: " << fb_u.transpose() << std::endl; return fb_u; } }; bool tubeFailure(float* s) { float inner_path_radius2 = 1.675 * 1.675; float outer_path_radius2 = 2.325 * 2.325; float radial_position = s[0] * s[0] + s[1] * s[1]; if ((radial_position < inner_path_radius2) || (radial_position > outer_path_radius2)) { return true; } else { return false; } } void saveTraj(const Eigen::Ref>& traj, int t, std::vector& vec) { for (int i = 0; i < NUM_TIMESTEPS; i++) { for (int j = 0; j < DoubleIntegratorDynamics::STATE_DIM; j++) { vec[t * NUM_TIMESTEPS * DoubleIntegratorDynamics::STATE_DIM + i * DoubleIntegratorDynamics::STATE_DIM + j] = traj(j, i); } } } void saveState(const Eigen::Ref& state, int t, std::vector& vec) { for (int j = 0; j < DoubleIntegratorDynamics::STATE_DIM; j++) { vec[t * DoubleIntegratorDynamics::STATE_DIM + j] = state(j); } } TEST(CCMTest, CCMFeedbackTest) { using DYN = DoubleIntegratorDynamics; using COST = DoubleIntegratorCircleCost; DYN model(100); ccm::LinearCCM fb_controller(&model); float dt = 0.02; int mission_length = int(10 / dt); DYN::state_array x, x_nom, x_dot; x << 4, 0, 0, 1; x_nom << -3, 2, 0, 0; DYN::control_array current_control; float two_percent_settle_time = -1; for (int t = 0; t < mission_length; t++) { current_control = fb_controller.u_feedback(x, x_nom, DYN::control_array::Zero()); model.computeDynamics(x, current_control, x_dot); model.updateState(x, x_dot, dt); DYN::state_array abs_diff = x - x_nom; for (int i = 0; i < DYN::STATE_DIM; i++) { if (x_nom(i) >= 1) { abs_diff(i) /= x_nom(i); } } abs_diff = abs_diff.cwiseAbs(); if (abs_diff.block<2, 1>(0, 0).maxCoeff() < 0.02 && two_percent_settle_time < 0) { two_percent_settle_time = t * dt; } if (t % 5 == 0) { std::cout << "State at t = " << t * dt << ": " << x.transpose() << std::endl; } } std::cout << "2% settling time is " << two_percent_settle_time << " secs" << std::endl; } TEST(CCMTest, RMPPIRolloutKernel) { // GTEST_SKIP(); using DYN = DoubleIntegratorDynamics; using COST = DoubleIntegratorRobustCost; DYN model(100); COST cost; auto params = cost.getParams(); params.crash_cost = 100; cost.setParams(params); const int num_timesteps = NUM_TIMESTEPS; const int num_rollouts = NUM_ROLLOUTS_CONST; using CONTROLLER = RMPPICCMDoubleIntegratorController; const int state_dim = DYN::STATE_DIM; const int control_dim = DYN::CONTROL_DIM; const float dt = 0.02; // int max_iter = 10; float lambda = 4; float alpha = 0; float value_func_threshold = 20; const int mission_length = int(0.06 / dt); // Create a random number generator // Random number generator for system noise std::mt19937 gen; // Standard mersenne_twister_engine which will be seeded std::normal_distribution normal_distribution; gen.seed(7); // Seed the 7, so everyone gets the same noise normal_distribution = std::normal_distribution(0, 1); Eigen::Matrix universal_noise = Eigen::Matrix::Zero(); // Create the noise for all systems for (int t = 0; t < mission_length; ++t) { for (int i = 2; i < 4; ++i) { universal_noise(i, t) = normal_distribution(gen); } } // std::vector actual_trajectory_save(num_timesteps*mission_length*DYN::STATE_DIM); // std::vector nominal_trajectory_save(num_timesteps*mission_length*DYN::STATE_DIM); // Save actual trajectories, nominal_trajectory, free energy std::vector robust_rc_trajectory(DYN::STATE_DIM * mission_length, 0); std::vector robust_rc_nominal_traj(DYN::STATE_DIM * num_timesteps * mission_length, 0); std::vector robust_rc_nominal_free_energy(mission_length, 0); std::vector robust_rc_real_free_energy(mission_length, 0); std::vector robust_rc_nominal_free_energy_bound(mission_length, 0); std::vector robust_rc_real_free_energy_bound(mission_length, 0); std::vector robust_rc_real_free_energy_growth_bound(mission_length, 0); std::vector robust_rc_nominal_free_energy_growth(mission_length, 0); std::vector robust_rc_real_free_energy_growth(mission_length, 0); std::vector robust_rc_nominal_state_used(mission_length, 0); // std::string file_prefix = "/data/bvlahov3/RMPPI_CCM_control_trajectories_CoRL2020/FreeEnergyRMPPIData/"; std::string file_prefix = ""; CONTROLLER::control_array control_std_dev = CONTROLLER::control_array::Constant(1.0); CONTROLLER::control_trajectory u_traj_eigen = CONTROLLER::control_trajectory::Zero(); // Set first control to 1 across entire time u_traj_eigen.row(0) = CONTROLLER::control_trajectory::Constant(1.0).row(0); ccm::LinearCCM fb_controller(&model); CONTROLLER rmppi_controller = CONTROLLER(&model, &cost, &fb_controller, dt, lambda, alpha, value_func_threshold, control_std_dev, num_timesteps, u_traj_eigen); // float x[num_rollouts * state_dim * 2]; // float x_dot[num_rollouts * state_dim * 2]; // float u[num_rollouts * control_dim * 2]; // float du[num_rollouts * control_dim * 2]; // float sigma_u[control_dim] = {0.5, 0.05}; // variance to sample noise from // COST::control_matrix cost_variance = COST::control_matrix::Identity(); // for(int i = 0; i < control_dim; i++) { // cost_variance(i, i) = sigma_u[i]; // } // float fb_u[num_rollouts * control_dim * state_dim]; DYN::state_array x_init_act, x_dot; x_init_act << 2, 0, 0, 1; DYN::state_array x_init_nom; x_init_nom << 0, 0, 0.1, 0; // rmppi_controller.computeControl(x_init_act); DYN::state_array x = x_init_act; std::string act_traj_file_name; std::string nom_traj_file_name; std::string nom_free_energy_name; std::string act_free_energy_name; std::string nom_state_used_name; std::string act_free_energy_bound_name; std::string nom_free_energy_bound_name; std::string act_free_energy_growth_bound_name; std::string act_free_energy_growth_name; std::string nom_free_energy_growth_name; for (int t = 0; t < mission_length; t++) { act_traj_file_name = file_prefix + "robust_large_actual_traj_CCM_t_" + std::to_string(t) + ".npy"; nom_traj_file_name = file_prefix + "robust_large_nominal_traj_CCM_t_" + std::to_string(t) + ".npy"; nom_free_energy_name = file_prefix + "robust_large_nominal_free_energy_CCM_t_" + std::to_string(t) + ".npy"; act_free_energy_name = file_prefix + "robust_large_actual_free_energy_CCM_t_" + std::to_string(t) + ".npy"; nom_state_used_name = file_prefix + "robust_large_nominal_state_used_CCM_t_" + std::to_string(t) + ".npy"; act_free_energy_bound_name = file_prefix + "robust_large_actual_free_energy_bound_CCM_t_" + std::to_string(t) + ".npy"; nom_free_energy_bound_name = file_prefix + "robust_large_nominal_free_energy_bound_CCM_t_" + std::to_string(t) + ".npy"; act_free_energy_growth_bound_name = file_prefix + "robust_large_actual_free_energy_growth_bound_CCM_t_" + std::to_string(t) + ".npy"; act_free_energy_growth_name = file_prefix + "robust_large_actual_free_energy_growth_CCM_t_" + std::to_string(t) + ".npy"; nom_free_energy_growth_name = file_prefix + "robust_large_nominal_free_energy_growth_CCM_t_" + std::to_string(t) + ".npy"; // if (cost.computeStateCost(x) > 1000) { // std::cout << "State Cost is " << cost.computeStateCost(x) << std::endl; // std::cout << "State was " << x.transpose() << std::endl; // FAIL(); // } if (tubeFailure(x.data())) { // cnpy::npy_save(act_traj_file_name, actual_trajectory_save.data(), // {mission_length, num_timesteps, DYN::STATE_DIM},"w"); // cnpy::npy_save(nom_traj_file_name, nominal_trajectory_save.data(), // {mission_length, num_timesteps, DYN::STATE_DIM},"w"); // cnpy::npy_save(act_traj_file_name, actual_trajectory_save.data(), // {mission_length, num_timesteps, DYN::STATE_DIM},"w"); // cnpy::npy_save(nom_traj_file_name, nominal_trajectory_save.data(), // {mission_length, num_timesteps, DYN::STATE_DIM},"w"); cnpy::npy_save(act_traj_file_name, robust_rc_trajectory.data(), { mission_length, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save(nom_traj_file_name, robust_rc_nominal_traj.data(), { mission_length, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save(nom_free_energy_name, robust_rc_nominal_free_energy.data(), { mission_length }, "w"); cnpy::npy_save(act_free_energy_name, robust_rc_real_free_energy.data(), { mission_length }, "w"); cnpy::npy_save(nom_state_used_name, robust_rc_nominal_state_used.data(), { mission_length }, "w"); cnpy::npy_save(act_free_energy_bound_name, robust_rc_real_free_energy_bound.data(), { mission_length }, "w"); cnpy::npy_save(nom_free_energy_bound_name, robust_rc_nominal_free_energy_bound.data(), { mission_length }, "w"); cnpy::npy_save(act_free_energy_growth_bound_name, robust_rc_real_free_energy_growth_bound.data(), { mission_length }, "w"); cnpy::npy_save(act_free_energy_growth_name, robust_rc_real_free_energy_growth.data(), { mission_length }, "w"); cnpy::npy_save(nom_free_energy_growth_name, robust_rc_nominal_free_energy_growth.data(), { mission_length }, "w"); printf("Current Time: %f ", t * dt); model.printState(x.data()); std::cout << "\tCandidate Free Energies: " << rmppi_controller.getCandidateFreeEnergy().transpose() << std::endl; std::cout << "Tube failure!!" << std::endl; FAIL() << "Visualize the trajectories by running scripts/double_integrator/plot_DI_test_trajectories; " "the argument to this python file is the build directory of MPPI-Generic"; } if (t % 2 == 0) { printf("Current Time: %5.2f ", t * dt); model.printState(x.data()); std::cout << "\tCandidate Free Energies: " << rmppi_controller.getCandidateFreeEnergy().transpose() << std::endl; } rmppi_controller.updateImportanceSamplingControl(x, 1); rmppi_controller.computeControl(x); auto nominal_trajectory = rmppi_controller.getTargetStateSeq(); auto fe_stat = rmppi_controller.getFreeEnergyStatistics(); // for (int i = 0; i < num_timesteps; i++) { // for (int j = 0; j < DYN::STATE_DIM; j++) { // actual_trajectory_save[t * num_timesteps * DYN::STATE_DIM + // i*DYN::STATE_DIM + j] = x(j); // nominal_trajectory_save[t * num_timesteps * DYN::STATE_DIM + // i*DYN::STATE_DIM + j] = nominal_trajectory(j, i); // } // } // Save everything saveState(x, t, robust_rc_trajectory); saveTraj(nominal_trajectory, t, robust_rc_nominal_traj); robust_rc_nominal_free_energy[t] = fe_stat.nominal_sys.freeEnergyMean; robust_rc_real_free_energy[t] = fe_stat.real_sys.freeEnergyMean; robust_rc_nominal_free_energy_bound[t] = value_func_threshold + 2 * fe_stat.nominal_sys.freeEnergyModifiedVariance; robust_rc_real_free_energy_bound[t] = fe_stat.nominal_sys.freeEnergyMean + cost.getLipshitzConstantCost() * 1 * (x - nominal_trajectory.col(0)).norm(); DYN::state_array x_nom = rmppi_controller.getTargetStateSeq().col(0); DYN::control_array current_control = rmppi_controller.getControlSeq().col(0); current_control += rmppi_controller.getCCMFeedbackGains(x, x_nom, current_control); model.computeDynamics(x, current_control, x_dot); model.updateState(x, x_dot, dt); rmppi_controller.setPropogatedFeedbackState(x); if (x.hasNaN()) { std::cout << "NANANANANA\n\n\n\nNANANANANAN" << std::endl; } robust_rc_real_free_energy_growth_bound[t] = (value_func_threshold - fe_stat.nominal_sys.freeEnergyMean) + cost.getLipshitzConstantCost() * 1 * rmppi_controller.computeDF() + 2 * fe_stat.nominal_sys.freeEnergyModifiedVariance; robust_rc_nominal_free_energy_growth[t] = fe_stat.nominal_sys.increase; robust_rc_real_free_energy_growth[t] = fe_stat.real_sys.increase; robust_rc_nominal_state_used[t] = fe_stat.nominal_state_used; x += universal_noise.col(t) * sqrt(model.getParams().system_noise) * dt; if (x.hasNaN()) { std::cout << "NOISEYNOISE\n\n\n\nNANANANANAN" << std::endl; } rmppi_controller.slideControlSequence(1); } // act_traj_file_name = file_prefix + "robust_large_actual_CCM_t_" + // std::to_string(mission_length - 1) + ".npy"; // nom_traj_file_name = file_prefix + "robust_large_nominal_CCM_t_" + // std::to_string(mission_length - 1) + ".npy"; // cnpy::npy_save(act_traj_file_name, actual_trajectory_save.data(), // {mission_length, num_timesteps, DYN::STATE_DIM},"w"); // cnpy::npy_save(nom_traj_file_name, nominal_trajectory_save.data(), // {mission_length, num_timesteps, DYN::STATE_DIM},"w"); cnpy::npy_save(act_traj_file_name, robust_rc_trajectory.data(), { mission_length, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save(nom_traj_file_name, robust_rc_nominal_traj.data(), { mission_length, num_timesteps, DoubleIntegratorDynamics::STATE_DIM }, "w"); cnpy::npy_save(nom_free_energy_name, robust_rc_nominal_free_energy.data(), { mission_length }, "w"); cnpy::npy_save(act_free_energy_name, robust_rc_real_free_energy.data(), { mission_length }, "w"); cnpy::npy_save(nom_state_used_name, robust_rc_nominal_state_used.data(), { mission_length }, "w"); cnpy::npy_save(act_free_energy_bound_name, robust_rc_real_free_energy_bound.data(), { mission_length }, "w"); cnpy::npy_save(nom_free_energy_bound_name, robust_rc_nominal_free_energy_bound.data(), { mission_length }, "w"); cnpy::npy_save(act_free_energy_growth_bound_name, robust_rc_real_free_energy_growth_bound.data(), { mission_length }, "w"); cnpy::npy_save(act_free_energy_growth_name, robust_rc_real_free_energy_growth.data(), { mission_length }, "w"); cnpy::npy_save(nom_free_energy_growth_name, robust_rc_nominal_free_energy_growth.data(), { mission_length }, "w"); } ================================================ FILE: tests/mppi_core/CMakeLists.txt ================================================ set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) list(REMOVE_ITEM TARGET_SRCS "CCM_tests.cu") foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) gtest_discover_tests(${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: tests/mppi_core/base_plant_tester.cu ================================================ // // Created by jgibson37 on 2/24/20. // #include #include #include #include #include #include #include #include #include const float DT = 0.05; template class TestPlant : public BasePlant { public: double time_ = 0.0; int pubControlCalled = 0; int pubNominalStateCalled = 0; using c_array = typename CONTROLLER_T::control_array; using c_traj = typename CONTROLLER_T::control_trajectory; using s_array = typename CONTROLLER_T::state_array; using s_traj = typename CONTROLLER_T::state_trajectory; using DYN_T = typename CONTROLLER_T::TEMPLATED_DYNAMICS; using DYN_PARAMS_T = typename DYN_T::DYN_PARAMS_T; using COST_T = typename CONTROLLER_T::TEMPLATED_COSTS; using COST_PARAMS_T = typename COST_T::COST_PARAMS_T; double timestamp_; double loop_speed_; TestPlant(std::shared_ptr controller, int hz = 20, int opt_stride = 1) : BasePlant(controller, hz, opt_stride) { } void pubControl(const c_array& u) override { pubControlCalled++; } void pubNominalState(const s_array& s) override { pubNominalStateCalled++; } void pubFreeEnergyStatistics(MPPIFreeEnergyStatistics& fe_stats) override { } void incrementTime() { time_ += DT; } void incrementTime(double dt) { time_ += dt; } int checkStatus() override { return 1; } double getCurrentTime() { return time_ + 0.3421; } double getPoseTime() { return time_; } // accessors for protected members int getNumIter() { return this->num_iter_; } double getLastUsedPoseUpdateTime() { return this->last_used_state_update_time_; } int getStatus() { return this->status_; } bool getDebugMode() { return this->debug_mode_; } double getOptimizationDuration() { return this->optimization_duration_; } double getOptimizationAvg() { return this->avg_optimize_time_ms_; } double getOptimizeLoopDuration() { return this->optimize_loop_duration_; } double getLoopAvg() { return this->avg_loop_time_ms_; } double getFeedbackDuration() { return this->feedback_duration_; } double getFeedbackAvg() { return this->avg_feedback_time_ms_; } void setLastTime(double time) { time_ = time; } void setLastUsedTime(double time) { this->last_used_state_update_time_ = time; } double getSleepTimeAvg() { return this->avg_sleep_time_ms_; } }; typedef TestPlant MockTestPlant; class BasePlantTest : public ::testing::Test { protected: void SetUp() override { EXPECT_CALL(mockCost, getParams()).Times(1); EXPECT_CALL(mockDynamics, getParams()).Times(1); EXPECT_CALL(mockDynamics, freeCudaMem()).Times(1); EXPECT_CALL(mockCost, freeCudaMem()).Times(1); EXPECT_CALL(mockSampler, freeCudaMem()).Times(1); EXPECT_CALL(mockFeedback, freeCudaMem()).Times(1); mockController = std::make_shared(); mockController->setDt(DT); EXPECT_CALL(*mockController, getDt()).WillRepeatedly(testing::Return(DT)); mockController->cost_ = &mockCost; mockController->model_ = &mockDynamics; mockController->sampler_ = &mockSampler; mockController->fb_controller_ = &mockFeedback; plant = std::make_shared(mockController); } void TearDown() override { } MockDynamics mockDynamics; MockCost mockCost; MockFeedback mockFeedback; MockSamplingDistribution mockSampler; std::shared_ptr mockController; std::shared_ptr plant; const float SMALL_TIME_MS = 8; }; TEST_F(BasePlantTest, Constructor) { EXPECT_EQ(plant->controller_, mockController); EXPECT_EQ(plant->getHz(), 20); EXPECT_EQ(plant->getTargetOptimizationStride(), 1); EXPECT_EQ(plant->getNumIter(), 0); EXPECT_EQ(plant->getLastUsedPoseUpdateTime(), 0); EXPECT_EQ(plant->getStatus(), 1); EXPECT_EQ(mockController->getFeedbackEnabled(), false); EXPECT_EQ(plant->hasNewCostParams(), false); EXPECT_EQ(plant->hasNewDynamicsParams(), false); } TEST_F(BasePlantTest, getAndSetState) { // check initial state is zerod MockController::state_array state = plant->getState(); for (int i = 0; i < 1; i++) { EXPECT_EQ(state(i), 0.0); } MockController::state_array new_state; for (int i = 0; i < 1; i++) { new_state(i) = i; } plant->setState(new_state); state = plant->getState(); for (int i = 0; i < 1; i++) { EXPECT_EQ(state(i), i); } } TEST_F(BasePlantTest, getSetOptimizationStride) { int optimization_stride = plant->getTargetOptimizationStride(); EXPECT_EQ(optimization_stride, 1); plant->setTargetOptimizationStride(5); optimization_stride = plant->getTargetOptimizationStride(); EXPECT_EQ(optimization_stride, 5); } TEST_F(BasePlantTest, getSetDynamicsParams) { EXPECT_EQ(plant->hasNewDynamicsParams(), false); MockTestPlant::DYN_PARAMS_T params; params.test = 3; plant->setDynamicsParams(params); EXPECT_EQ(plant->hasNewDynamicsParams(), true); MockTestPlant::DYN_PARAMS_T new_params = plant->getNewDynamicsParams(); EXPECT_EQ(plant->hasNewDynamicsParams(), false); EXPECT_EQ(new_params.test, params.test); } TEST_F(BasePlantTest, getSetCostParams) { EXPECT_EQ(plant->hasNewCostParams(), false); MockTestPlant::COST_PARAMS_T params; params.test = 100; plant->setCostParams(params); EXPECT_EQ(plant->hasNewCostParams(), true); auto new_params = plant->getNewCostParams(); EXPECT_EQ(plant->hasNewCostParams(), false); EXPECT_EQ(params.test, new_params.test); } TEST_F(BasePlantTest, updateParametersAllFalse) { EXPECT_CALL(mockCost, setParams(testing::_)).Times(0); EXPECT_CALL(mockDynamics, setParams(testing::_)).Times(0); plant->updateParameters(); } TEST_F(BasePlantTest, updateParametersAllTrue) { EXPECT_CALL(mockCost, setParams(testing::_)).Times(1); EXPECT_CALL(mockDynamics, setParams(testing::_)).Times(1); plant->setDebugMode(true); plant->setDynamicsParams(MockDynamics::DYN_PARAMS_T()); plant->setCostParams(MockCost::COST_PARAMS_T()); plant->updateParameters(); } TEST_F(BasePlantTest, updateStateOutsideTimeTest) { plant->setLastTime(0); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)).Times(0); MockController::state_array state = MockController::state_array::Zero(); plant->updateState(state, mockController->getDt() * mockController->getNumTimesteps() + 0.01); EXPECT_EQ(plant->getState(), state); plant->setLastTime(100); plant->updateState(state, 99.99); EXPECT_EQ(plant->getState(), state); EXPECT_EQ(plant->pubControlCalled, 0); EXPECT_EQ(plant->pubNominalStateCalled, 0); } TEST_F(BasePlantTest, updateStateTest) { plant->setLastTime(0); MockController::state_array state = MockController::state_array::Zero(); // EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, // testing::_)).Times(0); plant->updateState(state, 12); EXPECT_EQ(plant->pubControlCalled, 0); EXPECT_EQ(plant->pubNominalStateCalled, 0); // Calling updateState() should not pub controls when none have been calculated as of yet EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)).Times(0); plant->setLastUsedTime(11); plant->updateState(state, 12); EXPECT_EQ(plant->getState(), state); EXPECT_EQ(plant->pubControlCalled, 0); EXPECT_EQ(plant->pubNominalStateCalled, 0); // TODO in debug should pub nominal state } TEST_F(BasePlantTest, pubControlOnlyAfterControlAreCalculatedTest) { ::testing::Sequence s1; // Step 1: calling updateState() before controls are calculated should not call controller->getCurrentControl() MockController::state_array state = MockController::state_array::Zero(); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(0) .InSequence(s1); double curr_time = 0.0; plant->updateState(state, curr_time); EXPECT_EQ(plant->pubControlCalled, 0); EXPECT_EQ(plant->pubNominalStateCalled, 0); // ::testing::Mock::VerifyAndClearExpectations(mockController.get()); // Step 2: run control iteration inside plant // Create valid outputs from gmock methods to prevent nan detection from triggering MockController::control_trajectory valid_control_seq = MockController::control_trajectory::Zero(MockDynamics::CONTROL_DIM, NUM_TIMESTEPS); MockController::state_trajectory valid_state_seq = MockController::state_trajectory::Zero(MockDynamics::STATE_DIM, NUM_TIMESTEPS); EXPECT_CALL(*mockController, computeControl(testing::_, testing::_)).Times(1).InSequence(s1); EXPECT_CALL(*mockController, getControlSeq()).Times(1).WillRepeatedly(testing::Return(valid_control_seq)); EXPECT_CALL(*mockController, getTargetStateSeq()).Times(1).WillRepeatedly(testing::Return(valid_state_seq)); // EXPECT_CALL(*mockController, getTargetOutputSeq()).Times(1); std::atomic is_alive(true); plant->runControlIteration(&is_alive); // Step 3: calling updateState() now should use controller->getCurrentControl() EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(1) .InSequence(s1); curr_time++; plant->updateState(state, curr_time); EXPECT_EQ(plant->pubControlCalled, 1); EXPECT_EQ(plant->pubNominalStateCalled, 0); } TEST_F(BasePlantTest, EnsureReceivingStateCompletesRunControlIterationTest) { std::atomic is_alive(true); // Create valid outputs from gmock methods to prevent nan detection from triggering MockController::control_trajectory valid_control_seq = MockController::control_trajectory::Zero(MockDynamics::CONTROL_DIM, NUM_TIMESTEPS); MockController::state_trajectory valid_state_seq = MockController::state_trajectory::Zero(MockDynamics::STATE_DIM, NUM_TIMESTEPS); EXPECT_CALL(*mockController, computeControl(testing::_, testing::_)).Times(1); EXPECT_CALL(*mockController, getControlSeq()).Times(1).WillRepeatedly(testing::Return(valid_control_seq)); EXPECT_CALL(*mockController, getTargetStateSeq()).Times(1).WillRepeatedly(testing::Return(valid_state_seq)); // EXPECT_CALL(*mockController, getTargetOutputSeq()).Times(1); std::thread new_thread(&MockTestPlant::runControlIteration, plant.get(), &is_alive); // Wait some period of time and then call updateState() std::cout << "Wait for new state" << std::endl; std::this_thread::sleep_for(std::chrono::milliseconds(300)); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)).Times(0); MockController::state_array state = MockController::state_array::Zero(); double curr_time = 0.0; plant->updateState(state, curr_time); std::cout << "State sent" << std::endl; std::this_thread::sleep_for(std::chrono::milliseconds(100)); // is_alive.store(false); new_thread.join(); } TEST_F(BasePlantTest, runControlIterationStoppedTest) { EXPECT_CALL(*mockController, slideControlSequence(testing::_)).Times(0); EXPECT_CALL(*mockController, computeControl(testing::_, testing::_)).Times(0); std::atomic is_alive(false); plant->runControlIteration(&is_alive); } // TODO speed up to make tests run faster TEST_F(BasePlantTest, runControlIterationDebugFalseNoFeedbackTest) { double init_time = 100; plant->setLastTime(init_time); MockController::state_array s = MockController::state_array::Zero(); for (int i = 0; i < 2; i++) { double wait_ms = 50 * i; auto wait_function = [wait_ms](const Eigen::Ref& state, int optimization_stride = 0) { usleep(wait_ms * 1e3); }; int expect_opt_stride = i > 0 ? 1 : 0; MockDynamics::control_array control = MockDynamics::control_array::Zero(); if (i > 0) { EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(1) .WillRepeatedly(testing::Return(control)); // called by update state should not be called since time since // last // opt should be zero on first iteration, should be called on second } else { EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(0); } EXPECT_CALL(*mockController, slideControlSequence(expect_opt_stride)).Times(i > 0 ? 1 : 0); EXPECT_CALL(*mockController, computeControl(testing::_, testing::_)) .Times(1) .WillRepeatedly(testing::Invoke(wait_function)); MockController::control_trajectory control_seq = MockController::control_trajectory::Zero(); EXPECT_CALL(*mockController, getControlSeq()).Times(1).WillRepeatedly(testing::Return(control_seq)); MockController::state_trajectory state_seq = MockController::state_trajectory::Zero(); EXPECT_CALL(*mockController, getTargetStateSeq()).Times(1).WillRepeatedly(testing::Return(state_seq)); EXPECT_CALL(*mockController, computeFeedback(testing::_)).Times(0); EXPECT_CALL(*mockController, getFeedbackControl(testing::_, testing::_, testing::_)).Times(0); EXPECT_CALL(*mockController, computeFeedbackPropagatedStateSeq()).Times(1); EXPECT_CALL(*mockController, calculateSampledStateTrajectories()).Times(0); EXPECT_EQ(plant->getDebugMode(), false); std::atomic is_alive(true); plant->updateState(s, init_time + i * DT); plant->runControlIteration(&is_alive); plant->incrementTime(); EXPECT_EQ(plant->checkStatus(), 1); EXPECT_EQ(plant->getStateTraj(), state_seq); EXPECT_EQ(plant->getControlTraj(), control_seq); MockController::TEMPLATED_FEEDBACK_STATE feedback = plant->getFeedbackState(); MockController::state_array state = MockController::state_array::Ones(); for (int j = 0; j < 100; j++) { // TODO check that feedback is correct // auto result = feedback[i] * state; // float sum = feedback[i] * state; // EXPECT_FLOAT_EQ(result(0), 0); } // check last pose update EXPECT_FLOAT_EQ(plant->getLastUsedPoseUpdateTime(), DT * i + init_time); EXPECT_EQ(plant->getNumIter(), i + 1); EXPECT_EQ(plant->getLastOptimizationStride(), expect_opt_stride); EXPECT_THAT(plant->getOptimizationDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_LT(plant->getOptimizationAvg(), wait_ms + SMALL_TIME_MS); EXPECT_THAT(plant->getOptimizeLoopDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_LT(plant->getLoopAvg(), wait_ms + SMALL_TIME_MS); EXPECT_LE(plant->getFeedbackDuration(), SMALL_TIME_MS); EXPECT_LE(plant->getFeedbackAvg(), SMALL_TIME_MS); } } TEST_F(BasePlantTest, runControlIterationDebugFalseFeedbackTest) { EXPECT_CALL(mockFeedback, initTrackingController()).Times(1); mockController->initFeedback(); double init_time = 51789; plant->setLastTime(init_time); MockController::state_array s = MockController::state_array::Zero(); for (int i = 0; i < 10; i++) { double wait_ms = 50 * i; auto wait_function = [wait_ms](const Eigen::Ref& state, int optimization_stride = 0) { usleep(wait_ms * 1e3); }; int expect_opt_stride = i > 0 ? 1 : 0; MockDynamics::control_array control = MockDynamics::control_array::Zero(); if (i > 0) { EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(1) .WillRepeatedly(testing::Return(control)); // called by update state should not be called since time since // last // opt should be zero on first iteration, should be called on rest } else { EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(0); } EXPECT_CALL(*mockController, slideControlSequence(expect_opt_stride)).Times(i > 0 ? 1 : 0); EXPECT_CALL(*mockController, computeControl(testing::_, testing::_)) .Times(1) .WillRepeatedly(testing::Invoke(wait_function)); MockController::control_trajectory control_seq = MockController::control_trajectory::Zero(); EXPECT_CALL(*mockController, getControlSeq()).Times(1).WillRepeatedly(testing::Return(control_seq)); MockController::state_trajectory state_seq = MockController::state_trajectory::Zero(); EXPECT_CALL(*mockController, getTargetStateSeq()).Times(1).WillRepeatedly(testing::Return(state_seq)); EXPECT_CALL(*mockController, computeFeedback(testing::_)).Times(1).WillRepeatedly(testing::Invoke(wait_function)); MockController::TEMPLATED_FEEDBACK_STATE feedback; EXPECT_CALL(*mockController, getFeedbackState()).Times(1).WillRepeatedly(testing::Return(feedback)); EXPECT_CALL(*mockController, computeFeedbackPropagatedStateSeq()).Times(1); EXPECT_CALL(*mockController, calculateSampledStateTrajectories()).Times(0); EXPECT_EQ(plant->getDebugMode(), false); std::atomic is_alive(true); plant->updateState(s, init_time + i * DT); plant->runControlIteration(&is_alive); plant->incrementTime(); EXPECT_EQ(plant->checkStatus(), 1); EXPECT_EQ(plant->getStateTraj(), state_seq); EXPECT_EQ(plant->getControlTraj(), control_seq); // EXPECT_EQ(plant->getFeedbackState(), feedback); // check last pose update EXPECT_FLOAT_EQ(plant->getLastUsedPoseUpdateTime(), DT * i + init_time); EXPECT_EQ(plant->getNumIter(), i + 1); EXPECT_EQ(plant->getLastOptimizationStride(), expect_opt_stride); EXPECT_THAT(plant->getOptimizationDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_LT(plant->getOptimizationAvg(), wait_ms + SMALL_TIME_MS); EXPECT_THAT(plant->getFeedbackDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le((wait_ms + SMALL_TIME_MS) * 2))); // TODO should be range as well EXPECT_LT(plant->getFeedbackAvg(), wait_ms + SMALL_TIME_MS); EXPECT_THAT(plant->getOptimizeLoopDuration(), testing::AllOf(testing::Ge(wait_ms * 2), testing::Le((wait_ms + SMALL_TIME_MS) * 2))); EXPECT_LT(plant->getLoopAvg(), (wait_ms + SMALL_TIME_MS) * 2); } } TEST_F(BasePlantTest, runControlIterationDebugFalseFeedbackAvgTest) { EXPECT_CALL(mockFeedback, initTrackingController()).Times(1); mockController->initFeedback(); double init_time = 51531; plant->setLastTime(init_time); MockController::state_array s = MockController::state_array::Zero(); for (int i = 0; i < 10; i++) { double wait_ms = 50; auto wait_function = [wait_ms](const Eigen::Ref& state, int optimization_stride = 0) { usleep(wait_ms * 1e3); }; int expect_opt_stride = i > 0 ? 1 : 0; MockDynamics::control_array control = MockDynamics::control_array::Zero(); if (i > 0) { EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(1) .WillRepeatedly(testing::Return(control)); // called by update state should not be called since time since // last // opt should be zero on first iteration, should be called on rest } else { EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(0); } EXPECT_CALL(*mockController, slideControlSequence(expect_opt_stride)).Times(i > 0 ? 1 : 0); EXPECT_CALL(*mockController, computeControl(testing::_, testing::_)) .Times(1) .WillRepeatedly(testing::Invoke(wait_function)); MockController::control_trajectory control_seq = MockController::control_trajectory::Zero(); EXPECT_CALL(*mockController, getControlSeq()).Times(1).WillRepeatedly(testing::Return(control_seq)); MockController::state_trajectory state_seq = MockController::state_trajectory::Zero(); EXPECT_CALL(*mockController, getTargetStateSeq()).Times(1).WillRepeatedly(testing::Return(state_seq)); EXPECT_CALL(*mockController, computeFeedback(testing::_)).Times(1).WillRepeatedly(testing::Invoke(wait_function)); MockController::TEMPLATED_FEEDBACK_STATE feedback; EXPECT_CALL(*mockController, getFeedbackState()).Times(1).WillRepeatedly(testing::Return(feedback)); EXPECT_CALL(*mockController, computeFeedbackPropagatedStateSeq()).Times(1); EXPECT_CALL(*mockController, calculateSampledStateTrajectories()).Times(0); EXPECT_EQ(plant->getDebugMode(), false); std::atomic is_alive(true); plant->updateState(s, init_time + i * DT); plant->runControlIteration(&is_alive); plant->incrementTime(); EXPECT_EQ(plant->checkStatus(), 1); EXPECT_EQ(plant->getStateTraj(), state_seq); EXPECT_EQ(plant->getControlTraj(), control_seq); // EXPECT_EQ(plant->getFeedbackState(), feedback); // check last pose update EXPECT_FLOAT_EQ(plant->getLastUsedPoseUpdateTime(), DT * i + init_time); EXPECT_EQ(plant->getNumIter(), i + 1); EXPECT_EQ(plant->getLastOptimizationStride(), expect_opt_stride); EXPECT_THAT(plant->getOptimizationDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getOptimizationAvg(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getOptimizeLoopDuration(), testing::AllOf(testing::Ge(wait_ms * 2), testing::Le((wait_ms + SMALL_TIME_MS) * 2))); EXPECT_THAT(plant->getLoopAvg(), testing::AllOf(testing::Ge((wait_ms)*2), testing::Le((wait_ms + SMALL_TIME_MS) * 2))); EXPECT_THAT(plant->getFeedbackDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getFeedbackAvg(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); } } TEST_F(BasePlantTest, runControlLoopRegular) { EXPECT_CALL(mockFeedback, initTrackingController()).Times(1); mockController->initFeedback(); int hz = plant->getHz(); double test_duration = 1.0; // in seconds for how long to run the test // setup mock expected calls EXPECT_CALL(mockCost, setParams(testing::_)).Times(0); EXPECT_CALL(mockDynamics, setParams(testing::_)).Times(0); EXPECT_CALL(*mockController, resetControls()).Times(1); double wait_s = (1.0 / hz) / 3; // divide by 3 since wait is evenly split across computeFeedback, computeControl, and waiting auto wait_function = [wait_s](const Eigen::Ref& state, int optimization_stride = 0) { usleep(wait_s * 1e6); }; int iterations = int(std::round((hz * 1.0) / (test_duration))); // number of times the method will be called MockDynamics::control_array control = MockDynamics::control_array::Zero(); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(46) .WillRepeatedly(testing::Return(control)); // called by update state should not be called since time since last // slide control sequence is skipped on the first iteration EXPECT_CALL(*mockController, slideControlSequence(1)).Times(iterations / 2 - 1); EXPECT_CALL(*mockController, computeControl(testing::_, testing::_)) .Times(iterations / 2) .WillRepeatedly(testing::Invoke(wait_function)); MockController::control_trajectory control_seq = MockController::control_trajectory::Zero(); EXPECT_CALL(*mockController, getControlSeq()).Times(iterations / 2).WillRepeatedly(testing::Return(control_seq)); MockController::state_trajectory state_seq = MockController::state_trajectory::Zero(); EXPECT_CALL(*mockController, getTargetStateSeq()).Times(iterations / 2).WillRepeatedly(testing::Return(state_seq)); EXPECT_CALL(*mockController, computeFeedback(testing::_)) .Times(iterations / 2) .WillRepeatedly(testing::Invoke(wait_function)); MockController::TEMPLATED_FEEDBACK_STATE feedback; EXPECT_CALL(*mockController, getFeedbackState()).Times(iterations / 2).WillRepeatedly(testing::Return(feedback)); EXPECT_CALL(*mockController, computeFeedbackPropagatedStateSeq()).Times(iterations / 2); EXPECT_CALL(*mockController, calculateSampledStateTrajectories()).Times(0); int init_time = 78; plant->setLastTime(init_time); MockDynamics::state_array state = MockDynamics::state_array::Zero(); plant->updateState(state, init_time); std::atomic is_alive(true); std::thread optimizer(&MockTestPlant::runControlLoop, plant.get(), &is_alive); std::chrono::steady_clock::time_point loop_start = std::chrono::steady_clock::now(); std::chrono::duration loop_duration = std::chrono::steady_clock::now() - loop_start; // counter is number of dts for (int counter = 0; loop_duration.count() < test_duration * 1e3; counter++) { // wait until the correct hz has passed to tick the time // state at 100 Hz while (loop_duration.count() < (test_duration / 100) * 1e3 * counter) { usleep(50); loop_duration = std::chrono::steady_clock::now() - loop_start; } if (counter / 5 > iterations / 2) { // this forces it to block plant->incrementTime(0.01); plant->updateState(state, plant->getPoseTime()); } } is_alive.store(false); optimizer.join(); // check all the things EXPECT_EQ(plant->checkStatus(), 1); EXPECT_EQ(plant->getStateTraj(), state_seq); EXPECT_EQ(plant->getControlTraj(), control_seq); // EXPECT_EQ(plant->getFeedbackState(), feedback); // check last pose update EXPECT_NE(plant->getLastUsedPoseUpdateTime(), 0.0); EXPECT_EQ(plant->getNumIter(), iterations / 2); EXPECT_EQ(plant->getLastOptimizationStride(), 1); double wait_ms = wait_s * 1e3; EXPECT_THAT(plant->getOptimizationDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getOptimizationAvg(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getOptimizeLoopDuration(), testing::AllOf(testing::Ge(wait_ms * 2), testing::Le(wait_ms * 2 + SMALL_TIME_MS))); EXPECT_THAT(plant->getLoopAvg(), testing::AllOf(testing::Ge(wait_ms * 2), testing::Le(wait_ms * 2 + SMALL_TIME_MS))); EXPECT_THAT(plant->getFeedbackDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getFeedbackAvg(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); // 10 iters of just waiting, 10 iters of waiting for correct time double expected_avg_wait = ((wait_ms * 3 * 10) + wait_ms * 10) / 10; EXPECT_THAT(plant->getSleepTimeAvg(), testing::AllOf(testing::Gt(expected_avg_wait), testing::Le(expected_avg_wait + SMALL_TIME_MS * 4))); } TEST_F(BasePlantTest, runControlLoopSlowed) { mockController->initFeedback(); int hz = plant->getHz(); double test_duration = 1.0; // in seconds for how long to run the test MockTestPlant::COST_PARAMS_T cost_params; MockTestPlant::DYN_PARAMS_T dyn_params; double wait_s = (1.0 / hz); auto wait_function = [wait_s](const Eigen::Ref& state, int optimization_stride = 0) { usleep(wait_s * 1e6); }; int iterations = int(std::round((hz * 1.0) / (test_duration))); // number of times the method will be called // setup mock expected calls EXPECT_CALL(mockCost, setParams(testing::_)).Times(testing::AtLeast(54)); // 50 from waiting, 5 from run control // iteration EXPECT_CALL(mockDynamics, setParams(testing::_)).Times(testing::AtLeast(54)); EXPECT_CALL(*mockController, resetControls()).Times(1); int expected_iters = iterations / 4 + 1; // half do not count because of waiting, other half since we are slowed, // plus one for init call MockDynamics::control_array control = MockDynamics::control_array::Zero(); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)) .Times(46) .WillRepeatedly(testing::Return(control)); // called by update state should not be called since time since last // slide control sequence is skipped on the first iteration EXPECT_CALL(*mockController, slideControlSequence(1)).Times(1); EXPECT_CALL(*mockController, slideControlSequence(2)).Times(expected_iters - 2); EXPECT_CALL(*mockController, computeControl(testing::_, testing::_)) .Times(expected_iters) .WillRepeatedly(testing::Invoke(wait_function)); MockController::control_trajectory control_seq = MockController::control_trajectory::Zero(); EXPECT_CALL(*mockController, getControlSeq()).Times(expected_iters).WillRepeatedly(testing::Return(control_seq)); MockController::state_trajectory state_seq = MockController::state_trajectory::Zero(); EXPECT_CALL(*mockController, getTargetStateSeq()).Times(expected_iters).WillRepeatedly(testing::Return(state_seq)); EXPECT_CALL(*mockController, computeFeedback(testing::_)) .Times(expected_iters) .WillRepeatedly(testing::Invoke(wait_function)); MockController::TEMPLATED_FEEDBACK_STATE feedback; EXPECT_CALL(*mockController, getFeedbackState()).Times(expected_iters).WillRepeatedly(testing::Return(feedback)); EXPECT_CALL(*mockController, computeFeedbackPropagatedStateSeq()).Times(expected_iters); EXPECT_CALL(*mockController, calculateSampledStateTrajectories()).Times(0); int init_time = 78; plant->setLastTime(init_time); MockDynamics::state_array state = MockDynamics::state_array::Zero(); plant->updateState(state, init_time); std::atomic is_alive(true); std::thread optimizer(&MockTestPlant::runControlLoop, plant.get(), &is_alive); std::chrono::steady_clock::time_point loop_start = std::chrono::steady_clock::now(); std::chrono::duration loop_duration = std::chrono::steady_clock::now() - loop_start; // counter is number of dts for (int counter = 0; loop_duration.count() < test_duration * 1e3; counter++) { // wait until the correct hz has passed to tick the time // state at 100 Hz while (loop_duration.count() < (test_duration / 100) * 1e3 * counter) { usleep(50); loop_duration = std::chrono::steady_clock::now() - loop_start; } if (counter / 5 > iterations / 2) { // this forces it to block plant->incrementTime(0.01); plant->updateState(state, plant->getPoseTime()); } plant->setCostParams(cost_params); plant->setDynamicsParams(dyn_params); } is_alive.store(false); optimizer.join(); // check all the things EXPECT_EQ(plant->checkStatus(), 1); EXPECT_EQ(plant->getStateTraj(), state_seq); EXPECT_EQ(plant->getControlTraj(), control_seq); // EXPECT_EQ(plant->getFeedbackState(), feedback); // check last pose update EXPECT_NE(plant->getLastUsedPoseUpdateTime(), 0.0); EXPECT_EQ(plant->getNumIter(), expected_iters); EXPECT_EQ(plant->getLastOptimizationStride(), 2); double wait_ms = wait_s * 1e3; EXPECT_THAT(plant->getOptimizationDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getOptimizationAvg(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getOptimizeLoopDuration(), testing::AllOf(testing::Ge(wait_ms * 2), testing::Le(wait_ms * 2 + SMALL_TIME_MS))); EXPECT_THAT(plant->getLoopAvg(), testing::AllOf(testing::Ge(wait_ms * 2), testing::Le(wait_ms * 2 + SMALL_TIME_MS))); EXPECT_THAT(plant->getFeedbackDuration(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); EXPECT_THAT(plant->getFeedbackAvg(), testing::AllOf(testing::Ge(wait_ms), testing::Le(wait_ms + SMALL_TIME_MS))); // 10 iters of just waiting double expected_avg_wait = ((wait_ms * 2 * 10)) / 6; // EXPECT_THAT(plant->getSleepTimeAvg(), testing::AllOf(testing::Gt(expected_avg_wait - SMALL_TIME_MS), // testing::Le(expected_avg_wait + SMALL_TIME_MS * 4))); } ================================================ FILE: tests/mppi_core/buffered_plant_tester.cu ================================================ #include #include #include #include #include #include #include #include #include #include #include #include const double precision = 1e-6; const float DT = DT; template class TestPlant : public BufferedPlant { public: double time_ = 0.0; double avgDurationMs_ = 0; double avgTickDuration_ = 0; double avgSleepTime_ = 0; using c_array = typename CONTROLLER_T::control_array; using c_traj = typename CONTROLLER_T::control_trajectory; using s_array = typename CONTROLLER_T::state_array; using s_traj = typename CONTROLLER_T::state_trajectory; using DYN_T = typename CONTROLLER_T::TEMPLATED_DYNAMICS; using DYN_PARAMS_T = typename DYN_T::DYN_PARAMS_T; using COST_T = typename CONTROLLER_T::TEMPLATED_COSTS; using COST_PARAMS_T = typename COST_T::COST_PARAMS_T; double timestamp_; double loop_speed_; using buffer_trajectory = typename BufferedPlant::buffer_trajectory; TestPlant(std::shared_ptr controller, double buffer_time_horizon = 2.0, int hz = 20, int opt_stride = 1) : BufferedPlant(controller, hz, opt_stride) { this->buffer_time_horizon_ = 2.0; this->buffer_tau_ = 0.2; this->buffer_dt_ = 0.02; controller->setDt(this->buffer_dt_); } void pubControl(const c_array& u) override { } void pubNominalState(const s_array& s) override { } void pubFreeEnergyStatistics(MPPIFreeEnergyStatistics& fe_stats) override { } void incrementTime() { time_ += DT; } int checkStatus() override { return 1; } double getCurrentTime() override { auto current_time = std::chrono::system_clock::now(); auto duration_in_seconds = std::chrono::duration(current_time.time_since_epoch()); return duration_in_seconds.count(); } double getPoseTime() override { return time_; } void setStateTimeToPoseTime() { this->state_time_ = time_; } // accessors for protected members std::list> getPrevPositionList() { return this->buffer_.getPrevPositionList(); } std::list> getPrevQuaternionList() { return this->buffer_.getPrevQuaternionList(); } std::list> getPrevVelocityList() { return this->buffer_.getPrevVelocityList(); } std::list> getPrevOmegaList() { return this->buffer_.getPrevOmegaList(); } std::list> getPrevControlList() { return this->buffer_.getPrevControlList(); } std::map>> getPrevExtraList() { return this->buffer_.getPrevExtraList(); } double getBufferTimeHorizon() { return this->buffer_time_horizon_; } double getBufferTau() { return this->buffer_tau_; } void setLastUsedUpdateTime(double time) { this->last_used_state_update_time_ = time; } }; typedef TestPlant MockTestPlant; class BufferedPlantTest : public ::testing::Test { protected: void SetUp() override { mockController = std::make_shared(); // EXPECT_CALL(*mockController, getDt()).Times(1); EXPECT_CALL(*mockController, getDt()).WillRepeatedly(testing::Return(DT)); mockController->cost_ = &mockCost; mockController->model_ = &mockDynamics; mockController->fb_controller_ = &mockFeedback; mockController->sampler_ = &mockSamplingDistribution; EXPECT_CALL(*mockController->cost_, getParams()).Times(1); EXPECT_CALL(*mockController->model_, getParams()).Times(1); EXPECT_CALL(mockDynamics, freeCudaMem()).Times(1); EXPECT_CALL(mockCost, freeCudaMem()).Times(1); EXPECT_CALL(mockSamplingDistribution, freeCudaMem()).Times(1); EXPECT_CALL(mockFeedback, freeCudaMem()).Times(1); plant = std::make_shared(mockController); generator = std::default_random_engine(7.0); distribution = std::normal_distribution(0.0, 100.0); } void TearDown() override { plant = nullptr; mockController = nullptr; } MockSamplingDistribution mockSamplingDistribution; MockDynamics mockDynamics; MockCost mockCost; MockFeedback mockFeedback; std::shared_ptr mockController; std::shared_ptr plant; std::default_random_engine generator; std::normal_distribution distribution; }; TEST_F(BufferedPlantTest, Constructor) { auto prev_pos = plant->getPrevPositionList(); EXPECT_EQ(prev_pos.size(), 0); auto prev_quat = plant->getPrevQuaternionList(); EXPECT_EQ(prev_quat.size(), 0); auto prev_vel = plant->getPrevVelocityList(); EXPECT_EQ(prev_vel.size(), 0); auto prev_omega = plant->getPrevOmegaList(); EXPECT_EQ(prev_omega.size(), 0); auto prev_control = plant->getPrevControlList(); EXPECT_EQ(prev_control.size(), 0); auto prev_extra = plant->getPrevExtraList(); EXPECT_EQ(prev_control.size(), 0); EXPECT_FLOAT_EQ(plant->getBufferTimeHorizon(), 2.0); EXPECT_FLOAT_EQ(plant->getBufferTau(), 0.2); EXPECT_FLOAT_EQ(plant->getBufferDt(), 0.02); } TEST_F(BufferedPlantTest, setBufferDt) { double new_buffer_dt = 30.0; plant->setBufferDt(new_buffer_dt); EXPECT_FLOAT_EQ(plant->getBufferDt(), new_buffer_dt); } TEST_F(BufferedPlantTest, interpNew) { Eigen::Vector3f pos = Eigen::Vector3f::Ones(); Eigen::Quaternionf quat = Eigen::Quaternionf::Identity(); Eigen::Vector3f vel = Eigen::Vector3f::Random(); Eigen::Vector3f omega = Eigen::Vector3f::Random(); MockDynamics::state_array state = MockDynamics::state_array::Random(); EXPECT_CALL(mockDynamics, stateFromMap(testing::_)).Times(2).WillRepeatedly(testing::Return(state)); // Controls never calculated so no calls to controller in updateState() EXPECT_CALL(*mockController, getDt()).Times(0); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)).Times(0); plant->setLastUsedUpdateTime(0); plant->updateOdometry(pos, quat, vel, omega, 0.0); plant->updateOdometry(pos, quat, vel, omega, 1.0); std::map result_state = plant->getInterpState(10); EXPECT_FLOAT_EQ(result_state["OMEGA_X"], omega.x()); EXPECT_FLOAT_EQ(result_state["OMEGA_Y"], omega.y()); EXPECT_FLOAT_EQ(result_state["OMEGA_Z"], omega.z()); } TEST_F(BufferedPlantTest, updateControls) { MockDynamics::control_array u = MockDynamics::control_array::Zero(); auto prev_control = plant->getPrevControlList(); for (int i = 0; i < 20; i++) { u = MockDynamics::control_array::Ones() * 0.2 * i; plant->updateControls(u, 0.2 * i); prev_control = plant->getPrevControlList(); EXPECT_EQ(prev_control.size(), i + 1); } for (int i = 0; i < 20; i++) { u = MockDynamics::control_array::Ones() * (0.2 * i + 0.1); plant->updateControls(u, 0.2 * i + 0.1); prev_control = plant->getPrevControlList(); EXPECT_EQ(prev_control.size(), i + 21); } prev_control = plant->getPrevControlList(); EXPECT_EQ(prev_control.size(), 40); double time = 0; for (auto it = prev_control.begin(); it != prev_control.end(); it++, time += 0.1) { EXPECT_FLOAT_EQ(it->time, time); for (int i = 0; i < MockDynamics::CONTROL_DIM; i++) { EXPECT_FLOAT_EQ(it->data(i), time); } } plant->time_ = 4.0; plant->setStateTimeToPoseTime(); plant->updateParameters(); prev_control = plant->getPrevControlList(); EXPECT_EQ(prev_control.size(), 20); } TEST_F(BufferedPlantTest, updateControlsRandom) { MockDynamics::control_array u = MockDynamics::control_array::Random(); auto prev_control = plant->getPrevControlList(); for (int i = 0; i < 1000; i++) { plant->updateControls(u, distribution(generator)); } prev_control = plant->getPrevControlList(); EXPECT_EQ(prev_control.size(), 1000); std::vector times(1000); int index = 0; for (auto it = prev_control.begin(); it != prev_control.end(); it++, index++) { times[index] = it->time; } EXPECT_TRUE(std::is_sorted(times.begin(), times.end())); } // TEST_F(BufferedPlantTest, updateControlsInterp) // { // MockDynamics::control_array u = MockDynamics::control_array::Zero(); // std::list> prev_control = plant->getPrevControlList(); // // for (int i = 0; i < 21; i++) // { // u = MockDynamics::control_array::Ones() * 0.2 * i; // plant->updateControls(u, 0.2 * i); // prev_control = plant->getPrevControlList(); // EXPECT_EQ(prev_control.size(), i + 1); // } // prev_control = plant->getPrevControlList(); // EXPECT_EQ(prev_control.size(), 21); // // for (double t = -2.0; t < 6.0; t += 0.01) // { // MockDynamics::control_array u_interp = MockTestPlant::interp(prev_control, t); // if (t < 0) // { // for (int i = 0; i < MockDynamics::CONTROL_DIM; i++) // { // EXPECT_NEAR(u_interp(i), 0, precision) << "at time " << t; // } // } // else if (t > 4.0) // { // for (int i = 0; i < MockDynamics::CONTROL_DIM; i++) // { // EXPECT_NEAR(u_interp(i), 4.0, precision) << "at time " << t; // } // } // else // { // for (int i = 0; i < MockDynamics::CONTROL_DIM; i++) // { // EXPECT_NEAR(u_interp(i), t, precision) << "at time " << t; // } // } // } // } TEST_F(BufferedPlantTest, extraValues) { auto extra_info = plant->getPrevExtraList(); for (int i = 0; i < 20; i++) { plant->updateExtraValue("steering_angle", 0.2 * i, 0.2 * i); plant->updateExtraValue("steering_vel", 0.2 * i, 0.2 * i); extra_info = plant->getPrevExtraList(); EXPECT_EQ(extra_info.size(), 2); EXPECT_EQ(extra_info["steering_angle"].size(), i + 1); EXPECT_EQ(extra_info["steering_vel"].size(), i + 1); } for (int i = 0; i < 20; i++) { plant->updateExtraValue("steering_angle", 0.2 * i + 0.1, 0.2 * i + 0.1); plant->updateExtraValue("steering_vel", 0.2 * i + 0.1, 0.2 * i + 0.1); extra_info = plant->getPrevExtraList(); EXPECT_EQ(extra_info.size(), 2); EXPECT_EQ(extra_info["steering_angle"].size(), i + 21); EXPECT_EQ(extra_info["steering_vel"].size(), i + 21); } extra_info = plant->getPrevExtraList(); EXPECT_EQ(extra_info.size(), 2); EXPECT_EQ(extra_info["steering_angle"].size(), 40); EXPECT_EQ(extra_info["steering_vel"].size(), 40); for (auto list_it = extra_info.begin(); list_it != extra_info.end(); list_it++) { double time = 0; for (auto it = list_it->second.begin(); it != list_it->second.end(); it++, time += 0.1) { EXPECT_FLOAT_EQ(it->time, time); for (int i = 0; i < MockDynamics::CONTROL_DIM; i++) { EXPECT_FLOAT_EQ(it->data, time); } } } plant->cleanBuffers(4.0); extra_info = plant->getPrevExtraList(); EXPECT_EQ(extra_info.size(), 2); EXPECT_EQ(extra_info["steering_angle"].size(), 20); EXPECT_EQ(extra_info["steering_vel"].size(), 20); } TEST_F(BufferedPlantTest, updateOdometry) { Eigen::Vector3f pos = Eigen::Vector3f::Ones(); Eigen::Quaternionf quat = Eigen::Quaternionf::Identity(); Eigen::Vector3f vel = Eigen::Vector3f::Ones(); Eigen::Vector3f omega = Eigen::Vector3f::Ones(); MockDynamics::state_array state = MockDynamics::state_array::Random(); EXPECT_CALL(mockDynamics, stateFromMap(testing::_)).Times(2).WillRepeatedly(testing::Return(state)); // Controls never calculated so no calls to controller in updateState() EXPECT_CALL(*mockController, getDt()).Times(0); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)).Times(0); plant->setLastUsedUpdateTime(0.0); plant->updateOdometry(pos, quat, vel, omega, 0.0); auto prev_pos = plant->getPrevPositionList(); auto prev_quat = plant->getPrevQuaternionList(); auto prev_vel = plant->getPrevVelocityList(); auto prev_omega = plant->getPrevOmegaList(); EXPECT_EQ(prev_pos.size(), 1); EXPECT_EQ(prev_quat.size(), 1); EXPECT_EQ(prev_vel.size(), 1); EXPECT_EQ(prev_omega.size(), 1); MockDynamics::state_array result_state = plant->getState(); EXPECT_LT((state - result_state).norm(), 1e-8); pos = Eigen::Vector3f::Ones() * 3; vel = Eigen::Vector3f::Ones() * 4; omega = Eigen::Vector3f::Ones() * 5; quat = Eigen::AngleAxisf(M_PI_2, Eigen::Vector3f::UnitZ()); plant->updateOdometry(pos, quat, vel, omega, 1.0); prev_pos = plant->getPrevPositionList(); prev_quat = plant->getPrevQuaternionList(); prev_vel = plant->getPrevVelocityList(); prev_omega = plant->getPrevOmegaList(); EXPECT_EQ(prev_pos.size(), 2); EXPECT_EQ(prev_quat.size(), 2); EXPECT_EQ(prev_vel.size(), 2); EXPECT_EQ(prev_omega.size(), 2); result_state = plant->getState(); EXPECT_LT((state - result_state).norm(), 1e-8); std::map interp = plant->getInterpState(0.5); EXPECT_FLOAT_EQ(interp.at("POS_X"), 2); EXPECT_FLOAT_EQ(interp.at("POS_Y"), 2); EXPECT_FLOAT_EQ(interp.at("POS_Z"), 2); EXPECT_FLOAT_EQ(interp.at("Q_W"), 0.92387962); EXPECT_FLOAT_EQ(interp.at("Q_X"), 0.0); EXPECT_FLOAT_EQ(interp.at("Q_Y"), 0.0); EXPECT_FLOAT_EQ(interp.at("Q_Z"), 0.3826834); EXPECT_FLOAT_EQ(interp.at("VEL_X"), 2.5); EXPECT_FLOAT_EQ(interp.at("VEL_Y"), 2.5); EXPECT_FLOAT_EQ(interp.at("VEL_Z"), 2.5); EXPECT_FLOAT_EQ(interp.at("OMEGA_X"), 3); EXPECT_FLOAT_EQ(interp.at("OMEGA_Y"), 3); EXPECT_FLOAT_EQ(interp.at("OMEGA_Z"), 3); } TEST_F(BufferedPlantTest, getInterpState) { Eigen::Vector3f pos = Eigen::Vector3f::Ones(); Eigen::Quaternionf quat = Eigen::Quaternionf::Identity(); Eigen::Vector3f vel = Eigen::Vector3f::Ones(); Eigen::Vector3f omega = Eigen::Vector3f::Ones(); MockDynamics::control_array u = MockDynamics::control_array::Ones(); MockDynamics::state_array state = MockDynamics::state_array::Random(); plant->setLastUsedUpdateTime(0.0); EXPECT_CALL(mockDynamics, stateFromMap(testing::_)).Times(2).WillRepeatedly(testing::Return(state)); // Controls never calculated so no calls to controller in updateState() EXPECT_CALL(*mockController, getDt()).Times(0); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)).Times(0); plant->updateOdometry(pos, quat, vel, omega, 0.0); plant->updateControls(u, 0.0); plant->updateExtraValue("steering_angle", 1, 0); plant->updateExtraValue("steering_vel", 1, 0); pos = Eigen::Vector3f::Ones() * 2; vel = Eigen::Vector3f::Ones() * 2; omega = Eigen::Vector3f::Ones() * 2; quat = Eigen::AngleAxisf(M_PI_2f32, Eigen::Vector3f::UnitZ()); u = MockDynamics::control_array::Ones() * 2; plant->updateOdometry(pos, quat, vel, omega, 1.0); plant->updateControls(u, 1.0); plant->updateExtraValue("steering_angle", 2, 1.0); plant->updateExtraValue("steering_vel", 2, 1.0); std::map map = plant->getInterpState(0.5); EXPECT_EQ(map.size(), 19); EXPECT_FLOAT_EQ(map.at("POS_X"), 1.5); EXPECT_FLOAT_EQ(map.at("POS_Y"), 1.5); EXPECT_FLOAT_EQ(map.at("POS_Z"), 1.5); EXPECT_FLOAT_EQ(map.at("Q_W"), 0.92387962); EXPECT_FLOAT_EQ(map.at("Q_X"), 0.0); EXPECT_FLOAT_EQ(map.at("Q_Y"), 0.0); EXPECT_FLOAT_EQ(map.at("Q_Z"), 0.3826834); EXPECT_FLOAT_EQ(map.at("ROLL"), 0); EXPECT_FLOAT_EQ(map.at("PITCH"), 0); EXPECT_FLOAT_EQ(map.at("YAW"), M_PI_4f32); EXPECT_FLOAT_EQ(map.at("VEL_X"), 1.5); EXPECT_FLOAT_EQ(map.at("VEL_Y"), 1.5); EXPECT_FLOAT_EQ(map.at("VEL_Z"), 1.5); EXPECT_FLOAT_EQ(map.at("OMEGA_X"), 1.5); EXPECT_FLOAT_EQ(map.at("OMEGA_Y"), 1.5); EXPECT_FLOAT_EQ(map.at("OMEGA_Z"), 1.5); EXPECT_FLOAT_EQ(map.at("CONTROL_0"), 1.5); EXPECT_FLOAT_EQ(map.at("steering_angle"), 1.5); EXPECT_FLOAT_EQ(map.at("steering_vel"), 1.5); } TEST_F(BufferedPlantTest, getInterpBuffer) { Eigen::Vector3f pos = Eigen::Vector3f::Zero(); Eigen::Quaternionf quat = Eigen::Quaternionf::Identity(); Eigen::Vector3f vel = Eigen::Vector3f::Zero(); Eigen::Vector3f omega = Eigen::Vector3f::Zero(); MockDynamics::control_array u = MockDynamics::control_array::Zero(); MockDynamics::state_array state = MockDynamics::state_array::Random(); plant->setLastUsedUpdateTime(0.0); EXPECT_CALL(mockDynamics, stateFromMap(testing::_)).Times(2).WillRepeatedly(testing::Return(state)); // Controls never calculated so no calls to controller in updateState() EXPECT_CALL(*mockController, getDt()).Times(0); EXPECT_CALL(*mockController, getCurrentControl(testing::_, testing::_, testing::_, testing::_, testing::_)).Times(0); plant->updateOdometry(pos, quat, vel, omega, 0.0); plant->updateControls(u, 0.0); plant->updateExtraValue("steering_angle", 0, 0); plant->updateExtraValue("steering_vel", 0, 0); pos = Eigen::Vector3f::Ones(); vel = Eigen::Vector3f::Ones(); omega = Eigen::Vector3f::Ones(); quat = Eigen::AngleAxisf(M_PI_2, Eigen::Vector3f::UnitZ()); u = MockDynamics::control_array::Ones(); plant->updateOdometry(pos, quat, vel, omega, 1.0); plant->updateControls(u, 1.0); plant->updateExtraValue("steering_angle", 1, 1.0); plant->updateExtraValue("steering_vel", 1, 1.0); MockTestPlant::buffer_trajectory buffer = plant->getSmoothedBuffer(1.0); EXPECT_EQ(buffer.size(), 19); EXPECT_EQ(buffer.at("POS_X").size(), 11); for (int t = 0; t < 11; t++) { double time = 0.8 + t * 0.02; EXPECT_FLOAT_EQ(buffer.at("POS_X")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("POS_Y")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("POS_Z")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("VEL_X")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("VEL_Y")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("VEL_Z")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("OMEGA_X")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("OMEGA_Y")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("OMEGA_Z")(t), time) << "at time " << t << " " << time; EXPECT_FLOAT_EQ(buffer.at("ROLL")(t), 0); EXPECT_FLOAT_EQ(buffer.at("PITCH")(t), 0); EXPECT_FLOAT_EQ(buffer.at("CONTROL_0")(t), time) << "at time " << t << " " << time; EXPECT_NEAR(buffer.at("steering_angle")(t), time, precision) << "at time " << t << " " << time; EXPECT_NEAR(buffer.at("steering_vel")(t), time, precision) << "at time " << t << " " << time; } } ================================================ FILE: tests/mppi_core/normexp_kernel_tests.cu ================================================ #include #include #include #include #include #include #include class NormExpKernel : public testing::Test { protected: void SetUp() override { generator = std::default_random_engine(7.0); distribution = std::normal_distribution(100.0, 2.0); } void TearDown() override { } std::default_random_engine generator; std::normal_distribution distribution; }; template __global__ void computeNormalizerKernel(const float* __restrict__ costs, float* __restrict__ output) { __shared__ float reduction_buffer[NUM_ROLLOUTS]; int global_idx = threadIdx.x; int global_step = blockDim.x; *output = mppi::kernels::computeNormalizer(NUM_ROLLOUTS, costs, reduction_buffer, global_idx, global_step); }; template __global__ void computeBaselineCostKernel(const float* __restrict__ costs, float* __restrict__ output) { __shared__ float reduction_buffer[NUM_ROLLOUTS]; int global_idx = threadIdx.x; int global_step = blockDim.x; *output = mppi::kernels::computeBaselineCost(NUM_ROLLOUTS, costs, reduction_buffer, global_idx, global_step); }; TEST_F(NormExpKernel, computeBaselineCost_Test) { const int num_rollouts = 4196; std::array cost_vec = { 0 }; // Use a range based for loop to set the cost for (auto& cost : cost_vec) { cost = distribution(generator); } float min_cost_known = *std::min_element(cost_vec.begin(), cost_vec.end()); float min_cost_compute = mppi::kernels::computeBaselineCost(cost_vec.data(), num_rollouts); ASSERT_FLOAT_EQ(min_cost_compute, min_cost_known); } TEST_F(NormExpKernel, computeNormalizer_Test) { const int num_rollouts = 1024; std::array cost_vec = { 0 }; // Use a range based for loop to set the cost for (auto& cost : cost_vec) { cost = distribution(generator); } float sum_cost_known = std::accumulate(cost_vec.begin(), cost_vec.end(), 0.0); float sum_cost_compute = mppi::kernels::computeNormalizer(cost_vec.data(), num_rollouts); ASSERT_FLOAT_EQ(sum_cost_compute, sum_cost_known); } TEST_F(NormExpKernel, computeNormalizerDevice_Test) { const int num_rollouts = 6048; std::array cost_vec = { 0 }; // Use a range based for loop to set the cost for (int i = 0; i < cost_vec.size(); i++) { cost_vec[i] = distribution(generator); } float* norm_d; float* costs_d; float sum_cost_compute; HANDLE_ERROR(cudaMalloc((void**)&norm_d, sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&costs_d, sizeof(float) * num_rollouts)); HANDLE_ERROR(cudaMemcpy(costs_d, cost_vec.data(), sizeof(float) * num_rollouts, cudaMemcpyHostToDevice)); computeNormalizerKernel<<<1, 1024>>>(costs_d, norm_d); HANDLE_ERROR(cudaMemcpy(&sum_cost_compute, norm_d, sizeof(float), cudaMemcpyDeviceToHost)); float sum_cost_known = std::accumulate(cost_vec.begin(), cost_vec.end(), 0.0); ASSERT_FLOAT_EQ(sum_cost_compute, sum_cost_known); } TEST_F(NormExpKernel, computeBaselineCostDevice_Test) { const int num_rollouts = 6048; std::array cost_vec = { 0 }; // Use a range based for loop to set the cost for (int i = 0; i < cost_vec.size(); i++) { cost_vec[i] = cost_vec.size() - i; } std::cout << std::endl; float* norm_d; float* costs_d; float sum_cost_compute; HANDLE_ERROR(cudaMalloc((void**)&norm_d, sizeof(float))); HANDLE_ERROR(cudaMalloc((void**)&costs_d, sizeof(float) * num_rollouts)); HANDLE_ERROR(cudaMemcpy(costs_d, cost_vec.data(), sizeof(float) * num_rollouts, cudaMemcpyHostToDevice)); computeBaselineCostKernel<<<1, 1024>>>(costs_d, norm_d); HANDLE_ERROR(cudaMemcpy(&sum_cost_compute, norm_d, sizeof(float), cudaMemcpyDeviceToHost)); float sum_cost_known = *std::min_element(cost_vec.begin(), cost_vec.end()); ASSERT_FLOAT_EQ(sum_cost_compute, sum_cost_known); } TEST_F(NormExpKernel, computeExpNorm_Test) { const int num_rollouts = 555; std::array cost_vec = { 0 }; std::array normalized_compute = { 0 }; std::array normalized_known = { 0 }; float gamma = 0.3; // Use a range based for loop to set the cost for (auto& cost : cost_vec) { cost = distribution(generator); } float baseline = *std::min_element(cost_vec.begin(), cost_vec.end()); for (int i = 0; i < num_rollouts; i++) { normalized_known[i] = expf(-gamma * (cost_vec[i] - baseline)); } launchNormExp_KernelTest(cost_vec, gamma, baseline, normalized_compute); array_assert_float_eq(normalized_compute, normalized_known); } TEST_F(NormExpKernel, comparisonTestAutorallyMPPI_Generic) { const int num_rollouts = 28754; const int blocksize_x = 8; const int blocksize_y = 8; std::array cost_vec = { 0 }; std::array normalized_autorally = { 0 }; std::array normalized_generic = { 0 }; float gamma = 0.3; // Use a range based for loop to set the cost for (auto& cost : cost_vec) { cost = distribution(generator); } float baseline = *std::min_element(cost_vec.begin(), cost_vec.end()); launchGenericNormExpKernelTest(cost_vec, gamma, baseline, normalized_generic); for (int i = 0; i < num_rollouts; i++) { float cost = cost_vec[i] - baseline; cost = expf(-gamma * cost); EXPECT_FLOAT_EQ(normalized_generic[i], cost); } } TEST_F(NormExpKernel, comparisonTestHostvsDeviceBaselineNormalizerCalculation) { const int num_rollouts = 10000; const int blocksize_x = 8; const int num_iterations = 2500; std::array cost_vec = { 0 }; std::array host_dev_costs = { 0 }; std::array dev_only_costs = { 0 }; float lambda = 0.3; double old_method_ms = 0; double new_method_ms = 0; cudaStream_t stream; cudaStreamCreate(&stream); float* costs_dev_only_d; float* costs_host_only_d; float2* baseline_and_normalizer_d; float2 host_components, device_components; HANDLE_ERROR(cudaMalloc((void**)&baseline_and_normalizer_d, sizeof(float2))); HANDLE_ERROR(cudaMalloc((void**)&costs_dev_only_d, sizeof(float) * num_rollouts)); HANDLE_ERROR(cudaMalloc((void**)&costs_host_only_d, sizeof(float) * num_rollouts)); // Use a range based for loop to set the cost for (int i = 0; i < num_iterations; i++) { for (auto& cost : cost_vec) { cost = distribution(generator); } /** * @brief Prep CUDA components * */ HANDLE_ERROR(cudaMemcpyAsync(costs_dev_only_d, cost_vec.data(), sizeof(float) * num_rollouts, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(costs_host_only_d, cost_vec.data(), sizeof(float) * num_rollouts, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); auto start_old_method_t = std::chrono::steady_clock::now(); // Run old method to transform costs HANDLE_ERROR(cudaMemcpyAsync(host_dev_costs.data(), costs_host_only_d, num_rollouts * sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); host_components.x = mppi::kernels::computeBaselineCost(host_dev_costs.data(), num_rollouts); mppi::kernels::launchNormExpKernel(num_rollouts, blocksize_x, costs_host_only_d, 1.0 / lambda, host_components.x, stream, false); HANDLE_ERROR(cudaMemcpyAsync(host_dev_costs.data(), costs_host_only_d, num_rollouts * sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); host_components.y = mppi::kernels::computeNormalizer(host_dev_costs.data(), num_rollouts); old_method_ms += (std::chrono::steady_clock::now() - start_old_method_t).count() / 1e6; auto start_new_method_t = std::chrono::steady_clock::now(); // Run new method to transform costs mppi::kernels::launchWeightTransformKernel(costs_dev_only_d, baseline_and_normalizer_d, 1.0 / lambda, 1, stream, false); HANDLE_ERROR(cudaMemcpyAsync(dev_only_costs.data(), costs_dev_only_d, num_rollouts * sizeof(float), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR( cudaMemcpyAsync(&device_components, baseline_and_normalizer_d, sizeof(float2), cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); new_method_ms += (std::chrono::steady_clock::now() - start_new_method_t).count() / 1e6; } std::cout << "Old method averaged " << old_method_ms / num_iterations << " ms and the new method averaged " << new_method_ms / num_iterations << " ms" << std::endl; for (int i = 0; i < num_rollouts; i++) { ASSERT_FLOAT_EQ(dev_only_costs[i], host_dev_costs[i]); } ASSERT_FLOAT_EQ(device_components.x, host_components.x); ASSERT_FLOAT_EQ(device_components.y, host_components.y); } ================================================ FILE: tests/mppi_core/rmppi_kernel_tests.cu ================================================ // // Created by mgandhi on 5/23/20. // #include #include #include #include #include #include #include #include #include #include #include #include class RMPPIKernels : public ::testing::Test { public: static const int num_timesteps = 50; static const int nominal_idx = 0; static const int real_idx = 1; using DYN_T = DoubleIntegratorDynamics; using COST_T = DoubleIntegratorCircleCost; // using COST_T = QuadraticCost; using SAMPLER_T = mppi::sampling_distributions::GaussianDistribution; using FB_T = DDPFeedback; using control_trajectory = Eigen::Matrix; using state_array = DYN_T::state_array; using output_array = DYN_T::output_array; using control_array = DYN_T::control_array; float dt = 0.01; float lambda = 1.1f; float alpha = 0.0; float std_dev = 2.0f; float value_func_threshold = 20; float* initial_x_d = nullptr; float* cost_trajectories_d = nullptr; curandGenerator_t gen; cudaStream_t stream; mppi::util::MPPILoggerPtr logger = nullptr; void SetUp() override { model = new DYN_T(10); // Initialize the double integrator DYN_T cost = new COST_T; // Initialize the cost function auto sampler_params = SAMPLER_T::SAMPLING_PARAMS_T(); for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { sampler_params.std_dev[i] = std_dev; } sampler_params.num_timesteps = num_timesteps; sampler = new SAMPLER_T(sampler_params); fb_controller = new FB_T(model, dt); HANDLE_CURAND_ERROR(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); HANDLE_CURAND_ERROR(curandSetPseudoRandomGeneratorSeed(gen, seed)); logger = std::make_shared(); model->setLogger(logger); cost->setLogger(logger); sampler->setLogger(logger); fb_controller->setLogger(logger); HANDLE_ERROR(cudaStreamCreate(&stream)); model->bindToStream(stream); cost->bindToStream(stream); sampler->bindToStream(stream); fb_controller->bindToStream(stream); curandSetStream(gen, stream); model->GPUSetup(); cost->GPUSetup(); sampler->GPUSetup(); fb_controller->GPUSetup(); } void TearDown() override { delete model; delete cost; delete sampler; delete fb_controller; if (initial_x_d) { HANDLE_ERROR(cudaFree(initial_x_d)); initial_x_d = nullptr; } if (cost_trajectories_d) { HANDLE_ERROR(cudaFree(cost_trajectories_d)); cost_trajectories_d = nullptr; } } DYN_T* model; COST_T* cost; SAMPLER_T* sampler; FB_T* fb_controller; }; // Declare the static variables const int RMPPIKernels::nominal_idx; const int RMPPIKernels::real_idx; /** * @brief Runs the combined init eval kernel and compares to one created on the CPU * to ensure they produce the same cost trajectories. Multiple runs of the init eval kernel * will be done with different parallelizations to ensure that the kernel calculates the same * regardless of the parallelization technique. */ TEST_F(RMPPIKernels, ValidateCombinedInitEvalKernelAgainstCPU) { /** * Set up the problem */ int num_candidates = 9; int num_samples = 64; int num_rollouts = num_candidates * num_samples; /** * Setup GPU */ int* strides_d; sampler->setNumRollouts(num_rollouts); HANDLE_ERROR(cudaMalloc((void**)&strides_d, sizeof(int) * num_candidates)); HANDLE_ERROR(cudaMalloc((void**)&initial_x_d, sizeof(float) * num_candidates * DYN_T::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&cost_trajectories_d, sizeof(float) * num_rollouts)); // Setup inputs on both CPU and GPU Eigen::MatrixXf candidates = Eigen::MatrixXf::Random(DYN_T::STATE_DIM, num_candidates); Eigen::MatrixXi strides = Eigen::MatrixXi::Zero(num_candidates, 1); for (int i = 0; i < num_candidates; i++) { strides(i) = i + 1; } Eigen::MatrixXf trajectory_costs_cpu = Eigen::MatrixXf::Zero(num_rollouts, 1); Eigen::MatrixXf trajectory_costs_gpu = Eigen::MatrixXf::Zero(num_rollouts, 1); control_trajectory nominal_trajectory = control_trajectory::Random(); sampler->copyImportanceSamplerToDevice(nominal_trajectory.data(), 0, false); sampler->generateSamples(1, 0, gen, false); HANDLE_ERROR(cudaMemcpyAsync(initial_x_d, candidates.data(), sizeof(float) * DYN_T::STATE_DIM * num_candidates, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR( cudaMemcpyAsync(strides_d, strides.data(), sizeof(int) * num_candidates, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); /** * Do CPU calculation */ launchCPUInitEvalKernel(model, cost, sampler, dt, num_timesteps, num_candidates, num_samples, lambda, alpha, candidates, strides, trajectory_costs_cpu); /** * Do GPU Calculation on various thread dimensions */ std::vector possible_thread_x; for (int size = num_samples; size > 0; size /= 2) { possible_thread_x.push_back(size); } std::vector possible_thread_y{ 1, 2, 3, 8 }; for (const auto& thread_x : possible_thread_x) { for (const auto& thread_y : possible_thread_y) { dim3 threadsPerBlock(thread_x, thread_y, 1); logger->info("Testing Combined Eval Kernel on (%d, %d, 1)\n", thread_x, thread_y); mppi::kernels::rmppi::launchInitEvalKernel( model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, num_samples, strides_d, initial_x_d, cost_trajectories_d, threadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), cost_trajectories_d, sizeof(float) * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); for (int i = 0; i < num_rollouts; i++) { float cost_diff = trajectory_costs_cpu(i) - trajectory_costs_gpu(i); std::string error_prefix = "Eval sample " + std::to_string(i) + " (" + std::to_string(threadsPerBlock.x) + ", " + std::to_string(threadsPerBlock.y) + ", " + std::to_string(threadsPerBlock.z) + ")"; EXPECT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_cpu(i)) << error_prefix << ": CPU = " << trajectory_costs_cpu(i) << ", GPU = " << trajectory_costs_gpu(i) << std::endl; } } } HANDLE_ERROR(cudaFree(strides_d)); } TEST_F(RMPPIKernels, ValidateSplitInitEvalKernelAgainstCPU) { /** * Set up the problem */ int num_candidates = 12; int num_samples = 64; int num_rollouts = num_candidates * num_samples; /** * Setup GPU */ int* strides_d; float* initial_x_d; float* cost_trajectories_d; float* output_d; sampler->setNumRollouts(num_rollouts); HANDLE_ERROR(cudaMalloc((void**)&strides_d, sizeof(int) * num_candidates)); HANDLE_ERROR(cudaMalloc((void**)&initial_x_d, sizeof(float) * num_candidates * DYN_T::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&cost_trajectories_d, sizeof(float) * num_rollouts)); HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(float) * num_rollouts * num_timesteps * DYN_T::OUTPUT_DIM)); // Setup inputs on both CPU and GPU Eigen::MatrixXf candidates = Eigen::MatrixXf::Random(DYN_T::STATE_DIM, num_candidates) * 4; Eigen::MatrixXi strides = Eigen::MatrixXi::Zero(num_candidates, 1); for (int i = 0; i < num_candidates; i++) { strides(i) = i + 1; } Eigen::MatrixXf trajectory_costs_cpu = Eigen::MatrixXf::Zero(num_rollouts, 1); Eigen::MatrixXf trajectory_costs_gpu = Eigen::MatrixXf::Zero(num_rollouts, 1); control_trajectory nominal_trajectory = control_trajectory::Random(); sampler->copyImportanceSamplerToDevice(nominal_trajectory.data(), 0, false); sampler->generateSamples(1, 0, gen, false); HANDLE_ERROR(cudaMemcpyAsync(initial_x_d, candidates.data(), sizeof(float) * DYN_T::STATE_DIM * num_candidates, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR( cudaMemcpyAsync(strides_d, strides.data(), sizeof(int) * num_candidates, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); /** * Do CPU calculation */ launchCPUInitEvalKernel(model, cost, sampler, dt, num_timesteps, num_candidates, num_samples, lambda, alpha, candidates, strides, trajectory_costs_cpu); /** * Do GPU Calculation on various thread dimensions */ std::vector possible_dyn_thread_x; for (int size = num_samples; size > 0; size /= 2) { possible_dyn_thread_x.push_back(size); } std::vector possible_cost_thread_x; for (int size = num_timesteps; size > 0; size /= 2) { possible_cost_thread_x.push_back(size); } std::vector possible_thread_y{ 1, 2, 3, 8 }; for (const auto& dyn_thread_x : possible_dyn_thread_x) { for (const auto& dyn_thread_y : possible_thread_y) { dim3 dynThreadsPerBlock(dyn_thread_x, dyn_thread_y, 1); for (const auto& cost_thread_x : possible_cost_thread_x) { for (const auto& cost_thread_y : possible_thread_y) { dim3 costThreadsPerBlock(cost_thread_x, cost_thread_y, 1); logger->info("Testing coalesced Split Eval Kernel with dyn(%d, %d, %d), cost(%d %d, %d)\n", dynThreadsPerBlock.x, dynThreadsPerBlock.y, dynThreadsPerBlock.z, costThreadsPerBlock.x, costThreadsPerBlock.y, costThreadsPerBlock.z); mppi::kernels::rmppi::launchSplitInitEvalKernel( model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, num_samples, strides_d, initial_x_d, output_d, cost_trajectories_d, dynThreadsPerBlock, costThreadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), cost_trajectories_d, sizeof(float) * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); for (int i = 0; i < num_rollouts; i++) { float cost_diff = trajectory_costs_cpu(i) - trajectory_costs_gpu(i); std::string error_prefix = "Coalesced eval sample " + std::to_string(i) + "/" + std::to_string(num_rollouts) + " dyn(" + std::to_string(dynThreadsPerBlock.x) + ", " + std::to_string(dynThreadsPerBlock.y) + ", " + std::to_string(dynThreadsPerBlock.z) + ") cost(" + std::to_string(costThreadsPerBlock.x) + ", " + std::to_string(costThreadsPerBlock.y) + ", " + std::to_string(costThreadsPerBlock.z) + ")"; ASSERT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_cpu(i)) << error_prefix << ": CPU = " << trajectory_costs_cpu(i) << ", GPU = " << trajectory_costs_gpu(i) << std::endl; } logger->info("Testing non-coalesced Split Eval Kernel with dyn(%d, %d, %d), cost(%d %d, %d)\n", dynThreadsPerBlock.x, dynThreadsPerBlock.y, dynThreadsPerBlock.z, costThreadsPerBlock.x, costThreadsPerBlock.y, costThreadsPerBlock.z); mppi::kernels::rmppi::launchSplitInitEvalKernel( model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, num_samples, strides_d, initial_x_d, output_d, cost_trajectories_d, dynThreadsPerBlock, costThreadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), cost_trajectories_d, sizeof(float) * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); for (int i = 0; i < num_rollouts; i++) { float cost_diff = trajectory_costs_cpu(i) - trajectory_costs_gpu(i); std::string error_prefix = "Non-coalesced eval sample " + std::to_string(i) + "/" + std::to_string(num_rollouts) + " dyn(" + std::to_string(dynThreadsPerBlock.x) + ", " + std::to_string(dynThreadsPerBlock.y) + ", " + std::to_string(dynThreadsPerBlock.z) + ") cost(" + std::to_string(costThreadsPerBlock.x) + ", " + std::to_string(costThreadsPerBlock.y) + ", " + std::to_string(costThreadsPerBlock.z) + ")"; ASSERT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_cpu(i)) << error_prefix << ": CPU = " << trajectory_costs_cpu(i) << ", GPU = " << trajectory_costs_gpu(i) << std::endl; } } } } } HANDLE_ERROR(cudaFree(strides_d)); HANDLE_ERROR(cudaFree(output_d)); } TEST_F(RMPPIKernels, ValidateCombinedRMPPIRolloutKernelAgainstCPU) { int num_rollouts = 2048; sampler->setNumRollouts(num_rollouts); sampler->setNumDistributions(2); /** * Setup GPU */ HANDLE_ERROR(cudaMalloc((void**)&initial_x_d, sizeof(float) * 2 * DYN_T::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&cost_trajectories_d, sizeof(float) * 2 * num_rollouts)); state_array initial_real_state = state_array::Random(); state_array initial_nominal_state = state_array::Random(); control_trajectory nominal_trajectory = control_trajectory::Random(); sampler->copyImportanceSamplerToDevice(nominal_trajectory.data(), nominal_idx, false); sampler->copyImportanceSamplerToDevice(nominal_trajectory.data(), real_idx, false); fb_controller->copyToDevice(false); sampler->generateSamples(1, 0, gen, false); HANDLE_ERROR(cudaMemcpyAsync(initial_x_d + nominal_idx * DYN_T::STATE_DIM, initial_nominal_state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(initial_x_d + real_idx * DYN_T::STATE_DIM, initial_real_state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); Eigen::MatrixXf trajectory_costs_cpu = Eigen::MatrixXf::Zero(2 * num_rollouts, 1); Eigen::MatrixXf trajectory_costs_gpu = Eigen::MatrixXf::Zero(2 * num_rollouts, 1); /** * CPU Calculation **/ launchCPURMPPIRolloutKernel( model, cost, sampler, fb_controller, dt, num_timesteps, num_rollouts, lambda, alpha, value_func_threshold, nominal_idx, real_idx, initial_real_state, initial_nominal_state, trajectory_costs_cpu); /** * GPU Calculation **/ std::vector possible_thread_x; for (int size = 64; size > 0; size /= 2) { possible_thread_x.push_back(size); } std::vector possible_thread_y{ 1, 2, 3 }; for (const auto& thread_x : possible_thread_x) { for (const auto& thread_y : possible_thread_y) { dim3 threadsPerBlock(thread_x, thread_y, 2); logger->info("Testing RMPPI Rollout Kernel with (%d, %d, %d)\n", threadsPerBlock.x, threadsPerBlock.y, threadsPerBlock.z); mppi::kernels::rmppi::launchRMPPIRolloutKernel( model, cost, sampler, fb_controller->getHostPointer().get(), dt, num_timesteps, num_rollouts, lambda, alpha, value_func_threshold, initial_x_d, cost_trajectories_d, threadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), cost_trajectories_d, sizeof(float) * 2 * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); for (int i = 0; i < 2 * num_rollouts; i++) { float cost_diff = trajectory_costs_cpu(i) - trajectory_costs_gpu(i); std::string error_prefix = "Rollout sample " + std::to_string(i) + " (" + std::to_string(threadsPerBlock.x) + ", " + std::to_string(threadsPerBlock.y) + ", " + std::to_string(threadsPerBlock.z) + ")"; EXPECT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_cpu(i)) << error_prefix << ": CPU = " << trajectory_costs_cpu(i) << ", GPU = " << trajectory_costs_gpu(i) << std::endl; } } } } TEST_F(RMPPIKernels, ValidateSplitRMPPIRolloutKernelAgainstCPU) { int num_rollouts = 2048; sampler->setNumRollouts(num_rollouts); sampler->setNumDistributions(2); /** * Setup GPU */ float* output_d = nullptr; HANDLE_ERROR(cudaMalloc((void**)&initial_x_d, sizeof(float) * 2 * DYN_T::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&cost_trajectories_d, sizeof(float) * 2 * num_rollouts)); HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(float) * 2 * num_rollouts * num_timesteps * DYN_T::OUTPUT_DIM)); state_array initial_real_state = state_array::Random(); state_array initial_nominal_state = state_array::Random(); control_trajectory nominal_trajectory = control_trajectory::Random(); sampler->copyImportanceSamplerToDevice(nominal_trajectory.data(), nominal_idx, false); sampler->copyImportanceSamplerToDevice(nominal_trajectory.data(), real_idx, false); fb_controller->copyToDevice(false); sampler->generateSamples(1, 0, gen, false); HANDLE_ERROR(cudaMemcpyAsync(initial_x_d + nominal_idx * DYN_T::STATE_DIM, initial_nominal_state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(initial_x_d + real_idx * DYN_T::STATE_DIM, initial_real_state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); Eigen::MatrixXf trajectory_costs_cpu = Eigen::MatrixXf::Zero(2 * num_rollouts, 1); Eigen::MatrixXf trajectory_costs_gpu = Eigen::MatrixXf::Zero(2 * num_rollouts, 1); /** * CPU Calculation **/ launchCPURMPPIRolloutKernel( model, cost, sampler, fb_controller, dt, num_timesteps, num_rollouts, lambda, alpha, value_func_threshold, nominal_idx, real_idx, initial_real_state, initial_nominal_state, trajectory_costs_cpu); /** * GPU Calculation **/ std::vector possible_dyn_thread_x; for (int size = 64; size > 0; size /= 2) { possible_dyn_thread_x.push_back(size); } std::vector possible_cost_thread_x; for (int size = num_timesteps; size > 0; size /= 2) { possible_cost_thread_x.push_back(size); } std::vector possible_thread_y{ 1, 2, 3 }; for (const auto& dyn_thread_x : possible_dyn_thread_x) { for (const auto& dyn_thread_y : possible_thread_y) { dim3 dynThreadsPerBlock(dyn_thread_x, dyn_thread_y, 2); for (const auto& cost_thread_x : possible_cost_thread_x) { for (const auto& cost_thread_y : possible_thread_y) { dim3 costThreadsPerBlock(cost_thread_x, cost_thread_y, 2); logger->info("Testing coalesced RMPPI Rollout Kernel with dyn(%d, %d, %d), cost(%d %d, %d)\n", dynThreadsPerBlock.x, dynThreadsPerBlock.y, dynThreadsPerBlock.z, costThreadsPerBlock.x, costThreadsPerBlock.y, costThreadsPerBlock.z); mppi::kernels::rmppi::launchSplitRMPPIRolloutKernel( model, cost, sampler, fb_controller->getHostPointer().get(), dt, num_timesteps, num_rollouts, lambda, alpha, value_func_threshold, initial_x_d, output_d, cost_trajectories_d, dynThreadsPerBlock, costThreadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), cost_trajectories_d, sizeof(float) * 2 * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); for (int i = 0; i < 2 * num_rollouts; i++) { float cost_diff = trajectory_costs_cpu(i) - trajectory_costs_gpu(i); std::string error_prefix = "Coalesced rollout sample " + std::to_string(i) + "/" + std::to_string(num_rollouts) + " dyn(" + std::to_string(dynThreadsPerBlock.x) + ", " + std::to_string(dynThreadsPerBlock.y) + ", " + std::to_string(dynThreadsPerBlock.z) + ") cost(" + std::to_string(costThreadsPerBlock.x) + ", " + std::to_string(costThreadsPerBlock.y) + ", " + std::to_string(costThreadsPerBlock.z) + ")"; ASSERT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_cpu(i)) << error_prefix << ": CPU = " << trajectory_costs_cpu(i) << ", GPU = " << trajectory_costs_gpu(i) << std::endl; } logger->info("Testing non-coalesced RMPPI Rollout Kernel with dyn(%d, %d, %d), cost(%d %d, %d)\n", dynThreadsPerBlock.x, dynThreadsPerBlock.y, dynThreadsPerBlock.z, costThreadsPerBlock.x, costThreadsPerBlock.y, costThreadsPerBlock.z); mppi::kernels::rmppi::launchSplitRMPPIRolloutKernel( model, cost, sampler, fb_controller->getHostPointer().get(), dt, num_timesteps, num_rollouts, lambda, alpha, value_func_threshold, initial_x_d, output_d, cost_trajectories_d, dynThreadsPerBlock, costThreadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), cost_trajectories_d, sizeof(float) * 2 * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); eigen_assert_float_near(trajectory_costs_cpu, trajectory_costs_gpu, 1e-3); for (int i = 0; i < 2 * num_rollouts; i++) { float cost_diff = trajectory_costs_cpu(i) - trajectory_costs_gpu(i); std::string error_prefix = "Non-coalesced rollout sample " + std::to_string(i) + "/" + std::to_string(num_rollouts) + " dyn(" + std::to_string(dynThreadsPerBlock.x) + ", " + std::to_string(dynThreadsPerBlock.y) + ", " + std::to_string(dynThreadsPerBlock.z) + ") cost(" + std::to_string(costThreadsPerBlock.x) + ", " + std::to_string(costThreadsPerBlock.y) + ", " + std::to_string(costThreadsPerBlock.z) + ")"; ASSERT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_cpu(i)) << error_prefix << ": CPU = " << trajectory_costs_cpu(i) << ", GPU = " << trajectory_costs_gpu(i) << std::endl; } } } } } HANDLE_ERROR(cudaFree(output_d)); } TEST_F(RMPPIKernels, ValidateCombinedRMPPIRolloutKernelAgainstMPPIRollout) { int num_rollouts = 2048; sampler->setNumRollouts(num_rollouts); sampler->setNumDistributions(2); // Used to reset control samples between calls to rollout kernels Eigen::MatrixXf control_noise = Eigen::MatrixXf::Zero(DYN_T::CONTROL_DIM, 2 * num_rollouts * num_timesteps); /** * Setup GPU */ HANDLE_ERROR(cudaMalloc((void**)&initial_x_d, sizeof(float) * 2 * DYN_T::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&cost_trajectories_d, sizeof(float) * 2 * num_rollouts)); state_array initial_real_state = state_array::Random(); control_trajectory nominal_trajectory = control_trajectory::Random(); sampler->copyImportanceSamplerToDevice(nominal_trajectory.data(), nominal_idx, false); sampler->copyImportanceSamplerToDevice(nominal_trajectory.data(), real_idx, false); fb_controller->copyToDevice(false); sampler->generateSamples(1, 0, gen, false); // Get unaltered control noise for resetting purposes HANDLE_ERROR(cudaMemcpyAsync(control_noise.data(), sampler->getControlSample(0, 0, 0), sizeof(float) * 2 * num_rollouts * num_timesteps * DYN_T::CONTROL_DIM, cudaMemcpyDeviceToHost, stream)); // Start both nominal and real states at the same point HANDLE_ERROR(cudaMemcpyAsync(initial_x_d + nominal_idx * DYN_T::STATE_DIM, initial_real_state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(initial_x_d + real_idx * DYN_T::STATE_DIM, initial_real_state.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); Eigen::MatrixXf trajectory_costs_rmppi = Eigen::MatrixXf::Zero(2 * num_rollouts, 1); Eigen::MatrixXf trajectory_costs_mppi = Eigen::MatrixXf::Zero(2 * num_rollouts, 1); /** * GPU Calculation **/ std::vector possible_thread_x; for (int size = 64; size > 0; size /= 2) { possible_thread_x.push_back(size); } std::vector possible_thread_y{ 1, 2, 3 }; for (const auto& thread_x : possible_thread_x) { for (const auto& thread_y : possible_thread_y) { dim3 threadsPerBlock(thread_x, thread_y, 2); mppi::kernels::rmppi::launchRMPPIRolloutKernel( model, cost, sampler, fb_controller->getHostPointer().get(), dt, num_timesteps, num_rollouts, lambda, alpha, value_func_threshold, initial_x_d, cost_trajectories_d, threadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_rmppi.data(), cost_trajectories_d, sizeof(float) * 2 * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); // Reset control samples HANDLE_ERROR(cudaMemcpyAsync(sampler->getControlSample(0, 0, 0), control_noise.data(), sizeof(float) * 2 * num_rollouts * num_timesteps * DYN_T::CONTROL_DIM, cudaMemcpyHostToDevice, stream)); mppi::kernels::launchRolloutKernel(model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, initial_x_d, cost_trajectories_d, threadsPerBlock, stream, false); // Reset control samples HANDLE_ERROR(cudaMemcpyAsync(sampler->getControlSample(0, 0, 0), control_noise.data(), sizeof(float) * 2 * num_rollouts * num_timesteps * DYN_T::CONTROL_DIM, cudaMemcpyHostToDevice, stream)); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_mppi.data(), cost_trajectories_d, sizeof(float) * 2 * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); for (int i = 0; i < 2 * num_rollouts; i++) { float cost_diff = trajectory_costs_mppi(i) - trajectory_costs_rmppi(i); std::string error_prefix = "Rollout sample " + std::to_string(i) + " (" + std::to_string(threadsPerBlock.x) + ", " + std::to_string(threadsPerBlock.y) + ", " + std::to_string(threadsPerBlock.z) + ")"; EXPECT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_mppi(i)) << error_prefix << ": MPPI = " << trajectory_costs_mppi(i) << ", R-MPPI = " << trajectory_costs_rmppi(i) << std::endl; } } } } ================================================ FILE: tests/mppi_core/rollout_kernel_tests.cu ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Here we will test various device functions that are related to cuda kernel things. */ TEST(RolloutKernel, loadGlobalToShared) { const int STATE_DIM = 12; const int CONTROL_DIM = 3; std::vector x0_host = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2 }; std::vector x_thread_host(STATE_DIM, 0.f); std::vector xdot_thread_host(STATE_DIM, 2.f); std::vector u_thread_host(CONTROL_DIM, 3.f); launchGlobalToShared_KernelTest(x0_host, x_thread_host, xdot_thread_host, u_thread_host); array_assert_float_eq(x0_host, x_thread_host, STATE_DIM); array_assert_float_eq(0.f, xdot_thread_host, STATE_DIM); array_assert_float_eq(0.f, u_thread_host, CONTROL_DIM); } TEST(RolloutKernel, loadGlobalToSharedNominalAndActualState) { const int STATE_DIM = 12; const int CONTROL_DIM = 3; std::vector x0_host_act = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2 }; std::vector x0_host_nom = { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2 }; std::vector x_thread_host_act(STATE_DIM, 0.f); std::vector x_thread_host_nom(STATE_DIM, 0.f); std::vector xdot_thread_host_act(STATE_DIM, 2.f); std::vector xdot_thread_host_nom(STATE_DIM, 2.f); std::vector u_thread_host_act(CONTROL_DIM, 3.f); std::vector u_thread_host_nom(CONTROL_DIM, 3.f); launchGlobalToShared_KernelTest_nom_act(x0_host_act, x_thread_host_act, xdot_thread_host_act, u_thread_host_act, x0_host_nom, x_thread_host_nom, xdot_thread_host_nom, u_thread_host_nom); // std::cout << "Testing actual x0" << std::endl; array_assert_float_eq(x0_host_act, x_thread_host_act, STATE_DIM); // std::cout << "Testing nom x0" << std::endl; array_assert_float_eq(x0_host_nom, x_thread_host_nom, STATE_DIM); // std::cout << "Testing empty" << std::endl; array_assert_float_eq(0.f, xdot_thread_host_act, STATE_DIM); array_assert_float_eq(0.f, xdot_thread_host_nom, STATE_DIM); array_assert_float_eq(0.f, u_thread_host_act, CONTROL_DIM); array_assert_float_eq(0.f, u_thread_host_nom, CONTROL_DIM); } TEST(RolloutKernel, computeAndSaveCostAllRollouts) { // Define an assortment of costs for a given number of rollouts CartpoleQuadraticCost cost; cost.GPUSetup(); const int num_rollouts = 1234; std::array cost_all_rollouts = { 0 }; std::array x_traj_terminal = { 0 }; std::array cost_known = { 0 }; std::array cost_compute = { 0 }; std::default_random_engine generator(7.0); std::normal_distribution distribution(1.0, 2.0); for (auto& costs : cost_all_rollouts) { costs = 10 * distribution(generator); } for (auto& state : x_traj_terminal) { state = distribution(generator); } // Compute terminal cost on CPU for (int i = 0; i < num_rollouts; ++i) { cost_known[i] = cost_all_rollouts[i] + (x_traj_terminal[CartpoleDynamics::STATE_DIM * i] * x_traj_terminal[CartpoleDynamics::STATE_DIM * i] * cost.getParams().cart_position_coeff + x_traj_terminal[CartpoleDynamics::STATE_DIM * i + 1] * x_traj_terminal[CartpoleDynamics::STATE_DIM * i + 1] * cost.getParams().cart_velocity_coeff + x_traj_terminal[CartpoleDynamics::STATE_DIM * i + 2] * x_traj_terminal[CartpoleDynamics::STATE_DIM * i + 2] * cost.getParams().pole_angle_coeff + x_traj_terminal[CartpoleDynamics::STATE_DIM * i + 3] * x_traj_terminal[CartpoleDynamics::STATE_DIM * i + 3] * cost.getParams().pole_angular_velocity_coeff) * cost.getParams().terminal_cost_coeff; } // Compute the dynamics on the GPU launchComputeAndSaveCostAllRollouts_KernelTest( cost, cost_all_rollouts, x_traj_terminal, cost_compute); array_assert_float_eq(cost_known, cost_compute); } class RolloutKernelTests : public ::testing::Test { public: using DYN_T = CartpoleDynamics; using COST_T = CartpoleQuadraticCost; using DYN_PARAMS_T = typename DYN_T::DYN_PARAMS_T; using COST_PARAMS_T = typename COST_T::COST_PARAMS_T; using SAMPLER_T = mppi::sampling_distributions::GaussianDistribution; using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T; using state_array = DYN_T::state_array; float dt = 0.01; float lambda = 0.5f; float alpha = 0.001f; float control_std_dev = 0.4f; int num_timesteps = 100; int num_rollouts = 2048; cudaStream_t stream; DYN_T* model; COST_T* cost; SAMPLER_T* sampler; mppi::util::MPPILoggerPtr logger; void SetUp() override { model = new DYN_T(); cost = new COST_T(); sampler = new SAMPLER_T(); logger = std::make_shared(); model->setLogger(logger); cost->setLogger(logger); sampler->setLogger(logger); SAMPLER_PARAMS_T sampler_params; for (int i = 0; i < DYN_T::CONTROL_DIM; i++) { sampler_params.std_dev[i] = control_std_dev; } sampler_params.num_rollouts = num_rollouts; sampler_params.num_timesteps = num_timesteps; sampler->setParams(sampler_params); COST_PARAMS_T cost_params; cost_params.cart_position_coeff = 100; cost_params.pole_angle_coeff = 200; cost_params.cart_velocity_coeff = 10; cost_params.pole_angular_velocity_coeff = 20; cost_params.control_cost_coeff[0] = 1; cost_params.terminal_cost_coeff = 0; cost_params.desired_terminal_state[0] = -20; cost_params.desired_terminal_state[1] = 0; cost_params.desired_terminal_state[2] = M_PI; cost_params.desired_terminal_state[3] = 0; cost->setParams(cost_params); HANDLE_ERROR(cudaStreamCreate(&stream)); } void TearDown() override { delete model; delete cost; delete sampler; } }; TEST_F(RolloutKernelTests, runRolloutKernelOnMultipleSystems) { std::vector x0(DYN_T::STATE_DIM); Eigen::MatrixXf nom_control = Eigen::MatrixXf::Random(DYN_T::CONTROL_DIM, num_timesteps); std::vector nominal_control_seq(nom_control.data(), nom_control.data() + nom_control.size()); std::vector trajectory_costs_act(num_rollouts); std::vector trajectory_costs_nom(num_rollouts); // set initial state for (size_t i = 0; i < x0.size(); i++) { x0[i] = i * 0.1 + 0.2; } launchRolloutKernel_nom_act(model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, x0, nominal_control_seq, trajectory_costs_act, trajectory_costs_nom, stream); array_assert_float_eq(trajectory_costs_act, trajectory_costs_nom, num_rollouts); } TEST_F(RolloutKernelTests, CombinedRolloutKernelGPUvsCPU) { state_array x0 = state_array::Random(); /** * GPU Setup **/ model->GPUSetup(); cost->GPUSetup(); sampler->GPUSetup(); Eigen::MatrixXf control_seq = Eigen::MatrixXf::Random(DYN_T::CONTROL_DIM, num_timesteps * num_rollouts); sampler->copyImportanceSamplerToDevice(control_seq.data(), 0, false); // Generate samples and do stream synchronize logger->debug("Generating samples\n"); curandGenerator_t gen; curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetStream(gen, stream); sampler->generateSamples(1, 0, gen, true); Eigen::MatrixXf trajectory_costs_cpu = Eigen::MatrixXf::Zero(num_rollouts, 1); Eigen::MatrixXf trajectory_costs_gpu = Eigen::MatrixXf::Zero(num_rollouts, 1); logger->debug("Running CPU Rollout\n"); launchCPURolloutKernel(model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, x0, trajectory_costs_cpu, stream); /** GPU Computations **/ float* initial_x_d; float* trajectory_costs_d; HANDLE_ERROR(cudaMalloc((void**)&initial_x_d, sizeof(float) * DYN_T::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * num_rollouts)); HANDLE_ERROR( cudaMemcpyAsync(initial_x_d, x0.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); std::vector possible_thread_x; for (int i = 64; i > 0; i /= 2) { possible_thread_x.push_back(i); } std::vector possible_thread_y = { 1, 2, 3, 4 }; for (const auto& thread_x : possible_thread_x) { for (const auto& thread_y : possible_thread_y) { dim3 threadsPerBlock(thread_x, thread_y, 1); logger->debug("Running GPU Combined Rollout on (%d, %d, %d)\n", threadsPerBlock.x, threadsPerBlock.y, threadsPerBlock.z); mppi::kernels::launchRolloutKernel(model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, initial_x_d, trajectory_costs_d, threadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), trajectory_costs_d, sizeof(float) * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); eigen_assert_float_near(trajectory_costs_cpu, trajectory_costs_gpu, 1e-4f); } } } TEST_F(RolloutKernelTests, SplitRolloutKernelGPUvsCPU) { state_array x0 = state_array::Random(); /** * GPU Setup **/ model->GPUSetup(); cost->GPUSetup(); sampler->GPUSetup(); Eigen::MatrixXf control_seq = Eigen::MatrixXf::Random(DYN_T::CONTROL_DIM, num_timesteps * num_rollouts); sampler->copyImportanceSamplerToDevice(control_seq.data(), 0, false); // Generate samples and do stream synchronize logger->debug("Generating samples\n"); curandGenerator_t gen; curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetStream(gen, stream); sampler->generateSamples(1, 0, gen, true); Eigen::MatrixXf trajectory_costs_cpu = Eigen::MatrixXf::Zero(num_rollouts, 1); Eigen::MatrixXf trajectory_costs_gpu = Eigen::MatrixXf::Zero(num_rollouts, 1); logger->debug("Running CPU Rollout\n"); launchCPURolloutKernel(model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, x0, trajectory_costs_cpu, stream); /** GPU Computations **/ float* initial_x_d; float* output_d; float* trajectory_costs_d; HANDLE_ERROR(cudaMalloc((void**)&initial_x_d, sizeof(float) * DYN_T::STATE_DIM)); HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(float) * DYN_T::OUTPUT_DIM * num_rollouts * num_timesteps)); HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * num_rollouts)); HANDLE_ERROR( cudaMemcpyAsync(initial_x_d, x0.data(), sizeof(float) * DYN_T::STATE_DIM, cudaMemcpyHostToDevice, stream)); std::vector possible_dyn_thread_x; for (int i = 128; i > 0; i /= 2) { possible_dyn_thread_x.push_back(i); } std::vector possible_dyn_thread_y = { 1, 2, 3, 4 }; std::vector possible_cost_thread_x; for (int i = num_timesteps; i > 0; i /= 2) { possible_cost_thread_x.push_back(i); } std::vector possible_cost_thread_y = { 1, 2, 3, 4 }; for (const auto& dyn_thread_x : possible_dyn_thread_x) { for (const auto& dyn_thread_y : possible_dyn_thread_y) { for (const auto& cost_thread_x : possible_cost_thread_x) { for (const auto& cost_thread_y : possible_cost_thread_y) { dim3 dynThreadsPerBlock(dyn_thread_x, dyn_thread_y, 1); dim3 costThreadsPerBlock(cost_thread_x, cost_thread_y, 1); logger->debug("Running coalesced GPU Split Rollout with dyn(%d, %d, %d), cost(%d, %d, %d)\n", dynThreadsPerBlock.x, dynThreadsPerBlock.y, dynThreadsPerBlock.z, costThreadsPerBlock.x, costThreadsPerBlock.y, costThreadsPerBlock.z); mppi::kernels::launchSplitRolloutKernel( model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, initial_x_d, output_d, trajectory_costs_d, dynThreadsPerBlock, costThreadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), trajectory_costs_d, sizeof(float) * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); // eigen_assert_float_near(trajectory_costs_cpu, trajectory_costs_gpu, 1e-4f); for (int i = 0; i < num_rollouts; i++) { float cost_diff = trajectory_costs_cpu(i) - trajectory_costs_gpu(i); std::string error_prefix = "Split Rollout sample " + std::to_string(i) + " dyn(" + std::to_string(dynThreadsPerBlock.x) + ", " + std::to_string(dynThreadsPerBlock.y) + ", " + std::to_string(dynThreadsPerBlock.z) + +" cost(" + std::to_string(costThreadsPerBlock.x) + ", " + std::to_string(costThreadsPerBlock.y) + ", " + std::to_string(costThreadsPerBlock.z) + ")"; ASSERT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_cpu(i)) << error_prefix << ": CPU = " << trajectory_costs_cpu(i) << ", GPU = " << trajectory_costs_gpu(i) << std::endl; } logger->debug("Running non-coalesced GPU Split Rollout with dyn(%d, %d, %d), cost(%d, %d, %d)\n", dynThreadsPerBlock.x, dynThreadsPerBlock.y, dynThreadsPerBlock.z, costThreadsPerBlock.x, costThreadsPerBlock.y, costThreadsPerBlock.z); mppi::kernels::launchSplitRolloutKernel( model, cost, sampler, dt, num_timesteps, num_rollouts, lambda, alpha, initial_x_d, output_d, trajectory_costs_d, dynThreadsPerBlock, costThreadsPerBlock, stream, false); HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs_gpu.data(), trajectory_costs_d, sizeof(float) * num_rollouts, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); for (int i = 0; i < num_rollouts; i++) { float cost_diff = trajectory_costs_cpu(i) - trajectory_costs_gpu(i); std::string error_prefix = "Split Rollout sample " + std::to_string(i) + " dyn(" + std::to_string(dynThreadsPerBlock.x) + ", " + std::to_string(dynThreadsPerBlock.y) + ", " + std::to_string(dynThreadsPerBlock.z) + +" cost(" + std::to_string(costThreadsPerBlock.x) + ", " + std::to_string(costThreadsPerBlock.y) + ", " + std::to_string(costThreadsPerBlock.z) + ")"; ASSERT_LT(fabsf(cost_diff), 1e-3 * trajectory_costs_cpu(i)) << error_prefix << ": CPU = " << trajectory_costs_cpu(i) << ", GPU = " << trajectory_costs_gpu(i) << std::endl; } // eigen_assert_float_near(trajectory_costs_cpu, trajectory_costs_gpu, 1e-4f); } } } } } ================================================ FILE: tests/mppi_core/viz_kernels_test.cu ================================================ #include #include #include #include #include #include #include class VizualizationKernelsTest : public ::testing::Test { protected: void SetUp() override { CartpoleQuadraticCostParams new_params; new_params.cart_position_coeff = 100; new_params.pole_angle_coeff = 200; new_params.cart_velocity_coeff = 10; new_params.pole_angular_velocity_coeff = 20; new_params.control_cost_coeff[0] = 1; new_params.terminal_cost_coeff = 0; new_params.desired_terminal_state[0] = -20; new_params.desired_terminal_state[1] = 0; new_params.desired_terminal_state[2] = M_PI; new_params.desired_terminal_state[3] = 0; cost.setParams(new_params); cudaStreamCreate(&stream); /** * Ensure dynamics and costs exist on GPU */ dynamics.bindToStream(stream); cost.bindToStream(stream); // Call the GPU setup functions of the model and cost dynamics.GPUSetup(); cost.GPUSetup(); fb_controller.GPUSetup(); for (int i = 0; i < x0.rows(); i++) { x0(i) = i * 0.1 + 0.2; } for (int sample = 0; sample < num_rollouts; sample++) { control[sample] = control_trajectory::Random(); } // Create x init cuda array HANDLE_ERROR(cudaMalloc((void**)&initial_state_d, sizeof(float) * CartpoleDynamics::STATE_DIM * 2)); // create control u trajectory cuda array HANDLE_ERROR( cudaMalloc((void**)&control_d, sizeof(float) * CartpoleDynamics::CONTROL_DIM * MAX_TIMESTEPS * num_rollouts)); // Create cost trajectory cuda array HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * num_rollouts * (MAX_TIMESTEPS + 1) * 2)); // Create result state cuda array HANDLE_ERROR(cudaMalloc((void**)&result_state_d, sizeof(float) * num_rollouts * MAX_TIMESTEPS * CartpoleDynamics::STATE_DIM * 2)); // Create result state cuda array HANDLE_ERROR(cudaMalloc((void**)&crash_status_d, sizeof(float) * num_rollouts * MAX_TIMESTEPS * 2)); } void TearDown() override { cudaFree(initial_state_d); cudaFree(control_d); cudaFree(trajectory_costs_d); cudaFree(crash_status_d); cudaFree(result_state_d); } const static int MAX_TIMESTEPS = 100; typedef Eigen::Matrix state_trajectory; // A state trajectory typedef Eigen::Matrix control_trajectory; // A control // trajectory typedef Eigen::Matrix cost_trajectory; typedef Eigen::Matrix crash_status_trajectory; CartpoleDynamics dynamics = CartpoleDynamics(1, 1, 1); CartpoleQuadraticCost cost; DDPFeedback fb_controller = DDPFeedback(&dynamics, dt); float dt = 0.02; int num_rollouts = 20; float lambda = 0.5; float alpha = 0.001; cudaStream_t stream; float* initial_state_d; float* control_d; float* trajectory_costs_d; float* result_state_d; int* crash_status_d; CartpoleDynamics::state_array x0; std::vector control = std::vector(num_rollouts); std::vector result_state = std::vector(num_rollouts); std::vector trajectory_costs = std::vector(num_rollouts); std::vector crash_status = std::vector(num_rollouts); }; // TEST_F(VizualizationKernelsTest, stateAndCostTrajectoryKernelNoZNoFeedbackTest) // { // for (int tdy = 1; tdy < 8; tdy++) // { // /** // * Fill in GPU arrays // */ // HANDLE_ERROR(cudaMemcpyAsync(initial_state_d, x0.data(), sizeof(float) * CartpoleDynamics::STATE_DIM, // cudaMemcpyHostToDevice, stream)); // HANDLE_ERROR(cudaMemcpyAsync(initial_state_d + CartpoleDynamics::STATE_DIM, x0.data(), // sizeof(float) * CartpoleDynamics::STATE_DIM, cudaMemcpyHostToDevice, stream)); // for (int i = 0; i < num_rollouts; i++) // { // HANDLE_ERROR(cudaMemcpyAsync(control_d + i * MAX_TIMESTEPS * CartpoleDynamics::CONTROL_DIM, control[i].data(), // MAX_TIMESTEPS * CartpoleDynamics::CONTROL_DIM * sizeof(float), // cudaMemcpyHostToDevice, stream)); // } // HANDLE_ERROR(cudaStreamSynchronize(stream)); // const int gridsize_x = (num_rollouts - 1) / 32 + 1; // dim3 dimBlock(32, tdy, 1); // dim3 dimGrid(gridsize_x, 1, 1); // mppi_common::stateAndCostTrajectoryKernel, 32, 1> // <<>>(dynamics.model_d_, cost.cost_d_, fb_controller.getDevicePointer(), // control_d, // initial_state_d, result_state_d, trajectory_costs_d, crash_status_d, // num_rollouts, MAX_TIMESTEPS, dt, -1); // // Copy the results back to the host // for (int i = 0; i < num_rollouts; i++) // { // result_state[i].col(0) = x0; // // shifted by one since we do not save the initial state // HANDLE_ERROR(cudaMemcpyAsync(result_state[i].data() + (CartpoleDynamics::STATE_DIM), // result_state_d + i * MAX_TIMESTEPS * CartpoleDynamics::STATE_DIM, // (MAX_TIMESTEPS - 1) * CartpoleDynamics::STATE_DIM * sizeof(float), // cudaMemcpyDeviceToHost, stream)); // HANDLE_ERROR(cudaMemcpyAsync(trajectory_costs[i].data(), trajectory_costs_d + i * (MAX_TIMESTEPS + 1), // (MAX_TIMESTEPS + 1) * sizeof(float), cudaMemcpyDeviceToHost, stream)); // HANDLE_ERROR(cudaMemcpyAsync(crash_status[i].data(), crash_status_d + i * MAX_TIMESTEPS, // MAX_TIMESTEPS * sizeof(float), cudaMemcpyDeviceToHost, stream)); // } // HANDLE_ERROR(cudaStreamSynchronize(stream)); // for (int sample = 0; sample < num_rollouts; sample++) // { // CartpoleDynamics::state_array x = x0; // CartpoleDynamics::state_array x_dot; // control_trajectory u_traj = control[sample]; // int crash_status_val = 0; // int t = 0; // for (; t < MAX_TIMESTEPS; t++) // { // EXPECT_NEAR(x(0), result_state[sample].col(t)(0), 1e-5) // << "\ntdy: " << tdy << "\nsample: " << sample << "\nat time: " << t; // EXPECT_NEAR(x(1), result_state[sample].col(t)(1), 1e-5) // << "\ntdy: " << tdy << "\nsample: " << sample << "\nat time: " << t; // EXPECT_NEAR(x(2), result_state[sample].col(t)(2), 1e-5) // << "\ntdy: " << tdy << "\nsample: " << sample << "\nat time: " << t; // EXPECT_NEAR(x(3), result_state[sample].col(t)(3), 1e-5) // << "\ntdy: " << tdy << "\nsample: " << sample << "\nat time: " << t; // CartpoleDynamics::control_array u = u_traj.col(t); // float cost_val = cost.computeStateCost(x, t, &crash_status_val); // if (t == 0) // { // // don't weight the first state // EXPECT_FLOAT_EQ(trajectory_costs[sample](t), 0); // } // else // { // EXPECT_FLOAT_EQ(cost_val, trajectory_costs[sample](t)) << "\nsample: " << sample << "\nat time: " << t; // } // EXPECT_EQ(crash_status_val, crash_status[sample](t)) << "\nsample: " << sample << "\nat time: " << t; // dynamics.enforceConstraints(x, u); // dynamics.computeStateDeriv(x, u, x_dot); // dynamics.updateState(x, x_dot, dt); // } // float terminal_cost = cost.terminalCost(x); // EXPECT_FLOAT_EQ(terminal_cost, trajectory_costs[sample](t)) << "\nsample: " << sample << "\nat terminal"; // } // } // } ================================================ FILE: tests/mppi_core/weightedreduction_kernel_tests.cu ================================================ #include #include #include #include TEST(WeightedReductionKernel, setInitialControlToZero) { const int num_threads = 100; const int control_dim = 5; std::array u_host = { 1 }; std::array u_intermediate_host = { 1 }; launchSetInitialControlToZero_KernelTest(u_host, u_intermediate_host); array_assert_float_eq(0.f, u_host); array_assert_float_eq(0.f, u_intermediate_host); } TEST(WeightedReductionKernel, strideControlWeightReduction) { auto generator = std::default_random_engine(7.0); auto distribution = std::normal_distribution(1.0, 0.2); const int sum_stride = 64; float normalizer = 1000; const int num_rollouts = 1024; const int control_dim = 6; const int num_timesteps = 100; std::array exp_costs_host = { 1 }; std::array v_host = { 1 }; // Disturbed control for (size_t i = 1; i < exp_costs_host.size(); ++i) { exp_costs_host[i] = 0.001 * distribution(generator); } for (size_t i = 1; i < v_host.size(); ++i) { v_host[i] = distribution(generator); } const int cell_size = ((num_rollouts - 1) / sum_stride + 1); std::array u_intermediate_host = { 0 }; launchStrideControlWeightReduction_KernelTest( normalizer, exp_costs_host, v_host, u_intermediate_host); CudaCheckError(); std::array u_intermediate_known = { 0 }; // Compute weights per rollouts std::array rollout_weights = { 0 }; for (size_t i = 0; i < rollout_weights.size(); ++i) { rollout_weights[i] = exp_costs_host[i] / normalizer; } // Weight all the controls with the appropriate cost std::array weighted_controls = { 0 }; for (int i = 0; i < num_rollouts; ++i) { for (int j = 0; j < num_timesteps; ++j) { for (int k = 0; k < control_dim; ++k) { int index = i * num_timesteps * control_dim + j * control_dim + k; weighted_controls[index] = rollout_weights[i] * v_host[index]; } } } // Iterate through each stride for (int i = 0; i < ((num_rollouts - 1) / sum_stride + 1); ++i) { // i is the cell index for (int j = 0; j < sum_stride; ++j) { // j is the stride index for (int k = 0; k < num_timesteps; ++k) { // k specifies the current timestep for (int w = 0; w < control_dim; ++w) { int rollout_index = (i * sum_stride + j); if (rollout_index < num_rollouts) { u_intermediate_known[k * cell_size * control_dim + i * control_dim + w] += weighted_controls[rollout_index * num_timesteps * control_dim + k * control_dim + w]; } } } } } array_assert_float_eq(u_intermediate_known, u_intermediate_host); } TEST(WeightedReductionKernel, rolloutWeightReductionAndSaveControl) { auto generator = std::default_random_engine(7.0); auto distribution = std::normal_distribution(1.0, 0.2); const int sum_stride = 64; const int num_rollouts = 1024; const int control_dim = 6; const int num_timesteps = 100; const int cell_size = ((num_rollouts - 1) / sum_stride + 1); std::array u_intermediate_host = { 0 }; std::array du_new_compute = { 0 }; // Update control std::array du_new_known = { 0 }; for (size_t i = 0; i < u_intermediate_host.size(); ++i) { u_intermediate_host[i] = distribution(generator); } // Compute the control update for (int i = 0; i < ((num_rollouts - 1) / sum_stride + 1); ++i) { // i is the cell index for (int k = 0; k < num_timesteps; ++k) { // k specifies the current timestep for (int w = 0; w < control_dim; ++w) { du_new_known[k * control_dim + w] += u_intermediate_host[k * cell_size * control_dim + i * control_dim + w]; } } } // Launch the test kernel launchRolloutWeightReductionAndSaveControl_KernelTest( u_intermediate_host, du_new_compute); array_assert_float_eq(du_new_known, du_new_compute); } TEST(WeightedReductionKernel, comparisonTestAutorallyMPPI_Generic) { auto generator = std::default_random_engine(7.0); auto distribution = std::normal_distribution(5.0, 1.2); const int sum_stride = 64; const int num_rollouts = 1024; const int control_dim = 4; const int num_timesteps = 100; std::array exp_costs; std::array perturbed_controls; std::array controls_out_autorally; std::array controls_out_mppi_generic; // Initialize the exp costs with positive numbers for (float& exp_cost : exp_costs) { exp_cost = expf(-1 * distribution(generator)); } exp_costs[0] = 0; // Minimum cost // Initialize the control perturbations with random numbers for (float& control : perturbed_controls) { control = distribution(generator); } // The normalizer is the sum of all exponential costs; float normalizer = std::accumulate(exp_costs.begin(), exp_costs.end(), 0.0); launchWeightedReductionKernelTest( exp_costs, perturbed_controls, normalizer, controls_out_mppi_generic, 0); launchAutoRallyWeightedReductionKernelTest( exp_costs, perturbed_controls, normalizer, controls_out_autorally, 0); array_expect_float_eq(controls_out_mppi_generic, controls_out_autorally); } ================================================ FILE: tests/nn_helpers/CMakeLists.txt ================================================ set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) gtest_discover_tests(${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: tests/nn_helpers/activation_functions_tests.cu ================================================ #include #include #include #include #include #include #include #include #include "mppi/utils/math_utils.h" class ActivationFunctionTest : public testing::Test { protected: void SetUp() override { generator = std::default_random_engine(7.0); distribution = std::normal_distribution(0.0, 2.0); } void TearDown() override { } std::default_random_engine generator; std::normal_distribution distribution; }; TEST_F(ActivationFunctionTest, TanhCPU) { for (int i = 0; i < 1e5; i++) { float num = distribution(generator); EXPECT_FLOAT_EQ(mppi::nn::tanh(num), std::tanh(num)); } } __global__ void tanhTestKernel(float* input, int num, int times) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num) { for (int i = 0; i < times; i++) { input[tid] = mppi::nn::tanh(input[tid]); } } } __global__ void tanhStableTestKernel(float* input, int num, int times) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num) { for (int i = 0; i < times; i++) { input[tid] = mppi::nn::tanh_accurate(input[tid]); } } } template float launchTanhTestKernel(std::vector& input, int times = 100) { float* input_d; float* input_stable_d; int count = input.size(); HANDLE_ERROR(cudaMalloc((void**)&input_d, sizeof(float) * count)); HANDLE_ERROR(cudaMalloc((void**)&input_stable_d, sizeof(float) * count)); HANDLE_ERROR(cudaMemcpy(input_d, input.data(), sizeof(float) * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(input_stable_d, input.data(), sizeof(float) * count, cudaMemcpyHostToDevice)); const int gridsize_x = (count - 1) / BLOCKDIM_X + 1; dim3 threadsPerBlock(BLOCKDIM_X); dim3 numBlocks(gridsize_x, 1); cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); cudaEvent_t start, stop; float time_stable, time_fast; cudaEventCreate(&start); cudaEventCreate(&stop); std::cout << "\n===== TANH ======\n"; cudaEventRecord(start, stream); tanhTestKernel<<>>(input_d, count, times); cudaEventRecord(stop, stream); cudaEventSynchronize(stop); cudaEventElapsedTime(&time_fast, start, stop); std::cout << "time to compute fast " << time_fast << std::endl; cudaEventRecord(start, stream); tanhStableTestKernel<<>>(input_stable_d, count, times); cudaEventRecord(stop, stream); cudaEventSynchronize(stop); cudaEventElapsedTime(&time_stable, start, stop); std::cout << "time to compute stable " << time_stable << std::endl; std::cout << "time difference: " << time_stable - time_fast << std::endl; HANDLE_ERROR(cudaMemcpy(input.data(), input_d, sizeof(float) * count, cudaMemcpyDeviceToHost)); cudaFree(input_d); cudaFree(input_stable_d); return time_stable - time_fast; } TEST_F(ActivationFunctionTest, TanhGPU) { std::vector vec(1e8); for (int i = 0; i < vec.size(); i++) { vec[i] = distribution(generator); } std::vector output_vec = vec; launchTanhTestKernel(output_vec, 1); for (int i = 0; i < vec.size(); i++) { EXPECT_NEAR(output_vec[i], std::tanh(vec[i]), 2.0e-7); } float average = 0; for (int i = 0; i < 10; i++) { average += launchTanhTestKernel(output_vec); } average /= 10.0f; std::cout << "got average diff " << average << std::endl; // below condition is kind of a janky hardcode, might change on different systems EXPECT_LT(std::abs(average), 3); EXPECT_LT(average, 3); // in ms EXPECT_GT(average, -3); } TEST_F(ActivationFunctionTest, SigmoidCPU) { for (int i = 0; i < 1e5; i++) { float num = distribution(generator); EXPECT_FLOAT_EQ(mppi::nn::sigmoid(num), (1.0f / (1.0f + std::exp(-num)))); } } __global__ void sigmoidTestKernel(float* input, int num, int times) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num) { for (int i = 0; i < times; i++) { input[tid] = mppi::nn::sigmoid(input[tid]); } } } __global__ void sigmoidStableTestKernel(float* input, int num, int times) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num) { for (int i = 0; i < times; i++) { input[tid] = mppi::nn::sigmoid_accurate(input[tid]); } } } template float launchSigmoidTestKernel(std::vector& input, int times = 100) { float* input_d; float* input_stable_d; int count = input.size(); HANDLE_ERROR(cudaMalloc((void**)&input_d, sizeof(float) * count)); HANDLE_ERROR(cudaMalloc((void**)&input_stable_d, sizeof(float) * count)); HANDLE_ERROR(cudaMemcpy(input_d, input.data(), sizeof(float) * count, cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemcpy(input_stable_d, input.data(), sizeof(float) * count, cudaMemcpyHostToDevice)); const int gridsize_x = (count - 1) / BLOCKDIM_X + 1; dim3 threadsPerBlock(BLOCKDIM_X); dim3 numBlocks(gridsize_x, 1); cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); cudaEvent_t start, stop; float time_fast, time_stable; cudaEventCreate(&start); cudaEventCreate(&stop); std::cout << "\n===== SIGMOID ======\n"; cudaEventRecord(start, stream); sigmoidTestKernel<<>>(input_d, count, times); cudaEventRecord(stop, stream); cudaEventSynchronize(stop); cudaEventElapsedTime(&time_fast, start, stop); std::cout << "time to compute fast " << time_fast << std::endl; cudaEventRecord(start, stream); sigmoidStableTestKernel<<>>(input_stable_d, count, times); cudaEventRecord(stop, stream); cudaEventSynchronize(stop); cudaEventElapsedTime(&time_stable, start, stop); std::cout << "time to compute stable " << time_stable << std::endl; std::cout << "time difference: " << time_stable - time_fast << std::endl; HANDLE_ERROR(cudaMemcpy(input.data(), input_d, sizeof(float) * count, cudaMemcpyDeviceToHost)); cudaFree(input_d); cudaFree(input_stable_d); return time_stable - time_fast; } TEST_F(ActivationFunctionTest, SigmoidGPU) { std::vector vec(1e8); for (int i = 0; i < vec.size(); i++) { vec[i] = distribution(generator); } std::vector output_vec = vec; launchSigmoidTestKernel(output_vec, 1); for (int i = 0; i < vec.size(); i++) { EXPECT_NEAR(output_vec[i], (1.0f / (1.0f + std::exp(-vec[i]))), 2.0e-7); } float average = 0.0f; for (int i = 0; i < 10; i++) { average += launchSigmoidTestKernel(output_vec); } average /= 10.0f; std::cout << "got average difference " << average << std::endl; EXPECT_GT(average, 0.0f); // below condition is kind of a janky hardcode, might change on different systems // if should still be faster to use the faster version, so number should be positive EXPECT_GT(average, 1.0f); // in ms } ================================================ FILE: tests/nn_helpers/fnn_helper_test.cu ================================================ #include #include #include #include #include #include // Auto-generated header file #include #include #include #include "mppi/ddp/ddp_dynamics.h" class FNNHelperTest : public testing::Test { protected: void SetUp() override { generator = std::default_random_engine(7.0); distribution = std::normal_distribution(0.0, 1.0); } void TearDown() override { } std::default_random_engine generator; std::normal_distribution distribution; }; TEST_F(FNNHelperTest, ParamsConstructor1) { std::vector vec = { 5, 10, 3 }; FNNHelper<> helper(vec); EXPECT_EQ(helper.getNumLayers(), 3); EXPECT_EQ(helper.getLargestLayer(), 10); EXPECT_EQ(helper.getNumParams(), 93); auto stride = helper.getStrideIdcsPtr(); EXPECT_EQ(stride[0], 0); EXPECT_EQ(stride[1], 50); EXPECT_EQ(stride[2], 60); EXPECT_EQ(stride[3], 90); auto structure = helper.getNetStructurePtr(); EXPECT_EQ(structure[0], 5); EXPECT_EQ(structure[1], 10); EXPECT_EQ(structure[2], 3); auto theta = helper.getThetaPtr(); for (int i = 0; i < 93; i++) { EXPECT_FLOAT_EQ(theta[i], 0.0f); } const int shared_mem_grd = helper.getGrdSharedSizeBytes(); const int shared_mem_blk = helper.getBlkSharedSizeBytes(); EXPECT_EQ(shared_mem_grd, 400); EXPECT_EQ(shared_mem_blk, 80); } TEST_F(FNNHelperTest, ParamsConstructor2) { std::vector layers = { 5, 10, 20, 3, 3 }; FNNHelper<> helper(layers); EXPECT_EQ(helper.getNumLayers(), 5); EXPECT_EQ(helper.getNumParams(), 355); EXPECT_EQ(helper.getLargestLayer(), 20); auto stride = helper.getStrideIdcsPtr(); EXPECT_EQ(stride[0], 0); EXPECT_EQ(stride[1], 50); EXPECT_EQ(stride[2], 60); EXPECT_EQ(stride[3], 260); EXPECT_EQ(stride[4], 280); EXPECT_EQ(stride[5], 340); EXPECT_EQ(stride[6], 343); EXPECT_EQ(stride[7], 352); auto structure = helper.getNetStructurePtr(); EXPECT_EQ(structure[0], 5); EXPECT_EQ(structure[1], 10); EXPECT_EQ(structure[2], 20); EXPECT_EQ(structure[3], 3); EXPECT_EQ(structure[4], 3); auto theta = helper.getThetaPtr(); for (int i = 0; i < 355; i++) { EXPECT_FLOAT_EQ(theta[i], 0.0f); } const int shared_mem_grd = helper.getGrdSharedSizeBytes(); const int shared_mem_blk = helper.getBlkSharedSizeBytes(); EXPECT_EQ(shared_mem_grd, 1472); EXPECT_EQ(shared_mem_blk, 160); } TEST_F(FNNHelperTest, ParamsConstructor3) { std::vector layers = { 5, 3 }; FNNHelper<> helper(layers); EXPECT_EQ(helper.getNumLayers(), 2); EXPECT_EQ(helper.getLargestLayer(), 5); EXPECT_EQ(helper.getNumParams(), 18); EXPECT_EQ(helper.getGrdSharedSizeBytes(), 96); EXPECT_EQ(helper.getBlkSharedSizeBytes(), 12 * 4); EXPECT_EQ(5, helper.getInputDim()); EXPECT_EQ(3, helper.getOutputDim()); auto stride = helper.getStrideIdcsPtr(); EXPECT_EQ(stride[0], 0); EXPECT_EQ(stride[1], 15); auto structure = helper.getNetStructurePtr(); EXPECT_EQ(structure[0], 5); EXPECT_EQ(structure[1], 3); auto theta = helper.getThetaPtr(); for (int i = 0; i < 18; i++) { EXPECT_FLOAT_EQ(theta[i], 0.0f); } } TEST_F(FNNHelperTest, ParamsConstructor4) { std::vector layers = { 5, 3 }; FNNHelper helper(layers); EXPECT_EQ(helper.getGrdSharedSizeBytes(), 0); EXPECT_EQ(helper.getBlkSharedSizeBytes(), 12 * 4); } TEST_F(FNNHelperTest, BindStream) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); std::vector layers = { 5, 3 }; FNNHelper<> helper(layers, stream); EXPECT_EQ(helper.stream_, stream); } TEST_F(FNNHelperTest, GPUSetupAndParamsCheck) { std::vector layers{ 6, 32, 32, 4 }; FNNHelper<> model(layers); float* theta = model.getThetaPtr(); int* stride = model.getStrideIdcsPtr(); int* net_structure = model.getNetStructurePtr(); std::array theta_result = {}; std::array stride_result = {}; std::array net_structure_result = {}; std::array shared_theta_result = {}; std::array shared_stride_result = {}; std::array shared_net_structure_result = {}; EXPECT_EQ(model.GPUMemStatus_, false); EXPECT_EQ(model.network_d_, nullptr); EXPECT_NE(model.getGrdSharedSizeBytes(), 0); model.GPUSetup(); EXPECT_NE(model.getGrdSharedSizeBytes(), 0); EXPECT_EQ(model.GPUMemStatus_, true); EXPECT_NE(model.network_d_, nullptr); // launch kernel launchParameterCheckTestKernel, 1412, 6, 4>(model, theta_result, stride_result, net_structure_result, shared_theta_result, shared_stride_result, shared_net_structure_result); for (int i = 0; i < 1412; i++) { // these are a bunch of mostly random values and nan != nan EXPECT_FLOAT_EQ(theta_result[i], theta[i]); EXPECT_FLOAT_EQ(shared_theta_result[i], theta[i]); } for (int i = 0; i < 6; i++) { EXPECT_EQ(stride_result[i], stride[i]); EXPECT_EQ(shared_stride_result[i], stride[i]); } for (int i = 0; i < 4; i++) { EXPECT_EQ(net_structure[i], net_structure_result[i]); EXPECT_EQ(net_structure[i], shared_net_structure_result[i]); } } TEST_F(FNNHelperTest, UpdateModelTest) { std::vector layers{ 6, 32, 32, 4 }; FNNHelper<> model(layers); float* theta = model.getThetaPtr(); int* stride = model.getStrideIdcsPtr(); int* net_structure = model.getNetStructurePtr(); std::array theta_result = {}; std::array stride_result = {}; std::array net_structure_result = {}; std::array shared_theta_result = {}; std::array shared_stride_result = {}; std::array shared_net_structure_result = {}; model.GPUSetup(); std::vector theta_vec(1412); for (int i = 0; i < 1412; i++) { theta_vec[i] = distribution(generator); } model.updateModel({ 6, 32, 32, 4 }, theta_vec); // check CPU for (int i = 0; i < 1412; i++) { // these are a bunch of mostly random values and nan != nan EXPECT_FLOAT_EQ(model.getThetaPtr()[i], theta_vec[i]); } // launch kernel launchParameterCheckTestKernel, 1412, 6, 4>(model, theta_result, stride_result, net_structure_result, shared_theta_result, shared_stride_result, shared_net_structure_result); for (int i = 0; i < 1412; i++) { // these are a bunch of mostly random values and nan != nan EXPECT_FLOAT_EQ(theta_result[i], theta_vec[i]); EXPECT_FLOAT_EQ(shared_theta_result[i], theta_vec[i]); } for (int i = 0; i < 6; i++) { EXPECT_EQ(stride_result[i], stride[i]); EXPECT_EQ(shared_stride_result[i], stride[i]); } for (int i = 0; i < 4; i++) { EXPECT_EQ(net_structure[i], net_structure_result[i]); EXPECT_EQ(net_structure[i], shared_net_structure_result[i]); } } TEST_F(FNNHelperTest, LoadModelTest) { std::vector layers{ 6, 4 }; FNNHelper<> model(layers); model.GPUSetup(); std::string path = mppi::tests::test_load_nn_file; model.loadParams(path); std::array stride = { 0, 192, 224, 1248, 1280, 1408 }; std::array net_structure = { 6, 32, 32, 4 }; // check CPU for (int i = 0; i < 1412; i++) { EXPECT_FLOAT_EQ(model.getThetaPtr()[i], i) << "failed at index " << i; } for (int i = 0; i < 6; i++) { EXPECT_EQ(model.getStrideIdcsPtr()[i], stride[i]); } for (int i = 0; i < 4; i++) { EXPECT_EQ(net_structure[i], model.getNetStructurePtr()[i]); } std::array theta_result = {}; std::array stride_result = {}; std::array net_structure_result = {}; std::array shared_theta_result = {}; std::array shared_stride_result = {}; std::array shared_net_structure_result = {}; // launch kernel launchParameterCheckTestKernel, 1412, 6, 4>(model, theta_result, stride_result, net_structure_result, shared_theta_result, shared_stride_result, shared_net_structure_result); for (int i = 0; i < 1412; i++) { EXPECT_FLOAT_EQ(theta_result[i], i) << "failed at index " << i; EXPECT_FLOAT_EQ(shared_theta_result[i], i) << "failed at index " << i; } for (int i = 0; i < 6; i++) { EXPECT_EQ(stride_result[i], stride[i]); EXPECT_EQ(shared_stride_result[i], stride[i]); } for (int i = 0; i < 4; i++) { EXPECT_EQ(net_structure[i], net_structure_result[i]); EXPECT_EQ(net_structure[i], shared_net_structure_result[i]); } FNNHelper<> model2(path); model2.GPUSetup(); // launch kernel launchParameterCheckTestKernel, 1412, 6, 4>(model2, theta_result, stride_result, net_structure_result, shared_theta_result, shared_stride_result, shared_net_structure_result); for (int i = 0; i < 1412; i++) { EXPECT_FLOAT_EQ(theta_result[i], i) << "failed at index " << i; EXPECT_FLOAT_EQ(shared_theta_result[i], i) << "failed at index " << i; } for (int i = 0; i < 6; i++) { EXPECT_EQ(stride_result[i], stride[i]); EXPECT_EQ(shared_stride_result[i], stride[i]); } for (int i = 0; i < 4; i++) { EXPECT_EQ(net_structure[i], net_structure_result[i]); EXPECT_EQ(net_structure[i], shared_net_structure_result[i]); } } TEST_F(FNNHelperTest, LoadModelNPZTest) { std::vector layers{ 6, 4 }; FNNHelper<> model(layers); model.GPUSetup(); std::string path = mppi::tests::test_load_nn_file; if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(path); model.loadParams(param_dict); // check CPU for (int i = 0; i < 1412; i++) { EXPECT_FLOAT_EQ(model.getThetaPtr()[i], i) << "failed at index " << i; } std::array theta_result = {}; std::array stride_result = {}; std::array net_structure_result = {}; std::array shared_theta_result = {}; std::array shared_stride_result = {}; std::array shared_net_structure_result = {}; std::array stride = { 0, 192, 224, 1248, 1280, 1408 }; std::array net_structure = { 6, 32, 32, 4 }; // launch kernel launchParameterCheckTestKernel, 1412, 6, 4>(model, theta_result, stride_result, net_structure_result, shared_theta_result, shared_stride_result, shared_net_structure_result); for (int i = 0; i < 1412; i++) { EXPECT_FLOAT_EQ(theta_result[i], i) << "failed at index " << i; EXPECT_FLOAT_EQ(shared_theta_result[i], i) << "failed at index " << i; } for (int i = 0; i < 6; i++) { EXPECT_EQ(stride_result[i], stride[i]); EXPECT_EQ(shared_stride_result[i], stride[i]); } for (int i = 0; i < 4; i++) { EXPECT_EQ(net_structure[i], net_structure_result[i]); EXPECT_EQ(net_structure[i], shared_net_structure_result[i]); } } TEST_F(FNNHelperTest, LoadModelNPZTestNested) { std::vector layers{ 28, 30, 30, 2 }; FNNHelper<> model(layers); model.GPUSetup(); std::string path = mppi::tests::test_lstm_lstm; if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(path); model.loadParams("output", param_dict); } TEST_F(FNNHelperTest, forwardCPU) { std::vector layers{ 6, 32, 32, 4 }; FNNHelper<> model(layers); Eigen::MatrixXf input = model.getZeroInputVector(); input.setZero(); Eigen::MatrixXf output = model.getZeroOutputVector(); std::vector theta(1412); std::fill(theta.begin(), theta.end(), 1); model.updateModel({ 6, 32, 32, 4 }, theta); model.forward(input, output); EXPECT_FLOAT_EQ(output(0), 33); EXPECT_FLOAT_EQ(output(1), 33); EXPECT_FLOAT_EQ(output(2), 33); EXPECT_FLOAT_EQ(output(3), 33); // modify bias theta[1408] = 2.0; theta[1409] = 3.0; theta[1410] = 4.0; theta[1411] = 5.0; model.updateModel({ 6, 32, 32, 4 }, theta); model.forward(input, output); EXPECT_FLOAT_EQ(output(0), 34); EXPECT_FLOAT_EQ(output(1), 35); EXPECT_FLOAT_EQ(output(2), 36); EXPECT_FLOAT_EQ(output(3), 37); // modify weight theta[1280] = 2.0; theta[1312] = 2.0; theta[1344] = 2.0; theta[1376] = 2.0; model.updateModel({ 6, 32, 32, 4 }, theta); model.forward(input, output); EXPECT_FLOAT_EQ(output(0), 35); EXPECT_FLOAT_EQ(output(1), 36); EXPECT_FLOAT_EQ(output(2), 37); EXPECT_FLOAT_EQ(output(3), 38); } TEST_F(FNNHelperTest, forwardGPU) { const int num_rollouts = 1000; std::vector layers{ 6, 32, 32, 4 }; FNNHelper<> model(layers); model.GPUSetup(); std::vector theta(1412); std::fill(theta.begin(), theta.end(), 1); model.updateModel({ 6, 32, 32, 4 }, theta); Eigen::Matrix inputs; inputs = Eigen::Matrix::Zero(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } launchForwardTestKernel, 6, 4, 32>(model, input_arr, output_arr, y_dim); for (int point = 0; point < num_rollouts; point++) { Eigen::MatrixXf input = inputs.col(point); Eigen::MatrixXf output = model.getZeroOutputVector(); model.forward(input, output); for (int dim = 0; dim < 6; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 4; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); EXPECT_FLOAT_EQ(output(dim), 33); } } } } TEST_F(FNNHelperTest, forwardGPUCompare) { const int num_rollouts = 1000; std::vector layers{ 6, 32, 32, 4 }; FNNHelper<> model(layers); model.GPUSetup(); std::vector theta(1412); for (int i = 0; i < 1412; i++) { theta[i] = distribution(generator); } model.updateModel({ 6, 32, 32, 4 }, theta); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } launchForwardTestKernel, 6, 4, 32>(model, input_arr, output_arr, y_dim); for (int point = 0; point < num_rollouts; point++) { Eigen::MatrixXf input = inputs.col(point); Eigen::MatrixXf output = model.getZeroOutputVector(); model.forward(input, output); for (int dim = 0; dim < 6; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 4; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } TEST_F(FNNHelperTest, forwardGPUCompareNoShared) { const int num_rollouts = 10; std::vector layers{ 6, 32, 32, 4 }; FNNHelper model(layers); model.GPUSetup(); std::vector theta(1412); for (int i = 0; i < 1412; i++) { theta[i] = distribution(generator); } model.updateModel({ 6, 32, 32, 4 }, theta); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } launchForwardTestKernel, 6, 4, 32>(model, input_arr, output_arr, y_dim); for (int point = 0; point < num_rollouts; point++) { Eigen::MatrixXf input = inputs.col(point); Eigen::MatrixXf output = model.getZeroOutputVector(); model.forward(input, output); for (int dim = 0; dim < 6; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " dim " << dim << " with y_dim " << y_dim; } for (int dim = 0; dim < 4; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " dim " << dim << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } TEST_F(FNNHelperTest, forwardGPUComparePreload) { const int num_rollouts = 1000; std::vector layers{ 6, 32, 32, 4 }; FNNHelper<> model(layers); model.GPUSetup(); std::vector theta(1412); for (int i = 0; i < 1412; i++) { theta[i] = distribution(generator); } model.updateModel({ 6, 32, 32, 4 }, theta); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } launchForwardTestKernelPreload, 6, 4, 32>(model, input_arr, output_arr, y_dim); for (int point = 0; point < num_rollouts; point++) { Eigen::MatrixXf input = inputs.col(point); Eigen::MatrixXf output = model.getZeroOutputVector(); model.forward(input, output); for (int dim = 0; dim < 6; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 4; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } TEST_F(FNNHelperTest, forwardGPUComparePreloadNoShared) { const int num_rollouts = 1000; std::vector layers{ 6, 32, 32, 4 }; FNNHelper model(layers); model.GPUSetup(); std::vector theta(1412); for (int i = 0; i < 1412; i++) { theta[i] = distribution(generator); } model.updateModel({ 6, 32, 32, 4 }, theta); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } launchForwardTestKernelPreload, 6, 4, 32>(model, input_arr, output_arr, y_dim); for (int point = 0; point < num_rollouts; point++) { Eigen::MatrixXf input = inputs.col(point); Eigen::MatrixXf output = model.getZeroOutputVector(); model.forward(input, output); for (int dim = 0; dim < 6; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 4; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } TEST_F(FNNHelperTest, TestComputeGradComputationFinite) { std::vector layers{ 6, 32, 32, 4 }; FNNHelper<> model(layers); std::vector theta(1412); for (int i = 0; i < 1412; i++) { theta[i] = distribution(generator); } model.updateModel({ 6, 32, 32, 4 }, theta); Eigen::MatrixXf numeric_jac = model.getZeroJacobianMatrix(); Eigen::MatrixXf analytic_jac = model.getZeroJacobianMatrix(); for (int i = 0; i < 1000; i++) { Eigen::MatrixXf input = model.getZeroInputVector(); input.Random(); model.computeGrad(input, analytic_jac); EXPECT_TRUE(analytic_jac.allFinite()); } } TEST_F(FNNHelperTest, TestComputeGradComputationCompare) { GTEST_SKIP(); std::vector layers{ 6, 32, 32, 4 }; FNNHelper<> model(layers); std::vector theta(1412); for (int i = 0; i < 1412; i++) { theta[i] = distribution(generator); } model.updateModel({ 6, 32, 32, 4 }, theta); Eigen::MatrixXf numeric_jac; Eigen::MatrixXf analytic_jac; Eigen::MatrixXf input; input << 1, 2, 3, 4, 5, 6; model.computeGrad(input, analytic_jac); // numeric_jac = num_diff.df(input, numeric_jac); ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-3) << "Numeric Jacobian\n" << numeric_jac << "\nAnalytic Jacobian\n" << analytic_jac; } ================================================ FILE: tests/nn_helpers/lstm_helper_test.cu ================================================ #include #include #include #include #include #include // Auto-generated header file #include #include #include #include #include "mppi/ddp/ddp_dynamics.h" class LSTMHelperTest : public testing::Test { protected: void SetUp() override { generator = std::default_random_engine(7.0); distribution = std::normal_distribution(0.0, 1.0); } void TearDown() override { } std::default_random_engine generator; std::normal_distribution distribution; }; TEST_F(LSTMHelperTest, ParamsConstructor1) { std::vector vec = { 30, 10, 3 }; LSTMHelper<> model(5, 25, vec); EXPECT_EQ(model.getLSTMGrdSharedSizeBytes(), 12400); EXPECT_EQ(model.getGrdSharedSizeBytes(), 13808); EXPECT_EQ(model.getBlkSharedSizeBytes(), (25 * 2 + 30 * 2) * sizeof(float) + 8); EXPECT_EQ(model.getHiddenDim(), 25); EXPECT_EQ(model.getInputDim(), 5); EXPECT_EQ(model.getOutputDim(), 3); EXPECT_EQ(model.getHiddenHiddenSize(), 25 * 25); EXPECT_EQ(model.getInputHiddenSize(), 5 * 25); float* W_im = model.getWeights(); float* W_fm = model.getWeights() + model.getHiddenHiddenSize(); float* W_om = model.getWeights() + 2 * model.getHiddenHiddenSize(); float* W_cm = model.getWeights() + 3 * model.getHiddenHiddenSize(); for (int i = 0; i < 25 * 25; i++) { EXPECT_FLOAT_EQ(W_im[i], 0.0f); EXPECT_FLOAT_EQ(W_fm[i], 0.0f); EXPECT_FLOAT_EQ(W_cm[i], 0.0f); EXPECT_FLOAT_EQ(W_om[i], 0.0f); } float* W_ii = model.getWeights() + 4 * model.getHiddenHiddenSize(); float* W_fi = model.getWeights() + 4 * model.getHiddenHiddenSize() + model.getInputHiddenSize(); float* W_oi = model.getWeights() + 4 * model.getHiddenHiddenSize() + 2 * model.getInputHiddenSize(); float* W_ci = model.getWeights() + 4 * model.getHiddenHiddenSize() + 3 * model.getInputHiddenSize(); for (int i = 0; i < 5 * 25; i++) { EXPECT_FLOAT_EQ(W_ii[i], 0.0f); EXPECT_FLOAT_EQ(W_fi[i], 0.0f); EXPECT_FLOAT_EQ(W_oi[i], 0.0f); EXPECT_FLOAT_EQ(W_ci[i], 0.0f); } float* b_i = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize(); float* b_f = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + model.getHiddenDim(); float* b_o = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + 2 * model.getHiddenDim(); float* b_c = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + 3 * model.getHiddenDim(); for (int i = 0; i < 25; i++) { EXPECT_FLOAT_EQ(b_i[i], 0.0f); EXPECT_FLOAT_EQ(b_f[i], 0.0f); EXPECT_FLOAT_EQ(b_o[i], 0.0f); EXPECT_FLOAT_EQ(b_c[i], 0.0f); } float* init_hidden = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + 4 * model.getHiddenDim(); float* init_cell = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + 5 * model.getHiddenDim(); for (int i = 0; i < 25; i++) { EXPECT_FLOAT_EQ(init_hidden[i], 0.0f); EXPECT_FLOAT_EQ(init_cell[i], 0.0f); } } TEST_F(LSTMHelperTest, ConfigConstructor) { LSTMConfig config; config.input_dim = 5; config.hidden_dim = 25; config.output_layers = { 30, 10, 3 }; LSTMHelper<> model(config); EXPECT_EQ(model.getLSTMGrdSharedSizeBytes(), 12400); EXPECT_EQ(model.getGrdSharedSizeBytes(), 13808); EXPECT_EQ(model.getBlkSharedSizeBytes(), (25 * 2 + 30 * 2) * sizeof(float) + 8); EXPECT_EQ(model.getHiddenDim(), 25); EXPECT_EQ(model.getInputDim(), 5); EXPECT_EQ(model.getOutputDim(), 3); EXPECT_EQ(model.getHiddenHiddenSize(), 25 * 25); EXPECT_EQ(model.getInputHiddenSize(), 5 * 25); float* W_im = model.getWeights(); float* W_fm = model.getWeights() + model.getHiddenHiddenSize(); float* W_om = model.getWeights() + 2 * model.getHiddenHiddenSize(); float* W_cm = model.getWeights() + 3 * model.getHiddenHiddenSize(); for (int i = 0; i < 25 * 25; i++) { EXPECT_FLOAT_EQ(W_im[i], 0.0f); EXPECT_FLOAT_EQ(W_fm[i], 0.0f); EXPECT_FLOAT_EQ(W_cm[i], 0.0f); EXPECT_FLOAT_EQ(W_om[i], 0.0f); } float* W_ii = model.getWeights() + 4 * model.getHiddenHiddenSize(); float* W_fi = model.getWeights() + 4 * model.getHiddenHiddenSize() + model.getInputHiddenSize(); float* W_oi = model.getWeights() + 4 * model.getHiddenHiddenSize() + 2 * model.getInputHiddenSize(); float* W_ci = model.getWeights() + 4 * model.getHiddenHiddenSize() + 3 * model.getInputHiddenSize(); for (int i = 0; i < 5 * 25; i++) { EXPECT_FLOAT_EQ(W_ii[i], 0.0f); EXPECT_FLOAT_EQ(W_fi[i], 0.0f); EXPECT_FLOAT_EQ(W_oi[i], 0.0f); EXPECT_FLOAT_EQ(W_ci[i], 0.0f); } float* b_i = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize(); float* b_f = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + model.getHiddenDim(); float* b_o = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + 2 * model.getHiddenDim(); float* b_c = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + 3 * model.getHiddenDim(); for (int i = 0; i < 25; i++) { EXPECT_FLOAT_EQ(b_i[i], 0.0f); EXPECT_FLOAT_EQ(b_f[i], 0.0f); EXPECT_FLOAT_EQ(b_o[i], 0.0f); EXPECT_FLOAT_EQ(b_c[i], 0.0f); } float* init_hidden = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + 4 * model.getHiddenDim(); float* init_cell = model.getWeights() + 4 * model.getHiddenHiddenSize() + 4 * model.getInputHiddenSize() + 5 * model.getHiddenDim(); for (int i = 0; i < 25; i++) { EXPECT_FLOAT_EQ(init_hidden[i], 0.0f); EXPECT_FLOAT_EQ(init_cell[i], 0.0f); } } TEST_F(LSTMHelperTest, ParamsConstructor2) { int total_amount = 0; // delay model std::vector delay_vec = { 2, 10, 1 }; LSTMHelper<> delay(1, 1, delay_vec); total_amount += delay.getGrdSharedSizeBytes() / sizeof(float) + 1; total_amount += delay.getBlkSharedSizeBytes() * 32; // terra model std::vector terra_vec = { 18, 10, 3 }; LSTMHelper<> terra(8, 10, terra_vec); total_amount += terra.getGrdSharedSizeBytes() / sizeof(float) + 1; total_amount += terra.getBlkSharedSizeBytes() * 32; // engine model std::vector engine_vec = { 9, 10, 1 }; LSTMHelper<> engine(4, 5, engine_vec); total_amount += engine.getGrdSharedSizeBytes() / sizeof(float) + 1; total_amount += engine.getBlkSharedSizeBytes() * 32; // steering model std::vector steering_vec = { 12, 20, 1 }; LSTMHelper<> steering(7, 5, steering_vec); total_amount += steering.getGrdSharedSizeBytes() / sizeof(float) + 1; total_amount += steering.getBlkSharedSizeBytes() * 32; std::cout << "total amount: " << total_amount << std::endl; EXPECT_LT(total_amount, 49152); } TEST_F(LSTMHelperTest, BindStream) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); std::vector vec = { 30, 10, 3 }; LSTMHelper<> helper(5, 25, vec, stream); EXPECT_EQ(helper.stream_, stream); } // using LSTM = LSTMHelper, FNNParams<28, 3>>; TEST_F(LSTMHelperTest, GPUSetupAndParamsCheck) { std::vector vec = { 30, 3 }; LSTMHelper<> model(5, 25, vec); std::vector theta_vec(93); for (int i = 0; i < 93; i++) { // theta_vec[i] = distribution(generator); theta_vec[i] = static_cast(i); } model.updateOutputModel({ 30, 3 }, theta_vec); int grid_dim = 5; std::vector lstm_params(grid_dim); std::vector shared_lstm_params(grid_dim); std::vector fnn_params(grid_dim); std::vector shared_fnn_params(grid_dim); EXPECT_EQ(model.GPUMemStatus_, false); EXPECT_EQ(model.network_d_, nullptr); EXPECT_NE(model.getOutputModel(), nullptr); model.GPUSetup(); EXPECT_EQ(model.GPUMemStatus_, true); EXPECT_NE(model.network_d_, nullptr); // launch kernel launchParameterCheckTestKernel>(model, lstm_params, shared_lstm_params, fnn_params, shared_fnn_params, grid_dim); EXPECT_EQ(model.getOutputGrdSharedSizeBytes(), 400); for (int grid = 0; grid < grid_dim; grid++) { // ensure that the output nn matches for (int i = 0; i < 93; i++) { EXPECT_FLOAT_EQ(fnn_params[grid * 100 + i], theta_vec[i]) << "at grid " << grid << " at index " << i; } EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 93], 0); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 94], 90); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 95], 30); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 96], 3); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 97], 0); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 98], 0); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 99], 0); const int hh = 25 * 25; const int ih = 5 * 25; int shift = (25 * 6 + hh * 4 + ih * 4) * grid; for (int i = 0; i < 25 * 25; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + hh + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 2 * hh + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 3 * hh + i], 0.0f); } for (int i = 0; i < 8 * 25; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + ih + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 2 * ih + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 3 * ih + i], 0.0f); } for (int i = 0; i < 25; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 25 + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 50 + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 75 + i], 0.0f); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 100 + i], 0.0f) << "at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 125 + i], 0.0f) << "at index " << i; } } for (int grid = 0; grid < grid_dim; grid++) { // ensure that the output nn matches for (int i = 0; i < 93; i++) { EXPECT_FLOAT_EQ(shared_fnn_params[grid * 100 + i], theta_vec[i]) << "at grid " << grid << " at index " << i; } // EXPECT_EQ(static_cast(fnn_params[grid * 97 + 93]), 0); // EXPECT_EQ(static_cast(fnn_params[grid * 97 + 94]), 90); // EXPECT_EQ(static_cast(fnn_params[grid * 97 + 95]), 30); // EXPECT_EQ(static_cast(fnn_params[grid * 97 + 96]), 3); const int hh = 25 * 25; const int ih = 5 * 25; int shift = (20 * 6 + hh * 4 + ih * 4) * grid; for (int i = 0; i < 25 * 25; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + hh + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 2 * hh + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 3 * hh + i], 0.0f); } for (int i = 0; i < 8 * 25; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + ih + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 2 * ih + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 3 * ih + i], 0.0f); } for (int i = 0; i < 25; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 25 + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 50 + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 75 + i], 0.0f); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 100 + i], 0.0f) << "at index " << i; EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 125 + i], 0.0f) << "at index " << i; } } } TEST_F(LSTMHelperTest, UpdateModel) { std::vector vec = { 30, 3 }; LSTMHelper<> model(5, 25, vec); int grid_dim = 1; std::vector theta_vec(93); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = static_cast(i); } model.updateOutputModel({ 30, 3 }, theta_vec); float* weights_d = model.getWeights(); for (int i = 0; i < model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim(); i++) { weights_d[i] = static_cast(i); } std::vector lstm_params(grid_dim); std::vector shared_lstm_params(grid_dim); std::vector fnn_params(grid_dim); std::vector shared_fnn_params(grid_dim); EXPECT_EQ(model.GPUMemStatus_, false); EXPECT_EQ(model.network_d_, nullptr); EXPECT_NE(model.getOutputModel(), nullptr); model.GPUSetup(); EXPECT_EQ(model.GPUMemStatus_, true); EXPECT_NE(model.network_d_, nullptr); // launch kernel launchParameterCheckTestKernel>(model, lstm_params, shared_lstm_params, fnn_params, shared_fnn_params, grid_dim); EXPECT_EQ(model.getOutputGrdSharedSizeBytes(), 400); for (int grid = 0; grid < grid_dim; grid++) { // ensure that the output nn matches for (int i = 0; i < theta_vec.size(); i++) { EXPECT_FLOAT_EQ(fnn_params[grid * 100 + i], theta_vec[i]) << "at grid " << grid << " at index " << i; } EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 93], 0); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 94], 90); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 95], 30); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 96], 3); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 97], 0); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 98], 0); EXPECT_EQ(((int*)fnn_params.data())[grid * 100 + 99], 0); const int hh = 25 * 25; const int ih = 5 * 25; int shift = (25 * 6 + hh * 4 + ih * 4) * grid; for (int i = 0; i < 25 * 25; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + i], weights_d[i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + hh + i], weights_d[hh + i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + 2 * hh + i], weights_d[2 * hh + i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + 3 * hh + i], weights_d[3 * hh + i]) << "at grid " << grid << " at index " << i; } for (int i = 0; i < 8 * 25; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + i], weights_d[4 * hh + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + ih + i], weights_d[4 * hh + ih + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 2 * ih + i], weights_d[4 * hh + 2 * ih + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 3 * ih + i], weights_d[4 * hh + 3 * ih + i]); } for (int i = 0; i < 25; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + i], weights_d[4 * hh + 4 * ih + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 25 + i], weights_d[4 * hh + 4 * ih + 25 + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 50 + i], weights_d[4 * hh + 4 * ih + 50 + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 75 + i], weights_d[4 * hh + 4 * ih + 75 + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 100 + i], weights_d[4 * hh + 4 * ih + 100 + i]) << "at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 125 + i], weights_d[4 * hh + 4 * ih + 125 + i]) << "at index " << i; } } for (int grid = 0; grid < grid_dim; grid++) { // ensure that the output nn matches for (int i = 0; i < 93; i++) { EXPECT_FLOAT_EQ(shared_fnn_params[grid * 97 + i], theta_vec[i]) << "at grid " << grid << " at index " << i; } // EXPECT_EQ((int)fnn_params[grid * 97 + 93], 0); // EXPECT_EQ((int)fnn_params[grid * 97 + 94], 90); // EXPECT_EQ((int)fnn_params[grid * 97 + 95], 30); // EXPECT_EQ((int)fnn_params[grid * 97 + 96], 3); const int hh = 25 * 25; const int ih = 5 * 25; int shift = (25 * 6 + hh * 4 + ih * 4) * grid; for (int i = 0; i < 25 * 25; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + i], weights_d[i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + hh + i], weights_d[hh + i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(shared_lstm_params[shift + 2 * hh + i], weights_d[2 * hh + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 3 * hh + i], weights_d[3 * hh + i]); } for (int i = 0; i < 8 * 25; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + i], weights_d[4 * hh + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + ih + i], weights_d[4 * hh + ih + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 2 * ih + i], weights_d[4 * hh + 2 * ih + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 3 * ih + i], weights_d[4 * hh + 3 * ih + i]); } for (int i = 0; i < 25; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + i], weights_d[4 * hh + 4 * ih + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 25 + i], weights_d[4 * hh + 4 * ih + 25 + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 50 + i], weights_d[4 * hh + 4 * ih + 50 + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 75 + i], weights_d[4 * hh + 4 * ih + 75 + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 100 + i], weights_d[4 * hh + 4 * ih + 100 + i]) << "at index " << i; EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 125 + i], weights_d[4 * hh + 4 * ih + 125 + i]) << "at index " << i; } } } TEST_F(LSTMHelperTest, LoadModelPathTest) { std::vector vec = { 2, 1 }; LSTMHelper<> model(1, 1, vec); model.GPUSetup(); std::string path = mppi::tests::test_lstm_lstm; if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } model.loadParams(path); cnpy::npz_t input_outputs = cnpy::npz_load(path); int num_points = input_outputs.at("num_points").data()[0]; std::cout << "got T " << input_outputs.at("T").data()[0] << " dt " << input_outputs.at("dt").data()[0] << std::endl; std::cout << "num points " << num_points << std::endl; int T = std::round(input_outputs.at("T").data()[0] / input_outputs.at("dt").data()[0]); EXPECT_EQ(num_points, 3); EXPECT_EQ(T, 5); double* inputs = input_outputs.at("input").data(); double* outputs = input_outputs.at("output").data(); double* init_hidden = input_outputs.at("init/hidden").data(); double* init_cell = input_outputs.at("init/cell").data(); double* hidden = input_outputs.at("hidden").data(); double* cell = input_outputs.at("cell").data(); double tol = 1e-5; auto input = model.getZeroInputVector(); auto output = model.getZeroOutputVector(); const int hidden_dim = model.getHiddenDim(); const int input_dim = model.getInputDim(); const int output_dim = model.getOutputDim(); for (int point = 0; point < num_points; point++) { Eigen::VectorXf initial_hidden = Eigen::VectorXf(hidden_dim, 1); Eigen::VectorXf initial_cell = Eigen::VectorXf(hidden_dim, 1); for (int i = 0; i < hidden_dim; i++) { initial_hidden(i) = init_hidden[hidden_dim * point + i]; initial_cell(i) = init_cell[hidden_dim * point + i]; } model.updateLSTMInitialStates(initial_hidden, initial_cell); for (int t = 0; t < T; t++) { for (int i = 0; i < input_dim; i++) { input(i) = inputs[point * T * input_dim + t * input_dim + i]; } model.forward(input, output); for (int i = 0; i < output_dim; i++) { EXPECT_NEAR(output(i), outputs[point * T * output_dim + t * output_dim + i], tol) << "t: " << t << " point " << point << " at dim " << i; } for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(model.getHiddenState()(i), hidden[point * T * hidden_dim + hidden_dim * t + i], tol) << "t: " << t << " point " << point << " at dim " << i; EXPECT_NEAR(model.getCellState()(i), cell[point * T * hidden_dim + hidden_dim * t + i], tol) << "t: " << t << " point " << point << " at dim " << i; } } } } TEST_F(LSTMHelperTest, LoadModelPathInitTest) { std::vector vec = { 2, 1 }; LSTMHelper<> model(1, 1, vec); model.GPUSetup(); std::string path = mppi::tests::test_lstm_lstm; if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(path); model.loadParams("init_", param_dict, false); cnpy::npz_t input_outputs = cnpy::npz_load(path); int num_points = input_outputs.at("num_points").data()[0]; int T = std::round(input_outputs.at("tau").data()[0] / input_outputs.at("dt").data()[0]) + 1; double* init_inputs = input_outputs.at("init_input").data(); double* init_hidden = input_outputs.at("init/hidden").data(); double* init_cell = input_outputs.at("init/cell").data(); // TODO not the right config to load this data essentially // double* init_step_hidden = input_outputs.at("init_step_hidden").data(); // double* init_step_cell = input_outputs.at("init_step_cell").data(); EXPECT_EQ(num_points, 3); EXPECT_EQ(T, 6); double tol = 1e-5; auto input = model.getZeroInputVector(); auto output = model.getZeroOutputVector(); const int hidden_dim = model.getHiddenDim(); const int input_dim = model.getInputDim(); const int output_dim = model.getOutputDim(); EXPECT_EQ(hidden_dim, 60); EXPECT_EQ(input_dim, 3); EXPECT_EQ(output_dim, 50); for (int point = 0; point < num_points; point++) { // run the init network and ensure initial hidden/cell match model.resetHiddenCellCPU(); for (int t = 0; t < T; t++) { for (int i = 0; i < input_dim; i++) { input(i) = init_inputs[point * T * input_dim + t * input_dim + i]; } model.forward(input, output); // for (int i = 0; i < 60; i++) // { // EXPECT_NEAR(model.getHiddenState()(i), init_step_hidden[60 * point * T + t * 60 + i], tol) // << "at t " << t << " dim " << i; // EXPECT_NEAR(model.getCellState()(i), init_step_cell[60 * point * T + t * 60 + i], tol) // << "at t " << t << " dim " << i; // } } for (int i = 0; i < 25; i++) { EXPECT_NEAR(output(i), init_hidden[25 * point + i], tol) << "incorrect hidden at point " << point << " index " << i; EXPECT_NEAR(output(i + 25), init_cell[25 * point + i], tol) << "incorrect cell at point " << point << " index " << i; } } } TEST_F(LSTMHelperTest, LoadModelNPZTest) { std::vector vec = { 2, 1 }; LSTMHelper<> model(1, 1, vec); model.GPUSetup(); std::string path = mppi::tests::test_lstm_lstm; if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } cnpy::npz_t param_dict = cnpy::npz_load(path); model.loadParams(param_dict); cnpy::npz_t input_outputs = cnpy::npz_load(path); int num_points = input_outputs.at("num_points").data()[0]; int T = std::round(input_outputs.at("T").data()[0] / input_outputs.at("dt").data()[0]); double* inputs = input_outputs.at("input").data(); double* outputs = input_outputs.at("output").data(); double* init_hidden = input_outputs.at("init/hidden").data(); double* init_cell = input_outputs.at("init/cell").data(); double* hidden = input_outputs.at("hidden").data(); double* cell = input_outputs.at("cell").data(); double tol = 1e-5; auto input = model.getZeroInputVector(); auto output = model.getZeroOutputVector(); const int hidden_dim = model.getHiddenDim(); const int input_dim = model.getInputDim(); const int output_dim = model.getOutputDim(); for (int point = 0; point < num_points; point++) { Eigen::VectorXf initial_hidden = Eigen::VectorXf(hidden_dim, 1); Eigen::VectorXf initial_cell = Eigen::VectorXf(hidden_dim, 1); for (int i = 0; i < hidden_dim; i++) { initial_hidden(i) = init_hidden[hidden_dim * point + i]; initial_cell(i) = init_cell[hidden_dim * point + i]; } model.updateLSTMInitialStates(initial_hidden, initial_cell); for (int t = 0; t < T; t++) { for (int i = 0; i < 3; i++) { input(i) = inputs[point * T * 3 + t * 3 + i]; } model.forward(input, output); for (int i = 0; i < 2; i++) { EXPECT_NEAR(output[i], outputs[point * T * 2 + t * 2 + i], tol) << "point " << point << " at dim " << i; } for (int i = 0; i < 25; i++) { EXPECT_NEAR(model.getHiddenState()(i), hidden[point * T * 25 + 25 * t + i], tol) << "point " << point << " at dim " << i; EXPECT_NEAR(model.getCellState()(i), cell[point * T * 25 + 25 * t + i], tol) << "point " << point << " at dim " << i; } } } } TEST_F(LSTMHelperTest, forwardCPU) { std::vector vec = { 28, 3 }; LSTMHelper<> model(8, 20, vec); std::vector theta_vec(87); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = 1.0; } model.updateOutputModel({ 28, 3 }, theta_vec); float* weights_d = model.getWeights(); for (int i = 0; i < model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim(); i++) { weights_d[i] = 1.0f; } model.resetHiddenCellCPU(); auto input = model.getZeroInputVector(); auto output = model.getZeroOutputVector(); input.setOnes(); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 28.28055); EXPECT_FLOAT_EQ(output[1], 28.28055); EXPECT_FLOAT_EQ(output[2], 28.28055); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 28.901096); EXPECT_FLOAT_EQ(output[1], 28.901096); EXPECT_FLOAT_EQ(output[2], 28.901096); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 28.986588); EXPECT_FLOAT_EQ(output[1], 28.986588); EXPECT_FLOAT_EQ(output[2], 28.986588); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 28.998184); EXPECT_FLOAT_EQ(output[1], 28.998184); EXPECT_FLOAT_EQ(output[2], 28.998184); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 28.999756); EXPECT_FLOAT_EQ(output[1], 28.999756); EXPECT_FLOAT_EQ(output[2], 28.999756); } TEST_F(LSTMHelperTest, forwardGPU) { const int num_rollouts = 1000; std::vector vec = { 28, 3 }; LSTMHelper<> model(8, 20, vec); std::vector theta(87); std::fill(theta.begin(), theta.end(), 1); model.updateOutputModel({ 28, 3 }, theta); float* weights_d = model.getWeights(); for (int i = 0; i < model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim(); i++) { weights_d[i] = 1.0f; } model.resetHiddenCellCPU(); model.GPUSetup(); Eigen::Matrix inputs; inputs = Eigen::Matrix::Ones(); Eigen::VectorXf output = model.getZeroOutputVector(); Eigen::VectorXf input = model.getZeroInputVector(); std::array true_vals = { 28.28055, 28.901096, 28.986588, 28.998184, 28.999756 }; std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } for (int step = 1; step < 6; step++) { launchForwardTestKernel, 8, 3, 32>(model, input_arr, output_arr, y_dim, step); for (int point = 0; point < num_rollouts; point++) { model.resetHiddenCellCPU(); input = inputs.col(point); for (int cpu_step = 0; cpu_step < step; cpu_step++) { model.forward(input, output); } for (int dim = 0; dim < 8; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 3; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); EXPECT_FLOAT_EQ(output(dim), true_vals[step - 1]) << "at dim " << dim << " step " << step; } } } } } TEST_F(LSTMHelperTest, forwardGPUCompareNoShared) { const int num_rollouts = 1000; std::vector vec = { 28, 3 }; LSTMHelper model(8, 20, vec); std::vector theta_vec(87); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } model.updateOutputModel({ 28, 3 }, theta_vec); float* weights_d = model.getWeights(); for (int i = 0; i < model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim(); i++) { weights_d[i] = distribution(generator); } model.resetHiddenCellCPU(); model.GPUSetup(); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); Eigen::VectorXf output = model.getZeroOutputVector(); Eigen::VectorXf input = model.getZeroInputVector(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } for (int step = 1; step < 6; step++) { launchForwardTestKernel, 8, 3, 32>(model, input_arr, output_arr, y_dim, step); for (int point = 0; point < num_rollouts; point++) { model.resetHiddenCellCPU(); input = inputs.col(point); for (int cpu_step = 0; cpu_step < step; cpu_step++) { model.forward(input, output); } for (int dim = 0; dim < 8; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 3; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } } TEST_F(LSTMHelperTest, forwardGPUCompareShared) { const int num_rollouts = 1000; std::vector vec = { 28, 3 }; LSTMHelper<> model(8, 20, vec); std::vector theta_vec(87); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } model.updateOutputModel({ 28, 3 }, theta_vec); float* weights_d = model.getWeights(); for (int i = 0; i < model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim(); i++) { weights_d[i] = distribution(generator); } model.resetHiddenCellCPU(); model.GPUSetup(); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); Eigen::VectorXf output = model.getZeroOutputVector(); Eigen::VectorXf input = model.getZeroInputVector(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } for (int step = 1; step < 6; step++) { launchForwardTestKernel, 8, 3, 32>(model, input_arr, output_arr, y_dim, step); for (int point = 0; point < num_rollouts; point++) { model.resetHiddenCellCPU(); input = inputs.col(point); for (int cpu_step = 0; cpu_step < step; cpu_step++) { model.forward(input, output); } for (int dim = 0; dim < 8; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 3; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } } TEST_F(LSTMHelperTest, forwardGPUComparePreloadNoShared) { const int num_rollouts = 1000; std::vector vec = { 28, 3 }; LSTMHelper model(8, 20, vec); std::vector theta_vec(87); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } model.updateOutputModel({ 28, 3 }, theta_vec); float* weights_d = model.getWeights(); for (int i = 0; i < model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim(); i++) { weights_d[i] = distribution(generator); } model.resetHiddenCellCPU(); model.GPUSetup(); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); Eigen::VectorXf output = model.getZeroOutputVector(); Eigen::VectorXf input = model.getZeroInputVector(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } for (int step = 1; step < 6; step++) { launchForwardTestKernelPreload, 8, 3, 32>(model, input_arr, output_arr, y_dim, step); for (int point = 0; point < num_rollouts; point++) { model.resetHiddenCellCPU(); input = inputs.col(point); for (int cpu_step = 0; cpu_step < step; cpu_step++) { model.forward(input, output); } for (int dim = 0; dim < 8; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 3; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } } TEST_F(LSTMHelperTest, forwardGPUComparePreloadShared) { const int num_rollouts = 1000; std::vector vec = { 28, 3 }; LSTMHelper<> model(8, 20, vec); std::vector theta_vec(87); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } model.updateOutputModel({ 28, 3 }, theta_vec); float* weights_d = model.getWeights(); for (int i = 0; i < model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * model.getHiddenDim(); i++) { weights_d[i] = distribution(generator); } model.resetHiddenCellCPU(); model.GPUSetup(); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); Eigen::VectorXf output = model.getZeroOutputVector(); Eigen::VectorXf input = model.getZeroInputVector(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int y_dim = 1; y_dim < 16; y_dim++) { for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } for (int step = 1; step < 6; step++) { launchForwardTestKernelPreload, 8, 3, 32>(model, input_arr, output_arr, y_dim, step); for (int point = 0; point < num_rollouts; point++) { model.resetHiddenCellCPU(); input = inputs.col(point); for (int cpu_step = 0; cpu_step < step; cpu_step++) { model.forward(input, output); } for (int dim = 0; dim < 8; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < 3; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } } TEST_F(LSTMHelperTest, forwardGPUSpeedTest) { const int num_rollouts = 15000; std::vector vec = { 28, 3 }; LSTMHelper<> shared_model(8, 20, vec); LSTMHelper global_model(8, 20, vec); std::vector theta_vec(87); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } shared_model.updateOutputModel({ 28, 3 }, theta_vec); global_model.updateOutputModel({ 28, 3 }, theta_vec); float* shared_weights_d = shared_model.getWeights(); float* global_weights_d = global_model.getWeights(); for (int i = 0; i < shared_model.getLSTMGrdSharedSizeBytes() / sizeof(float) + 2 * shared_model.getHiddenDim(); i++) { shared_weights_d[i] = distribution(generator); global_weights_d[i] = shared_weights_d[i]; } shared_model.resetHiddenCellCPU(); shared_model.GPUSetup(); global_model.resetHiddenCellCPU(); global_model.GPUSetup(); Eigen::Matrix inputs; inputs = Eigen::Matrix::Random(); Eigen::VectorXf output = shared_model.getZeroOutputVector(); Eigen::VectorXf input = shared_model.getZeroInputVector(); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index % 1000)(dim); } } for (int y_dim = 1; y_dim < 16; y_dim++) { // TODO the way of doing this timing is bad auto shared_start = std::chrono::steady_clock::now(); launchForwardTestKernel, 8, 3, 32>(shared_model, input_arr, output_arr, y_dim, 500); auto shared_stop = std::chrono::steady_clock::now(); auto global_start = std::chrono::steady_clock::now(); launchForwardTestKernel, 8, 3, 32>(global_model, input_arr, output_arr, y_dim, 500); auto global_stop = std::chrono::steady_clock::now(); float shared_time_ms = mppi::math::timeDiffms(shared_stop, shared_start); float global_time_ms = mppi::math::timeDiffms(global_stop, global_start); std::cout << "for y dim " << y_dim << " got shared: " << shared_time_ms << std::endl; std::cout << "for y dim " << y_dim << " got global: " << global_time_ms << std::endl; } } // TEST_F(LSTMHelperTest, TestComputeGradComputationFinite) // { // LSTMHelper> model; // std::vector theta(1412); // for (int i = 0; i < 1412; i++) // { // theta[i] = distribution(generator); // } // model.updateModel({ 6, 32, 32, 4 }, theta); // // LSTMHelper>::dfdx numeric_jac; // LSTMHelper>::dfdx analytic_jac; // // for (int i = 0; i < 1000; i++) // { // LSTMHelper>::input_array input; // input = LSTMHelper>::input_array::Random(); // // model.computeGrad(input, analytic_jac); // EXPECT_TRUE(analytic_jac.allFinite()); // } // } // // TEST_F(LSTMHelperTest, TestComputeGradComputationCompare) // { // GTEST_SKIP(); // LSTMHelper> model; // std::vector theta(1412); // for (int i = 0; i < 1412; i++) // { // theta[i] = distribution(generator); // } // model.updateModel({ 6, 32, 32, 4 }, theta); // // LSTMHelper>::dfdx numeric_jac; // LSTMHelper>::dfdx analytic_jac; // // LSTMHelper>::input_array input; // input << 1, 2, 3, 4, 5, 6; // // model.computeGrad(input, analytic_jac); // // // numeric_jac = num_diff.df(input, numeric_jac); // // ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-3) << "Numeric Jacobian\n" // << numeric_jac << "\nAnalytic Jacobian\n" // << analytic_jac; // } ================================================ FILE: tests/nn_helpers/lstm_lstm_helper_test.cu ================================================ // #include #include #include #include #include #include // Auto-generated header file #include #include #include "mppi/ddp/ddp_dynamics.h" class LSTMLSTMHelperTest : public testing::Test { protected: void SetUp() override { generator = std::default_random_engine(7.0); distribution = std::normal_distribution(0.0, 1.0); } void TearDown() override { } std::default_random_engine generator; std::normal_distribution distribution; std::vector init_output_layers = { 68, 100, 20 }; std::vector output_layers = { 18, 2 }; const int init_hidden_dim = 60; const int hidden_dim = 10; const int init_input_dim = 8; const int input_dim = 8; const int output_dim = 2; const int init_len = 6; }; // template class FNNParams<18, 3>; // template class FNNParams<68, 100, 20>; // template class LSTMParams<8, 10>; // template class LSTMParams<8, 60>; // typedef FNNParams<18, 3> FNN_PARAMS; // typedef FNNParams<68, 100, 20> FNN_INIT_PARAMS; // typedef LSTMHelper, FNN_PARAMS> LSTM; // typedef LSTMHelper, FNN_INIT_PARAMS> INIT_LSTM; // // template class LSTMLSTMHelper; // // typedef LSTMLSTMHelper T; TEST_F(LSTMLSTMHelperTest, BindStreamAndConstructor) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); LSTMLSTMHelper<> helper(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len, stream); EXPECT_EQ(helper.getLSTMModel()->stream_, stream); EXPECT_NE(helper.getLSTMModel(), nullptr); EXPECT_EQ(helper.getLSTMModel()->getInputDim(), input_dim); EXPECT_EQ(helper.getLSTMModel()->getOutputDim(), output_dim); EXPECT_EQ(helper.getLSTMModel()->getHiddenDim(), hidden_dim); EXPECT_EQ(helper.getInitModel()->getInputDim(), init_input_dim); EXPECT_EQ(helper.getInitModel()->getOutputDim(), 2 * hidden_dim); EXPECT_EQ(helper.getInitModel()->getHiddenDim(), init_hidden_dim); auto init_lstm = helper.getInitModel(); EXPECT_NE(init_lstm, nullptr); auto hidden = init_lstm->getHiddenState(); auto cell = init_lstm->getCellState(); for (int i = 0; i < init_lstm->getHiddenDim(); i++) { EXPECT_FLOAT_EQ(hidden(i), 0.0f); EXPECT_FLOAT_EQ(cell(i), 0.0f); } } TEST_F(LSTMLSTMHelperTest, BindStreamAndConstructorConfigLSTM) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); LSTMConfig init_config, pred_config; init_config.input_dim = init_input_dim; init_config.hidden_dim = init_hidden_dim; init_config.output_layers = init_output_layers; pred_config.input_dim = input_dim; pred_config.hidden_dim = hidden_dim; pred_config.output_layers = output_layers; LSTMLSTMHelper<> helper(init_config, pred_config, init_len, stream); EXPECT_EQ(helper.getLSTMModel()->stream_, stream); EXPECT_NE(helper.getLSTMModel(), nullptr); EXPECT_EQ(helper.getLSTMModel()->getInputDim(), input_dim); EXPECT_EQ(helper.getLSTMModel()->getOutputDim(), output_dim); EXPECT_EQ(helper.getLSTMModel()->getHiddenDim(), hidden_dim); EXPECT_EQ(helper.getInitModel()->getInputDim(), init_input_dim); EXPECT_EQ(helper.getInitModel()->getOutputDim(), 2 * hidden_dim); EXPECT_EQ(helper.getInitModel()->getHiddenDim(), init_hidden_dim); auto init_lstm = helper.getInitModel(); EXPECT_NE(init_lstm, nullptr); auto hidden = init_lstm->getHiddenState(); auto cell = init_lstm->getCellState(); for (int i = 0; i < init_lstm->getHiddenDim(); i++) { EXPECT_FLOAT_EQ(hidden(i), 0.0f); EXPECT_FLOAT_EQ(cell(i), 0.0f); } } TEST_F(LSTMLSTMHelperTest, BindStreamAndConstructorConfigLSTMLSTM) { cudaStream_t stream; HANDLE_ERROR(cudaStreamCreate(&stream)); LSTMLSTMConfig config; config.init_config.input_dim = init_input_dim; config.init_config.hidden_dim = init_hidden_dim; config.init_config.output_layers = init_output_layers; config.pred_config.input_dim = input_dim; config.pred_config.hidden_dim = hidden_dim; config.pred_config.output_layers = output_layers; config.init_len = init_len; LSTMLSTMHelper<> helper(config, stream); EXPECT_EQ(helper.getLSTMModel()->stream_, stream); EXPECT_NE(helper.getLSTMModel(), nullptr); EXPECT_EQ(helper.getLSTMModel()->getInputDim(), input_dim); EXPECT_EQ(helper.getLSTMModel()->getOutputDim(), output_dim); EXPECT_EQ(helper.getLSTMModel()->getHiddenDim(), hidden_dim); EXPECT_EQ(helper.getInitModel()->getInputDim(), init_input_dim); EXPECT_EQ(helper.getInitModel()->getOutputDim(), 2 * hidden_dim); EXPECT_EQ(helper.getInitModel()->getHiddenDim(), init_hidden_dim); auto init_lstm = helper.getInitModel(); EXPECT_NE(init_lstm, nullptr); auto hidden = init_lstm->getHiddenState(); auto cell = init_lstm->getCellState(); for (int i = 0; i < init_lstm->getHiddenDim(); i++) { EXPECT_FLOAT_EQ(hidden(i), 0.0f); EXPECT_FLOAT_EQ(cell(i), 0.0f); } } TEST_F(LSTMLSTMHelperTest, initializeLSTMLSTMTest) { LSTMLSTMHelper<> helper(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len); Eigen::Matrix buffer; buffer.setOnes(); helper.getInitModel()->setAllValues(1.0); helper.initializeLSTM(buffer); auto lstm = helper.getLSTMModel(); auto hidden = lstm->getHiddenState(); auto cell = lstm->getHiddenState(); for (int i = 0; i < hidden_dim; i++) { EXPECT_FLOAT_EQ(hidden(i), 101.0f); EXPECT_FLOAT_EQ(cell(i), 101.0f); } } TEST_F(LSTMLSTMHelperTest, GPUSetupAndParamsCheck) { LSTMLSTMHelper<> model(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len); std::vector theta_vec(model.getLSTMModel()->getOutputModel()->getNumParams()); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } std::vector lstm_vec(model.getLSTMModel()->getNumParams()); for (int i = 0; i < lstm_vec.size(); i++) { lstm_vec[i] = distribution(generator); } model.getLSTMModel()->setAllValues(lstm_vec, theta_vec); int grid_dim = 5; std::vector lstm_params(grid_dim); std::vector shared_lstm_params(grid_dim); std::vector fnn_params(grid_dim); std::vector shared_fnn_params(grid_dim); EXPECT_EQ(model.getLSTMModel()->GPUMemStatus_, false); EXPECT_EQ(model.getLSTMModel()->network_d_, nullptr); EXPECT_NE(model.getLSTMModel()->getOutputModel(), nullptr); EXPECT_EQ(model.getInitModel()->GPUMemStatus_, false); EXPECT_EQ(model.getInitModel()->network_d_, nullptr); EXPECT_NE(model.getInitModel()->getOutputModel(), nullptr); model.GPUSetup(); EXPECT_EQ(model.getLSTMModel()->GPUMemStatus_, true); EXPECT_NE(model.getLSTMModel()->network_d_, nullptr); EXPECT_NE(model.getLSTMModel()->getOutputModel(), nullptr); EXPECT_EQ(model.getInitModel()->GPUMemStatus_, false); EXPECT_EQ(model.getInitModel()->network_d_, nullptr); EXPECT_NE(model.getInitModel()->getOutputModel(), nullptr); // launch kernel launchParameterCheckTestKernel>(*model.getLSTMModel(), lstm_params, shared_lstm_params, fnn_params, shared_fnn_params, grid_dim); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getGrdSharedSizeBytes() / sizeof(float), 44); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getBlkSharedSizeBytes() / sizeof(float), 36); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNumParams(), 38); int grd_size = model.getLSTMModel()->getOutputModel()->getGrdSharedSizeBytes() / sizeof(float); for (int grid = 0; grid < grid_dim; grid++) { // ensure that the output nn matches for (int i = 0; i < theta_vec.size(); i++) { EXPECT_FLOAT_EQ(fnn_params[grid * grd_size + i], theta_vec[i]) << "at grid " << grid << " at index " << i; } const int h = model.getLSTMModel()->getHiddenDim(); const int hh = h * h; const int ih = model.getLSTMModel()->getInputDim() * model.getLSTMModel()->getHiddenDim(); int shift = (h * 6 + hh * 4 + ih * 4) * grid; for (int i = 0; i < hh; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + i], lstm_vec[i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + hh + i], lstm_vec[hh + i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + 2 * hh + i], lstm_vec[2 * hh + i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + 3 * hh + i], lstm_vec[3 * hh + i]) << "at grid " << grid << " at index " << i; } for (int i = 0; i < ih; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + i], lstm_vec[4 * hh + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + ih + i], lstm_vec[4 * hh + ih + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 2 * ih + i], lstm_vec[4 * hh + 2 * ih + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 3 * ih + i], lstm_vec[4 * hh + 3 * ih + i]); } for (int i = 0; i < h; i++) { EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + i], lstm_vec[4 * hh + 4 * ih + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + h + i], lstm_vec[4 * hh + 4 * ih + h + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 2 * h + i], lstm_vec[4 * hh + 4 * ih + 2 * h + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 3 * h + i], lstm_vec[4 * hh + 4 * ih + 3 * h + i]); EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 4 * h + i], lstm_vec[4 * hh + 4 * ih + 4 * h + i]) << "at index " << i; EXPECT_FLOAT_EQ(lstm_params[shift + 4 * hh + 4 * ih + 5 * h + i], lstm_vec[4 * hh + 4 * ih + 5 * h + i]) << "at index " << i; } } for (int grid = 0; grid < grid_dim; grid++) { // ensure that the output nn matches for (int i = 0; i < theta_vec.size(); i++) { EXPECT_FLOAT_EQ(shared_fnn_params[grid * grd_size + i], theta_vec[i]) << "at grid " << grid << " at index " << i; } const int h = model.getLSTMModel()->getHiddenDim(); const int hh = h * h; const int ih = model.getLSTMModel()->getInputDim() * model.getLSTMModel()->getHiddenDim(); int shift = (h * 6 + hh * 4 + ih * 4) * grid; for (int i = 0; i < hh; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + i], lstm_vec[i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(shared_lstm_params[shift + hh + i], lstm_vec[hh + i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(shared_lstm_params[shift + 2 * hh + i], lstm_vec[2 * hh + i]) << "at grid " << grid << " at index " << i; EXPECT_FLOAT_EQ(shared_lstm_params[shift + 3 * hh + i], lstm_vec[3 * hh + i]) << "at grid " << grid << " at index " << i; } for (int i = 0; i < ih; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + i], lstm_vec[4 * hh + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + ih + i], lstm_vec[4 * hh + ih + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 2 * ih + i], lstm_vec[4 * hh + 2 * ih + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 3 * ih + i], lstm_vec[4 * hh + 3 * ih + i]); } for (int i = 0; i < h; i++) { EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + i], lstm_vec[4 * hh + 4 * ih + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + h + i], lstm_vec[4 * hh + 4 * ih + h + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 2 * h + i], lstm_vec[4 * hh + 4 * ih + 2 * h + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 3 * h + i], lstm_vec[4 * hh + 4 * ih + 3 * h + i]); EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 4 * h + i], lstm_vec[4 * hh + 4 * ih + 4 * h + i]) << "at index " << i; EXPECT_FLOAT_EQ(shared_lstm_params[shift + 4 * hh + 4 * ih + 5 * h + i], lstm_vec[4 * hh + 4 * ih + 5 * h + i]) << "at index " << i; } } } TEST_F(LSTMLSTMHelperTest, LoadModelPathTest) { std::string path = mppi::tests::test_lstm_lstm; if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } LSTMLSTMHelper<> model(path); EXPECT_EQ(model.getLSTMModel()->getInputDim(), 3); EXPECT_EQ(model.getLSTMModel()->getHiddenDim(), 25); EXPECT_EQ(model.getLSTMModel()->getOutputDim(), 2); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNetStructurePtr()[0], 28); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNetStructurePtr()[1], 30); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNetStructurePtr()[2], 30); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNetStructurePtr()[3], 2); EXPECT_EQ(model.getInitModel()->getInputDim(), 3); EXPECT_EQ(model.getInitModel()->getHiddenDim(), 60); EXPECT_EQ(model.getInitModel()->getOutputDim(), 50); EXPECT_EQ(model.getInitModel()->getOutputModel()->getNetStructurePtr()[0], 63); EXPECT_EQ(model.getInitModel()->getOutputModel()->getNetStructurePtr()[1], 15); EXPECT_EQ(model.getInitModel()->getOutputModel()->getNetStructurePtr()[2], 15); EXPECT_EQ(model.getInitModel()->getOutputModel()->getNetStructurePtr()[3], 50); EXPECT_EQ(model.getInitLen(), 6); assert(model.getInitLen() == 6); cnpy::npz_t input_outputs = cnpy::npz_load(path); double* inputs = input_outputs.at("input").data(); double* outputs = input_outputs.at("output").data(); double* init_inputs = input_outputs.at("init_input").data(); double* init_hidden = input_outputs.at("init/hidden").data(); double* init_cell = input_outputs.at("init/cell").data(); double* hidden = input_outputs.at("hidden").data(); double* cell = input_outputs.at("cell").data(); const int hidden_dim = model.getLSTMModel()->getHiddenDim(); const int input_dim = model.getLSTMModel()->getInputDim(); const int output_dim = model.getLSTMModel()->getOutputDim(); const int init_hidden_dim = model.getInitModel()->getHiddenDim(); const int init_input_dim = model.getInitModel()->getInputDim(); // TOOD figure out the number of points from the input int init_len = model.getInitLen(); int num_points = input_outputs.at("num_points").data()[0]; int T = input_outputs.at("output").shape[0] / (num_points * output_dim); EXPECT_EQ(num_points, 3); EXPECT_EQ(input_dim, init_input_dim); double tol = 1e-5; auto input = model.getLSTMModel()->getZeroInputVector(); auto output = model.getLSTMModel()->getZeroOutputVector(); for (int point = 0; point < num_points; point++) { // run the init network and ensure initial hidden/cell match Eigen::MatrixXf buffer(init_input_dim, init_len); model.resetInitHiddenCPU(); for (int t = 0; t < init_len; t++) { for (int i = 0; i < init_input_dim; i++) { buffer(i, t) = init_inputs[point * init_len * init_input_dim + t * init_input_dim + i]; } } model.initializeLSTM(buffer); for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(model.getLSTMModel()->getHiddenState()(i), init_hidden[hidden_dim * point + i], tol) << "at point " << point << " index " << i; EXPECT_NEAR(model.getLSTMModel()->getCellState()(i), init_cell[hidden_dim * point + i], tol); } for (int t = 0; t < T; t++) { for (int i = 0; i < input_dim; i++) { input(i) = inputs[point * T * input_dim + t * input_dim + i]; } model.forward(input, output); for (int i = 0; i < output_dim; i++) { EXPECT_NEAR(output[i], outputs[point * T * output_dim + t * output_dim + i], tol) << "point " << point << " T " << t << " at dim " << i; } for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(model.getLSTMModel()->getHiddenState()(i), hidden[point * T * hidden_dim + hidden_dim * t + i], tol) << "point " << point << " at dim " << i; EXPECT_NEAR(model.getLSTMModel()->getCellState()(i), cell[point * T * hidden_dim + hidden_dim * t + i], tol) << "point " << point << " at dim " << i; } } } } TEST_F(LSTMLSTMHelperTest, LoadModelPathTestLongBuffer) { std::string path = mppi::tests::test_lstm_lstm; if (!fileExists(path)) { std::cerr << "Could not load neural net model at path: " << path.c_str(); exit(-1); } LSTMLSTMHelper<> model(path); EXPECT_EQ(model.getLSTMModel()->getInputDim(), 3); EXPECT_EQ(model.getLSTMModel()->getHiddenDim(), 25); EXPECT_EQ(model.getLSTMModel()->getOutputDim(), 2); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNetStructurePtr()[0], 28); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNetStructurePtr()[1], 30); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNetStructurePtr()[2], 30); EXPECT_EQ(model.getLSTMModel()->getOutputModel()->getNetStructurePtr()[3], 2); EXPECT_EQ(model.getInitModel()->getInputDim(), 3); EXPECT_EQ(model.getInitModel()->getHiddenDim(), 60); EXPECT_EQ(model.getInitModel()->getOutputDim(), 50); EXPECT_EQ(model.getInitModel()->getOutputModel()->getNetStructurePtr()[0], 63); EXPECT_EQ(model.getInitModel()->getOutputModel()->getNetStructurePtr()[1], 15); EXPECT_EQ(model.getInitModel()->getOutputModel()->getNetStructurePtr()[2], 15); EXPECT_EQ(model.getInitModel()->getOutputModel()->getNetStructurePtr()[3], 50); cnpy::npz_t input_outputs = cnpy::npz_load(path); double* inputs = input_outputs.at("input").data(); double* outputs = input_outputs.at("output").data(); double* init_inputs = input_outputs.at("init_input").data(); double* init_hidden = input_outputs.at("init/hidden").data(); double* init_cell = input_outputs.at("init/cell").data(); double* hidden = input_outputs.at("hidden").data(); double* cell = input_outputs.at("cell").data(); const int hidden_dim = model.getLSTMModel()->getHiddenDim(); const int input_dim = model.getLSTMModel()->getInputDim(); const int output_dim = model.getLSTMModel()->getOutputDim(); const int init_hidden_dim = model.getInitModel()->getHiddenDim(); const int init_input_dim = model.getInitModel()->getInputDim(); // TOOD figure out the number of points from the input const int init_len = model.getInitLen(); EXPECT_EQ(model.getInitLen(), 6); int num_points = input_outputs.at("num_points").data()[0]; int T = input_outputs.at("output").shape[0] / (num_points * output_dim); double tol = 1e-5; auto input = model.getLSTMModel()->getZeroInputVector(); auto output = model.getLSTMModel()->getZeroOutputVector(); for (int point = 0; point < num_points; point++) { // run the init network and ensure initial hidden/cell match Eigen::MatrixXf buffer(init_input_dim, init_len + 30); model.resetInitHiddenCPU(); for (int t = 0; t < init_len; t++) { for (int i = 0; i < init_input_dim; i++) { buffer(i, t + 30) = init_inputs[point * init_len * init_input_dim + t * init_input_dim + i]; } } model.initializeLSTM(buffer); for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(model.getLSTMModel()->getHiddenState()(i), init_hidden[hidden_dim * point + i], tol) << "at point " << point << " index " << i; EXPECT_NEAR(model.getLSTMModel()->getCellState()(i), init_cell[hidden_dim * point + i], tol); } for (int t = 0; t < T; t++) { for (int i = 0; i < input_dim; i++) { input(i) = inputs[point * T * input_dim + t * input_dim + i]; } model.forward(input, output); for (int i = 0; i < output_dim; i++) { EXPECT_NEAR(output[i], outputs[point * T * output_dim + t * output_dim + i], tol) << "point " << point << " at dim " << i; } for (int i = 0; i < hidden_dim; i++) { EXPECT_NEAR(model.getLSTMModel()->getHiddenState()(i), hidden[point * T * hidden_dim + hidden_dim * t + i], tol) << "point " << point << " at dim " << i; EXPECT_NEAR(model.getLSTMModel()->getCellState()(i), cell[point * T * hidden_dim + hidden_dim * t + i], tol) << "point " << point << " at dim " << i; } } } } TEST_F(LSTMLSTMHelperTest, forwardCPU) { LSTMLSTMHelper<> model(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, 10); model.getLSTMModel()->setAllValues(1.0f); model.getInitModel()->setAllValues(0.1f); model.getInitModel()->getOutputModel()->setAllWeights(0.01f); for (int i = 0; i < init_hidden_dim; i++) { EXPECT_FLOAT_EQ(model.getInitModel()->getHiddenState()(i), 0.1); EXPECT_FLOAT_EQ(model.getInitModel()->getCellState()(i), 0.1); } for (int i = 0; i < hidden_dim; i++) { EXPECT_FLOAT_EQ(model.getLSTMModel()->getHiddenState()(i), 1.0); EXPECT_FLOAT_EQ(model.getLSTMModel()->getCellState()(i), 1.0); } auto input = model.getLSTMModel()->getZeroInputVector(); auto output = model.getLSTMModel()->getZeroOutputVector(); input.setOnes(); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 18.640274); EXPECT_FLOAT_EQ(output[1], 18.640274); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 18.950548); EXPECT_FLOAT_EQ(output[1], 18.950548); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 18.993294); EXPECT_FLOAT_EQ(output[1], 18.993294); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 18.999092); EXPECT_FLOAT_EQ(output[1], 18.999092); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 18.999878); EXPECT_FLOAT_EQ(output[1], 18.999878); auto buffer = model.getEmptyBufferMatrix(10); buffer.setOnes(); buffer = buffer * 0.1; model.initializeLSTM(buffer); for (int i = 0; i < init_hidden_dim; i++) { EXPECT_FLOAT_EQ(model.getInitModel()->getHiddenState()(i), 0.99790788); EXPECT_FLOAT_EQ(model.getInitModel()->getCellState()(i), 9.2190571); } for (int i = 0; i < hidden_dim; i++) { EXPECT_FLOAT_EQ(model.getLSTMModel()->getHiddenState()(i), 0.558857381); EXPECT_FLOAT_EQ(model.getLSTMModel()->getCellState()(i), 0.558857381); } input *= 0.1; model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 10.945133); EXPECT_FLOAT_EQ(output[1], 10.945133); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 11.680509); EXPECT_FLOAT_EQ(output[1], 11.680509); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 11.783686); EXPECT_FLOAT_EQ(output[1], 11.783686); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 11.797728); EXPECT_FLOAT_EQ(output[1], 11.797728); model.forward(input, output); EXPECT_FLOAT_EQ(output[0], 11.79963); EXPECT_FLOAT_EQ(output[1], 11.79963); } TEST_F(LSTMLSTMHelperTest, forwardGPU) { const int num_rollouts = 1; LSTMLSTMHelper<> model(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, 10); model.getLSTMModel()->setAllValues(1.0f); model.getInitModel()->setAllValues(0.1f); model.getInitModel()->getOutputModel()->setAllWeights(0.01f); model.GPUSetup(); std::array true_vals = { 10.945133, 11.680509, 11.783686, 11.797728, 11.79963 }; std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); auto input = model.getLSTMModel()->getZeroInputVector(); auto output = model.getLSTMModel()->getZeroOutputVector(); input.setOnes(); input = input * 0.1; for (int i = 0; i < init_hidden_dim; i++) { EXPECT_FLOAT_EQ(model.getInitModel()->getHiddenState()(i), 0.1); EXPECT_FLOAT_EQ(model.getInitModel()->getCellState()(i), 0.1); } for (int i = 0; i < hidden_dim; i++) { EXPECT_FLOAT_EQ(model.getLSTMModel()->getHiddenState()(i), 1.0); EXPECT_FLOAT_EQ(model.getLSTMModel()->getCellState()(i), 1.0); } auto buffer = model.getEmptyBufferMatrix(10); buffer.setOnes(); buffer = buffer * 0.1; for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = input(dim); } } for (int y_dim = 1; y_dim < 16; y_dim++) { for (int step = 1; step < 6; step++) { model.initializeLSTM(buffer); for (int i = 0; i < init_hidden_dim; i++) { EXPECT_FLOAT_EQ(model.getInitModel()->getHiddenState()(i), 0.99790788); EXPECT_FLOAT_EQ(model.getInitModel()->getCellState()(i), 9.2190571); } for (int i = 0; i < hidden_dim; i++) { EXPECT_FLOAT_EQ(model.getLSTMModel()->getHiddenState()(i), 0.558857381); EXPECT_FLOAT_EQ(model.getLSTMModel()->getCellState()(i), 0.558857381); } launchForwardTestKernel, 8, 2, 32>(*model.getLSTMModel(), input_arr, output_arr, y_dim, step); for (int point = 0; point < num_rollouts; point++) { // model.resetInitHiddenCPU(); model.resetLSTMHiddenCellCPU(); input.setOnes(); input = input * 0.1; for (int cpu_step = 0; cpu_step < step; cpu_step++) { model.forward(input, output); } for (int dim = 0; dim < input_dim; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < output_dim; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim << " dim " << dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); EXPECT_FLOAT_EQ(output(dim), true_vals[step - 1]) << "at dim " << dim << " step " << step; } } } } } TEST_F(LSTMLSTMHelperTest, forwardGPUCompareShared) { const int num_rollouts = 3000; LSTMLSTMHelper<> model(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len); std::vector theta_vec(model.getLSTMModel()->getOutputModel()->getNumParams()); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } std::vector lstm_vec(model.getLSTMModel()->getNumParams()); for (int i = 0; i < lstm_vec.size(); i++) { lstm_vec[i] = distribution(generator); } model.getLSTMModel()->setAllValues(lstm_vec, theta_vec); theta_vec.resize(model.getInitModel()->getOutputModel()->getNumParams()); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } lstm_vec.resize(model.getInitModel()->getNumParams()); for (int i = 0; i < lstm_vec.size(); i++) { lstm_vec[i] = distribution(generator); } model.getInitModel()->setAllValues(lstm_vec, theta_vec); model.GPUSetup(); auto buffer = model.getEmptyBufferMatrix(10); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); Eigen::Matrix inputs = Eigen::Matrix::Random(); auto input = model.getLSTMModel()->getZeroInputVector(); auto output = model.getLSTMModel()->getZeroOutputVector(); for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } for (int y_dim = 1; y_dim < 16; y_dim++) { for (int step = 1; step < 6; step++) { buffer.setRandom(); model.initializeLSTM(buffer); launchForwardTestKernel, 8, 2, 32>(*model.getLSTMModel(), input_arr, output_arr, y_dim, step); for (int point = 0; point < num_rollouts; point++) { // model.resetInitHiddenCPU(); model.resetLSTMHiddenCellCPU(); input = inputs.col(point); for (int cpu_step = 0; cpu_step < step; cpu_step++) { model.forward(input, output); } for (int dim = 0; dim < input_dim; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < output_dim; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim << " dim " << dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } } TEST_F(LSTMLSTMHelperTest, forwardGPUCompareNoShared) { const int num_rollouts = 3000; LSTMLSTMHelper model(init_input_dim, init_hidden_dim, init_output_layers, input_dim, hidden_dim, output_layers, init_len); std::vector theta_vec(model.getLSTMModel()->getOutputModel()->getNumParams()); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } std::vector lstm_vec(model.getLSTMModel()->getNumParams()); for (int i = 0; i < lstm_vec.size(); i++) { lstm_vec[i] = distribution(generator); } model.getLSTMModel()->setAllValues(lstm_vec, theta_vec); theta_vec.resize(model.getInitModel()->getOutputModel()->getNumParams()); for (int i = 0; i < theta_vec.size(); i++) { theta_vec[i] = distribution(generator); } lstm_vec.resize(model.getInitModel()->getNumParams()); for (int i = 0; i < lstm_vec.size(); i++) { lstm_vec[i] = distribution(generator); } model.getInitModel()->setAllValues(lstm_vec, theta_vec); model.GPUSetup(); auto buffer = model.getEmptyBufferMatrix(10); std::vector> input_arr(num_rollouts); std::vector> output_arr(num_rollouts); Eigen::Matrix inputs = Eigen::Matrix::Random(); auto input = model.getLSTMModel()->getZeroInputVector(); auto output = model.getLSTMModel()->getZeroOutputVector(); for (int state_index = 0; state_index < num_rollouts; state_index++) { for (int dim = 0; dim < input_arr[0].size(); dim++) { input_arr[state_index][dim] = inputs.col(state_index)(dim); } } for (int y_dim = 1; y_dim < 16; y_dim++) { for (int step = 1; step < 6; step++) { buffer.setRandom(); model.initializeLSTM(buffer); launchForwardTestKernel, 8, 2, 32>(*model.getLSTMModel(), input_arr, output_arr, y_dim, step); for (int point = 0; point < num_rollouts; point++) { // model.resetInitHiddenCPU(); model.resetLSTMHiddenCellCPU(); input = inputs.col(point); for (int cpu_step = 0; cpu_step < step; cpu_step++) { model.forward(input, output); } for (int dim = 0; dim < input_dim; dim++) { EXPECT_NEAR(input(dim), input_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim; } for (int dim = 0; dim < output_dim; dim++) { EXPECT_NEAR(output(dim), output_arr[point][dim], 1e-4) << "at index " << point << " with y_dim " << y_dim << " dim " << dim; EXPECT_TRUE(isfinite(output_arr[point][dim])); } } } } } // // // TEST_F(LSTMLSTMHelperTest, TestComputeGradComputationFinite) // // { // // LSTMLSTMHelper> model; // // std::vector theta(1412); // // for (int i = 0; i < 1412; i++) // // { // // theta[i] = distribution(generator); // // } // // model.updateModel({ 6, 32, 32, 4 }, theta); // // // // LSTMLSTMHelper>::dfdx numeric_jac; // // LSTMLSTMHelper>::dfdx analytic_jac; // // // // for (int i = 0; i < 1000; i++) // // { // // LSTMLSTMHelper>::input_array input; // // input = LSTMLSTMHelper>::input_array::Random(); // // // // model.computeGrad(input, analytic_jac); // // EXPECT_TRUE(analytic_jac.allFinite()); // // } // // } // // // // TEST_F(LSTMLSTMHelperTest, TestComputeGradComputationCompare) // // { // // GTEST_SKIP(); // // LSTMLSTMHelper> model; // // std::vector theta(1412); // // for (int i = 0; i < 1412; i++) // // { // // theta[i] = distribution(generator); // // } // // model.updateModel({ 6, 32, 32, 4 }, theta); // // // // LSTMLSTMHelper>::dfdx numeric_jac; // // LSTMLSTMHelper>::dfdx analytic_jac; // // // // LSTMLSTMHelper>::input_array input; // // input << 1, 2, 3, 4, 5, 6; // // // // model.computeGrad(input, analytic_jac); // // // // // numeric_jac = num_diff.df(input, numeric_jac); // // // // ASSERT_LT((numeric_jac - analytic_jac).norm(), 1e-3) << "Numeric Jacobian\n" // // << numeric_jac << "\nAnalytic Jacobian\n" // // << analytic_jac; // // } ================================================ FILE: tests/sampling_distributions/CMakeLists.txt ================================================ set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) gtest_discover_tests(${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: tests/sampling_distributions/colored_noise_tests.cu ================================================ // // Created by Bogdan on 12/26/21 // #include #include #include #include "gtest/gtest.h" void assert_float_rel_near(const float known, const float compute, float rel_err) { float err = fabsf(compute - known) / known; ASSERT_NEAR(known, compute, rel_err) << "Relative error is " << err; } TEST(cuFFT, checkErrorCode) { cufftHandle plan; cuComplex* input_d; float* output_d; // As this call is intended to cause issues, disable compiler warning // src: https://stackoverflow.com/questions/14831051/how-to-disable-a-specific-nvcc-compiler-warnings // https://stackoverflow.com/questions/56193080/how-do-i-apply-a-flag-setting-nvcc-pragma-to-only-a-few-lines-of-code // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#nv-diagnostic-pragmas #pragma push #pragma diag_suppress = used_before_set auto status = cufftExecC2R(plan, input_d, output_d); #pragma pop std::string error_string = cufftGetErrorString(status); // std::cout << error_string << std::endl; EXPECT_TRUE(error_string == "cuFFT was passed an invalid plan handle"); } template struct TestDynamicsParams : public DynamicsParams { enum class ControlIndex : int { NUM_CONTROLS = C_DIM, }; enum class OutputIndex : int { EMPTY = 0, NUM_OUTPUTS }; }; template class TestNoise : public ::testing::Test { public: const int NUM_TIMESTEPS = 250; const int NUM_ROLLOUTS = 5000; const int CONTROL_DIM = C_IND_CLASS(DYN_PARAMS_T, NUM_CONTROLS); using SAMPLER_T = mppi::sampling_distributions::ColoredNoiseDistribution; using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T; SAMPLER_T* sampler; cudaStream_t stream; curandGenerator_t* gen; protected: void SetUp() override { SAMPLER_PARAMS_T params; for (int i = 0; i < CONTROL_DIM; i++) { params.exponents[i] = 0.0; } params.num_timesteps = NUM_TIMESTEPS; params.num_rollouts = NUM_ROLLOUTS; params.pure_noise_trajectories_percentage = 0.0; params.offset_decay_rate = 0.0; cudaStreamCreate(&stream); gen = new curandGenerator_t(); curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(*gen, 42); curandSetGeneratorOffset(*gen, 0); curandSetStream(*gen, stream); sampler = new SAMPLER_T(params, stream); sampler->GPUSetup(); } void TearDown() override { delete gen; delete sampler; } }; using DIFFERENT_CONTROL_DIMS = ::testing::Types, TestDynamicsParams<2>, TestDynamicsParams<3>, TestDynamicsParams<4>>; TYPED_TEST_SUITE(TestNoise, DIFFERENT_CONTROL_DIMS); TYPED_TEST(TestNoise, WhiteNoise) { int full_buffer_size = this->NUM_ROLLOUTS * this->NUM_TIMESTEPS * this->CONTROL_DIM; float* colored_noise_output = new float[full_buffer_size]{ 0 }; auto sampler_params = this->sampler->getParams(); for (int i = 0; i < this->CONTROL_DIM; i++) { sampler_params.exponents[i] = 0.0; sampler_params.std_dev[i] = 1.0; } this->sampler->setParams(sampler_params); this->sampler->generateSamples(0, 0, *(this->gen), false); HANDLE_ERROR(cudaMemcpyAsync(colored_noise_output, this->sampler->getControlSample(0, 0, 0), sizeof(float) * full_buffer_size, cudaMemcpyDeviceToHost, this->stream)); HANDLE_ERROR(cudaStreamSynchronize(this->stream)); std::vector num_within_std_dev(3, 0); // Ignore first rollout as that will be all zeros for (int i = this->NUM_TIMESTEPS * this->CONTROL_DIM; i < full_buffer_size; i++) { for (int j = 0; j < num_within_std_dev.size(); j++) { if (fabsf(colored_noise_output[i]) < j + 1.0) { num_within_std_dev[j]++; break; } } } float perc_within_n_std_dev[num_within_std_dev.size()]; // Percentages from https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule float known_percentages[3] = { 0.6827, 0.9545, 0.9973 }; for (int i = 0; i < num_within_std_dev.size(); i++) { perc_within_n_std_dev[i] = std::accumulate(num_within_std_dev.begin(), num_within_std_dev.begin() + i + 1, 0.0) / full_buffer_size; assert_float_rel_near(known_percentages[i], perc_within_n_std_dev[i], 0.001); } delete[] colored_noise_output; } TYPED_TEST(TestNoise, MultiNoise) { int full_buffer_size = this->NUM_ROLLOUTS * this->NUM_TIMESTEPS * this->CONTROL_DIM; float* colored_noise_output = new float[full_buffer_size]{ 0 }; auto sampler_params = this->sampler->getParams(); for (int i = 0; i < this->CONTROL_DIM; i++) { sampler_params.exponents[i] = i; sampler_params.std_dev[i] = 1.0; } this->sampler->setParams(sampler_params); this->sampler->generateSamples(0, 0, *(this->gen), false); HANDLE_ERROR(cudaMemcpyAsync(colored_noise_output, this->sampler->getControlSample(0, 0, 0), sizeof(float) * full_buffer_size, cudaMemcpyDeviceToHost, this->stream)); HANDLE_ERROR(cudaStreamSynchronize(this->stream)); const int num_std_devs = 3; // std::vector num_within_std_dev(3, 0); std::vector> control_std_dev_count; for (int i = 0; i < this->CONTROL_DIM; i++) { std::array std_dev_count_i = { 0, 0, 0 }; control_std_dev_count.push_back(std_dev_count_i); } for (int n = 1; n < this->NUM_ROLLOUTS; n++) { for (int t = 0; t < this->NUM_TIMESTEPS; t++) { for (int i = 0; i < this->CONTROL_DIM; i++) { const int sample_idx = (n * this->NUM_TIMESTEPS + t) * this->CONTROL_DIM + i; for (int j = 0; j < num_std_devs; j++) { if (fabsf(colored_noise_output[sample_idx]) < j + 1.0) { control_std_dev_count[i][j]++; break; } } } } } float perc_within_n_std_dev[num_std_devs]; // Percentages from https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule float known_percentages[num_std_devs] = { 0.6827, 0.9545, 0.9973 }; float proper_count = (this->NUM_ROLLOUTS - 1) * this->NUM_TIMESTEPS; for (int i = 0; i < num_std_devs; i++) { perc_within_n_std_dev[i] = std::accumulate(control_std_dev_count[0].begin(), control_std_dev_count[0].begin() + i + 1, 0.0) / proper_count; assert_float_rel_near(known_percentages[i], perc_within_n_std_dev[i], 0.001); } if (this->CONTROL_DIM > 1) { for (int i = 0; i < num_std_devs; i++) { perc_within_n_std_dev[i] = std::accumulate(control_std_dev_count[1].begin(), control_std_dev_count[1].begin() + i + 1, 0.0) / proper_count; std::cout << "Control 1 values within " << i + 1 << " std dev: " << perc_within_n_std_dev[i] << std::endl; } } delete[] colored_noise_output; } TEST(ColoredNoise, DISABLED_checkWhiteNoise) { int NUM_TIMESTEPS = 50000; int NUM_ROLLOUTS = 1; int CONTROL_DIM = 1; std::vector exponents(CONTROL_DIM, 0.0); int full_buffer_size = NUM_ROLLOUTS * NUM_TIMESTEPS * CONTROL_DIM; float* colored_noise_d; float colored_noise_output[full_buffer_size] = { 0 }; HANDLE_ERROR(cudaMalloc((void**)&colored_noise_d, sizeof(float) * full_buffer_size)); cudaStream_t stream; curandGenerator_t gen; cudaStreamCreate(&stream); curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen, 42); curandSetStream(gen, stream); powerlaw_psd_gaussian(exponents, NUM_TIMESTEPS, NUM_ROLLOUTS, colored_noise_d, 0, gen, 0.0f, stream); HANDLE_ERROR(cudaMemcpyAsync(colored_noise_output, colored_noise_d, sizeof(float) * full_buffer_size, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); // Check percentages for 3 standard deviations std::vector num_within_std_dev(3, 0); for (int i = 0; i < full_buffer_size; i++) { for (int j = 0; j < num_within_std_dev.size(); j++) { if (fabsf(colored_noise_output[i]) < j + 1.0) { num_within_std_dev[j]++; break; } } } float perc_within_n_std_dev[num_within_std_dev.size()]; // Percentages from https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule float known_percentages[3] = { 0.6827, 0.9545, 0.9973 }; for (int i = 0; i < num_within_std_dev.size(); i++) { perc_within_n_std_dev[i] = std::accumulate(num_within_std_dev.begin(), num_within_std_dev.begin() + i + 1, 0.0) / full_buffer_size; assert_float_rel_near(known_percentages[i], perc_within_n_std_dev[i], 0.001); } } TEST(ColoredNoise, DISABLED_checkPinkNoise) { int NUM_TIMESTEPS = 50000; int NUM_ROLLOUTS = 1; int CONTROL_DIM = 1; std::vector exponents(CONTROL_DIM, 1.0); int full_buffer_size = NUM_ROLLOUTS * NUM_TIMESTEPS * CONTROL_DIM; float* colored_noise_d; float colored_noise_output[full_buffer_size] = { 0 }; HANDLE_ERROR(cudaMalloc((void**)&colored_noise_d, sizeof(float) * full_buffer_size)); cudaStream_t stream; curandGenerator_t gen; cudaStreamCreate(&stream); curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen, 42); curandSetStream(gen, stream); powerlaw_psd_gaussian(exponents, NUM_TIMESTEPS, NUM_ROLLOUTS, colored_noise_d, 0, gen, 0.0f, stream); HANDLE_ERROR(cudaMemcpyAsync(colored_noise_output, colored_noise_d, sizeof(float) * full_buffer_size, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); int within_std_dev = 0; int within_2_std_dev = 0; for (int i = 0; i < full_buffer_size; i++) { if (fabsf(colored_noise_output[i]) < 1.0) { within_std_dev++; } else if (fabsf(colored_noise_output[i]) < 2.0) { within_2_std_dev++; } } float perc_one_std_dev = (float)within_std_dev / full_buffer_size; float perc_two_std_dev = (float)(within_std_dev + within_2_std_dev) / full_buffer_size; std::cout << "Percentage within 1 std dev: " << 100 * perc_one_std_dev << std::endl; std::cout << "Percentage within 2 std dev: " << 100 * perc_two_std_dev << std::endl; // assert_float_rel_near(0.6827, perc_one_std_dev, 0.001); // assert_float_rel_near(0.9545, perc_two_std_dev, 0.001); } TEST(ColoredNoise, DISABLED_checkRedNoise) { int NUM_TIMESTEPS = 50000; int NUM_ROLLOUTS = 1; int CONTROL_DIM = 1; std::vector exponents(CONTROL_DIM, 2.0); int full_buffer_size = NUM_ROLLOUTS * NUM_TIMESTEPS * CONTROL_DIM; float* colored_noise_d; float colored_noise_output[full_buffer_size] = { 0 }; HANDLE_ERROR(cudaMalloc((void**)&colored_noise_d, sizeof(float) * full_buffer_size)); cudaStream_t stream; curandGenerator_t gen; cudaStreamCreate(&stream); curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen, 42); curandSetStream(gen, stream); powerlaw_psd_gaussian(exponents, NUM_TIMESTEPS, NUM_ROLLOUTS, colored_noise_d, 0, gen, 0.0f, stream); HANDLE_ERROR(cudaMemcpyAsync(colored_noise_output, colored_noise_d, sizeof(float) * full_buffer_size, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); // Check percentages for 3 standard deviations std::vector num_within_std_dev(3, 0); for (int i = 0; i < full_buffer_size; i++) { for (int j = 0; j < num_within_std_dev.size(); j++) { if (fabsf(colored_noise_output[i]) < j + 1.0) { num_within_std_dev[j]++; break; } } } float perc_within_n_std_dev[num_within_std_dev.size()]; // Percentages from https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule float known_percentages[3] = { 0.6827, 0.9545, 0.9973 }; for (int i = 0; i < num_within_std_dev.size(); i++) { perc_within_n_std_dev[i] = std::accumulate(num_within_std_dev.begin(), num_within_std_dev.begin() + i + 1, 0.0) / full_buffer_size; std::cout << "Percentage within " << i + 1 << " std dev: " << 100 * perc_within_n_std_dev[i] << std::endl; // assert_float_rel_near(known_percentages[i], perc_within_n_std_dev[i], 0.001); } } TEST(ColoredNoise, DISABLED_checkMultiNoise) { int NUM_TIMESTEPS = 6000; int NUM_ROLLOUTS = 50; int CONTROL_DIM = 3; std::vector exponents(CONTROL_DIM, 0.0); exponents[1] = 0.5; exponents[2] = 2.0; // exponents[3] = 1.25; // exponents[4] = 0.75; int full_buffer_size = NUM_ROLLOUTS * NUM_TIMESTEPS * CONTROL_DIM; float* colored_noise_d; float colored_noise_output[full_buffer_size] = { 0 }; HANDLE_ERROR(cudaMalloc((void**)&colored_noise_d, sizeof(float) * full_buffer_size)); cudaStream_t stream; curandGenerator_t gen; cudaStreamCreate(&stream); curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(gen, 42); curandSetStream(gen, stream); powerlaw_psd_gaussian(exponents, NUM_TIMESTEPS, NUM_ROLLOUTS, colored_noise_d, 0, gen, 0.0f, stream); HANDLE_ERROR(cudaMemcpyAsync(colored_noise_output, colored_noise_d, sizeof(float) * full_buffer_size, cudaMemcpyDeviceToHost, stream)); HANDLE_ERROR(cudaStreamSynchronize(stream)); // Check percentages for 3 standard deviations std::vector num_within_std_dev(3, 0); float perc_within_n_std_dev[num_within_std_dev.size()]; // Percentages from https://en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule float known_percentages[3] = { 0.6827, 0.9545, 0.9973 }; for (int c = 0; c < CONTROL_DIM; c++) { std::fill(num_within_std_dev.begin(), num_within_std_dev.end(), 0); for (int i = 0; i < NUM_TIMESTEPS * NUM_ROLLOUTS; i++) { for (int j = 0; j < num_within_std_dev.size(); j++) { if (fabsf(colored_noise_output[i * CONTROL_DIM + c]) < j + 1.0) { num_within_std_dev[j]++; break; } } } for (int i = 0; i < num_within_std_dev.size(); i++) { perc_within_n_std_dev[i] = std::accumulate(num_within_std_dev.begin(), num_within_std_dev.begin() + i + 1, 0.0) / (NUM_ROLLOUTS * NUM_TIMESTEPS); std::cout << "Colored Noise " << exponents[c] << " "; std::cout << "percent of samples within " << i + 1 << " std dev: " << 100 * perc_within_n_std_dev[i] << std::endl; if (exponents[c] == 0) { assert_float_rel_near(known_percentages[i], perc_within_n_std_dev[i], 0.001); } } } } ================================================ FILE: tests/sampling_distributions/gaussian_noise_tests.cu ================================================ #include #include #include #include #include namespace ms = mppi::sampling_distributions; template class GaussianTests : public ::testing::Test { public: using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T; static const int OUTPUT_DIM = E_INDEX(SAMPLER_T::OutputIndex, NUM_OUTPUTS); typedef Eigen::Matrix output_array; int num_samples_ = 1000; int num_timesteps_ = 100; int num_distributions_ = 1; int num_verifications_ = 0; int* sampling_indices_d_ = nullptr; int* times_d_ = nullptr; int* distribution_indices_d_ = nullptr; float* outputs_d_ = nullptr; float* controls_d_ = nullptr; float* costs_d_ = nullptr; float lambda_ = 1.0f; float alpha_ = 0.0f; std::shared_ptr sampler_ = nullptr; cudaStream_t stream_ = 0; curandGenerator_t gen_; dim3 thread_block_ = dim3(32, 1, 1); std::vector sampled_indices_; std::vector sampled_times_; std::vector sampled_distributions_; std::vector means_cpu_; void SetUp() override { cudaStreamCreate(&stream_); sampler_ = std::make_shared(stream_); updateSamplerSizes(); sampler_->GPUSetup(); HANDLE_CURAND_ERROR(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT)); HANDLE_CURAND_ERROR(curandSetPseudoRandomGeneratorSeed(gen_, 42)); } void TearDown() override { } void updateSamplerSizes() { sampler_->setNumTimesteps(num_timesteps_); sampler_->setNumRollouts(num_samples_); sampler_->setNumDistributions(num_distributions_); means_cpu_.resize(num_distributions_); } void setRandomMeans() { for (int i = 0; i < num_distributions_; i++) { Eigen::MatrixXf mean_i = 5 * Eigen::MatrixXf::Random(SAMPLER_T::CONTROL_DIM, num_timesteps_); sampler_->copyImportanceSamplerToDevice(mean_i.data(), i, false); means_cpu_[i] = mean_i; } } }; template using GaussianParams3 = ms::GaussianParamsImpl; template class TestGaussianDistribution : public ms::GaussianDistributionImpl, GaussianParams3, DYN_PARAMS_T> { public: using PARENT_CLASS = ms::GaussianDistributionImpl; using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; TestGaussianDistribution(cudaStream_t stream = 0) : PARENT_CLASS(stream) { } TestGaussianDistribution(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0) : PARENT_CLASS(params, stream) { } }; using TYPE_TESTS = ::testing::Types>, ms::GaussianDistribution>, TestGaussianDistribution>>; TYPED_TEST_SUITE(GaussianTests, TYPE_TESTS); TYPED_TEST(GaussianTests, SetNumDistributions) { using T = TypeParam; this->num_distributions_ = 3; if (this->num_distributions_ > T::SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS) { EXPECT_THROW(this->sampler_->setNumDistributions(this->num_distributions_), std::out_of_range); } else { this->sampler_->setNumDistributions(this->num_distributions_); auto params = this->sampler_->getParams(); EXPECT_EQ(this->num_distributions_, params.num_distributions); } } TYPED_TEST(GaussianTests, CheckSamplesAreGaussian) { using T = TypeParam; float std_dev = 2.3f; auto params = this->sampler_->getParams(); params.pure_noise_trajectories_percentage = 0.0f; for (int i = 0; i < this->num_distributions_; i++) { for (int j = 0; j < T::CONTROL_DIM; j++) { params.std_dev[j + i * T::CONTROL_DIM] = std_dev; } } this->sampler_->setParams(params); this->setRandomMeans(); this->setRandomMeans(); this->sampler_->generateSamples(0, 0, this->gen_, false); std::vector sampled_controls(T::CONTROL_DIM * this->num_timesteps_ * this->num_samples_ * this->num_distributions_); HANDLE_ERROR(cudaMemcpyAsync(sampled_controls.data(), this->sampler_->getControlSample(0, 0, 0), sizeof(float) * this->num_samples_ * this->num_timesteps_ * this->num_distributions_ * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); int k_bins = 3; std::vector binned_counts(k_bins, 0); for (int d = 0; d < this->num_distributions_; d++) { for (int s = 0; s < this->num_samples_; s++) { for (int t = 0; t < this->num_timesteps_; t++) { for (int i = 0; i < T::CONTROL_DIM; i++) { int index = ((d * this->num_samples_ + s) * this->num_timesteps_ + t) * T::CONTROL_DIM + i; float z_score = (sampled_controls[index] - this->means_cpu_[d](i, t)) / std_dev; for (int j = k_bins; j >= 0; j--) { if (fabsf(z_score) < j + 1) { binned_counts[j]++; } else { break; } } } } } } // Check how many samples are properly sampled std::vector normalized_bins(k_bins, 0.0f); std::vector sigma_rules(k_bins); for (int i = 0; i < k_bins; i++) { sigma_rules[i] = 0.5 * (erf((i + 1) / sqrt(2.0)) - erf(-(i + 1) / sqrt(2.0))); normalized_bins[i] = (double)binned_counts[i] / (this->num_distributions_ * this->num_samples_ * this->num_timesteps_ * T::CONTROL_DIM); double diff = fabsf(normalized_bins[i] - sigma_rules[i]) / sigma_rules[i]; EXPECT_NEAR(diff, 0.0, 1e-3); } } TYPED_TEST(GaussianTests, CheckZeroMeanSamplesAreGaussian) { using T = TypeParam; auto params = this->sampler_->getParams(); params.pure_noise_trajectories_percentage = 1.0f; float std_dev = 2.3f; for (int i = 0; i < this->num_distributions_; i++) { for (int j = 0; j < T::CONTROL_DIM; j++) { params.std_dev[j + i * T::CONTROL_DIM] = std_dev; } } this->sampler_->setParams(params); this->setRandomMeans(); // should end up doing not effecting the samples this->sampler_->generateSamples(0, 0, this->gen_, false); std::vector sampled_controls(T::CONTROL_DIM * this->num_timesteps_ * this->num_samples_ * this->num_distributions_); HANDLE_ERROR(cudaMemcpyAsync(sampled_controls.data(), this->sampler_->getControlSample(0, 0, 0), sizeof(float) * this->num_samples_ * this->num_timesteps_ * this->num_distributions_ * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); int k_bins = 3; std::vector binned_counts(k_bins, 0); for (int d = 0; d < this->num_distributions_; d++) { for (int s = 0; s < this->num_samples_; s++) { for (int t = 0; t < this->num_timesteps_; t++) { for (int i = 0; i < T::CONTROL_DIM; i++) { int index = ((d * this->num_samples_ + s) * this->num_timesteps_ + t) * T::CONTROL_DIM + i; float z_score = (sampled_controls[index]) / std_dev; for (int j = k_bins; j >= 0; j--) { if (fabsf(z_score) < j + 1) { binned_counts[j]++; } else { break; } } } } } } // Check how many samples are properly sampled std::vector normalized_bins(k_bins, 0.0f); std::vector sigma_rules(k_bins); for (int i = 0; i < k_bins; i++) { sigma_rules[i] = 0.5 * (erf((i + 1) / sqrt(2.0)) - erf(-(i + 1) / sqrt(2.0))); normalized_bins[i] = (double)binned_counts[i] / (this->num_distributions_ * this->num_samples_ * this->num_timesteps_ * T::CONTROL_DIM); double diff = fabsf(normalized_bins[i] - sigma_rules[i]) / sigma_rules[i]; EXPECT_NEAR(diff, 0.0, 1e-3); } } ================================================ FILE: tests/sampling_distributions/generic_sampling_distribution_tests.cu ================================================ #include #include #include #include #include #include #include #include #include #if __cplusplus < 201703L // std::void_t is a C++17 feature, used for inherited_from_gaussian struct namespace std { template struct make_void { typedef void type; }; template using void_t = typename make_void::type; } // namespace std #endif namespace ms = mppi::sampling_distributions; template __global__ void getControlSamplesKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples, const int* __restrict__ sample_indexes_d, const int* __restrict__ times_d, const int* __restrict__ distribution_indexes_d, const float* __restrict__ outputs_d, float* __restrict__ control_samples_d) { const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim); using OutputIndex = typename SAMPLER_T::OutputIndex; const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS); extern __shared__ float entire_buffer[]; float* theta_d_shared = &entire_buffer[0 / sizeof(float)]; float* outputs_shared = &entire_buffer[size_of_theta_d_bytes / sizeof(float)]; // THREAD_BLOCK_X * OUTPUT_DIM in size int x_index, x_step; mppi::p1::getParallel1DIndex(x_index, x_step); int y_index, y_step; mppi::p1::getParallel1DIndex(y_index, y_step); int test_index = threadIdx.x + blockDim.x * blockIdx.x; // load output into shared memory if (test_index < num_control_samples) { for (int j = y_index; j < OUTPUT_DIM; j += y_step) { outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM]; } } __syncthreads(); float* output = &outputs_shared[x_index * OUTPUT_DIM]; // initialize sampling distributions sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared); __syncthreads(); // get control samples if (test_index < num_control_samples) { int sample_index = sample_indexes_d[test_index]; int t = times_d[test_index]; int distribution_idx = distribution_indexes_d[test_index]; float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index, output); } } template __global__ void getLikelihoodCostKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples, float lambda, float alpha, const int* __restrict__ sample_indexes_d, const int* __restrict__ times_d, const int* __restrict__ distribution_indexes_d, const float* __restrict__ outputs_d, float* __restrict__ control_samples_d, float* __restrict__ costs_d) { const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim); using OutputIndex = typename SAMPLER_T::OutputIndex; const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS); extern __shared__ float entire_buffer[]; float* theta_d_shared = &entire_buffer[0 / sizeof(float)]; float* outputs_shared = &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; // THREAD_BLOCK_X * OUTPUT_DIM in size float* running_cost_shared = &outputs_shared[blockDim.x * OUTPUT_DIM]; int running_cost_index = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); float* running_cost = &running_cost_shared[running_cost_index]; running_cost[0] = 0.0f; int x_index, x_step; mppi::p1::getParallel1DIndex(x_index, x_step); int y_index, y_step; mppi::p1::getParallel1DIndex(y_index, y_step); int test_index = threadIdx.x + blockDim.x * blockIdx.x; // load output into shared memory if (test_index < num_control_samples) { for (int j = y_index; j < OUTPUT_DIM; j += y_step) { outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM]; } } __syncthreads(); float* output = &outputs_shared[x_index * OUTPUT_DIM]; // initialize sampling distributions sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared); __syncthreads(); // get control samples if (test_index < num_control_samples) { int sample_index = sample_indexes_d[test_index]; int t = times_d[test_index]; int distribution_idx = distribution_indexes_d[test_index]; float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index, output); } __syncthreads(); // Calculate likelihood ratio cost if (test_index < num_control_samples) { int sample_index = sample_indexes_d[test_index]; int t = times_d[test_index]; int distribution_idx = distribution_indexes_d[test_index]; float* u = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; running_cost[0] = sampler->computeLikelihoodRatioCost(u, theta_d_shared, sample_index, t, distribution_idx, lambda, alpha); } // __syncthreads(); // if (threadIdx.x == 1 && threadIdx.y + threadIdx.z == 0 && blockIdx.x == 0) // { // printf("Running costs:\n"); // for (int i = 0; i < blockDim.x; i++) // { // float cost_sum = 0.0f; // for (int j = 0; j < blockDim.y; j++) // { // cost_sum += running_cost_shared[i + j * blockDim.x]; // printf("(%2d, %2d): %6.3f\n", i, j, running_cost_shared[i + j * blockDim.x]); // } // printf("Sum of y dim for %2d: %8.5f\n", i, cost_sum); // } // } running_cost = &running_cost_shared[blockDim.x * blockDim.y * threadIdx.z]; __syncthreads(); mppi::kernels::costArrayReduction(&running_cost[x_index], blockDim.y, threadIdx.y, blockDim.y, threadIdx.y == 0, blockDim.x); if (test_index < num_control_samples) { costs_d[test_index] = running_cost[x_index]; } } template __global__ void getFeedbackCostKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples, float lambda, float alpha, const int* __restrict__ sample_indexes_d, const int* __restrict__ times_d, const int* __restrict__ distribution_indexes_d, const float* __restrict__ outputs_d, float* __restrict__ control_samples_d, float* __restrict__ costs_d) { const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim); using OutputIndex = typename SAMPLER_T::OutputIndex; const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS); extern __shared__ float entire_buffer[]; float* theta_d_shared = &entire_buffer[0 / sizeof(float)]; float* outputs_shared = &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; // THREAD_BLOCK_X * OUTPUT_DIM in size float* running_cost_shared = &outputs_shared[blockDim.x * OUTPUT_DIM]; int running_cost_index = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); float* running_cost = &running_cost_shared[running_cost_index]; running_cost[0] = 0.0f; int x_index, x_step; mppi::p1::getParallel1DIndex(x_index, x_step); int y_index, y_step; mppi::p1::getParallel1DIndex(y_index, y_step); int test_index = threadIdx.x + blockDim.x * blockIdx.x; // load output into shared memory if (test_index < num_control_samples) { for (int j = y_index; j < OUTPUT_DIM; j += y_step) { outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM]; } } __syncthreads(); float* output = &outputs_shared[x_index * OUTPUT_DIM]; // initialize sampling distributions sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared); __syncthreads(); // get control samples if (test_index < num_control_samples) { int sample_index = sample_indexes_d[test_index]; int t = times_d[test_index]; int distribution_idx = distribution_indexes_d[test_index]; float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index, output); } __syncthreads(); // Calculate Feedback Control cost using random control samples from distribution if (test_index < num_control_samples) { int sample_index = sample_indexes_d[test_index]; int t = times_d[test_index]; int distribution_idx = distribution_indexes_d[test_index]; float* u = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; running_cost[0] = sampler->computeFeedbackCost(u, theta_d_shared, t, distribution_idx, lambda, alpha); } running_cost = &running_cost_shared[blockDim.x * blockDim.y * threadIdx.z]; __syncthreads(); mppi::kernels::costArrayReduction(&running_cost[x_index], blockDim.y, threadIdx.y, blockDim.y, threadIdx.y == 0, blockDim.x); if (test_index < num_control_samples) { costs_d[test_index] = running_cost[x_index]; } } template class SamplingDistributionTests : public ::testing::Test { public: using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T; static const int OUTPUT_DIM = E_INDEX(SAMPLER_T::OutputIndex, NUM_OUTPUTS); typedef Eigen::Matrix output_array; int num_samples_ = 1000; int num_timesteps_ = 100; int num_distributions_ = 1; int num_verifications_ = 0; int* sampling_indices_d_ = nullptr; int* times_d_ = nullptr; int* distribution_indices_d_ = nullptr; float* outputs_d_ = nullptr; float* controls_d_ = nullptr; float* costs_d_ = nullptr; float lambda_ = 1.0f; float alpha_ = 0.0f; std::shared_ptr sampler_ = nullptr; cudaStream_t stream_ = 0; curandGenerator_t gen_; dim3 thread_block_ = dim3(32, 2, 1); std::vector sampled_indices_; std::vector sampled_times_; std::vector sampled_distributions_; util::EigenAlignedVector sampled_outputs_; util::EigenAlignedVector sampled_controls_cpu_; util::EigenAlignedVector sampled_controls_gpu_; void SetUp() override { cudaStreamCreate(&stream_); sampler_ = std::make_shared(stream_); sampler_->GPUSetup(); setUpCudaMemory(3000); HANDLE_CURAND_ERROR(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT)); HANDLE_CURAND_ERROR(curandSetPseudoRandomGeneratorSeed(gen_, 42)); } template void freeCudaPtr(T*& ptr) { if (ptr != nullptr) { cudaFree(ptr); } ptr = nullptr; } void setUpCudaMemory(const int num_verifications) { if (num_verifications > num_verifications_) { freeCudaPtr(outputs_d_); freeCudaPtr(sampling_indices_d_); freeCudaPtr(times_d_); freeCudaPtr(distribution_indices_d_); freeCudaPtr(controls_d_); freeCudaPtr(costs_d_); // Allocate GPU memory HANDLE_ERROR(cudaMalloc((void**)&sampling_indices_d_, sizeof(int) * num_verifications)); HANDLE_ERROR(cudaMalloc((void**)×_d_, sizeof(int) * num_verifications)); HANDLE_ERROR(cudaMalloc((void**)&distribution_indices_d_, sizeof(int) * num_verifications)); HANDLE_ERROR(cudaMalloc((void**)&outputs_d_, sizeof(float) * OUTPUT_DIM * num_verifications)); HANDLE_ERROR(cudaMalloc((void**)&controls_d_, sizeof(float) * SAMPLER_T::CONTROL_DIM * num_verifications)); HANDLE_ERROR(cudaMalloc((void**)&costs_d_, sizeof(float) * num_verifications)); num_verifications_ = num_verifications; } } void generateNewSamples() { // Create sample index std::vector samples = mppi::math::sample_without_replacement(num_verifications_, num_timesteps_ * num_samples_ * num_distributions_); // Fill in sampled indices and copy to GPU sampled_indices_.resize(num_verifications_); sampled_times_.resize(num_verifications_); sampled_distributions_.resize(num_verifications_); sampled_outputs_.resize(num_verifications_); for (int i = 0; i < num_verifications_; i++) { const int sample = samples[i]; sampled_indices_[i] = (sample / (num_timesteps_ * num_distributions_)) % num_samples_; sampled_times_[i] = (sample / num_distributions_) % num_timesteps_; sampled_distributions_[i] = sample % num_distributions_; sampled_outputs_[i] = output_array::Random(); HANDLE_ERROR(cudaMemcpyAsync(outputs_d_ + i * OUTPUT_DIM, sampled_outputs_[i].data(), sizeof(float) * OUTPUT_DIM, cudaMemcpyHostToDevice, stream_)); } HANDLE_ERROR(cudaMemcpyAsync(sampling_indices_d_, sampled_indices_.data(), sizeof(int) * num_verifications_, cudaMemcpyHostToDevice, stream_)); HANDLE_ERROR(cudaMemcpyAsync(times_d_, sampled_times_.data(), sizeof(int) * num_verifications_, cudaMemcpyHostToDevice, stream_)); HANDLE_ERROR(cudaMemcpyAsync(distribution_indices_d_, sampled_distributions_.data(), sizeof(int) * num_verifications_, cudaMemcpyHostToDevice, stream_)); // Create non-zero mean Eigen::MatrixXf random_mean = Eigen::MatrixXf::Random(SAMPLER_T::CONTROL_DIM, num_timesteps_); for (int i = 0; i < num_distributions_; i++) { sampler_->copyImportanceSamplerToDevice(random_mean.data(), i, false); } float mean_update = 1.0f; sampled_controls_cpu_.resize(num_verifications_); sampled_controls_gpu_.resize(num_verifications_); sampler_->generateSamples(0, 0, gen_, false); HANDLE_ERROR(cudaMemcpyAsync(costs_d_, &mean_update, sizeof(float), cudaMemcpyHostToDevice, stream_)); for (int i = 0; i < num_distributions_; i++) { sampler_->updateDistributionParamsFromDevice(costs_d_, 1.0f, i, false); } HANDLE_ERROR(cudaStreamSynchronize(stream_)); } void updateSampler() { sampler_->setNumTimesteps(num_timesteps_); sampler_->setNumRollouts(num_samples_); sampler_->setNumDistributions(num_distributions_); } void TearDown() override { HANDLE_ERROR(cudaStreamSynchronize(stream_)); freeCudaPtr(sampling_indices_d_); freeCudaPtr(times_d_); freeCudaPtr(distribution_indices_d_); freeCudaPtr(outputs_d_); freeCudaPtr(controls_d_); freeCudaPtr(costs_d_); } }; /** * Special setup for Gaussian-based distributions **/ template struct inherited_from_gaussian : std::false_type { public: void operator()(std::shared_ptr sampler) const { // Empty setup } }; template struct inherited_from_gaussian< T, std::void_t, typename T::SAMPLING_PARAMS_T>::value, bool>::type>> : std::true_type { public: void operator()(std::shared_ptr sampler) const { // Setup std dev and cost coefficients auto params = sampler->getParams(); for (int i = 0; i < T::CONTROL_DIM; i++) { params.control_cost_coeff[i] = 1.0f; } for (int dist_i = 0; dist_i < params.num_distributions; dist_i++) { for (int std_dev_i = 0; std_dev_i < T::CONTROL_DIM; std_dev_i++) { params.std_dev[std_dev_i + dist_i * T::CONTROL_DIM] = 2.0f; } } sampler->setParams(params); } }; using TYPE_TESTS = ::testing::Types< ms::GaussianDistribution>, ms::GaussianDistribution>, ms::GaussianDistribution>, ms::GaussianDistribution>, ms::GaussianDistribution>, ms::ColoredNoiseDistribution>, ms::ColoredNoiseDistribution>, ms::ColoredNoiseDistribution>, ms::ColoredNoiseDistribution>, ms::NLNDistribution>, ms::NLNDistribution>, ms::NLNDistribution>, ms::NLNDistribution>>; TYPED_TEST_SUITE(SamplingDistributionTests, TYPE_TESTS); TYPED_TEST(SamplingDistributionTests, TestCreation) { using T = TypeParam; EXPECT_TRUE(true); // testMethod>(); inherited_from_gaussian()(this->sampler_); } TYPED_TEST(SamplingDistributionTests, SetNumDistributions) { using T = TypeParam; this->num_distributions_ = 2; this->updateSampler(); this->generateNewSamples(); EXPECT_TRUE(true); } TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPU) { using T = TypeParam; this->updateSampler(); this->generateNewSamples(); dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x; getControlSamplesKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_); HANDLE_ERROR(cudaGetLastError()); // Copy back to CPU for (int i = 0; i < this->num_verifications_; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); // Query on CPU float* u_i = this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); } HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); for (int i = 0; i < this->num_verifications_; i++) { float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum(); ASSERT_FLOAT_EQ(diff, 0.0f); } } TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPUMoreDistributions) { using T = TypeParam; this->num_distributions_ = 2; this->updateSampler(); this->generateNewSamples(); dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x; getControlSamplesKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_); HANDLE_ERROR(cudaGetLastError()); // Copy back to CPU for (int i = 0; i < this->num_verifications_; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); // Query on CPU float* u_i = this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); } HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); for (int i = 0; i < this->num_verifications_; i++) { float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum(); ASSERT_FLOAT_EQ(diff, 0.0f); } } TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPUDifferentNoiseForDistributions) { using T = TypeParam; this->num_distributions_ = 2; auto params = this->sampler_->getParams(); params.use_same_noise_for_all_distributions = false; this->sampler_->setParams(params); this->updateSampler(); this->generateNewSamples(); dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x; getControlSamplesKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_); HANDLE_ERROR(cudaGetLastError()); // Copy back to CPU for (int i = 0; i < this->num_verifications_; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); // Query on CPU float* u_i = this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); } HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); for (int i = 0; i < this->num_verifications_; i++) { float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum(); ASSERT_FLOAT_EQ(diff, 0.0f); } } TYPED_TEST(SamplingDistributionTests, CompareLikelihoodRatioCostsCPUvsGPU) { using T = TypeParam; this->num_distributions_ = 2; auto params = this->sampler_->getParams(); params.use_same_noise_for_all_distributions = false; this->sampler_->setParams(params); inherited_from_gaussian()(this->sampler_); this->updateSampler(); this->generateNewSamples(); dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + sizeof(float) * (this->OUTPUT_DIM * this->thread_block_.x + this->thread_block_.x * this->thread_block_.y * this->thread_block_.z); getLikelihoodCostKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( this->sampler_->sampling_d_, this->num_verifications_, this->lambda_, this->alpha_, this->sampling_indices_d_, this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_, this->costs_d_); HANDLE_ERROR(cudaGetLastError()); std::vector costs_gpu(this->num_verifications_); // Copy back to CPU for (int i = 0; i < this->num_verifications_; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); // Query on CPU float* u_i = this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); } HANDLE_ERROR(cudaMemcpyAsync(costs_gpu.data(), this->costs_d_, sizeof(float) * this->num_verifications_, cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); float cost; for (int i = 2; i < this->num_verifications_; i++) { cost = 0.0f; cost = this->sampler_->computeLikelihoodRatioCost(this->sampled_controls_cpu_[i], this->sampled_indices_[i], this->sampled_times_[i], this->sampled_distributions_[i], this->lambda_, this->alpha_); ASSERT_NEAR(cost, costs_gpu[i], fabsf(cost) * 5e-5) << " failed on sample " << this->sampled_indices_[i] << ", time " << this->sampled_times_[i] << ", dist: " << this->sampled_distributions_[i]; } } TYPED_TEST(SamplingDistributionTests, CompareFeedbackCostsCPUvsGPU) { using T = TypeParam; this->num_distributions_ = 2; auto params = this->sampler_->getParams(); params.use_same_noise_for_all_distributions = false; this->sampler_->setParams(params); inherited_from_gaussian()(this->sampler_); this->updateSampler(); this->generateNewSamples(); dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + sizeof(float) * (this->OUTPUT_DIM * this->thread_block_.x + this->thread_block_.x * this->thread_block_.y * this->thread_block_.z); getFeedbackCostKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( this->sampler_->sampling_d_, this->num_verifications_, this->lambda_, this->alpha_, this->sampling_indices_d_, this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_, this->costs_d_); HANDLE_ERROR(cudaGetLastError()); std::vector costs_gpu(this->num_verifications_); // Copy back to CPU for (int i = 0; i < this->num_verifications_; i++) { HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); // Query on CPU float* u_i = this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); } HANDLE_ERROR(cudaMemcpyAsync(costs_gpu.data(), this->costs_d_, sizeof(float) * this->num_verifications_, cudaMemcpyDeviceToHost, this->stream_)); HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); float cost; for (int i = 0; i < this->num_verifications_; i++) { cost = 0.0f; cost = this->sampler_->computeFeedbackCost(this->sampled_controls_cpu_[i], this->sampled_times_[i], this->sampled_distributions_[i], this->lambda_, this->alpha_); ASSERT_FLOAT_EQ(cost, costs_gpu[i]) << " failed on sample " << this->sampled_indices_[i] << ", time " << this->sampled_times_[i] << ", dist: " << this->sampled_distributions_[i]; } } ================================================ FILE: tests/shaping_functions/CMakeLists.txt ================================================ # Rollout kernel test suite set(SHAPING_FUNCTION_TARGET_NAME shaping_function_tests) add_executable(${SHAPING_FUNCTION_TARGET_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp shaping_function_test.cu) target_link_libraries(${SHAPING_FUNCTION_TARGET_NAME} gtest gmock gtest_main ${MPPI_HEADER_LIBRARY_NAME}) gtest_discover_tests(${SHAPING_FUNCTION_TARGET_NAME}) set_target_properties(${SHAPING_FUNCTION_TARGET_NAME} PROPERTIES FOLDER test) # Rollout kernel test suite set(CEM_SHAPING_FUNCTION_TARGET_NAME cem_shaping_function_tests) add_executable(${CEM_SHAPING_FUNCTION_TARGET_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp cem_shaping_function_test.cu) target_link_libraries(${CEM_SHAPING_FUNCTION_TARGET_NAME} gtest gmock gtest_main ${MPPI_HEADER_LIBRARY_NAME}) gtest_discover_tests(${CEM_SHAPING_FUNCTION_TARGET_NAME}) set_target_properties(${CEM_SHAPING_FUNCTION_TARGET_NAME} PROPERTIES FOLDER test) ================================================ FILE: tests/shaping_functions/cem_shaping_function_test.cu ================================================ #include #include #include #include #include #include #include #include class CEMShapingFunctionTest : public testing::Test { protected: void SetUp() override { generator = std::default_random_engine(7.0); distribution = std::normal_distribution(100.0, 2.0); } void TearDown() override { } std::default_random_engine generator; std::normal_distribution distribution; }; TEST_F(CEMShapingFunctionTest, computeWeightTest) { const int num_rollouts = 500; std::array cost_vec = { 0 }; CEMShapingFunctionParams params; CEMShapingFunction shaping_function; // Use a range based for loop to set the cost for (auto& cost : cost_vec) { cost = distribution(generator); } float target_cost = 99; EXPECT_FLOAT_EQ(shaping_function.getNormalizer(), 1.0); for (float gamma = 0.0; gamma <= 1.0; gamma += 0.1) { params.gamma = gamma; shaping_function.setParams(params); // int index = (int) num_rollouts*gamma; for (int i = 0; i < cost_vec.size(); i++) { float weight = shaping_function.computeWeight(cost_vec.data(), target_cost, i); if (cost_vec[i] > target_cost) { EXPECT_FLOAT_EQ(weight, 1.0 / ((int)num_rollouts * params.gamma + 1)); } else { EXPECT_FLOAT_EQ(weight, 0.0); } } } } TEST_F(CEMShapingFunctionTest, weightKernelTest) { const int num_rollouts = 500; std::array cost_vec = { 0 }; std::array result_cost_vec = { 0 }; CEMShapingFunctionParams params; CEMShapingFunction shaping_function; shaping_function.GPUSetup(); // Use a range based for loop to set the cost for (auto& cost : cost_vec) { cost = distribution(generator); } float baseline = 99; for (float gamma = 0.0; gamma <= 1.0; gamma += 0.1) { params.gamma = gamma; shaping_function.setParams(params); launchShapingFunction_KernelTest, num_rollouts, 1>(cost_vec, shaping_function, baseline, result_cost_vec); for (int i = 0; i < cost_vec.size(); i++) { if (cost_vec[i] > baseline) { EXPECT_FLOAT_EQ(result_cost_vec[i], 1.0 / ((int)num_rollouts * params.gamma + 1)); } else { EXPECT_FLOAT_EQ(result_cost_vec[i], 0.0); } } } } /* TEST_F(ShapingFunctionTest, launchWeightKernelTest) { const int num_rollouts = 500; ShapingFunction::cost_traj cost_traj; std::array result_cost_vec = {0}; ShapingFunctionParams params; ShapingFunction shaping_function; shaping_function.GPUSetup(); // Use a range based for loop to set the cost cost_traj = ShapingFunction::cost_traj::Zero(); for (int i = 0; i < num_rollouts; i++) { cost_traj(i) = distribution(generator); } float min_cost_known = cost_traj.minCoeff(); for (float lambda_inv = 0.01; lambda_inv < 3.0; lambda_inv += 0.1) { params.lambda_inv = lambda_inv; shaping_function.setParams(params); launchShapingFunction_KernelTest, num_rollouts>(cost_traj, shaping_function, min_cost_known, result_cost_vec); for (int i = 0; i < result_cost_vec.size(); i++) { EXPECT_FLOAT_EQ(result_cost_vec[i], expf(-lambda_inv * (cost_traj(i) - min_cost_known))); } } } */ TEST_F(CEMShapingFunctionTest, computeWeightsTest) { const int num_rollouts = 500; CEMShapingFunction::cost_traj cost_traj; CEMShapingFunction::cost_traj cost_traj_copy; CEMShapingFunctionParams params; CEMShapingFunction shaping_function; shaping_function.GPUSetup(); // Use a range based for loop to set the cost cost_traj = ShapingFunction::cost_traj::Zero(); cost_traj_copy = ShapingFunction::cost_traj::Zero(); const float gamma_total = 1.0; const float gamma_inc = 0.1; const float gamma_start = 0.0; int index = 0; for (float gamma = gamma_start; gamma <= gamma_total; gamma += gamma_inc) { // create the random vector for (int i = 0; i < num_rollouts; i++) { cost_traj(i) = distribution(generator); } // find the correct costs for the baseline std::sort(cost_traj.data(), cost_traj.data() + num_rollouts, std::greater()); float pivot = cost_traj[gamma * num_rollouts]; std::random_shuffle(cost_traj.data(), cost_traj.data() + num_rollouts); for (int i = 0; i < num_rollouts; i++) { cost_traj_copy(i) = cost_traj(i); } float* trajectory_costs_d; HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * num_rollouts)); HANDLE_ERROR( cudaMemcpy(trajectory_costs_d, cost_traj.data(), sizeof(float) * cost_traj.size(), cudaMemcpyHostToDevice)) params.gamma = gamma; shaping_function.setParams(params); shaping_function.computeWeights(cost_traj, trajectory_costs_d); EXPECT_FLOAT_EQ(shaping_function.getBaseline(), pivot); if (gamma == 0) { EXPECT_FLOAT_EQ(shaping_function.getBaseline(), cost_traj_copy.maxCoeff()); } if (gamma == 1.0) { EXPECT_FLOAT_EQ(shaping_function.getBaseline(), cost_traj_copy.minCoeff()); } EXPECT_EQ(shaping_function.getNormalizer(), 1.0); EXPECT_NEAR(cost_traj.sum(), 1.0, 1e-5) << cost_traj.sum(); for (int i = 0; i < cost_traj.size(); i++) { if (cost_traj_copy(i) >= pivot) { EXPECT_FLOAT_EQ(cost_traj(i), 1.0 / ((int)num_rollouts * params.gamma + 1)); } else { EXPECT_FLOAT_EQ(cost_traj(i), 0.0); } } index++; } } ================================================ FILE: tests/shaping_functions/shaping_function_test.cu ================================================ #include #include #include #include #include #include #include class ShapingFunctionTest : public testing::Test { protected: void SetUp() override { generator = std::default_random_engine(7.0); distribution = std::normal_distribution(100.0, 2.0); } void TearDown() override { } std::default_random_engine generator; std::normal_distribution distribution; }; TEST_F(ShapingFunctionTest, computeWeightTest) { const int num_rollouts = 500; std::array cost_vec = { 0 }; ShapingFunctionParams params; ShapingFunction shaping_function; // Use a range based for loop to set the cost for (auto& cost : cost_vec) { cost = distribution(generator); } float min_cost_known = *std::min_element(cost_vec.begin(), cost_vec.end()); for (float lambda_inv = 0.0; lambda_inv < 3.0; lambda_inv += 0.1) { params.lambda_inv = lambda_inv; shaping_function.setParams(params); for (int i = 0; i < cost_vec.size(); i++) { float weight = shaping_function.computeWeight(cost_vec.data(), min_cost_known, i); EXPECT_FLOAT_EQ(weight, expf(-lambda_inv * (cost_vec[i] - min_cost_known))); } } } TEST_F(ShapingFunctionTest, weightKernelTest) { const int num_rollouts = 500; std::array cost_vec = { 0 }; std::array result_cost_vec = { 0 }; ShapingFunctionParams params; ShapingFunction shaping_function; shaping_function.GPUSetup(); // Use a range based for loop to set the cost for (auto& cost : cost_vec) { cost = distribution(generator); } float min_cost_known = *std::min_element(cost_vec.begin(), cost_vec.end()); for (float lambda_inv = 0.01; lambda_inv < 3.0; lambda_inv += 0.1) { params.lambda_inv = lambda_inv; shaping_function.setParams(params); launchShapingFunction_KernelTest, num_rollouts, 1>( cost_vec, shaping_function, min_cost_known, result_cost_vec); for (int i = 0; i < cost_vec.size(); i++) { EXPECT_FLOAT_EQ(result_cost_vec[i], expf(-lambda_inv * (cost_vec[i] - min_cost_known))); } } } TEST_F(ShapingFunctionTest, launchWeightKernelTest) { const int num_rollouts = 500; ShapingFunction::cost_traj cost_traj; std::array result_cost_vec = { 0 }; ShapingFunctionParams params; ShapingFunction shaping_function; shaping_function.GPUSetup(); // Use a range based for loop to set the cost cost_traj = ShapingFunction::cost_traj::Zero(); for (int i = 0; i < num_rollouts; i++) { cost_traj(i) = distribution(generator); } float min_cost_known = cost_traj.minCoeff(); for (float lambda_inv = 0.01; lambda_inv < 3.0; lambda_inv += 0.1) { params.lambda_inv = lambda_inv; shaping_function.setParams(params); launchShapingFunction_KernelTest, num_rollouts>(cost_traj, shaping_function, min_cost_known, result_cost_vec); for (int i = 0; i < result_cost_vec.size(); i++) { EXPECT_FLOAT_EQ(result_cost_vec[i], expf(-lambda_inv * (cost_traj(i) - min_cost_known))); } } } TEST_F(ShapingFunctionTest, computeWeightsTest) { const int num_rollouts = 500; ShapingFunction::cost_traj cost_traj; ShapingFunction::cost_traj result_cost_traj; ShapingFunctionParams params; ShapingFunction shaping_function; shaping_function.GPUSetup(); // Use a range based for loop to set the cost cost_traj = ShapingFunction::cost_traj::Zero(); for (int i = 0; i < num_rollouts; i++) { cost_traj(i) = distribution(generator); result_cost_traj(i) = cost_traj(i); } float* trajectory_costs_d; HANDLE_ERROR(cudaMalloc((void**)&trajectory_costs_d, sizeof(float) * num_rollouts)); HANDLE_ERROR( cudaMemcpy(trajectory_costs_d, cost_traj.data(), sizeof(float) * cost_traj.size(), cudaMemcpyHostToDevice)) for (float lambda_inv = 0.01; lambda_inv < 3.0; lambda_inv += 0.1) { float min_cost_known = cost_traj.minCoeff(); params.lambda_inv = lambda_inv; shaping_function.setParams(params); shaping_function.computeWeights(cost_traj, trajectory_costs_d); float normalizer = cost_traj.sum(); EXPECT_EQ(shaping_function.getBaseline(), min_cost_known); EXPECT_EQ(shaping_function.getNormalizer(), normalizer); for (int i = 0; i < cost_traj.size(); i++) { EXPECT_FLOAT_EQ(cost_traj(i), expf(-lambda_inv * (result_cost_traj(i) - min_cost_known))); result_cost_traj(i) = cost_traj(i); } } } ================================================ FILE: tests/templated_headers/autorally_test_map.h.in ================================================ #ifndef TESTS_COST_FUNCTIONS_AUTORALLY_TEST_MAP_H_ #define TESTS_COST_FUNCTIONS_AUTORALLY_TEST_MAP_H_ #include namespace mppi { namespace tests { const std::string test_map_file = "@TEST_MAP_FOLDER@/track_map.npz"; const std::string standard_test_map_file = "@TEST_MAP_FOLDER@/track_map_standard.npz"; const std::string robust_test_map_file = "@TEST_MAP_FOLDER@/track_map_robust.npz"; const std::string ccrf_map = "@PROJECT_SOURCE_DIR@/resources/ccrf_track.npz"; } } #endif // TESTS_COST_FUNCTIONS_AUTORALLY_TEST_MAP_H_ ================================================ FILE: tests/templated_headers/autorally_test_network.h.in ================================================ #ifndef TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ #define TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ #include namespace mppi { namespace tests { const std::string test_load_nn_file = "@TEST_NETWORK_FOLDER@/neuralNetLoadTest.npz"; const std::string test_compute_nn_file = "@TEST_NETWORK_FOLDER@/neuralNetComputeTest.npz"; const std::string old_autorally_network_file = "@PROJECT_SOURCE_DIR@/resources/autorally_nnet_09_12_2018.npz"; // const std::string autorally_lstm_network_file = "@PROJECT_SOURCE_DIR@/resources/integrator_FNN_LSTM_25_step_10_pre_no_out_20_1.npz"; // const std::string autorally_lstm_network_file = "@PROJECT_SOURCE_DIR@/resources/bogdan.npz"; const std::string autorally_lstm_network_file = "@PROJECT_SOURCE_DIR@/resources/lstm.npz"; const std::string autorally_hidden_network_file = "@PROJECT_SOURCE_DIR@/resources/hidden_init.npz"; const std::string autorally_cell_network_file = "@PROJECT_SOURCE_DIR@/resources/cell_init.npz"; const std::string autorally_output_network_file = "@PROJECT_SOURCE_DIR@/resources/output.npz"; } } #endif // TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ ================================================ FILE: tests/templated_headers/racer_test_networks.h.in ================================================ #ifndef TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ #define TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ #include namespace mppi { namespace tests { //const std::string steering_lstm = "@PROJECT_SOURCE_DIR@/resources/lstm_lstm_steering.npz"; const std::string steering_lstm = "@PROJECT_SOURCE_DIR@/resources/lstm_lstm_steering_accel.npz"; const std::string brake_delay_lstm = "@PROJECT_SOURCE_DIR@/resources/lstm_lstm_brake_delay.npz"; const std::string ackerman_lstm = "@PROJECT_SOURCE_DIR@/resources/lstm_lstm_ackerman.npz"; const std::string ackerman_test = "@PROJECT_SOURCE_DIR@/resources/ackerman_test.npz"; const std::string bicycle_slip_hybrid_test = "@PROJECT_SOURCE_DIR@/resources/bicycle_slip_hybrid_test.npz"; const std::string bicycle_slip_hybrid_true = "@PROJECT_SOURCE_DIR@/resources/bicycle_slip_hybrid.npz"; const std::string bicycle_slip_kinematic_test = "@PROJECT_SOURCE_DIR@/resources/bicycle_slip_kinematic_test.npz"; const std::string bicycle_slip_kinematic_true = "@PROJECT_SOURCE_DIR@/resources/lstm_lstm_bicycle_slip_kinematic.npz"; const std::string racer_dubins_elevation_uncertainty_test = "@PROJECT_SOURCE_DIR@/resources/network_rde_test.npz"; } } #endif // TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ ================================================ FILE: tests/templated_headers/test_networks.h.in ================================================ #ifndef TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ #define TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ #include namespace mppi { namespace tests { const std::string test_load_nn_file = "@TEST_NETWORK_FOLDER@/neuralNetLoadTest.npz"; const std::string test_lstm_lstm = "@PROJECT_SOURCE_DIR@/resources/lstm_lstm_test.npz"; } } #endif // TESTS_COST_FUNCTIONS_AUTORALLY_TEST_NETWORK_H_ ================================================ FILE: tests/test_main.cpp ================================================ // // Created by mgandhi3 on 11/18/19. // #include int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } ================================================ FILE: tests/texture_helpers/CMakeLists.txt ================================================ set(GTEST_LIBRARIES gtest gmock gtest_main) file(GLOB TARGET_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cu) foreach(T_FILE IN LISTS TARGET_SRCS) get_filename_component(T_NAME ${T_FILE} NAME_WE) add_executable(${T_NAME} ${PROJECT_SOURCE_DIR}/tests/test_main.cpp ${T_FILE}) target_link_libraries(${T_NAME} ${GTEST_LIBRARIES} ${MPPI_HEADER_LIBRARY_NAME}) gtest_discover_tests(${T_NAME}) set_target_properties(${T_NAME} PROPERTIES FOLDER test) endforeach() ================================================ FILE: tests/texture_helpers/texture_helper_test.cu ================================================ #include #include #include #include #include class TextureHelperTest : public testing::Test { protected: void SetUp() override { generator = std::default_random_engine(7.0); distribution = std::normal_distribution(100.0, 2.0); } void TearDown() override { } std::default_random_engine generator; std::normal_distribution distribution; }; class TextureHelperImpl : public TextureHelper { public: TextureHelperImpl(int number, cudaStream_t stream = 0) : TextureHelper(number, stream) { } ~TextureHelperImpl() { for (TextureParams& params : textures_) { params.allocated = false; } } std::vector> getTextures() { return textures_; } std::vector> getTexturesBuffer() { return textures_buffer_; } TextureParams* getParamsD() { return this->params_d_; } void setTestExtent(int index, cudaExtent extent) { this->textures_buffer_[index].extent = extent; this->textures_[index].extent = extent; } void setGpuMemStatus(bool flag) { this->GPUMemStatus_ = flag; } void setTestFlagsInParam(int index, bool update_mem, bool update_data, bool allocated, bool update_params) { this->textures_buffer_[index].update_mem = update_mem; this->textures_buffer_[index].update_data = update_data; this->textures_[index].allocated = allocated; this->textures_buffer_[index].update_params = update_params; } void copyDataToGPU(int index, bool sync = false) override { copyDataToGPUCalled++; this->textures_[index].update_data = false; } void allocateCudaTexture(int index) override { if (this->textures_[index].allocated) { freeCudaMem(this->textures_[index]); } allocateCudaTextureCalled++; } void createCudaTexture(int index, bool sync = false) override { createCudaTextureCalled++; this->textures_[index].allocated = true; this->textures_[index].update_mem = false; } void copyParamsToGPU(int index, bool sync = false) { copyParamsToGPUCalled++; this->textures_[index].update_params = false; } void clearCounters() { copyDataToGPUCalled = 0; allocateCudaTextureCalled = 0; createCudaTextureCalled = 0; copyParamsToGPUCalled = 0; freeCudaMemCalled = 0; } void freeCudaMem(TextureParams& texture) { texture.allocated = false; texture.use = false; freeCudaMemCalled++; } TextureParams* getTextureD() { return this->textures_d_; } TextureParams* getTextureDataPtr() { return this->textures_.data(); } int copyDataToGPUCalled = 0; int allocateCudaTextureCalled = 0; int createCudaTextureCalled = 0; int copyParamsToGPUCalled = 0; int freeCudaMemCalled = 0; float4 queryTexture(const int index, const float3& input) { float4 result; result.x = input.x; result.y = input.y; result.z = input.z; result.w = index; return result; } }; TEST_F(TextureHelperTest, Constructor) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); std::vector> textures = helper.getTextures(); EXPECT_EQ(textures.size(), number); EXPECT_EQ(helper.getCpuValues().size(), number); EXPECT_EQ(helper.getCpuBufferValues().size(), number); EXPECT_NE(helper.getTextureD(), nullptr); EXPECT_NE(helper.getTextureDataPtr(), nullptr); EXPECT_EQ(helper.ptr_d_, nullptr); EXPECT_EQ(helper.getParamsD(), nullptr); EXPECT_EQ(helper.getTextureD(), helper.getTextureDataPtr()); for (const TextureParams texture : textures) { EXPECT_EQ(texture.use, false); EXPECT_EQ(texture.allocated, false); EXPECT_EQ(texture.update_data, false); EXPECT_EQ(texture.update_mem, false); EXPECT_EQ(texture.array_d, nullptr); EXPECT_EQ(texture.tex_d, 0); EXPECT_FLOAT_EQ(texture.origin.x, 0); EXPECT_FLOAT_EQ(texture.origin.y, 0); EXPECT_FLOAT_EQ(texture.origin.z, 0); EXPECT_FLOAT_EQ(texture.rotations[0].x, 1); EXPECT_FLOAT_EQ(texture.rotations[0].y, 0); EXPECT_FLOAT_EQ(texture.rotations[0].z, 0); EXPECT_FLOAT_EQ(texture.rotations[1].x, 0); EXPECT_FLOAT_EQ(texture.rotations[1].y, 1); EXPECT_FLOAT_EQ(texture.rotations[1].z, 0); EXPECT_FLOAT_EQ(texture.rotations[2].x, 0); EXPECT_FLOAT_EQ(texture.rotations[2].y, 0); EXPECT_FLOAT_EQ(texture.rotations[2].z, 1); EXPECT_EQ(texture.resDesc.resType, cudaResourceTypeArray); EXPECT_EQ(texture.channelDesc.x, 32); EXPECT_EQ(texture.channelDesc.y, 32); EXPECT_EQ(texture.channelDesc.z, 32); EXPECT_EQ(texture.channelDesc.w, 32); EXPECT_EQ(texture.texDesc.addressMode[0], cudaAddressModeClamp); EXPECT_EQ(texture.texDesc.addressMode[1], cudaAddressModeClamp); EXPECT_EQ(texture.texDesc.filterMode, cudaFilterModeLinear); EXPECT_EQ(texture.texDesc.readMode, cudaReadModeElementType); EXPECT_EQ(texture.texDesc.normalizedCoords, 1); } textures = helper.getTexturesBuffer(); EXPECT_EQ(textures.size(), number); EXPECT_NE(helper.getTextureD(), nullptr); EXPECT_NE(helper.getTextureDataPtr(), nullptr); EXPECT_EQ(helper.ptr_d_, nullptr); EXPECT_EQ(helper.getParamsD(), nullptr); EXPECT_EQ(helper.getTextureD(), helper.getTextureDataPtr()); for (const TextureParams texture : textures) { EXPECT_EQ(texture.use, false); EXPECT_EQ(texture.allocated, false); EXPECT_EQ(texture.update_data, false); EXPECT_EQ(texture.update_mem, false); EXPECT_EQ(texture.array_d, nullptr); EXPECT_EQ(texture.tex_d, 0); EXPECT_FLOAT_EQ(texture.origin.x, 0); EXPECT_FLOAT_EQ(texture.origin.y, 0); EXPECT_FLOAT_EQ(texture.origin.z, 0); EXPECT_FLOAT_EQ(texture.rotations[0].x, 1); EXPECT_FLOAT_EQ(texture.rotations[0].y, 0); EXPECT_FLOAT_EQ(texture.rotations[0].z, 0); EXPECT_FLOAT_EQ(texture.rotations[1].x, 0); EXPECT_FLOAT_EQ(texture.rotations[1].y, 1); EXPECT_FLOAT_EQ(texture.rotations[1].z, 0); EXPECT_FLOAT_EQ(texture.rotations[2].x, 0); EXPECT_FLOAT_EQ(texture.rotations[2].y, 0); EXPECT_FLOAT_EQ(texture.rotations[2].z, 1); EXPECT_EQ(texture.resDesc.resType, cudaResourceTypeArray); EXPECT_EQ(texture.channelDesc.x, 32); EXPECT_EQ(texture.channelDesc.y, 32); EXPECT_EQ(texture.channelDesc.z, 32); EXPECT_EQ(texture.channelDesc.w, 32); EXPECT_EQ(texture.texDesc.addressMode[0], cudaAddressModeClamp); EXPECT_EQ(texture.texDesc.addressMode[1], cudaAddressModeClamp); EXPECT_EQ(texture.texDesc.filterMode, cudaFilterModeLinear); EXPECT_EQ(texture.texDesc.readMode, cudaReadModeElementType); EXPECT_EQ(texture.texDesc.normalizedCoords, 1); } } TEST_F(TextureHelperTest, UpdateOriginTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); helper.updateOrigin(0, make_float3(1, 2, 3)); helper.updateOrigin(1, make_float3(4, 5, 6)); std::vector> textures = helper.getTextures(); std::vector> textures_buffer = helper.getTexturesBuffer(); EXPECT_FLOAT_EQ(textures[0].origin.x, 0); EXPECT_FLOAT_EQ(textures[0].origin.y, 0); EXPECT_FLOAT_EQ(textures[0].origin.z, 0); EXPECT_FLOAT_EQ(textures_buffer[0].origin.x, 1); EXPECT_FLOAT_EQ(textures_buffer[0].origin.y, 2); EXPECT_FLOAT_EQ(textures_buffer[0].origin.z, 3); EXPECT_FLOAT_EQ(textures[1].origin.x, 0); EXPECT_FLOAT_EQ(textures[1].origin.y, 0); EXPECT_FLOAT_EQ(textures[1].origin.z, 0); EXPECT_FLOAT_EQ(textures_buffer[1].origin.x, 4); EXPECT_FLOAT_EQ(textures_buffer[1].origin.y, 5); EXPECT_FLOAT_EQ(textures_buffer[1].origin.z, 6); EXPECT_TRUE(textures_buffer[0].update_params); EXPECT_TRUE(textures_buffer[1].update_params); for (int i = 2; i < number; i++) { EXPECT_FALSE(textures_buffer[i].update_params); } } TEST_F(TextureHelperTest, UpdateRotationTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(1, 2, 3); new_rot_mat[1] = make_float3(4, 5, 6); new_rot_mat[2] = make_float3(7, 8, 9); helper.updateRotation(0, new_rot_mat); std::vector> textures = helper.getTextures(); std::vector> textures_buffer = helper.getTexturesBuffer(); // buffer is updated EXPECT_FLOAT_EQ(textures_buffer[0].rotations[0].x, 1); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[0].y, 2); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[0].z, 3); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[1].x, 4); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[1].y, 5); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[1].z, 6); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[2].x, 7); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[2].y, 8); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[2].z, 9); // non buffer not updated EXPECT_FLOAT_EQ(textures[0].rotations[0].x, 1); EXPECT_FLOAT_EQ(textures[0].rotations[0].y, 0); EXPECT_FLOAT_EQ(textures[0].rotations[0].z, 0); EXPECT_FLOAT_EQ(textures[0].rotations[1].x, 0); EXPECT_FLOAT_EQ(textures[0].rotations[1].y, 1); EXPECT_FLOAT_EQ(textures[0].rotations[1].z, 0); EXPECT_FLOAT_EQ(textures[0].rotations[2].x, 0); EXPECT_FLOAT_EQ(textures[0].rotations[2].y, 0); EXPECT_FLOAT_EQ(textures[0].rotations[2].z, 1); new_rot_mat[2].x = 22; helper.updateRotation(1, new_rot_mat); textures_buffer = helper.getTexturesBuffer(); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[0].x, 1); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[0].y, 2); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[0].z, 3); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[1].x, 4); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[1].y, 5); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[1].z, 6); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[2].x, 7); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[2].y, 8); EXPECT_FLOAT_EQ(textures_buffer[0].rotations[2].z, 9); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[0].x, 1); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[0].y, 2); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[0].z, 3); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[1].x, 4); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[1].y, 5); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[1].z, 6); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[2].x, 22); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[2].y, 8); EXPECT_FLOAT_EQ(textures_buffer[1].rotations[2].z, 9); } TEST_F(TextureHelperTest, SetExtentTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); cudaExtent extent = make_cudaExtent(4, 5, 6); helper.setTestExtent(1, extent); helper.setExtent(0, extent); helper.setExtent(1, extent); extent = make_cudaExtent(4, 5, 0); helper.setExtent(2, extent); helper.setTestExtent(3, extent); helper.setExtent(3, extent); std::vector> textures = helper.getTextures(); std::vector> textures_buffer = helper.getTexturesBuffer(); EXPECT_TRUE(textures_buffer[0].update_mem); EXPECT_EQ(textures_buffer[0].extent.width, 4); EXPECT_EQ(textures_buffer[0].extent.height, 5); EXPECT_EQ(textures_buffer[0].extent.depth, 6); EXPECT_FALSE(textures[0].update_mem); EXPECT_NE(textures[0].extent.width, 4); EXPECT_NE(textures[0].extent.height, 5); EXPECT_NE(textures[0].extent.depth, 6); EXPECT_FALSE(textures_buffer[1].update_mem); EXPECT_EQ(textures_buffer[1].extent.width, 4); EXPECT_EQ(textures_buffer[1].extent.height, 5); EXPECT_EQ(textures_buffer[1].extent.depth, 6); EXPECT_TRUE(textures_buffer[2].update_mem); EXPECT_EQ(textures_buffer[2].extent.width, 4); EXPECT_EQ(textures_buffer[2].extent.height, 5); EXPECT_EQ(textures_buffer[2].extent.depth, 0); EXPECT_FALSE(textures_buffer[3].update_mem); EXPECT_EQ(textures_buffer[3].extent.width, 4); EXPECT_EQ(textures_buffer[3].extent.height, 5); EXPECT_EQ(textures_buffer[3].extent.depth, 0); } TEST_F(TextureHelperTest, CopyToDeviceTest) { int number = 16; TextureHelperImpl helper = TextureHelperImpl(number); helper.setTestFlagsInParam(0, false, false, false, false); helper.setTestFlagsInParam(1, false, false, false, true); helper.setTestFlagsInParam(2, false, false, true, false); helper.setTestFlagsInParam(3, false, false, true, true); helper.setTestFlagsInParam(4, false, true, false, false); helper.setTestFlagsInParam(5, false, true, false, true); helper.setTestFlagsInParam(6, false, true, true, false); helper.setTestFlagsInParam(7, false, true, true, true); helper.setTestFlagsInParam(8, true, false, false, false); helper.setTestFlagsInParam(9, true, false, false, true); helper.setTestFlagsInParam(10, true, false, true, false); helper.setTestFlagsInParam(11, true, false, true, true); helper.setTestFlagsInParam(12, true, true, false, false); helper.setTestFlagsInParam(13, true, true, false, true); helper.setTestFlagsInParam(14, true, true, true, false); helper.setTestFlagsInParam(15, true, true, true, true); // nothing should have happened since GPUMemStatus=False helper.copyToDevice(); std::vector> textures = helper.getTextures(); std::vector> textures_buffer = helper.getTexturesBuffer(); for (int i = 0; i < textures_buffer.size(); i++) { EXPECT_FALSE(textures_buffer[i].update_mem); EXPECT_FALSE(textures_buffer[i].update_data); EXPECT_FALSE(textures_buffer[i].allocated); EXPECT_FALSE(textures_buffer[i].update_params); } EXPECT_FALSE(textures[0].update_mem); EXPECT_FALSE(textures[0].update_data); EXPECT_FALSE(textures[0].allocated); EXPECT_FALSE(textures[0].update_params); EXPECT_FALSE(textures[1].update_mem); EXPECT_FALSE(textures[1].update_data); EXPECT_FALSE(textures[1].allocated); EXPECT_TRUE(textures[1].update_params); EXPECT_FALSE(textures[2].update_mem); EXPECT_FALSE(textures[2].update_data); EXPECT_TRUE(textures[2].allocated); EXPECT_FALSE(textures[2].update_params); EXPECT_FALSE(textures[3].update_mem); EXPECT_FALSE(textures[3].update_data); EXPECT_TRUE(textures[3].allocated); EXPECT_TRUE(textures[3].update_params); EXPECT_FALSE(textures[4].update_mem); EXPECT_TRUE(textures[4].update_data); EXPECT_FALSE(textures[4].allocated); EXPECT_FALSE(textures[4].update_params); EXPECT_FALSE(textures[5].update_mem); EXPECT_TRUE(textures[5].update_data); EXPECT_FALSE(textures[5].allocated); EXPECT_TRUE(textures[5].update_params); EXPECT_FALSE(textures[6].update_mem); EXPECT_TRUE(textures[6].update_data); EXPECT_TRUE(textures[6].allocated); EXPECT_FALSE(textures[6].update_params); EXPECT_FALSE(textures[7].update_mem); EXPECT_TRUE(textures[7].update_data); EXPECT_TRUE(textures[7].allocated); EXPECT_TRUE(textures[7].update_params); EXPECT_TRUE(textures[8].update_mem); EXPECT_FALSE(textures[8].update_data); EXPECT_FALSE(textures[8].allocated); EXPECT_FALSE(textures[8].update_params); EXPECT_TRUE(textures[9].update_mem); EXPECT_FALSE(textures[9].update_data); EXPECT_FALSE(textures[9].allocated); EXPECT_TRUE(textures[9].update_params); EXPECT_TRUE(textures[10].update_mem); EXPECT_FALSE(textures[10].update_data); EXPECT_TRUE(textures[10].allocated); EXPECT_FALSE(textures[10].update_params); EXPECT_TRUE(textures[11].update_mem); EXPECT_FALSE(textures[11].update_data); EXPECT_TRUE(textures[11].allocated); EXPECT_TRUE(textures[11].update_params); EXPECT_TRUE(textures[12].update_mem); EXPECT_TRUE(textures[12].update_data); EXPECT_FALSE(textures[12].allocated); EXPECT_FALSE(textures[12].update_params); EXPECT_TRUE(textures[13].update_mem); EXPECT_TRUE(textures[13].update_data); EXPECT_FALSE(textures[13].allocated); EXPECT_TRUE(textures[13].update_params); EXPECT_TRUE(textures[14].update_mem); EXPECT_TRUE(textures[14].update_data); EXPECT_TRUE(textures[14].allocated); EXPECT_FALSE(textures[14].update_params); EXPECT_TRUE(textures[15].update_mem); EXPECT_TRUE(textures[15].update_data); EXPECT_TRUE(textures[15].allocated); EXPECT_TRUE(textures[15].update_params); EXPECT_EQ(helper.copyDataToGPUCalled, 0); EXPECT_EQ(helper.allocateCudaTextureCalled, 0); EXPECT_EQ(helper.createCudaTextureCalled, 0); EXPECT_EQ(helper.copyParamsToGPUCalled, 0); EXPECT_EQ(helper.freeCudaMemCalled, 0); helper.clearCounters(); helper.setGpuMemStatus(true); helper.copyToDevice(); textures = helper.getTextures(); for (int i = 0; i < textures_buffer.size(); i++) { EXPECT_FALSE(textures_buffer[i].update_mem); EXPECT_FALSE(textures_buffer[i].update_data); EXPECT_FALSE(textures_buffer[i].allocated); EXPECT_FALSE(textures_buffer[i].update_params); } EXPECT_FALSE(textures[0].update_mem); EXPECT_FALSE(textures[0].update_data); EXPECT_FALSE(textures[0].allocated); EXPECT_FALSE(textures[0].update_params); EXPECT_FALSE(textures[1].update_mem); EXPECT_FALSE(textures[1].update_data); EXPECT_FALSE(textures[1].allocated); EXPECT_TRUE(textures[1].update_params); EXPECT_FALSE(textures[2].update_mem); EXPECT_FALSE(textures[2].update_data); EXPECT_TRUE(textures[2].allocated); EXPECT_FALSE(textures[2].update_params); EXPECT_FALSE(textures[3].update_mem); EXPECT_FALSE(textures[3].update_data); EXPECT_TRUE(textures[3].allocated); EXPECT_FALSE(textures[3].update_params); EXPECT_FALSE(textures[4].update_mem); EXPECT_TRUE(textures[4].update_data); EXPECT_FALSE(textures[4].allocated); EXPECT_FALSE(textures[4].update_params); EXPECT_FALSE(textures[5].update_mem); EXPECT_TRUE(textures[5].update_data); EXPECT_FALSE(textures[5].allocated); EXPECT_TRUE(textures[5].update_params); // false, true, true, false EXPECT_FALSE(textures[6].update_mem); EXPECT_FALSE(textures[6].update_data); EXPECT_TRUE(textures[6].allocated); EXPECT_FALSE(textures[6].update_params); EXPECT_FALSE(textures[7].update_mem); EXPECT_FALSE(textures[7].update_data); EXPECT_TRUE(textures[7].allocated); EXPECT_FALSE(textures[7].update_params); EXPECT_FALSE(textures[8].update_mem); EXPECT_FALSE(textures[8].update_data); EXPECT_TRUE(textures[8].allocated); EXPECT_FALSE(textures[8].update_params); // true, false, false, true EXPECT_FALSE(textures[9].update_mem); EXPECT_FALSE(textures[9].update_data); EXPECT_TRUE(textures[9].allocated); EXPECT_FALSE(textures[9].update_params); EXPECT_FALSE(textures[10].update_mem); EXPECT_FALSE(textures[10].update_data); EXPECT_TRUE(textures[10].allocated); EXPECT_FALSE(textures[10].update_params); EXPECT_FALSE(textures[11].update_mem); EXPECT_FALSE(textures[11].update_data); EXPECT_TRUE(textures[11].allocated); EXPECT_FALSE(textures[11].update_params); EXPECT_FALSE(textures[12].update_mem); EXPECT_FALSE(textures[12].update_data); EXPECT_TRUE(textures[12].allocated); EXPECT_FALSE(textures[12].update_params); EXPECT_FALSE(textures[13].update_mem); EXPECT_FALSE(textures[13].update_data); EXPECT_TRUE(textures[13].allocated); EXPECT_FALSE(textures[13].update_params); EXPECT_FALSE(textures[14].update_mem); EXPECT_FALSE(textures[14].update_data); EXPECT_TRUE(textures[14].allocated); EXPECT_FALSE(textures[14].update_params); EXPECT_FALSE(textures[15].update_mem); EXPECT_FALSE(textures[15].update_data); EXPECT_TRUE(textures[15].allocated); EXPECT_FALSE(textures[15].update_params); EXPECT_EQ(helper.copyDataToGPUCalled, 6); EXPECT_EQ(helper.allocateCudaTextureCalled, 8); EXPECT_EQ(helper.createCudaTextureCalled, 8); EXPECT_EQ(helper.copyParamsToGPUCalled, 6); EXPECT_EQ(helper.freeCudaMemCalled, 4); } TEST_F(TextureHelperTest, WorldPoseToMapPoseTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); // set it to be zero std::array new_rot_mat{}; helper.updateRotation(0, new_rot_mat); helper.updateOrigin(0, make_float3(0, 0, 0)); float3 input = make_float3(0.1, 0.2, 0.3); float3 output; helper.worldPoseToMapPose(0, input, output); EXPECT_FLOAT_EQ(input.x, 0.1); EXPECT_FLOAT_EQ(input.y, 0.2); EXPECT_FLOAT_EQ(input.z, 0.3); EXPECT_FLOAT_EQ(output.x, 0.1); EXPECT_FLOAT_EQ(output.y, 0.2); EXPECT_FLOAT_EQ(output.z, 0.3); helper.copyToDevice(); helper.worldPoseToMapPose(0, input, output); EXPECT_FLOAT_EQ(input.x, 0.1); EXPECT_FLOAT_EQ(input.y, 0.2); EXPECT_FLOAT_EQ(input.z, 0.3); EXPECT_FLOAT_EQ(output.x, 0); EXPECT_FLOAT_EQ(output.y, 0); EXPECT_FLOAT_EQ(output.z, 0); // set rot mat non zero new_rot_mat[0] = make_float3(1, 2, 3); new_rot_mat[1] = make_float3(4, 5, 6); new_rot_mat[2] = make_float3(7, 8, 9); helper.updateRotation(0, new_rot_mat); helper.copyToDevice(); helper.worldPoseToMapPose(0, input, output); EXPECT_FLOAT_EQ(input.x, 0.1); EXPECT_FLOAT_EQ(input.y, 0.2); EXPECT_FLOAT_EQ(input.z, 0.3); EXPECT_FLOAT_EQ(output.x, 0.1 + 0.2 * 2 + 0.3 * 3); EXPECT_FLOAT_EQ(output.y, 0.1 * 4 + 0.2 * 5 + 0.3 * 6); EXPECT_FLOAT_EQ(output.z, 0.1 * 7 + 0.2 * 8 + 0.3 * 9); // set origin non zero new_rot_mat[0] = make_float3(0, 0, 0); new_rot_mat[1] = make_float3(0, 0, 0); new_rot_mat[2] = make_float3(0, 0, 0); helper.updateRotation(0, new_rot_mat); helper.updateOrigin(0, make_float3(0.1, 0.2, 0.3)); helper.copyToDevice(); helper.worldPoseToMapPose(0, input, output); EXPECT_FLOAT_EQ(input.x, 0.1); EXPECT_FLOAT_EQ(input.y, 0.2); EXPECT_FLOAT_EQ(input.z, 0.3); EXPECT_FLOAT_EQ(output.x, 0); EXPECT_FLOAT_EQ(output.y, 0); EXPECT_FLOAT_EQ(output.z, 0); // set rot mat non zero new_rot_mat[0] = make_float3(1, 2, 3); new_rot_mat[1] = make_float3(4, 5, 6); new_rot_mat[2] = make_float3(7, 8, 9); helper.updateRotation(0, new_rot_mat); helper.copyToDevice(); input = make_float3(0.2, 0.4, 0.6); helper.worldPoseToMapPose(0, input, output); EXPECT_FLOAT_EQ(input.x, 0.2); EXPECT_FLOAT_EQ(input.y, 0.4); EXPECT_FLOAT_EQ(input.z, 0.6); EXPECT_FLOAT_EQ(output.x, 0.1 + 0.2 * 2 + 0.3 * 3); EXPECT_FLOAT_EQ(output.y, 0.1 * 4 + 0.2 * 5 + 0.3 * 6); EXPECT_FLOAT_EQ(output.z, 0.1 * 7 + 0.2 * 8 + 0.3 * 9); } TEST_F(TextureHelperTest, BodyOffsetToWorldPoseTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); // set it to be zero std::array new_rot_mat{}; helper.updateRotation(0, new_rot_mat); helper.updateOrigin(0, make_float3(0, 0, 0)); float3 input = make_float3(0.1, 0.2, 0.3); float3 offset = make_float3(1.1, 1.3, 1.5); float3 rotation = make_float3(0, 0, 0); float3 output; helper.bodyOffsetToWorldPose(offset, input, rotation, output); EXPECT_FLOAT_EQ(input.x, 0.1); EXPECT_FLOAT_EQ(input.y, 0.2); EXPECT_FLOAT_EQ(input.z, 0.3); EXPECT_FLOAT_EQ(output.x, 1.2); EXPECT_FLOAT_EQ(output.y, 1.5); EXPECT_FLOAT_EQ(output.z, 1.8); // rotate by positive 90 degrees yaw rotation = make_float3(0, 0, M_PI_2); helper.copyToDevice(); helper.bodyOffsetToWorldPose(offset, input, rotation, output); EXPECT_FLOAT_EQ(output.x, -1.2); EXPECT_FLOAT_EQ(output.y, 1.3); EXPECT_FLOAT_EQ(output.z, 1.8); // rotate by -90 degrees yaw rotation = make_float3(0, 0, -M_PI_2); helper.copyToDevice(); helper.bodyOffsetToWorldPose(offset, input, rotation, output); EXPECT_FLOAT_EQ(output.x, 0.1 + 1.3); EXPECT_FLOAT_EQ(output.y, 0.2 - 1.1); EXPECT_FLOAT_EQ(output.z, 1.8); // rotate by +45 degrees yaw rotation = make_float3(0, 0, M_PI_4); helper.copyToDevice(); helper.bodyOffsetToWorldPose(offset, input, rotation, output); // EXPECT_FLOAT_EQ(output.x, -0.041421525); EXPECT_NEAR(output.x, -0.041421525, 1.0e-6); EXPECT_FLOAT_EQ(output.y, 1.8970563); EXPECT_FLOAT_EQ(output.z, 1.8); // rotate by +90 degrees roll rotation = make_float3(M_PI_2, 0, 0); helper.copyToDevice(); helper.bodyOffsetToWorldPose(offset, input, rotation, output); EXPECT_FLOAT_EQ(output.x, 0.1 + 1.1); EXPECT_FLOAT_EQ(output.y, 0.2 - 1.5); EXPECT_FLOAT_EQ(output.z, 0.3 + 1.3); // rotate by -90 degrees roll rotation = make_float3(-M_PI_2, 0, 0); helper.copyToDevice(); helper.bodyOffsetToWorldPose(offset, input, rotation, output); EXPECT_FLOAT_EQ(output.x, 0.1 + 1.1); EXPECT_FLOAT_EQ(output.y, 0.2 + 1.5); EXPECT_FLOAT_EQ(output.z, 0.3 - 1.3); // rotate by +90 degrees roll rotation = make_float3(0, M_PI_2, 0); helper.copyToDevice(); helper.bodyOffsetToWorldPose(offset, input, rotation, output); EXPECT_FLOAT_EQ(output.x, 0.1 + 1.5); EXPECT_FLOAT_EQ(output.y, 0.2 + 1.3); EXPECT_FLOAT_EQ(output.z, 0.3 - 1.1); // rotate by -90 degrees roll rotation = make_float3(0, -M_PI_2, 0); helper.copyToDevice(); helper.bodyOffsetToWorldPose(offset, input, rotation, output); EXPECT_FLOAT_EQ(output.x, 0.1 - 1.5); EXPECT_FLOAT_EQ(output.y, 0.2 + 1.3); EXPECT_FLOAT_EQ(output.z, 0.3 + 1.1); } TEST_F(TextureHelperTest, MapPoseToTexCoordTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); float3 input = make_float3(1.0, 2.0, 3.0); float3 output; helper.updateResolution(0, 10); cudaExtent extent = make_cudaExtent(4, 5, 6); helper.setExtent(0, extent); helper.copyToDevice(); helper.mapPoseToTexCoord(0, input, output); EXPECT_FLOAT_EQ(input.x, 1.0); EXPECT_FLOAT_EQ(input.y, 2.0); EXPECT_FLOAT_EQ(input.z, 3.0); EXPECT_FLOAT_EQ(output.x, (0.1) / 4); EXPECT_FLOAT_EQ(output.y, (0.2) / 5); EXPECT_FLOAT_EQ(output.z, (0.3) / 6); } TEST_F(TextureHelperTest, MapPoseToTexCoordIndResTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); float3 input = make_float3(1.0, 2.0, 3.0); float3 output; float3 resolution = make_float3(10, 20, 30); helper.updateResolution(0, resolution); cudaExtent extent = make_cudaExtent(4, 5, 6); helper.setExtent(0, extent); helper.copyToDevice(); helper.mapPoseToTexCoord(0, input, output); EXPECT_FLOAT_EQ(input.x, 1.0); EXPECT_FLOAT_EQ(input.y, 2.0); EXPECT_FLOAT_EQ(input.z, 3.0); EXPECT_FLOAT_EQ(output.x, (0.1) / 4); EXPECT_FLOAT_EQ(output.y, (0.1) / 5); EXPECT_FLOAT_EQ(output.z, (0.1) / 6); } TEST_F(TextureHelperTest, WorldPoseToTexCoordTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); float3 input = make_float3(0.2, 0.4, 0.6); float3 output; helper.updateResolution(0, 10); cudaExtent extent = make_cudaExtent(4, 5, 6); helper.setExtent(0, extent); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(1, 2, 3); new_rot_mat[1] = make_float3(4, 5, 6); new_rot_mat[2] = make_float3(7, 8, 9); helper.updateRotation(0, new_rot_mat); helper.updateOrigin(0, make_float3(0.1, 0.2, 0.3)); helper.copyToDevice(); helper.worldPoseToTexCoord(0, input, output); EXPECT_FLOAT_EQ(input.x, 0.2); EXPECT_FLOAT_EQ(input.y, 0.4); EXPECT_FLOAT_EQ(input.z, 0.6); EXPECT_FLOAT_EQ(output.x, ((0.1 * 1 + 0.2 * 2 + 0.3 * 3) / 10) / 4); EXPECT_FLOAT_EQ(output.y, ((0.1 * 4 + 0.2 * 5 + 0.3 * 6) / 10) / 5); EXPECT_FLOAT_EQ(output.z, ((0.1 * 7 + 0.2 * 8 + 0.3 * 9) / 10) / 6); } TEST_F(TextureHelperTest, BodyOffsetToTexCoordTest) { int number = 5; TextureHelperImpl helper = TextureHelperImpl(number); const float3 input = make_float3(-0.9, -0.9, -0.9); float3 output; helper.updateResolution(0, 10); cudaExtent extent = make_cudaExtent(4, 5, 6); helper.setExtent(0, extent); std::array new_rot_mat{}; new_rot_mat[0] = make_float3(1, 2, 3); new_rot_mat[1] = make_float3(4, 5, 6); new_rot_mat[2] = make_float3(7, 8, 9); helper.updateRotation(0, new_rot_mat); helper.updateOrigin(0, make_float3(0.1, 0.2, 0.3)); helper.copyToDevice(); float3 offset = make_float3(1.1, 1.3, 1.5); float3 rotation = make_float3(0, 0, 0); helper.bodyOffsetWorldToTexCoord(0, offset, input, rotation, output); EXPECT_FLOAT_EQ(output.x, ((0.1 * 1 + 0.2 * 2 + 0.3 * 3) / 10) / 4); EXPECT_FLOAT_EQ(output.y, ((0.1 * 4 + 0.2 * 5 + 0.3 * 6) / 10) / 5); EXPECT_FLOAT_EQ(output.z, ((0.1 * 7 + 0.2 * 8 + 0.3 * 9) / 10) / 6); } TEST_F(TextureHelperTest, AddNewTextureTest) { int number = 8; TextureHelperImpl helper = TextureHelperImpl(number); cudaExtent extent = make_cudaExtent(4, 5, 6); helper.addNewTexture(extent); std::vector> textures = helper.getTextures(); EXPECT_EQ(textures.size(), number + 1); for (int i = 0; i < number + 1; i++) { auto texture = textures.at(i); EXPECT_EQ(texture.use, false); // since GPUMemStatus_ = false, this should always be false EXPECT_EQ(texture.allocated, false); EXPECT_EQ(texture.update_data, false); EXPECT_EQ(texture.update_mem, false); EXPECT_EQ(texture.resDesc.resType, cudaResourceTypeArray); EXPECT_EQ(texture.channelDesc.x, 32); EXPECT_EQ(texture.channelDesc.y, 32); EXPECT_EQ(texture.channelDesc.z, 32); EXPECT_EQ(texture.channelDesc.w, 32); EXPECT_EQ(texture.texDesc.addressMode[0], cudaAddressModeClamp); EXPECT_EQ(texture.texDesc.addressMode[1], cudaAddressModeClamp); EXPECT_EQ(texture.texDesc.filterMode, cudaFilterModeLinear); EXPECT_EQ(texture.texDesc.readMode, cudaReadModeElementType); EXPECT_EQ(texture.texDesc.normalizedCoords, 1); } EXPECT_EQ(textures[8].extent.width, 4); EXPECT_EQ(textures[8].extent.height, 5); EXPECT_EQ(textures[8].extent.depth, 6); helper.setGpuMemStatus(true); helper.addNewTexture(extent); textures = helper.getTextures(); EXPECT_EQ(textures.size(), number + 2); for (int i = 0; i < number + 2; i++) { auto texture = textures.at(i); EXPECT_EQ(texture.use, false); if (i == 9) { EXPECT_EQ(texture.allocated, true); } else { EXPECT_EQ(texture.allocated, false); } EXPECT_EQ(texture.update_data, false); EXPECT_EQ(texture.update_mem, false); EXPECT_EQ(texture.resDesc.resType, cudaResourceTypeArray); EXPECT_EQ(texture.channelDesc.x, 32); EXPECT_EQ(texture.channelDesc.y, 32); EXPECT_EQ(texture.channelDesc.z, 32); EXPECT_EQ(texture.channelDesc.w, 32); EXPECT_EQ(texture.texDesc.addressMode[0], cudaAddressModeClamp); EXPECT_EQ(texture.texDesc.addressMode[1], cudaAddressModeClamp); EXPECT_EQ(texture.texDesc.filterMode, cudaFilterModeLinear); EXPECT_EQ(texture.texDesc.readMode, cudaReadModeElementType); EXPECT_EQ(texture.texDesc.normalizedCoords, 1); } EXPECT_EQ(textures[9].extent.width, 4); EXPECT_EQ(textures[9].extent.height, 5); EXPECT_EQ(textures[9].extent.depth, 6); } ================================================ FILE: tests/texture_helpers/three_d_texture_helper_test.cu ================================================ #include #include #include #include #include #include class ThreeDTextureHelperTest : public testing::Test { protected: void SetUp() override { CudaCheckError(); generator = std::default_random_engine(7.0); distribution = std::normal_distribution(100.0, 2.0); } void TearDown() override { CudaCheckError(); } std::default_random_engine generator; std::normal_distribution distribution; }; TEST_F(ThreeDTextureHelperTest, Constructor) { ThreeDTextureHelper helper = ThreeDTextureHelper(4); EXPECT_EQ(helper.getCpuValues().size(), 4); EXPECT_EQ(helper.getLayerCopy().size(), 4); } TEST_F(ThreeDTextureHelperTest, ThreeDAllocateCudaTextureTest) { ThreeDTextureHelper helper = ThreeDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 1); helper.setExtent(0, extent); extent = make_cudaExtent(30, 40, 1); helper.setExtent(1, extent); std::vector> textures = helper.getBufferTextures(); EXPECT_EQ(textures[0].array_d, nullptr); EXPECT_EQ(textures[0].tex_d, 0); EXPECT_TRUE(textures[0].update_mem); EXPECT_FALSE(textures[0].allocated); helper.GPUSetup(); textures = helper.getTextures(); EXPECT_NE(textures[0].array_d, nullptr); EXPECT_NE(textures[0].tex_d, 0); EXPECT_FALSE(textures[0].update_mem); EXPECT_TRUE(textures[0].allocated); } TEST_F(ThreeDTextureHelperTest, UpdateTexture) { ThreeDTextureHelper helper = ThreeDTextureHelper(1); cudaExtent extent = make_cudaExtent(10, 20, 2); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } helper.updateTexture(0, 0, data_vec); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i + 200, i + 200 + 1, i + 200 + 2, i + 200 + 3); } helper.updateTexture(0, 1, data_vec); std::vector> textures = helper.getBufferTextures(); EXPECT_TRUE(textures[0].update_data); auto cpu_values = helper.getCpuBufferValues()[0]; for (int i = 0; i < cpu_values.size(); i++) { EXPECT_FLOAT_EQ(cpu_values[i].x, i); EXPECT_FLOAT_EQ(cpu_values[i].y, i + 1); EXPECT_FLOAT_EQ(cpu_values[i].z, i + 2); EXPECT_FLOAT_EQ(cpu_values[i].w, i + 3); } } TEST_F(ThreeDTextureHelperTest, UpdateTextureColumnMajor) { ThreeDTextureHelper helper = ThreeDTextureHelper(3); cudaExtent extent = make_cudaExtent(10, 20, 3); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); // just in case I am stupid std::set total_set; for (int depth = 0; depth < 3; depth++) { for (int i = 0; i < data_vec.size(); i++) { float val = depth * 20 * 10 + i; data_vec[i] = make_float4(val, val + 1, val + 2, val + 3); total_set.insert(val); } EXPECT_EQ(total_set.size(), 200 * (depth + 1)); helper.updateTexture(0, depth, data_vec, true); } // returns a rowMajor vector auto cpu_values = helper.getCpuBufferValues()[0]; for (int k = 0; k < 3; k++) { for (int i = 0; i < 20; i++) { for (int j = 0; j < 10; j++) { int columnMajorIndex = (k * 10 * 20) + j * 20 + i; int rowVectorIndex = (k * 10 * 20) + i * 10 + j; EXPECT_FLOAT_EQ(cpu_values[rowVectorIndex].x, columnMajorIndex) << " at index: " << i; EXPECT_EQ(total_set.erase(rowVectorIndex), 1); } } } EXPECT_EQ(total_set.size(), 0); } TEST_F(ThreeDTextureHelperTest, UpdateTextureException) { ThreeDTextureHelper helper = ThreeDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 1); helper.setExtent(0, extent); extent = make_cudaExtent(30, 40, 1); helper.setExtent(1, extent); std::vector data_vec; data_vec.resize(30 * 40); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } EXPECT_THROW(helper.updateTexture(0, 0, data_vec), std::runtime_error); } TEST_F(ThreeDTextureHelperTest, CopyDataToGPUOneGo) { ThreeDTextureHelper helper = ThreeDTextureHelper(1); helper.GPUSetup(); cudaExtent extent = make_cudaExtent(10, 20, 5); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int depth = 0; depth < 5; depth++) { for (int i = 0; i < data_vec.size(); i++) { float val = depth * 20 * 10 + i; data_vec[i] = make_float4(val, val + 1, val + 2, val + 3); } helper.updateTexture(0, depth, data_vec); } helper.copyToDevice(true); std::vector> textures = helper.getTextures(); EXPECT_TRUE(textures[0].allocated); EXPECT_FALSE(textures[0].update_data); EXPECT_FALSE(textures[0].update_mem); std::vector query_points; // X query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.05, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.95, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(1.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.45, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.5, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.55, 0.0, 0.0, 0.0)); // Y query_points.push_back(make_float4(0.0, 0.025, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.05, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.075, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.975, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 1.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.475, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.5, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.525, 0.0, 0.0)); // Z query_points.push_back(make_float4(0.0, 0.0, 0.1, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.2, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.3, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 1.0, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.9, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.8, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.7, 0.0)); auto results = getTextureAtPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); EXPECT_FLOAT_EQ(results[0].x, 0.0); EXPECT_FLOAT_EQ(results[0].y, 1.0); EXPECT_FLOAT_EQ(results[0].z, 2.0); EXPECT_FLOAT_EQ(results[0].w, 3.0); EXPECT_FLOAT_EQ(results[1].x, 0.0); EXPECT_FLOAT_EQ(results[1].y, 1.0); EXPECT_FLOAT_EQ(results[1].z, 2.0); EXPECT_FLOAT_EQ(results[1].w, 3.0); EXPECT_FLOAT_EQ(results[2].x, 9.0); EXPECT_FLOAT_EQ(results[2].y, 10.0); EXPECT_FLOAT_EQ(results[2].z, 11.0); EXPECT_FLOAT_EQ(results[2].w, 12.0); EXPECT_FLOAT_EQ(results[3].x, 9.0); EXPECT_FLOAT_EQ(results[3].y, 10.0); EXPECT_FLOAT_EQ(results[3].z, 11.0); EXPECT_FLOAT_EQ(results[3].w, 12.0); EXPECT_FLOAT_EQ(results[4].x, 4.0); EXPECT_FLOAT_EQ(results[4].y, 5.0); EXPECT_FLOAT_EQ(results[4].z, 6.0); EXPECT_FLOAT_EQ(results[4].w, 7.0); EXPECT_FLOAT_EQ(results[5].x, 4.5); EXPECT_FLOAT_EQ(results[5].y, 5.5); EXPECT_FLOAT_EQ(results[5].z, 6.5); EXPECT_FLOAT_EQ(results[5].w, 7.5); EXPECT_FLOAT_EQ(results[6].x, 5.0); EXPECT_FLOAT_EQ(results[6].y, 6.0); EXPECT_FLOAT_EQ(results[6].z, 7.0); EXPECT_FLOAT_EQ(results[6].w, 8.0); EXPECT_FLOAT_EQ(results[7].x, 0.0); EXPECT_FLOAT_EQ(results[8].x, 5); EXPECT_FLOAT_EQ(results[9].x, 10); EXPECT_FLOAT_EQ(results[10].x, 190); EXPECT_FLOAT_EQ(results[11].x, 190); EXPECT_FLOAT_EQ(results[12].x, 90); EXPECT_FLOAT_EQ(results[13].x, 95); EXPECT_FLOAT_EQ(results[14].x, 100); EXPECT_FLOAT_EQ(results[15].x, 0.0); EXPECT_FLOAT_EQ(results[16].x, 100.0); EXPECT_FLOAT_EQ(results[17].x, 200.0); EXPECT_FLOAT_EQ(results[18].x, 800.0); EXPECT_FLOAT_EQ(results[19].x, 800.0); EXPECT_FLOAT_EQ(results[20].x, 700.0); EXPECT_FLOAT_EQ(results[21].x, 600.0); } TEST_F(ThreeDTextureHelperTest, TestCPUQuery) { ThreeDTextureHelper helper = ThreeDTextureHelper(1); helper.GPUSetup(); const int WIDTH = 10; const int HEIGHT = 20; const int DEPTH = 5; cudaExtent extent = make_cudaExtent(WIDTH, HEIGHT, DEPTH); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(WIDTH * HEIGHT); for (int depth = 0; depth < DEPTH; depth++) { for (int i = 0; i < data_vec.size(); i++) { float val = depth * HEIGHT * WIDTH + i; data_vec[i] = make_float4(val, val + 1, val + 2, val + 3); } helper.updateTexture(0, depth, data_vec); } helper.copyToDevice(true); std::vector> textures = helper.getTextures(); EXPECT_TRUE(textures[0].allocated); EXPECT_FALSE(textures[0].update_data); EXPECT_FALSE(textures[0].update_mem); std::vector query_points; // X query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.05, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.95, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(1.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.45, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.5, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.55, 0.0, 0.0, 0.0)); // Y query_points.push_back(make_float4(0.0, 0.025, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.05, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.075, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.975, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 1.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.475, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.5, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.525, 0.0, 0.0)); // Z query_points.push_back(make_float4(0.0, 0.0, 0.1, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.2, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.3, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 1.0, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.9, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.8, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.7, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 1.2, 0.0)); std::vector results; for (const auto& query : query_points) { float3 xyz_point = make_float3(query.x, query.y, query.z); int index = round(query.w); results.push_back(helper.queryTexture(index, xyz_point)); } EXPECT_FLOAT_EQ(results.size(), query_points.size()); EXPECT_FLOAT_EQ(results[0].x, 0.0); EXPECT_FLOAT_EQ(results[0].y, 1.0); EXPECT_FLOAT_EQ(results[0].z, 2.0); EXPECT_FLOAT_EQ(results[0].w, 3.0); EXPECT_FLOAT_EQ(results[1].x, 0.0); EXPECT_FLOAT_EQ(results[1].y, 1.0); EXPECT_FLOAT_EQ(results[1].z, 2.0); EXPECT_FLOAT_EQ(results[1].w, 3.0); EXPECT_FLOAT_EQ(results[2].x, 9.0); EXPECT_FLOAT_EQ(results[2].y, 10.0); EXPECT_FLOAT_EQ(results[2].z, 11.0); EXPECT_FLOAT_EQ(results[2].w, 12.0); EXPECT_FLOAT_EQ(results[3].x, 9.0); EXPECT_FLOAT_EQ(results[3].y, 10.0); EXPECT_FLOAT_EQ(results[3].z, 11.0); EXPECT_FLOAT_EQ(results[3].w, 12.0); EXPECT_FLOAT_EQ(results[4].x, 4.0); EXPECT_FLOAT_EQ(results[4].y, 5.0); EXPECT_FLOAT_EQ(results[4].z, 6.0); EXPECT_FLOAT_EQ(results[4].w, 7.0); EXPECT_FLOAT_EQ(results[5].x, 4.5); EXPECT_FLOAT_EQ(results[5].y, 5.5); EXPECT_FLOAT_EQ(results[5].z, 6.5); EXPECT_FLOAT_EQ(results[5].w, 7.5); EXPECT_FLOAT_EQ(results[6].x, 5.0); EXPECT_FLOAT_EQ(results[6].y, 6.0); EXPECT_FLOAT_EQ(results[6].z, 7.0); EXPECT_FLOAT_EQ(results[6].w, 8.0); EXPECT_FLOAT_EQ(results[7].x, 0.0); EXPECT_FLOAT_EQ(results[8].x, 5); EXPECT_FLOAT_EQ(results[9].x, 10); EXPECT_FLOAT_EQ(results[10].x, 190); EXPECT_FLOAT_EQ(results[11].x, 190); EXPECT_FLOAT_EQ(results[12].x, 90); EXPECT_FLOAT_EQ(results[13].x, 95); EXPECT_FLOAT_EQ(results[14].x, 100); EXPECT_FLOAT_EQ(results[15].x, 0.0); EXPECT_FLOAT_EQ(results[16].x, 100.0); EXPECT_FLOAT_EQ(results[17].x, 200.0); EXPECT_FLOAT_EQ(results[18].x, 800.0); EXPECT_FLOAT_EQ(results[19].x, 800.0); EXPECT_FLOAT_EQ(results[20].x, 700.0); EXPECT_FLOAT_EQ(results[21].x, 600.0); EXPECT_FLOAT_EQ(results[22].x, 800.0); } TEST_F(ThreeDTextureHelperTest, CopyDataToGPUSplitMiddle) { ThreeDTextureHelper helper = ThreeDTextureHelper(1); helper.GPUSetup(); cudaExtent extent = make_cudaExtent(10, 20, 5); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); // 0, 200, 400, 600, 800 for (int depth = 0; depth < 5; depth++) { for (int i = 0; i < data_vec.size(); i++) { float val = depth * 20 * 10 + i; data_vec[i] = make_float4(val, val + 1, val + 2, val + 3); } helper.updateTexture(0, depth, data_vec); } // fill in even indexes // 0, 1200, 400, 1600, 800 for (int depth = 1; depth < 5; depth += 2) { for (int i = 0; i < data_vec.size(); i++) { float val = (depth + 5) * 20 * 10 + i; data_vec[i] = make_float4(val, val + 1, val + 2, val + 3); } helper.updateTexture(0, depth, data_vec); } helper.copyToDevice(true); std::vector query_points; query_points.push_back(make_float4(0.0, 0.0, 0.1, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.2, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.3, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.5, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.6, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.7, 0.0)); auto results = getTextureAtPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); EXPECT_FLOAT_EQ(results[0].x, 0.0); EXPECT_FLOAT_EQ(results[1].x, 600); EXPECT_FLOAT_EQ(results[2].x, 1200); EXPECT_FLOAT_EQ(results[3].x, 400); EXPECT_FLOAT_EQ(results[4].x, 1000); EXPECT_FLOAT_EQ(results[5].x, 1600); // fill in odd indexes // 2000, 1200, 2400, 1600, 2800 for (int depth = 0; depth < 5; depth += 2) { for (int i = 0; i < data_vec.size(); i++) { float val = (depth + 10) * 20 * 10 + i; data_vec[i] = make_float4(val, val + 1, val + 2, val + 3); } helper.updateTexture(0, depth, data_vec); } helper.copyToDevice(true); results = getTextureAtPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); EXPECT_FLOAT_EQ(results[0].x, 2000.0); EXPECT_FLOAT_EQ(results[1].x, 1600); EXPECT_FLOAT_EQ(results[2].x, 1200); EXPECT_FLOAT_EQ(results[3].x, 2400); EXPECT_FLOAT_EQ(results[4].x, 2000); EXPECT_FLOAT_EQ(results[5].x, 1600); } TEST_F(ThreeDTextureHelperTest, CheckWrapping) { ThreeDTextureHelper helper = ThreeDTextureHelper(1); cudaExtent extent = make_cudaExtent(10, 20, 2); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } helper.updateTexture(0, 0, data_vec); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i + 200, i + 200 + 1, i + 200 + 2, i + 200 + 3); } helper.updateTexture(0, 1, data_vec); helper.updateAddressMode(0, 2, cudaAddressModeWrap); helper.GPUSetup(); helper.copyToDevice(true); std::vector query_points; query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.25, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.5, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.75, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 1.0, 0.0)); auto results = getTextureAtPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); EXPECT_FLOAT_EQ(results[0].x, 100.0); EXPECT_FLOAT_EQ(results[1].x, 0); EXPECT_FLOAT_EQ(results[2].x, 100); EXPECT_FLOAT_EQ(results[3].x, 200); EXPECT_FLOAT_EQ(results[4].x, 100); } TEST_F(ThreeDTextureHelperTest, CheckCPUWrapping) { ThreeDTextureHelper helper = ThreeDTextureHelper(1); cudaExtent extent = make_cudaExtent(10, 20, 2); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } helper.updateTexture(0, 0, data_vec); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i + 200, i + 200 + 1, i + 200 + 2, i + 200 + 3); } helper.updateTexture(0, 1, data_vec); helper.updateAddressMode(0, 2, cudaAddressModeWrap); helper.GPUSetup(); helper.copyToDevice(true); std::vector query_points; query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.25, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.5, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.75, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 1.0, 0.0)); std::vector results; for (const auto& query : query_points) { float3 xyz_point = make_float3(query.x, query.y, query.z); int index = round(query.w); results.push_back(helper.queryTexture(index, xyz_point)); } EXPECT_FLOAT_EQ(results.size(), query_points.size()); EXPECT_FLOAT_EQ(results[0].x, 100.0); EXPECT_FLOAT_EQ(results[1].x, 0); EXPECT_FLOAT_EQ(results[2].x, 100); EXPECT_FLOAT_EQ(results[3].x, 200); EXPECT_FLOAT_EQ(results[4].x, 100); } ================================================ FILE: tests/texture_helpers/two_d_texture_helper_test.cu ================================================ #include #include #include #include #include #include class TwoDTextureHelperTest : public testing::Test { protected: void SetUp() override { CudaCheckError(); generator = std::default_random_engine(7.0); distribution = std::normal_distribution(1.0, 3.0); } void TearDown() override { CudaCheckError(); } std::default_random_engine generator; std::normal_distribution distribution; }; TEST_F(TwoDTextureHelperTest, Constructor) { TwoDTextureHelper helper = TwoDTextureHelper(4); EXPECT_EQ(helper.getCpuValues().size(), 4); } TEST_F(TwoDTextureHelperTest, TwoDAllocateCudaTextureTest) { TwoDTextureHelper helper = TwoDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); extent = make_cudaExtent(30, 40, 0); helper.setExtent(1, extent); helper.GPUSetup(); std::vector> textures = helper.getTextures(); EXPECT_NE(textures[0].array_d, nullptr); EXPECT_NE(textures[0].tex_d, 0); EXPECT_FALSE(textures[0].update_mem); EXPECT_TRUE(textures[0].allocated); EXPECT_TRUE(helper.GPUMemStatus_); EXPECT_NE(helper.ptr_d_, nullptr); helper.freeCudaMem(); textures = helper.getTextures(); EXPECT_EQ(textures[0].array_d, nullptr); EXPECT_EQ(textures[0].tex_d, 0); EXPECT_FALSE(textures[0].update_mem); EXPECT_FALSE(textures[0].allocated); EXPECT_FALSE(helper.GPUMemStatus_); EXPECT_EQ(helper.ptr_d_, nullptr); } TEST_F(TwoDTextureHelperTest, UpdateTextureRowMajor) { TwoDTextureHelper helper = TwoDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); extent = make_cudaExtent(30, 40, 0); helper.setExtent(1, extent); std::vector data_vec; data_vec.resize(30 * 40); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } helper.updateTexture(1, data_vec); std::vector> textures = helper.getTextures(); std::vector> textures_buffer = helper.getBufferTextures(); EXPECT_FALSE(textures[1].update_data); EXPECT_TRUE(textures_buffer[1].update_data); auto cpu_values = helper.getCpuBufferValues()[1]; for (int i = 0; i < data_vec.size(); i++) { EXPECT_FLOAT_EQ(cpu_values[i].x, data_vec[i].x); EXPECT_FLOAT_EQ(cpu_values[i].y, data_vec[i].y); EXPECT_FLOAT_EQ(cpu_values[i].z, data_vec[i].z); EXPECT_FLOAT_EQ(cpu_values[i].w, data_vec[i].w); } } TEST_F(TwoDTextureHelperTest, UpdateTextureColumnMajor) { TwoDTextureHelper helper = TwoDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); // just in case I am stupid std::set total_set; for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); total_set.insert(i); } EXPECT_EQ(total_set.size(), 200); helper.updateTexture(0, data_vec, true); // returns a rowMajor vector auto cpu_values = helper.getCpuBufferValues()[0]; for (int i = 0; i < 20; i++) { for (int j = 0; j < 10; j++) { int rowVectorIndex = i * 10 + j; int columnMajorIndex = j * 20 + i; EXPECT_FLOAT_EQ(cpu_values[rowVectorIndex].x, columnMajorIndex) << " at index: " << i; EXPECT_EQ(total_set.erase(rowVectorIndex), 1); } } EXPECT_EQ(total_set.size(), 0); } TEST_F(TwoDTextureHelperTest, EigenUpdateTextureRowMajor) { TwoDTextureHelper helper = TwoDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); extent = make_cudaExtent(30, 40, 0); helper.setExtent(1, extent); std::vector data_vec; data_vec.resize(30 * 40); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } int outer_stride = 0; int inner_stride = 0; Eigen::Map, 0, Eigen::Stride> eigen_mat(data_vec.data(), 40, 30, Eigen::Stride(outer_stride, inner_stride)); helper.updateTexture(1, eigen_mat, false); std::vector> textures = helper.getBufferTextures(); EXPECT_TRUE(textures[1].update_data); auto cpu_values = helper.getCpuBufferValues()[1]; EXPECT_EQ(cpu_values.size(), 30 * 40); for (int i = 0; i < data_vec.size(); i++) { EXPECT_FLOAT_EQ(cpu_values[i].x, data_vec[i].x); EXPECT_FLOAT_EQ(cpu_values[i].y, data_vec[i].y); EXPECT_FLOAT_EQ(cpu_values[i].z, data_vec[i].z); EXPECT_FLOAT_EQ(cpu_values[i].w, data_vec[i].w); } } TEST_F(TwoDTextureHelperTest, EigenUpdateTextureColumnMajor) { TwoDTextureHelper helper = TwoDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); // just in case I am stupid std::set total_set; for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); total_set.insert(i); } EXPECT_EQ(total_set.size(), 200); int outer_stride = 0; int inner_stride = 0; Eigen::Map, 0, Eigen::Stride> eigen_mat(data_vec.data(), 40, 30, Eigen::Stride(outer_stride, inner_stride)); helper.updateTexture(0, eigen_mat); // returns a rowMajor vector auto cpu_values = helper.getCpuBufferValues()[0]; EXPECT_EQ(cpu_values.size(), 200); for (int i = 0; i < 20; i++) { for (int j = 0; j < 10; j++) { int columnMajorIndex = j * 20 + i; int rowVectorIndex = i * 10 + j; EXPECT_FLOAT_EQ(cpu_values[rowVectorIndex].x, columnMajorIndex) << " at index: " << i; EXPECT_EQ(total_set.erase(rowVectorIndex), 1); } } EXPECT_EQ(total_set.size(), 0); } TEST_F(TwoDTextureHelperTest, UpdateTextureException) { TwoDTextureHelper helper = TwoDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); extent = make_cudaExtent(30, 40, 0); helper.setExtent(1, extent); std::vector data_vec; data_vec.resize(30 * 40); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } EXPECT_THROW(helper.updateTexture(0, data_vec), std::runtime_error); } void checkEqual(float4 computed, float val) { EXPECT_FLOAT_EQ(computed.x, val); EXPECT_FLOAT_EQ(computed.y, val + 1); EXPECT_FLOAT_EQ(computed.z, val + 2); EXPECT_FLOAT_EQ(computed.w, val + 3); } TEST_F(TwoDTextureHelperTest, UpdateTextureNewSize) { TwoDTextureHelper helper = TwoDTextureHelper(2); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); extent = make_cudaExtent(30, 40, 0); helper.setExtent(1, extent); std::vector data_vec; data_vec.resize(30 * 40); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } helper.updateTexture(0, data_vec, extent); std::vector> textures = helper.getBufferTextures(); EXPECT_TRUE(textures[0].update_data); EXPECT_TRUE(textures[0].update_mem); auto cpu_values = helper.getCpuBufferValues()[0]; for (int i = 0; i < data_vec.size(); i++) { checkEqual(cpu_values[i], data_vec[i].x); } } void checkEqualTexGPUCPU(float4 computed, float val, TwoDTextureHelper& helper, float4 query_point) { checkEqual(computed, val); checkEqual(helper.queryTexture(query_point.w, make_float3(query_point.x, query_point.y, query_point.z)), val); } TEST_F(TwoDTextureHelperTest, CopyDataToGPU) { TwoDTextureHelper helper = TwoDTextureHelper(1); helper.GPUSetup(); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } helper.updateTexture(0, data_vec); helper.copyToDevice(true); std::vector> textures = helper.getTextures(); EXPECT_TRUE(textures[0].allocated); EXPECT_FALSE(textures[0].update_data); EXPECT_FALSE(textures[0].update_mem); std::vector query_points; query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.05, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.95, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(1.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.45, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.5, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.55, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.025, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.05, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.075, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.975, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 1.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.475, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.5, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.525, 0.0, 0.0)); query_points.push_back(make_float4(2.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(2.0, 0.075, 0.0, 0.0)); query_points.push_back(make_float4(-2.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(-2.0, 0.075, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 2.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, -2.0, 0.0, 0.0)); auto results = getTextureAtPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); checkEqualTexGPUCPU(results[0], 0.0, helper, query_points[0]); checkEqualTexGPUCPU(results[1], 0.0, helper, query_points[1]); checkEqualTexGPUCPU(results[2], 9.0, helper, query_points[2]); checkEqualTexGPUCPU(results[3], 9.0, helper, query_points[3]); checkEqualTexGPUCPU(results[4], 4.0, helper, query_points[4]); checkEqualTexGPUCPU(results[5], 4.5, helper, query_points[5]); checkEqualTexGPUCPU(results[6], 5.0, helper, query_points[6]); checkEqualTexGPUCPU(results[7], 0.0, helper, query_points[7]); checkEqualTexGPUCPU(results[8], 0.0, helper, query_points[8]); checkEqualTexGPUCPU(results[9], 5.0, helper, query_points[9]); checkEqualTexGPUCPU(results[10], 10.0, helper, query_points[10]); checkEqualTexGPUCPU(results[11], 190.0, helper, query_points[11]); checkEqualTexGPUCPU(results[12], 190.0, helper, query_points[12]); checkEqualTexGPUCPU(results[13], 90.0, helper, query_points[13]); checkEqualTexGPUCPU(results[14], 95.0, helper, query_points[14]); checkEqualTexGPUCPU(results[15], 100.0, helper, query_points[15]); checkEqualTexGPUCPU(results[16], 9.0, helper, query_points[16]); checkEqualTexGPUCPU(results[17], 19.0, helper, query_points[17]); checkEqualTexGPUCPU(results[18], 0.0, helper, query_points[18]); checkEqualTexGPUCPU(results[19], 10.0, helper, query_points[19]); checkEqualTexGPUCPU(results[20], 190.0, helper, query_points[20]); checkEqualTexGPUCPU(results[21], 0.0, helper, query_points[21]); } void checkEqualMapGPUCPU(float4 computed, float val, TwoDTextureHelper& helper, float4 query_point) { checkEqual(computed, val); checkEqual(helper.queryTextureAtMapPose(query_point.w, make_float3(query_point.x, query_point.y, query_point.z)), val); } TEST_F(TwoDTextureHelperTest, QueryTextureAtMapPose) { TwoDTextureHelper helper = TwoDTextureHelper(1); helper.GPUSetup(); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } helper.updateTexture(0, data_vec); helper.updateResolution(0, 10); helper.copyToDevice(true); std::vector query_points; query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.05, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.95, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(1.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.45, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.5, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.55, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.025, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.05, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.075, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.975, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 1.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.475, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.5, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.525, 0.0, 0.0)); for (int i = 0; i < query_points.size(); i++) { // resolution * normalized handling query_points[i].x = query_points[i].x * 10 * 10; query_points[i].y = query_points[i].y * 10 * 20; query_points[i].z = query_points[i].z * 10; } auto results = getTextureAtMapPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); checkEqualMapGPUCPU(results[0], 0.0, helper, query_points[0]); checkEqualMapGPUCPU(results[1], 0.0, helper, query_points[1]); checkEqualMapGPUCPU(results[2], 9.0, helper, query_points[2]); checkEqualMapGPUCPU(results[3], 9.0, helper, query_points[3]); checkEqualMapGPUCPU(results[4], 4.0, helper, query_points[4]); checkEqualMapGPUCPU(results[5], 4.5, helper, query_points[5]); checkEqualMapGPUCPU(results[6], 5.0, helper, query_points[6]); checkEqualMapGPUCPU(results[7], 0.0, helper, query_points[7]); checkEqualMapGPUCPU(results[8], 0.0, helper, query_points[8]); checkEqualMapGPUCPU(results[9], 5.0, helper, query_points[9]); checkEqualMapGPUCPU(results[10], 10., helper, query_points[10]); checkEqualMapGPUCPU(results[11], 190.0, helper, query_points[11]); checkEqualMapGPUCPU(results[12], 190.0, helper, query_points[12]); checkEqualMapGPUCPU(results[13], 90.0, helper, query_points[13]); checkEqualMapGPUCPU(results[14], 95.0, helper, query_points[14]); checkEqualMapGPUCPU(results[15], 100.0, helper, query_points[15]); } void checkEqualWorldGPUCPU(float4 computed, float val, TwoDTextureHelper& helper, float4 query_point) { checkEqual(computed, val); checkEqual(helper.queryTextureAtWorldPose(query_point.w, make_float3(query_point.x, query_point.y, query_point.z)), val); } TEST_F(TwoDTextureHelperTest, QueryTextureAtWorldPose) { TwoDTextureHelper helper = TwoDTextureHelper(1); helper.GPUSetup(); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper.updateRotation(0, new_rot_mat); helper.updateOrigin(0, make_float3(1, 2, 3)); helper.updateTexture(0, data_vec); helper.updateResolution(0, 10); helper.copyToDevice(true); std::vector query_points; query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.05, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.95, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(1.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.45, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.5, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.55, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.025, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.05, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.075, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.975, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 1.0, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.475, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.5, 0.0, 0.0)); query_points.push_back(make_float4(0.0, 0.525, 0.0, 0.0)); for (int i = 0; i < query_points.size(); i++) { if (i == 4) { printf("correct tex value at %d: %f %f\n", i, query_points[i].x, query_points[i].y); } float4 temp = query_points[i]; // resolution * normalized handling query_points[i].x = temp.y * 10.0 * 20.0 + 1; query_points[i].y = temp.x * 10.0 * 10.0 + 2; query_points[i].z = temp.z * 10.0 + 3; if (i == 4) { printf("starting input at %d: %f %f\n", i, query_points[i].x, query_points[i].y); } } auto results = getTextureAtWorldPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); checkEqualWorldGPUCPU(results[0], 0.0, helper, query_points[0]); checkEqualWorldGPUCPU(results[1], 0.0, helper, query_points[1]); checkEqualWorldGPUCPU(results[2], 9.0, helper, query_points[2]); checkEqualWorldGPUCPU(results[3], 9.0, helper, query_points[3]); checkEqualWorldGPUCPU(results[4], 4.0, helper, query_points[4]); checkEqualWorldGPUCPU(results[5], 4.5, helper, query_points[5]); checkEqualWorldGPUCPU(results[6], 5.0, helper, query_points[6]); checkEqualWorldGPUCPU(results[7], 0.0, helper, query_points[7]); checkEqualWorldGPUCPU(results[8], 0.0, helper, query_points[8]); checkEqualWorldGPUCPU(results[9], 5.0, helper, query_points[9]); checkEqualWorldGPUCPU(results[10], 10., helper, query_points[10]); checkEqualWorldGPUCPU(results[11], 190.0, helper, query_points[11]); checkEqualWorldGPUCPU(results[12], 190.0, helper, query_points[12]); checkEqualWorldGPUCPU(results[13], 90.0, helper, query_points[13]); checkEqualWorldGPUCPU(results[14], 95.0, helper, query_points[14]); checkEqualWorldGPUCPU(results[15], 100.0, helper, query_points[15]); } TEST_F(TwoDTextureHelperTest, GPUCPUTextureLookupMatch) { TwoDTextureHelper helper = TwoDTextureHelper(1); helper.GPUSetup(); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper.updateRotation(0, new_rot_mat); helper.updateOrigin(0, make_float3(1, 2, 3)); helper.updateTexture(0, data_vec); helper.updateResolution(0, 10); helper.copyToDevice(true); std::vector> textures = helper.getTextures(); EXPECT_TRUE(textures[0].allocated); EXPECT_FALSE(textures[0].update_data); EXPECT_FALSE(textures[0].update_mem); std::vector query_points; for (int i = 0; i < 500; i++) { query_points.push_back(make_float4(distribution(generator), distribution(generator), distribution(generator), 0.0)); } auto results = getTextureAtPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); for (int i = 0; i < results.size(); i++) { float4 CPU_result = helper.queryTexture(query_points[i].w, make_float3(query_points[i].x, query_points[i].y, query_points[i].z)); EXPECT_NEAR(results[i].x, CPU_result.x, 0.05); EXPECT_NEAR(results[i].y, CPU_result.y, 0.05); EXPECT_NEAR(results[i].z, CPU_result.z, 0.05); EXPECT_NEAR(results[i].w, CPU_result.w, 0.05); } } TEST_F(TwoDTextureHelperTest, GPUCPUTextureLookupMatchManyUpdates) { TwoDTextureHelper helper = TwoDTextureHelper(1); helper.GPUSetup(); cudaExtent extent = make_cudaExtent(10, 20, 0); helper.setExtent(0, extent); std::vector data_vec; data_vec.resize(10 * 20); for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); } std::array new_rot_mat{}; new_rot_mat[0] = make_float3(0, 1, 0); new_rot_mat[1] = make_float3(1, 0, 0); new_rot_mat[2] = make_float3(0, 0, 1); helper.updateRotation(0, new_rot_mat); helper.updateOrigin(0, make_float3(1, 2, 3)); helper.updateTexture(0, data_vec); helper.updateResolution(0, 10); helper.copyToDevice(true); std::vector> textures = helper.getTextures(); EXPECT_TRUE(textures[0].allocated); EXPECT_FALSE(textures[0].update_data); EXPECT_FALSE(textures[0].update_mem); std::vector query_points; for (int i = 0; i < 500; i++) { query_points.push_back(make_float4(distribution(generator), distribution(generator), distribution(generator), 0.0)); } for (int iterations = 0; iterations < 10; iterations++) { for (int i = 0; i < data_vec.size(); i++) { data_vec[i] = make_float4(i, i + 1, i + 2, i + 3); data_vec[i] *= iterations; helper.updateTexture(0, data_vec); helper.copyToDevice(true); } auto results = getTextureAtPointsKernel, float4>(helper, query_points); EXPECT_FLOAT_EQ(results.size(), query_points.size()); for (int i = 0; i < results.size(); i++) { float4 CPU_result = helper.queryTexture(query_points[i].w, make_float3(query_points[i].x, query_points[i].y, query_points[i].z)); EXPECT_NEAR(results[i].x, CPU_result.x, 0.05); EXPECT_NEAR(results[i].y, CPU_result.y, 0.05); EXPECT_NEAR(results[i].z, CPU_result.z, 0.05); EXPECT_NEAR(results[i].w, CPU_result.w, 0.05); } } } // TEST_F(TwoDTextureHelperTest, CopyToDeviceTest) //{ // int number = 8; // TwoDTextureHelper helper = TwoDTextureHelper(number); // // //helper.setTestFlagsInParam(0, false, false, false); // //helper.setTestFlagsInParam(1, false, false, true); // //helper.setTestFlagsInParam(2, false, true, false); // //helper.setTestFlagsInParam(3, false, true, true); // //helper.setTestFlagsInParam(4, true, false, false); // //helper.setTestFlagsInParam(5, true, false, true); // //helper.setTestFlagsInParam(6, true, true, false); // //helper.setTestFlagsInParam(7, true, true, true); // // // nothing should have happened since GPUMemStatus=False // helper.GPUSetup(); // helper.copyToDevice(); // std::vector> textures = helper.getTextures(); // textures = helper.getTextures(); // // EXPECT_FALSE(textures[0].update_mem); // EXPECT_FALSE(textures[0].update_data); // EXPECT_FALSE(textures[0].allocated); // // EXPECT_FALSE(textures[1].update_mem); // EXPECT_FALSE(textures[1].update_data); // EXPECT_TRUE(textures[1].allocated); // // EXPECT_FALSE(textures[2].update_mem); // EXPECT_TRUE(textures[2].update_data); // EXPECT_FALSE(textures[2].allocated); // // EXPECT_FALSE(textures[3].update_mem); // EXPECT_FALSE(textures[3].update_data); // was TRUE // EXPECT_TRUE(textures[3].allocated); // // EXPECT_FALSE(textures[4].update_mem); // was TRUE // EXPECT_FALSE(textures[4].update_data); // EXPECT_TRUE(textures[4].allocated); // was FALSE // // EXPECT_FALSE(textures[5].update_mem); // was TRUE // EXPECT_FALSE(textures[5].update_data); // EXPECT_TRUE(textures[5].allocated); // // EXPECT_FALSE(textures[6].update_mem); // was TRUE // EXPECT_FALSE(textures[6].update_data); // was TRUE // EXPECT_TRUE(textures[6].allocated); // // EXPECT_FALSE(textures[7].update_mem); // was TRUE // EXPECT_FALSE(textures[7].update_data); // was TRUE // EXPECT_TRUE(textures[7].allocated); // // //EXPECT_EQ(helper.copyDataToGPUCalled, 3); // //EXPECT_EQ(helper.allocateCudaTextureCalled, 4); // //EXPECT_EQ(helper.createCudaTextureCalled, 4); //}