Repository: RadeonOpenCompute/ROCR-Runtime
Branch: amd-staging_deprecated
Commit: ba56a24c6132
Files: 652
Total size: 12.5 MB
Directory structure:
gitextract_zn673rxz/
├── .gitignore
├── CMakeLists.txt
├── DEBIAN/
│ ├── Binary/
│ │ ├── postinst.in
│ │ └── prerm.in
│ ├── Dev/
│ │ ├── postinst.in
│ │ └── prerm.in
│ └── preinst
├── LICENSE.txt
├── README.md
├── RPM/
│ ├── Binary/
│ │ ├── post.in
│ │ └── postun.in
│ ├── Dev/
│ │ ├── post.in
│ │ └── postun.in
│ ├── hsa-rocr.spec.in
│ └── preinst
├── _clang-format
├── clang-format-diff.py
├── cmake_modules/
│ └── utils.cmake
├── format
├── libhsakmt/
│ ├── CMakeLists.txt
│ ├── DEBIAN/
│ │ ├── postinst.in
│ │ └── prerm.in
│ ├── LICENSE.md
│ ├── README.md
│ ├── RPM/
│ │ ├── hsakmt-roct-devel.spec.in
│ │ ├── libhsakmt.spec
│ │ ├── post.in
│ │ └── postun.in
│ ├── cmake_modules/
│ │ └── utils.cmake
│ ├── hsakmt-config.cmake.in
│ ├── include/
│ │ └── hsakmt/
│ │ ├── hsakmt.h
│ │ ├── hsakmt_virtio.h
│ │ ├── hsakmtmodel.h
│ │ ├── hsakmtmodeliface.h
│ │ ├── hsakmttypes.h
│ │ └── linux/
│ │ ├── kfd_ioctl.h
│ │ └── udmabuf.h
│ ├── libhsakmt.pc.in
│ ├── src/
│ │ ├── debug.c
│ │ ├── events.c
│ │ ├── fmm.c
│ │ ├── fmm.h
│ │ ├── globals.c
│ │ ├── hsakmtmodel.c
│ │ ├── libhsakmt.c
│ │ ├── libhsakmt.h
│ │ ├── libhsakmt.ver
│ │ ├── memory.c
│ │ ├── openclose.c
│ │ ├── pc_sampling.c
│ │ ├── perfctr.c
│ │ ├── pmc_table.c
│ │ ├── pmc_table.h
│ │ ├── queues.c
│ │ ├── rbtree.c
│ │ ├── rbtree.h
│ │ ├── rbtree_amd.h
│ │ ├── spm.c
│ │ ├── svm.c
│ │ ├── time.c
│ │ ├── topology.c
│ │ ├── version.c
│ │ └── virtio/
│ │ ├── CMakeLists.txt
│ │ ├── hsakmt_virtio_amdgpu.c
│ │ ├── hsakmt_virtio_device.c
│ │ ├── hsakmt_virtio_device.h
│ │ ├── hsakmt_virtio_events.c
│ │ ├── hsakmt_virtio_memory.c
│ │ ├── hsakmt_virtio_openclose.c
│ │ ├── hsakmt_virtio_proto.h
│ │ ├── hsakmt_virtio_queues.c
│ │ ├── hsakmt_virtio_topology.c
│ │ ├── hsakmt_virtio_vm.c
│ │ ├── include/
│ │ │ └── linux/
│ │ │ └── virtgpu_drm.h
│ │ ├── libhsakmt_virtio.ver
│ │ ├── virtio_gpu.c
│ │ └── virtio_gpu.h
│ └── tests/
│ ├── kfdtest/
│ │ ├── .gitignore
│ │ ├── CMakeLists.txt
│ │ ├── LICENSE.kfdtest
│ │ ├── README.txt
│ │ ├── gtest-1.6.0/
│ │ │ ├── gtest/
│ │ │ │ └── gtest.h
│ │ │ └── gtest-all.cpp
│ │ ├── include/
│ │ │ ├── amdp2ptest.h
│ │ │ ├── asic_reg/
│ │ │ │ ├── gfx_7_2_d.h
│ │ │ │ ├── gfx_7_2_enum.h
│ │ │ │ └── gfx_7_2_sh_mask.h
│ │ │ ├── kfd_pm4_opcodes.h
│ │ │ ├── pm4_pkt_struct_ai.h
│ │ │ ├── pm4_pkt_struct_ci.h
│ │ │ ├── pm4_pkt_struct_common.h
│ │ │ ├── pm4_pkt_struct_nv.h
│ │ │ └── sdma_pkt_struct.h
│ │ ├── scripts/
│ │ │ ├── kfdtest.exclude
│ │ │ └── run_kfdtest.sh
│ │ └── src/
│ │ ├── AqlQueue.cpp
│ │ ├── AqlQueue.hpp
│ │ ├── Assemble.cpp
│ │ ├── Assemble.hpp
│ │ ├── BaseDebug.cpp
│ │ ├── BaseDebug.hpp
│ │ ├── BasePacket.cpp
│ │ ├── BasePacket.hpp
│ │ ├── BaseQueue.cpp
│ │ ├── BaseQueue.hpp
│ │ ├── Dispatch.cpp
│ │ ├── Dispatch.hpp
│ │ ├── GoogleTestExtension.cpp
│ │ ├── GoogleTestExtension.hpp
│ │ ├── IndirectBuffer.cpp
│ │ ├── IndirectBuffer.hpp
│ │ ├── KFDASMTest.cpp
│ │ ├── KFDASMTest.hpp
│ │ ├── KFDBaseComponentTest.cpp
│ │ ├── KFDBaseComponentTest.hpp
│ │ ├── KFDCWSRTest.cpp
│ │ ├── KFDCWSRTest.hpp
│ │ ├── KFDDBGTest.cpp
│ │ ├── KFDDBGTest.hpp
│ │ ├── KFDEventTest.cpp
│ │ ├── KFDEventTest.hpp
│ │ ├── KFDEvictTest.cpp
│ │ ├── KFDEvictTest.hpp
│ │ ├── KFDExceptionTest.cpp
│ │ ├── KFDExceptionTest.hpp
│ │ ├── KFDGWSTest.cpp
│ │ ├── KFDGWSTest.hpp
│ │ ├── KFDGraphicsInterop.cpp
│ │ ├── KFDGraphicsInterop.hpp
│ │ ├── KFDHWSTest.cpp
│ │ ├── KFDHWSTest.hpp
│ │ ├── KFDIPCTest.cpp
│ │ ├── KFDIPCTest.hpp
│ │ ├── KFDLocalMemoryTest.cpp
│ │ ├── KFDLocalMemoryTest.hpp
│ │ ├── KFDMemoryTest.cpp
│ │ ├── KFDMemoryTest.hpp
│ │ ├── KFDMultiProcessTest.cpp
│ │ ├── KFDMultiProcessTest.hpp
│ │ ├── KFDNegativeTest.cpp
│ │ ├── KFDNegativeTest.hpp
│ │ ├── KFDOpenCloseKFDTest.cpp
│ │ ├── KFDOpenCloseKFDTest.hpp
│ │ ├── KFDPCSamplingTest.cpp
│ │ ├── KFDPCSamplingTest.hpp
│ │ ├── KFDPMTest.cpp
│ │ ├── KFDPMTest.hpp
│ │ ├── KFDPerfCounters.cpp
│ │ ├── KFDPerfCounters.hpp
│ │ ├── KFDPerformanceTest.cpp
│ │ ├── KFDQMTest.cpp
│ │ ├── KFDQMTest.hpp
│ │ ├── KFDRASTest.cpp
│ │ ├── KFDRASTest.hpp
│ │ ├── KFDSVMEvictTest.cpp
│ │ ├── KFDSVMEvictTest.hpp
│ │ ├── KFDSVMRangeTest.cpp
│ │ ├── KFDSVMRangeTest.hpp
│ │ ├── KFDTestFlags.hpp
│ │ ├── KFDTestMain.cpp
│ │ ├── KFDTestUtil.cpp
│ │ ├── KFDTestUtil.hpp
│ │ ├── KFDTestUtilQueue.cpp
│ │ ├── KFDTestUtilQueue.hpp
│ │ ├── KFDTopologyTest.cpp
│ │ ├── KFDTopologyTest.hpp
│ │ ├── LinuxOSWrapper.cpp
│ │ ├── OSWrapper.hpp
│ │ ├── PM4Packet.cpp
│ │ ├── PM4Packet.hpp
│ │ ├── PM4Queue.cpp
│ │ ├── PM4Queue.hpp
│ │ ├── RDMATest.cpp
│ │ ├── RDMATest.hpp
│ │ ├── RDMAUtil.cpp
│ │ ├── RDMAUtil.hpp
│ │ ├── SDMAPacket.cpp
│ │ ├── SDMAPacket.hpp
│ │ ├── SDMAQueue.cpp
│ │ ├── SDMAQueue.hpp
│ │ ├── SDMAQueueByEngId.hpp
│ │ ├── ShaderStore.cpp
│ │ ├── ShaderStore.hpp
│ │ └── XgmiOptimizedSDMAQueue.hpp
│ ├── rdma/
│ │ └── simple/
│ │ ├── app/
│ │ │ ├── CMakeLists.txt
│ │ │ └── rdma_test.cpp
│ │ └── drv/
│ │ ├── amdp2ptest.c
│ │ └── amdp2ptest.h
│ └── reopen/
│ ├── CMakeLists.txt
│ └── kmtreopen.c
├── rocrtst/
│ ├── .gitignore
│ ├── Kernels/
│ │ ├── CMakeLists.txt
│ │ ├── binary_search_kernel.cl
│ │ ├── read_kernel.cl
│ │ └── write_kernel.cl
│ ├── README.md
│ ├── common/
│ │ ├── base_rocr.cc
│ │ ├── base_rocr.h
│ │ ├── base_rocr_utils.cc
│ │ ├── base_rocr_utils.h
│ │ ├── common.cc
│ │ ├── common.h
│ │ ├── concurrent_utils.cc
│ │ ├── concurrent_utils.h
│ │ ├── helper_funcs.cc
│ │ ├── helper_funcs.h
│ │ ├── hsatimer.cc
│ │ ├── hsatimer.h
│ │ ├── os.cc
│ │ ├── os.h
│ │ ├── rocr.cc
│ │ ├── rocr.h
│ │ └── utils_test/
│ │ ├── CMakeLists.txt
│ │ ├── utils_cpp11_gtest.cpp
│ │ ├── utils_timer_gtest.cpp
│ │ ├── utils_timer_test.cpp
│ │ └── utils_timer_test.hpp
│ ├── gtest/
│ │ ├── CMakeLists.txt
│ │ ├── include/
│ │ │ └── gtest/
│ │ │ ├── gtest-death-test.h
│ │ │ ├── gtest-message.h
│ │ │ ├── gtest-param-test.h
│ │ │ ├── gtest-printers.h
│ │ │ ├── gtest-spi.h
│ │ │ ├── gtest-test-part.h
│ │ │ ├── gtest-typed-test.h
│ │ │ ├── gtest.h
│ │ │ ├── gtest_pred_impl.h
│ │ │ ├── gtest_prod.h
│ │ │ └── internal/
│ │ │ ├── gtest-death-test-internal.h
│ │ │ ├── gtest-filepath.h
│ │ │ ├── gtest-internal.h
│ │ │ ├── gtest-linked_ptr.h
│ │ │ ├── gtest-param-util-generated.h
│ │ │ ├── gtest-param-util-generated.h.pump
│ │ │ ├── gtest-param-util.h
│ │ │ ├── gtest-port.h
│ │ │ ├── gtest-string.h
│ │ │ ├── gtest-tuple.h
│ │ │ ├── gtest-tuple.h.pump
│ │ │ ├── gtest-type-util.h
│ │ │ └── gtest-type-util.h.pump
│ │ └── src/
│ │ ├── gtest-all.cpp
│ │ ├── gtest-death-test.cpp
│ │ ├── gtest-filepath.cpp
│ │ ├── gtest-internal-inl.h
│ │ ├── gtest-port.cpp
│ │ ├── gtest-printers.cpp
│ │ ├── gtest-test-part.cpp
│ │ ├── gtest-typed-test.cpp
│ │ ├── gtest.cpp
│ │ └── gtest_main.cpp
│ ├── samples/
│ │ ├── CMakeLists.txt
│ │ ├── README.txt
│ │ ├── async_mem_copy/
│ │ │ └── async_mem_copy.cc
│ │ ├── binary_search/
│ │ │ ├── binary_search.cc
│ │ │ └── binary_search_kernels.cl
│ │ ├── ipc/
│ │ │ └── ipc.cc
│ │ ├── rocm_async/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Readme.txt
│ │ │ ├── base_test.cpp
│ │ │ ├── base_test.hpp
│ │ │ ├── common.cpp
│ │ │ ├── common.hpp
│ │ │ ├── hsatimer.cpp
│ │ │ ├── hsatimer.hpp
│ │ │ ├── main.cpp
│ │ │ ├── os.cpp
│ │ │ ├── os.hpp
│ │ │ ├── rocm_async.cpp
│ │ │ ├── rocm_async.hpp
│ │ │ ├── rocm_async_io.cpp
│ │ │ ├── rocm_async_parse.cpp
│ │ │ ├── rocm_async_print.cpp
│ │ │ ├── rocm_async_report.cpp
│ │ │ ├── rocm_async_topology.cpp
│ │ │ ├── rocm_async_trans.cpp
│ │ │ └── rocm_async_validate.cpp
│ │ └── rocrinfo/
│ │ └── rocrinfo.cc
│ ├── suites/
│ │ ├── functional/
│ │ │ ├── agent_props.cc
│ │ │ ├── agent_props.h
│ │ │ ├── aql_barrier_bit.cc
│ │ │ ├── aql_barrier_bit.h
│ │ │ ├── concurrent_init.cc
│ │ │ ├── concurrent_init.h
│ │ │ ├── concurrent_init_shutdown.cc
│ │ │ ├── concurrent_init_shutdown.h
│ │ │ ├── concurrent_shutdown.cc
│ │ │ ├── concurrent_shutdown.h
│ │ │ ├── cu_masking.cc
│ │ │ ├── cu_masking.h
│ │ │ ├── deallocation_notifier.cc
│ │ │ ├── deallocation_notifier.h
│ │ │ ├── debug_basic.cc
│ │ │ ├── debug_basic.h
│ │ │ ├── ipc.cc
│ │ │ ├── ipc.h
│ │ │ ├── memory_access.cc
│ │ │ ├── memory_access.h
│ │ │ ├── memory_alignment.cc
│ │ │ ├── memory_alignment.h
│ │ │ ├── memory_allocation.cc
│ │ │ ├── memory_allocation.h
│ │ │ ├── memory_atomics.cc
│ │ │ ├── memory_atomics.h
│ │ │ ├── memory_basic.cc
│ │ │ ├── memory_basic.h
│ │ │ ├── reference_count.cc
│ │ │ ├── reference_count.h
│ │ │ ├── signal_concurrent.cc
│ │ │ ├── signal_concurrent.h
│ │ │ ├── signal_kernel.cc
│ │ │ ├── signal_kernel.h
│ │ │ ├── virtual_memory.cc
│ │ │ └── virtual_memory.h
│ │ ├── negative/
│ │ │ ├── memory_allocate_negative_tests.cc
│ │ │ ├── memory_allocate_negative_tests.h
│ │ │ ├── queue_validation.cc
│ │ │ └── queue_validation.h
│ │ ├── performance/
│ │ │ ├── dispatch_time.cc
│ │ │ ├── dispatch_time.h
│ │ │ ├── enqueueLatency.cc
│ │ │ ├── enqueueLatency.h
│ │ │ ├── memory_async_copy.cc
│ │ │ ├── memory_async_copy.h
│ │ │ ├── memory_async_copy_numa.cc
│ │ │ └── memory_async_copy_numa.h
│ │ ├── stress/
│ │ │ ├── memory_concurrent_tests.cc
│ │ │ ├── memory_concurrent_tests.h
│ │ │ ├── queue_write_index_concurrent_tests.cc
│ │ │ └── queue_write_index_concurrent_tests.h
│ │ └── test_common/
│ │ ├── CMakeLists.txt
│ │ ├── kernels/
│ │ │ ├── atomicOperations_kernels.cl
│ │ │ ├── cu_mask_kernels.cl
│ │ │ ├── dispatch_time_kernels.cl
│ │ │ ├── gpuReadWrite_kernels.cl
│ │ │ ├── groupMemoryDynamic_kernels.cl
│ │ │ ├── signal_operations.cl
│ │ │ ├── test_case_template_kernels.cl
│ │ │ ├── vector_add_debug_trap_kernel.cl
│ │ │ └── vector_add_memory_fault_kernel.cl
│ │ ├── main.cc
│ │ ├── main.h
│ │ ├── test_base.cc
│ │ ├── test_base.h
│ │ ├── test_case_template.cc
│ │ ├── test_case_template.h
│ │ ├── test_common.cc
│ │ └── test_common.h
│ └── thirdparty/
│ ├── include/
│ │ ├── LICENSE
│ │ ├── hwloc/
│ │ │ ├── autogen/
│ │ │ │ └── config.h
│ │ │ ├── bitmap.h
│ │ │ ├── cpuset.h
│ │ │ ├── cuda.h
│ │ │ ├── cudart.h
│ │ │ ├── deprecated.h
│ │ │ ├── diff.h
│ │ │ ├── gl.h
│ │ │ ├── glibc-sched.h
│ │ │ ├── helper.h
│ │ │ ├── inlines.h
│ │ │ ├── intel-mic.h
│ │ │ ├── linux-libnuma.h
│ │ │ ├── linux.h
│ │ │ ├── myriexpress.h
│ │ │ ├── nvml.h
│ │ │ ├── opencl.h
│ │ │ ├── openfabrics-verbs.h
│ │ │ ├── plugins.h
│ │ │ └── rename.h
│ │ └── hwloc.h
│ └── lib/
│ ├── LICENSE
│ └── libhwloc.so.5
├── runtime/
│ ├── cmake_modules/
│ │ ├── COPYING-CMAKE-SCRIPTS
│ │ └── FindLibElf.cmake
│ ├── docs/
│ │ ├── api-reference/
│ │ │ ├── api.rst
│ │ │ ├── c-interface-adaptors.rst
│ │ │ └── environment_variables.rst
│ │ ├── conf.py
│ │ ├── contribution/
│ │ │ └── contributing-to-rocr.rst
│ │ ├── data/
│ │ │ └── env_variables.rst
│ │ ├── index.rst
│ │ ├── install/
│ │ │ └── installation.rst
│ │ ├── license.rst
│ │ ├── sphinx/
│ │ │ ├── _toc.yml.in
│ │ │ ├── requirements.in
│ │ │ └── requirements.txt
│ │ └── what-is-rocr-runtime.rst
│ ├── hsa-ext-finalize/
│ │ └── CMakeLists.txt
│ ├── hsa-ext-image/
│ │ └── CMakeLists.txt
│ ├── hsa-runtime/
│ │ ├── CMakeLists.txt
│ │ ├── LICENSE.md
│ │ ├── cmake_modules/
│ │ │ ├── COPYING-CMAKE-SCRIPTS
│ │ │ ├── FindLibElf.cmake
│ │ │ ├── hsa_common.cmake
│ │ │ └── utils.cmake
│ │ ├── core/
│ │ │ ├── common/
│ │ │ │ ├── hsa_table_interface.cpp
│ │ │ │ └── shared.h
│ │ │ ├── driver/
│ │ │ │ ├── driver.cpp
│ │ │ │ ├── kfd/
│ │ │ │ │ └── amd_kfd_driver.cpp
│ │ │ │ ├── virtio/
│ │ │ │ │ └── amd_kfd_virtio_driver.cpp
│ │ │ │ └── xdna/
│ │ │ │ ├── amd_xdna_driver.cpp
│ │ │ │ └── uapi/
│ │ │ │ └── amdxdna_accel.h
│ │ │ ├── inc/
│ │ │ │ ├── agent.h
│ │ │ │ ├── amd_aie_agent.h
│ │ │ │ ├── amd_aie_aql_queue.h
│ │ │ │ ├── amd_aql_queue.h
│ │ │ │ ├── amd_available_drivers.h
│ │ │ │ ├── amd_blit_kernel.h
│ │ │ │ ├── amd_blit_sdma.h
│ │ │ │ ├── amd_blit_shaders.h
│ │ │ │ ├── amd_core_dump.hpp
│ │ │ │ ├── amd_cpu_agent.h
│ │ │ │ ├── amd_elf_image.hpp
│ │ │ │ ├── amd_filter_device.h
│ │ │ │ ├── amd_gpu_agent.h
│ │ │ │ ├── amd_gpu_pm4.h
│ │ │ │ ├── amd_hsa_code.hpp
│ │ │ │ ├── amd_hsa_loader.hpp
│ │ │ │ ├── amd_kfd_driver.h
│ │ │ │ ├── amd_loader_context.hpp
│ │ │ │ ├── amd_memory_region.h
│ │ │ │ ├── amd_topology.h
│ │ │ │ ├── amd_trap_handler_v1.h
│ │ │ │ ├── amd_virtio_driver.h
│ │ │ │ ├── amd_xdna_driver.h
│ │ │ │ ├── blit.h
│ │ │ │ ├── cache.h
│ │ │ │ ├── checked.h
│ │ │ │ ├── default_signal.h
│ │ │ │ ├── driver.h
│ │ │ │ ├── exceptions.h
│ │ │ │ ├── host_queue.h
│ │ │ │ ├── hsa_amd_tool_int.hpp
│ │ │ │ ├── hsa_api_trace_int.h
│ │ │ │ ├── hsa_ext_amd_impl.h
│ │ │ │ ├── hsa_ext_interface.h
│ │ │ │ ├── hsa_internal.h
│ │ │ │ ├── hsa_table_interface.h
│ │ │ │ ├── hsa_ven_amd_loader_impl.h
│ │ │ │ ├── intercept_queue.h
│ │ │ │ ├── interrupt_signal.h
│ │ │ │ ├── ipc_signal.h
│ │ │ │ ├── isa.h
│ │ │ │ ├── memory_region.h
│ │ │ │ ├── queue.h
│ │ │ │ ├── registers.h
│ │ │ │ ├── runtime.h
│ │ │ │ ├── scratch_cache.h
│ │ │ │ ├── sdma_registers.h
│ │ │ │ ├── signal.h
│ │ │ │ ├── svm_profiler.h
│ │ │ │ └── thunk_loader.h
│ │ │ ├── runtime/
│ │ │ │ ├── amd_aie_agent.cpp
│ │ │ │ ├── amd_aie_aql_queue.cpp
│ │ │ │ ├── amd_aql_queue.cpp
│ │ │ │ ├── amd_blit_kernel.cpp
│ │ │ │ ├── amd_blit_sdma.cpp
│ │ │ │ ├── amd_cpu_agent.cpp
│ │ │ │ ├── amd_filter_device.cpp
│ │ │ │ ├── amd_gpu_agent.cpp
│ │ │ │ ├── amd_hsa_loader.cpp
│ │ │ │ ├── amd_loader_context.cpp
│ │ │ │ ├── amd_memory_region.cpp
│ │ │ │ ├── amd_topology.cpp
│ │ │ │ ├── blit_shaders/
│ │ │ │ │ ├── CMakeLists.txt
│ │ │ │ │ ├── blit_copyAligned.s
│ │ │ │ │ ├── blit_copyMisaligned.s
│ │ │ │ │ ├── blit_fill.s
│ │ │ │ │ └── create_blit_shader_header.sh
│ │ │ │ ├── cache.cpp
│ │ │ │ ├── default_signal.cpp
│ │ │ │ ├── host_queue.cpp
│ │ │ │ ├── hsa.cpp
│ │ │ │ ├── hsa_api_trace.cpp
│ │ │ │ ├── hsa_ext_amd.cpp
│ │ │ │ ├── hsa_ext_interface.cpp
│ │ │ │ ├── hsa_ven_amd_loader.cpp
│ │ │ │ ├── intercept_queue.cpp
│ │ │ │ ├── interrupt_signal.cpp
│ │ │ │ ├── ipc_signal.cpp
│ │ │ │ ├── isa.cpp
│ │ │ │ ├── queue.cpp
│ │ │ │ ├── runtime.cpp
│ │ │ │ ├── signal.cpp
│ │ │ │ ├── svm_profiler.cpp
│ │ │ │ ├── thunk_loader.cpp
│ │ │ │ └── trap_handler/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── create_trap_handler_header.sh
│ │ │ │ ├── trap_handler.s
│ │ │ │ └── trap_handler_gfx12.s
│ │ │ └── util/
│ │ │ ├── atomic_helpers.h
│ │ │ ├── flag.cpp
│ │ │ ├── flag.h
│ │ │ ├── lazy_ptr.h
│ │ │ ├── lnx/
│ │ │ │ └── os_linux.cpp
│ │ │ ├── locks.h
│ │ │ ├── memory.h
│ │ │ ├── os.h
│ │ │ ├── simple_heap.h
│ │ │ ├── small_heap.cpp
│ │ │ ├── small_heap.h
│ │ │ ├── timer.cpp
│ │ │ ├── timer.h
│ │ │ ├── utils.h
│ │ │ └── win/
│ │ │ └── os_win.cpp
│ │ ├── hsa-runtime64-config.cmake.in
│ │ ├── hsacore.so.def
│ │ ├── hsacore.so.link
│ │ ├── image/
│ │ │ ├── addrlib/
│ │ │ │ ├── inc/
│ │ │ │ │ ├── addrinterface.h
│ │ │ │ │ └── addrtypes.h
│ │ │ │ └── src/
│ │ │ │ ├── addrinterface.cpp
│ │ │ │ ├── amdgpu_asic_addr.h
│ │ │ │ ├── chip/
│ │ │ │ │ ├── gfx10/
│ │ │ │ │ │ └── gfx10_gb_reg.h
│ │ │ │ │ ├── gfx11/
│ │ │ │ │ │ └── gfx11_gb_reg.h
│ │ │ │ │ ├── gfx12/
│ │ │ │ │ │ └── gfx12_gb_reg.h
│ │ │ │ │ ├── gfx9/
│ │ │ │ │ │ └── gfx9_gb_reg.h
│ │ │ │ │ └── r800/
│ │ │ │ │ └── si_gb_reg.h
│ │ │ │ ├── core/
│ │ │ │ │ ├── addrcommon.h
│ │ │ │ │ ├── addrelemlib.cpp
│ │ │ │ │ ├── addrelemlib.h
│ │ │ │ │ ├── addrlib.cpp
│ │ │ │ │ ├── addrlib.h
│ │ │ │ │ ├── addrlib1.cpp
│ │ │ │ │ ├── addrlib1.h
│ │ │ │ │ ├── addrlib2.cpp
│ │ │ │ │ ├── addrlib2.h
│ │ │ │ │ ├── addrlib3.cpp
│ │ │ │ │ ├── addrlib3.h
│ │ │ │ │ ├── addrobject.cpp
│ │ │ │ │ ├── addrobject.h
│ │ │ │ │ ├── coord.cpp
│ │ │ │ │ └── coord.h
│ │ │ │ ├── gfx10/
│ │ │ │ │ ├── gfx10SwizzlePattern.h
│ │ │ │ │ ├── gfx10addrlib.cpp
│ │ │ │ │ └── gfx10addrlib.h
│ │ │ │ ├── gfx11/
│ │ │ │ │ ├── gfx11SwizzlePattern.h
│ │ │ │ │ ├── gfx11addrlib.cpp
│ │ │ │ │ └── gfx11addrlib.h
│ │ │ │ ├── gfx12/
│ │ │ │ │ ├── gfx12SwizzlePattern.h
│ │ │ │ │ ├── gfx12addrlib.cpp
│ │ │ │ │ └── gfx12addrlib.h
│ │ │ │ └── gfx9/
│ │ │ │ ├── gfx9addrlib.cpp
│ │ │ │ └── gfx9addrlib.h
│ │ │ ├── blit_kernel.cpp
│ │ │ ├── blit_kernel.h
│ │ │ ├── blit_object_gfx7xx.cpp
│ │ │ ├── blit_object_gfx8xx.cpp
│ │ │ ├── blit_object_gfx9xx.cpp
│ │ │ ├── blit_src/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── README.md
│ │ │ │ ├── create_hsaco_ascii_file.sh
│ │ │ │ └── imageblit_kernels.cl
│ │ │ ├── device_info.cpp
│ │ │ ├── device_info.h
│ │ │ ├── hsa_ext_image.cpp
│ │ │ ├── image_lut.h
│ │ │ ├── image_lut_gfx11.cpp
│ │ │ ├── image_lut_gfx11.h
│ │ │ ├── image_lut_kv.cpp
│ │ │ ├── image_lut_kv.h
│ │ │ ├── image_manager.cpp
│ │ │ ├── image_manager.h
│ │ │ ├── image_manager_ai.cpp
│ │ │ ├── image_manager_ai.h
│ │ │ ├── image_manager_gfx11.cpp
│ │ │ ├── image_manager_gfx11.h
│ │ │ ├── image_manager_gfx12.cpp
│ │ │ ├── image_manager_gfx12.h
│ │ │ ├── image_manager_kv.cpp
│ │ │ ├── image_manager_kv.h
│ │ │ ├── image_manager_nv.cpp
│ │ │ ├── image_manager_nv.h
│ │ │ ├── image_runtime.cpp
│ │ │ ├── image_runtime.h
│ │ │ ├── inc/
│ │ │ │ └── hsa_ext_image_impl.h
│ │ │ ├── resource.h
│ │ │ ├── resource_ai.h
│ │ │ ├── resource_gfx11.h
│ │ │ ├── resource_gfx12.h
│ │ │ ├── resource_kv.h
│ │ │ ├── resource_nv.h
│ │ │ └── util.h
│ │ ├── inc/
│ │ │ ├── Brig.h
│ │ │ ├── amd_hsa_common.h
│ │ │ ├── amd_hsa_elf.h
│ │ │ ├── amd_hsa_kernel_code.h
│ │ │ ├── amd_hsa_queue.h
│ │ │ ├── amd_hsa_signal.h
│ │ │ ├── hsa.h
│ │ │ ├── hsa_amd_tool.h
│ │ │ ├── hsa_api_trace.h
│ │ │ ├── hsa_api_trace_version.h
│ │ │ ├── hsa_ext_amd.h
│ │ │ ├── hsa_ext_finalize.h
│ │ │ ├── hsa_ext_image.h
│ │ │ ├── hsa_ven_amd_aqlprofile.h
│ │ │ ├── hsa_ven_amd_loader.h
│ │ │ └── hsa_ven_amd_pc_sampling.h
│ │ ├── libamdhsacode/
│ │ │ ├── amd_core_dump.cpp
│ │ │ ├── amd_elf_image.cpp
│ │ │ ├── amd_hsa_code.cpp
│ │ │ ├── amd_hsa_code_util.cpp
│ │ │ ├── amd_hsa_code_util.hpp
│ │ │ ├── amd_hsa_locks.cpp
│ │ │ ├── amd_hsa_locks.hpp
│ │ │ ├── amd_options.cpp
│ │ │ └── amd_options.hpp
│ │ ├── loader/
│ │ │ ├── AMDHSAKernelDescriptor.h
│ │ │ ├── executable.cpp
│ │ │ └── executable.hpp
│ │ └── pcs/
│ │ ├── hsa_ven_amd_pc_sampling.cpp
│ │ ├── inc/
│ │ │ └── hsa_ven_amd_pc_sampling_impl.h
│ │ ├── pcs_runtime.cpp
│ │ └── pcs_runtime.h
│ ├── hsa-runtime-tools/
│ │ └── CMakeLists.txt
│ └── packages/
│ ├── hsa-ext-rocr-dev/
│ │ ├── CMakeLists.txt
│ │ ├── Old CMakeLists.txt
│ │ ├── copyright
│ │ ├── description
│ │ ├── postinst
│ │ ├── prerm
│ │ ├── rpm_post
│ │ └── rpm_postun
│ └── rocr_tools_legacy/
│ ├── CMakeLists.txt
│ ├── copyright
│ ├── description
│ ├── postinst
│ ├── prerm
│ ├── rpm_post
│ └── rpm_postun
└── samples/
├── GetInfo/
│ ├── get_info.cpp
│ └── get_info.h
└── common/
├── common.cpp
├── common.hpp
├── common_utility.cpp
├── common_utility.h
├── helper_funcs.cpp
├── helper_funcs.hpp
├── hsa_base_util.cpp
├── hsa_base_util.h
├── hsa_perf_cntrs.cpp
├── hsa_perf_cntrs.hpp
├── hsa_rsrc_factory.cpp
├── hsa_rsrc_factory.hpp
├── hsa_test.cpp
├── hsa_test.h
├── hsatimer.cpp
├── hsatimer.h
├── os.cpp
├── os.h
├── utilities.cpp
└── utilities.h
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.*
#
# git files that we don't want to ignore even it they are dot-files
#
!.gitignore
!.mailmap
.github*
patches-*
build/
outgoing/
Makefile
# documentation artifacts
_build/
_doxygen/
_images/
_static/
_templates/
_toc.yml
doxygen
================================================
FILE: CMakeLists.txt
================================================
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
cmake_minimum_required(VERSION 3.7)
# Set the project name
project("rocr")
set(CMAKE_VERBOSE_MAKEFILE ON)
## Expose static library option
if ( NOT DEFINED BUILD_SHARED_LIBS )
set ( BUILD_SHARED_LIBS ON )
endif()
set ( BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS} CACHE BOOL "Build shared library (.so) or not.")
if (NOT DEFINED BUILD_ROCR)
set(BUILD_ROCR ON)
endif()
function(add_rocm_subdir subdir subdir_assigns)
message("add_rocm_subdir() -- " ${subdir})
# message(" subdir_assigns before:" ${subdir_assigns} "EOM")
string(STRIP "${subdir_assigns}" subdir_assigns)
message(" subdir_assigns:" ${subdir_assigns} "EOM")
# if the subdir_assigns is defined and non-empty, then..
if(NOT "${subdir_assigns}" STREQUAL "")
foreach(assignment IN LISTS subdir_assigns)
# The format of each var should be VARNAME=VALUE
message("assignment: " ${assignment})
string(REPLACE "=" ";" pair ${assignment})
list(GET pair 0 var_name)
list(GET pair 1 var_value)
# Set variable locally for this function and for the subdirectory
set(${var_name} "${var_value}")
message("The value of ${var_name} is: ${${var_name}}")
endforeach()
endif()
add_subdirectory(${subdir})
endfunction()
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules")
include(utils)
## Get version strings
get_version("1.18.0")
if (${ROCM_PATCH_VERSION})
set(VERSION_PATCH ${ROCM_PATCH_VERSION})
endif()
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
set(PACKAGE_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_COMMIT_COUNT}")
if (NOT DEFINED BUILD_SHARED_LIBS)
set(BUILD_SHARED_LIBS ON)
endif()
# Set hsa pkg dependency with rocprofiler-register package
# for Shared Library Only.
if (BUILD_SHARED_LIBS)
set(HSA_DEP_ROCPROFILER_REGISTER ON CACHE INTERNAL "")
endif()
if (HSA_DEP_ROCPROFILER_REGISTER)
string(APPEND CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ", rocprofiler-register")
string(APPEND CPACK_RPM_BINARY_PACKAGE_REQUIRES " rocprofiler-register")
endif()
if (NOT DEFINED BUILD_THUNK_VIRTIO)
set(BUILD_THUNK_VIRTIO OFF)
endif()
add_rocm_subdir(libhsakmt "${THUNK_DEFINITIONS}")
set_target_properties(hsakmt PROPERTIES
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/libhsakmt/archive"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/libhsakmt/lib"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/libhsakmt/runtime")
if (BUILD_THUNK_VIRTIO)
add_rocm_subdir(libhsakmt/src/virtio "${THUNK_VIRTIO_DEFINITIONS}")
endif()
if (BUILD_ROCR)
add_rocm_subdir(runtime/hsa-runtime "${ROCR_DEFINITIONS}")
set_target_properties(hsa-runtime64 PROPERTIES
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/rocr/archive"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/rocr/lib"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/rocr/runtime")
if (BUILD_SHARED_LIBS)
add_dependencies(hsa-runtime64 hsakmt)
if (BUILD_THUNK_VIRTIO)
add_dependencies(hsa-runtime64 hsakmt_virtio)
endif()
else()
add_dependencies(hsa-runtime64 hsakmt-staticdrm)
endif()
endif()
# Optionally record the package's find module in the user's package cache.
if ( NOT DEFINED EXPORT_TO_USER_PACKAGE_REGISTRY )
set ( EXPORT_TO_USER_PACKAGE_REGISTRY "off")
endif()
set ( EXPORT_TO_USER_PACKAGE_REGISTRY ${EXPORT_TO_USER_PACKAGE_REGISTRY} CACHE BOOL "Add cmake package config location to the user's cmake package registry.")
if(${EXPORT_TO_USER_PACKAGE_REGISTRY})
# Enable writing to the registry
set(CMAKE_EXPORT_PACKAGE_REGISTRY ON)
# Generate a target file for the build
export(TARGETS ${CORE_RUNTIME_NAME} NAMESPACE ${CORE_RUNTIME_NAME}:: FILE ${CORE_RUNTIME_NAME}Targets.cmake)
# Record the package in the user's cache.
export(PACKAGE ${CORE_RUNTIME_NAME})
endif()
## Packaging directives
set(CPACK_VERBOSE 1)
set(CPACK_GENERATOR "DEB;RPM" CACHE STRING "Package types to build")
set(ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
# From libhsakmt:
set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix.")
if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX)
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "${CPACK_PACKAGING_INSTALL_PREFIX} ${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
endif()
# ASAN Package will have libraries and license file
if (ENABLE_ASAN_PACKAGING)
# ASAN Package requires only asan component with libraries and license file
set(CPACK_COMPONENTS_ALL asan)
else()
set(CPACK_COMPONENTS_ALL binary dev)
endif()
set(CPACK_DEB_COMPONENT_INSTALL ON)
set(CPACK_RPM_COMPONENT_INSTALL ON)
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
set(CPACK_PACKAGE_VERSION ${PACKAGE_VERSION_STRING})
set(CPACK_PACKAGE_CONTACT "AMD HSA Support
")
set(CPACK_COMPONENT_DESCRIPTION "AMD Heterogeneous System Architecture HSA - Linux HSA Runtime for Boltzmann (ROCm) platforms\nIncludes HSAKMT, the user-mode API interfaces used to interact with the ROCk driver.\n Contains the headers, pkgonfig and\n cmake files for ROCT.")
set(CPACK_COMPONENT_BINARY_DESCRIPTION "AMD Heterogeneous System Architecture HSA - Linux HSA Runtime for Boltzmann (ROCm) platforms")
set(CPACK_COMPONENT_DEV_DESCRIPTION "AMD Heterogeneous System Architecture HSA development package.\n This package contains the headers and cmake files for the rocr-runtime package.")
set(CPACK_COMPONENT_ASAN_DESCRIPTION "AMD Heterogeneous System Architecture HSA - Linux HSA instrumented libraries for Boltzmann (ROCm) platforms")
if (DEFINED ENV{ROCM_LIBPATCH_VERSION})
set(CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}.$ENV{ROCM_LIBPATCH_VERSION}")
message("Using CPACK_PACKAGE_VERSION ${CPACK_PACKAGE_VERSION}")
endif()
# Debian package specific variables
set(CPACK_DEBIAN_BINARY_PACKAGE_NAME "hsa-rocr")
set(CPACK_DEBIAN_DEV_PACKAGE_NAME "hsa-rocr-dev")
set(CPACK_DEBIAN_ASAN_PACKAGE_NAME "hsa-rocr-asan")
if (DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE})
set(CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE})
else()
set(CPACK_DEBIAN_PACKAGE_RELEASE "local")
endif()
message("Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}")
set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCR-Runtime")
## Process the Debian install/remove scripts to update the CPACK variables
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/Binary/postinst.in DEBIAN/Binary/postinst @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/Binary/prerm.in DEBIAN/Binary/prerm @ONLY)
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/preinst DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/DEBIAN)
set (CPACK_DEBIAN_BINARY_PACKAGE_CONTROL_EXTRA "DEBIAN/preinst;DEBIAN/Binary/postinst;DEBIAN/Binary/prerm")
# Needed since some packages still say they need hsakmt-roct
set(CPACK_DEBIAN_DEV_PACKAGE_REPLACES "hsakmt-roct,hsakmt-roct-dev,hsa-ext-rocr-dev")
set(CPACK_DEBIAN_DEV_PACKAGE_PROVIDES "hsakmt-roct,hsakmt-roct-dev,hsa-ext-rocr-dev")
#TODO: hsa-ext-rocr-dev can be added to conflicts list and remove CPACK_DEBIAN_DEV_PACKAGE_BREAKS
set(CPACK_DEBIAN_DEV_PACKAGE_CONFLICTS "hsakmt-roct,hsakmt-roct-dev")
# package dependencies
set(CPACK_DEBIAN_PACKAGE_DEPENDS "libdrm-amdgpu-dev | libdrm-dev, rocm-core")
set(CPACK_DEBIAN_PACKAGE_RECOMMENDS "libdrm-amdgpu-dev")
# Setting devel package dependendent version
set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "libdrm-amdgpu-dev | libdrm-dev, rocm-core, hsa-rocr")
set(CPACK_DEBIAN_DEV_PACKAGE_RECOMMENDS "libdrm-amdgpu-dev")
set(CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS "libdrm-amdgpu-amdgpu1 | libdrm-amdgpu1, libnuma1, libelf1")
set(CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS "libdrm-amdgpu-dev | libdrm-dev, rocm-core-asan, libdrm-amdgpu-amdgpu1 | libdrm-amdgpu1, libnuma1, libelf1")
set(CPACK_DEBIAN_ASAN_PACKAGE_RECOMMENDS "libdrm-amdgpu-dev")
set(CPACK_DEBIAN_BINARY_PACKAGE_RECOMMENDS "libdrm-amdgpu-amdgpu1")
if (ROCM_DEP_ROCMCORE)
string(APPEND CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ", rocm-core")
string(APPEND CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS ", rocm-core-asan")
endif()
if (HSA_DEP_ROCPROFILER_REGISTER)
string(APPEND CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS ", rocprofiler-register")
endif()
# Declare package relationships (hsa-ext-rocr-dev is a legacy package that we subsume)
set(CPACK_DEBIAN_DEV_PACKAGE_BREAKS "hsa-ext-rocr-dev")
# RPM package specific variables
set(EL7_DISTRO "FALSE")
Checksetel7(EL7_DISTRO)
set(CPACK_RPM_BINARY_PACKAGE_NAME "hsa-rocr")
# Since we changed the package name to match RPM specs, take care of older builds that had -dev installed
# Also cover the fact that this now replaces the old binary package hsakmt-roct
set(CPACK_RPM_DEV_PACKAGE_PROVIDES "hsakmt-roct,hsakmt-roct-devel,hsakmt-roct-dev,hsa-ext-rocr-dev")
set(CPACK_RPM_DEV_PACKAGE_OBSOLETES "hsakmt-roct,hsakmt-roct-devel,hsakmt-roct-dev,hsa-ext-rocr-dev")
set(CPACK_RPM_DEV_PACKAGE_NAME "hsa-rocr-devel")
set(CPACK_RPM_ASAN_PACKAGE_NAME "hsa-rocr-asan")
if (DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE})
set(CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE})
else()
set(CPACK_RPM_PACKAGE_RELEASE "local")
endif()
string(APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}")
set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
set(CPACK_RPM_PACKAGE_LICENSE "NCSA")
## Process the Rpm install/remove scripts to update the CPACK variables
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/RPM/Binary/post.in" RPM/Binary/post @ONLY)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/RPM/Binary/postun.in" RPM/Binary/postun @ONLY)
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/RPM/preinst DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/RPM)
set (CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/preinst")
set(CPACK_RPM_BINARY_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/Binary/post")
set(CPACK_RPM_BINARY_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/Binary/postun")
# package dependencies
set(CPACK_RPM_DEV_PACKAGE_REQUIRES "rocm-core , hsa-rocr")
#
if (${EL7_DISTRO} STREQUAL "TRUE")
set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "libdrm-amdgpu, numactl-libs")
set(CPACK_RPM_ASAN_PACKAGE_REQUIRES "libdrm-amdgpu, numactl-libs, libdrm-amdgpu-devel")
set(CPACK_RPM_PACKAGE_REQUIRES "libdrm-amdgpu-devel")
string(APPEND CPACK_RPM_DEV_PACKAGE_REQUIRES ", libdrm-amdgpu-devel")
else()
set(CPACK_RPM_BINARY_PACKAGE_REQUIRES "(libdrm-amdgpu or libdrm or libdrm_amdgpu1), (libnuma1 or numactl-libs)")
set(CPACK_RPM_ASAN_PACKAGE_REQUIRES "(libdrm-amdgpu or libdrm or libdrm_amdgpu1), (libnuma1 or numactl-libs), (libdrm-amdgpu-devel or libdrm-devel)")
set(CPACK_RPM_USER_BINARY_SPECFILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/hsa-rocr.spec.in")
set(CPACK_RPM_PACKAGE_RECOMMENDS "libdrm-amdgpu, libdrm-amdgpu-devel")
set(CPACK_RPM_PACKAGE_REQUIRES "(libdrm-amdgpu-devel or libdrm-devel)")
string(APPEND CPACK_RPM_DEV_PACKAGE_REQUIRES ", (libdrm-amdgpu-devel or libdrm-devel)")
set(CPACK_RPM_DEV_PACKAGE_RECOMMENDS "libdrm-amdgpu-devel")
set(CPACK_RPM_ASAN_PACKAGE_RECOMMENDS "libdrm-amdgpu-devel")
endif()
if (ROCM_DEP_ROCMCORE)
string(APPEND CPACK_RPM_BINARY_PACKAGE_REQUIRES " rocm-core")
string(APPEND CPACK_RPM_ASAN_PACKAGE_REQUIRES " rocm-core-asan")
else()
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_DEV_PACKAGE_REQUIRES ${CPACK_RPM_DEV_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ${CPACK_DEBIAN_DEV_PACKAGE_DEPENDS})
string(REGEX REPLACE ",? ?rocm-core-asan" "" CPACK_RPM_ASAN_PACKAGE_REQUIRES ${CPACK_RPM_ASAN_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core-asan" "" CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_ASAN_PACKAGE_DEPENDS})
endif()
if (HSA_DEP_ROCPROFILER_REGISTER)
string(APPEND CPACK_RPM_BINARY_PACKAGE_REQUIRES " rocprofiler-register")
endif()
if(NOT BUILD_SHARED_LIBS)
# Suffix package name with static
set(CPACK_RPM_STATIC_PACKAGE_NAME "hsa-rocr-static-devel")
set(CPACK_DEBIAN_STATIC_PACKAGE_NAME "hsa-rocr-static-dev")
set(CPACK_COMPONENT_STATIC_DESCRIPTION "HSA (Heterogenous System Architecture) core runtime - Linux static libraries")
set(CPACK_RPM_STATIC_PACKAGE_REQUIRES "${CPACK_RPM_BINARY_PACKAGE_REQUIRES}")
set(CPACK_DEBIAN_STATIC_PACKAGE_DEPENDS "${CPACK_DEBIAN_BINARY_PACKAGE_DEPENDS}")
endif()
## Include packaging
include(CPack)
# static package generation
# Group binary and dev component to single package
if(NOT BUILD_SHARED_LIBS)
cpack_add_component_group("static")
cpack_add_component(binary GROUP static)
cpack_add_component(dev GROUP static)
endif()
cpack_add_component(asan
DISPLAY_NAME "ASAN"
DESCRIPTION "ASAN libraries for rocr-runtime")
================================================
FILE: DEBIAN/Binary/postinst.in
================================================
#!/bin/bash
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
set -e
# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build
do_ldconfig() {
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /etc/ld.so.conf.d/rocr-runtime.conf
ldconfig
fi
}
case "$1" in
( configure )
do_ldconfig
;;
( abort-upgrade | abort-remove | abort-deconfigure )
echo "$1"
;;
( * )
exit 0
;;
esac
================================================
FILE: DEBIAN/Binary/prerm.in
================================================
#!/bin/bash
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
set -e
# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build
rm_ldconfig() {
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
rm -f /etc/ld.so.conf.d/rocr-runtime.conf
ldconfig
fi
}
case "$1" in
( remove | upgrade)
rm_ldconfig
;;
( purge )
;;
( * )
exit 0
;;
esac
================================================
FILE: DEBIAN/Dev/postinst.in
================================================
#!/bin/bash
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
set -e
case "$1" in
( configure )
# Workaround for CPACK directory symlink handling error.
mkdir -p @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/include
ln -sf ../../@CMAKE_INSTALL_INCLUDEDIR@/hsa @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/include/hsa
;;
( * )
exit 0
;;
esac
================================================
FILE: DEBIAN/Dev/prerm.in
================================================
#!/bin/bash
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
set -e
case "$1" in
( remove | upgrade )
# Workaround for CPACK directory symlink handling error.
# Needed for remove and upgrade scenarios since
# upgrade installs to new folder and old folders need to be cleaned
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/hsa
;;
( * )
exit 0
;;
esac
================================================
FILE: DEBIAN/preinst
================================================
#!/bin/bash
echo "Pre-install check for ROCr."
# Check for old installations...
if ls /usr/lib/libhsa-runtime* 1> /dev/null 2>&1; then
echo "An old version of libhsa-runtime was found in /usr/lib."
echo "This must be uninstalled before proceeding with the installation"
echo "to avoid potential incompatibilities."
read -r -p "Do you want to uninstall the old version? [y/N] " response
if [ "$response" = "y" ]; then
if ! rm -rf /usr/lib/libhsa-runtime*; then
echo "Failed to remove /usr/lib/libhsa-runtime* files."
echo "Try to uninstall these files manually."
exit 1
fi
echo "Old version uninstalled."
else
echo "The old and new versions of ROCm are incompatible. Installation aborted."
exit 1
fi
fi
================================================
FILE: LICENSE.txt
================================================
The University of Illinois/NCSA
Open Source License (NCSA)
Copyright (c) 2014-2025, Advanced Micro Devices, Inc. All rights reserved.
Developed by:
AMD Research and AMD HSA Software Development
Advanced Micro Devices, Inc.
www.amd.com
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal with the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
- Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimers.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimers in
the documentation and/or other materials provided with the distribution.
- Neither the names of Advanced Micro Devices, Inc,
nor the names of its contributors may be used to endorse or promote
products derived from this Software without specific prior written
permission.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS WITH THE SOFTWARE.
================================================
FILE: README.md
================================================
# ROCR Runtime
> [!CAUTION]
> The ROCR-Runtime repository is retired, please use the [ROCm/rocm-systems](https://github.com/ROCm/rocm-systems/tree/develop/projects/rocr-runtime) repository
This ROCm Runtime (ROCr) repo combines 2 previously separate repos into a single repo:
- The HSA Runtime (`hsa-runtime`) for AMD GPU application development and
- The ROCt Thunk Library (`libhsakmt`), a "thunk" interface to the ROCm kernel driver (ROCk), used by the runtime.
================================================
FILE: RPM/Binary/post.in
================================================
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2016-2021, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /etc/ld.so.conf.d/hsa-rocr.conf
ldconfig
fi
================================================
FILE: RPM/Binary/postun.in
================================================
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2016-2021, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
# left-hand term originates from @ENABLE_LDCONFIG@ = ON/OFF at package build
if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
rm -f /etc/ld.so.conf.d/hsa-rocr.conf
ldconfig
fi
================================================
FILE: RPM/Dev/post.in
================================================
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2016-2021, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
# Workaround for CPACK directory symlink handling error.
mkdir -p @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/include
ln -sf ../../@CMAKE_INSTALL_INCLUDEDIR@/hsa @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/include/hsa
================================================
FILE: RPM/Dev/postun.in
================================================
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2016-2021, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
if [ $1 -le 1 ]; then
# Workaround for CPACK directory symlink handling error.
# Needed for uninstall and upgrade scenarios since
# upgrade install to new folder and old folders need to be cleaned
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/hsa
fi
================================================
FILE: RPM/hsa-rocr.spec.in
================================================
# Restore old style debuginfo creation for rpm >= 4.14.
%undefine _debugsource_packages
%undefine _debuginfo_subpackages
# -*- rpm-spec -*-
BuildRoot: %_topdir/@CPACK_PACKAGE_FILE_NAME@@CPACK_RPM_PACKAGE_COMPONENT_PART_PATH@
Summary: @CPACK_RPM_PACKAGE_SUMMARY@
Name: @CPACK_RPM_PACKAGE_NAME@
Version: @CPACK_RPM_PACKAGE_VERSION@
Release: @CPACK_RPM_PACKAGE_RELEASE@
License: @CPACK_RPM_PACKAGE_LICENSE@
Group: @CPACK_RPM_PACKAGE_GROUP@
Vendor: @CPACK_RPM_PACKAGE_VENDOR@
# Modifications to allow recommends to be used (not implemented in cpack):
%if "@CPACK_RPM_PACKAGE_RECOMMENDS@" != ""
Recommends: @CPACK_RPM_PACKAGE_RECOMMENDS@
%endif
# End of modifications
@TMP_RPM_URL@
@TMP_RPM_REQUIRES@
@TMP_RPM_REQUIRES_PRE@
@TMP_RPM_REQUIRES_POST@
@TMP_RPM_REQUIRES_PREUN@
@TMP_RPM_REQUIRES_POSTUN@
@TMP_RPM_PROVIDES@
@TMP_RPM_OBSOLETES@
@TMP_RPM_CONFLICTS@
@TMP_RPM_SUGGESTS@
@TMP_RPM_AUTOPROV@
@TMP_RPM_AUTOREQ@
@TMP_RPM_AUTOREQPROV@
@TMP_RPM_BUILDARCH@
@TMP_RPM_PREFIXES@
@TMP_RPM_EPOCH@
@TMP_RPM_DEBUGINFO@
%define _rpmdir %_topdir/RPMS
%define _srcrpmdir %_topdir/SRPMS
@FILE_NAME_DEFINE@
%define _unpackaged_files_terminate_build 0
@TMP_RPM_SPEC_INSTALL_POST@
@CPACK_RPM_SPEC_MORE_DEFINE@
@CPACK_RPM_COMPRESSION_TYPE_TMP@
%description
@CPACK_RPM_PACKAGE_DESCRIPTION@
# This is a shortcutted spec file generated by CMake RPM generator
# we skip _install step because CPack does that for us.
# We do only save CPack installed tree in _prepr
# and then restore it in build.
%prep
mv $RPM_BUILD_ROOT %_topdir/tmpBBroot
%install
if [ -e $RPM_BUILD_ROOT ];
then
rm -rf $RPM_BUILD_ROOT
fi
mv %_topdir/tmpBBroot $RPM_BUILD_ROOT
@TMP_RPM_DEBUGINFO_INSTALL@
%clean
%post
@RPM_SYMLINK_POSTINSTALL@
@CPACK_RPM_SPEC_POSTINSTALL@
%posttrans
@CPACK_RPM_SPEC_POSTTRANS@
%postun
@CPACK_RPM_SPEC_POSTUNINSTALL@
%pre
@CPACK_RPM_SPEC_PREINSTALL@
%pretrans
@CPACK_RPM_SPEC_PRETRANS@
%preun
@CPACK_RPM_SPEC_PREUNINSTALL@
%files
%defattr(@TMP_DEFAULT_FILE_PERMISSIONS@,@TMP_DEFAULT_USER@,@TMP_DEFAULT_GROUP@,@TMP_DEFAULT_DIR_PERMISSIONS@)
@CPACK_RPM_INSTALL_FILES@
@CPACK_RPM_ABSOLUTE_INSTALL_FILES@
@CPACK_RPM_USER_INSTALL_FILES@
%changelog
@CPACK_RPM_SPEC_CHANGELOG@
@TMP_OTHER_COMPONENTS@
================================================
FILE: RPM/preinst
================================================
#!/bin/bash
echo "Pre-install check for ROCr."
# Check for old installations...
if ls /usr/lib/libhsa-runtime* 1> /dev/null 2>&1; then
echo "An old version of libhsa-runtime was found in /usr/lib."
echo "This must be uninstalled before proceeding with the installation"
echo "to avoid potential incompatibilities."
read -r -p "Do you want to uninstall the old version? [y/N] " response
if [ "$response" = "y" ]; then
if ! rm -rf /usr/lib/libhsa-runtime*; then
echo "Failed to remove /usr/lib/libhsa-runtime* files."
echo "Try to uninstall these files manually."
exit 1
fi
echo "Old version uninstalled."
else
echo "The old and new versions of ROCm are incompatible. Installation aborted."
exit 1
fi
fi
================================================
FILE: _clang-format
================================================
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
ConstructorInitializerIndentWidth: 4
AlignEscapedNewlinesLeft: false
AlignTrailingComments: true
AlignConsecutiveAssignments: false
AlignOperands: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AlwaysBreakAfterDefinitionReturnType: false
AlwaysBreakTemplateDeclarations: false
AlwaysBreakBeforeMultilineStrings: true
BreakBeforeBinaryOperators: false
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BinPackParameters: true
ColumnLimit: 100
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ExperimentalAutoDetectBinPacking: false
IndentCaseLabels: true
IndentWrappedFunctionNames: false
IndentFunctionDeclarationAfterType: false
MaxEmptyLinesToKeep: 2
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: None
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakString: 1000
PenaltyBreakFirstLessLess: 120
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
DerivePointerAlignment: false
PointerAlignment: Left
SpacesBeforeTrailingComments: 2
Cpp11BracedListStyle: true
Standard: Auto
IndentWidth: 2
TabWidth: 8
UseTab: Never
BreakBeforeBraces: Attach
SpacesInParentheses: false
SpacesInAngles: false
SpaceInEmptyParentheses: false
SpacesInCStyleCastParentheses: false
SpacesInContainerLiterals: true
SpaceBeforeAssignmentOperators: true
ContinuationIndentWidth: 4
CommentPragmas: '^ IWYU pragma:'
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
SpaceBeforeParens: ControlStatements
DisableFormat: false
SortIncludes: false
...
================================================
FILE: clang-format-diff.py
================================================
#!/usr/bin/env python3
#
#===- clang-format-diff.py - ClangFormat Diff Reformatter ----*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#
"""
This script reads input from a unified diff and reformats all the changed
lines. This is useful to reformat all the lines touched by a specific patch.
Example usage for git/svn users:
git diff -U0 --no-color --relative HEAD^ | clang-format-diff.py -p1 -i
svn diff --diff-cmd=diff -x-U0 | clang-format-diff.py -i
It should be noted that the filename contained in the diff is used unmodified
to determine the source file to update. Users calling this script directly
should be careful to ensure that the path in the diff is correct relative to the
current working directory.
"""
from __future__ import absolute_import, division, print_function
import argparse
import difflib
import re
import subprocess
import sys
if sys.version_info.major >= 3:
from io import StringIO
else:
from io import BytesIO as StringIO
def main():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=
argparse.RawDescriptionHelpFormatter)
parser.add_argument('-i', action='store_true', default=False,
help='apply edits to files instead of displaying a diff')
parser.add_argument('-p', metavar='NUM', default=0,
help='strip the smallest prefix containing P slashes')
parser.add_argument('-regex', metavar='PATTERN', default=None,
help='custom pattern selecting file paths to reformat '
'(case sensitive, overrides -iregex)')
parser.add_argument('-iregex', metavar='PATTERN', default=
r'.*\.(cpp|cc|c\+\+|cxx|c|cl|h|hh|hpp|hxx|m|mm|inc|js|ts'
r'|proto|protodevel|java|cs)',
help='custom pattern selecting file paths to reformat '
'(case insensitive, overridden by -regex)')
parser.add_argument('-sort-includes', action='store_true', default=False,
help='let clang-format sort include blocks')
parser.add_argument('-v', '--verbose', action='store_true',
help='be more verbose, ineffective without -i')
parser.add_argument('-style',
help='formatting style to apply (LLVM, GNU, Google, Chromium, '
'Microsoft, Mozilla, WebKit)')
parser.add_argument('-binary', default='clang-format',
help='location of binary to use for clang-format')
args = parser.parse_args()
# Extract changed lines for each file.
filename = None
lines_by_file = {}
for line in sys.stdin:
match = re.search(r'^\+\+\+\ (.*?/){%s}(\S*)' % args.p, line)
if match:
filename = match.group(2)
if filename is None:
continue
if args.regex is not None:
if not re.match('^%s$' % args.regex, filename):
continue
else:
if not re.match('^%s$' % args.iregex, filename, re.IGNORECASE):
continue
match = re.search(r'^@@.*\+(\d+)(,(\d+))?', line)
if match:
start_line = int(match.group(1))
line_count = 1
if match.group(3):
line_count = int(match.group(3))
if line_count == 0:
continue
end_line = start_line + line_count - 1
lines_by_file.setdefault(filename, []).extend(
['-lines', str(start_line) + ':' + str(end_line)])
# Reformat files containing changes in place.
for filename, lines in lines_by_file.items():
if args.i and args.verbose:
print('Formatting {}'.format(filename))
command = [args.binary, filename]
if args.i:
command.append('-i')
if args.sort_includes:
command.append('-sort-includes')
command.extend(lines)
if args.style:
command.extend(['-style', args.style])
try:
p = subprocess.Popen(command,
stdout=subprocess.PIPE,
stderr=None,
stdin=subprocess.PIPE,
universal_newlines=True)
except OSError as e:
# Give the user more context when clang-format isn't
# found/isn't executable, etc.
raise RuntimeError(
'Failed to run "%s" - %s"' % (" ".join(command), e.strerror))
stdout, stderr = p.communicate()
if p.returncode != 0:
sys.exit(p.returncode)
if not args.i:
with open(filename) as f:
code = f.readlines()
formatted_code = StringIO(stdout).readlines()
diff = difflib.unified_diff(code, formatted_code,
filename, filename,
'(before formatting)', '(after formatting)')
diff_string = ''.join(diff)
if len(diff_string) > 0:
sys.stdout.write(diff_string)
if __name__ == '__main__':
main()
================================================
FILE: cmake_modules/utils.cmake
================================================
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2014-2017, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and#or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and#or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
function( get_path LIB CACHED_PATH HELP )
set( options "")
set( oneValueArgs RESULT )
set( multiValueArgs HINTS NAMES )
cmake_parse_arguments(ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
# Search for canary file.
if( ${LIB} )
find_library( FULLPATH NAMES ${ARGS_NAMES} HINTS ${${CACHED_PATH}} ${ARGS_HINTS} )
else()
find_file( FULLPATH NAMES ${ARGS_NAMES} HINTS ${${CACHED_PATH}} ${ARGS_HINTS} )
endif()
set( RESULT (NOT ${FULLPATH} MATCHES NOTFOUND) )
# Extract path
get_filename_component ( DIRPATH ${FULLPATH} DIRECTORY )
# Check path against cache
if( NOT "${${CACHED_PATH}}" STREQUAL "" )
if ( NOT "${${CACHED_PATH}}" STREQUAL "${DIRPATH}" )
message(WARNING "${CACHED_PATH} may be incorrect." )
set( DIRPATH ${${CACHED_PATH}} )
endif()
elseif(NOT ${RESULT})
message(WARNING "${CACHED_PATH} not located during path search.")
endif()
# Set cache variable and help text
set( ${CACHED_PATH} ${DIRPATH} CACHE PATH ${HELP} FORCE )
unset( FULLPATH CACHE )
# Return success flag
if( NOT ${ARGS_RESULT} STREQUAL "" )
set( ${ARGS_RESULT} ${RESULT} PARENT_SCOPE)
endif()
endfunction()
## Searches for a file using include paths and stores the path to that file in the cache
## using the cached value if set. Search paths are optional. Returns success in RESULT.
## get_include_path( NAMES name1 [name2...] [HINTS path1 [path2 ... ENV var]] [RESULT ]
macro( get_include_path CACHED_PATH HELP )
get_path( 0 ${ARGV} )
endmacro()
## Searches for a file using library paths and stores the path to that file in the cache
## using the cached value if set. Search paths are optional. Returns success in RESULT.
## get_library_path( NAMES name1 [name2...] [HINTS path1 [path2 ... ENV var]] [RESULT ]
macro( get_library_path CACHED_PATH HELP )
get_path( 1 ${ARGV} )
endmacro()
## Parses the VERSION_STRING variable and places
## the first, second and third number values in
## the major, minor and patch variables.
function( parse_version VERSION_STRING )
string ( FIND ${VERSION_STRING} "-" STRING_INDEX )
if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD )
endif ()
string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} )
list ( LENGTH VERSIONS VERSION_COUNT )
if ( ${VERSION_COUNT} GREATER 0)
list ( GET VERSIONS 0 MAJOR )
set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE )
endif ()
if ( ${VERSION_COUNT} GREATER 1 )
list ( GET VERSIONS 1 MINOR )
set ( VERSION_MINOR ${MINOR} PARENT_SCOPE )
endif ()
if ( ${VERSION_COUNT} GREATER 2 )
list ( GET VERSIONS 2 PATCH )
set ( VERSION_PATCH ${PATCH} PARENT_SCOPE )
endif ()
endfunction ()
## Gets the current version of the repository
## using versioning tags and git describe.
## Passes back a packaging version string
## and a library version string.
function ( get_version DEFAULT_VERSION_STRING )
set( VERSION_JOB "local-build" )
set( VERSION_COMMIT_COUNT 0 )
set( VERSION_HASH "unknown" )
find_program( GIT NAMES git )
if( GIT )
#execute_process ( COMMAND git describe --tags --dirty --long
# WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
# OUTPUT_VARIABLE GIT_TAG_STRING
# OUTPUT_STRIP_TRAILING_WHITESPACE
# RESULT_VARIABLE RESULT )
# Get branch commit (common ancestor) of current branch and master branch.
execute_process(COMMAND git merge-base HEAD origin/HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GIT_MERGE_BASE
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if( ${RESULT} EQUAL 0 )
# Count commits from branch point.
execute_process(COMMAND git rev-list --count ${GIT_MERGE_BASE}..HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE VERSION_COMMIT_COUNT
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if(NOT ${RESULT} EQUAL 0 )
set( VERSION_COMMIT_COUNT 0 )
endif()
endif()
# Get current short hash.
execute_process(COMMAND git rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE VERSION_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if( ${RESULT} EQUAL 0 )
# Check for dirty workspace.
execute_process(COMMAND git diff --quiet
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE RESULT )
if(${RESULT} EQUAL 1)
set(VERSION_HASH "${VERSION_HASH}-dirty")
endif()
else()
set( VERSION_HASH "unknown" )
endif()
endif()
# Build automation IDs
if(DEFINED ENV{ROCM_BUILD_ID})
set( VERSION_JOB $ENV{ROCM_BUILD_ID} )
endif()
parse_version(${DEFAULT_VERSION_STRING})
set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE )
set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE )
set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE )
set( VERSION_COMMIT_COUNT "${VERSION_COMMIT_COUNT}" PARENT_SCOPE )
set( VERSION_HASH "${VERSION_HASH}" PARENT_SCOPE )
set( VERSION_JOB "${VERSION_JOB}" PARENT_SCOPE )
#message("${VERSION_MAJOR}" )
#message("${VERSION_MINOR}" )
#message("${VERSION_PATCH}" )
#message("${VERSION_COMMIT_COUNT}")
#message("${VERSION_HASH}")
#message("${VERSION_JOB}")
endfunction()
## Collects subdirectory names and returns them in a list
function ( listsubdirs DIRPATH SUBDIRECTORIES )
file( GLOB CONTENTS RELATIVE ${DIRPATH} "${DIRPATH}/*" )
set ( FOLDERS, "" )
foreach( ITEM IN LISTS CONTENTS)
if( IS_DIRECTORY "${DIRPATH}/${ITEM}" )
list( APPEND FOLDERS ${ITEM} )
endif()
endforeach()
set (${SUBDIRECTORIES} ${FOLDERS} PARENT_SCOPE)
endfunction()
## Sets el7 flag to be true
function (Checksetel7 EL7_DISTRO)
execute_process(COMMAND rpm --eval %{?dist}
RESULT_VARIABLE PROC_RESULT
OUTPUT_VARIABLE EVAL_RESULT
OUTPUT_STRIP_TRAILING_WHITESPACE)
message("RESULT_VARIABLE ${PROC_RESULT} OUTPUT_VARIABLE: ${EVAL_RESULT}")
if (PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "")
if ("${EVAL_RESULT}" STREQUAL ".el7")
set (${EL7_DISTRO} TRUE PARENT_SCOPE)
endif()
endif()
endfunction()
================================================
FILE: format
================================================
#!/bin/bash
root=`git rev-parse --show-toplevel`
pushd . > /dev/null
cd $root
git diff -U0 HEAD^ | ./clang-format-diff.py -p1 -i -style=file
popd > /dev/null
================================================
FILE: libhsakmt/CMakeLists.txt
================================================
################################################################################
##
## Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
##
## MIT LICENSE:
## Permission is hereby granted, free of charge, to any person obtaining a copy of
## this software and associated documentation files (the "Software"), to deal in
## the Software without restriction, including without limitation the rights to
## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
## of the Software, and to permit persons to whom the Software is furnished to do
## so, subject to the following conditions:
##
## The above copyright notice and this permission notice shall be included in all
## copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
## SOFTWARE.
##
################################################################################
cmake_minimum_required ( VERSION 3.6.3 )
set(CMAKE_VERBOSE_MAKEFILE ON)
set ( HSAKMT "hsakmt" )
set ( HSAKMT_PACKAGE "hsakmt-roct" )
set ( HSAKMT_COMPONENT "lib${HSAKMT}" )
set ( HSAKMT_TARGET "${HSAKMT}" )
set(HSAKMT_STATIC_DRM_TARGET "${HSAKMT_TARGET}-staticdrm")
project ( ${HSAKMT_TARGET} VERSION 1.9.0)
# Optionally, build HSAKMT with ccache.
set(ROCM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build")
if (ROCM_CCACHE_BUILD)
find_program(CCACHE_PROGRAM ccache)
if (CCACHE_PROGRAM)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM})
else()
message(WARNING "Unable to find ccache. Falling back to real compiler")
endif() # if (CCACHE_PROGRAM)
endif() # if (ROCM_CCACHE_BUILD)
list( PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" )
## Include common cmake modules
include ( utils )
include ( GNUInstallDirs )
## Setup the package version.
get_version ( "1.0.0" )
set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
set ( LIB_VERSION_MAJOR 1)
set ( LIB_VERSION_MINOR 0)
if (${ROCM_PATCH_VERSION})
set ( LIB_VERSION_PATCH ${ROCM_PATCH_VERSION} )
else ()
set ( LIB_VERSION_PATCH 6)
endif ()
set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" )
if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" )
set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
endif ()
set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
## Compiler flags
set (HSAKMT_C_FLAGS -fPIC -W -Wall -Wextra -Wno-unused-parameter -Wformat-security -Wswitch-default -Wundef -Wshadow -Wpointer-arith -Wbad-function-cast -Wcast-qual -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls -Wunreachable-code -std=gnu99 -fvisibility=hidden)
if ( CMAKE_COMPILER_IS_GNUCC )
set ( HSAKMT_C_FLAGS "${HSAKMT_C_FLAGS}" -Wlogical-op)
endif ()
if ( ${HSAKMT_WERROR} )
set ( HSAKMT_C_FLAGS "${HSAKMT_C_FLAGS}" -Werror )
endif ()
if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release )
set ( HSAKMT_C_FLAGS "${HSAKMT_C_FLAGS}" -O2 )
else ()
set ( HSAKMT_C_FLAGS "${HSAKMT_C_FLAGS}" -g )
endif ()
set ( HSAKMT_LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/src/libhsakmt.ver" )
## Linker Flags
## Add --enable-new-dtags to generate DT_RUNPATH
set (HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS} -Wl,--enable-new-dtags -Wl,--version-script=${HSAKMT_LINKER_SCRIPT} -Wl,-soname=${HSAKMT_COMPONENT}.so.${LIB_VERSION_MAJOR} -Wl,-z,nodelete")
## Address Sanitize Flag
if ( ${ADDRESS_SANITIZER} )
set ( HSAKMT_C_FLAGS "${HSAKMT_C_FLAGS}" -fsanitize=address )
set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS} -fsanitize=address" )
if ( BUILD_SHARED_LIBS )
set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS} -shared-libsan" )
else ()
set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS} -static-libsan" )
endif ()
else ()
if ( CMAKE_COMPILER_IS_GNUCC )
set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS} -Wl,-no-undefined" )
else ()
set ( HSAKMT_LINK_FLAGS "${HSAKMT_LINK_FLAGS} -Wl,-undefined,error" )
endif ()
endif ()
## Source files
set ( HSAKMT_SRC "src/debug.c"
"src/events.c"
"src/fmm.c"
"src/globals.c"
"src/hsakmtmodel.c"
"src/libhsakmt.c"
"src/memory.c"
"src/openclose.c"
"src/perfctr.c"
"src/pmc_table.c"
"src/queues.c"
"src/time.c"
"src/topology.c"
"src/rbtree.c"
"src/spm.c"
"src/version.c"
"src/svm.c"
"src/pc_sampling.c")
## Declare the library target name
add_library (${HSAKMT_TARGET} STATIC "")
## Add sources
target_sources ( ${HSAKMT_TARGET} PRIVATE ${HSAKMT_SRC} )
## Add headers. The public headers need to point at their location in both build and install
## directory layouts. This declaration allows publishing library use data to downstream clients.
target_include_directories( ${HSAKMT_TARGET}
PUBLIC
$
$
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src )
set_property(TARGET ${HSAKMT_TARGET} PROPERTY LINK_FLAGS ${HSAKMT_LINK_FLAGS})
## Set the VERSION and SOVERSION values
set_property ( TARGET ${HSAKMT_TARGET} PROPERTY VERSION "${LIB_VERSION_STRING}" )
set_property ( TARGET ${HSAKMT_TARGET} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" )
find_package(PkgConfig)
# get OS-info for OS-specific build dependencies
get_os_info()
find_package(PkgConfig)
# Check for libraries required for building
find_library(LIBC NAMES c REQUIRED)
find_package(NUMA)
if(NUMA_FOUND)
set(NUMA "${NUMA_LIBRARIES}")
else()
find_library(NUMA NAMES numa REQUIRED)
endif()
message(STATUS "LIBC: " ${LIBC})
message(STATUS "NUMA: " ${NUMA})
## If environment variable DRM_DIR is set, the script
## will pick up the corresponding libraries from that path.
if(DRM_DIR)
list (PREPEND CMAKE_PREFIX_PATH "${DRM_DIR}")
endif()
# The module name passed to pkg_check_modules() is determined by the
# name of file *.pc
pkg_check_modules(DRM REQUIRED IMPORTED_TARGET libdrm)
pkg_check_modules(DRM_AMDGPU REQUIRED IMPORTED_TARGET libdrm_amdgpu)
include_directories(${DRM_AMDGPU_INCLUDE_DIRS})
include_directories(${DRM_INCLUDE_DIRS})
target_link_libraries ( ${HSAKMT_TARGET}
PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt ${LIBC} ${NUMA} ${CMAKE_DL_LIBS}
)
target_compile_options(${HSAKMT_TARGET} PRIVATE ${DRM_CFLAGS} ${HSAKMT_C_FLAGS})
include(CheckFunctionExists)
set(CMAKE_REQUIRED_DEFINITIONS -D__USE_GNU=1)
set(CMAKE_REQUIRED_INCLUDES sys/mman.h)
check_function_exists(memfd_create HAVE_MEMFD_CREATE)
if(HAVE_MEMFD_CREATE)
target_compile_definitions(${HSAKMT_TARGET} PRIVATE -DHAVE_MEMFD_CREATE=1)
endif()
## Define default paths and packages.
if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT )
set ( CMAKE_INSTALL_PREFIX "/opt/rocm" )
endif()
set ( CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} CACHE STRING "Default installation directory." FORCE )
# Installs binaries and exports the library usage data to ${HSAKMT_TARGET}Targets
install ( TARGETS ${HSAKMT_TARGET} EXPORT ${HSAKMT_TARGET}Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT asan
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT asan )
install ( TARGETS ${HSAKMT_TARGET}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary )
# Install public headers
install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/${HSAKMT_TARGET} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
COMPONENT dev PATTERN "linux" EXCLUDE PATTERN "*virtio*" EXCLUDE)
# Record our usage data for clients find_package calls.
install ( EXPORT ${HSAKMT_TARGET}Targets
FILE ${HSAKMT_TARGET}Targets.cmake
NAMESPACE ${HSAKMT_TARGET}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${HSAKMT_TARGET}
COMPONENT dev)
# Adds the target alias hsakmt::hsakmt to the local cmake cache.
# This isn't necessary today. It's harmless preparation for some
# hypothetical future in which the we might be included by add_subdirectory()
# in some other project's cmake file. It allows uniform use of find_package
# and target_link_library() without regard to whether a target is external or
# a subdirectory of the current build.
add_library( ${HSAKMT_TARGET}::${HSAKMT_TARGET} ALIAS ${HSAKMT_TARGET} )
# Create cmake configuration files
include(CMakePackageConfigHelpers)
configure_package_config_file(${HSAKMT_TARGET}-config.cmake.in
${HSAKMT_TARGET}-config.cmake
INSTALL_DESTINATION
${CMAKE_INSTALL_LIBDIR}/cmake/${HSAKMT_TARGET} )
write_basic_package_version_file(${HSAKMT_TARGET}-config-version.cmake
VERSION ${BUILD_VERSION_STRING}
COMPATIBILITY
AnyNewerVersion)
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/${HSAKMT_TARGET}-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/${HSAKMT_TARGET}-config-version.cmake
DESTINATION
${CMAKE_INSTALL_LIBDIR}/cmake/${HSAKMT_TARGET}
COMPONENT dev)
# Optionally record the package's find module in the user's package cache.
if ( NOT DEFINED EXPORT_TO_USER_PACKAGE_REGISTRY )
set ( EXPORT_TO_USER_PACKAGE_REGISTRY "off" )
endif()
set ( EXPORT_TO_USER_PACKAGE_REGISTRY ${EXPORT_TO_USER_PACKAGE_REGISTRY}
CACHE BOOL "Add cmake package config location to the user's cmake package registry.")
if(${EXPORT_TO_USER_PACKAGE_REGISTRY})
# Enable writing to the registry
set(CMAKE_EXPORT_PACKAGE_REGISTRY ON)
# Generate a target file for the build
export(TARGETS ${HSAKMT_TARGET} NAMESPACE ${HSAKMT_TARGET}:: FILE ${HSAKMT_TARGET}Targets.cmake)
# Record the package in the user's cache.
export(PACKAGE ${HSAKMT_TARGET})
endif()
# CPACK_PACKAGING_INSTALL_PREFIX is needed in libhsakmt.pc.in
# TODO: Add support for relocatable packages.
configure_file ( libhsakmt.pc.in libhsakmt.pc @ONLY )
install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/libhsakmt.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig COMPONENT dev)
if ( NOT BUILD_SHARED_LIBS)
## Create separate target file for static builds
## In static builds, libdrm and libdrm_amdgpu need to be linked statically
add_library (${HSAKMT_STATIC_DRM_TARGET} STATIC "")
target_sources (${HSAKMT_STATIC_DRM_TARGET} PRIVATE ${HSAKMT_SRC})
target_include_directories( ${HSAKMT_STATIC_DRM_TARGET}
PUBLIC
$
$
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src )
## Set the VERSION and SOVERSION values
set_property(TARGET ${HSAKMT_STATIC_DRM_TARGET} PROPERTY LINK_FLAGS ${HSAKMT_LINK_FLAGS}
PROPERTY VERSION "${LIB_VERSION_STRING}"
PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" )
#Additional search path for static libraries
if(${DISTRO_ID} MATCHES "ubuntu")
set(AMDGPU_STATIC_LIB_PATHS "-L/opt/amdgpu/lib/x86_64-linux-gnu")
else()
set(AMDGPU_STATIC_LIB_PATHS "-L/opt/amdgpu/lib64" "-L/opt/amdgpu/lib")
endif()
# Link drm_amdgpu and drm library statically
target_link_libraries ( ${HSAKMT_STATIC_DRM_TARGET}
PRIVATE pthread rt c numa ${CMAKE_DL_LIBS}
INTERFACE -Wl,-Bstatic ${AMDGPU_STATIC_LIB_PATHS} ${DRM_AMDGPU_LDFLAGS} ${DRM_LDFLAGS} -Wl,-Bdynamic
)
target_compile_options(${HSAKMT_STATIC_DRM_TARGET} PRIVATE ${DRM_CFLAGS} ${HSAKMT_C_FLAGS})
install ( TARGETS ${HSAKMT_STATIC_DRM_TARGET} EXPORT ${HSAKMT_STATIC_DRM_TARGET}Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT binary)
install ( EXPORT ${HSAKMT_STATIC_DRM_TARGET}Targets
FILE ${HSAKMT_STATIC_DRM_TARGET}Targets.cmake
NAMESPACE ${HSAKMT_STATIC_DRM_TARGET}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${HSAKMT_TARGET}
COMPONENT dev)
add_library( ${HSAKMT_STATIC_DRM_TARGET}::${HSAKMT_STATIC_DRM_TARGET} ALIAS ${HSAKMT_STATIC_DRM_TARGET} )
endif()
###########################
# Packaging directives
###########################
# Use component packaging
set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.")
================================================
FILE: libhsakmt/DEBIAN/postinst.in
================================================
#!/bin/bash
set -e
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
do_ldconfig() {
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf
ldconfig
fi
}
case "$1" in
( configure )
do_ldconfig
;;
( abort-upgrade | abort-remove | abort-deconfigure )
echo "$1"
;;
( * )
exit 0
;;
esac
================================================
FILE: libhsakmt/DEBIAN/prerm.in
================================================
#!/bin/bash
set -e
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
rm_ldconfig() {
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
rm -f /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf && ldconfig
fi
}
case "$1" in
( remove | upgrade )
rm_ldconfig
;;
( purge )
;;
( * )
exit 0
;;
esac
================================================
FILE: libhsakmt/LICENSE.md
================================================
ROCT-Thunk Interface LICENSE
Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
MIT LICENSE:
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
This product contains software provided by Nginx, Inc. and its contributors.
Copyright (C) 2002-2018 Igor Sysoev
Copyright (C) 2011-2018 Nginx, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
================================================
FILE: libhsakmt/README.md
================================================
# ROCt Library
This repository includes the user-mode API interfaces used to interact with the ROCk driver.
Starting at 1.7 release, ROCt uses drm render device. This requires the user to belong to video group. Add the user account to video group with "sudo usermod -a -G video _username_" command if the user if not part of video group yet.
NOTE: Users of Ubuntu 20.04 will need to add the user to the new "render" group, as Ubuntu has changed the owner:group of /dev/kfd to render:render as of that release
## ROCk Driver
The ROCt library is not a standalone product and requires that you have the correct ROCk driver installed, or are using a compatible upstream kernel.
Please refer to under "Getting Started Guide" for a list of supported Operating Systems and kernel versions, as well as supported hardware.
## Building the Thunk
A simple cmake-based system is available for building thunk. To build the thunk from the the ROCT-Thunk-Interface directory, execute:
```bash
mkdir -p build
cd build
cmake ..
make
```
If the hsakmt-roct and hsakmt-roct-dev packages are desired:
```bash
mkdir -p build
cd build
cmake ..
make package
```
If you choose not to build and install packages, manual installation of the binaries and header files can be done via:
```bash
make install
```
NOTE: For older versions of the thunk where hsakmt-dev.txt is present, "make package-dev" and "make install-dev" are required to generate/install the developer packages. Currently, these are created via the "make package" and "make install" commands
## Disclaimer
The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD's products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale.
AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
Copyright (c) 2014-2023 Advanced Micro Devices, Inc. All rights reserved.
================================================
FILE: libhsakmt/RPM/hsakmt-roct-devel.spec.in
================================================
# Restore old style debuginfo creation for rpm >= 4.14.
%undefine _debugsource_packages
%undefine _debuginfo_subpackages
# -*- rpm-spec -*-
BuildRoot: %_topdir/@CPACK_PACKAGE_FILE_NAME@@CPACK_RPM_PACKAGE_COMPONENT_PART_PATH@
Summary: @CPACK_RPM_PACKAGE_SUMMARY@
Name: @CPACK_RPM_PACKAGE_NAME@
Version: @CPACK_RPM_PACKAGE_VERSION@
Release: @CPACK_RPM_PACKAGE_RELEASE@
License: @CPACK_RPM_PACKAGE_LICENSE@
Group: @CPACK_RPM_PACKAGE_GROUP@
Vendor: @CPACK_RPM_PACKAGE_VENDOR@
@TMP_RPM_URL@
@TMP_RPM_REQUIRES@
@TMP_RPM_REQUIRES_PRE@
@TMP_RPM_REQUIRES_POST@
@TMP_RPM_REQUIRES_PREUN@
@TMP_RPM_REQUIRES_POSTUN@
@TMP_RPM_PROVIDES@
@TMP_RPM_OBSOLETES@
@TMP_RPM_CONFLICTS@
@TMP_RPM_SUGGESTS@
@TMP_RPM_AUTOPROV@
@TMP_RPM_AUTOREQ@
@TMP_RPM_AUTOREQPROV@
@TMP_RPM_BUILDARCH@
@TMP_RPM_PREFIXES@
@TMP_RPM_EPOCH@
# Modifications to allow recommends to be used (not implemented in cpack):
%if "@CPACK_RPM_PACKAGE_RECOMMENDS@" != ""
Recommends: @CPACK_RPM_PACKAGE_RECOMMENDS@
%endif
# End of modifications
@TMP_RPM_DEBUGINFO@
%define _rpmdir %_topdir/RPMS
%define _srcrpmdir %_topdir/SRPMS
@FILE_NAME_DEFINE@
%define _unpackaged_files_terminate_build 0
@TMP_RPM_SPEC_INSTALL_POST@
@CPACK_RPM_SPEC_MORE_DEFINE@
@CPACK_RPM_COMPRESSION_TYPE_TMP@
%description
@CPACK_RPM_PACKAGE_DESCRIPTION@
# This is a shortcutted spec file generated by CMake RPM generator
# we skip _install step because CPack does that for us.
# We do only save CPack installed tree in _prepr
# and then restore it in build.
%prep
mv $RPM_BUILD_ROOT %_topdir/tmpBBroot
%install
if [ -e $RPM_BUILD_ROOT ];
then
rm -rf $RPM_BUILD_ROOT
fi
mv %_topdir/tmpBBroot $RPM_BUILD_ROOT
@TMP_RPM_DEBUGINFO_INSTALL@
%clean
%post
@RPM_SYMLINK_POSTINSTALL@
@CPACK_RPM_SPEC_POSTINSTALL@
%posttrans
@CPACK_RPM_SPEC_POSTTRANS@
%postun
@CPACK_RPM_SPEC_POSTUNINSTALL@
%pre
@CPACK_RPM_SPEC_PREINSTALL@
%pretrans
@CPACK_RPM_SPEC_PRETRANS@
%preun
@CPACK_RPM_SPEC_PREUNINSTALL@
%files
%defattr(@TMP_DEFAULT_FILE_PERMISSIONS@,@TMP_DEFAULT_USER@,@TMP_DEFAULT_GROUP@,@TMP_DEFAULT_DIR_PERMISSIONS@)
@CPACK_RPM_INSTALL_FILES@
@CPACK_RPM_ABSOLUTE_INSTALL_FILES@
@CPACK_RPM_USER_INSTALL_FILES@
%changelog
@CPACK_RPM_SPEC_CHANGELOG@
@TMP_OTHER_COMPONENTS@
================================================
FILE: libhsakmt/RPM/libhsakmt.spec
================================================
%define name hsakmt-rocm-dev
%define version %{getenv:PACKAGE_VER}
%define packageroot %{getenv:PACKAGE_DIR}
Name: %{name}
Version: %{version}
Release: 1
Summary: Thunk libraries for AMD KFD
Group: System Environment/Libraries
License: Advanced Micro Devices Inc.
%if 0%{?centos} == 6
Requires: numactl
%else
Requires: numactl-libs
%endif
%description
This package includes the libhsakmt (Thunk) libraries
for AMD KFD
%prep
%setup -T -D -c -n %{name}
%install
cp -R %packageroot $RPM_BUILD_ROOT
find $RPM_BUILD_ROOT \! -type d | sed "s|$RPM_BUILD_ROOT||"> thunk.list
%post
ldconfig
%postun
ldconfig
%clean
rm -rf $RPM_BUILD_ROOT
%files -f thunk.list
%defattr(-,root,root,-)
================================================
FILE: libhsakmt/RPM/post.in
================================================
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf
ldconfig
fi
================================================
FILE: libhsakmt/RPM/postun.in
================================================
# second term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
rm -f /@CMAKE_INSTALL_SYSCONFDIR@/ld.so.conf.d/x86_64-libhsakmt.conf
ldconfig
fi
================================================
FILE: libhsakmt/cmake_modules/utils.cmake
================================================
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2014-2017, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and#or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and#or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
## Parses the VERSION_STRING variable and places
## the first, second and third number values in
## the major, minor and patch variables.
function( parse_version VERSION_STRING )
string ( FIND ${VERSION_STRING} "-" STRING_INDEX )
if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD )
endif ()
string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} )
list ( LENGTH VERSIONS VERSION_COUNT )
if ( ${VERSION_COUNT} GREATER 0)
list ( GET VERSIONS 0 MAJOR )
set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${MAJOR}" )
endif ()
if ( ${VERSION_COUNT} GREATER 1 )
list ( GET VERSIONS 1 MINOR )
set ( VERSION_MINOR ${MINOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" )
endif ()
if ( ${VERSION_COUNT} GREATER 2 )
list ( GET VERSIONS 2 PATCH )
set ( VERSION_PATCH ${PATCH} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" )
endif ()
if ( DEFINED VERSION_BUILD )
set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
endif ()
set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE )
endfunction ()
## Gets the current version of the repository
## using versioning tags and git describe.
## Passes back a packaging version string
## and a library version string.
function ( get_version DEFAULT_VERSION_STRING )
parse_version ( ${DEFAULT_VERSION_STRING} )
find_program ( GIT NAMES git )
if ( GIT )
execute_process ( COMMAND git describe --tags --dirty --long
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_STRING
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if ( ${RESULT} EQUAL 0 )
parse_version ( ${GIT_TAG_STRING} )
endif ()
endif ()
set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE )
set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE )
set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE )
set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE )
set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
endfunction()
#get the OS version
function(get_os_info)
if( EXISTS "/etc/os-release")
file(STRINGS "/etc/os-release" DISTRO_ID REGEX "^ID=")
file(STRINGS "/etc/os-release" DISTRO_RELEASE REGEX "^VERSION_ID=")
string(REPLACE "ID=" "" DISTRO_ID "${DISTRO_ID}")
string(REPLACE "VERSION_ID=" "" DISTRO_RELEASE "${DISTRO_RELEASE}")
message(STATUS "Detected distribution: ${DISTRO_ID}:${DISTRO_RELEASE}")
elseif(EXISTS "/etc/centos-release" )
# Example: CentOS release 6.10 (Final)
file(STRINGS "/etc/centos-release" DISTRO_FULL_STR REGEX "release")
string(REGEX MATCH "^[a-zA-Z]+" DISTRO_ID "${DISTRO_FULL_STR}")
string(TOLOWER "${DISTRO_ID}" DISTRO_ID)
string(REGEX MATCH "[0-9]+" DISTRO_RELEASE "${DISTRO_FULL_STR}")
message(STATUS "Detected distribution: ${DISTRO_ID}:${DISTRO_RELEASE}")
else()
message(STATUS "Not able to detect OS")
endif()
set(DISTRO_ID "${DISTRO_ID}" PARENT_SCOPE )
set(DISTRO_RELEASE "${DISTRO_RELEASE}" PARENT_SCOPE )
endfunction()
================================================
FILE: libhsakmt/hsakmt-config.cmake.in
================================================
@PACKAGE_INIT@
include( CMakeFindDependencyMacro )
# Locate dependent packages here. Finding them propagates usage requirements,
# if any, to our clients and ensures that their target names are in scope for
# the build. hsakmt has no cmake project dependencies so there is nothing to
# find. If we switch to use find_package with external (to ROCm) library
# dependencies (ie libnuma) then those packages should be located here using
# find_dependencies as shown below.
#find_dependency(Bar, 2.0)
# If the option is ON link other dependent libraries dynamically
# If the option is OFF, then link libdrm and libdrm_amdgpu statically
if(@BUILD_SHARED_LIBS@)
include( "${CMAKE_CURRENT_LIST_DIR}/@HSAKMT_TARGET@Targets.cmake" )
else()
include( "${CMAKE_CURRENT_LIST_DIR}/@HSAKMT_STATIC_DRM_TARGET@Targets.cmake" )
endif()
================================================
FILE: libhsakmt/include/hsakmt/hsakmt.h
================================================
/*
* Copyright © 2024 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef _HSAKMT_H_
#define _HSAKMT_H_
#include "hsakmttypes.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
"Opens" the HSA kernel driver for user-kernel mode communication.
On Windows, this function gets a handle to the KFD's AMDKFDIO device object that
is responsible for user-kernel communication, this handle is used internally by
the thunk library to send device I/O control to the HSA kernel driver.
No other thunk library function may be called unless the user-kernel communication
channel is opened first.
On Linux this call opens the "/dev/kfd" device file to establish a communication
path to the kernel.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtOpenKFD( void );
/**
"Closes" the user-kernel communication path.
On Windows, the handle obtained by the hsaKmtOpenKFD() function is closed;
no other communication with the kernel driver is possible after the successful
execution of the saKmdCloseKFD() function. Depending on the failure reason,
the user-kernel communication path may or may not be still active.
On Linux the function closes the "dev/kfd" device file.
No further communication to the kernel driver is allowed until hsaKmtOpenKFD()
function is called again.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtCloseKFD( void );
/**
Returns the user-kernel interface version supported by KFD.
Higher major numbers usually add new features to KFD and may break user-kernel
compatibility; higher minor numbers define additional functionality associated
within a major number.
The calling software should validate that it meets the minimum interface version
as described in the API specification.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetVersion(
HsaVersionInfo* VersionInfo //OUT
);
/**
The function takes a "snapshot" of the topology information within the KFD
to avoid any changes during the enumeration process.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtAcquireSystemProperties(
HsaSystemProperties* SystemProperties //OUT
);
/**
Releases the topology "snapshot" taken by hsaKmtAcquireSystemProperties()
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtReleaseSystemProperties( void ) ;
/**
Retrieves the discoverable sub-properties for a given HSA
node. The parameters returned allow the application or runtime to size the
management structures necessary to store the information.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetNodeProperties(
HSAuint32 NodeId, //IN
HsaNodeProperties* NodeProperties //OUT
);
/**
Retrieves the memory properties of a specific HSA node.
the memory pointer passed as MemoryProperties is sized as
NumBanks * sizeof(HsaMemoryProperties). NumBanks is retrieved with the
hsaKmtGetNodeProperties() call.
Some of the data returned is optional. Not all implementations may return all
parameters in the hsaMemoryProperties.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetNodeMemoryProperties(
HSAuint32 NodeId, //IN
HSAuint32 NumBanks, //IN
HsaMemoryProperties* MemoryProperties //OUT
);
/**
Retrieves the cache properties of a specific HSA node and processor ID.
ProcessorID refers to either a CPU core or a SIMD unit as enumerated earlier
via the hsaKmtGetNodeProperties() call.
The memory pointer passed as CacheProperties is sized as
NumCaches * sizeof(HsaCacheProperties). NumCaches is retrieved with the
hsaKmtGetNodeProperties() call.
The data returned is optional. Not all implementations may return all
parameters in the CacheProperties.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetNodeCacheProperties(
HSAuint32 NodeId, //IN
HSAuint32 ProcessorId, //IN
HSAuint32 NumCaches, //IN
HsaCacheProperties* CacheProperties //OUT
);
/**
Retrieves the HSA IO affinity properties of a specific HSA node.
the memory pointer passed as Properties is sized as
NumIoLinks * sizeof(HsaIoLinkProperties). NumIoLinks is retrieved with the
hsaKmtGetNodeProperties() call.
The data returned is optional. Not all implementations may return all
parameters in the IoLinkProperties.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetNodeIoLinkProperties(
HSAuint32 NodeId, //IN
HSAuint32 NumIoLinks, //IN
HsaIoLinkProperties* IoLinkProperties //OUT
);
/**
Creates an operating system event associated with a HSA event ID
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtCreateEvent(
HsaEventDescriptor* EventDesc, //IN
bool ManualReset, //IN
bool IsSignaled, //IN
HsaEvent** Event //OUT
);
/**
Destroys an operating system event associated with a HSA event ID
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDestroyEvent(
HsaEvent* Event //IN
);
/**
Sets the specified event object to the signaled state
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSetEvent(
HsaEvent* Event //IN
);
/**
Sets the specified event object to the non-signaled state
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtResetEvent(
HsaEvent* Event //IN
);
/**
Queries the state of the specified event object
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtQueryEventState(
HsaEvent* Event //IN
);
/**
Checks the current state of the event object. If the object's state is
nonsignaled, the calling thread enters the wait state.
The function returns when one of the following occurs:
- The specified event object is in the signaled state.
- The time-out interval elapses.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtWaitOnEvent(
HsaEvent* Event, //IN
HSAuint32 Milliseconds //IN
);
/**
Checks the current state of the event object. If the object's state is
nonsignaled, the calling thread enters the wait state. event_age can
help avoiding race conditions.
The function returns when one of the following occurs:
- The specified event object is in the signaled state.
- The time-out interval elapses.
- Tracking event age
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtWaitOnEvent_Ext(
HsaEvent* Event, //IN
HSAuint32 Milliseconds, //IN
uint64_t *event_age //IN/OUT
);
/**
Checks the current state of multiple event objects.
The function returns when one of the following occurs:
- Either any one or all of the specified objects are in the signaled state
- if "WaitOnAll" is "true" the function returns when the state of all
objects in array is signaled
- if "WaitOnAll" is "false" the function returns when the state of any
one of the objects is set to signaled
- The time-out interval elapses.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtWaitOnMultipleEvents(
HsaEvent* Events[], //IN
HSAuint32 NumEvents, //IN
bool WaitOnAll, //IN
HSAuint32 Milliseconds //IN
);
/**
Checks the current state of multiple event objects.
event_age can help avoiding race conditions.
The function returns when one of the following occurs:
- Either any one or all of the specified objects are in the signaled state
- if "WaitOnAll" is "true" the function returns when the state of all
objects in array is signaled
- if "WaitOnAll" is "false" the function returns when the state of any
one of the objects is set to signaled
- The time-out interval elapses.
- Tracking event age
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtWaitOnMultipleEvents_Ext(
HsaEvent* Events[], //IN
HSAuint32 NumEvents, //IN
bool WaitOnAll, //IN
HSAuint32 Milliseconds, //IN
uint64_t *event_age //IN/OUT
);
/**
new TEMPORARY function definition - to be used only on "Triniti + Southern Islands" platform
If used on other platforms the function will return HSAKMT_STATUS_ERROR
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtReportQueue(
HSA_QUEUEID QueueId, //IN
HsaQueueReport* QueueReport //OUT
);
/**
Creates a GPU queue with user-mode access rights
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtCreateQueue(
HSAuint32 NodeId, //IN
HSA_QUEUE_TYPE Type, //IN
HSAuint32 QueuePercentage, //IN
HSA_QUEUE_PRIORITY Priority, //IN
void* QueueAddress, //IN
HSAuint64 QueueSizeInBytes, //IN
HsaEvent* Event, //IN
HsaQueueResource* QueueResource //OUT
);
/**
Creates a GPU queue with user-mode access rights
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtCreateQueueExt(
HSAuint32 NodeId, //IN
HSA_QUEUE_TYPE Type, //IN
HSAuint32 QueuePercentage, //IN
HSA_QUEUE_PRIORITY Priority, //IN
HSAuint32 SdmaEngineId, //IN
void* QueueAddress, //IN
HSAuint64 QueueSizeInBytes, //IN
HsaEvent* Event, //IN
HsaQueueResource* QueueResource //OUT
);
/**
Updates a queue
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtUpdateQueue(
HSA_QUEUEID QueueId, //IN
HSAuint32 QueuePercentage,//IN
HSA_QUEUE_PRIORITY Priority, //IN
void* QueueAddress, //IN
HSAuint64 QueueSize, //IN
HsaEvent* Event //IN
);
/**
Destroys a queue
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDestroyQueue(
HSA_QUEUEID QueueId //IN
);
/**
Set cu mask for a queue
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSetQueueCUMask(
HSA_QUEUEID QueueId, //IN
HSAuint32 CUMaskCount, //IN
HSAuint32* QueueCUMask //IN
);
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetQueueInfo(
HSA_QUEUEID QueueId, //IN
HsaQueueInfo *QueueInfo //IN
);
/**
Allows an HSA process to set/change the default and alternate memory coherency, before starting to dispatch.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSetMemoryPolicy(
HSAuint32 Node, //IN
HSAuint32 DefaultPolicy, //IN
HSAuint32 AlternatePolicy, //IN
void* MemoryAddressAlternate, //IN (page-aligned)
HSAuint64 MemorySizeInBytes //IN (page-aligned)
);
/**
Allocates a memory buffer that may be accessed by the GPU
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtAllocMemory(
HSAuint32 PreferredNode, //IN
HSAuint64 SizeInBytes, //IN (multiple of page size)
HsaMemFlags MemFlags, //IN
void** MemoryAddress //IN/OUT (page-aligned)
);
/**
Allocates a memory buffer with specific alignment that may be accessed by the GPU
If Alignment is 0, the smallest possible alignment will be used
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtAllocMemoryAlign(
HSAuint32 PreferredNode, //IN
HSAuint64 SizeInBytes, //IN (multiple of page size)
HSAuint64 Alignment, //IN (power of 2 and >= page size)
HsaMemFlags MemFlags, //IN
void** MemoryAddress //IN/OUT (page-aligned)
);
/**
Frees a memory buffer
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtFreeMemory(
void* MemoryAddress, //IN (page-aligned)
HSAuint64 SizeInBytes //IN
);
/**
Inquires memory available for allocation as a memory buffer
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtAvailableMemory(
HSAuint32 Node,
HSAuint64 *AvailableBytes
);
/**
Registers with KFD a memory buffer that may be accessed by the GPU
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRegisterMemory(
void* MemoryAddress, //IN (cache-aligned)
HSAuint64 MemorySizeInBytes //IN (cache-aligned)
);
/**
Registers with KFD a memory buffer that may be accessed by specific GPUs
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRegisterMemoryToNodes(
void *MemoryAddress, // IN (cache-aligned)
HSAuint64 MemorySizeInBytes, // IN (cache-aligned)
HSAuint64 NumberOfNodes, // IN
HSAuint32* NodeArray // IN
);
/**
Registers with KFD a memory buffer with memory attributes
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRegisterMemoryWithFlags(
void *MemoryAddress, // IN (cache-aligned)
HSAuint64 MemorySizeInBytes, // IN (cache-aligned)
HsaMemFlags MemFlags // IN
);
/**
Registers with KFD a graphics buffer and returns graphics metadata
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRegisterGraphicsHandleToNodes(
HSAuint64 GraphicsResourceHandle, //IN
HsaGraphicsResourceInfo *GraphicsResourceInfo, //OUT
HSAuint64 NumberOfNodes, //IN
HSAuint32* NodeArray //IN
);
/**
Similar to hsaKmtRegisterGraphicsHandleToNodes but provides registration
options via RegisterFlags.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRegisterGraphicsHandleToNodesExt(
HSAuint64 GraphicsResourceHandle, //IN
HsaGraphicsResourceInfo *GraphicsResourceInfo, //OUT
HSAuint64 NumberOfNodes, //IN
HSAuint32* NodeArray, //IN
HSA_REGISTER_MEM_FLAGS RegisterFlags //IN
);
/**
* Export a dmabuf handle and offset for a given memory address
*
* Validates that @MemoryAddress belongs to a valid allocation and that the
* @MemorySizeInBytes doesn't exceed the end of that allocation. Returns a
* dmabuf fd of the allocation and the offset of MemoryAddress within that
* allocation. The memory will remain allocated even after the allocation is
* freed by hsaKmtFreeMemory for as long as a dmabuf fd remains open or any
* importer of that fd maintains an active reference to the memory.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtExportDMABufHandle(
void *MemoryAddress, //IN
HSAuint64 MemorySizeInBytes, //IN
int *DMABufFd, //OUT
HSAuint64 *Offset //OUT
);
/**
Export a memory buffer for sharing with other processes
NOTE: for the current revision of the thunk spec, SizeInBytes
must match whole allocation.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtShareMemory(
void *MemoryAddress, // IN
HSAuint64 SizeInBytes, // IN
HsaSharedMemoryHandle *SharedMemoryHandle // OUT
);
/**
Register shared memory handle
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRegisterSharedHandle(
const HsaSharedMemoryHandle *SharedMemoryHandle, // IN
void **MemoryAddress, // OUT
HSAuint64 *SizeInBytes // OUT
);
/**
Register shared memory handle to specific nodes only
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRegisterSharedHandleToNodes(
const HsaSharedMemoryHandle *SharedMemoryHandle, // IN
void **MemoryAddress, // OUT
HSAuint64 *SizeInBytes, // OUT
HSAuint64 NumberOfNodes, // OUT
HSAuint32* NodeArray // OUT
);
/**
Copy data from the GPU address space of the process identified
by Pid. Size Copied will return actual amount of data copied.
If return is not SUCCESS, partial copies could have happened.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtProcessVMRead(
HSAuint32 Pid, // IN
HsaMemoryRange *LocalMemoryArray, // IN
HSAuint64 LocalMemoryArrayCount, // IN
HsaMemoryRange *RemoteMemoryArray, // IN
HSAuint64 RemoteMemoryArrayCount, // IN
HSAuint64 *SizeCopied // OUT
);
/**
Write data to the GPU address space of the process identified
by Pid. See also hsaKmtProcessVMRead.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtProcessVMWrite(
HSAuint32 Pid, // IN
HsaMemoryRange *LocalMemoryArray, // IN
HSAuint64 LocalMemoryArrayCount, // IN
HsaMemoryRange *RemoteMemoryArray, // IN
HSAuint64 RemoteMemoryArrayCount, // IN
HSAuint64 *SizeCopied // OUT
);
/**
Unregisters with KFD a memory buffer
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDeregisterMemory(
void* MemoryAddress //IN
);
/**
Ensures that the memory is resident and can be accessed by GPU
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtMapMemoryToGPU(
void* MemoryAddress, //IN (page-aligned)
HSAuint64 MemorySizeInBytes, //IN (page-aligned)
HSAuint64* AlternateVAGPU //OUT (page-aligned)
);
/**
Ensures that the memory is resident and can be accessed by GPUs
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtMapMemoryToGPUNodes(
void* MemoryAddress, //IN (page-aligned)
HSAuint64 MemorySizeInBytes, //IN (page-aligned)
HSAuint64* AlternateVAGPU, //OUT (page-aligned)
HsaMemMapFlags MemMapFlags, //IN
HSAuint64 NumberOfNodes, //IN
HSAuint32* NodeArray //IN
);
/**
Releases the residency of the memory
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtUnmapMemoryToGPU(
void* MemoryAddress //IN (page-aligned)
);
/**
Notifies the kernel driver that a process wants to use GPU debugging facilities
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtMapGraphicHandle(
HSAuint32 NodeId, //IN
HSAuint64 GraphicDeviceHandle, //IN
HSAuint64 GraphicResourceHandle, //IN
HSAuint64 GraphicResourceOffset, //IN
HSAuint64 GraphicResourceSize, //IN
HSAuint64* FlatMemoryAddress //OUT
);
/**
Stub for Unmap Graphic Handle
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtUnmapGraphicHandle(
HSAuint32 NodeId, //IN
HSAuint64 FlatMemoryAddress, //IN
HSAuint64 SizeInBytes //IN
);
/**
* Get an AMDGPU device handle for a GPU node
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetAMDGPUDeviceHandle(
HSAuint32 NodeId, //IN
HsaAMDGPUDeviceHandle *DeviceHandle //OUT
);
/**
Allocate GWS resource for a queue
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtAllocQueueGWS(
HSA_QUEUEID QueueId, //IN
HSAuint32 nGWS, //IN
HSAuint32 *firstGWS //OUT
);
/**
Notifies the kernel driver that a process wants to use GPU debugging facilities
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgRegister(
HSAuint32 NodeId //IN
);
/**
Detaches the debugger process from the HW debug established by hsaKmtDbgRegister() API
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgUnregister(
HSAuint32 NodeId //IN
);
/**
Controls a wavefront
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgWavefrontControl(
HSAuint32 NodeId, //IN
HSA_DBG_WAVEOP Operand, //IN
HSA_DBG_WAVEMODE Mode, //IN
HSAuint32 TrapId, //IN
HsaDbgWaveMessage* DbgWaveMsgRing //IN
);
/**
Sets watch points on memory address ranges to generate exception events when the
watched addresses are accessed
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgAddressWatch(
HSAuint32 NodeId, //IN
HSAuint32 NumWatchPoints, //IN
HSA_DBG_WATCH_MODE WatchMode[], //IN
void* WatchAddress[], //IN
HSAuint64 WatchMask[], //IN, optional
HsaEvent* WatchEvent[] //IN, optional
);
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRuntimeEnable(
void* rDebug, // IN
bool setupTtmp
);
HSAKMT_STATUS
HSAKMTAPI
hsaKmtRuntimeDisable(void);
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetRuntimeCapabilities(
HSAuint32 *caps_mask // OUT
);
/**
Enable debug trap.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgEnable(
void **runtime_info, //Out
HSAuint32 *data_size //Out
);
/**
Disable debug trap.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgDisable(void);
/**
Get device snapshot.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgGetDeviceData(
void **data, //Out
HSAuint32 *n_entries, //Out
HSAuint32 *entry_size //Out
);
/**
Get queues snapshot.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtDbgGetQueueData(
void **data, //Out
HSAuint32 *n_entries, //Out
HSAuint32 *entry_size, //Out
bool suspend_queues //In
);
/**
Check whether gpu firmware and kernel support debugging
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtCheckRuntimeDebugSupport(
void
);
/**
Debug ops call primarily used for KFD testing
*/
HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(
struct kfd_ioctl_dbg_trap_args *arg,
HSA_QUEUEID *Queues,
HSAuint64 *DebugReturn
);
/**
Gets GPU and CPU clock counters for particular Node
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetClockCounters(
HSAuint32 NodeId, //IN
HsaClockCounters* Counters //OUT
);
/**
Retrieves information on the available HSA counters
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPmcGetCounterProperties(
HSAuint32 NodeId, //IN
HsaCounterProperties** CounterProperties //OUT
);
/**
Registers a set of (HW) counters to be used for tracing/profiling
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPmcRegisterTrace(
HSAuint32 NodeId, //IN
HSAuint32 NumberOfCounters, //IN
HsaCounter* Counters, //IN
HsaPmcTraceRoot* TraceRoot //OUT
);
/**
Unregisters a set of (HW) counters used for tracing/profiling
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPmcUnregisterTrace(
HSAuint32 NodeId, //IN
HSATraceId TraceId //IN
);
/**
Allows a user mode process to get exclusive access to the defined set of (HW) counters
used for tracing/profiling
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPmcAcquireTraceAccess(
HSAuint32 NodeId, //IN
HSATraceId TraceId //IN
);
/**
Allows a user mode process to release exclusive access to the defined set of (HW) counters
used for tracing/profiling
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPmcReleaseTraceAccess(
HSAuint32 NodeId, //IN
HSATraceId TraceId //IN
);
/**
Starts tracing operation on a previously established set of performance counters
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPmcStartTrace(
HSATraceId TraceId, //IN
void* TraceBuffer, //IN (page aligned)
HSAuint64 TraceBufferSizeBytes //IN (page aligned)
);
/**
Forces an update of all the counters that a previously started trace operation has registered
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPmcQueryTrace(
HSATraceId TraceId //IN
);
/**
Stops tracing operation on a previously established set of performance counters
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPmcStopTrace(
HSATraceId TraceId //IN
);
/**
Sets trap handler and trap buffer to be used for all queues associated with the specified NodeId within this process context
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSetTrapHandler(
HSAuint32 NodeId, //IN
void* TrapHandlerBaseAddress, //IN
HSAuint64 TrapHandlerSizeInBytes, //IN
void* TrapBufferBaseAddress, //IN
HSAuint64 TrapBufferSizeInBytes //IN
);
/**
Gets image tile configuration.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetTileConfig(
HSAuint32 NodeId, // IN
HsaGpuTileConfig* config // IN & OUT
);
/**
Returns information about pointers
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtQueryPointerInfo(
const void * Pointer, //IN
HsaPointerInfo * PointerInfo //OUT
);
/**
Associates user data with a memory allocation
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSetMemoryUserData(
const void * Pointer, //IN
void * UserData //IN
);
/**
Acquire request exclusive use of SPM
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSPMAcquire(
HSAuint32 PreferredNode //IN
);
/**
Release exclusive use of SPM
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSPMRelease(
HSAuint32 PreferredNode //IN
);
/**
Set up the destination user mode buffer for stream performance
counter data.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSPMSetDestBuffer(
HSAuint32 PreferredNode, //IN
HSAuint32 SizeInBytes, //IN
HSAuint32 * timeout, //IN/OUT
HSAuint32 * SizeCopied, //OUT
void *DestMemoryAddress, //IN
bool *isSPMDataLoss //OUT
);
/* Helper functions for calling KFD SVM ioctl */
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSVMSetAttr(
void *start_addr, // IN: Start of the virtual address range (page-aligned)
HSAuint64 size, // IN: size (page-aligned)
unsigned int nattr, // IN: number of attributes
HSA_SVM_ATTRIBUTE *attrs // IN: array of attributes
);
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSVMGetAttr(
void *start_addr, // IN: Start of the virtual address range (page-aligned)
HSAuint64 size, // IN: size (page aligned)
unsigned int nattr, // IN: number of attributes
HSA_SVM_ATTRIBUTE *attrs // IN/OUT: array of attributes
);
HSAKMT_STATUS
HSAKMTAPI
hsaKmtSetXNACKMode(
HSAint32 enable // IN: enable/disable XNACK node.
);
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetXNACKMode(
HSAint32 * enable // OUT: returns XNACK value.
);
/**
Open anonymous file handle to enable events and read SMI events.
To enable events, write 64bit events mask to fd, event enums as bit index.
for example, event mask (HSA_SMI_EVENT_MASK_FROM_INDEX(HSA_SMI_EVENT_INDEX_MAX) - 1) to enable all events
Read event from fd is not blocking, use poll with timeout value to check if event is available.
Event is dropped if kernel event fifo is full.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtOpenSMI(
HSAuint32 NodeId, // IN: GPU node_id to receive the SMI event from
int *fd // OUT: anonymous file handle
);
/**
If this is GPU Mapped memory, remap the first page at this address to be normal system memory
This is used in ASAN mode to remap the first page of device memory to share host ASAN logic.
This function is only supported when libhsakmt is compiled in ASAN mode.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtReplaceAsanHeaderPage(
void *addr // IN: Start of othe virtual address page
);
/**
If this is GPU Mapped memory, remap the first page back to the original GPU memory
This is used in ASAN mode to remap the first page back to its original mapping.
This function is only supported when libhsakmt is compiled in ASAN mode.
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtReturnAsanHeaderPage(
void *addr // IN: Start of othe virtual address page
);
/**
Check whether kernel support pc sampling
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPcSamplingSupport(
void
);
/**
* Query device PC Sampling capabilities
*
* Arguments:
* @NodeId (IN) - GPU node_id
* @sample_info (IN) - Pointer to array of HSAPcSamplingInfo
* @sample_info_sz(IN) - Size of sampling_info in units of HSAPcSamplingInfo
* @sz_needed (OUT)- If sampling_info_sz is too small, sample_info_sz needed
*
* Return:
* HSAKMT_STATUS_ERROR - failed
* HSAKMT_STATUS_SUCCESS - successfully complete
* HSAKMT_STATUS_INVALID_PARAMETER - invalid input
* HSAKMT_STATUS_BUFFER_TOO_SMALL - sample buffer size is too small. Retry with sample_info_sz
* >= sz_needed
* HSAKMT_STATUS_NOT_SUPPORTED - this asic doesn't support pc sampling
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPcSamplingQueryCapabilities(
HSAuint32 NodeId,
void *sample_info,
HSAuint32 sample_info_sz,
HSAuint32 *sz_needed
);
/**
* Create PC Sampling Session
*
* Arguments:
* @NodeId (IN) - GPU node_id
* @sample_info(IN) - PC Sampling configuration requested
* @traceId (OUT) - Unique PC Sampling trace Id
*
* Return:
* HSAKMT_STATUS_ERROR - failed
* HSAKMT_STATUS_SUCCESS - successfully complete
* HSAKMT_STATUS_INVALID_PARAMETER - invalid input
* HSAKMT_STATUS_NO_MEMORY - not enough memory to create new pc sampling session
* HSAKMT_STATUS_UNAVAILABLE - a different pc sampling session started on this node
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPcSamplingCreate(
HSAuint32 node_id,
HsaPcSamplingInfo *sample_info,
HsaPcSamplingTraceId *traceId
);
/**
* Destroy PC Sampling Session
*
* Arguments:
* @NodeId (IN) - GPU node_id
* @traceId(IN) - PC Sampling trace Id
*
* Return:
* HSAKMT_STATUS_ERROR - failed
* HSAKMT_STATUS_SUCCESS - successfully complete
* HSAKMT_STATUS_INVALID_PARAMETER - invalid input
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPcSamplingDestroy(
HSAuint32 NodeId,
HsaPcSamplingTraceId traceId
);
/**
* Start PC Sampling Session
*
* Arguments:
* @NodeId (IN) - GPU node_id
* @traceId(IN) - PC Sampling trace Id
*
* Return:
* HSAKMT_STATUS_ERROR - failed
* HSAKMT_STATUS_SUCCESS - successfully complete
* HSAKMT_STATUS_INVALID_PARAMETER - invalid input
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPcSamplingStart(
HSAuint32 NodeId,
HsaPcSamplingTraceId traceId
);
/**
* Stop PC Sampling Session
*
* Arguments:
* @NodeId (IN) - GPU node_id
* @traceId(IN) - PC Sampling trace Id
*
* Return:
* HSAKMT_STATUS_ERROR - failed
* HSAKMT_STATUS_SUCCESS - successfully complete
* HSAKMT_STATUS_INVALID_PARAMETER - invalid input
* HSAKMT_STATUS_KERNEL_ALREADY_OPENED - stop already
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtPcSamplingStop(
HSAuint32 NodeId,
HsaPcSamplingTraceId traceId
);
/**
* Check if the HSA KMT Model is enabled
*
* Arguments:
* @enable (OUT) - true if the HSA KMT Model is enabled, false otherwise
*
* Return:
* HSAKMT_STATUS_ERROR - failed
* HSAKMT_STATUS_SUCCESS - successfully complete
*/
HSAKMT_STATUS
HSAKMTAPI
hsaKmtModelEnabled(
bool* enable // OUT
);
#ifdef __cplusplus
} //extern "C"
#endif
#endif //_HSAKMT_H_
================================================
FILE: libhsakmt/include/hsakmt/hsakmt_virtio.h
================================================
/*
* Copyright © 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef HSAKMT_VIRTIO_H
#define HSAKMT_VIRTIO_H
#include "hsakmt/linux/kfd_ioctl.h"
#include "hsakmt/hsakmt.h"
#include
#ifdef __cplusplus
extern "C" {
#endif
HSAKMT_STATUS HSAKMTAPI vhsaKmtOpenKFD(void);
HSAKMT_STATUS HSAKMTAPI vhsaKmtCloseKFD(void);
HSAKMT_STATUS HSAKMTAPI vhsaKmtAllocMemory(HSAuint32 PreferredNode, HSAuint64 SizeInBytes,
HsaMemFlags MemFlags, void** MemoryAddress);
HSAKMT_STATUS HSAKMTAPI vhsaKmtFreeMemory(void* MemoryAddress, HSAuint64 SizeInBytes);
HSAKMT_STATUS HSAKMTAPI vhsaKmtMapMemoryToGPUNodes(void* MemoryAddress, HSAuint64 MemorySizeInBytes,
HSAuint64* AlternateVAGPU,
HsaMemMapFlags MemMapFlags,
HSAuint64 NumberOfNodes, HSAuint32* NodeArray);
HSAKMT_STATUS HSAKMTAPI vhsaKmtUnmapMemoryToGPU(void* MemoryAddress);
HSAKMT_STATUS HSAKMTAPI vhsaKmtAvailableMemory(HSAuint32 Node, HSAuint64* AvailableBytes);
HSAKMT_STATUS HSAKMTAPI vhsaKmtMapMemoryToGPU(void* MemoryAddress, HSAuint64 MemorySizeInBytes,
HSAuint64* AlternateVAGPU);
HSAKMT_STATUS HSAKMTAPI vhsaKmtRegisterMemoryWithFlags(void* MemoryAddress,
HSAuint64 MemorySizeInBytes,
HsaMemFlags MemFlags);
HSAKMT_STATUS HSAKMTAPI vhsaKmtDeregisterMemory(void* MemoryAddress);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetVersion(HsaVersionInfo* v);
HSAKMT_STATUS HSAKMTAPI vhsaKmtAcquireSystemProperties(HsaSystemProperties* SystemProperties);
HSAKMT_STATUS HSAKMTAPI vhsaKmtReleaseSystemProperties(void);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetNodeProperties(HSAuint32 NodeId,
HsaNodeProperties* NodeProperties);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetXNACKMode(HSAint32* enable);
HSAKMT_STATUS HSAKMTAPI vhsaKmtRuntimeEnable(void* rDebug, bool setupTtmp);
HSAKMT_STATUS HSAKMTAPI vhsaKmtRuntimeDisable(void);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, HSAuint32 NumBanks,
HsaMemoryProperties* MemoryProperties);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetNodeCacheProperties(HSAuint32 NodeId, HSAuint32 ProcessorId,
HSAuint32 NumCaches,
HsaCacheProperties* CacheProperties);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, HSAuint32 NumIoLinks,
HsaIoLinkProperties* IoLinkProperties);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetClockCounters(HSAuint32 NodeId, HsaClockCounters* Counters);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetAMDGPUDeviceHandle(HSAuint32 NodeId,
HsaAMDGPUDeviceHandle* DeviceHandle);
HSAKMT_STATUS HSAKMTAPI vhsaKmtQueryPointerInfo(const void* Pointer, HsaPointerInfo* PointerInfo);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig* config);
HSAKMT_STATUS HSAKMTAPI vhsaKmtCreateEvent(HsaEventDescriptor* EventDesc, _Bool ManualReset,
_Bool IsSignaled, HsaEvent** Event);
HSAKMT_STATUS HSAKMTAPI vhsaKmtDestroyEvent(HsaEvent* Event);
HSAKMT_STATUS HSAKMTAPI vhsaKmtSetEvent(HsaEvent* Event);
HSAKMT_STATUS HSAKMTAPI vhsaKmtResetEvent(HsaEvent* Event);
HSAKMT_STATUS HSAKMTAPI vhsaKmtQueryEventState(HsaEvent* Event);
HSAKMT_STATUS HSAKMTAPI vhsaKmtWaitOnMultipleEvents(HsaEvent* Events[], HSAuint32 NumEvents,
bool WaitOnAll, HSAuint32 Milliseconds);
HSAKMT_STATUS HSAKMTAPI vhsaKmtWaitOnEvent(HsaEvent* Event, HSAuint32 Milliseconds);
HSAKMT_STATUS HSAKMTAPI vhsaKmtWaitOnEvent_Ext(HsaEvent* Event, HSAuint32 Milliseconds,
uint64_t* event_age);
HSAKMT_STATUS HSAKMTAPI vhsaKmtWaitOnMultipleEvents_Ext(HsaEvent* Events[], HSAuint32 NumEvents,
bool WaitOnAll, HSAuint32 Milliseconds,
uint64_t* event_age);
HSAKMT_STATUS HSAKMTAPI vhsaKmtSetTrapHandler(HSAuint32 NodeId, void* TrapHandlerBaseAddress,
HSAuint64 TrapHandlerSizeInBytes,
void* TrapBufferBaseAddress,
HSAuint64 TrapBufferSizeInBytes);
HSAKMT_STATUS HSAKMTAPI vhsaKmtCreateQueueExt(HSAuint32 NodeId, HSA_QUEUE_TYPE Type,
HSAuint32 QueuePercentage,
HSA_QUEUE_PRIORITY Priority, HSAuint32 SdmaEngineId,
void* QueueAddress, HSAuint64 QueueSizeInBytes,
HsaEvent* Event, HsaQueueResource* QueueResource);
HSAKMT_STATUS HSAKMTAPI vhsaKmtCreateQueue(HSAuint32 NodeId, HSA_QUEUE_TYPE Type,
HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority,
void* QueueAddress, HSAuint64 QueueSizeInBytes,
HsaEvent* Event, HsaQueueResource* QueueResource);
HSAKMT_STATUS HSAKMTAPI vhsaKmtDestroyQueue(HSA_QUEUEID QueueId);
HSAKMT_STATUS HSAKMTAPI vhsaKmtRegisterGraphicsHandleToNodes(
HSAuint64 GraphicsResourceHandle, HsaGraphicsResourceInfo* GraphicsResourceInfo,
HSAuint64 NumberOfNodes, HSAuint32* NodeArray);
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetRuntimeCapabilities(HSAuint32* caps_mask);
int vamdgpu_query_gpu_info(amdgpu_device_handle dev, void* out);
#ifdef __cplusplus
}
#endif
#endif /* HSAKMT_VIRTIO_H */
================================================
FILE: libhsakmt/include/hsakmt/hsakmtmodel.h
================================================
/*
* Copyright © 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef _HSAKMTMODEL_H_
#define _HSAKMTMODEL_H_
#include
extern bool hsakmt_use_model;
extern char *hsakmt_model_topology;
void model_init_env_vars(void);
void model_init(void);
void model_set_mmio_page(void *ptr);
void model_set_event_page(void *ptr, unsigned event_limit);
int model_kfd_ioctl(unsigned long request, void *arg);
#endif /* _HSAKMTMODEL_H_ */
================================================
FILE: libhsakmt/include/hsakmt/hsakmtmodeliface.h
================================================
/*
* Copyright © 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef _HSAKMTMODELIFACE_H_
#define _HSAKMTMODELIFACE_H_
#include
// Changelog:
// 0.2: Add set_set_event function to hsakmt_model_functions
#define HSAKMT_MODEL_INTERFACE_VERSION_MAJOR 0
#define HSAKMT_MODEL_INTERFACE_VERSION_MINOR 4
typedef struct hsakmt_model hsakmt_model_t;
typedef struct hsakmt_model_queue hsakmt_model_queue_t;
// Description of a queue to be registered with the model.
//
// Addresses are relative to the global aperture.
struct hsakmt_model_queue_info {
uint64_t ring_base_address;
uint64_t write_pointer_address;
uint64_t read_pointer_address;
uint64_t *doorbell;
uint32_t ring_size; // in bytes
uint32_t queue_type;
};
// Pointer to a "set event" function.
//
// data is a user-provided opaque pointer.
// event_id is the ID of the event to set (as in amd_signal_s::event_id).
typedef void (*hsakmt_model_set_event_fn)(void *data, unsigned event_id);
// Interface provided by the software model implementation.
//
// Queried from a shared library by calling an export called
// `get_hsakmt_model_functions`
//
// Interface versioning follows the semantic versioning model: clients that
// know about interface version X.Y can use any implementation that provides
// version X.Z with Z >= Y.
//
// The model is designed to support only one VMID space.
struct hsakmt_model_functions {
uint32_t version_major; // HSAKMT_MODEL_INTERFACE_VERSION_MAJOR
uint32_t version_minor; // HSAKMT_MODEL_INTERFACE_VERSION_MINOR
// Create a GPU device model.
hsakmt_model_t *(*create)(void);
// Destroy a GPU device model.
void (*destroy)(hsakmt_model_t *model);
// Set the global aperture. GPU virtual address 0 is at CPU address `base`.
void (*set_global_aperture)(hsakmt_model_t *model, void *base, uint64_t size);
void (*alloced_memory)(hsakmt_model_t *model, void *base, uint64_t size, uint32_t flags);
void (*freed_memory)(hsakmt_model_t *model, void *base, uint64_t size);
// Register a callback that the model should call when an event is signaled.
// `data` is client data that is opaque to the model.
//
// TODO: Deprecated -- remove this!
void (*set_notify_event)(hsakmt_model_t *model, void (*callback)(void *data), void *data);
// Register a callback that the model should call in order to wait for an
// event to be signaled.
// `data` is client data that is opaque to the model.
void (*set_wait_event)(hsakmt_model_t *model, void (*callback)(void *data, uint64_t address, uint64_t age), void *data);
// Register a queue with the model. The model will immediately begin
// asynchronous processing of the queue (but by default, the model need not
// provide forward progress guarantees between multiple queues).
hsakmt_model_queue_t *(*register_queue)(hsakmt_model_t *model, struct hsakmt_model_queue_info *info);
// Register a callback that allows the model to set an event.
void (*set_set_event)(hsakmt_model_t *model, hsakmt_model_set_event_fn fn, void *data);
// Destroy a queue that was returned by register_queue.
void (*destroy_queue)(hsakmt_model_t *model, hsakmt_model_queue_t *queue);
};
// Type of a shared library export called `get_hsakmt_model_functions`.
typedef const struct hsakmt_model_functions *(*get_hsakmt_model_functions_t)(void);
#endif // _HSAKMTMODELIFACE_H_
================================================
FILE: libhsakmt/include/hsakmt/hsakmttypes.h
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef _HSAKMTTYPES_H_
#define _HSAKMTTYPES_H_
//the definitions and THUNK API are version specific - define the version numbers here
#define HSAKMT_VERSION_MAJOR 0
#define HSAKMT_VERSION_MINOR 99
#ifdef __cplusplus
extern "C" {
#endif
#if defined(_WIN64) || defined(_WINDOWS) || defined(_WIN32)
#if defined(_WIN32)
#define HSAKMTAPI __stdcall
#else
#define HSAKMTAPI
#endif
typedef unsigned char HSAuint8;
typedef char HSAint8;
typedef unsigned short HSAuint16;
typedef signed short HSAint16;
typedef unsigned __int32 HSAuint32;
typedef signed __int32 HSAint32;
typedef signed __int64 HSAint64;
typedef unsigned __int64 HSAuint64;
#elif defined(__linux__)
#include
#include
#define HSAKMTAPI
typedef uint8_t HSAuint8;
typedef int8_t HSAint8;
typedef uint16_t HSAuint16;
typedef int16_t HSAint16;
typedef uint32_t HSAuint32;
typedef int32_t HSAint32;
typedef int64_t HSAint64;
typedef uint64_t HSAuint64;
#endif
typedef void* HSA_HANDLE;
typedef HSAuint64 HSA_QUEUEID;
// An HSA_QUEUEID that is never a valid queue ID.
#define INVALID_QUEUEID 0xFFFFFFFFFFFFFFFFULL
// A PID that is never a valid process ID.
#define INVALID_PID 0xFFFFFFFF
// // A HSA_NODEID that is never a valid node ID.
#define INVALID_NODEID 0xFFFFFFFF
// This is included in order to force the alignments to be 4 bytes so that
// it avoids extra padding added by the compiler when a 64-bit binary is generated.
#pragma pack(push, hsakmttypes_h, 4)
//
// HSA STATUS codes returned by the KFD Interfaces
//
typedef enum _HSAKMT_STATUS
{
HSAKMT_STATUS_SUCCESS = 0, // Operation successful
HSAKMT_STATUS_ERROR = 1, // General error return if not otherwise specified
HSAKMT_STATUS_DRIVER_MISMATCH = 2, // User mode component is not compatible with kernel HSA driver
HSAKMT_STATUS_INVALID_PARAMETER = 3, // KFD identifies input parameters invalid
HSAKMT_STATUS_INVALID_HANDLE = 4, // KFD identifies handle parameter invalid
HSAKMT_STATUS_INVALID_NODE_UNIT = 5, // KFD identifies node or unit parameter invalid
HSAKMT_STATUS_NO_MEMORY = 6, // No memory available (when allocating queues or memory)
HSAKMT_STATUS_BUFFER_TOO_SMALL = 7, // A buffer needed to handle a request is too small
HSAKMT_STATUS_NOT_IMPLEMENTED = 10, // KFD function is not implemented for this set of paramters
HSAKMT_STATUS_NOT_SUPPORTED = 11, // KFD function is not supported on this node
HSAKMT_STATUS_UNAVAILABLE = 12, // KFD function is not available currently on this node (but
// may be at a later time)
HSAKMT_STATUS_OUT_OF_RESOURCES = 13, // KFD function request exceeds the resources currently available.
HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED = 20, // KFD driver path not opened
HSAKMT_STATUS_KERNEL_COMMUNICATION_ERROR = 21, // user-kernel mode communication failure
HSAKMT_STATUS_KERNEL_ALREADY_OPENED = 22, // KFD driver path already opened
HSAKMT_STATUS_HSAMMU_UNAVAILABLE = 23, // ATS/PRI 1.1 (Address Translation Services) not available
// (IOMMU driver not installed or not-available)
HSAKMT_STATUS_WAIT_FAILURE = 30, // The wait operation failed
HSAKMT_STATUS_WAIT_TIMEOUT = 31, // The wait operation timed out
HSAKMT_STATUS_MEMORY_ALREADY_REGISTERED = 35, // Memory buffer already registered
HSAKMT_STATUS_MEMORY_NOT_REGISTERED = 36, // Memory buffer not registered
HSAKMT_STATUS_MEMORY_ALIGNMENT = 37, // Memory parameter not aligned
} HSAKMT_STATUS;
//
// HSA KFD interface version information. Calling software has to validate that it meets
// the minimum interface version as described in the API specification.
// All future structures will be extended in a backward compatible fashion.
//
typedef struct _HsaVersionInfo
{
HSAuint32 KernelInterfaceMajorVersion; // supported kernel interface major version
HSAuint32 KernelInterfaceMinorVersion; // supported kernel interface minor version
} HsaVersionInfo;
//
// HSA Topology Discovery Infrastructure structure definitions.
// The infrastructure implementation is based on design specified in the Kernel HSA Driver ADD
// The discoverable data is retrieved from ACPI structures in the platform infrastructure, as defined
// in the "Heterogeneous System Architecture Detail Topology" specification.
//
// The following structure is returned on a call to hsaKmtAcquireSystemProperties() as output.
// When the call is made within a process context, a "snapshot" of the topology information
// is taken within the KFD to avoid any changes during the enumeration process.
// The Snapshot is released when hsaKmtReleaseSystemProperties() is called
// or when the process exits or is terminated.
//
typedef struct _HsaSystemProperties
{
HSAuint32 NumNodes; // the number of "H-NUMA" memory nodes.
// each node represents a discoverable node of the system
// All other enumeration is done on a per-node basis
HSAuint32 PlatformOem; // identifies HSA platform, reflects the OEMID in the CRAT
HSAuint32 PlatformId; // HSA platform ID, reflects OEM TableID in the CRAT
HSAuint32 PlatformRev; // HSA platform revision, reflects Platform Table Revision ID
} HsaSystemProperties;
typedef union
{
HSAuint32 Value;
struct
{
unsigned int uCode : 10; // ucode packet processor version
unsigned int Major : 6; // GFXIP Major engine version
unsigned int Minor : 8; // GFXIP Minor engine version
unsigned int Stepping : 8; // GFXIP Stepping info
}ui32;
} HSA_ENGINE_ID;
typedef union
{
HSAuint32 Value;
struct
{
unsigned int uCodeSDMA: 10; // ucode version SDMA engine
unsigned int uCodeRes : 10; // ucode version (reserved)
unsigned int Reserved : 12; // Reserved, must be 0
};
} HSA_ENGINE_VERSION;
typedef union
{
HSAuint32 Value;
struct
{
unsigned int HotPluggable : 1; // the node may be removed by some system action
// (event will be sent)
unsigned int HSAMMUPresent : 1; // This node has an ATS/PRI 1.1 compatible
// translation agent in the system (e.g. IOMMUv2)
unsigned int SharedWithGraphics : 1; // this HSA nodes' GPU function is also used for OS primary
// graphics render (= UI)
unsigned int QueueSizePowerOfTwo : 1; // This node GPU requires the queue size to be a power of 2 value
unsigned int QueueSize32bit : 1; // This node GPU requires the queue size to be less than 4GB
unsigned int QueueIdleEvent : 1; // This node GPU supports notification on Queue Idle
unsigned int VALimit : 1; // This node GPU has limited VA range for platform
// (typical 40bit). Affects shared VM use for 64bit apps
unsigned int WatchPointsSupported: 1; // Indicates if Watchpoints are available on the node.
unsigned int WatchPointsTotalBits: 4; // Watchpoints available. To determine the number use 2^value
unsigned int DoorbellType : 2; // 0: This node has pre-1.0 doorbell characteristic
// 1: This node has 1.0 doorbell characteristic
// 2,3: reserved for future use
unsigned int AQLQueueDoubleMap : 1; // The unit needs a VA “double map”
unsigned int DebugTrapSupported : 1; // Indicates if Debug Trap is supported on the node.
unsigned int WaveLaunchTrapOverrideSupported: 1; // Indicates if Wave Launch Trap Override is supported on the node.
unsigned int WaveLaunchModeSupported: 1; // Indicates if Wave Launch Mode is supported on the node.
unsigned int PreciseMemoryOperationsSupported: 1; // Indicates if Precise Memory Operations are supported on the node.
unsigned int DEPRECATED_SRAM_EDCSupport: 1; // Old buggy user mode depends on this being 0
unsigned int Mem_EDCSupport: 1; // Indicates if GFX internal DRAM/HBM EDC/ECC functionality is active
unsigned int RASEventNotify: 1; // Indicates if GFX extended RASFeatures and RAS EventNotify status is available
unsigned int ASICRevision: 4; // Indicates the ASIC revision of the chip on this node.
unsigned int SRAM_EDCSupport: 1; // Indicates if GFX internal SRAM EDC/ECC functionality is active
unsigned int SVMAPISupported : 1; // Whether or not the SVM API is supported
unsigned int CoherentHostAccess: 1; // Whether or not device memory can be coherently accessed by the host CPU
unsigned int DebugSupportedFirmware : 1; // Indicates if HWS firmware supports GPU debugging
unsigned int PreciseALUOperationsSupported : 1; //Indicates if precise ALU operations are supported for GPU debugging
unsigned int PerQueueResetSupported : 1; // Indicates per-queue reset supported
} ui32;
} HSA_CAPABILITY;
typedef union
{
HSAuint32 Value;
struct
{
unsigned int PerSDMAQueueResetSupported : 1; // Indicates per-sdma queue reset supported
unsigned int Reserved : 31; // Reserved
} ui32;
} HSA_CAPABILITY2;
// Debug Properties and values
// HSA runtime may expose a subset of the capabilities outlined to the applicati
typedef union
{
HSAuint64 Value;
struct
{
HSAuint64 WatchAddrMaskLoBit: 4; // Only bits
// WatchAddrMaskLoBit..WatchAddrMaskHiBit
// of the
HSAuint64 WatchAddrMaskHiBit: 6; // watch address mask are used.
// 0 is the least significant bit.
HSAuint64 DispatchInfoAlwaysValid: 1; // 0 if control of TTMP setup is
// controlled on a per process
// basis and is not always enabled
// 1 if TTMP setup is always
// enabled
HSAuint64 AddressWatchpointShareKind: 1; // whether the address watchpoint
// is per process or shared with
// all proccesses
// 0 if shared or unsuppoted
// (unsupported indicated by
// address_watchpoint_count == 0)
// All current devices have shared watchpoints
// 1 if unshared
HSAuint64 Reserved: 52; //
};
} HSA_DEBUG_PROPERTIES;
//
// HSA node properties. This structure is an output parameter of hsaKmtGetNodeProperties()
// The application or runtime can use the information herein to size the topology management structures
// Unless there is some very weird setup, there is at most one "GPU" device (with a certain number
// of throughput compute units (= SIMDs) associated with a H-NUMA node.
//
#define HSA_PUBLIC_NAME_SIZE 64 // Marketing name string size
typedef struct _HsaNodeProperties
{
HSAuint32 NumCPUCores; // # of latency (= CPU) cores present on this HSA node.
// This value is 0 for a HSA node with no such cores,
// e.g a "discrete HSA GPU"
HSAuint32 NumFComputeCores; // # of HSA throughtput (= GPU) FCompute cores ("SIMD") present in a node.
// This value is 0 if no FCompute cores are present (e.g. pure "CPU node").
HSAuint32 NumNeuralCores; // # of HSA neural processing units (= AIE) present in a
// node. This value is 0 if there are no NeuralCores.
HSAuint32 NumMemoryBanks; // # of discoverable memory bank affinity properties on this "H-NUMA" node.
HSAuint32 NumCaches; // # of discoverable cache affinity properties on this "H-NUMA" node.
HSAuint32 NumIOLinks; // # of discoverable IO link affinity properties of this node
// connecting to other nodes.
HSAuint32 CComputeIdLo; // low value of the logical processor ID of the latency (= CPU)
// cores available on this node
HSAuint32 FComputeIdLo; // low value of the logical processor ID of the throughput (= GPU)
// units available on this node
HSA_CAPABILITY Capability; // see above
HSA_CAPABILITY2 Capability2; // see above
HSAuint32 MaxWavesPerSIMD; // This identifies the max. number of launched waves per SIMD.
// If NumFComputeCores is 0, this value is ignored.
HSAuint32 LDSSizeInKB; // Size of Local Data Store in Kilobytes per SIMD Wavefront
HSAuint32 GDSSizeInKB; // Size of Global Data Store in Kilobytes shared across SIMD Wavefronts
HSAuint32 WaveFrontSize; // Number of SIMD cores per wavefront executed, typically 64,
// may be 32 or a different value for some HSA based architectures
HSAuint32 NumShaderBanks; // Number of Shader Banks or Shader Engines, typical values are 1 or 2
HSAuint32 NumArrays; // Number of SIMD arrays per engine
HSAuint32 NumCUPerArray; // Number of Compute Units (CU) per SIMD array
HSAuint32 NumSIMDPerCU; // Number of SIMD representing a Compute Unit (CU)
HSAuint32 MaxSlotsScratchCU; // Number of temp. memory ("scratch") wave slots available to access,
// may be 0 if HW has no restrictions
HSA_ENGINE_ID EngineId; // Identifier (rev) of the GPU uEngine or Firmware, may be 0
HSA_ENGINE_ID OverrideEngineId; // Identifier (rev) of the Overrided GPU uEngine or Firmware, may be 0
HSAuint16 VendorId; // GPU vendor id; 0 on latency (= CPU)-only nodes
HSAuint16 DeviceId; // GPU device id; 0 on latency (= CPU)-only nodes
HSAuint32 LocationId; // GPU BDF (Bus/Device/function number) - identifies the device
// location in the overall system
HSAuint64 LocalMemSize; // Local memory size
HSAuint32 MaxEngineClockMhzFCompute; // maximum engine clocks for CPU and
HSAuint32 MaxEngineClockMhzCCompute; // GPU function, including any boost caopabilities,
HSAint32 DrmRenderMinor; // DRM render device minor device number
HSAuint16 MarketingName[HSA_PUBLIC_NAME_SIZE]; // Public name of the "device" on the node (board or APU name).
// Unicode string
HSAuint8 AMDName[HSA_PUBLIC_NAME_SIZE]; //CAL Name of the "device", ASCII
HSA_ENGINE_VERSION uCodeEngineVersions;
HSA_DEBUG_PROPERTIES DebugProperties; // Debug properties of this node.
HSAuint64 HiveID; // XGMI Hive the GPU node belongs to in the system. It is an opaque and static
// number hash created by the PSP
HSAuint32 NumSdmaEngines; // number of PCIe optimized SDMA engines
HSAuint32 NumSdmaXgmiEngines;// number of XGMI optimized SDMA engines
HSAuint8 NumSdmaQueuesPerEngine;// number of SDMA queue per one engine
HSAuint8 NumCpQueues; // number of Compute queues
HSAuint8 NumGws; // number of GWS barriers
HSAuint8 Integrated; // 0 - discrete GPU, 1 - integrated GPU (including small APU and APP APU)
HSAuint32 Domain; // PCI domain of the GPU
HSAuint64 UniqueID; // Globally unique immutable id
HSAuint32 VGPRSizePerCU; // VGPR size in bytes per CU
HSAuint32 SGPRSizePerCU; // SGPR size in bytes per CU
HSAuint32 NumXcc; // Number of XCC
HSAuint32 KFDGpuID; // GPU Hash ID generated by KFD
HSAuint32 FamilyID; // GPU family id
} HsaNodeProperties;
typedef enum _HSA_HEAPTYPE
{
HSA_HEAPTYPE_SYSTEM = 0,
HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC = 1, // CPU "visible" part of GPU device local memory (for discrete GPU)
HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE = 2, // CPU "invisible" part of GPU device local memory (for discrete GPU)
// All HSA accessible memory is per definition "CPU visible"
// "Private memory" is relevant for graphics interop only.
HSA_HEAPTYPE_GPU_GDS = 3, // GPU internal memory (GDS)
HSA_HEAPTYPE_GPU_LDS = 4, // GPU internal memory (LDS)
HSA_HEAPTYPE_GPU_SCRATCH = 5, // GPU special memory (scratch)
HSA_HEAPTYPE_DEVICE_SVM = 6, // sys-memory mapped by device page tables
HSA_HEAPTYPE_MMIO_REMAP = 7, // remapped mmio, such as hdp flush registers
HSA_HEAPTYPE_NUMHEAPTYPES,
HSA_HEAPTYPE_SIZE = 0xFFFFFFFF
} HSA_HEAPTYPE;
typedef union
{
HSAuint32 MemoryProperty;
struct
{
unsigned int HotPluggable : 1; // the memory may be removed by some system action,
// memory should be used for temporary data
unsigned int NonVolatile : 1; // memory content is preserved across a power-off cycle.
unsigned int Reserved :30;
} ui32;
} HSA_MEMORYPROPERTY;
//
// Discoverable HSA Memory properties.
// The structure is the output parameter of the hsaKmtGetNodeMemoryProperties() function
//
typedef struct _HsaMemoryProperties
{
HSA_HEAPTYPE HeapType; // system or frame buffer,
union
{
HSAuint64 SizeInBytes; // physical memory size of the memory range in bytes
struct
{
HSAuint32 SizeInBytesLow; // physical memory size of the memory range in bytes (lower 32bit)
HSAuint32 SizeInBytesHigh; // physical memory size of the memory range in bytes (higher 32bit)
} ui32;
};
HSA_MEMORYPROPERTY Flags; // See definitions above
HSAuint32 Width; // memory width - the number of parallel bits of the memory interface
HSAuint32 MemoryClockMax; // memory clock for the memory, this allows computing the available bandwidth
// to the memory when needed
HSAuint64 VirtualBaseAddress; // if set to value != 0, indicates the virtual base address of the memory
// in process virtual space
} HsaMemoryProperties;
//
// Discoverable Cache Properties. (optional).
// The structure is the output parameter of the hsaKmtGetNodeMemoryProperties() function
// Any of the parameters may be 0 (= not defined)
//
#define HSA_CPU_SIBLINGS 256
#define HSA_PROCESSORID_ALL 0xFFFFFFFF
typedef union
{
HSAuint32 Value;
struct
{
unsigned int Data : 1;
unsigned int Instruction : 1;
unsigned int CPU : 1;
unsigned int HSACU : 1;
unsigned int Reserved :28;
} ui32;
} HsaCacheType;
typedef struct _HaCacheProperties
{
HSAuint32 ProcessorIdLow; // Identifies the processor number
HSAuint32 CacheLevel; // Integer representing level: 1, 2, 3, 4, etc
HSAuint32 CacheSize; // Size of the cache
HSAuint32 CacheLineSize; // Cache line size in bytes
HSAuint32 CacheLinesPerTag; // Cache lines per Cache Tag
HSAuint32 CacheAssociativity; // Cache Associativity
HSAuint32 CacheLatency; // Cache latency in ns
HsaCacheType CacheType;
HSAuint32 SiblingMap[HSA_CPU_SIBLINGS];
} HsaCacheProperties;
//
// Discoverable CPU Compute Properties. (optional).
// The structure is the output parameter of the hsaKmtGetCComputeProperties() function
// Any of the parameters may be 0 (= not defined)
//
typedef struct _HsaCComputeProperties
{
HSAuint32 SiblingMap[HSA_CPU_SIBLINGS];
} HsaCComputeProperties;
//
// Discoverable IoLink Properties (optional).
// The structure is the output parameter of the hsaKmtGetIoLinkProperties() function.
// Any of the parameters may be 0 (= not defined)
//
typedef enum _HSA_IOLINKTYPE {
HSA_IOLINKTYPE_UNDEFINED = 0,
HSA_IOLINKTYPE_HYPERTRANSPORT = 1,
HSA_IOLINKTYPE_PCIEXPRESS = 2,
HSA_IOLINKTYPE_AMBA = 3,
HSA_IOLINKTYPE_MIPI = 4,
HSA_IOLINK_TYPE_QPI_1_1 = 5,
HSA_IOLINK_TYPE_RESERVED1 = 6,
HSA_IOLINK_TYPE_RESERVED2 = 7,
HSA_IOLINK_TYPE_RAPID_IO = 8,
HSA_IOLINK_TYPE_INFINIBAND = 9,
HSA_IOLINK_TYPE_RESERVED3 = 10,
HSA_IOLINK_TYPE_XGMI = 11,
HSA_IOLINK_TYPE_XGOP = 12,
HSA_IOLINK_TYPE_GZ = 13,
HSA_IOLINK_TYPE_ETHERNET_RDMA = 14,
HSA_IOLINK_TYPE_RDMA_OTHER = 15,
HSA_IOLINK_TYPE_OTHER = 16,
HSA_IOLINKTYPE_NUMIOLINKTYPES,
HSA_IOLINKTYPE_SIZE = 0xFFFFFFFF
} HSA_IOLINKTYPE;
typedef union
{
HSAuint32 LinkProperty;
struct
{
unsigned int Override : 1; // bus link properties are determined by this structure
// not by the HSA_IOLINKTYPE. The other flags are valid
// only if this bit is set to one
unsigned int NonCoherent : 1; // The link doesn't support coherent transactions
// memory accesses across must not be set to "host cacheable"!
unsigned int NoAtomics32bit : 1; // The link doesn't support 32bit-wide atomic transactions
unsigned int NoAtomics64bit : 1; // The link doesn't support 64bit-wide atomic transactions
unsigned int NoPeerToPeerDMA : 1; // The link doesn't allow device P2P access
unsigned int Reserved :27;
} ui32;
} HSA_LINKPROPERTY;
typedef struct _HsaIoLinkProperties
{
HSA_IOLINKTYPE IoLinkType; // see above
HSAuint32 VersionMajor; // Bus interface version (optional)
HSAuint32 VersionMinor; // Bus interface version (optional)
HSAuint32 NodeFrom; //
HSAuint32 NodeTo; //
HSAuint32 Weight; // weight factor (derived from CDIT)
HSAuint32 MinimumLatency; // minimum cost of time to transfer (rounded to ns)
HSAuint32 MaximumLatency; // maximum cost of time to transfer (rounded to ns)
HSAuint32 MinimumBandwidth; // minimum interface Bandwidth in MB/s
HSAuint32 MaximumBandwidth; // maximum interface Bandwidth in MB/s
HSAuint32 RecTransferSize; // recommended transfer size to reach maximum bandwidth in Bytes
HSAuint32 RecSdmaEngIdMask; // recommended sdma engine IDs to reach maximum bandwidth
HSA_LINKPROPERTY Flags; // override flags (may be active for specific platforms)
} HsaIoLinkProperties;
//
// Memory allocation definitions for the KFD HSA interface
//
typedef struct _HsaMemFlags
{
union
{
struct
{
unsigned int NonPaged : 1; // default = 0: pageable memory
unsigned int CachePolicy : 2; // see HSA_CACHING_TYPE
unsigned int ReadOnly : 1; // default = 0: Read/Write memory
unsigned int PageSize : 2; // see HSA_PAGE_SIZE
unsigned int HostAccess : 1; // default = 0: GPU access only
unsigned int NoSubstitute: 1; // default = 0: if specific memory is not available on node (e.g. on
// discrete GPU local), allocation may fall back to system memory node 0
// memory (= always available). Otherwise no allocation is possible.
unsigned int GDSMemory : 1; // default = 0: If set, the allocation will occur in GDS heap.
// HostAccess must be 0, all other flags (except NoSubstitute) should
// be 0 when setting this entry to 1. GDS allocation may fail due to
// limited resources. Application code is required to work without
// any allocated GDS memory using regular memory.
// Allocation fails on any node without GPU function.
unsigned int Scratch : 1; // default = 0: If set, the allocation will occur in GPU "scratch area".
// HostAccess must be 0, all other flags (except NoSubstitute) should be 0
// when setting this entry to 1. Scratch allocation may fail due to limited
// resources. Application code is required to work without any allocation.
// Allocation fails on any node without GPU function.
unsigned int AtomicAccessFull: 1; // default = 0: If set, the memory will be allocated and mapped to allow
// atomic ops processing. On AMD APU, this will use the ATC path on system
// memory, irrespective of the NonPaged flag setting (= if NonPaged is set,
// the memory is pagelocked but mapped through IOMMUv2 instead of GPUVM).
// All atomic ops must be supported on this memory.
unsigned int AtomicAccessPartial: 1; // default = 0: See above for AtomicAccessFull description, however
// focused on AMD discrete GPU that support PCIe atomics; the memory
// allocation is mapped to allow for PCIe atomics to operate on system
// memory, irrespective of NonPaged set or the presence of an ATC path
// in the system. The atomic operations supported are limited to SWAP,
// CompareAndSwap (CAS) and FetchAdd (this PCIe op allows both atomic
// increment and decrement via 2-complement arithmetic), which are the
// only atomic ops directly supported in PCI Express.
// On AMD APU, setting this flag will allocate the same type of memory
// as AtomicAccessFull, but it will be considered compatible with
// discrete GPU atomic operations access.
unsigned int ExecuteAccess: 1; // default = 0: Identifies if memory is primarily used for data or accessed
// for executable code (e.g. queue memory) by the host CPU or the device.
// Influences the page attribute setting within the allocation
unsigned int CoarseGrain : 1; // default = 0: The memory can be accessed assuming cache
// coherency maintained by link infrastructure and HSA agents.
// 1: memory consistency needs to be enforced at
// synchronization points at dispatch or other software
// enforced synchronization boundaries.
unsigned int AQLQueueMemory: 1; // default = 0; If 1: The caller indicates that the memory will be used as AQL queue memory.
// The KFD will ensure that the memory returned is allocated in the optimal memory location
// and optimal alignment requirements
unsigned int FixedAddress : 1; // Allocate memory at specified virtual address. Fail if address is not free.
unsigned int NoNUMABind: 1; // Don't bind system memory to a specific NUMA node
unsigned int Uncached: 1; // Caching flag for fine-grained memory on A+A HW platform
unsigned int NoAddress: 1; // only do vram allocation, return a handle, not allocate virtual address.
unsigned int OnlyAddress: 1; // only do virtal address allocation without vram allocation.
unsigned int ExtendedCoherent: 1; // system-scope coherence on atomic instructions
unsigned int GTTAccess: 1; // default = 0; If 1: The caller indicates this memory will be mapped to GART for MES
// KFD will allocate GTT memory with the Preferred_node set as gpu_id for GART mapping
unsigned int Contiguous: 1; // Allocate contiguous VRAM
unsigned int ExecuteBlit: 1; // default = 0; If 1: The caller indicates that the memory is for blit kernel object.
unsigned int Reserved: 8;
} ui32;
HSAuint32 Value;
};
} HsaMemFlags;
typedef struct _HsaMemMapFlags
{
union
{
struct
{
unsigned int Reserved1 : 1; //
unsigned int CachePolicy : 2; // see HSA_CACHING_TYPE
unsigned int ReadOnly : 1; // memory is not modified while mapped
// allows migration scale-out
unsigned int PageSize : 2; // see HSA_PAGE_SIZE, hint to use
// this page size if possible and
// smaller than default
unsigned int HostAccess : 1; // default = 0: GPU access only
unsigned int Migrate : 1; // Hint: Allows migration to local mem
// of mapped GPU(s), instead of mapping
// physical location
unsigned int Probe : 1; // default = 0: Indicates that a range
// will be mapped by the process soon,
// but does not initiate a map operation
// may trigger eviction of nonessential
// data from the memory, reduces latency
// “cleanup hint” only, may be ignored
unsigned int Reserved : 23;
} ui32;
HSAuint32 Value;
};
} HsaMemMapFlags;
typedef struct _HsaGraphicsResourceInfo {
void *MemoryAddress; // For use in hsaKmtMapMemoryToGPU(Nodes)
HSAuint64 SizeInBytes; // Buffer size
const void *Metadata; // Pointer to metadata owned by Thunk
HSAuint32 MetadataSizeInBytes; // Size of metadata
HSAuint32 NodeId; // GPU exported the buffer
} HsaGraphicsResourceInfo;
typedef enum _HSA_CACHING_TYPE
{
HSA_CACHING_CACHED = 0,
HSA_CACHING_NONCACHED = 1,
HSA_CACHING_WRITECOMBINED = 2,
HSA_CACHING_RESERVED = 3,
HSA_CACHING_NUM_CACHING,
HSA_CACHING_SIZE = 0xFFFFFFFF
} HSA_CACHING_TYPE;
typedef enum _HSA_PAGE_SIZE
{
HSA_PAGE_SIZE_4KB = 0,
HSA_PAGE_SIZE_64KB = 1, //64KB pages, not generally available in systems
HSA_PAGE_SIZE_2MB = 2,
HSA_PAGE_SIZE_1GB = 3, //1GB pages, not generally available in systems
} HSA_PAGE_SIZE;
typedef enum _HSA_DEVICE
{
HSA_DEVICE_CPU = 0,
HSA_DEVICE_GPU = 1,
MAX_HSA_DEVICE = 2
} HSA_DEVICE;
typedef enum _HSA_QUEUE_PRIORITY
{
HSA_QUEUE_PRIORITY_MINIMUM = -3,
HSA_QUEUE_PRIORITY_LOW = -2,
HSA_QUEUE_PRIORITY_BELOW_NORMAL = -1,
HSA_QUEUE_PRIORITY_NORMAL = 0,
HSA_QUEUE_PRIORITY_ABOVE_NORMAL = 1,
HSA_QUEUE_PRIORITY_HIGH = 2,
HSA_QUEUE_PRIORITY_MAXIMUM = 3,
HSA_QUEUE_PRIORITY_NUM_PRIORITY,
HSA_QUEUE_PRIORITY_SIZE = 0xFFFFFFFF
} HSA_QUEUE_PRIORITY;
typedef enum _HSA_QUEUE_TYPE
{
HSA_QUEUE_COMPUTE = 1, // AMD PM4 compatible Compute Queue
HSA_QUEUE_SDMA = 2, // PCIe optimized SDMA Queue, used for data transport and format conversion (e.g. (de-)tiling, etc).
HSA_QUEUE_MULTIMEDIA_DECODE = 3, // reserved, for HSA multimedia decode queue
HSA_QUEUE_MULTIMEDIA_ENCODE = 4, // reserved, for HSA multimedia encode queue
HSA_QUEUE_SDMA_XGMI = 5, // XGMI optimized SDMA Queue
HSA_QUEUE_SDMA_BY_ENG_ID = 6, // Queue with specified SDMA engine ID
// the following values indicate a queue type permitted to reference OS graphics
// resources through the interoperation API. See [5] "HSA Graphics Interoperation
// specification" for more details on use of such resources.
HSA_QUEUE_COMPUTE_OS = 11, // AMD PM4 compatible Compute Queue
HSA_QUEUE_SDMA_OS = 12, // SDMA Queue, used for data transport and format conversion (e.g. (de-)tiling, etc).
HSA_QUEUE_MULTIMEDIA_DECODE_OS = 13, // reserved, for HSA multimedia decode queue
HSA_QUEUE_MULTIMEDIA_ENCODE_OS = 14, // reserved, for HSA multimedia encode queue
HSA_QUEUE_COMPUTE_AQL = 21, // HSA AQL packet compatible Compute Queue
HSA_QUEUE_DMA_AQL = 22, // HSA AQL packet compatible DMA Queue
HSA_QUEUE_DMA_AQL_XGMI = 23, // HSA AQL packet compatible XGMI optimized DMA Queue
// more types in the future
HSA_QUEUE_TYPE_SIZE = 0xFFFFFFFF //aligns to 32bit enum
} HSA_QUEUE_TYPE;
/**
The user context save area is page aligned. The HsaUserContextSaveAreaHeader
header starts at offset 0. Space for a user space copy of the control stack
comes next and is immediately followed by the user space wave save state. The
start of the user space wave save state is page aligned. The debugger reserved
area comes next and is 64 byte aligned.
The user context save area is valid for the duration that the associated
queue exists. When a context save occurs, the HsaUserContextSaveAreaHeader
header will be updated with information about the context save. The context
save area is not modified by any other operation, including a context resume.
*/
typedef struct
{
HSAuint32 ControlStackOffset; // Byte offset from start of user context
// save area to the last saved top (lowest
// address) of control stack data. Must be
// 4 byte aligned.
HSAuint32 ControlStackSize; // Byte size of the last saved control stack
// data. Must be 4 byte aligned.
HSAuint32 WaveStateOffset; // Byte offset from start of user context save
// area to the last saved base (lowest address)
// of wave state data. Must be 4 byte aligned.
HSAuint32 WaveStateSize; // Byte size of the last saved wave state data.
// Must be 4 byte aligned.
HSAuint32 DebugOffset; // Byte offset from start of the user context
// save area to the memory reserved for the
// debugger. Must be 64 byte aligned.
HSAuint32 DebugSize; // Byte size of the memory reserved for the
// debugger. Must be 64 byte aligned.
volatile HSAint64 *ErrorReason; // Address of the HSA signal payload for
// reporting the error reason bitmask.
// Must be 4 byte aligned.
HSAuint32 ErrorEventId; // Event ID used for exception signalling.
// Must be 4 byte aligned.
HSAuint32 Reserved1;
} HsaUserContextSaveAreaHeader;
typedef struct
{
HSAuint32 QueueDetailError; // HW specific queue error state
HSAuint32 QueueTypeExtended; // HW specific queue type info.
// 0 = no information
HSAuint32 NumCUAssigned; // size of *CUMaskInfo bit array, Multiple
// of 32, 0 = no information
HSAuint32* CUMaskInfo; // runtime/system CU assignment for realtime
// queue & reserved CU priority. Ptr to
// bit-array, each bit represents one CU.
// NULL = no information
HSAuint32* UserContextSaveArea; // reference to user space context save area
HSAuint64 SaveAreaSizeInBytes; // Must be 4-Byte aligned
HSAuint32* ControlStackTop; // ptr to the TOS
HSAuint64 ControlStackUsedInBytes; // Must be 4-Byte aligned
HsaUserContextSaveAreaHeader *SaveAreaHeader;
HSAuint64 Reserved2; // runtime/system CU assignment
} HsaQueueInfo;
typedef struct _HsaQueueResource
{
HSA_QUEUEID QueueId; /** queue ID */
/** Doorbell address to notify HW of a new dispatch */
union
{
HSAuint32* Queue_DoorBell;
HSAuint64* Queue_DoorBell_aql;
HSAuint64 QueueDoorBell;
};
/** virtual address to notify HW of queue write ptr value */
union
{
HSAuint32* Queue_write_ptr;
HSAuint64* Queue_write_ptr_aql;
HSAuint64 QueueWptrValue;
};
/** virtual address updated by HW to indicate current read location */
union
{
HSAuint32* Queue_read_ptr;
HSAuint64* Queue_read_ptr_aql;
HSAuint64 QueueRptrValue;
};
volatile HSAint64* ErrorReason; /** exception bits signal payload */
} HsaQueueResource;
//TEMPORARY structure definition - to be used only on "Triniti + Southern Islands" platform
typedef struct _HsaQueueReport
{
HSAuint32 VMID; //Required on SI to dispatch IB in primary ring
void* QueueAddress; //virtual address of UM mapped compute ring
HSAuint64 QueueSize; //size of the UM mapped compute ring
} HsaQueueReport;
typedef enum _HSA_DBG_WAVEOP
{
HSA_DBG_WAVEOP_HALT = 1, //Halts a wavefront
HSA_DBG_WAVEOP_RESUME = 2, //Resumes a wavefront
HSA_DBG_WAVEOP_KILL = 3, //Kills a wavefront
HSA_DBG_WAVEOP_DEBUG = 4, //Causes wavefront to enter debug mode
HSA_DBG_WAVEOP_TRAP = 5, //Causes wavefront to take a trap
HSA_DBG_NUM_WAVEOP = 5,
HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF
} HSA_DBG_WAVEOP;
typedef enum _HSA_DBG_WAVEMODE
{
HSA_DBG_WAVEMODE_SINGLE = 0, //send command to a single wave
//Broadcast to all wavefronts of all processes is not supported for HSA user mode
HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, //send to waves within current process
HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, //send to waves within current process on CU
HSA_DBG_NUM_WAVEMODE = 3,
HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF
} HSA_DBG_WAVEMODE;
typedef enum _HSA_DBG_WAVEMSG_TYPE
{
HSA_DBG_WAVEMSG_AUTO = 0,
HSA_DBG_WAVEMSG_USER = 1,
HSA_DBG_WAVEMSG_ERROR = 2,
HSA_DBG_NUM_WAVEMSG,
HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF
} HSA_DBG_WAVEMSG_TYPE;
typedef enum _HSA_DBG_WATCH_MODE
{
HSA_DBG_WATCH_READ = 0, //Read operations only
HSA_DBG_WATCH_NONREAD = 1, //Write or Atomic operations only
HSA_DBG_WATCH_ATOMIC = 2, //Atomic Operations only
HSA_DBG_WATCH_ALL = 3, //Read, Write or Atomic operations
HSA_DBG_WATCH_NUM
} HSA_DBG_WATCH_MODE;
typedef enum _HSA_DBG_TRAP_OVERRIDE
{
HSA_DBG_TRAP_OVERRIDE_OR = 0, // Bitwise OR exception mask with HSA_DBG_TRAP_MASK
HSA_DBG_TRAP_OVERRIDE_REPLACE = 1, // Replace exception mask with HSA_DBG_TRAP_MASK
HSA_DBG_TRAP_OVERRIDE_NUM
} HSA_DBG_TRAP_OVERRIDE;
typedef enum _HSA_DBG_TRAP_MASK
{
HSA_DBG_TRAP_MASK_FP_INVALID = 1, // Floating point invalid operation
HSA_DBG_TRAP_MASK_FP_INPUT_DENOMAL = 2, // Floating point input denormal
HSA_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4, // Floating point divide by zero
HSA_DBG_TRAP_MASK_FP_OVERFLOW = 8, // Floating point overflow
HSA_DBG_TRAP_MASK_FP_UNDERFLOW = 16, // Floating point underflow
HSA_DBG_TRAP_MASK_FP_INEXACT = 32, // Floating point inexact
HSA_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64, // Integer divide by zero
HSA_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128, // Debug address watch
HSA_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256 // Memory violation
} HSA_DBG_TRAP_MASK;
typedef enum _HSA_DBG_TRAP_EXCEPTION_CODE {
HSA_DBG_EC_NONE = 0,
/* per queue */
HSA_DBG_EC_QUEUE_WAVE_ABORT = 1,
HSA_DBG_EC_QUEUE_WAVE_TRAP = 2,
HSA_DBG_EC_QUEUE_WAVE_MATH_ERROR = 3,
HSA_DBG_EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4,
HSA_DBG_EC_QUEUE_WAVE_MEMORY_VIOLATION = 5,
HSA_DBG_EC_QUEUE_WAVE_APERTURE_VIOLATION = 6,
HSA_DBG_EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16,
HSA_DBG_EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17,
HSA_DBG_EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18,
HSA_DBG_EC_QUEUE_PACKET_RESERVED = 19,
HSA_DBG_EC_QUEUE_PACKET_UNSUPPORTED = 20,
HSA_DBG_EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
HSA_DBG_EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
HSA_DBG_EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
HSA_DBG_EC_QUEUE_PREEMPTION_ERROR = 30,
HSA_DBG_EC_QUEUE_NEW = 31,
/* per device */
HSA_DBG_EC_DEVICE_QUEUE_DELETE = 32,
HSA_DBG_EC_DEVICE_MEMORY_VIOLATION = 33,
HSA_DBG_EC_DEVICE_RAS_ERROR = 34,
HSA_DBG_EC_DEVICE_FATAL_HALT = 35,
HSA_DBG_EC_DEVICE_NEW = 36,
/* per process */
HSA_DBG_EC_PROCESS_RUNTIME = 48,
HSA_DBG_EC_PROCESS_DEVICE_REMOVE = 49,
HSA_DBG_EC_MAX
} HSA_DBG_TRAP_EXCEPTION_CODE;
/* Mask generated by ecode defined in enum above. */
#define HSA_EC_MASK(ecode) (1ULL << (ecode - 1))
typedef enum _HSA_DBG_WAVE_LAUNCH_MODE
{
HSA_DBG_WAVE_LAUNCH_MODE_NORMAL = 0, // Wavefront launched normally.
HSA_DBG_WAVE_LAUNCH_MODE_HALT = 1, // Wavefront launched in halted mode.
HSA_DBG_WAVE_LAUNCH_MODE_KILL = 2, // Wavefront is launched but immediately
// terminated before executing any instructions.
HSA_DBG_WAVE_LAUNCH_MODE_SINGLE_STEP = 3, // Wavefront is launched in single step (debug)
// mode. If debug trap is enabled by
// hsaKmtDbgEnableDebugTrap() then causes a
// trap after executing each instruction,
// otherwise behaves the same as
// HSA_DBG_WAVE_LAUNCH_MODE_NORMAL.
HSA_DBG_WAVE_LAUNCH_MODE_DISABLE = 4, // Disable launching any new waves.
HSA_DBG_WAVE_LAUNCH_MODE_NUM
} HSA_DBG_WAVE_LAUNCH_MODE;
/**
* There are no flags currently defined.
*/
typedef enum HSA_DBG_NODE_CONTROL {
HSA_DBG_NODE_CONTROL_FLAG_MAX = 0x01
} HSA_DBG_NODE_CONTROL;
#define HSA_RUNTIME_ENABLE_CAPS_SUPPORTS_CORE_DUMP_MASK 0x80000000
//This structure is hardware specific and may change in the future
typedef struct _HsaDbgWaveMsgAMDGen2
{
HSAuint32 Value;
HSAuint32 Reserved2;
} HsaDbgWaveMsgAMDGen2;
typedef union _HsaDbgWaveMessageAMD
{
HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2;
//for future HsaDbgWaveMsgAMDGen3;
} HsaDbgWaveMessageAMD;
typedef struct _HsaDbgWaveMessage
{
void* MemoryVA; // ptr to associated host-accessible data
HsaDbgWaveMessageAMD DbgWaveMsg;
} HsaDbgWaveMessage;
//
// HSA sync primitive, Event and HW Exception notification API definitions
// The API functions allow the runtime to define a so-called sync-primitive, a SW object
// combining a user-mode provided "syncvar" and a scheduler event that can be signaled
// through a defined GPU interrupt. A syncvar is a process virtual memory location of
// a certain size that can be accessed by CPU and GPU shader code within the process to set
// and query the content within that memory. The definition of the content is determined by
// the HSA runtime and potentially GPU shader code interfacing with the HSA runtime.
// The syncvar values may be commonly written through an PM4 WRITE_DATA packet in the
// user mode instruction stream.
// The OS scheduler event is typically associated and signaled by an interrupt issued by
// the GPU, but other HSA system interrupt conditions from other HW (e.g. IOMMUv2) may be
// surfaced by the KFD by this mechanism, too.
//
// these are the new definitions for events
typedef enum _HSA_EVENTTYPE
{
HSA_EVENTTYPE_SIGNAL = 0, //user-mode generated GPU signal
HSA_EVENTTYPE_NODECHANGE = 1, //HSA node change (attach/detach)
HSA_EVENTTYPE_DEVICESTATECHANGE = 2, //HSA device state change( start/stop )
HSA_EVENTTYPE_HW_EXCEPTION = 3, //GPU shader exception event
HSA_EVENTTYPE_SYSTEM_EVENT = 4, //GPU SYSCALL with parameter info
HSA_EVENTTYPE_DEBUG_EVENT = 5, //GPU signal for debugging
HSA_EVENTTYPE_PROFILE_EVENT = 6, //GPU signal for profiling
HSA_EVENTTYPE_QUEUE_EVENT = 7, //GPU signal queue idle state (EOP pm4)
HSA_EVENTTYPE_MEMORY = 8, //GPU signal for signaling memory access faults and memory subsystem issues
//...
HSA_EVENTTYPE_MAXID,
HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF
} HSA_EVENTTYPE;
//
// Definitions for types of pending debug events
//
typedef enum _HSA_DEBUG_EVENT_TYPE
{
HSA_DEBUG_EVENT_TYPE_NONE = 0,
HSA_DEBUG_EVENT_TYPE_TRAP = 1,
HSA_DEBUG_EVENT_TYPE_VMFAULT = 2,
HSA_DEBUG_EVENT_TYPE_TRAP_VMFAULT = 3
} HSA_DEBUG_EVENT_TYPE;
typedef HSAuint32 HSA_EVENTID;
//
// Subdefinitions for various event types: Syncvar
//
typedef struct _HsaSyncVar
{
union
{
void* UserData; //pointer to user mode data
HSAuint64 UserDataPtrValue; //64bit compatibility of value
} SyncVar;
HSAuint64 SyncVarSize;
} HsaSyncVar;
//
// Subdefinitions for various event types: NodeChange
//
typedef enum _HSA_EVENTTYPE_NODECHANGE_FLAGS
{
HSA_EVENTTYPE_NODECHANGE_ADD = 0,
HSA_EVENTTYPE_NODECHANGE_REMOVE = 1,
HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF
} HSA_EVENTTYPE_NODECHANGE_FLAGS;
typedef struct _HsaNodeChange
{
HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; // HSA node added/removed on the platform
} HsaNodeChange;
//
// Sub-definitions for various event types: DeviceStateChange
//
typedef enum _HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS
{
HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, //device started (and available)
HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, //device stopped (i.e. unavailable)
HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF
} HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS;
typedef struct _HsaDeviceStateChange
{
HSAuint32 NodeId; // F-NUMA node that contains the device
HSA_DEVICE Device; // device type: GPU or CPU
HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; // event flags
} HsaDeviceStateChange;
//
// Sub-definitions for various event types: Memory exception
//
typedef enum _HSA_EVENTID_MEMORYFLAGS
{
HSA_EVENTID_MEMORY_RECOVERABLE = 0, //access fault, recoverable after page adjustment
HSA_EVENTID_MEMORY_FATAL_PROCESS = 1, //memory access requires process context destruction, unrecoverable
HSA_EVENTID_MEMORY_FATAL_VM = 2, //memory access requires all GPU VA context destruction, unrecoverable
} HSA_EVENTID_MEMORYFLAGS;
typedef struct _HsaAccessAttributeFailure
{
unsigned int NotPresent : 1; // Page not present or supervisor privilege
unsigned int ReadOnly : 1; // Write access to a read-only page
unsigned int NoExecute : 1; // Execute access to a page marked NX
unsigned int GpuAccess : 1; // Host access only
unsigned int ECC : 1; // RAS ECC failure (notification of DRAM ECC - non-recoverable - error, if supported by HW)
unsigned int Imprecise : 1; // Can't determine the exact fault address
unsigned int ErrorType : 3; // Indicates RAS errors or other errors causing the access to GPU to fail
// 0 = no RAS error, 1 = ECC_SRAM, 2 = Link_SYNFLOOD (poison), 3 = GPU hang (not attributable to a specific cause), other values reserved
unsigned int Reserved : 23; // must be 0
} HsaAccessAttributeFailure;
// data associated with HSA_EVENTID_MEMORY
typedef struct _HsaMemoryAccessFault
{
HSAuint32 NodeId; // H-NUMA node that contains the device where the memory access occurred
HSAuint64 VirtualAddress; // virtual address this occurred on
HsaAccessAttributeFailure Failure; // failure attribute
HSA_EVENTID_MEMORYFLAGS Flags; // event flags
} HsaMemoryAccessFault;
typedef enum _HSA_EVENTID_HW_EXCEPTION_CAUSE
{
HSA_EVENTID_HW_EXCEPTION_GPU_HANG = 0, // GPU Hang
HSA_EVENTID_HW_EXCEPTION_ECC = 1, // SRAM ECC error
} HSA_EVENTID_HW_EXCEPTION_CAUSE;
// data associated with HSA_EVENTID_HW_EXCEPTION
typedef struct _HsaHwException
{
HSAuint32 NodeId; // Node Id where the memory exception occured
HSAuint32 ResetType;
HSAuint32 MemoryLost;
HSA_EVENTID_HW_EXCEPTION_CAUSE ResetCause;
} HsaHwException;
typedef struct _HsaEventData
{
HSA_EVENTTYPE EventType; //event type
union
{
// return data associated with HSA_EVENTTYPE_SIGNAL and other events
HsaSyncVar SyncVar;
// data associated with HSA_EVENTTYPE_NODE_CHANGE
HsaNodeChange NodeChangeState;
// data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE
HsaDeviceStateChange DeviceState;
// data associated with HSA_EVENTTYPE_MEMORY
HsaMemoryAccessFault MemoryAccessFault;
// data associated with HSA_EVENTTYPE_HW_EXCEPTION
HsaHwException HwException;
} EventData;
// the following data entries are internal to the KFD & thunk itself.
HSAuint64 HWData1; // internal thunk store for Event data (OsEventHandle)
HSAuint64 HWData2; // internal thunk store for Event data (HWAddress)
HSAuint32 HWData3; // internal thunk store for Event data (HWData)
} HsaEventData;
typedef struct _HsaEventDescriptor
{
HSA_EVENTTYPE EventType; // event type to allocate
HSAuint32 NodeId; // H-NUMA node containing GPU device that is event source
HsaSyncVar SyncVar; // pointer to user mode syncvar data, syncvar->UserDataPtrValue may be NULL
} HsaEventDescriptor;
typedef struct _HsaEvent
{
HSA_EVENTID EventId;
HsaEventData EventData;
} HsaEvent;
typedef enum _HsaEventTimeout
{
HSA_EVENTTIMEOUT_IMMEDIATE = 0,
HSA_EVENTTIMEOUT_INFINITE = 0xFFFFFFFF
} HsaEventTimeOut;
typedef struct _HsaClockCounters
{
HSAuint64 GPUClockCounter;
HSAuint64 CPUClockCounter;
HSAuint64 SystemClockCounter;
HSAuint64 SystemClockFrequencyHz;
} HsaClockCounters;
#ifndef DEFINE_GUID
typedef struct _HSA_UUID
{
HSAuint32 Data1;
HSAuint16 Data2;
HSAuint16 Data3;
HSAuint8 Data4[8];
} HSA_UUID;
#define HSA_DEFINE_UUID(name, dw, w1, w2, b1, b2, b3, b4, b5, b6, b7, b8) \
static const HSA_UUID name = {dw, w1, w2, {b1, b2, b3, b4, b5, b6, b7, b8}}
#else
#define HSA_UUID GUID
#define HSA_DEFINE_UUID DEFINE_GUID
#endif
// HSA_UUID that identifies the GPU ColorBuffer (CB) block
// {9ba429c6-af2d-4b38-b349-157271beac6a}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_CB,
0x9ba429c6, 0xaf2d, 0x4b38, 0xb3, 0x49, 0x15, 0x72, 0x71, 0xbe, 0xac, 0x6a);
// HSA_UUID that identifies the GPU (CPF) block
// {2b0ad2b5-1c43-4f46-a7bc-e119411ea6c9}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_CPF,
0x2b0ad2b5, 0x1c43, 0x4f46, 0xa7, 0xbc, 0xe1, 0x19, 0x41, 0x1e, 0xa6, 0xc9);
// HSA_UUID that identifies the GPU (CPG) block
// {590ec94d-20f0-448f-8dff-316c679de7ff
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_CPG,
0x590ec94d, 0x20f0, 0x448f, 0x8d, 0xff, 0x31, 0x6c, 0x67, 0x9d, 0xe7, 0xff);
// HSA_UUID that identifies the GPU (DB) block
// {3d1a47fc-0013-4ed4-8306-822ca0b7a6c2
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_DB,
0x3d1a47fc, 0x0013, 0x4ed4, 0x83, 0x06, 0x82, 0x2c, 0xa0, 0xb7, 0xa6, 0xc2);
// HSA_UUID that identifies the GPU (GDS) block
// {f59276ec-2526-4bf8-8ec0-118f77700dc9
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_GDS,
0xf59276ec, 0x2526, 0x4bf8, 0x8e, 0xc0, 0x11, 0x8f, 0x77, 0x70, 0x0d, 0xc9);
// HSA_UUID that identifies the GPU (GRBM) block
// {8f00933c-c33d-4801-97b7-7007f78573ad
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_GRBM,
0x8f00933c, 0xc33d, 0x4801, 0x97, 0xb7, 0x70, 0x07, 0xf7, 0x85, 0x73, 0xad);
// HSA_UUID that identifies the GPU (GRBMSE) block
// {34ebd8d7-7c8b-4d15-88fa-0e4e4af59ac1
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_GRBMSE,
0x34ebd8d7, 0x7c8b, 0x4d15, 0x88, 0xfa, 0x0e, 0x4e, 0x4a, 0xf5, 0x9a, 0xc1);
// HSA_UUID that identifies the GPU (IA) block
// {34276944-4264-4fcd-9d6e-ae264582ec51
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_IA,
0x34276944, 0x4264, 0x4fcd, 0x9d, 0x6e, 0xae, 0x26, 0x45, 0x82, 0xec, 0x51);
// HSA_UUID that identifies the GPU Memory Controller (MC) block
// {13900B57-4956-4D98-81D0-68521937F59C
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_MC,
0x13900b57, 0x4956, 0x4d98, 0x81, 0xd0, 0x68, 0x52, 0x19, 0x37, 0xf5, 0x9c);
// HSA_UUID that identifies the GPU (PASC) block
// {b0e7fb5d-0efc-4744-b516-5d23dc1fd56c
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_PASC,
0xb0e7fb5d, 0x0efc, 0x4744, 0xb5, 0x16, 0x5d, 0x23, 0xdc, 0x1f, 0xd5, 0x6c);
// HSA_UUID that identifies the GPU (PASU) block
// {9a152b6a-1fad-45f2-a5bf-f163826bd0cd
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_PASU,
0x9a152b6a, 0x1fad, 0x45f2, 0xa5, 0xbf, 0xf1, 0x63, 0x82, 0x6b, 0xd0, 0xcd);
// HSA_UUID that identifies the GPU (SPI) block
// {eda81044-d62c-47eb-af89-4f6fbf3b38e0
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_SPI,
0xeda81044, 0xd62c, 0x47eb, 0xaf, 0x89, 0x4f, 0x6f, 0xbf, 0x3b, 0x38, 0xe0);
// HSA_UUID that identifies the GPU (SRBM) block
// {9f8040e0-6830-4019-acc8-463c9e445b89
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_SRBM,
0x9f8040e0, 0x6830, 0x4019, 0xac, 0xc8, 0x46, 0x3c, 0x9e, 0x44, 0x5b, 0x89);
// GUID that identifies the GPU Shader Sequencer (SQ) block
// {B5C396B6-D310-47E4-86FC-5CC3043AF508}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_SQ,
0xb5c396b6, 0xd310, 0x47e4, 0x86, 0xfc, 0x5c, 0xc3, 0x4, 0x3a, 0xf5, 0x8);
// HSA_UUID that identifies the GPU (SX) block
// {bdb8d737-43cc-4162-be52-51cfb847beaf}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_SX,
0xbdb8d737, 0x43cc, 0x4162, 0xbe, 0x52, 0x51, 0xcf, 0xb8, 0x47, 0xbe, 0xaf);
// HSA_UUID that identifies the GPU (TA) block
// {c01ee43d-ad92-44b1-8ab9-be5e696ceea7}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_TA,
0xc01ee43d, 0xad92, 0x44b1, 0x8a, 0xb9, 0xbe, 0x5e, 0x69, 0x6c, 0xee, 0xa7);
// HSA_UUID that identifies the GPU TextureCache (TCA) block
// {333e393f-e147-4f49-a6d1-60914c7086b0}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_TCA,
0x333e393f, 0xe147, 0x4f49, 0xa6, 0xd1,0x60, 0x91, 0x4c, 0x70, 0x86, 0xb0);
// HSA_UUID that identifies the GPU TextureCache (TCC) block
// {848ce855-d805-4566-a8ab-73e884cc6bff}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_TCC,
0x848ce855, 0xd805, 0x4566, 0xa8, 0xab, 0x73, 0xe8, 0x84, 0xcc, 0x6b, 0xff);
// HSA_UUID that identifies the GPU (TCP) block
// {e10a013b-17d4-4bf5-b089-429591059b60}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_TCP,
0xe10a013b, 0x17d4, 0x4bf5, 0xb0, 0x89, 0x42, 0x95, 0x91, 0x05, 0x9b, 0x60);
// HSA_UUID that identifies the GPU (TCS) block
// {4126245c-4d96-4d1a-8aed-a939d4cc8ec9}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_TCS,
0x4126245c, 0x4d96, 0x4d1a, 0x8a, 0xed, 0xa9, 0x39, 0xd4, 0xcc, 0x8e, 0xc9);
// HSA_UUID that identifies the GPU (TD) block
// {7d7c0fe4-fe41-4fea-92c9-4544d7706dc6}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_TD,
0x7d7c0fe4, 0xfe41, 0x4fea, 0x92, 0xc9, 0x45, 0x44, 0xd7, 0x70, 0x6d, 0xc6);
// HSA_UUID that identifies the GPU (VGT) block
// {0b6a8cb7-7a01-409f-a22c-3014854f1359}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_VGT,
0x0b6a8cb7, 0x7a01, 0x409f, 0xa2, 0x2c, 0x30, 0x14, 0x85, 0x4f, 0x13, 0x59);
// HSA_UUID that identifies the GPU (WD) block
// {0e176789-46ed-4b02-972a-916d2fac244a}
HSA_DEFINE_UUID(HSA_PROFILEBLOCK_AMD_WD,
0x0e176789, 0x46ed, 0x4b02, 0x97, 0x2a, 0x91, 0x6d, 0x2f, 0xac, 0x24, 0x4a);
typedef enum _HSA_PROFILE_TYPE
{
HSA_PROFILE_TYPE_PRIVILEGED_IMMEDIATE = 0, //immediate access counter (KFD access only)
HSA_PROFILE_TYPE_PRIVILEGED_STREAMING = 1, //streaming counter, HW continuously
//writes to memory on updates (KFD access only)
HSA_PROFILE_TYPE_NONPRIV_IMMEDIATE = 2, //user-queue accessible counter
HSA_PROFILE_TYPE_NONPRIV_STREAMING = 3, //user-queue accessible counter
//...
HSA_PROFILE_TYPE_NUM,
HSA_PROFILE_TYPE_SIZE = 0xFFFFFFFF // In order to align to 32-bit value
} HSA_PROFILE_TYPE;
typedef struct _HsaCounterFlags
{
union
{
struct
{
unsigned int Global : 1; // counter is global
// (not tied to VMID/WAVE/CU, ...)
unsigned int Resettable : 1; // counter can be reset by SW
// (always to 0?)
unsigned int ReadOnly : 1; // counter is read-only
// (but may be reset, if indicated)
unsigned int Stream : 1; // counter has streaming capability
// (after trigger, updates buffer)
unsigned int Reserved : 28;
} ui32;
HSAuint32 Value;
};
} HsaCounterFlags;
typedef struct _HsaCounter
{
HSA_PROFILE_TYPE Type; // specifies the counter type
HSAuint64 CounterId; // indicates counter register offset
HSAuint32 CounterSizeInBits; // indicates relevant counter bits
HSAuint64 CounterMask; // bitmask for counter value (if applicable)
HsaCounterFlags Flags; // Property flags (see above)
HSAuint32 BlockIndex; // identifies block the counter belongs to,
// value may be 0 to NumBlocks
} HsaCounter;
typedef struct _HsaCounterBlockProperties
{
HSA_UUID BlockId; // specifies the block location
HSAuint32 NumCounters; // How many counters are available?
// (sizes Counters[] array below)
HSAuint32 NumConcurrent; // How many counter slots are available
// in block?
HsaCounter Counters[1]; // Start of counter array
// (NumCounters elements total)
} HsaCounterBlockProperties;
typedef struct _HsaCounterProperties
{
HSAuint32 NumBlocks; // How many profilable block are available?
// (sizes Blocks[] array below)
HSAuint32 NumConcurrent; // How many blocks slots can be queried
// concurrently by HW?
HsaCounterBlockProperties Blocks[1]; // Start of block array
// (NumBlocks elements total)
} HsaCounterProperties;
typedef HSAuint64 HSATraceId;
typedef struct _HsaPmcTraceRoot
{
HSAuint64 TraceBufferMinSizeBytes;// (page aligned)
HSAuint32 NumberOfPasses;
HSATraceId TraceId;
} HsaPmcTraceRoot;
typedef struct _HsaGpuTileConfig
{
HSAuint32 *TileConfig;
HSAuint32 *MacroTileConfig;
HSAuint32 NumTileConfigs;
HSAuint32 NumMacroTileConfigs;
HSAuint32 GbAddrConfig;
HSAuint32 NumBanks;
HSAuint32 NumRanks;
/* 9 dwords on 64-bit system */
HSAuint32 Reserved[7]; /* Round up to 16 dwords for future extension */
} HsaGpuTileConfig;
typedef enum _HSA_POINTER_TYPE {
HSA_POINTER_UNKNOWN = 0,
HSA_POINTER_ALLOCATED = 1, // Allocated with hsaKmtAllocMemory (except scratch)
HSA_POINTER_REGISTERED_USER = 2, // Registered user pointer
HSA_POINTER_REGISTERED_GRAPHICS = 3, // Registered graphics buffer
HSA_POINTER_REGISTERED_SHARED = 4, // Registered shared buffer (IPC)
// (hsaKmtRegisterGraphicsToNodes)
HSA_POINTER_RESERVED_ADDR = 5 // address-only reservation VA
} HSA_POINTER_TYPE;
typedef struct _HsaPointerInfo {
HSA_POINTER_TYPE Type; // Pointer type
HSAuint32 Node; // Node where the memory is located
HsaMemFlags MemFlags; // HsaMemFlags used to alloc memory
void *CPUAddress; // Start address for CPU access
HSAuint64 GPUAddress; // Start address for GPU access
HSAuint64 SizeInBytes; // Size in bytes
HSAuint32 NRegisteredNodes; // Number of nodes the memory is registered to
HSAuint32 NMappedNodes; // Number of nodes the memory is mapped to
const HSAuint32 *RegisteredNodes; // Array of registered nodes
const HSAuint32 *MappedNodes; // Array of mapped nodes
void *UserData; // User data associated with the memory
} HsaPointerInfo;
typedef HSAuint32 HsaSharedMemoryHandle[8];
typedef struct _HsaMemoryRange {
void *MemoryAddress; // Pointer to GPU memory
HSAuint64 SizeInBytes; // Size of above memory
} HsaMemoryRange;
typedef enum _HSA_SVM_FLAGS {
HSA_SVM_FLAG_HOST_ACCESS = 0x00000001, // Guarantee host access to memory
HSA_SVM_FLAG_COHERENT = 0x00000002, // Fine grained coherency between all devices with access
HSA_SVM_FLAG_HIVE_LOCAL = 0x00000004, // Use any GPU in same hive as preferred device
HSA_SVM_FLAG_GPU_RO = 0x00000008, // GPUs only read, allows replication
HSA_SVM_FLAG_GPU_EXEC = 0x00000010, // Allow execution on GPU
HSA_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020, // GPUs mostly read, may allow similar optimizations as RO, but writes fault
HSA_SVM_FLAG_GPU_ALWAYS_MAPPED = 0x00000040, // Keep GPU memory mapping always valid as if XNACK is disable
HSA_SVM_FLAG_EXT_COHERENT = 0x00000080, // Fine grained coherency between all devices using device-scope atomics
} HSA_SVM_FLAGS;
typedef enum _HSA_SVM_ATTR_TYPE {
HSA_SVM_ATTR_PREFERRED_LOC, // gpuid of the preferred location, 0 for
// system memory, INVALID_NODEID for
// "don't care"
HSA_SVM_ATTR_PREFETCH_LOC, // gpuid of the prefetch location, 0 for
// system memory. Setting this triggers an
// immediate prefetch (migration)
HSA_SVM_ATTR_ACCESS,
HSA_SVM_ATTR_ACCESS_IN_PLACE,
HSA_SVM_ATTR_NO_ACCESS, // specify memory access for the gpuid given
// by the attribute value
HSA_SVM_ATTR_SET_FLAGS, // bitmask of flags to set (see HSA_SVM_FLAGS)
HSA_SVM_ATTR_CLR_FLAGS, // bitmask of flags to clear
HSA_SVM_ATTR_GRANULARITY // migration granularity (log2 num pages)
} HSA_SVM_ATTR_TYPE;
typedef struct _HSA_SVM_ATTRIBUTE {
HSAuint32 type; // attribute type (see enum HSA_SVM_ATTR_TYPE)
HSAuint32 value; // attribute value
} HSA_SVM_ATTRIBUTE;
typedef enum _HSA_SMI_EVENT {
HSA_SMI_EVENT_NONE = 0, /* not used */
HSA_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
HSA_SMI_EVENT_THERMAL_THROTTLE = 2,
HSA_SMI_EVENT_GPU_PRE_RESET = 3,
HSA_SMI_EVENT_GPU_POST_RESET = 4,
HSA_SMI_EVENT_MIGRATE_START = 5,
HSA_SMI_EVENT_MIGRATE_END = 6,
HSA_SMI_EVENT_PAGE_FAULT_START = 7,
HSA_SMI_EVENT_PAGE_FAULT_END = 8,
HSA_SMI_EVENT_QUEUE_EVICTION = 9,
HSA_SMI_EVENT_QUEUE_RESTORE = 10,
HSA_SMI_EVENT_UNMAP_FROM_GPU = 11,
HSA_SMI_EVENT_INDEX_MAX = 12,
/*
* max event number, as a flag bit to get events from all processes,
* this requires super user permission, otherwise will not be able to
* receive event from any process. Without this flag to receive events
* from same process.
*/
HSA_SMI_EVENT_ALL_PROCESS = 64
} HSA_EVENT_TYPE;
typedef enum _HSA_MIGRATE_TRIGGERS {
HSA_MIGRATE_TRIGGER_PREFETCH,
HSA_MIGRATE_TRIGGER_PAGEFAULT_GPU,
HSA_MIGRATE_TRIGGER_PAGEFAULT_CPU,
HSA_MIGRATE_TRIGGER_TTM_EVICTION
} HSA_MIGRATE_TRIGGERS;
typedef enum _HSA_QUEUE_EVICTION_TRIGGERS {
HSA_QUEUE_EVICTION_TRIGGER_SVM,
HSA_QUEUE_EVICTION_TRIGGER_USERPTR,
HSA_QUEUE_EVICTION_TRIGGER_TTM,
HSA_QUEUE_EVICTION_TRIGGER_SUSPEND,
HSA_QUEUE_EVICTION_CRIU_CHECKPOINT,
HSA_QUEUE_EVICTION_CRIU_RESTORE
} HSA_QUEUE_EVICTION_TRIGGERS;
typedef enum _HSA_SVM_UNMAP_TRIGGERS {
HSA_SVM_UNMAP_TRIGGER_MMU_NOTIFY,
HSA_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
HSA_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
} HSA_SVM_UNMAP_TRIGGERS;
#define HSA_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
#define HSA_SMI_EVENT_MSG_SIZE 96
typedef void *HsaAMDGPUDeviceHandle;
typedef HSAuint32 HsaPcSamplingTraceId;
typedef enum _HSA_PC_SAMPLING_METHOD_KIND
{
HSA_PC_SAMPLING_METHOD_KIND_HOSTTRAP_V1 = 1,
HSA_PC_SAMPLING_METHOD_KIND_STOCHASTIC_V1,
} HSA_PC_SAMPLING_METHOD_KIND;
typedef enum _HSA_PC_SAMPLING_UNITS
{
HSA_PC_SAMPLING_UNIT_INTERVAL_MICROSECONDS,
HSA_PC_SAMPLING_UNIT_INTERVAL_CYCLES,
HSA_PC_SAMPLING_UNIT_INTERVAL_INSTRUCTIONS,
} HSA_PC_SAMPLING_UNIT_INTERVAL;
typedef struct _HsaPcSamplingInfo
{
HSAuint64 value;
HSAuint64 value_min;
HSAuint64 value_max;
HSAuint64 flags;
HSA_PC_SAMPLING_METHOD_KIND method;
HSA_PC_SAMPLING_UNIT_INTERVAL units;
}
HsaPcSamplingInfo;
typedef union
{
HSAuint32 Value;
struct
{
unsigned int requiresVAddr : 1; // Requires virtual address
} ui32;
} HSA_REGISTER_MEM_FLAGS;
#pragma pack(pop, hsakmttypes_h)
#ifdef __cplusplus
} //extern "C"
#endif
#endif //_HSAKMTTYPES_H_
================================================
FILE: libhsakmt/include/hsakmt/linux/kfd_ioctl.h
================================================
/*
* Copyright 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef KFD_IOCTL_H_INCLUDED
#define KFD_IOCTL_H_INCLUDED
#include
#include
/*
* - 1.1 - initial version
* - 1.3 - Add SMI events support
* - 1.4 - Indicate new SRAM EDC bit in device properties
* - 1.5 - Add SVM API
* - 1.6 - Query clear flags in SVM get_attr API
* - 1.7 - Checkpoint Restore (CRIU) API
* - 1.8 - CRIU - Support for SDMA transfers with GTT BOs
* - 1.9 - Add available_memory ioctl
* - 1.10 - Add SMI profiler event log
* - 1.11 - Add unified memory for ctx save/restore area
* - 1.12 - Add DMA buf export ioctl
* - 1.13 - Add debugger API
* - 1.14 - Update kfd_event_data
* - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
* - 1.16 - Add contiguous VRAM allocation flag
* - 1.17 - Add SDMA queue creation with target SDMA engine ID
*/
#define KFD_IOCTL_MAJOR_VERSION 1
#define KFD_IOCTL_MINOR_VERSION 17
struct kfd_ioctl_get_version_args {
__u32 major_version; /* from KFD */
__u32 minor_version; /* from KFD */
};
/* For kfd_ioctl_create_queue_args.queue_type. */
#define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0
#define KFD_IOC_QUEUE_TYPE_SDMA 0x1
#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 0x2
#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI 0x3
#define KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID 0x4
#define KFD_MAX_QUEUE_PERCENTAGE 100
#define KFD_MAX_QUEUE_PRIORITY 15
struct kfd_ioctl_create_queue_args {
__u64 ring_base_address; /* to KFD */
__u64 write_pointer_address; /* from KFD */
__u64 read_pointer_address; /* from KFD */
__u64 doorbell_offset; /* from KFD */
__u32 ring_size; /* to KFD */
__u32 gpu_id; /* to KFD */
__u32 queue_type; /* to KFD */
__u32 queue_percentage; /* to KFD */
__u32 queue_priority; /* to KFD */
__u32 queue_id; /* from KFD */
__u64 eop_buffer_address; /* to KFD */
__u64 eop_buffer_size; /* to KFD */
__u64 ctx_save_restore_address; /* to KFD */
__u32 ctx_save_restore_size; /* to KFD */
__u32 ctl_stack_size; /* to KFD */
__u32 sdma_engine_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_destroy_queue_args {
__u32 queue_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_update_queue_args {
__u64 ring_base_address; /* to KFD */
__u32 queue_id; /* to KFD */
__u32 ring_size; /* to KFD */
__u32 queue_percentage; /* to KFD */
__u32 queue_priority; /* to KFD */
};
struct kfd_ioctl_set_cu_mask_args {
__u32 queue_id; /* to KFD */
__u32 num_cu_mask; /* to KFD */
__u64 cu_mask_ptr; /* to KFD */
};
struct kfd_ioctl_get_queue_wave_state_args {
__u64 ctl_stack_address; /* to KFD */
__u32 ctl_stack_used_size; /* from KFD */
__u32 save_area_used_size; /* from KFD */
__u32 queue_id; /* to KFD */
__u32 pad;
};
struct kfd_queue_snapshot_entry {
__u64 exception_status;
__u64 ring_base_address;
__u64 write_pointer_address;
__u64 read_pointer_address;
__u64 ctx_save_restore_address;
__u32 queue_id;
__u32 gpu_id;
__u32 ring_size;
__u32 queue_type;
__u32 ctx_save_restore_area_size;
__u32 reserved;
};
struct kfd_dbg_device_info_entry {
__u64 exception_status;
__u64 lds_base;
__u64 lds_limit;
__u64 scratch_base;
__u64 scratch_limit;
__u64 gpuvm_base;
__u64 gpuvm_limit;
__u32 gpu_id;
__u32 location_id;
__u32 vendor_id;
__u32 device_id;
__u32 revision_id;
__u32 subsystem_vendor_id;
__u32 subsystem_device_id;
__u32 fw_version;
__u32 gfx_target_version;
__u32 simd_count;
__u32 max_waves_per_simd;
__u32 array_count;
__u32 simd_arrays_per_engine;
__u32 num_xcc;
__u32 capability;
__u32 debug_prop;
};
/* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
#define KFD_IOC_CACHE_POLICY_COHERENT 0
#define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
struct kfd_ioctl_set_memory_policy_args {
__u64 alternate_aperture_base; /* to KFD */
__u64 alternate_aperture_size; /* to KFD */
__u32 gpu_id; /* to KFD */
__u32 default_policy; /* to KFD */
__u32 alternate_policy; /* to KFD */
__u32 misc_process_flag; /* to KFD */
};
/*
* All counters are monotonic. They are used for profiling of compute jobs.
* The profiling is done by userspace.
*
* In case of GPU reset, the counter should not be affected.
*/
struct kfd_ioctl_get_clock_counters_args {
__u64 gpu_clock_counter; /* from KFD */
__u64 cpu_clock_counter; /* from KFD */
__u64 system_clock_counter; /* from KFD */
__u64 system_clock_freq; /* from KFD */
__u32 gpu_id; /* to KFD */
__u32 pad;
};
struct kfd_process_device_apertures {
__u64 lds_base; /* from KFD */
__u64 lds_limit; /* from KFD */
__u64 scratch_base; /* from KFD */
__u64 scratch_limit; /* from KFD */
__u64 gpuvm_base; /* from KFD */
__u64 gpuvm_limit; /* from KFD */
__u32 gpu_id; /* from KFD */
__u32 pad;
};
/*
* AMDKFD_IOC_GET_PROCESS_APERTURES is deprecated. Use
* AMDKFD_IOC_GET_PROCESS_APERTURES_NEW instead, which supports an
* unlimited number of GPUs.
*/
#define NUM_OF_SUPPORTED_GPUS 7
struct kfd_ioctl_get_process_apertures_args {
struct kfd_process_device_apertures
process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
/* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
__u32 num_of_nodes;
__u32 pad;
};
struct kfd_ioctl_get_process_apertures_new_args {
/* User allocated. Pointer to struct kfd_process_device_apertures
* filled in by Kernel
*/
__u64 kfd_process_device_apertures_ptr;
/* to KFD - indicates amount of memory present in
* kfd_process_device_apertures_ptr
* from KFD - Number of entries filled by KFD.
*/
__u32 num_of_nodes;
__u32 pad;
};
#define MAX_ALLOWED_NUM_POINTS 100
#define MAX_ALLOWED_AW_BUFF_SIZE 4096
#define MAX_ALLOWED_WAC_BUFF_SIZE 128
struct kfd_ioctl_dbg_register_args {
__u32 gpu_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_dbg_unregister_args {
__u32 gpu_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_dbg_address_watch_args {
__u64 content_ptr; /* a pointer to the actual content */
__u32 gpu_id; /* to KFD */
__u32 buf_size_in_bytes; /*including gpu_id and buf_size */
};
struct kfd_ioctl_dbg_wave_control_args {
__u64 content_ptr; /* a pointer to the actual content */
__u32 gpu_id; /* to KFD */
__u32 buf_size_in_bytes; /*including gpu_id and buf_size */
};
#define KFD_DBG_EV_FLAG_CLEAR_STATUS 1
/* queue states for suspend/resume */
#define KFD_DBG_QUEUE_ERROR_BIT 30
#define KFD_DBG_QUEUE_INVALID_BIT 31
#define KFD_DBG_QUEUE_ERROR_MASK (1 << KFD_DBG_QUEUE_ERROR_BIT)
#define KFD_DBG_QUEUE_INVALID_MASK (1 << KFD_DBG_QUEUE_INVALID_BIT)
#define KFD_INVALID_GPUID 0xffffffff
#define KFD_INVALID_QUEUEID 0xffffffff
#define KFD_INVALID_FD 0xffffffff
enum kfd_dbg_trap_override_mode {
KFD_DBG_TRAP_OVERRIDE_OR = 0,
KFD_DBG_TRAP_OVERRIDE_REPLACE = 1
};
enum kfd_dbg_trap_mask {
KFD_DBG_TRAP_MASK_FP_INVALID = 1,
KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2,
KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4,
KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8,
KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16,
KFD_DBG_TRAP_MASK_FP_INEXACT = 32,
KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64,
KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128,
KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256,
KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START = (1 << 30),
KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END = (1 << 31)
};
/* Wave launch modes */
enum kfd_dbg_trap_wave_launch_mode {
KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL = 0,
KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT = 1,
KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG = 3
};
/* Address watch modes */
enum kfd_dbg_trap_address_watch_mode {
KFD_DBG_TRAP_ADDRESS_WATCH_MODE_READ = 0,
KFD_DBG_TRAP_ADDRESS_WATCH_MODE_NONREAD = 1,
KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ATOMIC = 2,
KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ALL = 3
};
/* Additional wave settings */
enum kfd_dbg_trap_flags {
KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1,
};
enum kfd_dbg_trap_exception_code {
EC_NONE = 0,
/* per queue */
EC_QUEUE_WAVE_ABORT = 1,
EC_QUEUE_WAVE_TRAP = 2,
EC_QUEUE_WAVE_MATH_ERROR = 3,
EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4,
EC_QUEUE_WAVE_MEMORY_VIOLATION = 5,
EC_QUEUE_WAVE_APERTURE_VIOLATION = 6,
EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16,
EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17,
EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18,
EC_QUEUE_PACKET_RESERVED = 19,
EC_QUEUE_PACKET_UNSUPPORTED = 20,
EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
EC_QUEUE_PREEMPTION_ERROR = 30,
EC_QUEUE_NEW = 31,
/* per device */
EC_DEVICE_QUEUE_DELETE = 32,
EC_DEVICE_MEMORY_VIOLATION = 33,
EC_DEVICE_RAS_ERROR = 34,
EC_DEVICE_FATAL_HALT = 35,
EC_DEVICE_NEW = 36,
/* per process */
EC_PROCESS_RUNTIME = 48,
EC_PROCESS_DEVICE_REMOVE = 49,
EC_MAX
};
/* Mask generated by ecode defined in enum above. */
#define KFD_EC_MASK(ecode) (1ULL << (ecode - 1))
/* Masks for exception code type checks below. */
#define KFD_EC_MASK_QUEUE (KFD_EC_MASK(EC_QUEUE_WAVE_ABORT) | \
KFD_EC_MASK(EC_QUEUE_WAVE_TRAP) | \
KFD_EC_MASK(EC_QUEUE_WAVE_MATH_ERROR) | \
KFD_EC_MASK(EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION) | \
KFD_EC_MASK(EC_QUEUE_WAVE_MEMORY_VIOLATION) | \
KFD_EC_MASK(EC_QUEUE_WAVE_APERTURE_VIOLATION) | \
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) | \
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) | \
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) | \
KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) | \
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) | \
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \
KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \
KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \
KFD_EC_MASK(EC_QUEUE_NEW))
#define KFD_EC_MASK_DEVICE (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \
KFD_EC_MASK(EC_DEVICE_RAS_ERROR) | \
KFD_EC_MASK(EC_DEVICE_FATAL_HALT) | \
KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION) | \
KFD_EC_MASK(EC_DEVICE_NEW))
#define KFD_EC_MASK_PROCESS (KFD_EC_MASK(EC_PROCESS_RUNTIME) | \
KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE))
/* Checks for exception code types for KFD search. */
#define KFD_DBG_EC_TYPE_IS_QUEUE(ecode) \
(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE))
#define KFD_DBG_EC_TYPE_IS_DEVICE(ecode) \
(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_DEVICE))
#define KFD_DBG_EC_TYPE_IS_PROCESS(ecode) \
(!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_PROCESS))
/* Misc. per process flags */
#define ENABLE_MFMA_HIGH_PRECISION (1 << 0)
enum kfd_dbg_runtime_state {
DEBUG_RUNTIME_STATE_DISABLED = 0,
DEBUG_RUNTIME_STATE_ENABLED = 1,
DEBUG_RUNTIME_STATE_ENABLED_BUSY = 2,
DEBUG_RUNTIME_STATE_ENABLED_ERROR = 3
};
struct kfd_runtime_info {
__u64 r_debug;
__u32 runtime_state;
__u32 ttmp_setup;
};
/* Enable modes for runtime enable */
#define KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK 1
#define KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK 2
#define KFD_RUNTIME_ENABLE_CAPS_SUPPORTS_CORE_DUMP_MASK 0x80000000
/**
* kfd_ioctl_runtime_enable_args - Arguments for runtime enable
*
* Coordinates debug exception signalling and debug device enablement with runtime.
*
* @r_debug - pointer to user struct for sharing information between ROCr and the debuggger
* @mode_mask - mask to set mode
* KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK - enable runtime for debugging, otherwise disable
* KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK - enable trap temporary setup (ignore on disable)
*
* Return - 0 on SUCCESS.
* - EBUSY if runtime enable call already pending.
* - EEXIST if user queues already active prior to call.
* If process is debug enabled, runtime enable will enable debug devices and
* wait for debugger process to send runtime exception EC_PROCESS_RUNTIME
* to unblock - see kfd_ioctl_dbg_trap_args.
*
*/
struct kfd_ioctl_runtime_enable_args {
__u64 r_debug;
__u32 mode_mask;
__u32 capabilities_mask;
};
/* Context save area header information */
struct kfd_context_save_area_header {
struct {
__u32 control_stack_offset;
__u32 control_stack_size;
__u32 wave_state_offset;
__u32 wave_state_size;
} wave_state;
__u32 debug_offset;
__u32 debug_size;
__u64 err_payload_addr;
__u32 err_event_id;
__u32 reserved1;
};
/*
* Debug operations
*
* For specifics on usage and return values, see documentation per operation
* below. Otherwise, generic error returns apply:
* - ESRCH if the process to debug does not exist.
*
* - EINVAL (with KFD_IOC_DBG_TRAP_ENABLE exempt) if operation
* KFD_IOC_DBG_TRAP_ENABLE has not succeeded prior.
* Also returns this error if GPU hardware scheduling is not supported.
*
* - EPERM (with KFD_IOC_DBG_TRAP_DISABLE exempt) if target process is not
* PTRACE_ATTACHED. KFD_IOC_DBG_TRAP_DISABLE is exempt to allow
* clean up of debug mode as long as process is debug enabled.
*
* - EACCES if any DBG_HW_OP (debug hardware operation) is requested when
* AMDKFD_IOC_RUNTIME_ENABLE has not succeeded prior.
*
* - ENODEV if any GPU does not support debugging on a DBG_HW_OP call.
*
* - Other errors may be returned when a DBG_HW_OP occurs while the GPU
* is in a fatal state.
*
*/
enum kfd_dbg_trap_operations {
KFD_IOC_DBG_TRAP_ENABLE = 0,
KFD_IOC_DBG_TRAP_DISABLE = 1,
KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT = 2,
KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED = 3,
KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE = 4, /* DBG_HW_OP */
KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE = 5, /* DBG_HW_OP */
KFD_IOC_DBG_TRAP_SUSPEND_QUEUES = 6, /* DBG_HW_OP */
KFD_IOC_DBG_TRAP_RESUME_QUEUES = 7, /* DBG_HW_OP */
KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH = 8, /* DBG_HW_OP */
KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH = 9, /* DBG_HW_OP */
KFD_IOC_DBG_TRAP_SET_FLAGS = 10,
KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT = 11,
KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO = 12,
KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT = 13,
KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT = 14
};
/**
* kfd_ioctl_dbg_trap_enable_args
*
* Arguments for KFD_IOC_DBG_TRAP_ENABLE.
*
* Enables debug session for target process. Call @op KFD_IOC_DBG_TRAP_DISABLE in
* kfd_ioctl_dbg_trap_args to disable debug session.
*
* @exception_mask (IN) - exceptions to raise to the debugger
* @rinfo_ptr (IN) - pointer to runtime info buffer (see kfd_runtime_info)
* @rinfo_size (IN/OUT) - size of runtime info buffer in bytes
* @dbg_fd (IN) - fd the KFD will nofify the debugger with of raised
* exceptions set in exception_mask.
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* Copies KFD saved kfd_runtime_info to @rinfo_ptr on enable.
* Size of kfd_runtime saved by the KFD returned to @rinfo_size.
* - EBADF if KFD cannot get a reference to dbg_fd.
* - EFAULT if KFD cannot copy runtime info to rinfo_ptr.
* - EINVAL if target process is already debug enabled.
*
*/
struct kfd_ioctl_dbg_trap_enable_args {
__u64 exception_mask;
__u64 rinfo_ptr;
__u32 rinfo_size;
__u32 dbg_fd;
};
/**
* kfd_ioctl_dbg_trap_send_runtime_event_args
*
*
* Arguments for KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT.
* Raises exceptions to runtime.
*
* @exception_mask (IN) - exceptions to raise to runtime
* @gpu_id (IN) - target device id
* @queue_id (IN) - target queue id
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* - ENODEV if gpu_id not found.
* If exception_mask contains EC_PROCESS_RUNTIME, unblocks pending
* AMDKFD_IOC_RUNTIME_ENABLE call - see kfd_ioctl_runtime_enable_args.
* All other exceptions are raised to runtime through err_payload_addr.
* See kfd_context_save_area_header.
*/
struct kfd_ioctl_dbg_trap_send_runtime_event_args {
__u64 exception_mask;
__u32 gpu_id;
__u32 queue_id;
};
/**
* kfd_ioctl_dbg_trap_set_exceptions_enabled_args
*
* Arguments for KFD_IOC_SET_EXCEPTIONS_ENABLED
* Set new exceptions to be raised to the debugger.
*
* @exception_mask (IN) - new exceptions to raise the debugger
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
*/
struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args {
__u64 exception_mask;
};
/**
* kfd_ioctl_dbg_trap_set_wave_launch_override_args
*
* Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE
* Enable HW exceptions to raise trap.
*
* @override_mode (IN) - see kfd_dbg_trap_override_mode
* @enable_mask (IN/OUT) - reference kfd_dbg_trap_mask.
* IN is the override modes requested to be enabled.
* OUT is referenced in Return below.
* @support_request_mask (IN/OUT) - reference kfd_dbg_trap_mask.
* IN is the override modes requested for support check.
* OUT is referenced in Return below.
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* Previous enablement is returned in @enable_mask.
* Actual override support is returned in @support_request_mask.
* - EINVAL if override mode is not supported.
* - EACCES if trap support requested is not actually supported.
* i.e. enable_mask (IN) is not a subset of support_request_mask (OUT).
* Otherwise it is considered a generic error (see kfd_dbg_trap_operations).
*/
struct kfd_ioctl_dbg_trap_set_wave_launch_override_args {
__u32 override_mode;
__u32 enable_mask;
__u32 support_request_mask;
__u32 pad;
};
/**
* kfd_ioctl_dbg_trap_set_wave_launch_mode_args
*
* Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE
* Set wave launch mode.
*
* @mode (IN) - see kfd_dbg_trap_wave_launch_mode
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
*/
struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args {
__u32 launch_mode;
__u32 pad;
};
/**
* kfd_ioctl_dbg_trap_suspend_queues_ags
*
* Arguments for KFD_IOC_DBG_TRAP_SUSPEND_QUEUES
* Suspend queues.
*
* @exception_mask (IN) - raised exceptions to clear
* @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id)
* to suspend
* @num_queues (IN) - number of queues to suspend in @queue_array_ptr
* @grace_period (IN) - wave time allowance before preemption
* per 1K GPU clock cycle unit
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Destruction of a suspended queue is blocked until the queue is
* resumed. This allows the debugger to access queue information and
* the its context save area without running into a race condition on
* queue destruction.
* Automatically copies per queue context save area header information
* into the save area base
* (see kfd_queue_snapshot_entry and kfd_context_save_area_header).
*
* Return - Number of queues suspended on SUCCESS.
* . KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK masked
* for each queue id in @queue_array_ptr array reports unsuccessful
* suspend reason.
* KFD_DBG_QUEUE_ERROR_MASK = HW failure.
* KFD_DBG_QUEUE_INVALID_MASK = queue does not exist, is new or
* is being destroyed.
*/
struct kfd_ioctl_dbg_trap_suspend_queues_args {
__u64 exception_mask;
__u64 queue_array_ptr;
__u32 num_queues;
__u32 grace_period;
};
/**
* kfd_ioctl_dbg_trap_resume_queues_args
*
* Arguments for KFD_IOC_DBG_TRAP_RESUME_QUEUES
* Resume queues.
*
* @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id)
* to resume
* @num_queues (IN) - number of queues to resume in @queue_array_ptr
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - Number of queues resumed on SUCCESS.
* KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK mask
* for each queue id in @queue_array_ptr array reports unsuccessful
* resume reason.
* KFD_DBG_QUEUE_ERROR_MASK = HW failure.
* KFD_DBG_QUEUE_INVALID_MASK = queue does not exist.
*/
struct kfd_ioctl_dbg_trap_resume_queues_args {
__u64 queue_array_ptr;
__u32 num_queues;
__u32 pad;
};
/**
* kfd_ioctl_dbg_trap_set_node_address_watch_args
*
* Arguments for KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH
* Sets address watch for device.
*
* @address (IN) - watch address to set
* @mode (IN) - see kfd_dbg_trap_address_watch_mode
* @mask (IN) - watch address mask
* @gpu_id (IN) - target gpu to set watch point
* @id (OUT) - watch id allocated
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* Allocated watch ID returned to @id.
* - ENODEV if gpu_id not found.
* - ENOMEM if watch IDs can be allocated
*/
struct kfd_ioctl_dbg_trap_set_node_address_watch_args {
__u64 address;
__u32 mode;
__u32 mask;
__u32 gpu_id;
__u32 id;
};
/**
* kfd_ioctl_dbg_trap_clear_node_address_watch_args
*
* Arguments for KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH
* Clear address watch for device.
*
* @gpu_id (IN) - target device to clear watch point
* @id (IN) - allocated watch id to clear
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* - ENODEV if gpu_id not found.
* - EINVAL if watch ID has not been allocated.
*/
struct kfd_ioctl_dbg_trap_clear_node_address_watch_args {
__u32 gpu_id;
__u32 id;
};
/**
* kfd_ioctl_dbg_trap_set_flags_args
*
* Arguments for KFD_IOC_DBG_TRAP_SET_FLAGS
* Sets flags for wave behaviour.
*
* @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* - EACCESS if any debug device does not allow flag options.
*/
struct kfd_ioctl_dbg_trap_set_flags_args {
__u32 flags;
__u32 pad;
};
/**
* kfd_ioctl_dbg_trap_query_debug_event_args
*
* Arguments for KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT
*
* Find one or more raised exceptions. This function can return multiple
* exceptions from a single queue or a single device with one call. To find
* all raised exceptions, this function must be called repeatedly until it
* returns -EAGAIN. Returned exceptions can optionally be cleared by
* setting the corresponding bit in the @exception_mask input parameter.
* However, clearing an exception prevents retrieving further information
* about it with KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO.
*
* @exception_mask (IN/OUT) - exception to clear (IN) and raised (OUT)
* @gpu_id (OUT) - gpu id of exceptions raised
* @queue_id (OUT) - queue id of exceptions raised
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on raised exception found
* Raised exceptions found are returned in @exception mask
* with reported source id returned in @gpu_id or @queue_id.
* - EAGAIN if no raised exception has been found
*/
struct kfd_ioctl_dbg_trap_query_debug_event_args {
__u64 exception_mask;
__u32 gpu_id;
__u32 queue_id;
};
/**
* kfd_ioctl_dbg_trap_query_exception_info_args
*
* Arguments KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO
* Get additional info on raised exception.
*
* @info_ptr (IN) - pointer to exception info buffer to copy to
* @info_size (IN/OUT) - exception info buffer size (bytes)
* @source_id (IN) - target gpu or queue id
* @exception_code (IN) - target exception
* @clear_exception (IN) - clear raised @exception_code exception
* (0 = false, 1 = true)
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* If @exception_code is EC_DEVICE_MEMORY_VIOLATION, copy @info_size(OUT)
* bytes of memory exception data to @info_ptr.
* If @exception_code is EC_PROCESS_RUNTIME, copy saved
* kfd_runtime_info to @info_ptr.
* Actual required @info_ptr size (bytes) is returned in @info_size.
*/
struct kfd_ioctl_dbg_trap_query_exception_info_args {
__u64 info_ptr;
__u32 info_size;
__u32 source_id;
__u32 exception_code;
__u32 clear_exception;
};
/**
* kfd_ioctl_dbg_trap_get_queue_snapshot_args
*
* Arguments KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT
* Get queue information.
*
* @exception_mask (IN) - exceptions raised to clear
* @snapshot_buf_ptr (IN) - queue snapshot entry buffer (see kfd_queue_snapshot_entry)
* @num_queues (IN/OUT) - number of queue snapshot entries
* The debugger specifies the size of the array allocated in @num_queues.
* KFD returns the number of queues that actually existed. If this is
* larger than the size specified by the debugger, KFD will not overflow
* the array allocated by the debugger.
*
* @entry_size (IN/OUT) - size per entry in bytes
* The debugger specifies sizeof(struct kfd_queue_snapshot_entry) in
* @entry_size. KFD returns the number of bytes actually populated per
* entry. The debugger should use the KFD_IOCTL_MINOR_VERSION to determine,
* which fields in struct kfd_queue_snapshot_entry are valid. This allows
* growing the ABI in a backwards compatible manner.
* Note that entry_size(IN) should still be used to stride the snapshot buffer in the
* event that it's larger than actual kfd_queue_snapshot_entry.
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* Copies @num_queues(IN) queue snapshot entries of size @entry_size(IN)
* into @snapshot_buf_ptr if @num_queues(IN) > 0.
* Otherwise return @num_queues(OUT) queue snapshot entries that exist.
*/
struct kfd_ioctl_dbg_trap_queue_snapshot_args {
__u64 exception_mask;
__u64 snapshot_buf_ptr;
__u32 num_queues;
__u32 entry_size;
};
/**
* kfd_ioctl_dbg_trap_get_device_snapshot_args
*
* Arguments for KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT
* Get device information.
*
* @exception_mask (IN) - exceptions raised to clear
* @snapshot_buf_ptr (IN) - pointer to snapshot buffer (see kfd_dbg_device_info_entry)
* @num_devices (IN/OUT) - number of debug devices to snapshot
* The debugger specifies the size of the array allocated in @num_devices.
* KFD returns the number of devices that actually existed. If this is
* larger than the size specified by the debugger, KFD will not overflow
* the array allocated by the debugger.
*
* @entry_size (IN/OUT) - size per entry in bytes
* The debugger specifies sizeof(struct kfd_dbg_device_info_entry) in
* @entry_size. KFD returns the number of bytes actually populated. The
* debugger should use KFD_IOCTL_MINOR_VERSION to determine, which fields
* in struct kfd_dbg_device_info_entry are valid. This allows growing the
* ABI in a backwards compatible manner.
* Note that entry_size(IN) should still be used to stride the snapshot buffer in the
* event that it's larger than actual kfd_dbg_device_info_entry.
*
* Generic errors apply (see kfd_dbg_trap_operations).
* Return - 0 on SUCCESS.
* Copies @num_devices(IN) device snapshot entries of size @entry_size(IN)
* into @snapshot_buf_ptr if @num_devices(IN) > 0.
* Otherwise return @num_devices(OUT) queue snapshot entries that exist.
*/
struct kfd_ioctl_dbg_trap_device_snapshot_args {
__u64 exception_mask;
__u64 snapshot_buf_ptr;
__u32 num_devices;
__u32 entry_size;
};
/**
* kfd_ioctl_dbg_trap_args
*
* Arguments to debug target process.
*
* @pid - target process to debug
* @op - debug operation (see kfd_dbg_trap_operations)
*
* @op determines which union struct args to use.
* Refer to kern docs for each kfd_ioctl_dbg_trap_*_args struct.
*/
struct kfd_ioctl_dbg_trap_args {
__u32 pid;
__u32 op;
union {
struct kfd_ioctl_dbg_trap_enable_args enable;
struct kfd_ioctl_dbg_trap_send_runtime_event_args send_runtime_event;
struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args set_exceptions_enabled;
struct kfd_ioctl_dbg_trap_set_wave_launch_override_args launch_override;
struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args launch_mode;
struct kfd_ioctl_dbg_trap_suspend_queues_args suspend_queues;
struct kfd_ioctl_dbg_trap_resume_queues_args resume_queues;
struct kfd_ioctl_dbg_trap_set_node_address_watch_args set_node_address_watch;
struct kfd_ioctl_dbg_trap_clear_node_address_watch_args clear_node_address_watch;
struct kfd_ioctl_dbg_trap_set_flags_args set_flags;
struct kfd_ioctl_dbg_trap_query_debug_event_args query_debug_event;
struct kfd_ioctl_dbg_trap_query_exception_info_args query_exception_info;
struct kfd_ioctl_dbg_trap_queue_snapshot_args queue_snapshot;
struct kfd_ioctl_dbg_trap_device_snapshot_args device_snapshot;
};
};
/* Matching HSA_EVENTTYPE */
#define KFD_IOC_EVENT_SIGNAL 0
#define KFD_IOC_EVENT_NODECHANGE 1
#define KFD_IOC_EVENT_DEVICESTATECHANGE 2
#define KFD_IOC_EVENT_HW_EXCEPTION 3
#define KFD_IOC_EVENT_SYSTEM_EVENT 4
#define KFD_IOC_EVENT_DEBUG_EVENT 5
#define KFD_IOC_EVENT_PROFILE_EVENT 6
#define KFD_IOC_EVENT_QUEUE_EVENT 7
#define KFD_IOC_EVENT_MEMORY 8
#define KFD_IOC_WAIT_RESULT_COMPLETE 0
#define KFD_IOC_WAIT_RESULT_TIMEOUT 1
#define KFD_IOC_WAIT_RESULT_FAIL 2
#define KFD_SIGNAL_EVENT_LIMIT 4096
/* For kfd_event_data.hw_exception_data.reset_type. */
#define KFD_HW_EXCEPTION_WHOLE_GPU_RESET 0
#define KFD_HW_EXCEPTION_PER_ENGINE_RESET 1
/* For kfd_event_data.hw_exception_data.reset_cause. */
#define KFD_HW_EXCEPTION_GPU_HANG 0
#define KFD_HW_EXCEPTION_ECC 1
/* For kfd_hsa_memory_exception_data.ErrorType */
#define KFD_MEM_ERR_NO_RAS 0
#define KFD_MEM_ERR_SRAM_ECC 1
#define KFD_MEM_ERR_POISON_CONSUMED 2
#define KFD_MEM_ERR_GPU_HANG 3
struct kfd_ioctl_create_event_args {
__u64 event_page_offset; /* from KFD */
__u32 event_trigger_data; /* from KFD - signal events only */
__u32 event_type; /* to KFD */
__u32 auto_reset; /* to KFD */
__u32 node_id; /* to KFD - only valid for certain
event types */
__u32 event_id; /* from KFD */
__u32 event_slot_index; /* from KFD */
};
struct kfd_ioctl_destroy_event_args {
__u32 event_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_set_event_args {
__u32 event_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_reset_event_args {
__u32 event_id; /* to KFD */
__u32 pad;
};
struct kfd_memory_exception_failure {
__u32 NotPresent; /* Page not present or supervisor privilege */
__u32 ReadOnly; /* Write access to a read-only page */
__u32 NoExecute; /* Execute access to a page marked NX */
__u32 imprecise; /* Can't determine the exact fault address */
};
/* memory exception data */
struct kfd_hsa_memory_exception_data {
struct kfd_memory_exception_failure failure;
__u64 va;
__u32 gpu_id;
__u32 ErrorType; /* 0 = no RAS error,
* 1 = ECC_SRAM,
* 2 = Link_SYNFLOOD (poison),
* 3 = GPU hang (not attributable to a specific cause),
* other values reserved
*/
};
/* hw exception data */
struct kfd_hsa_hw_exception_data {
__u32 reset_type;
__u32 reset_cause;
__u32 memory_lost;
__u32 gpu_id;
};
/* hsa signal event data */
struct kfd_hsa_signal_event_data {
__u64 last_event_age; /* to and from KFD */
};
/* Event data */
struct kfd_event_data {
union {
/* From KFD */
struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_hsa_hw_exception_data hw_exception_data;
/* To and From KFD */
struct kfd_hsa_signal_event_data signal_event_data;
};
__u64 kfd_event_data_ext; /* pointer to an extension structure
for future exception types */
__u32 event_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_wait_events_args {
__u64 events_ptr; /* pointed to struct
kfd_event_data array, to KFD */
__u32 num_events; /* to KFD */
__u32 wait_for_all; /* to KFD */
__u32 timeout; /* to KFD */
__u32 wait_result; /* from KFD */
};
struct kfd_ioctl_set_scratch_backing_va_args {
__u64 va_addr; /* to KFD */
__u32 gpu_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_get_tile_config_args {
/* to KFD: pointer to tile array */
__u64 tile_config_ptr;
/* to KFD: pointer to macro tile array */
__u64 macro_tile_config_ptr;
/* to KFD: array size allocated by user mode
* from KFD: array size filled by kernel
*/
__u32 num_tile_configs;
/* to KFD: array size allocated by user mode
* from KFD: array size filled by kernel
*/
__u32 num_macro_tile_configs;
__u32 gpu_id; /* to KFD */
__u32 gb_addr_config; /* from KFD */
__u32 num_banks; /* from KFD */
__u32 num_ranks; /* from KFD */
/* struct size can be extended later if needed
* without breaking ABI compatibility
*/
};
struct kfd_ioctl_set_trap_handler_args {
__u64 tba_addr; /* to KFD */
__u64 tma_addr; /* to KFD */
__u32 gpu_id; /* to KFD */
__u32 pad;
};
struct kfd_ioctl_acquire_vm_args {
__u32 drm_fd; /* to KFD */
__u32 gpu_id; /* to KFD */
};
/* Allocation flags: memory types */
#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM (1 << 0)
#define KFD_IOC_ALLOC_MEM_FLAGS_GTT (1 << 1)
#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 2)
#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 3)
#define KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP (1 << 4)
/* Allocation flags: attributes/access options */
#define KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE (1 << 31)
#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE (1 << 30)
#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC (1 << 29)
#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28)
#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27)
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26)
#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25)
#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24)
#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23)
/* Allocate memory for later SVM (shared virtual memory) mapping.
*
* @va_addr: virtual address of the memory to be allocated
* all later mappings on all GPUs will use this address
* @size: size in bytes
* @handle: buffer handle returned to user mode, used to refer to
* this allocation for mapping, unmapping and freeing
* @mmap_offset: for CPU-mapping the allocation by mmapping a render node
* for userptrs this is overloaded to specify the CPU address
* @gpu_id: device identifier
* @flags: memory type and attributes. See KFD_IOC_ALLOC_MEM_FLAGS above
*/
struct kfd_ioctl_alloc_memory_of_gpu_args {
__u64 va_addr; /* to KFD */
__u64 size; /* to KFD */
__u64 handle; /* from KFD */
__u64 mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */
__u32 gpu_id; /* to KFD */
__u32 flags;
};
/* Free memory allocated with kfd_ioctl_alloc_memory_of_gpu
*
* @handle: memory handle returned by alloc
*/
struct kfd_ioctl_free_memory_of_gpu_args {
__u64 handle; /* to KFD */
};
/* Inquire available memory with kfd_ioctl_get_available_memory
*
* @available: memory available for alloc
*/
struct kfd_ioctl_get_available_memory_args {
__u64 available; /* from KFD */
__u32 gpu_id; /* to KFD */
__u32 pad;
};
/* Map memory to one or more GPUs
*
* @handle: memory handle returned by alloc
* @device_ids_array_ptr: array of gpu_ids (__u32 per device)
* @n_devices: number of devices in the array
* @n_success: number of devices mapped successfully
*
* @n_success returns information to the caller how many devices from
* the start of the array have mapped the buffer successfully. It can
* be passed into a subsequent retry call to skip those devices. For
* the first call the caller should initialize it to 0.
*
* If the ioctl completes with return code 0 (success), n_success ==
* n_devices.
*/
struct kfd_ioctl_map_memory_to_gpu_args {
__u64 handle; /* to KFD */
__u64 device_ids_array_ptr; /* to KFD */
__u32 n_devices; /* to KFD */
__u32 n_success; /* to/from KFD */
};
/* Unmap memory from one or more GPUs
*
* same arguments as for mapping
*/
struct kfd_ioctl_unmap_memory_from_gpu_args {
__u64 handle; /* to KFD */
__u64 device_ids_array_ptr; /* to KFD */
__u32 n_devices; /* to KFD */
__u32 n_success; /* to/from KFD */
};
/* Allocate GWS for specific queue
*
* @queue_id: queue's id that GWS is allocated for
* @num_gws: how many GWS to allocate
* @first_gws: index of the first GWS allocated.
* only support contiguous GWS allocation
*/
struct kfd_ioctl_alloc_queue_gws_args {
__u32 queue_id; /* to KFD */
__u32 num_gws; /* to KFD */
__u32 first_gws; /* from KFD */
__u32 pad;
};
struct kfd_ioctl_get_dmabuf_info_args {
__u64 size; /* from KFD */
__u64 metadata_ptr; /* to KFD */
__u32 metadata_size; /* to KFD (space allocated by user)
* from KFD (actual metadata size)
*/
__u32 gpu_id; /* from KFD */
__u32 flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */
__u32 dmabuf_fd; /* to KFD */
};
struct kfd_ioctl_import_dmabuf_args {
__u64 va_addr; /* to KFD */
__u64 handle; /* from KFD */
__u32 gpu_id; /* to KFD */
__u32 dmabuf_fd; /* to KFD */
};
struct kfd_ioctl_export_dmabuf_args {
__u64 handle; /* to KFD */
__u32 flags; /* to KFD */
__u32 dmabuf_fd; /* from KFD */
};
/*
* KFD SMI(System Management Interface) events
*/
enum kfd_smi_event {
KFD_SMI_EVENT_NONE = 0, /* not used */
KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
KFD_SMI_EVENT_GPU_PRE_RESET = 3,
KFD_SMI_EVENT_GPU_POST_RESET = 4,
};
#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
#define KFD_SMI_EVENT_MSG_SIZE 96
struct kfd_ioctl_smi_events_args {
__u32 gpuid; /* to KFD */
__u32 anon_fd; /* from KFD */
};
/**
* kfd_ioctl_spm_op - SPM ioctl operations
*
* @KFD_IOCTL_SPM_OP_ACQUIRE: acquire exclusive access to SPM
* @KFD_IOCTL_SPM_OP_RELEASE: release exclusive access to SPM
* @KFD_IOCTL_SPM_OP_SET_DEST_BUF: set or unset destination buffer for SPM streaming
*/
enum kfd_ioctl_spm_op {
KFD_IOCTL_SPM_OP_ACQUIRE,
KFD_IOCTL_SPM_OP_RELEASE,
KFD_IOCTL_SPM_OP_SET_DEST_BUF
};
/**
* kfd_ioctl_spm_args - Arguments for SPM ioctl
*
* @op[in]: specifies the operation to perform
* @gpu_id[in]: GPU ID of the GPU to profile
* @dst_buf[in]: used for the address of the destination buffer
* in @KFD_IOCTL_SPM_SET_DEST_BUFFER
* @buf_size[in]: size of the destination buffer
* @timeout[in/out]: [in]: timeout in milliseconds, [out]: amount of time left
* `in the timeout window
* @bytes_copied[out]: total amount of data that was copied to the previous dest_buf
* @has_data_loss: total count for sub-block which has data loss
*
* This ioctl performs different functions depending on the @op parameter.
*
* KFD_IOCTL_SPM_OP_ACQUIRE
* ------------------------
*
* Acquires exclusive access of SPM on the specified @gpu_id for the calling process.
* This must be called before using KFD_IOCTL_SPM_OP_SET_DEST_BUF.
*
* KFD_IOCTL_SPM_OP_RELEASE
* ------------------------
*
* Releases exclusive access of SPM on the specified @gpu_id for the calling process,
* which allows another process to acquire it in the future.
*
* KFD_IOCTL_SPM_OP_SET_DEST_BUF
* -----------------------------
*
* If @dst_buf is NULL, the destination buffer address is unset and copying of counters
* is stopped.
*
* If @dst_buf is not NULL, it specifies the pointer to a new destination buffer.
* @buf_size specifies the size of the buffer.
*
* If @timeout is non-0, the call will wait for up to @timeout ms for the previous
* buffer to be filled. If previous buffer to be filled before timeout, the @timeout
* will be updated value with the time remaining. If the timeout is exceeded, the function
* copies any partial data available into the previous user buffer and returns success.
* The amount of valid data in the previous user buffer is indicated by @bytes_copied.
*
* If @timeout is 0, the function immediately replaces the previous destination buffer
* without waiting for the previous buffer to be filled. That means the previous buffer
* may only be partially filled, and @bytes_copied will indicate how much data has been
* copied to it.
*
* If data was lost, e.g. due to a ring buffer overflow, @has_data_loss will be non-0.
*
* Returns negative error code on failure, 0 on success.
*/
struct kfd_ioctl_spm_args {
__u64 dest_buf;
__u32 buf_size;
__u32 op;
__u32 timeout;
__u32 gpu_id;
__u32 bytes_copied;
__u32 has_data_loss;
};
/**
* kfd_ioctl_spm_buffer_header - SPM Buffer header for kfd_ioctl_spm_args->dest_buf
*
* @version [out]: spm versiom
* @bytes_copied [out]: amount of data for each sub-block
* @has_data_loss: [out]: boolean indicating whether data was lost for each sub-block
* (e.g. due to a ring-buffer overflow)
*/
struct kfd_ioctl_spm_buffer_header {
__u32 version; /* 0-23: minor 24-31: major */
__u32 bytes_copied;
__u32 has_data_loss;
__u32 reserved[5];
};
/**************************************************************************************************
* CRIU IOCTLs (Checkpoint Restore In Userspace)
*
* When checkpointing a process, the userspace application will perform:
* 1. PROCESS_INFO op to determine current process information. This pauses execution and evicts
* all the queues.
* 2. CHECKPOINT op to checkpoint process contents (BOs, queues, events, svm-ranges)
* 3. UNPAUSE op to un-evict all the queues
*
* When restoring a process, the CRIU userspace application will perform:
*
* 1. RESTORE op to restore process contents
* 2. RESUME op to start the process
*
* Note: Queues are forced into an evicted state after a successful PROCESS_INFO. User
* application needs to perform an UNPAUSE operation after calling PROCESS_INFO.
*/
enum kfd_criu_op {
KFD_CRIU_OP_PROCESS_INFO,
KFD_CRIU_OP_CHECKPOINT,
KFD_CRIU_OP_UNPAUSE,
KFD_CRIU_OP_RESTORE,
KFD_CRIU_OP_RESUME,
};
/**
* kfd_ioctl_criu_args - Arguments perform CRIU operation
* @devices: [in/out] User pointer to memory location for devices information.
* This is an array of type kfd_criu_device_bucket.
* @bos: [in/out] User pointer to memory location for BOs information
* This is an array of type kfd_criu_bo_bucket.
* @priv_data: [in/out] User pointer to memory location for private data
* @priv_data_size: [in/out] Size of priv_data in bytes
* @num_devices: [in/out] Number of GPUs used by process. Size of @devices array.
* @num_bos [in/out] Number of BOs used by process. Size of @bos array.
* @num_objects: [in/out] Number of objects used by process. Objects are opaque to
* user application.
* @pid: [in/out] PID of the process being checkpointed
* @op [in] Type of operation (kfd_criu_op)
*
* Return: 0 on success, -errno on failure
*/
struct kfd_ioctl_criu_args {
__u64 devices; /* Used during ops: CHECKPOINT, RESTORE */
__u64 bos; /* Used during ops: CHECKPOINT, RESTORE */
__u64 priv_data; /* Used during ops: CHECKPOINT, RESTORE */
__u64 priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */
__u32 num_devices; /* Used during ops: PROCESS_INFO, RESTORE */
__u32 num_bos; /* Used during ops: PROCESS_INFO, RESTORE */
__u32 num_objects; /* Used during ops: PROCESS_INFO, RESTORE */
__u32 pid; /* Used during ops: PROCESS_INFO, RESUME */
__u32 op;
};
struct kfd_criu_device_bucket {
__u32 user_gpu_id;
__u32 actual_gpu_id;
__u32 drm_fd;
__u32 pad;
};
struct kfd_criu_bo_bucket {
__u64 addr;
__u64 size;
__u64 offset;
__u64 restored_offset; /* During restore, updated offset for BO */
__u32 gpu_id; /* This is the user_gpu_id */
__u32 alloc_flags;
__u32 dmabuf_fd;
__u32 pad;
};
/* CRIU IOCTLs - END */
/**************************************************************************************************/
/* Register offset inside the remapped mmio page
*/
enum kfd_mmio_remap {
KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0,
KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4,
};
struct kfd_ioctl_ipc_export_handle_args {
__u64 handle; /* to KFD */
__u32 share_handle[4]; /* from KFD */
__u32 gpu_id; /* to KFD */
__u32 flags; /* to KFD */
};
struct kfd_ioctl_ipc_import_handle_args {
__u64 handle; /* from KFD */
__u64 va_addr; /* to KFD */
__u64 mmap_offset; /* from KFD */
__u32 share_handle[4]; /* to KFD */
__u32 gpu_id; /* to KFD */
__u32 flags; /* from KFD */
};
struct kfd_memory_range {
__u64 va_addr;
__u64 size;
};
/* flags definitions
* BIT0: 0: read operation, 1: write operation.
* This also identifies if the src or dst array belongs to remote process
*/
#define KFD_CROSS_MEMORY_RW_BIT (1 << 0)
#define KFD_SET_CROSS_MEMORY_READ(flags) (flags &= ~KFD_CROSS_MEMORY_RW_BIT)
#define KFD_SET_CROSS_MEMORY_WRITE(flags) (flags |= KFD_CROSS_MEMORY_RW_BIT)
#define KFD_IS_CROSS_MEMORY_WRITE(flags) (flags & KFD_CROSS_MEMORY_RW_BIT)
struct kfd_ioctl_cross_memory_copy_args {
/* to KFD: Process ID of the remote process */
__u32 pid;
/* to KFD: See above definition */
__u32 flags;
/* to KFD: Source GPU VM range */
__u64 src_mem_range_array;
/* to KFD: Size of above array */
__u64 src_mem_array_size;
/* to KFD: Destination GPU VM range */
__u64 dst_mem_range_array;
/* to KFD: Size of above array */
__u64 dst_mem_array_size;
/* from KFD: Total amount of bytes copied */
__u64 bytes_copied;
};
/* Guarantee host access to memory */
#define KFD_IOCTL_SVM_FLAG_HOST_ACCESS 0x00000001
/* Fine grained coherency between all devices with access */
#define KFD_IOCTL_SVM_FLAG_COHERENT 0x00000002
/* Use any GPU in same hive as preferred device */
#define KFD_IOCTL_SVM_FLAG_HIVE_LOCAL 0x00000004
/* GPUs only read, allows replication */
#define KFD_IOCTL_SVM_FLAG_GPU_RO 0x00000008
/* Allow execution on GPU */
#define KFD_IOCTL_SVM_FLAG_GPU_EXEC 0x00000010
/* GPUs mostly read, may allow similar optimizations as RO, but writes fault */
#define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020
/* Keep GPU memory mapping always valid as if XNACK is disable */
#define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040
/* Fine grained coherency between all devices using device-scope atomics */
#define KFD_IOCTL_SVM_FLAG_EXT_COHERENT 0x00000080
/**
* kfd_ioctl_svm_op - SVM ioctl operations
*
* @KFD_IOCTL_SVM_OP_SET_ATTR: Modify one or more attributes
* @KFD_IOCTL_SVM_OP_GET_ATTR: Query one or more attributes
*/
enum kfd_ioctl_svm_op {
KFD_IOCTL_SVM_OP_SET_ATTR,
KFD_IOCTL_SVM_OP_GET_ATTR
};
/** kfd_ioctl_svm_location - Enum for preferred and prefetch locations
*
* GPU IDs are used to specify GPUs as preferred and prefetch locations.
* Below definitions are used for system memory or for leaving the preferred
* location unspecified.
*/
enum kfd_ioctl_svm_location {
KFD_IOCTL_SVM_LOCATION_SYSMEM = 0,
KFD_IOCTL_SVM_LOCATION_UNDEFINED = 0xffffffff
};
/**
* kfd_ioctl_svm_attr_type - SVM attribute types
*
* @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: gpuid of the preferred location, 0 for
* system memory
* @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: gpuid of the prefetch location, 0 for
* system memory. Setting this triggers an
* immediate prefetch (migration).
* @KFD_IOCTL_SVM_ATTR_ACCESS:
* @KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
* @KFD_IOCTL_SVM_ATTR_NO_ACCESS: specify memory access for the gpuid given
* by the attribute value
* @KFD_IOCTL_SVM_ATTR_SET_FLAGS: bitmask of flags to set (see
* KFD_IOCTL_SVM_FLAG_...)
* @KFD_IOCTL_SVM_ATTR_CLR_FLAGS: bitmask of flags to clear
* @KFD_IOCTL_SVM_ATTR_GRANULARITY: migration granularity
* (log2 num pages)
*/
enum kfd_ioctl_svm_attr_type {
KFD_IOCTL_SVM_ATTR_PREFERRED_LOC,
KFD_IOCTL_SVM_ATTR_PREFETCH_LOC,
KFD_IOCTL_SVM_ATTR_ACCESS,
KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE,
KFD_IOCTL_SVM_ATTR_NO_ACCESS,
KFD_IOCTL_SVM_ATTR_SET_FLAGS,
KFD_IOCTL_SVM_ATTR_CLR_FLAGS,
KFD_IOCTL_SVM_ATTR_GRANULARITY
};
/**
* kfd_ioctl_svm_attribute - Attributes as pairs of type and value
*
* The meaning of the @value depends on the attribute type.
*
* @type: attribute type (see enum @kfd_ioctl_svm_attr_type)
* @value: attribute value
*/
struct kfd_ioctl_svm_attribute {
__u32 type;
__u32 value;
};
/**
* kfd_ioctl_svm_args - Arguments for SVM ioctl
*
* @op specifies the operation to perform (see enum
* @kfd_ioctl_svm_op). @start_addr and @size are common for all
* operations.
*
* A variable number of attributes can be given in @attrs.
* @nattr specifies the number of attributes. New attributes can be
* added in the future without breaking the ABI. If unknown attributes
* are given, the function returns -EINVAL.
*
* @KFD_IOCTL_SVM_OP_SET_ATTR sets attributes for a virtual address
* range. It may overlap existing virtual address ranges. If it does,
* the existing ranges will be split such that the attribute changes
* only apply to the specified address range.
*
* @KFD_IOCTL_SVM_OP_GET_ATTR returns the intersection of attributes
* over all memory in the given range and returns the result as the
* attribute value. If different pages have different preferred or
* prefetch locations, 0xffffffff will be returned for
* @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC or
* @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC resepctively. For
* @KFD_IOCTL_SVM_ATTR_SET_FLAGS, flags of all pages will be
* aggregated by bitwise AND. That means, a flag will be set in the
* output, if that flag is set for all pages in the range. For
* @KFD_IOCTL_SVM_ATTR_CLR_FLAGS, flags of all pages will be
* aggregated by bitwise NOR. That means, a flag will be set in the
* output, if that flag is clear for all pages in the range.
* The minimum migration granularity throughout the range will be
* returned for @KFD_IOCTL_SVM_ATTR_GRANULARITY.
*
* Querying of accessibility attributes works by initializing the
* attribute type to @KFD_IOCTL_SVM_ATTR_ACCESS and the value to the
* GPUID being queried. Multiple attributes can be given to allow
* querying multiple GPUIDs. The ioctl function overwrites the
* attribute type to indicate the access for the specified GPU.
*/
struct kfd_ioctl_svm_args {
__u64 start_addr;
__u64 size;
__u32 op;
__u32 nattr;
/* Variable length array of attributes */
struct kfd_ioctl_svm_attribute attrs[];
};
/**
* kfd_ioctl_set_xnack_mode_args - Arguments for set_xnack_mode
*
* @xnack_enabled: [in/out] Whether to enable XNACK mode for this process
*
* @xnack_enabled indicates whether recoverable page faults should be
* enabled for the current process. 0 means disabled, positive means
* enabled, negative means leave unchanged. If enabled, virtual address
* translations on GFXv9 and later AMD GPUs can return XNACK and retry
* the access until a valid PTE is available. This is used to implement
* device page faults.
*
* On output, @xnack_enabled returns the (new) current mode (0 or
* positive). Therefore, a negative input value can be used to query
* the current mode without changing it.
*
* The XNACK mode fundamentally changes the way SVM managed memory works
* in the driver, with subtle effects on application performance and
* functionality.
*
* Enabling XNACK mode requires shader programs to be compiled
* differently. Furthermore, not all GPUs support changing the mode
* per-process. Therefore changing the mode is only allowed while no
* user mode queues exist in the process. This ensure that no shader
* code is running that may be compiled for the wrong mode. And GPUs
* that cannot change to the requested mode will prevent the XNACK
* mode from occurring. All GPUs used by the process must be in the
* same XNACK mode.
*
* GFXv8 or older GPUs do not support 48 bit virtual addresses or SVM.
* Therefore those GPUs are not considered for the XNACK mode switch.
*
* Return: 0 on success, -errno on failure
*/
struct kfd_ioctl_set_xnack_mode_args {
__s32 xnack_enabled;
};
/**
* kfd_ioctl_pc_sample_op - PC Sampling ioctl operations
*
* @KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES: Query device PC Sampling capabilities
* @KFD_IOCTL_PCS_OP_CREATE: Register this process with a per-device PC sampler instance
* @KFD_IOCTL_PCS_OP_DESTROY: Unregister from a previously registered PC sampler instance
* @KFD_IOCTL_PCS_OP_START: Process begins taking samples from a previously registered PC sampler instance
* @KFD_IOCTL_PCS_OP_STOP: Process stops taking samples from a previously registered PC sampler instance
*/
enum kfd_ioctl_pc_sample_op {
KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES,
KFD_IOCTL_PCS_OP_CREATE,
KFD_IOCTL_PCS_OP_DESTROY,
KFD_IOCTL_PCS_OP_START,
KFD_IOCTL_PCS_OP_STOP,
};
/* Values have to be a power of 2*/
#define KFD_IOCTL_PCS_FLAG_POWER_OF_2 0x00000001
enum kfd_ioctl_pc_sample_method {
KFD_IOCTL_PCS_METHOD_HOSTTRAP = 1,
KFD_IOCTL_PCS_METHOD_STOCHASTIC,
};
enum kfd_ioctl_pc_sample_type {
KFD_IOCTL_PCS_TYPE_TIME_US,
KFD_IOCTL_PCS_TYPE_CLOCK_CYCLES,
KFD_IOCTL_PCS_TYPE_INSTRUCTIONS
};
struct kfd_pc_sample_info {
__u64 interval; /* [IN] if PCS_TYPE_INTERVAL_US: sample interval in us
* if PCS_TYPE_CLOCK_CYCLES: sample interval in graphics core clk cycles
* if PCS_TYPE_INSTRUCTIONS: sample interval in instructions issued by
* graphics compute units
*/
__u64 interval_min; /* [OUT] */
__u64 interval_max; /* [OUT] */
__u64 flags; /* [OUT] indicate potential restrictions e.g FLAG_POWER_OF_2 */
__u32 method; /* [IN/OUT] kfd_ioctl_pc_sample_method */
__u32 type; /* [IN/OUT] kfd_ioctl_pc_sample_type */
};
#define KFD_IOCTL_PCS_QUERY_TYPE_FULL (1 << 0) /* If not set, return current */
struct kfd_ioctl_pc_sample_args {
__u64 sample_info_ptr; /* array of kfd_pc_sample_info */
__u32 num_sample_info;
__u32 op; /* kfd_ioctl_pc_sample_op */
__u32 gpu_id;
__u32 trace_id;
__u32 flags; /* kfd_ioctl_pcs_query flags */
__u32 reserved;
};
#define AMDKFD_IOCTL_BASE 'K'
#define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr)
#define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type)
#define AMDKFD_IOW(nr, type) _IOW(AMDKFD_IOCTL_BASE, nr, type)
#define AMDKFD_IOWR(nr, type) _IOWR(AMDKFD_IOCTL_BASE, nr, type)
#define AMDKFD_IOC_GET_VERSION \
AMDKFD_IOR(0x01, struct kfd_ioctl_get_version_args)
#define AMDKFD_IOC_CREATE_QUEUE \
AMDKFD_IOWR(0x02, struct kfd_ioctl_create_queue_args)
#define AMDKFD_IOC_DESTROY_QUEUE \
AMDKFD_IOWR(0x03, struct kfd_ioctl_destroy_queue_args)
#define AMDKFD_IOC_SET_MEMORY_POLICY \
AMDKFD_IOW(0x04, struct kfd_ioctl_set_memory_policy_args)
#define AMDKFD_IOC_GET_CLOCK_COUNTERS \
AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args)
#define AMDKFD_IOC_GET_PROCESS_APERTURES \
AMDKFD_IOR(0x06, struct kfd_ioctl_get_process_apertures_args)
#define AMDKFD_IOC_UPDATE_QUEUE \
AMDKFD_IOW(0x07, struct kfd_ioctl_update_queue_args)
#define AMDKFD_IOC_CREATE_EVENT \
AMDKFD_IOWR(0x08, struct kfd_ioctl_create_event_args)
#define AMDKFD_IOC_DESTROY_EVENT \
AMDKFD_IOW(0x09, struct kfd_ioctl_destroy_event_args)
#define AMDKFD_IOC_SET_EVENT \
AMDKFD_IOW(0x0A, struct kfd_ioctl_set_event_args)
#define AMDKFD_IOC_RESET_EVENT \
AMDKFD_IOW(0x0B, struct kfd_ioctl_reset_event_args)
#define AMDKFD_IOC_WAIT_EVENTS \
AMDKFD_IOWR(0x0C, struct kfd_ioctl_wait_events_args)
#define AMDKFD_IOC_DBG_REGISTER_DEPRECATED \
AMDKFD_IOW(0x0D, struct kfd_ioctl_dbg_register_args)
#define AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED \
AMDKFD_IOW(0x0E, struct kfd_ioctl_dbg_unregister_args)
#define AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED \
AMDKFD_IOW(0x0F, struct kfd_ioctl_dbg_address_watch_args)
#define AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED \
AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args)
#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA \
AMDKFD_IOWR(0x11, struct kfd_ioctl_set_scratch_backing_va_args)
#define AMDKFD_IOC_GET_TILE_CONFIG \
AMDKFD_IOWR(0x12, struct kfd_ioctl_get_tile_config_args)
#define AMDKFD_IOC_SET_TRAP_HANDLER \
AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args)
#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW \
AMDKFD_IOWR(0x14, \
struct kfd_ioctl_get_process_apertures_new_args)
#define AMDKFD_IOC_ACQUIRE_VM \
AMDKFD_IOW(0x15, struct kfd_ioctl_acquire_vm_args)
#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU \
AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args)
#define AMDKFD_IOC_FREE_MEMORY_OF_GPU \
AMDKFD_IOW(0x17, struct kfd_ioctl_free_memory_of_gpu_args)
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU \
AMDKFD_IOWR(0x18, struct kfd_ioctl_map_memory_to_gpu_args)
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU \
AMDKFD_IOWR(0x19, struct kfd_ioctl_unmap_memory_from_gpu_args)
#define AMDKFD_IOC_SET_CU_MASK \
AMDKFD_IOW(0x1A, struct kfd_ioctl_set_cu_mask_args)
#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE \
AMDKFD_IOWR(0x1B, struct kfd_ioctl_get_queue_wave_state_args)
#define AMDKFD_IOC_GET_DMABUF_INFO \
AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_dmabuf_info_args)
#define AMDKFD_IOC_IMPORT_DMABUF \
AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args)
#define AMDKFD_IOC_ALLOC_QUEUE_GWS \
AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
#define AMDKFD_IOC_SMI_EVENTS \
AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
#define AMDKFD_IOC_SVM AMDKFD_IOWR(0x20, struct kfd_ioctl_svm_args)
#define AMDKFD_IOC_SET_XNACK_MODE \
AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
#define AMDKFD_IOC_CRIU_OP \
AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args)
#define AMDKFD_IOC_AVAILABLE_MEMORY \
AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
#define AMDKFD_IOC_EXPORT_DMABUF \
AMDKFD_IOWR(0x24, struct kfd_ioctl_export_dmabuf_args)
#define AMDKFD_IOC_RUNTIME_ENABLE \
AMDKFD_IOWR(0x25, struct kfd_ioctl_runtime_enable_args)
#define AMDKFD_IOC_DBG_TRAP \
AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args)
#define AMDKFD_COMMAND_START 0x01
#define AMDKFD_COMMAND_END 0x27
/* non-upstream ioctls */
#define AMDKFD_IOC_IPC_IMPORT_HANDLE \
AMDKFD_IOWR(0x80, struct kfd_ioctl_ipc_import_handle_args)
#define AMDKFD_IOC_IPC_EXPORT_HANDLE \
AMDKFD_IOWR(0x81, struct kfd_ioctl_ipc_export_handle_args)
#define AMDKFD_IOC_CROSS_MEMORY_COPY \
AMDKFD_IOWR(0x83, struct kfd_ioctl_cross_memory_copy_args)
#define AMDKFD_IOC_RLC_SPM \
AMDKFD_IOWR(0x84, struct kfd_ioctl_spm_args)
#define AMDKFD_IOC_PC_SAMPLE \
AMDKFD_IOWR(0x85, struct kfd_ioctl_pc_sample_args)
#define AMDKFD_COMMAND_START_2 0x80
#define AMDKFD_COMMAND_END_2 0x86
#endif
================================================
FILE: libhsakmt/include/hsakmt/linux/udmabuf.h
================================================
/* GPL-2.0 WITH Linux-syscall-note */
/*
* This file was copied from inux-libc-dev package
* This header provides interface to linux kernel udmabuf drver
* Modifications may have been made.
*/
#ifndef _THUNK_UDMABUF_H
#define _THUNK_UDMABUF_H
#include
#include
#define UDMABUF_FLAGS_CLOEXEC 0x01
struct udmabuf_create {
__u32 memfd;
__u32 flags;
__u64 offset;
__u64 size;
};
struct udmabuf_create_item {
__u32 memfd;
__u32 __pad;
__u64 offset;
__u64 size;
};
struct udmabuf_create_list {
__u32 flags;
__u32 count;
struct udmabuf_create_item list[];
};
#define UDMABUF_CREATE _IOW('u', 0x42, struct udmabuf_create)
#define UDMABUF_CREATE_LIST _IOW('u', 0x43, struct udmabuf_create_list)
#endif /* _THUNK_UDMABUF_H */
================================================
FILE: libhsakmt/libhsakmt.pc.in
================================================
prefix=${pcfiledir}/../..
exec_prefix=${prefix}
libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: libhsakmt
Description: HSA Kernel Mode Thunk library for AMD KFD support
Version: @LIB_VERSION_STRING@
Libs: -L${libdir} -lhsakmt
Cflags: -I${includedir}
================================================
FILE: libhsakmt/src/debug.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include "hsakmt/linux/kfd_ioctl.h"
#include
#include
#include
#include
static bool *is_device_debugged;
static uint32_t runtime_capabilities_mask = 0;
HSAKMT_STATUS hsakmt_init_device_debugging_memory(unsigned int NumNodes)
{
unsigned int i;
is_device_debugged = malloc(NumNodes * sizeof(bool));
if (!is_device_debugged)
return HSAKMT_STATUS_NO_MEMORY;
for (i = 0; i < NumNodes; i++)
is_device_debugged[i] = false;
return HSAKMT_STATUS_SUCCESS;
}
void hsakmt_destroy_device_debugging_memory(void)
{
if (is_device_debugged) {
free(is_device_debugged);
is_device_debugged = NULL;
}
}
bool hsakmt_debug_get_reg_status(uint32_t node_id)
{
return is_device_debugged[node_id];
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgRegister(HSAuint32 NodeId)
{
HSAKMT_STATUS result;
uint32_t gpu_id;
CHECK_KFD_OPEN();
if (!is_device_debugged)
return HSAKMT_STATUS_NO_MEMORY;
result = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
struct kfd_ioctl_dbg_register_args args = {0};
args.gpu_id = gpu_id;
long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_REGISTER_DEPRECATED, &args);
if (err == 0)
result = HSAKMT_STATUS_SUCCESS;
else
result = HSAKMT_STATUS_ERROR;
return result;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgUnregister(HSAuint32 NodeId)
{
uint32_t gpu_id;
HSAKMT_STATUS result;
CHECK_KFD_OPEN();
if (!is_device_debugged)
return HSAKMT_STATUS_NO_MEMORY;
result = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
struct kfd_ioctl_dbg_unregister_args args = {0};
args.gpu_id = gpu_id;
long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED, &args);
if (err)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgWavefrontControl(HSAuint32 NodeId,
HSA_DBG_WAVEOP Operand,
HSA_DBG_WAVEMODE Mode,
HSAuint32 TrapId,
HsaDbgWaveMessage *DbgWaveMsgRing)
{
HSAKMT_STATUS result;
uint32_t gpu_id;
struct kfd_ioctl_dbg_wave_control_args *args;
CHECK_KFD_OPEN();
result = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
/* Determine Size of the ioctl buffer */
uint32_t buff_size = sizeof(Operand) + sizeof(Mode) + sizeof(TrapId) +
sizeof(DbgWaveMsgRing->DbgWaveMsg) +
sizeof(DbgWaveMsgRing->MemoryVA) + sizeof(*args);
args = (struct kfd_ioctl_dbg_wave_control_args *)malloc(buff_size);
if (!args)
return HSAKMT_STATUS_ERROR;
memset(args, 0, buff_size);
args->gpu_id = gpu_id;
args->buf_size_in_bytes = buff_size;
/* increment pointer to the start of the non fixed part */
unsigned char *run_ptr = (unsigned char *)args + sizeof(*args);
/* save variable content pointer for kfd */
args->content_ptr = (uint64_t)run_ptr;
/* insert items, and increment pointer accordingly */
*((HSA_DBG_WAVEOP *)run_ptr) = Operand;
run_ptr += sizeof(Operand);
*((HSA_DBG_WAVEMODE *)run_ptr) = Mode;
run_ptr += sizeof(Mode);
*((HSAuint32 *)run_ptr) = TrapId;
run_ptr += sizeof(TrapId);
*((HsaDbgWaveMessageAMD *)run_ptr) = DbgWaveMsgRing->DbgWaveMsg;
run_ptr += sizeof(DbgWaveMsgRing->DbgWaveMsg);
*((void **)run_ptr) = DbgWaveMsgRing->MemoryVA;
run_ptr += sizeof(DbgWaveMsgRing->MemoryVA);
/* send to kernel */
long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED, args);
free(args);
if (err)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgAddressWatch(HSAuint32 NodeId,
HSAuint32 NumWatchPoints,
HSA_DBG_WATCH_MODE WatchMode[],
void *WatchAddress[],
HSAuint64 WatchMask[],
HsaEvent *WatchEvent[])
{
HSAKMT_STATUS result;
uint32_t gpu_id;
/* determine the size of the watch mask and event buffers
* the value is NULL if and only if no vector data should be attached
*/
uint32_t watch_mask_items = WatchMask[0] > 0 ? NumWatchPoints:1;
uint32_t watch_event_items = WatchEvent != NULL ? NumWatchPoints:0;
struct kfd_ioctl_dbg_address_watch_args *args;
HSAuint32 i = 0;
CHECK_KFD_OPEN();
result = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
if (NumWatchPoints > MAX_ALLOWED_NUM_POINTS)
return HSAKMT_STATUS_INVALID_PARAMETER;
/* Size and structure of the ioctl buffer is dynamic in this case
* Here we calculate the buff size.
*/
uint32_t buff_size = sizeof(NumWatchPoints) +
(sizeof(WatchMode[0]) + sizeof(WatchAddress[0])) *
NumWatchPoints +
watch_mask_items * sizeof(HSAuint64) +
watch_event_items * sizeof(HsaEvent *) + sizeof(*args);
args = (struct kfd_ioctl_dbg_address_watch_args *) malloc(buff_size);
if (!args)
return HSAKMT_STATUS_ERROR;
memset(args, 0, buff_size);
args->gpu_id = gpu_id;
args->buf_size_in_bytes = buff_size;
/* increment pointer to the start of the non fixed part */
unsigned char *run_ptr = (unsigned char *)args + sizeof(*args);
/* save variable content pointer for kfd */
args->content_ptr = (uint64_t)run_ptr;
/* insert items, and increment pointer accordingly */
*((HSAuint32 *)run_ptr) = NumWatchPoints;
run_ptr += sizeof(NumWatchPoints);
for (i = 0; i < NumWatchPoints; i++) {
*((HSA_DBG_WATCH_MODE *)run_ptr) = WatchMode[i];
run_ptr += sizeof(WatchMode[i]);
}
for (i = 0; i < NumWatchPoints; i++) {
*((void **)run_ptr) = WatchAddress[i];
run_ptr += sizeof(WatchAddress[i]);
}
for (i = 0; i < watch_mask_items; i++) {
*((HSAuint64 *)run_ptr) = WatchMask[i];
run_ptr += sizeof(WatchMask[i]);
}
for (i = 0; i < watch_event_items; i++) {
*((HsaEvent **)run_ptr) = WatchEvent[i];
run_ptr += sizeof(WatchEvent[i]);
}
/* send to kernel */
long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED, args);
free(args);
if (err)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
#define HSA_RUNTIME_ENABLE_MAX_MAJOR 1
#define HSA_RUNTIME_ENABLE_MIN_MINOR 13
HSAKMT_STATUS HSAKMTAPI hsaKmtCheckRuntimeDebugSupport(void) {
HsaNodeProperties node = {0};
HsaSystemProperties props = {0};
HsaVersionInfo versionInfo = {0};
memset(&node, 0x00, sizeof(node));
memset(&props, 0x00, sizeof(props));
if (hsaKmtAcquireSystemProperties(&props))
return HSAKMT_STATUS_ERROR;
//the firmware of gpu node doesn't support the debugger, disable it.
for (uint32_t i = 0; i < props.NumNodes; i++) {
if (hsaKmtGetNodeProperties(i, &node))
return HSAKMT_STATUS_ERROR;
//ignore cpu node
if (node.NumCPUCores && !node.NumFComputeCores)
continue;
if (!node.Capability.ui32.DebugSupportedFirmware)
return HSAKMT_STATUS_NOT_SUPPORTED;
}
if (hsaKmtGetVersion(&versionInfo))
return HSAKMT_STATUS_NOT_SUPPORTED;
if (versionInfo.KernelInterfaceMajorVersion < HSA_RUNTIME_ENABLE_MAX_MAJOR ||
(versionInfo.KernelInterfaceMajorVersion ==
HSA_RUNTIME_ENABLE_MAX_MAJOR &&
(int)versionInfo.KernelInterfaceMinorVersion < HSA_RUNTIME_ENABLE_MIN_MINOR))
return HSAKMT_STATUS_NOT_SUPPORTED;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeEnable(void *rDebug,
bool setupTtmp)
{
struct kfd_ioctl_runtime_enable_args args = {0};
HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
if (result)
return result;
memset(&args, 0x00, sizeof(args));
args.mode_mask = KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK |
((setupTtmp) ? KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK : 0);
args.r_debug = (HSAuint64)rDebug;
long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RUNTIME_ENABLE, &args);
if (err) {
if (errno == EBUSY)
return HSAKMT_STATUS_UNAVAILABLE;
else
return HSAKMT_STATUS_ERROR;
}
runtime_capabilities_mask= args.capabilities_mask;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRuntimeDisable(void)
{
struct kfd_ioctl_runtime_enable_args args = {0};
HSAKMT_STATUS result = hsaKmtCheckRuntimeDebugSupport();
if (result)
return result;
memset(&args, 0x00, sizeof(args));
args.mode_mask = 0; //Disable
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RUNTIME_ENABLE, &args))
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetRuntimeCapabilities(HSAuint32 *caps_mask)
{
*caps_mask = runtime_capabilities_mask;
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS dbg_trap_get_device_data(void *data,
uint32_t *n_entries,
uint32_t entry_size)
{
struct kfd_ioctl_dbg_trap_args args = {0};
args.device_snapshot.snapshot_buf_ptr = (uint64_t) data;
args.device_snapshot.num_devices = *n_entries;
args.device_snapshot.entry_size = entry_size;
args.op = KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT;
args.pid = getpid();
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args))
return HSAKMT_STATUS_ERROR;
*n_entries = args.device_snapshot.num_devices;
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS dbg_trap_get_queue_data(void *data,
uint32_t *n_entries,
uint32_t entry_size,
uint32_t *queue_ids)
{
struct kfd_ioctl_dbg_trap_args args = {0};
args.queue_snapshot.num_queues = *n_entries;
args.queue_snapshot.entry_size = entry_size;
args.queue_snapshot.exception_mask = KFD_EC_MASK(EC_QUEUE_NEW);
args.op = KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT;
args.queue_snapshot.snapshot_buf_ptr = (uint64_t) data;
args.pid = getpid();
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args))
return HSAKMT_STATUS_ERROR;
*n_entries = args.queue_snapshot.num_queues;
if (queue_ids && *n_entries) {
struct kfd_queue_snapshot_entry *queue_entry =
(struct kfd_queue_snapshot_entry *) data;
for (uint32_t i = 0; i < *n_entries; i++)
queue_ids[i] = queue_entry[i].queue_id;
}
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS dbg_trap_suspend_queues(uint32_t *queue_ids,
uint32_t num_queues)
{
struct kfd_ioctl_dbg_trap_args args = {0};
int r;
args.suspend_queues.queue_array_ptr = (uint64_t) queue_ids;
args.suspend_queues.num_queues = num_queues;
args.suspend_queues.exception_mask = KFD_EC_MASK(EC_QUEUE_NEW);
args.op = KFD_IOC_DBG_TRAP_SUSPEND_QUEUES;
args.pid = getpid();
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args);
if (r < 0)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
/* Debugger support has been in KFD ABI 1.13. */
#define KFD_MINOR_MIN_DEBUG 13
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgEnable(void **runtime_info,
HSAuint32 *data_size)
{
struct kfd_ioctl_dbg_trap_args args = {0};
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
*data_size = sizeof(struct kfd_runtime_info);
args.enable.rinfo_size = *data_size;
args.enable.dbg_fd = hsakmt_kfd_fd;
*runtime_info = malloc(args.enable.rinfo_size);
if (!*runtime_info)
return HSAKMT_STATUS_NO_MEMORY;
args.enable.rinfo_ptr = (uint64_t) *runtime_info;
args.op = KFD_IOC_DBG_TRAP_ENABLE;
args.pid = getpid();
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args)) {
free(*runtime_info);
return HSAKMT_STATUS_ERROR;
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgDisable(void)
{
struct kfd_ioctl_dbg_trap_args args = {0};
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
args.enable.dbg_fd = hsakmt_kfd_fd;
args.op = KFD_IOC_DBG_TRAP_DISABLE;
args.pid = getpid();
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, &args))
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetDeviceData(void **data,
HSAuint32 *n_entries,
HSAuint32 *entry_size)
{
HSAKMT_STATUS ret = HSAKMT_STATUS_NO_MEMORY;
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
*n_entries = UINT32_MAX;
*entry_size = sizeof(struct kfd_dbg_device_info_entry);
*data = malloc(*entry_size * *n_entries);
if (!*data)
return ret;
ret = dbg_trap_get_device_data(*data, n_entries, *entry_size);
if (ret)
free(*data);
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDbgGetQueueData(void **data,
HSAuint32 *n_entries,
HSAuint32 *entry_size,
bool suspend_queues)
{
uint32_t *queue_ids = NULL;
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(KFD_MINOR_MIN_DEBUG);
*entry_size = sizeof(struct kfd_queue_snapshot_entry);
*n_entries = 0;
if (dbg_trap_get_queue_data(NULL, n_entries, *entry_size, NULL))
return HSAKMT_STATUS_ERROR;
*data = malloc(*n_entries * *entry_size);
if (!*data)
return HSAKMT_STATUS_NO_MEMORY;
if (suspend_queues && *n_entries)
queue_ids = (uint32_t *)malloc(sizeof(uint32_t) * *n_entries);
if (!queue_ids ||
dbg_trap_get_queue_data(*data, n_entries, *entry_size, queue_ids))
goto free_data;
if (queue_ids) {
if (dbg_trap_suspend_queues(queue_ids, *n_entries) ||
dbg_trap_get_queue_data(*data, n_entries, *entry_size, NULL))
goto free_data;
free(queue_ids);
}
return HSAKMT_STATUS_SUCCESS;
free_data:
free(*data);
if (queue_ids)
free(queue_ids);
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDebugTrapIoctl(struct kfd_ioctl_dbg_trap_args *args,
HSA_QUEUEID *Queues,
HSAuint64 *DebugReturn)
{
HSAKMT_STATUS result;
CHECK_KFD_OPEN();
if (Queues) {
int num_queues = args->op == KFD_IOC_DBG_TRAP_SUSPEND_QUEUES ?
args->suspend_queues.num_queues :
args->resume_queues.num_queues;
void *queue_ptr = args->op == KFD_IOC_DBG_TRAP_SUSPEND_QUEUES ?
(void *)args->suspend_queues.queue_array_ptr :
(void *)args->resume_queues.queue_array_ptr;
uint32_t *queue_ids = hsakmt_convert_queue_ids(num_queues, Queues);
if (!queue_ids) {
return HSAKMT_STATUS_NO_MEMORY;
}
memcpy(queue_ptr, queue_ids, num_queues * sizeof(uint32_t));
free(queue_ids);
}
long err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DBG_TRAP, args);
if (DebugReturn)
*DebugReturn = err;
if (args->op == KFD_IOC_DBG_TRAP_SUSPEND_QUEUES &&
err >= 0 && err <= args->suspend_queues.num_queues)
result = HSAKMT_STATUS_SUCCESS;
else if (args->op == KFD_IOC_DBG_TRAP_RESUME_QUEUES &&
err >= 0 && err <= args->resume_queues.num_queues)
result = HSAKMT_STATUS_SUCCESS;
else if (err == 0)
result = HSAKMT_STATUS_SUCCESS;
else
result = HSAKMT_STATUS_ERROR;
return result;
}
================================================
FILE: libhsakmt/src/events.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include
#include
#include
#include
#include
#include
#include
#include "hsakmt/linux/kfd_ioctl.h"
#include "fmm.h"
#include "hsakmt/hsakmtmodel.h"
static HSAuint64 *events_page = NULL;
void hsakmt_clear_events_page(void)
{
events_page = NULL;
}
static bool IsSystemEventType(HSA_EVENTTYPE type)
{
// Debug events behave as signal events.
return (type != HSA_EVENTTYPE_SIGNAL && type != HSA_EVENTTYPE_DEBUG_EVENT);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
bool ManualReset, bool IsSignaled,
HsaEvent **Event)
{
unsigned int event_limit = KFD_SIGNAL_EVENT_LIMIT;
CHECK_KFD_OPEN();
if (EventDesc->EventType >= HSA_EVENTTYPE_MAXID)
return HSAKMT_STATUS_INVALID_PARAMETER;
HsaEvent *e = malloc(sizeof(HsaEvent));
if (!e)
return HSAKMT_STATUS_ERROR;
memset(e, 0, sizeof(*e));
struct kfd_ioctl_create_event_args args = {0};
args.event_type = EventDesc->EventType;
args.node_id = EventDesc->NodeId;
args.auto_reset = !ManualReset;
/* dGPU code */
pthread_mutex_lock(&hsakmt_mutex);
if (hsakmt_is_dgpu && !events_page) {
events_page = hsakmt_allocate_exec_aligned_memory_gpu(
KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, 0, true, false, true);
if (!events_page) {
free(e);
pthread_mutex_unlock(&hsakmt_mutex);
return HSAKMT_STATUS_ERROR;
}
if (hsakmt_use_model)
model_set_event_page(events_page, KFD_SIGNAL_EVENT_LIMIT);
else
hsakmt_fmm_get_handle(events_page, (uint64_t *)&args.event_page_offset);
}
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) {
free(e);
*Event = NULL;
pthread_mutex_unlock(&hsakmt_mutex);
return HSAKMT_STATUS_ERROR;
}
e->EventId = args.event_id;
if (!events_page && args.event_page_offset > 0) {
events_page = mmap(NULL, event_limit * 8, PROT_WRITE | PROT_READ,
MAP_SHARED, hsakmt_kfd_fd, args.event_page_offset);
if (events_page == MAP_FAILED) {
/* old kernels only support 256 events */
event_limit = 256;
events_page = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ,
MAP_SHARED, hsakmt_kfd_fd, args.event_page_offset);
}
if (events_page == MAP_FAILED) {
events_page = NULL;
pthread_mutex_unlock(&hsakmt_mutex);
hsaKmtDestroyEvent(e);
return HSAKMT_STATUS_ERROR;
}
}
if (args.event_page_offset > 0 && args.event_slot_index < event_limit)
e->EventData.HWData2 = (HSAuint64)&events_page[args.event_slot_index];
pthread_mutex_unlock(&hsakmt_mutex);
e->EventData.EventType = EventDesc->EventType;
e->EventData.HWData1 = args.event_id;
e->EventData.HWData3 = args.event_trigger_data;
e->EventData.EventData.SyncVar.SyncVar.UserData =
EventDesc->SyncVar.SyncVar.UserData;
e->EventData.EventData.SyncVar.SyncVarSize =
EventDesc->SyncVar.SyncVarSize;
if (IsSignaled && !IsSystemEventType(e->EventData.EventType)) {
struct kfd_ioctl_set_event_args set_args = {0};
set_args.event_id = args.event_id;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_EVENT,
&set_args) != 0) {
hsaKmtDestroyEvent(e);
return HSAKMT_STATUS_ERROR;
}
}
*Event = e;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event)
{
CHECK_KFD_OPEN();
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
struct kfd_ioctl_destroy_event_args args = {0};
args.event_id = Event->EventId;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DESTROY_EVENT, &args) != 0)
return HSAKMT_STATUS_ERROR;
free(Event);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event)
{
CHECK_KFD_OPEN();
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
/* Although the spec is doesn't say, don't allow system-defined events
* to be signaled.
*/
if (IsSystemEventType(Event->EventData.EventType))
return HSAKMT_STATUS_ERROR;
struct kfd_ioctl_set_event_args args = {0};
args.event_id = Event->EventId;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_EVENT, &args) == -1)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event)
{
CHECK_KFD_OPEN();
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
/* Although the spec is doesn't say, don't allow system-defined events
* to be signaled.
*/
if (IsSystemEventType(Event->EventData.EventType))
return HSAKMT_STATUS_ERROR;
struct kfd_ioctl_reset_event_args args = {0};
args.event_id = Event->EventId;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RESET_EVENT, &args) == -1)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event)
{
CHECK_KFD_OPEN();
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event,
HSAuint32 Milliseconds)
{
return hsaKmtWaitOnEvent_Ext(Event, Milliseconds, NULL);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent_Ext(HsaEvent *Event,
HSAuint32 Milliseconds, uint64_t *event_age)
{
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
return hsaKmtWaitOnMultipleEvents_Ext(&Event, 1, true, Milliseconds, event_age);
}
static HSAKMT_STATUS get_mem_info_svm_api(uint64_t address, uint32_t gpu_id)
{
struct kfd_ioctl_svm_args *args;
uint32_t node_id = 0;
HSAuint32 s_attr;
HSAuint32 i;
HSA_SVM_ATTRIBUTE attrs[] = {
{HSA_SVM_ATTR_PREFERRED_LOC, 0},
{HSA_SVM_ATTR_PREFETCH_LOC, 0},
{HSA_SVM_ATTR_ACCESS, gpu_id},
{HSA_SVM_ATTR_SET_FLAGS, 0},
};
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(5);
s_attr = sizeof(attrs);
args = alloca(sizeof(*args) + s_attr);
args->start_addr = address;
args->size = PAGE_SIZE;
args->op = KFD_IOCTL_SVM_OP_GET_ATTR;
args->nattr = s_attr / sizeof(*attrs);
memcpy(args->attrs, attrs, s_attr);
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
pr_debug("op get range attrs failed %s\n", strerror(errno));
return HSAKMT_STATUS_ERROR;
}
pr_err("GPU address 0x%lx, is Unified memory\n", address);
for (i = 0; i < args->nattr; i++) {
if (args->attrs[i].value == KFD_IOCTL_SVM_LOCATION_SYSMEM ||
args->attrs[i].value == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
node_id = args->attrs[i].value;
else
hsakmt_gpuid_to_nodeid(args->attrs[i].value, &node_id);
switch (args->attrs[i].type) {
case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
pr_err("Preferred location for address 0x%lx is Node id %d\n",
address, node_id);
break;
case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
pr_err("Prefetch location for address 0x%lx is Node id %d\n",
address, node_id);
break;
case KFD_IOCTL_SVM_ATTR_ACCESS:
pr_err("Node id %d has access to address 0x%lx\n",
node_id, address);
break;
case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
pr_err("Node id %d has access in place to address 0x%lx\n",
node_id, address);
break;
case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
pr_err("Node id %d has no access to address 0x%lx\n",
node_id, address);
break;
case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_COHERENT)
pr_err("Fine grained coherency between devices\n");
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_GPU_RO)
pr_err("Read only\n");
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
pr_err("GPU exec allowed\n");
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)
pr_err("GPU always mapped\n");
if (args->attrs[i].value & KFD_IOCTL_SVM_FLAG_EXT_COHERENT)
pr_err("Extended-scope fine grained coherency between devices\n");
break;
default:
pr_debug("get invalid attr type 0x%x\n", args->attrs[i].type);
return HSAKMT_STATUS_ERROR;
}
}
return HSAKMT_STATUS_SUCCESS;
}
//Analysis memory exception data, print debug messages
static void analysis_memory_exception(struct kfd_hsa_memory_exception_data *
memory_exception_data)
{
HSAKMT_STATUS ret;
HsaPointerInfo info;
const uint64_t addr = memory_exception_data->va;
uint32_t node_id = 0;
unsigned int i;
hsakmt_gpuid_to_nodeid(memory_exception_data->gpu_id, &node_id);
pr_err("Memory exception on virtual address 0x%lx, ", addr);
pr_err("node id %d : ", node_id);
if (memory_exception_data->failure.NotPresent)
pr_err("Page not present\n");
else if (memory_exception_data->failure.ReadOnly)
pr_err("Writing to readonly page\n");
else if (memory_exception_data->failure.NoExecute)
pr_err("Execute to none-executable page\n");
ret = hsakmt_fmm_get_mem_info((const void *)addr, &info);
if (ret != HSAKMT_STATUS_SUCCESS) {
ret = get_mem_info_svm_api(addr, memory_exception_data->gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS)
pr_err("Address does not belong to a known buffer\n");
return;
}
pr_err("GPU address 0x%lx, node id %d, size in byte 0x%lx\n",
info.GPUAddress, info.Node, info.SizeInBytes);
switch (info.Type) {
case HSA_POINTER_REGISTERED_SHARED:
pr_err("Memory is registered shared buffer (IPC)\n");
break;
case HSA_POINTER_REGISTERED_GRAPHICS:
pr_err("Memory is registered graphics buffer\n");
break;
case HSA_POINTER_REGISTERED_USER:
pr_err("Memory is registered user pointer\n");
pr_err("CPU address of the memory is %p\n", info.CPUAddress);
break;
case HSA_POINTER_ALLOCATED:
pr_err("Memory is allocated using hsaKmtAllocMemory\n");
pr_err("CPU address of the memory is %p\n", info.CPUAddress);
break;
case HSA_POINTER_RESERVED_ADDR:
pr_err("Memory is allocated by OnlyAddress mode\n");
break;
default:
pr_err("Invalid memory type %d\n", info.Type);
break;
}
if (info.RegisteredNodes) {
pr_err("Memory is registered to node id: ");
for (i = 0; i < info.NRegisteredNodes; i++)
pr_err("%d ", info.RegisteredNodes[i]);
pr_err("\n");
}
if (info.MappedNodes) {
pr_err("Memory is mapped to node id: ");
for (i = 0; i < info.NMappedNodes; i++)
pr_err("%d ", info.MappedNodes[i]);
pr_err("\n");
}
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
HSAuint32 NumEvents,
bool WaitOnAll,
HSAuint32 Milliseconds)
{
return hsaKmtWaitOnMultipleEvents_Ext(Events, NumEvents, WaitOnAll, Milliseconds, NULL);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[],
HSAuint32 NumEvents,
bool WaitOnAll,
HSAuint32 Milliseconds,
uint64_t *event_age)
{
HSAKMT_STATUS result;
CHECK_KFD_OPEN();
if (!Events)
return HSAKMT_STATUS_INVALID_HANDLE;
struct kfd_event_data *event_data =
calloc(NumEvents, sizeof(struct kfd_event_data));
if (!event_data) {
return HSAKMT_STATUS_NO_MEMORY;
}
for (HSAuint32 i = 0; i < NumEvents; i++) {
event_data[i].event_id = Events[i]->EventId;
event_data[i].kfd_event_data_ext = (uint64_t)(uintptr_t)NULL;
if (event_age && Events[i]->EventData.EventType == HSA_EVENTTYPE_SIGNAL)
event_data[i].signal_event_data.last_event_age = event_age[i];
}
struct kfd_ioctl_wait_events_args args = {0};
args.wait_for_all = WaitOnAll;
args.timeout = Milliseconds;
args.num_events = NumEvents;
args.events_ptr = (uint64_t)(uintptr_t)event_data;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_WAIT_EVENTS, &args) == -1)
result = HSAKMT_STATUS_ERROR;
else if (args.wait_result == KFD_IOC_WAIT_RESULT_TIMEOUT)
result = HSAKMT_STATUS_WAIT_TIMEOUT;
else {
result = HSAKMT_STATUS_SUCCESS;
for (HSAuint32 i = 0; i < NumEvents; i++) {
if (Events[i]->EventData.EventType == HSA_EVENTTYPE_MEMORY &&
event_data[i].memory_exception_data.gpu_id) {
Events[i]->EventData.EventData.MemoryAccessFault.VirtualAddress = event_data[i].memory_exception_data.va;
result = hsakmt_gpuid_to_nodeid(event_data[i].memory_exception_data.gpu_id, &Events[i]->EventData.EventData.MemoryAccessFault.NodeId);
if (result != HSAKMT_STATUS_SUCCESS)
goto out;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.NotPresent = event_data[i].memory_exception_data.failure.NotPresent;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.ReadOnly = event_data[i].memory_exception_data.failure.ReadOnly;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.NoExecute = event_data[i].memory_exception_data.failure.NoExecute;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.Imprecise = event_data[i].memory_exception_data.failure.imprecise;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.ErrorType = event_data[i].memory_exception_data.ErrorType;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.ECC =
((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0;
Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS;
analysis_memory_exception(&event_data[i].memory_exception_data);
} else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION &&
event_data[i].hw_exception_data.gpu_id) {
result = hsakmt_gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId);
if (result != HSAKMT_STATUS_SUCCESS)
goto out;
Events[i]->EventData.EventData.HwException.ResetType = event_data[i].hw_exception_data.reset_type;
Events[i]->EventData.EventData.HwException.ResetCause = event_data[i].hw_exception_data.reset_cause;
Events[i]->EventData.EventData.HwException.MemoryLost = event_data[i].hw_exception_data.memory_lost;
}
}
}
out:
for (HSAuint32 i = 0; i < NumEvents; i++) {
if (event_age && Events[i]->EventData.EventType == HSA_EVENTTYPE_SIGNAL)
event_age[i] = event_data[i].signal_event_data.last_event_age;
}
free(event_data);
return result;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtOpenSMI(HSAuint32 NodeId, int *fd)
{
struct kfd_ioctl_smi_events_args args;
HSAKMT_STATUS result;
uint32_t gpuid;
CHECK_KFD_OPEN();
pr_debug("[%s] node %d\n", __func__, NodeId);
result = hsakmt_validate_nodeid(NodeId, &gpuid);
if (result != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
return result;
}
args.gpuid = gpuid;
result = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SMI_EVENTS, &args);
if (result) {
pr_debug("open SMI event fd failed %s\n", strerror(errno));
return HSAKMT_STATUS_ERROR;
}
*fd = args.anon_fd;
return HSAKMT_STATUS_SUCCESS;
}
================================================
FILE: libhsakmt/src/fmm.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#define _GNU_SOURCE
#include "libhsakmt.h"
#include "fmm.h"
#include "hsakmt/hsakmtmodel.h"
#include "hsakmt/linux/kfd_ioctl.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "rbtree.h"
#include
#include
#include
#include
#include "hsakmt/linux/udmabuf.h"
#ifndef MPOL_F_STATIC_NODES
/* Bug in numaif.h, this should be defined in there. Definition copied
* from linux/mempolicy.h.
*/
#define MPOL_F_STATIC_NODES (1 << 15)
#endif
#define NON_VALID_GPU_ID 0
#define INIT_MANAGEABLE_APERTURE(base_value, limit_value) { \
.base = (void *) base_value, \
.limit = (void *) limit_value, \
.align = 0, \
.guard_pages = 1, \
.vm_ranges = NULL, \
.fmm_mutex = PTHREAD_MUTEX_INITIALIZER, \
.is_cpu_accessible = false, \
.ops = &reserved_aperture_ops \
}
#define container_of(ptr, type, member) ({ \
char *__mptr = (void *)(ptr); \
((type *)(__mptr - offsetof(type, member))); })
#define rb_entry(ptr, type, member) \
container_of(ptr, type, member)
#define vm_object_entry(n, is_userptr) ({ \
(is_userptr) == 0 ? \
rb_entry(n, vm_object_t, node) : \
rb_entry(n, vm_object_t, user_node); })
#define vm_object_tree(app, is_userptr) \
((is_userptr) ? &(app)->user_tree : &(app)->tree)
#define START_NON_CANONICAL_ADDR (1ULL << 47)
#define END_NON_CANONICAL_ADDR (~0UL - (1UL << 47))
struct vm_object {
void *start;
void *userptr;
uint64_t userptr_size;
uint64_t size; /* size allocated on GPU. When the user requests a random
* size, Thunk aligns it to page size and allocates this
* aligned size on GPU
*/
uint32_t node_id;
rbtree_node_t node;
rbtree_node_t user_node;
HsaMemFlags mflags; /* memory allocation flags */
/* Registered nodes to map on SVM mGPU */
uint32_t *registered_device_id_array;
uint32_t registered_device_id_array_size;
uint32_t *registered_node_id_array;
uint32_t registration_count; /* the same memory region can be registered multiple times */
/* Nodes that mapped already */
uint32_t *mapped_device_id_array;
uint32_t mapped_device_id_array_size;
uint32_t *mapped_node_id_array;
uint32_t mapping_count;
/* Metadata of imported graphics buffers */
void *metadata;
/* User data associated with the memory */
void *user_data;
/* Flag to indicate imported KFD buffer */
bool is_imported_kfd_bo;
#ifdef SANITIZER_AMDGPU
int mmap_flags;
int mmap_fd;
off_t mmap_offset;
#endif
uint32_t handle_num; /* number of handles */
uint64_t handles[]; /* kfd handles array */
};
typedef struct vm_object vm_object_t;
struct vm_area {
void *start;
void *end;
struct vm_area *next;
struct vm_area *prev;
};
typedef struct vm_area vm_area_t;
/* Memory manager for an aperture */
typedef struct manageable_aperture manageable_aperture_t;
/* Aperture management function pointers to allow different management
* schemes.
*/
typedef struct {
void *(*allocate_area_aligned)(manageable_aperture_t *aper, void *addr,
uint64_t size, uint64_t align);
void (*release_area)(manageable_aperture_t *aper,
void *addr, uint64_t size);
} manageable_aperture_ops_t;
/* Reserved aperture type managed by its own address allocator */
static void *reserved_aperture_allocate_aligned(manageable_aperture_t *aper,
void *addr,
uint64_t size, uint64_t align);
static void reserved_aperture_release(manageable_aperture_t *aper,
void *addr, uint64_t size);
static int bind_mem_to_numa(uint32_t node_id, void *mem,
uint64_t SizeInBytes, HsaMemFlags mflags);
static const manageable_aperture_ops_t reserved_aperture_ops = {
reserved_aperture_allocate_aligned,
reserved_aperture_release
};
/* Unreserved aperture type using mmap to allocate virtual address space */
static void *mmap_aperture_allocate_aligned(manageable_aperture_t *aper,
void *addr,
uint64_t size, uint64_t align);
static void mmap_aperture_release(manageable_aperture_t *aper,
void *addr, uint64_t size);
static const manageable_aperture_ops_t mmap_aperture_ops = {
mmap_aperture_allocate_aligned,
mmap_aperture_release
};
struct manageable_aperture {
void *base;
void *limit;
uint64_t align;
uint32_t guard_pages;
vm_area_t *vm_ranges;
rbtree_t tree;
rbtree_t user_tree;
pthread_mutex_t fmm_mutex;
bool is_cpu_accessible;
const manageable_aperture_ops_t *ops;
};
typedef struct {
void *base;
void *limit;
} aperture_t;
typedef struct {
uint32_t gpu_id;
uint32_t device_id;
uint32_t node_id;
uint64_t local_mem_size;
HSA_ENGINE_ID EngineId;
aperture_t lds_aperture;
aperture_t scratch_aperture;
aperture_t mmio_aperture;
manageable_aperture_t scratch_physical; /* For dGPU, scratch physical is allocated from
* dgpu_aperture. When requested by RT, each
* GPU will get a differnt range
*/
manageable_aperture_t gpuvm_aperture; /* used for GPUVM on APU, outsidethe canonical address range */
int drm_render_fd;
uint32_t usable_peer_id_num;
uint32_t *usable_peer_id_array;
int drm_render_minor;
} gpu_mem_t;
enum svm_aperture_type {
SVM_DEFAULT = 0,
SVM_COHERENT,
SVM_APERTURE_NUM
};
/* The main structure for dGPU Shared Virtual Memory Management */
typedef struct {
/* Two apertures can have different MTypes (for coherency) */
manageable_aperture_t apertures[SVM_APERTURE_NUM];
/* Pointers to apertures, may point to the same aperture on
* GFXv9 and later, where MType is not based on apertures
*/
manageable_aperture_t *dgpu_aperture;
manageable_aperture_t *dgpu_alt_aperture;
/* whether to use userptr for paged memory */
bool userptr_for_paged_mem;
/* whether to check userptrs on registration */
bool check_userptr;
/* whether to check reserve svm on registration */
bool reserve_svm;
/* whether all memory is coherent (GPU cache disabled) */
bool disable_cache;
/* specifies the alignment size as PAGE_SIZE * 2^alignment_order */
uint32_t alignment_order;
} svm_t;
/* The other apertures are specific to each GPU. gpu_mem_t manages GPU
* specific memory apertures.
*/
static gpu_mem_t *gpu_mem;
static unsigned int gpu_mem_count;
static gpu_mem_t *g_first_gpu_mem;
static void *dgpu_shared_aperture_base;
static void *dgpu_shared_aperture_limit;
static svm_t svm = {
.apertures = {INIT_MANAGEABLE_APERTURE(0, 0),
INIT_MANAGEABLE_APERTURE(0, 0)},
.dgpu_aperture = NULL,
.dgpu_alt_aperture = NULL,
.userptr_for_paged_mem = false,
.check_userptr = false,
.disable_cache = false,
};
/* On APU, for memory allocated on the system memory that GPU doesn't access
* via GPU driver, they are not managed by GPUVM. cpuvm_aperture keeps track
* of this part of memory.
*/
static manageable_aperture_t cpuvm_aperture = INIT_MANAGEABLE_APERTURE(0, 0);
/* mem_handle_aperture is used to generate memory handles
* for allocations that don't have a valid virtual address
* its size is 47bits.
*/
static manageable_aperture_t mem_handle_aperture = INIT_MANAGEABLE_APERTURE(START_NON_CANONICAL_ADDR, (START_NON_CANONICAL_ADDR + (1ULL << 47)));
/* GPU node array for default mappings */
static uint32_t all_gpu_id_array_size;
static uint32_t *all_gpu_id_array;
/* IPC structures and helper functions */
typedef enum _HSA_APERTURE {
HSA_APERTURE_UNSUPPORTED = 0,
HSA_APERTURE_DGPU,
HSA_APERTURE_DGPU_ALT,
HSA_APERTURE_GPUVM,
HSA_APERTURE_CPUVM,
HSA_APERTURE_MEMHANDLE
} HSA_APERTURE;
typedef struct _HsaApertureInfo {
HSA_APERTURE type; // Aperture type
HSAuint32 idx; // Aperture index
} HsaApertureInfo;
typedef struct _HsaSharedMemoryStruct {
HSAuint32 ShareHandle[4];
HsaApertureInfo ApeInfo;
HSAuint32 SizeInPages;
HSAuint32 ExportGpuId;
} HsaSharedMemoryStruct;
static inline const HsaSharedMemoryStruct *to_const_hsa_shared_memory_struct(
const HsaSharedMemoryHandle *SharedMemoryHandle)
{
return (const HsaSharedMemoryStruct *)SharedMemoryHandle;
}
static inline HsaSharedMemoryStruct *to_hsa_shared_memory_struct(
HsaSharedMemoryHandle *SharedMemoryHandle)
{
return (HsaSharedMemoryStruct *)SharedMemoryHandle;
}
__attribute__((unused))
static inline HsaSharedMemoryHandle *to_hsa_shared_memory_handle(
HsaSharedMemoryStruct *SharedMemoryStruct)
{
return (HsaSharedMemoryHandle *)SharedMemoryStruct;
}
static int __fmm_release(vm_object_t *object, manageable_aperture_t *aperture);
static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
manageable_aperture_t *aperture,
void *address);
static void print_device_id_array(uint32_t *device_id_array, uint32_t device_id_array_size);
static vm_area_t *vm_create_and_init_area(void *start, void *end)
{
vm_area_t *area = (vm_area_t *) malloc(sizeof(vm_area_t));
if (area) {
area->start = start;
area->end = end;
area->next = area->prev = NULL;
}
return area;
}
/* One huge page smaller than 512GB system buffer limit,
* because 512GB allocation will cause TTM failure.
*/
#define BIGGEST_SINGLE_BUF_SIZE ((1ULL << 39) - GPU_HUGE_PAGE_SIZE)
static vm_object_t *vm_create_and_init_object(void *start, uint64_t size,
uint64_t handle, HsaMemFlags mflags)
{
vm_object_t *object;
uint64_t handle_array_size = (size + BIGGEST_SINGLE_BUF_SIZE - 1) /
BIGGEST_SINGLE_BUF_SIZE;
object = (vm_object_t *) malloc(sizeof(vm_object_t) +
handle_array_size * sizeof(uint64_t));
if (object) {
object->start = start;
object->userptr = NULL;
object->userptr_size = 0;
object->size = size;
object->handles[0] = handle;
object->handle_num = 1;
object->registered_device_id_array_size = 0;
object->mapped_device_id_array_size = 0;
object->registered_device_id_array = NULL;
object->mapped_device_id_array = NULL;
object->registered_node_id_array = NULL;
object->mapped_node_id_array = NULL;
object->registration_count = 0;
object->mapping_count = 0;
object->mflags = mflags;
object->metadata = NULL;
object->user_data = NULL;
object->is_imported_kfd_bo = false;
object->node.key = rbtree_key((unsigned long)start, size);
object->user_node.key = rbtree_key(0, 0);
#ifdef SANITIZER_AMDGPU
object->mmap_fd = 0;
#endif
}
return object;
}
static void vm_remove_area(manageable_aperture_t *app, vm_area_t *area)
{
vm_area_t *next;
vm_area_t *prev;
next = area->next;
prev = area->prev;
if (!prev) /* The first element */
app->vm_ranges = next;
else
prev->next = next;
if (next) /* If not the last element */
next->prev = prev;
free(area);
}
static void vm_remove_object(manageable_aperture_t *app, vm_object_t *object)
{
/* Free allocations inside the object */
if (object->registered_device_id_array)
free(object->registered_device_id_array);
if (object->mapped_device_id_array)
free(object->mapped_device_id_array);
if (object->metadata)
free(object->metadata);
if (object->registered_node_id_array)
free(object->registered_node_id_array);
if (object->mapped_node_id_array)
free(object->mapped_node_id_array);
hsakmt_rbtree_delete(&app->tree, &object->node);
if (object->userptr)
hsakmt_rbtree_delete(&app->user_tree, &object->user_node);
free(object);
}
static void vm_add_area_after(vm_area_t *after_this, vm_area_t *new_area)
{
vm_area_t *next = after_this->next;
after_this->next = new_area;
new_area->next = next;
new_area->prev = after_this;
if (next)
next->prev = new_area;
}
static void vm_split_area(manageable_aperture_t *app, vm_area_t *area,
void *address, uint64_t MemorySizeInBytes)
{
/*
* The existing area is split to: [area->start, address - 1]
* and [address + MemorySizeInBytes, area->end]
*/
vm_area_t *new_area = vm_create_and_init_area(
VOID_PTR_ADD(address, MemorySizeInBytes),
area->end);
if (new_area == NULL) {
pr_err("[%s] Failed to create new area during split.", __func__);
return;
}
/* Shrink the existing area */
area->end = VOID_PTR_SUB(address, 1);
vm_add_area_after(area, new_area);
}
static vm_object_t *vm_find_object_by_address_userptr(manageable_aperture_t *app,
const void *address, uint64_t size, int is_userptr)
{
vm_object_t *cur = NULL;
rbtree_t *tree = vm_object_tree(app, is_userptr);
rbtree_key_t key = rbtree_key((unsigned long)address, size);
void *start;
uint64_t s;
/* rbtree_lookup_nearest(,,,RIGHT) will return a node with
* its size >= key.size and its address >= key.address
* if there are two nodes with format(address, size),
* (0x100, 16) and (0x110, 8). the key is (0x100, 0),
* then node (0x100, 16) will be returned.
*/
rbtree_node_t *n = rbtree_lookup_nearest(tree, &key, LKP_ALL, RIGHT);
if (n) {
cur = vm_object_entry(n, is_userptr);
if (is_userptr == 0) {
start = cur->start;
s = cur->size;
} else {
start = cur->userptr;
s = cur->userptr_size;
}
if (start != address)
return NULL;
if (size)
return size == s ? cur : NULL;
/* size is 0, make sure there is only one node whose address == key.address*/
key = rbtree_key((unsigned long)address, (unsigned long)-1);
rbtree_node_t *rn = rbtree_lookup_nearest(tree, &key, LKP_ALL, LEFT);
if (rn != n)
return NULL;
}
return cur; /* NULL if not found */
}
static vm_object_t *vm_find_object_by_address_userptr_range(manageable_aperture_t *app,
const void *address, int is_userptr)
{
vm_object_t *cur = NULL;
rbtree_t *tree = vm_object_tree(app, is_userptr);
rbtree_key_t key = rbtree_key((unsigned long)address, 0);
rbtree_node_t *rn = rbtree_lookup_nearest(tree, &key, LKP_ALL, RIGHT);
rbtree_node_t *ln;
void *start;
uint64_t size;
/* all nodes might sit on left side of *address*, in this case rn is NULL.
* So pick up the rightest one as rn.
*/
if (!rn)
rn = rbtree_min_max(tree, RIGHT);
if (is_userptr) {
/* userptr might overlap. Need walk through the tree from right to left as only left nodes
* can obtain the *address*
*/
ln = rbtree_min_max(tree, LEFT);
} else {
/* if key->size is -1, it match the node with start <= address.
* if key->size is 0, it match the node with start < address.
*/
key = rbtree_key((unsigned long)address, -1);
ln = rbtree_lookup_nearest(tree, &key, LKP_ALL, LEFT);
}
if (!ln)
return NULL;
while (rn) {
cur = vm_object_entry(rn, is_userptr);
if (is_userptr == 0) {
start = cur->start;
size = cur->size;
} else {
start = cur->userptr;
size = cur->userptr_size;
}
if (address >= start &&
(uint64_t)address < ((uint64_t)start + size))
break;
cur = NULL;
if (ln == rn)
break;
rn = hsakmt_rbtree_prev(tree, rn);
}
return cur; /* NULL if not found */
}
static vm_object_t *vm_find_object_by_address(manageable_aperture_t *app,
const void *address, uint64_t size)
{
return vm_find_object_by_address_userptr(app, address, size, 0);
}
static vm_object_t *vm_find_object_by_address_range(manageable_aperture_t *app,
const void *address)
{
return vm_find_object_by_address_userptr_range(app, address, 0);
}
static vm_object_t *vm_find_object_by_userptr(manageable_aperture_t *app,
const void *address, HSAuint64 size)
{
return vm_find_object_by_address_userptr(app, address, size, 1);
}
static vm_object_t *vm_find_object_by_userptr_range(manageable_aperture_t *app,
const void *address)
{
return vm_find_object_by_address_userptr_range(app, address, 1);
}
static vm_area_t *vm_find(manageable_aperture_t *app, void *address)
{
vm_area_t *cur = app->vm_ranges;
/* Look up the appropriate address range containing the given address */
while (cur) {
if (cur->start <= address && cur->end >= address)
break;
cur = cur->next;
};
return cur; /* NULL if not found */
}
static bool aperture_is_valid(void *app_base, void *app_limit)
{
if (app_base && app_limit && app_base < app_limit)
return true;
return false;
}
/* Align size of a VM area
*
* Leave at least one guard page after every object to catch
* out-of-bounds accesses with VM faults.
*/
static uint64_t vm_align_area_size(manageable_aperture_t *app, uint64_t size)
{
return size + (uint64_t)app->guard_pages * PAGE_SIZE;
}
/*
* Assumes that fmm_mutex is locked on entry.
*/
static void reserved_aperture_release(manageable_aperture_t *app,
void *address,
uint64_t MemorySizeInBytes)
{
vm_area_t *area;
uint64_t SizeOfRegion;
MemorySizeInBytes = vm_align_area_size(app, MemorySizeInBytes);
area = vm_find(app, address);
if (!area)
return;
SizeOfRegion = VOID_PTRS_SUB(area->end, area->start) + 1;
/* check if block is whole region or part of it */
if (SizeOfRegion == MemorySizeInBytes) {
vm_remove_area(app, area);
} else if (SizeOfRegion > MemorySizeInBytes) {
/* shrink from the start */
if (area->start == address)
area->start =
VOID_PTR_ADD(area->start, MemorySizeInBytes);
/* shrink from the end */
else if (VOID_PTRS_SUB(area->end, address) + 1 ==
MemorySizeInBytes)
area->end = VOID_PTR_SUB(area->end, MemorySizeInBytes);
/* split the area */
else
vm_split_area(app, area, address, MemorySizeInBytes);
}
if (app->is_cpu_accessible) {
void *mmap_ret;
/* Reset NUMA policy */
mbind(address, MemorySizeInBytes, MPOL_DEFAULT, NULL, 0, 0);
/* Remove any CPU mapping, but keep the address range reserved */
mmap_ret = mmap(address, MemorySizeInBytes, PROT_NONE,
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED,
-1, 0);
if (mmap_ret == MAP_FAILED && errno == ENOMEM) {
/* When mmap count reaches max_map_count, any mmap will
* fail. Reduce the count with munmap then map it as
* NORESERVE immediately.
*/
if (munmap(address, MemorySizeInBytes) == 0) {
/* After unmapping, try mmap again and handle failure
* */
mmap_ret = mmap(address, MemorySizeInBytes, PROT_NONE,
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED,
-1, 0);
if (mmap_ret == MAP_FAILED) {
/* Handle mmap failure gracefully, log if needed */
pr_err("Failed to remap memory after unmap\n");
}
} else {
/* Handle munmap failure if needed */
pr_err("Failed to unmap memory\n");
}
}
}
}
/*
* returns allocated address or NULL. Assumes, that fmm_mutex is locked
* on entry.
*/
static void *reserved_aperture_allocate_aligned(manageable_aperture_t *app,
void *address,
uint64_t MemorySizeInBytes,
uint64_t align)
{
uint64_t offset = 0, orig_align = align;
vm_area_t *cur, *next;
void *start;
if (align < app->align)
align = app->align;
/* Align big buffers to the next power-of-2 up to huge page
* size for flexible fragment size TLB optimizations
*/
while (align < GPU_HUGE_PAGE_SIZE && MemorySizeInBytes >= (align << 1))
align <<= 1;
/* If no specific alignment was requested, align the end of
* buffers instead of the start. For fragment optimizations,
* aligning the start or the end achieves the same effective
* optimization. End alignment to the TLB cache line size is
* needed as a workaround for TLB issues on some older GPUs.
*/
if (orig_align <= (uint64_t)PAGE_SIZE)
offset = align - (MemorySizeInBytes & (align - 1));
MemorySizeInBytes = vm_align_area_size(app, MemorySizeInBytes);
/* Find a big enough "hole" in the address space */
cur = NULL;
next = app->vm_ranges;
start = address ? address :
(void *)(ALIGN_UP((uint64_t)app->base, align) + offset);
while (next) {
if (next->start > start &&
VOID_PTRS_SUB(next->start, start) >= MemorySizeInBytes)
break;
cur = next;
next = next->next;
if (!address)
start = (void *)(ALIGN_UP((uint64_t)cur->end + 1, align) + offset);
}
if (!next && VOID_PTRS_SUB(app->limit, start) + 1 < MemorySizeInBytes)
/* No hole found and not enough space after the last area */
return NULL;
if (cur && address && address < (void *)ALIGN_UP((uint64_t)cur->end + 1, align))
/* Required address is not free or overlaps */
return NULL;
if (cur && VOID_PTR_ADD(cur->end, 1) == start) {
/* extend existing area */
cur->end = VOID_PTR_ADD(start, MemorySizeInBytes-1);
} else {
vm_area_t *new_area;
/* create a new area between cur and next */
new_area = vm_create_and_init_area(start,
VOID_PTR_ADD(start, (MemorySizeInBytes - 1)));
if (!new_area)
return NULL;
new_area->next = next;
new_area->prev = cur;
if (cur)
cur->next = new_area;
else
app->vm_ranges = new_area;
if (next)
next->prev = new_area;
}
return start;
}
void *hsakmt_mmap_allocate_aligned(int prot, int flags, uint64_t size, uint64_t align,
uint64_t guard_size, void *aper_base, void *aper_limit, int fd)
{
void *addr, *aligned_addr, *aligned_end, *mapping_end;
uint64_t aligned_padded_size;
aligned_padded_size = size + guard_size * 2 + (align - PAGE_SIZE);
/* Map memory PROT_NONE to alloc address space only */
addr = mmap(0, aligned_padded_size, PROT_NONE, flags | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) {
pr_err("mmap failed: %s\n", strerror(errno));
return NULL;
}
/* Adjust for alignment and guard pages */
aligned_addr = (void *)ALIGN_UP((uint64_t)addr + guard_size, align);
if (aligned_addr < aper_base ||
VOID_PTR_ADD(aligned_addr, size - 1) > aper_limit) {
pr_err("mmap returned %p, out of range %p-%p\n", aligned_addr,
aper_base, aper_limit);
munmap(addr, aligned_padded_size);
return NULL;
}
/* Unmap padding and guard pages */
if (aligned_addr > addr)
munmap(addr, VOID_PTRS_SUB(aligned_addr, addr));
aligned_end = VOID_PTR_ADD(aligned_addr, size);
mapping_end = VOID_PTR_ADD(addr, aligned_padded_size);
if (mapping_end > aligned_end)
munmap(aligned_end, VOID_PTRS_SUB(mapping_end, aligned_end));
if (prot == PROT_NONE)
return aligned_addr;
/* MAP_FIXED to the aligned address with required prot */
addr = mmap(aligned_addr, size, prot, flags | MAP_FIXED, fd, 0);
if (addr == MAP_FAILED) {
pr_err("mmap failed: %s\n", strerror(errno));
return NULL;
}
return addr;
}
static void *mmap_aperture_allocate_aligned(manageable_aperture_t *aper,
void *address,
uint64_t size, uint64_t align)
{
uint64_t alignment_size = PAGE_SIZE << svm.alignment_order;
uint64_t guard_size;
if (!aper->is_cpu_accessible) {
pr_err("MMap Aperture must be CPU accessible\n");
return NULL;
}
if (address) {
void *addr;
#ifdef MAP_FIXED_NOREPLACE
addr = mmap(address, size, PROT_NONE,
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED_NOREPLACE,
-1, 0);
#else
addr = mmap(address, size, PROT_NONE,
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE,
-1, 0);
#endif
if (addr == MAP_FAILED) {
pr_err("mmap failed: %s\n", strerror(errno));
return NULL;
}
#ifndef MAP_FIXED_NOREPLACE
if (address != addr) {
pr_err("mmap failed to return addr asked\n");
munmap(addr, size);
return NULL;
}
#endif
return addr;
}
/* Align big buffers to the next power-of-2. By default, the max alignment
* size is set to 2MB. This can be modified by the env variable
* HSA_MAX_VA_ALIGN. This variable sets the order of the alignment size as
* PAGE_SIZE * 2^HSA_MAX_VA_ALIGN. Setting HSA_MAX_VA_ALIGN = 18 (1GB),
* improves the time for memory allocation and mapping. But it might lose
* performance when GFX access it, specially for big allocations (>3GB).
*/
while (align < alignment_size && size >= (align << 1))
align <<= 1;
/* Add padding to guarantee proper alignment and leave guard
* pages on both sides
*/
guard_size = (uint64_t)aper->guard_pages * PAGE_SIZE;
return hsakmt_mmap_allocate_aligned(PROT_NONE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE,
size, align, guard_size, aper->base, aper->limit, -1);
}
static void mmap_aperture_release(manageable_aperture_t *aper,
void *addr, uint64_t size)
{
if (!aper->is_cpu_accessible) {
pr_err("MMap Aperture must be CPU accessible\n");
return;
}
/* Reset NUMA policy */
mbind(addr, size, MPOL_DEFAULT, NULL, 0, 0);
/* Unmap memory */
munmap(addr, size);
}
/* Wrapper functions to call aperture-specific VA management functions */
static void *aperture_allocate_area_aligned(manageable_aperture_t *app,
void *address,
uint64_t MemorySizeInBytes,
uint64_t align)
{
return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, align ? align : app->align);
}
static void *aperture_allocate_area(manageable_aperture_t *app, void *address,
uint64_t MemorySizeInBytes)
{
return app->ops->allocate_area_aligned(app, address, MemorySizeInBytes, app->align);
}
static void aperture_release_area(manageable_aperture_t *app, void *address,
uint64_t MemorySizeInBytes)
{
app->ops->release_area(app, address, MemorySizeInBytes);
}
/* returns 0 on success. Assumes, that fmm_mutex is locked on entry */
static vm_object_t *aperture_allocate_object(manageable_aperture_t *app,
void *new_address,
uint64_t handle,
uint64_t MemorySizeInBytes,
HsaMemFlags mflags)
{
vm_object_t *new_object;
/* Allocate new object */
new_object = vm_create_and_init_object(new_address,
MemorySizeInBytes,
handle, mflags);
if (!new_object)
return NULL;
hsakmt_rbtree_insert(&app->tree, &new_object->node);
return new_object;
}
static int32_t gpu_mem_find_by_gpu_id(uint32_t gpu_id)
{
uint32_t i;
for (i = 0 ; i < gpu_mem_count ; i++)
if (gpu_mem[i].gpu_id == gpu_id)
return i;
return -1;
}
static int32_t gpu_mem_find_by_node_id(uint32_t node_id)
{
uint32_t i;
for (i = 0 ; i < gpu_mem_count ; i++)
if (gpu_mem[i].node_id == node_id)
return i;
return -1;
}
static manageable_aperture_t *fmm_get_aperture(HsaApertureInfo info)
{
switch (info.type) {
case HSA_APERTURE_DGPU:
return svm.dgpu_aperture;
case HSA_APERTURE_DGPU_ALT:
return svm.dgpu_alt_aperture;
case HSA_APERTURE_GPUVM:
return &gpu_mem[info.idx].gpuvm_aperture;
case HSA_APERTURE_CPUVM:
return &cpuvm_aperture;
case HSA_APERTURE_MEMHANDLE:
return &mem_handle_aperture;
default:
return NULL;
}
}
static gpu_mem_t *fmm_is_scratch_aperture(const void *address)
{
uint32_t i;
for (i = 0; i < gpu_mem_count; i++) {
if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
continue;
if ((address >= gpu_mem[i].scratch_physical.base) &&
(address <= gpu_mem[i].scratch_physical.limit))
return &gpu_mem[i];
}
return NULL;
}
static manageable_aperture_t *fmm_find_aperture(const void *address,
HsaApertureInfo *info)
{
manageable_aperture_t *aperture = NULL;
uint32_t i;
HsaApertureInfo _info = { .type = HSA_APERTURE_UNSUPPORTED, .idx = 0};
gpu_mem_t *gpu_mem_ptr = NULL;
if ((address >= mem_handle_aperture.base) &&
(address <= mem_handle_aperture.limit)){
aperture = &mem_handle_aperture;
_info.type = HSA_APERTURE_MEMHANDLE;
} else if (hsakmt_is_dgpu) {
if (address >= svm.dgpu_aperture->base &&
address <= svm.dgpu_aperture->limit) {
gpu_mem_ptr = fmm_is_scratch_aperture(address);
if (gpu_mem_ptr) {
aperture = &gpu_mem_ptr->scratch_physical;
} else {
aperture = svm.dgpu_aperture;
_info.type = HSA_APERTURE_DGPU;
}
} else if (address >= svm.dgpu_alt_aperture->base &&
address <= svm.dgpu_alt_aperture->limit) {
aperture = svm.dgpu_alt_aperture;
_info.type = HSA_APERTURE_DGPU_ALT;
} else {
/* Not in SVM, it can be system memory registered by userptr */
aperture = svm.dgpu_aperture;
_info.type = HSA_APERTURE_DGPU;
}
} else { /* APU */
if (address >= svm.dgpu_aperture->base && address <= svm.dgpu_aperture->limit) {
aperture = svm.dgpu_aperture;
_info.type = HSA_APERTURE_DGPU;
} else {
/* gpuvm_aperture */
for (i = 0; i < gpu_mem_count; i++) {
if ((address >= gpu_mem[i].gpuvm_aperture.base) &&
(address <= gpu_mem[i].gpuvm_aperture.limit)) {
aperture = &gpu_mem[i].gpuvm_aperture;
_info.type = HSA_APERTURE_GPUVM;
_info.idx = i;
}
}
}
if (!aperture) {
/* Not in GPUVM */
aperture = &cpuvm_aperture;
_info.type = HSA_APERTURE_CPUVM;
}
}
if (info)
*info = _info;
return aperture;
}
static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags)
{
HsaMemFlags mflags = {0};
if (!(ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE))
mflags.ui32.ReadOnly = 1;
if (!(ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT))
mflags.ui32.CoarseGrain = 1;
if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)
mflags.ui32.ExtendedCoherent = 1;
if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC)
mflags.ui32.HostAccess = 1;
return mflags;
}
static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
uint64_t size,
bool coarse_grain,
bool ext_coherent)
{
struct kfd_ioctl_svm_args *args;
size_t s_attr;
HSAuint32 page_offset = (HSAuint64)address & (PAGE_SIZE-1);
HSAuint64 aligned_addr = (HSAuint64)address - page_offset;
HSAuint64 aligned_size = PAGE_ALIGN_UP(page_offset + size);
if (!g_first_gpu_mem)
return HSAKMT_STATUS_ERROR;
s_attr = 2 * sizeof(struct kfd_ioctl_svm_attribute);
args = alloca(sizeof(*args) + s_attr);
args->start_addr = aligned_addr;
args->size = aligned_size;
args->op = KFD_IOCTL_SVM_OP_SET_ATTR;
args->nattr = 2;
args->attrs[0].type = coarse_grain ?
HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS;
args->attrs[0].value = HSA_SVM_FLAG_COHERENT;
args->attrs[1].type = ext_coherent ? HSA_SVM_ATTR_SET_FLAGS : HSA_SVM_ATTR_CLR_FLAGS ;
args->attrs[1].value = HSA_SVM_FLAG_EXT_COHERENT;
pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr,
aligned_size);
/* Driver does one copy_from_user, with extra attrs size */
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
pr_debug("op set range attrs failed %s\n", strerror(errno));
return HSAKMT_STATUS_ERROR;
}
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS fmm_map_mem_svm_api(void *address,
uint64_t size,
uint32_t *nodes_to_map,
uint32_t nodes_array_size)
{
struct kfd_ioctl_svm_args *args;
size_t s_attr;
uint32_t i, nattr;
if (!g_first_gpu_mem)
return HSAKMT_STATUS_ERROR;
nattr = nodes_array_size;
s_attr = sizeof(struct kfd_ioctl_svm_attribute) * nattr;
args = alloca(sizeof(*args) + s_attr);
args->start_addr = (uint64_t)address;
args->size = size;
args->op = KFD_IOCTL_SVM_OP_SET_ATTR;
args->nattr = nattr;
for (i = 0; i < nodes_array_size; i++) {
args->attrs[i].type = HSA_SVM_ATTR_ACCESS_IN_PLACE;
args->attrs[i].value = nodes_to_map[i];
}
/* Driver does one copy_from_user, with extra attrs size */
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args)) {
pr_debug("op set range attrs failed %s\n", strerror(errno));
return HSAKMT_STATUS_ERROR;
}
return HSAKMT_STATUS_SUCCESS;
}
/* After allocating the memory, return the vm_object created for this memory.
* Return NULL if any failure.
*/
static vm_object_t *fmm_allocate_memory_object(uint32_t gpu_id, void *mem,
uint64_t MemorySizeInBytes,
manageable_aperture_t *aperture,
uint64_t *mmap_offset,
uint32_t ioc_flags)
{
struct kfd_ioctl_alloc_memory_of_gpu_args args = {0};
struct kfd_ioctl_free_memory_of_gpu_args free_args = {0};
vm_object_t *vm_obj = NULL;
HsaMemFlags mflags;
uint64_t offset = 0, total_size, size;
if (!mem)
return NULL;
/* Allocate memory from amdkfd */
args.gpu_id = gpu_id;
args.flags = ioc_flags |
KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE;
args.va_addr = (uint64_t)mem;
if (!hsakmt_is_dgpu &&
(ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM))
args.va_addr = VOID_PTRS_SUB(mem, aperture->base);
/* if allocate vram-only, use an invalid VA */
if (aperture == &mem_handle_aperture)
args.va_addr = 0;
total_size = 0;
/* Split to multiple buffers, if size is too big */
if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
size = MemorySizeInBytes < BIGGEST_SINGLE_BUF_SIZE ?
MemorySizeInBytes : BIGGEST_SINGLE_BUF_SIZE;
offset = *mmap_offset;
args.mmap_offset = *mmap_offset;
} else {
size = MemorySizeInBytes;
}
mflags = fmm_translate_ioc_to_hsa_flags(ioc_flags);
do {
args.size = size;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, &args))
goto err_hsakmt_ioctl_failed;
/* Allocate object */
if (!vm_obj) {
pthread_mutex_lock(&aperture->fmm_mutex);
vm_obj = aperture_allocate_object(aperture, mem, args.handle,
MemorySizeInBytes, mflags);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!vm_obj)
goto err_object_allocation_failed;
if (mmap_offset)
*mmap_offset = args.mmap_offset;
} else {
vm_obj->handles[vm_obj->handle_num++] = args.handle;
}
args.va_addr += size;
offset += size;
if (ioc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)
args.mmap_offset = offset;
total_size += size;
if (total_size + BIGGEST_SINGLE_BUF_SIZE > MemorySizeInBytes)
size = MemorySizeInBytes - total_size;
else
size = BIGGEST_SINGLE_BUF_SIZE;
} while (total_size < MemorySizeInBytes);
return vm_obj;
err_object_allocation_failed:
free_args.handle = args.handle;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args)) {
pr_err("Failed to free GPU memory with handle: 0x%llx\n", free_args.handle);
}
err_hsakmt_ioctl_failed:
if (vm_obj) {
do {
free_args.handle = vm_obj->handles[--vm_obj->handle_num];
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &free_args))
pr_err("Failed to free GPU memory with handle: 0x%llx\n", free_args.handle);
} while (vm_obj->handle_num);
pthread_mutex_lock(&aperture->fmm_mutex);
vm_remove_object(aperture, vm_obj);
pthread_mutex_unlock(&aperture->fmm_mutex);
}
return NULL;
}
#ifdef DEBUG_PRINT_APERTURE
static void aperture_print(aperture_t *app)
{
pr_info("\t Base: %p\n", app->base);
pr_info("\t Limit: %p\n", app->limit);
}
static void manageable_aperture_print(manageable_aperture_t *app)
{
vm_area_t *cur = app->vm_ranges;
rbtree_node_t *n = rbtree_node_any(&app->tree, LEFT);
vm_object_t *object;
pr_info("\t Base: %p\n", app->base);
pr_info("\t Limit: %p\n", app->limit);
pr_info("\t Ranges:\n");
while (cur) {
pr_info("\t\t Range [%p - %p]\n", cur->start, cur->end);
cur = cur->next;
};
pr_info("\t Objects:\n");
while (n) {
object = vm_object_entry(n, 0);
pr_info("\t\t Object [%p - %" PRIu64 "]\n",
object->start, object->size);
n = hsakmt_rbtree_next(&app->tree, n);
}
}
void hsakmt_fmm_print(uint32_t gpu_id)
{
int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id >= 0) { /* Found */
pr_info("LDS aperture:\n");
aperture_print(&gpu_mem[gpu_mem_id].lds_aperture);
pr_info("GPUVM aperture:\n");
manageable_aperture_print(&gpu_mem[gpu_mem_id].gpuvm_aperture);
pr_info("Scratch aperture:\n");
aperture_print(&gpu_mem[gpu_mem_id].scratch_aperture);
pr_info("Scratch backing memory:\n");
manageable_aperture_print(&gpu_mem[gpu_mem_id].scratch_physical);
}
pr_info("dGPU aperture:\n");
manageable_aperture_print(svm.dgpu_aperture);
pr_info("dGPU alt aperture:\n");
if (svm.dgpu_aperture == svm.dgpu_alt_aperture)
pr_info("\t Alias of dGPU aperture\n");
else
manageable_aperture_print(svm.dgpu_alt_aperture);
}
#else
void hsakmt_fmm_print(uint32_t gpu_id)
{
}
#endif
/* vm_find_object - Find a VM object in any aperture
*
* @addr: VM address of the object
* @size: size of the object, 0 means "don't care",
* UINT64_MAX means addr can match any address within the object
* @out_aper: Aperture where the object was found
*
* Returns a pointer to the object if found, NULL otherwise. If an
* object is found, this function returns with the
* (*out_aper)->fmm_mutex locked.
*/
static vm_object_t *vm_find_object(const void *addr, uint64_t size,
manageable_aperture_t **out_aper)
{
manageable_aperture_t *aper = NULL;
bool range = (size == UINT64_MAX);
bool userptr = false;
vm_object_t *obj = NULL;
uint32_t i;
for (i = 0; i < gpu_mem_count; i++)
if (gpu_mem[i].gpu_id != NON_VALID_GPU_ID &&
addr >= gpu_mem[i].gpuvm_aperture.base &&
addr <= gpu_mem[i].gpuvm_aperture.limit) {
aper = &gpu_mem[i].gpuvm_aperture;
break;
}
if (!aper) {
if ((addr >= mem_handle_aperture.base) &&
(addr <= mem_handle_aperture.limit)){
aper = &mem_handle_aperture;
}
}
if (!aper) {
if (!svm.dgpu_aperture)
goto no_svm;
if ((addr >= svm.dgpu_aperture->base) &&
(addr <= svm.dgpu_aperture->limit))
aper = svm.dgpu_aperture;
else if ((addr >= svm.dgpu_alt_aperture->base) &&
(addr <= svm.dgpu_alt_aperture->limit))
aper = svm.dgpu_alt_aperture;
else {
aper = svm.dgpu_aperture;
userptr = true;
}
}
pthread_mutex_lock(&aper->fmm_mutex);
if (range) {
/* mmap_apertures can have userptrs in them. Try to
* look up addresses as userptrs first to sort out any
* ambiguity of multiple overlapping mappings at
* different GPU addresses.
*/
if (userptr || aper->ops == &mmap_aperture_ops)
obj = vm_find_object_by_userptr_range(aper, addr);
if (!obj && !userptr)
obj = vm_find_object_by_address_range(aper, addr);
} else {
if (userptr || aper->ops == &mmap_aperture_ops)
obj = vm_find_object_by_userptr(aper, addr, size);
if (!obj && !userptr) {
long page_offset = (long)addr & (PAGE_SIZE-1);
const void *page_addr = (const uint8_t *)addr - page_offset;
obj = vm_find_object_by_address(aper, page_addr, 0);
/* If we find a userptr here, it's a match on
* the aligned GPU address. Make sure that the
* page offset and size match too.
*/
if (obj && obj->userptr &&
(((long)obj->userptr & (PAGE_SIZE - 1)) != page_offset ||
(size && size != obj->userptr_size)))
obj = NULL;
}
}
no_svm:
if (!obj && !hsakmt_is_dgpu) {
/* On APUs try finding it in the CPUVM aperture */
if (aper)
pthread_mutex_unlock(&aper->fmm_mutex);
aper = &cpuvm_aperture;
pthread_mutex_lock(&aper->fmm_mutex);
if (range)
obj = vm_find_object_by_address_range(aper, addr);
else
obj = vm_find_object_by_address(aper, addr, 0);
}
if (obj) {
*out_aper = aper;
return obj;
}
if (aper)
pthread_mutex_unlock(&aper->fmm_mutex);
return NULL;
}
static HSAuint8 fmm_check_user_memory(const void *addr, HSAuint64 size)
{
volatile const HSAuint8 *ptr = addr;
volatile const HSAuint8 *end = ptr + size;
HSAuint8 sum = 0;
/* Access every page in the buffer to make sure the mapping is
* valid. If it's not, it will die with a segfault that's easy
* to debug.
*/
for (; ptr < end; ptr = (void *)PAGE_ALIGN_UP(ptr + 1))
sum += *ptr;
return sum;
}
static void fmm_release_scratch(uint32_t gpu_id)
{
int32_t gpu_mem_id;
uint64_t size;
vm_object_t *obj;
manageable_aperture_t *aperture;
rbtree_node_t *n;
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return;
aperture = &gpu_mem[gpu_mem_id].scratch_physical;
size = VOID_PTRS_SUB(aperture->limit, aperture->base) + 1;
if (hsakmt_is_dgpu) {
/* unmap and remove all remaining objects */
pthread_mutex_lock(&aperture->fmm_mutex);
while ((n = rbtree_node_any(&aperture->tree, MID))) {
obj = vm_object_entry(n, 0);
void *obj_addr = obj->start;
pthread_mutex_unlock(&aperture->fmm_mutex);
_fmm_unmap_from_gpu_scratch(gpu_id, aperture, obj_addr);
pthread_mutex_lock(&aperture->fmm_mutex);
}
pthread_mutex_unlock(&aperture->fmm_mutex);
/* release address space */
pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex);
aperture_release_area(svm.dgpu_aperture,
gpu_mem[gpu_mem_id].scratch_physical.base,
size);
pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex);
} else
/* release address space */
munmap(gpu_mem[gpu_mem_id].scratch_physical.base, size);
/* invalidate scratch backing aperture */
gpu_mem[gpu_mem_id].scratch_physical.base = NULL;
gpu_mem[gpu_mem_id].scratch_physical.limit = NULL;
}
static uint32_t fmm_translate_hsa_to_ioc_flags(HsaMemFlags flags)
{
uint32_t ioc_flags = 0;
if (flags.ui32.AQLQueueMemory)
ioc_flags |= (KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM |
KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED);
if (!flags.ui32.ReadOnly)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE;
if (flags.ui32.ExecuteAccess)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
return ioc_flags;
}
#define SCRATCH_ALIGN 0x10000
void *hsakmt_fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes)
{
manageable_aperture_t *aperture_phy;
struct kfd_ioctl_set_scratch_backing_va_args args = {0};
int32_t gpu_mem_id;
void *mem = NULL;
uint64_t aligned_size = ALIGN_UP(MemorySizeInBytes, SCRATCH_ALIGN);
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
aperture_phy = &gpu_mem[gpu_mem_id].scratch_physical;
if (aperture_phy->base || aperture_phy->limit)
/* Scratch was already allocated for this GPU */
return NULL;
/* Allocate address space for scratch backing, 64KB aligned */
if (hsakmt_is_dgpu) {
pthread_mutex_lock(&svm.dgpu_aperture->fmm_mutex);
mem = aperture_allocate_area_aligned(
svm.dgpu_aperture, address,
aligned_size, SCRATCH_ALIGN);
pthread_mutex_unlock(&svm.dgpu_aperture->fmm_mutex);
} else {
if (address)
return NULL;
mem = hsakmt_mmap_allocate_aligned(PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
aligned_size, SCRATCH_ALIGN, 0,
0, (void *)LONG_MAX, -1);
}
/* Remember scratch backing aperture for later */
aperture_phy->base = mem;
aperture_phy->limit = VOID_PTR_ADD(mem, aligned_size-1);
aperture_phy->is_cpu_accessible = true;
/* Program SH_HIDDEN_PRIVATE_BASE */
args.gpu_id = gpu_id;
args.va_addr = ((uint64_t)mem) >> 16;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_SCRATCH_BACKING_VA, &args)) {
fmm_release_scratch(gpu_id);
return NULL;
}
return mem;
}
static void *__fmm_allocate_device(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes,
manageable_aperture_t *aperture, uint64_t *mmap_offset,
uint32_t ioc_flags, uint64_t alignment, vm_object_t **vm_obj)
{
void *mem = NULL;
vm_object_t *obj;
/* Check that aperture is properly initialized/supported */
if (!aperture_is_valid(aperture->base, aperture->limit))
return NULL;
/* Allocate address space */
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area_aligned(aperture, address, MemorySizeInBytes, alignment);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!mem)
return NULL;
/*
* Now that we have the area reserved, allocate memory in the device
* itself
*/
obj = fmm_allocate_memory_object(gpu_id, mem,
MemorySizeInBytes, aperture, mmap_offset, ioc_flags);
if (!obj) {
/*
* allocation of memory in device failed.
* Release region in aperture
*/
pthread_mutex_lock(&aperture->fmm_mutex);
aperture_release_area(aperture, mem, MemorySizeInBytes);
pthread_mutex_unlock(&aperture->fmm_mutex);
/* Assign NULL to mem to indicate failure to calling function */
mem = NULL;
}
if (vm_obj)
*vm_obj = obj;
return mem;
}
static void *fmm_map_to_cpu(void *mem, uint64_t size, bool host_access,
int fd, uint64_t mmap_offset) {
int flag = MAP_SHARED | MAP_FIXED;
int prot = host_access ? PROT_READ | PROT_WRITE : PROT_NONE;
void *ret = mmap(mem, size, prot, flag, fd, mmap_offset);
if (ret != MAP_FAILED)
/* This madvise() call is needed to avoid additional references
* to mapped BOs in child processes that can prevent freeing
* memory in the parent process and lead to out-of-memory
* conditions.
*/
madvise(mem, size, MADV_DONTFORK);
return ret;
}
static void *fmm_allocate_va(uint32_t gpu_id, void *address, uint64_t size,
manageable_aperture_t *aperture, uint64_t alignment, HsaMemFlags mflags)
{
void *mem = NULL;
vm_object_t *vm_obj = NULL;
/* Check aperture is properly initialized/supported */
if (!aperture_is_valid(aperture->base, aperture->limit))
return NULL;
/* Allocate address space */
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area_aligned(aperture, address, size, alignment);
if (mem) {
/* Assign handle 0 to vm_obj since no memory allocated yet */
vm_obj = aperture_allocate_object(aperture, mem, 0, size, mflags);
if (!vm_obj) {
aperture_release_area(aperture, mem, size);
mem = NULL;
}
/* Set node_id to 0 for OnlyAddress */
vm_obj->node_id = 0;
}
pthread_mutex_unlock(&aperture->fmm_mutex);
return mem;
}
/* use udmabuf driver to allocate buf */
static void* udmabuf_allocation(uint32_t gpu_id, uint32_t node_id, uint64_t size,
manageable_aperture_t *aperture, uint64_t alignment,
HsaMemFlags mflags, vm_object_t** vm_obj)
{
struct kfd_ioctl_import_dmabuf_args importArgs = {0};
int memfd, dmabuf_fd;
long long node_size, free_size;
struct udmabuf_create create;
uint64_t alignment_size;
uint32_t numa_node_id;
uint64_t guard_size;
void *mem;
int ret;
dmabuf_fd = -1;
memfd = -1;
*vm_obj = NULL;
memfd = memfd_create("thunk_memfd", MFD_ALLOW_SEALING);
if (memfd == -1) {
pr_debug("running kernel does not support memfd\n");
return NULL;
}
if (ftruncate(memfd, size) == -1) {
pr_debug("ftruncate fail\n");
goto error_release_memfd;
}
pr_debug("PID: %jd; fd: %d; /proc/%jd/fd/%d\n",
(intmax_t) getpid(), memfd, (intmax_t) getpid(), memfd);
if (fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW) < 0) {
pr_debug("fcntl fail %s\n", strerror(errno));
goto error_release_memfd;
}
alignment_size = PAGE_SIZE << svm.alignment_order;
alignment = alignment ? alignment : aperture->align;
while (alignment < alignment_size && size >= (alignment << 1))
alignment <<= 1;
guard_size = (uint64_t)aperture->guard_pages * PAGE_SIZE;
mem = hsakmt_mmap_allocate_aligned(PROT_WRITE | PROT_READ, MAP_NORESERVE | MAP_SHARED,
size, alignment, guard_size, aperture->base, aperture->limit, memfd);
if (!mem)
goto error_release_memfd;
/* set madvise flags to HUGEPAGE if allocate more than 2MB */
if (size >= (2 * 1024 * 1024))
madvise(mem, size, MADV_HUGEPAGE);
/* always bind to numa node */
mflags.ui32.NoSubstitute = 1;
/* Bind to NUMA node */
/* node_id is gpu id, get closed numa id */
numa_node_id = hsakmt_get_direct_link_cpu(node_id);
if (bind_mem_to_numa(numa_node_id, mem, size, mflags))
goto error_release_aperture;
node_size = numa_node_size64(numa_node_id, &free_size);
pr_debug("udmabuf_allocation: numa_node_id %d, node_size %lld, free_size %lld\n",
numa_node_id, node_size, free_size);
/* compare free size at numa_node_id with size */
if ((uint64_t)free_size < size) {
pr_debug("udmabuf_allocation: has no enough ram on numa_node_id %d, node_size %lld, free_size %lld\n",
numa_node_id, node_size, free_size);
goto error_release_aperture;
}
create.memfd = memfd;
create.flags = UDMABUF_FLAGS_CLOEXEC;
create.offset = 0;
create.size = size;
dmabuf_fd = ioctl(hsakmt_udmabuf_dev_fd, UDMABUF_CREATE, &create);
if (dmabuf_fd < 0) {
pr_debug("ioctl UDMABUF_CREATE failed\n");
goto error_release_aperture;
}
importArgs.va_addr = (uint64_t)mem;
importArgs.gpu_id = gpu_id;
importArgs.dmabuf_fd = dmabuf_fd;
ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs);
if (ret) {
pr_debug("ioctl AMDKFD_IOC_IMPORT_DMABUF failed\n, ret 0x%x", ret);
goto error_release_dmabuf;
}
/* Allocate object */
pthread_mutex_lock(&aperture->fmm_mutex);
*vm_obj = aperture_allocate_object(aperture, mem, importArgs.handle,
size, mflags);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (*vm_obj == NULL)
goto error_release_dmabuf;
/* after import udmabuf into kfd driver close dmabuf_fd
* as kfd driver holds the dmabuf
*/
close(dmabuf_fd);
close(memfd);
return mem;
error_release_dmabuf:
close(dmabuf_fd);
error_release_aperture:
aperture_release_area(aperture, mem, size);
error_release_memfd:
close(memfd);
return NULL;
}
void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
{
manageable_aperture_t *aperture;
int32_t gpu_mem_id;
uint32_t ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_VRAM;
uint64_t size, mmap_offset;
void *mem;
vm_object_t *vm_obj = NULL;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
size = MemorySizeInBytes;
if (mflags.ui32.HostAccess)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC;
ioc_flags |= fmm_translate_hsa_to_ioc_flags(mflags);
if (hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId)) {
aperture = svm.dgpu_aperture;
if (mflags.ui32.AQLQueueMemory)
size = MemorySizeInBytes * 2;
} else {
aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
}
/* special case for va allocation without vram alloc */
if (mflags.ui32.OnlyAddress)
return fmm_allocate_va(gpu_id, address, size, aperture, alignment, mflags);
/* special case for vram allocation without addr */
if(mflags.ui32.NoAddress)
aperture = &mem_handle_aperture;
if (!mflags.ui32.CoarseGrain || svm.disable_cache)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
if (mflags.ui32.Uncached || svm.disable_cache)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
if (mflags.ui32.ExtendedCoherent)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT;
if (mflags.ui32.Contiguous)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT;
mem = NULL;
if (hsakmt_udmabuf_dev_fd > 0 && aperture == svm.dgpu_aperture && !hsakmt_is_dgpu
&& aperture->ops == &mmap_aperture_ops) {
mem = udmabuf_allocation(gpu_id, node_id, size, aperture, alignment,
mflags, &vm_obj);
pr_debug("udmabuf_allocation mem %p\n", mem);
if (!mem)
pr_debug("udmabuf_allocation allocation fail\n");
}
/* env HSA_USE_UDMABUF not set, or not apu, or cannot use udmabuf,
* fall back to use device driver to allocate memory
*/
if (!mem) {
mem = __fmm_allocate_device(gpu_id, address, size, aperture, &mmap_offset,
ioc_flags, alignment, &vm_obj);
/* if alloc vram-only not mmap to cpu vm since no va */
if (mem && !mflags.ui32.NoAddress) {
void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes,
mflags.ui32.HostAccess,
gpu_mem[gpu_mem_id].drm_render_fd,
mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(vm_obj, aperture);
return NULL;
}
#ifdef SANITIZER_AMDGPU
if (vm_obj) {
vm_obj->mmap_flags = mflags.ui32.HostAccess ? PROT_READ | PROT_WRITE : PROT_NONE;
vm_obj->mmap_fd = gpu_mem[gpu_mem_id].drm_render_fd;
vm_obj->mmap_offset = mmap_offset;
}
#endif
}
}
if (mem && vm_obj) {
pthread_mutex_lock(&aperture->fmm_mutex);
/* Store memory allocation flags, not ioc flags */
vm_obj->mflags = mflags;
hsakmt_gpuid_to_nodeid(gpu_id, &vm_obj->node_id);
pthread_mutex_unlock(&aperture->fmm_mutex);
}
return mem;
}
void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes,
uint64_t doorbell_mmap_offset)
{
manageable_aperture_t *aperture;
int32_t gpu_mem_id;
uint32_t ioc_flags;
void *mem;
vm_object_t *vm_obj = NULL;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
/* Use fine-grained aperture */
aperture = svm.dgpu_alt_aperture;
ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
mem = __fmm_allocate_device(gpu_id, NULL, MemorySizeInBytes, aperture, NULL,
ioc_flags, 0, &vm_obj);
if (mem && vm_obj) {
HsaMemFlags mflags;
/* Cook up some flags for storing in the VM object */
mflags.Value = 0;
mflags.ui32.NonPaged = 1;
mflags.ui32.HostAccess = 1;
pthread_mutex_lock(&aperture->fmm_mutex);
vm_obj->mflags = mflags;
hsakmt_gpuid_to_nodeid(gpu_id, &vm_obj->node_id);
pthread_mutex_unlock(&aperture->fmm_mutex);
}
if (mem) {
void *ret = mmap(mem, MemorySizeInBytes,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, hsakmt_kfd_fd,
doorbell_mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(vm_obj, aperture);
return NULL;
}
}
return mem;
}
static void *fmm_allocate_host_cpu(void *address, uint64_t MemorySizeInBytes,
HsaMemFlags mflags)
{
void *mem = NULL;
vm_object_t *vm_obj;
int mmap_prot = PROT_READ;
if (address)
return NULL;
if (mflags.ui32.ExecuteAccess)
mmap_prot |= PROT_EXEC;
if (!mflags.ui32.ReadOnly)
mmap_prot |= PROT_WRITE;
/* mmap will return a pointer with alignment equal to
* sysconf(_SC_PAGESIZE).
*/
mem = mmap(NULL, MemorySizeInBytes, mmap_prot,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (mem == MAP_FAILED)
return NULL;
pthread_mutex_lock(&cpuvm_aperture.fmm_mutex);
vm_obj = aperture_allocate_object(&cpuvm_aperture, mem, 0,
MemorySizeInBytes, mflags);
if (vm_obj)
vm_obj->node_id = 0; /* APU systems only have one CPU node */
pthread_mutex_unlock(&cpuvm_aperture.fmm_mutex);
return mem;
}
static int bind_mem_to_numa(uint32_t numa_node_id, void *mem,
uint64_t SizeInBytes, HsaMemFlags mflags)
{
int mode = MPOL_F_STATIC_NODES;
struct bitmask *node_mask;
int num_node;
long r;
pr_debug("%s mem %p flags 0x%x size 0x%lx node_id %d\n", __func__,
mem, mflags.Value, SizeInBytes, numa_node_id);
if (mflags.ui32.NoNUMABind || numa_available() == -1) {
/* but need bind to a numa node */
if (mflags.ui32.NoSubstitute)
return -EFAULT;
else
return 0;
}
num_node = numa_max_node() + 1;
/* Ignore binding requests to invalid nodes IDs */
if (numa_node_id >= (unsigned)num_node || numa_node_id == INVALID_NODEID || num_node <= 1) {
pr_warn("numa_node_id is out range: numa_node_id %d, num_node %d\n", numa_node_id, num_node);
if (mflags.ui32.NoSubstitute)
return -EFAULT;
else
return 0;
}
node_mask = numa_bitmask_alloc(num_node);
if (!node_mask)
return -ENOMEM;
#ifdef __PPC64__
numa_bitmask_setbit(node_mask, numa_node_id * 8);
#else
numa_bitmask_setbit(node_mask, numa_node_id);
#endif
mode |= mflags.ui32.NoSubstitute ? MPOL_BIND : MPOL_PREFERRED;
r = mbind(mem, SizeInBytes, mode, node_mask->maskp, num_node + 1, 0);
numa_bitmask_free(node_mask);
if (r) {
/* If applcation is running inside docker, still return
* ok because docker seccomp blocks mbind by default,
* otherwise application cannot allocate system memory.
*/
if (errno == EPERM) {
pr_err_once("mbind is blocked by seccomp\n");
return 0;
}
/* Ignore mbind failure if no memory available on node */
if (!mflags.ui32.NoSubstitute)
return 0;
pr_warn_once("Failed to set NUMA policy for %p: %s\n", mem,
strerror(errno));
return -EFAULT;
}
return 0;
}
static void *fmm_allocate_host_gpu(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
{
manageable_aperture_t *aperture;
vm_object_t *vm_obj = NULL;
int flags = MADV_DONTFORK;
uint64_t mmap_offset;
int32_t gpu_drm_fd;
uint32_t ioc_flags;
uint32_t preferred_gpu_id;
int gpu_mem_id = 0; /* default to g_first_gpu_mem */
uint64_t size;
void *mem;
/* set madvise flags to HUGEPAGE always for 2MB pages */
if (MemorySizeInBytes >= (2 * 1024 * 1024))
flags |= MADV_HUGEPAGE;
if (!g_first_gpu_mem)
return NULL;
if (gpu_id) {
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return NULL;
}
preferred_gpu_id = gpu_mem[gpu_mem_id].gpu_id;
gpu_drm_fd = gpu_mem[gpu_mem_id].drm_render_fd;
size = MemorySizeInBytes;
ioc_flags = 0;
if (mflags.ui32.CoarseGrain)
aperture = svm.dgpu_aperture;
else
aperture = svm.dgpu_alt_aperture; /* always coherent */
if (!mflags.ui32.CoarseGrain || svm.disable_cache)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
if (mflags.ui32.Uncached || svm.disable_cache)
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
ioc_flags |= fmm_translate_hsa_to_ioc_flags(mflags);
if (mflags.ui32.AQLQueueMemory)
size = MemorySizeInBytes * 2;
/* special case for va allocation without real memory alloc */
if (mflags.ui32.OnlyAddress)
return fmm_allocate_va(gpu_id, address, size, aperture, alignment, mflags);
/* Paged memory is allocated as a userptr mapping, non-paged
* memory is allocated from KFD
*/
if (!mflags.ui32.NonPaged && svm.userptr_for_paged_mem) {
/* Allocate address space */
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area_aligned(aperture, address, size, alignment);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!mem)
return NULL;
/* Map anonymous pages */
if (mmap(mem, MemorySizeInBytes, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0)
== MAP_FAILED)
goto out_release_area;
/* Bind to NUMA node */
if (bind_mem_to_numa(node_id, mem, MemorySizeInBytes, mflags))
goto out_release_area;
/* Mappings in the DGPU aperture don't need to be copied on
* fork. This avoids MMU notifiers and evictions due to user
* memory mappings on fork.
*/
madvise(mem, MemorySizeInBytes, flags);
/* Create userptr BO */
mmap_offset = (uint64_t)mem;
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_USERPTR;
vm_obj = fmm_allocate_memory_object(preferred_gpu_id, mem, size,
aperture, &mmap_offset,
ioc_flags);
if (!vm_obj)
goto out_release_area;
} else {
ioc_flags |= KFD_IOC_ALLOC_MEM_FLAGS_GTT;
mem = __fmm_allocate_device(preferred_gpu_id, address, size, aperture,
&mmap_offset, ioc_flags, alignment, &vm_obj);
if (mem && mflags.ui32.HostAccess) {
void *ret = fmm_map_to_cpu(mem, MemorySizeInBytes,
mflags.ui32.HostAccess,
gpu_drm_fd, mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(vm_obj, aperture);
return NULL;
}
}
}
#ifdef SANITIZER_AMDGPU
if (mem && vm_obj) {
vm_obj->mmap_flags = mflags.ui32.HostAccess ? PROT_READ | PROT_WRITE : PROT_NONE;
vm_obj->mmap_fd = gpu_drm_fd;
vm_obj->mmap_offset = mmap_offset;
}
#endif
if (mem && vm_obj) {
/* Store memory allocation flags, not ioc flags */
pthread_mutex_lock(&aperture->fmm_mutex);
vm_obj->mflags = mflags;
vm_obj->node_id = node_id;
pthread_mutex_unlock(&aperture->fmm_mutex);
}
return mem;
out_release_area:
/* Release address space */
pthread_mutex_lock(&aperture->fmm_mutex);
if (mem) {
aperture_release_area(aperture, mem, size);
}
pthread_mutex_unlock(&aperture->fmm_mutex);
return NULL;
}
void *hsakmt_fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags mflags)
{
if (hsakmt_is_dgpu)
return fmm_allocate_host_gpu(gpu_id, node_id, address, MemorySizeInBytes, alignment, mflags);
if (alignment) {//Alignment not supported on non-dgpu
pr_err("Non-default alignment not supported on non-dgpu\n");
return NULL;
}
return fmm_allocate_host_cpu(address, MemorySizeInBytes, mflags);
}
static int __fmm_release(vm_object_t *object, manageable_aperture_t *aperture)
{
struct kfd_ioctl_free_memory_of_gpu_args args = {0};
int ret = 0;
uint32_t i;
if (!object)
return -EINVAL;
pthread_mutex_lock(&aperture->fmm_mutex);
if (object->userptr) {
object->registration_count--;
if (object->registration_count > 0) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return 0;
}
}
/* If memory is user memory and it's still GPU mapped, munmap
* would cause an eviction. If the restore happens quickly
* enough, restore would also fail with an error message. So
* free the BO before unmapping the pages.
*/
for (i = 0; i < object->handle_num; i++) {
args.handle = object->handles[i];
if (args.handle == 0)
continue;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &args))
ret = -errno;
}
if (ret)
goto err_free_mem_failed;
aperture_release_area(aperture, object->start, object->size);
vm_remove_object(aperture, object);
err_free_mem_failed:
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
HSAKMT_STATUS hsakmt_fmm_release(void *address)
{
manageable_aperture_t *aperture = NULL;
vm_object_t *object = NULL;
gpu_mem_t *gpu_mem_ptr = NULL;
/* Special handling for scratch memory */
gpu_mem_ptr = fmm_is_scratch_aperture(address);
if (gpu_mem_ptr) {
fmm_release_scratch(gpu_mem_ptr->gpu_id);
return HSAKMT_STATUS_SUCCESS;
}
object = vm_find_object(address, 0, &aperture);
if (!object)
return hsakmt_is_svm_api_supported ?
HSAKMT_STATUS_SUCCESS :
HSAKMT_STATUS_MEMORY_NOT_REGISTERED;
if (aperture == &cpuvm_aperture) {
/* APU system memory */
uint64_t size = 0;
size = object->size;
vm_remove_object(&cpuvm_aperture, object);
pthread_mutex_unlock(&aperture->fmm_mutex);
munmap(address, size);
} else {
pthread_mutex_unlock(&aperture->fmm_mutex);
if (__fmm_release(object, aperture))
return HSAKMT_STATUS_ERROR;
}
return HSAKMT_STATUS_SUCCESS;
}
static int fmm_set_memory_policy(uint32_t gpu_id, int default_policy, int alt_policy,
uintptr_t alt_base, uint64_t alt_size,
uint32_t misc_process_flags)
{
struct kfd_ioctl_set_memory_policy_args args = {0};
args.gpu_id = gpu_id;
args.default_policy = default_policy;
args.alternate_policy = alt_policy;
args.alternate_aperture_base = alt_base;
args.alternate_aperture_size = alt_size;
args.misc_process_flag = misc_process_flags;
return hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args);
}
static uint32_t get_vm_alignment(uint32_t device_id)
{
int page_size = 0;
if (device_id >= 0x6920 && device_id <= 0x6939) /* Tonga */
page_size = TONGA_PAGE_SIZE;
else if (device_id >= 0x9870 && device_id <= 0x9877) /* Carrizo */
page_size = TONGA_PAGE_SIZE;
return MAX(PAGE_SIZE, page_size);
}
static HSAKMT_STATUS get_process_apertures(
struct kfd_process_device_apertures *process_apertures,
uint32_t *num_of_nodes)
{
struct kfd_ioctl_get_process_apertures_new_args args_new = {0};
struct kfd_ioctl_get_process_apertures_args args_old;
args_new.kfd_process_device_apertures_ptr = (uintptr_t)process_apertures;
args_new.num_of_nodes = *num_of_nodes;
if (!hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
(void *)&args_new)) {
*num_of_nodes = args_new.num_of_nodes;
return HSAKMT_STATUS_SUCCESS;
}
/* New IOCTL failed, try the old one in case we're running on
* a really old kernel */
memset(&args_old, 0, sizeof(args_old));
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_PROCESS_APERTURES,
(void *)&args_old))
return HSAKMT_STATUS_ERROR;
if (args_old.num_of_nodes < *num_of_nodes)
*num_of_nodes = args_old.num_of_nodes;
memcpy(process_apertures, args_old.process_apertures,
sizeof(*process_apertures) * *num_of_nodes);
return HSAKMT_STATUS_SUCCESS;
}
/* The VMs from DRM render nodes are used by KFD for the lifetime of
* the process. Therefore we have to keep using the same FDs for the
* lifetime of the process, even when we close and reopen KFD. There
* are up to 128 render nodes that we cache in this array.
*/
#define DRM_FIRST_RENDER_NODE 128
#define DRM_LAST_RENDER_NODE 255
static int drm_render_fds[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE];
/* amdgpu device handle for each gpu that libdrm uses */
static struct amdgpu_device *amdgpu_handle[DRM_LAST_RENDER_NODE + 1 - DRM_FIRST_RENDER_NODE];
int hsakmt_open_drm_render_device(int minor)
{
char path[128];
int index, fd;
uint32_t major_drm, minor_drm;
struct amdgpu_device **device_handle;
/* Bypass amdgpu if we're running a model. Return hsakmt_kfd_fd, which is the
* backing for all our "GPU" memory. */
if (hsakmt_use_model)
return hsakmt_kfd_fd;
if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) {
pr_err("DRM render minor %d out of range [%d, %d]\n", minor,
DRM_FIRST_RENDER_NODE, DRM_LAST_RENDER_NODE);
return -EINVAL;
}
index = minor - DRM_FIRST_RENDER_NODE;
/* If the render node was already opened, keep using the same FD */
if (drm_render_fds[index])
return drm_render_fds[index];
sprintf(path, "/dev/dri/renderD%d", minor);
fd = open(path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT && errno != EPERM) {
pr_err("Failed to open %s: %s\n", path, strerror(errno));
if (errno == EACCES)
pr_info("Check user is in \"video\" group\n");
}
return -errno;
}
drm_render_fds[index] = fd;
device_handle = &amdgpu_handle[index];
if (!amdgpu_device_initialize(fd, &major_drm, &minor_drm, device_handle)) {
/* if amdgpu_device_get_fd available query render fd that libdrm uses,
* then close drm_render_fds above, replace it by fd libdrm uses.
*/
if (hsakmt_fn_amdgpu_device_get_fd) {
fd = hsakmt_fn_amdgpu_device_get_fd(*device_handle);
if (fd > 0) {
close(drm_render_fds[index]);
drm_render_fds[index] = fd;
} else {
pr_err("amdgpu_device_get_fd failed: %d\n", fd);
amdgpu_device_deinitialize(*device_handle);
*device_handle = 0;
}
}
}
return fd;
}
static HSAKMT_STATUS acquire_vm(uint32_t gpu_id, int fd)
{
struct kfd_ioctl_acquire_vm_args args;
args.gpu_id = gpu_id;
args.drm_fd = fd;
pr_info("acquiring VM for %x using %d\n", gpu_id, fd);
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ACQUIRE_VM, (void *)&args)) {
pr_err("AMDKFD_IOC_ACQUIRE_VM failed\n");
return HSAKMT_STATUS_ERROR;
}
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS init_mmap_apertures(HSAuint64 base, HSAuint64 limit,
HSAuint32 align, HSAuint32 guard_pages)
{
void *addr;
if (align > (HSAuint32)PAGE_SIZE) {
/* This should never happen. Alignment constraints
* only apply to old GPUs that don't support 48-bit
* virtual addresses.
*/
pr_info("Falling back to reserved SVM apertures due to alignment constraints.\n");
return HSAKMT_STATUS_ERROR;
}
/* Set up one SVM aperture */
svm.apertures[SVM_DEFAULT].base = (void *)base;
svm.apertures[SVM_DEFAULT].limit = (void *)limit;
svm.apertures[SVM_DEFAULT].align = align;
svm.apertures[SVM_DEFAULT].guard_pages = guard_pages;
svm.apertures[SVM_DEFAULT].is_cpu_accessible = true;
svm.apertures[SVM_DEFAULT].ops = &mmap_aperture_ops;
svm.apertures[SVM_COHERENT].base = svm.apertures[SVM_COHERENT].limit =
NULL;
/* Try to allocate one page. If it fails, we'll fall back to
* managing our own reserved address range.
*/
addr = aperture_allocate_area(&svm.apertures[SVM_DEFAULT], NULL, PAGE_SIZE);
if (addr) {
aperture_release_area(&svm.apertures[SVM_DEFAULT], addr,
PAGE_SIZE);
svm.dgpu_aperture = svm.dgpu_alt_aperture =
&svm.apertures[SVM_DEFAULT];
pr_info("Initialized unreserved SVM apertures: %p - %p\n",
svm.apertures[SVM_DEFAULT].base,
svm.apertures[SVM_DEFAULT].limit);
} else {
pr_info("Failed to allocate unreserved SVM address space.\n");
pr_info("Falling back to reserved SVM apertures.\n");
}
return addr ? HSAKMT_STATUS_SUCCESS : HSAKMT_STATUS_ERROR;
}
static void *reserve_address(void *addr, unsigned long long int len)
{
void *ret_addr;
if (len <= 0)
return NULL;
ret_addr = mmap(addr, len, PROT_NONE,
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
if (ret_addr == MAP_FAILED)
return NULL;
return ret_addr;
}
/* Managed SVM aperture limits: only reserve up to 40 bits (1TB, what
* GFX8 supports). Need to find at least 4GB of usable address space.
*/
#define SVM_RESERVATION_LIMIT ((1ULL << 40) - 1)
#define SVM_MIN_VM_SIZE (4ULL << 30)
#define IS_CANONICAL_ADDR(a) ((a) < (1ULL << 47))
static HSAKMT_STATUS init_svm_apertures(HSAuint64 base, HSAuint64 limit,
HSAuint32 align, HSAuint32 guard_pages)
{
const HSAuint64 ADDR_INC = GPU_HUGE_PAGE_SIZE;
HSAuint64 len, map_size, alt_base, alt_size;
bool found = false;
void *addr, *ret_addr = NULL;
/* If we already have an SVM aperture initialized (from a
* parent process), keep using it
*/
if (dgpu_shared_aperture_limit)
return HSAKMT_STATUS_SUCCESS;
/* Align base and limit to huge page size */
base = ALIGN_UP(base, GPU_HUGE_PAGE_SIZE);
limit = ((limit + 1) & ~(HSAuint64)(GPU_HUGE_PAGE_SIZE - 1)) - 1;
/* If the limit is greater or equal 47-bits of address space,
* it means we have GFXv9 or later GPUs only. We don't need
* apertures to determine the MTYPE and the virtual address
* space of the GPUs covers the full CPU address range (on
* x86_64) or at least mmap is unlikely to run out of
* addresses the GPUs can handle.
*/
if (limit >= (1ULL << 47) - 1 && !svm.reserve_svm) {
HSAKMT_STATUS status = init_mmap_apertures(base, limit, align,
guard_pages);
if (status == HSAKMT_STATUS_SUCCESS)
return status;
/* fall through: fall back to reserved address space */
}
if (limit > SVM_RESERVATION_LIMIT)
limit = SVM_RESERVATION_LIMIT;
if (base >= limit) {
pr_err("No SVM range compatible with all GPU and software constraints\n");
return HSAKMT_STATUS_ERROR;
}
/* Try to reserve address space for SVM.
*
* Inner loop: try start addresses in huge-page increments up
* to half the VM size we're trying to reserve
*
* Outer loop: reduce size of the allocation by factor 2 at a
* time and print a warning for every reduction
*/
for (len = limit - base + 1; !found && len >= SVM_MIN_VM_SIZE;
len = (len + 1) >> 1) {
for (addr = (void *)base; (HSAuint64)addr + ((len + 1) >> 1) - 1 <= limit;
addr = (void *)((HSAuint64)addr + ADDR_INC)) {
HSAuint64 top = MIN((HSAuint64)addr + len, limit+1);
map_size = (top - (HSAuint64)addr) &
~(HSAuint64)(PAGE_SIZE - 1);
if (map_size < SVM_MIN_VM_SIZE)
break;
ret_addr = reserve_address(addr, map_size);
if (!ret_addr)
break;
if ((HSAuint64)ret_addr + ((len + 1) >> 1) - 1 <= limit)
/* At least half the returned address
* space is GPU addressable, we'll
* take it
*/
break;
munmap(ret_addr, map_size);
ret_addr = NULL;
}
if (!ret_addr) {
pr_warn("Failed to reserve %uGB for SVM ...\n",
(unsigned int)(len >> 30));
continue;
}
if ((HSAuint64)ret_addr + SVM_MIN_VM_SIZE - 1 > limit) {
/* addressable size is less than the minimum */
pr_warn("Got %uGB for SVM at %p with only %dGB usable ...\n",
(unsigned int)(map_size >> 30), ret_addr,
(int)((limit - (HSAint64)ret_addr) >> 30));
munmap(ret_addr, map_size);
ret_addr = NULL;
continue;
} else {
found = true;
break;
}
}
if (!found) {
pr_err("Failed to reserve SVM address range. Giving up.\n");
return HSAKMT_STATUS_ERROR;
}
base = (HSAuint64)ret_addr;
if (base + map_size - 1 > limit)
/* trim the tail that's not GPU-addressable */
munmap((void *)(limit + 1), base + map_size - 1 - limit);
else
limit = base + map_size - 1;
/* init two apertures for non-coherent and coherent memory */
svm.apertures[SVM_DEFAULT].base = dgpu_shared_aperture_base = ret_addr;
svm.apertures[SVM_DEFAULT].limit = dgpu_shared_aperture_limit = (void *)limit;
svm.apertures[SVM_DEFAULT].align = align;
svm.apertures[SVM_DEFAULT].guard_pages = guard_pages;
svm.apertures[SVM_DEFAULT].is_cpu_accessible = true;
svm.apertures[SVM_DEFAULT].ops = &reserved_aperture_ops;
/* Use the first 1/4 of the dGPU aperture as
* alternate aperture for coherent access.
* Base and size must be 64KB aligned.
*/
alt_base = (HSAuint64)svm.apertures[SVM_DEFAULT].base;
alt_size = (VOID_PTRS_SUB(svm.apertures[SVM_DEFAULT].limit,
svm.apertures[SVM_DEFAULT].base) + 1) >> 2;
alt_base = (alt_base + 0xffff) & ~0xffffULL;
alt_size = (alt_size + 0xffff) & ~0xffffULL;
svm.apertures[SVM_COHERENT].base = (void *)alt_base;
svm.apertures[SVM_COHERENT].limit = (void *)(alt_base + alt_size - 1);
svm.apertures[SVM_COHERENT].align = align;
svm.apertures[SVM_COHERENT].guard_pages = guard_pages;
svm.apertures[SVM_COHERENT].is_cpu_accessible = true;
svm.apertures[SVM_COHERENT].ops = &reserved_aperture_ops;
svm.apertures[SVM_DEFAULT].base = VOID_PTR_ADD(svm.apertures[SVM_COHERENT].limit, 1);
pr_info("SVM alt (coherent): %12p - %12p\n",
svm.apertures[SVM_COHERENT].base, svm.apertures[SVM_COHERENT].limit);
pr_info("SVM (non-coherent): %12p - %12p\n",
svm.apertures[SVM_DEFAULT].base, svm.apertures[SVM_DEFAULT].limit);
svm.dgpu_aperture = &svm.apertures[SVM_DEFAULT];
svm.dgpu_alt_aperture = &svm.apertures[SVM_COHERENT];
return HSAKMT_STATUS_SUCCESS;
}
static void fmm_init_rbtree(void)
{
static int once;
int i = gpu_mem_count;
if (once++ == 0) {
rbtree_init(&svm.apertures[SVM_DEFAULT].tree);
rbtree_init(&svm.apertures[SVM_DEFAULT].user_tree);
rbtree_init(&svm.apertures[SVM_COHERENT].tree);
rbtree_init(&svm.apertures[SVM_COHERENT].user_tree);
rbtree_init(&cpuvm_aperture.tree);
rbtree_init(&cpuvm_aperture.user_tree);
rbtree_init(&mem_handle_aperture.tree);
rbtree_init(&mem_handle_aperture.user_tree);
}
while (i--) {
rbtree_init(&gpu_mem[i].scratch_physical.tree);
rbtree_init(&gpu_mem[i].scratch_physical.user_tree);
rbtree_init(&gpu_mem[i].gpuvm_aperture.tree);
rbtree_init(&gpu_mem[i].gpuvm_aperture.user_tree);
}
}
static void *map_mmio(uint32_t node_id, uint32_t gpu_id, int mmap_fd)
{
void *mem;
manageable_aperture_t *aperture = svm.dgpu_alt_aperture;
uint32_t ioc_flags;
vm_object_t *vm_obj = NULL;
HsaMemFlags mflags;
void *ret;
uint64_t mmap_offset;
/* Allocate physical memory and vm object*/
ioc_flags = KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
mem = __fmm_allocate_device(gpu_id, NULL, PAGE_SIZE, aperture,
&mmap_offset, ioc_flags, 0, &vm_obj);
if (!mem || !vm_obj)
return NULL;
mflags.Value = 0;
mflags.ui32.NonPaged = 1;
mflags.ui32.HostAccess = 1;
pthread_mutex_lock(&aperture->fmm_mutex);
vm_obj->mflags = mflags;
vm_obj->node_id = node_id;
pthread_mutex_unlock(&aperture->fmm_mutex);
if (hsakmt_use_model) {
model_set_mmio_page(mem);
return mem;
}
/* Map for CPU access*/
ret = mmap(mem, PAGE_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, mmap_fd,
mmap_offset);
if (ret == MAP_FAILED) {
__fmm_release(vm_obj, aperture);
return NULL;
}
/* Map for GPU access*/
if (hsakmt_fmm_map_to_gpu(mem, PAGE_SIZE, NULL)) {
__fmm_release(vm_obj, aperture);
return NULL;
}
return mem;
}
static void release_mmio(void)
{
uint32_t gpu_mem_id;
for (gpu_mem_id = 0; (uint32_t)gpu_mem_id < gpu_mem_count; gpu_mem_id++) {
if (!gpu_mem[gpu_mem_id].mmio_aperture.base)
continue;
hsakmt_fmm_unmap_from_gpu(gpu_mem[gpu_mem_id].mmio_aperture.base);
munmap(gpu_mem[gpu_mem_id].mmio_aperture.base, PAGE_SIZE);
hsakmt_fmm_release(gpu_mem[gpu_mem_id].mmio_aperture.base);
}
}
HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(uint32_t node_id,
HsaAMDGPUDeviceHandle *DeviceHandle)
{
int32_t i = gpu_mem_find_by_node_id(node_id);
int index;
if (i < 0)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
if (hsakmt_use_model) {
*DeviceHandle = NULL;
return HSAKMT_STATUS_SUCCESS;
}
index = gpu_mem[i].drm_render_minor - DRM_FIRST_RENDER_NODE;
if (!amdgpu_handle[index])
return HSAKMT_STATUS_INVALID_HANDLE;
*DeviceHandle = amdgpu_handle[index];
return HSAKMT_STATUS_SUCCESS;
}
static bool two_apertures_overlap(void *start_1, void *limit_1, void *start_2, void *limit_2)
{
return (start_1 >= start_2 && start_1 <= limit_2) || (start_2 >= start_1 && start_2 <= limit_1);
}
static bool init_mem_handle_aperture(HSAuint32 align, HSAuint32 guard_pages)
{
bool found;
uint32_t i;
/* init mem_handle_aperture for buffer handler management */
mem_handle_aperture.align = align;
mem_handle_aperture.guard_pages = guard_pages;
mem_handle_aperture.is_cpu_accessible = false;
mem_handle_aperture.ops = &reserved_aperture_ops;
while (PORT_VPTR_TO_UINT64(mem_handle_aperture.base) < END_NON_CANONICAL_ADDR - 1) {
found = true;
for (i = 0; i < gpu_mem_count; i++) {
if (gpu_mem[i].lds_aperture.base &&
two_apertures_overlap(gpu_mem[i].lds_aperture.base, gpu_mem[i].lds_aperture.limit,
mem_handle_aperture.base, mem_handle_aperture.limit)) {
found = false;
break;
}
if (gpu_mem[i].scratch_aperture.base &&
two_apertures_overlap(gpu_mem[i].scratch_aperture.base, gpu_mem[i].scratch_aperture.limit,
mem_handle_aperture.base, mem_handle_aperture.limit)){
found = false;
break;
}
if (gpu_mem[i].gpuvm_aperture.base &&
two_apertures_overlap(gpu_mem[i].gpuvm_aperture.base, gpu_mem[i].gpuvm_aperture.limit,
mem_handle_aperture.base, mem_handle_aperture.limit)){
found = false;
break;
}
}
if (found) {
pr_info("mem_handle_aperture start %p, mem_handle_aperture limit %p\n",
mem_handle_aperture.base, mem_handle_aperture.limit);
return true;
} else {
/* increase base by 1UL<<47 to check next hole */
mem_handle_aperture.base = VOID_PTR_ADD(mem_handle_aperture.base, (1UL << 47));
mem_handle_aperture.limit = VOID_PTR_ADD(mem_handle_aperture.base, (1ULL << 47));
}
}
/* set invalid aperture if fail locating a hole for it */
mem_handle_aperture.base = 0;
mem_handle_aperture.limit = 0;
return false;
}
HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes)
{
uint32_t i;
int32_t gpu_mem_id = 0;
struct kfd_process_device_apertures *process_apertures;
uint32_t num_of_sysfs_nodes;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
char *disableCache, *pagedUserptr, *checkUserptr, *guardPagesStr, *reserveSvm;
char *maxVaAlignStr, *mfmaHighPrecisionModeStr;
unsigned int guardPages = 1;
uint64_t svm_base = 0, svm_limit = 0;
uint32_t svm_alignment = 0, mfma_high_precision_mode = 0;
/* If HSA_DISABLE_CACHE is set to a non-0 value, disable caching */
disableCache = getenv("HSA_DISABLE_CACHE");
svm.disable_cache = (disableCache && strcmp(disableCache, "0"));
/* If HSA_USERPTR_FOR_PAGED_MEM is not set or set to a non-0
* value, enable userptr for all paged memory allocations
*/
pagedUserptr = getenv("HSA_USERPTR_FOR_PAGED_MEM");
svm.userptr_for_paged_mem = (!pagedUserptr || strcmp(pagedUserptr, "0"));
if (hsakmt_use_model)
svm.userptr_for_paged_mem = false;
/* If HSA_CHECK_USERPTR is set to a non-0 value, check all userptrs
* when they are registered
*/
checkUserptr = getenv("HSA_CHECK_USERPTR");
svm.check_userptr = (checkUserptr && strcmp(checkUserptr, "0"));
/* If HSA_RESERVE_SVM is set to a non-0 value,
* enable packet capture and replay mode.
*/
reserveSvm = getenv("HSA_RESERVE_SVM");
svm.reserve_svm = (reserveSvm && strcmp(reserveSvm, "0"));
/* Specify number of guard pages for SVM apertures, default is 1 */
guardPagesStr = getenv("HSA_SVM_GUARD_PAGES");
if (!guardPagesStr || sscanf(guardPagesStr, "%u", &guardPages) != 1)
guardPages = 1;
mfmaHighPrecisionModeStr = getenv("HSA_HIGH_PRECISION_MODE");
mfma_high_precision_mode = (mfmaHighPrecisionModeStr &&
strcmp(mfmaHighPrecisionModeStr, "0"));
/* Sets the max VA alignment order size during mapping. By default the order
* size is set to 18(1G) for GFX950 to reduce TLB hits. If any non-gfx950
* ASIC is found in the system, set back to 9(2MB).
*/
maxVaAlignStr = getenv("HSA_MAX_VA_ALIGN");
if (!maxVaAlignStr || sscanf(maxVaAlignStr, "%u", &svm.alignment_order) != 1) {
svm.alignment_order = 18;
for (i = 0; i < NumNodes; i++) {
if (hsakmt_get_gfxv_by_node_id(i) != GFX_VERSION_GFX950) {
svm.alignment_order = 9;
break;
}
}
}
pr_info("SVM alignment default order is %d.", svm.alignment_order);
gpu_mem_count = 0;
g_first_gpu_mem = NULL;
/* Trade off - NumNodes includes GPU nodes + CPU Node. So in
* systems with CPU node, slightly more memory is allocated than
* necessary
*/
gpu_mem = (gpu_mem_t *)calloc(NumNodes, sizeof(gpu_mem_t));
if (!gpu_mem)
return HSAKMT_STATUS_NO_MEMORY;
/* Initialize gpu_mem[] from sysfs topology. Rest of the members are
* set to 0 by calloc. This is necessary because this function
* gets called before hsaKmtAcquireSystemProperties() is called.
*/
hsakmt_is_dgpu = false;
for (i = 0; i < NumNodes; i++) {
HsaNodeProperties props;
ret = hsakmt_topology_get_node_props(i, &props);
if (ret != HSAKMT_STATUS_SUCCESS)
goto gpu_mem_init_failed;
hsakmt_topology_setup_is_dgpu_param(&props);
/* Skip non-GPU nodes */
if (props.KFDGpuID) {
int fd = hsakmt_open_drm_render_device(props.DrmRenderMinor);
if (fd <= 0) {
ret = HSAKMT_STATUS_ERROR;
goto gpu_mem_init_failed;
}
gpu_mem[gpu_mem_count].drm_render_minor = props.DrmRenderMinor;
gpu_mem[gpu_mem_count].usable_peer_id_array =
calloc(NumNodes, sizeof(uint32_t));
if (!gpu_mem[gpu_mem_count].usable_peer_id_array) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto gpu_mem_init_failed;
}
gpu_mem[gpu_mem_count].usable_peer_id_array[0] = props.KFDGpuID;
gpu_mem[gpu_mem_count].usable_peer_id_num = 1;
gpu_mem[gpu_mem_count].EngineId.ui32.Major = props.EngineId.ui32.Major;
gpu_mem[gpu_mem_count].EngineId.ui32.Minor = props.EngineId.ui32.Minor;
gpu_mem[gpu_mem_count].EngineId.ui32.Stepping = props.EngineId.ui32.Stepping;
gpu_mem[gpu_mem_count].drm_render_fd = fd;
gpu_mem[gpu_mem_count].gpu_id = props.KFDGpuID;
gpu_mem[gpu_mem_count].local_mem_size = props.LocalMemSize;
gpu_mem[gpu_mem_count].device_id = props.DeviceId;
gpu_mem[gpu_mem_count].node_id = i;
hsakmt_is_svm_api_supported &= props.Capability.ui32.SVMAPISupported;
gpu_mem[gpu_mem_count].scratch_physical.align = PAGE_SIZE;
gpu_mem[gpu_mem_count].scratch_physical.ops = &reserved_aperture_ops;
pthread_mutex_init(&gpu_mem[gpu_mem_count].scratch_physical.fmm_mutex, NULL);
gpu_mem[gpu_mem_count].gpuvm_aperture.align =
get_vm_alignment(props.DeviceId);
gpu_mem[gpu_mem_count].gpuvm_aperture.guard_pages = guardPages;
gpu_mem[gpu_mem_count].gpuvm_aperture.ops = &reserved_aperture_ops;
pthread_mutex_init(&gpu_mem[gpu_mem_count].gpuvm_aperture.fmm_mutex, NULL);
if (!g_first_gpu_mem)
g_first_gpu_mem = &gpu_mem[gpu_mem_count];
gpu_mem_count++;
}
}
/* The ioctl will also return Number of Nodes if
* args.kfd_process_device_apertures_ptr is set to NULL. This is not
* required since Number of nodes is already known. Kernel will fill in
* the apertures in kfd_process_device_apertures_ptr
*/
num_of_sysfs_nodes = hsakmt_get_num_sysfs_nodes();
if (num_of_sysfs_nodes < gpu_mem_count) {
ret = HSAKMT_STATUS_ERROR;
goto sysfs_parse_failed;
}
process_apertures = calloc(num_of_sysfs_nodes, sizeof(struct kfd_process_device_apertures));
if (!process_apertures) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto sysfs_parse_failed;
}
/* GPU Resource management can disable some of the GPU nodes.
* The Kernel driver could be not aware of this.
* Get from Kernel driver information of all the nodes and then filter it.
*/
ret = get_process_apertures(process_apertures, &num_of_sysfs_nodes);
if (ret != HSAKMT_STATUS_SUCCESS)
goto get_aperture_ioctl_failed;
all_gpu_id_array_size = 0;
all_gpu_id_array = NULL;
if (num_of_sysfs_nodes > 0) {
all_gpu_id_array = malloc(sizeof(uint32_t) * gpu_mem_count);
if (!all_gpu_id_array) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto get_aperture_ioctl_failed;
}
}
for (i = 0 ; i < num_of_sysfs_nodes ; i++) {
HsaNodeProperties nodeProps;
HsaIoLinkProperties linkProps[NumNodes];
uint32_t nodeId;
uint32_t j;
/* Map Kernel process device data node i <--> gpu_mem_id which
* indexes into gpu_mem[] based on gpu_id
*/
gpu_mem_id = gpu_mem_find_by_gpu_id(process_apertures[i].gpu_id);
if (gpu_mem_id < 0)
continue;
if (all_gpu_id_array_size == gpu_mem_count) {
ret = HSAKMT_STATUS_ERROR;
goto aperture_init_failed;
}
all_gpu_id_array[all_gpu_id_array_size++] = process_apertures[i].gpu_id;
/* Add this GPU to the usable_peer_id_arrays of all GPUs that
* this GPU has an IO link to. This GPU can map memory
* allocated on those GPUs.
*/
nodeId = gpu_mem[gpu_mem_id].node_id;
ret = hsakmt_topology_get_node_props(nodeId, &nodeProps);
if (ret != HSAKMT_STATUS_SUCCESS)
goto aperture_init_failed;
assert(nodeProps.NumIOLinks <= NumNodes);
ret = hsakmt_topology_get_iolink_props(nodeId, nodeProps.NumIOLinks,
linkProps);
if (ret != HSAKMT_STATUS_SUCCESS)
goto aperture_init_failed;
for (j = 0; j < nodeProps.NumIOLinks; j++) {
int32_t to_gpu_mem_id =
gpu_mem_find_by_node_id(linkProps[j].NodeTo);
uint32_t peer;
if (to_gpu_mem_id < 0)
continue;
assert(gpu_mem[to_gpu_mem_id].usable_peer_id_num < NumNodes);
peer = gpu_mem[to_gpu_mem_id].usable_peer_id_num++;
gpu_mem[to_gpu_mem_id].usable_peer_id_array[peer] =
gpu_mem[gpu_mem_id].gpu_id;
}
gpu_mem[gpu_mem_id].lds_aperture.base =
PORT_UINT64_TO_VPTR(process_apertures[i].lds_base);
gpu_mem[gpu_mem_id].lds_aperture.limit =
PORT_UINT64_TO_VPTR(process_apertures[i].lds_limit);
gpu_mem[gpu_mem_id].scratch_aperture.base =
PORT_UINT64_TO_VPTR(process_apertures[i].scratch_base);
gpu_mem[gpu_mem_id].scratch_aperture.limit =
PORT_UINT64_TO_VPTR(process_apertures[i].scratch_limit);
if (IS_CANONICAL_ADDR(process_apertures[i].gpuvm_limit)) {
uint64_t vm_alignment = get_vm_alignment(
gpu_mem[gpu_mem_id].device_id);
/* Set proper alignment for scratch backing aperture */
gpu_mem[gpu_mem_id].scratch_physical.align = vm_alignment;
/* Non-canonical per-ASIC GPUVM aperture does
* not exist on dGPUs in GPUVM64 address mode
*/
gpu_mem[gpu_mem_id].gpuvm_aperture.base = NULL;
gpu_mem[gpu_mem_id].gpuvm_aperture.limit = NULL;
/* Update SVM aperture limits and alignment */
if (process_apertures[i].gpuvm_base > svm_base)
svm_base = process_apertures[i].gpuvm_base;
if (process_apertures[i].gpuvm_limit < svm_limit ||
svm_limit == 0)
svm_limit = process_apertures[i].gpuvm_limit;
if (vm_alignment > svm_alignment)
svm_alignment = vm_alignment;
} else {
gpu_mem[gpu_mem_id].gpuvm_aperture.base =
PORT_UINT64_TO_VPTR(process_apertures[i].gpuvm_base);
gpu_mem[gpu_mem_id].gpuvm_aperture.limit =
PORT_UINT64_TO_VPTR(process_apertures[i].gpuvm_limit);
/* Reserve space at the start of the
* aperture. After subtracting the base, we
* don't want valid pointers to become NULL.
*/
aperture_allocate_area(
&gpu_mem[gpu_mem_id].gpuvm_aperture,
NULL,
gpu_mem[gpu_mem_id].gpuvm_aperture.align);
}
/* Acquire the VM from the DRM render node for KFD use */
ret = acquire_vm(gpu_mem[gpu_mem_id].gpu_id,
gpu_mem[gpu_mem_id].drm_render_fd);
if (ret != HSAKMT_STATUS_SUCCESS)
goto aperture_init_failed;
}
all_gpu_id_array_size *= sizeof(uint32_t);
if (svm_limit) {
/* At least one GPU uses GPUVM in canonical address
* space. Set up SVM apertures shared by all such GPUs
*/
ret = init_svm_apertures(svm_base, svm_limit, svm_alignment,
guardPages);
if (ret != HSAKMT_STATUS_SUCCESS)
goto init_svm_failed;
for (i = 0 ; i < num_of_sysfs_nodes ; i++) {
uintptr_t alt_base;
uint64_t alt_size;
int err;
if (!IS_CANONICAL_ADDR(process_apertures[i].gpuvm_limit))
continue;
/* Set memory policy to match the SVM apertures */
alt_base = (uintptr_t)svm.dgpu_alt_aperture->base;
alt_size = VOID_PTRS_SUB(svm.dgpu_alt_aperture->limit,
svm.dgpu_alt_aperture->base) + 1;
err = fmm_set_memory_policy(process_apertures[i].gpu_id,
svm.disable_cache ?
KFD_IOC_CACHE_POLICY_COHERENT :
KFD_IOC_CACHE_POLICY_NONCOHERENT,
KFD_IOC_CACHE_POLICY_COHERENT,
alt_base, alt_size,
hsakmt_get_gfxv_by_node_id(i) == GFX_VERSION_GFX950 ?
mfma_high_precision_mode : 0);
if (err) {
pr_err("Failed to set mem policy for GPU [0x%x]\n",
process_apertures[i].gpu_id);
ret = HSAKMT_STATUS_ERROR;
goto set_memory_policy_failed;
}
}
}
cpuvm_aperture.align = PAGE_SIZE;
cpuvm_aperture.limit = (void *)0x7FFFFFFFFFFF; /* 2^47 - 1 */
fmm_init_rbtree();
if (!init_mem_handle_aperture(PAGE_SIZE, guardPages))
pr_err("Failed to init mem_handle_aperture\n");
for (gpu_mem_id = 0; (uint32_t)gpu_mem_id < gpu_mem_count; gpu_mem_id++) {
if (!hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId))
continue;
gpu_mem[gpu_mem_id].mmio_aperture.base = map_mmio(
gpu_mem[gpu_mem_id].node_id,
gpu_mem[gpu_mem_id].gpu_id,
hsakmt_kfd_fd);
if (gpu_mem[gpu_mem_id].mmio_aperture.base)
gpu_mem[gpu_mem_id].mmio_aperture.limit = (void *)
((char *)gpu_mem[gpu_mem_id].mmio_aperture.base +
PAGE_SIZE - 1);
else
pr_err("Failed to map remapped mmio page on gpu_mem %d\n",
gpu_mem_id);
}
free(process_apertures);
return ret;
aperture_init_failed:
init_svm_failed:
set_memory_policy_failed:
free(all_gpu_id_array);
all_gpu_id_array = NULL;
get_aperture_ioctl_failed:
free(process_apertures);
sysfs_parse_failed:
gpu_mem_init_failed:
hsakmt_fmm_destroy_process_apertures();
return ret;
}
void hsakmt_fmm_destroy_process_apertures(void)
{
release_mmio();
if (all_gpu_id_array) {
free(all_gpu_id_array);
all_gpu_id_array = NULL;
}
all_gpu_id_array_size = 0;
if (gpu_mem) {
while (gpu_mem_count-- > 0)
free(gpu_mem[gpu_mem_count].usable_peer_id_array);
free(gpu_mem);
gpu_mem = NULL;
}
gpu_mem_count = 0;
}
HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id,
HSAuint64 *aperture_base, HSAuint64 *aperture_limit)
{
HSAKMT_STATUS err = HSAKMT_STATUS_ERROR;
int32_t slot = gpu_mem_find_by_gpu_id(gpu_id);
if (slot < 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
switch (aperture_type) {
case FMM_GPUVM:
if (aperture_is_valid(gpu_mem[slot].gpuvm_aperture.base,
gpu_mem[slot].gpuvm_aperture.limit)) {
*aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.base);
*aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].gpuvm_aperture.limit);
err = HSAKMT_STATUS_SUCCESS;
}
break;
case FMM_SCRATCH:
if (aperture_is_valid(gpu_mem[slot].scratch_aperture.base,
gpu_mem[slot].scratch_aperture.limit)) {
*aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.base);
*aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].scratch_aperture.limit);
err = HSAKMT_STATUS_SUCCESS;
}
break;
case FMM_LDS:
if (aperture_is_valid(gpu_mem[slot].lds_aperture.base,
gpu_mem[slot].lds_aperture.limit)) {
*aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.base);
*aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].lds_aperture.limit);
err = HSAKMT_STATUS_SUCCESS;
}
break;
case FMM_SVM:
/* Report single SVM aperture, starting at base of
* fine-grained, ending at limit of coarse-grained
*/
if (aperture_is_valid(svm.dgpu_alt_aperture->base,
svm.dgpu_aperture->limit)) {
*aperture_base = PORT_VPTR_TO_UINT64(svm.dgpu_alt_aperture->base);
*aperture_limit = PORT_VPTR_TO_UINT64(svm.dgpu_aperture->limit);
err = HSAKMT_STATUS_SUCCESS;
}
break;
case FMM_MMIO:
if (aperture_is_valid(gpu_mem[slot].mmio_aperture.base,
gpu_mem[slot].mmio_aperture.limit)) {
*aperture_base = PORT_VPTR_TO_UINT64(gpu_mem[slot].mmio_aperture.base);
*aperture_limit = PORT_VPTR_TO_UINT64(gpu_mem[slot].mmio_aperture.limit);
err = HSAKMT_STATUS_SUCCESS;
}
break;
default:
break;
}
return err;
}
static bool id_in_array(uint32_t id, uint32_t *ids_array,
uint32_t ids_array_size)
{
uint32_t i;
for (i = 0; i < ids_array_size/sizeof(uint32_t); i++) {
if (id == ids_array[i])
return true;
}
return false;
}
/* Helper function to remove ids_array from
* obj->mapped_device_id_array
*/
static void remove_device_ids_from_mapped_array(vm_object_t *obj,
uint32_t *ids_array, uint32_t ids_array_size)
{
uint32_t i = 0, j = 0;
if (obj->mapped_device_id_array == ids_array)
goto set_size_and_free;
for (i = 0; i < obj->mapped_device_id_array_size/
sizeof(uint32_t); i++) {
if (!id_in_array(obj->mapped_device_id_array[i],
ids_array, ids_array_size))
obj->mapped_device_id_array[j++] =
obj->mapped_device_id_array[i];
}
set_size_and_free:
obj->mapped_device_id_array_size = j*sizeof(uint32_t);
if (!j) {
if (obj->mapped_device_id_array)
free(obj->mapped_device_id_array);
obj->mapped_device_id_array = NULL;
}
}
/* Helper function to add ids_array to
* obj->mapped_device_id_array
*/
static void add_device_ids_to_mapped_array(vm_object_t *obj,
uint32_t *ids_array, uint32_t ids_array_size)
{
uint32_t new_array_size;
/* Remove any potential duplicated ids */
remove_device_ids_from_mapped_array(obj, ids_array, ids_array_size);
new_array_size = obj->mapped_device_id_array_size
+ ids_array_size;
obj->mapped_device_id_array = (uint32_t *)realloc(
obj->mapped_device_id_array, new_array_size);
if (!obj->mapped_device_id_array) {
pr_err("Failed to allocate memory for mapped device ID array.\n");
return;
}
memcpy(&obj->mapped_device_id_array
[obj->mapped_device_id_array_size/sizeof(uint32_t)],
ids_array, ids_array_size);
obj->mapped_device_id_array_size = new_array_size;
}
/* If nodes_to_map is not NULL, map the nodes specified; otherwise map all. */
static HSAKMT_STATUS _fmm_map_to_gpu(manageable_aperture_t *aperture,
void *address, uint64_t size, vm_object_t *obj,
uint32_t *nodes_to_map, uint32_t nodes_array_size)
{
struct kfd_ioctl_map_memory_to_gpu_args args = {0};
vm_object_t *object;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
int ret_ioctl;
uint32_t i;
if (!obj)
pthread_mutex_lock(&aperture->fmm_mutex);
object = obj;
if (!object) {
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (!object) {
ret = HSAKMT_STATUS_INVALID_HANDLE;
goto err_object_not_found;
}
}
/* For a memory region that is registered by user pointer, changing
* mapping nodes is not allowed, so we don't need to check the mapping
* nodes or map if it's already mapped. Just increase the reference.
*/
if (object->userptr && object->mapping_count) {
++object->mapping_count;
goto exit_ok;
}
if (nodes_to_map) {
/* If specified, map the requested */
args.device_ids_array_ptr = (uint64_t)nodes_to_map;
args.n_devices = nodes_array_size / sizeof(uint32_t);
} else if (object->registered_device_id_array_size > 0) {
/* otherwise map all registered */
args.device_ids_array_ptr =
(uint64_t)object->registered_device_id_array;
args.n_devices = object->registered_device_id_array_size /
sizeof(uint32_t);
} else {
/* not specified, not registered: map all GPUs */
int32_t gpu_mem_id = gpu_mem_find_by_node_id(obj->node_id);
if (!obj->userptr && hsakmt_get_device_id_by_node_id(obj->node_id) &&
gpu_mem_id >= 0) {
args.device_ids_array_ptr = (uint64_t)
gpu_mem[gpu_mem_id].usable_peer_id_array;
args.n_devices =
gpu_mem[gpu_mem_id].usable_peer_id_num;
} else {
args.device_ids_array_ptr = (uint64_t)all_gpu_id_array;
args.n_devices = all_gpu_id_array_size / sizeof(uint32_t);
}
}
for (i = 0; i < object->handle_num; i++) {
args.n_success = 0;
args.handle = object->handles[i];
ret_ioctl = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_MAP_MEMORY_TO_GPU, &args);
if (ret_ioctl) {
pr_err("GPU mapping failed (%d) for obj at %p, userptr %p, size %lu",
ret_ioctl, object->start, object->userptr, object->size);
ret = HSAKMT_STATUS_ERROR;
goto err_map_failed;
}
}
add_device_ids_to_mapped_array(object,
(uint32_t *)args.device_ids_array_ptr,
args.n_success * sizeof(uint32_t));
print_device_id_array((uint32_t *)object->mapped_device_id_array,
object->mapped_device_id_array_size);
object->mapping_count = 1;
/* Mapping changed and lifecycle of object->mapped_node_id_array
* terminates here. Free it and allocate on next query
*/
if (object->mapped_node_id_array) {
free(object->mapped_node_id_array);
object->mapped_node_id_array = NULL;
}
err_map_failed:
while (ret && i--) {
args.handle = object->handles[i];
hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
}
exit_ok:
err_object_not_found:
if (!obj)
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
static HSAKMT_STATUS _fmm_map_to_gpu_scratch(uint32_t gpu_id, manageable_aperture_t *aperture,
void *address, uint64_t size)
{
int32_t gpu_mem_id;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
bool is_debugger = 0;
uint32_t flags;
void *mmap_ret = NULL;
uint64_t mmap_offset = 0;
vm_object_t *obj;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (!hsakmt_is_dgpu)
return HSAKMT_STATUS_SUCCESS; /* Nothing to do on APU */
/* sanity check the address */
if (address < aperture->base ||
VOID_PTR_ADD(address, size - 1) > aperture->limit)
return HSAKMT_STATUS_INVALID_PARAMETER;
is_debugger = hsakmt_debug_get_reg_status(gpu_mem[gpu_mem_id].node_id);
flags = is_debugger ? KFD_IOC_ALLOC_MEM_FLAGS_GTT :
KFD_IOC_ALLOC_MEM_FLAGS_VRAM;
flags |= KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE;
/* allocate object within the scratch backing aperture */
obj = fmm_allocate_memory_object(gpu_id, address, size,
aperture, &mmap_offset, flags);
if (!obj)
return HSAKMT_STATUS_INVALID_HANDLE;
/* Create a CPU mapping for the debugger */
mmap_ret = fmm_map_to_cpu(address, size, is_debugger,
gpu_mem[gpu_mem_id].drm_render_fd,
mmap_offset);
if (mmap_ret == MAP_FAILED) {
__fmm_release(obj, aperture);
return HSAKMT_STATUS_ERROR;
}
/* map to GPU */
ret = _fmm_map_to_gpu(aperture, address, size, NULL, &gpu_id, sizeof(uint32_t));
if (ret != HSAKMT_STATUS_SUCCESS)
__fmm_release(obj, aperture);
return ret;
}
static HSAKMT_STATUS _fmm_map_to_gpu_userptr(void *addr, uint64_t size,
uint64_t *gpuvm_addr, vm_object_t *object,
uint32_t *nodes_to_map, uint32_t nodes_array_size)
{
manageable_aperture_t *aperture;
void *svm_addr;
HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1);
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
aperture = svm.dgpu_aperture;
/* Map and return the GPUVM address adjusted by the offset
* from the start of the page
*/
if (!object && hsakmt_is_svm_api_supported) {
svm_addr = (void*)((HSAuint64)addr - page_offset);
if (!nodes_to_map) {
nodes_to_map = all_gpu_id_array;
nodes_array_size = all_gpu_id_array_size;
}
pr_debug("%s Mapping Address %p size aligned: %ld offset: %x\n",
__func__, svm_addr, PAGE_ALIGN_UP(page_offset + size), page_offset);
ret = fmm_map_mem_svm_api(svm_addr,
PAGE_ALIGN_UP(page_offset + size),
nodes_to_map,
nodes_array_size / sizeof(uint32_t));
} else if (object) {
svm_addr = object->start;
ret = _fmm_map_to_gpu(aperture, svm_addr, object->size, object, NULL, 0);
} else {
pr_err("Object is null and SVM API is not supported.\n");
return HSAKMT_STATUS_ERROR;
}
if (ret == HSAKMT_STATUS_SUCCESS && gpuvm_addr)
*gpuvm_addr = (uint64_t)svm_addr + page_offset;
return ret;
}
HSAKMT_STATUS hsakmt_fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address)
{
manageable_aperture_t *aperture = NULL;
vm_object_t *object;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
gpu_mem_t *gpu_mem_ptr = NULL;
/* Special handling for scratch memory */
gpu_mem_ptr = fmm_is_scratch_aperture(address);
if (gpu_mem_ptr) {
return _fmm_map_to_gpu_scratch(gpu_mem_ptr->gpu_id,
&gpu_mem_ptr->scratch_physical,
address, size);
}
object = vm_find_object(address, size, &aperture);
if (!object && !hsakmt_is_svm_api_supported) {
if (!hsakmt_is_dgpu) {
/* Prefetch memory on APUs with dummy-reads */
fmm_check_user_memory(address, size);
return HSAKMT_STATUS_SUCCESS;
}
pr_err("Object not found at %p\n", address);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
/* Successful vm_find_object returns with the aperture locked */
/* allocate VA only */
if (object && object->handles[0] == 0) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
/* allocate buffer only, should be mapped by GEM API */
if (aperture && (aperture == &mem_handle_aperture)) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
if (aperture && (aperture == &cpuvm_aperture)) {
/* Prefetch memory on APUs with dummy-reads */
fmm_check_user_memory(address, size);
ret = HSAKMT_STATUS_SUCCESS;
} else if ((hsakmt_is_svm_api_supported && !object) || (object && (object->userptr))) {
ret = _fmm_map_to_gpu_userptr(address, size, gpuvm_address, object, NULL, 0);
} else if (aperture) {
ret = _fmm_map_to_gpu(aperture, address, size, object, NULL, 0);
/* Update alternate GPUVM address only for
* CPU-invisible apertures on old APUs
*/
if (ret == HSAKMT_STATUS_SUCCESS && gpuvm_address && !aperture->is_cpu_accessible)
*gpuvm_address = VOID_PTRS_SUB(object->start, aperture->base);
}
if (object)
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
static void print_device_id_array(uint32_t *device_id_array, uint32_t device_id_array_size)
{
#ifdef DEBUG_PRINT_APERTURE
device_id_array_size /= sizeof(uint32_t);
pr_info("device id array size %d\n", device_id_array_size);
for (uint32_t i = 0 ; i < device_id_array_size; i++)
pr_info("%d . 0x%x\n", (i+1), device_id_array[i]);
#endif
}
static int _fmm_unmap_from_gpu(manageable_aperture_t *aperture, void *address,
uint32_t *device_ids_array, uint32_t device_ids_array_size,
vm_object_t *obj)
{
vm_object_t *object;
int ret = 0, tmp_ret;
uint32_t i;
struct kfd_ioctl_unmap_memory_from_gpu_args args = {0};
HSAuint32 page_offset = (HSAint64)address & (PAGE_SIZE - 1);
if (!obj)
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = obj;
if (!object) {
object = vm_find_object_by_address(aperture,
VOID_PTR_SUB(address, page_offset), 0);
if (!object) {
ret = -1;
goto out;
}
}
if (object->userptr && object->mapping_count > 1) {
--object->mapping_count;
goto out;
}
if (device_ids_array && device_ids_array_size > 0) {
args.device_ids_array_ptr = (uint64_t)device_ids_array;
args.n_devices = device_ids_array_size / sizeof(uint32_t);
} else if (object->mapped_device_id_array_size > 0) {
args.device_ids_array_ptr = (uint64_t)object->mapped_device_id_array;
args.n_devices = object->mapped_device_id_array_size /
sizeof(uint32_t);
} else {
/*
* When unmap exits here it should return failing error code as the user tried to
* unmap already unmapped buffer. Currently we returns success as KFDTEST and RT
* need to deploy the change on there side before thunk fails on this case.
*/
ret = 0;
goto out;
}
print_device_id_array((void *)args.device_ids_array_ptr,
args.n_devices * sizeof(uint32_t));
for (i = 0; i < object->handle_num; i++) {
args.handle = object->handles[i];
args.n_success = 0;
tmp_ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
if (tmp_ret)
ret = tmp_ret;
}
if (!ret) {
remove_device_ids_from_mapped_array(object,
(uint32_t *)args.device_ids_array_ptr,
args.n_success * sizeof(uint32_t));
if (object->mapped_node_id_array)
free(object->mapped_node_id_array);
object->mapped_node_id_array = NULL;
object->mapping_count = 0;
}
out:
if (!obj)
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
static int _fmm_unmap_from_gpu_scratch(uint32_t gpu_id,
manageable_aperture_t *aperture,
void *address)
{
int32_t gpu_mem_id;
vm_object_t *object;
struct kfd_ioctl_unmap_memory_from_gpu_args args = {0};
int ret;
/* Retrieve gpu_mem id according to gpu_id */
gpu_mem_id = gpu_mem_find_by_gpu_id(gpu_id);
if (gpu_mem_id < 0)
return -1;
if (!hsakmt_is_dgpu)
return 0; /* Nothing to do on APU */
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle and size */
object = vm_find_object_by_address(aperture, address, 0);
if (!object) {
ret = -EINVAL;
goto err;
}
if (!object->mapped_device_id_array ||
object->mapped_device_id_array_size == 0) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return 0;
}
/* unmap from GPU */
args.handle = object->handles[0];
args.device_ids_array_ptr = (uint64_t)object->mapped_device_id_array;
args.n_devices = object->mapped_device_id_array_size / sizeof(uint32_t);
args.n_success = 0;
ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, &args);
/* unmap from CPU while keeping the address space reserved */
mmap(address, object->size, PROT_NONE,
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED,
-1, 0);
remove_device_ids_from_mapped_array(object,
(uint32_t *)args.device_ids_array_ptr,
args.n_success * sizeof(uint32_t));
if (object->mapped_node_id_array)
free(object->mapped_node_id_array);
object->mapped_node_id_array = NULL;
if (ret)
goto err;
pthread_mutex_unlock(&aperture->fmm_mutex);
/* free object in scratch backing aperture */
return __fmm_release(object, aperture);
err:
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
int hsakmt_fmm_unmap_from_gpu(void *address)
{
manageable_aperture_t *aperture;
vm_object_t *object;
int ret;
gpu_mem_t *gpu_mem_ptr = NULL;
/* Special handling for scratch memory */
gpu_mem_ptr = fmm_is_scratch_aperture(address);
if (gpu_mem_ptr) {
return _fmm_unmap_from_gpu_scratch(gpu_mem_ptr->gpu_id,
&gpu_mem_ptr->scratch_physical,
address);
}
object = vm_find_object(address, 0, &aperture);
if (!object)
/* On APUs GPU unmapping of system memory is a no-op */
return (!hsakmt_is_dgpu || hsakmt_is_svm_api_supported) ? 0 : -EINVAL;
/* Successful vm_find_object returns with the aperture locked */
if (aperture == &cpuvm_aperture)
/* On APUs GPU unmapping of system memory is a no-op */
ret = 0;
else
ret = _fmm_unmap_from_gpu(aperture, address, NULL, 0, object);
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
bool hsakmt_fmm_get_handle(void *address, uint64_t *handle)
{
uint32_t i;
manageable_aperture_t *aperture;
vm_object_t *object;
bool found;
found = false;
aperture = NULL;
/* Find the aperture the requested address belongs to */
for (i = 0; i < gpu_mem_count; i++) {
if (gpu_mem[i].gpu_id == NON_VALID_GPU_ID)
continue;
if ((address >= gpu_mem[i].gpuvm_aperture.base) &&
(address <= gpu_mem[i].gpuvm_aperture.limit)) {
aperture = &gpu_mem[i].gpuvm_aperture;
break;
}
}
if (!aperture) {
if ((address >= svm.dgpu_aperture->base) &&
(address <= svm.dgpu_aperture->limit)) {
aperture = svm.dgpu_aperture;
} else if ((address >= svm.dgpu_alt_aperture->base) &&
(address <= svm.dgpu_alt_aperture->limit)) {
aperture = svm.dgpu_alt_aperture;
}
}
if (!aperture)
return false;
pthread_mutex_lock(&aperture->fmm_mutex);
/* Find the object to retrieve the handle */
object = vm_find_object_by_address(aperture, address, 0);
if (object && handle) {
*handle = object->handles[0];
found = true;
}
pthread_mutex_unlock(&aperture->fmm_mutex);
return found;
}
static HSAKMT_STATUS fmm_register_user_memory(void *addr,
HSAuint64 size,
vm_object_t **obj_ret,
bool coarse_grain,
bool ext_coherent)
{
manageable_aperture_t *aperture = svm.dgpu_aperture;
HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1);
HSAuint64 aligned_addr = (HSAuint64)addr - page_offset;
HSAuint64 aligned_size = PAGE_ALIGN_UP(page_offset + size);
void *svm_addr;
HSAuint32 gpu_id;
vm_object_t *obj, *exist_obj;
/* Find first GPU for creating the userptr BO */
if (!g_first_gpu_mem)
return HSAKMT_STATUS_ERROR;
gpu_id = g_first_gpu_mem->gpu_id;
/* Optionally check that the CPU mapping is valid */
if (svm.check_userptr)
fmm_check_user_memory(addr, size);
/* Allocate BO, userptr address is passed in mmap_offset */
svm_addr = __fmm_allocate_device(gpu_id, NULL, aligned_size, aperture,
&aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
(ext_coherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0),
0,
&obj);
if (!svm_addr)
return HSAKMT_STATUS_ERROR;
if (!obj)
return HSAKMT_STATUS_ERROR;
pthread_mutex_lock(&aperture->fmm_mutex);
/* catch the race condition where some other thread added the userptr
* object already after the vm_find_object.
*/
exist_obj = vm_find_object_by_userptr(aperture, addr, size);
if (exist_obj) {
++exist_obj->registration_count;
} else {
obj->userptr = addr;
hsakmt_gpuid_to_nodeid(gpu_id, &obj->node_id);
obj->userptr_size = size;
obj->registration_count = 1;
obj->user_node.key = rbtree_key((unsigned long)addr, size);
hsakmt_rbtree_insert(&aperture->user_tree, &obj->user_node);
}
pthread_mutex_unlock(&aperture->fmm_mutex);
if (exist_obj)
__fmm_release(obj, aperture);
if (obj_ret)
*obj_ret = exist_obj ? exist_obj : obj;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size,
bool coarse_grain,
bool ext_coherent)
{
manageable_aperture_t *aperture = NULL;
vm_object_t *object = NULL;
HSAKMT_STATUS ret;
if (gpu_id_array_size > 0 && !gpu_id_array)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (coarse_grain && ext_coherent)
return HSAKMT_STATUS_INVALID_PARAMETER;
object = vm_find_object(address, size_in_bytes, &aperture);
if (!object) {
if (!hsakmt_is_dgpu)
/* System memory registration on APUs is a no-op */
return HSAKMT_STATUS_SUCCESS;
/* Register a new user ptr */
if (hsakmt_is_svm_api_supported) {
ret = fmm_register_mem_svm_api(address,
size_in_bytes,
coarse_grain,
ext_coherent);
if (ret == HSAKMT_STATUS_SUCCESS)
return ret;
pr_debug("SVM failed, falling back to old registration\n");
}
ret = fmm_register_user_memory(address,
size_in_bytes,
&object,
coarse_grain,
ext_coherent);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
if (gpu_id_array_size == 0)
return HSAKMT_STATUS_SUCCESS;
aperture = svm.dgpu_aperture;
pthread_mutex_lock(&aperture->fmm_mutex);
/* fall through for registered device ID array setup */
} else if (object->userptr) {
/* Update an existing userptr */
++object->registration_count;
} else {
/* Not a userptr when we are expecting one */
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_INVALID_HANDLE;
}
/* Successful vm_find_object returns with aperture locked */
if (object->registered_device_id_array_size > 0) {
/* Multiple registration is allowed, but not changing nodes */
if ((gpu_id_array_size != object->registered_device_id_array_size)
|| memcmp(object->registered_device_id_array,
gpu_id_array, gpu_id_array_size)) {
pr_err("Cannot change nodes in a registered addr.\n");
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_MEMORY_ALREADY_REGISTERED;
} else {
/* Delete the new array, keep the existing one. */
if (gpu_id_array)
free(gpu_id_array);
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_SUCCESS;
}
}
if (gpu_id_array_size > 0) {
object->registered_device_id_array = gpu_id_array;
object->registered_device_id_array_size = gpu_id_array_size;
/* Registration of object changed. Lifecycle of object->
* registered_node_id_array terminates here. Free old one
* and re-allocate on next query
*/
if (object->registered_node_id_array) {
free(object->registered_node_id_array);
object->registered_node_id_array = NULL;
}
}
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_SUCCESS;
}
#define GRAPHICS_METADATA_DEFAULT_SIZE 64
HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size,
HSA_REGISTER_MEM_FLAGS RegisterFlags)
{
struct kfd_ioctl_get_dmabuf_info_args infoArgs = {0};
struct kfd_ioctl_import_dmabuf_args importArgs = {0};
struct kfd_ioctl_free_memory_of_gpu_args freeArgs = {0};
manageable_aperture_t *aperture;
HsaMemFlags mflags;
vm_object_t *obj;
void *metadata;
void *mem = NULL, *aperture_base = NULL;
int32_t gpu_mem_id;
int r;
HSAKMT_STATUS status = HSAKMT_STATUS_ERROR;
static const uint64_t IMAGE_ALIGN = 256*1024;
if (gpu_id_array_size > 0 && !gpu_id_array)
return HSAKMT_STATUS_INVALID_PARAMETER;
infoArgs.dmabuf_fd = GraphicsResourceHandle;
infoArgs.metadata_size = GRAPHICS_METADATA_DEFAULT_SIZE;
metadata = calloc(infoArgs.metadata_size, 1);
if (!metadata)
return HSAKMT_STATUS_NO_MEMORY;
infoArgs.metadata_ptr = (uint64_t)metadata;
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs);
if (r && infoArgs.metadata_size > GRAPHICS_METADATA_DEFAULT_SIZE) {
/* Try again with bigger metadata */
free(metadata);
metadata = calloc(infoArgs.metadata_size, 1);
if (!metadata)
return HSAKMT_STATUS_NO_MEMORY;
infoArgs.metadata_ptr = (uint64_t)metadata;
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_DMABUF_INFO, (void *)&infoArgs);
}
if (r)
goto error_free_metadata;
/* Choose aperture based on GPU and allocate virtual address */
gpu_mem_id = gpu_mem_find_by_gpu_id(infoArgs.gpu_id);
if (gpu_mem_id < 0)
goto error_free_metadata;
/* import DMA buffer without VA assigned */
if (!gpu_id_array && gpu_id_array_size == 0 && !RegisterFlags.ui32.requiresVAddr) {
aperture = &mem_handle_aperture;
} else if (hsakmt_topology_is_svm_needed(gpu_mem[gpu_mem_id].EngineId)) {
aperture = svm.dgpu_aperture;
} else {
aperture = &gpu_mem[gpu_mem_id].gpuvm_aperture;
aperture_base = aperture->base;
}
if (!aperture_is_valid(aperture->base, aperture->limit))
goto error_free_metadata;
pthread_mutex_lock(&aperture->fmm_mutex);
mem = aperture_allocate_area_aligned(aperture, NULL, infoArgs.size,
IMAGE_ALIGN);
if (!mem) {
pthread_mutex_unlock(&aperture->fmm_mutex);
goto error_free_metadata;
}
/* Import DMA buffer */
if (aperture == &mem_handle_aperture)
importArgs.va_addr = 0;
else
importArgs.va_addr = VOID_PTRS_SUB(mem, aperture_base);
importArgs.gpu_id = infoArgs.gpu_id;
importArgs.dmabuf_fd = GraphicsResourceHandle;
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IMPORT_DMABUF, (void *)&importArgs);
if (r) {
pthread_mutex_unlock(&aperture->fmm_mutex);
goto error_release_aperture;
}
/* Atomically update and register the object */
mflags = fmm_translate_ioc_to_hsa_flags(infoArgs.flags);
mflags.ui32.CoarseGrain = 1;
obj = aperture_allocate_object(aperture, mem, importArgs.handle,
infoArgs.size, mflags);
if (obj) {
obj->metadata = metadata;
obj->registered_device_id_array = gpu_id_array;
obj->registered_device_id_array_size = gpu_id_array_size;
hsakmt_gpuid_to_nodeid(infoArgs.gpu_id, &obj->node_id);
}
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!obj)
goto error_release_buffer;
GraphicsResourceInfo->MemoryAddress = mem;
GraphicsResourceInfo->SizeInBytes = infoArgs.size;
GraphicsResourceInfo->Metadata = (void *)(unsigned long)infoArgs.metadata_ptr;
GraphicsResourceInfo->MetadataSizeInBytes = infoArgs.metadata_size;
hsakmt_gpuid_to_nodeid(infoArgs.gpu_id, &GraphicsResourceInfo->NodeId);
return HSAKMT_STATUS_SUCCESS;
error_release_buffer:
freeArgs.handle = importArgs.handle;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) {
/* Handle error if memory is not freed properly */
pr_err("Failed to free GPU memory\n");
}
error_release_aperture:
aperture_release_area(aperture, mem, infoArgs.size);
error_free_metadata:
free(metadata);
return status;
}
HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
int *DMABufFd,
HSAuint64 *Offset)
{
struct kfd_ioctl_export_dmabuf_args exportArgs = {0};
manageable_aperture_t *aperture;
HsaApertureInfo ApeInfo;
vm_object_t *obj;
HSAuint64 offset;
int r;
aperture = fmm_find_aperture(MemoryAddress, &ApeInfo);
if (!aperture)
return HSAKMT_STATUS_INVALID_PARAMETER;
pthread_mutex_lock(&aperture->fmm_mutex);
obj = vm_find_object_by_address_range(aperture, MemoryAddress);
if (obj) {
offset = VOID_PTRS_SUB(MemoryAddress, obj->start);
if (offset + MemorySizeInBytes <= obj->size) {
exportArgs.handle = obj->handles[0];
exportArgs.flags = O_CLOEXEC;
exportArgs.dmabuf_fd = 0;
} else {
obj = NULL;
}
}
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!obj)
return HSAKMT_STATUS_INVALID_PARAMETER;
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_EXPORT_DMABUF, (void *)&exportArgs);
if (r)
return HSAKMT_STATUS_ERROR;
*DMABufFd = exportArgs.dmabuf_fd;
*Offset = offset;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress,
HSAuint64 SizeInBytes,
HsaSharedMemoryHandle *SharedMemoryHandle)
{
int r = 0;
HSAuint32 gpu_id = 0;
vm_object_t *obj = NULL;
manageable_aperture_t *aperture = NULL;
struct kfd_ioctl_ipc_export_handle_args exportArgs = {0};
HsaApertureInfo ApeInfo;
HsaSharedMemoryStruct *SharedMemoryStruct =
to_hsa_shared_memory_struct(SharedMemoryHandle);
if (SizeInBytes >= (1ULL << ((sizeof(HSAuint32) * 8) + PAGE_SHIFT)))
return HSAKMT_STATUS_INVALID_PARAMETER;
aperture = fmm_find_aperture(MemoryAddress, &ApeInfo);
if (!aperture)
return HSAKMT_STATUS_INVALID_PARAMETER;
pthread_mutex_lock(&aperture->fmm_mutex);
obj = vm_find_object_by_address(aperture, MemoryAddress, 0);
pthread_mutex_unlock(&aperture->fmm_mutex);
if (!obj)
return HSAKMT_STATUS_INVALID_PARAMETER;
r = hsakmt_validate_nodeid(obj->node_id, &gpu_id);
if (r != HSAKMT_STATUS_SUCCESS)
return r;
if (!gpu_id && hsakmt_is_dgpu) {
/* Sharing non paged system memory. Use first GPU which was
* used during allocation. See fmm_allocate_host_gpu()
*/
if (!g_first_gpu_mem)
return HSAKMT_STATUS_ERROR;
gpu_id = g_first_gpu_mem->gpu_id;
}
exportArgs.handle = obj->handles[0];
exportArgs.gpu_id = gpu_id;
exportArgs.flags = obj->mflags.Value;
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IPC_EXPORT_HANDLE, (void *)&exportArgs);
if (r)
return HSAKMT_STATUS_ERROR;
memcpy(SharedMemoryStruct->ShareHandle, exportArgs.share_handle,
sizeof(SharedMemoryStruct->ShareHandle));
SharedMemoryStruct->ApeInfo = ApeInfo;
SharedMemoryStruct->SizeInPages = (HSAuint32) (SizeInBytes >> PAGE_SHIFT);
SharedMemoryStruct->ExportGpuId = gpu_id;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemoryHandle,
HSAuint64 *SizeInBytes,
void **MemoryAddress,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size)
{
int r = 0;
HSAKMT_STATUS err = HSAKMT_STATUS_ERROR;
vm_object_t *obj = NULL;
void *reservedMem = NULL;
manageable_aperture_t *aperture;
struct kfd_ioctl_ipc_import_handle_args importArgs = {0};
struct kfd_ioctl_free_memory_of_gpu_args freeArgs = {0};
const HsaSharedMemoryStruct *SharedMemoryStruct =
to_const_hsa_shared_memory_struct(SharedMemoryHandle);
HSAuint64 SizeInPages = SharedMemoryStruct->SizeInPages;
HsaMemFlags mflags;
if (gpu_id_array_size > 0 && !gpu_id_array)
return HSAKMT_STATUS_INVALID_PARAMETER;
memcpy(importArgs.share_handle, SharedMemoryStruct->ShareHandle,
sizeof(importArgs.share_handle));
importArgs.gpu_id = SharedMemoryStruct->ExportGpuId;
aperture = fmm_get_aperture(SharedMemoryStruct->ApeInfo);
if (!aperture)
return HSAKMT_STATUS_INVALID_PARAMETER;
pthread_mutex_lock(&aperture->fmm_mutex);
reservedMem = aperture_allocate_area(aperture, NULL,
(SizeInPages << PAGE_SHIFT));
if (!reservedMem) {
err = HSAKMT_STATUS_NO_MEMORY;
goto err_free_buffer;
}
importArgs.va_addr = (uint64_t)reservedMem;
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_IPC_IMPORT_HANDLE, (void *)&importArgs);
if (r) {
err = HSAKMT_STATUS_ERROR;
goto err_import;
}
mflags.Value = importArgs.flags;
obj = aperture_allocate_object(aperture, reservedMem, importArgs.handle,
(SizeInPages << PAGE_SHIFT), mflags);
if (!obj) {
err = HSAKMT_STATUS_NO_MEMORY;
goto err_free_mem;
}
if (importArgs.mmap_offset) {
int32_t gpu_mem_id = gpu_mem_find_by_gpu_id(importArgs.gpu_id);
void *ret;
if (gpu_mem_id < 0) {
vm_remove_object(aperture, obj);
aperture_release_area(aperture, reservedMem,
(SizeInPages << PAGE_SHIFT));
err = HSAKMT_STATUS_ERROR;
goto err_free_mem;
}
obj->node_id = gpu_mem[gpu_mem_id].node_id;
pthread_mutex_unlock(&aperture->fmm_mutex);
ret = fmm_map_to_cpu(reservedMem, (SizeInPages << PAGE_SHIFT),
true, gpu_mem[gpu_mem_id].drm_render_fd,
importArgs.mmap_offset);
if (ret == MAP_FAILED) {
pthread_mutex_lock(&aperture->fmm_mutex);
vm_remove_object(aperture, obj);
aperture_release_area(aperture, reservedMem,
(SizeInPages << PAGE_SHIFT));
err = HSAKMT_STATUS_ERROR;
goto err_free_mem_handle;
}
} else {
pthread_mutex_unlock(&aperture->fmm_mutex);
}
*MemoryAddress = reservedMem;
*SizeInBytes = (SizeInPages << PAGE_SHIFT);
if (gpu_id_array_size > 0) {
obj->registered_device_id_array = gpu_id_array;
obj->registered_device_id_array_size = gpu_id_array_size;
}
obj->is_imported_kfd_bo = true;
return HSAKMT_STATUS_SUCCESS;
err_free_mem_handle:
freeArgs.handle = importArgs.handle;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_FREE_MEMORY_OF_GPU, &freeArgs) != 0) {
pr_err("Failed to free GPU memory for handle %llu\n", freeArgs.handle);
}
err_free_mem:
err_free_buffer:
err_import:
pthread_mutex_unlock(&aperture->fmm_mutex);
return err;
}
HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address)
{
manageable_aperture_t *aperture;
vm_object_t *object;
object = vm_find_object(address, 0, &aperture);
if (!object)
/* On APUs we assume it's a random system memory address
* where registration and dergistration is a no-op
*/
return (!hsakmt_is_dgpu || hsakmt_is_svm_api_supported) ?
HSAKMT_STATUS_SUCCESS :
HSAKMT_STATUS_MEMORY_NOT_REGISTERED;
/* Successful vm_find_object returns with aperture locked */
if (aperture == &cpuvm_aperture) {
/* API-allocated system memory on APUs, deregistration
* is a no-op
*/
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_SUCCESS;
}
if (object->metadata || object->userptr || object->is_imported_kfd_bo) {
/* An object with metadata is an imported graphics
* buffer. Deregistering imported graphics buffers or
* userptrs means releasing the BO.
*/
pthread_mutex_unlock(&aperture->fmm_mutex);
__fmm_release(object, aperture);
return HSAKMT_STATUS_SUCCESS;
}
if (!object->registered_device_id_array ||
object->registered_device_id_array_size <= 0) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_MEMORY_NOT_REGISTERED;
}
if (object->registered_device_id_array) {
free(object->registered_device_id_array);
object->registered_device_id_array = NULL;
object->registered_device_id_array_size = 0;
}
if (object->registered_node_id_array)
free(object->registered_node_id_array);
object->registered_node_id_array = NULL;
object->registration_count = 0;
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_SUCCESS;
}
/*
* This function unmaps all nodes on current mapped nodes list that are not included on nodes_to_map
* and maps nodes_to_map
*/
HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
uint32_t *nodes_to_map, uint64_t num_of_nodes,
uint64_t *gpuvm_address)
{
manageable_aperture_t *aperture = NULL;
vm_object_t *object;
uint32_t i;
uint32_t *registered_node_id_array, registered_node_id_array_size;
HSAKMT_STATUS ret;
int retcode = 0;
if (!num_of_nodes || !nodes_to_map || !address)
return HSAKMT_STATUS_INVALID_PARAMETER;
object = vm_find_object(address, size, &aperture);
if (!object && !hsakmt_is_svm_api_supported)
return HSAKMT_STATUS_ERROR;
/* Successful vm_find_object returns with aperture locked */
/* allocates VA only */
if (object && object->handles[0] == 0) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
/* allocates buffer only, should be mapped by GEM API */
if (aperture == &mem_handle_aperture) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
/* APU memory is not supported by this function */
if (aperture &&
(aperture == &cpuvm_aperture || !aperture->is_cpu_accessible)) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_ERROR;
}
if ((hsakmt_is_svm_api_supported && !object) || object->userptr) {
retcode = _fmm_map_to_gpu_userptr(address, size, gpuvm_address,
object, nodes_to_map, num_of_nodes * sizeof(uint32_t));
if (object)
pthread_mutex_unlock(&aperture->fmm_mutex);
return retcode ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS;
}
/* Verify that all nodes to map are registered already */
registered_node_id_array = all_gpu_id_array;
registered_node_id_array_size = all_gpu_id_array_size;
if (object->registered_device_id_array_size > 0 &&
object->registered_device_id_array) {
registered_node_id_array = object->registered_device_id_array;
registered_node_id_array_size = object->registered_device_id_array_size;
}
for (i = 0 ; i < num_of_nodes; i++) {
if (!id_in_array(nodes_to_map[i], registered_node_id_array,
registered_node_id_array_size)) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_ERROR;
}
}
/* Unmap buffer from all nodes that have this buffer mapped that are not included on nodes_to_map array */
if (object->mapped_device_id_array_size > 0) {
uint32_t temp_node_id_array[object->mapped_device_id_array_size];
uint32_t temp_node_id_array_size = 0;
for (i = 0 ; i < object->mapped_device_id_array_size / sizeof(uint32_t); i++) {
if (!id_in_array(object->mapped_device_id_array[i],
nodes_to_map,
num_of_nodes*sizeof(uint32_t)))
temp_node_id_array[temp_node_id_array_size++] =
object->mapped_device_id_array[i];
}
temp_node_id_array_size *= sizeof(uint32_t);
if (temp_node_id_array_size) {
ret = _fmm_unmap_from_gpu(aperture, address,
temp_node_id_array,
temp_node_id_array_size,
object);
if (ret != HSAKMT_STATUS_SUCCESS) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
}
}
/* Remove already mapped nodes from nodes_to_map
* to generate the final map list
*/
uint32_t map_node_id_array[num_of_nodes];
uint32_t map_node_id_array_size = 0;
for (i = 0; i < num_of_nodes; i++) {
if (!id_in_array(nodes_to_map[i],
object->mapped_device_id_array,
object->mapped_device_id_array_size))
map_node_id_array[map_node_id_array_size++] =
nodes_to_map[i];
}
if (map_node_id_array_size)
retcode = _fmm_map_to_gpu(aperture, address, size, object,
map_node_id_array,
map_node_id_array_size * sizeof(uint32_t));
pthread_mutex_unlock(&aperture->fmm_mutex);
if (retcode != 0)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS hsakmt_fmm_get_mem_info(const void *address, HsaPointerInfo *info)
{
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
uint32_t i;
manageable_aperture_t *aperture;
vm_object_t *vm_obj;
memset(info, 0, sizeof(HsaPointerInfo));
vm_obj = vm_find_object(address, UINT64_MAX, &aperture);
if (!vm_obj) {
info->Type = HSA_POINTER_UNKNOWN;
return HSAKMT_STATUS_ERROR;
}
/* Successful vm_find_object returns with the aperture locked */
if (vm_obj->is_imported_kfd_bo)
info->Type = HSA_POINTER_REGISTERED_SHARED;
else if (vm_obj->metadata)
info->Type = HSA_POINTER_REGISTERED_GRAPHICS;
else if (vm_obj->userptr)
info->Type = HSA_POINTER_REGISTERED_USER;
else if (vm_obj->handles[0] == 0)
info->Type = HSA_POINTER_RESERVED_ADDR;
else
info->Type = HSA_POINTER_ALLOCATED;
info->Node = vm_obj->node_id;
info->GPUAddress = (HSAuint64)vm_obj->start;
info->SizeInBytes = vm_obj->size;
/* registered nodes */
info->NRegisteredNodes =
vm_obj->registered_device_id_array_size / sizeof(uint32_t);
if (info->NRegisteredNodes && !vm_obj->registered_node_id_array) {
vm_obj->registered_node_id_array = (uint32_t *)
(uint32_t *)malloc(vm_obj->registered_device_id_array_size);
if (!vm_obj->registered_node_id_array) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_NO_MEMORY;
}
/* vm_obj->registered_node_id_array allocated here will be
* freed whenever the registration is changed (deregistration or
* register to new nodes) or the memory being freed
*/
for (i = 0; i < info->NRegisteredNodes; i++)
hsakmt_gpuid_to_nodeid(vm_obj->registered_device_id_array[i],
&vm_obj->registered_node_id_array[i]);
}
info->RegisteredNodes = vm_obj->registered_node_id_array;
/* mapped nodes */
info->NMappedNodes =
vm_obj->mapped_device_id_array_size / sizeof(uint32_t);
if (info->NMappedNodes && !vm_obj->mapped_node_id_array) {
vm_obj->mapped_node_id_array =
(uint32_t *)malloc(vm_obj->mapped_device_id_array_size);
if (!vm_obj->mapped_node_id_array) {
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_NO_MEMORY;
}
/* vm_obj->mapped_node_id_array allocated here will be
* freed whenever the mapping is changed (unmapped or map
* to new nodes) or memory being freed
*/
for (i = 0; i < info->NMappedNodes; i++)
hsakmt_gpuid_to_nodeid(vm_obj->mapped_device_id_array[i],
&vm_obj->mapped_node_id_array[i]);
}
info->MappedNodes = vm_obj->mapped_node_id_array;
info->UserData = vm_obj->user_data;
info->MemFlags = vm_obj->mflags;
if (info->Type == HSA_POINTER_REGISTERED_USER) {
info->CPUAddress = vm_obj->userptr;
info->SizeInBytes = vm_obj->userptr_size;
info->GPUAddress += ((HSAuint64)info->CPUAddress & (PAGE_SIZE - 1));
} else if (info->Type == HSA_POINTER_ALLOCATED) {
info->CPUAddress = vm_obj->start;
}
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
#ifdef SANITIZER_AMDGPU
HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(void* address)
{
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
manageable_aperture_t* aperture;
vm_object_t* vm_obj;
vm_obj = vm_find_object(address, UINT64_MAX, &aperture);
if (!vm_obj)
return HSAKMT_STATUS_ERROR;
/* Successful vm_find_object returns with the aperture locked */
/* If this is a GPU-mapped memory, remap the first page to be normal system memory*/
if (vm_obj->mmap_fd) {
void* p = mmap(address,
PAGE_SIZE,
PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED,
-1,
0);
if (p == MAP_FAILED)
ret = HSAKMT_STATUS_ERROR;
}
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(void* address)
{
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
manageable_aperture_t* aperture;
vm_object_t* vm_obj;
vm_obj = vm_find_object(address, UINT64_MAX, &aperture);
if (!vm_obj)
return HSAKMT_STATUS_ERROR;
/* Successful vm_find_object returns with the aperture locked */
/* If this is a GPU-mapped memory, remap the first page back to the original GPU memory*/
if (vm_obj->mmap_fd) {
off_t mmap_offset = vm_obj->mmap_offset + ((char*)address - (char*)vm_obj->start);
void* p = mmap(address,
PAGE_SIZE,
vm_obj->mmap_flags,
MAP_SHARED | MAP_FIXED,
vm_obj->mmap_fd,
mmap_offset);
if (p == MAP_FAILED)
ret = HSAKMT_STATUS_ERROR;
}
pthread_mutex_unlock(&aperture->fmm_mutex);
return ret;
}
#endif
HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(const void *mem, void *usr_data)
{
manageable_aperture_t *aperture;
vm_object_t *vm_obj;
vm_obj = vm_find_object(mem, 0, &aperture);
if (!vm_obj)
return HSAKMT_STATUS_ERROR;
vm_obj->user_data = usr_data;
pthread_mutex_unlock(&aperture->fmm_mutex);
return HSAKMT_STATUS_SUCCESS;
}
static void fmm_clear_aperture(manageable_aperture_t *app)
{
rbtree_node_t *n;
pthread_mutex_init(&app->fmm_mutex, NULL);
while ((n = rbtree_node_any(&app->tree, MID)))
vm_remove_object(app, vm_object_entry(n, 0));
while (app->vm_ranges) {
void *next_range = app->vm_ranges->next;
vm_remove_area(app, app->vm_ranges);
app->vm_ranges = next_range;
}
}
/* This is a special funcion that should be called only from the child process
* after a fork(). This will clear all vm_objects and mmaps duplicated from
* the parent.
*/
void hsakmt_fmm_clear_all_mem(void)
{
uint32_t i;
void *map_addr;
/* Close render node FDs. The child process needs to open new ones */
for (i = 0; i <= DRM_LAST_RENDER_NODE - DRM_FIRST_RENDER_NODE; i++) {
if (amdgpu_handle[i]) {
amdgpu_device_deinitialize(amdgpu_handle[i]);
amdgpu_handle[i] = NULL;
} else if (drm_render_fds[i]) {
close(drm_render_fds[i]);
}
drm_render_fds[i] = 0;
}
fmm_clear_aperture(&mem_handle_aperture);
fmm_clear_aperture(&cpuvm_aperture);
fmm_clear_aperture(&svm.apertures[SVM_DEFAULT]);
fmm_clear_aperture(&svm.apertures[SVM_COHERENT]);
if (dgpu_shared_aperture_limit) {
/* Use the same dgpu range as the parent. If failed, then set
* hsakmt_is_dgpu_mem_init to false. Later on dgpu_mem_init will try
* to get a new range
*/
map_addr = mmap(dgpu_shared_aperture_base, (HSAuint64)(dgpu_shared_aperture_limit)-
(HSAuint64)(dgpu_shared_aperture_base) + 1, PROT_NONE,
MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED, -1, 0);
if (map_addr == MAP_FAILED) {
munmap(dgpu_shared_aperture_base,
(HSAuint64)(dgpu_shared_aperture_limit) -
(HSAuint64)(dgpu_shared_aperture_base) + 1);
dgpu_shared_aperture_base = NULL;
dgpu_shared_aperture_limit = NULL;
}
}
/* Nothing is initialized. */
if (!gpu_mem)
return;
for (i = 0; i < gpu_mem_count; i++) {
fmm_clear_aperture(&gpu_mem[i].gpuvm_aperture);
fmm_clear_aperture(&gpu_mem[i].scratch_physical);
}
hsakmt_fmm_destroy_process_apertures();
}
================================================
FILE: libhsakmt/src/fmm.h
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef FMM_H_
#define FMM_H_
#include "hsakmt/hsakmttypes.h"
#include
typedef enum {
FMM_FIRST_APERTURE_TYPE = 0,
FMM_GPUVM = FMM_FIRST_APERTURE_TYPE,
FMM_LDS,
FMM_SCRATCH,
FMM_SVM,
FMM_MMIO,
FMM_LAST_APERTURE_TYPE
} aperture_type_e;
typedef struct {
aperture_type_e app_type;
uint64_t size;
void *start_address;
} aperture_properties_t;
HSAKMT_STATUS hsakmt_fmm_get_amdgpu_device_handle(uint32_t node_id, HsaAMDGPUDeviceHandle *DeviceHandle);
HSAKMT_STATUS hsakmt_fmm_init_process_apertures(unsigned int NumNodes);
void hsakmt_fmm_destroy_process_apertures(void);
/* Memory interface */
void *hsakmt_fmm_allocate_scratch(uint32_t gpu_id, void *address, uint64_t MemorySizeInBytes);
void *hsakmt_fmm_allocate_device(uint32_t gpu_id, uint32_t node_id, void *address,
uint64_t MemorySizeInBytes, uint64_t alignment, HsaMemFlags flags);
void *hsakmt_fmm_allocate_doorbell(uint32_t gpu_id, uint64_t MemorySizeInBytes, uint64_t doorbell_offset);
void *hsakmt_fmm_allocate_host(uint32_t gpu_id, uint32_t node_id, void *address, uint64_t MemorySizeInBytes,
uint64_t alignment, HsaMemFlags flags);
void hsakmt_fmm_print(uint32_t node);
HSAKMT_STATUS hsakmt_fmm_release(void *address);
HSAKMT_STATUS hsakmt_fmm_map_to_gpu(void *address, uint64_t size, uint64_t *gpuvm_address);
int hsakmt_fmm_unmap_from_gpu(void *address);
bool hsakmt_fmm_get_handle(void *address, uint64_t *handle);
HSAKMT_STATUS hsakmt_fmm_get_mem_info(const void *address, HsaPointerInfo *info);
HSAKMT_STATUS hsakmt_fmm_set_mem_user_data(const void *mem, void *usr_data);
#ifdef SANITIZER_AMDGPU
HSAKMT_STATUS hsakmt_fmm_replace_asan_header_page(void* address);
HSAKMT_STATUS hsakmt_fmm_return_asan_header_page(void* address);
#endif
/* Topology interface*/
HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_type, HSAuint32 gpu_id,
HSAuint64 *aperture_base, HSAuint64 *aperture_limit);
HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size,
bool coarse_grain,
bool ext_coherent);
HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size,
HSA_REGISTER_MEM_FLAGS RegisterFlags);
HSAKMT_STATUS hsakmt_fmm_deregister_memory(void *address);
HSAKMT_STATUS hsakmt_fmm_export_dma_buf_fd(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
int *DMABufFd,
HSAuint64 *Offset);
HSAKMT_STATUS hsakmt_fmm_share_memory(void *MemoryAddress,
HSAuint64 SizeInBytes,
HsaSharedMemoryHandle *SharedMemoryHandle);
HSAKMT_STATUS hsakmt_fmm_register_shared_memory(const HsaSharedMemoryHandle *SharedMemoryHandle,
HSAuint64 *SizeInBytes,
void **MemoryAddress,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size);
HSAKMT_STATUS hsakmt_fmm_map_to_gpu_nodes(void *address, uint64_t size,
uint32_t *nodes_to_map, uint64_t num_of_nodes, uint64_t *gpuvm_address);
int hsakmt_open_drm_render_device(int minor);
void *hsakmt_mmap_allocate_aligned(int prot, int flags, uint64_t size, uint64_t align,
uint64_t guard_size, void *aper_base, void *aper_limit, int fd);
extern int (*hsakmt_fn_amdgpu_device_get_fd)(HsaAMDGPUDeviceHandle device_handle);
#endif /* FMM_H_ */
================================================
FILE: libhsakmt/src/globals.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
// HSAKMT global data
int hsakmt_kfd_fd = -1;
int hsakmt_udmabuf_dev_fd = -1;
unsigned long hsakmt_kfd_open_count;
unsigned long hsakmt_system_properties_count;
pthread_mutex_t hsakmt_mutex = PTHREAD_MUTEX_INITIALIZER;
bool hsakmt_is_dgpu;
int hsakmt_page_size;
int hsakmt_page_shift;
/* whether to check all dGPUs in the topology support SVM API */
bool hsakmt_is_svm_api_supported;
/* zfb is mainly used during emulation */
int hsakmt_zfb_support;
================================================
FILE: libhsakmt/src/hsakmtmodel.c
================================================
/*
* Copyright © 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "hsakmt/hsakmtmodel.h"
#include "libhsakmt.h"
#include "hsakmt/hsakmttypes.h"
#include "hsakmt/hsakmtmodeliface.h"
#define _GNU_SOURCE
#define __USE_GNU
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
bool hsakmt_use_model;
char *hsakmt_model_topology;
struct model_node
{
bool is_gpu;
void *aperture;
hsakmt_model_t *model;
uint64_t doorbell_offset;
uint64_t total_memory_size;
uint64_t allocated_memory_size;
};
struct model_event
{
uint32_t event_type;
uint32_t auto_reset;
uint64_t value;
};
struct model_mem_data
{
uint64_t va_addr;
uint64_t file_offset;
uint64_t size;
uint64_t mapped_nodes_bitmask;
uint32_t flags;
uint32_t node_id;
};
struct model_queue
{
hsakmt_model_queue_t *queue;
uint32_t node_id;
};
#define MAX_MODEL_QUEUES 128
// Use a 256GB aperture for the model.
#define MODEL_APERTURE_SIZE (1llu << 38)
static void *model_mmio_page;
static pthread_mutex_t model_ioctl_mutex = PTHREAD_MUTEX_INITIALIZER;
static unsigned model_event_limit;
static uint64_t *model_event_bitmap;
static struct model_event *model_events;
static pthread_cond_t model_event_condvar;
static void *model_library;
static const struct hsakmt_model_functions *model_functions;
static uint64_t model_memfd_size;
static uint64_t model_num_nodes;
static struct model_node *model_nodes;
static struct model_queue model_queues[MAX_MODEL_QUEUES];
HSAKMT_STATUS HSAKMTAPI hsaKmtModelEnabled(bool* enable)
{
*enable = hsakmt_use_model;
return HSAKMT_STATUS_SUCCESS;
}
void model_init_env_vars(void)
{
/* Check whether to use a model instead of real hardware */
hsakmt_model_topology = getenv("HSA_MODEL_TOPOLOGY");
if (hsakmt_model_topology)
hsakmt_use_model = true;
if (hsakmt_use_model)
{
/* Backing memory file is used to stand in for the kfd_fd,
* which is needed early, so create it already.
*
* For old systems without memfd_create, or if the user prefers,
* we create a regular backing file. Prefer to use memfd_create
* by default where possible.
*/
int fd = -1;
const char *fname = getenv("HSA_MODEL_MEMFILE");
if (fname)
{
fprintf(stderr, "model: use memory backing file given in HSA_MODEL_MEMFILE: %s\n", fname);
fd = open(fname, O_CREAT | O_EXCL | O_CLOEXEC | O_RDWR, S_IRUSR | S_IWUSR);
if (fd < 0)
{
perror("model: failed to create backing file");
abort();
}
unlink(fname);
}
if (fd < 0)
{
#ifdef HAVE_MEMFD_CREATE
fd = memfd_create("hsakmt_model", MFD_CLOEXEC);
if (fd < 0)
{
fprintf(stderr, "model: Failed to create memfd\n");
abort();
}
#else
fprintf(stderr, "model: built without memfd support\n"
"model: set HSA_MODEL_MEMFILE to path of a backing file\n");
abort();
#endif
}
assert(hsakmt_kfd_fd < 0);
hsakmt_kfd_fd = fd;
pthread_condattr_t condattr;
pthread_condattr_init(&condattr);
pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC);
pthread_cond_init(&model_event_condvar, &condattr);
pthread_condattr_destroy(&condattr);
const char *libname = getenv("HSA_MODEL_LIB");
if (!libname)
{
fprintf(stderr, "model: HSA_MODEL_LIB environment variable must be set to FFM .so\n");
abort();
}
// model_library = dlmopen(LM_ID_NEWLM, libname, RTLD_NOW);
model_library = dlopen(libname, RTLD_NOW | RTLD_LOCAL);
if (!model_library)
{
fprintf(stderr, "model: failed to load %s: %s\n", libname, dlerror());
abort();
}
get_hsakmt_model_functions_t getter = dlsym(model_library, "get_hsakmt_model_functions");
if (!getter)
{
fprintf(stderr, "model: Failed to get hsakmt_model_functions\n");
abort();
}
model_functions = getter();
if (model_functions->version_major != HSAKMT_MODEL_INTERFACE_VERSION_MAJOR ||
model_functions->version_minor < HSAKMT_MODEL_INTERFACE_VERSION_MINOR)
{
fprintf(stderr, "model: Model has interface version %u.%u, need version %u.%u\n",
model_functions->version_major, model_functions->version_minor,
HSAKMT_MODEL_INTERFACE_VERSION_MAJOR, HSAKMT_MODEL_INTERFACE_VERSION_MINOR);
abort();
}
}
}
static uint64_t allocate_from_memfd(uint64_t size, uint64_t align)
{
if (!align)
align = 4096;
assert(POWER_OF_2(align)); /* must be power of two */
assert(align >= 4096);
size = (size + 4095) & ~4095;
model_memfd_size = (model_memfd_size + align - 1) & ~(align - 1);
uint64_t offset = model_memfd_size;
model_memfd_size += size;
int ret = ftruncate(hsakmt_kfd_fd, model_memfd_size);
if (ret < 0)
{
fprintf(stderr, "model: ftruncate on memfd failed\n");
abort();
}
return offset;
}
static uint64_t get_sysfs_mem_bank_size(unsigned node_id, unsigned mem_id)
{
char prop_name[256];
char path[256];
snprintf(path, sizeof(path), "%s/nodes/%u/mem_banks/%u/properties",
hsakmt_model_topology, node_id, mem_id);
FILE *f = fopen(path, "r");
if (!f)
{
fprintf(stderr, "model: Failed to open %s\n", path);
abort();
}
uint64_t prop_val;
while (fscanf(f, "%s %" PRIu64 "\n", prop_name, &prop_val) == 2)
{
if (!strcmp(prop_name, "size_in_bytes"))
{
fclose(f);
return prop_val;
}
}
fprintf(stderr, "model: Missing size_in_bytes in %s\n", path);
abort();
}
static void model_set_event(void *data, unsigned event_id)
{
if (!event_id)
return;
if (event_id > model_event_limit)
{
fprintf(stderr, "model_set_event: event_id = %u out of bounds\n",
event_id);
abort();
}
unsigned slot = event_id - 1;
if (!((model_event_bitmap[slot / 64] >> (slot % 64)) & 1))
{
fprintf(stderr, "model_set_event: event_id = %u is not allocated\n",
event_id);
abort();
}
struct model_event *event = &model_events[slot];
if (event->event_type == HSA_EVENTTYPE_SIGNAL)
{
assert(model_events[slot].value <= 1);
model_events[slot].value = 1;
}
else
{
fprintf(stderr, "model: Unimplemented event type\n");
abort();
}
pthread_cond_broadcast(&model_event_condvar);
}
void model_init(void)
{
if (!hsakmt_use_model)
return;
HSAKMT_STATUS result;
HsaSystemProperties props;
/* Read the topology to determine nodes. */
result = hsakmt_topology_sysfs_get_system_props(&props);
if (result != HSAKMT_STATUS_SUCCESS)
{
fprintf(stderr, "model: Failed to parse topology\n");
abort();
}
model_nodes = calloc(props.NumNodes, sizeof(*model_nodes));
if (!model_nodes)
abort();
model_num_nodes = props.NumNodes;
for (unsigned node_id = 0; node_id < props.NumNodes; node_id++)
{
HsaNodeProperties node_props;
result = hsakmt_topology_get_node_props(node_id, &node_props);
if (result != HSAKMT_STATUS_SUCCESS)
{
fprintf(stderr, "model: Failed to get node %u properties\n", node_id);
abort();
}
if (node_props.KFDGpuID == 0)
continue;
if (node_props.KFDGpuID != node_id + 1)
{
fprintf(stderr,
"model: Node %u has KFD GPU ID %u, but should be %u."
" Please change the gpu_id file.\n",
node_id, node_props.KFDGpuID, node_id + 1);
abort();
}
model_nodes[node_id].is_gpu = true;
/* Reserve the VA space for the aperture, but don't fill it with pages. */
model_nodes[node_id].aperture =
mmap(NULL, MODEL_APERTURE_SIZE, PROT_NONE,
MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS, -1, 0);
pr_debug("Modeling Creating Memory Aperture: %p\n", model_nodes[node_id].aperture);
if (model_nodes[node_id].aperture == MAP_FAILED)
{
fprintf(stderr, "model: Failed to reserve aperture via mmap\n");
abort();
}
/* Create the doorbell region */
model_nodes[node_id].doorbell_offset = allocate_from_memfd(8192, 8192);
for (unsigned mem_id = 0; mem_id < node_props.NumMemoryBanks; ++mem_id)
{
model_nodes[node_id].total_memory_size += get_sysfs_mem_bank_size(node_id, mem_id);
}
/* Create the model */
// TODO: Move this into a separate thread
model_nodes[node_id].model = model_functions->create();
if (!model_nodes[node_id].model)
{
fprintf(stderr, "model: Failed to create model\n");
abort();
}
model_functions->set_global_aperture(model_nodes[node_id].model,
model_nodes[node_id].aperture,
MODEL_APERTURE_SIZE);
model_functions->set_set_event(model_nodes[node_id].model, model_set_event, NULL);
}
}
void model_set_mmio_page(void *ptr)
{
assert(!model_mmio_page);
model_mmio_page = ptr;
}
void model_set_event_page(void *ptr, unsigned event_limit)
{
// TODO: Fully understand what's happening with this page and the event limit.
// ROCR-Runtime allocates a pool of 4096 events, but also a handful or so
// of additional events, which blows through the event_limit of 4096
// that is passed here. And it seems that not using the page at all
// is supported?
assert(!model_event_limit);
assert(event_limit % 64 == 0);
event_limit *= 2;
model_event_limit = event_limit;
model_event_bitmap = calloc(event_limit / 64, 8);
model_events = calloc(event_limit, sizeof(*model_events));
}
/* Model implementation of KFD ioctl. */
static int model_kfd_ioctl_locked(unsigned long request, void *arg)
{
assert(_IOC_TYPE(request) == AMDKFD_IOCTL_BASE);
if (_IOC_NR(request) == 0x20)
{
// This is AMDKFD_IOC_SVM. It is defined / used in an unusual way.
struct kfd_ioctl_svm_args *args = arg;
if (args->op == KFD_IOCTL_SVM_OP_SET_ATTR)
{
// todo?
return 0;
}
fprintf(stderr, "model: Unimplemented SVM op\n");
abort();
}
switch (request)
{
case AMDKFD_IOC_GET_VERSION:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_GET_VERSION\n");
struct kfd_ioctl_get_version_args *args = arg;
args->major_version = 1;
args->minor_version = 14;
return 0;
}
case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
struct kfd_ioctl_get_process_apertures_new_args *args = arg;
struct kfd_process_device_apertures *apertures =
(void *)args->kfd_process_device_apertures_ptr;
assert(args->num_of_nodes == model_num_nodes);
for (unsigned node_id = 0; node_id < args->num_of_nodes; ++node_id)
{
memset(&apertures[node_id], 0, sizeof(apertures[node_id]));
if (!model_nodes[node_id].is_gpu)
continue;
apertures[node_id].gpu_id = 1 + node_id;
apertures[node_id].gpuvm_base = 0x4000llu;
apertures[node_id].gpuvm_limit = MODEL_APERTURE_SIZE;
apertures[node_id].lds_base = 0x4000000000000000llu; // 0x1000000000000?
apertures[node_id].lds_limit = 0x40000000ffffffffllu;
apertures[node_id].scratch_base = 0x5000000000000000llu; // 0x2000000000000?
apertures[node_id].scratch_limit = 0x50000000ffffffffllu;
}
return 0;
}
case AMDKFD_IOC_SET_XNACK_MODE:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_SET_XNACK_MODE\n");
// Don't support XNACK
struct kfd_ioctl_set_xnack_mode_args *args = arg;
if (args->xnack_enabled < 0)
{
args->xnack_enabled = 0;
return 0;
}
errno = EPERM;
return -1;
}
case AMDKFD_IOC_GET_CLOCK_COUNTERS:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
struct kfd_ioctl_get_clock_counters_args *args = arg;
args->gpu_clock_counter = 0; // TODO
args->cpu_clock_counter = 0;
args->system_clock_counter = 0;
args->system_clock_freq = 0;
return 0;
}
case AMDKFD_IOC_ACQUIRE_VM:
pr_debug("MODEL IOCTL: AMDKFD_IOC_ACQUIRE_VM\n");
return 0;
case AMDKFD_IOC_SET_MEMORY_POLICY:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_SET_MEMORY_POLICY\n");
// todo?
return 0;
}
case AMDKFD_IOC_AVAILABLE_MEMORY:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_AVAILABLE_MEMORY\n");
static const uint64_t minimum_reported = 128 * 1024 * 1024;
struct kfd_ioctl_get_available_memory_args *args = arg;
unsigned node_id = args->gpu_id - 1;
struct model_node *node = &model_nodes[node_id];
assert(node_id < model_num_nodes);
if (node->allocated_memory_size + minimum_reported >= node->total_memory_size)
args->available = minimum_reported;
else
args->available = node->total_memory_size - node->allocated_memory_size;
return 0;
}
case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
{
// Expect an SVM style allocation: The memory is allocated on the host
// side e.g. via mmap(), and this IOCTL "only" registers the memory
// with the GPU. This is a no-op for us because we aren't a GPU.
struct kfd_ioctl_alloc_memory_of_gpu_args *args = arg;
unsigned node_id = args->gpu_id - 1;
assert(node_id < model_num_nodes);
assert(model_nodes[node_id].is_gpu);
if (args->va_addr == 0)
{
fprintf(stderr, "model: Expect only SVM allocations?\n");
abort();
}
if (args->size % PAGE_SIZE != 0)
{
fprintf(stderr, "model: Allocation size not a multiple of page size\n");
abort();
}
if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)
{
fprintf(stderr, "model: userptr not supported\n");
abort();
}
struct model_mem_data *mem_data = calloc(1, sizeof(*mem_data));
if (!mem_data)
abort();
mem_data->va_addr = args->va_addr;
mem_data->size = args->size;
mem_data->flags = args->flags;
mem_data->node_id = node_id;
if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
{
assert(args->size == 8192);
mem_data->file_offset = model_nodes[node_id].doorbell_offset;
}
else
{
mem_data->file_offset = allocate_from_memfd(args->size, 0);
}
args->handle = (__u64)mem_data;
args->mmap_offset = mem_data->file_offset;
model_nodes[node_id].allocated_memory_size += args->size;
pr_debug("MODEL IOCTL: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU: VA: %lx : Size: %lu, Flags: %x\n", mem_data->va_addr, mem_data->size, mem_data->flags);
model_functions->alloced_memory(model_nodes[node_id].model, (uint64_t *)mem_data->va_addr, mem_data->size, mem_data->flags);
return 0;
}
case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
{
struct kfd_ioctl_free_memory_of_gpu_args *args = arg;
struct model_mem_data *mem_data = (void *)args->handle;
assert(!mem_data->mapped_nodes_bitmask);
// Free the memory by punching a hole into the underlying memfd.
//
// Ideally, we'd also remember holes in the file and re-use them for
// allocations to avoid the file size from growing indefinitely. It's
// unclear whether the current implementation causes kernel data
// structures to grow. But in practice, it almost certainly never
// matters.
int ret = fallocate(hsakmt_kfd_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
mem_data->file_offset, mem_data->size);
if (ret != 0)
{
perror("model: failed to punch hole in memfd");
abort();
}
model_nodes[mem_data->node_id].allocated_memory_size -= mem_data->size;
model_functions->freed_memory(model_nodes[mem_data->node_id].model, (uint64_t *)mem_data->va_addr, mem_data->size);
pr_debug("MODEL IOCTL: AMDKFD_IOC_FREE_MEMORY_OF_GPU: VA: %lx : Size: %lu, Flags: %x\n", mem_data->va_addr, mem_data->size, mem_data->flags);
free(mem_data);
return 0;
}
case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
{
struct kfd_ioctl_map_memory_to_gpu_args *args = arg;
struct model_mem_data *mem_data = (void *)args->handle;
while (args->n_success < args->n_devices)
{
uint32_t gpu_id = ((uint32_t *)args->device_ids_array_ptr)[args->n_success];
uint32_t node_id = gpu_id - 1;
assert(node_id < model_num_nodes);
if (mem_data->mapped_nodes_bitmask & (1llu << node_id))
{
fprintf(stderr, "model: Already mapped\n");
abort();
}
assert(model_nodes[node_id].aperture);
unsigned prot = PROT_READ;
if (mem_data->flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE)
prot |= PROT_WRITE;
// TODO: Mark *shader*-executable memory?
pr_debug("MODEL IOCTL: AMDKFD_IOC_MAP_MEMORY_TO_GPU: VA: %lx : Size: %lu, Flags: %x\n", mem_data->va_addr, mem_data->size, mem_data->flags);
void *ret = mmap(VOID_PTR_ADD(model_nodes[node_id].aperture, mem_data->va_addr),
mem_data->size, prot,
MAP_SHARED | MAP_FIXED, hsakmt_kfd_fd, mem_data->file_offset);
if (ret == MAP_FAILED)
{
fprintf(stderr, "model: mmap failed\n");
abort();
}
mem_data->mapped_nodes_bitmask |= (1llu << node_id);
args->n_success++;
}
return 0;
}
case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
struct kfd_ioctl_unmap_memory_from_gpu_args *args = arg;
struct model_mem_data *mem_data = (void *)args->handle;
while (args->n_success < args->n_devices)
{
uint32_t gpu_id = ((uint32_t *)args->device_ids_array_ptr)[args->n_success];
uint32_t node_id = gpu_id - 1;
assert(node_id < model_num_nodes);
if (!(mem_data->mapped_nodes_bitmask & (1llu << node_id)))
{
fprintf(stderr, "model: Not mapped\n");
abort();
}
assert(model_nodes[node_id].aperture);
/* Overwrite the mapping with an empty mapping to keep
* it reserved. */
void *ret = mmap(VOID_PTR_ADD(model_nodes[node_id].aperture, mem_data->va_addr),
mem_data->size, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | MAP_NORESERVE, -1, 0);
if (ret == MAP_FAILED)
{
perror("model: unmap failed");
abort();
}
mem_data->mapped_nodes_bitmask &= ~(1llu << node_id);
args->n_success++;
}
args->n_success = args->n_devices;
return 0;
}
case AMDKFD_IOC_CREATE_EVENT:
{
struct kfd_ioctl_create_event_args *args = arg;
pr_debug("MODEL IOCTL: AMDKFD_IOC_CREATE_EVENT: %u\n", args->event_type);
// Find a free slot
unsigned i;
for (i = 0; i < model_event_limit; i += 64)
{
uint64_t bitmap = model_event_bitmap[i / 64];
if (bitmap == ~(uint64_t)0)
continue;
i += ffsll(~bitmap) - 1;
break;
}
if (i >= model_event_limit)
{
fprintf(stderr, "model: Ran out of event slots. Should be an application error.\n");
abort();
}
// Allocate the signal
model_event_bitmap[i / 64] |= (uint64_t)1 << (i % 64);
model_events[i].event_type = args->event_type;
model_events[i].auto_reset = args->auto_reset;
model_events[i].value = 0;
args->event_trigger_data = 0xbadf001; // ???
args->event_id = 1 + i;
args->event_slot_index = ~0;
return 0;
}
case AMDKFD_IOC_WAIT_EVENTS:
{
struct kfd_ioctl_wait_events_args *args = arg;
struct kfd_event_data *events = (void *)args->events_ptr;
pr_debug("MODEL IOCTL: AMDKFD_IOC_WAIT_EVENTS: %u\n", args->num_events);
bool have_timeout = args->timeout != 0xffffffffu;
bool hit_timeout = false;
struct timespec timeout;
if (have_timeout)
{
clock_gettime(CLOCK_MONOTONIC, &timeout);
timeout.tv_sec += args->timeout / 1000;
timeout.tv_nsec += (args->timeout % 1000) * 1000000;
if (timeout.tv_nsec > 1000000000)
{
timeout.tv_nsec -= 1000000000;
timeout.tv_sec++;
}
}
for (;;)
{
bool final_ready = args->wait_for_all;
for (unsigned i = 0; i < args->num_events; ++i)
{
unsigned slot = events[i].event_id - 1;
struct model_event *event = &model_events[slot];
bool this_ready = false;
if (event->event_type == HSA_EVENTTYPE_SIGNAL)
{
uint64_t current_age = event->value;
uint64_t target_age = events[i].signal_event_data.last_event_age;
this_ready = current_age >= target_age;
}
else if (event->event_type == HSA_EVENTTYPE_HW_EXCEPTION ||
event->event_type == HSA_EVENTTYPE_NODECHANGE ||
event->event_type == HSA_EVENTTYPE_DEVICESTATECHANGE ||
event->event_type == HSA_EVENTTYPE_HW_EXCEPTION ||
event->event_type == HSA_EVENTTYPE_DEBUG_EVENT ||
event->event_type == HSA_EVENTTYPE_PROFILE_EVENT ||
event->event_type == HSA_EVENTTYPE_MEMORY)
{
// These never happen in the model
}
else
{
fprintf(stderr, "model: Unimplemented event type\n");
abort();
}
if (final_ready != this_ready)
{
final_ready = this_ready;
break;
}
}
if (final_ready)
break;
if (have_timeout)
{
int ret = pthread_cond_timedwait(
&model_event_condvar, &model_ioctl_mutex, &timeout);
if (ret == ETIMEDOUT)
{
hit_timeout = true;
break;
}
}
else
{
pthread_cond_wait(&model_event_condvar, &model_ioctl_mutex);
}
}
/* Record most recent event ages and perform auto reset. */
for (unsigned i = 0; i < args->num_events; ++i)
{
unsigned slot = events[i].event_id - 1;
struct model_event *event = &model_events[slot];
if (event->event_type == HSA_EVENTTYPE_SIGNAL)
{
uint64_t last_age = event->value;
if (event->auto_reset && last_age >= events[i].signal_event_data.last_event_age)
event->value = 0;
events[i].signal_event_data.last_event_age = last_age;
}
}
args->wait_result = hit_timeout ? KFD_IOC_WAIT_RESULT_TIMEOUT
: KFD_IOC_WAIT_RESULT_COMPLETE;
return 0;
}
case AMDKFD_IOC_SET_EVENT:
{
struct kfd_ioctl_set_event_args *args = arg;
model_set_event(NULL, args->event_id);
return 0;
}
case AMDKFD_IOC_RESET_EVENT:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_RESET_EVENT\n");
struct kfd_ioctl_reset_event_args *args = arg;
unsigned slot = args->event_id - 1;
struct model_event *event = &model_events[slot];
if (event->event_type == HSA_EVENTTYPE_SIGNAL)
{
model_events[slot].value = 0;
}
else
{
fprintf(stderr, "model: Unimplemented event type\n");
abort();
}
return 0;
}
case AMDKFD_IOC_DESTROY_EVENT:
{
struct kfd_ioctl_destroy_event_args *args = arg;
unsigned i = args->event_id - 1;
if (i >= model_event_limit || !(model_event_bitmap[i / 64] & ((uint64_t)1 << (i % 64))))
{
fprintf(stderr, "model: trying to destroy an event that doesn't exist.\n");
abort();
}
memset(&model_events[i], 0, sizeof(model_events[i]));
model_event_bitmap[i / 64] &= ~((uint64_t)1 << (i % 64));
return 0;
}
case AMDKFD_IOC_CREATE_QUEUE:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_CREATE_QUEUE\n");
struct kfd_ioctl_create_queue_args *args = arg;
unsigned node_id = args->gpu_id - 1;
assert(node_id < model_num_nodes);
assert(model_nodes[node_id].model);
const bool supported_queue_type = args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL ||
args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA;
if (!supported_queue_type)
{
fprintf(stderr, "model: Unsupported queue type\n");
abort();
}
unsigned queue_id = 0;
while (queue_id < MAX_MODEL_QUEUES && model_queues[queue_id].queue)
queue_id++;
if (queue_id >= MAX_MODEL_QUEUES)
{
fprintf(stderr, "model: too many queues\n");
abort();
}
struct hsakmt_model_queue_info info = {0};
info.ring_base_address = args->ring_base_address;
info.ring_size = args->ring_size;
info.write_pointer_address = args->write_pointer_address;
info.read_pointer_address = args->read_pointer_address;
info.queue_type = args->queue_type;
model_queues[queue_id].queue =
model_functions->register_queue(model_nodes[node_id].model, &info);
model_queues[queue_id].node_id = node_id;
args->queue_id = queue_id;
// Note that strictly speaking, this is the offset into the hsakmt_kfd_fd
// file, not the DRM fd (but they are the same in our case).
args->doorbell_offset = model_nodes[node_id].doorbell_offset + 8 * queue_id;
return 0;
}
case AMDKFD_IOC_DESTROY_QUEUE:
{
struct kfd_ioctl_destroy_queue_args *args = arg;
if (args->queue_id >= MAX_MODEL_QUEUES || !model_queues[args->queue_id].queue)
{
fprintf(stderr, "model: trying to destroy a queue that doesn't exist\n");
abort();
}
struct model_queue *queue = &model_queues[args->queue_id];
// Older model versions simply leak the queue.
if (model_functions->version_minor >= 3)
model_functions->destroy_queue(model_nodes[queue->node_id].model, queue->queue);
queue->queue = NULL;
return 0;
}
case AMDKFD_IOC_GET_TILE_CONFIG:
{
pr_debug("MODEL IOCTL: AMDKFD_IOC_GET_TILE_CONFIG\n");
struct kfd_ioctl_get_tile_config_args *args = arg;
args->gb_addr_config = 0x10000444;
return 0;
}
case AMDKFD_IOC_SET_SCRATCH_BACKING_VA:
pr_debug("MODEL IOCTL: AMDKFD_IOC_SET_SCRATCH_BACKING_VA\n");
// no-op -- scratch allocations are communicated via amd_queue_s
return 0;
case AMDKFD_IOC_RUNTIME_ENABLE:
pr_debug("MODEL IOCTL: AMDKFD_IOC_RUNTIME_ENABLE\n");
fprintf(stderr, "model: Debugger runtime not implemented\n");
fprintf(stderr, "Fix this by clearing bit 30 of the 'capability' field in $HSA_MODEL_TOPOLOGY/%%d/properties\n");
abort();
default:
fprintf(stderr, "model: Unimplemented KFD ioctl\n");
abort();
}
}
int model_kfd_ioctl(unsigned long request, void *arg)
{
/* Use a very simle locking strategy for correctness. IOCTLs should
* be rare anyway and not contended considering the cost of running
* the model itself.
*
* The bulk of model execution happens in a separate thread *without*
* holding the IOCTL mutex. */
pthread_mutex_lock(&model_ioctl_mutex);
int ret = model_kfd_ioctl_locked(request, arg);
pthread_mutex_unlock(&model_ioctl_mutex);
return ret;
}
================================================
FILE: libhsakmt/src/libhsakmt.c
================================================
#include
#include
#include
#include "libhsakmt.h"
#include "hsakmt/hsakmtmodel.h"
/* Call ioctl, restarting if it is interrupted */
int hsakmt_ioctl(int fd, unsigned long request, void *arg)
{
if (hsakmt_use_model)
return model_kfd_ioctl(request, arg);
int ret;
do {
ret = ioctl(fd, request, arg);
} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
if (ret == -1 && errno == EBADF) {
/* In case pthread_atfork didn't catch it, this will
* make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN.
*/
pr_err("KFD file descriptor not valid in this process\n");
hsakmt_is_forked_child();
}
return ret;
}
================================================
FILE: libhsakmt/src/libhsakmt.h
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef LIBHSAKMT_H_INCLUDED
#define LIBHSAKMT_H_INCLUDED
#include "hsakmt/linux/kfd_ioctl.h"
#include "hsakmt/hsakmt.h"
#include
#include
#include
extern int hsakmt_kfd_fd;
extern int hsakmt_udmabuf_dev_fd;
extern unsigned long hsakmt_kfd_open_count;
extern bool hsakmt_forked;
extern pthread_mutex_t hsakmt_mutex;
extern bool hsakmt_is_dgpu;
extern bool hsakmt_is_svm_api_supported;
extern int hsakmt_zfb_support;
extern HsaVersionInfo hsakmt_kfd_version_info;
#undef HSAKMTAPI
#define HSAKMTAPI __attribute__((visibility ("default")))
#if defined(__clang__)
#if __has_feature(address_sanitizer)
#define SANITIZER_AMDGPU 1
#endif
#endif
/*Avoid pointer-to-int-cast warning*/
#define PORT_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr))
/*Avoid int-to-pointer-cast warning*/
#define PORT_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v))
#define CHECK_KFD_OPEN() \
do { if (hsakmt_kfd_open_count == 0 || hsakmt_forked) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; } while (0)
#define CHECK_KFD_MINOR_VERSION(minor) \
do { if ((minor) > hsakmt_kfd_version_info.KernelInterfaceMinorVersion)\
return HSAKMT_STATUS_NOT_SUPPORTED; } while (0)
extern int hsakmt_page_size;
extern int hsakmt_page_shift;
/* Might be defined in limits.h on platforms where it is constant (used by musl) */
/* See also: https://pubs.opengroup.org/onlinepubs/7908799/xsh/limits.h.html */
#ifndef PAGE_SIZE
#define PAGE_SIZE hsakmt_page_size
#endif
#ifndef PAGE_SHIFT
#define PAGE_SHIFT hsakmt_page_shift
#endif
/* VI HW bug requires this virtual address alignment */
#define TONGA_PAGE_SIZE 0x8000
/* 64KB BigK fragment size for TLB efficiency */
#define GPU_BIGK_PAGE_SIZE (1 << 16)
/* 2MB huge page size for 4-level page tables on Vega10 and later GPUs */
#define GPU_HUGE_PAGE_SIZE (2 << 20)
#define CHECK_PAGE_MULTIPLE(x) \
do { if ((uint64_t)PORT_VPTR_TO_UINT64(x) % PAGE_SIZE) return HSAKMT_STATUS_INVALID_PARAMETER; } while(0)
#define ALIGN_UP(x,align) (((uint64_t)(x) + (align) - 1) & ~(uint64_t)((align)-1))
#define ALIGN_UP_32(x,align) (((uint32_t)(x) + (align) - 1) & ~(uint32_t)((align)-1))
#define PAGE_ALIGN_UP(x) ALIGN_UP(x,PAGE_SIZE)
#define BITMASK(n) ((n) ? (UINT64_MAX >> (sizeof(UINT64_MAX) * CHAR_BIT - (n))) : 0)
#define ARRAY_LEN(array) (sizeof(array) / sizeof(array[0]))
/* HSA Thunk logging usage */
extern int hsakmt_debug_level;
#define hsakmt_print(level, fmt, ...) \
do { if (level <= hsakmt_debug_level) fprintf(stderr, fmt, ##__VA_ARGS__); } while (0)
#define HSAKMT_DEBUG_LEVEL_DEFAULT -1
#define HSAKMT_DEBUG_LEVEL_ERR 3
#define HSAKMT_DEBUG_LEVEL_WARNING 4
#define HSAKMT_DEBUG_LEVEL_INFO 6
#define HSAKMT_DEBUG_LEVEL_DEBUG 7
#define pr_err(fmt, ...) \
hsakmt_print(HSAKMT_DEBUG_LEVEL_ERR, fmt, ##__VA_ARGS__)
#define pr_warn(fmt, ...) \
hsakmt_print(HSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__)
#define pr_info(fmt, ...) \
hsakmt_print(HSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__)
#define pr_debug(fmt, ...) \
hsakmt_print(HSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__)
#define pr_err_once(fmt, ...) \
({ \
static bool __print_once; \
if (!__print_once) { \
__print_once = true; \
pr_err(fmt, ##__VA_ARGS__); \
} \
})
#define pr_warn_once(fmt, ...) \
({ \
static bool __print_once; \
if (!__print_once) { \
__print_once = true; \
pr_warn(fmt, ##__VA_ARGS__); \
} \
})
/* Expects gfxv (full) in decimal */
#define HSA_GET_GFX_VERSION_MAJOR(gfxv) (((gfxv) / 10000) % 100)
#define HSA_GET_GFX_VERSION_MINOR(gfxv) (((gfxv) / 100) % 100)
#define HSA_GET_GFX_VERSION_STEP(gfxv) ((gfxv) % 100)
/* Expects HSA_ENGINE_ID.ui32, returns gfxv (full) in hex */
#define HSA_GET_GFX_VERSION_FULL(ui32) \
(((ui32.Major) << 16) | ((ui32.Minor) << 8) | (ui32.Stepping))
enum full_gfx_versions {
GFX_VERSION_KAVERI = 0x070000,
GFX_VERSION_HAWAII = 0x070001,
GFX_VERSION_CARRIZO = 0x080001,
GFX_VERSION_TONGA = 0x080002,
GFX_VERSION_FIJI = 0x080003,
GFX_VERSION_POLARIS10 = 0x080003,
GFX_VERSION_POLARIS11 = 0x080003,
GFX_VERSION_POLARIS12 = 0x080003,
GFX_VERSION_VEGAM = 0x080003,
GFX_VERSION_VEGA10 = 0x090000,
GFX_VERSION_RAVEN = 0x090002,
GFX_VERSION_VEGA12 = 0x090004,
GFX_VERSION_VEGA20 = 0x090006,
GFX_VERSION_ARCTURUS = 0x090008,
GFX_VERSION_ALDEBARAN = 0x09000A,
GFX_VERSION_AQUA_VANJARAM = 0x090400,
GFX_VERSION_GFX950 = 0x090500,
GFX_VERSION_RENOIR = 0x09000C,
GFX_VERSION_NAVI10 = 0x0A0100,
GFX_VERSION_NAVI12 = 0x0A0101,
GFX_VERSION_NAVI14 = 0x0A0102,
GFX_VERSION_CYAN_SKILLFISH = 0x0A0103,
GFX_VERSION_SIENNA_CICHLID = 0x0A0300,
GFX_VERSION_NAVY_FLOUNDER = 0x0A0301,
GFX_VERSION_DIMGREY_CAVEFISH = 0x0A0302,
GFX_VERSION_VANGOGH = 0x0A0303,
GFX_VERSION_BEIGE_GOBY = 0x0A0304,
GFX_VERSION_YELLOW_CARP = 0x0A0305,
GFX_VERSION_PLUM_BONITO = 0x0B0000,
GFX_VERSION_WHEAT_NAS = 0x0B0001,
GFX_VERSION_GFX1200 = 0x0C0000,
GFX_VERSION_GFX1201 = 0x0C0001,
};
struct hsa_gfxip_table {
uint16_t device_id; // Device ID
unsigned char major; // GFXIP Major engine version
unsigned char minor; // GFXIP Minor engine version
unsigned char stepping; // GFXIP Stepping info
const char *amd_name; // CALName of the device
};
HSAKMT_STATUS hsakmt_init_kfd_version(void);
#define IS_SOC15(gfxv) ((gfxv) >= GFX_VERSION_VEGA10)
HSAKMT_STATUS hsakmt_validate_nodeid(uint32_t nodeid, uint32_t *gpu_id);
HSAKMT_STATUS hsakmt_gpuid_to_nodeid(uint32_t gpu_id, uint32_t* node_id);
uint32_t hsakmt_get_gfxv_by_node_id(HSAuint32 node_id);
bool hsakmt_prefer_ats(HSAuint32 node_id);
uint16_t hsakmt_get_device_id_by_node_id(HSAuint32 node_id);
uint16_t hsakmt_get_device_id_by_gpu_id(HSAuint32 gpu_id);
uint32_t hsakmt_get_direct_link_cpu(uint32_t gpu_node);
int get_drm_render_fd_by_gpu_id(HSAuint32 gpu_id);
HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array,
uint32_t NumberOfNodes, uint32_t *NodeArray);
HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaSystemProperties *props);
HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId,
HsaNodeProperties *NodeProperties);
HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId,
HSAuint32 NumIoLinks,
HsaIoLinkProperties *IoLinkProperties);
void hsakmt_topology_setup_is_dgpu_param(HsaNodeProperties *props);
bool hsakmt_topology_is_svm_needed(HSA_ENGINE_ID EngineId);
HSAuint32 hsakmt_PageSizeFromFlags(unsigned int pageSizeFlags);
void* hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align,
uint32_t gpu_id,
uint32_t NodeId, bool NonPaged,
bool DeviceLocal, bool Uncached);
void hsakmt_free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align);
HSAKMT_STATUS hsakmt_init_process_doorbells(unsigned int NumNodes);
void hsakmt_destroy_process_doorbells(void);
HSAKMT_STATUS hsakmt_init_device_debugging_memory(unsigned int NumNodes);
void hsakmt_destroy_device_debugging_memory(void);
bool hsakmt_debug_get_reg_status(uint32_t node_id);
HSAKMT_STATUS hsakmt_init_counter_props(unsigned int NumNodes);
void hsakmt_destroy_counter_props(void);
uint32_t *hsakmt_convert_queue_ids(HSAuint32 NumQueues, HSA_QUEUEID *Queues);
extern int hsakmt_ioctl(int fd, unsigned long request, void *arg);
/* Void pointer arithmetic (or remove -Wpointer-arith to allow void pointers arithmetic) */
#define VOID_PTR_ADD32(ptr,n) (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
#define VOID_PTR_ADD(ptr,n) (void*)((uint8_t*)(ptr) + n)/*ptr + offset*/
#define VOID_PTR_SUB(ptr,n) (void*)((uint8_t*)(ptr) - n)/*ptr - offset*/
#define VOID_PTRS_SUB(ptr1,ptr2) (uint64_t)((uint8_t*)(ptr1) - (uint8_t*)(ptr2)) /*ptr1 - ptr2*/
#define MIN(a, b) ({ \
typeof(a) tmp1 = (a), tmp2 = (b); \
tmp1 < tmp2 ? tmp1 : tmp2; })
#define MAX(a, b) ({ \
typeof(a) tmp1 = (a), tmp2 = (b); \
tmp1 > tmp2 ? tmp1 : tmp2; })
#define POWER_OF_2(x) ((x && (!(x & (x - 1)))) ? 1 : 0)
void hsakmt_clear_events_page(void);
void hsakmt_fmm_clear_all_mem(void);
void hsakmt_clear_process_doorbells(void);
uint32_t hsakmt_get_num_sysfs_nodes(void);
bool hsakmt_is_forked_child(void);
/* Calculate VGPR and SGPR register file size per CU */
uint32_t hsakmt_get_vgpr_size_per_cu(uint32_t gfxv);
#define SGPR_SIZE_PER_CU 0x4000
#endif
================================================
FILE: libhsakmt/src/libhsakmt.ver
================================================
HSAKMT_1
{
global:
hsaKmtOpenKFD;
hsaKmtCloseKFD;
hsaKmtGetVersion;
hsaKmtAcquireSystemProperties;
hsaKmtReleaseSystemProperties;
hsaKmtGetNodeProperties;
hsaKmtGetNodeMemoryProperties;
hsaKmtGetNodeCacheProperties;
hsaKmtGetNodeIoLinkProperties;
hsaKmtCreateEvent;
hsaKmtDestroyEvent;
hsaKmtSetEvent;
hsaKmtResetEvent;
hsaKmtQueryEventState;
hsaKmtWaitOnEvent;
hsaKmtWaitOnMultipleEvents;
hsaKmtCreateQueue;
hsaKmtUpdateQueue;
hsaKmtDestroyQueue;
hsaKmtSetQueueCUMask;
hsaKmtSetMemoryPolicy;
hsaKmtAllocMemory;
hsaKmtAllocMemoryAlign;
hsaKmtFreeMemory;
hsaKmtAvailableMemory;
hsaKmtRegisterMemory;
hsaKmtRegisterMemoryToNodes;
hsaKmtRegisterMemoryWithFlags;
hsaKmtRegisterGraphicsHandleToNodes;
hsaKmtShareMemory;
hsaKmtRegisterSharedHandle;
hsaKmtRegisterSharedHandleToNodes;
hsaKmtProcessVMRead;
hsaKmtProcessVMWrite;
hsaKmtDeregisterMemory;
hsaKmtMapMemoryToGPU;
hsaKmtMapMemoryToGPUNodes;
hsaKmtUnmapMemoryToGPU;
hsaKmtDbgRegister;
hsaKmtDbgUnregister;
hsaKmtDbgWavefrontControl;
hsaKmtDbgAddressWatch;
hsaKmtDbgEnable;
hsaKmtDbgDisable;
hsaKmtDbgGetDeviceData;
hsaKmtDbgGetQueueData;
hsaKmtGetClockCounters;
hsaKmtPmcGetCounterProperties;
hsaKmtPmcRegisterTrace;
hsaKmtPmcUnregisterTrace;
hsaKmtPmcAcquireTraceAccess;
hsaKmtPmcReleaseTraceAccess;
hsaKmtPmcStartTrace;
hsaKmtPmcQueryTrace;
hsaKmtPmcStopTrace;
hsaKmtMapGraphicHandle;
hsaKmtUnmapGraphicHandle;
hsaKmtSetTrapHandler;
hsaKmtGetTileConfig;
hsaKmtQueryPointerInfo;
hsaKmtSetMemoryUserData;
hsaKmtGetQueueInfo;
hsaKmtAllocQueueGWS;
hsaKmtRuntimeEnable;
hsaKmtRuntimeDisable;
hsaKmtCheckRuntimeDebugSupport;
hsaKmtGetRuntimeCapabilities;
hsaKmtDebugTrapIoctl;
hsaKmtSPMAcquire;
hsaKmtSPMRelease;
hsaKmtSPMSetDestBuffer;
hsaKmtSVMSetAttr;
hsaKmtSVMGetAttr;
hsaKmtSetXNACKMode;
hsaKmtGetXNACKMode;
hsaKmtOpenSMI;
hsaKmtExportDMABufHandle;
hsaKmtWaitOnEvent_Ext;
hsaKmtWaitOnMultipleEvents_Ext;
hsaKmtReplaceAsanHeaderPage;
hsaKmtReturnAsanHeaderPage;
hsaKmtGetAMDGPUDeviceHandle;
hsaKmtPcSamplingQueryCapabilities;
hsaKmtPcSamplingCreate;
hsaKmtPcSamplingDestroy;
hsaKmtPcSamplingStart;
hsaKmtPcSamplingStop;
hsaKmtPcSamplingSupport;
local: *;
};
================================================
FILE: libhsakmt/src/memory.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include "hsakmt/linux/kfd_ioctl.h"
#include
#include
#include
#include
#include
#include
#include
#include "fmm.h"
HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryPolicy(HSAuint32 Node,
HSAuint32 DefaultPolicy,
HSAuint32 AlternatePolicy,
void *MemoryAddressAlternate,
HSAuint64 MemorySizeInBytes)
{
struct kfd_ioctl_set_memory_policy_args args = {0};
HSAKMT_STATUS result;
uint32_t gpu_id;
CHECK_KFD_OPEN();
pr_debug("[%s] node %d; default %d; alternate %d\n",
__func__, Node, DefaultPolicy, AlternatePolicy);
result = hsakmt_validate_nodeid(Node, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
if (hsakmt_get_gfxv_by_node_id(Node) != GFX_VERSION_KAVERI)
/* This is a legacy API useful on Kaveri only. On dGPU
* the alternate aperture is setup and used
* automatically for coherent allocations. Don't let
* app override it.
*/
return HSAKMT_STATUS_NOT_IMPLEMENTED;
/*
* We accept any legal policy and alternate address location.
* You get CC everywhere anyway.
*/
if ((DefaultPolicy != HSA_CACHING_CACHED &&
DefaultPolicy != HSA_CACHING_NONCACHED) ||
(AlternatePolicy != HSA_CACHING_CACHED &&
AlternatePolicy != HSA_CACHING_NONCACHED))
return HSAKMT_STATUS_INVALID_PARAMETER;
CHECK_PAGE_MULTIPLE(MemoryAddressAlternate);
CHECK_PAGE_MULTIPLE(MemorySizeInBytes);
args.gpu_id = gpu_id;
args.default_policy = (DefaultPolicy == HSA_CACHING_CACHED) ?
KFD_IOC_CACHE_POLICY_COHERENT :
KFD_IOC_CACHE_POLICY_NONCOHERENT;
args.alternate_policy = (AlternatePolicy == HSA_CACHING_CACHED) ?
KFD_IOC_CACHE_POLICY_COHERENT :
KFD_IOC_CACHE_POLICY_NONCOHERENT;
args.alternate_aperture_base = (uintptr_t) MemoryAddressAlternate;
args.alternate_aperture_size = MemorySizeInBytes;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_MEMORY_POLICY, &args);
return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS;
}
HSAuint32 hsakmt_PageSizeFromFlags(unsigned int pageSizeFlags)
{
switch (pageSizeFlags) {
case HSA_PAGE_SIZE_4KB: return 4*1024;
case HSA_PAGE_SIZE_64KB: return 64*1024;
case HSA_PAGE_SIZE_2MB: return 2*1024*1024;
case HSA_PAGE_SIZE_1GB: return 1024*1024*1024;
default:
assert(false);
return 4*1024;
}
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemory(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HsaMemFlags MemFlags,
void **MemoryAddress)
{
return hsaKmtAllocMemoryAlign(PreferredNode, SizeInBytes, 0, MemFlags, MemoryAddress);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAllocMemoryAlign(HSAuint32 PreferredNode,
HSAuint64 SizeInBytes,
HSAuint64 Alignment,
HsaMemFlags MemFlags,
void **MemoryAddress)
{
HSAKMT_STATUS result;
uint32_t gpu_id;
HSAuint64 page_size;
CHECK_KFD_OPEN();
if (MemFlags.ui32.Contiguous)
CHECK_KFD_MINOR_VERSION(16);
pr_debug("[%s] node %d\n", __func__, PreferredNode);
result = hsakmt_validate_nodeid(PreferredNode, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode);
return result;
}
page_size = hsakmt_PageSizeFromFlags(MemFlags.ui32.PageSize);
if (Alignment && (Alignment < page_size || !POWER_OF_2(Alignment)))
return HSAKMT_STATUS_INVALID_PARAMETER;
if (!MemoryAddress || !SizeInBytes || (SizeInBytes & (page_size-1)))
return HSAKMT_STATUS_INVALID_PARAMETER;
if (MemFlags.ui32.FixedAddress) {
if (*MemoryAddress == NULL)
return HSAKMT_STATUS_INVALID_PARAMETER;
} else
*MemoryAddress = NULL;
if ((MemFlags.ui32.CoarseGrain && MemFlags.ui32.ExtendedCoherent) ||
(MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.Uncached))
return HSAKMT_STATUS_INVALID_PARAMETER;
if (MemFlags.ui32.Scratch) {
if (Alignment) {
// Scratch memory currently forced to SCRATCH_ALIGN
pr_err("[%s] Alignment not supported for scratch memory: %d\n", __func__, PreferredNode);
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
*MemoryAddress = hsakmt_fmm_allocate_scratch(gpu_id, *MemoryAddress, SizeInBytes);
if (!(*MemoryAddress)) {
pr_err("[%s] failed to allocate %lu bytes from scratch\n",
__func__, SizeInBytes);
return HSAKMT_STATUS_NO_MEMORY;
}
pr_debug("[%s] node %d address %p size %lu from scratch\n", __func__, PreferredNode, *MemoryAddress, SizeInBytes);
return HSAKMT_STATUS_SUCCESS;
}
/* GPU allocated system memory */
if (!gpu_id || !MemFlags.ui32.NonPaged || hsakmt_zfb_support || MemFlags.ui32.GTTAccess
|| MemFlags.ui32.OnlyAddress) {
/* Backwards compatibility hack: Allocate system memory if app
* asks for paged memory from a GPU node.
*/
/* If allocate VRAM under ZFB mode */
if (hsakmt_zfb_support && gpu_id && MemFlags.ui32.NonPaged == 1)
MemFlags.ui32.CoarseGrain = 1;
*MemoryAddress = hsakmt_fmm_allocate_host(gpu_id, MemFlags.ui32.GTTAccess ? 0 : PreferredNode,
*MemoryAddress, SizeInBytes, Alignment, MemFlags);
if (!(*MemoryAddress)) {
pr_err("[%s] failed to allocate %lu bytes from host\n",
__func__, SizeInBytes);
return HSAKMT_STATUS_ERROR;
}
pr_debug("[%s] node %d address %p size %lu from host\n", __func__, PreferredNode, *MemoryAddress, SizeInBytes);
return HSAKMT_STATUS_SUCCESS;
}
/* GPU allocated VRAM */
/* sanity check cannot do OnlyAddress and NoAddress alloc at same time */
if (MemFlags.ui32.OnlyAddress && MemFlags.ui32.NoAddress) {
pr_err("[%s] allocate addr-only and memory-only at same time\n",
__func__);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
*MemoryAddress = hsakmt_fmm_allocate_device(gpu_id, PreferredNode, *MemoryAddress,
SizeInBytes, Alignment, MemFlags);
if (!(*MemoryAddress)) {
pr_err("[%s] failed to allocate %lu bytes from device\n",
__func__, SizeInBytes);
return HSAKMT_STATUS_NO_MEMORY;
}
pr_debug("[%s] node %d address %p size %lu from device\n", __func__, PreferredNode, *MemoryAddress, SizeInBytes);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void *MemoryAddress,
HSAuint64 SizeInBytes)
{
CHECK_KFD_OPEN();
pr_debug("[%s] address %p\n", __func__, MemoryAddress);
if (!MemoryAddress) {
pr_err("FIXME: freeing NULL pointer\n");
return HSAKMT_STATUS_ERROR;
}
return hsakmt_fmm_release(MemoryAddress);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node,
HSAuint64 *AvailableBytes)
{
struct kfd_ioctl_get_available_memory_args args = {};
HSAKMT_STATUS result;
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(9);
pr_debug("[%s] node %d\n", __func__, Node);
result = hsakmt_validate_nodeid(Node, &args.gpu_id);
if (result != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, Node);
return result;
}
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY, &args))
return HSAKMT_STATUS_ERROR;
*AvailableBytes = args.available;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
HSAuint64 MemorySizeInBytes)
{
CHECK_KFD_OPEN();
pr_debug("[%s] address %p size %lu\n", __func__, MemoryAddress, MemorySizeInBytes);
if (!hsakmt_is_dgpu)
/* TODO: support mixed APU and dGPU configurations */
return HSAKMT_STATUS_SUCCESS;
return hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
NULL, 0, true, false);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray)
{
CHECK_KFD_OPEN();
uint32_t *gpu_id_array;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
pr_debug("[%s] address %p size %lu number of nodes %lu\n",
__func__, MemoryAddress, MemorySizeInBytes, NumberOfNodes);
if (!hsakmt_is_dgpu)
/* TODO: support mixed APU and dGPU configurations */
return HSAKMT_STATUS_NOT_SUPPORTED;
ret = hsakmt_validate_nodeid_array(&gpu_id_array,
NumberOfNodes, NodeArray);
if (ret == HSAKMT_STATUS_SUCCESS) {
ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
gpu_id_array,
NumberOfNodes*sizeof(uint32_t),
true, false);
if (ret != HSAKMT_STATUS_SUCCESS)
free(gpu_id_array);
}
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HsaMemFlags MemFlags)
{
CHECK_KFD_OPEN();
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
pr_debug("[%s] address %p size %lu\n",
__func__, MemoryAddress, MemorySizeInBytes);
if (MemFlags.ui32.ExtendedCoherent && MemFlags.ui32.CoarseGrain)
return HSAKMT_STATUS_INVALID_PARAMETER;
// Registered memory should be ordinary paged host memory.
if ((MemFlags.ui32.HostAccess != 1) || (MemFlags.ui32.NonPaged == 1))
return HSAKMT_STATUS_NOT_SUPPORTED;
if (!hsakmt_is_dgpu)
/* TODO: support mixed APU and dGPU configurations */
return HSAKMT_STATUS_NOT_SUPPORTED;
ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
NULL, 0, MemFlags.ui32.CoarseGrain, MemFlags.ui32.ExtendedCoherent);
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodes(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray)
{
HSA_REGISTER_MEM_FLAGS regFlags;
regFlags.Value = 0;
return hsaKmtRegisterGraphicsHandleToNodesExt(GraphicsResourceHandle,
GraphicsResourceInfo,
NumberOfNodes,
NodeArray,
regFlags);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterGraphicsHandleToNodesExt(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray,
HSA_REGISTER_MEM_FLAGS RegisterFlags)
{
CHECK_KFD_OPEN();
uint32_t *gpu_id_array = NULL;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
pr_debug("[%s] number of nodes %lu\n", __func__, NumberOfNodes);
if (NodeArray != NULL || NumberOfNodes != 0) {
ret = hsakmt_validate_nodeid_array(&gpu_id_array,
NumberOfNodes, NodeArray);
}
if (ret == HSAKMT_STATUS_SUCCESS) {
ret = hsakmt_fmm_register_graphics_handle(
GraphicsResourceHandle, GraphicsResourceInfo,
gpu_id_array, NumberOfNodes * sizeof(uint32_t), RegisterFlags);
if (ret != HSAKMT_STATUS_SUCCESS)
free(gpu_id_array);
}
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtExportDMABufHandle(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
int *DMABufFd,
HSAuint64 *Offset)
{
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(12);
pr_debug("[%s] address %p\n", __func__, MemoryAddress);
return hsakmt_fmm_export_dma_buf_fd(MemoryAddress, MemorySizeInBytes,
DMABufFd, Offset);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtShareMemory(void *MemoryAddress,
HSAuint64 SizeInBytes,
HsaSharedMemoryHandle *SharedMemoryHandle)
{
CHECK_KFD_OPEN();
pr_debug("[%s] address %p\n", __func__, MemoryAddress);
if (!SharedMemoryHandle)
return HSAKMT_STATUS_INVALID_PARAMETER;
return hsakmt_fmm_share_memory(MemoryAddress, SizeInBytes, SharedMemoryHandle);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandle(const HsaSharedMemoryHandle *SharedMemoryHandle,
void **MemoryAddress,
HSAuint64 *SizeInBytes)
{
CHECK_KFD_OPEN();
pr_debug("[%s] handle %p\n", __func__, SharedMemoryHandle);
return hsaKmtRegisterSharedHandleToNodes(SharedMemoryHandle,
MemoryAddress,
SizeInBytes,
0,
NULL);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterSharedHandleToNodes(const HsaSharedMemoryHandle *SharedMemoryHandle,
void **MemoryAddress,
HSAuint64 *SizeInBytes,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray)
{
CHECK_KFD_OPEN();
uint32_t *gpu_id_array = NULL;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
pr_debug("[%s] handle %p number of nodes %lu\n",
__func__, SharedMemoryHandle, NumberOfNodes);
if (!SharedMemoryHandle)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (NodeArray) {
ret = hsakmt_validate_nodeid_array(&gpu_id_array, NumberOfNodes, NodeArray);
if (ret != HSAKMT_STATUS_SUCCESS)
goto error;
}
ret = hsakmt_fmm_register_shared_memory(SharedMemoryHandle,
SizeInBytes,
MemoryAddress,
gpu_id_array,
NumberOfNodes*sizeof(uint32_t));
if (ret != HSAKMT_STATUS_SUCCESS)
goto error;
return ret;
error:
if (gpu_id_array)
free(gpu_id_array);
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMRead(HSAuint32 Pid,
HsaMemoryRange *LocalMemoryArray,
HSAuint64 LocalMemoryArrayCount,
HsaMemoryRange *RemoteMemoryArray,
HSAuint64 RemoteMemoryArrayCount,
HSAuint64 *SizeCopied)
{
pr_err("[%s] Deprecated\n", __func__);
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtProcessVMWrite(HSAuint32 Pid,
HsaMemoryRange *LocalMemoryArray,
HSAuint64 LocalMemoryArrayCount,
HsaMemoryRange *RemoteMemoryArray,
HSAuint64 RemoteMemoryArrayCount,
HSAuint64 *SizeCopied)
{
pr_err("[%s] Deprecated\n", __func__);
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDeregisterMemory(void *MemoryAddress)
{
CHECK_KFD_OPEN();
pr_debug("[%s] address %p\n", __func__, MemoryAddress);
return hsakmt_fmm_deregister_memory(MemoryAddress);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPU(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAuint64 *AlternateVAGPU)
{
CHECK_KFD_OPEN();
pr_debug("[%s] address %p\n", __func__, MemoryAddress);
if (!MemoryAddress) {
pr_err("FIXME: mapping NULL pointer\n");
return HSAKMT_STATUS_ERROR;
}
if (AlternateVAGPU)
*AlternateVAGPU = 0;
return hsakmt_fmm_map_to_gpu(MemoryAddress, MemorySizeInBytes, AlternateVAGPU);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapMemoryToGPUNodes(void *MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAuint64 *AlternateVAGPU,
HsaMemMapFlags MemMapFlags,
HSAuint64 NumberOfNodes,
HSAuint32 *NodeArray)
{
uint32_t *gpu_id_array;
HSAKMT_STATUS ret;
CHECK_KFD_OPEN();
pr_debug("[%s] address %p number of nodes %lu\n",
__func__, MemoryAddress, NumberOfNodes);
if (!MemoryAddress) {
pr_err("FIXME: mapping NULL pointer\n");
return HSAKMT_STATUS_ERROR;
}
if (!hsakmt_is_dgpu && NumberOfNodes == 1)
return hsaKmtMapMemoryToGPU(MemoryAddress,
MemorySizeInBytes,
AlternateVAGPU);
ret = hsakmt_validate_nodeid_array(&gpu_id_array,
NumberOfNodes, NodeArray);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
ret = hsakmt_fmm_map_to_gpu_nodes(MemoryAddress, MemorySizeInBytes,
gpu_id_array, NumberOfNodes, AlternateVAGPU);
if (gpu_id_array)
free(gpu_id_array);
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapMemoryToGPU(void *MemoryAddress)
{
CHECK_KFD_OPEN();
pr_debug("[%s] address %p\n", __func__, MemoryAddress);
if (!MemoryAddress) {
/* Workaround for runtime bug */
pr_err("FIXME: Unmapping NULL pointer\n");
return HSAKMT_STATUS_SUCCESS;
}
if (!hsakmt_fmm_unmap_from_gpu(MemoryAddress))
return HSAKMT_STATUS_SUCCESS;
else
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtMapGraphicHandle(HSAuint32 NodeId,
HSAuint64 GraphicDeviceHandle,
HSAuint64 GraphicResourceHandle,
HSAuint64 GraphicResourceOffset,
HSAuint64 GraphicResourceSize,
HSAuint64 *FlatMemoryAddress)
{
/* This API was only ever implemented in KFD for Kaveri and
* was never upstreamed. There are no open-source users of
* this interface. It has been superseded by
* RegisterGraphicsHandleToNodes.
*/
return HSAKMT_STATUS_NOT_IMPLEMENTED;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtUnmapGraphicHandle(HSAuint32 NodeId,
HSAuint64 FlatMemoryAddress,
HSAuint64 SizeInBytes)
{
CHECK_KFD_OPEN();
return hsaKmtUnmapMemoryToGPU(PORT_UINT64_TO_VPTR(FlatMemoryAddress));
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig *config)
{
struct kfd_ioctl_get_tile_config_args args = {0};
uint32_t gpu_id;
HSAKMT_STATUS result;
CHECK_KFD_OPEN();
pr_debug("[%s] node %d\n", __func__, NodeId);
result = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
/* Avoid Valgrind warnings about uninitialized data. Valgrind doesn't
* know that KFD writes this.
*/
memset(config->TileConfig, 0, sizeof(*config->TileConfig) * config->NumTileConfigs);
memset(config->MacroTileConfig, 0, sizeof(*config->MacroTileConfig) * config->NumMacroTileConfigs);
args.gpu_id = gpu_id;
args.tile_config_ptr = (uint64_t)config->TileConfig;
args.macro_tile_config_ptr = (uint64_t)config->MacroTileConfig;
args.num_tile_configs = config->NumTileConfigs;
args.num_macro_tile_configs = config->NumMacroTileConfigs;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_TILE_CONFIG, &args) != 0)
return HSAKMT_STATUS_ERROR;
config->NumTileConfigs = args.num_tile_configs;
config->NumMacroTileConfigs = args.num_macro_tile_configs;
config->GbAddrConfig = args.gb_addr_config;
config->NumBanks = args.num_banks;
config->NumRanks = args.num_ranks;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtQueryPointerInfo(const void *Pointer,
HsaPointerInfo *PointerInfo)
{
CHECK_KFD_OPEN();
pr_debug("[%s] pointer %p\n", __func__, Pointer);
if (!PointerInfo)
return HSAKMT_STATUS_INVALID_PARAMETER;
return hsakmt_fmm_get_mem_info(Pointer, PointerInfo);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetMemoryUserData(const void *Pointer,
void *UserData)
{
CHECK_KFD_OPEN();
pr_debug("[%s] pointer %p\n", __func__, Pointer);
return hsakmt_fmm_set_mem_user_data(Pointer, UserData);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtReplaceAsanHeaderPage(void *addr)
{
#ifdef SANITIZER_AMDGPU
pr_debug("[%s] address %p\n", __func__, addr);
CHECK_KFD_OPEN();
return hsakmt_fmm_replace_asan_header_page(addr);
#else
return HSAKMT_STATUS_NOT_SUPPORTED;
#endif
}
HSAKMT_STATUS HSAKMTAPI hsaKmtReturnAsanHeaderPage(void *addr)
{
#ifdef SANITIZER_AMDGPU
pr_debug("[%s] address %p\n", __func__, addr);
CHECK_KFD_OPEN();
return hsakmt_fmm_return_asan_header_page(addr);
#else
return HSAKMT_STATUS_NOT_SUPPORTED;
#endif
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetAMDGPUDeviceHandle( HSAuint32 NodeId,
HsaAMDGPUDeviceHandle *DeviceHandle)
{
CHECK_KFD_OPEN();
return hsakmt_fmm_get_amdgpu_device_handle(NodeId, DeviceHandle);
}
================================================
FILE: libhsakmt/src/openclose.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/* glibc macro that enables access some nonstandard GNU/Linux extensions
* such as RTLD_DEFAULT used by dlsym
*/
#define _GNU_SOURCE
#include "libhsakmt.h"
#include "hsakmt/hsakmtmodel.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include "fmm.h"
#include
#include
int (*hsakmt_fn_amdgpu_device_get_fd)(HsaAMDGPUDeviceHandle device_handle);
static const char kfd_device_name[] = "/dev/kfd";
static const char kfd_udmabuf_device_name[] = "/dev/udmabuf";
static pid_t parent_pid = -1;
int hsakmt_debug_level;
bool hsakmt_forked;
/* hsakmt_is_forked_child detects when the process has forked since the last
* time this function was called. We cannot rely on pthread_atfork
* because the process can fork without calling the fork function in
* libc (using clone or calling the system call directly).
*/
bool hsakmt_is_forked_child(void)
{
pid_t cur_pid;
if (hsakmt_forked)
return true;
cur_pid = getpid();
if (parent_pid == -1) {
parent_pid = cur_pid;
return false;
}
if (parent_pid != cur_pid) {
hsakmt_forked = true;
return true;
}
return false;
}
/* Callbacks from pthread_atfork */
static void prepare_fork_handler(void)
{
pthread_mutex_lock(&hsakmt_mutex);
}
static void parent_fork_handler(void)
{
pthread_mutex_unlock(&hsakmt_mutex);
}
static void child_fork_handler(void)
{
pthread_mutex_init(&hsakmt_mutex, NULL);
hsakmt_forked = true;
}
/* Call this from the child process after fork. This will clear all
* data that is duplicated from the parent process, that is not valid
* in the child.
* The topology information is duplicated from the parent is valid
* in the child process so it is not cleared
*/
static void clear_after_fork(void)
{
hsakmt_clear_process_doorbells();
hsakmt_clear_events_page();
hsakmt_fmm_clear_all_mem();
hsakmt_destroy_device_debugging_memory();
if (hsakmt_kfd_fd) {
close(hsakmt_kfd_fd);
hsakmt_kfd_fd = -1;
}
if (hsakmt_udmabuf_dev_fd > 0) {
close(hsakmt_udmabuf_dev_fd);
hsakmt_udmabuf_dev_fd = -1;
}
hsakmt_kfd_open_count = 0;
parent_pid = -1;
hsakmt_forked = false;
}
static inline void init_page_size(void)
{
hsakmt_page_size = sysconf(_SC_PAGESIZE);
hsakmt_page_shift = ffs(hsakmt_page_size) - 1;
}
static HSAKMT_STATUS init_vars_from_env(void)
{
char *envvar;
int debug_level;
/* Normally libraries don't print messages. For debugging purpose, we'll
* print messages if an environment variable, HSAKMT_DEBUG_LEVEL, is set.
*/
hsakmt_debug_level = HSAKMT_DEBUG_LEVEL_DEFAULT;
envvar = getenv("HSAKMT_DEBUG_LEVEL");
if (envvar) {
debug_level = atoi(envvar);
if (debug_level >= HSAKMT_DEBUG_LEVEL_ERR &&
debug_level <= HSAKMT_DEBUG_LEVEL_DEBUG)
hsakmt_debug_level = debug_level;
}
/* Check whether to support Zero frame buffer */
envvar = getenv("HSA_ZFB");
if (envvar)
hsakmt_zfb_support = atoi(envvar);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtOpenKFD(void)
{
HSAKMT_STATUS result;
int fd = -1;
HsaSystemProperties sys_props;
char *error;
char *useSvmStr;
char *useUdmaBuf;
pthread_mutex_lock(&hsakmt_mutex);
/* If the process has forked, the child process must re-initialize
* it's connection to KFD. Any references tracked by hsakmt_kfd_open_count
* belong to the parent
*/
if (hsakmt_is_forked_child())
clear_after_fork();
if (hsakmt_kfd_open_count == 0) {
static bool atfork_installed = false;
hsakmt_fn_amdgpu_device_get_fd = dlsym(RTLD_DEFAULT, "amdgpu_device_get_fd");
if ((error = dlerror()) != NULL)
pr_err("amdgpu_device_get_fd is not available: %s\n", error);
else
pr_info("amdgpu_device_get_fd is available %p\n", hsakmt_fn_amdgpu_device_get_fd);
result = init_vars_from_env();
if (result != HSAKMT_STATUS_SUCCESS)
goto open_failed;
// Check if we are using the hsakmtmodel and setup initial state
model_init_env_vars();
if (hsakmt_kfd_fd < 0 && !hsakmt_use_model) {
fd = open(kfd_device_name, O_RDWR | O_CLOEXEC);
if (fd == -1) {
result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
goto open_failed;
}
hsakmt_kfd_fd = fd;
}
init_page_size();
result = hsakmt_init_kfd_version();
if (result != HSAKMT_STATUS_SUCCESS)
goto kfd_version_failed;
/* check if udmabuf is enabled by env HSA_USE_UDMABUF */
useUdmaBuf = getenv("HSA_USE_UDMABUF");
if (useUdmaBuf && atoi(useUdmaBuf)) {
/* open udmabuf device */
hsakmt_udmabuf_dev_fd = open(kfd_udmabuf_device_name, 0);
if (hsakmt_udmabuf_dev_fd < 0)
pr_debug("running kernel does not support udmabuf\n");
else
pr_debug("udmabuf is enabled\n");
} else
pr_debug("udmabuf is not enabled\n");
useSvmStr = getenv("HSA_USE_SVM");
hsakmt_is_svm_api_supported = !(useSvmStr && !strcmp(useSvmStr, "0"));
if(!hsakmt_use_model)
result = hsakmt_topology_sysfs_get_system_props(&sys_props);
if (result != HSAKMT_STATUS_SUCCESS)
goto topology_sysfs_failed;
hsakmt_kfd_open_count = 1;
if (hsakmt_init_device_debugging_memory(sys_props.NumNodes) != HSAKMT_STATUS_SUCCESS)
pr_warn("Insufficient Memory. Debugging unavailable\n");
hsakmt_init_counter_props(sys_props.NumNodes);
if (!atfork_installed) {
/* Atfork handlers cannot be uninstalled and
* must be installed only once. Otherwise
* prepare will deadlock when trying to take
* the same lock multiple times.
*/
pthread_atfork(prepare_fork_handler,
parent_fork_handler,
child_fork_handler);
atfork_installed = true;
}
} else {
hsakmt_kfd_open_count++;
result = HSAKMT_STATUS_KERNEL_ALREADY_OPENED;
}
pthread_mutex_unlock(&hsakmt_mutex);
return result;
topology_sysfs_failed:
kfd_version_failed:
if (fd >= 0)
close(fd);
open_failed:
pthread_mutex_unlock(&hsakmt_mutex);
return result;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtCloseKFD(void)
{
HSAKMT_STATUS result;
pthread_mutex_lock(&hsakmt_mutex);
if (hsakmt_kfd_open_count > 0) {
if (--hsakmt_kfd_open_count == 0) {
hsakmt_destroy_counter_props();
hsakmt_destroy_device_debugging_memory();
}
result = HSAKMT_STATUS_SUCCESS;
} else
result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
pthread_mutex_unlock(&hsakmt_mutex);
return result;
}
================================================
FILE: libhsakmt/src/pc_sampling.c
================================================
/*
* Copyright © 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include "hsakmt/linux/kfd_ioctl.h"
#include
#include
#include
#include
#define INVALID_TRACE_ID 0x0
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingSupport(void)
{
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(16);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingQueryCapabilities(HSAuint32 NodeId, void *sample_info,
HSAuint32 sample_info_sz, HSAuint32 *size)
{
struct kfd_ioctl_pc_sample_args args = {0};
uint32_t gpu_id;
if (size == NULL)
return HSAKMT_STATUS_INVALID_PARAMETER;
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(16);
HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
return ret;
}
assert(sizeof(HsaPcSamplingInfo) == sizeof(struct kfd_pc_sample_info));
args.op = KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES;
args.gpu_id = gpu_id;
args.sample_info_ptr = (uint64_t)sample_info;
args.num_sample_info = sample_info_sz;
args.flags = 0;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
*size = args.num_sample_info;
if (err) {
switch (errno) {
case ENOSPC:
return HSAKMT_STATUS_BUFFER_TOO_SMALL;
case EINVAL:
return HSAKMT_STATUS_INVALID_PARAMETER;
case EOPNOTSUPP:
return HSAKMT_STATUS_NOT_SUPPORTED;
case EBUSY:
return HSAKMT_STATUS_UNAVAILABLE;
default:
return HSAKMT_STATUS_ERROR;
}
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingCreate(HSAuint32 NodeId, HsaPcSamplingInfo *sample_info,
HsaPcSamplingTraceId *traceId)
{
struct kfd_ioctl_pc_sample_args args = {0};
uint32_t gpu_id;
if (sample_info == NULL || traceId == NULL)
return HSAKMT_STATUS_INVALID_PARAMETER;
CHECK_KFD_OPEN();
*traceId = INVALID_TRACE_ID;
HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
return ret;
}
args.op = KFD_IOCTL_PCS_OP_CREATE;
args.gpu_id = gpu_id;
args.sample_info_ptr = (uint64_t)sample_info;
args.num_sample_info = 1;
args.trace_id = INVALID_TRACE_ID;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
if (err) {
switch (errno) {
case EINVAL:
return HSAKMT_STATUS_INVALID_PARAMETER;
case ENOMEM:
return HSAKMT_STATUS_NO_MEMORY;
case EBUSY:
return HSAKMT_STATUS_UNAVAILABLE;
default:
return HSAKMT_STATUS_ERROR;
}
}
*traceId = args.trace_id;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingDestroy(HSAuint32 NodeId, HsaPcSamplingTraceId traceId)
{
struct kfd_ioctl_pc_sample_args args = {0};
uint32_t gpu_id;
if (traceId == INVALID_TRACE_ID)
return HSAKMT_STATUS_INVALID_HANDLE;
CHECK_KFD_OPEN();
HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
return ret;
}
hsaKmtPcSamplingStop(NodeId, traceId);
args.op = KFD_IOCTL_PCS_OP_DESTROY;
args.gpu_id = gpu_id;
args.trace_id = traceId;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
if (err) {
if (errno == EINVAL)
return HSAKMT_STATUS_INVALID_PARAMETER;
return HSAKMT_STATUS_ERROR;
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStart(HSAuint32 NodeId, HsaPcSamplingTraceId traceId)
{
struct kfd_ioctl_pc_sample_args args = {0};
uint32_t gpu_id;
if (traceId == INVALID_TRACE_ID)
return HSAKMT_STATUS_INVALID_HANDLE;
CHECK_KFD_OPEN();
HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
return ret;
}
args.op = KFD_IOCTL_PCS_OP_START;
args.gpu_id = gpu_id;
args.trace_id = traceId;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
if (err) {
switch (errno) {
case EINVAL:
return HSAKMT_STATUS_INVALID_PARAMETER;
case ENOMEM:
return HSAKMT_STATUS_OUT_OF_RESOURCES;
case EBUSY:
return HSAKMT_STATUS_UNAVAILABLE;
case EALREADY:
return HSAKMT_STATUS_KERNEL_ALREADY_OPENED;
default:
return HSAKMT_STATUS_ERROR;
}
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPcSamplingStop(HSAuint32 NodeId, HsaPcSamplingTraceId traceId)
{
struct kfd_ioctl_pc_sample_args args = {0};
uint32_t gpu_id;
if (traceId == INVALID_TRACE_ID)
return HSAKMT_STATUS_INVALID_HANDLE;
CHECK_KFD_OPEN();
HSAKMT_STATUS ret = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, NodeId);
return ret;
}
args.op = KFD_IOCTL_PCS_OP_STOP;
args.gpu_id = gpu_id;
args.trace_id = traceId;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_PC_SAMPLE, &args);
if (err) {
switch (errno) {
case EINVAL:
return HSAKMT_STATUS_INVALID_PARAMETER;
case EALREADY:
return HSAKMT_STATUS_KERNEL_ALREADY_OPENED;
default:
return HSAKMT_STATUS_ERROR;
}
}
return HSAKMT_STATUS_SUCCESS;
}
================================================
FILE: libhsakmt/src/perfctr.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include
#include
#include
#include
#include
#include "libhsakmt.h"
#include "pmc_table.h"
#include "hsakmt/linux/kfd_ioctl.h"
#include
#include
#include
#include
#include
#include
#define BITS_PER_BYTE CHAR_BIT
#define HSA_PERF_MAGIC4CC 0x54415348
enum perf_trace_state {
PERF_TRACE_STATE__STOPPED = 0,
PERF_TRACE_STATE__STARTED
};
struct perf_trace_block {
enum perf_block_id block_id;
uint32_t num_counters;
uint64_t *counter_id;
int *perf_event_fd;
};
struct perf_trace {
uint32_t magic4cc;
uint32_t gpu_id;
enum perf_trace_state state;
uint32_t num_blocks;
void *buf;
uint64_t buf_size;
struct perf_trace_block blocks[0];
};
struct perf_counts_values {
union {
struct {
uint64_t val;
uint64_t ena;
uint64_t run;
};
uint64_t values[3];
};
};
static HsaCounterProperties **counter_props;
static unsigned int counter_props_count;
static ssize_t readn(int fd, void *buf, size_t n)
{
size_t left = n;
ssize_t bytes;
while (left) {
bytes = read(fd, buf, left);
if (!bytes) /* reach EOF */
return (n - left);
if (bytes < 0) {
if (errno == EINTR) /* read got interrupted */
continue;
else
return -errno;
}
left -= bytes;
buf = VOID_PTR_ADD(buf, bytes);
}
return n;
}
HSAKMT_STATUS hsakmt_init_counter_props(unsigned int NumNodes)
{
counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties *));
if (!counter_props) {
pr_warn("Profiling is not available.\n");
return HSAKMT_STATUS_NO_MEMORY;
}
counter_props_count = NumNodes;
return HSAKMT_STATUS_SUCCESS;
}
void hsakmt_destroy_counter_props(void)
{
unsigned int i;
if (!counter_props)
return;
for (i = 0; i < counter_props_count; i++)
if (counter_props[i]) {
free(counter_props[i]);
counter_props[i] = NULL;
}
free(counter_props);
}
static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid)
{
int rc = 0;
switch (block_id) {
case PERFCOUNTER_BLOCKID__CB:
*uuid = HSA_PROFILEBLOCK_AMD_CB;
break;
case PERFCOUNTER_BLOCKID__CPF:
*uuid = HSA_PROFILEBLOCK_AMD_CPF;
break;
case PERFCOUNTER_BLOCKID__CPG:
*uuid = HSA_PROFILEBLOCK_AMD_CPG;
break;
case PERFCOUNTER_BLOCKID__DB:
*uuid = HSA_PROFILEBLOCK_AMD_DB;
break;
case PERFCOUNTER_BLOCKID__GDS:
*uuid = HSA_PROFILEBLOCK_AMD_GDS;
break;
case PERFCOUNTER_BLOCKID__GRBM:
*uuid = HSA_PROFILEBLOCK_AMD_GRBM;
break;
case PERFCOUNTER_BLOCKID__GRBMSE:
*uuid = HSA_PROFILEBLOCK_AMD_GRBMSE;
break;
case PERFCOUNTER_BLOCKID__IA:
*uuid = HSA_PROFILEBLOCK_AMD_IA;
break;
case PERFCOUNTER_BLOCKID__MC:
*uuid = HSA_PROFILEBLOCK_AMD_MC;
break;
case PERFCOUNTER_BLOCKID__PASC:
*uuid = HSA_PROFILEBLOCK_AMD_PASC;
break;
case PERFCOUNTER_BLOCKID__PASU:
*uuid = HSA_PROFILEBLOCK_AMD_PASU;
break;
case PERFCOUNTER_BLOCKID__SPI:
*uuid = HSA_PROFILEBLOCK_AMD_SPI;
break;
case PERFCOUNTER_BLOCKID__SRBM:
*uuid = HSA_PROFILEBLOCK_AMD_SRBM;
break;
case PERFCOUNTER_BLOCKID__SQ:
*uuid = HSA_PROFILEBLOCK_AMD_SQ;
break;
case PERFCOUNTER_BLOCKID__SX:
*uuid = HSA_PROFILEBLOCK_AMD_SX;
break;
case PERFCOUNTER_BLOCKID__TA:
*uuid = HSA_PROFILEBLOCK_AMD_TA;
break;
case PERFCOUNTER_BLOCKID__TCA:
*uuid = HSA_PROFILEBLOCK_AMD_TCA;
break;
case PERFCOUNTER_BLOCKID__TCC:
*uuid = HSA_PROFILEBLOCK_AMD_TCC;
break;
case PERFCOUNTER_BLOCKID__TCP:
*uuid = HSA_PROFILEBLOCK_AMD_TCP;
break;
case PERFCOUNTER_BLOCKID__TCS:
*uuid = HSA_PROFILEBLOCK_AMD_TCS;
break;
case PERFCOUNTER_BLOCKID__TD:
*uuid = HSA_PROFILEBLOCK_AMD_TD;
break;
case PERFCOUNTER_BLOCKID__VGT:
*uuid = HSA_PROFILEBLOCK_AMD_VGT;
break;
case PERFCOUNTER_BLOCKID__WD:
*uuid = HSA_PROFILEBLOCK_AMD_WD;
break;
default:
/* If we reach this point, it's a bug */
rc = -1;
break;
}
return rc;
}
static HSAuint32 get_block_concurrent_limit(uint32_t node_id,
HSAuint32 block_id)
{
uint32_t i;
HsaCounterBlockProperties *block = &counter_props[node_id]->Blocks[0];
for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
if (block->Counters[0].BlockIndex == block_id)
return block->NumConcurrent;
block = (HsaCounterBlockProperties *)&block->Counters[block->NumCounters];
}
return 0;
}
static HSAKMT_STATUS perf_trace_ioctl(struct perf_trace_block *block,
uint32_t cmd)
{
uint32_t i;
for (i = 0; i < block->num_counters; i++) {
if (block->perf_event_fd[i] < 0)
return HSAKMT_STATUS_UNAVAILABLE;
if (ioctl(block->perf_event_fd[i], cmd, NULL))
return HSAKMT_STATUS_ERROR;
}
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS query_trace(int fd, uint64_t *buf)
{
struct perf_counts_values content;
if (fd < 0)
return HSAKMT_STATUS_ERROR;
if (readn(fd, &content, sizeof(content)) != sizeof(content))
return HSAKMT_STATUS_ERROR;
*buf = content.val;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId,
HsaCounterProperties **CounterProperties)
{
HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS;
uint32_t gpu_id, i, block_id;
uint32_t counter_props_size = 0;
uint32_t total_counters = 0;
uint32_t total_concurrent = 0;
struct perf_counter_block block = {0};
uint32_t total_blocks = 0;
HsaCounterBlockProperties *block_prop;
if (!counter_props)
return HSAKMT_STATUS_NO_MEMORY;
if (!CounterProperties)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
if (counter_props[NodeId]) {
*CounterProperties = counter_props[NodeId];
return HSAKMT_STATUS_SUCCESS;
}
for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
rc = hsakmt_get_block_properties(NodeId, i, &block);
if (rc != HSAKMT_STATUS_SUCCESS)
return rc;
total_concurrent += block.num_of_slots;
total_counters += block.num_of_counters;
/* If num_of_slots=0, this block doesn't exist */
if (block.num_of_slots)
total_blocks++;
}
counter_props_size = sizeof(HsaCounterProperties) +
sizeof(HsaCounterBlockProperties) * (total_blocks - 1) +
sizeof(HsaCounter) * (total_counters - total_blocks);
counter_props[NodeId] = malloc(counter_props_size);
if (!counter_props[NodeId])
return HSAKMT_STATUS_NO_MEMORY;
counter_props[NodeId]->NumBlocks = total_blocks;
counter_props[NodeId]->NumConcurrent = total_concurrent;
block_prop = &counter_props[NodeId]->Blocks[0];
for (block_id = 0; block_id < PERFCOUNTER_BLOCKID__MAX; block_id++) {
rc = hsakmt_get_block_properties(NodeId, block_id, &block);
if (rc != HSAKMT_STATUS_SUCCESS) {
free(counter_props[NodeId]);
counter_props[NodeId] = NULL;
return rc;
}
if (!block.num_of_slots) /* not a valid block */
continue;
blockid2uuid(block_id, &block_prop->BlockId);
block_prop->NumCounters = block.num_of_counters;
block_prop->NumConcurrent = block.num_of_slots;
for (i = 0; i < block.num_of_counters; i++) {
block_prop->Counters[i].BlockIndex = block_id;
block_prop->Counters[i].CounterId = block.counter_ids[i];
block_prop->Counters[i].CounterSizeInBits = block.counter_size_in_bits;
block_prop->Counters[i].CounterMask = block.counter_mask;
block_prop->Counters[i].Flags.ui32.Global = 1;
block_prop->Counters[i].Type = HSA_PROFILE_TYPE_NONPRIV_IMMEDIATE;
}
block_prop = (HsaCounterBlockProperties *)&block_prop->Counters[block_prop->NumCounters];
}
*CounterProperties = counter_props[NodeId];
return HSAKMT_STATUS_SUCCESS;
}
/* Registers a set of (HW) counters to be used for tracing/profiling */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
HSAuint32 NumberOfCounters,
HsaCounter *Counters,
HsaPmcTraceRoot *TraceRoot)
{
uint32_t gpu_id, i, j;
uint64_t min_buf_size = 0;
struct perf_trace *trace = NULL;
uint32_t concurrent_limit;
const uint32_t MAX_COUNTERS = 512;
/* Declare performance counter ID 2D array as a contiguous block */
uint64_t *counter_id = malloc(
PERFCOUNTER_BLOCKID__MAX * MAX_COUNTERS * sizeof(uint64_t));
uint32_t num_counters[PERFCOUNTER_BLOCKID__MAX] = {0};
uint32_t block, num_blocks = 0, total_counters = 0;
uint64_t *counter_id_ptr;
int *fd_ptr;
pr_debug("[%s] Number of counters %d\n", __func__, NumberOfCounters);
if (counter_id == NULL) {
pr_err("Failed to allocate memory for counter_id. Requested %zu bytes.\n",
PERFCOUNTER_BLOCKID__MAX * MAX_COUNTERS * sizeof(uint64_t));
return HSAKMT_STATUS_NO_MEMORY;
}
if (!counter_props) {
pr_err("Profiling is not available, counter_props is NULL.\n");
goto no_memory_exit;
}
if (!Counters || !TraceRoot || NumberOfCounters == 0)
goto invalid_parameter_exit;
if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) {
free(counter_id);
return HSAKMT_STATUS_INVALID_NODE_UNIT;
}
if (NumberOfCounters > MAX_COUNTERS) {
pr_err("MAX_COUNTERS is too small for %d.\n", NumberOfCounters);
goto no_memory_exit;
}
/* Calculating the minimum buffer size */
for (i = 0; i < NumberOfCounters; i++) {
if (Counters[i].BlockIndex >= PERFCOUNTER_BLOCKID__MAX)
goto invalid_parameter_exit;
/* Only privileged counters need to register */
if (Counters[i].Type > HSA_PROFILE_TYPE_PRIVILEGED_STREAMING)
continue;
min_buf_size += Counters[i].CounterSizeInBits/BITS_PER_BYTE;
/* j: the first blank entry in the block to record counter_id */
j = num_counters[Counters[i].BlockIndex];
/* Make sure counter_id stays within bounds */
if (j >= MAX_COUNTERS) {
pr_err("Counter ID exceeded MAX_COUNTERS for block %d.\n",
Counters[i].BlockIndex);
goto invalid_parameter_exit;
}
/* Initialize counter_id */
counter_id[Counters[i].BlockIndex * MAX_COUNTERS + j] = Counters[i].CounterId;
num_counters[Counters[i].BlockIndex]++;
total_counters++;
}
/* Verify that the number of counters per block is not larger than the
* number of slots.
*/
for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
if (!num_counters[i])
continue;
concurrent_limit = get_block_concurrent_limit(NodeId, i);
if (!concurrent_limit) {
pr_err("Invalid block ID: %d\n", i);
goto invalid_parameter_exit;
}
if (num_counters[i] > concurrent_limit) {
pr_err("Counters exceed the limit.\n");
goto invalid_parameter_exit;
}
num_blocks++;
}
if (!num_blocks)
goto invalid_parameter_exit;
/* Now we have sorted blocks/counters information in
* num_counters[block_id] and counter_id[block_id][]. Allocate trace
* and record the information.
*/
trace = (struct perf_trace *)calloc(sizeof(struct perf_trace)
+ sizeof(struct perf_trace_block) * num_blocks
+ sizeof(uint64_t) * total_counters
+ sizeof(int) * total_counters,
1);
if (!trace) {
pr_err("Failed to allocate memory for trace. Requested %zu bytes.\n",
sizeof(struct perf_trace)
+ sizeof(struct perf_trace_block) * num_blocks
+ sizeof(uint64_t) * total_counters
+ sizeof(int) * total_counters);
goto no_memory_exit;
}
/* Allocated area is partitioned as:
* +---------------------------------+ trace
* | perf_trace |
* |---------------------------------| trace->blocks[0]
* | perf_trace_block 0 |
* | .... |
* | perf_trace_block N-1 | trace->blocks[N-1]
* |---------------------------------| <-- counter_id_ptr starts here
* | block 0's counter IDs(uint64_t) |
* | ...... |
* | block N-1's counter IDs |
* |---------------------------------| <-- perf_event_fd starts here
* | block 0's perf_event_fds(int) |
* | ...... |
* | block N-1's perf_event_fds |
* +---------------------------------+
*/
block = 0;
counter_id_ptr = (uint64_t *)((char *)
trace + sizeof(struct perf_trace)
+ sizeof(struct perf_trace_block) * num_blocks);
fd_ptr = (int *)(counter_id_ptr + total_counters);
/* Fill in each block's information to the TraceId */
for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
if (!num_counters[i]) /* not a block to trace */
continue;
/* Following perf_trace + perf_trace_block x N are those
* counter_id arrays. Assign the counter_id array belonging to
* this block.
*/
trace->blocks[block].counter_id = counter_id_ptr;
/* Fill in counter IDs to the counter_id array. */
for (j = 0; j < num_counters[i]; j++)
trace->blocks[block].counter_id[j] = counter_id[i * MAX_COUNTERS + j];
trace->blocks[block].perf_event_fd = fd_ptr;
/* how many counters to trace */
trace->blocks[block].num_counters = num_counters[i];
/* block index in "enum perf_block_id" */
trace->blocks[block].block_id = i;
block++; /* move to next */
counter_id_ptr += num_counters[i];
fd_ptr += num_counters[i];
}
trace->magic4cc = HSA_PERF_MAGIC4CC;
trace->gpu_id = gpu_id;
trace->state = PERF_TRACE_STATE__STOPPED;
trace->num_blocks = num_blocks;
TraceRoot->NumberOfPasses = 1;
TraceRoot->TraceBufferMinSizeBytes = PAGE_ALIGN_UP(min_buf_size);
TraceRoot->TraceId = PORT_VPTR_TO_UINT64(trace);
free(trace);
free(counter_id);
return HSAKMT_STATUS_SUCCESS;
no_memory_exit:
free(counter_id);
return HSAKMT_STATUS_NO_MEMORY;
invalid_parameter_exit:
free(counter_id);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
/* Unregisters a set of (HW) counters used for tracing/profiling */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
HSATraceId TraceId)
{
uint32_t gpu_id;
struct perf_trace *trace;
pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
if (TraceId == 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
if (trace->magic4cc != HSA_PERF_MAGIC4CC)
return HSAKMT_STATUS_INVALID_HANDLE;
if (trace->gpu_id != gpu_id)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
/* If the trace is in the running state, stop it */
if (trace->state == PERF_TRACE_STATE__STARTED) {
HSAKMT_STATUS status = hsaKmtPmcStopTrace(TraceId);
if (status != HSAKMT_STATUS_SUCCESS)
return status;
}
free(trace);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
HSATraceId TraceId)
{
struct perf_trace *trace;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
uint32_t gpu_id;
pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
if (TraceId == 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
if (trace->magic4cc != HSA_PERF_MAGIC4CC)
return HSAKMT_STATUS_INVALID_HANDLE;
if (hsakmt_validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId,
HSATraceId TraceId)
{
struct perf_trace *trace;
pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
if (TraceId == 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
if (trace->magic4cc != HSA_PERF_MAGIC4CC)
return HSAKMT_STATUS_INVALID_HANDLE;
return HSAKMT_STATUS_SUCCESS;
}
/* Starts tracing operation on a previously established set of performance counters */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId,
void *TraceBuffer,
HSAuint64 TraceBufferSizeBytes)
{
struct perf_trace *trace =
(struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
uint32_t i;
int32_t j;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
if (TraceId == 0 || !TraceBuffer || TraceBufferSizeBytes == 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (trace->magic4cc != HSA_PERF_MAGIC4CC)
return HSAKMT_STATUS_INVALID_HANDLE;
for (i = 0; i < trace->num_blocks; i++) {
ret = perf_trace_ioctl(&trace->blocks[i],
PERF_EVENT_IOC_ENABLE);
if (ret != HSAKMT_STATUS_SUCCESS)
break;
}
if (ret != HSAKMT_STATUS_SUCCESS) {
/* Disable enabled blocks before returning the failure. */
j = (int32_t)i;
while (--j >= 0)
perf_trace_ioctl(&trace->blocks[j],
PERF_EVENT_IOC_DISABLE);
return ret;
}
trace->state = PERF_TRACE_STATE__STARTED;
trace->buf = TraceBuffer;
trace->buf_size = TraceBufferSizeBytes;
return HSAKMT_STATUS_SUCCESS;
}
/*Forces an update of all the counters that a previously started trace operation has registered */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId)
{
struct perf_trace *trace =
(struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
uint32_t i, j;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
uint64_t *buf;
uint64_t buf_filled = 0;
if (TraceId == 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (trace->magic4cc != HSA_PERF_MAGIC4CC)
return HSAKMT_STATUS_INVALID_HANDLE;
buf = (uint64_t *)trace->buf;
pr_debug("[%s] Trace buffer(%p): ", __func__, buf);
for (i = 0; i < trace->num_blocks; i++)
for (j = 0; j < trace->blocks[i].num_counters; j++) {
buf_filled += sizeof(uint64_t);
if (buf_filled > trace->buf_size)
return HSAKMT_STATUS_NO_MEMORY;
ret = query_trace(trace->blocks[i].perf_event_fd[j],
buf);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
pr_debug("%lu_", *buf);
buf++;
}
pr_debug("\n");
return HSAKMT_STATUS_SUCCESS;
}
/* Stops tracing operation on a previously established set of performance counters */
HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId)
{
struct perf_trace *trace =
(struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
uint32_t i;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
if (TraceId == 0)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (trace->magic4cc != HSA_PERF_MAGIC4CC)
return HSAKMT_STATUS_INVALID_HANDLE;
for (i = 0; i < trace->num_blocks; i++) {
ret = perf_trace_ioctl(&trace->blocks[i],
PERF_EVENT_IOC_DISABLE);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
}
trace->state = PERF_TRACE_STATE__STOPPED;
return ret;
}
================================================
FILE: libhsakmt/src/pmc_table.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include
#include
#include
#include
#include
#include "libhsakmt.h"
#include "pmc_table.h"
/****** CB ******/
static uint32_t gfx7_cb_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225
};
static uint32_t gfx8_cb_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
389, 390, 391, 392, 393, 394, 395
};
static uint32_t gfx9_cb_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436,
437
};
static uint32_t gfx10_cb_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436,
437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452,
453, 454, 455, 456, 457, 458, 459, 460
};
/****** CPF ******/
static uint32_t gfx7_cpf_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
};
static uint32_t gfx8_cpf_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
};
static uint32_t gfx9_cpf_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
};
static uint32_t gfx10_cpf_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39
};
/****** CPG ******/
static uint32_t gfx7_cpg_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45
};
static uint32_t gfx8_cpg_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48
};
static uint32_t gfx9_cpg_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58
};
static uint32_t gfx10_cpg_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81
};
/****** DB ******/
static uint32_t gfx7_db_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256
};
/* gfx8_db_counter_ids are the same as gfx7_db_counter_ids */
static uint32_t gfx9_db_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327
};
static uint32_t gfx10_db_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369
};
/****** GDS ******/
static uint32_t gfx7_gds_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120
};
/* gfx8_gds_counter_ids are the same as gfx7_gds_counter_ids */
/* gfx9_gds_counter_ids are the same as gfx7_gds_counter_ids */
static uint32_t gfx10_gds_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122
};
/****** GRBM ******/
static uint32_t gfx7_grbm_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33
};
/* gfx8_grbm_counter_ids are the same as gfx7_grbm_counter_ids */
static uint32_t gfx9_grbm_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37
};
static uint32_t gfx10_grbm_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46
};
/****** GRBMSE ******/
static uint32_t gfx7_grbmse_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
};
/* gfx8_grbmse_counter_ids are the same as gfx7_grbmse_counter_ids */
static uint32_t gfx9_grbmse_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
static uint32_t gfx10_grbmse_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
};
/****** IA ******/
static uint32_t gfx7_ia_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
};
static uint32_t gfx8_ia_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23
};
static uint32_t gfx9_ia_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31
};
/* gfx10 doesn't have IA */
/****** PASC ******/
static uint32_t gfx7_pasc_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
389, 390, 391, 392, 393, 394
};
static uint32_t gfx8_pasc_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
389, 390, 391, 392, 393, 394, 395, 396
};
static uint32_t gfx9_pasc_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436,
437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452,
453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468,
469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
485, 486, 487, 488, 489, 490
};
static uint32_t gfx10_pasc_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436,
437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452,
453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468,
469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500,
501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516,
517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532,
533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548,
549, 550, 551
};
/****** PASU ******/
static uint32_t gfx7_pasu_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152
};
/* gfx8_pasu_counter_ids are the same as gfx7_pasu_counter_ids */
static uint32_t gfx9_pasu_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291
};
static uint32_t gfx10_pasu_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265
};
/****** SPI ******/
static uint32_t gfx7_spi_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185
};
static uint32_t gfx8_spi_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196
};
static uint32_t gfx9_spi_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195
};
static uint32_t gfx10_spi_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328
};
/****** SQ ******/
/* Unused counters - 163-167 */
static uint32_t gfx7_sq_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 168, 169,
170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217,
218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
250
};
/* Unused counters - 166, 292 - 297 */
static uint32_t gfx8_sq_counter_ids[] = {
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101,
102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262,
263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278,
279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 298
};
/* Polaris 10/11/12 have the same SQ cpunter IDs but different from other gfx8's. */
/* Unused counters - 167 and 275 are *_DUMMY_LAST */
static uint32_t gfx8_pl_sq_counter_ids[] = {
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
158, 159, 160, 161, 162, 163, 164, 165, 168, 169, 170, 171, 172, 173, 174,
175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264,
265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 276, 277, 278, 279, 280,
281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295
};
static uint32_t gfx9_sq_counter_ids[] = {
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 255, 256, 257, 258,
259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288,
289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303,
304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318,
319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333,
334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348,
349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
364, 365, 366, 367, 368, 369, 370, 371, 372
};
static uint32_t gfx10_sq_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244,
245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276,
277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308,
309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340,
341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436,
437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452,
453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468,
469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500,
501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511
};
/****** SRBM ******/
static uint32_t gfx7_srbm_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
};
static uint32_t gfx8_srbm_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27
};
/* gfx9 doesn't have SRBM */
/* gfx10 doesn't have SRBM */
/****** SX ******/
static uint32_t gfx7_sx_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33
};
/* gfx8_sx_counter_ids are the same as gfx7_sx_counter_ids */
static uint32_t gfx9_sx_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207
};
static uint32_t gfx10_sx_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224
};
/****** TA ******/
static uint32_t gfx7_ta_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110
};
static uint32_t gfx8_ta_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118
};
/* gfx9_ta_counter_ids is same as gfx8_ta_counter_ids */
static uint32_t gfx10_ta_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196,
197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225
};
/****** TCA ******/
static uint32_t gfx7_tca_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38
};
static uint32_t gfx8_tca_counter_ids[] = {
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34
};
/* gfx9_tca_counter_ids is same as gfx8_tca_counter_ids */
/* gfx10 doesn't have TCA */
/****** TCC ******/
static uint32_t gfx7_tcc_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 159
};
static uint32_t gfx8_tcc_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 128, 129, 130, 131, 132, 133, 134, 135, 136,
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216,
217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232,
233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
249, 250, 251, 252, 253, 254, 255
};
static uint32_t gfx8_cz_tcc_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 128, 129, 130, 131, 132, 133, 134, 135, 136,
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
185, 186, 187, 188, 189, 190, 191
};
/* gfx9_tcc_counter_ids is same as gfx8_tcc_counter_ids */
/* gfx10 doesn't have TCC */
/****** TCP ******/
static uint32_t gfx7_tcp_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153
};
static uint32_t gfx8_tcp_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182
};
static uint32_t gfx9_tcp_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84
};
static uint32_t gfx10_tcp_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76
};
/****** TCS ******/
static uint32_t gfx7_tcs_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 64,
65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127
};
/* gfx8 doesn't have TCS */
/* gfx9 doesn't have TCS */
/* gfx10 doesn't have TCS */
/****** TD ******/
static uint32_t gfx7_td_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54
};
/* gfx8_td_counter_ids are the same as gfx7_td_counter_ids */
static uint32_t gfx9_td_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56
};
static uint32_t gfx10_td_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60
};
/****** VGT ******/
static uint32_t gfx7_vgt_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139
};
static uint32_t gfx8_vgt_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145
};
static uint32_t gfx8_pl_vgt_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146
};
static uint32_t gfx9_vgt_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147
};
/* gfx10 doesn't have VGT */
/****** WD ******/
static uint32_t gfx7_wd_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9
};
static uint32_t gfx8_wd_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36
};
static uint32_t gfx9_wd_counter_ids[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57
};
/* gfx10 doesn't have WD */
static struct perf_counter_block kaveri_blocks[PERFCOUNTER_BLOCKID__MAX] = {
[PERFCOUNTER_BLOCKID__SQ] = {
.num_of_slots = 8,
.num_of_counters = sizeof(gfx7_sq_counter_ids) /
sizeof(*gfx7_sq_counter_ids),
.counter_ids = gfx7_sq_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
};
static struct perf_counter_block hawaii_blocks[PERFCOUNTER_BLOCKID__MAX] = {
[PERFCOUNTER_BLOCKID__CB] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx7_cb_counter_ids) /
sizeof(*gfx7_cb_counter_ids),
.counter_ids = gfx7_cb_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPF] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx7_cpf_counter_ids) /
sizeof(*gfx7_cpf_counter_ids),
.counter_ids = gfx7_cpf_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPG] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx7_cpg_counter_ids) /
sizeof(*gfx7_cpg_counter_ids),
.counter_ids = gfx7_cpg_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__DB] = {
.num_of_slots = 12,
.num_of_counters = sizeof(gfx7_db_counter_ids) /
sizeof(*gfx7_db_counter_ids),
.counter_ids = gfx7_db_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GDS] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_gds_counter_ids) /
sizeof(*gfx7_gds_counter_ids),
.counter_ids = gfx7_gds_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx7_grbm_counter_ids) /
sizeof(*gfx7_grbm_counter_ids),
.counter_ids = gfx7_grbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBMSE] = {
.num_of_slots = 1,
.num_of_counters = sizeof(gfx7_grbmse_counter_ids) /
sizeof(*gfx7_grbmse_counter_ids),
.counter_ids = gfx7_grbmse_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__IA] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx7_ia_counter_ids) /
sizeof(*gfx7_ia_counter_ids),
.counter_ids = gfx7_ia_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASC] = {
.num_of_slots = 11,
.num_of_counters = sizeof(gfx7_pasc_counter_ids) /
sizeof(*gfx7_pasc_counter_ids),
.counter_ids = gfx7_pasc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASU] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx7_pasu_counter_ids) /
sizeof(*gfx7_pasu_counter_ids),
.counter_ids = gfx7_pasu_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SPI] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_spi_counter_ids) /
sizeof(*gfx7_spi_counter_ids),
.counter_ids = gfx7_spi_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx7_srbm_counter_ids) /
sizeof(*gfx7_srbm_counter_ids),
.counter_ids = gfx7_srbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SQ] = {
.num_of_slots = 8,
.num_of_counters = sizeof(gfx7_sq_counter_ids) /
sizeof(*gfx7_sq_counter_ids),
.counter_ids = gfx7_sq_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SX] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_sx_counter_ids) /
sizeof(*gfx7_sx_counter_ids),
.counter_ids = gfx7_sx_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TA] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx7_ta_counter_ids) /
sizeof(*gfx7_ta_counter_ids),
.counter_ids = gfx7_ta_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCA] = {
.num_of_slots = 10, /* same as CZ */
.num_of_counters = sizeof(gfx7_tca_counter_ids) /
sizeof(*gfx7_tca_counter_ids),
.counter_ids = gfx7_tca_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCC] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx7_tcc_counter_ids) /
sizeof(*gfx7_tcc_counter_ids),
.counter_ids = gfx7_tcc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCP] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx7_tcp_counter_ids) /
sizeof(*gfx7_tcp_counter_ids),
.counter_ids = gfx7_tcp_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCS] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx7_tcs_counter_ids) /
sizeof(*gfx7_tcs_counter_ids),
.counter_ids = gfx7_tcs_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TD] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx7_td_counter_ids) /
sizeof(*gfx7_td_counter_ids),
.counter_ids = gfx7_td_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__VGT] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx7_vgt_counter_ids) /
sizeof(*gfx7_vgt_counter_ids),
.counter_ids = gfx7_vgt_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__WD] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_wd_counter_ids) /
sizeof(*gfx7_wd_counter_ids),
.counter_ids = gfx7_wd_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
};
static struct perf_counter_block carrizo_blocks[PERFCOUNTER_BLOCKID__MAX] = {
[PERFCOUNTER_BLOCKID__CB] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx8_cb_counter_ids) /
sizeof(*gfx8_cb_counter_ids),
.counter_ids = gfx8_cb_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPF] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx8_cpf_counter_ids) /
sizeof(*gfx8_cpf_counter_ids),
.counter_ids = gfx8_cpf_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPG] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx8_cpg_counter_ids) /
sizeof(*gfx8_cpg_counter_ids),
.counter_ids = gfx8_cpg_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__DB] = {
.num_of_slots = 12,
.num_of_counters = sizeof(gfx7_db_counter_ids) /
sizeof(*gfx7_db_counter_ids),
.counter_ids = gfx7_db_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GDS] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_gds_counter_ids) /
sizeof(*gfx7_gds_counter_ids),
.counter_ids = gfx7_gds_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx7_grbm_counter_ids) /
sizeof(*gfx7_grbm_counter_ids),
.counter_ids = gfx7_grbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBMSE] = {
.num_of_slots = 1,
.num_of_counters = sizeof(gfx7_grbmse_counter_ids) /
sizeof(*gfx7_grbmse_counter_ids),
.counter_ids = gfx7_grbmse_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__IA] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx8_ia_counter_ids) /
sizeof(*gfx8_ia_counter_ids),
.counter_ids = gfx8_ia_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASC] = {
.num_of_slots = 11,
.num_of_counters = sizeof(gfx8_pasc_counter_ids) /
sizeof(*gfx8_pasc_counter_ids),
.counter_ids = gfx8_pasc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASU] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx7_pasu_counter_ids) /
sizeof(*gfx7_pasu_counter_ids),
.counter_ids = gfx7_pasu_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SPI] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx8_spi_counter_ids) /
sizeof(*gfx8_spi_counter_ids),
.counter_ids = gfx8_spi_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx8_srbm_counter_ids) /
sizeof(*gfx8_srbm_counter_ids),
.counter_ids = gfx8_srbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SQ] = {
.num_of_slots = 8,
.num_of_counters = sizeof(gfx8_sq_counter_ids) /
sizeof(*gfx8_sq_counter_ids),
.counter_ids = gfx8_sq_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SX] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_sx_counter_ids) /
sizeof(*gfx7_sx_counter_ids),
.counter_ids = gfx7_sx_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TA] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx8_ta_counter_ids) /
sizeof(*gfx8_ta_counter_ids),
.counter_ids = gfx8_ta_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCA] = {
/* PMC0: PERF_SEL~PERF_SEL3, PMC1: PERF_SEL~PERF_SEL3,
* PMC2: PERF_SEL, PMC3: PERF_SEL. So 10 PERF_SELs in total
*/
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_tca_counter_ids) /
sizeof(*gfx8_tca_counter_ids),
.counter_ids = gfx8_tca_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCC] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_cz_tcc_counter_ids) /
sizeof(*gfx8_cz_tcc_counter_ids),
.counter_ids = gfx8_cz_tcc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCP] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_tcp_counter_ids) /
sizeof(*gfx8_tcp_counter_ids),
.counter_ids = gfx8_tcp_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TD] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx7_td_counter_ids) /
sizeof(*gfx7_td_counter_ids),
.counter_ids = gfx7_td_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__VGT] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_vgt_counter_ids) /
sizeof(*gfx8_vgt_counter_ids),
.counter_ids = gfx8_vgt_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__WD] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx8_wd_counter_ids) /
sizeof(*gfx8_wd_counter_ids),
.counter_ids = gfx8_wd_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
};
static struct perf_counter_block fiji_blocks[PERFCOUNTER_BLOCKID__MAX] = {
[PERFCOUNTER_BLOCKID__CB] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx8_cb_counter_ids) /
sizeof(*gfx8_cb_counter_ids),
.counter_ids = gfx8_cb_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPF] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx8_cpf_counter_ids) /
sizeof(*gfx8_cpf_counter_ids),
.counter_ids = gfx8_cpf_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPG] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx8_cpg_counter_ids) /
sizeof(*gfx8_cpg_counter_ids),
.counter_ids = gfx8_cpg_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__DB] = {
.num_of_slots = 12,
.num_of_counters = sizeof(gfx7_db_counter_ids) /
sizeof(*gfx7_db_counter_ids),
.counter_ids = gfx7_db_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GDS] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_gds_counter_ids) /
sizeof(*gfx7_gds_counter_ids),
.counter_ids = gfx7_gds_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx7_grbm_counter_ids) /
sizeof(*gfx7_grbm_counter_ids),
.counter_ids = gfx7_grbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBMSE] = {
.num_of_slots = 1,
.num_of_counters = sizeof(gfx7_grbmse_counter_ids) /
sizeof(*gfx7_grbmse_counter_ids),
.counter_ids = gfx7_grbmse_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__IA] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx8_ia_counter_ids) /
sizeof(*gfx8_ia_counter_ids),
.counter_ids = gfx8_ia_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASC] = {
.num_of_slots = 11,
.num_of_counters = sizeof(gfx8_pasc_counter_ids) /
sizeof(*gfx8_pasc_counter_ids),
.counter_ids = gfx8_pasc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASU] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx7_pasu_counter_ids) /
sizeof(*gfx7_pasu_counter_ids),
.counter_ids = gfx7_pasu_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SPI] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx8_spi_counter_ids) /
sizeof(*gfx8_spi_counter_ids),
.counter_ids = gfx8_spi_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx8_srbm_counter_ids) /
sizeof(*gfx8_srbm_counter_ids),
.counter_ids = gfx8_srbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SQ] = {
.num_of_slots = 8,
.num_of_counters = sizeof(gfx8_sq_counter_ids) /
sizeof(*gfx8_sq_counter_ids),
.counter_ids = gfx8_sq_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SX] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_sx_counter_ids) /
sizeof(*gfx7_sx_counter_ids),
.counter_ids = gfx7_sx_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TA] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx8_ta_counter_ids) /
sizeof(*gfx8_ta_counter_ids),
.counter_ids = gfx8_ta_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCA] = {
.num_of_slots = 10, /* same as CZ */
.num_of_counters = sizeof(gfx8_tca_counter_ids) /
sizeof(*gfx8_tca_counter_ids),
.counter_ids = gfx8_tca_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCC] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_tcc_counter_ids) /
sizeof(*gfx8_tcc_counter_ids),
.counter_ids = gfx8_tcc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCP] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_tcp_counter_ids) /
sizeof(*gfx8_tcp_counter_ids),
.counter_ids = gfx8_tcp_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TD] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx7_td_counter_ids) /
sizeof(*gfx7_td_counter_ids),
.counter_ids = gfx7_td_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__VGT] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_vgt_counter_ids) /
sizeof(*gfx8_vgt_counter_ids),
.counter_ids = gfx8_vgt_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__WD] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx8_wd_counter_ids) /
sizeof(*gfx8_wd_counter_ids),
.counter_ids = gfx8_wd_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
};
static struct perf_counter_block polaris_blocks[PERFCOUNTER_BLOCKID__MAX] = {
[PERFCOUNTER_BLOCKID__CB] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx8_cb_counter_ids) /
sizeof(*gfx8_cb_counter_ids),
.counter_ids = gfx8_cb_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPF] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx8_cpf_counter_ids) /
sizeof(*gfx8_cpf_counter_ids),
.counter_ids = gfx8_cpf_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPG] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx8_cpg_counter_ids) /
sizeof(*gfx8_cpg_counter_ids),
.counter_ids = gfx8_cpg_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__DB] = {
.num_of_slots = 12,
.num_of_counters = sizeof(gfx7_db_counter_ids) /
sizeof(*gfx7_db_counter_ids),
.counter_ids = gfx7_db_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GDS] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_gds_counter_ids) /
sizeof(*gfx7_gds_counter_ids),
.counter_ids = gfx7_gds_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx7_grbm_counter_ids) /
sizeof(*gfx7_grbm_counter_ids),
.counter_ids = gfx7_grbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBMSE] = {
.num_of_slots = 1,
.num_of_counters = sizeof(gfx7_grbmse_counter_ids) /
sizeof(*gfx7_grbmse_counter_ids),
.counter_ids = gfx7_grbmse_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__IA] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx8_ia_counter_ids) /
sizeof(*gfx8_ia_counter_ids),
.counter_ids = gfx8_ia_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASC] = {
.num_of_slots = 11,
.num_of_counters = sizeof(gfx8_pasc_counter_ids) /
sizeof(*gfx8_pasc_counter_ids),
.counter_ids = gfx8_pasc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASU] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx7_pasu_counter_ids) /
sizeof(*gfx7_pasu_counter_ids),
.counter_ids = gfx7_pasu_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SPI] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx8_spi_counter_ids) /
sizeof(*gfx8_spi_counter_ids),
.counter_ids = gfx8_spi_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SQ] = {
.num_of_slots = 8,
.num_of_counters = sizeof(gfx8_pl_sq_counter_ids) /
sizeof(*gfx8_pl_sq_counter_ids),
.counter_ids = gfx8_pl_sq_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx8_srbm_counter_ids) /
sizeof(*gfx8_srbm_counter_ids),
.counter_ids = gfx8_srbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SX] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_sx_counter_ids) /
sizeof(*gfx7_sx_counter_ids),
.counter_ids = gfx7_sx_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TA] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx8_ta_counter_ids) /
sizeof(*gfx8_ta_counter_ids),
.counter_ids = gfx8_ta_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCA] = {
.num_of_slots = 10, /* same as CZ */
.num_of_counters = sizeof(gfx8_tca_counter_ids) /
sizeof(*gfx8_tca_counter_ids),
.counter_ids = gfx8_tca_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCC] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_tcc_counter_ids) /
sizeof(*gfx8_tcc_counter_ids),
.counter_ids = gfx8_tcc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCP] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_tcp_counter_ids) /
sizeof(*gfx8_tcp_counter_ids),
.counter_ids = gfx8_tcp_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TD] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx7_td_counter_ids) /
sizeof(*gfx7_td_counter_ids),
.counter_ids = gfx7_td_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__VGT] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_pl_vgt_counter_ids) /
sizeof(*gfx8_pl_vgt_counter_ids),
.counter_ids = gfx8_pl_vgt_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__WD] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx8_wd_counter_ids) /
sizeof(*gfx8_wd_counter_ids),
.counter_ids = gfx8_wd_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
};
static struct perf_counter_block vega_blocks[PERFCOUNTER_BLOCKID__MAX] = {
[PERFCOUNTER_BLOCKID__CB] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx9_cb_counter_ids) /
sizeof(*gfx9_cb_counter_ids),
.counter_ids = gfx9_cb_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPF] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx9_cpf_counter_ids) /
sizeof(*gfx9_cpf_counter_ids),
.counter_ids = gfx9_cpf_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPG] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx9_cpg_counter_ids) /
sizeof(*gfx9_cpg_counter_ids),
.counter_ids = gfx9_cpg_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__DB] = {
.num_of_slots = 12,
.num_of_counters = sizeof(gfx9_db_counter_ids) /
sizeof(*gfx9_db_counter_ids),
.counter_ids = gfx9_db_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GDS] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx7_gds_counter_ids) /
sizeof(*gfx7_gds_counter_ids),
.counter_ids = gfx7_gds_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx9_grbm_counter_ids) /
sizeof(*gfx9_grbm_counter_ids),
.counter_ids = gfx9_grbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBMSE] = {
.num_of_slots = 1,
.num_of_counters = sizeof(gfx9_grbmse_counter_ids) /
sizeof(*gfx9_grbmse_counter_ids),
.counter_ids = gfx9_grbmse_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__IA] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx9_ia_counter_ids) /
sizeof(*gfx9_ia_counter_ids),
.counter_ids = gfx9_ia_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASC] = {
.num_of_slots = 11,
.num_of_counters = sizeof(gfx9_pasc_counter_ids) /
sizeof(*gfx9_pasc_counter_ids),
.counter_ids = gfx9_pasc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASU] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx9_pasu_counter_ids) /
sizeof(*gfx9_pasu_counter_ids),
.counter_ids = gfx9_pasu_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SPI] = {
.num_of_slots = 18,
.num_of_counters = sizeof(gfx9_spi_counter_ids) /
sizeof(*gfx9_spi_counter_ids),
.counter_ids = gfx9_spi_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SQ] = {
.num_of_slots = 16,
.num_of_counters = sizeof(gfx9_sq_counter_ids) /
sizeof(*gfx9_sq_counter_ids),
.counter_ids = gfx9_sq_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SX] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx9_sx_counter_ids) /
sizeof(*gfx9_sx_counter_ids),
.counter_ids = gfx9_sx_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TA] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx8_ta_counter_ids) /
sizeof(*gfx8_ta_counter_ids),
.counter_ids = gfx8_ta_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCA] = {
.num_of_slots = 10, /* same as Fiji */
/* Greenland has the same TCA counter IDs with Fiji */
.num_of_counters = sizeof(gfx8_tca_counter_ids) /
sizeof(*gfx8_tca_counter_ids),
.counter_ids = gfx8_tca_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCC] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx8_tcc_counter_ids) /
sizeof(*gfx8_tcc_counter_ids),
.counter_ids = gfx8_tcc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCP] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx9_tcp_counter_ids) /
sizeof(*gfx9_tcp_counter_ids),
.counter_ids = gfx9_tcp_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TD] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx9_td_counter_ids) /
sizeof(*gfx9_td_counter_ids),
.counter_ids = gfx9_td_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__VGT] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx9_vgt_counter_ids) /
sizeof(*gfx9_vgt_counter_ids),
.counter_ids = gfx9_vgt_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__WD] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx9_wd_counter_ids) /
sizeof(*gfx9_wd_counter_ids),
.counter_ids = gfx9_wd_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
};
static struct perf_counter_block navi_blocks[PERFCOUNTER_BLOCKID__MAX] = {
[PERFCOUNTER_BLOCKID__CB] = {
.num_of_slots = 7,
.num_of_counters = sizeof(gfx10_cb_counter_ids) /
sizeof(*gfx10_cb_counter_ids),
.counter_ids = gfx10_cb_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPF] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx10_cpf_counter_ids) /
sizeof(*gfx10_cpf_counter_ids),
.counter_ids = gfx10_cpf_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__CPG] = {
.num_of_slots = 6,
.num_of_counters = sizeof(gfx10_cpg_counter_ids) /
sizeof(*gfx10_cpg_counter_ids),
.counter_ids = gfx10_cpg_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__DB] = {
.num_of_slots = 12,
.num_of_counters = sizeof(gfx10_db_counter_ids) /
sizeof(*gfx10_db_counter_ids),
.counter_ids = gfx10_db_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GDS] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx10_gds_counter_ids) /
sizeof(*gfx10_gds_counter_ids),
.counter_ids = gfx10_gds_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBM] = {
.num_of_slots = 2,
.num_of_counters = sizeof(gfx10_grbm_counter_ids) /
sizeof(*gfx10_grbm_counter_ids),
.counter_ids = gfx10_grbm_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__GRBMSE] = {
.num_of_slots = 1,
.num_of_counters = sizeof(gfx10_grbmse_counter_ids) /
sizeof(*gfx10_grbmse_counter_ids),
.counter_ids = gfx10_grbmse_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASC] = {
.num_of_slots = 11,
.num_of_counters = sizeof(gfx10_pasc_counter_ids) /
sizeof(*gfx10_pasc_counter_ids),
.counter_ids = gfx10_pasc_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__PASU] = {
.num_of_slots = 16,
.num_of_counters = sizeof(gfx10_pasu_counter_ids) /
sizeof(*gfx10_pasu_counter_ids),
.counter_ids = gfx10_pasu_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SPI] = {
.num_of_slots = 18,
.num_of_counters = sizeof(gfx10_spi_counter_ids) /
sizeof(*gfx10_spi_counter_ids),
.counter_ids = gfx10_spi_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SQ] = {
.num_of_slots = 16,
.num_of_counters = sizeof(gfx10_sq_counter_ids) /
sizeof(*gfx10_sq_counter_ids),
.counter_ids = gfx10_sq_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__SX] = {
.num_of_slots = 4,
.num_of_counters = sizeof(gfx10_sx_counter_ids) /
sizeof(*gfx10_sx_counter_ids),
.counter_ids = gfx10_sx_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TA] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx10_ta_counter_ids) /
sizeof(*gfx10_ta_counter_ids),
.counter_ids = gfx10_ta_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TCP] = {
.num_of_slots = 10,
.num_of_counters = sizeof(gfx10_tcp_counter_ids) /
sizeof(*gfx10_tcp_counter_ids),
.counter_ids = gfx10_tcp_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
[PERFCOUNTER_BLOCKID__TD] = {
.num_of_slots = 5,
.num_of_counters = sizeof(gfx10_td_counter_ids) /
sizeof(*gfx10_td_counter_ids),
.counter_ids = gfx10_td_counter_ids,
.counter_size_in_bits = 64,
.counter_mask = BITMASK(64)
},
};
HSAKMT_STATUS hsakmt_get_block_properties(uint32_t node_id,
enum perf_block_id block_id,
struct perf_counter_block *block)
{
uint32_t gfxv = hsakmt_get_gfxv_by_node_id(node_id);
uint16_t dev_id = hsakmt_get_device_id_by_node_id(node_id);
if (block_id >= PERFCOUNTER_BLOCKID__MAX ||
block_id < PERFCOUNTER_BLOCKID__FIRST)
return HSAKMT_STATUS_INVALID_PARAMETER;
/* Major GFX Version */
switch (gfxv >> 16) {
case 7:
if (gfxv == GFX_VERSION_KAVERI)
*block = kaveri_blocks[block_id];
else
*block = hawaii_blocks[block_id];
break;
case 8:
if (gfxv == GFX_VERSION_TONGA)
return HSAKMT_STATUS_INVALID_PARAMETER;
else if (gfxv == GFX_VERSION_CARRIZO)
*block = carrizo_blocks[block_id];
else {
/*
* Fiji/Polaris/VegaM cards are of the same GFXIP Engine Version (8.0.3).
* Only way to differentiate b/t Fiji and Polaris/VegaM is via DID.
*/
if (dev_id == 0x7300 || dev_id == 0x730F)
*block = fiji_blocks[block_id];
else
*block = polaris_blocks[block_id];
}
break;
case 9:
*block = vega_blocks[block_id];
break;
case 10:
*block = navi_blocks[block_id];
break;
default:
return HSAKMT_STATUS_INVALID_PARAMETER;
}
return HSAKMT_STATUS_SUCCESS;
}
================================================
FILE: libhsakmt/src/pmc_table.h
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef PMC_TABLE_H
#define PMC_TABLE_H
#include "libhsakmt.h"
enum perf_block_id {
PERFCOUNTER_BLOCKID__FIRST = 0,
/* non-privileged */
PERFCOUNTER_BLOCKID__CB = PERFCOUNTER_BLOCKID__FIRST,
PERFCOUNTER_BLOCKID__CPC,
PERFCOUNTER_BLOCKID__CPF,
PERFCOUNTER_BLOCKID__CPG,
PERFCOUNTER_BLOCKID__DB,
PERFCOUNTER_BLOCKID__GDS,
PERFCOUNTER_BLOCKID__GRBM,
PERFCOUNTER_BLOCKID__GRBMSE,
PERFCOUNTER_BLOCKID__IA,
PERFCOUNTER_BLOCKID__MC,
PERFCOUNTER_BLOCKID__PASC,
PERFCOUNTER_BLOCKID__PASU,
PERFCOUNTER_BLOCKID__SPI,
PERFCOUNTER_BLOCKID__SRBM,
PERFCOUNTER_BLOCKID__SQ,
PERFCOUNTER_BLOCKID__SX,
PERFCOUNTER_BLOCKID__TA,
PERFCOUNTER_BLOCKID__TCA,
PERFCOUNTER_BLOCKID__TCC,
PERFCOUNTER_BLOCKID__TCP,
PERFCOUNTER_BLOCKID__TCS,
PERFCOUNTER_BLOCKID__TD,
PERFCOUNTER_BLOCKID__VGT,
PERFCOUNTER_BLOCKID__WD,
/* privileged */
PERFCOUNTER_BLOCKID__MAX
};
struct perf_counter_block {
uint32_t num_of_slots;
uint32_t num_of_counters;
uint32_t *counter_ids;
uint32_t counter_size_in_bits;
uint64_t counter_mask;
};
HSAKMT_STATUS hsakmt_get_block_properties(uint32_t node_id,
enum perf_block_id block_id,
struct perf_counter_block *block);
#endif // PMC_TABLE_H
================================================
FILE: libhsakmt/src/queues.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include "fmm.h"
#include "hsakmt/linux/kfd_ioctl.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
/* 1024 doorbells, 4 or 8 bytes each doorbell depending on ASIC generation */
#define DOORBELL_SIZE(gfxv) (((gfxv) >= 0x90000) ? 8 : 4)
#define DOORBELLS_PAGE_SIZE(ds) (1024 * (ds))
#define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, node) \
(hsakmt_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU + \
(node.LDSSizeInKB << 10) + HWREG_SIZE_PER_CU)
#define CNTL_STACK_BYTES_PER_WAVE(gfxv) \
((gfxv) >= GFX_VERSION_NAVI10 ? 12 : 8)
#define HWREG_SIZE_PER_CU 0x1000
#define DEBUGGER_BYTES_ALIGN 64
#define DEBUGGER_BYTES_PER_WAVE 32
struct queue {
uint32_t queue_id;
uint64_t wptr;
uint64_t rptr;
void *eop_buffer;
void *ctx_save_restore;
uint32_t ctx_save_restore_size;
uint32_t ctl_stack_size;
uint32_t debug_memory_size;
uint32_t eop_buffer_size;
uint32_t total_mem_alloc_size;
uint32_t gfxv;
bool use_ats;
bool unified_ctx_save_restore;
/* This queue structure is allocated from GPU with page aligned size
* but only small bytes are used. We use the extra space in the end for
* cu_mask bits array.
*/
uint32_t cu_mask_count; /* in bits */
uint32_t cu_mask[0];
};
struct process_doorbells {
bool use_gpuvm;
uint32_t size;
void *mapping;
pthread_mutex_t mutex;
};
static unsigned int num_doorbells;
static struct process_doorbells *doorbells;
uint32_t hsakmt_get_vgpr_size_per_cu(uint32_t gfxv)
{
uint32_t vgpr_size = 0x40000;
if (gfxv == GFX_VERSION_GFX950 ||
(gfxv & ~(0xff)) == GFX_VERSION_AQUA_VANJARAM ||
gfxv == GFX_VERSION_ALDEBARAN ||
gfxv == GFX_VERSION_ARCTURUS)
vgpr_size = 0x80000;
else if (gfxv == GFX_VERSION_PLUM_BONITO ||
gfxv == GFX_VERSION_WHEAT_NAS ||
gfxv == GFX_VERSION_GFX1200 ||
gfxv == GFX_VERSION_GFX1201)
vgpr_size = 0x60000;
return vgpr_size;
}
HSAKMT_STATUS hsakmt_init_process_doorbells(unsigned int NumNodes)
{
unsigned int i;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
/* doorbells[] is accessed using Topology NodeId. This means doorbells[0],
* which corresponds to CPU only Node, might not be used
*/
doorbells = malloc(NumNodes * sizeof(struct process_doorbells));
if (!doorbells)
return HSAKMT_STATUS_NO_MEMORY;
for (i = 0; i < NumNodes; i++) {
doorbells[i].use_gpuvm = false;
doorbells[i].size = 0;
doorbells[i].mapping = NULL;
pthread_mutex_init(&doorbells[i].mutex, NULL);
}
num_doorbells = NumNodes;
return ret;
}
static void get_doorbell_map_info(uint32_t node_id,
struct process_doorbells *doorbell)
{
/*
* GPUVM doorbell on Tonga requires a workaround for VM TLB ACTIVE bit
* lookup bug. Remove ASIC check when this is implemented in amdgpu.
*/
uint32_t gfxv = hsakmt_get_gfxv_by_node_id(node_id);
doorbell->use_gpuvm = (hsakmt_is_dgpu && gfxv != GFX_VERSION_TONGA);
doorbell->size = DOORBELLS_PAGE_SIZE(DOORBELL_SIZE(gfxv));
if (doorbell->size < (uint32_t) PAGE_SIZE) {
doorbell->size = PAGE_SIZE;
}
return;
}
void hsakmt_destroy_process_doorbells(void)
{
unsigned int i;
if (!doorbells)
return;
for (i = 0; i < num_doorbells; i++) {
if (!doorbells[i].size)
continue;
if (doorbells[i].use_gpuvm) {
hsakmt_fmm_unmap_from_gpu(doorbells[i].mapping);
hsakmt_fmm_release(doorbells[i].mapping);
} else
munmap(doorbells[i].mapping, doorbells[i].size);
}
free(doorbells);
doorbells = NULL;
num_doorbells = 0;
}
/* This is a special funcion that should be called only from the child process
* after a fork(). This will clear doorbells duplicated from the parent.
*/
void hsakmt_clear_process_doorbells(void)
{
unsigned int i;
if (!doorbells)
return;
for (i = 0; i < num_doorbells; i++) {
if (!doorbells[i].size)
continue;
if (!doorbells[i].use_gpuvm)
munmap(doorbells[i].mapping, doorbells[i].size);
}
free(doorbells);
doorbells = NULL;
num_doorbells = 0;
}
static HSAKMT_STATUS map_doorbell_apu(HSAuint32 NodeId, HSAuint32 gpu_id,
HSAuint64 doorbell_mmap_offset)
{
void *ptr;
ptr = mmap(0, doorbells[NodeId].size, PROT_READ|PROT_WRITE,
MAP_SHARED, hsakmt_kfd_fd, doorbell_mmap_offset);
if (ptr == MAP_FAILED)
return HSAKMT_STATUS_ERROR;
doorbells[NodeId].mapping = ptr;
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS map_doorbell_dgpu(HSAuint32 NodeId, HSAuint32 gpu_id,
HSAuint64 doorbell_mmap_offset)
{
void *ptr;
ptr = hsakmt_fmm_allocate_doorbell(gpu_id, doorbells[NodeId].size,
doorbell_mmap_offset);
if (!ptr)
return HSAKMT_STATUS_ERROR;
/* map for GPU access */
if (hsakmt_fmm_map_to_gpu(ptr, doorbells[NodeId].size, NULL)) {
hsakmt_fmm_release(ptr);
return HSAKMT_STATUS_ERROR;
}
doorbells[NodeId].mapping = ptr;
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS map_doorbell(HSAuint32 NodeId, HSAuint32 gpu_id,
HSAuint64 doorbell_mmap_offset)
{
HSAKMT_STATUS status = HSAKMT_STATUS_SUCCESS;
pthread_mutex_lock(&doorbells[NodeId].mutex);
if (doorbells[NodeId].size) {
pthread_mutex_unlock(&doorbells[NodeId].mutex);
return HSAKMT_STATUS_SUCCESS;
}
get_doorbell_map_info(NodeId, &doorbells[NodeId]);
if (doorbells[NodeId].use_gpuvm) {
status = map_doorbell_dgpu(NodeId, gpu_id, doorbell_mmap_offset);
if (status != HSAKMT_STATUS_SUCCESS) {
/* Fall back to the old method if KFD doesn't
* support doorbells in GPUVM
*/
doorbells[NodeId].use_gpuvm = false;
status = map_doorbell_apu(NodeId, gpu_id, doorbell_mmap_offset);
}
} else
status = map_doorbell_apu(NodeId, gpu_id, doorbell_mmap_offset);
if (status != HSAKMT_STATUS_SUCCESS)
doorbells[NodeId].size = 0;
pthread_mutex_unlock(&doorbells[NodeId].mutex);
return status;
}
static void *allocate_exec_aligned_memory_cpu(uint32_t size)
{
void *ptr;
/* mmap will return a pointer with alignment equal to
* sysconf(_SC_PAGESIZE).
*
* MAP_ANONYMOUS initializes the memory to zero.
*/
ptr = mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (ptr == MAP_FAILED)
return NULL;
return ptr;
}
/* The bool return indicate whether the queue needs a context-save-restore area*/
static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q)
{
HsaNodeProperties node;
if (q->gfxv < GFX_VERSION_CARRIZO)
return false;
if (hsaKmtGetNodeProperties(nodeid, &node))
return false;
if (node.NumFComputeCores && node.NumSIMDPerCU) {
uint32_t ctl_stack_size, wg_data_size;
uint32_t cu_num = node.NumFComputeCores / node.NumSIMDPerCU / node.NumXcc;
uint32_t wave_num = (q->gfxv < GFX_VERSION_NAVI10)
? MIN(cu_num * 40, node.NumShaderBanks / node.NumArrays * 512)
: cu_num * 32;
ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(q->gfxv) + 8;
wg_data_size = cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(q->gfxv, node);
q->ctl_stack_size = PAGE_ALIGN_UP(sizeof(HsaUserContextSaveAreaHeader)
+ ctl_stack_size);
if ((q->gfxv & 0x3f0000) == 0xA0000) {
/* HW design limits control stack size to 0x7000.
* This is insufficient for theoretical PM4 cases
* but sufficient for AQL, limited by SPI events.
*/
q->ctl_stack_size = MIN(q->ctl_stack_size, 0x7000);
}
q->debug_memory_size =
ALIGN_UP(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN);
q->ctx_save_restore_size = q->ctl_stack_size
+ PAGE_ALIGN_UP(wg_data_size);
return true;
}
return false;
}
void *hsakmt_allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t gpu_id,
uint32_t NodeId, bool nonPaged,
bool DeviceLocal,
bool Uncached)
{
void *mem = NULL;
HSAuint64 gpu_va;
HsaMemFlags flags;
HSAuint32 cpu_id = 0;
flags.Value = 0;
flags.ui32.HostAccess = !DeviceLocal;
flags.ui32.ExecuteAccess = 1;
flags.ui32.NonPaged = nonPaged;
flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
flags.ui32.CoarseGrain = DeviceLocal;
flags.ui32.Uncached = Uncached;
size = ALIGN_UP(size, align);
if (DeviceLocal && !hsakmt_zfb_support)
mem = hsakmt_fmm_allocate_device(gpu_id, NodeId, mem, size, 0, flags);
else {
/* VRAM under ZFB mode should be supported here without any
* additional code
*/
/* Get the closest cpu_id to GPU NodeId for system memory allocation
* nonPaged=0 system memory allocation uses GTT path
*/
if (!nonPaged) {
cpu_id = hsakmt_get_direct_link_cpu(NodeId);
if (cpu_id == INVALID_NODEID) {
flags.ui32.NoNUMABind = 1;
cpu_id = 0;
}
}
mem = hsakmt_fmm_allocate_host(gpu_id, cpu_id, mem, size, 0, flags);
}
if (!mem) {
pr_err("Alloc %s memory failed size %d\n",
DeviceLocal ? "VRAM" : "GTT", size);
return NULL;
}
if (NodeId != 0) {
uint32_t nodes_array[1] = {NodeId};
HsaMemMapFlags map_flags = {0};
HSAKMT_STATUS result;
result = hsaKmtMapMemoryToGPUNodes(mem, size, &gpu_va, map_flags, 1, nodes_array);
if (result != HSAKMT_STATUS_SUCCESS) {
hsaKmtFreeMemory(mem, size);
return NULL;
}
return mem;
}
if (hsaKmtMapMemoryToGPU(mem, size, &gpu_va) != HSAKMT_STATUS_SUCCESS) {
hsaKmtFreeMemory(mem, size);
return NULL;
}
return mem;
}
void hsakmt_free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align)
{
size = ALIGN_UP(size, align);
if (hsaKmtUnmapMemoryToGPU(addr) == HSAKMT_STATUS_SUCCESS)
hsaKmtFreeMemory(addr, size);
}
/*
* Allocates memory aligned to sysconf(_SC_PAGESIZE)
*/
static void *allocate_exec_aligned_memory(uint32_t size,
bool use_ats,
uint32_t gpu_id,
uint32_t NodeId,
bool nonPaged,
bool DeviceLocal,
bool Uncached)
{
if (!use_ats)
return hsakmt_allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, gpu_id, NodeId,
nonPaged, DeviceLocal,
Uncached);
return allocate_exec_aligned_memory_cpu(size);
}
static void free_exec_aligned_memory(void *addr, uint32_t size, uint32_t align,
bool use_ats)
{
if (!use_ats)
hsakmt_free_exec_aligned_memory_gpu(addr, size, align);
else
munmap(addr, size);
}
static HSAKMT_STATUS register_svm_range(void *mem, uint32_t size,
uint32_t gpuNode, uint32_t prefetchNode,
uint32_t preferredNode, bool alwaysMapped)
{
HSA_SVM_ATTRIBUTE *attrs;
HSAuint64 s_attr;
HSAuint32 nattr;
HSAuint32 flags;
flags = HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_GPU_EXEC;
if (alwaysMapped) {
CHECK_KFD_MINOR_VERSION(11);
flags |= HSA_SVM_FLAG_GPU_ALWAYS_MAPPED;
}
nattr = 6;
s_attr = sizeof(*attrs) * nattr;
attrs = (HSA_SVM_ATTRIBUTE *)alloca(s_attr);
attrs[0].type = HSA_SVM_ATTR_PREFETCH_LOC;
attrs[0].value = prefetchNode;
attrs[1].type = HSA_SVM_ATTR_PREFERRED_LOC;
attrs[1].value = preferredNode;
attrs[2].type = HSA_SVM_ATTR_CLR_FLAGS;
attrs[2].value = ~flags;
attrs[3].type = HSA_SVM_ATTR_SET_FLAGS;
attrs[3].value = flags;
attrs[4].type = HSA_SVM_ATTR_ACCESS;
attrs[4].value = gpuNode;
attrs[5].type = HSA_SVM_ATTR_GRANULARITY;
attrs[5].value = 0xFF;
return hsaKmtSVMSetAttr(mem, size, nattr, attrs);
}
static void free_queue(struct queue *q)
{
if (q->eop_buffer)
free_exec_aligned_memory(q->eop_buffer,
q->eop_buffer_size,
PAGE_SIZE, q->use_ats);
if (q->unified_ctx_save_restore)
munmap(q->ctx_save_restore, q->total_mem_alloc_size);
else if (q->ctx_save_restore)
free_exec_aligned_memory(q->ctx_save_restore,
q->total_mem_alloc_size,
PAGE_SIZE, q->use_ats);
free_exec_aligned_memory((void *)q, sizeof(*q), PAGE_SIZE, q->use_ats);
}
static inline void fill_cwsr_header(struct queue *q, void *addr,
HsaEvent *Event, volatile HSAint64 *ErrPayload, HSAuint32 NumXcc)
{
uint32_t i;
HsaUserContextSaveAreaHeader *header;
for (i = 0; i < NumXcc; i++) {
header = (HsaUserContextSaveAreaHeader *)
((uintptr_t)addr + (i * q->ctx_save_restore_size));
header->ErrorEventId = 0;
if (Event)
header->ErrorEventId = Event->EventId;
header->ErrorReason = ErrPayload;
header->DebugOffset = (NumXcc - i) * q->ctx_save_restore_size;
header->DebugSize = q->debug_memory_size * NumXcc;
}
}
static int handle_concrete_asic(struct queue *q,
struct kfd_ioctl_create_queue_args *args,
uint32_t gpu_id,
uint32_t NodeId,
HsaEvent *Event,
volatile HSAint64 *ErrPayload)
{
bool ret;
if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA ||
args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
return HSAKMT_STATUS_SUCCESS;
if (q->eop_buffer_size > 0) {
pr_info("Allocating VRAM for EOP\n");
q->eop_buffer = allocate_exec_aligned_memory(q->eop_buffer_size,
q->use_ats, gpu_id,
NodeId, true, true, /* Unused for VRAM */false);
if (!q->eop_buffer)
return HSAKMT_STATUS_NO_MEMORY;
args->eop_buffer_address = (uintptr_t)q->eop_buffer;
args->eop_buffer_size = q->eop_buffer_size;
}
ret = update_ctx_save_restore_size(NodeId, q);
if (ret) {
HsaNodeProperties node;
if (hsaKmtGetNodeProperties(NodeId, &node))
return HSAKMT_STATUS_ERROR;
args->ctx_save_restore_size = q->ctx_save_restore_size;
args->ctl_stack_size = q->ctl_stack_size;
/* Total memory to be allocated is =
* (Control Stack size + WG size +
* Debug memory area size) * num_xcc
*/
q->total_mem_alloc_size = (q->ctx_save_restore_size +
q->debug_memory_size) * node.NumXcc;
/* Allocate unified memory for context save restore
* area on dGPU.
*/
if (!q->use_ats && hsakmt_is_svm_api_supported) {
uint32_t size = PAGE_ALIGN_UP(q->total_mem_alloc_size);
pr_info("Allocating GTT for CWSR\n");
void *addr = hsakmt_mmap_allocate_aligned(PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE,
size, GPU_HUGE_PAGE_SIZE, 0,
0, (void *)LONG_MAX, -1);
if (!addr) {
pr_err("mmap failed to alloc ctx area size 0x%x: %s\n",
size, strerror(errno));
} else {
/*
* To avoid fork child process COW MMU notifier
* callback evict parent process queues.
*/
if (madvise(addr, size, MADV_DONTFORK))
pr_err("madvise failed -%d\n", errno);
fill_cwsr_header(q, addr, Event, ErrPayload, node.NumXcc);
HSAKMT_STATUS r = register_svm_range(addr, size,
NodeId, NodeId, 0, true);
if (r == HSAKMT_STATUS_SUCCESS) {
q->ctx_save_restore = addr;
q->unified_ctx_save_restore = true;
} else {
munmap(addr, size);
}
}
}
if (!q->unified_ctx_save_restore) {
q->ctx_save_restore = allocate_exec_aligned_memory(
q->total_mem_alloc_size,
q->use_ats, gpu_id, NodeId,
false, false, false);
if (!q->ctx_save_restore)
return HSAKMT_STATUS_NO_MEMORY;
fill_cwsr_header(q, q->ctx_save_restore, Event, ErrPayload, node.NumXcc);
}
args->ctx_save_restore_address = (uintptr_t)q->ctx_save_restore;
}
return HSAKMT_STATUS_SUCCESS;
}
/* A map to translate thunk queue priority (-3 to +3)
* to KFD queue priority (0 to 15)
* Indexed by thunk_queue_priority+3
*/
static uint32_t priority_map[] = {0, 3, 5, 7, 9, 11, 15};
HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId,
HSA_QUEUE_TYPE Type,
HSAuint32 QueuePercentage,
HSA_QUEUE_PRIORITY Priority,
void *QueueAddress,
HSAuint64 QueueSizeInBytes,
HsaEvent *Event,
HsaQueueResource *QueueResource)
{
if (Type == HSA_QUEUE_SDMA_BY_ENG_ID)
return HSAKMT_STATUS_ERROR;
return hsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, 0,
QueueAddress, QueueSizeInBytes, Event,
QueueResource);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueueExt(HSAuint32 NodeId,
HSA_QUEUE_TYPE Type,
HSAuint32 QueuePercentage,
HSA_QUEUE_PRIORITY Priority,
HSAuint32 SdmaEngineId,
void *QueueAddress,
HSAuint64 QueueSizeInBytes,
HsaEvent *Event,
HsaQueueResource *QueueResource)
{
HSAKMT_STATUS result;
uint32_t gpu_id;
uint64_t doorbell_mmap_offset;
unsigned int doorbell_offset;
int err;
HsaNodeProperties props;
uint32_t cu_num, i;
CHECK_KFD_OPEN();
if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
return HSAKMT_STATUS_INVALID_PARAMETER;
result = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
struct queue *q = allocate_exec_aligned_memory(sizeof(*q),
false, gpu_id, NodeId, true, false, true);
if (!q)
return HSAKMT_STATUS_NO_MEMORY;
memset(q, 0, sizeof(*q));
q->gfxv = hsakmt_get_gfxv_by_node_id(NodeId);
q->use_ats = false;
if (q->gfxv == GFX_VERSION_TONGA)
q->eop_buffer_size = TONGA_PAGE_SIZE;
else if ((q->gfxv & ~(0xff)) == GFX_VERSION_AQUA_VANJARAM)
q->eop_buffer_size = ((Type == HSA_QUEUE_COMPUTE) ? 4096 : 0);
else if (q->gfxv >= 0x80000)
q->eop_buffer_size = 4096;
/* By default, CUs are all turned on. Initialize cu_mask to '1
* for all CU bits.
*/
if (hsaKmtGetNodeProperties(NodeId, &props))
q->cu_mask_count = 0;
else {
cu_num = props.NumFComputeCores / props.NumSIMDPerCU;
/* cu_mask_count counts bits. It must be multiple of 32 */
q->cu_mask_count = ALIGN_UP_32(cu_num, 32);
for (i = 0; i < cu_num; i++)
q->cu_mask[i/32] |= (1 << (i % 32));
}
struct kfd_ioctl_create_queue_args args = {0};
args.gpu_id = gpu_id;
switch (Type) {
case HSA_QUEUE_COMPUTE:
args.queue_type = KFD_IOC_QUEUE_TYPE_COMPUTE;
break;
case HSA_QUEUE_SDMA:
args.queue_type = KFD_IOC_QUEUE_TYPE_SDMA;
break;
case HSA_QUEUE_SDMA_XGMI:
args.queue_type = KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
break;
case HSA_QUEUE_SDMA_BY_ENG_ID:
args.queue_type = KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID;
break;
case HSA_QUEUE_COMPUTE_AQL:
args.queue_type = KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
break;
default:
return HSAKMT_STATUS_INVALID_PARAMETER;
}
if (Type != HSA_QUEUE_COMPUTE_AQL) {
QueueResource->QueueRptrValue = (uintptr_t)&q->rptr;
QueueResource->QueueWptrValue = (uintptr_t)&q->wptr;
}
err = handle_concrete_asic(q, &args, gpu_id, NodeId, Event, QueueResource->ErrorReason);
if (err != HSAKMT_STATUS_SUCCESS) {
free_queue(q);
return err;
}
args.read_pointer_address = QueueResource->QueueRptrValue;
args.write_pointer_address = QueueResource->QueueWptrValue;
args.ring_base_address = (uintptr_t)QueueAddress;
args.ring_size = QueueSizeInBytes;
args.queue_percentage = QueuePercentage;
args.queue_priority = priority_map[Priority+3];
args.sdma_engine_id = SdmaEngineId;
err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_CREATE_QUEUE, &args);
if (err == -1) {
free_queue(q);
return HSAKMT_STATUS_ERROR;
}
q->queue_id = args.queue_id;
if (IS_SOC15(q->gfxv)) {
HSAuint64 mask = DOORBELLS_PAGE_SIZE(DOORBELL_SIZE(q->gfxv)) - 1;
/* On SOC15 chips, the doorbell offset within the
* doorbell page is included in the doorbell offset
* returned by KFD. This allows CP queue doorbells to be
* allocated dynamically (while SDMA queue doorbells fixed)
* rather than based on the its process queue ID.
*/
doorbell_mmap_offset = args.doorbell_offset & ~mask;
doorbell_offset = args.doorbell_offset & mask;
} else {
/* On older chips, the doorbell offset within the
* doorbell page is based on the queue ID.
*/
doorbell_mmap_offset = args.doorbell_offset;
doorbell_offset = q->queue_id * DOORBELL_SIZE(q->gfxv);
}
err = map_doorbell(NodeId, gpu_id, doorbell_mmap_offset);
if (err != HSAKMT_STATUS_SUCCESS) {
hsaKmtDestroyQueue(q->queue_id);
return HSAKMT_STATUS_ERROR;
}
QueueResource->QueueId = PORT_VPTR_TO_UINT64(q);
QueueResource->Queue_DoorBell = VOID_PTR_ADD(doorbells[NodeId].mapping,
doorbell_offset);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtUpdateQueue(HSA_QUEUEID QueueId,
HSAuint32 QueuePercentage,
HSA_QUEUE_PRIORITY Priority,
void *QueueAddress,
HSAuint64 QueueSize,
HsaEvent *Event)
{
struct kfd_ioctl_update_queue_args arg = {0};
struct queue *q = PORT_UINT64_TO_VPTR(QueueId);
CHECK_KFD_OPEN();
if (Priority < HSA_QUEUE_PRIORITY_MINIMUM ||
Priority > HSA_QUEUE_PRIORITY_MAXIMUM)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (!q)
return HSAKMT_STATUS_INVALID_PARAMETER;
arg.queue_id = (HSAuint32)q->queue_id;
arg.ring_base_address = (uintptr_t)QueueAddress;
arg.ring_size = QueueSize;
arg.queue_percentage = QueuePercentage;
arg.queue_priority = priority_map[Priority+3];
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_UPDATE_QUEUE, &arg);
if (err == -1)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyQueue(HSA_QUEUEID QueueId)
{
CHECK_KFD_OPEN();
struct queue *q = PORT_UINT64_TO_VPTR(QueueId);
struct kfd_ioctl_destroy_queue_args args = {0};
if (!q)
return HSAKMT_STATUS_INVALID_PARAMETER;
args.queue_id = q->queue_id;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_DESTROY_QUEUE, &args);
if (err == -1) {
pr_err("Failed to destroy queue: %s\n", strerror(errno));
return HSAKMT_STATUS_ERROR;
}
free_queue(q);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetQueueCUMask(HSA_QUEUEID QueueId,
HSAuint32 CUMaskCount,
HSAuint32 *QueueCUMask)
{
struct queue *q = PORT_UINT64_TO_VPTR(QueueId);
struct kfd_ioctl_set_cu_mask_args args = {0};
CHECK_KFD_OPEN();
if (CUMaskCount == 0 || !QueueCUMask || ((CUMaskCount % 32) != 0))
return HSAKMT_STATUS_INVALID_PARAMETER;
args.queue_id = q->queue_id;
args.num_cu_mask = CUMaskCount;
args.cu_mask_ptr = (uintptr_t)QueueCUMask;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_CU_MASK, &args);
if (err == -1)
return HSAKMT_STATUS_ERROR;
memcpy(q->cu_mask, QueueCUMask, CUMaskCount / 8);
q->cu_mask_count = CUMaskCount;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS
HSAKMTAPI
hsaKmtGetQueueInfo(
HSA_QUEUEID QueueId,
HsaQueueInfo *QueueInfo
)
{
struct queue *q = PORT_UINT64_TO_VPTR(QueueId);
struct kfd_ioctl_get_queue_wave_state_args args = {0};
CHECK_KFD_OPEN();
if (QueueInfo == NULL || q == NULL)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (q->ctx_save_restore == NULL)
return HSAKMT_STATUS_ERROR;
args.queue_id = q->queue_id;
args.ctl_stack_address = (uintptr_t)q->ctx_save_restore;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_QUEUE_WAVE_STATE, &args) < 0)
return HSAKMT_STATUS_ERROR;
QueueInfo->ControlStackTop = (void *)(args.ctl_stack_address +
q->ctl_stack_size - args.ctl_stack_used_size);
QueueInfo->UserContextSaveArea = (void *)
(args.ctl_stack_address + q->ctl_stack_size);
QueueInfo->SaveAreaSizeInBytes = args.save_area_used_size;
QueueInfo->ControlStackUsedInBytes = args.ctl_stack_used_size;
QueueInfo->NumCUAssigned = q->cu_mask_count;
QueueInfo->CUMaskInfo = q->cu_mask;
QueueInfo->QueueDetailError = 0;
QueueInfo->QueueTypeExtended = 0;
QueueInfo->SaveAreaHeader = q->ctx_save_restore;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetTrapHandler(HSAuint32 Node,
void *TrapHandlerBaseAddress,
HSAuint64 TrapHandlerSizeInBytes,
void *TrapBufferBaseAddress,
HSAuint64 TrapBufferSizeInBytes)
{
struct kfd_ioctl_set_trap_handler_args args = {0};
HSAKMT_STATUS result;
uint32_t gpu_id;
CHECK_KFD_OPEN();
result = hsakmt_validate_nodeid(Node, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
args.gpu_id = gpu_id;
args.tba_addr = (uintptr_t)TrapHandlerBaseAddress;
args.tma_addr = (uintptr_t)TrapBufferBaseAddress;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_TRAP_HANDLER, &args);
return (err == -1) ? HSAKMT_STATUS_ERROR : HSAKMT_STATUS_SUCCESS;
}
uint32_t *hsakmt_convert_queue_ids(HSAuint32 NumQueues, HSA_QUEUEID *Queues)
{
uint32_t *queue_ids_ptr;
unsigned int i;
if (NumQueues == 0 || Queues == NULL)
return NULL;
queue_ids_ptr = malloc(NumQueues * sizeof(uint32_t));
if (!queue_ids_ptr)
return NULL;
for (i = 0; i < NumQueues; i++) {
struct queue *q = PORT_UINT64_TO_VPTR(Queues[i]);
if (q == NULL) {
free(queue_ids_ptr);
return NULL;
}
queue_ids_ptr[i] = q->queue_id;
}
return queue_ids_ptr;
}
HSAKMT_STATUS
HSAKMTAPI
hsaKmtAllocQueueGWS(
HSA_QUEUEID QueueId,
HSAuint32 nGWS,
HSAuint32 *firstGWS)
{
struct kfd_ioctl_alloc_queue_gws_args args = {0};
struct queue *q = PORT_UINT64_TO_VPTR(QueueId);
CHECK_KFD_OPEN();
args.queue_id = (HSAuint32)q->queue_id;
args.num_gws = nGWS;
int err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_ALLOC_QUEUE_GWS, &args);
if (!err && firstGWS)
*firstGWS = args.first_gws;
if (!err)
return HSAKMT_STATUS_SUCCESS;
else if (errno == EINVAL)
return HSAKMT_STATUS_INVALID_PARAMETER;
else if (errno == EBUSY)
return HSAKMT_STATUS_OUT_OF_RESOURCES;
else if (errno == ENODEV)
return HSAKMT_STATUS_NOT_SUPPORTED;
else
return HSAKMT_STATUS_ERROR;
}
================================================
FILE: libhsakmt/src/rbtree.c
================================================
/*
* Copyright (C) 2002-2018 Igor Sysoev
* Copyright (C) 2011-2018 Nginx, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "rbtree.h"
static inline void rbtree_left_rotate(rbtree_node_t **root,
rbtree_node_t *sentinel, rbtree_node_t *node);
static inline void rbtree_right_rotate(rbtree_node_t **root,
rbtree_node_t *sentinel, rbtree_node_t *node);
static void
hsakmt_rbtree_insert_value(rbtree_node_t *temp, rbtree_node_t *node,
rbtree_node_t *sentinel)
{
rbtree_node_t **p;
for ( ;; ) {
p = rbtree_key_compare(LKP_ALL, &node->key, &temp->key) < 0 ?
&temp->left : &temp->right;
if (*p == sentinel) {
break;
}
temp = *p;
}
*p = node;
node->parent = temp;
node->left = sentinel;
node->right = sentinel;
rbt_red(node);
}
void
hsakmt_rbtree_insert(rbtree_t *tree, rbtree_node_t *node)
{
rbtree_node_t **root, *temp, *sentinel;
/* a binary tree insert */
root = &tree->root;
sentinel = &tree->sentinel;
if (*root == sentinel) {
node->parent = NULL;
node->left = sentinel;
node->right = sentinel;
rbt_black(node);
*root = node;
return;
}
hsakmt_rbtree_insert_value(*root, node, sentinel);
/* re-balance tree */
while (node != *root && rbt_is_red(node->parent)) {
if (node->parent == node->parent->parent->left) {
temp = node->parent->parent->right;
if (rbt_is_red(temp)) {
rbt_black(node->parent);
rbt_black(temp);
rbt_red(node->parent->parent);
node = node->parent->parent;
} else {
if (node == node->parent->right) {
node = node->parent;
rbtree_left_rotate(root, sentinel, node);
}
rbt_black(node->parent);
rbt_red(node->parent->parent);
rbtree_right_rotate(root, sentinel, node->parent->parent);
}
} else {
temp = node->parent->parent->left;
if (rbt_is_red(temp)) {
rbt_black(node->parent);
rbt_black(temp);
rbt_red(node->parent->parent);
node = node->parent->parent;
} else {
if (node == node->parent->left) {
node = node->parent;
rbtree_right_rotate(root, sentinel, node);
}
rbt_black(node->parent);
rbt_red(node->parent->parent);
rbtree_left_rotate(root, sentinel, node->parent->parent);
}
}
}
rbt_black(*root);
}
void
hsakmt_rbtree_delete(rbtree_t *tree, rbtree_node_t *node)
{
unsigned int red;
rbtree_node_t **root, *sentinel, *subst, *temp, *w;
/* a binary tree delete */
root = &tree->root;
sentinel = &tree->sentinel;
if (node->left == sentinel) {
temp = node->right;
subst = node;
} else if (node->right == sentinel) {
temp = node->left;
subst = node;
} else {
subst = rbtree_min(node->right, sentinel);
if (subst->left != sentinel) {
temp = subst->left;
} else {
temp = subst->right;
}
}
if (subst == *root) {
*root = temp;
rbt_black(temp);
return;
}
red = rbt_is_red(subst);
if (subst == subst->parent->left) {
subst->parent->left = temp;
} else {
subst->parent->right = temp;
}
if (subst == node) {
temp->parent = subst->parent;
} else {
if (subst->parent == node) {
temp->parent = subst;
} else {
temp->parent = subst->parent;
}
subst->left = node->left;
subst->right = node->right;
subst->parent = node->parent;
rbt_copy_color(subst, node);
if (node == *root) {
*root = subst;
} else {
if (node == node->parent->left) {
node->parent->left = subst;
} else {
node->parent->right = subst;
}
}
if (subst->left != sentinel) {
subst->left->parent = subst;
}
if (subst->right != sentinel) {
subst->right->parent = subst;
}
}
if (red) {
return;
}
/* a delete fixup */
while (temp != *root && rbt_is_black(temp)) {
if (temp == temp->parent->left) {
w = temp->parent->right;
if (rbt_is_red(w)) {
rbt_black(w);
rbt_red(temp->parent);
rbtree_left_rotate(root, sentinel, temp->parent);
w = temp->parent->right;
}
if (rbt_is_black(w->left) && rbt_is_black(w->right)) {
rbt_red(w);
temp = temp->parent;
} else {
if (rbt_is_black(w->right)) {
rbt_black(w->left);
rbt_red(w);
rbtree_right_rotate(root, sentinel, w);
w = temp->parent->right;
}
rbt_copy_color(w, temp->parent);
rbt_black(temp->parent);
rbt_black(w->right);
rbtree_left_rotate(root, sentinel, temp->parent);
temp = *root;
}
} else {
w = temp->parent->left;
if (rbt_is_red(w)) {
rbt_black(w);
rbt_red(temp->parent);
rbtree_right_rotate(root, sentinel, temp->parent);
w = temp->parent->left;
}
if (rbt_is_black(w->left) && rbt_is_black(w->right)) {
rbt_red(w);
temp = temp->parent;
} else {
if (rbt_is_black(w->left)) {
rbt_black(w->right);
rbt_red(w);
rbtree_left_rotate(root, sentinel, w);
w = temp->parent->left;
}
rbt_copy_color(w, temp->parent);
rbt_black(temp->parent);
rbt_black(w->left);
rbtree_right_rotate(root, sentinel, temp->parent);
temp = *root;
}
}
}
rbt_black(temp);
}
static inline void
rbtree_left_rotate(rbtree_node_t **root, rbtree_node_t *sentinel,
rbtree_node_t *node)
{
rbtree_node_t *temp;
temp = node->right;
node->right = temp->left;
if (temp->left != sentinel) {
temp->left->parent = node;
}
temp->parent = node->parent;
if (node == *root) {
*root = temp;
} else if (node == node->parent->left) {
node->parent->left = temp;
} else {
node->parent->right = temp;
}
temp->left = node;
node->parent = temp;
}
static inline void
rbtree_right_rotate(rbtree_node_t **root, rbtree_node_t *sentinel,
rbtree_node_t *node)
{
rbtree_node_t *temp;
temp = node->left;
node->left = temp->right;
if (temp->right != sentinel) {
temp->right->parent = node;
}
temp->parent = node->parent;
if (node == *root) {
*root = temp;
} else if (node == node->parent->right) {
node->parent->right = temp;
} else {
node->parent->left = temp;
}
temp->right = node;
node->parent = temp;
}
rbtree_node_t *
hsakmt_rbtree_next(rbtree_t *tree, rbtree_node_t *node)
{
rbtree_node_t *root, *sentinel, *parent;
sentinel = &tree->sentinel;
if (node->right != sentinel) {
return rbtree_min(node->right, sentinel);
}
root = tree->root;
for ( ;; ) {
parent = node->parent;
if (node == root) {
return NULL;
}
if (node == parent->left) {
return parent;
}
node = parent;
}
}
rbtree_node_t *
hsakmt_rbtree_prev(rbtree_t *tree, rbtree_node_t *node)
{
rbtree_node_t *root, *sentinel, *parent;
sentinel = &tree->sentinel;
if (node->left != sentinel) {
return rbtree_max(node->left, sentinel);
}
root = tree->root;
for ( ;; ) {
parent = node->parent;
if (node == root) {
return NULL;
}
if (node == parent->right) {
return parent;
}
node = parent;
}
}
================================================
FILE: libhsakmt/src/rbtree.h
================================================
/*
* Copyright (C) 2002-2018 Igor Sysoev
* Copyright (C) 2011-2018 Nginx, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _RBTREE_H_
#define _RBTREE_H_
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "rbtree_amd.h"
typedef struct rbtree_node_s rbtree_node_t;
struct rbtree_node_s {
rbtree_key_t key;
rbtree_node_t *left;
rbtree_node_t *right;
rbtree_node_t *parent;
unsigned char color;
unsigned char data;
};
typedef struct rbtree_s rbtree_t;
struct rbtree_s {
rbtree_node_t *root;
rbtree_node_t sentinel;
};
#define rbtree_init(tree) \
rbtree_sentinel_init(&(tree)->sentinel); \
(tree)->root = &(tree)->sentinel;
void hsakmt_rbtree_insert(rbtree_t *tree, rbtree_node_t *node);
void hsakmt_rbtree_delete(rbtree_t *tree, rbtree_node_t *node);
rbtree_node_t *hsakmt_rbtree_prev(rbtree_t *tree,
rbtree_node_t *node);
rbtree_node_t *hsakmt_rbtree_next(rbtree_t *tree,
rbtree_node_t *node);
#define rbt_red(node) ((node)->color = 1)
#define rbt_black(node) ((node)->color = 0)
#define rbt_is_red(node) ((node)->color)
#define rbt_is_black(node) (!rbt_is_red(node))
#define rbt_copy_color(n1, n2) (n1->color = n2->color)
/* a sentinel must be black */
#define rbtree_sentinel_init(node) rbt_black(node)
static inline rbtree_node_t *
rbtree_min(rbtree_node_t *node, rbtree_node_t *sentinel)
{
while (node->left != sentinel) {
node = node->left;
}
return node;
}
#include "rbtree_amd.h"
#endif
================================================
FILE: libhsakmt/src/rbtree_amd.h
================================================
/*
* Copyright © 2018 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef _RBTREE_AMD_H_
#define _RBTREE_AMD_H_
typedef struct rbtree_key_s rbtree_key_t;
struct rbtree_key_s {
#define ADDR_BIT 0
#define SIZE_BIT 1
unsigned long addr;
unsigned long size;
};
#define BIT(x) (1<<(x))
#define LKP_ALL (BIT(ADDR_BIT) | BIT(SIZE_BIT))
#define LKP_ADDR (BIT(ADDR_BIT))
#define LKP_ADDR_SIZE (BIT(ADDR_BIT) | BIT(SIZE_BIT))
static inline rbtree_key_t
rbtree_key(unsigned long addr, unsigned long size)
{
return (rbtree_key_t){addr, size};
}
/*
* compare addr, size one by one
*/
static inline int
rbtree_key_compare(unsigned int type, rbtree_key_t *key1, rbtree_key_t *key2)
{
if ((type & 1 << ADDR_BIT) && (key1->addr != key2->addr))
return key1->addr > key2->addr ? 1 : -1;
if ((type & 1 << SIZE_BIT) && (key1->size != key2->size))
return key1->size > key2->size ? 1 : -1;
return 0;
}
#endif /*_RBTREE_AMD_H_*/
/*inlcude this file again with RBTREE_HELPER defined*/
#ifndef RBTREE_HELPER
#define RBTREE_HELPER
#else
#ifndef _RBTREE_AMD_H_HELPER_
#define _RBTREE_AMD_H_HELPER_
static inline rbtree_node_t *
rbtree_max(rbtree_node_t *node, rbtree_node_t *sentinel)
{
while (node->right != sentinel)
node = node->right;
return node;
}
#define LEFT 0
#define RIGHT 1
#define MID 2
static inline rbtree_node_t *
rbtree_min_max(rbtree_t *tree, int lr)
{
rbtree_node_t *sentinel = &tree->sentinel;
rbtree_node_t *node = tree->root;
if (node == sentinel)
return NULL;
if (lr == LEFT)
node = rbtree_min(node, sentinel);
else if (lr == RIGHT)
node = rbtree_max(node, sentinel);
return node;
}
static inline rbtree_node_t *
rbtree_node_any(rbtree_t *tree, int lmr)
{
rbtree_node_t *sentinel = &tree->sentinel;
rbtree_node_t *node = tree->root;
if (node == sentinel)
return NULL;
if (lmr == MID)
return node;
return rbtree_min_max(tree, lmr);
}
static inline rbtree_node_t *
rbtree_lookup_nearest(rbtree_t *rbtree, rbtree_key_t *key,
unsigned int type, int lr)
{
int rc;
rbtree_node_t *node, *sentinel, *n = NULL;
node = rbtree->root;
sentinel = &rbtree->sentinel;
while (node != sentinel) {
rc = rbtree_key_compare(type, key, &node->key);
if (rc < 0) {
if (lr == RIGHT)
n = node;
node = node->left;
continue;
}
if (rc > 0) {
if (lr == LEFT)
n = node;
node = node->right;
continue;
}
return node;
}
return n;
}
static inline rbtree_node_t *
rbtree_lookup(rbtree_t *rbtree, rbtree_key_t *key,
unsigned int type)
{
return rbtree_lookup_nearest(rbtree, key, type, -1);
}
#endif /*_RBTREE_AMD_H_HELPER_*/
#endif /*RBTREE_HELPER*/
================================================
FILE: libhsakmt/src/spm.c
================================================
/*
* Copyright © 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include "hsakmt/linux/kfd_ioctl.h"
#include
#include
HSAKMT_STATUS HSAKMTAPI hsaKmtSPMAcquire(HSAuint32 PreferredNode)
{
int ret;
struct kfd_ioctl_spm_args args = {0};
uint32_t gpu_id;
ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode);
return ret;
}
ret = HSAKMT_STATUS_SUCCESS;
args.op = KFD_IOCTL_SPM_OP_ACQUIRE;
args.gpu_id = gpu_id;
ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSPMSetDestBuffer(HSAuint32 PreferredNode,
HSAuint32 SizeInBytes,
HSAuint32 * timeout,
HSAuint32 * SizeCopied,
void *DestMemoryAddress,
bool *isSPMDataLoss)
{
int ret;
struct kfd_ioctl_spm_args args = {0};
uint32_t gpu_id = 0;
ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS) {
return ret;
}
args.timeout = *timeout;
args.dest_buf = (uint64_t)DestMemoryAddress;
args.buf_size = SizeInBytes;
args.op = KFD_IOCTL_SPM_OP_SET_DEST_BUF;
args.gpu_id = gpu_id;
ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
*SizeCopied = args.bytes_copied;
*isSPMDataLoss = args.has_data_loss;
*timeout = args.timeout;
return ret;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSPMRelease(HSAuint32 PreferredNode)
{
int ret = HSAKMT_STATUS_SUCCESS;
struct kfd_ioctl_spm_args args = {0};
uint32_t gpu_id;
ret = hsakmt_validate_nodeid(PreferredNode, &gpu_id);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("[%s] invalid node ID: %d\n", __func__, PreferredNode);
return ret;
}
args.op = KFD_IOCTL_SPM_OP_RELEASE;
args.gpu_id = gpu_id;
ret = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_RLC_SPM, &args);
return ret;
}
================================================
FILE: libhsakmt/src/svm.c
================================================
/*
* Copyright © 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/* Helper functions for calling KFD SVM ioctl */
HSAKMT_STATUS HSAKMTAPI
hsaKmtSVMSetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
HSA_SVM_ATTRIBUTE *attrs)
{
struct kfd_ioctl_svm_args *args;
HSAuint64 s_attr;
HSAKMT_STATUS r;
HSAuint32 i;
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(5);
pr_debug("%s: address 0x%p size 0x%lx\n", __func__, start_addr, size);
if (!start_addr || !size)
return HSAKMT_STATUS_INVALID_PARAMETER;
if ((uint64_t)start_addr & (PAGE_SIZE - 1))
return HSAKMT_STATUS_INVALID_PARAMETER;
if (size & (PAGE_SIZE - 1))
return HSAKMT_STATUS_INVALID_PARAMETER;
s_attr = sizeof(*attrs) * nattr;
args = alloca(sizeof(*args) + s_attr);
args->start_addr = (uint64_t)start_addr;
args->size = size;
args->op = KFD_IOCTL_SVM_OP_SET_ATTR;
args->nattr = nattr;
memcpy(args->attrs, attrs, s_attr);
for (i = 0; i < nattr; i++) {
if (attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFERRED_LOC &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFETCH_LOC &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS)
continue;
if (attrs[i].type == KFD_IOCTL_SVM_ATTR_PREFERRED_LOC &&
attrs[i].value == INVALID_NODEID) {
args->attrs[i].value = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
continue;
}
r = hsakmt_validate_nodeid(attrs[i].value, &args->attrs[i].value);
if (r != HSAKMT_STATUS_SUCCESS) {
pr_debug("invalid node ID: %d\n", attrs[i].value);
return r;
} else if (!args->attrs[i].value &&
(attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS ||
attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE ||
attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS)) {
pr_debug("CPU node invalid for access attribute\n");
return HSAKMT_STATUS_INVALID_NODE_UNIT;
}
}
/* Driver does one copy_from_user, with extra attrs size */
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args);
if (r) {
pr_debug("op set range attrs failed %s\n", strerror(errno));
return HSAKMT_STATUS_ERROR;
}
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtSVMGetAttr(void *start_addr, HSAuint64 size, unsigned int nattr,
HSA_SVM_ATTRIBUTE *attrs)
{
struct kfd_ioctl_svm_args *args;
HSAuint64 s_attr;
HSAKMT_STATUS r;
HSAuint32 i;
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(5);
pr_debug("%s: address 0x%p size 0x%lx\n", __func__, start_addr, size);
if (!start_addr || !size)
return HSAKMT_STATUS_INVALID_PARAMETER;
if ((uint64_t)start_addr & (PAGE_SIZE - 1))
return HSAKMT_STATUS_INVALID_PARAMETER;
if (size & (PAGE_SIZE - 1))
return HSAKMT_STATUS_INVALID_PARAMETER;
s_attr = sizeof(*attrs) * nattr;
args = alloca(sizeof(*args) + s_attr);
args->start_addr = (uint64_t)start_addr;
args->size = size;
args->op = KFD_IOCTL_SVM_OP_GET_ATTR;
args->nattr = nattr;
memcpy(args->attrs, attrs, s_attr);
for (i = 0; i < nattr; i++) {
if (attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS)
continue;
r = hsakmt_validate_nodeid(attrs[i].value, &args->attrs[i].value);
if (r != HSAKMT_STATUS_SUCCESS) {
pr_debug("invalid node ID: %d\n", attrs[i].value);
return r;
} else if (!args->attrs[i].value) {
pr_debug("CPU node invalid for access attribute\n");
return HSAKMT_STATUS_INVALID_NODE_UNIT;
}
}
/* Driver does one copy_from_user, with extra attrs size */
r = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SVM + (s_attr << _IOC_SIZESHIFT), args);
if (r) {
pr_debug("op get range attrs failed %s\n", strerror(errno));
return HSAKMT_STATUS_ERROR;
}
memcpy(attrs, args->attrs, s_attr);
for (i = 0; i < nattr; i++) {
if (attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFERRED_LOC &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_PREFETCH_LOC &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE &&
attrs[i].type != KFD_IOCTL_SVM_ATTR_NO_ACCESS)
continue;
switch (attrs[i].value) {
case KFD_IOCTL_SVM_LOCATION_SYSMEM:
attrs[i].value = 0;
break;
case KFD_IOCTL_SVM_LOCATION_UNDEFINED:
attrs[i].value = INVALID_NODEID;
break;
default:
r = hsakmt_gpuid_to_nodeid(attrs[i].value, &attrs[i].value);
if (r != HSAKMT_STATUS_SUCCESS) {
pr_debug("invalid GPU ID: %d\n",
attrs[i].value);
return r;
}
}
}
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS
hsaKmtSetGetXNACKMode(HSAint32 * enable)
{
struct kfd_ioctl_set_xnack_mode_args args;
CHECK_KFD_OPEN();
CHECK_KFD_MINOR_VERSION(5);
args.xnack_enabled = *enable;
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_SET_XNACK_MODE, &args)) {
if (errno == EPERM) {
pr_debug("set mode not supported %s\n",
strerror(errno));
return HSAKMT_STATUS_NOT_SUPPORTED;
} else if (errno == EBUSY) {
pr_debug("hsakmt_ioctl queues not empty %s\n",
strerror(errno));
}
return HSAKMT_STATUS_ERROR;
}
*enable = args.xnack_enabled;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtSetXNACKMode(HSAint32 enable)
{
return hsaKmtSetGetXNACKMode(&enable);
}
HSAKMT_STATUS HSAKMTAPI
hsaKmtGetXNACKMode(HSAint32 * enable)
{
*enable = -1;
return hsaKmtSetGetXNACKMode(enable);
}
================================================
FILE: libhsakmt/src/time.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include "hsakmt/linux/kfd_ioctl.h"
HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId,
HsaClockCounters *Counters)
{
HSAKMT_STATUS result;
uint32_t gpu_id;
struct kfd_ioctl_get_clock_counters_args args = {0};
int err;
CHECK_KFD_OPEN();
result = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (result != HSAKMT_STATUS_SUCCESS)
return result;
args.gpu_id = gpu_id;
err = hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args);
if (err < 0) {
result = HSAKMT_STATUS_ERROR;
} else {
/* At this point the result is already HSAKMT_STATUS_SUCCESS */
Counters->GPUClockCounter = args.gpu_clock_counter;
Counters->CPUClockCounter = args.cpu_clock_counter;
Counters->SystemClockCounter = args.system_clock_counter;
Counters->SystemClockFrequencyHz = args.system_clock_freq;
}
return result;
}
================================================
FILE: libhsakmt/src/topology.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
* Copyright 2016-2018 Raptor Engineering, LLC. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "libhsakmt.h"
#include "hsakmt/hsakmtmodel.h"
#include "fmm.h"
/* Number of memory banks added by thunk on top of topology
* This only includes static heaps like LDS, scratch and SVM,
* not for MMIO_REMAP heap. MMIO_REMAP memory bank is reported
* dynamically based on whether mmio aperture was mapped
* successfully on this node.
*/
#define NUM_OF_IGPU_HEAPS 3
#define NUM_OF_DGPU_HEAPS 3
/* SYSFS related */
#define KFD_SYSFS_PATH "/sys/devices/virtual/kfd/kfd/topology"
#define KFD_SYSFS_PATH_GENERATION_ID "%s/generation_id"
#define KFD_SYSFS_PATH_SYSTEM_PROPERTIES "%s/system_properties"
#define KFD_SYSFS_PATH_NODES "%s/nodes"
static const char *get_topology_dir(void)
{
if (hsakmt_use_model)
return hsakmt_model_topology;
return KFD_SYSFS_PATH;
}
typedef struct {
HsaNodeProperties node;
HsaMemoryProperties *mem; /* node->NumBanks elements */
HsaCacheProperties *cache;
HsaIoLinkProperties *link;
} node_props_t;
static HsaSystemProperties *g_system;
static node_props_t *g_props;
/* This array caches sysfs based node IDs of CPU nodes + all supported GPU nodes.
* It will be used to map user-node IDs to sysfs-node IDs.
*/
static uint32_t *map_user_to_sysfs_node_id;
static uint32_t map_user_to_sysfs_node_id_size;
static uint32_t num_sysfs_nodes;
static int processor_vendor = -1;
/* Supported System Vendors */
enum SUPPORTED_PROCESSOR_VENDORS {
GENUINE_INTEL = 0,
AUTHENTIC_AMD,
IBM_POWER
};
/* Adding newline to make the search easier */
static const char *supported_processor_vendor_name[] = {
"GenuineIntel\n",
"AuthenticAMD\n",
"\n" // POWER requires a different search method
};
static HSAKMT_STATUS topology_take_snapshot(void);
static void topology_drop_snapshot(void);
static const struct hsa_gfxip_table gfxip_lookup_table[] = {
/* Kaveri Family */
{ 0x1304, 7, 0, 0, "Spectre" },
{ 0x1305, 7, 0, 0, "Spectre" },
{ 0x1306, 7, 0, 0, "Spectre" },
{ 0x1307, 7, 0, 0, "Spectre" },
{ 0x1309, 7, 0, 0, "Spectre" },
{ 0x130A, 7, 0, 0, "Spectre" },
{ 0x130B, 7, 0, 0, "Spectre" },
{ 0x130C, 7, 0, 0, "Spectre" },
{ 0x130D, 7, 0, 0, "Spectre" },
{ 0x130E, 7, 0, 0, "Spectre" },
{ 0x130F, 7, 0, 0, "Spectre" },
{ 0x1310, 7, 0, 0, "Spectre" },
{ 0x1311, 7, 0, 0, "Spectre" },
{ 0x1312, 7, 0, 0, "Spooky" },
{ 0x1313, 7, 0, 0, "Spectre" },
{ 0x1315, 7, 0, 0, "Spectre" },
{ 0x1316, 7, 0, 0, "Spooky" },
{ 0x1317, 7, 0, 0, "Spooky" },
{ 0x1318, 7, 0, 0, "Spectre" },
{ 0x131B, 7, 0, 0, "Spectre" },
{ 0x131C, 7, 0, 0, "Spectre" },
{ 0x131D, 7, 0, 0, "Spectre" },
/* Hawaii Family */
{ 0x67A0, 7, 0, 1, "Hawaii" },
{ 0x67A1, 7, 0, 1, "Hawaii" },
{ 0x67A2, 7, 0, 1, "Hawaii" },
{ 0x67A8, 7, 0, 1, "Hawaii" },
{ 0x67A9, 7, 0, 1, "Hawaii" },
{ 0x67AA, 7, 0, 1, "Hawaii" },
{ 0x67B0, 7, 0, 1, "Hawaii" },
{ 0x67B1, 7, 0, 1, "Hawaii" },
{ 0x67B8, 7, 0, 1, "Hawaii" },
{ 0x67B9, 7, 0, 1, "Hawaii" },
{ 0x67BA, 7, 0, 1, "Hawaii" },
{ 0x67BE, 7, 0, 1, "Hawaii" },
/* Carrizo Family */
{ 0x9870, 8, 0, 1, "Carrizo" },
{ 0x9874, 8, 0, 1, "Carrizo" },
{ 0x9875, 8, 0, 1, "Carrizo" },
{ 0x9876, 8, 0, 1, "Carrizo" },
{ 0x9877, 8, 0, 1, "Carrizo" },
/* Tonga Family */
{ 0x6920, 8, 0, 2, "Tonga" },
{ 0x6921, 8, 0, 2, "Tonga" },
{ 0x6928, 8, 0, 2, "Tonga" },
{ 0x6929, 8, 0, 2, "Tonga" },
{ 0x692B, 8, 0, 2, "Tonga" },
{ 0x692F, 8, 0, 2, "Tonga" },
{ 0x6930, 8, 0, 2, "Tonga" },
{ 0x6938, 8, 0, 2, "Tonga" },
{ 0x6939, 8, 0, 2, "Tonga" },
/* Fiji */
{ 0x7300, 8, 0, 3, "Fiji" },
{ 0x730F, 8, 0, 3, "Fiji" },
/* Polaris10 */
{ 0x67C0, 8, 0, 3, "Polaris10" },
{ 0x67C1, 8, 0, 3, "Polaris10" },
{ 0x67C2, 8, 0, 3, "Polaris10" },
{ 0x67C4, 8, 0, 3, "Polaris10" },
{ 0x67C7, 8, 0, 3, "Polaris10" },
{ 0x67C8, 8, 0, 3, "Polaris10" },
{ 0x67C9, 8, 0, 3, "Polaris10" },
{ 0x67CA, 8, 0, 3, "Polaris10" },
{ 0x67CC, 8, 0, 3, "Polaris10" },
{ 0x67CF, 8, 0, 3, "Polaris10" },
{ 0x67D0, 8, 0, 3, "Polaris10" },
{ 0x67DF, 8, 0, 3, "Polaris10" },
{ 0x6FDF, 8, 0, 3, "Polaris10" },
/* Polaris11 */
{ 0x67E0, 8, 0, 3, "Polaris11" },
{ 0x67E1, 8, 0, 3, "Polaris11" },
{ 0x67E3, 8, 0, 3, "Polaris11" },
{ 0x67E7, 8, 0, 3, "Polaris11" },
{ 0x67E8, 8, 0, 3, "Polaris11" },
{ 0x67E9, 8, 0, 3, "Polaris11" },
{ 0x67EB, 8, 0, 3, "Polaris11" },
{ 0x67EF, 8, 0, 3, "Polaris11" },
{ 0x67FF, 8, 0, 3, "Polaris11" },
/* Polaris12 */
{ 0x6980, 8, 0, 3, "Polaris12" },
{ 0x6981, 8, 0, 3, "Polaris12" },
{ 0x6985, 8, 0, 3, "Polaris12" },
{ 0x6986, 8, 0, 3, "Polaris12" },
{ 0x6987, 8, 0, 3, "Polaris12" },
{ 0x6995, 8, 0, 3, "Polaris12" },
{ 0x6997, 8, 0, 3, "Polaris12" },
{ 0x699F, 8, 0, 3, "Polaris12" },
/* VegaM */
{ 0x694C, 8, 0, 3, "VegaM" },
{ 0x694E, 8, 0, 3, "VegaM" },
{ 0x694F, 8, 0, 3, "VegaM" },
/* Vega10 */
{ 0x6860, 9, 0, 0, "Vega10" },
{ 0x6861, 9, 0, 0, "Vega10" },
{ 0x6862, 9, 0, 0, "Vega10" },
{ 0x6863, 9, 0, 0, "Vega10" },
{ 0x6864, 9, 0, 0, "Vega10" },
{ 0x6867, 9, 0, 0, "Vega10" },
{ 0x6868, 9, 0, 0, "Vega10" },
{ 0x6869, 9, 0, 0, "Vega10" },
{ 0x686A, 9, 0, 0, "Vega10" },
{ 0x686B, 9, 0, 0, "Vega10" },
{ 0x686C, 9, 0, 0, "Vega10" },
{ 0x686D, 9, 0, 0, "Vega10" },
{ 0x686E, 9, 0, 0, "Vega10" },
{ 0x687F, 9, 0, 0, "Vega10" },
/* Vega12 */
{ 0x69A0, 9, 0, 4, "Vega12" },
{ 0x69A1, 9, 0, 4, "Vega12" },
{ 0x69A2, 9, 0, 4, "Vega12" },
{ 0x69A3, 9, 0, 4, "Vega12" },
{ 0x69Af, 9, 0, 4, "Vega12" },
/* Raven */
{ 0x15DD, 9, 0, 2, "Raven" },
{ 0x15D8, 9, 0, 2, "Raven" },
/* Vega20 */
{ 0x66A0, 9, 0, 6, "Vega20" },
{ 0x66A1, 9, 0, 6, "Vega20" },
{ 0x66A2, 9, 0, 6, "Vega20" },
{ 0x66A3, 9, 0, 6, "Vega20" },
{ 0x66A4, 9, 0, 6, "Vega20" },
{ 0x66A7, 9, 0, 6, "Vega20" },
{ 0x66AF, 9, 0, 6, "Vega20" },
/* Arcturus */
{ 0x7388, 9, 0, 8, "Arcturus" },
{ 0x738C, 9, 0, 8, "Arcturus" },
{ 0x738E, 9, 0, 8, "Arcturus" },
{ 0x7390, 9, 0, 8, "Arcturus" },
/* Aldebaran */
{ 0x7408, 9, 0, 10, "Aldebaran" },
{ 0x740C, 9, 0, 10, "Aldebaran" },
{ 0x740F, 9, 0, 10, "Aldebaran" },
{ 0x7410, 9, 0, 10, "Aldebaran" },
/* Renoir */
{ 0x15E7, 9, 0, 12, "Renoir" },
{ 0x1636, 9, 0, 12, "Renoir" },
{ 0x1638, 9, 0, 12, "Renoir" },
{ 0x164C, 9, 0, 12, "Renoir" },
/* Navi10 */
{ 0x7310, 10, 1, 0, "Navi10" },
{ 0x7312, 10, 1, 0, "Navi10" },
{ 0x7318, 10, 1, 0, "Navi10" },
{ 0x731A, 10, 1, 0, "Navi10" },
{ 0x731E, 10, 1, 0, "Navi10" },
{ 0x731F, 10, 1, 0, "Navi10" },
/* cyan_skillfish */
{ 0x13F9, 10, 1, 3, "cyan_skillfish" },
{ 0x13FA, 10, 1, 3, "cyan_skillfish" },
{ 0x13FB, 10, 1, 3, "cyan_skillfish" },
{ 0x13FC, 10, 1, 3, "cyan_skillfish" },
{ 0x13FE, 10, 1, 3, "cyan_skillfish" },
{ 0x143F, 10, 1, 3, "cyan_skillfish" },
/* Navi14 */
{ 0x7340, 10, 1, 2, "Navi14" },
{ 0x7341, 10, 1, 2, "Navi14" },
{ 0x7347, 10, 1, 2, "Navi14" },
/* Navi12 */
{ 0x7360, 10, 1, 1, "Navi12" },
{ 0x7362, 10, 1, 1, "Navi12" },
/* SIENNA_CICHLID */
{ 0x73A0, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73A1, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73A2, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73A3, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73A5, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73A8, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73A9, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73AC, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73AD, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73AB, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73AE, 10, 3, 0, "SIENNA_CICHLID" },
{ 0x73BF, 10, 3, 0, "SIENNA_CICHLID" },
/* NAVY_FLOUNDER */
{ 0x73C0, 10, 3, 1, "NAVY_FLOUNDER" },
{ 0x73C1, 10, 3, 1, "NAVY_FLOUNDER" },
{ 0x73C3, 10, 3, 1, "NAVY_FLOUNDER" },
{ 0x73DA, 10, 3, 1, "NAVY_FLOUNDER" },
{ 0x73DB, 10, 3, 1, "NAVY_FLOUNDER" },
{ 0x73DC, 10, 3, 1, "NAVY_FLOUNDER" },
{ 0x73DD, 10, 3, 1, "NAVY_FLOUNDER" },
{ 0x73DE, 10, 3, 1, "NAVY_FLOUNDER" },
{ 0x73DF, 10, 3, 1, "NAVY_FLOUNDER" },
/* DIMGREY_CAVEFISH */
{ 0x73E0, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73E1, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73E2, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73E8, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73E9, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73EA, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73EB, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73EC, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73ED, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73EF, 10, 3, 2, "DIMGREY_CAVEFISH" },
{ 0x73FF, 10, 3, 2, "DIMGREY_CAVEFISH" },
/* VanGogh */
{ 0x163F, 10, 3, 3, "VanGogh" },
/* BEIGE_GOBY */
{ 0x7420, 10, 3, 4, "BEIGE_GOBY" },
{ 0x7421, 10, 3, 4, "BEIGE_GOBY" },
{ 0x7422, 10, 3, 4, "BEIGE_GOBY" },
{ 0x7423, 10, 3, 4, "BEIGE_GOBY" },
{ 0x743F, 10, 3, 4, "BEIGE_GOBY" },
/* Yellow_Carp */
{ 0x164D, 10, 3, 5, "YELLOW_CARP" },
{ 0x1681, 10, 3, 5, "YELLOW_CARP" },
};
/* information from /proc/cpuinfo */
struct proc_cpuinfo {
uint32_t proc_num; /* processor */
uint32_t apicid; /* apicid */
char model_name[HSA_PUBLIC_NAME_SIZE]; /* model name */
};
/* CPU cache table for all CPUs on the system. Each entry has the relative CPU
* info and caches connected to that CPU.
*/
typedef struct cpu_cacheinfo {
uint32_t len; /* length of the table = number of online procs */
int32_t proc_num; /* this cpu's processor number */
uint32_t num_caches; /* number of caches reported by this cpu */
HsaCacheProperties *cache_prop; /* a list of cache properties */
} cpu_cacheinfo_t;
static void free_properties(node_props_t *props, int size)
{
if (props) {
int i;
for (i = 0; i < size; i++) {
free(props[i].mem);
free(props[i].cache);
free(props[i].link);
}
free(props);
}
}
/* num_subdirs - find the number of sub-directories in the specified path
* @dirpath - directory path to find sub-directories underneath
* @prefix - only count sub-directory names starting with prefix.
* Use blank string, "", to count all.
* Return - number of sub-directories
*/
static int num_subdirs(char *dirpath, char *prefix)
{
int count = 0;
DIR *dirp;
struct dirent *dir;
int prefix_len = strlen(prefix);
dirp = opendir(dirpath);
if (dirp) {
while ((dir = readdir(dirp)) != 0) {
if ((strcmp(dir->d_name, ".") == 0) ||
(strcmp(dir->d_name, "..") == 0))
continue;
if (prefix_len &&
strncmp(dir->d_name, prefix, prefix_len))
continue;
count++;
}
closedir(dirp);
}
return count;
}
/* fscanf_dec - read a file whose content is a decimal number
* @file [IN ] file to read
* @num [OUT] number in the file
*/
static HSAKMT_STATUS fscanf_dec(char *file, uint32_t *num)
{
FILE *fd;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
fd = fopen(file, "r");
if (!fd) {
pr_err("Failed to open %s\n", file);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
if (fscanf(fd, "%u", num) != 1) {
pr_err("Failed to parse %s as a decimal.\n", file);
ret = HSAKMT_STATUS_ERROR;
}
fclose(fd);
return ret;
}
/* fscanf_str - read a file whose content is a string
* @file [IN ] file to read
* @str [OUT] string in the file
*/
static HSAKMT_STATUS fscanf_str(const char *file, char *str, size_t str_size)
{
FILE *fd;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
fd = fopen(file, "r");
if (!fd) {
pr_err("Failed to open %s\n", file);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
if (!fgets(str, (int)str_size, fd)) {
pr_err("Failed to read from %s.\n", file);
ret = HSAKMT_STATUS_ERROR;
} else {
// Remove possible newline characters at the end, due to using fgets function
str[strcspn(str, "\r\n")] = '\0';
}
fclose(fd);
return ret;
}
/* fscanf_size - read a file whose content represents size as a string
* @file [IN ] file to read
* @bytes [OUT] sizes in bytes
*/
static HSAKMT_STATUS fscanf_size(char *file, uint32_t *bytes)
{
FILE *fd;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
char unit;
int n;
fd = fopen(file, "r");
if (!fd) {
pr_err("Failed to open %s\n", file);
return HSAKMT_STATUS_INVALID_PARAMETER;
}
n = fscanf(fd, "%u%c", bytes, &unit);
if (n < 1) {
pr_err("Failed to parse %s\n", file);
ret = HSAKMT_STATUS_ERROR;
}
if (n == 2) {
switch (unit) {
case 'K':
*bytes <<= 10; break;
case 'M':
*bytes <<= 20; break;
case 'G':
*bytes <<= 30; break;
default:
ret = HSAKMT_STATUS_ERROR; break;
}
}
fclose(fd);
return ret;
}
/* cpumap_to_cpu_ci - translate shared_cpu_map string + cpuinfo->apicid into
* SiblingMap in cache
* @shared_cpu_map [IN ] shared_cpu_map string
* @cpuinfo [IN ] cpuinfo to get apicid
* @this_cache [OUT] CPU cache to fill in SiblingMap
*/
static void cpumap_to_cpu_ci(char *shared_cpu_map,
struct proc_cpuinfo *cpuinfo,
HsaCacheProperties *this_cache)
{
int num_hexs, bit;
uint32_t proc, apicid, mask;
char *ch_ptr;
/* shared_cpu_map is shown as ...X3,X2,X1 Each X is a hex without 0x
* and it's up to 8 characters(32 bits). For the first 32 CPUs(actually
* procs), it's presented in X1. The next 32 is in X2, and so on.
*/
num_hexs = (strlen(shared_cpu_map) + 8) / 9; /* 8 characters + "," */
ch_ptr = strtok(shared_cpu_map, ",");
while (num_hexs-- > 0) {
mask = strtol(ch_ptr, NULL, 16); /* each X */
for (bit = 0; bit < 32; bit++) {
if (!((1 << bit) & mask))
continue;
proc = num_hexs * 32 + bit;
apicid = cpuinfo[proc].apicid;
if (apicid >= HSA_CPU_SIBLINGS) {
pr_warn("SiblingMap buffer %d is too small\n",
HSA_CPU_SIBLINGS);
continue;
}
this_cache->SiblingMap[apicid] = 1;
}
ch_ptr = strtok(NULL, ",");
}
}
/* get_cpu_cache_info - get specified CPU's cache information from sysfs
* @prefix [IN] sysfs path for target cpu cache,
* /sys/devices/system/node/nodeX/cpuY/cache
* @cpuinfo [IN] /proc/cpuinfo data to get apicid
* @cpu_ci: CPU specified. This parameter is an input and also an output.
* [IN] cpu_ci->num_caches: number of index dirs
* [OUT] cpu_ci->cache_info: to store cache info collected
* [OUT] cpu_ci->num_caches: reduces when shared with other cpu(s)
* Return: number of cache reported from this cpu
*/
static int get_cpu_cache_info(const char *prefix, struct proc_cpuinfo *cpuinfo,
cpu_cacheinfo_t *cpu_ci)
{
int idx, num_idx, n;
HsaCacheProperties *this_cache;
char path[256], str[256];
bool is_power9 = false;
if (processor_vendor == IBM_POWER) {
if (strcmp(cpuinfo[0].model_name, "POWER9") == 0) {
is_power9 = true;
}
}
this_cache = cpu_ci->cache_prop;
num_idx = cpu_ci->num_caches;
for (idx = 0; idx < num_idx; idx++) {
/* If this cache is shared by multiple CPUs, we only need
* to list it in the first CPU.
*/
if (is_power9) {
// POWER9 has SMT4
if (cpu_ci->proc_num & 0x3) {
/* proc is not 0,4,8,etc. Skip and reduce the cache count. */
--cpu_ci->num_caches;
continue;
}
} else {
snprintf(path, 256, "%s/index%d/shared_cpu_list", prefix, idx);
/* shared_cpu_list is shown as n1,n2... or n1-n2,n3-n4...
* For both cases, this cache is listed to proc n1 only.
*/
fscanf_dec(path, (uint32_t *)&n);
if (cpu_ci->proc_num != n) {
/* proc is not n1. Skip and reduce the cache count. */
--cpu_ci->num_caches;
continue;
}
this_cache->ProcessorIdLow = cpuinfo[cpu_ci->proc_num].apicid;
}
/* CacheLevel */
snprintf(path, 256, "%s/index%d/level", prefix, idx);
fscanf_dec(path, &this_cache->CacheLevel);
/* CacheType */
snprintf(path, 256, "%s/index%d/type", prefix, idx);
memset(str, 0, sizeof(str));
fscanf_str(path, str, sizeof(str));
if (!strcmp(str, "Data"))
this_cache->CacheType.ui32.Data = 1;
if (!strcmp(str, "Instruction"))
this_cache->CacheType.ui32.Instruction = 1;
if (!strcmp(str, "Unified")) {
this_cache->CacheType.ui32.Data = 1;
this_cache->CacheType.ui32.Instruction = 1;
}
this_cache->CacheType.ui32.CPU = 1;
/* CacheSize */
snprintf(path, 256, "%s/index%d/size", prefix, idx);
fscanf_size(path, &this_cache->CacheSize);
/* CacheLineSize */
snprintf(path, 256, "%s/index%d/coherency_line_size", prefix, idx);
fscanf_dec(path, &this_cache->CacheLineSize);
/* CacheAssociativity */
snprintf(path, 256, "%s/index%d/ways_of_associativity", prefix, idx);
fscanf_dec(path, &this_cache->CacheAssociativity);
/* CacheLinesPerTag */
snprintf(path, 256, "%s/index%d/physical_line_partition", prefix, idx);
fscanf_dec(path, &this_cache->CacheLinesPerTag);
/* CacheSiblings */
snprintf(path, 256, "%s/index%d/shared_cpu_map", prefix, idx);
fscanf_str(path, str, sizeof(str));
cpumap_to_cpu_ci(str, cpuinfo, this_cache);
++this_cache;
}
return cpu_ci->num_caches;
}
static HSAKMT_STATUS topology_sysfs_get_generation(uint32_t *gen)
{
FILE *fd;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
char path[256];
snprintf(path, sizeof(path), KFD_SYSFS_PATH_GENERATION_ID, get_topology_dir());
assert(gen);
fd = fopen(path, "r");
if (!fd)
return HSAKMT_STATUS_ERROR;
if (fscanf(fd, "%ul", gen) != 1) {
ret = HSAKMT_STATUS_ERROR;
goto err;
}
err:
fclose(fd);
return ret;
}
static HSAKMT_STATUS topology_sysfs_map_node_id(uint32_t node_id, uint32_t *sys_node_id)
{
if ((!map_user_to_sysfs_node_id) || (node_id >= map_user_to_sysfs_node_id_size))
return HSAKMT_STATUS_NOT_SUPPORTED;
*sys_node_id = map_user_to_sysfs_node_id[node_id];
return HSAKMT_STATUS_SUCCESS;
}
static HSAKMT_STATUS topology_sysfs_get_gpu_id(uint32_t sysfs_node_id, uint32_t *gpu_id)
{
FILE *fd;
char path[256];
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
assert(gpu_id);
snprintf(path, sizeof(path), KFD_SYSFS_PATH_NODES "/%d/gpu_id", get_topology_dir(), sysfs_node_id);
fd = fopen(path, "r");
if (!fd)
return HSAKMT_STATUS_ERROR;
if (fscanf(fd, "%ul", gpu_id) != 1)
ret = (errno == EPERM) ? HSAKMT_STATUS_NOT_SUPPORTED :
HSAKMT_STATUS_ERROR;
fclose(fd);
return ret;
}
/* Check if the @sysfs_node_id is supported. This function will be passed with sysfs node id.
* This function can not use topology_* help functions, because those functions are
* using user node id.
* A sysfs node is not supported
* - if corresponding drm render node is not available.
* - if node information is not accessible (EPERM)
*/
static HSAKMT_STATUS topology_sysfs_check_node_supported(uint32_t sysfs_node_id, bool *is_node_supported)
{
uint32_t gpu_id;
FILE *fd;
char *read_buf, *p;
int read_size;
char prop_name[256];
char path[256];
unsigned long long prop_val;
uint32_t prog;
uint32_t drm_render_minor = 0;
int ret_value;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
*is_node_supported = false;
/* Retrieve the GPU ID */
ret = topology_sysfs_get_gpu_id(sysfs_node_id, &gpu_id);
if (ret == HSAKMT_STATUS_NOT_SUPPORTED)
return HSAKMT_STATUS_SUCCESS;
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
if (gpu_id == 0) {
*is_node_supported = true;
return HSAKMT_STATUS_SUCCESS;
}
read_buf = malloc(PAGE_SIZE);
if (!read_buf)
return HSAKMT_STATUS_NO_MEMORY;
/* Retrieve the node properties */
snprintf(path, 256, KFD_SYSFS_PATH_NODES "/%d/properties", get_topology_dir(), sysfs_node_id);
fd = fopen(path, "r");
if (!fd) {
free(read_buf);
return HSAKMT_STATUS_ERROR;
}
read_size = fread(read_buf, 1, PAGE_SIZE, fd);
if (read_size <= 0) {
ret = HSAKMT_STATUS_ERROR;
goto err;
}
/* Since we're using the buffer as a string, we make sure the string terminates */
if (read_size >= PAGE_SIZE)
read_size = PAGE_SIZE - 1;
read_buf[read_size] = 0;
/* Read the node properties */
prog = 0;
p = read_buf;
while (sscanf(p += prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) {
if (strcmp(prop_name, "drm_render_minor") == 0) {
drm_render_minor = (int32_t)prop_val;
break;
}
}
if (!drm_render_minor) {
ret = HSAKMT_STATUS_ERROR;
goto err;
}
/* Open DRM Render device */
ret_value = hsakmt_open_drm_render_device(drm_render_minor);
if (ret_value > 0)
*is_node_supported = true;
else if (ret_value != -ENOENT && ret_value != -EPERM)
ret = HSAKMT_STATUS_ERROR;
err:
free(read_buf);
fclose(fd);
return ret;
}
HSAKMT_STATUS hsakmt_topology_sysfs_get_system_props(HsaSystemProperties *props)
{
FILE *fd;
char *read_buf, *p;
char path[256];
char prop_name[256];
unsigned long long prop_val;
uint32_t prog;
int read_size;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
bool is_node_supported = true;
uint32_t num_supported_nodes = 0;
assert(props);
snprintf(path, sizeof(path), KFD_SYSFS_PATH_SYSTEM_PROPERTIES, get_topology_dir());
fd = fopen(path, "r");
if (!fd)
return HSAKMT_STATUS_ERROR;
read_buf = malloc(PAGE_SIZE);
if (!read_buf) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto err1;
}
read_size = fread(read_buf, 1, PAGE_SIZE, fd);
if (read_size <= 0) {
ret = HSAKMT_STATUS_ERROR;
goto err2;
}
/* Since we're using the buffer as a string, we make sure the string terminates */
if (read_size >= PAGE_SIZE)
read_size = PAGE_SIZE - 1;
read_buf[read_size] = 0;
/* Read the system properties */
prog = 0;
p = read_buf;
while (sscanf(p += prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) {
if (strcmp(prop_name, "platform_oem") == 0)
props->PlatformOem = (uint32_t)prop_val;
else if (strcmp(prop_name, "platform_id") == 0)
props->PlatformId = (uint32_t)prop_val;
else if (strcmp(prop_name, "platform_rev") == 0)
props->PlatformRev = (uint32_t)prop_val;
}
/*
* Discover the number of sysfs nodes:
* Assuming that inside nodes folder there are only folders
* which represent the node numbers
*/
snprintf(path, sizeof(path), KFD_SYSFS_PATH_NODES, get_topology_dir());
num_sysfs_nodes = num_subdirs(path, "");
if (map_user_to_sysfs_node_id == NULL) {
/* Trade off - num_sysfs_nodes includes all CPU and GPU nodes.
* Slightly more memory is allocated than necessary.
*/
map_user_to_sysfs_node_id = calloc(num_sysfs_nodes, sizeof(uint32_t));
if (map_user_to_sysfs_node_id == NULL) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto err2;
}
map_user_to_sysfs_node_id_size = num_sysfs_nodes;
} else if (num_sysfs_nodes > map_user_to_sysfs_node_id_size) {
free(map_user_to_sysfs_node_id);
map_user_to_sysfs_node_id = calloc(num_sysfs_nodes, sizeof(uint32_t));
if (map_user_to_sysfs_node_id == NULL) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto err2;
}
map_user_to_sysfs_node_id_size = num_sysfs_nodes;
}
for (uint32_t i = 0; i < num_sysfs_nodes; i++) {
ret = topology_sysfs_check_node_supported(i, &is_node_supported);
if (ret != HSAKMT_STATUS_SUCCESS)
goto sysfs_parse_failed;
if (is_node_supported)
map_user_to_sysfs_node_id[num_supported_nodes++] = i;
}
props->NumNodes = num_supported_nodes;
free(read_buf);
fclose(fd);
return ret;
sysfs_parse_failed:
free(map_user_to_sysfs_node_id);
map_user_to_sysfs_node_id = NULL;
err2:
free(read_buf);
err1:
fclose(fd);
return ret;
}
static const struct hsa_gfxip_table *find_hsa_gfxip_device(uint16_t device_id, uint8_t gfxv_major)
{
if (gfxv_major > 10)
return NULL;
uint32_t i, table_size;
table_size = sizeof(gfxip_lookup_table)/sizeof(struct hsa_gfxip_table);
for (i = 0; i < table_size; i++) {
if (gfxip_lookup_table[i].device_id == device_id)
return &gfxip_lookup_table[i];
}
return NULL;
}
void hsakmt_topology_setup_is_dgpu_param(HsaNodeProperties *props)
{
/* if we found a dGPU node, then treat the whole system as dGPU */
if (!props->NumCPUCores && props->NumFComputeCores)
hsakmt_is_dgpu = true;
}
bool hsakmt_topology_is_svm_needed(HSA_ENGINE_ID EngineId)
{
if (hsakmt_is_dgpu)
return true;
if (HSA_GET_GFX_VERSION_FULL(EngineId.ui32) >= GFX_VERSION_VEGA10)
return true;
return false;
}
static HSAKMT_STATUS topology_get_cpu_model_name(HsaNodeProperties *props,
struct proc_cpuinfo *cpuinfo, int num_procs)
{
int i, j;
if (!props) {
pr_err("Invalid props to get cpu model name\n");
return HSAKMT_STATUS_INVALID_PARAMETER;
}
for (i = 0; i < num_procs; i++, cpuinfo++) {
if (props->CComputeIdLo == cpuinfo->apicid) {
if (!props->DeviceId) /* CPU-only node */
strncpy((char *)props->AMDName, cpuinfo->model_name, sizeof(props->AMDName));
/* Convert from UTF8 to UTF16 */
for (j = 0; cpuinfo->model_name[j] != '\0' && j < HSA_PUBLIC_NAME_SIZE - 1; j++)
props->MarketingName[j] = cpuinfo->model_name[j];
props->MarketingName[j] = '\0';
return HSAKMT_STATUS_SUCCESS;
}
}
return HSAKMT_STATUS_ERROR;
}
static int topology_search_processor_vendor(const char *processor_name)
{
unsigned int i;
for (i = 0; i < ARRAY_LEN(supported_processor_vendor_name); i++) {
if (!strcmp(processor_name, supported_processor_vendor_name[i]))
return i;
if (!strcmp(processor_name, "POWER9, altivec supported\n"))
return IBM_POWER;
}
return -1;
}
/* topology_parse_cpuinfo - Parse /proc/cpuinfo and fill up required
* topology information
* cpuinfo [OUT]: output buffer to hold cpu information
* num_procs: number of processors the output buffer can hold
*/
static HSAKMT_STATUS topology_parse_cpuinfo(struct proc_cpuinfo *cpuinfo,
uint32_t num_procs)
{
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
FILE *fd;
char read_buf[256];
char *p;
uint32_t proc = 0;
size_t p_len;
const char *proc_cpuinfo_path = "/proc/cpuinfo";
if (!cpuinfo) {
pr_err("CPU information will be missing\n");
return HSAKMT_STATUS_INVALID_PARAMETER;
}
fd = fopen(proc_cpuinfo_path, "r");
if (!fd) {
pr_err("Failed to open [%s]. Unable to get CPU information",
proc_cpuinfo_path);
return HSAKMT_STATUS_ERROR;
}
#ifdef __PPC64__
char *p2;
/* Each line in /proc/cpuinfo that read_buf is constructed, the format
* is like this:
* "token : value\n"
* where token is our target like vendor_id, model name, apicid ...
* and value is the answer
*/
while (fgets(read_buf, sizeof(read_buf), fd)) {
/* processor number */
if (!strncmp("processor ", read_buf, sizeof("processor ") - 1)) {
p = strchr(read_buf, ':');
p += 2; /* remove ": " */
proc = atoi(p);
if (proc >= num_procs) {
pr_warn("cpuinfo contains processor %d larger than %u\n",
proc, num_procs);
ret = HSAKMT_STATUS_NO_MEMORY;
goto exit;
}
continue;
}
/* vendor name / model name */
if (!strncmp("cpu ", read_buf, sizeof("cpu ") - 1) &&
(processor_vendor == -1)) {
p = strchr(read_buf, ':');
p += 2; /* remove ": " */
processor_vendor = topology_search_processor_vendor(p);
p2 = strchr(p, ',');
if (p2 != NULL) {
p2++;
*p2 = 0;
}
if (strlen(p) < HSA_PUBLIC_NAME_SIZE) {
/* -1 to remove \n from p */
strncpy(cpuinfo[proc].model_name, p, strlen(p) - 1);
cpuinfo[proc].model_name[strlen(p) - 1] = '\0';
} else
strncpy(cpuinfo[proc].model_name, p, HSA_PUBLIC_NAME_SIZE);
continue;
}
}
#else
/* Each line in /proc/cpuinfo that read_buf is constructed, the format
* is like this:
* "token : value\n"
* where token is our target like vendor_id, model name, apicid ...
* and value is the answer
*/
while (fgets(read_buf, sizeof(read_buf), fd)) {
/* processor number */
if (!strncmp("processor", read_buf, sizeof("processor") - 1)) {
p = strchr(read_buf, ':');
p += 2; /* remove ": " */
proc = atoi(p);
if (proc >= num_procs) {
pr_warn("cpuinfo contains processor %d larger than %u\n",
proc, num_procs);
ret = HSAKMT_STATUS_NO_MEMORY;
goto exit;
}
continue;
}
/* vendor name */
if (!strncmp("vendor_id", read_buf, sizeof("vendor_id") - 1) &&
(processor_vendor == -1)) {
p = strchr(read_buf, ':');
p += 2; /* remove ": " */
processor_vendor = topology_search_processor_vendor(p);
continue;
}
/* model name */
if (!strncmp("model name", read_buf, sizeof("model name") - 1)) {
p = strchr(read_buf, ':');
p += 2; /* remove ": " */
p_len = strlen(p);
if (p_len > HSA_PUBLIC_NAME_SIZE)
p_len = HSA_PUBLIC_NAME_SIZE;
memcpy(cpuinfo[proc].model_name, p, p_len);
cpuinfo[proc].model_name[p_len - 1] = '\0';
continue;
}
/* apicid */
if (!strncmp("apicid", read_buf, sizeof("apicid") - 1)) {
p = strchr(read_buf, ':');
p += 2; /* remove ": " */
cpuinfo[proc].apicid = atoi(p);
}
}
#endif
if (processor_vendor < 0) {
pr_err("Failed to get Processor Vendor. Setting to %s",
supported_processor_vendor_name[GENUINE_INTEL]);
processor_vendor = GENUINE_INTEL;
}
exit:
fclose(fd);
return ret;
}
static int topology_get_node_props_from_drm(HsaNodeProperties *props)
{
int drm_fd;
uint32_t major_version;
uint32_t minor_version;
amdgpu_device_handle device_handle;
struct amdgpu_gpu_info gpu_info;
const char *name;
int i, ret = 0;
if (props == NULL)
return -1;
drm_fd = drmOpenRender(props->DrmRenderMinor);
if (drm_fd < 0)
return -1;
if (amdgpu_device_initialize(drm_fd,
&major_version, &minor_version, &device_handle) < 0) {
ret = -1;
goto err_device_initialize;
}
name = amdgpu_get_marketing_name(device_handle);
if (name != NULL) {
for (i = 0; name[i] != 0 && i < HSA_PUBLIC_NAME_SIZE - 1; i++)
props->MarketingName[i] = name[i];
props->MarketingName[i] = '\0';
}
if (amdgpu_query_gpu_info(device_handle, &gpu_info)) {
ret = -1;
goto err_query_gpu_info;
}
props->FamilyID = gpu_info.family_id;
props->Integrated = !!(gpu_info.ids_flags & AMDGPU_IDS_FLAGS_FUSION);
err_query_gpu_info:
amdgpu_device_deinitialize(device_handle);
err_device_initialize:
drmClose(drm_fd);
return ret;
}
static HSAKMT_STATUS topology_sysfs_get_node_props(uint32_t node_id,
HsaNodeProperties *props,
bool *p2p_links,
uint32_t *num_p2pLinks)
{
FILE *fd;
char *read_buf, *p, *envvar, dummy = '\0';
char prop_name[256];
char path[256];
char per_node_override[32];
unsigned long long prop_val = 0;
uint32_t prog, major = 0, minor = 0, step = 0;
int read_size;
const struct hsa_gfxip_table *hsa_gfxip;
uint32_t sys_node_id;
uint32_t gfxv = 0;
uint8_t gfxv_major, gfxv_minor, gfxv_stepping;
uint32_t simd_arrays_count = 0;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
assert(props);
ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
/* Retrieve the GPU ID */
ret = topology_sysfs_get_gpu_id(sys_node_id, &props->KFDGpuID);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
read_buf = malloc(PAGE_SIZE);
if (!read_buf)
return HSAKMT_STATUS_NO_MEMORY;
/* Retrieve the node properties */
snprintf(path, 256, KFD_SYSFS_PATH_NODES "/%d/properties", get_topology_dir(), sys_node_id);
fd = fopen(path, "r");
if (!fd) {
free(read_buf);
return HSAKMT_STATUS_ERROR;
}
read_size = fread(read_buf, 1, PAGE_SIZE, fd);
if (read_size <= 0) {
ret = HSAKMT_STATUS_ERROR;
goto out;
}
/* Since we're using the buffer as a string, we make sure the string terminates */
if (read_size >= PAGE_SIZE)
read_size = PAGE_SIZE - 1;
read_buf[read_size] = 0;
/* Read the node properties */
prog = 0;
p = read_buf;
while (sscanf(p += prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) {
if (strcmp(prop_name, "cpu_cores_count") == 0)
props->NumCPUCores = (uint32_t)prop_val;
else if (strcmp(prop_name, "simd_count") == 0)
props->NumFComputeCores = (uint32_t)prop_val;
else if (strcmp(prop_name, "mem_banks_count") == 0)
props->NumMemoryBanks = (uint32_t)prop_val;
else if (strcmp(prop_name, "caches_count") == 0)
props->NumCaches = (uint32_t)prop_val;
else if (strcmp(prop_name, "io_links_count") == 0)
props->NumIOLinks = (uint32_t)prop_val;
else if (strcmp(prop_name, "p2p_links_count") == 0) {
props->NumIOLinks += (uint32_t)prop_val;
if (num_p2pLinks)
*num_p2pLinks = (uint32_t)prop_val;
if (p2p_links)
*p2p_links = true;
} else if (strcmp(prop_name, "cpu_core_id_base") == 0)
props->CComputeIdLo = (uint32_t)prop_val;
else if (strcmp(prop_name, "simd_id_base") == 0)
props->FComputeIdLo = (uint32_t)prop_val;
else if (strcmp(prop_name, "capability") == 0)
props->Capability.Value = (uint32_t)prop_val;
else if (strcmp(prop_name, "capability2") == 0)
props->Capability2.Value = (uint32_t)prop_val;
else if (strcmp(prop_name, "debug_prop") == 0)
props->DebugProperties.Value = (uint64_t)prop_val;
else if (strcmp(prop_name, "max_waves_per_simd") == 0)
props->MaxWavesPerSIMD = (uint32_t)prop_val;
else if (strcmp(prop_name, "lds_size_in_kb") == 0)
props->LDSSizeInKB = (uint32_t)prop_val;
else if (strcmp(prop_name, "gds_size_in_kb") == 0)
props->GDSSizeInKB = (uint32_t)prop_val;
else if (strcmp(prop_name, "wave_front_size") == 0)
props->WaveFrontSize = (uint32_t)prop_val;
else if (strcmp(prop_name, "array_count") == 0)
simd_arrays_count = (uint32_t)prop_val;
else if (strcmp(prop_name, "simd_arrays_per_engine") == 0)
props->NumArrays = (uint32_t)prop_val;
else if (strcmp(prop_name, "cu_per_simd_array") == 0)
props->NumCUPerArray = (uint32_t)prop_val;
else if (strcmp(prop_name, "simd_per_cu") == 0)
props->NumSIMDPerCU = (uint32_t)prop_val;
else if (strcmp(prop_name, "max_slots_scratch_cu") == 0)
props->MaxSlotsScratchCU = (uint32_t)prop_val;
else if (strcmp(prop_name, "fw_version") == 0)
props->EngineId.Value = (uint32_t)prop_val & 0x3ff;
else if (strcmp(prop_name, "vendor_id") == 0)
props->VendorId = (uint32_t)prop_val;
else if (strcmp(prop_name, "device_id") == 0)
props->DeviceId = (uint32_t)prop_val;
else if (strcmp(prop_name, "location_id") == 0)
props->LocationId = (uint32_t)prop_val;
else if (strcmp(prop_name, "domain") == 0)
props->Domain = (uint32_t)prop_val;
else if (strcmp(prop_name, "max_engine_clk_fcompute") == 0)
props->MaxEngineClockMhzFCompute = (uint32_t)prop_val;
else if (strcmp(prop_name, "max_engine_clk_ccompute") == 0)
props->MaxEngineClockMhzCCompute = (uint32_t)prop_val;
else if (strcmp(prop_name, "local_mem_size") == 0)
props->LocalMemSize = prop_val;
else if (strcmp(prop_name, "drm_render_minor") == 0)
props->DrmRenderMinor = (int32_t)prop_val;
else if (strcmp(prop_name, "sdma_fw_version") == 0)
props->uCodeEngineVersions.Value = (uint32_t)prop_val & 0x3ff;
else if (strcmp(prop_name, "hive_id") == 0)
props->HiveID = prop_val;
else if (strcmp(prop_name, "unique_id") == 0)
props->UniqueID = prop_val;
else if (strcmp(prop_name, "num_sdma_engines") == 0)
props->NumSdmaEngines = prop_val;
else if (strcmp(prop_name, "num_sdma_xgmi_engines") == 0)
props->NumSdmaXgmiEngines = prop_val;
else if (strcmp(prop_name, "num_gws") == 0)
props->NumGws = prop_val;
else if (strcmp(prop_name, "num_sdma_queues_per_engine") == 0)
props->NumSdmaQueuesPerEngine = prop_val;
else if (strcmp(prop_name, "num_cp_queues") == 0)
props->NumCpQueues = prop_val;
else if (strcmp(prop_name, "num_xcc") == 0)
props->NumXcc = prop_val;
else if (strcmp(prop_name, "family_id") == 0)
props->FamilyID = prop_val;
else if (strcmp(prop_name, "gfx_target_version") == 0)
gfxv = (uint32_t)prop_val;
}
if (!hsakmt_is_svm_api_supported)
props->Capability.ui32.SVMAPISupported = 0;
/* Bail out early, if a CPU node */
if (!props->NumFComputeCores)
goto out;
if (props->NumArrays != 0)
props->NumShaderBanks = simd_arrays_count/props->NumArrays;
gfxv_major = HSA_GET_GFX_VERSION_MAJOR(gfxv);
gfxv_minor = HSA_GET_GFX_VERSION_MINOR(gfxv);
gfxv_stepping = HSA_GET_GFX_VERSION_STEP(gfxv);
hsa_gfxip = find_hsa_gfxip_device(props->DeviceId, gfxv_major);
if (hsa_gfxip || gfxv) {
snprintf(per_node_override, sizeof(per_node_override), "HSA_OVERRIDE_GFX_VERSION_%d", node_id);
if ((envvar = getenv(per_node_override)) || (envvar = getenv("HSA_OVERRIDE_GFX_VERSION"))) {
/* HSA_OVERRIDE_GFX_VERSION=major.minor.stepping */
if ((sscanf(envvar, "%u.%u.%u%c",
&major, &minor, &step, &dummy) != 3) ||
(major > 63 || minor > 255 || step > 255)) {
pr_err("HSA_OVERRIDE_GFX_VERSION %s is invalid\n",
envvar);
ret = HSAKMT_STATUS_ERROR;
goto out;
}
props->OverrideEngineId.ui32.Major = major & 0x3f;
props->OverrideEngineId.ui32.Minor = minor & 0xff;
props->OverrideEngineId.ui32.Stepping = step & 0xff;
}
if (hsa_gfxip) {
props->EngineId.ui32.Major = hsa_gfxip->major & 0x3f;
props->EngineId.ui32.Minor = hsa_gfxip->minor & 0xff;
props->EngineId.ui32.Stepping = hsa_gfxip->stepping & 0xff;
} else {
props->EngineId.ui32.Major = gfxv_major & 0x3f;
props->EngineId.ui32.Minor = gfxv_minor & 0xff;
props->EngineId.ui32.Stepping = gfxv_stepping & 0xff;
}
/* Set the CAL name of the node. If DID-based hsa_gfxip lookup was
* successful, use that name. Otherwise, set to GFX.
*/
if (hsa_gfxip && hsa_gfxip->amd_name)
strncpy((char *)props->AMDName, hsa_gfxip->amd_name,
sizeof(props->AMDName)-1);
else
snprintf((char *)props->AMDName, sizeof(props->AMDName)-1, "GFX%06x",
HSA_GET_GFX_VERSION_FULL(props->EngineId.ui32));
/* Is dGPU Node, not APU
* Retrieve the marketing name of the node.
*/
if (topology_get_node_props_from_drm(props))
pr_info("failed to get marketing name for device ID 0x%x\n", props->DeviceId);
/* Get VGPR/SGPR size in byte per CU */
props->SGPRSizePerCU = SGPR_SIZE_PER_CU;
props->VGPRSizePerCU = hsakmt_get_vgpr_size_per_cu(HSA_GET_GFX_VERSION_FULL(props->EngineId.ui32));
} else if (props->DeviceId)
/* still return success */
pr_err("device ID 0x%x is not supported in libhsakmt\n",
props->DeviceId);
if (props->NumFComputeCores)
assert(props->EngineId.ui32.Major && "HSA_OVERRIDE_GFX_VERSION may be needed");
/* On Older kernels, num_xcc may not be present in system properties.
* Set it to 1 if system properties do not report num_xcc.
*/
if (!props->NumXcc)
props->NumXcc = 1;
out:
free(read_buf);
fclose(fd);
return ret;
}
static HSAKMT_STATUS topology_sysfs_get_mem_props(uint32_t node_id,
uint32_t mem_id,
HsaMemoryProperties *props)
{
FILE *fd;
char *read_buf, *p;
char prop_name[256];
char path[256];
unsigned long long prop_val;
uint32_t prog;
int read_size;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
uint32_t sys_node_id;
assert(props);
ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
snprintf(path, 256, KFD_SYSFS_PATH_NODES "/%d/mem_banks/%d/properties", get_topology_dir(), sys_node_id, mem_id);
fd = fopen(path, "r");
if (!fd)
return HSAKMT_STATUS_ERROR;
read_buf = malloc(PAGE_SIZE);
if (!read_buf) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto err1;
}
read_size = fread(read_buf, 1, PAGE_SIZE, fd);
if (read_size <= 0) {
ret = HSAKMT_STATUS_ERROR;
goto err2;
}
/* Since we're using the buffer as a string, we make sure the string terminates */
if (read_size >= PAGE_SIZE)
read_size = PAGE_SIZE - 1;
read_buf[read_size] = 0;
prog = 0;
p = read_buf;
while (sscanf(p += prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) {
if (strcmp(prop_name, "heap_type") == 0)
props->HeapType = (uint32_t)prop_val;
else if (strcmp(prop_name, "size_in_bytes") == 0)
props->SizeInBytes = (uint64_t)prop_val;
else if (strcmp(prop_name, "flags") == 0)
props->Flags.MemoryProperty = (uint32_t)prop_val;
else if (strcmp(prop_name, "width") == 0)
props->Width = (uint32_t)prop_val;
else if (strcmp(prop_name, "mem_clk_max") == 0)
props->MemoryClockMax = (uint32_t)prop_val;
}
err2:
free(read_buf);
err1:
fclose(fd);
return ret;
}
/* topology_destroy_temp_cpu_cache_list -
* Free the memory allocated in topology_create_temp_cpu_cache_list().
*/
static void topology_destroy_temp_cpu_cache_list(
cpu_cacheinfo_t *temp_cpu_ci_list)
{
uint32_t n;
cpu_cacheinfo_t *p_temp_cpu_ci_list = temp_cpu_ci_list;
cpu_cacheinfo_t *cpu_ci = p_temp_cpu_ci_list;
if (p_temp_cpu_ci_list) {
for (n = 0; n < p_temp_cpu_ci_list->len; n++, cpu_ci++)
free(cpu_ci->cache_prop);
free(p_temp_cpu_ci_list);
}
}
/* topology_create_temp_cpu_cache_list - Create a temporary cpu-cache list to
* store cpu cache information. This list will be used to copy
* HsaCacheProperties in the CPU node. Two buffers are allocated
* inside this function: cpu_ci list and cache_prop under each
* cpu_ci. Must call topology_destroy_temp_cpu_cache_list to free
* the memory after the information is copied.
* @node [IN] CPU node number
* @cpuinfo [IN] /proc/cpuinfo data
* @temp_cpu_ci_list [OUT] cpu-cache-info list with data filled
* Return: total number of caches under this CPU node
*/
static int topology_create_temp_cpu_cache_list(int node,
struct proc_cpuinfo *cpuinfo, cpu_cacheinfo_t **temp_cpu_ci_list)
{
/* Get max path size from /sys/devices/system/node/node%d/%s/cache
* below, which will max out according to the largest filename,
* which can be present twice in the string above. 29 is for the prefix
* and the +6 is for the cache suffix
*/
#ifndef MAXNAMLEN
/* MAXNAMLEN is the BSD name for NAME_MAX. glibc aliases this as NAME_MAX, but not musl */
#define MAXNAMLEN NAME_MAX
#endif
const uint32_t MAXPATHSIZE = 29 + MAXNAMLEN + (MAXNAMLEN + 6);
cpu_cacheinfo_t *p_temp_cpu_ci_list; /* a list of cpu_ci */
char path[MAXPATHSIZE], node_dir[MAXPATHSIZE];
int max_cpus;
cpu_cacheinfo_t *this_cpu; /* one cpu_ci in cpu_ci_list */
int cache_cnt = 0;
DIR *dirp = NULL;
struct dirent *dir;
char *p;
if (!temp_cpu_ci_list) {
pr_err("Invalid temp_cpu_ci_list\n");
goto exit;
}
*temp_cpu_ci_list = NULL;
/* Get info from /sys/devices/system/node/nodeX/cpuY/cache */
int node_real = node;
if (processor_vendor == IBM_POWER) {
if (!strcmp(cpuinfo[0].model_name, "POWER9")) {
node_real = node * 8;
}
}
snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/node/node%d", node_real);
/* Other than cpuY folders, this dir also has cpulist and cpumap */
max_cpus = num_subdirs(node_dir, "cpu");
if (max_cpus <= 0) {
/* If CONFIG_NUMA is not enabled in the kernel,
* /sys/devices/system/node doesn't exist.
*/
if (node) { /* CPU node must be 0 or something is wrong */
pr_err("Fail to get cpu* dirs under %s.", node_dir);
goto exit;
}
/* Fall back to use /sys/devices/system/cpu */
snprintf(node_dir, MAXPATHSIZE, "/sys/devices/system/cpu");
max_cpus = num_subdirs(node_dir, "cpu");
if (max_cpus <= 0) {
pr_err("Fail to get cpu* dirs under %s\n", node_dir);
goto exit;
}
}
p_temp_cpu_ci_list = calloc(max_cpus, sizeof(cpu_cacheinfo_t));
if (!p_temp_cpu_ci_list) {
pr_err("Fail to allocate p_temp_cpu_ci_list\n");
goto exit;
}
p_temp_cpu_ci_list->len = 0;
this_cpu = p_temp_cpu_ci_list;
dirp = opendir(node_dir);
while ((dir = readdir(dirp)) != 0) {
if (strncmp(dir->d_name, "cpu", 3))
continue;
if (!isdigit(dir->d_name[3])) /* ignore files like cpulist */
continue;
snprintf(path, MAXPATHSIZE, "%s/%s/cache", node_dir, dir->d_name);
this_cpu->num_caches = num_subdirs(path, "index");
this_cpu->cache_prop = calloc(this_cpu->num_caches,
sizeof(HsaCacheProperties));
if (!this_cpu->cache_prop) {
pr_err("Fail to allocate cache_info\n");
goto exit;
}
p = &dir->d_name[3];
this_cpu->proc_num = atoi(p);
cache_cnt += get_cpu_cache_info(path, cpuinfo, this_cpu);
++p_temp_cpu_ci_list->len;
++this_cpu;
}
*temp_cpu_ci_list = p_temp_cpu_ci_list;
exit:
if (dirp)
closedir(dirp);
return cache_cnt;
}
/* topology_get_cpu_cache_props - Read CPU cache information from sysfs
* @node [IN] CPU node number
* @cpuinfo [IN] /proc/cpuinfo data
* @tbl [OUT] the node table to fill up
* Return: HSAKMT_STATUS_SUCCESS in success or error number in failure
*/
static HSAKMT_STATUS topology_get_cpu_cache_props(int node,
struct proc_cpuinfo *cpuinfo, node_props_t *tbl)
{
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
cpu_cacheinfo_t *cpu_ci_list = NULL;
uint32_t n, cache_cnt, i;
cpu_cacheinfo_t *cpu_ci;
HsaCacheProperties *this_cache;
tbl->node.NumCaches = topology_create_temp_cpu_cache_list(
node, cpuinfo, &cpu_ci_list);
if (!tbl->node.NumCaches) {
/* For "Intel Meteor lake Mobile", the cache info is not in sysfs,
* That means /sys/devices/system/node/node%d/%s/cache is not exist.
* here AMD will not black this issue.
*/
pr_debug("CPU cache info is not available for node %d \n", node);
goto exit;
}
tbl->cache = calloc(tbl->node.NumCaches, sizeof(HsaCacheProperties));
if (!tbl->cache) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto exit;
}
/* Now fill in the information to cache properties. */
cache_cnt = 0;
cpu_ci = cpu_ci_list;
for (n = 0; n < cpu_ci_list->len; n++, cpu_ci++) {
this_cache = cpu_ci->cache_prop;
for (i = 0; i < cpu_ci->num_caches; i++, this_cache++) {
memcpy(&tbl->cache[cache_cnt++],
this_cache,
sizeof(HsaCacheProperties));
if (cache_cnt >= tbl->node.NumCaches)
goto exit;
}
}
exit:
topology_destroy_temp_cpu_cache_list(cpu_ci_list);
return ret;
}
static HSAKMT_STATUS topology_sysfs_get_cache_props(uint32_t node_id,
uint32_t cache_id,
HsaCacheProperties *props)
{
FILE *fd;
char *read_buf, *p;
char prop_name[256];
char path[256];
unsigned long long prop_val;
uint32_t i, prog;
int read_size;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
uint32_t sys_node_id;
assert(props);
ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
snprintf(path, 256, KFD_SYSFS_PATH_NODES "/%d/caches/%d/properties", get_topology_dir(), sys_node_id, cache_id);
fd = fopen(path, "r");
if (!fd)
return HSAKMT_STATUS_ERROR;
read_buf = malloc(PAGE_SIZE);
if (!read_buf) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto err1;
}
read_size = fread(read_buf, 1, PAGE_SIZE, fd);
if (read_size <= 0) {
ret = HSAKMT_STATUS_ERROR;
goto err2;
}
/* Since we're using the buffer as a string, we make sure the string terminates */
if (read_size >= PAGE_SIZE)
read_size = PAGE_SIZE - 1;
read_buf[read_size] = 0;
prog = 0;
p = read_buf;
while (sscanf(p += prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) {
if (strcmp(prop_name, "processor_id_low") == 0)
props->ProcessorIdLow = (uint32_t)prop_val;
else if (strcmp(prop_name, "level") == 0)
props->CacheLevel = (uint32_t)prop_val;
else if (strcmp(prop_name, "size") == 0)
props->CacheSize = (uint32_t)prop_val;
else if (strcmp(prop_name, "cache_line_size") == 0)
props->CacheLineSize = (uint32_t)prop_val;
else if (strcmp(prop_name, "cache_lines_per_tag") == 0)
props->CacheLinesPerTag = (uint32_t)prop_val;
else if (strcmp(prop_name, "association") == 0)
props->CacheAssociativity = (uint32_t)prop_val;
else if (strcmp(prop_name, "latency") == 0)
props->CacheLatency = (uint32_t)prop_val;
else if (strcmp(prop_name, "type") == 0)
props->CacheType.Value = (uint32_t)prop_val;
else if (strcmp(prop_name, "sibling_map") == 0)
break;
}
prog = 0;
if ((sscanf(p, "sibling_map %n", &prog)) == 0 && prog) {
i = 0;
while ((i < HSA_CPU_SIBLINGS) &&
(sscanf(p += prog, "%u%*[,\n]%n", &props->SiblingMap[i++], &prog) == 1))
continue;
}
err2:
free(read_buf);
err1:
fclose(fd);
return ret;
}
static HSAKMT_STATUS topology_map_sysfs_to_user_node_id(uint32_t sys_node_id, uint32_t *user_node_id)
{
uint32_t node_id;
for (node_id = 0; node_id < map_user_to_sysfs_node_id_size; node_id++)
if (map_user_to_sysfs_node_id[node_id] == sys_node_id) {
*user_node_id = node_id;
return HSAKMT_STATUS_SUCCESS;
}
return HSAKMT_STATUS_INVALID_NODE_UNIT;
}
/* For a give Node @node_id the function gets @iolink_id information i.e. parses sysfs the following sysfs entry
* ./nodes/@node_id/io_links/@iolink_id/properties. @node_id has to be valid accessible node.
*
* If node_to specified by the @iolink_id is not accessible the function returns HSAKMT_STATUS_NOT_SUPPORTED.
* If node_to is accessible, then node_to is mapped from sysfs_node to user_node and returns HSAKMT_STATUS_SUCCESS.
*/
static HSAKMT_STATUS topology_sysfs_get_iolink_props(uint32_t node_id,
uint32_t iolink_id,
HsaIoLinkProperties *props, bool p2pLink)
{
FILE *fd;
char *read_buf, *p;
char prop_name[256];
char path[256];
unsigned long long prop_val;
uint32_t prog;
int read_size;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
uint32_t sys_node_id;
assert(props);
ret = topology_sysfs_map_node_id(node_id, &sys_node_id);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
snprintf(path, 256, KFD_SYSFS_PATH_NODES "/%d/%s/%d/properties", get_topology_dir(), sys_node_id, p2pLink ? "p2p_links" : "io_links", iolink_id);
fd = fopen(path, "r");
if (!fd)
return HSAKMT_STATUS_ERROR;
read_buf = malloc(PAGE_SIZE);
if (!read_buf) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto err1;
}
read_size = fread(read_buf, 1, PAGE_SIZE, fd);
if (read_size <= 0) {
ret = (errno == EPERM) ? HSAKMT_STATUS_NOT_SUPPORTED :
HSAKMT_STATUS_ERROR;
goto err2;
}
/* Since we're using the buffer as a string, we make sure the string terminates */
if (read_size >= PAGE_SIZE)
read_size = PAGE_SIZE - 1;
read_buf[read_size] = 0;
prog = 0;
p = read_buf;
while (sscanf(p += prog, "%s %llu\n%n", prop_name, &prop_val, &prog) == 2) {
if (strcmp(prop_name, "type") == 0)
props->IoLinkType = (uint32_t)prop_val;
else if (strcmp(prop_name, "version_major") == 0)
props->VersionMajor = (uint32_t)prop_val;
else if (strcmp(prop_name, "version_minor") == 0)
props->VersionMinor = (uint32_t)prop_val;
else if (strcmp(prop_name, "node_from") == 0) {
if (sys_node_id != (uint32_t)prop_val) {
ret = HSAKMT_STATUS_INVALID_NODE_UNIT;
goto err2;
}
props->NodeFrom = node_id;
} else if (strcmp(prop_name, "node_to") == 0) {
bool is_node_supported;
uint32_t sysfs_node_id;
sysfs_node_id = (uint32_t)prop_val;
ret = topology_sysfs_check_node_supported(sysfs_node_id, &is_node_supported);
if (!is_node_supported) {
ret = HSAKMT_STATUS_NOT_SUPPORTED;
memset(props, 0, sizeof(*props));
goto err2;
}
ret = topology_map_sysfs_to_user_node_id(sysfs_node_id, &props->NodeTo);
if (ret != HSAKMT_STATUS_SUCCESS)
goto err2;
} else if (strcmp(prop_name, "weight") == 0)
props->Weight = (uint32_t)prop_val;
else if (strcmp(prop_name, "min_latency") == 0)
props->MinimumLatency = (uint32_t)prop_val;
else if (strcmp(prop_name, "max_latency") == 0)
props->MaximumLatency = (uint32_t)prop_val;
else if (strcmp(prop_name, "min_bandwidth") == 0)
props->MinimumBandwidth = (uint32_t)prop_val;
else if (strcmp(prop_name, "max_bandwidth") == 0)
props->MaximumBandwidth = (uint32_t)prop_val;
else if (strcmp(prop_name, "recommended_transfer_size") == 0)
props->RecTransferSize = (uint32_t)prop_val;
else if (strcmp(prop_name, "recommended_sdma_engine_id_mask") == 0)
props->RecSdmaEngIdMask = (uint32_t)prop_val;
else if (strcmp(prop_name, "flags") == 0)
props->Flags.LinkProperty = (uint32_t)prop_val;
}
err2:
free(read_buf);
err1:
fclose(fd);
return ret;
}
/* topology_get_free_io_link_slot_for_node - For the given node_id, find the
* next available free slot to add an io_link
*/
static HsaIoLinkProperties *topology_get_free_io_link_slot_for_node(uint32_t node_id,
const HsaSystemProperties *sys_props,
node_props_t *node_props)
{
HsaIoLinkProperties *props;
if (node_id >= sys_props->NumNodes) {
pr_err("Invalid node [%d]\n", node_id);
return NULL;
}
props = node_props[node_id].link;
if (!props) {
pr_err("No io_link reported for Node [%d]\n", node_id);
return NULL;
}
if (node_props[node_id].node.NumIOLinks >= sys_props->NumNodes - 1) {
pr_err("No more space for io_link for Node [%d]\n", node_id);
return NULL;
}
return &props[node_props[node_id].node.NumIOLinks];
}
/* topology_add_io_link_for_node - If a free slot is available,
* add io_link for the given Node.
* TODO: Add other members of HsaIoLinkProperties
*/
static HSAKMT_STATUS topology_add_io_link_for_node(uint32_t node_from,
const HsaSystemProperties *sys_props,
node_props_t *node_props,
HSA_IOLINKTYPE IoLinkType,
uint32_t node_to,
uint32_t Weight)
{
HsaIoLinkProperties *props;
props = topology_get_free_io_link_slot_for_node(node_from,
sys_props, node_props);
if (!props)
return HSAKMT_STATUS_NO_MEMORY;
props->IoLinkType = IoLinkType;
props->NodeFrom = node_from;
props->NodeTo = node_to;
props->Weight = Weight;
node_props[node_from].node.NumIOLinks++;
return HSAKMT_STATUS_SUCCESS;
}
/* Find the CPU that this GPU (gpu_node) directly connects to */
static int32_t gpu_get_direct_link_cpu(uint32_t gpu_node, node_props_t *node_props)
{
HsaIoLinkProperties *props = node_props[gpu_node].link;
uint32_t i;
if (!node_props[gpu_node].node.KFDGpuID || !props ||
node_props[gpu_node].node.NumIOLinks == 0)
return -1;
for (i = 0; i < node_props[gpu_node].node.NumIOLinks; i++)
if ((props[i].IoLinkType == HSA_IOLINKTYPE_PCIEXPRESS || props[i].IoLinkType == HSA_IOLINK_TYPE_XGMI) &&
props[i].Weight <= 20) /* >20 is GPU->CPU->GPU */{
if (!node_props[props[i].NodeTo].node.KFDGpuID)
return props[i].NodeTo;
}
return -1;
}
/* Get node1->node2 IO link information. This should be a direct link that has
* been created in the kernel.
*/
static HSAKMT_STATUS get_direct_iolink_info(uint32_t node1, uint32_t node2,
node_props_t *node_props, HSAuint32 *weight,
HSA_IOLINKTYPE *type)
{
HsaIoLinkProperties *props = node_props[node1].link;
uint32_t i;
if (!props)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
for (i = 0; i < node_props[node1].node.NumIOLinks; i++)
if (props[i].NodeTo == node2) {
if (weight)
*weight = props[i].Weight;
if (type)
*type = props[i].IoLinkType;
return HSAKMT_STATUS_SUCCESS;
}
return HSAKMT_STATUS_INVALID_PARAMETER;
}
static HSAKMT_STATUS get_indirect_iolink_info(uint32_t node1, uint32_t node2,
node_props_t *node_props, HSAuint32 *weight,
HSA_IOLINKTYPE *type)
{
int32_t dir_cpu1 = -1, dir_cpu2 = -1;
HSAuint32 weight1 = 0, weight2 = 0, weight3 = 0;
HSAKMT_STATUS ret;
uint32_t i;
*weight = 0;
*type = HSA_IOLINKTYPE_UNDEFINED;
if (node1 == node2)
return HSAKMT_STATUS_INVALID_PARAMETER;
/* CPU->CPU is not an indirect link */
if (!node_props[node1].node.KFDGpuID && !node_props[node2].node.KFDGpuID)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
if (node_props[node1].node.HiveID &&
node_props[node2].node.HiveID &&
node_props[node1].node.HiveID == node_props[node2].node.HiveID)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (node_props[node1].node.KFDGpuID)
dir_cpu1 = gpu_get_direct_link_cpu(node1, node_props);
if (node_props[node2].node.KFDGpuID)
dir_cpu2 = gpu_get_direct_link_cpu(node2, node_props);
if (dir_cpu1 < 0 && dir_cpu2 < 0)
return HSAKMT_STATUS_ERROR;
/* if the node2(dst) is GPU , it need to be large bar for host access*/
if (node_props[node2].node.KFDGpuID) {
for (i = 0; i < node_props[node2].node.NumMemoryBanks; ++i)
if (node_props[node2].mem[i].HeapType ==
HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC)
break;
if (i >= node_props[node2].node.NumMemoryBanks)
return HSAKMT_STATUS_ERROR;
}
/* Possible topology:
* GPU --(weight1) -- CPU -- (weight2) -- GPU
* GPU --(weight1) -- CPU -- (weight2) -- CPU -- (weight3) -- GPU
* GPU --(weight1) -- CPU -- (weight2) -- CPU
* CPU -- (weight2) -- CPU -- (weight3) -- GPU
*/
if (dir_cpu1 >= 0) { /* GPU->CPU ... */
if (dir_cpu2 >= 0) {
if (dir_cpu1 == dir_cpu2) /* GPU->CPU->GPU*/ {
ret = get_direct_iolink_info(node1, dir_cpu1,
node_props, &weight1, NULL);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
ret = get_direct_iolink_info(dir_cpu1, node2,
node_props, &weight2, type);
} else /* GPU->CPU->CPU->GPU*/ {
ret = get_direct_iolink_info(node1, dir_cpu1,
node_props, &weight1, NULL);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
ret = get_direct_iolink_info(dir_cpu1, dir_cpu2,
node_props, &weight2, type);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
/* On QPI interconnection, GPUs can't access
* each other if they are attached to different
* CPU sockets. CPU<->CPU weight larger than 20
* means the two CPUs are in different sockets.
*/
if (*type == HSA_IOLINK_TYPE_QPI_1_1
&& weight2 > 20)
return HSAKMT_STATUS_NOT_SUPPORTED;
ret = get_direct_iolink_info(dir_cpu2, node2,
node_props, &weight3, NULL);
}
} else /* GPU->CPU->CPU */ {
ret = get_direct_iolink_info(node1, dir_cpu1, node_props,
&weight1, NULL);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
ret = get_direct_iolink_info(dir_cpu1, node2, node_props,
&weight2, type);
}
} else { /* CPU->CPU->GPU */
ret = get_direct_iolink_info(node1, dir_cpu2, node_props, &weight2,
type);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
ret = get_direct_iolink_info(dir_cpu2, node2, node_props, &weight3,
NULL);
}
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
*weight = weight1 + weight2 + weight3;
return HSAKMT_STATUS_SUCCESS;
}
static void topology_create_indirect_gpu_links(const HsaSystemProperties *sys_props,
node_props_t *node_props)
{
uint32_t i, j;
HSAuint32 weight;
HSA_IOLINKTYPE type;
for (i = 0; i < sys_props->NumNodes - 1; i++) {
for (j = i + 1; j < sys_props->NumNodes; j++) {
get_indirect_iolink_info(i, j, node_props, &weight, &type);
if (!weight)
goto try_alt_dir;
if (topology_add_io_link_for_node(i, sys_props, node_props,
type, j, weight) != HSAKMT_STATUS_SUCCESS)
pr_err("Fail to add IO link %d->%d\n", i, j);
try_alt_dir:
get_indirect_iolink_info(j, i, node_props, &weight, &type);
if (!weight)
continue;
if (topology_add_io_link_for_node(j, sys_props, node_props,
type, i, weight) != HSAKMT_STATUS_SUCCESS)
pr_err("Fail to add IO link %d->%d\n", j, i);
}
}
}
HSAKMT_STATUS topology_take_snapshot(void)
{
uint32_t gen_start, gen_end, i, mem_id, cache_id;
HsaSystemProperties sys_props;
node_props_t *temp_props = 0;
HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
struct proc_cpuinfo *cpuinfo;
const uint32_t num_procs = get_nprocs();
uint32_t num_ioLinks;
bool p2p_links = false;
uint32_t num_p2pLinks = 0;
cpuinfo = calloc(num_procs, sizeof(struct proc_cpuinfo));
if (!cpuinfo) {
pr_err("Fail to allocate memory for CPU info\n");
return HSAKMT_STATUS_NO_MEMORY;
}
topology_parse_cpuinfo(cpuinfo, num_procs);
retry:
ret = topology_sysfs_get_generation(&gen_start);
if (ret != HSAKMT_STATUS_SUCCESS)
goto err;
ret = hsakmt_topology_sysfs_get_system_props(&sys_props);
if (ret != HSAKMT_STATUS_SUCCESS)
goto err;
if (sys_props.NumNodes > 0) {
temp_props = calloc(sys_props.NumNodes * sizeof(node_props_t), 1);
if (!temp_props) {
ret = HSAKMT_STATUS_NO_MEMORY;
goto err;
}
for (i = 0; i < sys_props.NumNodes; i++) {
ret = topology_sysfs_get_node_props(i,
&temp_props[i].node,
&p2p_links, &num_p2pLinks);
if (ret != HSAKMT_STATUS_SUCCESS) {
free_properties(temp_props, i);
goto err;
}
if (temp_props[i].node.NumCPUCores)
topology_get_cpu_model_name(&temp_props[i].node,
cpuinfo, num_procs);
if (temp_props[i].node.NumMemoryBanks) {
temp_props[i].mem = calloc(temp_props[i].node.NumMemoryBanks * sizeof(HsaMemoryProperties), 1);
if (!temp_props[i].mem) {
ret = HSAKMT_STATUS_NO_MEMORY;
free_properties(temp_props, i + 1);
goto err;
}
for (mem_id = 0; mem_id < temp_props[i].node.NumMemoryBanks; mem_id++) {
ret = topology_sysfs_get_mem_props(i, mem_id, &temp_props[i].mem[mem_id]);
if (ret != HSAKMT_STATUS_SUCCESS) {
free_properties(temp_props, i + 1);
goto err;
}
}
}
if (temp_props[i].node.NumCaches) {
temp_props[i].cache = calloc(temp_props[i].node.NumCaches * sizeof(HsaCacheProperties), 1);
if (!temp_props[i].cache) {
ret = HSAKMT_STATUS_NO_MEMORY;
free_properties(temp_props, i + 1);
goto err;
}
for (cache_id = 0; cache_id < temp_props[i].node.NumCaches; cache_id++) {
ret = topology_sysfs_get_cache_props(i, cache_id, &temp_props[i].cache[cache_id]);
if (ret != HSAKMT_STATUS_SUCCESS) {
free_properties(temp_props, i + 1);
goto err;
}
}
} else if (!temp_props[i].node.KFDGpuID) { /* a CPU node */
ret = topology_get_cpu_cache_props(
i, cpuinfo, &temp_props[i]);
if (ret != HSAKMT_STATUS_SUCCESS) {
free_properties(temp_props, i + 1);
goto err;
}
}
/* To simplify, allocate maximum needed memory for io_links for each node. This
* removes the need for realloc when indirect and QPI links are added later
*/
temp_props[i].link = calloc(sys_props.NumNodes - 1, sizeof(HsaIoLinkProperties));
if (!temp_props[i].link) {
ret = HSAKMT_STATUS_NO_MEMORY;
free_properties(temp_props, i + 1);
goto err;
}
num_ioLinks = temp_props[i].node.NumIOLinks - num_p2pLinks;
uint32_t link_id = 0;
if (num_ioLinks) {
uint32_t sys_link_id = 0;
/* Parse all the sysfs specified io links. Skip the ones where the
* remote node (node_to) is not accessible
*/
while (sys_link_id < num_ioLinks &&
link_id < sys_props.NumNodes - 1) {
ret = topology_sysfs_get_iolink_props(i, sys_link_id++,
&temp_props[i].link[link_id], false);
if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
continue;
} else if (ret != HSAKMT_STATUS_SUCCESS) {
free_properties(temp_props, i + 1);
goto err;
}
link_id++;
}
/* sysfs specifies all the io links. Limit the number to valid ones */
temp_props[i].node.NumIOLinks = link_id;
}
if (num_p2pLinks) {
uint32_t sys_link_id = 0;
/* Parse all the sysfs specified p2p links.
*/
while (sys_link_id < num_p2pLinks &&
link_id < sys_props.NumNodes - 1) {
ret = topology_sysfs_get_iolink_props(i, sys_link_id++,
&temp_props[i].link[link_id], true);
if (ret == HSAKMT_STATUS_NOT_SUPPORTED) {
continue;
} else if (ret != HSAKMT_STATUS_SUCCESS) {
free_properties(temp_props, i + 1);
goto err;
}
link_id++;
}
temp_props[i].node.NumIOLinks = link_id;
}
}
}
if (!p2p_links) {
/* All direct IO links are created in the kernel. Here we need to
* connect GPU<->GPU or GPU<->CPU indirect IO links.
*/
topology_create_indirect_gpu_links(&sys_props, temp_props);
}
ret = topology_sysfs_get_generation(&gen_end);
if (ret != HSAKMT_STATUS_SUCCESS) {
free_properties(temp_props, sys_props.NumNodes);
goto err;
}
if (gen_start != gen_end) {
free_properties(temp_props, sys_props.NumNodes);
temp_props = 0;
goto retry;
}
if (!g_system) {
g_system = malloc(sizeof(HsaSystemProperties));
if (!g_system) {
free_properties(temp_props, sys_props.NumNodes);
ret = HSAKMT_STATUS_NO_MEMORY;
goto err;
}
}
*g_system = sys_props;
if (g_props)
free(g_props);
g_props = temp_props;
err:
free(cpuinfo);
return ret;
}
/* Drop the Snapshot of the HSA topology information. Assume lock is held. */
void topology_drop_snapshot(void)
{
if (!!g_system != !!g_props)
pr_warn("Probably inconsistency?\n");
if (g_props) {
/* Remove state */
free_properties(g_props, g_system->NumNodes);
g_props = NULL;
}
free(g_system);
g_system = NULL;
if (map_user_to_sysfs_node_id) {
free(map_user_to_sysfs_node_id);
map_user_to_sysfs_node_id = NULL;
map_user_to_sysfs_node_id_size = 0;
}
}
HSAKMT_STATUS hsakmt_validate_nodeid(uint32_t nodeid, uint32_t *gpu_id)
{
if (!g_props || !g_system || g_system->NumNodes <= nodeid)
return HSAKMT_STATUS_INVALID_NODE_UNIT;
if (gpu_id)
*gpu_id = g_props[nodeid].node.KFDGpuID;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS hsakmt_gpuid_to_nodeid(uint32_t gpu_id, uint32_t *node_id)
{
uint64_t node_idx;
for (node_idx = 0; node_idx < g_system->NumNodes; node_idx++) {
if (g_props[node_idx].node.KFDGpuID == gpu_id) {
*node_id = node_idx;
return HSAKMT_STATUS_SUCCESS;
}
}
return HSAKMT_STATUS_INVALID_NODE_UNIT;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtAcquireSystemProperties(HsaSystemProperties *SystemProperties)
{
HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
CHECK_KFD_OPEN();
if (!SystemProperties)
return HSAKMT_STATUS_INVALID_PARAMETER;
pthread_mutex_lock(&hsakmt_mutex);
/* We already have a valid snapshot. Avoid double initialization that
* would leak memory.
*/
if (g_system) {
*SystemProperties = *g_system;
goto out;
}
err = topology_take_snapshot();
if (err != HSAKMT_STATUS_SUCCESS)
goto out;
assert(g_system);
if (hsakmt_use_model)
model_init();
err = hsakmt_fmm_init_process_apertures(g_system->NumNodes);
if (err != HSAKMT_STATUS_SUCCESS)
goto init_process_apertures_failed;
err = hsakmt_init_process_doorbells(g_system->NumNodes);
if (err != HSAKMT_STATUS_SUCCESS)
goto init_doorbells_failed;
*SystemProperties = *g_system;
goto out;
init_doorbells_failed:
hsakmt_fmm_destroy_process_apertures();
init_process_apertures_failed:
topology_drop_snapshot();
out:
pthread_mutex_unlock(&hsakmt_mutex);
return err;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtReleaseSystemProperties(void)
{
pthread_mutex_lock(&hsakmt_mutex);
hsakmt_destroy_process_doorbells();
hsakmt_fmm_destroy_process_apertures();
topology_drop_snapshot();
pthread_mutex_unlock(&hsakmt_mutex);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS hsakmt_topology_get_node_props(HSAuint32 NodeId,
HsaNodeProperties *NodeProperties)
{
if (!g_system || !g_props || NodeId >= g_system->NumNodes)
return HSAKMT_STATUS_ERROR;
*NodeProperties = g_props[NodeId].node;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeProperties(HSAuint32 NodeId,
HsaNodeProperties *NodeProperties)
{
HSAKMT_STATUS err;
uint32_t gpu_id;
if (!NodeProperties)
return HSAKMT_STATUS_INVALID_PARAMETER;
CHECK_KFD_OPEN();
pthread_mutex_lock(&hsakmt_mutex);
err = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (err != HSAKMT_STATUS_SUCCESS)
goto out;
err = hsakmt_topology_get_node_props(NodeId, NodeProperties);
if (err != HSAKMT_STATUS_SUCCESS)
goto out;
/* For CPU only node don't add any additional GPU memory banks. */
if (gpu_id) {
uint64_t base, limit;
if (hsakmt_is_dgpu)
NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS;
else
NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS;
if (hsakmt_fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base,
&limit) == HSAKMT_STATUS_SUCCESS)
NodeProperties->NumMemoryBanks += 1;
}
out:
pthread_mutex_unlock(&hsakmt_mutex);
return err;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeMemoryProperties(HSAuint32 NodeId,
HSAuint32 NumBanks,
HsaMemoryProperties *MemoryProperties)
{
HSAKMT_STATUS err = HSAKMT_STATUS_SUCCESS;
uint32_t i, gpu_id;
HSAuint64 aperture_limit;
if (!MemoryProperties)
return HSAKMT_STATUS_INVALID_PARAMETER;
CHECK_KFD_OPEN();
pthread_mutex_lock(&hsakmt_mutex);
err = hsakmt_validate_nodeid(NodeId, &gpu_id);
if (err != HSAKMT_STATUS_SUCCESS)
goto out;
memset(MemoryProperties, 0, NumBanks * sizeof(HsaMemoryProperties));
for (i = 0; i < MIN(g_props[NodeId].node.NumMemoryBanks, NumBanks); i++) {
assert(g_props[NodeId].mem);
MemoryProperties[i] = g_props[NodeId].mem[i];
}
/* The following memory banks does not apply to CPU only node */
if (gpu_id == 0)
goto out;
/*Add LDS*/
if (i < NumBanks &&
hsakmt_fmm_get_aperture_base_and_limit(FMM_LDS, gpu_id,
&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_LDS;
MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LDSSizeInKB * 1024;
i++;
}
/* Add Local memory - HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE.
* For dGPU the topology node contains Local Memory and it is added by
* the for loop above
*/
if (hsakmt_get_gfxv_by_node_id(NodeId) == GFX_VERSION_KAVERI && i < NumBanks &&
g_props[NodeId].node.LocalMemSize > 0 &&
hsakmt_fmm_get_aperture_base_and_limit(FMM_GPUVM, gpu_id,
&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
MemoryProperties[i].HeapType = HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE;
MemoryProperties[i].SizeInBytes = g_props[NodeId].node.LocalMemSize;
i++;
}
/* Add SCRATCH */
if (i < NumBanks &&
hsakmt_fmm_get_aperture_base_and_limit(FMM_SCRATCH, gpu_id,
&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
MemoryProperties[i].HeapType = HSA_HEAPTYPE_GPU_SCRATCH;
MemoryProperties[i].SizeInBytes = (aperture_limit - MemoryProperties[i].VirtualBaseAddress) + 1;
i++;
}
/* Add SVM aperture */
if (hsakmt_topology_is_svm_needed(g_props[NodeId].node.EngineId) && i < NumBanks &&
hsakmt_fmm_get_aperture_base_and_limit(
FMM_SVM, gpu_id, &MemoryProperties[i].VirtualBaseAddress,
&aperture_limit) == HSAKMT_STATUS_SUCCESS) {
MemoryProperties[i].HeapType = HSA_HEAPTYPE_DEVICE_SVM;
MemoryProperties[i].SizeInBytes = (aperture_limit - MemoryProperties[i].VirtualBaseAddress) + 1;
i++;
}
/* Add mmio aperture */
if (i < NumBanks &&
hsakmt_fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id,
&MemoryProperties[i].VirtualBaseAddress, &aperture_limit) == HSAKMT_STATUS_SUCCESS) {
MemoryProperties[i].HeapType = HSA_HEAPTYPE_MMIO_REMAP;
MemoryProperties[i].SizeInBytes = (aperture_limit - MemoryProperties[i].VirtualBaseAddress) + 1;
i++;
}
out:
pthread_mutex_unlock(&hsakmt_mutex);
return err;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeCacheProperties(HSAuint32 NodeId,
HSAuint32 ProcessorId,
HSAuint32 NumCaches,
HsaCacheProperties *CacheProperties)
{
HSAKMT_STATUS err;
uint32_t i;
if (!CacheProperties)
return HSAKMT_STATUS_INVALID_PARAMETER;
CHECK_KFD_OPEN();
pthread_mutex_lock(&hsakmt_mutex);
/* KFD ADD page 18, snapshot protocol violation */
if (!g_system || NodeId >= g_system->NumNodes) {
err = HSAKMT_STATUS_INVALID_NODE_UNIT;
goto out;
}
if (NumCaches > g_props[NodeId].node.NumCaches) {
err = HSAKMT_STATUS_INVALID_PARAMETER;
goto out;
}
for (i = 0; i < MIN(g_props[NodeId].node.NumCaches, NumCaches); i++) {
assert(g_props[NodeId].cache);
CacheProperties[i] = g_props[NodeId].cache[i];
}
err = HSAKMT_STATUS_SUCCESS;
out:
pthread_mutex_unlock(&hsakmt_mutex);
return err;
}
HSAKMT_STATUS hsakmt_topology_get_iolink_props(HSAuint32 NodeId,
HSAuint32 NumIoLinks,
HsaIoLinkProperties *IoLinkProperties)
{
if (!g_system || !g_props || NodeId >= g_system->NumNodes)
return HSAKMT_STATUS_ERROR;
memcpy(IoLinkProperties, g_props[NodeId].link,
NumIoLinks * sizeof(*IoLinkProperties));
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId,
HSAuint32 NumIoLinks,
HsaIoLinkProperties *IoLinkProperties)
{
HSAKMT_STATUS err;
if (!IoLinkProperties)
return HSAKMT_STATUS_INVALID_PARAMETER;
CHECK_KFD_OPEN();
pthread_mutex_lock(&hsakmt_mutex);
/* KFD ADD page 18, snapshot protocol violation */
if (!g_system || NodeId >= g_system->NumNodes ) {
err = HSAKMT_STATUS_INVALID_NODE_UNIT;
goto out;
}
if (NumIoLinks > g_props[NodeId].node.NumIOLinks) {
err = HSAKMT_STATUS_INVALID_PARAMETER;
goto out;
}
assert(g_props[NodeId].link);
err = hsakmt_topology_get_iolink_props(NodeId, NumIoLinks, IoLinkProperties);
out:
pthread_mutex_unlock(&hsakmt_mutex);
return err;
}
uint32_t hsakmt_get_gfxv_by_node_id(HSAuint32 node_id)
{
return HSA_GET_GFX_VERSION_FULL(g_props[node_id].node.EngineId.ui32);
}
uint16_t hsakmt_get_device_id_by_node_id(HSAuint32 node_id)
{
if (!g_props || !g_system || g_system->NumNodes <= node_id)
return 0;
return g_props[node_id].node.DeviceId;
}
bool hsakmt_prefer_ats(HSAuint32 node_id)
{
return g_props[node_id].node.Capability.ui32.HSAMMUPresent
&& g_props[node_id].node.NumCPUCores
&& g_props[node_id].node.NumFComputeCores;
}
uint16_t hsakmt_get_device_id_by_gpu_id(HSAuint32 gpu_id)
{
unsigned int i;
if (!g_props || !g_system)
return 0;
for (i = 0; i < g_system->NumNodes; i++) {
if (g_props[i].node.KFDGpuID == gpu_id)
return g_props[i].node.DeviceId;
}
return 0;
}
uint32_t hsakmt_get_direct_link_cpu(uint32_t gpu_node)
{
HSAuint64 size = 0;
int32_t cpu_id;
HSAuint32 i;
cpu_id = gpu_get_direct_link_cpu(gpu_node, g_props);
if (cpu_id == -1)
return INVALID_NODEID;
assert(g_props[cpu_id].mem);
for (i = 0; i < g_props[cpu_id].node.NumMemoryBanks; i++)
size += g_props[cpu_id].mem[i].SizeInBytes;
return size ? (uint32_t)cpu_id : INVALID_NODEID;
}
HSAKMT_STATUS hsakmt_validate_nodeid_array(uint32_t **gpu_id_array,
uint32_t NumberOfNodes, uint32_t *NodeArray)
{
HSAKMT_STATUS ret;
unsigned int i;
if (NumberOfNodes == 0 || !NodeArray || !gpu_id_array)
return HSAKMT_STATUS_INVALID_PARAMETER;
/* Translate Node IDs to gpu_ids */
*gpu_id_array = malloc(NumberOfNodes * sizeof(uint32_t));
if (!(*gpu_id_array))
return HSAKMT_STATUS_NO_MEMORY;
for (i = 0; i < NumberOfNodes; i++) {
ret = hsakmt_validate_nodeid(NodeArray[i], *gpu_id_array + i);
if (ret != HSAKMT_STATUS_SUCCESS) {
free(*gpu_id_array);
break;
}
}
return ret;
}
inline uint32_t hsakmt_get_num_sysfs_nodes(void)
{
return num_sysfs_nodes;
}
================================================
FILE: libhsakmt/src/version.c
================================================
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include
#include
#include "hsakmt/linux/kfd_ioctl.h"
HsaVersionInfo hsakmt_kfd_version_info;
HSAKMT_STATUS HSAKMTAPI hsaKmtGetVersion(HsaVersionInfo *VersionInfo)
{
CHECK_KFD_OPEN();
*VersionInfo = hsakmt_kfd_version_info;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS hsakmt_init_kfd_version(void)
{
struct kfd_ioctl_get_version_args args = {0};
if (hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_VERSION, &args) == -1)
return HSAKMT_STATUS_ERROR;
hsakmt_kfd_version_info.KernelInterfaceMajorVersion = args.major_version;
hsakmt_kfd_version_info.KernelInterfaceMinorVersion = args.minor_version;
if (args.major_version != 1)
return HSAKMT_STATUS_DRIVER_MISMATCH;
return HSAKMT_STATUS_SUCCESS;
}
================================================
FILE: libhsakmt/src/virtio/CMakeLists.txt
================================================
# Copyright 2025 Advanced Micro Devices, Inc.
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
cmake_minimum_required ( VERSION 3.7 )
set (CMAKE_VERBOSE_MAKEFILE ON)
set ( HSAKMT_VIRTIO "hsakmt_virtio" )
set ( HSAKMT_VIRTIO_TARGET "${HSAKMT_VIRTIO}" )
project ( ${HSAKMT_VIRTIO_TARGET} VERSION 1.0)
## Compiler flags
set ( HSAKMT_VIRTIO_C_FLAGS -fPIC -W -Wall -Wextra -Wno-unused-parameter -Wformat-security -Wswitch-default -Wundef -Wshadow -Wpointer-arith -Wbad-function-cast -Wcast-qual -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations -Wredundant-decls -Wunreachable-code -std=gnu99 -fvisibility=hidden )
if ( CMAKE_COMPILER_IS_GNUCC )
set ( HSAKMT_VIRTIO_C_FLAGS "${HSAKMT_VIRTIO_C_FLAGS}" -Wlogical-op )
endif ()
set ( HSAKMT_VIRTIO_LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/libhsakmt_virtio.ver" )
set ( HSAKMT_VIRTIO_LINK_FLAGS "-Wl,--enable-new-dtags -Wl,--version-script=${HSAKMT_VIRTIO_LINKER_SCRIPT} -Wl,-z,nodelete")
if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release )
set ( HSAKMT_VIRTIO_C_FLAGS "${HSAKMT_VIRTIO_C_FLAGS}" -O2 )
else ()
set ( HSAKMT_VIRTIO_C_FLAGS "${HSAKMT_VIRTIO_C_FLAGS}" -g )
endif ()
set ( HSAKMT_VIRTIO_SRC "virtio_gpu.c"
"hsakmt_virtio_vm.c"
"hsakmt_virtio_device.c"
"hsakmt_virtio_memory.c"
"hsakmt_virtio_amdgpu.c"
"hsakmt_virtio_events.c"
"hsakmt_virtio_queues.c"
"hsakmt_virtio_topology.c"
"hsakmt_virtio_openclose.c"
"../rbtree.c" )
add_library ( ${HSAKMT_VIRTIO_TARGET} STATIC ${HSAKMT_VIRTIO_SRC} )
target_sources ( ${HSAKMT_VIRTIO_TARGET} PRIVATE ${HSAKMT_VIRTIO_SRC} )
target_compile_options ( ${HSAKMT_VIRTIO_TARGET} PRIVATE ${HSAKMT_VIRTIO_C_FLAGS} )
target_include_directories ( ${HSAKMT_VIRTIO_TARGET}
PUBLIC
$
$
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/virtio
${CMAKE_CURRENT_SOURCE_DIR}/../
${CMAKE_CURRENT_SOURCE_DIR}/../../include
${CMAKE_CURRENT_SOURCE_DIR}/include/linux )
set_property(TARGET ${HSAKMT_VIRTIO_TARGET} PROPERTY LINK_FLAGS ${HSAKMT_VIRTIO_LINK_FLAGS})
find_package ( PkgConfig )
## If environment variable DRM_DIR is set, the script
## will pick up the corresponding libraries from that path.
list ( PREPEND CMAKE_PREFIX_PATH "${DRM_DIR}" )
pkg_check_modules ( DRM REQUIRED IMPORTED_TARGET libdrm )
pkg_check_modules ( DRM_AMDGPU REQUIRED IMPORTED_TARGET libdrm_amdgpu )
target_include_directories ( ${HSAKMT_VIRTIO_TARGET} PRIVATE ${DRM_AMDGPU_INCLUDE_DIRS} )
target_include_directories ( ${HSAKMT_VIRTIO_TARGET} PRIVATE ${DRM_INCLUDE_DIRS} )
target_link_libraries ( ${HSAKMT_VIRTIO_TARGET}
PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt c ${CMAKE_DL_LIBS} )
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_amdgpu.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "hsakmt/hsakmt_virtio.h"
#include "hsakmt_virtio_device.h"
int vamdgpu_query_gpu_info(amdgpu_device_handle handle, void* out) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GPU_INFO,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
int ret = vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (!ret) memcpy(out, &rsp->gpu_info, sizeof(struct amdgpu_gpu_info));
return ret;
}
HSAKMT_STATUS vhsaKmtGetAMDGPUDeviceHandle(HSAuint32 NodeId, HsaAMDGPUDeviceHandle* DeviceHandle) {
CHECK_VIRTIO_KFD_OPEN();
return HSAKMT_STATUS_SUCCESS;
}
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_device.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "hsakmt_virtio_device.h"
int vhsakmt_execbuf_cpu(vhsakmt_device_handle dev, struct vhsakmt_ccmd_req* req, const char* from) {
return virtio_gpu_exec_cmd(dev->vgdev, req, true);
}
void* vhsakmt_alloc_rsp(vhsakmt_device_handle dev, struct vhsakmt_ccmd_req* req, uint32_t sz) {
return virtio_gpu_alloc_rsp(dev->vgdev, req, sz);
}
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_device.h
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef HSAKMT_VIRTIO_DEVICE_H
#define HSAKMT_VIRTIO_DEVICE_H
#include "hsakmt_virtio_proto.h"
#include "rbtree.h"
#include "virtio_gpu.h"
#include
#ifdef __cplusplus
extern "C" {
#endif
#define vhsakmt_atomic_inc_return(ptr) (atomic_fetch_add((ptr), 1) + 1)
#define vhsakmt_atomic_dec_return(ptr) (atomic_fetch_sub((ptr), 1) - 1)
#define VHSA_VPTR_TO_UINT64(vptr) ((uint64_t)(unsigned long)(vptr))
#define VHSA_UINT64_TO_VPTR(v) ((void*)(unsigned long)(v))
extern int vhsakmt_debug_level;
#define vhsakmt_print(level, fmt, ...) \
do { \
if (level <= vhsakmt_debug_level) fprintf(stderr, fmt, ##__VA_ARGS__); \
} while (0)
#define VHSAKMT_DEBUG_LEVEL_DEFAULT -1
#define VHSAKMT_DEBUG_LEVEL_ERR 3
#define VHSAKMT_DEBUG_LEVEL_WARNING 4
#define VHSAKMT_DEBUG_LEVEL_INFO 6
#define VHSAKMT_DEBUG_LEVEL_DEBUG 7
#define vhsa_err(fmt, ...) vhsakmt_print(VHSAKMT_DEBUG_LEVEL_ERR, fmt, ##__VA_ARGS__)
#define vhsa_warn(fmt, ...) vhsakmt_print(VHSAKMT_DEBUG_LEVEL_WARNING, fmt, ##__VA_ARGS__)
#define vhsa_info(fmt, ...) vhsakmt_print(VHSAKMT_DEBUG_LEVEL_INFO, fmt, ##__VA_ARGS__)
#define vhsa_debug(fmt, ...) vhsakmt_print(VHSAKMT_DEBUG_LEVEL_DEBUG, fmt, ##__VA_ARGS__)
struct vhsakmt_device;
struct vhsakmt_bo;
typedef struct vhsakmt_device* vhsakmt_device_handle;
typedef struct vhsakmt_bo* vhsakmt_bo_handle;
typedef rbtree_node_t* bo_entry;
extern pthread_mutex_t dev_mutex;
extern vhsakmt_device_handle dev_list;
#define VHSA_BO_KFD_MEM 1 << 0 /* allocated from KFD (hsaKmtAllocMemory) */
#define VHSA_BO_USERPTR 1 << 1
#define VHSA_BO_QUEUE_BUFFER 1 << 2 /* allocated from KFD, but used for queue CMD submit */
#define VHSA_BO_QUEUE_DOORBELL 1 << 3 /* doorbell memory */
#define VHSA_BO_QUEUE_RW_PTR 1 << 4 /* queue read write ptr, from host map to guest*/
/* allocated from KFD, but used for AQL queue read write ptr */
#define VHSA_BO_QUEUE_AQL_RW_PTR 1 << 5
#define VHSA_BO_CLGL 1 << 6 /* CLGL memory, imported from mesa GL */
/* allocated from KFD, but is scratch memory, do not need map and unmap in ioctrl */
#define VHSA_BO_SCRATCH 1 << 7
#define VHSA_BO_QUEUE 1 << 8
#define VHSA_BO_EVENT 1 << 9
#define VHSA_BO_SCRATCH_MAP 1 << 10
#define VHSA_SDMA_NONE UINT32_MAX
#define CHECK_VIRTIO_KFD_OPEN() \
do { \
if (dev_list == NULL) return HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; \
} while (0)
struct vhsakmt_node {
HsaNodeProperties node_props;
void* doorbell_base;
uint64_t scratch_start;
uint64_t scratch_size;
};
struct vhsakmt_device {
struct virtio_gpu_device* vgdev;
int refcount;
pthread_mutex_t bo_handles_mutex;
rbtree_t bo_rbt;
struct vhsakmt_bo* shmem_bo;
uint32_t reqbuf_max;
uint32_t next_blob_id;
uint64_t vm_start;
uint64_t vm_size;
pthread_mutex_t vhsakmt_mutex;
struct vhsakmt_node* vhsakmt_nodes;
HsaSystemProperties* sys_props;
};
struct vhsakmt_bo {
rbtree_node_t rbtn;
struct vhsakmt_device* dev;
int refcount;
unsigned size;
void* cpu_addr;
void* host_addr;
HsaMemFlags flags;
uint32_t bo_type;
uint32_t blob_id;
pthread_mutex_t map_mutex;
union {
struct {
uint32_t handle;
uint32_t res_id;
uint64_t offset;
uint64_t alloc_size;
int map_count;
} real;
};
vHsaEvent* event;
uint64_t queue_id;
vhsakmt_bo_handle rw_bo;
void* gl_meta_data;
};
/*hsakmt_virtio_memory.c*/
vhsakmt_bo_handle vhsakmt_entry_to_bo_handle(bo_entry e);
bo_entry vhsakmt_bo_handle_to_entry(vhsakmt_bo_handle bo);
void vhsakmt_insert_bo(vhsakmt_device_handle dev, vhsakmt_bo_handle bo, void* addr, uint64_t size);
void vhsakmt_remove_bo(vhsakmt_device_handle dev, vhsakmt_bo_handle bo);
vhsakmt_bo_handle vhsakmt_find_bo_by_addr(vhsakmt_device_handle dev, void* addr);
void* vhsakmt_gpu_va(vhsakmt_device_handle dev, void* va);
int vhsakmt_bo_cpu_unmap(vhsakmt_bo_handle bo);
int vhsakmt_bo_cpu_map(vhsakmt_bo_handle bo_handle, void** cpu, void* fixed_cpu);
int vhsakmt_create_mappable_blob_bo(vhsakmt_device_handle dev, size_t size, uint32_t blob_id,
uint32_t bo_type, void* va_handle,
vhsakmt_bo_handle* bo_handle);
int vhsakmt_bo_free(vhsakmt_device_handle dev, vhsakmt_bo_handle bo);
int vhsakmt_init_host_blob(vhsakmt_device_handle dev, size_t size, uint32_t blob_type,
uint32_t blob_flag, uint32_t blob_id, uint32_t bo_type, void* va_handle,
vhsakmt_bo_handle* bo_handle);
/*hsakmt_virtio_openclose.c*/
vhsakmt_device_handle vhsakmt_dev(void);
/*hsakmt_virtio_vm.c*/
void* vhsakmt_vm_start(void);
int vhsakmt_reserve_va(uint64_t start, uint64_t size);
void vhsakmt_dereserve_va(uint64_t start, uint64_t size);
void vhsakmt_set_scratch_area(vhsakmt_device_handle dev, uint32_t node, uint64_t start,
uint64_t size);
void vhsakmt_set_vm_area(vhsakmt_device_handle dev, uint64_t start, uint64_t size);
int vhsakmt_set_node_doorbell(vhsakmt_device_handle dev, uint32_t node, void* doorbell);
void* vhsakmt_node_doorbell(vhsakmt_device_handle dev, uint32_t node);
bool vhsakmt_is_scratch_mem(vhsakmt_device_handle dev, void* addr);
bool vhsakmt_is_userptr(vhsakmt_device_handle dev, void* addr);
/*hsakmt_virtio_device.c*/
int vhsakmt_execbuf_cpu(vhsakmt_device_handle dev, struct vhsakmt_ccmd_req* req, const char* from);
void* vhsakmt_alloc_rsp(vhsakmt_device_handle dev, struct vhsakmt_ccmd_req* req, uint32_t sz);
/*hsakmt_virtio_event.c*/
void* vhsakmt_event_host_handle(HsaEvent* h);
#ifdef __cplusplus
}
#endif
#endif
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_events.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "hsakmt/hsakmt_virtio.h"
#include "hsakmt_virtio_device.h"
int vhsakmt_debug_level;
void* vhsakmt_event_host_handle(HsaEvent* h) { return (void*)((vHsaEvent*)h)->event_handle; }
static inline int32_t vhsakmt_event_res_id(HsaEvent* h) { return ((vHsaEvent*)h)->res_id; }
static inline vhsakmt_bo_handle vhsakmt_event_bo_handle(HsaEvent* h) {
return (vhsakmt_bo_handle)((vHsaEvent*)h)->bo_handle;
}
static int vhsakmt_create_event_blob_bo(vhsakmt_device_handle dev, size_t size, uint32_t blob_id,
vHsaEvent* vevent_handle, vhsakmt_bo_handle* bo_handle) {
int r;
r = vhsakmt_init_host_blob(dev, size, VIRTGPU_BLOB_MEM_HOST3D, 0, blob_id, VHSA_BO_EVENT,
(void*)vevent_handle->event_handle, bo_handle);
if (r) return r;
(*bo_handle)->event = vevent_handle;
vevent_handle->bo_handle = (uint64_t)(*bo_handle);
vevent_handle->res_id = (*bo_handle)->real.res_id;
vhsakmt_insert_bo(dev, *bo_handle, vevent_handle, size);
return r;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtCreateEvent(HsaEventDescriptor* EventDesc, _Bool ManualReset,
_Bool IsSignaled, HsaEvent** Event) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_event_rsp* rsp;
vhsakmt_bo_handle event_bo;
vHsaEvent* e;
int r;
struct vhsakmt_ccmd_event_req req = {
.hdr = VHSAKMT_CCMD(EVENT, sizeof(struct vhsakmt_ccmd_event_req)),
.type = VHSAKMT_CCMD_EVENT_CREATE,
.create_args.EventDesc = *EventDesc,
.create_args.ManualReset = ManualReset,
.create_args.IsSignaled = IsSignaled,
.blob_id = vhsakmt_atomic_inc_return(&dev->next_blob_id),
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_event_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (rsp->ret) return rsp->ret;
e = calloc(1, sizeof(vHsaEvent));
if (!e) return -ENOMEM;
memcpy(e, &rsp->vevent, sizeof(vHsaEvent));
r = vhsakmt_create_event_blob_bo(dev, sizeof(vHsaEvent), req.blob_id, e, &event_bo);
if (r) {
free(e);
return -ENOMEM;
}
*Event = (HsaEvent*)e;
vhsa_debug(
"%s: event addr: %p, hw123: %lx, %lx, %x, type: %d, id: %x, host handle: 0x%lx, res id: %d\n",
__FUNCTION__, e, e->event.EventData.HWData1, e->event.EventData.HWData2,
e->event.EventData.HWData3, e->event.EventData.EventType, e->event.EventId, e->event_handle,
event_bo->real.res_id);
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtDestroyEvent(HsaEvent* Event) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_bo* bo;
if (Event == NULL) return HSAKMT_STATUS_SUCCESS;
bo = vhsakmt_event_bo_handle(Event);
if (!bo) return HSAKMT_STATUS_SUCCESS;
return vhsakmt_bo_free(dev, bo);
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtSetEvent(HsaEvent* Event) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_event_rsp* rsp;
struct vhsakmt_ccmd_event_req req = {
.hdr = VHSAKMT_CCMD(EVENT, sizeof(struct vhsakmt_ccmd_event_req)),
.type = VHSAKMT_CCMD_EVENT_SET,
.event_hanele = vhsakmt_event_host_handle(Event),
.res_id = vhsakmt_event_res_id(Event),
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_event_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtResetEvent(HsaEvent* Event) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_event_rsp* rsp;
struct vhsakmt_ccmd_event_req req = {
.hdr = VHSAKMT_CCMD(EVENT, sizeof(struct vhsakmt_ccmd_event_req)),
.type = VHSAKMT_CCMD_EVENT_RESET,
.event_hanele = vhsakmt_event_host_handle(Event),
.res_id = vhsakmt_event_res_id(Event),
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_event_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtQueryEventState(HsaEvent* Event) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_event_rsp* rsp;
struct vhsakmt_ccmd_event_req req = {
.hdr = VHSAKMT_CCMD(EVENT, sizeof(struct vhsakmt_ccmd_event_req)),
.type = VHSAKMT_CCMD_EVENT_QUERY_STATE,
.event_hanele = vhsakmt_event_host_handle(Event),
.res_id = vhsakmt_event_res_id(Event),
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_event_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtWaitOnMultipleEvents(HsaEvent* Events[], HSAuint32 NumEvents,
bool WaitOnAll, HSAuint32 Milliseconds) {
return HSAKMT_STATUS_ERROR;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtWaitOnEvent(HsaEvent* Event, HSAuint32 Milliseconds) {
return vhsaKmtWaitOnMultipleEvents(&Event, 1, true, Milliseconds);
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtWaitOnEvent_Ext(HsaEvent* Event, HSAuint32 Milliseconds,
uint64_t* event_age) {
return vhsaKmtWaitOnMultipleEvents(&Event, 1, true, Milliseconds);
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtWaitOnMultipleEvents_Ext(HsaEvent* Events[], HSAuint32 NumEvents,
bool WaitOnAll, HSAuint32 Milliseconds,
uint64_t* event_age) {
return vhsaKmtWaitOnMultipleEvents(Events, NumEvents, WaitOnAll, Milliseconds);
}
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_memory.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "hsakmt/hsakmt_virtio.h"
#include "hsakmt_virtio_device.h"
#define VHSA_GL_METADATA_MAX_SIZE (0x50)
vhsakmt_bo_handle vhsakmt_entry_to_bo_handle(bo_entry e) { return (vhsakmt_bo_handle)e; }
bo_entry vhsakmt_bo_handle_to_entry(vhsakmt_bo_handle bo) { return &bo->rbtn; }
static inline bool vhsakmt_is_mem_bo(vhsakmt_bo_handle bo) { return (!bo->queue_id && !bo->event); }
static bool vhsakmt_mappable(HsaMemFlags flags) { return (!flags.ui32.Scratch); }
static bool vhsakmt_bo_mappable(vhsakmt_bo_handle bo) { return vhsakmt_mappable(bo->flags); }
void vhsakmt_insert_bo(vhsakmt_device_handle dev, vhsakmt_bo_handle bo, void* addr, uint64_t size) {
bo->rbtn.key.addr = (unsigned long)addr;
bo->rbtn.key.size = (unsigned long)size;
pthread_mutex_lock(&dev->bo_handles_mutex);
hsakmt_rbtree_insert(&dev->bo_rbt, &bo->rbtn);
pthread_mutex_unlock(&dev->bo_handles_mutex);
}
static void vhsakmt_remove_entry(vhsakmt_device_handle dev, bo_entry entry) {
if (!entry) return;
pthread_mutex_lock(&dev->bo_handles_mutex);
hsakmt_rbtree_delete(&dev->bo_rbt, entry);
pthread_mutex_unlock(&dev->bo_handles_mutex);
}
void vhsakmt_remove_bo(vhsakmt_device_handle dev, vhsakmt_bo_handle bo) {
bo_entry entry = vhsakmt_bo_handle_to_entry(bo);
if (entry->key.addr == 0 && entry->key.size == 0) return;
vhsakmt_remove_entry(dev, entry);
}
static bo_entry vhsakmt_rbt_search(vhsakmt_device_handle dev, void* addr) {
vhsakmt_bo_handle bo;
rbtree_key_t key = rbtree_key((uint64_t)addr, 0);
pthread_mutex_lock(&dev->bo_handles_mutex);
bo_entry n = rbtree_lookup_nearest(&dev->bo_rbt, &key, LKP_ADDR, RIGHT);
pthread_mutex_unlock(&dev->bo_handles_mutex);
if (n) {
bo = vhsakmt_entry_to_bo_handle(n);
if (bo->cpu_addr != addr) return NULL;
return n;
}
return NULL;
}
static bo_entry vhsakmt_find_entry_by_addr(vhsakmt_device_handle dev, void* addr) {
return vhsakmt_rbt_search(dev, addr);
}
vhsakmt_bo_handle vhsakmt_find_bo_by_addr(vhsakmt_device_handle dev, void* addr) {
bo_entry entry = vhsakmt_find_entry_by_addr(dev, addr);
if (entry) {
vhsakmt_bo_handle bo = vhsakmt_entry_to_bo_handle(entry);
if (!vhsakmt_is_mem_bo(bo)) return NULL;
return bo;
}
return NULL;
}
void* vhsakmt_gpu_va(vhsakmt_device_handle dev, void* va) {
if (!vhsakmt_is_userptr(dev, va)) return va;
bo_entry entry = vhsakmt_find_entry_by_addr(dev, va);
if (!entry) return NULL;
return vhsakmt_entry_to_bo_handle(entry)->host_addr;
}
int vhsakmt_bo_cpu_map(vhsakmt_bo_handle bo, void** cpu, void* fixed_cpu) {
int r;
if (!vhsakmt_bo_mappable(bo)) return 0;
pthread_mutex_lock(&bo->map_mutex);
if (!bo->cpu_addr) {
r = virtio_gpu_map_handle(bo->dev->vgdev, bo->real.handle, bo->size, cpu, fixed_cpu);
if (r) {
pthread_mutex_unlock(&bo->map_mutex);
return r;
}
bo->cpu_addr = *cpu;
atomic_fetch_add(&bo->real.map_count, 1);
}
pthread_mutex_unlock(&bo->map_mutex);
return *cpu == MAP_FAILED;
}
int vhsakmt_bo_cpu_unmap(vhsakmt_bo_handle bo) {
int r = 0;
if (!vhsakmt_bo_mappable(bo)) return 0;
pthread_mutex_lock(&bo->map_mutex);
if (!bo->cpu_addr || bo->real.map_count == 0) {
pthread_mutex_unlock(&bo->map_mutex);
return 0;
}
if (vhsakmt_atomic_dec_return(&bo->real.map_count) <= 0) {
if (bo->bo_type & VHSA_BO_KFD_MEM) {
virtio_gpu_unmap(bo->cpu_addr, bo->size);
vhsakmt_reserve_va(VHSA_VPTR_TO_UINT64(bo->cpu_addr), bo->size);
bo->cpu_addr = NULL;
}
}
pthread_mutex_unlock(&bo->map_mutex);
return r;
}
static int vhsakmt_destroy_handle(vhsakmt_device_handle dev, vhsakmt_bo_handle bo) {
int r = virtio_gpu_destroy_handle(dev->vgdev, bo->real.handle);
free(bo);
return r;
}
int vhsakmt_init_host_blob(vhsakmt_device_handle dev, size_t size, uint32_t blob_type,
uint32_t blob_flag, uint32_t blob_id, uint32_t bo_type, void* va_handle,
vhsakmt_bo_handle* bo_handle) {
int r;
vhsakmt_bo_handle bo;
struct drm_virtgpu_resource_create_blob args = {
.blob_mem = blob_type,
.size = size,
.blob_id = blob_id,
.blob_flags = blob_flag,
};
r = virtio_gpu_create_blob(dev->vgdev, &args);
if (r) return -EINVAL;
bo = calloc(1, sizeof(struct vhsakmt_bo));
if (!bo) {
virtio_gpu_destroy_handle(dev->vgdev, args.bo_handle);
return -ENOMEM;
}
bo->dev = dev;
bo->size = size;
bo->real.alloc_size = size;
bo->bo_type = bo_type;
bo->host_addr = va_handle;
pthread_mutex_init(&bo->map_mutex, NULL);
atomic_store(&bo->real.map_count, 0);
atomic_store(&bo->refcount, 1);
bo->real.handle = args.bo_handle;
virtio_gpu_res_id(dev->vgdev, bo->real.handle, &bo->real.res_id);
*bo_handle = bo;
return 0;
}
static int vhsakmt_init_userptr_blob(vhsakmt_device_handle dev, void* addr, size_t size,
vhsakmt_bo_handle* bo_handle, uint64_t* offset) {
int r;
struct drm_virtgpu_resource_create_blob args = {
.blob_mem = VIRTGPU_BLOB_MEM_HOST3D_GUEST,
.blob_flags = VIRTGPU_BLOB_FLAG_USE_USERPTR,
.size = size,
.blob_id = vhsakmt_atomic_inc_return(&dev->next_blob_id),
.blob_userptr = (uint64_t)addr,
};
r = virtio_gpu_create_blob(dev->vgdev, &args);
if (r < 0) return r;
vhsakmt_bo_handle userptr = calloc(1, sizeof(struct vhsakmt_bo));
if (!userptr) {
virtio_gpu_destroy_handle(dev->vgdev, args.bo_handle);
return -ENOMEM;
}
userptr->dev = dev;
userptr->size = size;
userptr->real.alloc_size = size;
userptr->bo_type = VHSA_BO_USERPTR;
userptr->cpu_addr = addr;
pthread_mutex_init(&userptr->map_mutex, NULL);
atomic_store(&userptr->real.map_count, 0);
atomic_store(&userptr->refcount, 1);
userptr->real.handle = args.bo_handle;
virtio_gpu_res_id(dev->vgdev, userptr->real.handle, &userptr->real.res_id);
*bo_handle = userptr;
*offset = args.offset;
return r;
}
int vhsakmt_create_mappable_blob_bo(vhsakmt_device_handle dev, size_t size, uint32_t blob_id,
uint32_t bo_type, void* va_handle,
vhsakmt_bo_handle* bo_handle) {
int r;
r = vhsakmt_init_host_blob(dev, size, VIRTGPU_BLOB_MEM_HOST3D, VIRTGPU_BLOB_FLAG_USE_MAPPABLE,
blob_id, bo_type, va_handle, bo_handle);
if (r) return r;
r = vhsakmt_bo_cpu_map(*bo_handle, &((*bo_handle)->cpu_addr), va_handle);
if (r) {
free(*bo_handle);
*bo_handle = NULL;
return -EINVAL;
}
if (va_handle && (va_handle != (*bo_handle)->cpu_addr))
vhsa_warn("%s: target map: %p != real map: %p\n", __FUNCTION__, va_handle,
(*bo_handle)->cpu_addr);
vhsakmt_insert_bo(dev, *bo_handle, (*bo_handle)->cpu_addr, (*bo_handle)->size);
return r;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtAllocMemory(HSAuint32 PreferredNode, HSAuint64 SizeInBytes,
HsaMemFlags MemFlags, void** MemoryAddress) {
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_memory_rsp* rsp;
vhsakmt_bo_handle bo;
int r;
struct vhsakmt_ccmd_memory_req req = {
.hdr = VHSAKMT_CCMD(MEMORY, sizeof(struct vhsakmt_ccmd_memory_req)),
.type = VHSAKMT_CCMD_MEMORY_ALLOC,
.blob_id = vhsakmt_atomic_inc_return(&dev->next_blob_id),
.alloc_args =
{
.PreferredNode = PreferredNode,
.SizeInBytes = SizeInBytes,
.MemFlags = MemFlags,
},
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (rsp->ret) return rsp->ret;
if (!rsp->memory_handle) return -ENOMEM;
r = vhsakmt_init_host_blob(dev, SizeInBytes, VIRTGPU_BLOB_MEM_HOST3D,
vhsakmt_mappable(MemFlags) ? VIRTGPU_BLOB_FLAG_USE_MAPPABLE : 0,
req.blob_id, VHSA_BO_KFD_MEM, (void*)rsp->memory_handle, &bo);
if (r) return r;
if (!vhsakmt_mappable(MemFlags)) {
bo->cpu_addr = bo->host_addr;
if (MemFlags.ui32.Scratch) {
vhsakmt_set_scratch_area(dev, PreferredNode, (uint64_t)bo->cpu_addr, SizeInBytes);
bo->bo_type |= VHSA_BO_SCRATCH;
}
} else {
r = vhsakmt_bo_cpu_map(bo, &bo->cpu_addr, bo->host_addr);
if (r) {
free(bo);
return -ENOMEM;
}
}
if (!MemFlags.ui32.Scratch) vhsakmt_insert_bo(dev, bo, bo->cpu_addr, bo->size);
*MemoryAddress = bo->cpu_addr;
vhsa_debug("alloc mem addr: %p, host addr: %p, size: %lx, res-id: %d, handble: %d\n",
*MemoryAddress, bo->host_addr, SizeInBytes, bo->real.res_id, bo->real.handle);
return rsp->ret;
}
int vhsakmt_bo_free(vhsakmt_device_handle dev, vhsakmt_bo_handle bo) {
bo_entry entry;
int r;
if (vhsakmt_atomic_dec_return(&bo->refcount) > 0) return 0;
entry = vhsakmt_bo_handle_to_entry(bo);
if (entry->key.addr == 0 && entry->key.size == 0) return -EINVAL;
/* do not free BOs of queue, let them be freed with queue */
if (bo->bo_type & VHSA_BO_QUEUE_DOORBELL) {
vhsa_err("%s: Try to free VHSA_BO_QUEUE_DOORBELL memory: %p\n", __FUNCTION__, bo->cpu_addr);
return 0;
}
vhsakmt_remove_bo(dev, bo);
if (bo->cpu_addr) vhsakmt_bo_cpu_unmap(bo);
if (bo->event) free(bo->event);
if (bo->gl_meta_data) free(bo->gl_meta_data);
pthread_mutex_destroy(&bo->map_mutex);
r = vhsakmt_destroy_handle(dev, bo);
return r;
}
/* Only remove bo in rbtree */
static void vhsakmt_remove_userptr_bo(vhsakmt_device_handle dev, vhsakmt_bo_handle bo) {
vhsakmt_remove_bo(dev, bo);
free(bo);
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtFreeMemory(void* MemoryAddress, HSAuint64 SizeInBytes) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
vhsakmt_bo_handle bo = vhsakmt_find_bo_by_addr(dev, MemoryAddress);
if (!bo) return HSAKMT_STATUS_SUCCESS;
vhsa_debug("%s: addr: %p, size: %lx, res_id: %d\n", __FUNCTION__, MemoryAddress, SizeInBytes,
bo->real.res_id);
return vhsakmt_bo_free(dev, bo);
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtMapMemoryToGPUNodes(void* MemoryAddress, HSAuint64 MemorySizeInBytes,
HSAuint64* AlternateVAGPU,
HsaMemMapFlags MemMapFlags,
HSAuint64 NumberOfNodes, HSAuint32* NodeArray) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
size_t req_len =
VHSA_ALIGN_UP(sizeof(struct vhsakmt_ccmd_memory_req) + NumberOfNodes * sizeof(*NodeArray), 8);
struct vhsakmt_ccmd_memory_req* req;
struct vhsakmt_ccmd_memory_rsp* rsp;
vhsakmt_bo_handle bo;
req = (void*)calloc(1, req_len);
if (!req) return -ENOMEM;
req->hdr = VHSAKMT_CCMD(MEMORY, req_len);
req->type = VHSAKMT_CCMD_MEMORY_MAP_TO_GPU_NODES;
req->map_to_GPU_nodes_args.MemorySizeInBytes = MemorySizeInBytes;
req->map_to_GPU_nodes_args.MemMapFlags = MemMapFlags;
req->map_to_GPU_nodes_args.NumberOfNodes = NumberOfNodes;
bo = vhsakmt_find_bo_by_addr(dev, MemoryAddress);
if (bo) {
req->map_to_GPU_nodes_args.MemoryAddress = (uint64_t)bo->host_addr;
if (bo->bo_type & VHSA_BO_USERPTR) vhsakmt_remove_userptr_bo(dev, bo);
} else
req->map_to_GPU_nodes_args.MemoryAddress = (uint64_t)MemoryAddress;
memcpy(req->payload, NodeArray, NumberOfNodes * sizeof(*NodeArray));
rsp = vhsakmt_alloc_rsp(dev, &req->hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) {
free(req);
return -ENOMEM;
}
vhsakmt_execbuf_cpu(dev, &req->hdr, __FUNCTION__);
*AlternateVAGPU = rsp->alternate_vagpu;
vhsa_debug("%s: gva: %p, hva: 0x%lx, size: %lx, AlternateVAGPU: %lx, ret: %d\n", __FUNCTION__,
MemoryAddress, req->map_to_GPU_nodes_args.MemoryAddress, MemorySizeInBytes,
*AlternateVAGPU, rsp->ret);
free(req);
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtUnmapMemoryToGPU(void* MemoryAddress) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
vhsakmt_bo_handle bo = vhsakmt_find_bo_by_addr(dev, MemoryAddress);
if (!bo) return HSAKMT_STATUS_SUCCESS;
struct vhsakmt_ccmd_memory_rsp* rsp;
struct vhsakmt_ccmd_memory_req req = {
.hdr = VHSAKMT_CCMD(MEMORY, sizeof(struct vhsakmt_ccmd_memory_req)),
.type = VHSAKMT_CCMD_MEMORY_UNMAP_TO_GPU,
.MemoryAddress = (uint64_t)bo->host_addr,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
vhsa_debug("%s: gva: %p, hva: 0x%lx\n", __FUNCTION__, MemoryAddress, req.MemoryAddress);
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtAvailableMemory(HSAuint32 Node, HSAuint64* AvailableBytes) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_memory_rsp* rsp;
struct vhsakmt_ccmd_memory_req req = {
.hdr = VHSAKMT_CCMD(MEMORY, sizeof(struct vhsakmt_ccmd_memory_req)),
.type = VHSAKMT_CCMD_MEMORY_AVAIL_MEM,
.Node = Node,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
*AvailableBytes = rsp->available_bytes;
return rsp->ret;
}
static int vhsakmt_create_scratch_map_memory(vhsakmt_device_handle dev, void* MemoryAddress,
HSAuint64 MemorySizeInBytes,
HSAuint64* AlternateVAGPU) {
vhsakmt_bo_handle out;
int r;
struct vhsakmt_ccmd_memory_req req = {
.hdr = VHSAKMT_CCMD(MEMORY, sizeof(struct vhsakmt_ccmd_memory_req)),
.type = VHSAKMT_CCMD_MEMORY_MAP_MEM_TO_GPU,
.blob_id = vhsakmt_atomic_inc_return(&dev->next_blob_id),
.map_to_GPU_args =
{
.MemoryAddress = (uint64_t)MemoryAddress,
.MemorySizeInBytes = MemorySizeInBytes,
.need_create_bo = true,
},
};
struct vhsakmt_ccmd_memory_rsp* rsp =
vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (rsp->ret) return rsp->ret;
r = vhsakmt_init_host_blob(dev, MemorySizeInBytes, VIRTGPU_BLOB_MEM_HOST3D, 0, req.blob_id,
VHSA_BO_SCRATCH_MAP, NULL, &out);
if (r) return r;
// TODO: insert scratch bo into rbtree, or insert it in dev nodes.
out->cpu_addr = MemoryAddress;
out->host_addr = (void*)rsp->memory_handle;
*AlternateVAGPU = rsp->alternate_vagpu;
vhsa_debug(
"%s: create scratch memory, gva: %p, memory_handle: 0x%p, alternate_vagpu: %p, size: %lx\n",
__FUNCTION__, MemoryAddress, (void*)rsp->memory_handle, (void*)rsp->alternate_vagpu,
MemorySizeInBytes);
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtMapMemoryToGPU(void* MemoryAddress, HSAuint64 MemorySizeInBytes,
HSAuint64* AlternateVAGPU) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_memory_rsp* rsp;
vhsakmt_bo_handle bo = vhsakmt_find_bo_by_addr(dev, MemoryAddress);
if (!bo && vhsakmt_is_scratch_mem(dev, MemoryAddress))
return vhsakmt_create_scratch_map_memory(dev, MemoryAddress, MemorySizeInBytes, AlternateVAGPU);
struct vhsakmt_ccmd_memory_req req = {
.hdr = VHSAKMT_CCMD(MEMORY, sizeof(struct vhsakmt_ccmd_memory_req)),
.type = VHSAKMT_CCMD_MEMORY_MAP_MEM_TO_GPU,
.map_to_GPU_args =
{
.MemoryAddress = bo ? (uint64_t)bo->host_addr : (uint64_t)MemoryAddress,
.MemorySizeInBytes = MemorySizeInBytes,
},
};
if (bo && (bo->bo_type & VHSA_BO_USERPTR)) vhsakmt_remove_userptr_bo(dev, bo);
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
vhsa_debug("%s: gva: %p, hva: 0x%lx, size: %lx\n", __FUNCTION__, MemoryAddress, req.MemoryAddress,
MemorySizeInBytes);
*AlternateVAGPU = rsp->alternate_vagpu;
return rsp->ret;
}
static int vhsakmt_map_userptr(vhsakmt_device_handle dev, void* addr, size_t size, uint32_t res_id,
uint64_t* userptr_handle) {
struct vhsakmt_ccmd_memory_req req = {
.hdr = VHSAKMT_CCMD(MEMORY, sizeof(struct vhsakmt_ccmd_memory_req)),
.type = VHSAKMT_CCMD_MEMORY_MAP_USERPTR,
.res_id = res_id,
};
struct vhsakmt_ccmd_memory_rsp* rsp =
vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) return -ENOMEM;
rsp->map_userptr_rsp.userptr_handle = 0;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
*userptr_handle = rsp->map_userptr_rsp.userptr_handle;
return rsp->ret;
}
static void* vhsakmt_map_to_gpu(void* addr, size_t size) {
vhsakmt_device_handle dev = vhsakmt_dev();
size_t offset = (uint64_t)addr % getpagesize();
size_t map_size = (VHSA_ALIGN_UP(size + offset, getpagesize()) / getpagesize()) * getpagesize();
uint64_t userptr_offset, userptr_handle = 0;
vhsakmt_bo_handle userptr;
int r;
vhsa_debug("%s: addr: %p, size: 0x%lx, size + offset: 0x%lx, map_size: 0x%lx\n", __FUNCTION__,
addr, size, size + offset, map_size);
r = vhsakmt_init_userptr_blob(dev, addr, size, &userptr, &userptr_offset);
if (r < 0) {
vhsa_debug("%s: userptr create failed at address: %p, ret = %d\n", __FUNCTION__, addr, r);
return NULL;
}
vhsakmt_map_userptr(dev, addr, size, userptr->real.res_id, &userptr_handle);
if (!userptr_handle) {
vhsa_debug("%s: map userptr failed at address: %p, ret = %d\n", __FUNCTION__, addr, r);
vhsakmt_destroy_handle(dev, userptr);
vhsakmt_remove_userptr_bo(dev, userptr);
return NULL;
}
userptr->host_addr = VHSA_UINT64_TO_VPTR(VHSA_VPTR_TO_UINT64(userptr_handle) + offset);
if (r > 0) {
vhsa_debug("%s: userptr: %p already registered, offset: %lx\n", __FUNCTION__, addr,
userptr_offset);
userptr->host_addr =
VHSA_UINT64_TO_VPTR(VHSA_VPTR_TO_UINT64(userptr->host_addr) + userptr_offset);
}
vhsakmt_insert_bo(dev, userptr, userptr->cpu_addr, userptr->size);
vhsa_debug("%s: real gva: %p, gva: %p, hva: %p, size: %lx, offset: %" PRIu64
", map_size: 0x%lx\n",
__FUNCTION__, addr, userptr->cpu_addr, userptr->host_addr, size, offset, map_size);
return userptr->host_addr;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtRegisterMemoryWithFlags(void* MemoryAddress,
HSAuint64 MemorySizeInBytes,
HsaMemFlags MemFlags) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_memory_rsp* rsp;
void* addr;
struct vhsakmt_ccmd_memory_req req = {
.hdr = VHSAKMT_CCMD(MEMORY, sizeof(struct vhsakmt_ccmd_memory_req)),
.type = VHSAKMT_CCMD_MEMORY_REG_MEM_WITH_FLAG,
.reg_mem_with_flag =
{
.MemorySizeInBytes = MemorySizeInBytes,
.MemFlags = MemFlags,
},
};
/* no need to register memory from lihsakmt / not a userptr */
if (!vhsakmt_is_userptr(dev, MemoryAddress)) return HSAKMT_STATUS_SUCCESS;
addr = vhsakmt_map_to_gpu(MemoryAddress, MemorySizeInBytes);
if (!addr) {
vhsa_debug("%s: register memory failed, gva: %p, size: %lx\n", __FUNCTION__, MemoryAddress,
MemorySizeInBytes);
return HSAKMT_STATUS_ERROR;
}
req.reg_mem_with_flag.MemoryAddress = (uint64_t)addr;
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
return rsp->ret;
}
static int vhsakmt_remove_clgl_bo(vhsakmt_device_handle dev, vhsakmt_bo_handle bo) {
struct vhsakmt_ccmd_memory_rsp* rsp;
struct vhsakmt_ccmd_memory_req req = {
.hdr = VHSAKMT_CCMD(MEMORY, sizeof(struct vhsakmt_ccmd_memory_req)),
.type = VHSAKMT_CCMD_MEMORY_DEREG_MEM,
.res_id = bo->real.res_id,
.MemoryAddress = (uint64_t)bo->cpu_addr,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_memory_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (rsp->ret) vhsa_err("%s: deregister failed clgl memory gva: %p\n", __FUNCTION__, bo->cpu_addr);
vhsakmt_bo_free(dev, bo);
vhsa_debug("%s: deregister clgl memory gva: %p, ret: %d\n", __FUNCTION__, bo->cpu_addr, rsp->ret);
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtDeregisterMemory(void* MemoryAddress) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
vhsakmt_bo_handle bo = vhsakmt_find_bo_by_addr(dev, MemoryAddress);
if (!bo) return HSAKMT_STATUS_SUCCESS;
vhsa_debug("%s: remove userptr %p size: 0x%lx, res id: %d\n", __FUNCTION__, MemoryAddress,
(size_t)bo->size, bo->real.res_id);
if (bo->bo_type & VHSA_BO_CLGL)
return vhsakmt_remove_clgl_bo(dev, bo);
else {
vhsakmt_remove_bo(dev, bo);
free(bo);
}
return 0;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtQueryPointerInfo(const void* Pointer, HsaPointerInfo* PointerInfo) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
void* gpu_va = vhsakmt_gpu_va(dev, VHSA_UINT64_TO_VPTR(Pointer));
if (!gpu_va) return -HSAKMT_STATUS_ERROR;
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_POINTER_INFO,
.pointer = VHSA_VPTR_TO_UINT64(gpu_va),
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr,
sizeof(struct vhsakmt_ccmd_query_info_rsp) +
QUERY_PTR_INFO_MAX_MAPPED_NODES * sizeof(uint32_t));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
memcpy(PointerInfo, &rsp->ptr_info, sizeof(HsaPointerInfo));
if (PointerInfo->NMappedNodes && PointerInfo->MappedNodes) {
if (PointerInfo->NMappedNodes > QUERY_PTR_INFO_MAX_MAPPED_NODES) {
PointerInfo->NMappedNodes = QUERY_PTR_INFO_MAX_MAPPED_NODES;
vhsa_debug(
"%s: query pointer: %p info mapped nodes greater than QUERY_PTR_INFO_MAX_MAPPED_NODES\n",
__FUNCTION__, Pointer);
}
PointerInfo->MappedNodes = calloc(PointerInfo->NMappedNodes, sizeof(uint32_t));
if (!PointerInfo->MappedNodes) {
PointerInfo->NMappedNodes = 0;
return -HSAKMT_STATUS_NO_MEMORY;
}
memcpy(VHSA_UINT64_TO_VPTR(PointerInfo->MappedNodes), rsp->payload,
PointerInfo->NMappedNodes * sizeof(uint32_t));
}
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetTileConfig(HSAuint32 NodeId, HsaGpuTileConfig* config) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
uint8_t* config_cpy_addr = NULL;
struct vhsakmt_ccmd_query_info_rsp* rsp;
unsigned req_len = sizeof(struct vhsakmt_ccmd_query_info_req);
unsigned rsp_len = sizeof(struct vhsakmt_ccmd_query_info_rsp) +
config->NumTileConfigs * sizeof(HSAuint32) + config->NumMacroTileConfigs * sizeof(HSAuint32);
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, req_len),
.type = VHSAKMT_CCMD_QUERY_TILE_CONFIG,
.tile_config_args.NodeId = NodeId,
.tile_config_args.config = *config,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, rsp_len);
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
memcpy(config, &rsp->tile_config_rsp, sizeof(HsaGpuTileConfig));
config_cpy_addr = ((uint8_t*)rsp->payload);
memcpy(config->TileConfig, config_cpy_addr, config->NumTileConfigs * sizeof(HSAuint32));
config_cpy_addr += config->NumTileConfigs * sizeof(HSAuint32);
memcpy(config->MacroTileConfig, config_cpy_addr, config->NumMacroTileConfigs * sizeof(HSAuint32));
return rsp->ret;
}
static int vhsakmt_create_clgl_bo(vhsakmt_device_handle dev, void* addr, size_t size,
uint32_t res_id, uint32_t bo_handle, void* meta_data) {
vhsakmt_bo_handle out = calloc(1, sizeof(struct vhsakmt_bo));
if (!out) return -ENOMEM;
out->dev = dev;
out->size = size;
atomic_store(&out->real.map_count, 0);
atomic_store(&out->refcount, 1);
#ifdef CLGL_EXPORT_RESID
out->real.res_id = GraphicsResourceHandle;
#else
out->real.res_id = res_id;
#endif
/* GL bo handle from GL context*/
out->real.handle = bo_handle;
out->bo_type |= VHSA_BO_CLGL;
if (meta_data) out->gl_meta_data = meta_data;
out->host_addr = addr;
vhsakmt_insert_bo(dev, out, addr, out->size);
return 0;
}
static int vhsakmt_gfxhandle_to_resid(vhsakmt_device_handle dev, uint32_t gfx_handle,
uint32_t* res_id, uint32_t* bo_handle) {
int r = drmPrimeFDToHandle(dev->vgdev->fd, gfx_handle, bo_handle);
if (r) {
vhsa_err("%s: drmPrimeFDToHandle failed for handle: %u\n", __FUNCTION__, gfx_handle);
return r;
}
virtio_gpu_res_id(dev->vgdev, *bo_handle, res_id);
vhsa_debug("%s: register praphics handle: handle: %d, bo_handle: %d, res_id: %d\n", __FUNCTION__,
gfx_handle, *bo_handle, *res_id);
return 0;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtRegisterGraphicsHandleToNodes(
HSAuint64 GraphicsResourceHandle, HsaGraphicsResourceInfo* GraphicsResourceInfo,
HSAuint64 NumberOfNodes, HSAuint32* NodeArray) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
uint32_t bo_handle, res_id;
uint64_t meta_data_size = VHSA_GL_METADATA_MAX_SIZE;
unsigned req_len = sizeof(struct vhsakmt_ccmd_gl_inter_req) + NumberOfNodes * sizeof(NodeArray);
struct vhsakmt_ccmd_gl_inter_req* req;
struct vhsakmt_ccmd_gl_inter_rsp* rsp;
int r;
req = calloc(1, req_len);
if (!req) return -ENOMEM;
req->hdr = VHSAKMT_CCMD(GL_INTER, req_len);
req->type = VHSAKMT_CCMD_GL_REG_GHD_TO_NODES;
req->reg_ghd_to_nodes.NumberOfNodes = NumberOfNodes;
req->reg_ghd_to_nodes.res_handle = GraphicsResourceHandle;
#ifdef CLGL_EXPORT_RESID
req->reg_ghd_to_nodes.GraphicsResourceHandle = GraphicsResourceHandle;
#else
r = vhsakmt_gfxhandle_to_resid(dev, GraphicsResourceHandle, &res_id, &bo_handle);
if (r) return r;
req->reg_ghd_to_nodes.GraphicsResourceHandle = bo_handle;
req->reg_ghd_to_nodes.res_handle = res_id;
#endif
memcpy(req->payload, NodeArray, NumberOfNodes * sizeof(NodeArray));
rsp =
vhsakmt_alloc_rsp(dev, &req->hdr, sizeof(struct vhsakmt_ccmd_gl_inter_rsp) + meta_data_size);
if (!rsp) {
r = -ENOMEM;
goto free_out;
}
vhsakmt_execbuf_cpu(dev, &req->hdr, __FUNCTION__);
if (rsp->ret) return rsp->ret;
memcpy(GraphicsResourceInfo, &rsp->info, sizeof(HsaGraphicsResourceInfo));
if (rsp->info.MetadataSizeInBytes) {
GraphicsResourceInfo->Metadata = calloc(1, GraphicsResourceInfo->MetadataSizeInBytes);
if (!GraphicsResourceInfo->Metadata) {
r = -ENOMEM;
goto free_out;
}
memcpy(VHSA_UINT64_TO_VPTR(GraphicsResourceInfo->Metadata), rsp->payload,
GraphicsResourceInfo->MetadataSizeInBytes);
} else
GraphicsResourceInfo->Metadata = NULL;
vhsa_debug("%s: register graphics handle: handle: %ld hva: %p, size: %lx\n", __FUNCTION__,
GraphicsResourceHandle, GraphicsResourceInfo->MemoryAddress,
GraphicsResourceInfo->SizeInBytes);
r = vhsakmt_create_clgl_bo(dev, GraphicsResourceInfo->MemoryAddress,
GraphicsResourceInfo->SizeInBytes, res_id, bo_handle,
VHSA_UINT64_TO_VPTR(GraphicsResourceInfo->Metadata));
if (r) goto free_out;
r = rsp->ret;
free_out:
/* close exported FD after register or close it when deregistre. Close after register here. */
close(GraphicsResourceHandle);
free(req);
return r;
}
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_openclose.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "hsakmt/hsakmt_virtio.h"
#include "hsakmt_virtio_device.h"
pthread_mutex_t dev_mutex = PTHREAD_MUTEX_INITIALIZER;
vhsakmt_device_handle dev_list = NULL;
vhsakmt_device_handle vhsakmt_dev(void) { return dev_list; }
static HSAKMT_STATUS vhsakmt_openKFD_cmd(vhsakmt_device_handle dev) {
void* vm_start = vhsakmt_vm_start();
if (!vm_start) return -HSAKMT_STATUS_NO_MEMORY;
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_OPEN_KFD,
.open_kfd_args =
{
.cur_vm_start = VHSA_VPTR_TO_UINT64(vm_start),
},
};
if (!req.open_kfd_args.cur_vm_start) {
vhsa_err("%s: failed to get current heap start address\n", __FUNCTION__);
return -HSAKMT_STATUS_ERROR;
}
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -HSAKMT_STATUS_NO_MEMORY;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (!rsp->open_kfd_rsp.vm_start || !rsp->open_kfd_rsp.vm_size) {
vhsa_err("%s: failed to get KFD VM area\n", __FUNCTION__);
return -HSAKMT_STATUS_ERROR;
}
vhsakmt_set_vm_area(dev, rsp->open_kfd_rsp.vm_start, rsp->open_kfd_rsp.vm_size);
if (vhsakmt_reserve_va(dev->vm_start, dev->vm_size)) {
vhsa_err("%s: failed to reserve VM area: [%lx-%lx]-0x%lx\n", __FUNCTION__, dev->vm_start,
dev->vm_start + dev->vm_size, dev->vm_size);
return -HSAKMT_STATUS_NO_MEMORY;
}
vhsa_debug("%s: kfd vm range: [%lx-%lx]-0x%lx\n", __FUNCTION__, dev->vm_start,
dev->vm_start + dev->vm_size, dev->vm_size);
return rsp->ret;
}
static vhsakmt_device_handle vhsakmt_device_init(void) {
int fd;
vhsakmt_device_handle dev = NULL;
if (vhsakmt_dev()) return vhsakmt_dev();
pthread_mutex_lock(&dev_mutex);
fd = virtio_gpu_kfd_open();
if (fd < 0) goto open_failed;
dev = calloc(1, sizeof(struct vhsakmt_device));
if (!dev) goto open_failed;
dev->vgdev = virtio_gpu_init(fd, 0);
if (!dev->vgdev) goto malloc_failed;
rbtree_init(&dev->bo_rbt);
atomic_store(&dev->next_blob_id, 1);
atomic_store(&dev->refcount, 1);
pthread_mutex_init(&dev->bo_handles_mutex, NULL);
pthread_mutex_init(&dev->vhsakmt_mutex, NULL);
dev_list = dev;
pthread_mutex_unlock(&dev_mutex);
return dev;
malloc_failed:
free(dev);
dev = NULL;
open_failed:
pthread_mutex_unlock(&dev_mutex);
return dev;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtOpenKFD(void) {
vhsakmt_device_handle dev;
char* d = getenv("VHSAKMT_DEBUG_LEVEL");
if (d) vhsakmt_debug_level = atoi(d);
dev = vhsakmt_device_init();
if (!dev) return HSAKMT_STATUS_ERROR;
return vhsakmt_openKFD_cmd(vhsakmt_dev());
}
static void vhsakmt_device_destroy(struct vhsakmt_device* dev) {
pthread_mutex_destroy(&dev->bo_handles_mutex);
vhsakmt_dereserve_va(dev->vm_start, dev->vm_size);
if (dev->sys_props) free(dev->sys_props);
if (dev->vhsakmt_nodes) free(dev->vhsakmt_nodes);
virtio_gpu_close(dev->vgdev);
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtCloseKFD(void) {
vhsakmt_device_handle dev = vhsakmt_dev();
pthread_mutex_lock(&dev_mutex);
if (vhsakmt_atomic_dec_return(&dev->refcount) <= 0) vhsakmt_device_destroy(dev);
pthread_mutex_unlock(&dev_mutex);
return 0;
}
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_proto.h
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef VHSAKMT_VIRTIO_PROTO_H
#define VHSAKMT_VIRTIO_PROTO_H
#include "hsakmt/linux/kfd_ioctl.h"
#include "hsakmt/hsakmt.h"
#include
#include
#include
#include "virtio_gpu.h"
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wpadded"
#endif
/* defined in other header file in virglrenderer */
#define VHSAKMT_DEFINE_CAST(parent, child) \
static inline struct child* to_##child(struct parent* x) { return (struct child*)x; }
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
#define VHSAKMT_STATIC_ASSERT_SIZE(t) \
static_assert(sizeof(struct t) % 8 == 0, "sizeof(struct " #t ") not multiple of 8"); \
static_assert(_Alignof(struct t) <= 8, "alignof(struct " #t ") too large");
#else
#define VHSAKMT_STATIC_ASSERT_SIZE(t)
#endif
enum vhsakmt_ccmd {
VHSAKMT_CCMD_NOP = 1, /* No payload, can be used to sync with host */
VHSAKMT_CCMD_QUERY_INFO,
VHSAKMT_CCMD_EVENT,
VHSAKMT_CCMD_MEMORY,
VHSAKMT_CCMD_QUEUE,
VHSAKMT_CCMD_GL_INTER,
};
typedef struct _vHsaEvent {
HsaEvent event;
uint64_t event_handle;
uint64_t bo_handle;
uint32_t res_id;
uint32_t pad;
} vHsaEvent;
VHSAKMT_STATIC_ASSERT_SIZE(_vHsaEvent)
struct vhsakmt_event_shmem {
uint32_t trigered_events_num;
uint32_t pad;
HsaEvent trigered_events[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_event_shmem)
#define VHSAKMT_CCMD(_cmd, _len) \
((struct vhsakmt_ccmd_req){ \
.cmd = VHSAKMT_CCMD_##_cmd, \
.len = (_len), \
})
struct vhsakmt_ccmd_nop_req {
struct vhsakmt_ccmd_req hdr;
};
/*
* VHSAKMT_CCMD_QUERY
*/
enum vhsakmt_ccmd_query_type {
VHSAKMT_CCMD_QUERY_GPU_INFO = 0,
VHSAKMT_CCMD_QUERY_OPEN_KFD,
VHSAKMT_CCMD_QUERY_GET_VER,
VHSAKMT_CCMD_QUERY_REL_SYS_PROP,
VHSAKMT_CCMD_QUERY_GET_SYS_PROP,
VHSAKMT_CCMD_QUERY_GET_NODE_PROP,
VHSAKMT_CCMD_QUERY_GET_XNACK_MODE,
VHSAKMT_CCMD_QUERY_RUN_TIME_ENABLE,
VHSAKMT_CCMD_QUERY_RUN_TIME_DISABLE,
VHSAKMT_CCMD_QUERY_GET_NOD_MEM_PROP,
VHSAKMT_CCMD_QUERY_GET_NOD_CACHE_PROP,
VHSAKMT_CCMD_QUERY_GET_NOD_IO_LINK_PROP,
VHSAKMT_CCMD_QUERY_GET_CLOCK_COUNTERS,
VHSAKMT_CCMD_QUERY_POINTER_INFO,
VHSAKMT_CCMD_QUERY_TILE_CONFIG,
VHSAKMT_CCMD_QUERY_NANO_TIME,
VHSAKMT_CCMD_QUERY_GET_RUNTIME_CAPS,
};
#define QUERY_PTR_INFO_MAX_MAPPED_NODES 3
typedef struct _query_req_run_time_enable_args {
/* void* rDebug, bypassed by payload */
uint8_t pad[3];
uint8_t setupTtmp;
uint32_t __pad;
} query_req_run_time_enable_args;
VHSAKMT_STATIC_ASSERT_SIZE(_query_req_run_time_enable_args)
typedef struct _query_req_node_mem_prop_args {
uint32_t NodeId;
uint32_t NumBanks;
} query_req_node_mem_prop_args;
VHSAKMT_STATIC_ASSERT_SIZE(_query_req_node_mem_prop_args)
typedef struct _query_req_node_cache_prop_args {
uint32_t NodeId;
uint32_t ProcessorId;
uint32_t NumCaches;
uint32_t pad;
} query_req_node_cache_prop_args;
VHSAKMT_STATIC_ASSERT_SIZE(_query_req_node_cache_prop_args)
typedef struct _query_req_node_io_link_args {
uint32_t NodeId;
uint32_t NumIoLinks;
} query_req_node_io_link_args;
VHSAKMT_STATIC_ASSERT_SIZE(_query_req_node_io_link_args)
typedef struct _query_tile_config {
HsaGpuTileConfig config;
uint32_t NodeId;
uint32_t pad;
} query_tile_config;
VHSAKMT_STATIC_ASSERT_SIZE(_query_tile_config)
typedef struct _query_open_kfd_args {
uint64_t cur_vm_start;
} query_open_kfd_args;
VHSAKMT_STATIC_ASSERT_SIZE(_query_open_kfd_args)
typedef struct _query_open_kfd_rsp {
uint64_t vm_start;
uint64_t vm_size;
} query_open_kfd_rsp;
VHSAKMT_STATIC_ASSERT_SIZE(_query_open_kfd_rsp)
typedef struct _query_nano_time_rsp {
uint64_t nano_time;
} query_nano_time_rsp;
VHSAKMT_STATIC_ASSERT_SIZE(_query_nano_time_rsp)
struct vhsakmt_ccmd_query_info_req {
struct vhsakmt_ccmd_req hdr;
struct drm_amdgpu_info info;
uint32_t type;
uint32_t pad;
union {
uint64_t pointer;
uint32_t NodeID; /* some query API just need node ID */
query_req_run_time_enable_args run_time_enable_args;
query_req_node_mem_prop_args node_mem_prop_args;
query_req_node_cache_prop_args node_cache_prop_args;
query_req_node_io_link_args node_io_link_args;
query_tile_config tile_config_args;
query_open_kfd_args open_kfd_args;
};
uint8_t payload[];
};
VHSAKMT_DEFINE_CAST(vhsakmt_ccmd_req, vhsakmt_ccmd_query_info_req)
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_query_info_req)
#define VHSAKMT_CCMD_QUERY_MAX_TILE_CONFIG 128
#define VHSAKMT_CCMD_QUERY_MAX_GET_NOD_MEM_PROP 128
#define VHSAKMT_CCMD_QUERY_MAX_GET_NOD_CACHE_PROP 128
#define VHSAKMT_CCMD_QUERY_MAX_GET_NOD_IO_LINK_PROP 128
struct vhsakmt_ccmd_query_info_rsp {
struct vhsakmt_ccmd_rsp hdr;
int32_t ret;
union {
query_open_kfd_rsp open_kfd_rsp;
query_nano_time_rsp nano_time_rsp;
HsaGpuTileConfig tile_config_rsp;
HsaPointerInfo ptr_info;
struct amdgpu_gpu_info gpu_info;
HsaVersionInfo kfd_version;
HsaSystemProperties sys_props;
HsaNodeProperties node_props;
int32_t xnack_mode;
HsaClockCounters clock_counters;
uint32_t caps;
uint64_t pad[9];
};
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_query_info_rsp)
/*
* VHSAKMT_CCMD_EVENT
*/
enum vhsakmt_ccmd_event_type {
VHSAKMT_CCMD_EVENT_CREATE,
VHSAKMT_CCMD_EVENT_DESTROY,
VHSAKMT_CCMD_EVENT_SET,
VHSAKMT_CCMD_EVENT_RESET,
VHSAKMT_CCMD_EVENT_QUERY_STATE,
VHSAKMT_CCMD_EVENT_WAIT_ON_MULTI_EVENTS,
VHSAKMT_CCMD_EVENT_SET_TRAP,
};
typedef struct _event_req_create_args {
HsaEventDescriptor EventDesc;
uint8_t ManualReset;
uint8_t IsSignaled;
uint8_t pad[6];
} event_req_create_args;
VHSAKMT_STATIC_ASSERT_SIZE(_event_req_create_args)
typedef struct _event_req_wait_args {
HsaEvent Event;
uint32_t Milliseconds;
uint32_t pad;
} event_req_wait_args;
VHSAKMT_STATIC_ASSERT_SIZE(_event_req_wait_args)
typedef struct _event_req_wait_ext_args {
HsaEvent Event;
uint64_t event_age;
uint32_t Milliseconds;
uint32_t pad;
} event_req_wait_ext_args;
VHSAKMT_STATIC_ASSERT_SIZE(_event_req_wait_ext_args)
typedef struct _event_req_wait_on_multi_args {
/*HsaEvent* Events[], in playloud*/
uint32_t NumEvents;
uint32_t Milliseconds;
uint8_t WaitOnAll;
uint8_t pad[7];
} event_req_wait_on_multi_args;
VHSAKMT_STATIC_ASSERT_SIZE(_event_req_wait_on_multi_args)
typedef struct _event_req_wait_on_multi_ext_args {
/*HsaEvent* Events[], in playloud*/
uint32_t NumEvents;
uint32_t Milliseconds;
uint64_t event_age;
uint8_t WaitOnAll;
uint8_t pad[7];
} event_req_wait_on_multi_ext_args;
VHSAKMT_STATIC_ASSERT_SIZE(_event_req_wait_on_multi_ext_args)
typedef struct _event_set_trap_handler_args {
uint64_t TrapHandlerBaseAddress;
uint64_t TrapHandlerSizeInBytes;
uint64_t TrapBufferBaseAddress;
uint64_t TrapBufferSizeInBytes;
uint32_t NodeId;
uint32_t pad;
} event_set_trap_handler_args;
VHSAKMT_STATIC_ASSERT_SIZE(_event_set_trap_handler_args)
struct vhsakmt_ccmd_event_req {
struct vhsakmt_ccmd_req hdr;
union {
HsaEvent Event; /* For set, reset, query. */
HsaEvent* event_hanele;
event_req_wait_args wait_args;
event_req_create_args create_args;
event_req_wait_ext_args wait_ext_args;
event_req_wait_on_multi_args wait_on_multi_args;
event_req_wait_on_multi_ext_args wait_on_multi_ext_args;
event_set_trap_handler_args set_trap_handler_args;
};
uint32_t type;
uint32_t sync_shmem_res_id;
uint64_t blob_id;
uint32_t res_id;
uint32_t pad;
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_event_req)
VHSAKMT_DEFINE_CAST(vhsakmt_ccmd_req, vhsakmt_ccmd_event_req)
struct vhsakmt_ccmd_event_rsp {
struct vhsakmt_ccmd_rsp hdr;
int32_t ret;
vHsaEvent vevent;
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_event_rsp)
/*
* VHSAKMT_CCMD_MEMORY
*/
enum vhsakmt_ccmd_memory_type {
VHSAKMT_CCMD_MEMORY_ALLOC,
VHSAKMT_CCMD_MEMORY_MAP_TO_GPU_NODES,
VHSAKMT_CCMD_MEMORY_FREE,
VHSAKMT_CCMD_MEMORY_UNMAP_TO_GPU,
VHSAKMT_CCMD_MEMORY_AVAIL_MEM,
VHSAKMT_CCMD_MEMORY_MAP_MEM_TO_GPU,
VHSAKMT_CCMD_MEMORY_REG_MEM_WITH_FLAG,
VHSAKMT_CCMD_MEMORY_DEREG_MEM,
VHSAKMT_CCMD_MEMORY_MAP_USERPTR,
};
typedef struct _memory_req_alloc_args {
uint32_t PreferredNode;
HsaMemFlags MemFlags;
uint64_t SizeInBytes;
uint64_t MemoryAddress;
} memory_req_alloc_args;
VHSAKMT_STATIC_ASSERT_SIZE(_memory_req_alloc_args)
typedef struct _memory_req_free_args {
uint64_t MemoryAddress;
uint64_t SizeInBytes;
} memory_req_free_args;
VHSAKMT_STATIC_ASSERT_SIZE(_memory_req_free_args)
typedef struct _memory_req_map_to_GPU_nodes_args {
uint64_t MemoryAddress;
uint64_t MemorySizeInBytes;
uint64_t AlternateVAGPU;
HsaMemMapFlags MemMapFlags;
uint32_t pad;
uint64_t NumberOfNodes;
uint32_t* NodeArray;
} memory_req_map_to_GPU_nodes_args;
VHSAKMT_STATIC_ASSERT_SIZE(_memory_req_map_to_GPU_nodes_args)
typedef struct _memory_map_mem_to_gpu_args {
uint64_t MemoryAddress;
uint64_t MemorySizeInBytes;
uint8_t need_create_bo;
uint8_t pad[7];
} memory_map_mem_to_gpu_args;
VHSAKMT_STATIC_ASSERT_SIZE(_memory_map_mem_to_gpu_args)
typedef struct _memory_reg_mem_with_flag {
uint64_t MemoryAddress;
uint64_t MemorySizeInBytes;
HsaMemFlags MemFlags;
uint32_t pad;
} memory_reg_mem_with_flag;
VHSAKMT_STATIC_ASSERT_SIZE(_memory_reg_mem_with_flag)
struct vhsakmt_ccmd_memory_req {
struct vhsakmt_ccmd_req hdr;
union {
uint64_t MemoryAddress;
uint32_t Node;
memory_req_alloc_args alloc_args;
memory_req_map_to_GPU_nodes_args map_to_GPU_nodes_args;
memory_req_free_args free_args;
memory_map_mem_to_gpu_args map_to_GPU_args;
memory_reg_mem_with_flag reg_mem_with_flag;
};
uint64_t blob_id;
uint32_t type;
uint32_t res_id;
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_memory_req)
VHSAKMT_DEFINE_CAST(vhsakmt_ccmd_req, vhsakmt_ccmd_memory_req)
typedef struct _vhsakmt_ccmd_memory_map_userptr_rsp {
uint64_t userptr_handle;
uint32_t npfns;
uint32_t pad;
} vhsakmt_ccmd_memory_map_userptr_rsp;
VHSAKMT_STATIC_ASSERT_SIZE(_vhsakmt_ccmd_memory_map_userptr_rsp)
struct vhsakmt_ccmd_memory_rsp {
struct vhsakmt_ccmd_rsp hdr;
int32_t ret;
union {
vhsakmt_ccmd_memory_map_userptr_rsp map_userptr_rsp;
uint64_t memory_handle;
uint64_t alternate_vagpu;
uint64_t available_bytes;
};
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_memory_rsp)
/*
* VHSAKMT_CCMD_QUEUE
*/
enum vhsakmt_ccmd_queue_type {
VHSAKMT_CCMD_QUEUE_CREATE,
VHSAKMT_CCMD_QUEUE_DESTROY,
};
typedef struct _vHsaQueueResource {
HsaQueueResource r;
uint64_t host_doorbell;
uint64_t host_doorbell_offset;
uint64_t host_write_offset;
uint64_t host_read_offset;
uint64_t host_rw_handle;
uint64_t queue_handle;
} vHsaQueueResource;
VHSAKMT_STATIC_ASSERT_SIZE(_vHsaQueueResource)
typedef struct _queue_req_create {
uint32_t NodeId;
HSA_QUEUE_TYPE Type;
uint32_t QueuePercentage;
uint32_t pad;
HSA_QUEUE_PRIORITY Priority;
uint32_t pad1;
uint32_t SdmaEngineId;
uint64_t QueueAddress;
uint64_t QueueSizeInBytes;
HsaEvent* Event;
HsaQueueResource* QueueResource;
uint64_t* Queue_write_ptr_aql;
uint64_t* Queue_read_ptr_aql;
} queue_req_create;
VHSAKMT_STATIC_ASSERT_SIZE(_queue_req_create)
struct vhsakmt_ccmd_queue_req {
struct vhsakmt_ccmd_req hdr;
union {
HSA_QUEUEID QueueId;
queue_req_create create_queue_args;
};
uint64_t blob_id; /* For queue create, queue resource */
uint64_t rw_ptr_blob_id; /* For queue create, r/w ptr memory mapping */
uint64_t doorbell_blob_id; /* For queue create, doorbell ptr memory mapping */
uint32_t res_id;
uint32_t type;
uint32_t queue_mem_res_id;
uint32_t pad;
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_queue_req)
VHSAKMT_DEFINE_CAST(vhsakmt_ccmd_req, vhsakmt_ccmd_queue_req)
struct vhsakmt_ccmd_queue_rsp {
struct vhsakmt_ccmd_rsp hdr;
int32_t ret;
vHsaQueueResource vqueue_res;
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_queue_rsp)
/*
* VHSAKMT_CCMD_GL_INTER
*/
enum vhsakmt_ccmd_gl_inter_type {
VHSAKMT_CCMD_GL_REG_GHD_TO_NODES,
};
typedef struct _gl_inter_req_reg_ghd_to_nodes {
uint64_t GraphicsResourceHandle;
uint64_t NumberOfNodes; // NodeArray in payload
uint32_t res_handle;
uint32_t pad;
} gl_inter_req_reg_ghd_to_nodes;
VHSAKMT_STATIC_ASSERT_SIZE(_gl_inter_req_reg_ghd_to_nodes)
struct vhsakmt_ccmd_gl_inter_req {
struct vhsakmt_ccmd_req hdr;
union {
gl_inter_req_reg_ghd_to_nodes reg_ghd_to_nodes;
};
uint32_t type;
uint32_t pad;
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_gl_inter_req)
VHSAKMT_DEFINE_CAST(vhsakmt_ccmd_req, vhsakmt_ccmd_gl_inter_req)
struct vhsakmt_ccmd_gl_inter_rsp {
struct vhsakmt_ccmd_rsp hdr;
int32_t ret;
union {
HsaGraphicsResourceInfo info;
};
uint8_t payload[];
};
VHSAKMT_STATIC_ASSERT_SIZE(vhsakmt_ccmd_gl_inter_rsp)
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#endif
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_queues.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "hsakmt/hsakmt_virtio.h"
#include "hsakmt_virtio_device.h"
static inline uint64_t vhsakmt_doorbell_page_size(void) { return 0x2000; }
static inline uint64_t vhsakmt_queue_page_size(void) { return getpagesize(); }
HSAKMT_STATUS HSAKMTAPI vhsaKmtSetTrapHandler(HSAuint32 NodeId, void* TrapHandlerBaseAddress,
HSAuint64 TrapHandlerSizeInBytes,
void* TrapBufferBaseAddress,
HSAuint64 TrapBufferSizeInBytes) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_event_rsp* rsp;
struct vhsakmt_ccmd_event_req req = {
.hdr = VHSAKMT_CCMD(EVENT, sizeof(struct vhsakmt_ccmd_event_req)),
.type = VHSAKMT_CCMD_EVENT_SET_TRAP,
.set_trap_handler_args =
{
.NodeId = NodeId,
.TrapHandlerBaseAddress = (uint64_t)TrapHandlerBaseAddress,
.TrapHandlerSizeInBytes = TrapHandlerSizeInBytes,
.TrapBufferBaseAddress = (uint64_t)TrapBufferBaseAddress,
.TrapBufferSizeInBytes = TrapBufferSizeInBytes,
},
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_event_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
return rsp->ret;
}
static int vhsakmt_find_aql_rw_bo(vhsakmt_device_handle dev, uint64_t aql_ptr,
uint32_t* aql_bo_res_id) {
uint64_t aql_base_ptr = VHSA_ALIGN_DOWN(aql_ptr, getpagesize());
vhsakmt_bo_handle bo = vhsakmt_find_bo_by_addr(dev, (void*)aql_base_ptr);
if (!bo) return -EINVAL;
bo->bo_type |= VHSA_BO_QUEUE_AQL_RW_PTR;
*aql_bo_res_id = bo->real.res_id;
return 0;
}
static int vhsakmt_create_doorbell_blob_bo(vhsakmt_device_handle dev, uint32_t node, size_t size,
uint32_t blob_id, uint64_t host_handle,
vhsakmt_bo_handle* bo_handle) {
int r;
r = vhsakmt_create_mappable_blob_bo(dev, size, blob_id, VHSA_BO_QUEUE_DOORBELL,
(void*)host_handle, bo_handle);
if (r) return r;
r = vhsakmt_set_node_doorbell(dev, node, (*bo_handle)->cpu_addr);
return r;
}
static int vhsakmt_create_queue_rw_blob_bo(vhsakmt_device_handle dev, size_t size, uint32_t blob_id,
uint64_t host_handle, vhsakmt_bo_handle* bo_handle) {
int r;
r = vhsakmt_create_mappable_blob_bo(dev, size, blob_id, VHSA_BO_QUEUE_RW_PTR, NULL, bo_handle);
if (r) return r;
(*bo_handle)->host_addr = (void*)host_handle;
return r;
}
static int vhsakmt_create_queue_blob_bo(vhsakmt_device_handle dev, size_t size, uint32_t blob_id,
uint64_t queue_id, vhsakmt_bo_handle rw_bo_handle,
vhsakmt_bo_handle* bo_handle) {
int r;
r = vhsakmt_init_host_blob(dev, size, VIRTGPU_BLOB_MEM_HOST3D, 0, blob_id, VHSA_BO_QUEUE, NULL,
bo_handle);
if (r) return r;
vhsakmt_insert_bo(dev, *bo_handle, *bo_handle, (*bo_handle)->size);
(*bo_handle)->queue_id = queue_id;
(*bo_handle)->rw_bo = rw_bo_handle;
return r;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtCreateQueueExt(HSAuint32 NodeId, HSA_QUEUE_TYPE Type,
HSAuint32 QueuePercentage,
HSA_QUEUE_PRIORITY Priority, HSAuint32 SdmaEngineId,
void* QueueAddress, HSAuint64 QueueSizeInBytes,
HsaEvent* Event, HsaQueueResource* QueueResource) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
vhsakmt_bo_handle rw_bo_handle = NULL, doorbell_bo, queue_bo, queue_mem_bo;
struct vhsakmt_ccmd_queue_rsp* rsp;
struct vhsakmt_ccmd_queue_req req = {
.hdr = VHSAKMT_CCMD(QUEUE, sizeof(struct vhsakmt_ccmd_queue_req)),
.type = VHSAKMT_CCMD_QUEUE_CREATE,
.create_queue_args =
{
.NodeId = NodeId,
.Type = Type,
.QueuePercentage = QueuePercentage,
.Priority = Priority,
.SdmaEngineId = SdmaEngineId,
.QueueAddress = (uint64_t)QueueAddress,
.QueueSizeInBytes = QueueSizeInBytes,
.Event = Event ? vhsakmt_event_host_handle(Event) : 0,
.Queue_write_ptr_aql = QueueResource->Queue_write_ptr_aql,
.Queue_read_ptr_aql = QueueResource->Queue_read_ptr_aql,
},
.blob_id = vhsakmt_atomic_inc_return(&dev->next_blob_id), /* For queue resource */
.doorbell_blob_id = vhsakmt_node_doorbell(dev, NodeId)
? 0
: vhsakmt_atomic_inc_return(&dev->next_blob_id), /* For queue doorbell memory map */
};
int r;
/* Queue ptr memory is allocated by hsakmtallocmemory in host then mapped into guest, but their
* address are not aligned. */
if (Type == HSA_QUEUE_COMPUTE_AQL) {
r = vhsakmt_find_aql_rw_bo(dev, QueueResource->QueueWptrValue, &req.res_id);
if (r) {
vhsa_debug("%s: can not find the AQL queue R/W BO: %p\n", __FUNCTION__,
QueueResource->Queue_write_ptr_aql);
return HSAKMT_STATUS_NO_MEMORY;
}
vhsa_debug("%s: create AQL queue, read ptr: %p, write ptr: %p, res id: %d\n", __FUNCTION__,
QueueResource->Queue_read_ptr_aql, QueueResource->Queue_write_ptr_aql, req.res_id);
} else
/* For queue not CP AQL, it use r/w ptr by itself. */
req.rw_ptr_blob_id = vhsakmt_atomic_inc_return(&dev->next_blob_id);
queue_mem_bo = vhsakmt_find_bo_by_addr(dev, QueueAddress);
if (!queue_mem_bo) {
vhsa_err("%s: can not find the queue memory BO: %p\n", __FUNCTION__, QueueAddress);
return HSAKMT_STATUS_NO_MEMORY;
}
queue_mem_bo->bo_type |= VHSA_BO_QUEUE_AQL_RW_PTR;
req.queue_mem_res_id = queue_mem_bo->real.res_id;
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_queue_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (rsp->ret) {
vhsa_err("%s: queue create failed, ret: %d", __FUNCTION__, rsp->ret);
return rsp->ret;
}
/* Map doorbell */
if (req.doorbell_blob_id) {
r = vhsakmt_create_doorbell_blob_bo(
dev, NodeId, vhsakmt_doorbell_page_size(), req.doorbell_blob_id,
rsp->vqueue_res.host_doorbell - rsp->vqueue_res.host_doorbell_offset, &doorbell_bo);
if (r) {
vhsa_err("%s: doorbell create failed, doorbell: %lx\n", __FUNCTION__,
rsp->vqueue_res.host_doorbell);
return r;
}
vhsa_debug("%s: create doorbell: %p, size: 0x%x\n", __FUNCTION__, doorbell_bo->cpu_addr,
doorbell_bo->size);
}
QueueResource->Queue_DoorBell_aql = (void*)rsp->vqueue_res.host_doorbell;
vhsa_debug("%s: queue create, Doorbell: %p\n", __FUNCTION__, QueueResource->Queue_DoorBell_aql);
/* Map R/W pointer.
* For a queue is not a COMPUTE AQL, the R/W PTR not using the input address,
* uses the queue memory allocated by hsakmtallocmemory, a page align address.
*/
if (Type != HSA_QUEUE_COMPUTE_AQL) {
r = vhsakmt_create_queue_rw_blob_bo(dev, vhsakmt_queue_page_size(), req.rw_ptr_blob_id,
rsp->vqueue_res.host_rw_handle, &rw_bo_handle);
if (r) {
vhsa_debug("%s: queue rw ptr create failed, host addr: %p\n", __FUNCTION__,
(void*)rsp->vqueue_res.host_rw_handle);
return r;
}
QueueResource->Queue_write_ptr_aql = VHSA_UINT64_TO_VPTR(
VHSA_VPTR_TO_UINT64(rw_bo_handle->cpu_addr) + rsp->vqueue_res.host_write_offset);
QueueResource->Queue_read_ptr_aql = VHSA_UINT64_TO_VPTR(
VHSA_VPTR_TO_UINT64(rw_bo_handle->cpu_addr) + rsp->vqueue_res.host_read_offset);
vhsa_debug("%s: queue create: write ptr gva: %p, read ptr gva: %p, base hva: %lx\n",
__FUNCTION__, QueueResource->Queue_write_ptr_aql, QueueResource->Queue_read_ptr_aql,
rsp->vqueue_res.host_rw_handle);
}
r = vhsakmt_create_queue_blob_bo(dev, QueueSizeInBytes, req.blob_id, rsp->vqueue_res.r.QueueId,
rw_bo_handle, &queue_bo);
if (r) {
vhsa_err("%s: queue create failed, queue ID: 0x%lx\n", __FUNCTION__, rsp->vqueue_res.r.QueueId);
return r;
}
QueueResource->QueueId = (uint64_t)queue_bo;
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtCreateQueue(HSAuint32 NodeId, HSA_QUEUE_TYPE Type,
HSAuint32 QueuePercentage, HSA_QUEUE_PRIORITY Priority,
void* QueueAddress, HSAuint64 QueueSizeInBytes,
HsaEvent* Event, HsaQueueResource* QueueResource) {
return vhsaKmtCreateQueueExt(NodeId, Type, QueuePercentage, Priority, VHSA_SDMA_NONE,
QueueAddress, QueueSizeInBytes, Event, QueueResource);
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtDestroyQueue(HSA_QUEUEID QueueId) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
int r;
/* queue ID: vhsakmt_bo_handle -> real queue ID*/
vhsakmt_bo_handle bo = (vhsakmt_bo_handle)QueueId;
vhsakmt_bo_handle rw_bo = bo->rw_bo;
r = vhsakmt_bo_free(dev, bo);
if (rw_bo) vhsakmt_bo_free(dev, rw_bo);
vhsa_debug("%s: queue res id: %d, queue ID: %" PRIu64 ", ret = %d\n", __FUNCTION__,
bo->real.res_id, bo->queue_id, r);
return r;
}
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_topology.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "hsakmt/hsakmt_virtio.h"
#include "hsakmt_virtio_device.h"
static int vhsakmt_set_sys_props(vhsakmt_device_handle dev, HsaSystemProperties* sys_props) {
int r = 0;
pthread_mutex_lock(&dev->vhsakmt_mutex);
if (dev->sys_props) {
r = 0;
goto out;
}
dev->sys_props = calloc(1, sizeof(HsaSystemProperties));
if (!dev->sys_props) {
r = -ENOMEM;
goto out;
}
memcpy(dev->sys_props, sys_props, sizeof(HsaSystemProperties));
out:
pthread_mutex_unlock(&dev->vhsakmt_mutex);
return r;
}
static int vhsakmt_set_node_props(vhsakmt_device_handle dev, uint32_t node,
HsaNodeProperties* node_props) {
int r = 0;
if (!dev->sys_props) return -EINVAL;
if (node >= dev->sys_props->NumNodes) return -EINVAL;
pthread_mutex_lock(&dev->vhsakmt_mutex);
if (!dev->vhsakmt_nodes) {
dev->vhsakmt_nodes = calloc(dev->sys_props->NumNodes, sizeof(struct vhsakmt_node));
if (!dev->vhsakmt_nodes) {
r = -ENOMEM;
goto out;
}
}
memcpy(&dev->vhsakmt_nodes[node].node_props, node_props, sizeof(HsaNodeProperties));
out:
pthread_mutex_unlock(&dev->vhsakmt_mutex);
return r;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetVersion(HsaVersionInfo* v) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GET_VER,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
memcpy(v, &rsp->kfd_version, sizeof(HsaVersionInfo));
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtAcquireSystemProperties(HsaSystemProperties* SystemProperties) {
CHECK_VIRTIO_KFD_OPEN();
int r;
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GET_SYS_PROP,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (!rsp) return -ENOMEM;
memcpy(SystemProperties, &rsp->sys_props, sizeof(HsaSystemProperties));
r = vhsakmt_set_sys_props(dev, SystemProperties);
if (r) return r;
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtReleaseSystemProperties(void) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_REL_SYS_PROP,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (!rsp) return -ENOMEM;
if (dev->sys_props) {
free(dev->sys_props);
dev->sys_props = NULL;
}
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetNodeProperties(HSAuint32 NodeId,
HsaNodeProperties* NodeProperties) {
CHECK_VIRTIO_KFD_OPEN();
int r;
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.NodeID = NodeId,
.type = VHSAKMT_CCMD_QUERY_GET_NODE_PROP,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (!rsp) return -ENOMEM;
memcpy(NodeProperties, &rsp->node_props, sizeof(HsaNodeProperties));
r = vhsakmt_set_node_props(dev, NodeId, NodeProperties);
if (r) return r;
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetXNACKMode(HSAint32* enable) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GET_XNACK_MODE,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (!rsp) return -ENOMEM;
memcpy(enable, &rsp->xnack_mode, sizeof(HSAint32));
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtRuntimeEnable(void* rDebug, bool setupTtmp) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.run_time_enable_args.setupTtmp = setupTtmp,
.type = VHSAKMT_CCMD_QUERY_RUN_TIME_ENABLE,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (!rsp) return -ENOMEM;
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtRuntimeDisable(void) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_RUN_TIME_DISABLE,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
if (!rsp) return -ENOMEM;
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetNodeMemoryProperties(HSAuint32 NodeId, HSAuint32 NumBanks,
HsaMemoryProperties* MemoryProperties) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GET_NOD_MEM_PROP,
.node_mem_prop_args.NodeId = NodeId,
.node_mem_prop_args.NumBanks = NumBanks,
};
rsp = vhsakmt_alloc_rsp(
dev, &req.hdr,
sizeof(struct vhsakmt_ccmd_query_info_rsp) + NumBanks * sizeof(HsaMemoryProperties));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
memcpy(MemoryProperties, rsp->payload, NumBanks * sizeof(HsaMemoryProperties));
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetNodeCacheProperties(HSAuint32 NodeId, HSAuint32 ProcessorId,
HSAuint32 NumCaches,
HsaCacheProperties* CacheProperties) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GET_NOD_CACHE_PROP,
.node_cache_prop_args.NodeId = NodeId,
.node_cache_prop_args.ProcessorId = ProcessorId,
.node_cache_prop_args.NumCaches = NumCaches,
};
rsp = vhsakmt_alloc_rsp(
dev, &req.hdr,
sizeof(struct vhsakmt_ccmd_query_info_rsp) + NumCaches * sizeof(HsaCacheProperties));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
memcpy(CacheProperties, rsp->payload, NumCaches * sizeof(HsaCacheProperties));
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetNodeIoLinkProperties(HSAuint32 NodeId, HSAuint32 NumIoLinks,
HsaIoLinkProperties* IoLinkProperties) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GET_NOD_IO_LINK_PROP,
.node_io_link_args.NodeId = NodeId,
.node_io_link_args.NumIoLinks = NumIoLinks,
};
rsp = vhsakmt_alloc_rsp(
dev, &req.hdr,
sizeof(struct vhsakmt_ccmd_query_info_rsp) + NumIoLinks * sizeof(HsaIoLinkProperties));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
memcpy(IoLinkProperties, rsp->payload, NumIoLinks * sizeof(HsaIoLinkProperties));
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetClockCounters(HSAuint32 NodeId, HsaClockCounters* Counters) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GET_CLOCK_COUNTERS,
.NodeID = NodeId,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
memcpy(Counters, &rsp->clock_counters, sizeof(HsaClockCounters));
return rsp->ret;
}
HSAKMT_STATUS HSAKMTAPI vhsaKmtGetRuntimeCapabilities(HSAuint32* caps_mask) {
CHECK_VIRTIO_KFD_OPEN();
vhsakmt_device_handle dev = vhsakmt_dev();
struct vhsakmt_ccmd_query_info_rsp* rsp;
struct vhsakmt_ccmd_query_info_req req = {
.hdr = VHSAKMT_CCMD(QUERY_INFO, sizeof(struct vhsakmt_ccmd_query_info_req)),
.type = VHSAKMT_CCMD_QUERY_GET_RUNTIME_CAPS,
};
rsp = vhsakmt_alloc_rsp(dev, &req.hdr, sizeof(struct vhsakmt_ccmd_query_info_rsp));
if (!rsp) return -ENOMEM;
vhsakmt_execbuf_cpu(dev, &req.hdr, __FUNCTION__);
*caps_mask = rsp->caps;
return rsp->ret;
}
================================================
FILE: libhsakmt/src/virtio/hsakmt_virtio_vm.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include
#include
#include "hsakmt_virtio_device.h"
void* vhsakmt_vm_start(void) {
void* vm_start = malloc(getpagesize());
if (!vm_start) return NULL;
free(vm_start);
return vm_start;
}
int vhsakmt_reserve_va(uint64_t start, uint64_t size) {
int32_t protFlags = PROT_NONE;
int32_t mapFlags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED;
void* va = mmap((void*)start, size, protFlags, mapFlags, -1, 0);
if (va == MAP_FAILED) return -ENOMEM;
if (va != (void*)start) return -ENOMEM;
madvise(va, size, MADV_DONTFORK);
return 0;
}
void vhsakmt_dereserve_va(uint64_t start, uint64_t size) { munmap((void*)start, size); }
void vhsakmt_set_scratch_area(vhsakmt_device_handle dev, uint32_t node, uint64_t start,
uint64_t size) {
if (!dev->vhsakmt_nodes || !dev->sys_props) return;
if (node >= dev->sys_props->NumNodes) return;
pthread_mutex_lock(&dev->vhsakmt_mutex);
if (dev->vhsakmt_nodes[node].scratch_start && dev->vhsakmt_nodes[node].scratch_size) goto out;
dev->vhsakmt_nodes[node].scratch_start = start;
dev->vhsakmt_nodes[node].scratch_size = size;
out:
pthread_mutex_unlock(&dev->vhsakmt_mutex);
}
bool vhsakmt_is_scratch_mem(vhsakmt_device_handle dev, void* addr) {
uint32_t i;
if (!dev->vhsakmt_nodes || !dev->sys_props) return false;
for (i = 0; i < dev->sys_props->NumNodes; i++) {
if ((uint64_t)addr >= dev->vhsakmt_nodes[i].scratch_start &&
(uint64_t)addr <= dev->vhsakmt_nodes[i].scratch_start + dev->vhsakmt_nodes[i].scratch_size)
return true;
}
return false;
}
void vhsakmt_set_vm_area(vhsakmt_device_handle dev, uint64_t start, uint64_t size) {
pthread_mutex_lock(&dev->vhsakmt_mutex);
if (dev->vm_start && dev->vm_size) goto out;
dev->vm_start = start;
dev->vm_size = size;
out:
pthread_mutex_unlock(&dev->vhsakmt_mutex);
}
bool vhsakmt_is_userptr(vhsakmt_device_handle dev, void* addr) {
return !((uint64_t)addr >= dev->vm_start && (uint64_t)addr <= dev->vm_start + dev->vm_size);
}
int vhsakmt_set_node_doorbell(vhsakmt_device_handle dev, uint32_t node, void* doorbell) {
if (!dev->vhsakmt_nodes || !dev->sys_props) return -EINVAL;
if (node >= dev->sys_props->NumNodes) return -EINVAL;
pthread_mutex_lock(&dev->vhsakmt_mutex);
dev->vhsakmt_nodes[node].doorbell_base = doorbell;
pthread_mutex_unlock(&dev->vhsakmt_mutex);
return 0;
}
void* vhsakmt_node_doorbell(vhsakmt_device_handle dev, uint32_t node) {
if (!dev->vhsakmt_nodes || !dev->sys_props) return NULL;
if (node >= dev->sys_props->NumNodes) return NULL;
return dev->vhsakmt_nodes[node].doorbell_base;
}
================================================
FILE: libhsakmt/src/virtio/include/linux/virtgpu_drm.h
================================================
/*
* Copyright 2013 Red Hat
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef VIRTGPU_DRM_H
#define VIRTGPU_DRM_H
#include "drm.h"
#if defined(__cplusplus)
extern "C" {
#endif
/* Please note that modifications to all structs defined here are
* subject to backwards-compatibility constraints.
*
* Do not use pointers, use __u64 instead for 32 bit / 64 bit user/kernel
* compatibility Keep fields aligned to their size
*/
#define DRM_VIRTGPU_MAP 0x01
#define DRM_VIRTGPU_EXECBUFFER 0x02
#define DRM_VIRTGPU_GETPARAM 0x03
#define DRM_VIRTGPU_RESOURCE_CREATE 0x04
#define DRM_VIRTGPU_RESOURCE_INFO 0x05
#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06
#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07
#define DRM_VIRTGPU_WAIT 0x08
#define DRM_VIRTGPU_GET_CAPS 0x09
#define DRM_VIRTGPU_RESOURCE_CREATE_BLOB 0x0a
#define DRM_VIRTGPU_CONTEXT_INIT 0x0b
#define VIRTGPU_EXECBUF_FENCE_FD_IN 0x01
#define VIRTGPU_EXECBUF_FENCE_FD_OUT 0x02
#define VIRTGPU_EXECBUF_RING_IDX 0x04
#define VIRTGPU_EXECBUF_FLAGS (\
VIRTGPU_EXECBUF_FENCE_FD_IN |\
VIRTGPU_EXECBUF_FENCE_FD_OUT |\
VIRTGPU_EXECBUF_RING_IDX |\
0)
struct drm_virtgpu_map {
__u64 offset; /* use for mmap system call */
__u32 handle;
__u32 pad;
};
#define VIRTGPU_EXECBUF_SYNCOBJ_RESET 0x01
#define VIRTGPU_EXECBUF_SYNCOBJ_FLAGS ( \
VIRTGPU_EXECBUF_SYNCOBJ_RESET | \
0)
struct drm_virtgpu_execbuffer_syncobj {
__u32 handle;
__u32 flags;
__u64 point;
};
/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */
struct drm_virtgpu_execbuffer {
__u32 flags;
__u32 size;
__u64 command; /* void* */
__u64 bo_handles;
__u32 num_bo_handles;
__s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */
__u32 ring_idx; /* command ring index (see VIRTGPU_EXECBUF_RING_IDX) */
__u32 syncobj_stride; /* size of @drm_virtgpu_execbuffer_syncobj */
__u32 num_in_syncobjs;
__u32 num_out_syncobjs;
__u64 in_syncobjs;
__u64 out_syncobjs;
};
#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */
#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 /* do we have the capset fix */
#define VIRTGPU_PARAM_RESOURCE_BLOB 3 /* DRM_VIRTGPU_RESOURCE_CREATE_BLOB */
#define VIRTGPU_PARAM_HOST_VISIBLE 4 /* Host blob resources are mappable */
#define VIRTGPU_PARAM_CROSS_DEVICE 5 /* Cross virtio-device resource sharing */
#define VIRTGPU_PARAM_CONTEXT_INIT 6 /* DRM_VIRTGPU_CONTEXT_INIT */
#define VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs 7 /* Bitmask of supported capability set ids */
#define VIRTGPU_PARAM_EXPLICIT_DEBUG_NAME 8 /* Ability to set debug name from userspace */
struct drm_virtgpu_getparam {
__u64 param;
__u64 value;
};
/* NO_BO flags? NO resource flag? */
/* resource flag for y_0_top */
struct drm_virtgpu_resource_create {
__u32 target;
__u32 format;
__u32 bind;
__u32 width;
__u32 height;
__u32 depth;
__u32 array_size;
__u32 last_level;
__u32 nr_samples;
__u32 flags;
__u32 bo_handle; /* if this is set - recreate a new resource attached to this bo ? */
__u32 res_handle; /* returned by kernel */
__u32 size; /* validate transfer in the host */
__u32 stride; /* validate transfer in the host */
};
struct drm_virtgpu_resource_info {
__u32 bo_handle;
__u32 res_handle;
__u32 size;
__u32 blob_mem;
};
struct drm_virtgpu_3d_box {
__u32 x;
__u32 y;
__u32 z;
__u32 w;
__u32 h;
__u32 d;
};
struct drm_virtgpu_3d_transfer_to_host {
__u32 bo_handle;
struct drm_virtgpu_3d_box box;
__u32 level;
__u32 offset;
__u32 stride;
__u32 layer_stride;
};
struct drm_virtgpu_3d_transfer_from_host {
__u32 bo_handle;
struct drm_virtgpu_3d_box box;
__u32 level;
__u32 offset;
__u32 stride;
__u32 layer_stride;
};
#define VIRTGPU_WAIT_NOWAIT 1 /* like it */
struct drm_virtgpu_3d_wait {
__u32 handle; /* 0 is an invalid handle */
__u32 flags;
};
#define VIRTGPU_DRM_CAPSET_VIRGL 1
#define VIRTGPU_DRM_CAPSET_VIRGL2 2
#define VIRTGPU_DRM_CAPSET_GFXSTREAM_VULKAN 3
#define VIRTGPU_DRM_CAPSET_VENUS 4
#define VIRTGPU_DRM_CAPSET_CROSS_DOMAIN 5
#define VIRTGPU_DRM_CAPSET_DRM 6
struct drm_virtgpu_get_caps {
__u32 cap_set_id;
__u32 cap_set_ver;
__u64 addr;
__u32 size;
__u32 pad;
};
struct drm_virtgpu_resource_create_blob {
#define VIRTGPU_BLOB_MEM_GUEST 0x0001
#define VIRTGPU_BLOB_MEM_HOST3D 0x0002
#define VIRTGPU_BLOB_MEM_HOST3D_GUEST 0x0003
#define VIRTGPU_BLOB_FLAG_USE_MAPPABLE 0x0001
#define VIRTGPU_BLOB_FLAG_USE_SHAREABLE 0x0002
#define VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE 0x0004
#define VIRTGPU_BLOB_FLAG_USE_USERPTR 0x0008
/* zero is invalid blob_mem */
__u32 blob_mem;
__u32 blob_flags;
__u32 bo_handle;
__u32 res_handle;
__u64 size;
/*
* for 3D contexts with VIRTGPU_BLOB_MEM_HOST3D_GUEST and
* VIRTGPU_BLOB_MEM_HOST3D otherwise, must be zero.
*/
__u32 pad;
__u32 cmd_size;
__u64 cmd;
__u64 blob_id;
__u64 blob_userptr;
__s64 offset;
};
#define VIRTGPU_CONTEXT_PARAM_CAPSET_ID 0x0001
#define VIRTGPU_CONTEXT_PARAM_NUM_RINGS 0x0002
#define VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK 0x0003
#define VIRTGPU_CONTEXT_PARAM_DEBUG_NAME 0x0004
struct drm_virtgpu_context_set_param {
__u64 param;
__u64 value;
};
struct drm_virtgpu_context_init {
__u32 num_params;
__u32 pad;
/* pointer to drm_virtgpu_context_set_param array */
__u64 ctx_set_params;
};
/*
* Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in
* effect. The event size is sizeof(drm_event), since there is no additional
* payload.
*/
#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000
#define DRM_IOCTL_VIRTGPU_MAP \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map)
#define DRM_IOCTL_VIRTGPU_EXECBUFFER \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\
struct drm_virtgpu_execbuffer)
#define DRM_IOCTL_VIRTGPU_GETPARAM \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\
struct drm_virtgpu_getparam)
#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE, \
struct drm_virtgpu_resource_create)
#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \
struct drm_virtgpu_resource_info)
#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST, \
struct drm_virtgpu_3d_transfer_from_host)
#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST, \
struct drm_virtgpu_3d_transfer_to_host)
#define DRM_IOCTL_VIRTGPU_WAIT \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT, \
struct drm_virtgpu_3d_wait)
#define DRM_IOCTL_VIRTGPU_GET_CAPS \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \
struct drm_virtgpu_get_caps)
#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE_BLOB, \
struct drm_virtgpu_resource_create_blob)
#define DRM_IOCTL_VIRTGPU_CONTEXT_INIT \
DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_CONTEXT_INIT, \
struct drm_virtgpu_context_init)
#if defined(__cplusplus)
}
#endif
#endif
================================================
FILE: libhsakmt/src/virtio/libhsakmt_virtio.ver
================================================
{
global:
vhsaKmtOpenKFD;
vhsaKmtCloseKFD;
vhsaKmtAllocMemory;
vhsaKmtFreeMemory;
vhsaKmtMapMemoryToGPUNodes;
vhsaKmtUnmapMemoryToGPU;
vhsaKmtAvailableMemory;
vhsaKmtMapMemoryToGPU;
vhsaKmtRegisterMemoryWithFlags;
vhsaKmtDeregisterMemory;
vhsaKmtGetVersion;
vhsaKmtAcquireSystemProperties;
vhsaKmtReleaseSystemProperties;
vhsaKmtGetNodeProperties;
vhsaKmtGetXNACKMode;
vhsaKmtRuntimeEnable;
vhsaKmtRuntimeDisable;
vhsaKmtGetNodeMemoryProperties;
vhsaKmtGetNodeCacheProperties;
vhsaKmtGetNodeIoLinkProperties;
vhsaKmtGetClockCounters;
vhsaKmtGetAMDGPUDeviceHandle;
vhsaKmtQueryPointerInfo;
vhsaKmtGetTileConfig;
vhsaKmtCreateEvent;
vhsaKmtDestroyEvent;
vhsaKmtSetEvent;
vhsaKmtResetEvent;
vhsaKmtQueryEventState;
vhsaKmtWaitOnMultipleEvents;
vhsaKmtWaitOnEvent;
vhsaKmtWaitOnEvent_Ext;
vhsaKmtWaitOnMultipleEvents_Ext;
vhsaKmtSetTrapHandler;
vhsaKmtCreateQueueExt;
vhsaKmtCreateQueue;
vhsaKmtDestroyQueue;
vhsaKmtRegisterGraphicsHandleToNodes;
vhsaKmtGetRuntimeCapabilities;
vamdgpu_query_gpu_info;
local: *;
};
================================================
FILE: libhsakmt/src/virtio/virtio_gpu.c
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include
#include
#include
#include
#include
#include
#include
#include "virtio_gpu.h"
#define SHMEM_SZ (25 * 0x1000)
static int set_context(int fd) {
struct drm_virtgpu_context_set_param params[] = {
{VIRTGPU_CONTEXT_PARAM_CAPSET_ID, VIRGL_RENDERER_CAPSET_HSAKMT},
{VIRTGPU_CONTEXT_PARAM_NUM_RINGS, 64},
};
struct drm_virtgpu_context_init args = {
.num_params = ARRAY_SIZE(params),
.ctx_set_params = (uintptr_t)(params),
};
return virtio_gpu_ioctl(fd, VIRTGPU_CONTEXT_INIT, &args);
}
int virtio_gpu_map_handle(struct virtio_gpu_device* vgdev, uint32_t handle, uint64_t size,
void** addr, void* fixed_map) {
struct drm_virtgpu_map args = {
.handle = handle,
};
int r;
r = virtio_gpu_ioctl(vgdev->fd, VIRTGPU_MAP, &args);
if (r) return r;
*addr = mmap(fixed_map, size, PROT_READ | PROT_WRITE, MAP_SHARED | (fixed_map ? MAP_FIXED : 0),
vgdev->fd, args.offset);
if (*addr == MAP_FAILED) return -EINVAL;
return 0;
}
void virtio_gpu_unmap(void* addr, uint64_t size) { munmap(addr, size); }
static void virtio_gpu_bo_close(struct virtio_gpu_device* vgdev, uint32_t handle) {
struct drm_gem_close args = {
.handle = handle,
};
virtio_gpu_ioctl(vgdev->fd, GEM_CLOSE, &args);
}
static int virtio_gpu_shmem_init(struct virtio_gpu_device* vgdev, size_t size) {
struct drm_virtgpu_resource_create_blob args = {
.blob_mem = VIRTGPU_BLOB_MEM_HOST3D,
.blob_flags = VIRTGPU_BLOB_FLAG_USE_MAPPABLE,
.size = size,
.blob_id = 0,
};
int r = virtio_gpu_ioctl(vgdev->fd, VIRTGPU_RESOURCE_CREATE_BLOB, &args);
if (r) return r;
r = virtio_gpu_map_handle(vgdev, args.bo_handle, size, (void**)&vgdev->shmem, NULL);
if (r) {
virtio_gpu_bo_close(vgdev, args.bo_handle);
return r;
}
vgdev->shmem_handle = args.bo_handle;
uint32_t offset = vgdev->shmem->base.rsp_mem_offset;
vgdev->rsp_mem_len = size - offset;
vgdev->rsp_mem = &((uint8_t*)vgdev->shmem)[offset];
return 0;
}
struct virtio_gpu_device* virtio_gpu_init(int fd, uint32_t context_id) {
struct virtio_gpu_device* vgdev;
int r;
r = set_context(fd);
if (r) return NULL;
vgdev = calloc(1, sizeof(*vgdev));
if (!vgdev) return NULL;
vgdev->fd = fd;
vgdev->reqbuf = calloc(1, SHMEM_SZ);
if (!vgdev->reqbuf) {
free(vgdev);
return NULL;
}
r = virtio_gpu_shmem_init(vgdev, SHMEM_SZ);
if (r) {
free(vgdev);
return NULL;
}
pthread_mutex_init(&vgdev->rsp_lock, NULL);
pthread_mutex_init(&vgdev->eb_lock, NULL);
return vgdev;
}
void virtio_gpu_close(struct virtio_gpu_device* vgdev) {
virtio_gpu_unmap(vgdev->shmem, SHMEM_SZ);
virtio_gpu_bo_close(vgdev, vgdev->shmem_handle);
pthread_mutex_destroy(&vgdev->rsp_lock);
pthread_mutex_destroy(&vgdev->eb_lock);
close(vgdev->fd);
free(vgdev->reqbuf);
free(vgdev);
}
void* virtio_gpu_alloc_rsp(struct virtio_gpu_device* vgdev, struct virtio_gpu_ccmd_req* req,
uint32_t size) {
uint32_t off;
pthread_mutex_lock(&vgdev->rsp_lock);
size = VHSA_ALIGN_UP(size, 8);
if ((vgdev->next_rsp_off + size) >= vgdev->rsp_mem_len) vgdev->next_rsp_off = 0;
off = vgdev->next_rsp_off;
vgdev->next_rsp_off += size;
pthread_mutex_unlock(&vgdev->rsp_lock);
req->rsp_off = off;
struct virtio_gpu_ccmd_rsp* rsp = (void*)&vgdev->rsp_mem[off];
rsp->len = size;
return rsp;
}
static int virtio_gpu_execbuffer_locked(struct virtio_gpu_device* vgdev, void* cmd,
uint32_t cmd_size, uint32_t* handles, uint32_t num_handles,
int* fence_fd, int ring_idx, uint32_t num_in_syncobjs,
uint32_t num_out_syncobjs,
struct drm_virtgpu_execbuffer_syncobj* in_syncobjs,
struct drm_virtgpu_execbuffer_syncobj* out_syncobjs,
bool in_fence, bool out_fence) {
struct drm_virtgpu_execbuffer eb = {
.flags = (out_fence ? VIRTGPU_EXECBUF_FENCE_FD_OUT : 0) |
(in_fence ? VIRTGPU_EXECBUF_FENCE_FD_IN : 0) | VIRTGPU_EXECBUF_RING_IDX,
.size = cmd_size,
.command = (uintptr_t)cmd,
.bo_handles = (uintptr_t)handles,
.num_bo_handles = num_handles,
.fence_fd = *fence_fd,
.ring_idx = ring_idx,
.syncobj_stride = sizeof(struct drm_virtgpu_execbuffer_syncobj),
.num_in_syncobjs = num_in_syncobjs,
.num_out_syncobjs = num_out_syncobjs,
.in_syncobjs = (uintptr_t)in_syncobjs,
.out_syncobjs = (uintptr_t)out_syncobjs,
};
int r = virtio_gpu_ioctl(vgdev->fd, VIRTGPU_EXECBUFFER, &eb);
if (r) return r;
if (out_fence) *fence_fd = eb.fence_fd;
return 0;
}
static int virtio_gpu_flush_locked(struct virtio_gpu_device* vgdev, int* fence) {
int r;
if (!vgdev->reqbuf_len) return 0;
r = virtio_gpu_execbuffer_locked(vgdev, vgdev->reqbuf, vgdev->reqbuf_len, NULL, 0, fence, 0, 0, 0,
NULL, NULL, false, !!fence);
if (r) return r;
vgdev->reqbuf_len = 0;
vgdev->reqbuf_cnt = 0;
return 0;
}
static int virtio_gpu_add_cmd(struct virtio_gpu_device* vgdev, struct virtio_gpu_ccmd_req* req) {
req->seqno = ++vgdev->next_seqno;
int r;
if (vgdev->reqbuf_len + req->len > sizeof(vgdev->reqbuf)) {
r = virtio_gpu_flush_locked(vgdev, NULL);
if (r) return r;
}
memcpy(&vgdev->reqbuf[vgdev->reqbuf_len], req, req->len);
vgdev->reqbuf_len += req->len;
vgdev->reqbuf_cnt++;
return 0;
}
static inline bool fence_before(uint32_t a, uint32_t b) { return (int32_t)(a - b) < 0; }
static void virtio_gpu_seqno_sync(struct virtio_gpu_device* vgdev,
struct virtio_gpu_ccmd_req* req) {
while (fence_before(vgdev->shmem->base.seqno, req->seqno)) sched_yield();
}
int virtio_gpu_exec_cmd(struct virtio_gpu_device* vgdev, struct virtio_gpu_ccmd_req* req,
bool sync) {
int r = 0;
int fence;
pthread_mutex_lock(&vgdev->eb_lock);
r = virtio_gpu_add_cmd(vgdev, req);
if (r || !sync) goto out;
r = virtio_gpu_flush_locked(vgdev, &fence);
out:
pthread_mutex_unlock(&vgdev->eb_lock);
if (r) return r;
if (sync) {
sync_wait(fence, -1);
close(fence);
virtio_gpu_seqno_sync(vgdev, req);
}
return r;
}
int virtio_gpu_create_blob(struct virtio_gpu_device* vgdev,
struct drm_virtgpu_resource_create_blob* args) {
return virtio_gpu_ioctl(vgdev->fd, VIRTGPU_RESOURCE_CREATE_BLOB, args);
}
int virtio_gpu_destroy_handle(struct virtio_gpu_device* vgdev, uint32_t bo_handle) {
struct drm_gem_close args = {
.handle = bo_handle,
};
return virtio_gpu_ioctl(vgdev->fd, GEM_CLOSE, &args);
}
int virtio_gpu_res_id(struct virtio_gpu_device* vgdev, uint32_t handle, uint32_t* res_id) {
struct drm_virtgpu_resource_info args = {
.bo_handle = handle,
};
int r = virtio_gpu_ioctl(vgdev->fd, VIRTGPU_RESOURCE_INFO, &args);
if (r) return r;
*res_id = args.res_handle;
return 0;
}
static int virtio_gpu_get_capset(int fd, struct virgl_renderer_capset_hsakmt* caps) {
struct drm_virtgpu_get_caps args = {
.cap_set_id = VIRGL_RENDERER_CAPSET_HSAKMT,
.cap_set_ver = 0,
.addr = (uintptr_t)caps,
.size = sizeof(*caps),
};
memset(caps, 0, sizeof(*caps));
return virtio_gpu_ioctl(fd, VIRTGPU_GET_CAPS, &args);
}
int virtio_gpu_kfd_open(void) {
drmDevicePtr devices[VHSA_MAX_DEVICES];
int num_devices = 0;
int i, fd, ret;
num_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
if (num_devices <= 0) return -1;
for (i = 0; i < num_devices; i++) {
fd = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR | O_CLOEXEC);
if (fd < 0) continue;
struct virgl_renderer_capset_hsakmt caps;
ret = virtio_gpu_get_capset(fd, &caps);
if (ret || caps.context_type != VIRTGPU_DRM_CONTEXT_AMDGPU) {
close(fd);
fd = -1;
continue;
}
goto out;
}
out:
drmFreeDevices(devices, num_devices);
return fd;
}
================================================
FILE: libhsakmt/src/virtio/virtio_gpu.h
================================================
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef VIRTIO_GPU_H
#define VIRTIO_GPU_H
#include
#include
#include
#include "virtgpu_drm.h"
#define VIRGL_RENDERER_CAPSET_HSAKMT 8
#define VIRTGPU_DRM_CONTEXT_AMDGPU 1
#define VHSA_MAX_DEVICES 10
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#endif
#define VHSA_ALIGN_UP(x, align) (((uint64_t)(x) + (align)-1) & ~(uint64_t)((align)-1))
#define VHSA_ALIGN_DOWN(x, align) ((uint64_t)(x) & ~(uint64_t)((align)-1))
#define virtio_gpu_ioctl(fd, name, args) \
({ \
int ret = drmIoctl((fd), DRM_IOCTL_##name, (args)); \
ret; \
})
struct virgl_renderer_capset_hsakmt {
uint32_t wire_format_version;
/* Underlying drm device version: */
uint32_t version_major;
uint32_t version_minor;
uint32_t version_patchlevel;
uint32_t context_type;
uint32_t pad;
};
struct virtio_gpu_shmem_base {
uint32_t seqno;
uint32_t rsp_mem_offset;
};
struct virtio_gpu_ccmd_req {
uint32_t cmd;
uint32_t len;
uint32_t seqno;
uint32_t rsp_off;
};
struct virtio_gpu_ccmd_rsp {
uint32_t len;
};
struct virtio_gpu_shmem {
struct virtio_gpu_shmem_base base;
uint32_t async_error;
uint32_t global_faults;
};
#define vhsakmt_shmem virtio_gpu_shmem
#define vhsakmt_ccmd_req virtio_gpu_ccmd_req
#define vhsakmt_ccmd_rsp virtio_gpu_ccmd_rsp
struct virtio_gpu_device {
int fd;
struct virtio_gpu_shmem* shmem;
uint32_t shmem_handle;
uint8_t* rsp_mem;
uint32_t rsp_mem_len;
uint32_t next_rsp_off;
pthread_mutex_t rsp_lock;
pthread_mutex_t eb_lock;
uint32_t next_seqno;
uint32_t reqbuf_len;
uint32_t reqbuf_cnt;
uint8_t* reqbuf;
};
struct virtio_gpu_device* virtio_gpu_init(int fd, uint32_t context_id);
void virtio_gpu_close(struct virtio_gpu_device* vgdev);
int virtio_gpu_exec_cmd(struct virtio_gpu_device* vgdev, struct virtio_gpu_ccmd_req* req,
bool sync);
void* virtio_gpu_alloc_rsp(struct virtio_gpu_device* vgdev, struct virtio_gpu_ccmd_req* req,
uint32_t size);
int virtio_gpu_map_handle(struct virtio_gpu_device* vgdev, uint32_t handle, uint64_t size,
void** addr, void* fixed_map);
void virtio_gpu_unmap(void* addr, uint64_t size);
int virtio_gpu_create_blob(struct virtio_gpu_device* vgdev,
struct drm_virtgpu_resource_create_blob* args);
int virtio_gpu_destroy_handle(struct virtio_gpu_device* vgdev, uint32_t bo_handle);
int virtio_gpu_res_id(struct virtio_gpu_device* vgdev, uint32_t handle, uint32_t* res_id);
int virtio_gpu_kfd_open(void);
#endif /* VIRTIO_GPU_H */
================================================
FILE: libhsakmt/tests/kfdtest/.gitignore
================================================
================================================
FILE: libhsakmt/tests/kfdtest/CMakeLists.txt
================================================
#
# Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
#
# If environment variable DRM_DIR or LIBHSAKMT_PATH is set, the script
# will pick up the corresponding libraries from those pathes.
cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
project(KFDTest)
# For DEB/RPM generation
set ( CPACK_PACKAGE_NAME "kfdtest" )
set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." )
set ( CPACK_PACKAGE_DESCRIPTION "This package includes kfdtest, the list of excluded tests for each ASIC, and a convenience script to run the test suite" )
set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "Test suite for ROCK/KFD" )
# Make proper version for appending
# Default Value is 99999, setting it first
set(ROCM_VERSION_FOR_PACKAGE "99999")
if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
set(ROCM_VERSION_FOR_PACKAGE $ENV{ROCM_LIBPATCH_VERSION})
endif()
set ( CPACK_PACKAGE_VERSION_MAJOR "1" )
set ( CPACK_PACKAGE_VERSION_MINOR "0" )
set ( CPACK_PACKAGE_VERSION_PATCH "0" )
set ( CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/ROCm/ROCR-Runtime/" )
set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT")
## Debian package values
set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" )
if( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
endif()
## RPM package variables
set ( CPACK_RPM_PACKAGE_RELEASE "local" )
if( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} )
set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} )
endif()
## Note: rpm --eval %{?dist} will evaluate to NULL in Debian
## So Debian distros won't append dist tag to CPACK_RPM_PACKAGE_RELEASE.
## Also for debian package name , the dist tag is added from build env
execute_process( COMMAND rpm --eval %{?dist}
RESULT_VARIABLE PROC_RESULT
OUTPUT_VARIABLE EVAL_RESULT
OUTPUT_STRIP_TRAILING_WHITESPACE )
message("RESULT_VARIABLE ${PROC_RESULT} OUTPUT_VARIABLE: ${EVAL_RESULT}")
## Add distribution tag to rpm package name
if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
endif()
set(PACKAGE_VERSION_STR "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${ROCM_VERSION_FOR_PACKAGE}")
set(CPACK_PACKAGE_VERSION "${PACKAGE_VERSION_STR}")
## Define default variable and variables for the optional build target hsakmt-dev
set ( SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Location of hsakmt source code." )
set ( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default installation directory." )
set ( CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE STRING "Default packaging prefix." )
set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators." )
# Debian package specific variables
set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/ROCm/ROCR-Runtime/" )
set ( CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core" )
# RPM package specific variables
set (CPACK_RPM_PACKAGE_REQUIRES "rocm-core")
#set ( CMAKE_VERBOSE_MAKEFILE on )
find_package(PkgConfig)
list (PREPEND CMAKE_PREFIX_PATH "${DRM_DIR}")
# The module name passed to pkg_check_modules() is determined by the
# name of file *.pc
pkg_check_modules(DRM REQUIRED libdrm)
pkg_check_modules(DRM_AMDGPU REQUIRED libdrm_amdgpu)
include_directories(${DRM_AMDGPU_INCLUDE_DIRS})
if( DEFINED ENV{LIBHSAKMT_PATH} )
set ( LIBHSAKMT_PATH $ENV{LIBHSAKMT_PATH} )
message ( "LIBHSAKMT_PATH environment variable is set" )
else()
if ( ${ROCM_INSTALL_PATH} )
set ( ENV{PKG_CONFIG_PATH} ${ROCM_INSTALL_PATH}/share/pkgconfig )
else()
set ( ENV{PKG_CONFIG_PATH} /opt/rocm/share/pkgconfig )
endif()
pkg_check_modules(HSAKMT libhsakmt)
if( NOT HSAKMT_FOUND )
set ( LIBHSAKMT_PATH $ENV{OUT_DIR} )
endif()
endif()
if( DEFINED LIBHSAKMT_PATH )
set ( HSAKMT_LIBRARY_DIRS ${LIBHSAKMT_PATH} )
set ( HSAKMT_LIBRARIES hsakmt )
endif()
message ( "Find libhsakmt at ${HSAKMT_LIBRARY_DIRS}" )
if ( POLICY CMP0074 )
cmake_policy( SET CMP0074 NEW )
endif()
find_path( LIGHTNING_CMAKE_DIR NAMES LLVMConfig.cmake
PATHS $ENV{OUT_DIR}/llvm/lib/cmake/llvm NO_CACHE NO_DEFAULT_PATH)
if ( DEFINED LIGHTNING_CMAKE_DIR AND EXISTS ${LIGHTNING_CMAKE_DIR} )
set ( LLVM_DIR ${LIGHTNING_CMAKE_DIR} )
else()
message( STATUS "Couldn't find Lightning build in compute directory. "
"Searching LLVM_DIR then defaulting to system LLVM install if still not found..." )
endif()
find_package( LLVM REQUIRED CONFIG )
if( ${LLVM_PACKAGE_VERSION} VERSION_LESS "7.0" )
message( FATAL_ERROR "Requires LLVM 7.0 or greater "
"(found ${LLVM_PACKAGE_VERSION})" )
elseif( ${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0" )
message( WARNING "Not using latest LLVM version. "
"Some ASIC targets may not work!" )
endif()
message( STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}" )
message( STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}" )
include_directories(${LLVM_INCLUDE_DIRS})
separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
add_definitions(${LLVM_DEFINITIONS_LIST})
if (LLVM_LINK_LLVM_DYLIB)
set(llvm_libs LLVM)
else()
llvm_map_components_to_libnames(llvm_libs AMDGPUAsmParser Core Support)
endif()
include_directories(${PROJECT_SOURCE_DIR}/gtest-1.6.0)
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/../../include)
include_directories(${PROJECT_SOURCE_DIR}/../../libhsakmt/include)
include_directories(${DRM_INCLUDE_DIRS})
set (SRC_FILES gtest-1.6.0/gtest-all.cpp
src/AqlQueue.cpp
src/BasePacket.cpp
src/BaseDebug.cpp
src/BaseQueue.cpp
src/Dispatch.cpp
src/GoogleTestExtension.cpp
src/IndirectBuffer.cpp
src/Assemble.cpp
src/ShaderStore.cpp
src/LinuxOSWrapper.cpp
src/PM4Packet.cpp
src/PM4Queue.cpp
src/RDMAUtil.cpp
src/SDMAPacket.cpp
src/SDMAQueue.cpp
src/KFDBaseComponentTest.cpp
src/KFDMultiProcessTest.cpp
src/KFDTestMain.cpp
src/KFDTestUtil.cpp
src/KFDTestUtilQueue.cpp
src/KFDOpenCloseKFDTest.cpp
src/KFDTopologyTest.cpp
src/KFDMemoryTest.cpp
src/KFDLocalMemoryTest.cpp
src/KFDEventTest.cpp
src/KFDQMTest.cpp
src/KFDCWSRTest.cpp
src/KFDExceptionTest.cpp
src/KFDGraphicsInterop.cpp
src/KFDPerfCounters.cpp
src/KFDDBGTest.cpp
src/KFDGWSTest.cpp
src/KFDIPCTest.cpp
src/KFDASMTest.cpp
src/KFDEvictTest.cpp
src/KFDHWSTest.cpp
src/KFDPerformanceTest.cpp
src/KFDPMTest.cpp
src/KFDSVMRangeTest.cpp
src/KFDSVMEvictTest.cpp
src/KFDRASTest.cpp
src/KFDPCSamplingTest.cpp
src/KFDNegativeTest.cpp
src/RDMATest.cpp)
message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} )
#message( STATUS "SRC_FILES: ")
#foreach(file ${SRC_FILES})
# message(STATUS "${file}")
#endforeach()
#add_definitions(-Wall -std=c++11)
if ( "${CMAKE_C_COMPILER_VERSION}" STRGREATER "4.8.0")
## Add --enable-new-dtags to generate DT_RUNPATH
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17 -Wl,--enable-new-dtags" )
endif()
if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2" )
else ()
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
endif ()
## Address Sanitize Flag
if ( ${ADDRESS_SANITIZER} )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address" )
set ( CMAKE_EXE_LINKER_FLAGS -fsanitize=address )
endif ()
# link_directories() has to be put before add_executable()
# The modules found by pkg_check_modules() in the default pkg config
# path do not need to use link_directories() here.
link_directories(${HSAKMT_LIBRARY_DIRS})
add_executable(kfdtest ${SRC_FILES})
target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} ${llvm_libs} pthread m stdc++ rt numa)
configure_file ( scripts/kfdtest.exclude kfdtest.exclude COPYONLY )
configure_file ( scripts/run_kfdtest.sh run_kfdtest.sh COPYONLY )
install( PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/kfdtest ${CMAKE_CURRENT_BINARY_DIR}/run_kfdtest.sh
DESTINATION bin )
install( FILES ${CMAKE_CURRENT_BINARY_DIR}/kfdtest.exclude
DESTINATION share/kfdtest )
# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake
if(NOT ROCM_DEP_ROCMCORE)
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES})
string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS})
endif()
include ( CPack )
================================================
FILE: libhsakmt/tests/kfdtest/LICENSE.kfdtest
================================================
KFDTest - KFD unit tests LICENSE
Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
MIT LICENSE:
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: libhsakmt/tests/kfdtest/README.txt
================================================
1. Note on building kfdtest
To build this kfdtest application, the following libraries should be already
installed on the building machine:
libdrm libdrm_amdgpu libhsakmt
If libhsakmt is not installed, but the headers and libraries are present
locally, you can specify its directory by
export LIBHSAKMT_PATH=/path/to/libhsakmt.a
With that, CMake/make will look for the lib at LIBHSAKMT_PATH/libhsakmt.a
Note that this assumes that you will be building kfdtest from the same thunk found in ../..
2. How to run kfdtest
Just run "./run_kfdtest.sh" under the building output folder. You may need
to specify library path through:
export LD_LIBRARY_PATH=/path/to/libhsakmt.a
Note: you can use "run_kfdtest.sh -h" to see more options.
================================================
FILE: libhsakmt/tests/kfdtest/gtest-1.6.0/gtest/gtest.h
================================================
// Copyright 2005, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author: wan@google.com (Zhanyong Wan)
//
// The Google C++ Testing Framework (Google Test)
//
// This header file defines the public API for Google Test. It should be
// included by any test program that uses Google Test.
//
// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
// leave some internal implementation details in this header file.
// They are clearly marked by comments like this:
//
// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
//
// Such code is NOT meant to be used by a user directly, and is subject
// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user
// program!
//
// Acknowledgment: Google Test borrowed the idea of automatic test
// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
// easyUnit framework.
#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
#define GTEST_INCLUDE_GTEST_GTEST_H_
#include
#include
#include
// Copyright 2005, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
//
// The Google C++ Testing Framework (Google Test)
//
// This header file declares functions and macros used internally by
// Google Test. They are subject to change without notice.
#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
// Copyright 2005, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Authors: wan@google.com (Zhanyong Wan)
//
// Low-level types and utilities for porting Google Test to various
// platforms. They are subject to change without notice. DO NOT USE
// THEM IN USER CODE.
#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
// The user can define the following macros in the build script to
// control Google Test's behavior. If the user doesn't define a macro
// in this list, Google Test will define it.
//
// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2)
// is/isn't available.
// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions
// are enabled.
// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string
// is/isn't available (some systems define
// ::string, which is different to std::string).
// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
// is/isn't available (some systems define
// ::wstring, which is different to std::wstring).
// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular
// expressions are/aren't available.
// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that
// is/isn't available.
// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't
// enabled.
// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that
// std::wstring does/doesn't work (Google Test can
// be used where std::wstring is unavailable).
// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple
// is/isn't available.
// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the
// compiler supports Microsoft's "Structured
// Exception Handling".
// GTEST_HAS_STREAM_REDIRECTION
// - Define it to 1/0 to indicate whether the
// platform supports I/O stream redirection using
// dup() and dup2().
// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google
// Test's own tr1 tuple implementation should be
// used. Unused when the user sets
// GTEST_HAS_TR1_TUPLE to 0.
// GTEST_LINKED_AS_SHARED_LIBRARY
// - Define to 1 when compiling tests that use
// Google Test as a shared library (known as
// DLL on Windows).
// GTEST_CREATE_SHARED_LIBRARY
// - Define to 1 when compiling Google Test itself
// as a shared library.
// This header defines the following utilities:
//
// Macros indicating the current platform (defined to 1 if compiled on
// the given platform; otherwise undefined):
// GTEST_OS_AIX - IBM AIX
// GTEST_OS_CYGWIN - Cygwin
// GTEST_OS_HPUX - HP-UX
// GTEST_OS_LINUX - Linux
// GTEST_OS_LINUX_ANDROID - Google Android
// GTEST_OS_MAC - Mac OS X
// GTEST_OS_NACL - Google Native Client (NaCl)
// GTEST_OS_SOLARIS - Sun Solaris
// GTEST_OS_SYMBIAN - Symbian
// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile)
// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop
// GTEST_OS_WINDOWS_MINGW - MinGW
// GTEST_OS_WINDOWS_MOBILE - Windows Mobile
// GTEST_OS_ZOS - z/OS
//
// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
// most stable support. Since core members of the Google Test project
// don't have access to other platforms, support for them may be less
// stable. If you notice any problems on your platform, please notify
// googletestframework@googlegroups.com (patches for fixing them are
// even more welcome!).
//
// Note that it is possible that none of the GTEST_OS_* macros are defined.
//
// Macros indicating available Google Test features (defined to 1 if
// the corresponding feature is supported; otherwise undefined):
// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized
// tests)
// GTEST_HAS_DEATH_TEST - death tests
// GTEST_HAS_PARAM_TEST - value-parameterized tests
// GTEST_HAS_TYPED_TEST - typed tests
// GTEST_HAS_TYPED_TEST_P - type-parameterized tests
// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with
// GTEST_HAS_POSIX_RE (see above) which users can
// define themselves.
// GTEST_USES_SIMPLE_RE - our own simple regex is used;
// the above two are mutually exclusive.
// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
//
// Macros for basic C++ coding:
// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a
// variable don't have to be used.
// GTEST_DISALLOW_ASSIGN_ - disables operator=.
// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used.
//
// Synchronization:
// Mutex, MutexLock, ThreadLocal, GetThreadCount()
// - synchronization primitives.
// GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
// synchronization primitives have real implementations
// and Google Test is thread-safe; or 0 otherwise.
//
// Template meta programming:
// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only.
// IteratorTraits - partial implementation of std::iterator_traits, which
// is not available in libCstd when compiled with Sun C++.
//
// Smart pointers:
// scoped_ptr - as in TR2.
//
// Regular expressions:
// RE - a simple regular expression class using the POSIX
// Extended Regular Expression syntax on UNIX-like
// platforms, or a reduced regular exception syntax on
// other platforms, including Windows.
//
// Logging:
// GTEST_LOG_() - logs messages at the specified severity level.
// LogToStderr() - directs all log messages to stderr.
// FlushInfoLog() - flushes informational log messages.
//
// Stdout and stderr capturing:
// CaptureStdout() - starts capturing stdout.
// GetCapturedStdout() - stops capturing stdout and returns the captured
// string.
// CaptureStderr() - starts capturing stderr.
// GetCapturedStderr() - stops capturing stderr and returns the captured
// string.
//
// Integer types:
// TypeWithSize - maps an integer to a int type.
// Int32, UInt32, Int64, UInt64, TimeInMillis
// - integers of known sizes.
// BiggestInt - the biggest signed integer type.
//
// Command-line utilities:
// GTEST_FLAG() - references a flag.
// GTEST_DECLARE_*() - declares a flag.
// GTEST_DEFINE_*() - defines a flag.
// GetArgvs() - returns the command line as a vector of strings.
//
// Environment variable utilities:
// GetEnv() - gets the value of an environment variable.
// BoolFromGTestEnv() - parses a bool environment variable.
// Int32FromGTestEnv() - parses an Int32 environment variable.
// StringFromGTestEnv() - parses a string environment variable.
#include // for isspace, etc
#include // for ptrdiff_t
#include
#include
#include
#ifndef _WIN32_WCE
# include
# include
#endif // !_WIN32_WCE
#include // NOLINT
#include // NOLINT
#include // NOLINT
#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
#define GTEST_FLAG_PREFIX_ "gtest_"
#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
#define GTEST_NAME_ "Google Test"
#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
// Determines the version of gcc that is used to compile this.
#ifdef __GNUC__
// 40302 means version 4.3.2.
# define GTEST_GCC_VER_ \
(__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
#endif // __GNUC__
// Determines the platform on which Google Test is compiled.
#ifdef __CYGWIN__
# define GTEST_OS_CYGWIN 1
#elif defined __SYMBIAN32__
# define GTEST_OS_SYMBIAN 1
#elif defined _WIN32
# define GTEST_OS_WINDOWS 1
# ifdef _WIN32_WCE
# define GTEST_OS_WINDOWS_MOBILE 1
# elif defined(__MINGW__) || defined(__MINGW32__)
# define GTEST_OS_WINDOWS_MINGW 1
# else
# define GTEST_OS_WINDOWS_DESKTOP 1
# endif // _WIN32_WCE
#elif defined __APPLE__
# define GTEST_OS_MAC 1
#elif defined __linux__
# define GTEST_OS_LINUX 1
# ifdef ANDROID
# define GTEST_OS_LINUX_ANDROID 1
# endif // ANDROID
#elif defined __MVS__
# define GTEST_OS_ZOS 1
#elif defined(__sun) && defined(__SVR4)
# define GTEST_OS_SOLARIS 1
#elif defined(_AIX)
# define GTEST_OS_AIX 1
#elif defined(__hpux)
# define GTEST_OS_HPUX 1
#elif defined __native_client__
# define GTEST_OS_NACL 1
#endif // __CYGWIN__
// Brings in definitions for functions used in the testing::internal::posix
// namespace (read, write, close, chdir, isatty, stat). We do not currently
// use them on Windows Mobile.
#if !GTEST_OS_WINDOWS
// This assumes that non-Windows OSes provide unistd.h. For OSes where this
// is not the case, we need to include headers that provide the functions
// mentioned above.
# include
# if !GTEST_OS_NACL
// TODO(vladl@google.com): Remove this condition when Native Client SDK adds
// strings.h (tracked in
// http://code.google.com/p/nativeclient/issues/detail?id=1175).
# include // Native Client doesn't provide strings.h.
# endif
#elif !GTEST_OS_WINDOWS_MOBILE
# include
# include
#endif
#if defined(_MSC_VER)
# include
#endif
// Defines this to true iff Google Test can use POSIX regular expressions.
#ifndef GTEST_HAS_POSIX_RE
# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
#endif
#if GTEST_HAS_POSIX_RE
// On some platforms, needs someone to define size_t, and
// won't compile otherwise. We can #include it here as we already
// included , which is guaranteed to define size_t through
// .
# include // NOLINT
# define GTEST_USES_POSIX_RE 1
#elif GTEST_OS_WINDOWS
// is not available on Windows. Use our own simple regex
// implementation instead.
# define GTEST_USES_SIMPLE_RE 1
#else
// may not be available on this platform. Use our own
// simple regex implementation instead.
# define GTEST_USES_SIMPLE_RE 1
#endif // GTEST_HAS_POSIX_RE
#ifndef GTEST_HAS_EXCEPTIONS
// The user didn't tell us whether exceptions are enabled, so we need
// to figure it out.
# if defined(_MSC_VER) || defined(__BORLANDC__)
// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
// macro to enable exceptions, so we'll do the same.
// Assumes that exceptions are enabled by default.
# ifndef _HAS_EXCEPTIONS
# define _HAS_EXCEPTIONS 1
# endif // _HAS_EXCEPTIONS
# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
# elif defined(__GNUC__) && __EXCEPTIONS
// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
# define GTEST_HAS_EXCEPTIONS 1
# elif defined(__SUNPRO_CC)
// Sun Pro CC supports exceptions. However, there is no compile-time way of
// detecting whether they are enabled or not. Therefore, we assume that
// they are enabled unless the user tells us otherwise.
# define GTEST_HAS_EXCEPTIONS 1
# elif defined(__IBMCPP__) && __EXCEPTIONS
// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
# define GTEST_HAS_EXCEPTIONS 1
# elif defined(__HP_aCC)
// Exception handling is in effect by default in HP aCC compiler. It has to
// be turned of by +noeh compiler option if desired.
# define GTEST_HAS_EXCEPTIONS 1
# else
// For other compilers, we assume exceptions are disabled to be
// conservative.
# define GTEST_HAS_EXCEPTIONS 0
# endif // defined(_MSC_VER) || defined(__BORLANDC__)
#endif // GTEST_HAS_EXCEPTIONS
#if !defined(GTEST_HAS_STD_STRING)
// Even though we don't use this macro any longer, we keep it in case
// some clients still depend on it.
# define GTEST_HAS_STD_STRING 1
#elif !GTEST_HAS_STD_STRING
// The user told us that ::std::string isn't available.
# error "Google Test cannot be used where ::std::string isn't available."
#endif // !defined(GTEST_HAS_STD_STRING)
#ifndef GTEST_HAS_GLOBAL_STRING
// The user didn't tell us whether ::string is available, so we need
// to figure it out.
# define GTEST_HAS_GLOBAL_STRING 0
#endif // GTEST_HAS_GLOBAL_STRING
#ifndef GTEST_HAS_STD_WSTRING
// The user didn't tell us whether ::std::wstring is available, so we need
// to figure it out.
// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
// is available.
// Cygwin 1.7 and below doesn't support ::std::wstring.
// Solaris' libc++ doesn't support it either. Android has
// no support for it at least as recent as Froyo (2.2).
# define GTEST_HAS_STD_WSTRING \
(!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
#endif // GTEST_HAS_STD_WSTRING
#ifndef GTEST_HAS_GLOBAL_WSTRING
// The user didn't tell us whether ::wstring is available, so we need
// to figure it out.
# define GTEST_HAS_GLOBAL_WSTRING \
(GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
#endif // GTEST_HAS_GLOBAL_WSTRING
// Determines whether RTTI is available.
#ifndef GTEST_HAS_RTTI
// The user didn't tell us whether RTTI is enabled, so we need to
// figure it out.
# ifdef _MSC_VER
# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled.
# define GTEST_HAS_RTTI 1
# else
# define GTEST_HAS_RTTI 0
# endif
// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
# ifdef __GXX_RTTI
# define GTEST_HAS_RTTI 1
# else
# define GTEST_HAS_RTTI 0
# endif // __GXX_RTTI
// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
// both the typeid and dynamic_cast features are present.
# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
# ifdef __RTTI_ALL__
# define GTEST_HAS_RTTI 1
# else
# define GTEST_HAS_RTTI 0
# endif
# else
// For all other compilers, we assume RTTI is enabled.
# define GTEST_HAS_RTTI 1
# endif // _MSC_VER
#endif // GTEST_HAS_RTTI
// It's this header's responsibility to #include when RTTI
// is enabled.
#if GTEST_HAS_RTTI
# include
#endif
// Determines whether Google Test can use the pthreads library.
#ifndef GTEST_HAS_PTHREAD
// The user didn't tell us explicitly, so we assume pthreads support is
// available on Linux and Mac.
//
// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
// to your compiler flags.
# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX)
#endif // GTEST_HAS_PTHREAD
#if GTEST_HAS_PTHREAD
// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is
// true.
# include // NOLINT
// For timespec and nanosleep, used below.
# include // NOLINT
#endif
// Determines whether Google Test can use tr1/tuple. You can define
// this macro to 0 to prevent Google Test from using tuple (any
// feature depending on tuple with be disabled in this mode).
#ifndef GTEST_HAS_TR1_TUPLE
// The user didn't tell us not to do it, so we assume it's OK.
# define GTEST_HAS_TR1_TUPLE 0
#endif // GTEST_HAS_TR1_TUPLE
// Determines whether Google Test's own tr1 tuple implementation
// should be used.
#ifndef GTEST_USE_OWN_TR1_TUPLE
// The user didn't tell us, so we need to figure it out.
// We use our own TR1 tuple if we aren't sure the user has an
// implementation of it already. At this time, GCC 4.0.0+ and MSVC
// 2010 are the only mainstream compilers that come with a TR1 tuple
// implementation. NVIDIA's CUDA NVCC compiler pretends to be GCC by
// defining __GNUC__ and friends, but cannot compile GCC's tuple
// implementation. MSVC 2008 (9.0) provides TR1 tuple in a 323 MB
// Feature Pack download, which we cannot assume the user has.
# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000)) \
|| _MSC_VER >= 1600
# define GTEST_USE_OWN_TR1_TUPLE 0
# else
# define GTEST_USE_OWN_TR1_TUPLE 1
# endif
#endif // GTEST_USE_OWN_TR1_TUPLE
// To avoid conditional compilation everywhere, we make it
// gtest-port.h's responsibility to #include the header implementing
// tr1/tuple.
#if GTEST_HAS_TR1_TUPLE
# if GTEST_USE_OWN_TR1_TUPLE
// This file was GENERATED by a script. DO NOT EDIT BY HAND!!!
// Copyright 2009 Google Inc.
// All Rights Reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author: wan@google.com (Zhanyong Wan)
// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
#include // For ::std::pair.
// The compiler used in Symbian has a bug that prevents us from declaring the
// tuple template as a friend (it complains that tuple is redefined). This
// hack bypasses the bug by declaring the members that should otherwise be
// private as public.
// Sun Studio versions < 12 also have the above bug.
#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
#else
# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
template friend class tuple; \
private:
#endif
// GTEST_n_TUPLE_(T) is the type of an n-tuple.
#define GTEST_0_TUPLE_(T) tuple<>
#define GTEST_1_TUPLE_(T) tuple
#define GTEST_2_TUPLE_(T) tuple
#define GTEST_3_TUPLE_(T) tuple
#define GTEST_4_TUPLE_(T) tuple
#define GTEST_5_TUPLE_(T) tuple
#define GTEST_6_TUPLE_(T) tuple
#define GTEST_7_TUPLE_(T) tuple
#define GTEST_8_TUPLE_(T) tuple
#define GTEST_9_TUPLE_(T) tuple
#define GTEST_10_TUPLE_(T) tuple
// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
#define GTEST_0_TYPENAMES_(T)
#define GTEST_1_TYPENAMES_(T) typename T##0
#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3
#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4
#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5
#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5, typename T##6
#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5, typename T##6, \
typename T##7, typename T##8
#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5, typename T##6, \
typename T##7, typename T##8, typename T##9
// In theory, defining stuff in the ::std namespace is undefined
// behavior. We can do this as we are playing the role of a standard
// library vendor.
namespace std {
namespace tr1 {
template
class tuple;
// Anything in namespace gtest_internal is Google Test's INTERNAL
// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
namespace gtest_internal {
// ByRef::type is T if T is a reference; otherwise it's const T&.
template
struct ByRef { typedef const T& type; }; // NOLINT
template
struct ByRef { typedef T& type; }; // NOLINT
// A handy wrapper for ByRef.
#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type
// AddRef::type is T if T is a reference; otherwise it's T&. This
// is the same as tr1::add_reference::type.
template
struct AddRef { typedef T& type; }; // NOLINT
template
struct AddRef { typedef T& type; }; // NOLINT
// A handy wrapper for AddRef.
#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type
// A helper for implementing get().
template class Get;
// A helper for implementing tuple_element. kIndexValid is true
// iff k < the number of fields in tuple type T.
template
struct TupleElement;
template
struct TupleElement { typedef T0 type; };
template
struct TupleElement { typedef T1 type; };
template
struct TupleElement { typedef T2 type; };
template
struct TupleElement { typedef T3 type; };
template
struct TupleElement { typedef T4 type; };
template
struct TupleElement { typedef T5 type; };
template
struct TupleElement { typedef T6 type; };
template
struct TupleElement { typedef T7 type; };
template
struct TupleElement { typedef T8 type; };
template
struct TupleElement { typedef T9 type; };
} // namespace gtest_internal
template <>
class tuple<> {
public:
tuple() {}
tuple(const tuple& /* t */) {}
tuple& operator=(const tuple& /* t */) { return *this; }
};
template
class GTEST_1_TUPLE_(T) {
public:
template friend class gtest_internal::Get;
tuple() : f0_() {}
explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
tuple(const tuple& t) : f0_(t.f0_) {}
template
tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
tuple& operator=(const tuple& t) { return CopyFrom(t); }
template
tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
return CopyFrom(t);
}
GTEST_DECLARE_TUPLE_AS_FRIEND_
template
tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
f0_ = t.f0_;
return *this;
}
T0 f0_;
};
template
class GTEST_2_TUPLE_(T) {
public:
template friend class gtest_internal::Get;
tuple() : f0_(), f1_() {}
explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
f1_(f1) {}
tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
template